|
| typedef vl_simd_vec4_f32(* | vl_simd_load_vec4f32_fn) (const vl_float32_t *) |
| |
| typedef void(* | vl_simd_store_vec4f32_fn) (vl_float32_t *, vl_simd_vec4_f32) |
| |
| typedef vl_simd_vec4_f32(* | vl_simd_splat_vec4f32_fn) (vl_float32_t) |
| |
| typedef vl_simd_vec4_f32(* | vl_simd_add_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32) |
| |
| typedef vl_simd_vec4_f32(* | vl_simd_sub_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32) |
| |
| typedef vl_simd_vec4_f32(* | vl_simd_mul_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32) |
| |
| typedef vl_simd_vec4_f32(* | vl_simd_div_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32) |
| |
| typedef vl_simd_vec4_f32(* | vl_simd_fma_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32, vl_simd_vec4_f32) |
| |
| typedef vl_float32_t(* | vl_simd_hsum_vec4f32_fn) (vl_simd_vec4_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_load_vec8f32_fn) (const vl_float32_t *) |
| |
| typedef void(* | vl_simd_store_vec8f32_fn) (vl_float32_t *, vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_add_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_mul_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_fma_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32, vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_splat_vec8f32_fn) (vl_float32_t) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_sub_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_lt_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_gt_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_eq_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_and_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_or_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_xor_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec8_f32(* | vl_simd_not_vec8f32_fn) (vl_simd_vec8_f32) |
| |
| typedef vl_simd_vec4_f32(* | vl_simd_cmp_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32) |
| |
| typedef vl_simd_vec4_f32(* | vl_simd_bitwise_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32) |
| |
| typedef vl_simd_vec4_f32(* | vl_simd_not_vec4f32_fn) (vl_simd_vec4_f32) |
| |
| typedef vl_float32_t(* | vl_simd_hmax_vec4f32_fn) (vl_simd_vec4_f32) |
| |
| typedef vl_float32_t(* | vl_simd_hmin_vec4f32_fn) (vl_simd_vec4_f32) |
| |
| typedef vl_float32_t(* | vl_simd_hprod_vec4f32_fn) (vl_simd_vec4_f32) |
| |
| typedef vl_float32_t(* | vl_simd_extract_lane_vec4f32_fn) (vl_simd_vec4_f32, int) |
| |
| typedef vl_simd_vec4_f32(* | vl_simd_broadcast_lane_vec4f32_fn) (vl_simd_vec4_f32, int) |
| |
| typedef vl_simd_vec4_i32(* | vl_simd_load_vec4i32_fn) (const vl_int32_t *) |
| |
| typedef void(* | vl_simd_store_vec4i32_fn) (vl_int32_t *, vl_simd_vec4_i32) |
| |
| typedef vl_simd_vec4_i32(* | vl_simd_add_vec4i32_fn) (vl_simd_vec4_i32, vl_simd_vec4_i32) |
| |
| typedef vl_simd_vec4_i32(* | vl_simd_mul_vec4i32_fn) (vl_simd_vec4_i32, vl_simd_vec4_i32) |
| |
| typedef vl_simd_vec8_i16(* | vl_simd_load_vec8i16_fn) (const vl_int16_t *) |
| |
| typedef void(* | vl_simd_store_vec8i16_fn) (vl_int16_t *, vl_simd_vec8_i16) |
| |
| typedef vl_simd_vec8_i16(* | vl_simd_add_vec8i16_fn) (vl_simd_vec8_i16, vl_simd_vec8_i16) |
| |
| typedef vl_simd_vec32_u8(* | vl_simd_load_vec32u8_fn) (const vl_uint8_t *) |
| |
| typedef void(* | vl_simd_store_vec32u8_fn) (vl_uint8_t *, vl_simd_vec32_u8) |
| |
Transparent runtime-selected SIMD abstraction layer.
██ ██ ██ █████ ███████ █████ ██████ ███ ██ █████ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██ ███████ ███████ ███████ ██ ███ ██ ██ ██ ███████ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ███████ ██ ██ ███████ ██ ██ ██████ ██ ████ ██ ██ ====—: A Data Structure and Algorithms library for C11. :—====
Copyright 2026 Jesse Walker, released under the MIT license. Git Repository: https://github.com/walkerje/veritable_lasagna
Provides a unified, architecture-agnostic interface to SIMD operations with automatic CPU capability detection and backend selection at initialization time.
Overview
This module abstracts away platform-specific SIMD intrinsics (SSE2, AVX2, NEON, etc.) behind a portable C API. The best available implementation is selected once at application startup via vlSIMDInit(), with graceful fallback to portable C implementations on all platforms.
Key design principles:
- Zero runtime cost after init: All backend selection happens once; after that, function pointers are just table lookups.
- No global state pollution: Selection is stored locally in vlSIMDFunctions.
- Transparent API: Users call simple macros; backend is invisible.
- Portable C always available: The abstraction itself is proven correct before optimizations layer on top.
Supported Architectures & Backends
x86 / x86-64
- Portable C: Always available, unoptimized scalar fallback.
- SSE2: Available on all x86-64 CPUs and most modern x86. Provides 128-bit operations on 4 float or 4 int32. Integer 32-bit multiply falls back to scalar.
- AVX2: Intel Haswell (2013+), AMD Excavator (2015+). Extends SSE with 256-bit operations (8 float or 8 int32), true FMA, and better integer operations.
ARM / ARM64
- NEON (ARMv7): 128-bit operations on 4 float or mixed-width integers. Division uses reciprocal approximation with one Newton-Raphson iteration (~11-12 bits accuracy).
- NEON64 (ARMv8+): Enhanced NEON with true FMA and improved precision. Handles 8-wide operations via two 128-bit registers.
Selection priority (checked in order):
- x86: AVX2 > SSE2 > Portable C
- ARM64: NEON64 > Portable C
- ARM32: NEON > Portable C
- Other: Portable C
Usage Pattern
int main() {
printf("Using SIMD backend: %s\n", backend);
vlSIMDStoreVec4F32(output_ptr, result);
float sum = vlSIMDHsumVec4F32(result);
float max_val = vlSIMDHmaxVec4F32(result);
return 0;
}
const char * vlSIMDInit(void)
Initializes the SIMD subsystem and selects the best available backend.
Definition vl_simd.c:135
Transparent runtime-selected SIMD abstraction layer.
vl_simd_vec4_f32
Definition vl_simd.h:223
Vector Types
All vector types are struct-based with element arrays, enabling transparent interchange between backends. Alignment hints are provided for cache efficiency.
- 4-wide float (F32): 4 × 32-bit floats, 16-byte aligned
- 8-wide float (F32): 8 × 32-bit floats, 32-byte aligned (for AVX)
- 4-wide int32 (I32): 4 × 32-bit signed integers, 16-byte aligned
- 8-wide int16 (I16): 8 × 16-bit signed integers, 32-byte aligned
- 32-wide uint8 (U8): 32 × 8-bit unsigned integers, 32-byte aligned
Operation Categories
Arithmetic (4-wide F32)
- Basic: vlSIMDAddVec4F32, vlSIMDSubVec4F32, vlSIMDMulVec4F32, vlSIMDDivVec4F32
- Advanced: vlSIMDFmaVec4F32 (fused multiply-add, hardware native when available)
Arithmetic (8-wide F32)
- Basic: vlSIMDAddVec8F32, vlSIMDMulVec8F32
- Advanced: vlSIMDFmaVec8F32
Horizontal Reductions (4-wide F32)
Reduce a vector to a single scalar by combining all lanes:
- vlSIMDHsumVec4F32: Sum all 4 elements
- vlSIMDHmaxVec4F32: Maximum element
- vlSIMDHminVec4F32: Minimum element
- vlSIMDHprodVec4F32: Product of all elements
Comparisons (4-wide F32)
Return element-wise masks (0xFFFFFFFF for true, 0x00000000 for false):
- vlSIMDLtVec4F32: Less-than
- vlSIMDGtVec4F32: Greater-than
- vlSIMDEqVec4F32: Equality
Bitwise Operations (4-wide F32)
Treat float bits as integers:
- vlSIMDAndVec4F32, vlSIMDOrVec4F32, vlSIMDXorVec4F32, vlSIMDNotVec4F32
Lane Operations (4-wide F32)
- vlSIMDExtractLaneVec4F32: Extract single lane to scalar
- vlSIMDBroadcastLaneVec4F32: Replicate single lane to all lanes
Integer Operations
- I32: Load, store, add, multiply (4-wide)
- I16: Load, store, add (8-wide)
- U8: Load, store (32-wide)
Important Notes on Precision & Behavior
Division on NEON (ARMv7/ARMv8)
NEON does not have native division. The implementation uses reciprocal approximation with Newton-Raphson refinement:
recip = vrecpeq_f32(b)
recip = recip * vrecpsq_f32(b, recip)
result = a * recip
This achieves ~11-12 bits of accuracy, sufficient for graphics but not for high-precision numerical work. Use portable C or compute higher-precision divisions on CPU if needed.
Integer Multiply on SSE2
SSE2 lacks 32-bit integer multiply, so vlSIMDMulVec4I32 falls back to scalar operations on this backend.
Comparison Results as Float Bits
Comparison operations (lt, gt, eq) return masks stored as float bit patterns:
- True: 0xFFFFFFFF (all bits set)
- False: 0x00000000
These can be used in bitwise operations or with FMA for blending.
Memory Alignment
Load/store operations use unaligned variants (_mm_loadu_ps, _mm256_loadu_ps, etc.) to accept arbitrary pointers. If data is guaranteed aligned, consider manual optimization to aligned variants for performance.
Thread Safety
Initialization: vlSIMDInit() is thread-safe. Subsequent calls return immediately and are safe to call from multiple threads.
Runtime use: vlSIMDFunctions is read-only after initialization. All threads can safely call SIMD operations without synchronization.
Performance Considerations
Load/Store Overhead
The struct-based design requires loading vectors from components arrays and storing back after operations. Modern compilers optimize these to single instructions when possible, but be aware of this pattern for hot loops.
Macro API
All operations are exposed as macros (e.g., vlSIMDAddVec4F32) that defer to function pointers. This incurs one indirect call per operation. For very tight inner loops, consider caching frequently used functions:
for (...) {
result = add_fn(a, b);
}
vl_simd_functions_t vlSIMDFunctions
Global SIMD function table.
Definition vl_simd.c:34
vl_simd_vec4_f32(* vl_simd_add_vec4f32_fn)(vl_simd_vec4_f32, vl_simd_vec4_f32)
Definition vl_simd.h:314
vl_simd_add_vec4f32_fn add_vec4f32
Definition vl_simd.h:385
8-Wide Operations
8-wide operations on NEON/SSE2 are synthesized from two 128-bit registers.
- See also
- vlSIMDInit
-
vl_simd_functions_t