Veritable Lasagna
An Allocator & Data Structure Library for C.
Loading...
Searching...
No Matches
vl_simd.h File Reference

Transparent runtime-selected SIMD abstraction layer. More...

#include <vl/vl_memory.h>
#include <vl/vl_numtypes.h>
+ Include dependency graph for vl_simd.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  vl_simd_functions_t
 Master SIMD function dispatch table. More...
 

Typedefs

typedef vl_simd_vec4_f32(* vl_simd_load_vec4f32_fn) (const vl_float32_t *)
 
typedef void(* vl_simd_store_vec4f32_fn) (vl_float32_t *, vl_simd_vec4_f32)
 
typedef vl_simd_vec4_f32(* vl_simd_splat_vec4f32_fn) (vl_float32_t)
 
typedef vl_simd_vec4_f32(* vl_simd_add_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)
 
typedef vl_simd_vec4_f32(* vl_simd_sub_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)
 
typedef vl_simd_vec4_f32(* vl_simd_mul_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)
 
typedef vl_simd_vec4_f32(* vl_simd_div_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)
 
typedef vl_simd_vec4_f32(* vl_simd_fma_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32, vl_simd_vec4_f32)
 
typedef vl_float32_t(* vl_simd_hsum_vec4f32_fn) (vl_simd_vec4_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_load_vec8f32_fn) (const vl_float32_t *)
 
typedef void(* vl_simd_store_vec8f32_fn) (vl_float32_t *, vl_simd_vec8_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_add_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_mul_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_fma_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32, vl_simd_vec8_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_splat_vec8f32_fn) (vl_float32_t)
 
typedef vl_simd_vec8_f32(* vl_simd_sub_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_lt_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_gt_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_eq_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_and_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_or_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_xor_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)
 
typedef vl_simd_vec8_f32(* vl_simd_not_vec8f32_fn) (vl_simd_vec8_f32)
 
typedef vl_simd_vec4_f32(* vl_simd_cmp_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)
 
typedef vl_simd_vec4_f32(* vl_simd_bitwise_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)
 
typedef vl_simd_vec4_f32(* vl_simd_not_vec4f32_fn) (vl_simd_vec4_f32)
 
typedef vl_float32_t(* vl_simd_hmax_vec4f32_fn) (vl_simd_vec4_f32)
 
typedef vl_float32_t(* vl_simd_hmin_vec4f32_fn) (vl_simd_vec4_f32)
 
typedef vl_float32_t(* vl_simd_hprod_vec4f32_fn) (vl_simd_vec4_f32)
 
typedef vl_float32_t(* vl_simd_extract_lane_vec4f32_fn) (vl_simd_vec4_f32, int)
 
typedef vl_simd_vec4_f32(* vl_simd_broadcast_lane_vec4f32_fn) (vl_simd_vec4_f32, int)
 
typedef vl_simd_vec4_i32(* vl_simd_load_vec4i32_fn) (const vl_int32_t *)
 
typedef void(* vl_simd_store_vec4i32_fn) (vl_int32_t *, vl_simd_vec4_i32)
 
typedef vl_simd_vec4_i32(* vl_simd_add_vec4i32_fn) (vl_simd_vec4_i32, vl_simd_vec4_i32)
 
typedef vl_simd_vec4_i32(* vl_simd_mul_vec4i32_fn) (vl_simd_vec4_i32, vl_simd_vec4_i32)
 
typedef vl_simd_vec8_i16(* vl_simd_load_vec8i16_fn) (const vl_int16_t *)
 
typedef void(* vl_simd_store_vec8i16_fn) (vl_int16_t *, vl_simd_vec8_i16)
 
typedef vl_simd_vec8_i16(* vl_simd_add_vec8i16_fn) (vl_simd_vec8_i16, vl_simd_vec8_i16)
 
typedef vl_simd_vec32_u8(* vl_simd_load_vec32u8_fn) (const vl_uint8_t *)
 
typedef void(* vl_simd_store_vec32u8_fn) (vl_uint8_t *, vl_simd_vec32_u8)
 

Functions

struct VL_ALIGN_HINT (16) vl_simd_vec4_f32_
 4-element 32-bit float vector.
 
struct VL_ALIGN_HINT (32) vl_simd_vec8_f32_
 8-element 32-bit float vector.
 
VL_API const char * vlSIMDInit (void)
 Initializes the SIMD subsystem and selects the best available backend.
 

Variables

 vl_simd_vec4_f32
 
 vl_simd_vec8_f32
 
 vl_simd_vec4_i32
 
 vl_simd_vec8_i32
 
 vl_simd_vec8_i16
 
 vl_simd_vec16_u8
 
 vl_simd_vec32_u8
 
VL_API vl_simd_functions_t vlSIMDFunctions
 Global SIMD function table.
 

Detailed Description

Transparent runtime-selected SIMD abstraction layer.

██ ██ ██ █████ ███████ █████ ██████ ███ ██ █████ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██ ███████ ███████ ███████ ██ ███ ██ ██ ██ ███████ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ███████ ██ ██ ███████ ██ ██ ██████ ██ ████ ██ ██ ====—: A Data Structure and Algorithms library for C11. :—====

Copyright 2026 Jesse Walker, released under the MIT license. Git Repository: https://github.com/walkerje/veritable_lasagna

Provides a unified, architecture-agnostic interface to SIMD operations with automatic CPU capability detection and backend selection at initialization time.

Overview

This module abstracts away platform-specific SIMD intrinsics (SSE2, AVX2, NEON, etc.) behind a portable C API. The best available implementation is selected once at application startup via vlSIMDInit(), with graceful fallback to portable C implementations on all platforms.

Key design principles:

  • Zero runtime cost after init: All backend selection happens once; after that, function pointers are just table lookups.
  • No global state pollution: Selection is stored locally in vlSIMDFunctions.
  • Transparent API: Users call simple macros; backend is invisible.
  • Portable C always available: The abstraction itself is proven correct before optimizations layer on top.

Supported Architectures & Backends

x86 / x86-64

  • Portable C: Always available, unoptimized scalar fallback.
  • SSE2: Available on all x86-64 CPUs and most modern x86. Provides 128-bit operations on 4 float or 4 int32. Integer 32-bit multiply falls back to scalar.
  • AVX2: Intel Haswell (2013+), AMD Excavator (2015+). Extends SSE with 256-bit operations (8 float or 8 int32), true FMA, and better integer operations.

ARM / ARM64

  • NEON (ARMv7): 128-bit operations on 4 float or mixed-width integers. Division uses reciprocal approximation with one Newton-Raphson iteration (~11-12 bits accuracy).
  • NEON64 (ARMv8+): Enhanced NEON with true FMA and improved precision. Handles 8-wide operations via two 128-bit registers.

Selection priority (checked in order):

  • x86: AVX2 > SSE2 > Portable C
  • ARM64: NEON64 > Portable C
  • ARM32: NEON > Portable C
  • Other: Portable C

Usage Pattern

#include <vl/vl_simd.h>
int main() {
// Initialize once at startup
const char* backend = vlSIMDInit();
printf("Using SIMD backend: %s\n", backend); // e.g., "AVX2", "NEON64"
// Use transparently anywhere
vl_simd_vec4_f32 a = vlSIMDLoadVec4F32(data_ptr);
vl_simd_vec4_f32 b = vlSIMDLoadVec4F32(data_ptr2);
vl_simd_vec4_f32 result = vlSIMDAddVec4F32(a, b);
vlSIMDStoreVec4F32(output_ptr, result);
// Horizontal reductions
float sum = vlSIMDHsumVec4F32(result);
float max_val = vlSIMDHmaxVec4F32(result);
return 0;
}
const char * vlSIMDInit(void)
Initializes the SIMD subsystem and selects the best available backend.
Definition vl_simd.c:135
Transparent runtime-selected SIMD abstraction layer.
vl_simd_vec4_f32
Definition vl_simd.h:223

Vector Types

All vector types are struct-based with element arrays, enabling transparent interchange between backends. Alignment hints are provided for cache efficiency.

  • 4-wide float (F32): 4 × 32-bit floats, 16-byte aligned
  • 8-wide float (F32): 8 × 32-bit floats, 32-byte aligned (for AVX)
  • 4-wide int32 (I32): 4 × 32-bit signed integers, 16-byte aligned
  • 8-wide int16 (I16): 8 × 16-bit signed integers, 32-byte aligned
  • 32-wide uint8 (U8): 32 × 8-bit unsigned integers, 32-byte aligned

Operation Categories

Arithmetic (4-wide F32)

  • Basic: vlSIMDAddVec4F32, vlSIMDSubVec4F32, vlSIMDMulVec4F32, vlSIMDDivVec4F32
  • Advanced: vlSIMDFmaVec4F32 (fused multiply-add, hardware native when available)

Arithmetic (8-wide F32)

  • Basic: vlSIMDAddVec8F32, vlSIMDMulVec8F32
  • Advanced: vlSIMDFmaVec8F32

Horizontal Reductions (4-wide F32)

Reduce a vector to a single scalar by combining all lanes:

  • vlSIMDHsumVec4F32: Sum all 4 elements
  • vlSIMDHmaxVec4F32: Maximum element
  • vlSIMDHminVec4F32: Minimum element
  • vlSIMDHprodVec4F32: Product of all elements

Comparisons (4-wide F32)

Return element-wise masks (0xFFFFFFFF for true, 0x00000000 for false):

  • vlSIMDLtVec4F32: Less-than
  • vlSIMDGtVec4F32: Greater-than
  • vlSIMDEqVec4F32: Equality

Bitwise Operations (4-wide F32)

Treat float bits as integers:

  • vlSIMDAndVec4F32, vlSIMDOrVec4F32, vlSIMDXorVec4F32, vlSIMDNotVec4F32

Lane Operations (4-wide F32)

  • vlSIMDExtractLaneVec4F32: Extract single lane to scalar
  • vlSIMDBroadcastLaneVec4F32: Replicate single lane to all lanes

Integer Operations

  • I32: Load, store, add, multiply (4-wide)
  • I16: Load, store, add (8-wide)
  • U8: Load, store (32-wide)

Important Notes on Precision & Behavior

Division on NEON (ARMv7/ARMv8)

NEON does not have native division. The implementation uses reciprocal approximation with Newton-Raphson refinement:

recip = vrecpeq_f32(b)
recip = recip * vrecpsq_f32(b, recip) // One iteration
result = a * recip

This achieves ~11-12 bits of accuracy, sufficient for graphics but not for high-precision numerical work. Use portable C or compute higher-precision divisions on CPU if needed.

Integer Multiply on SSE2

SSE2 lacks 32-bit integer multiply, so vlSIMDMulVec4I32 falls back to scalar operations on this backend.

Comparison Results as Float Bits

Comparison operations (lt, gt, eq) return masks stored as float bit patterns:

  • True: 0xFFFFFFFF (all bits set)
  • False: 0x00000000

These can be used in bitwise operations or with FMA for blending.

Memory Alignment

Load/store operations use unaligned variants (_mm_loadu_ps, _mm256_loadu_ps, etc.) to accept arbitrary pointers. If data is guaranteed aligned, consider manual optimization to aligned variants for performance.

Thread Safety

Initialization: vlSIMDInit() is thread-safe. Subsequent calls return immediately and are safe to call from multiple threads.

Runtime use: vlSIMDFunctions is read-only after initialization. All threads can safely call SIMD operations without synchronization.

Performance Considerations

Load/Store Overhead

The struct-based design requires loading vectors from components arrays and storing back after operations. Modern compilers optimize these to single instructions when possible, but be aware of this pattern for hot loops.

Macro API

All operations are exposed as macros (e.g., vlSIMDAddVec4F32) that defer to function pointers. This incurs one indirect call per operation. For very tight inner loops, consider caching frequently used functions:

for (...) {
result = add_fn(a, b); // May be vaguely faster than macro
}
vl_simd_functions_t vlSIMDFunctions
Global SIMD function table.
Definition vl_simd.c:34
vl_simd_vec4_f32(* vl_simd_add_vec4f32_fn)(vl_simd_vec4_f32, vl_simd_vec4_f32)
Definition vl_simd.h:314
vl_simd_add_vec4f32_fn add_vec4f32
Definition vl_simd.h:385

8-Wide Operations

8-wide operations on NEON/SSE2 are synthesized from two 128-bit registers.

See also
vlSIMDInit
vl_simd_functions_t

Data Structure Documentation

◆ vl_simd_functions_t

struct vl_simd_functions_t

Master SIMD function dispatch table.

Initialized by vlSIMDInit() to point to the best available backend for the target architecture. Thread-safe to read after initialization; do not modify.

Contains function pointers for:

  • Load/store operations (memory I/O)
  • Arithmetic (add, subtract, multiply, divide, FMA)
  • Comparisons (lt, gt, eq)
  • Bitwise operations (and, or, xor, not)
  • Horizontal reductions (sum, max, min, product)
  • Lane operations (extract, broadcast)
  • Integer operations (I32, I16, U8)
Note
Read-only after vlSIMDInit(). Modifying this after initialization will cause undefined behavior.
See also
vlSIMDInit
+ Collaboration diagram for vl_simd_functions_t:
Data Fields
vl_simd_add_vec4f32_fn add_vec4f32
vl_simd_add_vec4i32_fn add_vec4i32
vl_simd_add_vec8f32_fn add_vec8f32
vl_simd_add_vec8i16_fn add_vec8i16
vl_simd_bitwise_vec4f32_fn and_vec4f32
vl_simd_and_vec8f32_fn and_vec8f32
const char * backend_name Backend name string for logging/debugging (e.g., "AVX2", "NEON64").
vl_simd_broadcast_lane_vec4f32_fn broadcast_lane_vec4f32
vl_simd_div_vec4f32_fn div_vec4f32
vl_simd_cmp_vec4f32_fn eq_vec4f32
vl_simd_eq_vec8f32_fn eq_vec8f32
vl_simd_extract_lane_vec4f32_fn extract_lane_vec4f32
vl_simd_fma_vec4f32_fn fma_vec4f32
vl_simd_fma_vec8f32_fn fma_vec8f32
vl_simd_cmp_vec4f32_fn gt_vec4f32
vl_simd_gt_vec8f32_fn gt_vec8f32
vl_simd_hmax_vec4f32_fn hmax_vec4f32
vl_simd_hmin_vec4f32_fn hmin_vec4f32
vl_simd_hprod_vec4f32_fn hprod_vec4f32
vl_simd_hsum_vec4f32_fn hsum_vec4f32
vl_simd_load_vec32u8_fn load_vec32u8
vl_simd_load_vec4f32_fn load_vec4f32
vl_simd_load_vec4i32_fn load_vec4i32
vl_simd_load_vec8f32_fn load_vec8f32
vl_simd_load_vec8i16_fn load_vec8i16
vl_simd_cmp_vec4f32_fn lt_vec4f32
vl_simd_lt_vec8f32_fn lt_vec8f32
vl_simd_mul_vec4f32_fn mul_vec4f32
vl_simd_mul_vec4i32_fn mul_vec4i32
vl_simd_mul_vec8f32_fn mul_vec8f32
vl_simd_not_vec4f32_fn not_vec4f32
vl_simd_not_vec8f32_fn not_vec8f32
vl_simd_bitwise_vec4f32_fn or_vec4f32
vl_simd_or_vec8f32_fn or_vec8f32
vl_simd_splat_vec4f32_fn splat_vec4f32
vl_simd_splat_vec8f32_fn splat_vec8f32
vl_simd_store_vec32u8_fn store_vec32u8
vl_simd_store_vec4f32_fn store_vec4f32
vl_simd_store_vec4i32_fn store_vec4i32
vl_simd_store_vec8f32_fn store_vec8f32
vl_simd_store_vec8i16_fn store_vec8i16
vl_simd_sub_vec4f32_fn sub_vec4f32
vl_simd_sub_vec8f32_fn sub_vec8f32
vl_simd_bitwise_vec4f32_fn xor_vec4f32
vl_simd_xor_vec8f32_fn xor_vec8f32

Typedef Documentation

◆ vl_simd_add_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_add_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_add_vec4i32_fn

typedef vl_simd_vec4_i32(* vl_simd_add_vec4i32_fn) (vl_simd_vec4_i32, vl_simd_vec4_i32)

◆ vl_simd_add_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_add_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_add_vec8i16_fn

typedef vl_simd_vec8_i16(* vl_simd_add_vec8i16_fn) (vl_simd_vec8_i16, vl_simd_vec8_i16)

◆ vl_simd_and_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_and_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_bitwise_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_bitwise_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_broadcast_lane_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_broadcast_lane_vec4f32_fn) (vl_simd_vec4_f32, int)

◆ vl_simd_cmp_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_cmp_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_div_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_div_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_eq_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_eq_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_extract_lane_vec4f32_fn

typedef vl_float32_t(* vl_simd_extract_lane_vec4f32_fn) (vl_simd_vec4_f32, int)

◆ vl_simd_fma_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_fma_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_fma_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_fma_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_gt_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_gt_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_hmax_vec4f32_fn

typedef vl_float32_t(* vl_simd_hmax_vec4f32_fn) (vl_simd_vec4_f32)

◆ vl_simd_hmin_vec4f32_fn

typedef vl_float32_t(* vl_simd_hmin_vec4f32_fn) (vl_simd_vec4_f32)

◆ vl_simd_hprod_vec4f32_fn

typedef vl_float32_t(* vl_simd_hprod_vec4f32_fn) (vl_simd_vec4_f32)

◆ vl_simd_hsum_vec4f32_fn

typedef vl_float32_t(* vl_simd_hsum_vec4f32_fn) (vl_simd_vec4_f32)

◆ vl_simd_load_vec32u8_fn

typedef vl_simd_vec32_u8(* vl_simd_load_vec32u8_fn) (const vl_uint8_t *)

◆ vl_simd_load_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_load_vec4f32_fn) (const vl_float32_t *)

◆ vl_simd_load_vec4i32_fn

typedef vl_simd_vec4_i32(* vl_simd_load_vec4i32_fn) (const vl_int32_t *)

◆ vl_simd_load_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_load_vec8f32_fn) (const vl_float32_t *)

◆ vl_simd_load_vec8i16_fn

typedef vl_simd_vec8_i16(* vl_simd_load_vec8i16_fn) (const vl_int16_t *)

◆ vl_simd_lt_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_lt_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_mul_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_mul_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_mul_vec4i32_fn

typedef vl_simd_vec4_i32(* vl_simd_mul_vec4i32_fn) (vl_simd_vec4_i32, vl_simd_vec4_i32)

◆ vl_simd_mul_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_mul_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_not_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_not_vec4f32_fn) (vl_simd_vec4_f32)

◆ vl_simd_not_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_not_vec8f32_fn) (vl_simd_vec8_f32)

◆ vl_simd_or_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_or_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_splat_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_splat_vec4f32_fn) (vl_float32_t)

◆ vl_simd_splat_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_splat_vec8f32_fn) (vl_float32_t)

◆ vl_simd_store_vec32u8_fn

typedef void(* vl_simd_store_vec32u8_fn) (vl_uint8_t *, vl_simd_vec32_u8)

◆ vl_simd_store_vec4f32_fn

typedef void(* vl_simd_store_vec4f32_fn) (vl_float32_t *, vl_simd_vec4_f32)

◆ vl_simd_store_vec4i32_fn

typedef void(* vl_simd_store_vec4i32_fn) (vl_int32_t *, vl_simd_vec4_i32)

◆ vl_simd_store_vec8f32_fn

typedef void(* vl_simd_store_vec8f32_fn) (vl_float32_t *, vl_simd_vec8_f32)

◆ vl_simd_store_vec8i16_fn

typedef void(* vl_simd_store_vec8i16_fn) (vl_int16_t *, vl_simd_vec8_i16)

◆ vl_simd_sub_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_sub_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_sub_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_sub_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_xor_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_xor_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

Function Documentation

◆ VL_ALIGN_HINT() [1/2]

struct VL_ALIGN_HINT ( 16  )

4-element 32-bit float vector.

16-element 8-bit unsigned integer vector.

4-element 32-bit signed integer vector.

Represents a 128-bit SIMD vector on most architectures. Stored as a simple component array to enable transparent backend selection.

Alignment: 16 bytes (cache line friendly on modern CPUs).

See also
vlSIMDLoadVec4F32, vlSIMDStoreVec4F32

Used for integer math, bit manipulation, and fixed-point operations. Alignment: 16 bytes.

Behavior Note: Integer multiply (vlSIMDMulVec4I32) may be scalar on SSE2.

See also
vlSIMDLoadVec4I32, vlSIMDStoreVec4I32, vlSIMDMulVec4I32

Alignment: 16 bytes.

◆ VL_ALIGN_HINT() [2/2]

struct VL_ALIGN_HINT ( 32  )

8-element 32-bit float vector.

32-element 8-bit unsigned integer vector.

8-element 16-bit signed integer vector.

8-element 32-bit signed integer vector.

Represents a 256-bit SIMD vector (or two 128-bit vectors on ARM). Alignment: 32 bytes (AVX-friendly).

See also
vlSIMDLoadVec8F32, vlSIMDStoreVec8F32

Alignment: 32 bytes.

Used for compact integer storage (e.g., normals, audio samples). Alignment: 32 bytes (to support wider SIMD where applicable).

Used for bulk byte operations (e.g., image processing). Alignment: 32 bytes (AVX-friendly).

◆ vlSIMDInit()

VL_API const char * vlSIMDInit ( void  )

Initializes the SIMD subsystem and selects the best available backend.

Must be called once at application startup, before any SIMD operations. Thread-safe; subsequent calls return immediately and are safe from any thread.

Backend Selection Algorithm

  1. x86/x86-64: Check for AVX2 → SSE2 → fallback to Portable C
  2. ARM64: Use NEON64 (guaranteed available) → fallback to Portable C
  3. ARM32: Use NEON (if available) → fallback to Portable C
  4. Other: Use Portable C

CPU capability detection uses:

  • CPUID instruction (x86/MSVC and GCC/Clang)
  • Compile-time guarantees (ARM with -mfpu=neon)

Example

int main() {
const char* backend = vlSIMDInit();
printf("SIMD backend: %s\n", backend);
// SIMD operations now use best available backend
vl_simd_vec4_f32 v = vlSIMDLoadVec4F32(data);
// ...
return 0;
}
Returns
Pointer to a static string naming the selected backend. Examples: "SSE2", "AVX2", "NEON64", "NEON (ARMv7)", "Portable C". Pointer is valid for the lifetime of the program.
Note
Safe to call from any thread. Repeated calls are safe and return immediately on subsequent invocations.
See also
vlSIMDFunctions, vl_simd_functions_t
+ Here is the call graph for this function:

Variable Documentation

◆ vl_simd_vec16_u8

vl_simd_vec16_u8

◆ vl_simd_vec32_u8

vl_simd_vec32_u8

◆ vl_simd_vec4_f32

vl_simd_vec4_f32

◆ vl_simd_vec4_i32

vl_simd_vec4_i32

◆ vl_simd_vec8_f32

vl_simd_vec8_f32

◆ vl_simd_vec8_i16

vl_simd_vec8_i16

◆ vl_simd_vec8_i32

vl_simd_vec8_i32

◆ vlSIMDFunctions

VL_API vl_simd_functions_t vlSIMDFunctions
extern

Global SIMD function table.

Set by vlSIMDInit(). Thread-safe to read after initialization. Do not modify after initialization.

See also
vlSIMDInit, vl_simd_functions_t