Transparent runtime-selected SIMD abstraction layer. More...

#include <vl/vl_memory.h>
#include <vl/vl_numtypes.h>

Include dependency graph for vl_simd.h:

This graph shows which files directly or indirectly include this file:

Data Structures
struct	vl_simd_functions_t
	Master SIMD function dispatch table. More...

Typedefs
typedef vl_simd_vec4_f32(*	vl_simd_load_vec4f32_fn) (const vl_float32_t *)

typedef void(*	vl_simd_store_vec4f32_fn) (vl_float32_t *, vl_simd_vec4_f32)

typedef vl_simd_vec4_f32(*	vl_simd_splat_vec4f32_fn) (vl_float32_t)

typedef vl_simd_vec4_f32(*	vl_simd_add_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

typedef vl_simd_vec4_f32(*	vl_simd_sub_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

typedef vl_simd_vec4_f32(*	vl_simd_mul_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

typedef vl_simd_vec4_f32(*	vl_simd_div_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

typedef vl_simd_vec4_f32(*	vl_simd_fma_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32, vl_simd_vec4_f32)

typedef vl_float32_t(*	vl_simd_hsum_vec4f32_fn) (vl_simd_vec4_f32)

typedef vl_simd_vec8_f32(*	vl_simd_load_vec8f32_fn) (const vl_float32_t *)

typedef void(*	vl_simd_store_vec8f32_fn) (vl_float32_t *, vl_simd_vec8_f32)

typedef vl_simd_vec8_f32(*	vl_simd_add_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

typedef vl_simd_vec8_f32(*	vl_simd_mul_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

typedef vl_simd_vec8_f32(*	vl_simd_fma_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32, vl_simd_vec8_f32)

typedef vl_simd_vec8_f32(*	vl_simd_splat_vec8f32_fn) (vl_float32_t)

typedef vl_simd_vec8_f32(*	vl_simd_sub_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

typedef vl_simd_vec8_f32(*	vl_simd_lt_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

typedef vl_simd_vec8_f32(*	vl_simd_gt_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

typedef vl_simd_vec8_f32(*	vl_simd_eq_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

typedef vl_simd_vec8_f32(*	vl_simd_and_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

typedef vl_simd_vec8_f32(*	vl_simd_or_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

typedef vl_simd_vec8_f32(*	vl_simd_xor_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

typedef vl_simd_vec8_f32(*	vl_simd_not_vec8f32_fn) (vl_simd_vec8_f32)

typedef vl_simd_vec4_f32(*	vl_simd_cmp_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

typedef vl_simd_vec4_f32(*	vl_simd_bitwise_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

typedef vl_simd_vec4_f32(*	vl_simd_not_vec4f32_fn) (vl_simd_vec4_f32)

typedef vl_float32_t(*	vl_simd_hmax_vec4f32_fn) (vl_simd_vec4_f32)

typedef vl_float32_t(*	vl_simd_hmin_vec4f32_fn) (vl_simd_vec4_f32)

typedef vl_float32_t(*	vl_simd_hprod_vec4f32_fn) (vl_simd_vec4_f32)

typedef vl_float32_t(*	vl_simd_extract_lane_vec4f32_fn) (vl_simd_vec4_f32, int)

typedef vl_simd_vec4_f32(*	vl_simd_broadcast_lane_vec4f32_fn) (vl_simd_vec4_f32, int)

typedef vl_simd_vec4_i32(*	vl_simd_load_vec4i32_fn) (const vl_int32_t *)

typedef void(*	vl_simd_store_vec4i32_fn) (vl_int32_t *, vl_simd_vec4_i32)

typedef vl_simd_vec4_i32(*	vl_simd_add_vec4i32_fn) (vl_simd_vec4_i32, vl_simd_vec4_i32)

typedef vl_simd_vec4_i32(*	vl_simd_mul_vec4i32_fn) (vl_simd_vec4_i32, vl_simd_vec4_i32)

typedef vl_simd_vec8_i16(*	vl_simd_load_vec8i16_fn) (const vl_int16_t *)

typedef void(*	vl_simd_store_vec8i16_fn) (vl_int16_t *, vl_simd_vec8_i16)

typedef vl_simd_vec8_i16(*	vl_simd_add_vec8i16_fn) (vl_simd_vec8_i16, vl_simd_vec8_i16)

typedef vl_simd_vec32_u8(*	vl_simd_load_vec32u8_fn) (const vl_uint8_t *)

typedef void(*	vl_simd_store_vec32u8_fn) (vl_uint8_t *, vl_simd_vec32_u8)

Functions
struct	VL_ALIGN_HINT (16) vl_simd_vec4_f32_
	4-element 32-bit float vector.

struct	VL_ALIGN_HINT (32) vl_simd_vec8_f32_
	8-element 32-bit float vector.

VL_API const char *	vlSIMDInit (void)
	Initializes the SIMD subsystem and selects the best available backend.

Variables
	vl_simd_vec4_f32

	vl_simd_vec8_f32

	vl_simd_vec4_i32

	vl_simd_vec8_i32

	vl_simd_vec8_i16

	vl_simd_vec16_u8

	vl_simd_vec32_u8

VL_API vl_simd_functions_t	vlSIMDFunctions
	Global SIMD function table.

Detailed Description

Transparent runtime-selected SIMD abstraction layer.

██ ██ ██ █████ ███████ █████ ██████ ███ ██ █████ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██ ███████ ███████ ███████ ██ ███ ██ ██ ██ ███████ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ███████ ██ ██ ███████ ██ ██ ██████ ██ ████ ██ ██ ====—: A Data Structure and Algorithms library for C11. :—====

Provides a unified, architecture-agnostic interface to SIMD operations with automatic CPU capability detection and backend selection at initialization time.

Overview

This module abstracts away platform-specific SIMD intrinsics (SSE2, AVX2, NEON, etc.) behind a portable C API. The best available implementation is selected once at application startup via vlSIMDInit(), with graceful fallback to portable C implementations on all platforms.

Key design principles:

Zero runtime cost after init: All backend selection happens once; after that, function pointers are just table lookups.
No global state pollution: Selection is stored locally in vlSIMDFunctions.
Transparent API: Users call simple macros; backend is invisible.
Portable C always available: The abstraction itself is proven correct before optimizations layer on top.

Supported Architectures & Backends

x86 / x86-64

Portable C: Always available, unoptimized scalar fallback.
SSE2: Available on all x86-64 CPUs and most modern x86. Provides 128-bit operations on 4 float or 4 int32. Integer 32-bit multiply falls back to scalar.
AVX2: Intel Haswell (2013+), AMD Excavator (2015+). Extends SSE with 256-bit operations (8 float or 8 int32), true FMA, and better integer operations.

ARM / ARM64

NEON (ARMv7): 128-bit operations on 4 float or mixed-width integers. Division uses reciprocal approximation with one Newton-Raphson iteration (~11-12 bits accuracy).
NEON64 (ARMv8+): Enhanced NEON with true FMA and improved precision. Handles 8-wide operations via two 128-bit registers.

Selection priority (checked in order):

x86: AVX2 > SSE2 > Portable C
ARM64: NEON64 > Portable C
ARM32: NEON > Portable C
Other: Portable C

Usage Pattern

#include <vl/vl_simd.h>
 
int main() {
    // Initialize once at startup
    const char* backend = vlSIMDInit();
    printf("Using SIMD backend: %s\n", backend);  // e.g., "AVX2", "NEON64"
 
    // Use transparently anywhere
    vl_simd_vec4_f32 a = vlSIMDLoadVec4F32(data_ptr);
    vl_simd_vec4_f32 b = vlSIMDLoadVec4F32(data_ptr2);
    vl_simd_vec4_f32 result = vlSIMDAddVec4F32(a, b);
    vlSIMDStoreVec4F32(output_ptr, result);
 
    // Horizontal reductions
    float sum = vlSIMDHsumVec4F32(result);
    float max_val = vlSIMDHmaxVec4F32(result);
 
    return 0;
}

Vector Types

All vector types are struct-based with element arrays, enabling transparent interchange between backends. Alignment hints are provided for cache efficiency.

4-wide float (F32): 4 × 32-bit floats, 16-byte aligned
8-wide float (F32): 8 × 32-bit floats, 32-byte aligned (for AVX)
4-wide int32 (I32): 4 × 32-bit signed integers, 16-byte aligned
8-wide int16 (I16): 8 × 16-bit signed integers, 32-byte aligned
32-wide uint8 (U8): 32 × 8-bit unsigned integers, 32-byte aligned

Operation Categories

Arithmetic (4-wide F32)

Basic: vlSIMDAddVec4F32, vlSIMDSubVec4F32, vlSIMDMulVec4F32, vlSIMDDivVec4F32
Advanced: vlSIMDFmaVec4F32 (fused multiply-add, hardware native when available)

Arithmetic (8-wide F32)

Basic: vlSIMDAddVec8F32, vlSIMDMulVec8F32
Advanced: vlSIMDFmaVec8F32

Horizontal Reductions (4-wide F32)

Reduce a vector to a single scalar by combining all lanes:

vlSIMDHsumVec4F32: Sum all 4 elements
vlSIMDHmaxVec4F32: Maximum element
vlSIMDHminVec4F32: Minimum element
vlSIMDHprodVec4F32: Product of all elements

Comparisons (4-wide F32)

Return element-wise masks (0xFFFFFFFF for true, 0x00000000 for false):

vlSIMDLtVec4F32: Less-than
vlSIMDGtVec4F32: Greater-than
vlSIMDEqVec4F32: Equality

Bitwise Operations (4-wide F32)

Treat float bits as integers:

vlSIMDAndVec4F32, vlSIMDOrVec4F32, vlSIMDXorVec4F32, vlSIMDNotVec4F32

Lane Operations (4-wide F32)

vlSIMDExtractLaneVec4F32: Extract single lane to scalar
vlSIMDBroadcastLaneVec4F32: Replicate single lane to all lanes

Integer Operations

I32: Load, store, add, multiply (4-wide)
I16: Load, store, add (8-wide)
U8: Load, store (32-wide)

Important Notes on Precision & Behavior

Division on NEON (ARMv7/ARMv8)

NEON does not have native division. The implementation uses reciprocal approximation with Newton-Raphson refinement:

recip = vrecpeq_f32(b)
recip = recip * vrecpsq_f32(b, recip)  // One iteration
result = a * recip

This achieves ~11-12 bits of accuracy, sufficient for graphics but not for high-precision numerical work. Use portable C or compute higher-precision divisions on CPU if needed.

Integer Multiply on SSE2

SSE2 lacks 32-bit integer multiply, so vlSIMDMulVec4I32 falls back to scalar operations on this backend.

Comparison Results as Float Bits

Comparison operations (lt, gt, eq) return masks stored as float bit patterns:

True: 0xFFFFFFFF (all bits set)
False: 0x00000000

These can be used in bitwise operations or with FMA for blending.

Memory Alignment

Load/store operations use unaligned variants (_mm_loadu_ps, _mm256_loadu_ps, etc.) to accept arbitrary pointers. If data is guaranteed aligned, consider manual optimization to aligned variants for performance.

Thread Safety

Initialization: vlSIMDInit() is thread-safe. Subsequent calls return immediately and are safe to call from multiple threads.

Runtime use: vlSIMDFunctions is read-only after initialization. All threads can safely call SIMD operations without synchronization.

Performance Considerations

Load/Store Overhead

The struct-based design requires loading vectors from components arrays and storing back after operations. Modern compilers optimize these to single instructions when possible, but be aware of this pattern for hot loops.

Macro API

All operations are exposed as macros (e.g., vlSIMDAddVec4F32) that defer to function pointers. This incurs one indirect call per operation. For very tight inner loops, consider caching frequently used functions:

vl_simd_add_vec4f32_fn add_fn = vlSIMDFunctions.add_vec4f32;
for (...) {
    result = add_fn(a, b);  // May be vaguely faster than macro
}

8-Wide Operations

8-wide operations on NEON/SSE2 are synthesized from two 128-bit registers.

See also: vlSIMDInit; vl_simd_functions_t

Data Structure Documentation

◆ vl_simd_functions_t

struct vl_simd_functions_t

Master SIMD function dispatch table.

Initialized by vlSIMDInit() to point to the best available backend for the target architecture. Thread-safe to read after initialization; do not modify.

Contains function pointers for:

Load/store operations (memory I/O)
Arithmetic (add, subtract, multiply, divide, FMA)
Comparisons (lt, gt, eq)
Bitwise operations (and, or, xor, not)
Horizontal reductions (sum, max, min, product)
Lane operations (extract, broadcast)
Integer operations (I32, I16, U8)

Note: Read-only after vlSIMDInit(). Modifying this after initialization will cause undefined behavior.

See also: vlSIMDInit

Collaboration diagram for vl_simd_functions_t:

Data Fields
vl_simd_add_vec4f32_fn	add_vec4f32
vl_simd_add_vec4i32_fn	add_vec4i32
vl_simd_add_vec8f32_fn	add_vec8f32
vl_simd_add_vec8i16_fn	add_vec8i16
vl_simd_bitwise_vec4f32_fn	and_vec4f32
vl_simd_and_vec8f32_fn	and_vec8f32
const char *	backend_name	Backend name string for logging/debugging (e.g., "AVX2", "NEON64").
vl_simd_broadcast_lane_vec4f32_fn	broadcast_lane_vec4f32
vl_simd_div_vec4f32_fn	div_vec4f32
vl_simd_cmp_vec4f32_fn	eq_vec4f32
vl_simd_eq_vec8f32_fn	eq_vec8f32
vl_simd_extract_lane_vec4f32_fn	extract_lane_vec4f32
vl_simd_fma_vec4f32_fn	fma_vec4f32
vl_simd_fma_vec8f32_fn	fma_vec8f32
vl_simd_cmp_vec4f32_fn	gt_vec4f32
vl_simd_gt_vec8f32_fn	gt_vec8f32
vl_simd_hmax_vec4f32_fn	hmax_vec4f32
vl_simd_hmin_vec4f32_fn	hmin_vec4f32
vl_simd_hprod_vec4f32_fn	hprod_vec4f32
vl_simd_hsum_vec4f32_fn	hsum_vec4f32
vl_simd_load_vec32u8_fn	load_vec32u8
vl_simd_load_vec4f32_fn	load_vec4f32
vl_simd_load_vec4i32_fn	load_vec4i32
vl_simd_load_vec8f32_fn	load_vec8f32
vl_simd_load_vec8i16_fn	load_vec8i16
vl_simd_cmp_vec4f32_fn	lt_vec4f32
vl_simd_lt_vec8f32_fn	lt_vec8f32
vl_simd_mul_vec4f32_fn	mul_vec4f32
vl_simd_mul_vec4i32_fn	mul_vec4i32
vl_simd_mul_vec8f32_fn	mul_vec8f32
vl_simd_not_vec4f32_fn	not_vec4f32
vl_simd_not_vec8f32_fn	not_vec8f32
vl_simd_bitwise_vec4f32_fn	or_vec4f32
vl_simd_or_vec8f32_fn	or_vec8f32
vl_simd_splat_vec4f32_fn	splat_vec4f32
vl_simd_splat_vec8f32_fn	splat_vec8f32
vl_simd_store_vec32u8_fn	store_vec32u8
vl_simd_store_vec4f32_fn	store_vec4f32
vl_simd_store_vec4i32_fn	store_vec4i32
vl_simd_store_vec8f32_fn	store_vec8f32
vl_simd_store_vec8i16_fn	store_vec8i16
vl_simd_sub_vec4f32_fn	sub_vec4f32
vl_simd_sub_vec8f32_fn	sub_vec8f32
vl_simd_bitwise_vec4f32_fn	xor_vec4f32
vl_simd_xor_vec8f32_fn	xor_vec8f32

Typedef Documentation

◆ vl_simd_add_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_add_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_add_vec4i32_fn

typedef vl_simd_vec4_i32(* vl_simd_add_vec4i32_fn) (vl_simd_vec4_i32, vl_simd_vec4_i32)

◆ vl_simd_add_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_add_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_add_vec8i16_fn

typedef vl_simd_vec8_i16(* vl_simd_add_vec8i16_fn) (vl_simd_vec8_i16, vl_simd_vec8_i16)

◆ vl_simd_and_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_and_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_bitwise_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_bitwise_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_broadcast_lane_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_broadcast_lane_vec4f32_fn) (vl_simd_vec4_f32, int)

◆ vl_simd_cmp_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_cmp_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_div_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_div_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_eq_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_eq_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_extract_lane_vec4f32_fn

typedef vl_float32_t(* vl_simd_extract_lane_vec4f32_fn) (vl_simd_vec4_f32, int)

◆ vl_simd_fma_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_fma_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_fma_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_fma_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_gt_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_gt_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_hmax_vec4f32_fn

typedef vl_float32_t(* vl_simd_hmax_vec4f32_fn) (vl_simd_vec4_f32)

◆ vl_simd_hmin_vec4f32_fn

typedef vl_float32_t(* vl_simd_hmin_vec4f32_fn) (vl_simd_vec4_f32)

◆ vl_simd_hprod_vec4f32_fn

typedef vl_float32_t(* vl_simd_hprod_vec4f32_fn) (vl_simd_vec4_f32)

◆ vl_simd_hsum_vec4f32_fn

typedef vl_float32_t(* vl_simd_hsum_vec4f32_fn) (vl_simd_vec4_f32)

◆ vl_simd_load_vec32u8_fn

typedef vl_simd_vec32_u8(* vl_simd_load_vec32u8_fn) (const vl_uint8_t *)

◆ vl_simd_load_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_load_vec4f32_fn) (const vl_float32_t *)

◆ vl_simd_load_vec4i32_fn

typedef vl_simd_vec4_i32(* vl_simd_load_vec4i32_fn) (const vl_int32_t *)

◆ vl_simd_load_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_load_vec8f32_fn) (const vl_float32_t *)

◆ vl_simd_load_vec8i16_fn

typedef vl_simd_vec8_i16(* vl_simd_load_vec8i16_fn) (const vl_int16_t *)

◆ vl_simd_lt_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_lt_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_mul_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_mul_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_mul_vec4i32_fn

typedef vl_simd_vec4_i32(* vl_simd_mul_vec4i32_fn) (vl_simd_vec4_i32, vl_simd_vec4_i32)

◆ vl_simd_mul_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_mul_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_not_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_not_vec4f32_fn) (vl_simd_vec4_f32)

◆ vl_simd_not_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_not_vec8f32_fn) (vl_simd_vec8_f32)

◆ vl_simd_or_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_or_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_splat_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_splat_vec4f32_fn) (vl_float32_t)

◆ vl_simd_splat_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_splat_vec8f32_fn) (vl_float32_t)

◆ vl_simd_store_vec32u8_fn

typedef void(* vl_simd_store_vec32u8_fn) (vl_uint8_t *, vl_simd_vec32_u8)

◆ vl_simd_store_vec4f32_fn

typedef void(* vl_simd_store_vec4f32_fn) (vl_float32_t *, vl_simd_vec4_f32)

◆ vl_simd_store_vec4i32_fn

typedef void(* vl_simd_store_vec4i32_fn) (vl_int32_t *, vl_simd_vec4_i32)

◆ vl_simd_store_vec8f32_fn

typedef void(* vl_simd_store_vec8f32_fn) (vl_float32_t *, vl_simd_vec8_f32)

◆ vl_simd_store_vec8i16_fn

typedef void(* vl_simd_store_vec8i16_fn) (vl_int16_t *, vl_simd_vec8_i16)

◆ vl_simd_sub_vec4f32_fn

typedef vl_simd_vec4_f32(* vl_simd_sub_vec4f32_fn) (vl_simd_vec4_f32, vl_simd_vec4_f32)

◆ vl_simd_sub_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_sub_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

◆ vl_simd_xor_vec8f32_fn

typedef vl_simd_vec8_f32(* vl_simd_xor_vec8f32_fn) (vl_simd_vec8_f32, vl_simd_vec8_f32)

Function Documentation

◆ VL_ALIGN_HINT() [1/2]

struct VL_ALIGN_HINT ( 16 )

4-element 32-bit float vector.

16-element 8-bit unsigned integer vector.

4-element 32-bit signed integer vector.

Represents a 128-bit SIMD vector on most architectures. Stored as a simple component array to enable transparent backend selection.

Alignment: 16 bytes (cache line friendly on modern CPUs).

See also: vlSIMDLoadVec4F32, vlSIMDStoreVec4F32

Used for integer math, bit manipulation, and fixed-point operations. Alignment: 16 bytes.

Behavior Note: Integer multiply (vlSIMDMulVec4I32) may be scalar on SSE2.

See also: vlSIMDLoadVec4I32, vlSIMDStoreVec4I32, vlSIMDMulVec4I32

Alignment: 16 bytes.

◆ VL_ALIGN_HINT() [2/2]

struct VL_ALIGN_HINT ( 32 )

8-element 32-bit float vector.

32-element 8-bit unsigned integer vector.

8-element 16-bit signed integer vector.

8-element 32-bit signed integer vector.

Represents a 256-bit SIMD vector (or two 128-bit vectors on ARM). Alignment: 32 bytes (AVX-friendly).

See also: vlSIMDLoadVec8F32, vlSIMDStoreVec8F32

Alignment: 32 bytes.

Used for compact integer storage (e.g., normals, audio samples). Alignment: 32 bytes (to support wider SIMD where applicable).

Used for bulk byte operations (e.g., image processing). Alignment: 32 bytes (AVX-friendly).

◆ vlSIMDInit()

VL_API const char * vlSIMDInit ( void )

Initializes the SIMD subsystem and selects the best available backend.

Must be called once at application startup, before any SIMD operations. Thread-safe; subsequent calls return immediately and are safe from any thread.

Backend Selection Algorithm

x86/x86-64: Check for AVX2 → SSE2 → fallback to Portable C
ARM64: Use NEON64 (guaranteed available) → fallback to Portable C
ARM32: Use NEON (if available) → fallback to Portable C
Other: Use Portable C

CPU capability detection uses:

CPUID instruction (x86/MSVC and GCC/Clang)
Compile-time guarantees (ARM with -mfpu=neon)

Example

int main() {
    const char* backend = vlSIMDInit();
    printf("SIMD backend: %s\n", backend);
 
    // SIMD operations now use best available backend
    vl_simd_vec4_f32 v = vlSIMDLoadVec4F32(data);
    // ...
 
    return 0;
}

Returns: Pointer to a static string naming the selected backend. Examples: "SSE2", "AVX2", "NEON64", "NEON (ARMv7)", "Portable C". Pointer is valid for the lifetime of the program.

Note: Safe to call from any thread. Repeated calls are safe and return immediately on subsequent invocations.

See also: vlSIMDFunctions, vl_simd_functions_t

Here is the call graph for this function:

Variable Documentation

◆ vl_simd_vec16_u8

vl_simd_vec16_u8

◆ vl_simd_vec32_u8

vl_simd_vec32_u8

◆ vl_simd_vec4_f32

vl_simd_vec4_f32

◆ vl_simd_vec4_i32

vl_simd_vec4_i32

◆ vl_simd_vec8_f32

vl_simd_vec8_f32

◆ vl_simd_vec8_i16

vl_simd_vec8_i16

◆ vl_simd_vec8_i32

vl_simd_vec8_i32

◆ vlSIMDFunctions

VL_API vl_simd_functions_t vlSIMDFunctions

extern

Global SIMD function table.

Set by vlSIMDInit(). Thread-safe to read after initialization. Do not modify after initialization.

See also: vlSIMDInit, vl_simd_functions_t

Data Structures

Typedefs

Functions

Variables