Veritable Lasagna
An Allocator & Data Structure Library for C.
Loading...
Searching...
No Matches
vl_simd.h
Go to the documentation of this file.
14#ifndef VL_SIMD_H
15#define VL_SIMD_H
16
202#include <vl/vl_memory.h>
203#include <vl/vl_numtypes.h>
204
205/* ============================================================================
206 * Vector Type Definitions
207 * ============================================================================
208 */
209
220typedef struct VL_ALIGN_HINT(16) vl_simd_vec4_f32_
221{
222 vl_float32_t components[4];
224
233typedef struct VL_ALIGN_HINT(32) vl_simd_vec8_f32_
234{
235 vl_float32_t components[8];
237
248typedef struct VL_ALIGN_HINT(16) vl_simd_vec4_i32_
249{
250 vl_int32_t components[4];
252
258typedef struct VL_ALIGN_HINT(32) vl_simd_vec8_i32_
259{
260 vl_int32_t components[8];
262
269#ifdef _MSC_VER
270#pragma warning(push)
271#pragma warning(disable : 4324)
272#endif
273typedef struct VL_ALIGN_HINT(32) vl_simd_vec8_i16_
274{
275 vl_int16_t components[8];
277#ifdef _MSC_VER
278#pragma warning(pop)
279#endif
280
286typedef struct VL_ALIGN_HINT(16) vl_simd_vec16_u8_
287{
288 vl_uint8_t components[16];
290
297typedef struct VL_ALIGN_HINT(32) vl_simd_vec32_u8_
298{
299 vl_uint8_t components[32];
301
302/* ============================================================================
303 * Function Pointer Types for Runtime Dispatch
304 *
305 * Each operation is defined as a function pointer type. The global
306 * vlSIMDFunctions table is populated with concrete implementations at init
307 * time.
308 * ============================================================================
309 */
310
342typedef vl_simd_vec4_i32 (*vl_simd_load_vec4i32_fn)(const vl_int32_t*);
343typedef void (*vl_simd_store_vec4i32_fn)(vl_int32_t*, vl_simd_vec4_i32);
346typedef vl_simd_vec8_i16 (*vl_simd_load_vec8i16_fn)(const vl_int16_t*);
347typedef void (*vl_simd_store_vec8i16_fn)(vl_int16_t*, vl_simd_vec8_i16);
349typedef vl_simd_vec32_u8 (*vl_simd_load_vec32u8_fn)(const vl_uint8_t*);
350typedef void (*vl_simd_store_vec32u8_fn)(vl_uint8_t*, vl_simd_vec32_u8);
351
352/* ============================================================================
353 * Global Function Pointer Table
354 *
355 * This table is set once by vlSIMDInit() based on detected CPU capabilities.
356 * All threads read from it safely after initialization.
357 * ============================================================================
358 */
359
380typedef struct
381{
426
429 const char* backend_name;
431
441
442/* ============================================================================
443 * Initialization
444 * ============================================================================
445 */
446
489VL_API const char* vlSIMDInit(void);
490
491/* ============================================================================
492 * Inline API (User-Facing)
493 *
494 * All operations are exposed as inline functions that dereference
495 * vlSIMDFunctions. This provides a transparent, backend-agnostic interface.
496 * ============================================================================
497 */
498
499/* --- 4-Wide Float (F32) Load/Store --- */
500
510static inline vl_simd_vec4_f32 vlSIMDLoadVec4F32(const vl_float32_t* ptr) { return vlSIMDFunctions.load_vec4f32(ptr); }
511
520static inline void vlSIMDStoreVec4F32(vl_float32_t* ptr, vl_simd_vec4_f32 v) { vlSIMDFunctions.store_vec4f32(ptr, v); }
521
530static inline vl_simd_vec4_f32 vlSIMDSplatVec4F32(vl_float32_t scalar) { return vlSIMDFunctions.splat_vec4f32(scalar); }
531
532/* --- 4-Wide Float (F32) Arithmetic --- */
533
543static inline vl_simd_vec4_f32 vlSIMDAddVec4F32(vl_simd_vec4_f32 a, vl_simd_vec4_f32 b)
544{
545 return vlSIMDFunctions.add_vec4f32(a, b);
546}
547
555static inline vl_simd_vec4_f32 vlSIMDSubVec4F32(vl_simd_vec4_f32 a, vl_simd_vec4_f32 b)
556{
557 return vlSIMDFunctions.sub_vec4f32(a, b);
558}
559
567static inline vl_simd_vec4_f32 vlSIMDMulVec4F32(vl_simd_vec4_f32 a, vl_simd_vec4_f32 b)
568{
569 return vlSIMDFunctions.mul_vec4f32(a, b);
570}
571
584static inline vl_simd_vec4_f32 vlSIMDDivVec4F32(vl_simd_vec4_f32 a, vl_simd_vec4_f32 b)
585{
586 return vlSIMDFunctions.div_vec4f32(a, b);
587}
588
604static inline vl_simd_vec4_f32 vlSIMDFmaVec4F32(vl_simd_vec4_f32 a, vl_simd_vec4_f32 b, vl_simd_vec4_f32 c)
605{
606 return vlSIMDFunctions.fma_vec4f32(a, b, c);
607}
608
609/* --- 4-Wide Float (F32) Reductions --- */
610
621static inline vl_float32_t vlSIMDHsumVec4F32(vl_simd_vec4_f32 v) { return vlSIMDFunctions.hsum_vec4f32(v); }
622
629static inline vl_float32_t vlSIMDHmaxVec4F32(vl_simd_vec4_f32 v) { return vlSIMDFunctions.hmax_vec4f32(v); }
630
637static inline vl_float32_t vlSIMDHminVec4F32(vl_simd_vec4_f32 v) { return vlSIMDFunctions.hmin_vec4f32(v); }
638
645static inline vl_float32_t vlSIMDHprodVec4F32(vl_simd_vec4_f32 v) { return vlSIMDFunctions.hprod_vec4f32(v); }
646
647/* --- 4-Wide Float (F32) Comparisons --- */
648
667static inline vl_simd_vec4_f32 vlSIMDLtVec4F32(vl_simd_vec4_f32 a, vl_simd_vec4_f32 b)
668{
669 return vlSIMDFunctions.lt_vec4f32(a, b);
670}
671
679static inline vl_simd_vec4_f32 vlSIMDGtVec4F32(vl_simd_vec4_f32 a, vl_simd_vec4_f32 b)
680{
681 return vlSIMDFunctions.gt_vec4f32(a, b);
682}
683
694static inline vl_simd_vec4_f32 vlSIMDEqVec4F32(vl_simd_vec4_f32 a, vl_simd_vec4_f32 b)
695{
696 return vlSIMDFunctions.eq_vec4f32(a, b);
697}
698
699/* --- 4-Wide Float (F32) Bitwise Operations --- */
700
712static inline vl_simd_vec4_f32 vlSIMDAndVec4F32(vl_simd_vec4_f32 a, vl_simd_vec4_f32 b)
713{
714 return vlSIMDFunctions.and_vec4f32(a, b);
715}
716
724static inline vl_simd_vec4_f32 vlSIMDOrVec4F32(vl_simd_vec4_f32 a, vl_simd_vec4_f32 b)
725{
726 return vlSIMDFunctions.or_vec4f32(a, b);
727}
728
736static inline vl_simd_vec4_f32 vlSIMDXorVec4F32(vl_simd_vec4_f32 a, vl_simd_vec4_f32 b)
737{
738 return vlSIMDFunctions.xor_vec4f32(a, b);
739}
740
747static inline vl_simd_vec4_f32 vlSIMDNotVec4F32(vl_simd_vec4_f32 a) { return vlSIMDFunctions.not_vec4f32(a); }
748
749/* --- 4-Wide Float (F32) Lane Operations --- */
750
760static inline vl_float32_t vlSIMDExtractLaneVec4F32(vl_simd_vec4_f32 v, int lane)
761{
763}
764
774static inline vl_simd_vec4_f32 vlSIMDBroadcastLaneVec4F32(vl_simd_vec4_f32 v, int lane)
775{
777}
778
779/* --- 8-Wide Float (F32) Load/Store --- */
780
791static inline vl_simd_vec8_f32 vlSIMDLoadVec8F32(const vl_float32_t* ptr) { return vlSIMDFunctions.load_vec8f32(ptr); }
792
799static inline void vlSIMDStoreVec8F32(vl_float32_t* ptr, vl_simd_vec8_f32 v) { vlSIMDFunctions.store_vec8f32(ptr, v); }
800
801/* --- 8-Wide Float (F32) Arithmetic --- */
802
810static inline vl_simd_vec8_f32 vlSIMDAddVec8F32(vl_simd_vec8_f32 a, vl_simd_vec8_f32 b)
811{
812 return vlSIMDFunctions.add_vec8f32(a, b);
813}
814
822static inline vl_simd_vec8_f32 vlSIMDMulVec8F32(vl_simd_vec8_f32 a, vl_simd_vec8_f32 b)
823{
824 return vlSIMDFunctions.mul_vec8f32(a, b);
825}
826
835static inline vl_simd_vec8_f32 vlSIMDFmaVec8F32(vl_simd_vec8_f32 a, vl_simd_vec8_f32 b, vl_simd_vec8_f32 c)
836{
837 return vlSIMDFunctions.fma_vec8f32(a, b, c);
838}
839
840/* --- Integer Operations --- */
841
852static inline vl_simd_vec4_i32 vlSIMDLoadVec4I32(const vl_int32_t* ptr) { return vlSIMDFunctions.load_vec4i32(ptr); }
853
860static inline void vlSIMDStoreVec4I32(vl_int32_t* ptr, vl_simd_vec4_i32 v) { vlSIMDFunctions.store_vec4i32(ptr, v); }
861
869static inline vl_simd_vec4_i32 vlSIMDAddVec4I32(vl_simd_vec4_i32 a, vl_simd_vec4_i32 b)
870{
871 return vlSIMDFunctions.add_vec4i32(a, b);
872}
873
884static inline vl_simd_vec4_i32 vlSIMDMulVec4I32(vl_simd_vec4_i32 a, vl_simd_vec4_i32 b)
885{
886 return vlSIMDFunctions.mul_vec4i32(a, b);
887}
888
897static inline vl_simd_vec8_i16 vlSIMDLoadVec8I16(const vl_int16_t* ptr) { return vlSIMDFunctions.load_vec8i16(ptr); }
898
905static inline void vlSIMDStoreVec8I16(vl_int16_t* ptr, vl_simd_vec8_i16 v) { vlSIMDFunctions.store_vec8i16(ptr, v); }
906
914static inline vl_simd_vec8_i16 vlSIMDAddVec8I16(vl_simd_vec8_i16 a, vl_simd_vec8_i16 b)
915{
916 return vlSIMDFunctions.add_vec8i16(a, b);
917}
918
927static inline vl_simd_vec32_u8 vlSIMDLoadVec32U8(const vl_uint8_t* ptr) { return vlSIMDFunctions.load_vec32u8(ptr); }
928
935static inline void vlSIMDStoreVec32U8(vl_uint8_t* ptr, vl_simd_vec32_u8 v) { vlSIMDFunctions.store_vec32u8(ptr, v); }
936
943static inline vl_simd_vec8_f32 vlSIMDSplatVec8F32(vl_float32_t scalar) { return vlSIMDFunctions.splat_vec8f32(scalar); }
944
952static inline vl_simd_vec8_f32 vlSIMDSubVec8F32(vl_simd_vec8_f32 a, vl_simd_vec8_f32 b)
953{
954 return vlSIMDFunctions.sub_vec8f32(a, b);
955}
956
957/* --- 8-Wide Float (F32) Comparisons --- */
958
966static inline vl_simd_vec8_f32 vlSIMDLtVec8F32(vl_simd_vec8_f32 a, vl_simd_vec8_f32 b)
967{
968 return vlSIMDFunctions.lt_vec8f32(a, b);
969}
970
978static inline vl_simd_vec8_f32 vlSIMDGtVec8F32(vl_simd_vec8_f32 a, vl_simd_vec8_f32 b)
979{
980 return vlSIMDFunctions.gt_vec8f32(a, b);
981}
982
990static inline vl_simd_vec8_f32 vlSIMDEqVec8F32(vl_simd_vec8_f32 a, vl_simd_vec8_f32 b)
991{
992 return vlSIMDFunctions.eq_vec8f32(a, b);
993}
994
995/* --- 8-Wide Float (F32) Bitwise Operations --- */
996
1004static inline vl_simd_vec8_f32 vlSIMDAndVec8F32(vl_simd_vec8_f32 a, vl_simd_vec8_f32 b)
1005{
1006 return vlSIMDFunctions.and_vec8f32(a, b);
1007}
1008
1016static inline vl_simd_vec8_f32 vlSIMDOrVec8F32(vl_simd_vec8_f32 a, vl_simd_vec8_f32 b)
1017{
1018 return vlSIMDFunctions.or_vec8f32(a, b);
1019}
1020
1028static inline vl_simd_vec8_f32 vlSIMDXorVec8F32(vl_simd_vec8_f32 a, vl_simd_vec8_f32 b)
1029{
1030 return vlSIMDFunctions.xor_vec8f32(a, b);
1031}
1032
1039static inline vl_simd_vec8_f32 vlSIMDNotVec8F32(vl_simd_vec8_f32 a) { return vlSIMDFunctions.not_vec8f32(a); }
1040
1041#endif
#define VL_ALIGN_HINT(x)
Structure alignment hint.
Definition vl_memory.h:90
VL_F32_T vl_float32_t
32-bit floating point number type.
Definition vl_numtypes.h:173
vl_simd_hmin_vec4f32_fn hmin_vec4f32
Definition vl_simd.h:413
vl_float32_t(* vl_simd_hprod_vec4f32_fn)(vl_simd_vec4_f32)
Definition vl_simd.h:339
vl_simd_vec4_f32(* vl_simd_broadcast_lane_vec4f32_fn)(vl_simd_vec4_f32, int)
Definition vl_simd.h:341
vl_simd_bitwise_vec4f32_fn or_vec4f32
Definition vl_simd.h:409
vl_simd_lt_vec8f32_fn lt_vec8f32
Definition vl_simd.h:398
vl_simd_vec8_i32
Definition vl_simd.h:261
vl_simd_or_vec8f32_fn or_vec8f32
Definition vl_simd.h:402
vl_simd_hprod_vec4f32_fn hprod_vec4f32
Definition vl_simd.h:414
vl_simd_cmp_vec4f32_fn eq_vec4f32
Definition vl_simd.h:407
vl_simd_vec4_f32(* vl_simd_add_vec4f32_fn)(vl_simd_vec4_f32, vl_simd_vec4_f32)
Definition vl_simd.h:314
vl_simd_bitwise_vec4f32_fn and_vec4f32
Definition vl_simd.h:408
vl_simd_vec4_f32(* vl_simd_div_vec4f32_fn)(vl_simd_vec4_f32, vl_simd_vec4_f32)
Definition vl_simd.h:317
vl_simd_not_vec4f32_fn not_vec4f32
Definition vl_simd.h:411
vl_simd_vec32_u8(* vl_simd_load_vec32u8_fn)(const vl_uint8_t *)
Definition vl_simd.h:349
vl_simd_store_vec4i32_fn store_vec4i32
Definition vl_simd.h:418
vl_simd_vec8_f32(* vl_simd_splat_vec8f32_fn)(vl_float32_t)
Definition vl_simd.h:325
void(* vl_simd_store_vec4i32_fn)(vl_int32_t *, vl_simd_vec4_i32)
Definition vl_simd.h:343
vl_simd_fma_vec8f32_fn fma_vec8f32
Definition vl_simd.h:395
vl_simd_eq_vec8f32_fn eq_vec8f32
Definition vl_simd.h:400
vl_simd_vec4_f32(* vl_simd_not_vec4f32_fn)(vl_simd_vec4_f32)
Definition vl_simd.h:336
void(* vl_simd_store_vec4f32_fn)(vl_float32_t *, vl_simd_vec4_f32)
Definition vl_simd.h:312
vl_simd_store_vec8i16_fn store_vec8i16
Definition vl_simd.h:422
vl_simd_vec4_f32(* vl_simd_cmp_vec4f32_fn)(vl_simd_vec4_f32, vl_simd_vec4_f32)
Definition vl_simd.h:334
vl_simd_vec4_f32(* vl_simd_load_vec4f32_fn)(const vl_float32_t *)
Definition vl_simd.h:311
vl_simd_vec4_i32(* vl_simd_mul_vec4i32_fn)(vl_simd_vec4_i32, vl_simd_vec4_i32)
Definition vl_simd.h:345
vl_simd_vec4_f32(* vl_simd_fma_vec4f32_fn)(vl_simd_vec4_f32, vl_simd_vec4_f32, vl_simd_vec4_f32)
Definition vl_simd.h:318
void(* vl_simd_store_vec8f32_fn)(vl_float32_t *, vl_simd_vec8_f32)
Definition vl_simd.h:321
vl_float32_t(* vl_simd_hsum_vec4f32_fn)(vl_simd_vec4_f32)
Definition vl_simd.h:319
vl_simd_load_vec4f32_fn load_vec4f32
Definition vl_simd.h:382
vl_simd_sub_vec8f32_fn sub_vec8f32
Definition vl_simd.h:397
vl_float32_t(* vl_simd_hmax_vec4f32_fn)(vl_simd_vec4_f32)
Definition vl_simd.h:337
vl_float32_t(* vl_simd_hmin_vec4f32_fn)(vl_simd_vec4_f32)
Definition vl_simd.h:338
vl_simd_load_vec8i16_fn load_vec8i16
Definition vl_simd.h:421
vl_simd_vec8_f32(* vl_simd_add_vec8f32_fn)(vl_simd_vec8_f32, vl_simd_vec8_f32)
Definition vl_simd.h:322
vl_simd_load_vec32u8_fn load_vec32u8
Definition vl_simd.h:424
void(* vl_simd_store_vec8i16_fn)(vl_int16_t *, vl_simd_vec8_i16)
Definition vl_simd.h:347
vl_simd_vec8_i16(* vl_simd_load_vec8i16_fn)(const vl_int16_t *)
Definition vl_simd.h:346
vl_simd_vec4_i32(* vl_simd_load_vec4i32_fn)(const vl_int32_t *)
Definition vl_simd.h:342
const char * backend_name
Backend name string for logging/debugging (e.g., "AVX2", "NEON64").
Definition vl_simd.h:429
vl_simd_vec8_f32(* vl_simd_gt_vec8f32_fn)(vl_simd_vec8_f32, vl_simd_vec8_f32)
Definition vl_simd.h:328
vl_simd_vec8_f32(* vl_simd_sub_vec8f32_fn)(vl_simd_vec8_f32, vl_simd_vec8_f32)
Definition vl_simd.h:326
vl_simd_broadcast_lane_vec4f32_fn broadcast_lane_vec4f32
Definition vl_simd.h:416
vl_simd_cmp_vec4f32_fn gt_vec4f32
Definition vl_simd.h:406
vl_simd_mul_vec8f32_fn mul_vec8f32
Definition vl_simd.h:394
vl_simd_store_vec8f32_fn store_vec8f32
Definition vl_simd.h:392
void(* vl_simd_store_vec32u8_fn)(vl_uint8_t *, vl_simd_vec32_u8)
Definition vl_simd.h:350
vl_simd_vec8_f32(* vl_simd_load_vec8f32_fn)(const vl_float32_t *)
Definition vl_simd.h:320
vl_simd_store_vec4f32_fn store_vec4f32
Definition vl_simd.h:383
vl_simd_vec8_f32(* vl_simd_xor_vec8f32_fn)(vl_simd_vec8_f32, vl_simd_vec8_f32)
Definition vl_simd.h:332
vl_simd_add_vec8f32_fn add_vec8f32
Definition vl_simd.h:393
vl_simd_vec8_f32(* vl_simd_or_vec8f32_fn)(vl_simd_vec8_f32, vl_simd_vec8_f32)
Definition vl_simd.h:331
vl_simd_vec4_f32
Definition vl_simd.h:223
vl_simd_and_vec8f32_fn and_vec8f32
Definition vl_simd.h:401
vl_simd_splat_vec4f32_fn splat_vec4f32
Definition vl_simd.h:384
vl_simd_load_vec8f32_fn load_vec8f32
Definition vl_simd.h:391
vl_simd_vec4_f32(* vl_simd_sub_vec4f32_fn)(vl_simd_vec4_f32, vl_simd_vec4_f32)
Definition vl_simd.h:315
vl_simd_add_vec4i32_fn add_vec4i32
Definition vl_simd.h:419
vl_simd_load_vec4i32_fn load_vec4i32
Definition vl_simd.h:417
vl_float32_t(* vl_simd_extract_lane_vec4f32_fn)(vl_simd_vec4_f32, int)
Definition vl_simd.h:340
vl_simd_sub_vec4f32_fn sub_vec4f32
Definition vl_simd.h:386
vl_simd_hmax_vec4f32_fn hmax_vec4f32
Definition vl_simd.h:412
vl_simd_add_vec4f32_fn add_vec4f32
Definition vl_simd.h:385
vl_simd_vec8_i16(* vl_simd_add_vec8i16_fn)(vl_simd_vec8_i16, vl_simd_vec8_i16)
Definition vl_simd.h:348
vl_simd_fma_vec4f32_fn fma_vec4f32
Definition vl_simd.h:389
vl_simd_div_vec4f32_fn div_vec4f32
Definition vl_simd.h:388
vl_simd_vec4_i32
Definition vl_simd.h:251
vl_simd_bitwise_vec4f32_fn xor_vec4f32
Definition vl_simd.h:410
vl_simd_xor_vec8f32_fn xor_vec8f32
Definition vl_simd.h:403
vl_simd_vec16_u8
Definition vl_simd.h:289
vl_simd_vec8_f32(* vl_simd_not_vec8f32_fn)(vl_simd_vec8_f32)
Definition vl_simd.h:333
vl_simd_store_vec32u8_fn store_vec32u8
Definition vl_simd.h:425
vl_simd_vec8_i16
Definition vl_simd.h:276
vl_simd_vec8_f32(* vl_simd_mul_vec8f32_fn)(vl_simd_vec8_f32, vl_simd_vec8_f32)
Definition vl_simd.h:323
vl_simd_vec32_u8
Definition vl_simd.h:300
vl_simd_add_vec8i16_fn add_vec8i16
Definition vl_simd.h:423
vl_simd_mul_vec4i32_fn mul_vec4i32
Definition vl_simd.h:420
VL_API const char * vlSIMDInit(void)
Initializes the SIMD subsystem and selects the best available backend.
Definition vl_simd.c:135
vl_simd_splat_vec8f32_fn splat_vec8f32
Definition vl_simd.h:396
vl_simd_hsum_vec4f32_fn hsum_vec4f32
Definition vl_simd.h:390
VL_API vl_simd_functions_t vlSIMDFunctions
Global SIMD function table.
Definition vl_simd.c:34
vl_simd_vec4_i32(* vl_simd_add_vec4i32_fn)(vl_simd_vec4_i32, vl_simd_vec4_i32)
Definition vl_simd.h:344
vl_simd_vec8_f32(* vl_simd_eq_vec8f32_fn)(vl_simd_vec8_f32, vl_simd_vec8_f32)
Definition vl_simd.h:329
vl_simd_vec4_f32(* vl_simd_mul_vec4f32_fn)(vl_simd_vec4_f32, vl_simd_vec4_f32)
Definition vl_simd.h:316
vl_simd_vec4_f32(* vl_simd_splat_vec4f32_fn)(vl_float32_t)
Definition vl_simd.h:313
vl_simd_vec8_f32
Definition vl_simd.h:236
vl_simd_vec8_f32(* vl_simd_lt_vec8f32_fn)(vl_simd_vec8_f32, vl_simd_vec8_f32)
Definition vl_simd.h:327
vl_simd_vec8_f32(* vl_simd_fma_vec8f32_fn)(vl_simd_vec8_f32, vl_simd_vec8_f32, vl_simd_vec8_f32)
Definition vl_simd.h:324
vl_simd_mul_vec4f32_fn mul_vec4f32
Definition vl_simd.h:387
vl_simd_vec8_f32(* vl_simd_and_vec8f32_fn)(vl_simd_vec8_f32, vl_simd_vec8_f32)
Definition vl_simd.h:330
vl_simd_gt_vec8f32_fn gt_vec8f32
Definition vl_simd.h:399
vl_simd_extract_lane_vec4f32_fn extract_lane_vec4f32
Definition vl_simd.h:415
vl_simd_cmp_vec4f32_fn lt_vec4f32
Definition vl_simd.h:405
vl_simd_not_vec8f32_fn not_vec8f32
Definition vl_simd.h:404
vl_simd_vec4_f32(* vl_simd_bitwise_vec4f32_fn)(vl_simd_vec4_f32, vl_simd_vec4_f32)
Definition vl_simd.h:335
Master SIMD function dispatch table.
Definition vl_simd.h:381