Veritable Lasagna
An Allocator & Data Structure Library for C.
Loading...
Searching...
No Matches
vl_half.h
Go to the documentation of this file.
1
14#ifndef VL_HALF_H
15#define VL_HALF_H
16
17#include <string.h> /* memcpy */
18#include <vl/vl_numtypes.h>
19
48typedef vl_uint16_t vl_half_t;
49
50/*=============================================================================
51 * Constants
52 *============================================================================*/
53
55#define VL_HALF_SIGN_MASK 0x8000
57#define VL_HALF_EXP_MASK 0x7C00
59#define VL_HALF_FRAC_MASK 0x03FF
60
62#define VL_HALF_EXP_SHIFT 10
64#define VL_HALF_EXP_BIAS 15
65
66/* Signed zeros */
67#define VL_HALF_POS_ZERO ((vl_half)0x0000)
68#define VL_HALF_NEG_ZERO ((vl_half)0x8000)
69
70/* Infinities */
71#define VL_HALF_POS_INF ((vl_half)0x7C00)
72#define VL_HALF_NEG_INF ((vl_half)0xFC00)
73
74/* NaNs */
75#define VL_HALF_QNAN ((vl_half)0x7E00) /* canonical quiet NaN */
76#define VL_HALF_SNAN ((vl_half)0x7D00) /* signaling NaN (payload = 0) */
77
78/* Ones */
79#define VL_HALF_ONE ((vl_half)0x3C00) /* 1.0 */
80#define VL_HALF_NEG_ONE ((vl_half)0xBC00) /* -1.0 */
81
82/* Largest finite values */
83#define VL_HALF_MAX ((vl_half)0x7BFF) /* +65504 */
84#define VL_HALF_MIN ((vl_half)0xFBFF) /* -65504 */
85
86/* Smallest normal values */
87#define VL_HALF_MIN_POS ((vl_half)0x0400) /* 2^-14 */
88#define VL_HALF_MIN_NEG ((vl_half)0x8400)
89
90/* Smallest subnormal values */
91#define VL_HALF_TRUE_MIN_POS ((vl_half)0x0001) /* 2^-24 */
92#define VL_HALF_TRUE_MIN_NEG ((vl_half)0x8001)
93
94/* Difference between 1.0 and the next representable value */
95#define VL_HALF_EPSILON ((vl_half)0x1400) /* 2^-10 */
96
97/*=============================================================================
98 * Bit extraction helpers
99 *============================================================================*/
100
105static inline vl_uint16_t vlHalfSign(vl_half_t h) { return (h & VL_HALF_SIGN_MASK) >> 15; }
106
111static inline vl_uint16_t vlHalfExp(vl_half_t h) { return (h & VL_HALF_EXP_MASK) >> VL_HALF_EXP_SHIFT; }
112
117static inline vl_uint16_t vlHalfFrac(vl_half_t h) { return h & VL_HALF_FRAC_MASK; }
118
119/*=============================================================================
120 * Classification helpers
121 *============================================================================*/
122
126static inline int vlHalfIsZero(vl_half_t h) { return (h & ~VL_HALF_SIGN_MASK) == 0; }
127
131static inline int vlHalfIsSubnormal(vl_half_t h) { return (vlHalfExp(h) == 0) && (vlHalfFrac(h) != 0); }
132
136static inline int vlHalfIsInf(vl_half_t h) { return (h & ~VL_HALF_SIGN_MASK) == VL_HALF_EXP_MASK; }
137
141static inline int vlHalfIsNaN(vl_half_t h) { return (vlHalfExp(h) == 31) && (vlHalfFrac(h) != 0); }
142
146static inline int vlHalfSignBit(vl_half_t h) { return (h & VL_HALF_SIGN_MASK) != 0; }
147
148/*=============================================================================
149 * Packing helper
150 *============================================================================*/
151
157static inline vl_half_t vlHalfPack(vl_uint16_t sign, vl_uint16_t exp, vl_uint16_t frac)
158{
159 return (vl_half_t)(((sign & 1) << 15) | ((exp & 0x1F) << VL_HALF_EXP_SHIFT) | (frac & VL_HALF_FRAC_MASK));
160}
161
162/*=============================================================================
163 * Bit-cast helpers
164 *============================================================================*/
165
166/*=============================================================================
167 * Conversion: float -> half
168 *============================================================================*/
169
180static inline vl_half_t vlHalfFromFloat(float x)
181{
182 vl_uint32_t f;
183 memcpy(&f, &x, sizeof f);
184
185 vl_uint32_t sign = (f >> 31) & 1;
186 vl_int32_t exp = (f >> 23) & 0xFF;
187 vl_uint32_t frac = f & 0x7FFFFF;
188
189 if (exp == 255)
190 {
191 if (frac == 0)
192 return vlHalfPack(sign, 31, 0);
193 return vlHalfPack(sign, 31, 1);
194 }
195
196 vl_int32_t e = exp - 127;
197
198 if (e > 15)
199 return vlHalfPack(sign, 31, 0);
200
201 if (e < -14)
202 {
203 if (e < -24)
204 return vlHalfPack(sign, 0, 0);
205
206 vl_uint32_t mant = frac | 0x800000;
207 vl_uint32_t shift = (vl_uint32_t)(-e - 1);
208 vl_uint32_t rshift = shift + 13;
209
210 vl_uint32_t frac16 = mant >> rshift;
211 vl_uint32_t rem = mant & ((1u << rshift) - 1);
212 vl_uint32_t half = 1u << (rshift - 1);
213
214 if (rem > half || (rem == half && (frac16 & 1)))
215 frac16++;
216
217 return vlHalfPack(sign, 0, frac16);
218 }
219
220 vl_uint32_t frac16 = frac >> 13;
221 vl_uint32_t rem = frac & 0x1FFF;
222
223 if (rem > 0x1000 || (rem == 0x1000 && (frac16 & 1)))
224 frac16++;
225
226 if (frac16 == 0x400)
227 {
228 frac16 = 0;
229 e++;
230 if (e > 15)
231 return vlHalfPack(sign, 31, 0);
232 }
233
234 return vlHalfPack(sign, (vl_uint16_t)(e + VL_HALF_EXP_BIAS), frac16);
235}
236
237/*=============================================================================
238 * Conversion: half -> float
239 *============================================================================*/
240
244static inline float vlHalfToFloat(vl_half_t h)
245{
246 const vl_uint32_t sign = vlHalfSign(h);
247 const vl_uint32_t exp = vlHalfExp(h);
248 vl_uint32_t frac = vlHalfFrac(h);
249
250 vl_uint32_t value;
251
252 if (exp == 0)
253 {
254 if (frac == 0)
255 {
256 value = sign << 31;
257 }
258 else
259 {
260 vl_int32_t e = -14;
261 while ((frac & 0x400) == 0)
262 {
263 frac <<= 1;
264 e--;
265 }
266 frac &= 0x3FF;
267 value = (sign << 31) | ((e + 127) << 23) | (frac << 13);
268 }
269 }
270 else if (exp == 31)
271 {
272 value = (sign << 31) | (255 << 23) | (frac << 13);
273 }
274 else
275 {
276 value = (sign << 31) | ((exp - VL_HALF_EXP_BIAS + 127) << 23) | (frac << 13);
277 }
278
279 vl_float32_t result = 0;
280 memcpy(&result, &value, sizeof result);
281 return result;
282}
283
284/*=============================================================================
285 * Arithmetic helpers (via widening)
286 *============================================================================*/
287
289static inline vl_half_t vlHalfAdd(vl_half_t a, vl_half_t b)
290{
291 return vlHalfFromFloat(vlHalfToFloat(a) + vlHalfToFloat(b));
292}
293
295static inline vl_half_t vlHalfSub(vl_half_t a, vl_half_t b)
296{
297 return vlHalfFromFloat(vlHalfToFloat(a) - vlHalfToFloat(b));
298}
299
301static inline vl_half_t vlHalfMul(vl_half_t a, vl_half_t b)
302{
303 return vlHalfFromFloat(vlHalfToFloat(a) * vlHalfToFloat(b));
304}
305
307static inline vl_half_t vlHalfDiv(vl_half_t a, vl_half_t b)
308{
309 return vlHalfFromFloat(vlHalfToFloat(a) / vlHalfToFloat(b));
310}
311
312#endif /* VL_HALF_H */
#define VL_HALF_EXP_SHIFT
Exponent field shift.
Definition vl_half.h:62
#define VL_HALF_EXP_MASK
Exponent field mask.
Definition vl_half.h:57
#define VL_HALF_SIGN_MASK
Sign bit mask.
Definition vl_half.h:55
#define VL_HALF_FRAC_MASK
Fraction (mantissa) mask.
Definition vl_half.h:59
vl_uint16_t vl_half_t
Raw half-precision floating-point bit pattern.
Definition vl_half.h:48
#define VL_HALF_EXP_BIAS
Exponent bias for binary16.
Definition vl_half.h:64
VL_F32_T vl_float32_t
32-bit floating point number type.
Definition vl_numtypes.h:173