Line | Count | Source |
1 | | #include <cstring> |
2 | | |
3 | | #include "zserio/FloatUtil.h" |
4 | | |
5 | | namespace zserio |
6 | | { |
7 | | |
8 | | static constexpr uint16_t FLOAT16_SIGN_MASK = UINT16_C(0x8000); |
9 | | static constexpr uint16_t FLOAT16_EXPONENT_MASK = UINT16_C(0x7C00); |
10 | | static constexpr uint16_t FLOAT16_SIGNIFICAND_MASK = UINT16_C(0x03FF); |
11 | | |
12 | | static constexpr uint16_t FLOAT16_SIGN_BIT_POSITION = UINT16_C(15); |
13 | | static constexpr uint16_t FLOAT16_EXPONENT_BIT_POSITION = UINT16_C(10); |
14 | | |
15 | | static constexpr uint16_t FLOAT16_SIGNIFICAND_NUM_BITS = FLOAT16_EXPONENT_BIT_POSITION; |
16 | | |
17 | | static constexpr uint16_t FLOAT16_EXPONENT_INFINITY_NAN = UINT16_C(0x001F); |
18 | | static constexpr uint16_t FLOAT16_EXPONENT_BIAS = UINT16_C(15); |
19 | | |
20 | | static constexpr uint32_t FLOAT32_SIGN_MASK = UINT32_C(0x80000000); |
21 | | static constexpr uint32_t FLOAT32_EXPONENT_MASK = UINT32_C(0x7F800000); |
22 | | static constexpr uint32_t FLOAT32_SIGNIFICAND_MASK = UINT32_C(0x007FFFFF); |
23 | | |
24 | | static constexpr uint32_t FLOAT32_SIGN_BIT_POSITION = UINT32_C(31); |
25 | | static constexpr uint32_t FLOAT32_EXPONENT_BIT_POSITION = UINT32_C(23); |
26 | | |
27 | | static constexpr uint32_t FLOAT32_SIGNIFICAND_NUM_BITS = FLOAT32_EXPONENT_BIT_POSITION; |
28 | | |
29 | | static constexpr uint32_t FLOAT32_EXPONENT_INFINITY_NAN = UINT32_C(0x00FF); |
30 | | static constexpr uint32_t FLOAT32_EXPONENT_BIAS = UINT32_C(127); |
31 | | |
32 | | float convertUInt16ToFloat(uint16_t float16Value) |
33 | 101 | { |
34 | | // decompose half precision float (float16) |
35 | 101 | const uint16_t sign16Shifted = (float16Value & FLOAT16_SIGN_MASK); |
36 | 101 | const uint16_t exponent16 = static_cast<uint16_t>( |
37 | 101 | static_cast<uint16_t>(float16Value & FLOAT16_EXPONENT_MASK) >> FLOAT16_EXPONENT_BIT_POSITION); |
38 | 101 | const uint16_t significand16 = (float16Value & FLOAT16_SIGNIFICAND_MASK); |
39 | | |
40 | | // calculate significand for single precision float (float32) |
41 | 101 | uint32_t significand32 = static_cast<uint32_t>(significand16) |
42 | 101 | << (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS); |
43 | | |
44 | | // calculate exponent for single precision float (float32) |
45 | 101 | uint32_t exponent32 = 0; |
46 | 101 | if (exponent16 == 0) |
47 | 4 | { |
48 | 4 | if (significand32 != 0) |
49 | 2 | { |
50 | | // subnormal (denormal) number will be normalized |
51 | 2 | exponent32 = 1 + FLOAT32_EXPONENT_BIAS - FLOAT16_EXPONENT_BIAS; // exp is initialized by -14 |
52 | | // shift significand until leading bit overflows into exponent bit |
53 | 13 | while ((significand32 & (FLOAT32_SIGNIFICAND_MASK + 1)) == 0) |
54 | 11 | { |
55 | 11 | exponent32--; |
56 | 11 | significand32 <<= 1U; |
57 | 11 | } |
58 | | // mask out overflowed leading bit from significand (normalized has implicit leading bit 1) |
59 | 2 | significand32 &= FLOAT32_SIGNIFICAND_MASK; |
60 | 2 | } |
61 | 4 | } |
62 | 97 | else if (exponent16 == FLOAT16_EXPONENT_INFINITY_NAN) |
63 | 4 | { |
64 | | // infinity or NaN |
65 | 4 | exponent32 = FLOAT32_EXPONENT_INFINITY_NAN; |
66 | 4 | } |
67 | 93 | else |
68 | 93 | { |
69 | | // normal number |
70 | 93 | exponent32 = static_cast<uint32_t>(exponent16) - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS; |
71 | 93 | } |
72 | | |
73 | | // compose single precision float (float32) |
74 | 101 | const uint32_t sign32Shifted = static_cast<uint32_t>(sign16Shifted) |
75 | 101 | << (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION); |
76 | 101 | const uint32_t exponent32Shifted = exponent32 << FLOAT32_EXPONENT_BIT_POSITION; |
77 | 101 | const uint32_t float32Value = sign32Shifted | exponent32Shifted | significand32; |
78 | | |
79 | | // convert it to float |
80 | 101 | return convertUInt32ToFloat(float32Value); |
81 | 101 | } |
82 | | |
83 | | uint16_t convertFloatToUInt16(float float32) |
84 | 196 | { |
85 | 196 | const uint32_t float32Value = convertFloatToUInt32(float32); |
86 | | |
87 | | // decompose single precision float (float32) |
88 | 196 | const uint32_t sign32Shifted = (float32Value & FLOAT32_SIGN_MASK); |
89 | 196 | const uint32_t exponent32 = (float32Value & FLOAT32_EXPONENT_MASK) >> FLOAT32_EXPONENT_BIT_POSITION; |
90 | 196 | const uint32_t significand32 = (float32Value & FLOAT32_SIGNIFICAND_MASK); |
91 | | |
92 | | // calculate significand for half precision float (float16) |
93 | 196 | uint16_t significand16 = static_cast<uint16_t>( |
94 | 196 | (significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS))); |
95 | | |
96 | | // calculate exponent for half precision float (float16) |
97 | 196 | bool needsRounding = false; |
98 | 196 | uint16_t exponent16 = 0; |
99 | 196 | if (exponent32 == 0) |
100 | 4 | { |
101 | 4 | if (significand32 != 0) |
102 | 2 | { |
103 | | // subnormal (denormal) number will be zero |
104 | 2 | significand16 = 0; |
105 | 2 | } |
106 | 4 | } |
107 | 192 | else if (exponent32 == FLOAT32_EXPONENT_INFINITY_NAN) |
108 | 4 | { |
109 | | // infinity or NaN |
110 | 4 | exponent16 = FLOAT16_EXPONENT_INFINITY_NAN; |
111 | 4 | } |
112 | 188 | else |
113 | 188 | { |
114 | | // normal number |
115 | 188 | const int16_t signedExponent16 = static_cast<int16_t>(static_cast<int32_t>(exponent32) - |
116 | 188 | static_cast<int32_t>(FLOAT32_EXPONENT_BIAS) + static_cast<int32_t>(FLOAT16_EXPONENT_BIAS)); |
117 | 188 | if (signedExponent16 > FLOAT16_EXPONENT_INFINITY_NAN) |
118 | 1 | { |
119 | | // exponent overflow, set infinity or NaN |
120 | 1 | exponent16 = FLOAT16_EXPONENT_INFINITY_NAN; |
121 | 1 | } |
122 | 187 | else if (signedExponent16 <= 0) |
123 | 3 | { |
124 | | // exponent underflow |
125 | 3 | if (signedExponent16 <= static_cast<int16_t>(-FLOAT16_SIGNIFICAND_NUM_BITS)) |
126 | 1 | { |
127 | | // too big underflow, set to zero |
128 | 1 | significand16 = 0; |
129 | 1 | } |
130 | 2 | else |
131 | 2 | { |
132 | | // we can still use subnormal numbers |
133 | 2 | const uint32_t fullSignificand32 = significand32 | (FLOAT32_SIGNIFICAND_MASK + 1); |
134 | 2 | const uint32_t significandShift = static_cast<uint32_t>(1 - signedExponent16); |
135 | 2 | significand16 = static_cast<uint16_t>(fullSignificand32 >> |
136 | 2 | (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + significandShift)); |
137 | | |
138 | 2 | needsRounding = |
139 | 2 | ((fullSignificand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + |
140 | 2 | significandShift - 1)) & |
141 | 2 | UINT32_C(1)) != 0; |
142 | 2 | } |
143 | 3 | } |
144 | 184 | else |
145 | 184 | { |
146 | | // exponent ok |
147 | 184 | exponent16 = static_cast<uint16_t>(signedExponent16); |
148 | 184 | needsRounding = |
149 | 184 | ((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS - 1)) & |
150 | 184 | UINT32_C(1)) != 0; |
151 | 184 | } |
152 | 188 | } |
153 | | |
154 | | // compose half precision float (float16) |
155 | 196 | const uint16_t sign16Shifted = |
156 | 196 | static_cast<uint16_t>(sign32Shifted >> (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION)); |
157 | 196 | const uint16_t exponent16Shifted = static_cast<uint16_t>(exponent16 << FLOAT16_EXPONENT_BIT_POSITION); |
158 | 196 | uint16_t float16Value = static_cast<uint16_t>(sign16Shifted | exponent16Shifted) | significand16; |
159 | | |
160 | | // check rounding |
161 | 196 | if (needsRounding) |
162 | 2 | { |
163 | 2 | ++float16Value; // might overflow to infinity |
164 | 2 | } |
165 | | |
166 | 196 | return float16Value; |
167 | 196 | } |
168 | | |
169 | | float convertUInt32ToFloat(uint32_t float32Value) |
170 | 307 | { |
171 | 307 | float convertedFloat = 0.0F; |
172 | 307 | (void)std::memcpy(&convertedFloat, &float32Value, sizeof(uint32_t)); |
173 | | |
174 | 307 | return convertedFloat; |
175 | 307 | } |
176 | | |
177 | | uint32_t convertFloatToUInt32(float float32) |
178 | 585 | { |
179 | 585 | uint32_t float32Value = 0; |
180 | 585 | (void)std::memcpy(&float32Value, &float32, sizeof(float)); |
181 | | |
182 | 585 | return float32Value; |
183 | 585 | } |
184 | | |
185 | | double convertUInt64ToDouble(uint64_t float64Value) |
186 | 374 | { |
187 | 374 | double convertedDouble = 0.0; |
188 | 374 | (void)std::memcpy(&convertedDouble, &float64Value, sizeof(uint64_t)); |
189 | | |
190 | 374 | return convertedDouble; |
191 | 374 | } |
192 | | |
193 | | uint64_t convertDoubleToUInt64(double float64) |
194 | 742 | { |
195 | 742 | uint64_t float64Value = 0; |
196 | 742 | (void)std::memcpy(&float64Value, &float64, sizeof(double)); |
197 | | |
198 | 742 | return float64Value; |
199 | 742 | } |
200 | | |
201 | | } // namespace zserio |