Coverage Report

Created: 2024-09-23 09:09

src/zserio/FloatUtil.cpp
Line
Count
Source
1
#include <cstring>
2
3
#include "zserio/FloatUtil.h"
4
5
namespace zserio
6
{
7
8
static constexpr uint16_t FLOAT16_SIGN_MASK = UINT16_C(0x8000);
9
static constexpr uint16_t FLOAT16_EXPONENT_MASK = UINT16_C(0x7C00);
10
static constexpr uint16_t FLOAT16_SIGNIFICAND_MASK = UINT16_C(0x03FF);
11
12
static constexpr uint16_t FLOAT16_SIGN_BIT_POSITION = UINT16_C(15);
13
static constexpr uint16_t FLOAT16_EXPONENT_BIT_POSITION = UINT16_C(10);
14
15
static constexpr uint16_t FLOAT16_SIGNIFICAND_NUM_BITS = FLOAT16_EXPONENT_BIT_POSITION;
16
17
static constexpr uint16_t FLOAT16_EXPONENT_INFINITY_NAN = UINT16_C(0x001F);
18
static constexpr uint16_t FLOAT16_EXPONENT_BIAS = UINT16_C(15);
19
20
static constexpr uint32_t FLOAT32_SIGN_MASK = UINT32_C(0x80000000);
21
static constexpr uint32_t FLOAT32_EXPONENT_MASK = UINT32_C(0x7F800000);
22
static constexpr uint32_t FLOAT32_SIGNIFICAND_MASK = UINT32_C(0x007FFFFF);
23
24
static constexpr uint32_t FLOAT32_SIGN_BIT_POSITION = UINT32_C(31);
25
static constexpr uint32_t FLOAT32_EXPONENT_BIT_POSITION = UINT32_C(23);
26
27
static constexpr uint32_t FLOAT32_SIGNIFICAND_NUM_BITS = FLOAT32_EXPONENT_BIT_POSITION;
28
29
static constexpr uint32_t FLOAT32_EXPONENT_INFINITY_NAN = UINT32_C(0x00FF);
30
static constexpr uint32_t FLOAT32_EXPONENT_BIAS = UINT32_C(127);
31
32
float convertUInt16ToFloat(uint16_t float16Value)
33
101
{
34
    // decompose half precision float (float16)
35
101
    const uint16_t sign16Shifted = (float16Value & FLOAT16_SIGN_MASK);
36
101
    const uint16_t exponent16 = static_cast<uint16_t>(
37
101
            static_cast<uint16_t>(float16Value & FLOAT16_EXPONENT_MASK) >> FLOAT16_EXPONENT_BIT_POSITION);
38
101
    const uint16_t significand16 = (float16Value & FLOAT16_SIGNIFICAND_MASK);
39
40
    // calculate significand for single precision float (float32)
41
101
    uint32_t significand32 = static_cast<uint32_t>(significand16)
42
101
            << (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS);
43
44
    // calculate exponent for single precision float (float32)
45
101
    uint32_t exponent32 = 0;
46
101
    if (exponent16 == 0)
47
4
    {
48
4
        if (significand32 != 0)
49
2
        {
50
            // subnormal (denormal) number will be normalized
51
2
            exponent32 = 1 + FLOAT32_EXPONENT_BIAS - FLOAT16_EXPONENT_BIAS; // exp is initialized by -14
52
            // shift significand until leading bit overflows into exponent bit
53
13
            while ((significand32 & (FLOAT32_SIGNIFICAND_MASK + 1)) == 0)
54
11
            {
55
11
                exponent32--;
56
11
                significand32 <<= 1U;
57
11
            }
58
            // mask out overflowed leading bit from significand (normalized has implicit leading bit 1)
59
2
            significand32 &= FLOAT32_SIGNIFICAND_MASK;
60
2
        }
61
4
    }
62
97
    else if (exponent16 == FLOAT16_EXPONENT_INFINITY_NAN)
63
4
    {
64
        // infinity or NaN
65
4
        exponent32 = FLOAT32_EXPONENT_INFINITY_NAN;
66
4
    }
67
93
    else
68
93
    {
69
        // normal number
70
93
        exponent32 = static_cast<uint32_t>(exponent16) - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS;
71
93
    }
72
73
    // compose single precision float (float32)
74
101
    const uint32_t sign32Shifted = static_cast<uint32_t>(sign16Shifted)
75
101
            << (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION);
76
101
    const uint32_t exponent32Shifted = exponent32 << FLOAT32_EXPONENT_BIT_POSITION;
77
101
    const uint32_t float32Value = sign32Shifted | exponent32Shifted | significand32;
78
79
    // convert it to float
80
101
    return convertUInt32ToFloat(float32Value);
81
101
}
82
83
uint16_t convertFloatToUInt16(float float32)
84
196
{
85
196
    const uint32_t float32Value = convertFloatToUInt32(float32);
86
87
    // decompose single precision float (float32)
88
196
    const uint32_t sign32Shifted = (float32Value & FLOAT32_SIGN_MASK);
89
196
    const uint32_t exponent32 = (float32Value & FLOAT32_EXPONENT_MASK) >> FLOAT32_EXPONENT_BIT_POSITION;
90
196
    const uint32_t significand32 = (float32Value & FLOAT32_SIGNIFICAND_MASK);
91
92
    // calculate significand for half precision float (float16)
93
196
    uint16_t significand16 = static_cast<uint16_t>(
94
196
            (significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS)));
95
96
    // calculate exponent for half precision float (float16)
97
196
    bool needsRounding = false;
98
196
    uint16_t exponent16 = 0;
99
196
    if (exponent32 == 0)
100
4
    {
101
4
        if (significand32 != 0)
102
2
        {
103
            // subnormal (denormal) number will be zero
104
2
            significand16 = 0;
105
2
        }
106
4
    }
107
192
    else if (exponent32 == FLOAT32_EXPONENT_INFINITY_NAN)
108
4
    {
109
        // infinity or NaN
110
4
        exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
111
4
    }
112
188
    else
113
188
    {
114
        // normal number
115
188
        const int16_t signedExponent16 = static_cast<int16_t>(static_cast<int32_t>(exponent32) -
116
188
                static_cast<int32_t>(FLOAT32_EXPONENT_BIAS) + static_cast<int32_t>(FLOAT16_EXPONENT_BIAS));
117
188
        if (signedExponent16 > FLOAT16_EXPONENT_INFINITY_NAN)
118
1
        {
119
            // exponent overflow, set infinity or NaN
120
1
            exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
121
1
        }
122
187
        else if (signedExponent16 <= 0)
123
3
        {
124
            // exponent underflow
125
3
            if (signedExponent16 <= static_cast<int16_t>(-FLOAT16_SIGNIFICAND_NUM_BITS))
126
1
            {
127
                // too big underflow, set to zero
128
1
                significand16 = 0;
129
1
            }
130
2
            else
131
2
            {
132
                // we can still use subnormal numbers
133
2
                const uint32_t fullSignificand32 = significand32 | (FLOAT32_SIGNIFICAND_MASK + 1);
134
2
                const uint32_t significandShift = static_cast<uint32_t>(1 - signedExponent16);
135
2
                significand16 = static_cast<uint16_t>(fullSignificand32 >>
136
2
                        (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + significandShift));
137
138
2
                needsRounding =
139
2
                        ((fullSignificand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS +
140
2
                                                       significandShift - 1)) &
141
2
                                UINT32_C(1)) != 0;
142
2
            }
143
3
        }
144
184
        else
145
184
        {
146
            // exponent ok
147
184
            exponent16 = static_cast<uint16_t>(signedExponent16);
148
184
            needsRounding =
149
184
                    ((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS - 1)) &
150
184
                            UINT32_C(1)) != 0;
151
184
        }
152
188
    }
153
154
    // compose half precision float (float16)
155
196
    const uint16_t sign16Shifted =
156
196
            static_cast<uint16_t>(sign32Shifted >> (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION));
157
196
    const uint16_t exponent16Shifted = static_cast<uint16_t>(exponent16 << FLOAT16_EXPONENT_BIT_POSITION);
158
196
    uint16_t float16Value = static_cast<uint16_t>(sign16Shifted | exponent16Shifted) | significand16;
159
160
    // check rounding
161
196
    if (needsRounding)
162
2
    {
163
2
        ++float16Value; // might overflow to infinity
164
2
    }
165
166
196
    return float16Value;
167
196
}
168
169
float convertUInt32ToFloat(uint32_t float32Value)
170
307
{
171
307
    float convertedFloat = 0.0F;
172
307
    (void)std::memcpy(&convertedFloat, &float32Value, sizeof(uint32_t));
173
174
307
    return convertedFloat;
175
307
}
176
177
uint32_t convertFloatToUInt32(float float32)
178
585
{
179
585
    uint32_t float32Value = 0;
180
585
    (void)std::memcpy(&float32Value, &float32, sizeof(float));
181
182
585
    return float32Value;
183
585
}
184
185
double convertUInt64ToDouble(uint64_t float64Value)
186
374
{
187
374
    double convertedDouble = 0.0;
188
374
    (void)std::memcpy(&convertedDouble, &float64Value, sizeof(uint64_t));
189
190
374
    return convertedDouble;
191
374
}
192
193
uint64_t convertDoubleToUInt64(double float64)
194
742
{
195
742
    uint64_t float64Value = 0;
196
742
    (void)std::memcpy(&float64Value, &float64, sizeof(double));
197
198
742
    return float64Value;
199
742
}
200
201
} // namespace zserio