src/zserio/FloatUtil.cpp

Source
#include <cstring>

#include "zserio/FloatUtil.h"

namespace zserio
{

static constexpr uint16_t FLOAT16_SIGN_MASK = UINT16_C(0x8000);
static constexpr uint16_t FLOAT16_EXPONENT_MASK = UINT16_C(0x7C00);
static constexpr uint16_t FLOAT16_SIGNIFICAND_MASK = UINT16_C(0x03FF);

static constexpr uint16_t FLOAT16_SIGN_BIT_POSITION = UINT16_C(15);
static constexpr uint16_t FLOAT16_EXPONENT_BIT_POSITION = UINT16_C(10);

static constexpr uint16_t FLOAT16_SIGNIFICAND_NUM_BITS = FLOAT16_EXPONENT_BIT_POSITION;

static constexpr uint16_t FLOAT16_EXPONENT_INFINITY_NAN = UINT16_C(0x001F);
static constexpr uint16_t FLOAT16_EXPONENT_BIAS = UINT16_C(15);

static constexpr uint32_t FLOAT32_SIGN_MASK = UINT32_C(0x80000000);
static constexpr uint32_t FLOAT32_EXPONENT_MASK = UINT32_C(0x7F800000);
static constexpr uint32_t FLOAT32_SIGNIFICAND_MASK = UINT32_C(0x007FFFFF);

static constexpr uint32_t FLOAT32_SIGN_BIT_POSITION = UINT32_C(31);
static constexpr uint32_t FLOAT32_EXPONENT_BIT_POSITION = UINT32_C(23);

static constexpr uint32_t FLOAT32_SIGNIFICAND_NUM_BITS = FLOAT32_EXPONENT_BIT_POSITION;

static constexpr uint32_t FLOAT32_EXPONENT_INFINITY_NAN = UINT32_C(0x00FF);
static constexpr uint32_t FLOAT32_EXPONENT_BIAS = UINT32_C(127);

float convertUInt16ToFloat(uint16_t float16Value)
{
    // decompose half precision float (float16)
    const uint16_t sign16Shifted = (float16Value & FLOAT16_SIGN_MASK);
    const uint16_t exponent16 = static_cast<uint16_t>(
            static_cast<uint16_t>(float16Value & FLOAT16_EXPONENT_MASK) >> FLOAT16_EXPONENT_BIT_POSITION);
    const uint16_t significand16 = (float16Value & FLOAT16_SIGNIFICAND_MASK);

    // calculate significand for single precision float (float32)
    uint32_t significand32 = static_cast<uint32_t>(significand16)
            << (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS);

    // calculate exponent for single precision float (float32)
    uint32_t exponent32 = 0;
    if (exponent16 == 0)
    {
        if (significand32 != 0)
        {
            // subnormal (denormal) number will be normalized
            exponent32 = 1 + FLOAT32_EXPONENT_BIAS - FLOAT16_EXPONENT_BIAS; // exp is initialized by -14
            // shift significand until leading bit overflows into exponent bit
            while ((significand32 & (FLOAT32_SIGNIFICAND_MASK + 1)) == 0)
            {
                exponent32--;
                significand32 <<= 1U;
            }
            // mask out overflowed leading bit from significand (normalized has implicit leading bit 1)
            significand32 &= FLOAT32_SIGNIFICAND_MASK;
        }
    }
    else if (exponent16 == FLOAT16_EXPONENT_INFINITY_NAN)
    {
        // infinity or NaN
        exponent32 = FLOAT32_EXPONENT_INFINITY_NAN;
    }
    else
    {
        // normal number
        exponent32 = static_cast<uint32_t>(exponent16) - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS;
    }

    // compose single precision float (float32)
    const uint32_t sign32Shifted = static_cast<uint32_t>(sign16Shifted)
            << (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION);
    const uint32_t exponent32Shifted = exponent32 << FLOAT32_EXPONENT_BIT_POSITION;
    const uint32_t float32Value = sign32Shifted | exponent32Shifted | significand32;

    // convert it to float
    return convertUInt32ToFloat(float32Value);
}

uint16_t convertFloatToUInt16(float float32)
{
    const uint32_t float32Value = convertFloatToUInt32(float32);

    // decompose single precision float (float32)
    const uint32_t sign32Shifted = (float32Value & FLOAT32_SIGN_MASK);
    const uint32_t exponent32 = (float32Value & FLOAT32_EXPONENT_MASK) >> FLOAT32_EXPONENT_BIT_POSITION;
    const uint32_t significand32 = (float32Value & FLOAT32_SIGNIFICAND_MASK);

    // calculate significand for half precision float (float16)
    uint16_t significand16 = static_cast<uint16_t>(
            (significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS)));

    // calculate exponent for half precision float (float16)
    bool needsRounding = false;
    uint16_t exponent16 = 0;
    if (exponent32 == 0)
    {
        if (significand32 != 0)
        {
            // subnormal (denormal) number will be zero
            significand16 = 0;
        }
    }
    else if (exponent32 == FLOAT32_EXPONENT_INFINITY_NAN)
    {
        // infinity or NaN
        exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
    }
    else
    {
        // normal number
        const int16_t signedExponent16 = static_cast<int16_t>(static_cast<int32_t>(exponent32) -
                static_cast<int32_t>(FLOAT32_EXPONENT_BIAS) + static_cast<int32_t>(FLOAT16_EXPONENT_BIAS));
        if (signedExponent16 > FLOAT16_EXPONENT_INFINITY_NAN)
        {
            // exponent overflow, set infinity or NaN
            exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
        }
        else if (signedExponent16 <= 0)
        {
            // exponent underflow
            if (signedExponent16 <= static_cast<int16_t>(-FLOAT16_SIGNIFICAND_NUM_BITS))
            {
                // too big underflow, set to zero
                significand16 = 0;
            }
            else
            {
                // we can still use subnormal numbers
                const uint32_t fullSignificand32 = significand32 | (FLOAT32_SIGNIFICAND_MASK + 1);
                const uint32_t significandShift = static_cast<uint32_t>(1 - signedExponent16);
                significand16 = static_cast<uint16_t>(fullSignificand32 >>
                        (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + significandShift));

                needsRounding =
                        ((fullSignificand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS +
                                                       significandShift - 1)) &
                                UINT32_C(1)) != 0;
            }
        }
        else
        {
            // exponent ok
            exponent16 = static_cast<uint16_t>(signedExponent16);
            needsRounding =
                    ((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS - 1)) &
                            UINT32_C(1)) != 0;
        }
    }

    // compose half precision float (float16)
    const uint16_t sign16Shifted =
            static_cast<uint16_t>(sign32Shifted >> (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION));
    const uint16_t exponent16Shifted = static_cast<uint16_t>(exponent16 << FLOAT16_EXPONENT_BIT_POSITION);
    uint16_t float16Value = static_cast<uint16_t>(sign16Shifted | exponent16Shifted) | significand16;

    // check rounding
    if (needsRounding)
    {
        ++float16Value; // might overflow to infinity
    }

    return float16Value;
}

float convertUInt32ToFloat(uint32_t float32Value)
{
    float convertedFloat = 0.0F;
    (void)std::memcpy(&convertedFloat, &float32Value, sizeof(uint32_t));

    return convertedFloat;
}

uint32_t convertFloatToUInt32(float float32)
{
    uint32_t float32Value = 0;
    (void)std::memcpy(&float32Value, &float32, sizeof(float));

    return float32Value;
}

double convertUInt64ToDouble(uint64_t float64Value)
{
    double convertedDouble = 0.0;
    (void)std::memcpy(&convertedDouble, &float64Value, sizeof(uint64_t));

    return convertedDouble;
}

uint64_t convertDoubleToUInt64(double float64)
{
    uint64_t float64Value = 0;
    (void)std::memcpy(&float64Value, &float64, sizeof(double));

    return float64Value;
}

} // namespace zserio

Coverage Report

Created: 2024-09-23 09:09

Line	Count	Source
1		#include <cstring>
2
3		#include "zserio/FloatUtil.h"
4
5		namespace zserio
6		{
7
8		static constexpr uint16_t FLOAT16_SIGN_MASK = UINT16_C(0x8000);
9		static constexpr uint16_t FLOAT16_EXPONENT_MASK = UINT16_C(0x7C00);
10		static constexpr uint16_t FLOAT16_SIGNIFICAND_MASK = UINT16_C(0x03FF);
11
12		static constexpr uint16_t FLOAT16_SIGN_BIT_POSITION = UINT16_C(15);
13		static constexpr uint16_t FLOAT16_EXPONENT_BIT_POSITION = UINT16_C(10);
14
15		static constexpr uint16_t FLOAT16_SIGNIFICAND_NUM_BITS = FLOAT16_EXPONENT_BIT_POSITION;
16
17		static constexpr uint16_t FLOAT16_EXPONENT_INFINITY_NAN = UINT16_C(0x001F);
18		static constexpr uint16_t FLOAT16_EXPONENT_BIAS = UINT16_C(15);
19
20		static constexpr uint32_t FLOAT32_SIGN_MASK = UINT32_C(0x80000000);
21		static constexpr uint32_t FLOAT32_EXPONENT_MASK = UINT32_C(0x7F800000);
22		static constexpr uint32_t FLOAT32_SIGNIFICAND_MASK = UINT32_C(0x007FFFFF);
23
24		static constexpr uint32_t FLOAT32_SIGN_BIT_POSITION = UINT32_C(31);
25		static constexpr uint32_t FLOAT32_EXPONENT_BIT_POSITION = UINT32_C(23);
26
27		static constexpr uint32_t FLOAT32_SIGNIFICAND_NUM_BITS = FLOAT32_EXPONENT_BIT_POSITION;
28
29		static constexpr uint32_t FLOAT32_EXPONENT_INFINITY_NAN = UINT32_C(0x00FF);
30		static constexpr uint32_t FLOAT32_EXPONENT_BIAS = UINT32_C(127);
31
32		float convertUInt16ToFloat(uint16_t float16Value)
33	101	{
34		// decompose half precision float (float16)
35	101	const uint16_t sign16Shifted = (float16Value & FLOAT16_SIGN_MASK);
36	101	const uint16_t exponent16 = static_cast<uint16_t>(
37	101	static_cast<uint16_t>(float16Value & FLOAT16_EXPONENT_MASK) >> FLOAT16_EXPONENT_BIT_POSITION);
38	101	const uint16_t significand16 = (float16Value & FLOAT16_SIGNIFICAND_MASK);
39
40		// calculate significand for single precision float (float32)
41	101	uint32_t significand32 = static_cast<uint32_t>(significand16)
42	101	<< (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS);
43
44		// calculate exponent for single precision float (float32)
45	101	uint32_t exponent32 = 0;
46	101	if (exponent16 == 0)
47	4	{
48	4	if (significand32 != 0)
49	2	{
50		// subnormal (denormal) number will be normalized
51	2	exponent32 = 1 + FLOAT32_EXPONENT_BIAS - FLOAT16_EXPONENT_BIAS; // exp is initialized by -14
52		// shift significand until leading bit overflows into exponent bit
53	13	while ((significand32 & (FLOAT32_SIGNIFICAND_MASK + 1)) == 0)
54	11	{
55	11	exponent32--;
56	11	significand32 <<= 1U;
57	11	}
58		// mask out overflowed leading bit from significand (normalized has implicit leading bit 1)
59	2	significand32 &= FLOAT32_SIGNIFICAND_MASK;
60	2	}
61	4	}
62	97	else if (exponent16 == FLOAT16_EXPONENT_INFINITY_NAN)
63	4	{
64		// infinity or NaN
65	4	exponent32 = FLOAT32_EXPONENT_INFINITY_NAN;
66	4	}
67	93	else
68	93	{
69		// normal number
70	93	exponent32 = static_cast<uint32_t>(exponent16) - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS;
71	93	}
72
73		// compose single precision float (float32)
74	101	const uint32_t sign32Shifted = static_cast<uint32_t>(sign16Shifted)
75	101	<< (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION);
76	101	const uint32_t exponent32Shifted = exponent32 << FLOAT32_EXPONENT_BIT_POSITION;
77	101	const uint32_t float32Value = sign32Shifted \| exponent32Shifted \| significand32;
78
79		// convert it to float
80	101	return convertUInt32ToFloat(float32Value);
81	101	}
82
83		uint16_t convertFloatToUInt16(float float32)
84	196	{
85	196	const uint32_t float32Value = convertFloatToUInt32(float32);
86
87		// decompose single precision float (float32)
88	196	const uint32_t sign32Shifted = (float32Value & FLOAT32_SIGN_MASK);
89	196	const uint32_t exponent32 = (float32Value & FLOAT32_EXPONENT_MASK) >> FLOAT32_EXPONENT_BIT_POSITION;
90	196	const uint32_t significand32 = (float32Value & FLOAT32_SIGNIFICAND_MASK);
91
92		// calculate significand for half precision float (float16)
93	196	uint16_t significand16 = static_cast<uint16_t>(
94	196	(significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS)));
95
96		// calculate exponent for half precision float (float16)
97	196	bool needsRounding = false;
98	196	uint16_t exponent16 = 0;
99	196	if (exponent32 == 0)
100	4	{
101	4	if (significand32 != 0)
102	2	{
103		// subnormal (denormal) number will be zero
104	2	significand16 = 0;
105	2	}
106	4	}
107	192	else if (exponent32 == FLOAT32_EXPONENT_INFINITY_NAN)
108	4	{
109		// infinity or NaN
110	4	exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
111	4	}
112	188	else
113	188	{
114		// normal number
115	188	const int16_t signedExponent16 = static_cast<int16_t>(static_cast<int32_t>(exponent32) -
116	188	static_cast<int32_t>(FLOAT32_EXPONENT_BIAS) + static_cast<int32_t>(FLOAT16_EXPONENT_BIAS));
117	188	if (signedExponent16 > FLOAT16_EXPONENT_INFINITY_NAN)
118	1	{
119		// exponent overflow, set infinity or NaN
120	1	exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
121	1	}
122	187	else if (signedExponent16 <= 0)
123	3	{
124		// exponent underflow
125	3	if (signedExponent16 <= static_cast<int16_t>(-FLOAT16_SIGNIFICAND_NUM_BITS))
126	1	{
127		// too big underflow, set to zero
128	1	significand16 = 0;
129	1	}
130	2	else
131	2	{
132		// we can still use subnormal numbers
133	2	const uint32_t fullSignificand32 = significand32 \| (FLOAT32_SIGNIFICAND_MASK + 1);
134	2	const uint32_t significandShift = static_cast<uint32_t>(1 - signedExponent16);
135	2	significand16 = static_cast<uint16_t>(fullSignificand32 >>
136	2	(FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + significandShift));
137
138	2	needsRounding =
139	2	((fullSignificand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS +
140	2	significandShift - 1)) &
141	2	UINT32_C(1)) != 0;
142	2	}
143	3	}
144	184	else
145	184	{
146		// exponent ok
147	184	exponent16 = static_cast<uint16_t>(signedExponent16);
148	184	needsRounding =
149	184	((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS - 1)) &
150	184	UINT32_C(1)) != 0;
151	184	}
152	188	}
153
154		// compose half precision float (float16)
155	196	const uint16_t sign16Shifted =
156	196	static_cast<uint16_t>(sign32Shifted >> (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION));
157	196	const uint16_t exponent16Shifted = static_cast<uint16_t>(exponent16 << FLOAT16_EXPONENT_BIT_POSITION);
158	196	uint16_t float16Value = static_cast<uint16_t>(sign16Shifted \| exponent16Shifted) \| significand16;
159
160		// check rounding
161	196	if (needsRounding)
162	2	{
163	2	++float16Value; // might overflow to infinity
164	2	}
165
166	196	return float16Value;
167	196	}
168
169		float convertUInt32ToFloat(uint32_t float32Value)
170	307	{
171	307	float convertedFloat = 0.0F;
172	307	(void)std::memcpy(&convertedFloat, &float32Value, sizeof(uint32_t));
173
174	307	return convertedFloat;
175	307	}
176
177		uint32_t convertFloatToUInt32(float float32)
178	585	{
179	585	uint32_t float32Value = 0;
180	585	(void)std::memcpy(&float32Value, &float32, sizeof(float));
181
182	585	return float32Value;
183	585	}
184
185		double convertUInt64ToDouble(uint64_t float64Value)
186	374	{
187	374	double convertedDouble = 0.0;
188	374	(void)std::memcpy(&convertedDouble, &float64Value, sizeof(uint64_t));
189
190	374	return convertedDouble;
191	374	}
192
193		uint64_t convertDoubleToUInt64(double float64)
194	742	{
195	742	uint64_t float64Value = 0;
196	742	(void)std::memcpy(&float64Value, &float64, sizeof(double));
197
198	742	return float64Value;
199	742	}
200
201		} // namespace zserio