| // Copyright 2022 The Tint Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "src/tint/number.h" |
| |
| #include <algorithm> |
| #include <cmath> |
| #include <cstring> |
| #include <ostream> |
| |
| #include "src/tint/debug.h" |
| |
| namespace tint { |
| |
| std::ostream& operator<<(std::ostream& out, ConversionFailure failure) { |
| switch (failure) { |
| case ConversionFailure::kExceedsPositiveLimit: |
| return out << "value exceeds positive limit for type"; |
| case ConversionFailure::kExceedsNegativeLimit: |
| return out << "value exceeds negative limit for type"; |
| } |
| return out << "<unknown>"; |
| } |
| |
| f16::type f16::Quantize(f16::type value) { |
| if (value > kHighestValue) { |
| return std::numeric_limits<f16::type>::infinity(); |
| } |
| if (value < kLowestValue) { |
| return -std::numeric_limits<f16::type>::infinity(); |
| } |
| // Below value must be within the finite range of a f16. |
| // Assert we use binary32 (i.e. float) as underlying type, which has 4 bytes. |
| static_assert(std::is_same<f16::type, float>()); |
| const uint32_t sign_mask = 0x80000000u; // Mask for the sign bit |
| const uint32_t exponent_mask = 0x7f800000u; // Mask for 8 exponent bits |
| |
| uint32_t u32; |
| memcpy(&u32, &value, 4); |
| |
| if ((u32 & ~sign_mask) == 0) { |
| return value; // +/- zero |
| } |
| if ((u32 & exponent_mask) == exponent_mask) { // exponent all 1's |
| return value; // inf or nan |
| } |
| |
| // We are now going to quantize a f32 number into subnormal f16 and store the result value back |
| // into a f32 variable. Notice that all subnormal f16 values are just normal f32 values. Below |
| // will show that we can do this quantization by just masking out 13 or more lowest mantissa |
| // bits of the original f32 number. |
| // |
| // Note: |
| // f32 has 1 sign bit, 8 exponent bits for biased exponent (i.e. unbiased exponent + 127), and |
| // 23 mantissa bits. Binary form: s_eeeeeeee_mmmmmmmmmmmmmmmmmmmmmmm |
| // f16 has 1 sign bit, 5 exponent bits for biased exponent (i.e. unbiased exponent + 15), and |
| // 10 mantissa bits. Binary form: s_eeeee_mmmmmmmmmm |
| // The largest finite f16 number has a biased exponent of 11110 in binary, or 30 decimal, and so |
| // a unbiased exponent of 30 - 15 = 15. |
| // The smallest finite f16 number has a biased exponent of 00001 in binary, or 1 decimal, and so |
| // a unbiased exponent of 1 - 15 = -14. |
| // |
| // We may follow the argument below: |
| // 1. All normal or subnormal f16 values, range from 0x1.p-24 to 0x1.ffcp15, are exactly |
| // representable by normal f32 number. |
| // 1.1. We can denote the set of all f32 number that are exact representation of finite f16 |
| // values by `R`. |
| // 1.2. We can do the quantization by mapping a normal f32 value v (in the f16 finite range) |
| // to a certain f32 number v' in the set R, which is the largest (by the meaning of absolute |
| // value) one among all values in R that are no larger than v. |
| // 2. We can decide whether a given normal f32 number v is in the set R, by looking at its |
| // mantissa bits and biased exponent `e`. Recall that biased exponent e is unbiased exponent + |
| // 127, and in the range of 1 to 254 for normal f32 number. |
| // 2.1. If e >= 143, i.e. abs(v) >= 2^16 > f16::kHighestValue = 0x1.ffcp15, v is larger than |
| // any finite f16 value and can not be in set R. 2.2. If 142 >= e >= 113, or |
| // f16::kHighestValue >= abs(v) >= f16::kSmallestValue = 2^-14, v falls in the range of normal |
| // f16 values. In this case, v is in the set R iff the lowest 13 mantissa bits are all 0. (See |
| // below for proof) |
| // 2.2.1. If we let v' be v with lowest 13 mantissa bits masked to 0, v' will be in set R |
| // and the largest one in set R that no larger than v. Such v' is the quantized value of v. |
| // 2.3. If 112 >= e >= 103, i.e. 2^-14 > abs(v) >= f16::kSmallestSubnormalValue = 2^-24, v |
| // falls in the range of subnormal f16 values. In this case, v is in the set R iff the lowest |
| // 126-e mantissa bits are all 0. Notice that 126-e is in range 14 to 23, inclusive. (See |
| // below for proof) |
| // 2.3.1. If we let v' be v with lowest 126-e mantissa bits masked to 0, v' will be in set R |
| // and the largest on in set R that no larger than v. Such v' is the quantized value of v. |
| // 2.4. If 2^-24 > abs(v) > 0, i.e. 103 > e, v is smaller than any finite f16 value and not |
| // equal to 0.0, thus can not be in set R. |
| // 2.5. If abs(v) = 0, v is in set R and is just +-0.0. |
| // |
| // Proof for 2.2: |
| // Any normal f16 number, in binary form, s_eeeee_mmmmmmmmmm, has value |
| // (s==0?1:-1)*(1+uint(mmmmm_mmmmm)*(2^-10))*2^(uint(eeeee)-15) |
| // in which unit(bbbbb) means interprete binary pattern "bbbbb" as unsigned binary number, |
| // and we have 1 <= uint(eeeee) <= 30. |
| // This value is equal to a normal f32 number with binary |
| // s_EEEEEEEE_mmmmmmmmmm0000000000000 |
| // where uint(EEEEEEEE) = uint(eeeee) + 112, so that unbiased exponent keep unchanged |
| // uint(EEEEEEEE) - 127 = uint(eeeee) - 15 |
| // and its value is |
| // (s==0?1:-1)*(1+uint(mmmmm_mmmmm_00000_00000_000)*(2^-23))*2^(uint(EEEEEEEE)-127) |
| // == (s==0?1:-1)*(1+uint(mmmmm_mmmmm)*(2^-10))*2^(uint(eeeee)-15) |
| // Notice that uint(EEEEEEEE) is in range [113, 142], showing that it is a normal f32 number. |
| // So we proof that any normal f16 number can be exactly representd by a normal f32 number |
| // with biased exponent in range [113,142] and the lowest 13 mantissa bits 0. |
| // On the other hand, since mantissa bits mmmmmmmmmm are arbitrary, the value of any f32 |
| // that has a biased exponent in range [113, 142] and lowest 13 mantissa bits zero is equal |
| // to a normal f16 value. Hence we proof 2.2. |
| // |
| // Proof for 2.3: |
| // Any subnormal f16 number has a binary form of s_00000_mmmmmmmmmm, and its value is |
| // (s==0?1:-1)*uint(mmmmmmmmmm)*(2^-10)*(2^-14) = (s==0?1:-1)*uint(mmmmmmmmmm)*(2^-24). |
| // We discuss on bit pattern of mantissa bits mmmmmmmmmm. |
| // Case 1: mantissa bits has no leading zero bit, s_00000_1mmmmmmmmm |
| // In this case the value is |
| // (s==0?1:-1)*uint(1mmmm_mmmmm)*(2^-10)*(2^-14) |
| // == (s==0?1:-1)*(uint(1_mmmmm_mmmm)*(2^-9))*(2^-15) |
| // == (s==0?1:-1)*(1+uint(mmmmm_mmmm)*(2^-9))*(2^-15) |
| // == (s==0?1:-1)*(1+uint(mmmmm_mmmm0_00000_00000_000)*(2^-23))*(2^-15) |
| // which is equal to the value of normal f32 number |
| // s_EEEEEEEE_mmmmm_mmmm0_00000_00000_000 |
| // where uint(EEEEEEEE) = -15 + 127 = 112. Hence we proof that any subnormal f16 number |
| // with no leading zero mantissa bit can be exactly represented by a f32 number with |
| // biased exponent 112 and the lowest 14 mantissa bits zero, and the value of any f32 |
| // number with biased exponent 112 and the lowest 14 mantissa bits zero are equal to a |
| // subnormal f16 number with no leading zero mantissa bit. |
| // Case 2: mantissa bits has 1 leading zero bit, s_00000_01mmmmmmmm |
| // In this case the value is |
| // (s==0?1:-1)*uint(01mmm_mmmmm)*(2^-10)*(2^-14) |
| // == (s==0?1:-1)*(uint(01_mmmmm_mmm)*(2^-8))*(2^-16) |
| // == (s==0?1:-1)*(1+uint(mmmmm_mmm)*(2^-8))*(2^-16) |
| // == (s==0?1:-1)*(1+uint(mmmmm_mmm00_00000_00000_000)*(2^-23))*(2^-16) |
| // which is equal to the value of normal f32 number |
| // s_EEEEEEEE_mmmmm_mmm00_00000_00000_000 |
| // where uint(EEEEEEEE) = -16 + 127 = 111. Hence we proof that any subnormal f16 number |
| // with 1 leading zero mantissa bit can be exactly represented by a f32 number with |
| // biased exponent 111 and the lowest 15 mantissa bits zero, and the value of any f32 |
| // number with biased exponent 111 and the lowest 15 mantissa bits zero are equal to a |
| // subnormal f16 number with 1 leading zero mantissa bit. |
| // Case 3 to case 8: ...... |
| // Case 9: mantissa bits has 8 leading zero bit, s_00000_000000001m |
| // In this case the value is |
| // (s==0?1:-1)*uint(00000_0001m)*(2^-10)*(2^-14) |
| // == (s==0?1:-1)*(uint(000000001_m)*(2^-1))*(2^-23) |
| // == (s==0?1:-1)*(1+uint(m)*(2^-1))*(2^-23) |
| // == (s==0?1:-1)*(1+uint(m0000_00000_00000_00000_000)*(2^-23))*(2^-23) |
| // which is equal to the value of normal f32 number |
| // s_EEEEEEEE_m0000_00000_00000_00000_000 |
| // where uint(EEEEEEEE) = -23 + 127 = 104. Hence we proof that any subnormal f16 number |
| // with 8 leading zero mantissa bit can be exactly represented by a f32 number with |
| // biased exponent 104 and the lowest 22 mantissa bits zero, and the value of any f32 |
| // number with biased exponent 104 and the lowest 22 mantissa bits zero are equal to a |
| // subnormal f16 number with 8 leading zero mantissa bit. |
| // Case 10: mantissa bits has 9 leading zero bit, s_00000_0000000001 |
| // In this case the value is just +-2^-24 = +-0x1.0p-24, |
| // the f32 number has biased exponent 103 and all 23 mantissa bits zero. |
| // Case 11: mantissa bits has 10 leading zero bit, s_00000_0000000000, just 0.0 |
| // Concluding all these case, we proof that any subnormal f16 number with N leading zero |
| // mantissa bit can be exactly represented by a f32 number with biased exponent 112-N and the |
| // lowest 14+N mantissa bits zero, and the value of any f32 number with biased exponent 112-N (= |
| // e) and the lowest 14+N (= 126-e) mantissa bits zero are equal to a subnormal f16 number with |
| // N leading zero mantissa bit. N is in range [0, 9], so the f32 number's biased exponent e is |
| // in range [103, 112], or unbiased exponent in [-24, -15]. |
| |
| float abs_value = std::fabs(value); |
| if (abs_value >= kSmallestValue) { |
| // Value falls in the normal f16 range, quantize it to a normal f16 value by masking out |
| // lowest 13 mantissa bits. |
| u32 = u32 & ~((1u << 13) - 1); |
| } else if (abs_value >= kSmallestSubnormalValue) { |
| // Value should be quantized to a subnormal f16 value. |
| |
| // Get the biased exponent `e` of f32 value, e.g. value 127 representing exponent 2^0. |
| uint32_t biased_exponent_original = (u32 & exponent_mask) >> 23; |
| // Since we ensure that kSmallestValue = 0x1f-14 > abs(value) >= kSmallestSubnormalValue = |
| // 0x1f-24, value will have a unbiased exponent in range -24 to -15 (inclusive), and the |
| // corresponding biased exponent in f32 is in range 103 to 112 (inclusive). |
| TINT_ASSERT(Semantic, |
| (103 <= biased_exponent_original) && (biased_exponent_original <= 112)); |
| |
| // As we have proved, masking out the lowest 126-e mantissa bits of input value will result |
| // in a valid subnormal f16 value, which is exactly the required quantization result. |
| uint32_t discard_bits = 126 - biased_exponent_original; // In range 14 to 23 (inclusive) |
| TINT_ASSERT(Semantic, (14 <= discard_bits) && (discard_bits <= 23)); |
| uint32_t discard_mask = (1u << discard_bits) - 1; |
| u32 = u32 & ~discard_mask; |
| } else { |
| // value is too small that it can't even be represented as subnormal f16 number. Quantize |
| // to zero. |
| return value > 0 ? 0.0 : -0.0; |
| } |
| memcpy(&value, &u32, 4); |
| return value; |
| } |
| |
| uint16_t f16::BitsRepresentation() const { |
| constexpr uint16_t f16_nan = 0x7e00u; |
| constexpr uint16_t f16_pos_inf = 0x7c00u; |
| constexpr uint16_t f16_neg_inf = 0xfc00u; |
| |
| // Assert we use binary32 (i.e. float) as underlying type, which has 4 bytes. |
| static_assert(std::is_same<f16::type, float>()); |
| |
| // The stored value in f16 object must be already quantized, so it should be either NaN, +/- |
| // Inf, or exactly representable by normal or subnormal f16. |
| |
| if (std::isnan(value)) { |
| return f16_nan; |
| } |
| |
| if (std::isinf(value)) { |
| return value > 0 ? f16_pos_inf : f16_neg_inf; |
| } |
| |
| // Now quantized_value must be a finite f16 exactly-representable value. |
| // The following table shows exponent cases for all finite f16 exactly-representable value. |
| // --------------------------------------------------------------------------- |
| // | Value category | Unbiased exp | F16 biased exp | F32 biased exp | |
| // |------------------|----------------|------------------|------------------| |
| // | +/- zero | \ | 0 | 0 | |
| // | Subnormal f16 | [-24, -15] | 0 | [103, 112] | |
| // | Normal f16 | [-14, 15] | [1, 30] | [113, 142] | |
| // --------------------------------------------------------------------------- |
| |
| constexpr uint32_t max_f32_biased_exp_for_f16_normal_number = 142; |
| constexpr uint32_t min_f32_biased_exp_for_f16_normal_number = 113; |
| constexpr uint32_t max_f32_biased_exp_for_f16_subnormal_number = 112; |
| constexpr uint32_t min_f32_biased_exp_for_f16_subnormal_number = 103; |
| |
| constexpr uint32_t f32_sign_mask = 0x80000000u; |
| constexpr uint32_t f32_exp_mask = 0x7f800000u; |
| constexpr uint32_t f32_mantissa_mask = 0x007fffffu; |
| constexpr uint32_t f32_mantissa_bis_number = 23; |
| constexpr uint32_t f32_exp_bias = 127; |
| |
| constexpr uint16_t f16_sign_mask = 0x8000u; |
| constexpr uint16_t f16_exp_mask = 0x7c00u; |
| constexpr uint16_t f16_mantissa_mask = 0x03ffu; |
| constexpr uint32_t f16_mantissa_bis_number = 10; |
| constexpr uint32_t f16_exp_bias = 15; |
| |
| uint32_t f32_bit_pattern; |
| memcpy(&f32_bit_pattern, &value, 4); |
| uint32_t f32_biased_exponent = (f32_bit_pattern & f32_exp_mask) >> f32_mantissa_bis_number; |
| uint32_t f32_mantissa = f32_bit_pattern & f32_mantissa_mask; |
| |
| uint16_t f16_sign_part = static_cast<uint16_t>((f32_bit_pattern & f32_sign_mask) >> 16); |
| TINT_ASSERT(Semantic, (f16_sign_part & ~f16_sign_mask) == 0); |
| |
| if ((f32_bit_pattern & ~f32_sign_mask) == 0) { |
| // +/- zero |
| return f16_sign_part; |
| } |
| |
| if ((min_f32_biased_exp_for_f16_normal_number <= f32_biased_exponent) && |
| (f32_biased_exponent <= max_f32_biased_exp_for_f16_normal_number)) { |
| // Normal f16 |
| uint32_t f16_biased_exponent = f32_biased_exponent - f32_exp_bias + f16_exp_bias; |
| uint16_t f16_exp_part = |
| static_cast<uint16_t>(f16_biased_exponent << f16_mantissa_bis_number); |
| uint16_t f16_mantissa_part = static_cast<uint16_t>( |
| f32_mantissa >> (f32_mantissa_bis_number - f16_mantissa_bis_number)); |
| |
| TINT_ASSERT(Semantic, (f16_exp_part & ~f16_exp_mask) == 0); |
| TINT_ASSERT(Semantic, (f16_mantissa_part & ~f16_mantissa_mask) == 0); |
| |
| return f16_sign_part | f16_exp_part | f16_mantissa_part; |
| } |
| |
| if ((min_f32_biased_exp_for_f16_subnormal_number <= f32_biased_exponent) && |
| (f32_biased_exponent <= max_f32_biased_exp_for_f16_subnormal_number)) { |
| // Subnormal f16 |
| // The resulting exp bits are always 0, and the mantissa bits should be handled specially. |
| uint16_t f16_exp_part = 0; |
| // The resulting subnormal f16 will have only 1 valid mantissa bit if the unbiased exponent |
| // of value is of the minimum, i.e. -24; and have all 10 mantissa bits valid if the unbiased |
| // exponent of value is of the maximum, i.e. -15. |
| uint32_t f16_valid_mantissa_bits = |
| f32_biased_exponent - min_f32_biased_exp_for_f16_subnormal_number + 1; |
| // The resulting f16 mantissa part comes from right-shifting the f32 mantissa bits with |
| // leading 1 added. |
| uint16_t f16_mantissa_part = |
| static_cast<uint16_t>((f32_mantissa | (f32_mantissa_mask + 1)) >> |
| (f32_mantissa_bis_number + 1 - f16_valid_mantissa_bits)); |
| |
| TINT_ASSERT(Semantic, (1 <= f16_valid_mantissa_bits) && |
| (f16_valid_mantissa_bits <= f16_mantissa_bis_number)); |
| TINT_ASSERT(Semantic, (f16_mantissa_part & ~((1u << f16_valid_mantissa_bits) - 1)) == 0); |
| TINT_ASSERT(Semantic, (f16_mantissa_part != 0)); |
| |
| return f16_sign_part | f16_exp_part | f16_mantissa_part; |
| } |
| |
| // Neither zero, subnormal f16 or normal f16, shall never hit. |
| tint::diag::List diag; |
| TINT_UNREACHABLE(Semantic, diag); |
| return f16_nan; |
| } |
| |
| } // namespace tint |