tint: Implement f16 value binary representation
This CL add methods that return the binary16 bit pattern for a
constructed Number<detail::NumberKindF16>. This is required for
generating SIPR-V oprand.
Bug: tint:1473, tint:1502
Change-Id: Ia3680cdb5a0e64d31bfe2f48432cda3850c1f5a7
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/95240
Reviewed-by: Dan Sinclair <dsinclair@chromium.org>
Commit-Queue: Zhaoming Jiang <zhaoming.jiang@intel.com>
diff --git a/src/tint/number.cc b/src/tint/number.cc
index 17b005b..6371442 100644
--- a/src/tint/number.cc
+++ b/src/tint/number.cc
@@ -204,4 +204,108 @@
return value;
}
+uint16_t f16::BitsRepresentation() const {
+ constexpr uint16_t f16_nan = 0x7e00u;
+ constexpr uint16_t f16_pos_inf = 0x7c00u;
+ constexpr uint16_t f16_neg_inf = 0xfc00u;
+
+ // Assert we use binary32 (i.e. float) as underlying type, which has 4 bytes.
+ static_assert(std::is_same<f16::type, float>());
+
+ // The stored value in f16 object must be already quantized, so it should be either NaN, +/-
+ // Inf, or exactly representable by normal or subnormal f16.
+
+ if (std::isnan(value)) {
+ return f16_nan;
+ }
+
+ if (std::isinf(value)) {
+ return value > 0 ? f16_pos_inf : f16_neg_inf;
+ }
+
+ // Now quantized_value must be a finite f16 exactly-representable value.
+ // The following table shows exponent cases for all finite f16 exactly-representable value.
+ // ---------------------------------------------------------------------------
+ // | Value category | Unbiased exp | F16 biased exp | F32 biased exp |
+ // |------------------|----------------|------------------|------------------|
+ // | +/- zero | \ | 0 | 0 |
+ // | Subnormal f16 | [-24, -15] | 0 | [103, 112] |
+ // | Normal f16 | [-14, 15] | [1, 30] | [113, 142] |
+ // ---------------------------------------------------------------------------
+
+ constexpr uint32_t max_f32_biased_exp_for_f16_normal_number = 142;
+ constexpr uint32_t min_f32_biased_exp_for_f16_normal_number = 113;
+ constexpr uint32_t max_f32_biased_exp_for_f16_subnormal_number = 112;
+ constexpr uint32_t min_f32_biased_exp_for_f16_subnormal_number = 103;
+
+ constexpr uint32_t f32_sign_mask = 0x80000000u;
+ constexpr uint32_t f32_exp_mask = 0x7f800000u;
+ constexpr uint32_t f32_mantissa_mask = 0x007fffffu;
+ constexpr uint32_t f32_mantissa_bis_number = 23;
+ constexpr uint32_t f32_exp_bias = 127;
+
+ constexpr uint16_t f16_sign_mask = 0x8000u;
+ constexpr uint16_t f16_exp_mask = 0x7c00u;
+ constexpr uint16_t f16_mantissa_mask = 0x03ffu;
+ constexpr uint32_t f16_mantissa_bis_number = 10;
+ constexpr uint32_t f16_exp_bias = 15;
+
+ uint32_t f32_bit_pattern;
+ memcpy(&f32_bit_pattern, &value, 4);
+ uint32_t f32_biased_exponent = (f32_bit_pattern & f32_exp_mask) >> f32_mantissa_bis_number;
+ uint32_t f32_mantissa = f32_bit_pattern & f32_mantissa_mask;
+
+ uint16_t f16_sign_part = static_cast<uint16_t>((f32_bit_pattern & f32_sign_mask) >> 16);
+ TINT_ASSERT(Semantic, (f16_sign_part & ~f16_sign_mask) == 0);
+
+ if ((f32_bit_pattern & ~f32_sign_mask) == 0) {
+ // +/- zero
+ return f16_sign_part;
+ }
+
+ if ((min_f32_biased_exp_for_f16_normal_number <= f32_biased_exponent) &&
+ (f32_biased_exponent <= max_f32_biased_exp_for_f16_normal_number)) {
+ // Normal f16
+ uint32_t f16_biased_exponent = f32_biased_exponent - f32_exp_bias + f16_exp_bias;
+ uint16_t f16_exp_part =
+ static_cast<uint16_t>(f16_biased_exponent << f16_mantissa_bis_number);
+ uint16_t f16_mantissa_part = static_cast<uint16_t>(
+ f32_mantissa >> (f32_mantissa_bis_number - f16_mantissa_bis_number));
+
+ TINT_ASSERT(Semantic, (f16_exp_part & ~f16_exp_mask) == 0);
+ TINT_ASSERT(Semantic, (f16_mantissa_part & ~f16_mantissa_mask) == 0);
+
+ return f16_sign_part | f16_exp_part | f16_mantissa_part;
+ }
+
+ if ((min_f32_biased_exp_for_f16_subnormal_number <= f32_biased_exponent) &&
+ (f32_biased_exponent <= max_f32_biased_exp_for_f16_subnormal_number)) {
+ // Subnormal f16
+ // The resulting exp bits are always 0, and the mantissa bits should be handled specially.
+ uint16_t f16_exp_part = 0;
+ // The resulting subnormal f16 will have only 1 valid mantissa bit if the unbiased exponent
+ // of value is of the minimum, i.e. -24; and have all 10 mantissa bits valid if the unbiased
+ // exponent of value is of the maximum, i.e. -15.
+ uint32_t f16_valid_mantissa_bits =
+ f32_biased_exponent - min_f32_biased_exp_for_f16_subnormal_number + 1;
+ // The resulting f16 mantissa part comes from right-shifting the f32 mantissa bits with
+ // leading 1 added.
+ uint16_t f16_mantissa_part =
+ static_cast<uint16_t>((f32_mantissa | (f32_mantissa_mask + 1)) >>
+ (f32_mantissa_bis_number + 1 - f16_valid_mantissa_bits));
+
+ TINT_ASSERT(Semantic, (1 <= f16_valid_mantissa_bits) &&
+ (f16_valid_mantissa_bits <= f16_mantissa_bis_number));
+ TINT_ASSERT(Semantic, (f16_mantissa_part & ~((1u << f16_valid_mantissa_bits) - 1)) == 0);
+ TINT_ASSERT(Semantic, (f16_mantissa_part != 0));
+
+ return f16_sign_part | f16_exp_part | f16_mantissa_part;
+ }
+
+ // Neither zero, subnormal f16 or normal f16, shall never hit.
+ tint::diag::List diag;
+ TINT_UNREACHABLE(Semantic, diag);
+ return f16_nan;
+}
+
} // namespace tint
diff --git a/src/tint/number.h b/src/tint/number.h
index e130019..32cca59 100644
--- a/src/tint/number.h
+++ b/src/tint/number.h
@@ -186,6 +186,13 @@
return *this;
}
+ /// Get the binary16 bit pattern in type uint16_t of this value.
+ /// @returns the binary16 bit pattern, in type uint16_t, of the stored quantized f16 value. If
+ /// the value is NaN, the returned value will be 0x7e00u. If the value is positive infinity, the
+ /// returned value will be 0x7c00u. If the input value is negative infinity, the returned value
+ /// will be 0xfc00u.
+ uint16_t BitsRepresentation() const;
+
/// @param value the input float32 value
/// @returns the float32 value quantized to the smaller float16 value, through truncation of the
/// mantissa bits (no rounding). If the float32 value is too large (positive or negative) to be
diff --git a/src/tint/number_test.cc b/src/tint/number_test.cc
index 81acc04..eeb31ed 100644
--- a/src/tint/number_test.cc
+++ b/src/tint/number_test.cc
@@ -217,83 +217,164 @@
EXPECT_EQ(CheckedConvert<f16>(AFloat(-kHighestF16Subnormal)), f16(-kHighestF16Subnormal));
}
-TEST(NumberTest, QuantizeF16) {
- constexpr float nan = std::numeric_limits<float>::quiet_NaN();
- constexpr float inf = std::numeric_limits<float>::infinity();
+// Test cases for f16 subnormal quantization and BitsRepresentation.
+// The ULP is based on float rather than double or f16, since F16::Quantize and
+// F16::BitsRepresentation take float as input.
+constexpr float lowestPositiveNormalF16 = 0x1p-14;
+constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14;
+constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15;
+constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14;
+constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15;
+constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15;
+constexpr float lowestPositiveSubnormalF16 = 0x1.p-24;
+constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24;
+constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25;
- EXPECT_EQ(f16(0.0), 0.0f);
- EXPECT_EQ(f16(1.0), 1.0f);
- EXPECT_EQ(f16(0.00006106496), 0.000061035156f);
- EXPECT_EQ(f16(1.0004883), 1.0f);
- EXPECT_EQ(f16(-8196), -8192.f);
- EXPECT_EQ(f16(65504.003), inf);
- EXPECT_EQ(f16(-65504.003), -inf);
- EXPECT_EQ(f16(inf), inf);
- EXPECT_EQ(f16(-inf), -inf);
- EXPECT_TRUE(std::isnan(f16(nan)));
+constexpr uint16_t lowestPositiveNormalF16Bits = 0x0400u;
+constexpr uint16_t highestPositiveSubnormalF16Bits = 0x03ffu;
+constexpr uint16_t lowestPositiveSubnormalF16Bits = 0x0001u;
- // Test for subnormal quantization.
- // The ULP is based on float rather than double or f16, since F16::Quantize take float as input.
- constexpr float lowestPositiveNormalF16 = 0x1p-14;
- constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14;
- constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15;
- constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14;
- constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15;
- constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15;
- constexpr float lowestPositiveSubnormalF16 = 0x1.p-24;
- constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24;
- constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25;
+constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16;
+constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP;
+constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP;
+constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16;
+constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP;
+constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP;
+constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16;
+constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP;
+constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP;
- constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16;
- constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP;
- constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP;
- constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16;
- constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP;
- constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP;
- constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16;
- constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP;
- constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP;
+constexpr uint16_t highestNegativeNormalF16Bits = 0x8400u;
+constexpr uint16_t lowestNegativeSubnormalF16Bits = 0x83ffu;
+constexpr uint16_t highestNegativeSubnormalF16Bits = 0x8001u;
- // Value larger than or equal to lowest positive normal f16 will be quantized to normal f16.
- EXPECT_EQ(f16(lowestPositiveNormalF16PlusULP), lowestPositiveNormalF16);
- EXPECT_EQ(f16(lowestPositiveNormalF16), lowestPositiveNormalF16);
- // Positive value smaller than lowest positive normal f16 but not smaller than lowest positive
- // subnormal f16 will be quantized to subnormal f16 or zero.
- EXPECT_EQ(f16(lowestPositiveNormalF16MinusULP), highestPositiveSubnormalF16);
- EXPECT_EQ(f16(highestPositiveSubnormalF16PlusULP), highestPositiveSubnormalF16);
- EXPECT_EQ(f16(highestPositiveSubnormalF16), highestPositiveSubnormalF16);
- EXPECT_EQ(f16(highestPositiveSubnormalF16MinusULP), 0x0.ff8p-14);
- EXPECT_EQ(f16(lowestPositiveSubnormalF16PlusULP), lowestPositiveSubnormalF16);
- EXPECT_EQ(f16(lowestPositiveSubnormalF16), lowestPositiveSubnormalF16);
- // Positive value smaller than lowest positive subnormal f16 will be quantized to zero.
- EXPECT_EQ(f16(lowestPositiveSubnormalF16MinusULP), 0.0);
- // Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 = 0x0.004p-14.
- EXPECT_EQ(f16(0x0.064p-14), 0x0.064p-14);
- EXPECT_EQ(f16(0x0.067fecp-14), 0x0.064p-14);
- EXPECT_EQ(f16(0x0.063ffep-14), 0x0.060p-14);
- EXPECT_EQ(f16(0x0.008p-14), 0x0.008p-14);
- EXPECT_EQ(f16(0x0.00bffep-14), 0x0.008p-14);
- EXPECT_EQ(f16(0x0.007ffep-14), 0x0.004p-14);
+constexpr float f32_nan = std::numeric_limits<float>::quiet_NaN();
+constexpr float f32_inf = std::numeric_limits<float>::infinity();
- // Vice versa for negative cases.
- EXPECT_EQ(f16(highestNegativeNormalF16MinusULP), highestNegativeNormalF16);
- EXPECT_EQ(f16(highestNegativeNormalF16), highestNegativeNormalF16);
- EXPECT_EQ(f16(highestNegativeNormalF16PlusULP), lowestNegativeSubnormalF16);
- EXPECT_EQ(f16(lowestNegativeSubnormalF16MinusULP), lowestNegativeSubnormalF16);
- EXPECT_EQ(f16(lowestNegativeSubnormalF16), lowestNegativeSubnormalF16);
- EXPECT_EQ(f16(lowestNegativeSubnormalF16PlusULP), -0x0.ff8p-14);
- EXPECT_EQ(f16(highestNegativeSubnormalF16MinusULP), highestNegativeSubnormalF16);
- EXPECT_EQ(f16(highestNegativeSubnormalF16), highestNegativeSubnormalF16);
- EXPECT_EQ(f16(highestNegativeSubnormalF16PlusULP), -0.0);
- // Test the mantissa discarding.
- EXPECT_EQ(f16(-0x0.064p-14), -0x0.064p-14);
- EXPECT_EQ(f16(-0x0.067fecp-14), -0x0.064p-14);
- EXPECT_EQ(f16(-0x0.063ffep-14), -0x0.060p-14);
- EXPECT_EQ(f16(-0x0.008p-14), -0x0.008p-14);
- EXPECT_EQ(f16(-0x0.00bffep-14), -0x0.008p-14);
- EXPECT_EQ(f16(-0x0.007ffep-14), -0x0.004p-14);
+struct F16TestCase {
+ float input_value;
+ float quantized_value;
+ uint16_t f16_bit_pattern;
+};
+
+using NumberF16Test = testing::TestWithParam<F16TestCase>;
+
+TEST_P(NumberF16Test, QuantizeF16) {
+ float input_value = GetParam().input_value;
+ float quantized_value = GetParam().quantized_value;
+
+ std::stringstream ss;
+ ss << "input value = " << input_value << ", expected quantized value = " << quantized_value;
+ SCOPED_TRACE(ss.str());
+
+ if (std::isnan(quantized_value)) {
+ EXPECT_TRUE(std::isnan(f16(input_value)));
+ } else {
+ EXPECT_EQ(f16(input_value), quantized_value);
+ }
}
+TEST_P(NumberF16Test, BitsRepresentation) {
+ float input_value = GetParam().input_value;
+ uint16_t representation = GetParam().f16_bit_pattern;
+
+ std::stringstream ss;
+ ss << "input value = " << input_value
+ << ", expected binary16 bits representation = " << std::hex << std::showbase
+ << representation;
+ SCOPED_TRACE(ss.str());
+
+ EXPECT_EQ(f16(input_value).BitsRepresentation(), representation);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ NumberF16Test,
+ NumberF16Test,
+ testing::ValuesIn(std::vector<F16TestCase>{
+ // NaN, Inf
+ {f32_inf, f32_inf, 0x7c00u},
+ {-f32_inf, -f32_inf, 0xfc00u},
+ {f32_nan, f32_nan, 0x7e00u},
+ {-f32_nan, -f32_nan, 0x7e00u},
+ // +/- zero
+ {+0.0f, 0.0f, 0x0000u},
+ {-0.0f, -0.0f, 0x8000u},
+ // Value in normal f16 range
+ {1.0f, 1.0f, 0x3c00u},
+ {-1.0f, -1.0f, 0xbc00u},
+ // 0.00006106496 quantized to 0.000061035156 = 0x1p-14
+ {0.00006106496f, 0.000061035156f, 0x0400u},
+ {-0.00006106496f, -0.000061035156f, 0x8400u},
+ // 1.0004883 quantized to 1.0 = 0x1p0
+ {1.0004883f, 1.0f, 0x3c00u},
+ {-1.0004883f, -1.0f, 0xbc00u},
+ // 8196.0 quantized to 8192.0 = 0x1p13
+ {8196.0f, 8192.f, 0x7000u},
+ {-8196.0f, -8192.f, 0xf000u},
+ // Value in subnormal f16 range
+ {0x0.034p-14f, 0x0.034p-14f, 0x000du},
+ {-0x0.034p-14f, -0x0.034p-14f, 0x800du},
+ {0x0.068p-14f, 0x0.068p-14f, 0x001au},
+ {-0x0.068p-14f, -0x0.068p-14f, 0x801au},
+ // 0x0.06b7p-14 quantized to 0x0.068p-14
+ {0x0.06b7p-14f, 0x0.068p-14f, 0x001au},
+ {-0x0.06b7p-14f, -0x0.068p-14, 0x801au},
+ // Value out of f16 range
+ {65504.003f, f32_inf, 0x7c00u},
+ {-65504.003f, -f32_inf, 0xfc00u},
+ {0x1.234p56f, f32_inf, 0x7c00u},
+ {-0x4.321p65f, -f32_inf, 0xfc00u},
+
+ // Test for subnormal quantization.
+ // Value larger than or equal to lowest positive normal f16 will be quantized to normal f16.
+ {lowestPositiveNormalF16PlusULP, lowestPositiveNormalF16, lowestPositiveNormalF16Bits},
+ {lowestPositiveNormalF16, lowestPositiveNormalF16, lowestPositiveNormalF16Bits},
+ // Positive value smaller than lowest positive normal f16 but not smaller than lowest
+ // positive
+ // subnormal f16 will be quantized to subnormal f16 or zero.
+ {lowestPositiveNormalF16MinusULP, highestPositiveSubnormalF16,
+ highestPositiveSubnormalF16Bits},
+ {highestPositiveSubnormalF16PlusULP, highestPositiveSubnormalF16,
+ highestPositiveSubnormalF16Bits},
+ {highestPositiveSubnormalF16, highestPositiveSubnormalF16, highestPositiveSubnormalF16Bits},
+ {highestPositiveSubnormalF16MinusULP, 0x0.ff8p-14, 0x03feu},
+ {lowestPositiveSubnormalF16PlusULP, lowestPositiveSubnormalF16,
+ lowestPositiveSubnormalF16Bits},
+ {lowestPositiveSubnormalF16, lowestPositiveSubnormalF16, lowestPositiveSubnormalF16Bits},
+ // Positive value smaller than lowest positive subnormal f16 will be quantized to zero.
+ {lowestPositiveSubnormalF16MinusULP, 0.0, 0x0000u},
+ // Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 =
+ // 0x0.004p-14.
+ {0x0.064p-14f, 0x0.064p-14, 0x0019u},
+ {0x0.067fecp-14f, 0x0.064p-14, 0x0019u},
+ {0x0.063ffep-14f, 0x0.060p-14, 0x0018u},
+ {0x0.008p-14f, 0x0.008p-14, 0x0002u},
+ {0x0.00bffep-14f, 0x0.008p-14, 0x0002u},
+ {0x0.007ffep-14f, 0x0.004p-14, 0x0001u},
+
+ // Vice versa for negative cases.
+ {highestNegativeNormalF16MinusULP, highestNegativeNormalF16, highestNegativeNormalF16Bits},
+ {highestNegativeNormalF16, highestNegativeNormalF16, highestNegativeNormalF16Bits},
+ {highestNegativeNormalF16PlusULP, lowestNegativeSubnormalF16,
+ lowestNegativeSubnormalF16Bits},
+ {lowestNegativeSubnormalF16MinusULP, lowestNegativeSubnormalF16,
+ lowestNegativeSubnormalF16Bits},
+ {lowestNegativeSubnormalF16, lowestNegativeSubnormalF16, lowestNegativeSubnormalF16Bits},
+ {lowestNegativeSubnormalF16PlusULP, -0x0.ff8p-14, 0x83feu},
+ {highestNegativeSubnormalF16MinusULP, highestNegativeSubnormalF16,
+ highestNegativeSubnormalF16Bits},
+ {highestNegativeSubnormalF16, highestNegativeSubnormalF16, highestNegativeSubnormalF16Bits},
+ {highestNegativeSubnormalF16PlusULP, -0.0, 0x8000u},
+ // Test the mantissa discarding.
+ {-0x0.064p-14f, -0x0.064p-14, 0x8019u},
+ {-0x0.067fecp-14f, -0x0.064p-14, 0x8019u},
+ {-0x0.063ffep-14f, -0x0.060p-14, 0x8018u},
+ {-0x0.008p-14f, -0x0.008p-14, 0x8002u},
+ {-0x0.00bffep-14f, -0x0.008p-14, 0x8002u},
+ {-0x0.007ffep-14f, -0x0.004p-14, 0x8001u},
+ /////////////////////////////////////
+ }));
+
using BinaryCheckedCase = std::tuple<std::optional<AInt>, AInt, AInt>;
#undef OVERFLOW // corecrt_math.h :(