replace clz

MichealReed · MichealReed · commit f2b555da20b5 · 2025-04-10T16:44:27.000-05:00
diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake
@@ -38,7 +38,7 @@ include_directories(BEFORE PUBLIC
 
 
 # Optionally try to find an existing Dawn build.
-set(ENABLE_DAWN_FIND ON CACHE BOOL "Attempt to find an existing Dawn build" FORCE)
+set(ENABLE_DAWN_FIND OFF CACHE BOOL "Attempt to find an existing Dawn build" FORCE)
 set(DAWN_BUILD_FOUND OFF CACHE BOOL "Dawn build found" FORCE)
 
 if(ENABLE_DAWN_FIND)
diff --git a/numeric_types/half.hpp b/numeric_types/half.hpp
@@ -7,50 +7,18 @@
 #include <cstdint>
 #include <cstdio>
 
-#ifdef _MSC_VER
-#include <intrin.h>
-
-static inline uint32_t __builtin_clz(uint32_t value)
-{
-  unsigned long leading_zero = 0;
-  if (value == 0)
-  {
-    return 32;
+// A simple function that counts leading zeros in a 16-bit number.
+static inline uint16_t half_clz16(uint16_t value) {
+  uint16_t count = 0;
+  // Start at the highest bit (0x8000)
+  for (uint16_t mask = 0x8000; mask; mask >>= 1) {
+      if (value & mask)
+          break;
+      ++count;
   }
-  _BitScanReverse(&leading_zero, value);
-  return 31 - leading_zero;
+  return count;
 }
 
-static inline uint16_t __builtin_clz(uint16_t value)
-{
-  return __builtin_clz(static_cast<uint32_t>(value)) - 16;
-}
-
-static inline uint64_t __builtin_clz(uint64_t value)
-{
-  unsigned long leading_zero = 0;
-  if (value == 0)
-  {
-    return 64;
-  }
-#if defined(_WIN64)
-  _BitScanReverse64(&leading_zero, value);
-  return 63 - leading_zero;
-#else
-  uint32_t high = static_cast<uint32_t>(value >> 32);
-  uint32_t low = static_cast<uint32_t>(value);
-  if (high != 0)
-  {
-    return __builtin_clz(high);
-  }
-  else
-  {
-    return 32 + __builtin_clz(low);
-  }
-#endif
-}
-#endif
-
 struct half;
 static inline half halfFromFloat(float f);
 static inline float halfToFloat(half h);
@@ -59,8 +27,7 @@ int testHalf();
 /**
  * Experimental implementation of half-precision 16-bit floating point numbers.
  */
-struct half
-{
+struct half {
   uint16_t data;
 
   // Default constructor
@@ -78,22 +45,19 @@ struct half
   operator uint16_t() const { return data; }
 
   // Overload assignment operator from uint16_t
-  half &operator=(uint16_t value)
-  {
+  half &operator=(uint16_t value) {
     data = value;
     return *this;
   }
 
   // Overload assignment operator from another half
-  half &operator=(const half &other)
-  {
+  half &operator=(const half &other) {
     data = other.data;
     return *this;
   }
 
   // Overload assignment operator from float
-  half &operator=(float value)
-  {
+  half &operator=(float value) {
     data = halfFromFloat(value);
     return *this;
   }
@@ -104,10 +68,8 @@ struct half
  *
  * Based on Mike Acton's half.c implementation.
  */
-half halfFromFloat(float f)
-{
-  union
-  {
+half halfFromFloat(float f) {
+  union {
     float f;
     uint32_t u;
   } floatUnion = {f};
@@ -146,8 +108,7 @@ half halfFromFloat(float f)
   const uint32_t floatMantissa = float32 & FLOAT_MANTISSA_MASK;
 
   // Check for NaN
-  if ((floatExpMasked == FLOAT_EXP_MASK) && (floatMantissa != 0))
-  {
+  if ((floatExpMasked == FLOAT_EXP_MASK) && (floatMantissa != 0)) {
     half result;
     result.data =
         HALF_EXP_MASK | (floatMantissa >> FLOAT_HALF_MANTISSA_POS_OFFSET);
@@ -227,8 +188,7 @@ half halfFromFloat(float f)
  *
  * Based on Mike Acton's half.c implementation.
  */
-float halfToFloat(half h)
-{
+float halfToFloat(half h) {
   // Constants for bit masks, shifts, and biases
   const uint16_t ONE = 0x0001;
   const uint16_t TWO = 0x0002;
@@ -273,7 +233,7 @@ float halfToFloat(half h)
   const uint32_t isNan = isExpFlagged && isMantissaNonZero;
 
   // Handling denormalized numbers
-  const uint16_t halfMantissaLeadingZeros = __builtin_clz(halfMantissa) - 16;
+  const uint16_t halfMantissaLeadingZeros = half_clz16(halfMantissa);
   const uint16_t halfDenormShiftAmount =
       halfMantissaLeadingZeros + HALF_FLOAT_DENORM_SA_OFFSET;
   const uint32_t halfFloatDenormMantissaShiftAmount =
@@ -309,8 +269,7 @@ float halfToFloat(half h)
   const uint32_t result = checkNanResult;
 
   // Reinterpret the uint32_t result as a float using a union
-  union
-  {
+  union {
     uint32_t u;
     float f;
   } floatUnion;
diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
@@ -200,8 +200,8 @@ void testToCPUWithint8() {
 
   // Validate the copy.
   for (size_t i = 0; i < N; ++i) {
-    LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]);
-    LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]);
+    //LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]);
+    //LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]);
     assert(outputData[i] == inputData[i]);
   }
   LOG(kDefLog, kInfo, "testToCPUWithint8 passed.");
@@ -234,8 +234,8 @@ void testToCPUWithint16() {
 
   // Validate the copy.
   for (size_t i = 0; i < N; ++i) {
-    LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]);
-    LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]);
+    //LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]);
+    //LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]);
     assert(outputData[i] == inputData[i]);
   }
   LOG(kDefLog, kInfo, "testToCPUWithint16 passed.");
@@ -268,8 +268,8 @@ void testToCPUWithint() {
 
   // Validate the copy.
   for (size_t i = 0; i < N; ++i) {
-    LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]);
-    LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]);
+    //LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]);
+    //LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]);
     assert(outputData[i] == inputData[i]);
   }
   LOG(kDefLog, kInfo, "testToCPUWithint passed.");
@@ -328,8 +328,8 @@ void testToCPUWithUint8() {
 
   // Verify the output matches the input.
   for (size_t i = 0; i < N; ++i) {
-    LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]);
-    LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]);
+    //LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]);
+    //LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]);
     assert(outputData[i] == inputData[i]);
   }
   LOG(kDefLog, kInfo, "testToCPUWithUint8 passed.");
@@ -360,8 +360,8 @@ void testToCPUWithUint16() {
 
   // Verify the output matches the input.
   for (size_t i = 0; i < N; ++i) {
-    LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]);
-    LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]);
+    //LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]);
+    //LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]);
     assert(outputData[i] == inputData[i]);
   }
   LOG(kDefLog, kInfo, "testToCPUWithUint16 passed.");
@@ -392,8 +392,8 @@ void testToCPUWithUint32() {
 
   // Verify the output matches the input.
   for (size_t i = 0; i < N; ++i) {
-    LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]);
-    LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]);
+    //LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]);
+    //LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]);
     assert(outputData[i] == inputData[i]);
   }
   LOG(kDefLog, kInfo, "testToCPUWithUint32 passed.");
@@ -462,8 +462,8 @@ void testToCPUWithTensor() {
 
   // Verify the output matches the input.
   for (size_t i = 0; i < N; ++i) {
-    LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]);
-    LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+    //LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]);
+    //LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
     assert(outputData[i] == inputData[i]);
   }
   LOG(kDefLog, kInfo, "testToCPUWithTensor passed.");
@@ -500,7 +500,7 @@ void testToCPUWithBuffer() {
 
   // Verify that the CPU output matches the original data.
   for (size_t i = 0; i < N; ++i) {
-    LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+    //LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
     assert(outputData[i] == data[i]);
   }
   LOG(kDefLog, kInfo, "testToCPUWithBuffer passed.");
@@ -542,8 +542,8 @@ void testToCPUWithTensorSourceOffset() {
   for (size_t i = 0; i < copyCount; ++i) {
     float expected = inputData[sourceOffsetElements + i];
     float actual = cpuOutput[i];
-    LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
-    LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+    //LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+    //LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
     assert(expected == actual);
   }
   LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed.");
@@ -585,8 +585,8 @@ void testToCPUWithBufferSourceOffset() {
   for (size_t i = 0; i < copyCount; ++i) {
     float expected = inputData[sourceOffsetElements + i];
     float actual = cpuOutput[i];
-    LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
-    LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+    //LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+    //LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
     assert(expected == actual);
   }
   LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed.");