64bit fixes

MichealReed · MichealReed · commit b99d6bf67924 · 2025-04-21T22:13:54.000-05:00
diff --git a/gpu.hpp b/gpu.hpp
@@ -1953,7 +1953,7 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, NumType dtype, void *output,
   case ki8: {
     size_t packedCount = (numElements + 3) / 4;
     std::vector<int32_t> tmp(packedCount);
-    toCPU(ctx, buffer, tmp.data(), packedCount * sizeof(int32_t), sourceOffset);
+    toCPU(ctx, buffer, tmp.data(), tmp.size() * sizeof(int32_t), sourceOffset);
     int8_t *dst = static_cast<int8_t *>(output);
     for (size_t i = 0; i < numElements; ++i) {
       size_t idx = i / 4;
@@ -2074,8 +2074,7 @@ inline void toGPU(Context &ctx, const half *data, WGPUBuffer buffer,
 
 // Overload for double: bit-pack each double into two 32‑bit unsigned integers.
 inline void toGPU(Context &ctx, const double *data, WGPUBuffer buffer,
-                  size_t size) {
-  size_t numElements = size / sizeof(double);
+                  size_t numElements) {
   std::vector<uint32_t> packed(numElements * 2);
   for (size_t i = 0; i < numElements; ++i) {
     uint64_t bits;
@@ -2089,23 +2088,22 @@ inline void toGPU(Context &ctx, const double *data, WGPUBuffer buffer,
 
 // Overload for int8_t: pack four 8‑bit ints into one 32‑bit integer.
 inline void toGPU(Context &ctx, const int8_t *data, WGPUBuffer buffer,
-                  size_t size) {
+                  size_t numElements) {
   // Number of int8_t elements equals size (sizeof(int8_t)==1)
-  size_t numElements = size;
   size_t packedCount = (numElements + 3) / 4;
   std::vector<int32_t> packed(packedCount, 0);
   for (size_t i = 0; i < numElements; ++i) {
     size_t idx = i / 4;
     size_t shift = (i % 4) * 8;
     packed[idx] |= (static_cast<uint8_t>(data[i]) << shift);
+    //LOG(kDefLog, kInfo, "toGPU: %d %d %d", data[i], packed[idx], idx);
   }
   toGPU(ctx, packed.data(), buffer, packedCount * sizeof(int32_t));
 }
 
 // Overload for int16_t: pack two 16‑bit ints into one 32‑bit integer.
 inline void toGPU(Context &ctx, const int16_t *data, WGPUBuffer buffer,
-                  size_t size) {
-  size_t numElements = size / sizeof(int16_t);
+                  size_t numElements) {
   size_t packedCount = (numElements + 1) / 2;
   std::vector<int32_t> packed(packedCount, 0);
   for (size_t i = 0; i < numElements; ++i) {
@@ -2118,8 +2116,7 @@ inline void toGPU(Context &ctx, const int16_t *data, WGPUBuffer buffer,
 
 // Overload for int64_t: pack each 64‑bit int into two 32‑bit integers.
 inline void toGPU(Context &ctx, const int64_t *data, WGPUBuffer buffer,
-                  size_t size) {
-  size_t numElements = size / sizeof(int64_t);
+                  size_t numElements) {
   std::vector<int32_t> packed(numElements * 2);
   for (size_t i = 0; i < numElements; ++i) {
     int64_t val = data[i];
@@ -2131,8 +2128,7 @@ inline void toGPU(Context &ctx, const int64_t *data, WGPUBuffer buffer,
 
 // Overload for uint8_t: pack four 8‑bit uints into one 32‑bit unsigned integer.
 inline void toGPU(Context &ctx, const uint8_t *data, WGPUBuffer buffer,
-                  size_t size) {
-  size_t numElements = size; // sizeof(uint8_t)==1
+                  size_t numElements) {
   size_t packedCount = (numElements + 3) / 4;
   std::vector<uint32_t> packed(packedCount, 0);
   for (size_t i = 0; i < numElements; ++i) {
@@ -2146,8 +2142,7 @@ inline void toGPU(Context &ctx, const uint8_t *data, WGPUBuffer buffer,
 // Overload for uint16_t: pack two 16‑bit uints into one 32‑bit unsigned
 // integer.
 inline void toGPU(Context &ctx, const uint16_t *data, WGPUBuffer buffer,
-                  size_t size) {
-  size_t numElements = size / sizeof(uint16_t);
+                  size_t numElements) {
   size_t packedCount = (numElements + 1) / 2;
   std::vector<uint32_t> packed(packedCount, 0);
   for (size_t i = 0; i < numElements; ++i) {
@@ -2161,8 +2156,7 @@ inline void toGPU(Context &ctx, const uint16_t *data, WGPUBuffer buffer,
 // Overload for uint64_t: pack each 64‑bit uint into two 32‑bit unsigned
 // integers.
 inline void toGPU(Context &ctx, const uint64_t *data, WGPUBuffer buffer,
-                  size_t size) {
-  size_t numElements = size / sizeof(uint64_t);
+                  size_t numElements) {
   std::vector<uint32_t> packed(numElements * 2);
   for (size_t i = 0; i < numElements; ++i) {
     uint64_t val = data[i];
diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
@@ -145,26 +145,26 @@ void testAddKernelInt8();
 int main() {
   LOG(kDefLog, kInfo, "Running GPU integration tests...");
   testAddKernelInt8();
-  // testCopyShaderPackedUnpack_int8();
-  // testToCPUUnpack();
-  // testToCPUWithTensor();
-  // testToCPUWithBuffer();
-  // testToCPUWithTensorSourceOffset();
-  // testToCPUWithBufferSourceOffset();
-  // testToCPUWithHalf();
-  // testToCPUWithFloat();
-  // testToCPUWithDouble();
-  // testToCPUWithint8();
-  // testToCPUWithint16();
-  // testToCPUWithint();
-  // testToCPUWithint64();
-  // testToCPUWithUint8();
-  // testToCPUWithUint16();
-  // testToCPUWithUint32();
-  // testToCPUWithUint64();
-  // testNumTypeSizes();
-  // stressTestToCPU();
-  // testHalf();
+  testCopyShaderPackedUnpack_int8();
+  testToCPUUnpack();
+  testToCPUWithTensor();
+  testToCPUWithBuffer();
+  testToCPUWithTensorSourceOffset();
+  testToCPUWithBufferSourceOffset();
+  testToCPUWithHalf();
+  testToCPUWithFloat();
+  testToCPUWithDouble();
+  testToCPUWithint8();
+  testToCPUWithint16();
+  testToCPUWithint();
+  testToCPUWithint64();
+  testToCPUWithUint8();
+  testToCPUWithUint16();
+  testToCPUWithUint32();
+  testToCPUWithUint64();
+  testNumTypeSizes();
+  stressTestToCPU();
+  testHalf();
   LOG(kDefLog, kInfo, "All tests passed.");
   return 0;
 }