typed toGPU for packing

MichealReed · MichealReed · commit 9745c7724c70 · 2025-04-11T01:08:38.000-05:00
diff --git a/gpu.hpp b/gpu.hpp
@@ -2027,123 +2027,231 @@ inline void toGPU(Context &ctx, const void *data, WGPUBuffer buffer,
   wgpuQueueWriteBuffer(ctx.queue, buffer, 0, data, size);
 }
 
-/**
- * @brief Overload of the toGPU function to copy data from CPU memory to a GPU
- * taking a Tensor instance instead of a WGPUBuffer instance.
- * @param[in] ctx Context instance to manage the operation
- * @param[in] data Pointer to the CPU memory to copy from
- * @param[in] tensor Tensor instance representing the GPU buffer to copy to
- *
- * @code
- * toGPU(ctx, data, tensor);
- * @endcode
- */
-inline void toGPU(Context &ctx, const float *data, Tensor &tensor) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
-                       tensor.data.size);
-}
-
-inline void toGPU(Context &ctx, const half *data, Tensor &tensor) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
-                       tensor.data.size);
-}
-
-inline void toGPU(Context &ctx, const double *data, Tensor &tensor) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
-                       tensor.data.size);
-}
-
-inline void toGPU(Context &ctx, const float *data, Tensor &tensor,
+// Overload for float: directly copy the float data.
+inline void toGPU(Context &ctx, const float *data, WGPUBuffer buffer,
                   size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
-}
-
-inline void toGPU(Context &ctx, const half *data, Tensor &tensor, size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+  toGPU(ctx, static_cast<const void *>(data), buffer, size);
 }
 
-inline void toGPU(Context &ctx, const double *data, Tensor &tensor,
+// Overload for half: directly copy the half data.
+inline void toGPU(Context &ctx, const half *data, WGPUBuffer buffer,
                   size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
-}
-
-inline void toGPU(Context &ctx, const uint8_t *data, Tensor &tensor) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
-                       tensor.data.size);
+  toGPU(ctx, static_cast<const void *>(data), buffer, size);
 }
 
-inline void toGPU(Context &ctx, const uint16_t *data, Tensor &tensor) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
-                       tensor.data.size);
+// Overload for double: pack each double into a float (losing precision).
+inline void toGPU(Context &ctx, const double *data, WGPUBuffer buffer,
+                  size_t size) {
+  // Number of doubles = size / sizeof(double)
+  size_t numElements = size / sizeof(double);
+  std::vector<float> packed(numElements);
+  for (size_t i = 0; i < numElements; ++i) {
+    packed[i] = static_cast<float>(data[i]);
+  }
+  toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(float));
 }
 
-inline void toGPU(Context &ctx, const uint32_t *data, Tensor &tensor) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
-                       tensor.data.size);
+// Overload for int8_t: pack four 8‑bit ints into one 32‑bit integer.
+inline void toGPU(Context &ctx, const int8_t *data, WGPUBuffer buffer,
+                  size_t size) {
+  // Number of int8_t elements equals size (sizeof(int8_t)==1)
+  size_t numElements = size;
+  size_t packedCount = (numElements + 3) / 4;
+  std::vector<int32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 4;
+    size_t shift = (i % 4) * 8;
+    packed[idx] |= (static_cast<uint8_t>(data[i]) << shift);
+  }
+  toGPU(ctx, packed.data(), buffer, packedCount * sizeof(int32_t));
 }
 
-inline void toGPU(Context &ctx, const uint64_t *data, Tensor &tensor) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
-                       tensor.data.size);
+// Overload for int16_t: pack two 16‑bit ints into one 32‑bit integer.
+inline void toGPU(Context &ctx, const int16_t *data, WGPUBuffer buffer,
+                  size_t size) {
+  size_t numElements = size / sizeof(int16_t);
+  size_t packedCount = (numElements + 1) / 2;
+  std::vector<int32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 2;
+    size_t shift = (i % 2) * 16;
+    packed[idx] |= (static_cast<uint16_t>(data[i]) << shift);
+  }
+  toGPU(ctx, packed.data(), buffer, packedCount * sizeof(int32_t));
 }
 
-inline void toGPU(Context &ctx, const uint8_t *data, Tensor &tensor,
+// Overload for int64_t: pack each 64‑bit int into two 32‑bit integers.
+inline void toGPU(Context &ctx, const int64_t *data, WGPUBuffer buffer,
                   size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+  size_t numElements = size / sizeof(int64_t);
+  std::vector<int32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    int64_t val = data[i];
+    packed[2 * i] = static_cast<int32_t>(val & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<int32_t>((val >> 32) & 0xFFFFFFFF);
+  }
+  toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(int32_t));
 }
 
-inline void toGPU(Context &ctx, const uint16_t *data, Tensor &tensor,
+// Overload for uint8_t: pack four 8‑bit uints into one 32‑bit unsigned integer.
+inline void toGPU(Context &ctx, const uint8_t *data, WGPUBuffer buffer,
                   size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+  size_t numElements = size; // sizeof(uint8_t)==1
+  size_t packedCount = (numElements + 3) / 4;
+  std::vector<uint32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 4;
+    size_t shift = (i % 4) * 8;
+    packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
+  }
+  toGPU(ctx, packed.data(), buffer, packedCount * sizeof(uint32_t));
 }
 
-inline void toGPU(Context &ctx, const uint32_t *data, Tensor &tensor,
+// Overload for uint16_t: pack two 16‑bit uints into one 32‑bit unsigned
+// integer.
+inline void toGPU(Context &ctx, const uint16_t *data, WGPUBuffer buffer,
                   size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+  size_t numElements = size / sizeof(uint16_t);
+  size_t packedCount = (numElements + 1) / 2;
+  std::vector<uint32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 2;
+    size_t shift = (i % 2) * 16;
+    packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
+  }
+  toGPU(ctx, packed.data(), buffer, packedCount * sizeof(uint32_t));
 }
 
-inline void toGPU(Context &ctx, const uint64_t *data, Tensor &tensor,
+// Overload for uint64_t: pack each 64‑bit uint into two 32‑bit unsigned
+// integers.
+inline void toGPU(Context &ctx, const uint64_t *data, WGPUBuffer buffer,
                   size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+  size_t numElements = size / sizeof(uint64_t);
+  std::vector<uint32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    uint64_t val = data[i];
+    packed[2 * i] = static_cast<uint32_t>(val & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<uint32_t>(val >> 32);
+  }
+  toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(uint32_t));
 }
 
-inline void toGPU(Context &ctx, const int8_t *data, Tensor &tensor) {
+/**
+ * @brief Overload of the toGPU function to copy data from CPU memory to a GPU
+ * taking a Tensor instance instead of a WGPUBuffer instance.
+ * @param[in] ctx Context instance to manage the operation
+ * @param[in] data Pointer to the CPU memory to copy from
+ * @param[in] tensor Tensor instance representing the GPU buffer to copy to
+ *
+ * @code
+ * toGPU(ctx, data, tensor);
+ * @endcode
+ */
+inline void toGPU(Context &ctx, const float *data, Tensor &tensor) {
   wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
                        tensor.data.size);
 }
 
-inline void toGPU(Context &ctx, const int16_t *data, Tensor &tensor) {
+inline void toGPU(Context &ctx, const half *data, Tensor &tensor) {
   wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
                        tensor.data.size);
 }
 
-inline void toGPU(Context &ctx, const int *data, Tensor &tensor) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
+// Overload for double: pack each double into a float (losing precision)
+inline void toGPU(Context &ctx, const double *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  std::vector<float> packed(numElements);
+  for (size_t i = 0; i < numElements; ++i) {
+    packed[i] = static_cast<float>(data[i]);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
                        tensor.data.size);
 }
 
-inline void toGPU(Context &ctx, const int64_t *data, Tensor &tensor) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
+// Overload for int8_t: pack four 8‑bit integers into one 32‑bit integer
+inline void toGPU(Context &ctx, const int8_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  size_t packedCount = (numElements + 3) / 4;
+  std::vector<int32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 4;
+    size_t shift = (i % 4) * 8;
+    // Pack as unsigned then reinterpret (shader will unpack)
+    packed[idx] |= (static_cast<uint8_t>(data[i]) << shift);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
                        tensor.data.size);
 }
 
-inline void toGPU(Context &ctx, const int8_t *data, Tensor &tensor,
-                  size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+// Overload for int16_t: pack two 16‑bit integers into one 32‑bit integer
+inline void toGPU(Context &ctx, const int16_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  size_t packedCount = (numElements + 1) / 2;
+  std::vector<int32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 2;
+    size_t shift = (i % 2) * 16;
+    packed[idx] |= (static_cast<uint16_t>(data[i]) << shift);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
 }
 
-inline void toGPU(Context &ctx, const int16_t *data, Tensor &tensor,
-                  size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+// Overload for int64_t: pack each 64‑bit integer into two 32‑bit integers
+inline void toGPU(Context &ctx, const int64_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  std::vector<int32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    int64_t val = data[i];
+    packed[2 * i] = static_cast<int32_t>(val & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<int32_t>((val >> 32) & 0xFFFFFFFF);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
 }
 
-inline void toGPU(Context &ctx, const int *data, Tensor &tensor, size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+// Overload for uint8_t: pack four 8‑bit unsigned integers into one 32‑bit
+// unsigned
+inline void toGPU(Context &ctx, const uint8_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  size_t packedCount = (numElements + 3) / 4;
+  std::vector<uint32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 4;
+    size_t shift = (i % 4) * 8;
+    packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
 }
 
-inline void toGPU(Context &ctx, const int64_t *data, Tensor &tensor,
-                  size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+// Overload for uint16_t: pack two 16‑bit unsigned integers into one 32‑bit
+// unsigned
+inline void toGPU(Context &ctx, const uint16_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  size_t packedCount = (numElements + 1) / 2;
+  std::vector<uint32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 2;
+    size_t shift = (i % 2) * 16;
+    packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
+}
+
+// Overload for uint64_t: pack each 64‑bit unsigned integer into two 32‑bit
+// unsigned
+inline void toGPU(Context &ctx, const uint64_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  std::vector<uint32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    uint64_t val = data[i];
+    packed[2 * i] = static_cast<uint32_t>(val & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<uint32_t>(val >> 32);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
 }
 
 template <typename Params>