Skip to content

Commit 9745c77

Browse files
committed
typed toGPU for packing
1 parent a61dfc3 commit 9745c77

File tree

1 file changed

+181
-73
lines changed

1 file changed

+181
-73
lines changed

gpu.hpp

Lines changed: 181 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -2027,123 +2027,231 @@ inline void toGPU(Context &ctx, const void *data, WGPUBuffer buffer,
20272027
wgpuQueueWriteBuffer(ctx.queue, buffer, 0, data, size);
20282028
}
20292029

2030-
/**
2031-
* @brief Overload of the toGPU function to copy data from CPU memory to a GPU
2032-
* taking a Tensor instance instead of a WGPUBuffer instance.
2033-
* @param[in] ctx Context instance to manage the operation
2034-
* @param[in] data Pointer to the CPU memory to copy from
2035-
* @param[in] tensor Tensor instance representing the GPU buffer to copy to
2036-
*
2037-
* @code
2038-
* toGPU(ctx, data, tensor);
2039-
* @endcode
2040-
*/
2041-
inline void toGPU(Context &ctx, const float *data, Tensor &tensor) {
2042-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
2043-
tensor.data.size);
2044-
}
2045-
2046-
inline void toGPU(Context &ctx, const half *data, Tensor &tensor) {
2047-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
2048-
tensor.data.size);
2049-
}
2050-
2051-
inline void toGPU(Context &ctx, const double *data, Tensor &tensor) {
2052-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
2053-
tensor.data.size);
2054-
}
2055-
2056-
inline void toGPU(Context &ctx, const float *data, Tensor &tensor,
2030+
// Overload for float: directly copy the float data.
2031+
inline void toGPU(Context &ctx, const float *data, WGPUBuffer buffer,
20572032
size_t size) {
2058-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
2059-
}
2060-
2061-
inline void toGPU(Context &ctx, const half *data, Tensor &tensor, size_t size) {
2062-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
2033+
toGPU(ctx, static_cast<const void *>(data), buffer, size);
20632034
}
20642035

2065-
inline void toGPU(Context &ctx, const double *data, Tensor &tensor,
2036+
// Overload for half: directly copy the half data.
2037+
inline void toGPU(Context &ctx, const half *data, WGPUBuffer buffer,
20662038
size_t size) {
2067-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
2068-
}
2069-
2070-
inline void toGPU(Context &ctx, const uint8_t *data, Tensor &tensor) {
2071-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
2072-
tensor.data.size);
2039+
toGPU(ctx, static_cast<const void *>(data), buffer, size);
20732040
}
20742041

2075-
inline void toGPU(Context &ctx, const uint16_t *data, Tensor &tensor) {
2076-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
2077-
tensor.data.size);
2042+
// Overload for double: pack each double into a float (losing precision).
2043+
inline void toGPU(Context &ctx, const double *data, WGPUBuffer buffer,
2044+
size_t size) {
2045+
// Number of doubles = size / sizeof(double)
2046+
size_t numElements = size / sizeof(double);
2047+
std::vector<float> packed(numElements);
2048+
for (size_t i = 0; i < numElements; ++i) {
2049+
packed[i] = static_cast<float>(data[i]);
2050+
}
2051+
toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(float));
20782052
}
20792053

2080-
inline void toGPU(Context &ctx, const uint32_t *data, Tensor &tensor) {
2081-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
2082-
tensor.data.size);
2054+
// Overload for int8_t: pack four 8‑bit ints into one 32‑bit integer.
2055+
inline void toGPU(Context &ctx, const int8_t *data, WGPUBuffer buffer,
2056+
size_t size) {
2057+
// Number of int8_t elements equals size (sizeof(int8_t)==1)
2058+
size_t numElements = size;
2059+
size_t packedCount = (numElements + 3) / 4;
2060+
std::vector<int32_t> packed(packedCount, 0);
2061+
for (size_t i = 0; i < numElements; ++i) {
2062+
size_t idx = i / 4;
2063+
size_t shift = (i % 4) * 8;
2064+
packed[idx] |= (static_cast<uint8_t>(data[i]) << shift);
2065+
}
2066+
toGPU(ctx, packed.data(), buffer, packedCount * sizeof(int32_t));
20832067
}
20842068

2085-
inline void toGPU(Context &ctx, const uint64_t *data, Tensor &tensor) {
2086-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
2087-
tensor.data.size);
2069+
// Overload for int16_t: pack two 16‑bit ints into one 32‑bit integer.
2070+
inline void toGPU(Context &ctx, const int16_t *data, WGPUBuffer buffer,
2071+
size_t size) {
2072+
size_t numElements = size / sizeof(int16_t);
2073+
size_t packedCount = (numElements + 1) / 2;
2074+
std::vector<int32_t> packed(packedCount, 0);
2075+
for (size_t i = 0; i < numElements; ++i) {
2076+
size_t idx = i / 2;
2077+
size_t shift = (i % 2) * 16;
2078+
packed[idx] |= (static_cast<uint16_t>(data[i]) << shift);
2079+
}
2080+
toGPU(ctx, packed.data(), buffer, packedCount * sizeof(int32_t));
20882081
}
20892082

2090-
inline void toGPU(Context &ctx, const uint8_t *data, Tensor &tensor,
2083+
// Overload for int64_t: pack each 64‑bit int into two 32‑bit integers.
2084+
inline void toGPU(Context &ctx, const int64_t *data, WGPUBuffer buffer,
20912085
size_t size) {
2092-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
2086+
size_t numElements = size / sizeof(int64_t);
2087+
std::vector<int32_t> packed(numElements * 2);
2088+
for (size_t i = 0; i < numElements; ++i) {
2089+
int64_t val = data[i];
2090+
packed[2 * i] = static_cast<int32_t>(val & 0xFFFFFFFF);
2091+
packed[2 * i + 1] = static_cast<int32_t>((val >> 32) & 0xFFFFFFFF);
2092+
}
2093+
toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(int32_t));
20932094
}
20942095

2095-
inline void toGPU(Context &ctx, const uint16_t *data, Tensor &tensor,
2096+
// Overload for uint8_t: pack four 8‑bit uints into one 32‑bit unsigned integer.
2097+
inline void toGPU(Context &ctx, const uint8_t *data, WGPUBuffer buffer,
20962098
size_t size) {
2097-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
2099+
size_t numElements = size; // sizeof(uint8_t)==1
2100+
size_t packedCount = (numElements + 3) / 4;
2101+
std::vector<uint32_t> packed(packedCount, 0);
2102+
for (size_t i = 0; i < numElements; ++i) {
2103+
size_t idx = i / 4;
2104+
size_t shift = (i % 4) * 8;
2105+
packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
2106+
}
2107+
toGPU(ctx, packed.data(), buffer, packedCount * sizeof(uint32_t));
20982108
}
20992109

2100-
inline void toGPU(Context &ctx, const uint32_t *data, Tensor &tensor,
2110+
// Overload for uint16_t: pack two 16‑bit uints into one 32‑bit unsigned
2111+
// integer.
2112+
inline void toGPU(Context &ctx, const uint16_t *data, WGPUBuffer buffer,
21012113
size_t size) {
2102-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
2114+
size_t numElements = size / sizeof(uint16_t);
2115+
size_t packedCount = (numElements + 1) / 2;
2116+
std::vector<uint32_t> packed(packedCount, 0);
2117+
for (size_t i = 0; i < numElements; ++i) {
2118+
size_t idx = i / 2;
2119+
size_t shift = (i % 2) * 16;
2120+
packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
2121+
}
2122+
toGPU(ctx, packed.data(), buffer, packedCount * sizeof(uint32_t));
21032123
}
21042124

2105-
inline void toGPU(Context &ctx, const uint64_t *data, Tensor &tensor,
2125+
// Overload for uint64_t: pack each 64‑bit uint into two 32‑bit unsigned
2126+
// integers.
2127+
inline void toGPU(Context &ctx, const uint64_t *data, WGPUBuffer buffer,
21062128
size_t size) {
2107-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
2129+
size_t numElements = size / sizeof(uint64_t);
2130+
std::vector<uint32_t> packed(numElements * 2);
2131+
for (size_t i = 0; i < numElements; ++i) {
2132+
uint64_t val = data[i];
2133+
packed[2 * i] = static_cast<uint32_t>(val & 0xFFFFFFFF);
2134+
packed[2 * i + 1] = static_cast<uint32_t>(val >> 32);
2135+
}
2136+
toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(uint32_t));
21082137
}
21092138

2110-
inline void toGPU(Context &ctx, const int8_t *data, Tensor &tensor) {
2139+
/**
2140+
* @brief Overload of the toGPU function to copy data from CPU memory to a GPU
2141+
* taking a Tensor instance instead of a WGPUBuffer instance.
2142+
* @param[in] ctx Context instance to manage the operation
2143+
* @param[in] data Pointer to the CPU memory to copy from
2144+
* @param[in] tensor Tensor instance representing the GPU buffer to copy to
2145+
*
2146+
* @code
2147+
* toGPU(ctx, data, tensor);
2148+
* @endcode
2149+
*/
2150+
inline void toGPU(Context &ctx, const float *data, Tensor &tensor) {
21112151
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
21122152
tensor.data.size);
21132153
}
21142154

2115-
inline void toGPU(Context &ctx, const int16_t *data, Tensor &tensor) {
2155+
inline void toGPU(Context &ctx, const half *data, Tensor &tensor) {
21162156
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
21172157
tensor.data.size);
21182158
}
21192159

2120-
inline void toGPU(Context &ctx, const int *data, Tensor &tensor) {
2121-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
2160+
// Overload for double: pack each double into a float (losing precision)
2161+
inline void toGPU(Context &ctx, const double *data, Tensor &tensor) {
2162+
size_t numElements = size(tensor.shape);
2163+
std::vector<float> packed(numElements);
2164+
for (size_t i = 0; i < numElements; ++i) {
2165+
packed[i] = static_cast<float>(data[i]);
2166+
}
2167+
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
21222168
tensor.data.size);
21232169
}
21242170

2125-
inline void toGPU(Context &ctx, const int64_t *data, Tensor &tensor) {
2126-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
2171+
// Overload for int8_t: pack four 8‑bit integers into one 32‑bit integer
2172+
inline void toGPU(Context &ctx, const int8_t *data, Tensor &tensor) {
2173+
size_t numElements = size(tensor.shape);
2174+
size_t packedCount = (numElements + 3) / 4;
2175+
std::vector<int32_t> packed(packedCount, 0);
2176+
for (size_t i = 0; i < numElements; ++i) {
2177+
size_t idx = i / 4;
2178+
size_t shift = (i % 4) * 8;
2179+
// Pack as unsigned then reinterpret (shader will unpack)
2180+
packed[idx] |= (static_cast<uint8_t>(data[i]) << shift);
2181+
}
2182+
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
21272183
tensor.data.size);
21282184
}
21292185

2130-
inline void toGPU(Context &ctx, const int8_t *data, Tensor &tensor,
2131-
size_t size) {
2132-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
2186+
// Overload for int16_t: pack two 16‑bit integers into one 32‑bit integer
2187+
inline void toGPU(Context &ctx, const int16_t *data, Tensor &tensor) {
2188+
size_t numElements = size(tensor.shape);
2189+
size_t packedCount = (numElements + 1) / 2;
2190+
std::vector<int32_t> packed(packedCount, 0);
2191+
for (size_t i = 0; i < numElements; ++i) {
2192+
size_t idx = i / 2;
2193+
size_t shift = (i % 2) * 16;
2194+
packed[idx] |= (static_cast<uint16_t>(data[i]) << shift);
2195+
}
2196+
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
2197+
tensor.data.size);
21332198
}
21342199

2135-
inline void toGPU(Context &ctx, const int16_t *data, Tensor &tensor,
2136-
size_t size) {
2137-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
2200+
// Overload for int64_t: pack each 64‑bit integer into two 32‑bit integers
2201+
inline void toGPU(Context &ctx, const int64_t *data, Tensor &tensor) {
2202+
size_t numElements = size(tensor.shape);
2203+
std::vector<int32_t> packed(numElements * 2);
2204+
for (size_t i = 0; i < numElements; ++i) {
2205+
int64_t val = data[i];
2206+
packed[2 * i] = static_cast<int32_t>(val & 0xFFFFFFFF);
2207+
packed[2 * i + 1] = static_cast<int32_t>((val >> 32) & 0xFFFFFFFF);
2208+
}
2209+
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
2210+
tensor.data.size);
21382211
}
21392212

2140-
inline void toGPU(Context &ctx, const int *data, Tensor &tensor, size_t size) {
2141-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
2213+
// Overload for uint8_t: pack four 8‑bit unsigned integers into one 32‑bit
2214+
// unsigned
2215+
inline void toGPU(Context &ctx, const uint8_t *data, Tensor &tensor) {
2216+
size_t numElements = size(tensor.shape);
2217+
size_t packedCount = (numElements + 3) / 4;
2218+
std::vector<uint32_t> packed(packedCount, 0);
2219+
for (size_t i = 0; i < numElements; ++i) {
2220+
size_t idx = i / 4;
2221+
size_t shift = (i % 4) * 8;
2222+
packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
2223+
}
2224+
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
2225+
tensor.data.size);
21422226
}
21432227

2144-
inline void toGPU(Context &ctx, const int64_t *data, Tensor &tensor,
2145-
size_t size) {
2146-
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
2228+
// Overload for uint16_t: pack two 16‑bit unsigned integers into one 32‑bit
2229+
// unsigned
2230+
inline void toGPU(Context &ctx, const uint16_t *data, Tensor &tensor) {
2231+
size_t numElements = size(tensor.shape);
2232+
size_t packedCount = (numElements + 1) / 2;
2233+
std::vector<uint32_t> packed(packedCount, 0);
2234+
for (size_t i = 0; i < numElements; ++i) {
2235+
size_t idx = i / 2;
2236+
size_t shift = (i % 2) * 16;
2237+
packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
2238+
}
2239+
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
2240+
tensor.data.size);
2241+
}
2242+
2243+
// Overload for uint64_t: pack each 64‑bit unsigned integer into two 32‑bit
2244+
// unsigned
2245+
inline void toGPU(Context &ctx, const uint64_t *data, Tensor &tensor) {
2246+
size_t numElements = size(tensor.shape);
2247+
std::vector<uint32_t> packed(numElements * 2);
2248+
for (size_t i = 0; i < numElements; ++i) {
2249+
uint64_t val = data[i];
2250+
packed[2 * i] = static_cast<uint32_t>(val & 0xFFFFFFFF);
2251+
packed[2 * i + 1] = static_cast<uint32_t>(val >> 32);
2252+
}
2253+
wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
2254+
tensor.data.size);
21472255
}
21482256

21492257
template <typename Params>

0 commit comments

Comments
 (0)