@@ -2027,123 +2027,231 @@ inline void toGPU(Context &ctx, const void *data, WGPUBuffer buffer,
20272027 wgpuQueueWriteBuffer (ctx.queue , buffer, 0 , data, size);
20282028}
20292029
2030- /* *
2031- * @brief Overload of the toGPU function to copy data from CPU memory to a GPU
2032- * taking a Tensor instance instead of a WGPUBuffer instance.
2033- * @param[in] ctx Context instance to manage the operation
2034- * @param[in] data Pointer to the CPU memory to copy from
2035- * @param[in] tensor Tensor instance representing the GPU buffer to copy to
2036- *
2037- * @code
2038- * toGPU(ctx, data, tensor);
2039- * @endcode
2040- */
2041- inline void toGPU (Context &ctx, const float *data, Tensor &tensor) {
2042- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data,
2043- tensor.data .size );
2044- }
2045-
2046- inline void toGPU (Context &ctx, const half *data, Tensor &tensor) {
2047- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data,
2048- tensor.data .size );
2049- }
2050-
2051- inline void toGPU (Context &ctx, const double *data, Tensor &tensor) {
2052- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data,
2053- tensor.data .size );
2054- }
2055-
2056- inline void toGPU (Context &ctx, const float *data, Tensor &tensor,
2030+ // Overload for float: directly copy the float data.
2031+ inline void toGPU (Context &ctx, const float *data, WGPUBuffer buffer,
20572032 size_t size) {
2058- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data, size);
2059- }
2060-
2061- inline void toGPU (Context &ctx, const half *data, Tensor &tensor, size_t size) {
2062- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data, size);
2033+ toGPU (ctx, static_cast <const void *>(data), buffer, size);
20632034}
20642035
2065- inline void toGPU (Context &ctx, const double *data, Tensor &tensor,
2036+ // Overload for half: directly copy the half data.
2037+ inline void toGPU (Context &ctx, const half *data, WGPUBuffer buffer,
20662038 size_t size) {
2067- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data, size);
2068- }
2069-
2070- inline void toGPU (Context &ctx, const uint8_t *data, Tensor &tensor) {
2071- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data,
2072- tensor.data .size );
2039+ toGPU (ctx, static_cast <const void *>(data), buffer, size);
20732040}
20742041
2075- inline void toGPU (Context &ctx, const uint16_t *data, Tensor &tensor) {
2076- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data,
2077- tensor.data .size );
2042+ // Overload for double: pack each double into a float (losing precision).
2043+ inline void toGPU (Context &ctx, const double *data, WGPUBuffer buffer,
2044+ size_t size) {
2045+ // Number of doubles = size / sizeof(double)
2046+ size_t numElements = size / sizeof (double );
2047+ std::vector<float > packed (numElements);
2048+ for (size_t i = 0 ; i < numElements; ++i) {
2049+ packed[i] = static_cast <float >(data[i]);
2050+ }
2051+ toGPU (ctx, packed.data (), buffer, packed.size () * sizeof (float ));
20782052}
20792053
2080- inline void toGPU (Context &ctx, const uint32_t *data, Tensor &tensor) {
2081- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data,
2082- tensor.data .size );
2054+ // Overload for int8_t: pack four 8‑bit ints into one 32‑bit integer.
2055+ inline void toGPU (Context &ctx, const int8_t *data, WGPUBuffer buffer,
2056+ size_t size) {
2057+ // Number of int8_t elements equals size (sizeof(int8_t)==1)
2058+ size_t numElements = size;
2059+ size_t packedCount = (numElements + 3 ) / 4 ;
2060+ std::vector<int32_t > packed (packedCount, 0 );
2061+ for (size_t i = 0 ; i < numElements; ++i) {
2062+ size_t idx = i / 4 ;
2063+ size_t shift = (i % 4 ) * 8 ;
2064+ packed[idx] |= (static_cast <uint8_t >(data[i]) << shift);
2065+ }
2066+ toGPU (ctx, packed.data (), buffer, packedCount * sizeof (int32_t ));
20832067}
20842068
2085- inline void toGPU (Context &ctx, const uint64_t *data, Tensor &tensor) {
2086- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data,
2087- tensor.data .size );
2069+ // Overload for int16_t: pack two 16‑bit ints into one 32‑bit integer.
2070+ inline void toGPU (Context &ctx, const int16_t *data, WGPUBuffer buffer,
2071+ size_t size) {
2072+ size_t numElements = size / sizeof (int16_t );
2073+ size_t packedCount = (numElements + 1 ) / 2 ;
2074+ std::vector<int32_t > packed (packedCount, 0 );
2075+ for (size_t i = 0 ; i < numElements; ++i) {
2076+ size_t idx = i / 2 ;
2077+ size_t shift = (i % 2 ) * 16 ;
2078+ packed[idx] |= (static_cast <uint16_t >(data[i]) << shift);
2079+ }
2080+ toGPU (ctx, packed.data (), buffer, packedCount * sizeof (int32_t ));
20882081}
20892082
2090- inline void toGPU (Context &ctx, const uint8_t *data, Tensor &tensor,
2083+ // Overload for int64_t: pack each 64‑bit int into two 32‑bit integers.
2084+ inline void toGPU (Context &ctx, const int64_t *data, WGPUBuffer buffer,
20912085 size_t size) {
2092- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data, size);
2086+ size_t numElements = size / sizeof (int64_t );
2087+ std::vector<int32_t > packed (numElements * 2 );
2088+ for (size_t i = 0 ; i < numElements; ++i) {
2089+ int64_t val = data[i];
2090+ packed[2 * i] = static_cast <int32_t >(val & 0xFFFFFFFF );
2091+ packed[2 * i + 1 ] = static_cast <int32_t >((val >> 32 ) & 0xFFFFFFFF );
2092+ }
2093+ toGPU (ctx, packed.data (), buffer, packed.size () * sizeof (int32_t ));
20932094}
20942095
2095- inline void toGPU (Context &ctx, const uint16_t *data, Tensor &tensor,
2096+ // Overload for uint8_t: pack four 8‑bit uints into one 32‑bit unsigned integer.
2097+ inline void toGPU (Context &ctx, const uint8_t *data, WGPUBuffer buffer,
20962098 size_t size) {
2097- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data, size);
2099+ size_t numElements = size; // sizeof(uint8_t)==1
2100+ size_t packedCount = (numElements + 3 ) / 4 ;
2101+ std::vector<uint32_t > packed (packedCount, 0 );
2102+ for (size_t i = 0 ; i < numElements; ++i) {
2103+ size_t idx = i / 4 ;
2104+ size_t shift = (i % 4 ) * 8 ;
2105+ packed[idx] |= (static_cast <uint32_t >(data[i]) << shift);
2106+ }
2107+ toGPU (ctx, packed.data (), buffer, packedCount * sizeof (uint32_t ));
20982108}
20992109
2100- inline void toGPU (Context &ctx, const uint32_t *data, Tensor &tensor,
2110+ // Overload for uint16_t: pack two 16‑bit uints into one 32‑bit unsigned
2111+ // integer.
2112+ inline void toGPU (Context &ctx, const uint16_t *data, WGPUBuffer buffer,
21012113 size_t size) {
2102- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data, size);
2114+ size_t numElements = size / sizeof (uint16_t );
2115+ size_t packedCount = (numElements + 1 ) / 2 ;
2116+ std::vector<uint32_t > packed (packedCount, 0 );
2117+ for (size_t i = 0 ; i < numElements; ++i) {
2118+ size_t idx = i / 2 ;
2119+ size_t shift = (i % 2 ) * 16 ;
2120+ packed[idx] |= (static_cast <uint32_t >(data[i]) << shift);
2121+ }
2122+ toGPU (ctx, packed.data (), buffer, packedCount * sizeof (uint32_t ));
21032123}
21042124
2105- inline void toGPU (Context &ctx, const uint64_t *data, Tensor &tensor,
2125+ // Overload for uint64_t: pack each 64‑bit uint into two 32‑bit unsigned
2126+ // integers.
2127+ inline void toGPU (Context &ctx, const uint64_t *data, WGPUBuffer buffer,
21062128 size_t size) {
2107- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data, size);
2129+ size_t numElements = size / sizeof (uint64_t );
2130+ std::vector<uint32_t > packed (numElements * 2 );
2131+ for (size_t i = 0 ; i < numElements; ++i) {
2132+ uint64_t val = data[i];
2133+ packed[2 * i] = static_cast <uint32_t >(val & 0xFFFFFFFF );
2134+ packed[2 * i + 1 ] = static_cast <uint32_t >(val >> 32 );
2135+ }
2136+ toGPU (ctx, packed.data (), buffer, packed.size () * sizeof (uint32_t ));
21082137}
21092138
2110- inline void toGPU (Context &ctx, const int8_t *data, Tensor &tensor) {
2139+ /* *
2140+ * @brief Overload of the toGPU function to copy data from CPU memory to a GPU
2141+ * taking a Tensor instance instead of a WGPUBuffer instance.
2142+ * @param[in] ctx Context instance to manage the operation
2143+ * @param[in] data Pointer to the CPU memory to copy from
2144+ * @param[in] tensor Tensor instance representing the GPU buffer to copy to
2145+ *
2146+ * @code
2147+ * toGPU(ctx, data, tensor);
2148+ * @endcode
2149+ */
2150+ inline void toGPU (Context &ctx, const float *data, Tensor &tensor) {
21112151 wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data,
21122152 tensor.data .size );
21132153}
21142154
2115- inline void toGPU (Context &ctx, const int16_t *data, Tensor &tensor) {
2155+ inline void toGPU (Context &ctx, const half *data, Tensor &tensor) {
21162156 wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data,
21172157 tensor.data .size );
21182158}
21192159
2120- inline void toGPU (Context &ctx, const int *data, Tensor &tensor) {
2121- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data,
2160+ // Overload for double: pack each double into a float (losing precision)
2161+ inline void toGPU (Context &ctx, const double *data, Tensor &tensor) {
2162+ size_t numElements = size (tensor.shape );
2163+ std::vector<float > packed (numElements);
2164+ for (size_t i = 0 ; i < numElements; ++i) {
2165+ packed[i] = static_cast <float >(data[i]);
2166+ }
2167+ wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , packed.data (),
21222168 tensor.data .size );
21232169}
21242170
2125- inline void toGPU (Context &ctx, const int64_t *data, Tensor &tensor) {
2126- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data,
2171+ // Overload for int8_t: pack four 8‑bit integers into one 32‑bit integer
2172+ inline void toGPU (Context &ctx, const int8_t *data, Tensor &tensor) {
2173+ size_t numElements = size (tensor.shape );
2174+ size_t packedCount = (numElements + 3 ) / 4 ;
2175+ std::vector<int32_t > packed (packedCount, 0 );
2176+ for (size_t i = 0 ; i < numElements; ++i) {
2177+ size_t idx = i / 4 ;
2178+ size_t shift = (i % 4 ) * 8 ;
2179+ // Pack as unsigned then reinterpret (shader will unpack)
2180+ packed[idx] |= (static_cast <uint8_t >(data[i]) << shift);
2181+ }
2182+ wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , packed.data (),
21272183 tensor.data .size );
21282184}
21292185
2130- inline void toGPU (Context &ctx, const int8_t *data, Tensor &tensor,
2131- size_t size) {
2132- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data, size);
2186+ // Overload for int16_t: pack two 16‑bit integers into one 32‑bit integer
2187+ inline void toGPU (Context &ctx, const int16_t *data, Tensor &tensor) {
2188+ size_t numElements = size (tensor.shape );
2189+ size_t packedCount = (numElements + 1 ) / 2 ;
2190+ std::vector<int32_t > packed (packedCount, 0 );
2191+ for (size_t i = 0 ; i < numElements; ++i) {
2192+ size_t idx = i / 2 ;
2193+ size_t shift = (i % 2 ) * 16 ;
2194+ packed[idx] |= (static_cast <uint16_t >(data[i]) << shift);
2195+ }
2196+ wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , packed.data (),
2197+ tensor.data .size );
21332198}
21342199
2135- inline void toGPU (Context &ctx, const int16_t *data, Tensor &tensor,
2136- size_t size) {
2137- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data, size);
2200+ // Overload for int64_t: pack each 64‑bit integer into two 32‑bit integers
2201+ inline void toGPU (Context &ctx, const int64_t *data, Tensor &tensor) {
2202+ size_t numElements = size (tensor.shape );
2203+ std::vector<int32_t > packed (numElements * 2 );
2204+ for (size_t i = 0 ; i < numElements; ++i) {
2205+ int64_t val = data[i];
2206+ packed[2 * i] = static_cast <int32_t >(val & 0xFFFFFFFF );
2207+ packed[2 * i + 1 ] = static_cast <int32_t >((val >> 32 ) & 0xFFFFFFFF );
2208+ }
2209+ wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , packed.data (),
2210+ tensor.data .size );
21382211}
21392212
2140- inline void toGPU (Context &ctx, const int *data, Tensor &tensor, size_t size) {
2141- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data, size);
2213+ // Overload for uint8_t: pack four 8‑bit unsigned integers into one 32‑bit
2214+ // unsigned
2215+ inline void toGPU (Context &ctx, const uint8_t *data, Tensor &tensor) {
2216+ size_t numElements = size (tensor.shape );
2217+ size_t packedCount = (numElements + 3 ) / 4 ;
2218+ std::vector<uint32_t > packed (packedCount, 0 );
2219+ for (size_t i = 0 ; i < numElements; ++i) {
2220+ size_t idx = i / 4 ;
2221+ size_t shift = (i % 4 ) * 8 ;
2222+ packed[idx] |= (static_cast <uint32_t >(data[i]) << shift);
2223+ }
2224+ wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , packed.data (),
2225+ tensor.data .size );
21422226}
21432227
2144- inline void toGPU (Context &ctx, const int64_t *data, Tensor &tensor,
2145- size_t size) {
2146- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , data, size);
2228+ // Overload for uint16_t: pack two 16‑bit unsigned integers into one 32‑bit
2229+ // unsigned
2230+ inline void toGPU (Context &ctx, const uint16_t *data, Tensor &tensor) {
2231+ size_t numElements = size (tensor.shape );
2232+ size_t packedCount = (numElements + 1 ) / 2 ;
2233+ std::vector<uint32_t > packed (packedCount, 0 );
2234+ for (size_t i = 0 ; i < numElements; ++i) {
2235+ size_t idx = i / 2 ;
2236+ size_t shift = (i % 2 ) * 16 ;
2237+ packed[idx] |= (static_cast <uint32_t >(data[i]) << shift);
2238+ }
2239+ wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , packed.data (),
2240+ tensor.data .size );
2241+ }
2242+
2243+ // Overload for uint64_t: pack each 64‑bit unsigned integer into two 32‑bit
2244+ // unsigned
2245+ inline void toGPU (Context &ctx, const uint64_t *data, Tensor &tensor) {
2246+ size_t numElements = size (tensor.shape );
2247+ std::vector<uint32_t > packed (numElements * 2 );
2248+ for (size_t i = 0 ; i < numElements; ++i) {
2249+ uint64_t val = data[i];
2250+ packed[2 * i] = static_cast <uint32_t >(val & 0xFFFFFFFF );
2251+ packed[2 * i + 1 ] = static_cast <uint32_t >(val >> 32 );
2252+ }
2253+ wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , packed.data (),
2254+ tensor.data .size );
21472255}
21482256
21492257template <typename Params>
0 commit comments