@@ -760,13 +760,27 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
760760// Overload for double: pack each double into a float (losing precision)
761761inline Tensor createTensor (Context &ctx, const Shape &shape, NumType dtype,
762762 const double *data) {
763- assert (dtype == kf64); // unsupported: convert to kf32
763+ assert (dtype == kf64);
764764 size_t numElements = size (shape);
765- std::vector<float > packed (numElements);
765+ // Each double (8 bytes) will be packed into 2 uint32_t values (2×4 bytes).
766+ std::vector<uint32_t > packed (numElements * 2 );
766767 for (size_t i = 0 ; i < numElements; ++i) {
767- packed[i] = static_cast <float >(data[i]);
768+ uint64_t bits;
769+ std::memcpy (&bits, &data[i], sizeof (double )); // Extract raw bits.
770+ packed[2 * i] = static_cast <uint32_t >(bits & 0xFFFFFFFF );
771+ packed[2 * i + 1 ] = static_cast <uint32_t >(bits >> 32 );
768772 }
769- return createTensor (ctx, shape, kf32, packed.data ());
773+ // Create a tensor using the core overload that accepts a TensorPool and
774+ // WGPUDevice.
775+ Tensor tensor =
776+ createTensor (ctx.pool , ctx.device , shape, kf64,
777+ WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
778+ WGPUBufferUsage_CopySrc);
779+
780+ wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , packed.data (),
781+ packed.size () * sizeof (uint32_t ));
782+
783+ return tensor;
770784}
771785
772786inline Tensor createTensor (Context &ctx, const Shape &shape, NumType dtype,
@@ -1792,13 +1806,22 @@ inline void toCPU(Context &ctx, Tensor &tensor, NumType dtype, void *output,
17921806 toCPU (ctx, tensor, output, tensor.data .size , sourceOffset);
17931807 break ;
17941808
1795- // For double, the tensor was created by packing doubles into floats .
1809+ // kf64 to reverse bit‐ packing of doubles .
17961810 case kf64: {
1797- std::vector<float > tmp (numElements);
1798- toCPU (ctx, tensor, tmp.data (), tmp.size () * sizeof (float ), sourceOffset);
1811+ // We expect each double to have been packed into 2 uint32_t values.
1812+ std::vector<uint32_t > tmp (numElements * 2 );
1813+ // Read the packed data (each element is 4 bytes)
1814+ toCPU (ctx, tensor, tmp.data (), tmp.size () * sizeof (uint32_t ), sourceOffset);
17991815 double *dst = static_cast <double *>(output);
18001816 for (size_t i = 0 ; i < numElements; ++i) {
1801- dst[i] = static_cast <double >(tmp[i]);
1817+ uint32_t low = tmp[2 * i];
1818+ uint32_t high = tmp[2 * i + 1 ];
1819+ // Reassemble the 64-bit raw representation.
1820+ uint64_t bits = (static_cast <uint64_t >(high) << 32 ) | low;
1821+ // Copy the raw bits into a double.
1822+ double d;
1823+ std::memcpy (&d, &bits, sizeof (double ));
1824+ dst[i] = d;
18021825 }
18031826 break ;
18041827 }
@@ -1905,13 +1928,22 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, NumType dtype, void *output,
19051928 break ;
19061929 }
19071930
1908- // For double, the buffer was written as floats .
1931+ // kf64 to reverse bit‐packing of doubles .
19091932 case kf64: {
1910- std::vector<float > tmp (numElements);
1911- toCPU (ctx, buffer, tmp.data (), numElements * sizeof (float ), sourceOffset);
1933+ // We expect each double to have been packed into 2 uint32_t values.
1934+ std::vector<uint32_t > tmp (numElements * 2 );
1935+ // Read the packed data (each element is 4 bytes)
1936+ toCPU (ctx, buffer, tmp.data (), tmp.size () * sizeof (uint32_t ), sourceOffset);
19121937 double *dst = static_cast <double *>(output);
19131938 for (size_t i = 0 ; i < numElements; ++i) {
1914- dst[i] = static_cast <double >(tmp[i]);
1939+ uint32_t low = tmp[2 * i];
1940+ uint32_t high = tmp[2 * i + 1 ];
1941+ // Reassemble the 64-bit raw representation.
1942+ uint64_t bits = (static_cast <uint64_t >(high) << 32 ) | low;
1943+ // Copy the raw bits into a double.
1944+ double d;
1945+ std::memcpy (&d, &bits, sizeof (double ));
1946+ dst[i] = d;
19151947 }
19161948 break ;
19171949 }
@@ -2039,16 +2071,19 @@ inline void toGPU(Context &ctx, const half *data, WGPUBuffer buffer,
20392071 toGPU (ctx, static_cast <const void *>(data), buffer, size);
20402072}
20412073
2042- // Overload for double: pack each double into a float (losing precision) .
2074+ // Overload for double: bit- pack each double into two 32‑bit unsigned integers .
20432075inline void toGPU (Context &ctx, const double *data, WGPUBuffer buffer,
20442076 size_t size) {
2045- // Number of doubles = size / sizeof(double)
20462077 size_t numElements = size / sizeof (double );
2047- std::vector<float > packed (numElements);
2078+ std::vector<uint32_t > packed (numElements * 2 );
20482079 for (size_t i = 0 ; i < numElements; ++i) {
2049- packed[i] = static_cast <float >(data[i]);
2080+ uint64_t bits;
2081+ std::memcpy (&bits, &data[i],
2082+ sizeof (double )); // Reinterpret double as raw bits.
2083+ packed[2 * i] = static_cast <uint32_t >(bits & 0xFFFFFFFF );
2084+ packed[2 * i + 1 ] = static_cast <uint32_t >(bits >> 32 );
20502085 }
2051- toGPU (ctx, packed.data (), buffer, packed.size () * sizeof (float ));
2086+ toGPU (ctx, packed.data (), buffer, packed.size () * sizeof (uint32_t ));
20522087}
20532088
20542089// Overload for int8_t: pack four 8‑bit ints into one 32‑bit integer.
@@ -2157,15 +2192,19 @@ inline void toGPU(Context &ctx, const half *data, Tensor &tensor) {
21572192 tensor.data .size );
21582193}
21592194
2160- // Overload for double: pack each double into a float (losing precision)
2195+ // Overload for double: bit- pack each double into two 32‑bit unsigned integers.
21612196inline void toGPU (Context &ctx, const double *data, Tensor &tensor) {
2162- size_t numElements = size ( tensor.shape );
2163- std::vector<float > packed (numElements);
2197+ size_t numElements = tensor.data . size / sizeof ( double );
2198+ std::vector<uint32_t > packed (numElements * 2 );
21642199 for (size_t i = 0 ; i < numElements; ++i) {
2165- packed[i] = static_cast <float >(data[i]);
2166- }
2167- wgpuQueueWriteBuffer (ctx.queue , tensor.data .buffer , 0 , packed.data (),
2168- tensor.data .size );
2200+ uint64_t bits;
2201+ std::memcpy (&bits, &data[i],
2202+ sizeof (double )); // Reinterpret double as raw bits.
2203+ packed[2 * i] = static_cast <uint32_t >(bits & 0xFFFFFFFF );
2204+ packed[2 * i + 1 ] = static_cast <uint32_t >(bits >> 32 );
2205+ }
2206+ toGPU (ctx, packed.data (), tensor.data .buffer ,
2207+ packed.size () * sizeof (uint32_t ));
21692208}
21702209
21712210// Overload for int8_t: pack four 8‑bit integers into one 32‑bit integer
0 commit comments