11#include " gpu.hpp"
22#include < array>
33#include < cassert>
4+ #include < chrono>
45#include < cstdio>
56#include < cstring>
67#include < future>
78#include < vector>
89
910using namespace gpu ;
11+ using namespace std ::chrono;
12+
13+
14+ // Forward declarations:
15+ void testToCPUWithTensor ();
16+ void testToCPUWithBuffer ();
17+ void testToCPUWithTensorSourceOffset ();
18+ void testToCPUWithBufferSourceOffset ();
19+ void stressTestToCPU ();
20+
21+ int main () {
22+ LOG (kDefLog , kInfo , " Running GPU integration tests..." );
23+ testToCPUWithTensor ();
24+ testToCPUWithBuffer ();
25+ testToCPUWithTensorSourceOffset ();
26+ testToCPUWithBufferSourceOffset ();
27+ stressTestToCPU ();
28+ LOG (kDefLog , kInfo , " All tests passed." );
29+ return 0 ;
30+ }
31+
1032
1133// A simple WGSL copy kernel that copies input to output.
1234static const char *kCopyKernel = R"(
@@ -22,6 +44,7 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
2244}
2345)" ;
2446
47+
2548// Test using the overload that takes a Tensor.
2649void testToCPUWithTensor () {
2750 LOG (kDefLog , kInfo , " Running testToCPUWithTensor..." );
@@ -185,12 +208,49 @@ void testToCPUWithBufferSourceOffset() {
185208 LOG (kDefLog , kInfo , " testToCPUWithBufferSourceOffset passed." );
186209}
187210
188- int main () {
189- LOG (kDefLog , kInfo , " Running GPU integration tests..." );
190- testToCPUWithTensor ();
191- testToCPUWithBuffer ();
192- testToCPUWithTensorSourceOffset ();
193- testToCPUWithBufferSourceOffset ();
194- LOG (kDefLog , kInfo , " All tests passed." );
195- return 0 ;
196- }
211+ void stressTestToCPU () {
212+ LOG (kDefLog , kInfo , " Running stressTestToCPU for 2 seconds..." );
213+
214+ #ifdef USE_DAWN_API
215+ Context ctx = createContextByGpuIdx (0 );
216+ #else
217+ Context ctx = createContext ();
218+ #endif
219+
220+ constexpr size_t N = 1024 ;
221+ // Create a persistent tensor with some test data.
222+ std::vector<float > inputData (N, 0 .0f );
223+ for (size_t i = 0 ; i < N; ++i) {
224+ inputData[i] = static_cast <float >(i);
225+ }
226+ Tensor tensor = createTensor (ctx, Shape{N}, kf32, inputData.data ());
227+
228+ // Prepare to run for one second.
229+ auto startTime = high_resolution_clock::now ();
230+ std::vector<std::future<void >> futures;
231+ size_t opCount = 0 ;
232+ while (high_resolution_clock::now () - startTime < seconds (2 )) {
233+ // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes)
234+ auto outputData = std::make_shared<std::vector<float >>(N, 0 .0f );
235+ // Use the tensor overload; we’re copying the entire tensor (destOffset = 0)
236+ LOG (kDefLog , kInfo , " Copying %zu bytes from GPU to CPU..." , N * sizeof (float ));
237+ // log count
238+ LOG (kDefLog , kInfo , " opCount = %zu" , opCount);
239+ auto fut = toCPUAsync (ctx, tensor, outputData->data (), N * sizeof (float ), 0 );
240+ futures.push_back (std::move (fut));
241+ ++opCount;
242+ }
243+
244+ // Wait for all submitted operations to complete.
245+ for (auto &f : futures) {
246+ wait (ctx, f);
247+ }
248+
249+ auto endTime = high_resolution_clock::now ();
250+ auto totalMs = duration_cast<milliseconds>(endTime - startTime).count ();
251+ double throughput = (opCount / (totalMs / 1000.0 ));
252+
253+ LOG (kDefLog , kInfo , " Stress test completed:\n "
254+ " %zu GPU to CPU operations in %lld ms\n "
255+ " Throughput: %.2f ops/sec" , opCount, totalMs, throughput);
256+ }
0 commit comments