Skip to content

Commit d1e320d

Browse files
Write benchmark to CSV file
1 parent decf573 commit d1e320d

File tree

5 files changed

+205
-119
lines changed

5 files changed

+205
-119
lines changed

GPU/GPUTracking/Base/GPUReconstructionCPU.cxx

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535

3636
#include <atomic>
3737
#include <ctime>
38+
#include <iomanip>
39+
#include <iostream>
3840

3941
#ifndef _WIN32
4042
#include <unistd.h>
@@ -212,6 +214,38 @@ int32_t GPUReconstructionCPU::ExitDevice()
212214
return 0;
213215
}
214216

217+
namespace {
218+
void write_header(std::ostream& stream) {
219+
stream << "type,count,name,kernel (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n";
220+
}
221+
222+
struct Row {
223+
std::string type = "";
224+
std::string name = "";
225+
double kernel_time = -1.0;
226+
double cpu_time = -1.0;
227+
double total_time = -1.0;
228+
size_t memSize = 0;
229+
uint32_t count = 0;
230+
231+
void write(std::ostream& stream, uint32_t statNEvents) {
232+
double scale = 1000000.0 / statNEvents;
233+
stream << type << ",";
234+
if (count != 0) stream << count;
235+
stream << "," << name << "," << uint32_t(kernel_time * scale) << ",";
236+
if (cpu_time != -1.0) stream << uint32_t(cpu_time * scale);
237+
stream << ",";
238+
if (cpu_time != -1.0 && total_time != -1.0) stream << uint32_t(cpu_time / total_time *100) / 100.0;
239+
stream << ",";
240+
if (total_time != -1.0) stream << uint32_t(total_time * scale);
241+
stream << ",";
242+
if (memSize != 0 && count != 0) stream << uint32_t(memSize / kernel_time * 1e-6) * 1e-3 << "," << memSize / statNEvents << "," << memSize / statNEvents / count;
243+
else stream << ",,";
244+
stream << std::endl;
245+
}
246+
};
247+
}
248+
215249
int32_t GPUReconstructionCPU::RunChains()
216250
{
217251
mMemoryScalers->temporaryFactor = 1.;
@@ -264,6 +298,16 @@ int32_t GPUReconstructionCPU::RunChains()
264298
double kernelTotal = 0;
265299
std::vector<double> kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.);
266300

301+
std::ofstream benchmarkCSV;
302+
if (!GetProcessingSettings().timingCSV.empty()) {
303+
benchmarkCSV.open(GetProcessingSettings().timingCSV, std::ios::out | std::ios::app);
304+
if (!benchmarkCSV.is_open()) {
305+
GPUError("Could not open timing CSV file '%s' for writing", GetProcessingSettings().timingCSV.c_str());
306+
} else if (mNEventsProcessed == 1) {
307+
write_header(benchmarkCSV);
308+
}
309+
}
310+
267311
if (GetProcessingSettings().debugLevel >= 1) {
268312
for (uint32_t i = 0; i < mTimers.size(); i++) {
269313
double time = 0;
@@ -285,9 +329,16 @@ int32_t GPUReconstructionCPU::RunChains()
285329
kernelStepTimes[stepNum] += time;
286330
}
287331
char bandwidth[256] = "";
332+
Row task_row;
333+
task_row.type = 'K';
334+
task_row.name = mTimers[i]->name.c_str();
335+
task_row.kernel_time = time;
336+
task_row.count = mTimers[i]->count;
288337
if (mTimers[i]->memSize && mStatNEvents && time != 0.) {
338+
task_row.memSize = mTimers[i]->memSize;
289339
snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count);
290340
}
341+
if (benchmarkCSV.is_open()) task_row.write(benchmarkCSV, mStatNEvents);
291342
printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth);
292343
if (GetProcessingSettings().resetTimers) {
293344
mTimers[i]->count = 0;
@@ -298,14 +349,34 @@ int32_t GPUReconstructionCPU::RunChains()
298349
if (GetProcessingSettings().recoTaskTiming) {
299350
for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) {
300351
if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) {
352+
Row reco_step_row;
353+
reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)";
354+
reco_step_row.kernel_time = kernelStepTimes[i];
355+
reco_step_row.cpu_time = mTimersRecoSteps[i].timerCPU;
356+
reco_step_row.total_time = mTimersRecoSteps[i].timerTotal.GetElapsedTime();
357+
if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents);
301358
printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks",
302359
gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime());
303360
}
304361
if (mTimersRecoSteps[i].bytesToGPU) {
362+
Row reco_step_row;
363+
reco_step_row.type = 'D';
364+
reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)";
365+
reco_step_row.kernel_time = mTimersRecoSteps[i].timerToGPU.GetElapsedTime();
366+
reco_step_row.memSize = mTimersRecoSteps[i].bytesToGPU;
367+
reco_step_row.count = mTimersRecoSteps[i].countToGPU;
368+
if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents);
305369
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents,
306370
mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToGPU / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].countToGPU);
307371
}
308372
if (mTimersRecoSteps[i].bytesToHost) {
373+
Row reco_step_row;
374+
reco_step_row.type = 'D';
375+
reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)";
376+
reco_step_row.kernel_time = mTimersRecoSteps[i].timerToHost.GetElapsedTime();
377+
reco_step_row.memSize = mTimersRecoSteps[i].bytesToHost;
378+
reco_step_row.count = mTimersRecoSteps[i].countToHost;
379+
if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents);
309380
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents,
310381
mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost);
311382
}
@@ -321,13 +392,23 @@ int32_t GPUReconstructionCPU::RunChains()
321392
}
322393
for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) {
323394
if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) {
395+
Row general_step_row;
396+
general_step_row.name = gpudatatypes::GENERAL_STEP_NAMES[i];
397+
general_step_row.kernel_time = mTimersGeneralSteps[i].GetElapsedTime();
398+
if (benchmarkCSV.is_open()) general_step_row.write(benchmarkCSV, mStatNEvents);
324399
printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents);
325400
}
326401
}
402+
Row wall_row;
403+
wall_row.name = "Wall";
327404
if (GetProcessingSettings().debugLevel >= 1) {
405+
wall_row.kernel_time = kernelTotal;
328406
mStatKernelTime = kernelTotal * 1000000 / mStatNEvents;
329407
printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str());
330408
}
409+
wall_row.cpu_time = mStatCPUTime;
410+
wall_row.total_time = mStatWallTime * mStatNEvents / 1000000;
411+
if (benchmarkCSV.is_open()) wall_row.write(benchmarkCSV, mStatNEvents);
331412
printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str());
332413
} else if (GetProcessingSettings().debugLevel >= 0) {
333414
GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent,
329329
AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)")
330330
AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file")
331331
AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6")
332+
AddOption(timingCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.")
332333
AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
333334
AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
334335
AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1))

0 commit comments

Comments
 (0)