From f8514aa8d1e4352d8d7d538ff10ac5564ca8ea76 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 5 Nov 2024 17:26:44 +0100 Subject: [PATCH 01/14] Fix hybrid vertexer printouts --- .../GPU/ITStrackingGPU/TrackingKernels.h | 4 + .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 235 +----------------- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 14 +- .../ITSMFT/ITS/tracking/src/Vertexer.cxx | 8 +- 4 files changed, 26 insertions(+), 235 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 34e6165b9530f..834fdff0ac9ef 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -50,6 +50,10 @@ GPUg() void fitTrackSeedsKernel( #endif } // namespace gpu +template +void computeTrackletsInRofsHandler(const int startROF, + const int endROF); + void countCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 3c6a307fc4ff6..b890a1dce05ee 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -37,235 +37,6 @@ void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) template void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int, int) { - // if (!mTimeFrameGPU->getClusters().size()) { - // return; - // } - // const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); - // gsl::span diamondSpan(&diamondVert, 1); - // std::vector threads(mTimeFrameGPU->getNChunks()); - - // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) { - // int maxTracklets{static_cast(mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->clustersPerROfCapacity) * - // static_cast(mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxTrackletsPerCluster)}; - // int maxRofPerChunk{mTimeFrameGPU->mNrof / (int)mTimeFrameGPU->getNChunks()}; - // // Define workload - // auto doTrackReconstruction = [&, chunkId, maxRofPerChunk, iteration]() -> void { - // auto offset = chunkId * maxRofPerChunk; - // auto maxROF = offset + maxRofPerChunk; - // while (offset < maxROF) { - // auto rofs = mTimeFrameGPU->loadChunkData(chunkId, offset, maxROF); - // //////////////////// - // /// Tracklet finding - - // for (int iLayer{0}; iLayer < nLayers - 1; ++iLayer) { - // auto nclus = mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, iLayer); - // const float meanDeltaR{mTrkParams[iteration].LayerRadii[iLayer + 1] - mTrkParams[iteration].LayerRadii[iLayer]}; - // gpu::computeLayerTrackletsKernelMultipleRof<<getStream(chunkId).get()>>>( - // iLayer, // const int layerIndex, - // iteration, // const int iteration, - // offset, // const unsigned int startRofId, - // rofs, // const unsigned int rofSize, - // 0, // const unsigned int deltaRof, - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(iLayer), // const Cluster* clustersCurrentLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(iLayer + 1), // const Cluster* clustersNextLayer, - // mTimeFrameGPU->getDeviceROframesClusters(iLayer), // const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF - // mTimeFrameGPU->getDeviceROframesClusters(iLayer + 1), // const int* roFrameClustersNextLayer, // Number of clusters on layer 1 per ROF - // mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(iLayer + 1), // const int* indexTableNextLayer, - // mTimeFrameGPU->getDeviceUsedClusters(iLayer), // const int* usedClustersCurrentLayer, - // mTimeFrameGPU->getDeviceUsedClusters(iLayer + 1), // const int* usedClustersNextLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), // Tracklet* tracklets, // output data - // mTimeFrameGPU->getDeviceVertices(), // const Vertex* vertices, - // mTimeFrameGPU->getDeviceROframesPV(), // const int* pvROFrame, - // mTimeFrameGPU->getPhiCut(iLayer), // const float phiCut, - // mTimeFrameGPU->getMinR(iLayer + 1), // const float minR, - // mTimeFrameGPU->getMaxR(iLayer + 1), // const float maxR, - // meanDeltaR, // const float meanDeltaR, - // mTimeFrameGPU->getPositionResolution(iLayer), // const float positionResolution, - // mTimeFrameGPU->getMSangle(iLayer), // const float mSAngle, - // mTimeFrameGPU->getDeviceTrackingParameters(), // const StaticTrackingParameters* trkPars, - // mTimeFrameGPU->getDeviceIndexTableUtils(), // const IndexTableUtils* utils - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->clustersPerROfCapacity, // const int clustersPerROfCapacity, - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxTrackletsPerCluster); // const int maxTrackletsPerCluster - - // // Remove empty tracklets due to striding. - // auto nulltracklet = o2::its::Tracklet{}; - // auto thrustTrackletsBegin = thrust::device_ptr(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer)); - // auto thrustTrackletsEnd = thrust::device_ptr(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer) + (int)rofs * maxTracklets); - // auto thrustTrackletsAfterEraseEnd = thrust::remove(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()), - // thrustTrackletsBegin, - // thrustTrackletsEnd, - // nulltracklet); - // // Sort tracklets by first cluster index. - // thrust::sort(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()), - // thrustTrackletsBegin, - // thrustTrackletsAfterEraseEnd, - // gpu::trackletSortIndexFunctor()); - - // // Remove duplicates. - // auto thrustTrackletsAfterUniqueEnd = thrust::unique(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()), thrustTrackletsBegin, thrustTrackletsAfterEraseEnd); - - // discardResult(cudaStreamSynchronize(mTimeFrameGPU->getStream(chunkId).get())); - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer] = thrustTrackletsAfterUniqueEnd - thrustTrackletsBegin; - // // Compute tracklet lookup table. - // gpu::compileTrackletsLookupTableKernel<<getStream(chunkId).get()>>>(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer), - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer]); - // discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer), // d_out - // nclus, // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - - // // Create tracklets labels, at the moment on the host - // if (mTimeFrameGPU->hasMCinformation()) { - // std::vector tracklets(mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer]); - // checkGPUError(cudaHostRegister(tracklets.data(), tracklets.size() * sizeof(o2::its::Tracklet), cudaHostRegisterDefault)); - // checkGPUError(cudaMemcpyAsync(tracklets.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), tracklets.size() * sizeof(o2::its::Tracklet), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); - // for (auto& trk : tracklets) { - // MCCompLabel label; - // int currentId{mTimeFrameGPU->mClusters[iLayer][trk.firstClusterIndex].clusterId}; // This is not yet offsetted to the index of the first cluster of the chunk - // int nextId{mTimeFrameGPU->mClusters[iLayer + 1][trk.secondClusterIndex].clusterId}; // This is not yet offsetted to the index of the first cluster of the chunk - // for (auto& lab1 : mTimeFrameGPU->getClusterLabels(iLayer, currentId)) { - // for (auto& lab2 : mTimeFrameGPU->getClusterLabels(iLayer + 1, nextId)) { - // if (lab1 == lab2 && lab1.isValid()) { - // label = lab1; - // break; - // } - // } - // if (label.isValid()) { - // break; - // } - // } - // // TODO: implment label merging. - // // mTimeFrameGPU->getTrackletsLabel(iLayer).emplace_back(label); - // } - // checkGPUError(cudaHostUnregister(tracklets.data())); - // } - // } - - // //////////////// - // /// Cell finding - // for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - // // Compute layer cells. - // gpu::computeLayerCellsKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer + 1), - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], - // nullptr, - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), - // mTimeFrameGPU->getDeviceTrackingParameters()); - - // // Compute number of found Cells - // checkGPUError(cub::DeviceReduce::Sum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells() + iLayer, // d_out - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - // // Compute LUT - // discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), // d_out - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - - // gpu::computeLayerCellsKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer + 1), - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), - // mTimeFrameGPU->getDeviceTrackingParameters()); - // } - // checkGPUError(cudaMemcpyAsync(mTimeFrameGPU->getHostNCells(chunkId).data(), - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), - // (nLayers - 2) * sizeof(int), - // cudaMemcpyDeviceToHost, - // mTimeFrameGPU->getStream(chunkId).get())); - - // // Create cells labels - // // TODO: make it work after fixing the tracklets labels - // if (mTimeFrameGPU->hasMCinformation()) { - // for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - // std::vector cells(mTimeFrameGPU->getHostNCells(chunkId)[iLayer]); - // // Async with not registered memory? - // checkGPUError(cudaMemcpyAsync(cells.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), mTimeFrameGPU->getHostNCells(chunkId)[iLayer] * sizeof(o2::its::Cell), cudaMemcpyDeviceToHost)); - // for (auto& cell : cells) { - // MCCompLabel currentLab{mTimeFrameGPU->getTrackletsLabel(iLayer)[cell.getFirstTrackletIndex()]}; - // MCCompLabel nextLab{mTimeFrameGPU->getTrackletsLabel(iLayer + 1)[cell.getSecondTrackletIndex()]}; - // mTimeFrameGPU->getCellsLabel(iLayer).emplace_back(currentLab == nextLab ? currentLab : MCCompLabel()); - // } - // } - // } - - // ///////////////////// - // /// Neighbour finding - // for (int iLayer{0}; iLayer < nLayers - 3; ++iLayer) { - // gpu::computeLayerCellNeighboursKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer + 1), - // iLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), - // nullptr, - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize); - - // // Compute Cell Neighbours LUT - // checkGPUError(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), // d_out - // mTimeFrameGPU->getHostNCells(chunkId)[iLayer + 1], // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - - // gpu::computeLayerCellNeighboursKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer + 1), - // iLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeighbours(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize); - - // // if (!chunkId) { - // // gpu::printBufferLayerOnThread<<<1, 1, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(iLayer, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeighbours(iLayer), - // // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize * rofs); - // // } - // } - // // Download cells into vectors - - // for (int iLevel{nLayers - 2}; iLevel >= mTrkParams[iteration].CellMinimumLevel(); --iLevel) { - // const int minimumLevel{iLevel - 1}; - // for (int iLayer{nLayers - 3}; iLayer >= minimumLevel; --iLayer) { - // // gpu::computeLayerRoadsKernel<<<1, 1, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(iLevel, // const int level, - // // iLayer, // const int layerIndex, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceArrayCells(), // const CellSeed** cells, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), // const int* nCells, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceArrayNeighboursCell(), // const int** neighbours, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceArrayNeighboursCellLUT(), // const int** neighboursLUT, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceRoads(), // Road* roads, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceRoadsLookupTables(iLayer)); // int* roadsLookupTable - // } - // } - - // // End of tracking for this chunk - // offset += rofs; - // } - // }; - // threads[chunkId] = std::thread(doTrackReconstruction); - // } - // for (auto& thread : threads) { - // thread.join(); - // } - - // mTimeFrameGPU->wipe(nLayers); } template @@ -314,6 +85,12 @@ template void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex) { TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); + + const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); + gsl::span diamondSpan(&diamondVert, 1); + int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0}; + int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()}; + computeTrackletsInRofsHandler(startROF, endROF); } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 73dcf3bcb4894..6fdc5839559d4 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -671,7 +671,6 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, template GPUg() void computeLayerTrackletsKernelMultipleRof( const int layerIndex, - const int iteration, const unsigned int startRofId, const unsigned int rofSize, const int maxRofs, @@ -803,6 +802,16 @@ GPUg() void removeDuplicateTrackletsEntriesLUTKernel( } // namespace gpu +template +void computeTrackletsInRofsHandler(const int startROF, + const int endROF) +{ + for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { + std::cout << "start: " << startROF << " end: " << endROF << std::endl; + // gpu::computeLayerTrackletsKernelMultipleRof(); + } +} + void countCellsHandler( const Cluster** sortedClusters, const Cluster** unsortedClusters, @@ -1032,4 +1041,5 @@ void trackSeedHandler(CellSeed* trackSeeds, gpuCheckError(cudaPeekAtLastError()); gpuCheckError(cudaDeviceSynchronize()); } -} // namespace o2::its +template void computeTrackletsInRofsHandler<7>(const int, const int); +} // namespace o2::its \ No newline at end of file diff --git a/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx b/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx index 4eaddc8385b8a..e87e2289b49e7 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx @@ -90,7 +90,7 @@ float Vertexer::clustersToVerticesHybrid(std::function logg auto timeVertexingIteration = evaluateTask( &Vertexer::findVerticesHybrid, "Hybrid Vertexer vertex finding", [](std::string) {}, iteration); - printEpilog(logger, true, nTracklets01, nTracklets12, mTimeFrame->getNLinesTotal(), mTimeFrame->getTotVertIteration().size(), timeInit, timeTracklet, timeSelection, timeVertexing); + printEpilog(logger, true, nTracklets01, nTracklets12, mTimeFrame->getNLinesTotal(), mTimeFrame->getTotVertIteration()[iteration], timeInitIteration, timeTrackletIteration, timeSelectionIteration, timeVertexingIteration); timeInit += timeInitIteration; timeTracklet += timeTrackletIteration; timeSelection += timeSelectionIteration; @@ -142,9 +142,9 @@ void Vertexer::printEpilog(std::function logger, const float initT, const float trackletT, const float selecT, const float vertexT) { float total = initT + trackletT + selecT + vertexT; - logger(fmt::format(" - {}Vertexer: found {} | {} tracklets in: {} ms", isHybrid ? "Hybrid" : "", trackletN01, trackletN12, trackletT)); - logger(fmt::format(" - {}Vertexer: selected {} tracklets in: {} ms", isHybrid ? "Hybrid" : "", selectedN, selecT)); - logger(fmt::format(" - {}Vertexer: found {} vertices in: {} ms", isHybrid ? "Hybrid" : "", vertexN, vertexT)); + logger(fmt::format(" - {}Vertexer: found {} | {} tracklets in: {} ms", isHybrid ? "Hybrid " : "", trackletN01, trackletN12, trackletT)); + logger(fmt::format(" - {}Vertexer: selected {} tracklets in: {} ms", isHybrid ? "Hybrid " : "", selectedN, selecT)); + logger(fmt::format(" - {}Vertexer: found {} vertices in: {} ms", isHybrid ? "Hybrid " : "", vertexN, vertexT)); // logger(fmt::format(" - Timeframe {} vertexing completed in: {} ms, using {} thread(s).", mTimeFrameCounter++, total, mTraits->getNThreads())); } From ea5473a5be19d6974e0bf23e1d31e4ed9854239b Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 5 Nov 2024 18:57:50 +0100 Subject: [PATCH 02/14] Move multiplicity mask to a vector --- .../include/ITSReconstruction/FastMultEst.h | 2 +- .../ITS/reconstruction/src/FastMultEst.cxx | 2 +- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 8 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 5 +- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 11 + .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 8 +- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 245 +++++++++--------- .../tracking/include/ITStracking/TimeFrame.h | 9 +- .../ITS/tracking/src/TrackingInterface.cxx | 2 +- .../ITS/workflow/src/CookedTrackerSpec.cxx | 2 +- 10 files changed, 163 insertions(+), 131 deletions(-) diff --git a/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h b/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h index 457381862cc42..9e8299e89b404 100644 --- a/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h +++ b/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h @@ -45,7 +45,7 @@ struct FastMultEst { static uint32_t getCurrentRandomSeed(); int selectROFs(const gsl::span rofs, const gsl::span clus, - const gsl::span trig, std::vector& sel); + const gsl::span trig, std::vector& sel); void fillNClPerLayer(const gsl::span& clusters); float process(const std::array ncl) diff --git a/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx b/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx index a55fafdf60409..c547996c6f356 100644 --- a/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx +++ b/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx @@ -125,7 +125,7 @@ float FastMultEst::processNoiseImposed(const std::array ncl) } int FastMultEst::selectROFs(const gsl::span rofs, const gsl::span clus, - const gsl::span trig, std::vector& sel) + const gsl::span trig, std::vector& sel) { int nrof = rofs.size(), nsel = 0; const auto& multEstConf = FastMultEstConfig::Instance(); // parameters for mult estimation and cuts diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index ad8724f315ec8..19211112a3b0e 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -54,6 +54,8 @@ class TimeFrameGPU : public TimeFrame void loadTrackingFrameInfoDevice(const int); void loadUnsortedClustersDevice(const int); void loadClustersDevice(const int); + void loadMultiplicityCutMask(const int); + /// void loadTrackletsDevice(); void loadTrackletsLUTDevice(); void loadCellsDevice(); @@ -116,6 +118,7 @@ class TimeFrameGPU : public TimeFrame o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; } float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; } int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; } + bool* getDevicemMultMask() { return mMultMaskDevice; } void setDevicePropagator(const o2::base::PropagatorImpl*) override; @@ -147,6 +150,7 @@ class TimeFrameGPU : public TimeFrame int* mROFramesPVDevice; // Hybrid pref + bool* mMultMaskDevice; std::array mClustersDevice; std::array mUnsortedClustersDevice; const Cluster** mClustersDeviceArray; @@ -186,10 +190,6 @@ class TimeFrameGPU : public TimeFrame std::vector> mNVerticesInChunks; std::vector> mLabelsInChunks; - // Host memory used only in GPU tracking - std::vector mHostNTracklets; - std::vector mHostNCells; - // Temporary buffer for storing output tracks from GPU tracking std::vector mTrackITSExt; }; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 834fdff0ac9ef..24fb531fa3e1d 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -52,7 +52,10 @@ GPUg() void fitTrackSeedsKernel( template void computeTrackletsInRofsHandler(const int startROF, - const int endROF); + const int endROF, + const int vertexId, + const int nBlocks, + const int nThreads); void countCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 67144ba2c98ea..1d58c491e2adb 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -146,6 +146,17 @@ void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } +template +void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask"); + if (!iteration) { + LOGP(info, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB); + allocMemAsync(reinterpret_cast(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(bool), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} template void TimeFrameGPU::loadTrackletsDevice() { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index b890a1dce05ee..eec86573b4090 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -32,6 +32,7 @@ void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) mTimeFrameGPU->loadClustersDevice(iteration); mTimeFrameGPU->loadUnsortedClustersDevice(iteration); mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration); + mTimeFrameGPU->loadMultiplicityCutMask(iteration); } template @@ -84,13 +85,18 @@ int TrackerTraitsGPU::getTFNumberOfCells() const template void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex) { + auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); gsl::span diamondSpan(&diamondVert, 1); int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0}; int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()}; - computeTrackletsInRofsHandler(startROF, endROF); + computeTrackletsInRofsHandler(startROF, + endROF, + iVertex, + conf.nBlocks, + conf.nThreads); } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 6fdc5839559d4..540cbce9c6d54 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -196,6 +196,10 @@ struct is_valid_pair { } }; +GPUd() void getPrimaryVertices(const int rof, + const uint8_t* mask, + const Vertex* vertices); + template GPUg() void fitTrackSeedsKernel( CellSeed* trackSeeds, @@ -394,6 +398,116 @@ GPUg() void computeLayerCellsKernel( } } +template +GPUg() void computeLayerTrackletsMultiROFKernel( + const int layerIndex, + const int startRofId, + const int rofSize, + const int vertexId, + // const int maxRofs, + // const Cluster* clustersCurrentLayer, // input data rof0 + // const Cluster* clustersNextLayer, // input data rof0-delta getNphiBins()}; + const int zBins{utils->getNzBins()}; + for (unsigned int iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) { + auto rof0 = iRof + startRofId; + auto nClustersCurrentLayerRof = o2::gpu::GPUCommonMath::Min(roFrameClustersCurrentLayer[rof0 + 1] - roFrameClustersCurrentLayer[rof0], (int)maxClustersPerRof); + // if (nClustersCurrentLayerRof > maxClustersPerRof) { + // printf("its-gpu-tracklet finder: on layer %d found more clusters per ROF (%d) than maximum set (%d), check the configuration!\n", layerIndex, nClustersCurrentLayerRof, maxClustersPerRof); + // } + auto* clustersCurrentLayerRof = clustersCurrentLayer + (roFrameClustersCurrentLayer[rof0] - roFrameClustersCurrentLayer[startRofId]); + auto nVerticesRof0 = nVertices[rof0 + 1] - nVertices[rof0]; + auto trackletsRof0 = tracklets + maxTrackletsPerCluster * maxClustersPerRof * iRof; + for (int currentClusterIndex = threadIdx.x; currentClusterIndex < nClustersCurrentLayerRof; currentClusterIndex += blockDim.x) { + unsigned int storedTracklets{0}; + const Cluster& currentCluster{clustersCurrentLayerRof[currentClusterIndex]}; + const int currentSortedIndex{roFrameClustersCurrentLayer[rof0] + currentClusterIndex}; + const int currentSortedIndexChunk{currentSortedIndex - roFrameClustersCurrentLayer[startRofId]}; + if (usedClustersLayer[currentSortedIndex]) { + continue; + } + + int minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0; + int maxRof = (rof0 == maxRofs - trkPars->DeltaROF) ? rof0 : rof0 + trkPars->DeltaROF; // works with delta = {0, 1} + const float inverseR0{1.f / currentCluster.radius}; + + for (int iPrimaryVertex{0}; iPrimaryVertex < nVerticesRof0; iPrimaryVertex++) { + const auto& primaryVertex{vertices[nVertices[rof0] + iPrimaryVertex]}; + const float resolution{o2::gpu::GPUCommonMath::Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))}; + const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; + const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; + const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; + const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution + const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))}; + + const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)}; + + if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { + continue; + } + int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; + if (phiBinsNum < 0) { + phiBinsNum += trkPars->PhiBins; + } + const int tableSize{phiBins * zBins + 1}; + for (int rof1{minRof}; rof1 <= maxRof; ++rof1) { + auto nClustersNext{roFrameClustersNextLayer[rof1 + 1] - roFrameClustersNextLayer[rof1]}; + if (!nClustersNext) { // number of clusters on next layer > 0 + continue; + } + for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { + int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins; + const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; + const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; + const int firstRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + firstBinIndex]; + const int maxRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + maxBinIndex]; + for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { + if (iNextCluster >= nClustersNext) { + break; + } + auto nextClusterIndex{roFrameClustersNextLayer[rof1] - roFrameClustersNextLayer[startRofId] + iNextCluster}; + const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]}; + if (usedClustersNextLayer[nextCluster.clusterId]) { + continue; + } + const float deltaPhi{o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi)}; + const float deltaZ{o2::gpu::GPUCommonMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; + + if ((deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::GPUCommonMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut))) { + const float phi{o2::gpu::GPUCommonMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; + const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; + const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster}; + if (storedTracklets < maxTrackletsPerCluster) { + new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast(rof0), static_cast(rof1)}; + } + // else { + // printf("its-gpu-tracklet-finder: on rof %d layer: %d: found more tracklets (%d) than maximum allowed per cluster. This is lossy!\n", rof0, layerIndex, storedTracklets); + // } + ++storedTracklets; + } + } + } + } + } + } + }*/ +} + ///////////////////////////////////////// // Debug Kernels ///////////////////////////////////////// @@ -668,119 +782,6 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, } } -template -GPUg() void computeLayerTrackletsKernelMultipleRof( - const int layerIndex, - const unsigned int startRofId, - const unsigned int rofSize, - const int maxRofs, - const Cluster* clustersCurrentLayer, // input data rof0 - const Cluster* clustersNextLayer, // input data rof0-delta * trkPars, - const IndexTableUtils* utils, - const unsigned int maxClustersPerRof = 5e2, - const unsigned int maxTrackletsPerCluster = 50) -{ - const int phiBins{utils->getNphiBins()}; - const int zBins{utils->getNzBins()}; - for (unsigned int iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) { - auto rof0 = iRof + startRofId; - auto nClustersCurrentLayerRof = o2::gpu::GPUCommonMath::Min(roFrameClustersCurrentLayer[rof0 + 1] - roFrameClustersCurrentLayer[rof0], (int)maxClustersPerRof); - // if (nClustersCurrentLayerRof > maxClustersPerRof) { - // printf("its-gpu-tracklet finder: on layer %d found more clusters per ROF (%d) than maximum set (%d), check the configuration!\n", layerIndex, nClustersCurrentLayerRof, maxClustersPerRof); - // } - auto* clustersCurrentLayerRof = clustersCurrentLayer + (roFrameClustersCurrentLayer[rof0] - roFrameClustersCurrentLayer[startRofId]); - auto nVerticesRof0 = nVertices[rof0 + 1] - nVertices[rof0]; - auto trackletsRof0 = tracklets + maxTrackletsPerCluster * maxClustersPerRof * iRof; - for (int currentClusterIndex = threadIdx.x; currentClusterIndex < nClustersCurrentLayerRof; currentClusterIndex += blockDim.x) { - unsigned int storedTracklets{0}; - const Cluster& currentCluster{clustersCurrentLayerRof[currentClusterIndex]}; - const int currentSortedIndex{roFrameClustersCurrentLayer[rof0] + currentClusterIndex}; - const int currentSortedIndexChunk{currentSortedIndex - roFrameClustersCurrentLayer[startRofId]}; - if (usedClustersLayer[currentSortedIndex]) { - continue; - } - - int minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0; - int maxRof = (rof0 == maxRofs - trkPars->DeltaROF) ? rof0 : rof0 + trkPars->DeltaROF; // works with delta = {0, 1} - const float inverseR0{1.f / currentCluster.radius}; - - for (int iPrimaryVertex{0}; iPrimaryVertex < nVerticesRof0; iPrimaryVertex++) { - const auto& primaryVertex{vertices[nVertices[rof0] + iPrimaryVertex]}; - const float resolution{o2::gpu::GPUCommonMath::Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))}; - const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; - const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; - const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; - const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution - const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))}; - - const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)}; - - if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { - continue; - } - int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - if (phiBinsNum < 0) { - phiBinsNum += trkPars->PhiBins; - } - const int tableSize{phiBins * zBins + 1}; - for (int rof1{minRof}; rof1 <= maxRof; ++rof1) { - auto nClustersNext{roFrameClustersNextLayer[rof1 + 1] - roFrameClustersNextLayer[rof1]}; - if (!nClustersNext) { // number of clusters on next layer > 0 - continue; - } - for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { - int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins; - const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; - const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; - const int firstRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + firstBinIndex]; - const int maxRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + maxBinIndex]; - for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (iNextCluster >= nClustersNext) { - break; - } - auto nextClusterIndex{roFrameClustersNextLayer[rof1] - roFrameClustersNextLayer[startRofId] + iNextCluster}; - const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]}; - if (usedClustersNextLayer[nextCluster.clusterId]) { - continue; - } - const float deltaPhi{o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi)}; - const float deltaZ{o2::gpu::GPUCommonMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; - - if ((deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::GPUCommonMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut))) { - const float phi{o2::gpu::GPUCommonMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; - const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; - const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster}; - if (storedTracklets < maxTrackletsPerCluster) { - new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast(rof0), static_cast(rof1)}; - } - // else { - // printf("its-gpu-tracklet-finder: on rof %d layer: %d: found more tracklets (%d) than maximum allowed per cluster. This is lossy!\n", rof0, layerIndex, storedTracklets); - // } - ++storedTracklets; - } - } - } - } - } - } - } -} - // Decrease LUT entries corresponding to duplicated tracklets. NB: duplicate tracklets are removed separately (see const Tracklets*). GPUg() void removeDuplicateTrackletsEntriesLUTKernel( int* trackletsLookUpTable, @@ -804,11 +805,16 @@ GPUg() void removeDuplicateTrackletsEntriesLUTKernel( template void computeTrackletsInRofsHandler(const int startROF, - const int endROF) + const int endROF, + const int vertexId, + const int nBlocks, + const int nThreads) { for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - std::cout << "start: " << startROF << " end: " << endROF << std::endl; - // gpu::computeLayerTrackletsKernelMultipleRof(); + gpu::computeLayerTrackletsMultiROFKernel<<>>(iLayer, + startROF, + endROF - startROF, + vertexId); } } @@ -1041,5 +1047,10 @@ void trackSeedHandler(CellSeed* trackSeeds, gpuCheckError(cudaPeekAtLastError()); gpuCheckError(cudaDeviceSynchronize()); } -template void computeTrackletsInRofsHandler<7>(const int, const int); + +template void computeTrackletsInRofsHandler<7>(const int, // startROF + const int, // endROF + const int, // vertexId + const int, // nBlocks + const int); // nThreads; } // namespace o2::its \ No newline at end of file diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index 906eb0fa5c21e..9f6355c19555d 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -209,8 +209,8 @@ class TimeFrame const unsigned long long& getRoadLabel(int i) const; bool isRoadFake(int i) const; - void setMultiplicityCutMask(const std::vector& cutMask) { mMultiplicityCutMask = cutMask; } - void setROFMask(const std::vector& rofMask) { mROFMask = rofMask; } + void setMultiplicityCutMask(const std::vector& cutMask) { mMultiplicityCutMask = cutMask; } + void setROFMask(const std::vector& rofMask) { mROFMask = rofMask; } void swapMasks() { mMultiplicityCutMask.swap(mROFMask); } int hasBogusClusters() const { return std::accumulate(mBogusClusters.begin(), mBogusClusters.end(), 0); } @@ -289,6 +289,7 @@ class TimeFrame std::vector> mTracks; std::vector> mCellsNeighbours; std::vector> mCellsLookupTable; + std::vector mMultiplicityCutMask; const o2::base::PropagatorImpl* mPropagatorDevice = nullptr; // Needed only for GPU protected: @@ -311,8 +312,8 @@ class TimeFrame std::vector mPhiCuts; std::vector mPositionResolution; std::vector mClusterSize; - std::vector mMultiplicityCutMask; - std::vector mROFMask; + + std::vector mROFMask; std::vector> mPValphaX; /// PV x and alpha for track propagation std::vector> mTrackletLabels; std::vector> mCellLabels; diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx index f00d87164d7d6..5b8a9bb1cb0f2 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx @@ -174,7 +174,7 @@ void ITSTrackingInterface::run(framework::ProcessingContext& pc) auto errorLogger = [&](std::string s) { LOG(error) << s; }; FastMultEst multEst; // mult estimator - std::vector processingMask, processUPCMask; + std::vector processingMask, processUPCMask; int cutVertexMult{0}, cutUPCVertex{0}, cutRandomMult = int(trackROFvec.size()) - multEst.selectROFs(trackROFvec, compClusters, physTriggers, processingMask); processUPCMask.resize(processingMask.size(), false); mTimeFrame->setMultiplicityCutMask(processingMask); diff --git a/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx b/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx index 01e649f982896..4a0470adcf07a 100644 --- a/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx +++ b/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx @@ -132,7 +132,7 @@ void CookedTrackerDPL::run(ProcessingContext& pc) const auto& multEstConf = FastMultEstConfig::Instance(); // parameters for mult estimation and cuts FastMultEst multEst; // mult estimator - std::vector processingMask; + std::vector processingMask; int cutVertexMult{0}, cutRandomMult = int(rofsinput.size()) - multEst.selectROFs(rofsinput, compClusters, physTriggers, processingMask); // auto processingMask_ephemeral = processingMask; From 3c3accfa30511a3c6b097defe20bb34b8369351a Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 5 Nov 2024 20:16:54 +0100 Subject: [PATCH 03/14] Add gpuSpan --- .../ITS/tracking/GPU/ITStrackingGPU/Utils.h | 18 ++++++++++++++++++ .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 18 +++++++++++++----- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index 66244bf854b5f..ca90edc211d7c 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -29,6 +29,24 @@ struct gpuPair { T2 second; }; +template +struct gpuSpan { + GPUd() gpuSpan() : _data(nullptr), _size(0) {} + GPUd() gpuSpan(T* data, size_t size) : _data(data), _size(size) {} + GPUd() gpuSpan(const T* data, size_t size) : _data(data), _size(size) {} + GPUd() T& operator[](size_t idx) const { return _data[idx]; } + GPUd() size_t size() const { return _size; } + GPUd() bool empty() const { return _size == 0; } + GPUd() T& front() const { return _data[0]; } + GPUd() T& back() const { return _data[_size - 1]; } + GPUd() T* begin() const { return _data; } + GPUd() T* end() const { return _data + _size; } + + protected: + const T* _data; + size_t _size; +}; + namespace gpu { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 540cbce9c6d54..546da488a07b3 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -32,6 +32,7 @@ #include "ITStracking/IndexTableUtils.h" #include "ITStracking/MathUtils.h" #include "DataFormatsITS/TrackITS.h" +#include "ReconstructionDataFormats/Vertex.h" #include "ITStrackingGPU/TrackerTraitsGPU.h" #include "ITStrackingGPU/TrackingKernels.h" @@ -70,10 +71,9 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = } namespace o2::its - { using namespace constants::its2; - +using Vertex = o2::dataformats::Vertex>; namespace gpu { GPUd() bool fitTrack(TrackITSExt& track, @@ -196,9 +196,17 @@ struct is_valid_pair { } }; -GPUd() void getPrimaryVertices(const int rof, - const uint8_t* mask, - const Vertex* vertices); +GPUd() gpuSpan getPrimaryVertices(const int rof, + const int* roframesPV, + const int nRof, + const uint8_t* mask, + const Vertex* vertices) +{ + const int start = roframesPV[rof]; + const int stop_idx = rof >= nRof - 1 ? nRof : rof + 1; + size_t delta = mask[rof] ? roframesPV[stop_idx] - start : 0; // return empty span if Rof is excluded + return gpuSpan(&vertices[start], delta); +}; template GPUg() void fitTrackSeedsKernel( From 8eb99d79a174a0f6eb5ba4a9964feca4b78aee5b Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Fri, 15 Nov 2024 18:51:35 +0100 Subject: [PATCH 04/14] Debugging getSpan --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 17 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 12 +- .../ITS/tracking/GPU/ITStrackingGPU/Utils.h | 49 +- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 38 +- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 15 +- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 451 +++++++----------- .../tracking/include/ITStracking/TimeFrame.h | 11 +- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 8 +- 8 files changed, 303 insertions(+), 298 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 19211112a3b0e..6b5d32dc1c17a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -54,7 +54,10 @@ class TimeFrameGPU : public TimeFrame void loadTrackingFrameInfoDevice(const int); void loadUnsortedClustersDevice(const int); void loadClustersDevice(const int); + void loadROframeClustersDevice(const int iteration); void loadMultiplicityCutMask(const int); + void loadVertices(const int); + /// void loadTrackletsDevice(); void loadTrackletsLUTDevice(); @@ -95,7 +98,7 @@ class TimeFrameGPU : public TimeFrame std::vector>& getLabelsInChunks() { return mLabelsInChunks; } int getNAllocatedROFs() const { return mNrof; } // Allocated means maximum nROF for each chunk while populated is the number of loaded ones. StaticTrackingParameters* getDeviceTrackingParameters() { return mTrackingParamsDevice; } - Vertex* getDeviceVertices() { return mVerticesDevice; } + Vertex* getDeviceVertices() { return mPrimaryVerticesDevice; } int* getDeviceROFramesPV() { return mROFramesPVDevice; } unsigned char* getDeviceUsedClusters(const int); const o2::base::Propagator* getChainPropagator(); @@ -109,6 +112,7 @@ class TimeFrameGPU : public TimeFrame const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; } const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; } + const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; } const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; } @@ -118,7 +122,7 @@ class TimeFrameGPU : public TimeFrame o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; } float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; } int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; } - bool* getDevicemMultMask() { return mMultMaskDevice; } + uint8_t* getDeviceMultCutMask() { return mMultMaskDevice; } void setDevicePropagator(const o2::base::PropagatorImpl*) override; @@ -144,17 +148,18 @@ class TimeFrameGPU : public TimeFrame // Device pointers StaticTrackingParameters* mTrackingParamsDevice; IndexTableUtils* mIndexTableUtilsDevice; - std::array mROFramesClustersDevice; std::array mUsedClustersDevice; - Vertex* mVerticesDevice; - int* mROFramesPVDevice; // Hybrid pref - bool* mMultMaskDevice; + uint8_t* mMultMaskDevice; + Vertex* mPrimaryVerticesDevice; + int* mROFramesPVDevice; std::array mClustersDevice; std::array mUnsortedClustersDevice; + std::array mROFramesClustersDevice; const Cluster** mClustersDeviceArray; const Cluster** mUnsortedClustersDeviceArray; + const int** mROFrameClustersDeviceArray; std::array mTrackletsDevice; const Tracklet** mTrackletsDeviceArray; const int** mTrackletsLUTDeviceArray; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 24fb531fa3e1d..5b70d571b9b52 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -51,9 +51,19 @@ GPUg() void fitTrackSeedsKernel( } // namespace gpu template -void computeTrackletsInRofsHandler(const int startROF, +void computeTrackletsInRofsHandler(const uint8_t* multMask, + const int startROF, const int endROF, + const int maxROF, + const int deltaROF, const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + const int** ROFClusters, + std::vector& radii, + std::vector& mulScatAng, const int nBlocks, const int nThreads); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index ca90edc211d7c..cc45e24a8cbdb 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -29,26 +29,51 @@ struct gpuPair { T2 second; }; +namespace gpu +{ +// Poor man implementation of a span-like struct. It is very limited. template struct gpuSpan { + using value_type = T; + using ptr = T*; + using ref = T&; + GPUd() gpuSpan() : _data(nullptr), _size(0) {} - GPUd() gpuSpan(T* data, size_t size) : _data(data), _size(size) {} - GPUd() gpuSpan(const T* data, size_t size) : _data(data), _size(size) {} - GPUd() T& operator[](size_t idx) const { return _data[idx]; } - GPUd() size_t size() const { return _size; } + GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {} + GPUd() ref operator[](std::size_t idx) const { return _data[idx]; } + GPUd() std::size_t size() const { return _size; } GPUd() bool empty() const { return _size == 0; } - GPUd() T& front() const { return _data[0]; } - GPUd() T& back() const { return _data[_size - 1]; } - GPUd() T* begin() const { return _data; } - GPUd() T* end() const { return _data + _size; } + GPUd() ref front() const { return _data[0]; } + GPUd() ref back() const { return _data[_size - 1]; } + GPUd() ptr begin() const { return _data; } + GPUd() ptr end() const { return _data + _size; } protected: - const T* _data; - size_t _size; + ptr _data; + std::size_t _size; }; -namespace gpu -{ +template +struct gpuSpan { + using value_type = T; + using ptr = const T*; + using ref = const T&; + + GPUd() gpuSpan() : _data(nullptr), _size(0) {} + GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {} + GPUd() gpuSpan(const gpuSpan& other) : _data(other._data), _size(other._size) {} + GPUd() ref operator[](std::size_t idx) const { return _data[idx]; } + GPUd() std::size_t size() const { return _size; } + GPUd() bool empty() const { return _size == 0; } + GPUd() ref front() const { return _data[0]; } + GPUd() ref back() const { return _data[_size - 1]; } + GPUd() ptr begin() const { return _data; } + GPUd() ptr end() const { return _data + _size; } + + protected: + ptr _data; + std::size_t _size; +}; enum class Task { Tracker = 0, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 1d58c491e2adb..0db970b2361ab 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -128,6 +128,22 @@ void TimeFrameGPU::loadClustersDevice(const int iteration) } } +template +void TimeFrameGPU::loadROframeClustersDevice(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(info, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + template void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) { @@ -149,14 +165,30 @@ void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) template void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask"); if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask"); LOGP(info, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB); - allocMemAsync(reinterpret_cast(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(bool), nullptr, getExtAllocator()); + allocMemAsync(reinterpret_cast(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } + +template +void TimeFrameGPU::loadVertices(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices"); + LOGP(info, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + LOGP(info, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB); + allocMemAsync(reinterpret_cast(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + template void TimeFrameGPU::loadTrackletsDevice() { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index eec86573b4090..8fb3f628cacc0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -33,6 +33,8 @@ void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) mTimeFrameGPU->loadUnsortedClustersDevice(iteration); mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration); mTimeFrameGPU->loadMultiplicityCutMask(iteration); + mTimeFrameGPU->loadVertices(iteration); + mTimeFrameGPU->loadROframeClustersDevice(iteration); } template @@ -92,9 +94,20 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int gsl::span diamondSpan(&diamondVert, 1); int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0}; int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()}; - computeTrackletsInRofsHandler(startROF, + + computeTrackletsInRofsHandler(mTimeFrameGPU->getDeviceMultCutMask(), + startROF, endROF, + mTimeFrameGPU->getNrof(), + mTrkParams[iteration].DeltaROF, iVertex, + mTimeFrameGPU->getDeviceVertices(), + mTimeFrameGPU->getDeviceROFramesPV(), + mTimeFrameGPU->getPrimaryVerticesNum(), + mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getDeviceROframeClusters(), + mTrkParams[iteration].LayerRadii, + mTimeFrameGPU->getMSangles(), conf.nBlocks, conf.nThreads); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 546da488a07b3..5dedb5fd754ad 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -76,6 +76,7 @@ using namespace constants::its2; using Vertex = o2::dataformats::Vertex>; namespace gpu { + GPUd() bool fitTrack(TrackITSExt& track, int start, int end, @@ -127,7 +128,7 @@ GPUd() bool fitTrack(TrackITSExt& track, } nCl++; } - return o2::gpu::GPUCommonMath::Abs(track.getQ2Pt()) < maxQoverPt && track.getChi2() < chi2ndfcut * (nCl * 2 - 5); + return o2::gpu::CAMath::Abs(track.getQ2Pt()) < maxQoverPt && track.getChi2() < chi2ndfcut * (nCl * 2 - 5); } GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, @@ -146,7 +147,7 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, const float y3 = tf3.positionTrackingFrame[0]; const float z3 = tf3.positionTrackingFrame[1]; - const bool zeroField{o2::gpu::GPUCommonMath::Abs(bz) < o2::constants::math::Almost0}; + const bool zeroField{o2::gpu::CAMath::Abs(bz) < o2::constants::math::Almost0}; const float tgp = zeroField ? o2::gpu::CAMath::ATan2(y3 - y1, x3 - x1) : 1.f; const float crv = zeroField ? 1.f : math_utils::computeCurvature(x3, y3, x2, y2, x1, y1); const float snp = zeroField ? tgp / o2::gpu::CAMath::Sqrt(1.f + tgp * tgp) : crv * (x3 - math_utils::computeCurvatureCentreX(x3, y3, x2, y2, x1, y1)); @@ -196,18 +197,35 @@ struct is_valid_pair { } }; -GPUd() gpuSpan getPrimaryVertices(const int rof, - const int* roframesPV, - const int nRof, - const uint8_t* mask, - const Vertex* vertices) +GPUd() gpuSpan getPrimaryVertices(const int rof, + const int* roframesPV, + const int nRof, + const uint8_t* mask, + const Vertex* vertices) { - const int start = roframesPV[rof]; - const int stop_idx = rof >= nRof - 1 ? nRof : rof + 1; - size_t delta = mask[rof] ? roframesPV[stop_idx] - start : 0; // return empty span if Rof is excluded - return gpuSpan(&vertices[start], delta); + const int start_pv_id = roframesPV[rof]; + const int stop_rof = rof >= nRof - 1 ? nRof : rof + 1; + size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if Rof is excluded + return gpuSpan(&vertices[start_pv_id], delta); }; +GPUd() gpuSpan getClustersOnLayer(const int rof, + const int** roframesClus, + const int layer, + const int nRof, + const Cluster** clusters) +{ + const int start_clus_id{roframesClus[layer][rof]}; + const int stop_rof = rof >= nRof - 1 ? nRof : rof + 1; + const int delta = roframesClus[layer][stop_rof] - start_clus_id; + printf("\t\t\t r: %d nr: %d rfci: %d d: %d\n", rof, nRof, roframesClus[layer][rof], delta); + if (rof < 0 || rof >= nRof) { + return gpuSpan(); + } + + return gpuSpan(&(clusters[layer][start_clus_id]), delta); +} + template GPUg() void fitTrackSeedsKernel( CellSeed* trackSeeds, @@ -354,7 +372,7 @@ GPUg() void computeLayerCellsKernel( break; } const Tracklet& nextTracklet = tracklets[layer + 1][iNextTrackletIndex]; - const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(currentTracklet.tanLambda - nextTracklet.tanLambda)}; + const float deltaTanLambda{o2::gpu::CAMath::Abs(currentTracklet.tanLambda - nextTracklet.tanLambda)}; if (deltaTanLambda / cellDeltaTanLambdaSigma < nSigmaCut) { const int clusId[3]{ @@ -408,112 +426,125 @@ GPUg() void computeLayerCellsKernel( template GPUg() void computeLayerTrackletsMultiROFKernel( + const uint8_t* multMask, const int layerIndex, - const int startRofId, - const int rofSize, + const int startROF, + const int endROF, + const int maxRof, + const int deltaRof, + const Vertex* vertices, + const int* rofPV, + const int nVertices, const int vertexId, - // const int maxRofs, - // const Cluster* clustersCurrentLayer, // input data rof0 - // const Cluster* clustersNextLayer, // input data rof0-delta getNphiBins()}; - const int zBins{utils->getNzBins()}; - for (unsigned int iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) { - auto rof0 = iRof + startRofId; - auto nClustersCurrentLayerRof = o2::gpu::GPUCommonMath::Min(roFrameClustersCurrentLayer[rof0 + 1] - roFrameClustersCurrentLayer[rof0], (int)maxClustersPerRof); - // if (nClustersCurrentLayerRof > maxClustersPerRof) { - // printf("its-gpu-tracklet finder: on layer %d found more clusters per ROF (%d) than maximum set (%d), check the configuration!\n", layerIndex, nClustersCurrentLayerRof, maxClustersPerRof); - // } - auto* clustersCurrentLayerRof = clustersCurrentLayer + (roFrameClustersCurrentLayer[rof0] - roFrameClustersCurrentLayer[startRofId]); - auto nVerticesRof0 = nVertices[rof0 + 1] - nVertices[rof0]; - auto trackletsRof0 = tracklets + maxTrackletsPerCluster * maxClustersPerRof * iRof; - for (int currentClusterIndex = threadIdx.x; currentClusterIndex < nClustersCurrentLayerRof; currentClusterIndex += blockDim.x) { + // const int phiBins{utils->getNphiBins()}; + // const int zBins{utils->getNzBins()}; + for (unsigned int iRof{blockIdx.x}; iRof < endROF - startROF; iRof += gridDim.x) { + auto rof0 = iRof + startROF; + auto primaryVertices = getPrimaryVertices(rof0, rofPV, maxRof, multMask, vertices); + const auto startVtx{vertexId >= 0 ? vertexId : 0}; + const auto endVtx{vertexId >= 0 ? o2::gpu::CAMath::Min(vertexId + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; + auto minRof = o2::gpu::CAMath::Max(startROF, static_cast(rof0 - deltaRof)); + auto maxRof = o2::gpu::CAMath::Min(endROF - 1, static_cast(rof0 + deltaRof)); + auto clustersCurrentLayer = getClustersOnLayer(iRof, ROFClusters, layerIndex, maxRof, clusters); + if (threadIdx.x == 0) { + // printf("> l: %d r: %d rc: %d s: %d e: %d \n", layerIndex, iRof, ROFClusters[layerIndex][iRof], clustersCurrentLayer.size(), clustersCurrentLayer.empty()); + } + if (clustersCurrentLayer.empty()) { + continue; + } + + for (int currentClusterIndex = threadIdx.x; currentClusterIndex < clustersCurrentLayer.size(); currentClusterIndex += blockDim.x) { unsigned int storedTracklets{0}; - const Cluster& currentCluster{clustersCurrentLayerRof[currentClusterIndex]}; - const int currentSortedIndex{roFrameClustersCurrentLayer[rof0] + currentClusterIndex}; - const int currentSortedIndexChunk{currentSortedIndex - roFrameClustersCurrentLayer[startRofId]}; - if (usedClustersLayer[currentSortedIndex]) { - continue; + auto currentCluster{clustersCurrentLayer[currentClusterIndex]}; + if (threadIdx.x == 0) { + printf("rof: %d has %zu clusters on layer %d\n", rof0, clustersCurrentLayer.size(), layerIndex); } - int minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0; - int maxRof = (rof0 == maxRofs - trkPars->DeltaROF) ? rof0 : rof0 + trkPars->DeltaROF; // works with delta = {0, 1} - const float inverseR0{1.f / currentCluster.radius}; - - for (int iPrimaryVertex{0}; iPrimaryVertex < nVerticesRof0; iPrimaryVertex++) { - const auto& primaryVertex{vertices[nVertices[rof0] + iPrimaryVertex]}; - const float resolution{o2::gpu::GPUCommonMath::Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))}; - const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; - const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; - const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; - const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution - const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))}; - - const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)}; - - if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { - continue; - } - int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - if (phiBinsNum < 0) { - phiBinsNum += trkPars->PhiBins; - } - const int tableSize{phiBins * zBins + 1}; - for (int rof1{minRof}; rof1 <= maxRof; ++rof1) { - auto nClustersNext{roFrameClustersNextLayer[rof1 + 1] - roFrameClustersNextLayer[rof1]}; - if (!nClustersNext) { // number of clusters on next layer > 0 - continue; - } - for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { - int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins; - const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; - const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; - const int firstRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + firstBinIndex]; - const int maxRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + maxBinIndex]; - for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (iNextCluster >= nClustersNext) { - break; - } - auto nextClusterIndex{roFrameClustersNextLayer[rof1] - roFrameClustersNextLayer[startRofId] + iNextCluster}; - const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]}; - if (usedClustersNextLayer[nextCluster.clusterId]) { - continue; - } - const float deltaPhi{o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi)}; - const float deltaZ{o2::gpu::GPUCommonMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; - - if ((deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::GPUCommonMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut))) { - const float phi{o2::gpu::GPUCommonMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; - const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; - const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster}; - if (storedTracklets < maxTrackletsPerCluster) { - new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast(rof0), static_cast(rof1)}; - } - // else { - // printf("its-gpu-tracklet-finder: on rof %d layer: %d: found more tracklets (%d) than maximum allowed per cluster. This is lossy!\n", rof0, layerIndex, storedTracklets); - // } - ++storedTracklets; - } - } - } - } - } + // const int currentSortedIndex{roFrameClustersCurrentLayer[rof0] + currentClusterIndex}; + // const int currentSortedIndexChunk{currentSortedIndex - roFrameClustersCurrentLayer[startROF]}; + // if (usedClustersLayer[currentSortedIndex]) { + // continue; + // } + // + // int minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0; + // int maxRof = (rof0 == maxRofs - trkPars->DeltaROF) ? rof0 : rof0 + trkPars->DeltaROF; // works with delta = {0, 1} + // const float inverseR0{1.f / currentCluster.radius}; + // + // for (int iPrimaryVertex{0}; iPrimaryVertex < nVerticesRof0; iPrimaryVertex++) { + // const auto& primaryVertex{vertices[nVertices[rof0] + iPrimaryVertex]}; + // const float resolution{Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))}; + // const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; + // const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; + // const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; + // const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution + // const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))}; + // + // const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)}; + // + // if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { + // continue; + // } + // int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; + // if (phiBinsNum < 0) { + // phiBinsNum += trkPars->PhiBins; + // } + // const int tableSize{phiBins * zBins + 1}; + // for (int rof1{minRof}; rof1 <= maxRof; ++rof1) { + // auto nClustersNext{roFrameClustersNextLayer[rof1 + 1] - roFrameClustersNextLayer[rof1]}; + // if (!nClustersNext) { // number of clusters on next layer > 0 + // continue; + // } + // for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { + // int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins; + // const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; + // const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; + // const int firstRowClusterIndex = indexTablesNext[(rof1 - startROF) * tableSize + firstBinIndex]; + // const int maxRowClusterIndex = indexTablesNext[(rof1 - startROF) * tableSize + maxBinIndex]; + // for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { + // if (iNextCluster >= nClustersNext) { + // break; + // } + // auto nextClusterIndex{roFrameClustersNextLayer[rof1] - roFrameClustersNextLayer[startROF] + iNextCluster}; + // const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]}; + // if (usedClustersNextLayer[nextCluster.clusterId]) { + // continue; + // } + // const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)}; + // const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; + // + // if ((deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut))) { + // const float phi{ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; + // const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; + // const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster}; + // if (storedTracklets < maxTrackletsPerCluster) { + // new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast(rof0), static_cast(rof1)}; + // } + // else { + // printf("its-gpu-tracklet-finder: on rof %d layer: %d: found more tracklets (%d) than maximum allowed per cluster. This is lossy!\n", rof0, layerIndex, storedTracklets); + // } + // ++storedTracklets; + // } + // } + // } + // } + // } + // */ } - }*/ + } } ///////////////////////////////////////// @@ -523,9 +554,9 @@ GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerInde const o2::its::IndexTableUtils& utils, const float z1, const float z2, float maxdeltaz, float maxdeltaphi) { - const float zRangeMin = o2::gpu::GPUCommonMath::Min(z1, z2) - maxdeltaz; + const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz; const float phiRangeMin = currentCluster.phi - maxdeltaphi; - const float zRangeMax = o2::gpu::GPUCommonMath::Max(z1, z2) + maxdeltaz; + const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz; const float phiRangeMax = currentCluster.phi + maxdeltaphi; if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] || @@ -534,17 +565,12 @@ GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerInde return getEmptyBinsRect(); } - return int4{o2::gpu::GPUCommonMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)), + return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)), utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), - o2::gpu::GPUCommonMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)), + o2::gpu::CAMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)), utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; } -GPUhd() float Sq(float q) -{ - return q * q; -} - template GPUd() void pPointer(T* ptr) { @@ -559,7 +585,6 @@ GPUg() void printPointersKernel(std::tuple args) std::apply(print_all, args); } -// Functors to sort tracklets template struct trackletSortEmptyFunctor : public thrust::binary_function { GPUhd() bool operator()(const T& lhs, const T& rhs) const @@ -576,7 +601,6 @@ struct trackletSortIndexFunctor : public thrust::binary_function { } }; -// Print layer buffer GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int size, const int len = 150, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { @@ -590,7 +614,7 @@ GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int } } -GPUg() void printMatrixRow(const int row, int** mat, const unsigned int rowLength, const int len = 150, const unsigned int tId = 0) +GPUg() void printMatrixRow(const int row, const int** mat, const unsigned int rowLength, const int len = 150, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { for (int i{0}; i < rowLength; ++i) { @@ -616,52 +640,12 @@ GPUg() void printBufferPointersLayerOnThread(const int layer, void** v, unsigned } } -// Dump vertices GPUg() void printVertices(const Vertex* v, unsigned int size, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { - printf("vertices: "); + printf("vertices: \n"); for (int i{0}; i < size; ++i) { - printf("x=%f y=%f z=%f\n", v[i].getX(), v[i].getY(), v[i].getZ()); - } - } -} - -// Dump tracklets -GPUg() void printTracklets(const Tracklet* t, - const int offset, - const int startRof, - const int nrof, - const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF - const int* roFrameClustersNextLayer, // Number of clusters on layer 1 per ROF - const int maxClustersPerRof = 5e2, - const int maxTrackletsPerCluster = 50, - const unsigned int tId = 0) -{ - if (threadIdx.x == tId) { - auto offsetCurrent{roFrameClustersCurrentLayer[offset]}; - auto offsetNext{roFrameClustersNextLayer[offset]}; - auto offsetChunk{(startRof - offset) * maxClustersPerRof * maxTrackletsPerCluster}; - for (int i{offsetChunk}; i < offsetChunk + nrof * maxClustersPerRof * maxTrackletsPerCluster; ++i) { - if (t[i].firstClusterIndex != -1) { - t[i].dump(offsetCurrent, offsetNext); - } - } - } -} - -GPUg() void printTrackletsNotStrided(const Tracklet* t, - const int offset, - const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF - const int* roFrameClustersNextLayer, // Number of clusters on layer 1 per ROF - const int ntracklets, - const unsigned int tId = 0) -{ - if (threadIdx.x == tId) { - auto offsetCurrent{roFrameClustersCurrentLayer[offset]}; - auto offsetNext{roFrameClustersNextLayer[offset]}; - for (int i{0}; i < ntracklets; ++i) { - t[i].dump(offsetCurrent, offsetNext); + printf("\tx=%f y=%f z=%f\n", v[i].getX(), v[i].getY(), v[i].getZ()); } } } @@ -678,105 +662,6 @@ GPUg() void printNeighbours(const gpuPair* neighbours, } } -// Compute the tracklets for a given layer -template -GPUg() void computeLayerTrackletsKernelSingleRof( - const short rof0, - const short maxRofs, - const int layerIndex, - const Cluster* clustersCurrentLayer, // input data rof0 - const Cluster* clustersNextLayer, // input data rof0-delta * trkPars, - const IndexTableUtils* utils, - const unsigned int maxTrackletsPerCluster = 50) -{ - for (int currentClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; currentClusterIndex < currentLayerClustersSize; currentClusterIndex += blockDim.x * gridDim.x) { - unsigned int storedTracklets{0}; - const Cluster& currentCluster{clustersCurrentLayer[currentClusterIndex]}; - const int currentSortedIndex{roFrameClusters[rof0] + currentClusterIndex}; - if (usedClustersLayer[currentSortedIndex]) { - continue; - } - short minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0; - short maxRof = (rof0 == static_cast(maxRofs - trkPars->DeltaROF)) ? rof0 : rof0 + trkPars->DeltaROF; - const float inverseR0{1.f / currentCluster.radius}; - for (int iPrimaryVertex{0}; iPrimaryVertex < nVertices; iPrimaryVertex++) { - const auto& primaryVertex{vertices[iPrimaryVertex]}; - if (primaryVertex.getX() == 0.f && primaryVertex.getY() == 0.f && primaryVertex.getZ() == 0.f) { - continue; - } - const float resolution{o2::gpu::GPUCommonMath::Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))}; - const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; - const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; - const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; - const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution - const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))}; - - const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)}; - if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { - continue; - } - int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - if (phiBinsNum < 0) { - phiBinsNum += trkPars->PhiBins; - } - constexpr int tableSize{256 * 128 + 1}; // hardcoded for the time being - - for (short rof1{minRof}; rof1 <= maxRof; ++rof1) { - if (!(roFrameClustersNext[rof1 + 1] - roFrameClustersNext[rof1])) { // number of clusters on next layer > 0 - continue; - } - for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { - int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins; - const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; - const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; - const int firstRowClusterIndex = indexTable[rof1 * tableSize + firstBinIndex]; - const int maxRowClusterIndex = indexTable[rof1 * tableSize + maxBinIndex]; - for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (iNextCluster >= (roFrameClustersNext[rof1 + 1] - roFrameClustersNext[rof1])) { - break; - } - const Cluster& nextCluster{getPtrFromRuler(rof1, clustersNextLayer, roFrameClustersNext)[iNextCluster]}; - if (usedClustersNextLayer[nextCluster.clusterId]) { - continue; - } - const float deltaPhi{o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi)}; - const float deltaZ{o2::gpu::GPUCommonMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; - - if (deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::GPUCommonMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut)) { - trackletsLookUpTable[currentSortedIndex]++; // Race-condition safe - const float phi{o2::gpu::GPUCommonMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; - const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; - const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster}; - new (tracklets + stride + storedTracklets) Tracklet{currentSortedIndex, roFrameClustersNext[rof1] + iNextCluster, tanL, phi, rof0, rof1}; - ++storedTracklets; - } - } - } - } - } - // if (storedTracklets > maxTrackletsPerCluster) { - // printf("its-gpu-tracklet finder: found more tracklets per clusters (%d) than maximum set (%d), check the configuration!\n", maxTrackletsPerCluster, storedTracklets); - // } - } -} - template GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, int* trackletsLookUpTable, @@ -812,17 +697,41 @@ GPUg() void removeDuplicateTrackletsEntriesLUTKernel( } // namespace gpu template -void computeTrackletsInRofsHandler(const int startROF, +void computeTrackletsInRofsHandler(const uint8_t* multMask, + const int startROF, const int endROF, + const int maxROF, + const int deltaROF, const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + const int** ROFClusters, + std::vector& radii, + std::vector& mulScatAng, const int nBlocks, const int nThreads) { for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - gpu::computeLayerTrackletsMultiROFKernel<<>>(iLayer, - startROF, - endROF - startROF, - vertexId); + const auto meanDeltaR = radii[iLayer + 1] - radii[iLayer]; + const auto mSAngle = mulScatAng[iLayer]; + // gpu::printMatrixRow<<<1, 1>>>(iLayer, ROFClusters, maxROF); + gpu::computeLayerTrackletsMultiROFKernel<<<1, 1>>>( + multMask, + iLayer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + meanDeltaR, + mSAngle); } } @@ -986,8 +895,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, const int nThreads) { - gpu::computeLayerCellNeighboursKernel<<>>( + gpu::computeLayerCellNeighboursKernel<<>>( cellsLayersDevice, neighboursLUT, neighboursIndexTable, @@ -1056,9 +965,19 @@ void trackSeedHandler(CellSeed* trackSeeds, gpuCheckError(cudaDeviceSynchronize()); } -template void computeTrackletsInRofsHandler<7>(const int, // startROF - const int, // endROF - const int, // vertexId - const int, // nBlocks - const int); // nThreads; +template void computeTrackletsInRofsHandler<7>(const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + const int** ROFClusters, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads); } // namespace o2::its \ No newline at end of file diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index 9f6355c19555d..309ca2031a9b5 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -110,6 +110,7 @@ class TimeFrame float getMinR(int layer) const { return mMinR[layer]; } float getMaxR(int layer) const { return mMaxR[layer]; } float getMSangle(int layer) const { return mMSangles[layer]; } + std::vector& getMSangles() { return mMSangles; } float getPhiCut(int layer) const { return mPhiCuts[layer]; } float getPositionResolution(int layer) const { return mPositionResolution[layer]; } @@ -440,33 +441,33 @@ inline gsl::span TimeFrame::getClustersPerROFrange(int rofMin, in return gsl::span(); } int startIdx{mROFramesClusters[layerId][rofMin]}; // First cluster of rofMin - int endIdx{mROFramesClusters[layerId][std::min(rofMin + range, mNrof)]}; + int endIdx{mROFramesClusters[layerId][o2::gpu::CAMath::Min(rofMin + range, mNrof)]}; return {&mClusters[layerId][startIdx], static_cast::size_type>(endIdx - startIdx)}; } inline gsl::span TimeFrame::getROFramesClustersPerROFrange(int rofMin, int range, int layerId) const { - int chkdRange{std::min(range, mNrof - rofMin)}; + int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)}; return {&mROFramesClusters[layerId][rofMin], static_cast::size_type>(chkdRange)}; } inline gsl::span TimeFrame::getNClustersROFrange(int rofMin, int range, int layerId) const { - int chkdRange{std::min(range, mNrof - rofMin)}; + int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)}; return {&mNClustersPerROF[layerId][rofMin], static_cast::size_type>(chkdRange)}; } inline int TimeFrame::getTotalClustersPerROFrange(int rofMin, int range, int layerId) const { int startIdx{rofMin}; // First cluster of rofMin - int endIdx{std::min(rofMin + range, mNrof)}; + int endIdx{o2::gpu::CAMath::Min(rofMin + range, mNrof)}; return mROFramesClusters[layerId][endIdx] - mROFramesClusters[layerId][startIdx]; } inline gsl::span TimeFrame::getIndexTablePerROFrange(int rofMin, int range, int layerId) const { const int iTableSize{mIndexTableUtils.getNphiBins() * mIndexTableUtils.getNzBins() + 1}; - int chkdRange{std::min(range, mNrof - rofMin)}; + int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)}; return {&mIndexTables[layerId][rofMin * iTableSize], static_cast::size_type>(chkdRange * iTableSize)}; } diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index da0abbae9dc1f..be43bcec2ac89 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -75,9 +75,9 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in for (int rof0{startROF}; rof0 < endROF; ++rof0) { gsl::span primaryVertices = mTrkParams[iteration].UseDiamond ? diamondSpan : tf->getPrimaryVertices(rof0); const int startVtx{iVertex >= 0 ? iVertex : 0}; - const int endVtx{iVertex >= 0 ? std::min(iVertex + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; - int minRof = std::max(startROF, rof0 - mTrkParams[iteration].DeltaROF); - int maxRof = std::min(endROF - 1, rof0 + mTrkParams[iteration].DeltaROF); + const int endVtx{iVertex >= 0 ? o2::gpu::CAMath::Min(iVertex + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; + int minRof = o2::gpu::CAMath::Max(startROF, rof0 - mTrkParams[iteration].DeltaROF); + int maxRof = o2::gpu::CAMath::Min(endROF - 1, rof0 + mTrkParams[iteration].DeltaROF); #pragma omp parallel for num_threads(mNThreads) for (int iLayer = 0; iLayer < mTrkParams[iteration].TrackletsPerRoad(); ++iLayer) { gsl::span layer0 = tf->getClustersOnLayer(rof0, iLayer); @@ -668,7 +668,7 @@ void TrackerTraits::findRoads(const int iteration) if (rofs[1] != INT_MAX) { track.setNextROFbit(); } - mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); + mTimeFrame->getTracks(o2::gpu::CAMath::Min(rofs[0], rofs[1])).emplace_back(track); } } } From e24bc7e637800a093180e7040df06753211a5fb9 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 20 Nov 2024 18:20:31 +0100 Subject: [PATCH 05/14] Checkpointing --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 13 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 12 +- .../ITS/tracking/GPU/ITStrackingGPU/Utils.h | 16 +- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 64 +++- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 16 +- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 295 ++++++++++-------- .../tracking/include/ITStracking/TimeFrame.h | 5 +- 7 files changed, 269 insertions(+), 152 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 6b5d32dc1c17a..8f2ba9dd6ea58 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -51,10 +51,14 @@ class TimeFrameGPU : public TimeFrame void initialise(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr); void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int); void initDeviceSAFitting(); + void loadIndexTableUtils(const int); void loadTrackingFrameInfoDevice(const int); void loadUnsortedClustersDevice(const int); void loadClustersDevice(const int); - void loadROframeClustersDevice(const int iteration); + void loadClustersIndexTables(const int iteration); + void createUsedClustersDevice(const int); + void loadUsedClustersDevice(); + void loadROframeClustersDevice(const int); void loadMultiplicityCutMask(const int); void loadVertices(const int); @@ -112,6 +116,8 @@ class TimeFrameGPU : public TimeFrame const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; } const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; } + const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; } + const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; } const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; } const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } @@ -148,7 +154,6 @@ class TimeFrameGPU : public TimeFrame // Device pointers StaticTrackingParameters* mTrackingParamsDevice; IndexTableUtils* mIndexTableUtilsDevice; - std::array mUsedClustersDevice; // Hybrid pref uint8_t* mMultMaskDevice; @@ -156,9 +161,13 @@ class TimeFrameGPU : public TimeFrame int* mROFramesPVDevice; std::array mClustersDevice; std::array mUnsortedClustersDevice; + std::array mClustersIndexTablesDevice; + std::array mUsedClustersDevice; std::array mROFramesClustersDevice; const Cluster** mClustersDeviceArray; const Cluster** mUnsortedClustersDeviceArray; + const int** mClustersIndexTablesDeviceArray; + const unsigned char** mUsedClustersDeviceArray; const int** mROFrameClustersDeviceArray; std::array mTrackletsDevice; const Tracklet** mTrackletsDeviceArray; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 5b70d571b9b52..0496635f8898b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -51,7 +51,8 @@ GPUg() void fitTrackSeedsKernel( } // namespace gpu template -void computeTrackletsInRofsHandler(const uint8_t* multMask, +void computeTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, const int startROF, const int endROF, const int maxROF, @@ -62,6 +63,15 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask, const int nVertices, const Cluster** clusters, const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minR, + std::vector& maxR, + std::vector& resolutions, std::vector& radii, std::vector& mulScatAng, const int nBlocks, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index cc45e24a8cbdb..a88e51742e84a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -39,9 +39,9 @@ struct gpuSpan { using ref = T&; GPUd() gpuSpan() : _data(nullptr), _size(0) {} - GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {} - GPUd() ref operator[](std::size_t idx) const { return _data[idx]; } - GPUd() std::size_t size() const { return _size; } + GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {} + GPUd() ref operator[](unsigned int idx) const { return _data[idx]; } + GPUd() unsigned int size() const { return _size; } GPUd() bool empty() const { return _size == 0; } GPUd() ref front() const { return _data[0]; } GPUd() ref back() const { return _data[_size - 1]; } @@ -50,7 +50,7 @@ struct gpuSpan { protected: ptr _data; - std::size_t _size; + unsigned int _size; }; template @@ -60,10 +60,10 @@ struct gpuSpan { using ref = const T&; GPUd() gpuSpan() : _data(nullptr), _size(0) {} - GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {} + GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {} GPUd() gpuSpan(const gpuSpan& other) : _data(other._data), _size(other._size) {} - GPUd() ref operator[](std::size_t idx) const { return _data[idx]; } - GPUd() std::size_t size() const { return _size; } + GPUd() ref operator[](unsigned int idx) const { return _data[idx]; } + GPUd() unsigned int size() const { return _size; } GPUd() bool empty() const { return _size == 0; } GPUd() ref front() const { return _data[0]; } GPUd() ref back() const { return _data[_size - 1]; } @@ -72,7 +72,7 @@ struct gpuSpan { protected: ptr _data; - std::size_t _size; + unsigned int _size; }; enum class Task { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 0db970b2361ab..0b30d7af99246 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -92,6 +92,19 @@ void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl +void TimeFrameGPU::loadIndexTableUtils(const int iteration) +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils"); + if (!iteration) { + LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB); + allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, getExtAllocator()); + } + LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB); + checkGPUError(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + template void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration) { @@ -128,13 +141,56 @@ void TimeFrameGPU::loadClustersDevice(const int iteration) } } +template +void TimeFrameGPU::loadClustersIndexTables(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(info, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::createUsedClustersDevice(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB); + allocMemAsync(reinterpret_cast(&mUsedClustersDevice[iLayer]), mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, getExtAllocator()); + checkGPUError(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::loadUsedClustersDevice() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB); + checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + template void TimeFrameGPU::loadROframeClustersDevice(const int iteration) { if (!iteration) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - LOGP(info, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB); + LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } @@ -167,7 +223,7 @@ void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) { if (!iteration) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask"); - LOGP(info, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB); + LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB); allocMemAsync(reinterpret_cast(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); @@ -179,10 +235,10 @@ void TimeFrameGPU::loadVertices(const int iteration) { if (!iteration) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices"); - LOGP(info, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB); + LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - LOGP(info, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB); + LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB); allocMemAsync(reinterpret_cast(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 8fb3f628cacc0..b83caf5b5b849 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -31,10 +31,13 @@ void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers); mTimeFrameGPU->loadClustersDevice(iteration); mTimeFrameGPU->loadUnsortedClustersDevice(iteration); + mTimeFrameGPU->loadClustersIndexTables(iteration); mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration); mTimeFrameGPU->loadMultiplicityCutMask(iteration); mTimeFrameGPU->loadVertices(iteration); mTimeFrameGPU->loadROframeClustersDevice(iteration); + mTimeFrameGPU->createUsedClustersDevice(iteration); + mTimeFrameGPU->loadIndexTableUtils(iteration); } template @@ -95,7 +98,8 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0}; int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()}; - computeTrackletsInRofsHandler(mTimeFrameGPU->getDeviceMultCutMask(), + computeTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + mTimeFrameGPU->getDeviceMultCutMask(), startROF, endROF, mTimeFrameGPU->getNrof(), @@ -106,6 +110,15 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int mTimeFrameGPU->getPrimaryVerticesNum(), mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getDeviceROframeClusters(), + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + iteration, + mTrkParams[iteration].NSigmaCut, + mTimeFrameGPU->getPhiCuts(), + mTrkParams[iteration].PVres, + mTimeFrameGPU->getMinRs(), + mTimeFrameGPU->getMaxRs(), + mTimeFrameGPU->getPositionResolutions(), mTrkParams[iteration].LayerRadii, mTimeFrameGPU->getMSangles(), conf.nBlocks, @@ -324,6 +337,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); } } + mTimeFrameGPU->loadUsedClustersDevice(); if (iteration == mTrkParams.size() - 1) { mTimeFrameGPU->unregisterHostMemory(0); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 5dedb5fd754ad..a215332513824 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -74,9 +74,36 @@ namespace o2::its { using namespace constants::its2; using Vertex = o2::dataformats::Vertex>; + +GPUd() float Sq(float v) +{ + return v * v; +} + namespace gpu { +GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, + const o2::its::IndexTableUtils& utils, + const float z1, const float z2, float maxdeltaz, float maxdeltaphi) +{ + const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz; + const float phiRangeMin = (maxdeltaphi > constants::math::Pi) ? 0.f : currentCluster.phi - maxdeltaphi; + const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz; + const float phiRangeMax = (maxdeltaphi > constants::math::Pi) ? constants::math::TwoPi : currentCluster.phi + maxdeltaphi; + + if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] || + zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) { + + return getEmptyBinsRect(); + } + + return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)), + utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), + o2::gpu::CAMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)), + utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; +} + GPUd() bool fitTrack(TrackITSExt& track, int start, int end, @@ -199,30 +226,28 @@ struct is_valid_pair { GPUd() gpuSpan getPrimaryVertices(const int rof, const int* roframesPV, - const int nRof, + const int nROF, const uint8_t* mask, const Vertex* vertices) { const int start_pv_id = roframesPV[rof]; - const int stop_rof = rof >= nRof - 1 ? nRof : rof + 1; - size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if Rof is excluded + const int stop_rof = rof >= nROF - 1 ? nROF : rof + 1; + size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if ROF is excluded return gpuSpan(&vertices[start_pv_id], delta); }; GPUd() gpuSpan getClustersOnLayer(const int rof, - const int** roframesClus, + const int totROFs, const int layer, - const int nRof, + const int** roframesClus, const Cluster** clusters) { - const int start_clus_id{roframesClus[layer][rof]}; - const int stop_rof = rof >= nRof - 1 ? nRof : rof + 1; - const int delta = roframesClus[layer][stop_rof] - start_clus_id; - printf("\t\t\t r: %d nr: %d rfci: %d d: %d\n", rof, nRof, roframesClus[layer][rof], delta); - if (rof < 0 || rof >= nRof) { + if (rof < 0 || rof >= totROFs) { return gpuSpan(); } - + const int start_clus_id{roframesClus[layer][rof]}; + const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1; + const unsigned int delta = roframesClus[layer][stop_rof] - start_clus_id; return gpuSpan(&(clusters[layer][start_clus_id]), delta); } @@ -424,45 +449,45 @@ GPUg() void computeLayerCellsKernel( } } -template +template GPUg() void computeLayerTrackletsMultiROFKernel( + const IndexTableUtils* utils, const uint8_t* multMask, const int layerIndex, const int startROF, const int endROF, - const int maxRof, - const int deltaRof, + const int totalROFs, + const int deltaROF, const Vertex* vertices, const int* rofPV, const int nVertices, const int vertexId, - const Cluster** clusters, // input data rof0 - const int** ROFClusters, // Number of clusters on layers per ROF - // const int* roFrameClustersNextLayer, // Number of clusters on layer 1 per ROF - // const int* indexTablesNext, // input data rof0-delta getNphiBins()}; - // const int zBins{utils->getNzBins()}; - for (unsigned int iRof{blockIdx.x}; iRof < endROF - startROF; iRof += gridDim.x) { - auto rof0 = iRof + startROF; - auto primaryVertices = getPrimaryVertices(rof0, rofPV, maxRof, multMask, vertices); + const int phiBins{utils->getNphiBins()}; + const int zBins{utils->getNzBins()}; + for (unsigned int iROF{blockIdx.x}; iROF < endROF - startROF; iROF += gridDim.x) { + const int rof0 = iROF + startROF; + auto primaryVertices = getPrimaryVertices(rof0, rofPV, totalROFs, multMask, vertices); const auto startVtx{vertexId >= 0 ? vertexId : 0}; const auto endVtx{vertexId >= 0 ? o2::gpu::CAMath::Min(vertexId + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; - auto minRof = o2::gpu::CAMath::Max(startROF, static_cast(rof0 - deltaRof)); - auto maxRof = o2::gpu::CAMath::Min(endROF - 1, static_cast(rof0 + deltaRof)); - auto clustersCurrentLayer = getClustersOnLayer(iRof, ROFClusters, layerIndex, maxRof, clusters); - if (threadIdx.x == 0) { - // printf("> l: %d r: %d rc: %d s: %d e: %d \n", layerIndex, iRof, ROFClusters[layerIndex][iRof], clustersCurrentLayer.size(), clustersCurrentLayer.empty()); - } + auto minROF = o2::gpu::CAMath::Max(startROF, static_cast(rof0 - deltaROF)); + auto maxROF = o2::gpu::CAMath::Min(endROF - 1, static_cast(rof0 + deltaROF)); + auto clustersCurrentLayer = getClustersOnLayer(rof0, totalROFs, layerIndex, ROFClusters, clusters); if (clustersCurrentLayer.empty()) { continue; } @@ -470,79 +495,72 @@ GPUg() void computeLayerTrackletsMultiROFKernel( for (int currentClusterIndex = threadIdx.x; currentClusterIndex < clustersCurrentLayer.size(); currentClusterIndex += blockDim.x) { unsigned int storedTracklets{0}; auto currentCluster{clustersCurrentLayer[currentClusterIndex]}; - if (threadIdx.x == 0) { - printf("rof: %d has %zu clusters on layer %d\n", rof0, clustersCurrentLayer.size(), layerIndex); + const int currentSortedIndex{ROFClusters[layerIndex][rof0] + currentClusterIndex}; + if (usedClusters[layerIndex][currentSortedIndex]) { + continue; } - // const int currentSortedIndex{roFrameClustersCurrentLayer[rof0] + currentClusterIndex}; - // const int currentSortedIndexChunk{currentSortedIndex - roFrameClustersCurrentLayer[startROF]}; - // if (usedClustersLayer[currentSortedIndex]) { - // continue; - // } - // - // int minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0; - // int maxRof = (rof0 == maxRofs - trkPars->DeltaROF) ? rof0 : rof0 + trkPars->DeltaROF; // works with delta = {0, 1} - // const float inverseR0{1.f / currentCluster.radius}; - // - // for (int iPrimaryVertex{0}; iPrimaryVertex < nVerticesRof0; iPrimaryVertex++) { - // const auto& primaryVertex{vertices[nVertices[rof0] + iPrimaryVertex]}; - // const float resolution{Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))}; - // const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; - // const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; - // const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; - // const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution - // const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))}; - // - // const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)}; - // - // if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { - // continue; - // } - // int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - // if (phiBinsNum < 0) { - // phiBinsNum += trkPars->PhiBins; - // } - // const int tableSize{phiBins * zBins + 1}; - // for (int rof1{minRof}; rof1 <= maxRof; ++rof1) { - // auto nClustersNext{roFrameClustersNextLayer[rof1 + 1] - roFrameClustersNextLayer[rof1]}; - // if (!nClustersNext) { // number of clusters on next layer > 0 - // continue; - // } - // for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { - // int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins; - // const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; - // const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; - // const int firstRowClusterIndex = indexTablesNext[(rof1 - startROF) * tableSize + firstBinIndex]; - // const int maxRowClusterIndex = indexTablesNext[(rof1 - startROF) * tableSize + maxBinIndex]; - // for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - // if (iNextCluster >= nClustersNext) { - // break; - // } - // auto nextClusterIndex{roFrameClustersNextLayer[rof1] - roFrameClustersNextLayer[startROF] + iNextCluster}; - // const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]}; - // if (usedClustersNextLayer[nextCluster.clusterId]) { - // continue; - // } - // const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)}; - // const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; - // - // if ((deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut))) { - // const float phi{ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; - // const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; - // const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster}; - // if (storedTracklets < maxTrackletsPerCluster) { - // new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast(rof0), static_cast(rof1)}; - // } - // else { - // printf("its-gpu-tracklet-finder: on rof %d layer: %d: found more tracklets (%d) than maximum allowed per cluster. This is lossy!\n", rof0, layerIndex, storedTracklets); - // } - // ++storedTracklets; - // } - // } - // } - // } - // } - // */ + const float inverseR0{1.f / currentCluster.radius}; + for (int iV{startVtx}; iV < endVtx; ++iV) { + auto& primaryVertex{primaryVertices[iV]}; + if (primaryVertex.isFlagSet(2) && iteration != 3) { + continue; + } + const float resolution = o2::gpu::CAMath::Sqrt(Sq(resolutionPV) / primaryVertex.getNContributors() + Sq(positionResolution)); + const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; + const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; + const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; + const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution + const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * MSAngle))}; + const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)}; + if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { + continue; + } + int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; + + if (phiBinsNum < 0) { + phiBinsNum += phiBins; + } + + const int tableSize{phiBins * zBins + 1}; + for (int rof1{minROF}; rof1 <= maxROF; ++rof1) { + auto clustersNextLayer = getClustersOnLayer(rof1, totalROFs, layerIndex + 1, ROFClusters, clusters); + if (clustersNextLayer.empty()) { + continue; + } + for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { + int iPhiBin = (selectedBinsRect.y + iPhiCount) % PhiBins; + const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; + const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; + const int firstRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + firstBinIndex]; + const int maxRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + maxBinIndex]; + for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { + if (iNextCluster >= clustersNextLayer.size()) { + break; + } + const Cluster& nextCluster{clustersNextLayer[iNextCluster]}; + if (usedClusters[layerIndex + 1][nextCluster.clusterId]) { + continue; + } + const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)}; + const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + + currentCluster.zCoordinate - nextCluster.zCoordinate)}; + if (deltaZ / sigmaZ < NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut)) { + // if (layerIndex > 0) { + if constexpr (initRun) { + // trackletsLUT[currentSortedIndex]++; // we need l0 as well for usual exclusive sums. + } else { + // } + const float phi{o2::gpu::CAMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; + const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; + // tf->getTracklets()[layerIndex].emplace_back(currentSortedIndex, tf->getSortedIndex(rof1, layerIndex + 1, iNextCluster), tanL, phi, rof0, rof1); + } + ++storedTracklets; + } + } + } + } + } } } } @@ -550,26 +568,6 @@ GPUg() void computeLayerTrackletsMultiROFKernel( ///////////////////////////////////////// // Debug Kernels ///////////////////////////////////////// -GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, - const o2::its::IndexTableUtils& utils, - const float z1, const float z2, float maxdeltaz, float maxdeltaphi) -{ - const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz; - const float phiRangeMin = currentCluster.phi - maxdeltaphi; - const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz; - const float phiRangeMax = currentCluster.phi + maxdeltaphi; - - if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] || - zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) { - - return getEmptyBinsRect(); - } - - return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)), - utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), - o2::gpu::CAMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)), - utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; -} template GPUd() void pPointer(T* ptr) @@ -697,7 +695,8 @@ GPUg() void removeDuplicateTrackletsEntriesLUTKernel( } // namespace gpu template -void computeTrackletsInRofsHandler(const uint8_t* multMask, +void computeTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, const int startROF, const int endROF, const int maxROF, @@ -708,16 +707,23 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask, const int nVertices, const Cluster** clusters, const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, std::vector& radii, std::vector& mulScatAng, const int nBlocks, const int nThreads) { for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - const auto meanDeltaR = radii[iLayer + 1] - radii[iLayer]; - const auto mSAngle = mulScatAng[iLayer]; - // gpu::printMatrixRow<<<1, 1>>>(iLayer, ROFClusters, maxROF); - gpu::computeLayerTrackletsMultiROFKernel<<<1, 1>>>( + gpu::computeLayerTrackletsMultiROFKernel<<>>( + utils, multMask, iLayer, startROF, @@ -730,8 +736,17 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask, vertexId, clusters, ROFClusters, - meanDeltaR, - mSAngle); + usedClusters, + clustersIndexTables, + iteration, + NSigmaCut, + phiCuts[iLayer], + resolutionPV, + minRs[iLayer + 1], + maxRs[iLayer + 1], + resolutions[iLayer], + radii[iLayer + 1] - radii[iLayer], + mulScatAng[iLayer]); } } @@ -965,7 +980,8 @@ void trackSeedHandler(CellSeed* trackSeeds, gpuCheckError(cudaDeviceSynchronize()); } -template void computeTrackletsInRofsHandler<7>(const uint8_t* multMask, +template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, + const uint8_t* multMask, const int startROF, const int endROF, const int maxROF, @@ -976,6 +992,15 @@ template void computeTrackletsInRofsHandler<7>(const uint8_t* multMask, const int nVertices, const Cluster** clusters, const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, std::vector& radii, std::vector& mulScatAng, const int nBlocks, diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index 309ca2031a9b5..fa4f33782d16a 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -106,13 +106,16 @@ class TimeFrame float getBeamX() const; float getBeamY() const; - + std::vector& getMinRs() { return mMinR; } + std::vector& getMaxRs() { return mMaxR; } float getMinR(int layer) const { return mMinR[layer]; } float getMaxR(int layer) const { return mMaxR[layer]; } float getMSangle(int layer) const { return mMSangles[layer]; } std::vector& getMSangles() { return mMSangles; } float getPhiCut(int layer) const { return mPhiCuts[layer]; } + std::vector& getPhiCuts() { return mPhiCuts; } float getPositionResolution(int layer) const { return mPositionResolution[layer]; } + std::vector& getPositionResolutions() { return mPositionResolution; } gsl::span getClustersOnLayer(int rofId, int layerId); gsl::span getClustersOnLayer(int rofId, int layerId) const; From 7e05816c60147b2a013b114993cc0fb511d932d2 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 26 Nov 2024 15:40:52 +0100 Subject: [PATCH 06/14] Fix access in tracklet finding --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 7 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 5 +- .../ITS/tracking/GPU/cuda/CMakeLists.txt | 2 +- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 34 ++++++++-- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 2 + .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 67 ++++++++++--------- 6 files changed, 76 insertions(+), 41 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 8f2ba9dd6ea58..77d81a50ef0ac 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -63,6 +63,7 @@ class TimeFrameGPU : public TimeFrame void loadVertices(const int); /// + void createTrackletsLUTDevice(); void loadTrackletsDevice(); void loadTrackletsLUTDevice(); void loadCellsDevice(); @@ -120,7 +121,7 @@ class TimeFrameGPU : public TimeFrame const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; } const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; } const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } - const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } + int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; } int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; } CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; } @@ -171,14 +172,14 @@ class TimeFrameGPU : public TimeFrame const int** mROFrameClustersDeviceArray; std::array mTrackletsDevice; const Tracklet** mTrackletsDeviceArray; - const int** mTrackletsLUTDeviceArray; - std::array mTrackletsLUTDevice; + std::array mTrackletsLUTDevice; std::array mCellsLUTDevice; std::array mNeighboursLUTDevice; int** mCellsLUTDeviceArray; int** mNeighboursCellDeviceArray; int** mNeighboursCellLUTDeviceArray; + int** mTrackletsLUTDeviceArray; std::array mCellsDevice; std::array mNeighboursIndexTablesDevice; CellSeed* mTrackSeedsDevice; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 0496635f8898b..352f45b927cb8 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -65,6 +65,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, + int** trackletsLUTs, const int iteration, const float NSigmaCut, std::vector& phiCuts, @@ -81,7 +82,7 @@ void countCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, const Tracklet** tracklets, - const int** trackletsLUT, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, @@ -98,7 +99,7 @@ void computeCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, const Tracklet** tracklets, - const int** trackletsLUT, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt index 3cdb107e07438..e2fc1f1388ad0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt @@ -13,7 +13,7 @@ if(CUDA_ENABLED) find_package(CUDAToolkit) message(STATUS "Building ITS CUDA tracker") -# add_compile_options(-O0 -g -lineinfo -fPIC) +add_compile_options(-O0 -g -lineinfo -fPIC) # add_compile_definitions(ITS_MEASURE_GPU_TIME) o2_add_library(ITStrackingCUDA SOURCES ClusterLinesGPU.cu diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 0b30d7af99246..c8c080b1acb10 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -147,12 +147,12 @@ void TimeFrameGPU::loadClustersIndexTables(const int iteration) if (!iteration) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - LOGP(info, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB); + LOGP(debug, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -245,6 +245,32 @@ void TimeFrameGPU::loadVertices(const int iteration) } } +template +void TimeFrameGPU::createTrackletsLUTDevice() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs"); + for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { + LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", mClusters[iLayer].size() + 1, iLayer, (mClusters[iLayer].size() + 1) * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), (mClusters[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +// template void TimeFrameGPU::createTrackletsBuffers() +// { +// START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); +// for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { +// mNTracklets[iLayer] = 0; +// checkGPUError(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost)); +// LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[layer], iLayer, mNTracklets[iLayer] * sizeof(CellSeed) / MB); +// allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), nullptr, getExtAllocator()); +// } +// STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +// } + template void TimeFrameGPU::loadTrackletsDevice() { @@ -267,11 +293,11 @@ void TimeFrameGPU::loadTrackletsLUTDevice() START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), mTrackletsLookupTable[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + // allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), mTrackletsLookupTable[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); checkGPUError(cudaHostRegister(mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); } - allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); + // allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice)); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index b83caf5b5b849..53ed26513218e 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -92,6 +92,7 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int { auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); + mTimeFrameGPU->createTrackletsLUTDevice(); const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); gsl::span diamondSpan(&diamondVert, 1); @@ -112,6 +113,7 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int mTimeFrameGPU->getDeviceROframeClusters(), mTimeFrameGPU->getDeviceArrayUsedClusters(), mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + mTimeFrameGPU->getDeviceArrayTrackletsLUT(), iteration, mTrkParams[iteration].NSigmaCut, mTimeFrameGPU->getPhiCuts(), diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index a215332513824..2e0348c0ab0e2 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -370,7 +370,7 @@ GPUg() void computeLayerCellsKernel( const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, const Tracklet** tracklets, - const int** trackletsLUT, + int** trackletsLUT, const int nTrackletsCurrent, const int layer, CellSeed* cells, @@ -467,7 +467,7 @@ GPUg() void computeLayerTrackletsMultiROFKernel( const unsigned char** usedClusters, // Used clusters const int** indexTables, // input data rof0-delta 0) { if constexpr (initRun) { - // trackletsLUT[currentSortedIndex]++; // we need l0 as well for usual exclusive sums. + trackletsLUT[layerIndex][currentSortedIndex]++; // we need l0 as well for usual exclusive sums. } else { // } const float phi{o2::gpu::CAMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; @@ -612,7 +612,7 @@ GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int } } -GPUg() void printMatrixRow(const int row, const int** mat, const unsigned int rowLength, const int len = 150, const unsigned int tId = 0) +GPUg() void printMatrixRow(const int row, const int** mat, const unsigned int rowLength, const int len = 256 * 128 + 1, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { for (int i{0}; i < rowLength; ++i) { @@ -709,6 +709,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, + int** trackletsLUTs, const int iteration, const float NSigmaCut, std::vector& phiCuts, @@ -722,31 +723,34 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int nThreads) { for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - gpu::computeLayerTrackletsMultiROFKernel<<>>( - utils, - multMask, - iLayer, - startROF, - endROF, - maxROF, - deltaROF, - vertices, - rofPV, - nVertices, - vertexId, - clusters, - ROFClusters, - usedClusters, - clustersIndexTables, - iteration, - NSigmaCut, - phiCuts[iLayer], - resolutionPV, - minRs[iLayer + 1], - maxRs[iLayer + 1], - resolutions[iLayer], - radii[iLayer + 1] - radii[iLayer], - mulScatAng[iLayer]); + // gpu::computeLayerTrackletsMultiROFKernel<<<1, 1>>>( + // utils, + // multMask, + // iLayer, + // startROF, + // endROF, + // maxROF, + // deltaROF, + // vertices, + // rofPV, + // nVertices, + // vertexId, + // clusters, + // ROFClusters, + // usedClusters, + // clustersIndexTables, + // trackletsLUTs, + // iteration, + // NSigmaCut, + // phiCuts[iLayer], + // resolutionPV, + // minRs[iLayer + 1], + // maxRs[iLayer + 1], + // resolutions[iLayer], + // radii[iLayer + 1] - radii[iLayer], + // mulScatAng[iLayer]); + gpuCheckError(cudaPeekAtLastError()); + gpuCheckError(cudaDeviceSynchronize()); } } @@ -755,7 +759,7 @@ void countCellsHandler( const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, const Tracklet** tracklets, - const int** trackletsLUT, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, @@ -806,7 +810,7 @@ void computeCellsHandler( const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, const Tracklet** tracklets, - const int** trackletsLUT, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, @@ -994,6 +998,7 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, + int** trackletsLUTs, const int iteration, const float NSigmaCut, std::vector& phiCuts, From a03b5be6ff9f6a7faedd1a9ac7ff70e7b5c57154 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 26 Nov 2024 18:10:40 +0100 Subject: [PATCH 07/14] Fix tracklet LUTs issue --- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 10 ++-- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 59 ++++++++++--------- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 6 +- 3 files changed, 39 insertions(+), 36 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index c8c080b1acb10..2176def7f197f 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -292,14 +292,12 @@ void TimeFrameGPU::loadTrackletsLUTDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB); - // allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), mTrackletsLookupTable[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer + 1, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB); checkGPUError(cudaHostRegister(mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); } - // allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice)); + checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice)); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 2e0348c0ab0e2..92a2b505f288b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -386,8 +386,8 @@ GPUg() void computeLayerCellsKernel( for (int iCurrentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentTrackletIndex < nTrackletsCurrent; iCurrentTrackletIndex += blockDim.x * gridDim.x) { const Tracklet& currentTracklet = tracklets[layer][iCurrentTrackletIndex]; const int nextLayerClusterIndex{currentTracklet.secondClusterIndex}; - const int nextLayerFirstTrackletIndex{trackletsLUT[layer][nextLayerClusterIndex]}; - const int nextLayerLastTrackletIndex{trackletsLUT[layer][nextLayerClusterIndex + 1]}; + const int nextLayerFirstTrackletIndex{trackletsLUT[layer + 1][nextLayerClusterIndex]}; + const int nextLayerLastTrackletIndex{trackletsLUT[layer + 1][nextLayerClusterIndex + 1]}; if (nextLayerFirstTrackletIndex == nextLayerLastTrackletIndex) { continue; } @@ -612,7 +612,7 @@ GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int } } -GPUg() void printMatrixRow(const int row, const int** mat, const unsigned int rowLength, const int len = 256 * 128 + 1, const unsigned int tId = 0) +GPUg() void printMatrixRow(const int row, int** mat, const unsigned int rowLength, const int len = 256 * 128 + 1, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { for (int i{0}; i < rowLength; ++i) { @@ -723,34 +723,35 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int nThreads) { for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - // gpu::computeLayerTrackletsMultiROFKernel<<<1, 1>>>( - // utils, - // multMask, - // iLayer, - // startROF, - // endROF, - // maxROF, - // deltaROF, - // vertices, - // rofPV, - // nVertices, - // vertexId, - // clusters, - // ROFClusters, - // usedClusters, - // clustersIndexTables, - // trackletsLUTs, - // iteration, - // NSigmaCut, - // phiCuts[iLayer], - // resolutionPV, - // minRs[iLayer + 1], - // maxRs[iLayer + 1], - // resolutions[iLayer], - // radii[iLayer + 1] - radii[iLayer], - // mulScatAng[iLayer]); + gpu::computeLayerTrackletsMultiROFKernel<<<1, 1>>>( + utils, + multMask, + iLayer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + trackletsLUTs, + iteration, + NSigmaCut, + phiCuts[iLayer], + resolutionPV, + minRs[iLayer + 1], + maxRs[iLayer + 1], + resolutions[iLayer], + radii[iLayer + 1] - radii[iLayer], + mulScatAng[iLayer]); gpuCheckError(cudaPeekAtLastError()); gpuCheckError(cudaDeviceSynchronize()); + gpu::printMatrixRow<<<1, 1>>>(iLayer, trackletsLUTs, 3000); } } diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index be43bcec2ac89..f4a3c75a9dc55 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -201,7 +201,11 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in if (!tf->checkMemory(mTrkParams[iteration].MaxMemory)) { return; } - + for (auto& l : tf->getTrackletsLookupTable()) { + for (auto& t : l) { + std::cout << t << "\t"; + } + } #pragma omp parallel for num_threads(mNThreads) for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { /// Sort tracklets From 3bc9c9ea4489a8f629d8411db5a35c8e58345c9b Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 27 Nov 2024 17:39:01 +0100 Subject: [PATCH 08/14] Debugging small discrepancies --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 11 ++++++ .../GPU/ITStrackingGPU/TrackingKernels.h | 1 + .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 1 + .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 35 ++++++++++++++++- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 38 +++++++++++++++---- 5 files changed, 77 insertions(+), 9 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 77d81a50ef0ac..32d0b183a8cb8 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -118,6 +118,7 @@ class TimeFrameGPU : public TimeFrame const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; } const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; } + std::vector getClusterSizes(); const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; } const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; } const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } @@ -230,6 +231,16 @@ inline int TimeFrameGPU::getNClustersInRofSpan(const int rofIdstart, co { return static_cast(mROFramesClusters[layerId][(rofIdstart + rofSpanSize) < mROFramesClusters.size() ? rofIdstart + rofSpanSize : mROFramesClusters.size() - 1] - mROFramesClusters[layerId][rofIdstart]); } + +template +inline std::vector TimeFrameGPU::getClusterSizes() +{ + std::vector sizes(mUnsortedClusters.size()); + std::transform(mUnsortedClusters.begin(), mUnsortedClusters.end(), sizes.begin(), + [](const auto& v) { return static_cast(v.size()); }); + return sizes; +} + } // namespace gpu } // namespace its } // namespace o2 diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 352f45b927cb8..491b656fcd0e9 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -62,6 +62,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int* rofPV, const int nVertices, const Cluster** clusters, + std::vector nClusters, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 53ed26513218e..6d4e341c3cf57 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -110,6 +110,7 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int mTimeFrameGPU->getDeviceROFramesPV(), mTimeFrameGPU->getPrimaryVerticesNum(), mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getClusterSizes(), mTimeFrameGPU->getDeviceROframeClusters(), mTimeFrameGPU->getDeviceArrayUsedClusters(), mTimeFrameGPU->getDeviceArrayClustersIndexTables(), diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 92a2b505f288b..55bf406b5317e 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -534,7 +534,13 @@ GPUg() void computeLayerTrackletsMultiROFKernel( const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; const int firstRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + firstBinIndex]; const int maxRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + maxBinIndex]; + if (currentClusterIndex == 0 && layerIndex == 1 && rof0 == 81 && threadIdx.x == 0) { + printf("GPU: rof0: %d rof1: %d nclus0: %d nclus1: %d vertId: %d fbi: %d, mbi: %d, frci: %d, mrci: %d \n", rof0, rof1, clustersCurrentLayer.size(), clustersNextLayer.size(), iV, firstBinIndex, maxBinIndex, firstRowClusterIndex, maxRowClusterIndex); + } for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { + if (currentClusterIndex == 0 && layerIndex == 1 && rof0 == 81 && threadIdx.x == 0) { + printf("\ttesting clId: %d ...\n", iNextCluster); + } if (iNextCluster >= clustersNextLayer.size()) { break; } @@ -612,7 +618,7 @@ GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int } } -GPUg() void printMatrixRow(const int row, int** mat, const unsigned int rowLength, const int len = 256 * 128 + 1, const unsigned int tId = 0) +GPUg() void printMatrixRow(const int row, int** mat, const unsigned int rowLength, const int len = 150, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { for (int i{0}; i < rowLength; ++i) { @@ -660,6 +666,28 @@ GPUg() void printNeighbours(const gpuPair* neighbours, } } +GPUg() void printTrackletsLUTPerROF(const int layerId, + const int** ROFClusters, + int** luts, + const int tId = 0) +{ + if (blockIdx.x * blockDim.x + threadIdx.x == tId) { + for (auto rofId{0}; rofId < 2304; ++rofId) { + int nClus = ROFClusters[layerId][rofId + 1] - ROFClusters[layerId][rofId]; + if (!nClus) { + continue; + } + printf("rof: %d (%d) ==> ", rofId, nClus); + + for (int iC{0}; iC < nClus; ++iC) { + int nT = luts[layerId][ROFClusters[layerId][rofId] + iC]; + printf("%d\t", nT); + } + printf("\n"); + } + } +} + template GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, int* trackletsLookUpTable, @@ -706,6 +734,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int* rofPV, const int nVertices, const Cluster** clusters, + std::vector nClusters, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, @@ -751,8 +780,9 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, mulScatAng[iLayer]); gpuCheckError(cudaPeekAtLastError()); gpuCheckError(cudaDeviceSynchronize()); - gpu::printMatrixRow<<<1, 1>>>(iLayer, trackletsLUTs, 3000); + // gpu::printMatrixRow<<<1, 1>>>(iLayer, trackletsLUTs, nClusters[iLayer]); } + // gpu::printTrackletsLUTPerROF<<<1, 1>>>(1, ROFClusters, trackletsLUTs); } void countCellsHandler( @@ -996,6 +1026,7 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, const int* rofPV, const int nVertices, const Cluster** clusters, + std::vector nClusters, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index f4a3c75a9dc55..d722ed645c8b7 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -145,9 +145,13 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in } const int firstRowClusterIndex = tf->getIndexTable(rof1, iLayer + 1)[firstBinIndex]; const int maxRowClusterIndex = tf->getIndexTable(rof1, iLayer + 1)[maxBinIndex]; - + if (iCluster == 0 && iLayer == 1 && rof0 == 81) { + printf("CPU: rof0: %d rof1: %d nclus0: %d nclus1: %d vertId: %d fbi: %d, mbi: %d, frci: %d, mrci: %d \n", rof0, rof1, layer0.size(), layer1.size(), iV, firstBinIndex, maxBinIndex, firstRowClusterIndex, maxRowClusterIndex); + } for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - + if (iCluster == 0 && iLayer == 1 && rof0 == 81) { + printf("\ttesting clId: %d ...\n", iNextCluster); + } if (iNextCluster >= (int)layer1.size()) { break; } @@ -201,11 +205,31 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in if (!tf->checkMemory(mTrkParams[iteration].MaxMemory)) { return; } - for (auto& l : tf->getTrackletsLookupTable()) { - for (auto& t : l) { - std::cout << t << "\t"; - } - } + + // for (auto iLayer{0}; iLayer < tf->getTrackletsLookupTable().size(); ++iLayer) { + // auto lut = tf->getTrackletsLookupTable()[iLayer]; + // for (unsigned int iC{0}; iC < lut.size(); ++iC) { + // if (!(iC % 150)) { + // printf("\n row %d: ===> %d/%d\t", iLayer, iC, (int)lut.size()); + // } + // printf("%d\t", lut[iC]); + // } + // } + + // for (auto rofId{0}; rofId < 2304; ++rofId) { + // int nClus = tf->getClustersOnLayer(rofId, 1).size(); + // if (!nClus) { + // continue; + // } + // printf("rof: %d (%d) ==> ", rofId, nClus); + + // for (int iC{0}; iC < nClus; ++iC) { + // int nT = tf->getTrackletsLookupTable()[0][tf->getSortedIndex(rofId, 1, iC)]; + // printf("%d\t", nT); + // } + // printf("\n"); + // } + #pragma omp parallel for num_threads(mNThreads) for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { /// Sort tracklets From 2bdeefe8dc70a3cfaabef9d0331c938ff506646e Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 28 Nov 2024 11:10:40 +0100 Subject: [PATCH 09/14] Fix bad PhiBins pick --- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 16 ++++++++-------- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 7 ------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 55bf406b5317e..01c37ea15504f 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -529,18 +529,18 @@ GPUg() void computeLayerTrackletsMultiROFKernel( continue; } for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { - int iPhiBin = (selectedBinsRect.y + iPhiCount) % PhiBins; + int iPhiBin = (selectedBinsRect.y + iPhiCount) % phiBins; const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; const int firstRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + firstBinIndex]; const int maxRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + maxBinIndex]; - if (currentClusterIndex == 0 && layerIndex == 1 && rof0 == 81 && threadIdx.x == 0) { - printf("GPU: rof0: %d rof1: %d nclus0: %d nclus1: %d vertId: %d fbi: %d, mbi: %d, frci: %d, mrci: %d \n", rof0, rof1, clustersCurrentLayer.size(), clustersNextLayer.size(), iV, firstBinIndex, maxBinIndex, firstRowClusterIndex, maxRowClusterIndex); - } + // if (currentClusterIndex == 0 && layerIndex == 1 && rof0 == 81 && threadIdx.x == 0) { + // printf("GPU: pb: %d ipc: %d ipb: %d sbr.x: %d sbr.y: %d sbr.z: %d sbr.w: %d fbi: %d, mbi: %d, frci: %d, mrci: %d \n", phiBins, iPhiCount, iPhiBin, selectedBinsRect.x, selectedBinsRect.y, selectedBinsRect.z, selectedBinsRect.w, firstBinIndex, maxBinIndex, firstRowClusterIndex, maxRowClusterIndex); + // } for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (currentClusterIndex == 0 && layerIndex == 1 && rof0 == 81 && threadIdx.x == 0) { - printf("\ttesting clId: %d ...\n", iNextCluster); - } + // if (currentClusterIndex == 0 && layerIndex == 1 && rof0 == 81 && threadIdx.x == 0) { + // printf("\ttesting clId: %d ...\n", iNextCluster); + // } if (iNextCluster >= clustersNextLayer.size()) { break; } @@ -782,7 +782,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, gpuCheckError(cudaDeviceSynchronize()); // gpu::printMatrixRow<<<1, 1>>>(iLayer, trackletsLUTs, nClusters[iLayer]); } - // gpu::printTrackletsLUTPerROF<<<1, 1>>>(1, ROFClusters, trackletsLUTs); + gpu::printTrackletsLUTPerROF<<<1, 1>>>(1, ROFClusters, trackletsLUTs); } void countCellsHandler( diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index d722ed645c8b7..eb1fe96026d6c 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -128,7 +128,6 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in if (layer1.empty()) { continue; } - for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { int iPhiBin = (selectedBinsRect.y + iPhiCount) % mTrkParams[iteration].PhiBins; const int firstBinIndex{tf->mIndexTableUtils.getBinIndex(selectedBinsRect.x, iPhiBin)}; @@ -145,13 +144,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in } const int firstRowClusterIndex = tf->getIndexTable(rof1, iLayer + 1)[firstBinIndex]; const int maxRowClusterIndex = tf->getIndexTable(rof1, iLayer + 1)[maxBinIndex]; - if (iCluster == 0 && iLayer == 1 && rof0 == 81) { - printf("CPU: rof0: %d rof1: %d nclus0: %d nclus1: %d vertId: %d fbi: %d, mbi: %d, frci: %d, mrci: %d \n", rof0, rof1, layer0.size(), layer1.size(), iV, firstBinIndex, maxBinIndex, firstRowClusterIndex, maxRowClusterIndex); - } for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (iCluster == 0 && iLayer == 1 && rof0 == 81) { - printf("\ttesting clId: %d ...\n", iNextCluster); - } if (iNextCluster >= (int)layer1.size()) { break; } From 4613f73660c3cc02099c813f324bffbc284018e1 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 28 Nov 2024 18:52:34 +0100 Subject: [PATCH 10/14] Add tracklet counting --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 3 + .../GPU/ITStrackingGPU/TrackingKernels.h | 31 ++++ .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 22 +-- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 56 +++--- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 173 ++++++++++++++---- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 4 + 6 files changed, 215 insertions(+), 74 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 32d0b183a8cb8..528b555bd509a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -72,6 +72,7 @@ class TimeFrameGPU : public TimeFrame void loadTrackSeedsChi2Device(); void loadRoadsDevice(); void loadTrackSeedsDevice(std::vector&); + void createTrackletsBuffers(); void createCellsBuffers(const int); void createCellsDevice(); void createCellsLUTDevice(); @@ -139,6 +140,7 @@ class TimeFrameGPU : public TimeFrame gsl::span getHostNCells(const int chunkId); // Host-available device getters + gsl::span getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; } gsl::span getDeviceCellLUTs() { return mCellsLUTDevice; } gsl::span getDeviceCells() { return mCellsDevice; } gsl::span getNCellsDevice() { return mNCells; } @@ -151,6 +153,7 @@ class TimeFrameGPU : public TimeFrame StaticTrackingParameters mStaticTrackingParams; // Host-available device buffer sizes + std::array mNTracklets; std::array mNCells; // Device pointers diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 491b656fcd0e9..dc1d2eceb4494 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -50,6 +50,36 @@ GPUg() void fitTrackSeedsKernel( #endif } // namespace gpu +template +void countTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minR, + std::vector& maxR, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads); + template void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const uint8_t* multMask, @@ -66,6 +96,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, + Tracklet* tracklets, int** trackletsLUTs, const int iteration, const float NSigmaCut, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 2176def7f197f..d7c04ad3b9358 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -259,17 +259,17 @@ void TimeFrameGPU::createTrackletsLUTDevice() STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } -// template void TimeFrameGPU::createTrackletsBuffers() -// { -// START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); -// for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { -// mNTracklets[iLayer] = 0; -// checkGPUError(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost)); -// LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[layer], iLayer, mNTracklets[iLayer] * sizeof(CellSeed) / MB); -// allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), nullptr, getExtAllocator()); -// } -// STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); -// } +template void TimeFrameGPU::createTrackletsBuffers() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); + for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { + mNTracklets[iLayer] = 0; + checkGPUError(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost)); + LOGP(info, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), nullptr, getExtAllocator()); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} template void TimeFrameGPU::loadTrackletsDevice() diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 6d4e341c3cf57..9b3edddbd7d10 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -99,33 +99,35 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0}; int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()}; - computeTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), - mTimeFrameGPU->getDeviceMultCutMask(), - startROF, - endROF, - mTimeFrameGPU->getNrof(), - mTrkParams[iteration].DeltaROF, - iVertex, - mTimeFrameGPU->getDeviceVertices(), - mTimeFrameGPU->getDeviceROFramesPV(), - mTimeFrameGPU->getPrimaryVerticesNum(), - mTimeFrameGPU->getDeviceArrayClusters(), - mTimeFrameGPU->getClusterSizes(), - mTimeFrameGPU->getDeviceROframeClusters(), - mTimeFrameGPU->getDeviceArrayUsedClusters(), - mTimeFrameGPU->getDeviceArrayClustersIndexTables(), - mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - iteration, - mTrkParams[iteration].NSigmaCut, - mTimeFrameGPU->getPhiCuts(), - mTrkParams[iteration].PVres, - mTimeFrameGPU->getMinRs(), - mTimeFrameGPU->getMaxRs(), - mTimeFrameGPU->getPositionResolutions(), - mTrkParams[iteration].LayerRadii, - mTimeFrameGPU->getMSangles(), - conf.nBlocks, - conf.nThreads); + countTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + mTimeFrameGPU->getDeviceMultCutMask(), + startROF, + endROF, + mTimeFrameGPU->getNrof(), + mTrkParams[iteration].DeltaROF, + iVertex, + mTimeFrameGPU->getDeviceVertices(), + mTimeFrameGPU->getDeviceROFramesPV(), + mTimeFrameGPU->getPrimaryVerticesNum(), + mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getClusterSizes(), + mTimeFrameGPU->getDeviceROframeClusters(), + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + mTimeFrameGPU->getDeviceArrayTrackletsLUT(), + mTimeFrameGPU->getDeviceTrackletsLUTs(), + iteration, + mTrkParams[iteration].NSigmaCut, + mTimeFrameGPU->getPhiCuts(), + mTrkParams[iteration].PVres, + mTimeFrameGPU->getMinRs(), + mTimeFrameGPU->getMaxRs(), + mTimeFrameGPU->getPositionResolutions(), + mTrkParams[iteration].LayerRadii, + mTimeFrameGPU->getMSangles(), + conf.nBlocks, + conf.nThreads); + mTimeFrameGPU->createTrackletsBuffers(); } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 01c37ea15504f..f9f761718d1d0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -466,7 +466,7 @@ GPUg() void computeLayerTrackletsMultiROFKernel( const int** ROFClusters, // Number of clusters on layers per ROF const unsigned char** usedClusters, // Used clusters const int** indexTables, // input data rof0-delta = clustersNextLayer.size()) { break; } @@ -722,6 +716,85 @@ GPUg() void removeDuplicateTrackletsEntriesLUTKernel( } // namespace gpu +template +void countTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads) +{ + for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { + gpu::computeLayerTrackletsMultiROFKernel<<>>( + utils, + multMask, + iLayer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + nullptr, + trackletsLUTs, + iteration, + NSigmaCut, + phiCuts[iLayer], + resolutionPV, + minRs[iLayer + 1], + maxRs[iLayer + 1], + resolutions[iLayer], + radii[iLayer + 1] - radii[iLayer], + mulScatAng[iLayer]); + // gpuCheckError(cudaPeekAtLastError()); + // gpuCheckError(cudaDeviceSynchronize()); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + gpuCheckError(cudaFree(d_temp_storage)); + } +} + template void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const uint8_t* multMask, @@ -738,6 +811,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, + Tracklet* tracklets, int** trackletsLUTs, const int iteration, const float NSigmaCut, @@ -752,37 +826,35 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int nThreads) { for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - gpu::computeLayerTrackletsMultiROFKernel<<<1, 1>>>( - utils, - multMask, - iLayer, - startROF, - endROF, - maxROF, - deltaROF, - vertices, - rofPV, - nVertices, - vertexId, - clusters, - ROFClusters, - usedClusters, - clustersIndexTables, - trackletsLUTs, - iteration, - NSigmaCut, - phiCuts[iLayer], - resolutionPV, - minRs[iLayer + 1], - maxRs[iLayer + 1], - resolutions[iLayer], - radii[iLayer + 1] - radii[iLayer], - mulScatAng[iLayer]); + gpu::computeLayerTrackletsMultiROFKernel<<>>(utils, + multMask, + iLayer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + tracklets, + trackletsLUTs, + iteration, + NSigmaCut, + phiCuts[iLayer], + resolutionPV, + minRs[iLayer + 1], + maxRs[iLayer + 1], + resolutions[iLayer], + radii[iLayer + 1] - radii[iLayer], + mulScatAng[iLayer]); gpuCheckError(cudaPeekAtLastError()); gpuCheckError(cudaDeviceSynchronize()); - // gpu::printMatrixRow<<<1, 1>>>(iLayer, trackletsLUTs, nClusters[iLayer]); } - gpu::printTrackletsLUTPerROF<<<1, 1>>>(1, ROFClusters, trackletsLUTs); } void countCellsHandler( @@ -832,7 +904,6 @@ void countCellsHandler( cellsLUTsHost, // d_out nTracklets + 1, // num_items 0)); // NOLINT: this is the offset of the sum, not a pointer - // gpu::printBufferLayerOnThread<<<1, 1>>>(layer, cellsLUTsHost, nTracklets + 1); gpuCheckError(cudaFree(d_temp_storage)); } @@ -1015,6 +1086,35 @@ void trackSeedHandler(CellSeed* trackSeeds, gpuCheckError(cudaDeviceSynchronize()); } +template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads); + template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, const uint8_t* multMask, const int startROF, @@ -1030,6 +1130,7 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, + Tracklet* tracklets, int** trackletsLUTs, const int iteration, const float NSigmaCut, diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index eb1fe96026d6c..8230a55e5ce29 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -199,6 +199,10 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in return; } + for (auto iLayer{0}; iLayer < tf->getTracklets().size(); ++iLayer) { + std::cout << "tracklets layer " << iLayer << ": " << tf->getTracklets()[iLayer].size() << std::endl; + } + // for (auto iLayer{0}; iLayer < tf->getTrackletsLookupTable().size(); ++iLayer) { // auto lut = tf->getTrackletsLookupTable()[iLayer]; // for (unsigned int iC{0}; iC < lut.size(); ++iC) { From 1bd7e9f6ea756bf93913ecf22d714b49d0e22c18 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Fri, 29 Nov 2024 11:05:59 +0100 Subject: [PATCH 11/14] Fix indices for used clusters --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 2 +- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 21 ++++++++++++------- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 2 +- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 6 ++---- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 12 +++++++++++ 5 files changed, 29 insertions(+), 14 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 528b555bd509a..f04215ec9400f 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -63,7 +63,7 @@ class TimeFrameGPU : public TimeFrame void loadVertices(const int); /// - void createTrackletsLUTDevice(); + void createTrackletsLUTDevice(const int); void loadTrackletsDevice(); void loadTrackletsLUTDevice(); void loadCellsDevice(); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index d7c04ad3b9358..8a5f5e6d76dbf 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -176,7 +176,7 @@ void TimeFrameGPU::createUsedClustersDevice(const int iteration) template void TimeFrameGPU::loadUsedClustersDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading used clusters flags"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB); checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get())); @@ -246,20 +246,25 @@ void TimeFrameGPU::loadVertices(const int iteration) } template -void TimeFrameGPU::createTrackletsLUTDevice() +void TimeFrameGPU::createTrackletsLUTDevice(const int iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating tracklets LUTs"); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { - LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", mClusters[iLayer].size() + 1, iLayer, (mClusters[iLayer].size() + 1) * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), (mClusters[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); + if (!iteration) { + LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", mClusters[iLayer].size() + 1, iLayer, (mClusters[iLayer].size() + 1) * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), (mClusters[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); + } checkGPUError(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); - checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + if (!iteration) { + allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } -template void TimeFrameGPU::createTrackletsBuffers() +template +void TimeFrameGPU::createTrackletsBuffers() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 9b3edddbd7d10..6345f5435c795 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -92,7 +92,7 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int { auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); - mTimeFrameGPU->createTrackletsLUTDevice(); + mTimeFrameGPU->createTrackletsLUTDevice(iteration); const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); gsl::span diamondSpan(&diamondVert, 1); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index f9f761718d1d0..3fd53df686456 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -472,8 +472,8 @@ GPUg() void computeLayerTrackletsMultiROFKernel( const float NSigmaCut, const float phiCut, const float resolutionPV, - const float maxR, const float minR, + const float maxR, const float positionResolution, const float meanDeltaR = -666.f, const float MSAngle = -666.f) @@ -496,7 +496,7 @@ GPUg() void computeLayerTrackletsMultiROFKernel( unsigned int storedTracklets{0}; auto currentCluster{clustersCurrentLayer[currentClusterIndex]}; const int currentSortedIndex{ROFClusters[layerIndex][rof0] + currentClusterIndex}; - if (usedClusters[layerIndex][currentSortedIndex]) { + if (usedClusters[layerIndex][currentCluster.clusterId]) { continue; } @@ -774,8 +774,6 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, resolutions[iLayer], radii[iLayer + 1] - radii[iLayer], mulScatAng[iLayer]); - // gpuCheckError(cudaPeekAtLastError()); - // gpuCheckError(cudaDeviceSynchronize()); void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 8230a55e5ce29..faeb0ecddfcc7 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -193,6 +193,18 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in } } } + // if (rof0 == 81) { + // printf("CPU layer: %d -> %f %f %f %f %f %f %f %f\n", + // iLayer, + // mTrkParams[iteration].NSigmaCut, + // tf->getPhiCut(iLayer), + // mTrkParams[iteration].PVres, + // tf->getMinR(iLayer + 1), + // tf->getMaxR(iLayer + 1), + // tf->getPositionResolution(iLayer), + // meanDeltaR, + // tf->getMSangle(iLayer)); + // } } } if (!tf->checkMemory(mTrkParams[iteration].MaxMemory)) { From 208ea84ffb0c80b12f8cec74cbbe560b9fc70336 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Fri, 29 Nov 2024 14:49:00 +0100 Subject: [PATCH 12/14] Add tracklet writing on the buffer --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 5 ++- .../GPU/ITStrackingGPU/TrackingKernels.h | 6 +-- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 9 ++-- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 30 ++++++++++++- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 42 +++++++++---------- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 6 +-- 6 files changed, 62 insertions(+), 36 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index f04215ec9400f..e288fbae98396 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -122,7 +122,7 @@ class TimeFrameGPU : public TimeFrame std::vector getClusterSizes(); const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; } const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; } - const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } + Tracklet** getDeviceArrayTracklets() { return mTrackletsDeviceArray; } int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; } int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; } @@ -142,6 +142,7 @@ class TimeFrameGPU : public TimeFrame // Host-available device getters gsl::span getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; } gsl::span getDeviceCellLUTs() { return mCellsLUTDevice; } + gsl::span getDeviceTracklet() { return mTrackletsDevice; } gsl::span getDeviceCells() { return mCellsDevice; } gsl::span getNCellsDevice() { return mNCells; } @@ -175,7 +176,7 @@ class TimeFrameGPU : public TimeFrame const unsigned char** mUsedClustersDeviceArray; const int** mROFrameClustersDeviceArray; std::array mTrackletsDevice; - const Tracklet** mTrackletsDeviceArray; + Tracklet** mTrackletsDeviceArray; std::array mTrackletsLUTDevice; std::array mCellsLUTDevice; std::array mNeighboursLUTDevice; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index dc1d2eceb4494..473f338aa4200 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -96,7 +96,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, - Tracklet* tracklets, + Tracklet** tracklets, int** trackletsLUTs, const int iteration, const float NSigmaCut, @@ -113,7 +113,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, void countCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, + Tracklet** tracklets, int** trackletsLUT, const int nTracklets, const int layer, @@ -130,7 +130,7 @@ void countCellsHandler(const Cluster** sortedClusters, void computeCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, + Tracklet** tracklets, int** trackletsLUT, const int nTracklets, const int layer, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 8a5f5e6d76dbf..129060758ae2d 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -270,9 +270,12 @@ void TimeFrameGPU::createTrackletsBuffers() for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { mNTracklets[iLayer] = 0; checkGPUError(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost)); - LOGP(info, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB); + LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB); allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), nullptr, getExtAllocator()); } + allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -282,13 +285,9 @@ void TimeFrameGPU::loadTrackletsDevice() START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", mTracklets[iLayer].size(), iLayer, mTracklets[iLayer].size() * sizeof(Tracklet) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mTracklets[iLayer].size() * sizeof(Tracklet), nullptr, getExtAllocator()); checkGPUError(cudaHostRegister(mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mTrackletsDevice[iLayer], mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 6345f5435c795..d4ef4322cd43f 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -115,7 +115,7 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int mTimeFrameGPU->getDeviceArrayUsedClusters(), mTimeFrameGPU->getDeviceArrayClustersIndexTables(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getDeviceTrackletsLUTs(), + mTimeFrameGPU->getDeviceTrackletsLUTs(), // Required for the exclusive sums iteration, mTrkParams[iteration].NSigmaCut, mTimeFrameGPU->getPhiCuts(), @@ -128,6 +128,34 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int conf.nBlocks, conf.nThreads); mTimeFrameGPU->createTrackletsBuffers(); + computeTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + mTimeFrameGPU->getDeviceMultCutMask(), + startROF, + endROF, + mTimeFrameGPU->getNrof(), + mTrkParams[iteration].DeltaROF, + iVertex, + mTimeFrameGPU->getDeviceVertices(), + mTimeFrameGPU->getDeviceROFramesPV(), + mTimeFrameGPU->getPrimaryVerticesNum(), + mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getClusterSizes(), + mTimeFrameGPU->getDeviceROframeClusters(), + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + mTimeFrameGPU->getDeviceArrayTracklets(), + mTimeFrameGPU->getDeviceArrayTrackletsLUT(), + iteration, + mTrkParams[iteration].NSigmaCut, + mTimeFrameGPU->getPhiCuts(), + mTrkParams[iteration].PVres, + mTimeFrameGPU->getMinRs(), + mTimeFrameGPU->getMaxRs(), + mTimeFrameGPU->getPositionResolutions(), + mTrkParams[iteration].LayerRadii, + mTimeFrameGPU->getMSangles(), + conf.nBlocks, + conf.nThreads); } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 3fd53df686456..c1e754b3c5b45 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -369,7 +369,7 @@ GPUg() void computeLayerCellsKernel( const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, + Tracklet** tracklets, int** trackletsLUT, const int nTrackletsCurrent, const int layer, @@ -462,11 +462,11 @@ GPUg() void computeLayerTrackletsMultiROFKernel( const int* rofPV, const int nVertices, const int vertexId, - const Cluster** clusters, // input data rof0 + const Cluster** clusters, // Input data rof0 const int** ROFClusters, // Number of clusters on layers per ROF const unsigned char** usedClusters, // Used clusters - const int** indexTables, // input data rof0-delta getNphiBins()}; const int zBins{utils->getNzBins()}; for (unsigned int iROF{blockIdx.x}; iROF < endROF - startROF; iROF += gridDim.x) { - const int rof0 = iROF + startROF; + const short rof0 = iROF + startROF; auto primaryVertices = getPrimaryVertices(rof0, rofPV, totalROFs, multMask, vertices); const auto startVtx{vertexId >= 0 ? vertexId : 0}; const auto endVtx{vertexId >= 0 ? o2::gpu::CAMath::Min(vertexId + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; - auto minROF = o2::gpu::CAMath::Max(startROF, static_cast(rof0 - deltaROF)); - auto maxROF = o2::gpu::CAMath::Min(endROF - 1, static_cast(rof0 + deltaROF)); + const short minROF = o2::gpu::CAMath::Max(startROF, static_cast(rof0 - deltaROF)); + const short maxROF = o2::gpu::CAMath::Min(endROF - 1, static_cast(rof0 + deltaROF)); auto clustersCurrentLayer = getClustersOnLayer(rof0, totalROFs, layerIndex, ROFClusters, clusters); if (clustersCurrentLayer.empty()) { continue; @@ -523,7 +523,7 @@ GPUg() void computeLayerTrackletsMultiROFKernel( } const int tableSize{phiBins * zBins + 1}; - for (int rof1{minROF}; rof1 <= maxROF; ++rof1) { + for (short rof1{minROF}; rof1 <= maxROF; ++rof1) { auto clustersNextLayer = getClustersOnLayer(rof1, totalROFs, layerIndex + 1, ROFClusters, clusters); if (clustersNextLayer.empty()) { continue; @@ -534,26 +534,24 @@ GPUg() void computeLayerTrackletsMultiROFKernel( const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; const int firstRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + firstBinIndex]; const int maxRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + maxBinIndex]; - for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (iNextCluster >= clustersNextLayer.size()) { + for (int nextClusterIndex{firstRowClusterIndex}; nextClusterIndex < maxRowClusterIndex; ++nextClusterIndex) { + if (nextClusterIndex >= clustersNextLayer.size()) { break; } - const Cluster& nextCluster{clustersNextLayer[iNextCluster]}; + const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]}; if (usedClusters[layerIndex + 1][nextCluster.clusterId]) { continue; } const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)}; - const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + - currentCluster.zCoordinate - nextCluster.zCoordinate)}; + const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; + const int nextSortedIndex{ROFClusters[layerIndex + 1][rof1] + nextClusterIndex}; if (deltaZ / sigmaZ < NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut)) { - // if (layerIndex > 0) { if constexpr (initRun) { trackletsLUT[layerIndex][currentSortedIndex]++; // we need l0 as well for usual exclusive sums. } else { - // } const float phi{o2::gpu::CAMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; - // tf->getTracklets()[layerIndex].emplace_back(currentSortedIndex, tf->getSortedIndex(rof1, layerIndex + 1, iNextCluster), tanL, phi, rof0, rof1); + new (tracklets[layerIndex] + trackletsLUT[layerIndex][currentSortedIndex] + storedTracklets) Tracklet{currentSortedIndex, nextSortedIndex, tanL, phi, rof0, rof1}; } ++storedTracklets; } @@ -809,7 +807,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, - Tracklet* tracklets, + Tracklet** tracklets, int** trackletsLUTs, const int iteration, const float NSigmaCut, @@ -859,7 +857,7 @@ void countCellsHandler( const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, + Tracklet** tracklets, int** trackletsLUT, const int nTracklets, const int layer, @@ -909,7 +907,7 @@ void computeCellsHandler( const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, + Tracklet** tracklets, int** trackletsLUT, const int nTracklets, const int layer, @@ -1128,7 +1126,7 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, const int** ROFClusters, const unsigned char** usedClusters, const int** clustersIndexTables, - Tracklet* tracklets, + Tracklet** tracklets, int** trackletsLUTs, const int iteration, const float NSigmaCut, diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index faeb0ecddfcc7..510d819776147 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -211,9 +211,9 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in return; } - for (auto iLayer{0}; iLayer < tf->getTracklets().size(); ++iLayer) { - std::cout << "tracklets layer " << iLayer << ": " << tf->getTracklets()[iLayer].size() << std::endl; - } + // for (auto iLayer{0}; iLayer < tf->getTracklets().size(); ++iLayer) { + // std::cout << "tracklets layer " << iLayer << ": " << tf->getTracklets()[iLayer].size() << std::endl; + // } // for (auto iLayer{0}; iLayer < tf->getTrackletsLookupTable().size(); ++iLayer) { // auto lut = tf->getTrackletsLookupTable()[iLayer]; From ff486d4c5b68d39204390ee2e1937d20aa9cb9dd Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Mon, 2 Dec 2024 18:59:02 +0100 Subject: [PATCH 13/14] tracklets on gpu --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 3 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 3 ++ .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 7 --- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 32 ++++++------ .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 49 ++++++++++++++++--- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 42 +--------------- 6 files changed, 66 insertions(+), 70 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index e288fbae98396..4b9256253cc4e 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -138,13 +138,14 @@ class TimeFrameGPU : public TimeFrame // Host-specific getters gsl::span getHostNTracklets(const int chunkId); gsl::span getHostNCells(const int chunkId); + gsl::span getNTracklets() { return mNTracklets; } + gsl::span getNCells() { return mNCells; } // Host-available device getters gsl::span getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; } gsl::span getDeviceCellLUTs() { return mCellsLUTDevice; } gsl::span getDeviceTracklet() { return mTrackletsDevice; } gsl::span getDeviceCells() { return mCellsDevice; } - gsl::span getNCellsDevice() { return mNCells; } private: void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 473f338aa4200..54bdae302e643 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -97,7 +97,10 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const unsigned char** usedClusters, const int** clustersIndexTables, Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, int** trackletsLUTs, + gsl::span trackletsLUTsHost, const int iteration, const float NSigmaCut, std::vector& phiCuts, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 129060758ae2d..66180a9d14d95 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -489,13 +489,6 @@ void TimeFrameGPU::unregisterRest() LOGP(debug, "unregistering rest of the host memory..."); checkGPUError(cudaHostUnregister(mCellsDevice.data())); checkGPUError(cudaHostUnregister(mTrackletsDevice.data())); - checkGPUError(cudaHostUnregister(mTrackletsLUTDevice.data())); - for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { - if (iLayer < nLayers - 2) { - checkGPUError(cudaHostUnregister(mTrackletsLookupTable[iLayer].data())); - } - checkGPUError(cudaHostUnregister(mTracklets[iLayer].data())); - } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index d4ef4322cd43f..8db849daa49c3 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -144,7 +144,10 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int mTimeFrameGPU->getDeviceArrayUsedClusters(), mTimeFrameGPU->getDeviceArrayClustersIndexTables(), mTimeFrameGPU->getDeviceArrayTracklets(), + mTimeFrameGPU->getDeviceTracklet(), + mTimeFrameGPU->getNTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), + mTimeFrameGPU->getDeviceTrackletsLUTs(), iteration, mTrkParams[iteration].NSigmaCut, mTimeFrameGPU->getPhiCuts(), @@ -161,25 +164,22 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int template void TrackerTraitsGPU::computeCellsHybrid(const int iteration) { - mTimeFrameGPU->loadTrackletsDevice(); - mTimeFrameGPU->loadTrackletsLUTDevice(); mTimeFrameGPU->createCellsLUTDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - // #pragma omp parallel for num_threads(nLayers) for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { - if (mTimeFrameGPU->getTracklets()[iLayer + 1].empty() || - mTimeFrameGPU->getTracklets()[iLayer].empty()) { + if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) { + LOGP(info, "continuing here"); continue; } - - const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getTracklets()[iLayer].size())}; + LOGP(info, "+> {}", mTimeFrameGPU->getNTracklets()[iLayer]); + const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getNTracklets()[iLayer])}; countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getDeviceArrayUnsortedClusters(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), mTimeFrameGPU->getDeviceArrayTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getTracklets()[iLayer].size(), + mTimeFrameGPU->getNTracklets()[iLayer], iLayer, nullptr, mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -196,7 +196,7 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), mTimeFrameGPU->getDeviceArrayTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getTracklets()[iLayer].size(), + mTimeFrameGPU->getNTracklets()[iLayer], iLayer, mTimeFrameGPU->getDeviceCells()[iLayer], mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -220,7 +220,7 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); std::vector>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1); for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { - const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCellsDevice()[iLayer + 1])}; + const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCells()[iLayer + 1])}; mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear(); mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0); @@ -283,7 +283,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) std::vector lastCellId, updatedCellId; std::vector lastCellSeed, updatedCellSeed; - processNeighbours(startLayer, startLevel, mTimeFrame->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId); + processNeighbours(startLayer, startLevel, mTimeFrameGPU->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId); int level = startLevel; for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { @@ -337,8 +337,8 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (track.getClusterIndex(iLayer) == UnusedIndex) { continue; } - nShared += int(mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer))); - isFirstShared |= !iLayer && mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer)); + nShared += int(mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer))); + isFirstShared |= !iLayer && mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer)); } if (nShared > mTrkParams[0].ClusterSharing) { @@ -350,8 +350,8 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (track.getClusterIndex(iLayer) == UnusedIndex) { continue; } - mTimeFrame->markUsedCluster(iLayer, track.getClusterIndex(iLayer)); - int currentROF = mTimeFrame->getClusterROF(iLayer, track.getClusterIndex(iLayer)); + mTimeFrameGPU->markUsedCluster(iLayer, track.getClusterIndex(iLayer)); + int currentROF = mTimeFrameGPU->getClusterROF(iLayer, track.getClusterIndex(iLayer)); for (int iR{0}; iR < 3; ++iR) { if (rofs[iR] == INT_MAX) { rofs[iR] = currentROF; @@ -367,7 +367,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (rofs[1] != INT_MAX) { track.setNextROFbit(); } - mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); + mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); } } mTimeFrameGPU->loadUsedClustersDevice(); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index c1e754b3c5b45..00c26a67bcb51 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -192,6 +192,17 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, 0.f, 0.f, 0.f, 0.f, sg2q2pt}); } +// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }; +// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; }; + +struct sort_tracklets { + GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); } +}; + +struct equal_tracklets { + GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; } +}; + template struct pair_to_first : public thrust::unary_function, T1> { GPUhd() int operator()(const gpuPair& a) const @@ -686,10 +697,7 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, const int nTracklets) { for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) { - auto& tracklet{tracklets[currentTrackletIndex]}; - if (tracklet.firstClusterIndex >= 0) { - atomicAdd(trackletsLookUpTable + tracklet.firstClusterIndex, 1); - } + atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1); } } @@ -808,7 +816,10 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const unsigned char** usedClusters, const int** clustersIndexTables, Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, int** trackletsLUTs, + gsl::span trackletsLUTsHost, const int iteration, const float NSigmaCut, std::vector& phiCuts, @@ -848,8 +859,31 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, resolutions[iLayer], radii[iLayer + 1] - radii[iLayer], mulScatAng[iLayer]); - gpuCheckError(cudaPeekAtLastError()); - gpuCheckError(cudaDeviceSynchronize()); + thrust::device_ptr tracklets_ptr(spanTracklets[iLayer]); + thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets()); + auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets()); + nTracklets[iLayer] = unique_end - tracklets_ptr; + LOGP(info, "=> {} {}", nTracklets[iLayer], unique_end - tracklets_ptr); + if (iLayer > 0) { + gpuCheckError(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int))); + gpu::compileTrackletsLookupTableKernel<<>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + gpuCheckError(cudaFree(d_temp_storage)); + } } } @@ -1127,7 +1161,10 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, const unsigned char** usedClusters, const int** clustersIndexTables, Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, int** trackletsLUTs, + gsl::span trackletsLUTsHost, const int iteration, const float NSigmaCut, std::vector& phiCuts, diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 510d819776147..cfeb2cbc73a8b 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -193,56 +193,17 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in } } } - // if (rof0 == 81) { - // printf("CPU layer: %d -> %f %f %f %f %f %f %f %f\n", - // iLayer, - // mTrkParams[iteration].NSigmaCut, - // tf->getPhiCut(iLayer), - // mTrkParams[iteration].PVres, - // tf->getMinR(iLayer + 1), - // tf->getMaxR(iLayer + 1), - // tf->getPositionResolution(iLayer), - // meanDeltaR, - // tf->getMSangle(iLayer)); - // } } } if (!tf->checkMemory(mTrkParams[iteration].MaxMemory)) { return; } - // for (auto iLayer{0}; iLayer < tf->getTracklets().size(); ++iLayer) { - // std::cout << "tracklets layer " << iLayer << ": " << tf->getTracklets()[iLayer].size() << std::endl; - // } - - // for (auto iLayer{0}; iLayer < tf->getTrackletsLookupTable().size(); ++iLayer) { - // auto lut = tf->getTrackletsLookupTable()[iLayer]; - // for (unsigned int iC{0}; iC < lut.size(); ++iC) { - // if (!(iC % 150)) { - // printf("\n row %d: ===> %d/%d\t", iLayer, iC, (int)lut.size()); - // } - // printf("%d\t", lut[iC]); - // } - // } - - // for (auto rofId{0}; rofId < 2304; ++rofId) { - // int nClus = tf->getClustersOnLayer(rofId, 1).size(); - // if (!nClus) { - // continue; - // } - // printf("rof: %d (%d) ==> ", rofId, nClus); - - // for (int iC{0}; iC < nClus; ++iC) { - // int nT = tf->getTrackletsLookupTable()[0][tf->getSortedIndex(rofId, 1, iC)]; - // printf("%d\t", nT); - // } - // printf("\n"); - // } - #pragma omp parallel for num_threads(mNThreads) for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { /// Sort tracklets auto& trkl{tf->getTracklets()[iLayer + 1]}; + auto oldsize{trkl.size()}; std::sort(trkl.begin(), trkl.end(), [](const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }); @@ -265,6 +226,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in /// Compute LUT std::exclusive_scan(lut.begin(), lut.end(), lut.begin(), 0); lut.push_back(trkl.size()); + LOGP(info, "CPU layer {} -> old size: {} - new size: {}", iLayer, oldsize, trkl.size()); } /// Layer 0 is done outside the loop std::sort(tf->getTracklets()[0].begin(), tf->getTracklets()[0].end(), [](const Tracklet& a, const Tracklet& b) { From afaf9fc05467ad1ac04073897c80f933f97f7dd7 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 3 Dec 2024 10:31:09 +0100 Subject: [PATCH 14/14] Tracklet finder on GPU --- .../ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h | 2 -- .../ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt | 2 +- .../ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 14 +++++++------- .../ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx | 6 ++---- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 1 - .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 2 -- 6 files changed, 10 insertions(+), 17 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 4b9256253cc4e..37f392ebbd3a7 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -136,8 +136,6 @@ class TimeFrameGPU : public TimeFrame void setDevicePropagator(const o2::base::PropagatorImpl*) override; // Host-specific getters - gsl::span getHostNTracklets(const int chunkId); - gsl::span getHostNCells(const int chunkId); gsl::span getNTracklets() { return mNTracklets; } gsl::span getNCells() { return mNCells; } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt index e2fc1f1388ad0..3cdb107e07438 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt @@ -13,7 +13,7 @@ if(CUDA_ENABLED) find_package(CUDAToolkit) message(STATUS "Building ITS CUDA tracker") -add_compile_options(-O0 -g -lineinfo -fPIC) +# add_compile_options(-O0 -g -lineinfo -fPIC) # add_compile_definitions(ITS_MEASURE_GPU_TIME) o2_add_library(ITStrackingCUDA SOURCES ClusterLinesGPU.cu diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 66180a9d14d95..4bd15c0203d81 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -341,9 +341,9 @@ void TimeFrameGPU::createCellsLUTDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mTracklets[iLayer].size() + 1, iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mTracklets[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); - checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mTracklets[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); + LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); @@ -355,7 +355,7 @@ void TimeFrameGPU::createCellsBuffers(const int layer) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); mNCells[layer] = 0; - checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mTracklets[layer].size(), sizeof(int), cudaMemcpyDeviceToHost)); + checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost)); LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, getExtAllocator()); @@ -446,9 +446,9 @@ void TimeFrameGPU::downloadCellsLUTDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cell luts"); for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { - LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mTracklets[iLayer + 1].size() + 1)); - mCellsLookupTable[iLayer].resize(mTracklets[iLayer + 1].size() + 1); - checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mTracklets[iLayer + 1].size() + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1)); + mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1); + checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 8db849daa49c3..ae86507e46325 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -76,7 +76,7 @@ int TrackerTraitsGPU::getTFNumberOfClusters() const template int TrackerTraitsGPU::getTFNumberOfTracklets() const { - return mTimeFrameGPU->getNumberOfTracklets(); + return std::accumulate(mTimeFrameGPU->getNTracklets().begin(), mTimeFrameGPU->getNTracklets().end(), 0); } template @@ -91,7 +91,7 @@ template void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex) { auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); + // TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); mTimeFrameGPU->createTrackletsLUTDevice(iteration); const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); @@ -169,10 +169,8 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) { - LOGP(info, "continuing here"); continue; } - LOGP(info, "+> {}", mTimeFrameGPU->getNTracklets()[iLayer]); const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getNTracklets()[iLayer])}; countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getDeviceArrayUnsortedClusters(), diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 00c26a67bcb51..229827611c077 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -863,7 +863,6 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets()); auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets()); nTracklets[iLayer] = unique_end - tracklets_ptr; - LOGP(info, "=> {} {}", nTracklets[iLayer], unique_end - tracklets_ptr); if (iLayer > 0) { gpuCheckError(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int))); gpu::compileTrackletsLookupTableKernel<<>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]); diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index cfeb2cbc73a8b..409b20ea23235 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -203,7 +203,6 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { /// Sort tracklets auto& trkl{tf->getTracklets()[iLayer + 1]}; - auto oldsize{trkl.size()}; std::sort(trkl.begin(), trkl.end(), [](const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }); @@ -226,7 +225,6 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in /// Compute LUT std::exclusive_scan(lut.begin(), lut.end(), lut.begin(), 0); lut.push_back(trkl.size()); - LOGP(info, "CPU layer {} -> old size: {} - new size: {}", iLayer, oldsize, trkl.size()); } /// Layer 0 is done outside the loop std::sort(tf->getTracklets()[0].begin(), tf->getTracklets()[0].end(), [](const Tracklet& a, const Tracklet& b) {