diff --git a/quest/include/environment.h b/quest/include/environment.h index 04f24bfe..f5630813 100644 --- a/quest/include/environment.h +++ b/quest/include/environment.h @@ -83,6 +83,14 @@ int isQuESTEnvInit(); QuESTEnv getQuESTEnv(); +/** @notyetdoced + * GPU thread per block control + * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for + * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads. + */ +int getQuESTNumGpuThreadsPerBlock(); +void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock); + // end de-mangler #ifdef __cplusplus diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp index 54149189..3d10a689 100644 --- a/quest/src/api/environment.cpp +++ b/quest/src/api/environment.cpp @@ -492,22 +492,34 @@ void reportQuESTEnv() { void getEnvironmentString(char str[200]) { validate_envIsInit(__func__); - QuESTEnv env = getQuESTEnv(); - int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1; - int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled(); - int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible(); + int cuQuantum = globalEnvPtr->isGpuAccelerated && gpu_isCuQuantumCompiled(); + int gpuDirect = globalEnvPtr->isGpuAccelerated && gpu_isDirectGpuCommPossible(); snprintf(str, 200, "CUDA=%d OpenMP=%d MPI=%d threads=%d ranks=%d cuQuantum=%d gpuDirect=%d", - env.isGpuAccelerated, - env.isMultithreaded, - env.isDistributed, + globalEnvPtr->isGpuAccelerated, + globalEnvPtr->isMultithreaded, + globalEnvPtr->isDistributed, numThreads, - env.numNodes, + globalEnvPtr->numNodes, cuQuantum, gpuDirect); } +int getQuESTNumGpuThreadsPerBlock() { + validate_envIsInit(__func__); + + return globalEnvPtr->isGpuAccelerated? gpu_getNumThreadsPerBlock() : 0; +} + +void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock) { + validate_envIsInit(__func__); + + // just rely on the internal function to throw an error if there's no GPU support compiled + gpu_setNumThreadsPerBlock(newThreadsPerBlock); + return; +} + // end de-mangler } diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index c11ec224..ad8e303a 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -79,9 +79,7 @@ int cpu_getAvailableNumThreads() { #if COMPILE_OPENMP int n = -1; - #pragma omp parallel shared(n) - #pragma omp single - n = omp_get_num_threads(); + n = omp_get_max_threads(); return n; #else diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp index c7db834b..588779c4 100644 --- a/quest/src/gpu/gpu_config.cpp +++ b/quest/src/gpu/gpu_config.cpp @@ -41,6 +41,7 @@ #include "quest/src/gpu/cuda_to_hip.hpp" #endif +int numThreadsPerBlock = 128; /* @@ -330,6 +331,24 @@ qindex gpu_getMaxNumConcurrentThreads() { * ENVIRONMENT MANAGEMENT */ +int gpu_getNumThreadsPerBlock() { +#if COMPILE_CUDA + return numThreadsPerBlock; +#else + error_gpuQueriedButGpuNotCompiled(); + return -1; +#endif +} + +void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock) { +#if COMPILE_CUDA + numThreadsPerBlock = newThreadsPerBlock; +#else + error_gpuQueriedButGpuNotCompiled(); +#endif + return; +} + std::array getBoundGpuUuid() { #if COMPILE_CUDA diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp index 1b3be629..0787e127 100644 --- a/quest/src/gpu/gpu_config.hpp +++ b/quest/src/gpu/gpu_config.hpp @@ -19,7 +19,6 @@ #include "quest/include/channels.h" - /* * CUDA ERROR HANDLING */ @@ -65,6 +64,10 @@ qindex gpu_getMaxNumConcurrentThreads(); * ENVIRONMENT MANAGEMENT */ +int gpu_getNumThreadsPerBlock(); + +void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock); + void gpu_bindLocalGPUsToNodes(); bool gpu_areAnyNodesBoundToSameGpu(); @@ -76,7 +79,6 @@ void gpu_initCuQuantum(); void gpu_finalizeCuQuantum(); - /* * MEMORY MANAGEMENT */ @@ -122,4 +124,4 @@ size_t gpu_getCacheMemoryInBytes(); -#endif // GPU_CONFIG_HPP \ No newline at end of file +#endif // GPU_CONFIG_HPP diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 30377850..74ee13f6 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -42,23 +42,19 @@ * THREAD MANAGEMENT */ - -const int NUM_THREADS_PER_BLOCK = 128; - - __forceinline__ __device__ qindex getThreadInd() { return blockIdx.x*blockDim.x + threadIdx.x; } -__host__ qindex getNumBlocks(qindex numThreads) { +__host__ qindex getNumBlocks(qindex numThreads, const int numThreadsPerBlock) { /// @todo /// improve this with cudaOccupancyMaxPotentialBlockSize(), /// making it function specific // CUDA ceil - return ceil(numThreads / static_cast(NUM_THREADS_PER_BLOCK)); + return ceil(numThreads / static_cast(numThreadsPerBlock)); } diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 527b568e..0da26554 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -66,7 +66,6 @@ using std::vector; - /* * GETTERS */ @@ -141,13 +140,14 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, SmallList qubits, SmallList #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(qubits.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex sendInd = getSubBufferSendInd(qureg); devints sortedQubits = getDevInts(util_getSorted(qubits)); qindex qubitStateMask = util_getBitMask(qubits, qubitStates); - kernel_statevec_packAmpsIntoBuffer <<>> ( + kernel_statevec_packAmpsIntoBuffer <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, getPtr(sortedQubits), qubits.size(), qubitStateMask ); @@ -169,10 +169,11 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 8; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex sendInd = getSubBufferSendInd(qureg); - kernel_statevec_packPairSummedAmpsIntoBuffer <<>> ( + kernel_statevec_packPairSummedAmpsIntoBuffer <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, qubit1, qubit2, qubit3, bit2 ); @@ -208,12 +209,13 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, SmallList ctrls, SmallList ctrlS #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ2, targ1})); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); - kernel_statevec_anyCtrlSwap_subA <<>> ( + kernel_statevec_anyCtrlSwap_subA <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2 ); @@ -232,13 +234,14 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, SmallList ctrls, SmallList ctrlS #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); devints sortedCtrls = getDevInts(util_getSorted(ctrls)); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevec_anyCtrlSwap_subB <<>> ( + kernel_statevec_anyCtrlSwap_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask ); @@ -257,13 +260,14 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, SmallList ctrls, SmallList ctrlS #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(1 + ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ})); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); - kernel_statevec_anyCtrlSwap_subC <<>> ( + kernel_statevec_anyCtrlSwap_subC <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask ); @@ -300,14 +304,15 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, SmallList ctrls, Sma #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ})); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); auto [m00, m01, m10, m11] = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grr! - kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( + kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, m00, m01, m10, m11 @@ -327,13 +332,14 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, SmallList ctrls, Sma #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); devints sortedCtrls = getDevInts(util_getSorted(ctrls)); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( + kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, getGpuQcomp(fac0), getGpuQcomp(fac1) @@ -370,7 +376,8 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, SmallList ctrls, Smal #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ1,targ2})); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0}); @@ -378,7 +385,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, SmallList ctrls, Smal // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...) auto m = getFlattenedGpuQcompMatrix<4>(matr.elems); // explicit template for MSVC, grr! - kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( + kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], @@ -455,7 +462,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, SmallList ctrls, Smal if constexpr (NumTargs != -1) { // when NumTargs <= 5, each thread has a private array stored in the registers, - // enabling rapid IO. Given NUM_THREADS_PER_BLOCK = 128, the maximum size of + // enabling rapid IO. Given numThreadsPerBlock = 128, the maximum size of // this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared // memory capacity, but does NOT exceed maximum register capacity. @@ -465,11 +472,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, SmallList ctrls, Smal /// global memory) and greatly sabotage performance on some GPUs. qindex numThreads = numBatches; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); kernel_statevec_anyCtrlFewTargDenseMatr - <<>> ( + <<>> ( ampsPtr, numThreads, qubitsPtr, nCtrls, qubitStateMask, targsPtr, matrPtr @@ -488,6 +496,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, SmallList ctrls, Smal // where we assign one-block per multiprocessor because we are anyway memory- // bandwidth bound (so we don't expect many interweaved blocks per MP). qindex numThreads = gpu_getMaxNumConcurrentThreads(); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); // use strictly 2^# threads to maintain precondition of all kernels if (!isPowerOf2(numThreads)) @@ -499,15 +508,15 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, SmallList ctrls, Smal // evenly distribute the batches between threads, and the threads unevenly between blocks qindex numBatchesPerThread = numBatches / numThreads; // divides evenly - qindex numBlocks = getNumBlocks(numThreads); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); // expand the cache if necessary - qindex numKernelInvocations = numBlocks * NUM_THREADS_PER_BLOCK; + qindex numKernelInvocations = numBlocks * numThreadsPerBlock; qcomp* cache = gpu_getCacheOfSize(powerOf2(targs.size()), numKernelInvocations); kernel_statevec_anyCtrlManyTargDenseMatr - <<>> ( + <<>> ( getGpuQcompPtr(cache), ampsPtr, numThreads, numBatchesPerThread, qubitsPtr, nCtrls, qubitStateMask, @@ -569,13 +578,14 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, SmallList ctrls, Small /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints deviceCtrls = getDevInts(util_getSorted(ctrls)); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = getGpuQcompArray<2>(matr.elems); // explicit template for MSVC, grr! - kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( + kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] ); @@ -639,13 +649,14 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, SmallList ctrls, Small /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints deviceCtrls = getDevInts(util_getSorted(ctrls)); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = getGpuQcompArray<4>(matr.elems); // explicit template for MSVC, grr! - kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( + kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2, elems[0], elems[1], elems[2], elems[3] @@ -707,13 +718,14 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, SmallList ctrls, Small /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints deviceTargs = getDevInts(targs); devints deviceCtrls = getDevInts(util_getSorted(ctrls)); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( + kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), getGpuQcompPtr(util_getGpuMemPtr(matr)), getGpuQcomp(exponent) @@ -764,11 +776,12 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); kernel_densmatr_allTargDiagMatr_sub - <<>> ( + <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getGpuQcompPtr(util_getGpuMemPtr(matr)), matr.numElems, getGpuQcomp(exponent) ); @@ -826,8 +839,9 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, SmallList ctrl // faster than when giving threads many pair-amps to modify, due to memory movements qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly - qindex numBlocks = getNumBlocks(numThreads); - kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); + kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(deviceQubits), ctrls.size(), qubitStateMask, getPtr(deviceTargs), deviceTargs.size(), @@ -848,7 +862,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, SmallList ctrl #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); qcomp powI = util_getPowerOfI(y.size()); @@ -858,7 +873,7 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, SmallList ctrl devints sortedCtrls = getDevInts(util_getSorted(ctrls)); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( + kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, maskXY, maskYZ, bufferMaskXY, @@ -889,13 +904,14 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, SmallList ctr #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedCtrls = getDevInts(util_getSorted(ctrls)); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); qindex targMask = util_getBitMask(targs); - kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <<>> ( + kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask, getGpuQcomp(fac0), getGpuQcomp(fac1) @@ -922,7 +938,8 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); // extract amp ptrs from qureg list vector ptrs; @@ -934,7 +951,7 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs devgpuqcompptrs devQuregAmps = ptrs; devcomps devCoeffs = coeffs; - kernel_statevec_setQuregToWeightedSum_sub <<>> ( + kernel_statevec_setQuregToWeightedSum_sub <<>> ( getGpuQcompPtr(outQureg.gpuAmps), numThreads, getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size() ); @@ -962,9 +979,10 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - kernel_densmatr_mixQureg_subB <<>> ( + kernel_densmatr_mixQureg_subB <<>> ( outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(inQureg.gpuAmps), numThreads, inQureg.numAmps ); @@ -980,9 +998,10 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - kernel_densmatr_mixQureg_subC <<>> ( + kernel_densmatr_mixQureg_subC <<>> ( outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(outQureg.gpuCommBuffer), numThreads, outQureg.rank, powerOf2(outQureg.numQubits), outQureg.logNumAmpsPerNode ); @@ -1012,12 +1031,13 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob) #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto fac = util_getOneQubitDephasingFactor(prob); int braQubit = util_getBraQubit(ketQubit, qureg); - kernel_densmatr_oneQubitDephasing_subA <<>> ( + kernel_densmatr_oneQubitDephasing_subA <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac ); @@ -1038,12 +1058,13 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob) #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto fac = util_getOneQubitDephasingFactor(prob); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); - kernel_densmatr_oneQubitDephasing_subB <<>> ( + kernel_densmatr_oneQubitDephasing_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, fac ); @@ -1083,13 +1104,14 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto term = util_getTwoQubitDephasingTerm(prob); int braQubitA = util_getBraQubit(ketQubitA, qureg); int braQubitB = util_getBraQubit(ketQubitB, qureg); - kernel_densmatr_twoQubitDephasing_subB <<>> ( + kernel_densmatr_twoQubitDephasing_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols ketQubitA, ketQubitB, braQubitA, braQubitB, term ); @@ -1111,12 +1133,13 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitDepolarisingFactors(prob); - kernel_densmatr_oneQubitDepolarising_subA <<>> ( + kernel_densmatr_oneQubitDepolarising_subA <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3 ); @@ -1131,13 +1154,14 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); auto factors = util_getOneQubitDepolarisingFactors(prob); - kernel_densmatr_oneQubitDepolarising_subB <<>> ( + kernel_densmatr_oneQubitDepolarising_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3 ); @@ -1159,13 +1183,14 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQb1 = util_getBraQubit(ketQb1, qureg); int braQb2 = util_getBraQubit(ketQb2, qureg); auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; - kernel_densmatr_twoQubitDepolarising_subA <<>> ( + kernel_densmatr_twoQubitDepolarising_subA <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, c3 ); @@ -1181,7 +1206,8 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 16; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQb1 = util_getBraQubit(ketQb1, qureg); int braQb2 = util_getBraQubit(ketQb2, qureg); @@ -1190,7 +1216,7 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2, // each kernel invocation sums all 4 amps together, so adjusts c1 qreal altc1 = factors.c1 - factors.c2; - kernel_densmatr_twoQubitDepolarising_subB <<>> ( + kernel_densmatr_twoQubitDepolarising_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, altc1, factors.c2 ); @@ -1206,13 +1232,14 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQb1 = util_getBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; - kernel_densmatr_twoQubitDepolarising_subC <<>> ( + kernel_densmatr_twoQubitDepolarising_subC <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braBit2, c3 ); @@ -1228,14 +1255,15 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 8; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex offset = getBufferRecvInd(); int braQb1 = util_getBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); auto factors = util_getTwoQubitDepolarisingFactors(prob); - kernel_densmatr_twoQubitDepolarising_subD <<>> ( + kernel_densmatr_twoQubitDepolarising_subD <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads, ketQb1, ketQb2, braQb1, braBit2, factors.c1, factors.c2 ); @@ -1251,7 +1279,8 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); @@ -1260,7 +1289,7 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, qreal fac0 = 1 + factors.c3; qreal fac1 = factors.c1 - fac0; - kernel_densmatr_twoQubitDepolarising_subE <<>> ( + kernel_densmatr_twoQubitDepolarising_subE <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braBit1, braBit2, fac0, fac1 ); @@ -1276,14 +1305,15 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex offset = getBufferRecvInd(); int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2; - kernel_densmatr_twoQubitDepolarising_subF <<>> ( + kernel_densmatr_twoQubitDepolarising_subF <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads, ketQb1, ketQb2, braBit1, braBit2, c2 ); @@ -1305,12 +1335,13 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); - kernel_densmatr_oneQubitPauliChannel_subA <<>> ( + kernel_densmatr_oneQubitPauliChannel_subA <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1326,13 +1357,14 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); - kernel_densmatr_oneQubitPauliChannel_subB <<>> ( + kernel_densmatr_oneQubitPauliChannel_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1354,12 +1386,13 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitDampingFactors(prob); - kernel_densmatr_oneQubitDamping_subA <<>> ( + kernel_densmatr_oneQubitDamping_subA <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, prob, factors.c1, factors.c2 ); @@ -1375,11 +1408,12 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto c2 = util_getOneQubitDampingFactors(prob).c2; - kernel_densmatr_oneQubitDamping_subB <<>> ( + kernel_densmatr_oneQubitDamping_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, qubit, c2 ); @@ -1394,12 +1428,13 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto braBit = util_getRankBitOfBraQubit(ketQubit, qureg); auto c1 = util_getOneQubitDampingFactors(prob).c1; - kernel_densmatr_oneQubitDamping_subC <<>> ( + kernel_densmatr_oneQubitDamping_subC <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, c1 ); @@ -1414,10 +1449,11 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); - kernel_densmatr_oneQubitDamping_subD <<>> ( + kernel_densmatr_oneQubitDamping_subD <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, qubit, prob ); @@ -1442,13 +1478,14 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, SmallList targ #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints devTargs = getDevInts(targs); devints devPairTargs = getDevInts(pairTargs); devints devAllTargs = getDevInts(util_getSorted(targs, pairTargs)); - kernel_densmatr_partialTrace_sub <<>> ( + kernel_densmatr_partialTrace_sub <<>> ( getGpuQcompPtr(inQureg.gpuAmps), getGpuQcompPtr(outQureg.gpuAmps), numThreads, getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size() ); @@ -1562,13 +1599,14 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu #if COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); // allocate exponentially-big temporary memory (error if failed) devints devQubits = getDevInts(qubits); devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws - kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( + kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size() ); @@ -1596,7 +1634,8 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu // we decouple numColsPerNode and numThreads for clarity // (and in case parallelisation granularity ever changes); qindex numThreads = powerOf2(qureg.logNumColsPerNode); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg); qindex numAmpsPerCol = powerOf2(qureg.numQubits); @@ -1605,7 +1644,7 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu devints devQubits = getDevInts(qubits); devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws - kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( + kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, firstDiagInd, numAmpsPerCol, qureg.rank, qureg.logNumAmpsPerNode,