Conversation
Greptile SummaryThis PR adds PyTorch symmetric memory backend support as an alternative to nvfuser's native CUDA VMM implementation. The implementation allows users to choose between Native (default), PyTorchNccl, PyTorchNvshmem, or PyTorchCuda backends via Key changes:
Issues found:
Confidence Score: 3/5
Important Files Changed
Flowchart%%{init: {'theme': 'neutral'}}%%
flowchart TD
Start[SymmetricTensor::allocate] --> CheckBackend{getSymmetricMemoryBackend}
CheckBackend -->|Native| CheckVMM[Check VMM Support]
CheckBackend -->|PyTorch*| PyTorchPath[PyTorch Backend Path]
PyTorchPath --> InitBackend[ensurePyTorchSymmMemBackend]
InitBackend --> SetBackend[set_backend NCCL/NVSHMEM/CUDA]
SetBackend --> SetGroupInfo[set_group_info with Communicator]
SetGroupInfo --> ComputeStrides[Compute row-major strides]
ComputeStrides --> CheckNCCL{Backend == PyTorchNccl?}
CheckNCCL -->|Yes| AllocNoGroup[empty_strided_p2p with nullopt group]
CheckNCCL -->|No| AllocWithGroup[empty_strided_p2p with nvfuser_symm group]
AllocNoGroup --> Rendezvous[c10d::symmetric_memory::rendezvous]
AllocWithGroup --> Rendezvous
Rendezvous --> CacheHandle[Cache handle by data_ptr]
CacheHandle --> ReturnTensor[Return tensor]
CheckVMM --> NativeAlloc[Native CUDA VMM allocation]
NativeAlloc --> ReturnTensor
ReturnTensor --> UserCode[User constructs SymmetricTensor]
UserCode --> Constructor[SymmetricTensor constructor]
Constructor --> LookupCache{Check handle cache}
LookupCache -->|Found| MovePyHandle[Move py_symm_handle_ from cache]
LookupCache -->|Not found| NativeSetup[Native IPC setup]
MovePyHandle --> RemoteAccess[Remote tensor access via PyTorch]
NativeSetup --> RemoteAccess
Last reviewed commit: 14816aa |
| #!/bin/bash | ||
|
|
||
| export CC=clang-20 | ||
| export CXX=clang++-20 | ||
| export LDFLAGS="-fuse-ld=mold" | ||
|
|
||
| export NVFUSER_BUILD_ENABLE_PCH | ||
|
|
||
| export UCC_HOME="/opt/hpcx/ucc" | ||
| export UCC_DIR="/opt/hpcx/ucc/lib/cmake/ucc" | ||
| export UCX_HOME="/opt/hpcx/ucx" | ||
| export UCX_DIR="/opt/hpcx/ucx/lib/cmake/ucx" | ||
|
|
||
| # export TORCH_CUDA_ARCH_LIST="9.0" | ||
|
|
||
| export NVFUSER_BUILD_WITH_UCC=1 | ||
| export NVFUSER_BUILD_INSTALL_DIR=$BUILD_DIRECTORY/nvfuser | ||
| export NVFUSER_BUILD_DIR=$BUILD_DIRECTORY | ||
|
|
||
| # Enable debug mode, leave empty for non-debug compilation | ||
| export NVFUSER_BUILD_BUILD_TYPE=Debug | ||
| export RUN_CMAKE="" | ||
|
|
||
| pip install -v -e ./python --no-build-isolation |
There was a problem hiding this comment.
This appears to be a personal build script with hardcoded paths (/opt/hpcx/), specific compiler versions (clang-20), and debug settings. It should not be committed to the repository.
| #!/bin/bash | |
| export CC=clang-20 | |
| export CXX=clang++-20 | |
| export LDFLAGS="-fuse-ld=mold" | |
| export NVFUSER_BUILD_ENABLE_PCH | |
| export UCC_HOME="/opt/hpcx/ucc" | |
| export UCC_DIR="/opt/hpcx/ucc/lib/cmake/ucc" | |
| export UCX_HOME="/opt/hpcx/ucx" | |
| export UCX_DIR="/opt/hpcx/ucx/lib/cmake/ucx" | |
| # export TORCH_CUDA_ARCH_LIST="9.0" | |
| export NVFUSER_BUILD_WITH_UCC=1 | |
| export NVFUSER_BUILD_INSTALL_DIR=$BUILD_DIRECTORY/nvfuser | |
| export NVFUSER_BUILD_DIR=$BUILD_DIRECTORY | |
| # Enable debug mode, leave empty for non-debug compilation | |
| export NVFUSER_BUILD_BUILD_TYPE=Debug | |
| export RUN_CMAKE="" | |
| pip install -v -e ./python --no-build-isolation | |
| # Remove this file - it should not be part of the PR |
| // TEST_F(SymmetricTensorTest, PyTorchBackend_RemoteAccessCorrectness) { | ||
| // if (communicator_->size() == 1) { | ||
| // GTEST_SKIP() << "Skipping test for single device"; | ||
| // } | ||
| // SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); | ||
| // if (backend == SymmetricMemoryBackend::Native) { | ||
| // GTEST_SKIP() | ||
| // << "PyTorch backend not selected; set NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl) to run"; | ||
| // } | ||
|
|
||
| // const int64_t rank = communicator_->deviceId(); | ||
| // const int64_t world_size = communicator_->size(); | ||
|
|
||
| // at::Tensor local_tensor = SymmetricTensor::allocate( | ||
| // {256, 512}, at::ScalarType::Float, communicator_->device()); | ||
| // SymmetricTensor sym_tensor(local_tensor); | ||
|
|
||
| // EXPECT_TRUE(local_tensor.is_cuda()); | ||
| // EXPECT_EQ(local_tensor.numel(), 256 * 512); | ||
|
|
||
| // float local_value = static_cast<float>(rank + 200); | ||
| // local_tensor.fill_(local_value); | ||
|
|
||
| // sym_tensor.setupRemoteHandles(); | ||
|
|
||
| // for (int64_t peer_rank = 0; peer_rank < world_size; ++peer_rank) { | ||
| // void* peer_ptr = sym_tensor.remoteTensor(peer_rank).data_ptr(); | ||
| // EXPECT_NE(peer_ptr, nullptr); | ||
|
|
||
| // float peer_value; | ||
| // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy( | ||
| // &peer_value, peer_ptr, sizeof(float), cudaMemcpyDeviceToHost)); | ||
|
|
||
| // float expected_value = static_cast<float>(peer_rank + 200); | ||
| // EXPECT_FLOAT_EQ(peer_value, expected_value) | ||
| // << "Rank " << rank << " reading from rank " << peer_rank | ||
| // << " (PyTorch backend)"; | ||
| // } | ||
| // } |
There was a problem hiding this comment.
Large block of commented-out test code. Either remove it or uncomment and enable it for testing the PyTorch backend path.
Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!
No description provided.