diff --git a/ci/blackbox.sh b/ci/blackbox.sh index e89e0b40a2..8a771ea445 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -19,7 +19,7 @@ ROOT_DIR=$SCRIPT_DIR/.. show_usage() { echo "Vortex BlackBox Test Driver v1.0" - echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--scope] [--perf=#class] [--log=logfile] [--nohup] [--help]]" + echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--np=#num of MPI processes] [--scope] [--perf=#class] [--log=logfile] [--nohup] [--help]]" } show_help() @@ -47,6 +47,7 @@ DEFAULTS() { DEBUG_LEVEL=0 SCOPE=0 HAS_ARGS=0 + HAS_NP=0 PERF_CLASS=0 CONFIGS="$CONFIGS" TEMPBUILD=0 @@ -69,6 +70,7 @@ parse_args() { --debug=*) DEBUG=1; DEBUG_LEVEL=${i#*=} ;; --scope) SCOPE=1; ;; --args=*) HAS_ARGS=1; ARGS=${i#*=} ;; + --np=*) HAS_NP=1; NP=${i#*=} ;; --log=*) LOGFILE=${i#*=} ;; --nohup) TEMPBUILD=1 ;; --help) show_help; exit 0 ;; @@ -123,6 +125,7 @@ run_app() { [ $DEBUG -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=1") [ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "VORTEX_RT_PATH=\"$TEMPDIR\"") [ $HAS_ARGS -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "OPTS=\"$ARGS\"") + [ $HAS_NP -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "NP=$NP") cmd_opts=$(add_option "$cmd_opts" "make -C \"$APP_PATH\" run-$DRIVER") [ $DEBUG -ne 0 ] && cmd_opts=$(add_option "$cmd_opts" "> $LOGFILE 2>&1") echo "Running: $cmd_opts" diff --git a/miscs/apptainer/vortex.def b/miscs/apptainer/vortex.def index cd32138928..ecb57cce29 100644 --- a/miscs/apptainer/vortex.def +++ b/miscs/apptainer/vortex.def @@ -45,7 +45,8 @@ From: ubuntu:22.04 openjdk-11-jre-zero libtheora0 libavcodec58 libcairo-gobject2 \ ca-certificates-java libchromaprint1 software-properties-common perl-modules bzip2 \ unzip zlib1g-dev libtinfo5 g++ usbutils pciutils gawk bison gcc make tar python3.9 locales zstd uuid-dev ccache \ - libboost-filesystem1.74.0 libboost-program-options1.74.0 libboost-system1.74.0 libboost-chrono1.74.0 libboost-thread1.74.0 environment-modules || true + libboost-filesystem1.74.0 libboost-program-options1.74.0 libboost-system1.74.0 libboost-chrono1.74.0 libboost-thread1.74.0 \ + environment-modules openmpi-bin libopenmpi-dev || true ln -s /usr/bin/python3 /usr/bin/python diff --git a/tests/regression/Makefile b/tests/regression/Makefile index be3ccc9636..0bd35fe3c9 100644 --- a/tests/regression/Makefile +++ b/tests/regression/Makefile @@ -7,15 +7,23 @@ all: $(MAKE) -C dogfood $(MAKE) -C dropout $(MAKE) -C dotproduct + $(MAKE) -C mpi_dotproduct + $(MAKE) -C mpi_put_dotproduct $(MAKE) -C mstress $(MAKE) -C io_addr $(MAKE) -C printf $(MAKE) -C diverge + $(MAKE) -C mpi_diverge $(MAKE) -C sort $(MAKE) -C fence $(MAKE) -C vecadd + $(MAKE) -C mpi_vecadd $(MAKE) -C sgemm + $(MAKE) -C mpi_sgemm + $(MAKE) -C mpi_blocked_sgemm $(MAKE) -C conv3 + $(MAKE) -C mpi_conv3 + $(MAKE) -C mpi_neighbor_a2a_conv3 $(MAKE) -C relu $(MAKE) -C sgemv $(MAKE) -C sgemm2 @@ -28,15 +36,23 @@ run-simx: $(MAKE) -C dogfood run-simx $(MAKE) -C dropout run-simx $(MAKE) -C dotproduct run-simx + $(MAKE) -C mpi_dotproduct run-simx + $(MAKE) -C mpi_put_dotproduct run-simx $(MAKE) -C mstress run-simx $(MAKE) -C io_addr run-simx $(MAKE) -C printf run-simx $(MAKE) -C diverge run-simx + $(MAKE) -C mpi_diverge run-simx $(MAKE) -C sort run-simx $(MAKE) -C fence run-simx $(MAKE) -C vecadd run-simx + $(MAKE) -C mpi_vecadd run-simx $(MAKE) -C sgemm run-simx + $(MAKE) -C mpi_sgemm run-simx + $(MAKE) -C mpi_blocked_sgemm run-simx $(MAKE) -C conv3 run-simx + $(MAKE) -C mpi_conv3 run-simx + $(MAKE) -C mpi_neighbor_a2a_conv3 run-simx $(MAKE) -C relu run-simx $(MAKE) -C sgemv run-simx $(MAKE) -C sgemm2 run-simx @@ -70,15 +86,23 @@ clean: $(MAKE) -C dogfood clean $(MAKE) -C dropout clean $(MAKE) -C dotproduct clean + $(MAKE) -C mpi_dotproduct clean + $(MAKE) -C mpi_put_dotproduct clean $(MAKE) -C mstress clean $(MAKE) -C io_addr clean $(MAKE) -C printf clean $(MAKE) -C diverge clean + $(MAKE) -C mpi_diverge clean $(MAKE) -C sort clean $(MAKE) -C fence clean $(MAKE) -C vecadd clean + $(MAKE) -C mpi_vecadd clean $(MAKE) -C sgemm clean + $(MAKE) -C mpi_sgemm clean + $(MAKE) -C mpi_blocked_sgemm clean $(MAKE) -C conv3 clean + $(MAKE) -C mpi_neighbor_a2a_conv3 clean + $(MAKE) -C mpi_conv3 clean $(MAKE) -C relu clean $(MAKE) -C sgemv clean $(MAKE) -C sgemm2 clean diff --git a/tests/regression/common.mk b/tests/regression/common.mk index dea17512b6..ae8cc9bae2 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -65,6 +65,12 @@ CXXFLAGS += $(CONFIGS) LDFLAGS += -L$(VORTEX_RT_PATH) -lvortex +ifdef MPI + MPIRUN = mpirun --allow-run-as-root --oversubscribe -np $(NP) +else + MPIRUN = +endif + # Debugging ifdef DEBUG CXXFLAGS += -g -O0 @@ -99,7 +105,7 @@ $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ run-simx: $(PROJECT) kernel.vxbin - LD_LIBRARY_PATH=$(VORTEX_RT_PATH):$(LD_LIBRARY_PATH) VORTEX_DRIVER=simx ./$(PROJECT) $(OPTS) + LD_LIBRARY_PATH=$(VORTEX_RT_PATH):$(LD_LIBRARY_PATH) VORTEX_DRIVER=simx $(MPIRUN) ./$(PROJECT) $(OPTS) run-rtlsim: $(PROJECT) kernel.vxbin LD_LIBRARY_PATH=$(VORTEX_RT_PATH):$(LD_LIBRARY_PATH) VORTEX_DRIVER=rtlsim ./$(PROJECT) $(OPTS) diff --git a/tests/regression/mpi_blocked_sgemm/Makefile b/tests/regression/mpi_blocked_sgemm/Makefile new file mode 100644 index 0000000000..52cf20b5eb --- /dev/null +++ b/tests/regression/mpi_blocked_sgemm/Makefile @@ -0,0 +1,21 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mpi_blocked_sgemm + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n64 + +MPI ?= 0 +NP ?= 1 + +ifdef MPI + CXX = mpic++ +endif + +include ../common.mk diff --git a/tests/regression/mpi_blocked_sgemm/common.h b/tests/regression/mpi_blocked_sgemm/common.h new file mode 100644 index 0000000000..b58ec5f5e1 --- /dev/null +++ b/tests/regression/mpi_blocked_sgemm/common.h @@ -0,0 +1,16 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef TYPE +#define TYPE float +#endif + +typedef struct { + uint32_t grid_dim[2]; + uint32_t size; + uint64_t A_addr; + uint64_t B_addr; + uint64_t C_addr; +} kernel_arg_t; + +#endif diff --git a/tests/regression/mpi_blocked_sgemm/kernel.cpp b/tests/regression/mpi_blocked_sgemm/kernel.cpp new file mode 100644 index 0000000000..5b67113ec1 --- /dev/null +++ b/tests/regression/mpi_blocked_sgemm/kernel.cpp @@ -0,0 +1,24 @@ +#include +#include "common.h" + +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto A = reinterpret_cast(arg->A_addr); + auto B = reinterpret_cast(arg->B_addr); + auto C = reinterpret_cast(arg->C_addr); + auto size = arg->size; + + int col = blockIdx.x; + int row = blockIdx.y; + + TYPE sum(0); + for (int e = 0; e < size; ++e) { + sum += A[row * size + e] * B[e * size + col]; + } + + C[row * size + col] = sum; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mpi_blocked_sgemm/main.cpp b/tests/regression/mpi_blocked_sgemm/main.cpp new file mode 100644 index 0000000000..8124839f8f --- /dev/null +++ b/tests/regression/mpi_blocked_sgemm/main.cpp @@ -0,0 +1,386 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" +#include + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +template +class Comparator {}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "integer"; + } + static int generate() { + return rand(); + } + static bool compare(int a, int b, int index, int errors) { + if (a != b) { + if (errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a); + } + return false; + } + return true; + } +}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "float"; + } + static float generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) { + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a); + } + return false; + } + return true; + } +}; + +static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) { + for (uint32_t row = 0; row < height; ++row) { + for (uint32_t col = 0; col < width; ++col) { + TYPE sum(0); + for (uint32_t e = 0; e < width; ++e) { + sum += A[row * width + e] * B[e * width + col]; + } + out[row * width + col] = sum; + } + } +} + +const char* kernel_file = "kernel.vxbin"; +uint32_t size = 32; + +vx_device_h device = nullptr; +vx_buffer_h A_buffer = nullptr; +vx_buffer_h B_buffer = nullptr; +vx_buffer_h C_buffer = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + show_usage(); + exit(0); + break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(A_buffer); + vx_mem_free(B_buffer); + vx_mem_free(C_buffer); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + + + +//// +int main(int argc, char *argv[]) { + + MPI_Init(&argc, &argv); + + int rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + if (rank == 0) + parse_args(argc, argv); + + MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD); + + // =============================== + // Create 2D Cartesian Topology + // =============================== + + int dims[2]; + dims[0] = dims[1] = std::sqrt(world_size); + + if (dims[0] * dims[1] != world_size) { + if (rank == 0) + std::cout << "World size must be perfect square\n"; + MPI_Finalize(); + return -1; + } + + int periods[2] = {1, 1}; + MPI_Comm grid_comm; + + MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &grid_comm); + + int coords[2]; + MPI_Cart_coords(grid_comm, rank, 2, coords); + + int row = coords[0]; + int col = coords[1]; + + uint32_t block_size = size / dims[0]; + uint32_t block_elems = block_size * block_size; + uint32_t block_bytes = block_elems * sizeof(TYPE); + + std::vector A_block(block_elems); + std::vector B_block(block_elems); + std::vector C_block(block_elems, 0); + + std::vector A_full, B_full; + + if (rank == 0) { + std::srand(50); + A_full.resize(size * size); + B_full.resize(size * size); + for (uint32_t i = 0; i < size*size; i++) { + A_full[i] = Comparator::generate(); + B_full[i] = Comparator::generate(); + } + } + + // ================================== + // Manual block distribution + // ================================== + + for (int r = 0; r < world_size; r++) { + + int ccoords[2]; + MPI_Cart_coords(grid_comm, r, 2, ccoords); + + int r_row = ccoords[0]; + int r_col = ccoords[1]; + + if (rank == 0) { + + std::vector temp(block_elems); + + for (uint32_t i = 0; i < block_size; i++) + for (uint32_t j = 0; j < block_size; j++) + temp[i*block_size+j] = + A_full[(r_row*block_size+i)*size + (r_col*block_size+j)]; + + if (r == 0) + A_block = temp; + else + MPI_Send(temp.data(), block_elems, MPI_FLOAT, r, 0, MPI_COMM_WORLD); + + for (uint32_t i = 0; i < block_size; i++) + for (uint32_t j = 0; j < block_size; j++) + temp[i*block_size+j] = + B_full[(r_row*block_size+i)*size + (r_col*block_size+j)]; + + if (r == 0) + B_block = temp; + else + MPI_Send(temp.data(), block_elems, MPI_FLOAT, r, 1, MPI_COMM_WORLD); + } + + if (rank == r && rank != 0) { + MPI_Recv(A_block.data(), block_elems, MPI_FLOAT, 0, 0, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(B_block.data(), block_elems, MPI_FLOAT, 0, 1, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } + } + + // ============================== + // Cannon Initial Alignment + // ============================== + + int src, dst; + + MPI_Cart_shift(grid_comm, 1, -row, &src, &dst); + MPI_Sendrecv_replace(A_block.data(), block_elems, + MPI_FLOAT, dst, 0, src, 0, + grid_comm, MPI_STATUS_IGNORE); + + MPI_Cart_shift(grid_comm, 0, -col, &src, &dst); + MPI_Sendrecv_replace(B_block.data(), block_elems, + MPI_FLOAT, dst, 1, src, 1, + grid_comm, MPI_STATUS_IGNORE); + + // ============================== + // Main Cannon Loop + // ============================== + + for (int k = 0; k < dims[0]; k++) { + + // Vortex compute block multiply + vx_device_h device; + vx_buffer_h A_buffer, B_buffer, C_buffer; + vx_buffer_h krnl_buffer, args_buffer; + kernel_arg_t kernel_arg = {}; + + RT_CHECK(vx_dev_open(&device)); + RT_CHECK(vx_mem_alloc(device, block_bytes, VX_MEM_READ, &A_buffer)); + RT_CHECK(vx_mem_alloc(device, block_bytes, VX_MEM_READ, &B_buffer)); + RT_CHECK(vx_mem_alloc(device, block_bytes, VX_MEM_WRITE, &C_buffer)); + + RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.A_addr)); + RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.B_addr)); + RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.C_addr)); + + kernel_arg.grid_dim[0] = block_size; + kernel_arg.grid_dim[1] = block_size; + kernel_arg.size = block_size; + + RT_CHECK(vx_copy_to_dev(A_buffer, A_block.data(), 0, block_bytes)); + RT_CHECK(vx_copy_to_dev(B_buffer, B_block.data(), 0, block_bytes)); + + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + RT_CHECK(vx_upload_bytes(device, &kernel_arg, + sizeof(kernel_arg_t), &args_buffer)); + + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + std::vector temp(block_elems); + RT_CHECK(vx_copy_from_dev(temp.data(), C_buffer, 0, block_bytes)); + + for (uint32_t i = 0; i < block_elems; i++) + C_block[i] += temp[i]; + + vx_dev_close(device); + + // shift left A + MPI_Cart_shift(grid_comm, 1, -1, &src, &dst); + MPI_Sendrecv_replace(A_block.data(), block_elems, + MPI_FLOAT, dst, 2, src, 2, + grid_comm, MPI_STATUS_IGNORE); + + // shift up B + MPI_Cart_shift(grid_comm, 0, -1, &src, &dst); + MPI_Sendrecv_replace(B_block.data(), block_elems, + MPI_FLOAT, dst, 3, src, 3, + grid_comm, MPI_STATUS_IGNORE); + } +// ============================== +// Gather C blocks to rank 0 +// ============================== + +std::vector C_full; + +if (rank == 0) + C_full.resize(size * size); + +if (rank == 0) { + + // place own block + for (uint32_t i = 0; i < block_size; i++) + for (uint32_t j = 0; j < block_size; j++) + C_full[i*size + j] = C_block[i*block_size + j]; + + // receive others + for (int r = 1; r < world_size; r++) { + + int coords_r[2]; + MPI_Cart_coords(grid_comm, r, 2, coords_r); + + std::vector temp(block_elems); + + MPI_Recv(temp.data(), block_elems, MPI_FLOAT, + r, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + int r_row = coords_r[0]; + int r_col = coords_r[1]; + + for (uint32_t i = 0; i < block_size; i++) + for (uint32_t j = 0; j < block_size; j++) + C_full[(r_row*block_size+i)*size + + (r_col*block_size+j)] + = temp[i*block_size+j]; + } + +} else { + + MPI_Send(C_block.data(), block_elems, + MPI_FLOAT, 0, 99, MPI_COMM_WORLD); +} + + +if (rank == 0) { + + std::vector ref(size * size); + + matmul_cpu(ref.data(), + A_full.data(), + B_full.data(), + size, + size); + + int errors = 0; + + for (uint32_t i = 0; i < size*size; i++) { + if (!Comparator::compare(C_full[i], + ref[i], + i, + errors)) + errors++; + } + + if (errors) + std::cout << "FAILED with " + << errors << " errors\n"; + else + std::cout << "PASSED\n"; +} + + + + MPI_Finalize(); + return 0; +} \ No newline at end of file diff --git a/tests/regression/mpi_conv3/Makefile b/tests/regression/mpi_conv3/Makefile new file mode 100644 index 0000000000..479776a731 --- /dev/null +++ b/tests/regression/mpi_conv3/Makefile @@ -0,0 +1,21 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mpi_conv3 + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n64 + +MPI ?= 0 +NP ?= 1 + +ifdef MPI + CXX = mpic++ +endif + +include ../common.mk diff --git a/tests/regression/mpi_conv3/common.h b/tests/regression/mpi_conv3/common.h new file mode 100644 index 0000000000..25abebd076 --- /dev/null +++ b/tests/regression/mpi_conv3/common.h @@ -0,0 +1,17 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef TYPE +#define TYPE float +#endif + +typedef struct { + uint32_t grid_dim[2]; + uint32_t width; + uint64_t I_addr; + uint64_t W_addr; + uint64_t O_addr; + bool use_lmem; +} kernel_arg_t; + +#endif diff --git a/tests/regression/mpi_conv3/kernel.cpp b/tests/regression/mpi_conv3/kernel.cpp new file mode 100644 index 0000000000..27d86fd6dd --- /dev/null +++ b/tests/regression/mpi_conv3/kernel.cpp @@ -0,0 +1,47 @@ +#include +#include "common.h" + +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto I = reinterpret_cast(arg->I_addr); + auto W = reinterpret_cast(arg->use_lmem ? __local_mem(0) : (void*)arg->W_addr); + auto O = reinterpret_cast(arg->O_addr); + auto width = arg->width; + + int col = blockIdx.x; + int row = blockIdx.y; + + // Adjust for padded borders + int paddedWidth = width + 2; + int paddedX = col + 1; + int paddedY = row + 1; + + // Compute 3x3 convolution sum + float sum = 0.0f; + + sum += I[(paddedY - 1) * paddedWidth + (paddedX - 1)] * W[0]; // Top-left + sum += I[(paddedY - 1) * paddedWidth + paddedX] * W[1]; // Top-center + sum += I[(paddedY - 1) * paddedWidth + (paddedX + 1)] * W[2]; // Top-right + + sum += I[paddedY * paddedWidth + (paddedX - 1)] * W[3]; // Middle-left + sum += I[paddedY * paddedWidth + paddedX] * W[4]; // Center + sum += I[paddedY * paddedWidth + (paddedX + 1)] * W[5]; // Middle-right + + sum += I[(paddedY + 1) * paddedWidth + (paddedX - 1)] * W[6]; // Bottom-left + sum += I[(paddedY + 1) * paddedWidth + paddedX] * W[7]; // Bottom-center + sum += I[(paddedY + 1) * paddedWidth + (paddedX + 1)] * W[8]; // Bottom-right + + O[row * width + col] = sum; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + if (arg->use_lmem) { + // populate local memory + auto W = reinterpret_cast(arg->W_addr); + auto L = reinterpret_cast(__local_mem(0)); + for (int i = 0; i < (3*3); ++i) { + L[i] = W[i]; + } + } + return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mpi_conv3/main.cpp b/tests/regression/mpi_conv3/main.cpp new file mode 100644 index 0000000000..351b38f1be --- /dev/null +++ b/tests/regression/mpi_conv3/main.cpp @@ -0,0 +1,212 @@ +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + } while(false) + +template +class Comparator {}; + +template <> +class Comparator { +public: + static float generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union { float f; int i; } fa, fb; + fa.f = a; fb.f = b; + int d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP && errors < 10) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a); + return false; + } + return d <= FLOAT_ULP; + } +}; + +static void convolution_cpu(float *O, float *I, float *W, int width, int height) { + int paddedWidth = width + 2; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + int paddedY = y + 1; + int paddedX = x + 1; + float sum = 0.0f; + for (int ky = -1; ky <= 1; ++ky) + for (int kx = -1; kx <= 1; ++kx) + sum += I[(paddedY+ky)*paddedWidth + (paddedX+kx)] + * W[(ky+1)*3 + (kx+1)]; + O[y*width+x] = sum; + } + } +} + +const char* kernel_file = "kernel.vxbin"; +int size = 32; + +vx_device_h device = nullptr; +vx_buffer_h I_buf=nullptr, W_buf=nullptr, O_buf=nullptr; +vx_buffer_h krnl_buf=nullptr, args_buf=nullptr; +kernel_arg_t kernel_arg = {}; + +void cleanup() { + if (device) { + vx_mem_free(I_buf); + vx_mem_free(W_buf); + vx_mem_free(O_buf); + vx_mem_free(krnl_buf); + vx_mem_free(args_buf); + vx_dev_close(device); + } +} + +int main(int argc, char** argv) { + + MPI_Init(&argc,&argv); + + int rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + MPI_Comm_size(MPI_COMM_WORLD,&world_size); + +if (rank == 0) { + for (int i = 1; i < argc; i++) { + if (strncmp(argv[i], "-n", 2) == 0) { + if (strlen(argv[i]) > 2) { + // case: -n64 + size = atoi(&argv[i][2]); + } else if (i + 1 < argc) { + // case: -n 64 + size = atoi(argv[i + 1]); + } + } + } +} + + MPI_Bcast(&size,1,MPI_INT,0,MPI_COMM_WORLD); + + srand(50); + + int rows_per_rank = (size + world_size - 1)/world_size; + int start = rank*rows_per_rank; + int end = std::min(start+rows_per_rank,size); + int local_rows = end-start; + + int padded_width = size+2; + + std::vector full_I, full_W, full_O; + + if(rank==0){ + full_I.resize((size+2)*(size+2)); + full_W.resize(9); + full_O.resize(size*size); + + for(int y=-1;y=0&&x=0&&y::generate() : 0; + + for(int i=0;i<9;i++) + full_W[i] = Comparator::generate(); + } + + std::vector local_I((local_rows+2)*padded_width); + std::vector local_O(local_rows*size); + std::vector local_W(9); + + if(rank==0) + local_W = full_W; + + MPI_Bcast(local_W.data(),9,MPI_FLOAT,0,MPI_COMM_WORLD); + + std::vector sendcounts(world_size), displs(world_size); + + for(int r=0;r recvcounts(world_size), recvdispls(world_size); + + for(int r=0;r ref(size*size); + convolution_cpu(ref.data(),full_I.data(),full_W.data(),size,size); + + int errors=0; + for(int i=0;i::compare(full_O[i],ref[i],i,errors)) + errors++; + + std::cout<<(errors?"FAILED\n":"PASSED\n"); + } + + cleanup(); + MPI_Finalize(); + return 0; +} \ No newline at end of file diff --git a/tests/regression/mpi_diverge/Makefile b/tests/regression/mpi_diverge/Makefile new file mode 100644 index 0000000000..0d99385b72 --- /dev/null +++ b/tests/regression/mpi_diverge/Makefile @@ -0,0 +1,21 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mpi_diverge + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n64 + +MPI ?= 0 +NP ?= 1 + +ifdef MPI + CXX = mpic++ +endif + +include ../common.mk diff --git a/tests/regression/mpi_diverge/common.h b/tests/regression/mpi_diverge/common.h new file mode 100644 index 0000000000..edf054657e --- /dev/null +++ b/tests/regression/mpi_diverge/common.h @@ -0,0 +1,10 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +typedef struct { + uint32_t num_points; + uint64_t src_addr; + uint64_t dst_addr; +} kernel_arg_t; + +#endif \ No newline at end of file diff --git a/tests/regression/mpi_diverge/kernel.cpp b/tests/regression/mpi_diverge/kernel.cpp new file mode 100644 index 0000000000..522a31b232 --- /dev/null +++ b/tests/regression/mpi_diverge/kernel.cpp @@ -0,0 +1,99 @@ +#include +#include +#include +#include "common.h" + +// Parallel Selection sort + +struct key_t { + uint32_t user = 0; +}; + +static __attribute__((noinline)) void hacker(key_t* key, uint32_t task_id) { + key->user = task_id; +} + +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + int32_t* src_ptr = (int32_t*)arg->src_addr; + int32_t* dst_ptr = (int32_t*)arg->dst_addr; + + uint32_t task_id = blockIdx.x; + + int value = src_ptr[task_id]; + + key_t key; + uint32_t samples = arg->num_points; + while (samples--) { + hacker(&key, task_id); + if ((key.user & 0x1) == 0) { + value += 1; + } + } + + // none taken + if (task_id >= 0x7fffffff) { + value = 0; + } else { + value += 2; + } + + // diverge + if (task_id > 1) { + if (task_id > 2) { + value += 6; + } else { + value += 5; + } + } else { + if (task_id > 0) { + value += 4; + } else { + value += 3; + } + } + + // all taken + if (task_id >= 0) { + value += 7; + } else { + value = 0; + } + + // loop + for (int i = 0, n = task_id; i < n; ++i) { + value += src_ptr[i]; + } + + // switch + switch (task_id) { + case 0: + value += 1; + break; + case 1: + value -= 1; + break; + case 2: + value *= 3; + break; + case 3: + value *= 5; + break; + default: + //assert(task_id < arg->num_points); + break; + } + + // select + value += (task_id >= 0) ? ((task_id > 5) ? src_ptr[0] : task_id) : ((task_id < 5) ? src_ptr[1] : -task_id); + + // min/max + value += std::min(src_ptr[task_id], value); + value += std::max(src_ptr[task_id], value); + + dst_ptr[task_id] = value; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mpi_diverge/main.cpp b/tests/regression/mpi_diverge/main.cpp new file mode 100644 index 0000000000..542dee019f --- /dev/null +++ b/tests/regression/mpi_diverge/main.cpp @@ -0,0 +1,207 @@ +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +const char* kernel_file = "kernel.vxbin"; +uint32_t count = 0; + +vx_device_h device = nullptr; +vx_buffer_h src_buffer = nullptr; +vx_buffer_h dst_buffer = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; +kernel_arg_t kernel_arg = {}; + +void cleanup() { + if (device) { + vx_mem_free(src_buffer); + vx_mem_free(dst_buffer); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +void gen_src_data(std::vector& src_data, uint32_t size) { + src_data.resize(size); + for (uint32_t i = 0; i < size; ++i) { + int value = std::rand(); + src_data[i] = value; + //std::cout << std::dec << i << ": value=0x" << std::hex << value << std::endl; + } +} + +void gen_ref_data(std::vector& ref_data, const std::vector& src_data, uint32_t size) { + ref_data.resize(size); + for (int i = 0; i < (int)size; ++i) { + int value = src_data.at(i); + + uint32_t samples = size; + while (samples--) { + if ((i & 0x1) == 0) { + value += 1; + } + } + + // none taken + if (i >= 0x7fffffff) { + value = 0; + } else { + value += 2; + } + + // diverge + if (i > 1) { + if (i > 2) { + value += 6; + } else { + value += 5; + } + } else { + if (i > 0) { + value += 4; + } else { + value += 3; + } + } + + // all taken + if (i >= 0) { + value += 7; + } else { + value = 0; + } + + // loop + for (int j = 0, n = i; j < n; ++j) { + value += src_data.at(j); + } + + // switch + switch (i) { + case 0: + value += 1; + break; + case 1: + value -= 1; + break; + case 2: + value *= 3; + break; + case 3: + value *= 5; + break; + default: + assert(i < (int)size); + break; + } + + // select + value += (i >= 0) ? ((i > 5) ? src_data.at(0) : i) : ((i < 5) ? src_data.at(1) : -i); + + // min/max + value += std::min(src_data.at(i), value); + value += std::max(src_data.at(i), value); + + ref_data[i] = value; + } +} + +int main(int argc, char** argv) { + MPI_Init(&argc, &argv); + + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // parse args + int c; + while ((c = getopt(argc, argv, "n:k:h")) != -1) { + switch (c) { + case 'n': count = atoi(optarg); break; + case 'k': kernel_file = optarg; break; + case 'h': if(rank==0) std::cout<<"Usage: -n count -k kernel\n"; MPI_Finalize(); return 0; + default: if(rank==0) std::cout<<"Invalid\n"; MPI_Finalize(); return -1; + } + } + if(count==0) count=1; + + std::srand(50); + + // open device + RT_CHECK(vx_dev_open(&device)); + + uint64_t cores, warps, threads; + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &cores)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &warps)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &threads)); + + uint32_t total_threads = cores*warps*threads; + uint32_t num_points = count*total_threads; + uint32_t buf_size = num_points*sizeof(int32_t); + + // allocate full buffers on each rank + kernel_arg.num_points = num_points; + std::vector full_src(num_points); + std::vector full_dst(num_points); + + if(rank==0) { + for(uint32_t i=0;i h_ref(num_points); + gen_ref_data(h_ref, full_src, num_points); + + int errors=0; + for(uint32_t i=0;i +#include "common.h" + +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto src0_ptr = reinterpret_cast(arg->src0_addr); + auto src1_ptr = reinterpret_cast(arg->src1_addr); + auto dst_ptr = reinterpret_cast(arg->dst_addr); + auto num_points = arg->num_points; + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int cacheIndex = threadIdx.x; + + auto cache = reinterpret_cast(__local_mem(blockDim.x * sizeof(TYPE))); + + float temp = 0; + while (tid < num_points){ + temp += src0_ptr[tid] * src1_ptr[tid]; + tid += blockDim.x * gridDim.x; + } + + // set the cache values + cache[cacheIndex] = temp; + + __syncthreads(); + + int i = blockDim.x/2; + while (i != 0){ + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + __syncthreads(); + i /= 2; + } + + if (cacheIndex == 0) + dst_ptr[blockIdx.x] = cache[0]; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(1, arg->grid_dim, arg->block_dim, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mpi_dotproduct/main.cpp b/tests/regression/mpi_dotproduct/main.cpp new file mode 100644 index 0000000000..717946e31c --- /dev/null +++ b/tests/regression/mpi_dotproduct/main.cpp @@ -0,0 +1,194 @@ +#include +#include +#include +#include +#include +#include "common.h" + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +template +class Comparator {}; + +template <> +class Comparator { +public: + static float generate(uint32_t idx) { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union { float f; int i; } fa, fb; + fa.f = a; fb.f = b; + int d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP && errors < 100) { + printf("*** error: expected=%f, actual=%f\n", b, a); + return false; + } + return d <= FLOAT_ULP; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.vxbin"; +uint32_t size = 16; + +vx_device_h device = nullptr; +vx_buffer_h src0_buffer = nullptr; +vx_buffer_h src1_buffer = nullptr; +vx_buffer_h dst_buffer = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; +kernel_arg_t kernel_arg = {}; + +void cleanup() { + if (device) { + vx_mem_free(src0_buffer); + vx_mem_free(src1_buffer); + vx_mem_free(dst_buffer); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h")) != -1) { + switch (c) { + case 'n': size = atoi(optarg); break; + case 'k': kernel_file = optarg; break; + case 'h': exit(0); + default: exit(-1); + } + } +} + +/////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char* argv[]) { + + MPI_Init(&argc, &argv); + + int rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + if (rank == 0) parse_args(argc, argv); + MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD); + + std::srand(50); + + // balanced partitioning + uint32_t base = size / world_size; + uint32_t rem = size % world_size; + + uint32_t local_size = (rank < rem) ? base + 1 : base; + uint32_t start = rank * base + std::min(rank, (int)rem); + + // full input on rank 0 + std::vector full_src0, full_src1; + if (rank == 0) { + full_src0.resize(size); + full_src1.resize(size); + for (uint32_t i = 0; i < size; i++) { + full_src0[i] = Comparator::generate(i); + full_src1[i] = Comparator::generate(i); + } + } + + // Scatter setup + std::vector counts(world_size), displs(world_size); + for (int r = 0; r < world_size; r++) { + counts[r] = (r < rem) ? base + 1 : base; + displs[r] = r * base + std::min(r, (int)rem); + } + + std::vector h_src0(local_size); + std::vector h_src1(local_size); + + MPI_Scatterv(full_src0.data(), counts.data(), displs.data(), MPI_FLOAT, + h_src0.data(), local_size, MPI_FLOAT, 0, MPI_COMM_WORLD); + + MPI_Scatterv(full_src1.data(), counts.data(), displs.data(), MPI_FLOAT, + h_src1.data(), local_size, MPI_FLOAT, 0, MPI_COMM_WORLD); + + // Open device + RT_CHECK(vx_dev_open(&device)); + + const uint32_t threadsPerBlock = 8; + const uint32_t blocksPerGrid = + (local_size + threadsPerBlock - 1) / threadsPerBlock; + + uint32_t buf_size = local_size * sizeof(TYPE); + uint32_t dst_buf_size = blocksPerGrid * sizeof(TYPE); + + kernel_arg.num_points = local_size; + kernel_arg.block_dim[0] = threadsPerBlock; + kernel_arg.grid_dim[0] = blocksPerGrid; + + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer)); + RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr)); + + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer)); + RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr)); + + RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_WRITE, &dst_buffer)); + RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr)); + + RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size)); + RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size)); + + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + RT_CHECK(vx_upload_bytes(device, &kernel_arg, + sizeof(kernel_arg_t), &args_buffer)); + + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + std::vector h_dst(blocksPerGrid); + RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, dst_buf_size)); + + // Local reduction of block outputs + TYPE local_sum = 0; + for (uint32_t i = 0; i < blocksPerGrid; i++) + local_sum += h_dst[i]; + + TYPE global_sum = 0; + MPI_Reduce(&local_sum, &global_sum, + 1, MPI_FLOAT, MPI_SUM, + 0, MPI_COMM_WORLD); + + // Final verification + if (rank == 0) { + + TYPE ref = 0; + for (uint32_t i = 0; i < size; i++) + ref += full_src0[i] * full_src1[i]; + + int errors = 0; + if (!Comparator::compare(global_sum, ref, 0, errors)) + errors++; + + if (errors) + std::cout << "FAILED!\n"; + else + std::cout << "PASSED!\n"; + } + + cleanup(); + MPI_Finalize(); + return 0; +} \ No newline at end of file diff --git a/tests/regression/mpi_neighbor_a2a_conv3/Makefile b/tests/regression/mpi_neighbor_a2a_conv3/Makefile new file mode 100644 index 0000000000..ac99098f9c --- /dev/null +++ b/tests/regression/mpi_neighbor_a2a_conv3/Makefile @@ -0,0 +1,21 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mpi_neighbor_a2a_conv3 + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n64 + +MPI ?= 0 +NP ?= 1 + +ifdef MPI + CXX = mpic++ +endif + +include ../common.mk diff --git a/tests/regression/mpi_neighbor_a2a_conv3/common.h b/tests/regression/mpi_neighbor_a2a_conv3/common.h new file mode 100644 index 0000000000..25abebd076 --- /dev/null +++ b/tests/regression/mpi_neighbor_a2a_conv3/common.h @@ -0,0 +1,17 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef TYPE +#define TYPE float +#endif + +typedef struct { + uint32_t grid_dim[2]; + uint32_t width; + uint64_t I_addr; + uint64_t W_addr; + uint64_t O_addr; + bool use_lmem; +} kernel_arg_t; + +#endif diff --git a/tests/regression/mpi_neighbor_a2a_conv3/kernel.cpp b/tests/regression/mpi_neighbor_a2a_conv3/kernel.cpp new file mode 100644 index 0000000000..27d86fd6dd --- /dev/null +++ b/tests/regression/mpi_neighbor_a2a_conv3/kernel.cpp @@ -0,0 +1,47 @@ +#include +#include "common.h" + +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto I = reinterpret_cast(arg->I_addr); + auto W = reinterpret_cast(arg->use_lmem ? __local_mem(0) : (void*)arg->W_addr); + auto O = reinterpret_cast(arg->O_addr); + auto width = arg->width; + + int col = blockIdx.x; + int row = blockIdx.y; + + // Adjust for padded borders + int paddedWidth = width + 2; + int paddedX = col + 1; + int paddedY = row + 1; + + // Compute 3x3 convolution sum + float sum = 0.0f; + + sum += I[(paddedY - 1) * paddedWidth + (paddedX - 1)] * W[0]; // Top-left + sum += I[(paddedY - 1) * paddedWidth + paddedX] * W[1]; // Top-center + sum += I[(paddedY - 1) * paddedWidth + (paddedX + 1)] * W[2]; // Top-right + + sum += I[paddedY * paddedWidth + (paddedX - 1)] * W[3]; // Middle-left + sum += I[paddedY * paddedWidth + paddedX] * W[4]; // Center + sum += I[paddedY * paddedWidth + (paddedX + 1)] * W[5]; // Middle-right + + sum += I[(paddedY + 1) * paddedWidth + (paddedX - 1)] * W[6]; // Bottom-left + sum += I[(paddedY + 1) * paddedWidth + paddedX] * W[7]; // Bottom-center + sum += I[(paddedY + 1) * paddedWidth + (paddedX + 1)] * W[8]; // Bottom-right + + O[row * width + col] = sum; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + if (arg->use_lmem) { + // populate local memory + auto W = reinterpret_cast(arg->W_addr); + auto L = reinterpret_cast(__local_mem(0)); + for (int i = 0; i < (3*3); ++i) { + L[i] = W[i]; + } + } + return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mpi_neighbor_a2a_conv3/main.cpp b/tests/regression/mpi_neighbor_a2a_conv3/main.cpp new file mode 100644 index 0000000000..3e458d3744 --- /dev/null +++ b/tests/regression/mpi_neighbor_a2a_conv3/main.cpp @@ -0,0 +1,280 @@ +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + } while(false) + +template +class Comparator {}; + +template <> +class Comparator { +public: + static float generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union { float f; int i; } fa, fb; + fa.f = a; fb.f = b; + int d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP && errors < 10) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a); + return false; + } + return d <= FLOAT_ULP; + } +}; + +static void convolution_cpu(float *O, float *I, float *W, int width, int height) { + int paddedWidth = width + 2; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + int paddedY = y + 1; + int paddedX = x + 1; + float sum = 0.0f; + for (int ky = -1; ky <= 1; ++ky) + for (int kx = -1; kx <= 1; ++kx) + sum += I[(paddedY+ky)*paddedWidth + (paddedX+kx)] + * W[(ky+1)*3 + (kx+1)]; + O[y*width+x] = sum; + } + } +} + +const char* kernel_file = "kernel.vxbin"; +int size = 32; + +vx_device_h device = nullptr; +vx_buffer_h I_buf=nullptr, W_buf=nullptr, O_buf=nullptr; +vx_buffer_h krnl_buf=nullptr, args_buf=nullptr; +kernel_arg_t kernel_arg = {}; + +void cleanup() { + if (device) { + vx_mem_free(I_buf); + vx_mem_free(W_buf); + vx_mem_free(O_buf); + vx_mem_free(krnl_buf); + vx_mem_free(args_buf); + vx_dev_close(device); + } +} + + +int main(int argc, char** argv) { + + MPI_Init(&argc,&argv); + + int rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + MPI_Comm_size(MPI_COMM_WORLD,&world_size); + + if (rank == 0) { + for (int i = 1; i < argc; i++) { + if (strncmp(argv[i], "-n", 2) == 0) { + if (strlen(argv[i]) > 2) + size = atoi(&argv[i][2]); + else if (i+1 < argc) + size = atoi(argv[i+1]); + } + } + } + + MPI_Bcast(&size,1,MPI_INT,0,MPI_COMM_WORLD); + + srand(50); + + // ========================================== + // 1D Cartesian Topology + // ========================================== + + int dims[1] = {world_size}; + int periods[1] = {0}; + MPI_Comm cart_comm; + + MPI_Cart_create(MPI_COMM_WORLD,1,dims,periods,0,&cart_comm); + + int up, down; + MPI_Cart_shift(cart_comm,0,1,&up,&down); + + int rows_per_rank = (size + world_size - 1)/world_size; + int start = rank*rows_per_rank; + int end = std::min(start+rows_per_rank,size); + int local_rows = end-start; + + int padded_width = size+2; + + // ========================================== + // Root initializes full image + kernel + // ========================================== + + std::vector full_I, full_W, full_O; + + if(rank==0){ + full_I.resize((size+2)*(size+2)); + full_W.resize(9); + full_O.resize(size*size); + + for(int y=-1;y=0&&x=0&&y::generate() : 0; + + for(int i=0;i<9;i++) + full_W[i] = Comparator::generate(); + } + + // ========================================== + // Distribute only interior rows + // ========================================== + + std::vector local_I((local_rows+2)*padded_width,0); + std::vector local_O(local_rows*size); + std::vector local_W(9); + + if(rank==0) + local_W = full_W; + + MPI_Bcast(local_W.data(),9,MPI_FLOAT,0,MPI_COMM_WORLD); + + // Scatter only interior rows (no halos) + std::vector sendcounts(world_size), displs(world_size); + + for(int r=0;r sendbuf(2*padded_width); + std::vector recvbuf(2*padded_width); + + // Pack top and bottom interior rows + memcpy(sendbuf.data(), + local_I.data()+padded_width, + padded_width*sizeof(float)); + + memcpy(sendbuf.data()+padded_width, + local_I.data()+local_rows*padded_width, + padded_width*sizeof(float)); + + MPI_Neighbor_alltoall(sendbuf.data(), + padded_width, + MPI_FLOAT, + recvbuf.data(), + padded_width, + MPI_FLOAT, + cart_comm); + + // Unpack halos + if(up != MPI_PROC_NULL) + memcpy(local_I.data(), + recvbuf.data(), + padded_width*sizeof(float)); + + if(down != MPI_PROC_NULL) + memcpy(local_I.data()+(local_rows+1)*padded_width, + recvbuf.data()+padded_width, + padded_width*sizeof(float)); + + // ========================================== + // Vortex Execution (unchanged) + // ========================================== + + RT_CHECK(vx_dev_open(&device)); + + kernel_arg.width=size; + kernel_arg.grid_dim[0]=size; + kernel_arg.grid_dim[1]=local_rows; + kernel_arg.use_lmem=false; + + RT_CHECK(vx_mem_alloc(device,local_I.size()*sizeof(float),VX_MEM_READ,&I_buf)); + RT_CHECK(vx_mem_alloc(device,9*sizeof(float),VX_MEM_READ,&W_buf)); + RT_CHECK(vx_mem_alloc(device,local_O.size()*sizeof(float),VX_MEM_WRITE,&O_buf)); + + RT_CHECK(vx_mem_address(I_buf,&kernel_arg.I_addr)); + RT_CHECK(vx_mem_address(W_buf,&kernel_arg.W_addr)); + RT_CHECK(vx_mem_address(O_buf,&kernel_arg.O_addr)); + + RT_CHECK(vx_copy_to_dev(I_buf,local_I.data(),0,local_I.size()*sizeof(float))); + RT_CHECK(vx_copy_to_dev(W_buf,local_W.data(),0,9*sizeof(float))); + + RT_CHECK(vx_upload_kernel_file(device,kernel_file,&krnl_buf)); + RT_CHECK(vx_upload_bytes(device,&kernel_arg,sizeof(kernel_arg),&args_buf)); + + RT_CHECK(vx_start(device,krnl_buf,args_buf)); + RT_CHECK(vx_ready_wait(device,VX_MAX_TIMEOUT)); + + RT_CHECK(vx_copy_from_dev(local_O.data(),O_buf,0,local_O.size()*sizeof(float))); + + // ========================================== + // Gather results + // ========================= ================= + + std::vector recvcounts(world_size), recvdispls(world_size); + + for(int r=0;r ref(size*size); + convolution_cpu(ref.data(),full_I.data(),full_W.data(),size,size); + + int errors=0; + for(int i=0;i::compare(full_O[i],ref[i],i,errors)) + errors++; + + std::cout<<(errors?"FAILED\n":"PASSED\n"); + } + + cleanup(); + MPI_Finalize(); + return 0; +} \ No newline at end of file diff --git a/tests/regression/mpi_put_dotproduct/Makefile b/tests/regression/mpi_put_dotproduct/Makefile new file mode 100644 index 0000000000..4e96ff00b6 --- /dev/null +++ b/tests/regression/mpi_put_dotproduct/Makefile @@ -0,0 +1,21 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mpi_put_dotproduct + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n64 + +MPI ?= 0 +NP ?= 1 + +ifdef MPI + CXX = mpic++ +endif + +include ../common.mk diff --git a/tests/regression/mpi_put_dotproduct/common.h b/tests/regression/mpi_put_dotproduct/common.h new file mode 100644 index 0000000000..f5dd2c9e89 --- /dev/null +++ b/tests/regression/mpi_put_dotproduct/common.h @@ -0,0 +1,17 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef TYPE +#define TYPE float +#endif + +typedef struct { + uint32_t grid_dim[1]; + uint32_t block_dim[1]; + uint32_t num_points; + uint64_t src0_addr; + uint64_t src1_addr; + uint64_t dst_addr; +} kernel_arg_t; + +#endif diff --git a/tests/regression/mpi_put_dotproduct/kernel.cpp b/tests/regression/mpi_put_dotproduct/kernel.cpp new file mode 100644 index 0000000000..65ad1afd34 --- /dev/null +++ b/tests/regression/mpi_put_dotproduct/kernel.cpp @@ -0,0 +1,41 @@ +#include +#include "common.h" + +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto src0_ptr = reinterpret_cast(arg->src0_addr); + auto src1_ptr = reinterpret_cast(arg->src1_addr); + auto dst_ptr = reinterpret_cast(arg->dst_addr); + auto num_points = arg->num_points; + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int cacheIndex = threadIdx.x; + + auto cache = reinterpret_cast(__local_mem(blockDim.x * sizeof(TYPE))); + + float temp = 0; + while (tid < num_points){ + temp += src0_ptr[tid] * src1_ptr[tid]; + tid += blockDim.x * gridDim.x; + } + + // set the cache values + cache[cacheIndex] = temp; + + __syncthreads(); + + int i = blockDim.x/2; + while (i != 0){ + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + __syncthreads(); + i /= 2; + } + + if (cacheIndex == 0) + dst_ptr[blockIdx.x] = cache[0]; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(1, arg->grid_dim, arg->block_dim, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mpi_put_dotproduct/main.cpp b/tests/regression/mpi_put_dotproduct/main.cpp new file mode 100644 index 0000000000..fe6b363e4c --- /dev/null +++ b/tests/regression/mpi_put_dotproduct/main.cpp @@ -0,0 +1,229 @@ +#include +#include +#include +#include +#include +#include "common.h" + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +template +class Comparator {}; + +template <> +class Comparator { +public: + static float generate(uint32_t idx) { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union { float f; int i; } fa, fb; + fa.f = a; fb.f = b; + int d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP && errors < 100) { + printf("*** error: expected=%f, actual=%f\n", b, a); + return false; + } + return d <= FLOAT_ULP; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.vxbin"; +uint32_t size = 16; + +vx_device_h device = nullptr; +vx_buffer_h src0_buffer = nullptr; +vx_buffer_h src1_buffer = nullptr; +vx_buffer_h dst_buffer = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; +kernel_arg_t kernel_arg = {}; + +void cleanup() { + if (device) { + vx_mem_free(src0_buffer); + vx_mem_free(src1_buffer); + vx_mem_free(dst_buffer); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h")) != -1) { + switch (c) { + case 'n': size = atoi(optarg); break; + case 'k': kernel_file = optarg; break; + case 'h': exit(0); + default: exit(-1); + } + } +} + + +int main(int argc, char* argv[]) { + + MPI_Init(&argc, &argv); + + int rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + if (rank == 0) parse_args(argc, argv); + MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD); + + std::srand(50); + + uint32_t base = size / world_size; + uint32_t rem = size % world_size; + + uint32_t local_size = (rank < rem) ? base + 1 : base; + uint32_t start = rank * base + std::min(rank, (int)rem); + + std::vector full_src0, full_src1; + if (rank == 0) { + full_src0.resize(size); + full_src1.resize(size); + for (uint32_t i = 0; i < size; i++) { + full_src0[i] = Comparator::generate(i); + full_src1[i] = Comparator::generate(i); + } + } + + std::vector counts(world_size), displs(world_size); + for (int r = 0; r < world_size; r++) { + counts[r] = (r < rem) ? base + 1 : base; + displs[r] = r * base + std::min(r, (int)rem); + } + + std::vector h_src0(local_size); + std::vector h_src1(local_size); + + MPI_Scatterv(full_src0.data(), counts.data(), displs.data(), MPI_FLOAT, + h_src0.data(), local_size, MPI_FLOAT, 0, MPI_COMM_WORLD); + + MPI_Scatterv(full_src1.data(), counts.data(), displs.data(), MPI_FLOAT, + h_src1.data(), local_size, MPI_FLOAT, 0, MPI_COMM_WORLD); + + // ================================ + // VORTEX EXECUTION (unchanged) + // ================================ + + RT_CHECK(vx_dev_open(&device)); + + const uint32_t threadsPerBlock = 8; + const uint32_t blocksPerGrid = + (local_size + threadsPerBlock - 1) / threadsPerBlock; + + uint32_t buf_size = local_size * sizeof(TYPE); + uint32_t dst_buf_size = blocksPerGrid * sizeof(TYPE); + + kernel_arg.num_points = local_size; + kernel_arg.block_dim[0] = threadsPerBlock; + kernel_arg.grid_dim[0] = blocksPerGrid; + + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer)); + RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr)); + + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer)); + RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr)); + + RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_WRITE, &dst_buffer)); + RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr)); + + RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size)); + RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size)); + + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + RT_CHECK(vx_upload_bytes(device, &kernel_arg, + sizeof(kernel_arg_t), &args_buffer)); + + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + std::vector h_dst(blocksPerGrid); + RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, dst_buf_size)); + + TYPE local_sum = 0; + for (uint32_t i = 0; i < blocksPerGrid; i++) + local_sum += h_dst[i]; + + // ================================ + // RMA SECTION (MPI_Put) + // ================================ + + std::vector window_buf; + if (rank == 0) + window_buf.resize(world_size); + + MPI_Win win; + MPI_Win_create(rank == 0 ? window_buf.data() : nullptr, + rank == 0 ? world_size * sizeof(TYPE) : 0, + sizeof(TYPE), + MPI_INFO_NULL, + MPI_COMM_WORLD, + &win); + + MPI_Win_fence(0, win); + + // Each rank writes its local_sum into slot [rank] on rank 0 + MPI_Put(&local_sum, + 1, + MPI_FLOAT, + 0, + rank, + 1, + MPI_FLOAT, + win); + + MPI_Win_fence(0, win); + + TYPE global_sum = 0; + + if (rank == 0) { + for (int r = 0; r < world_size; r++) + global_sum += window_buf[r]; + } + + MPI_Win_free(&win); + + // ================================ + // Verification + // ================================ + + if (rank == 0) { + + TYPE ref = 0; + for (uint32_t i = 0; i < size; i++) + ref += full_src0[i] * full_src1[i]; + + int errors = 0; + if (!Comparator::compare(global_sum, ref, 0, errors)) + errors++; + + if (errors) + std::cout << "FAILED!\n"; + else + std::cout << "PASSED!\n"; + } + + cleanup(); + MPI_Finalize(); + return 0; +} \ No newline at end of file diff --git a/tests/regression/mpi_sgemm/Makefile b/tests/regression/mpi_sgemm/Makefile new file mode 100644 index 0000000000..73f93674e0 --- /dev/null +++ b/tests/regression/mpi_sgemm/Makefile @@ -0,0 +1,21 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mpi_sgemm + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n64 + +MPI ?= 0 +NP ?= 1 + +ifdef MPI + CXX = mpic++ +endif + +include ../common.mk \ No newline at end of file diff --git a/tests/regression/mpi_sgemm/common.h b/tests/regression/mpi_sgemm/common.h new file mode 100644 index 0000000000..b58ec5f5e1 --- /dev/null +++ b/tests/regression/mpi_sgemm/common.h @@ -0,0 +1,16 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef TYPE +#define TYPE float +#endif + +typedef struct { + uint32_t grid_dim[2]; + uint32_t size; + uint64_t A_addr; + uint64_t B_addr; + uint64_t C_addr; +} kernel_arg_t; + +#endif diff --git a/tests/regression/mpi_sgemm/kernel.cpp b/tests/regression/mpi_sgemm/kernel.cpp new file mode 100644 index 0000000000..5b67113ec1 --- /dev/null +++ b/tests/regression/mpi_sgemm/kernel.cpp @@ -0,0 +1,24 @@ +#include +#include "common.h" + +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto A = reinterpret_cast(arg->A_addr); + auto B = reinterpret_cast(arg->B_addr); + auto C = reinterpret_cast(arg->C_addr); + auto size = arg->size; + + int col = blockIdx.x; + int row = blockIdx.y; + + TYPE sum(0); + for (int e = 0; e < size; ++e) { + sum += A[row * size + e] * B[e * size + col]; + } + + C[row * size + col] = sum; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mpi_sgemm/main.cpp b/tests/regression/mpi_sgemm/main.cpp new file mode 100644 index 0000000000..ad353563cd --- /dev/null +++ b/tests/regression/mpi_sgemm/main.cpp @@ -0,0 +1,289 @@ +#include +#include +#include +#include +#include +#include +#include +#include "common.h" +#include + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +template +class Comparator {}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "integer"; + } + static int generate() { + return rand(); + } + static bool compare(int a, int b, int index, int errors) { + if (a != b) { + if (errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a); + } + return false; + } + return true; + } +}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "float"; + } + static float generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) { + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a); + } + return false; + } + return true; + } +}; + +static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) { + for (uint32_t row = 0; row < height; ++row) { + for (uint32_t col = 0; col < width; ++col) { + TYPE sum(0); + for (uint32_t e = 0; e < width; ++e) { + sum += A[row * width + e] * B[e * width + col]; + } + out[row * width + col] = sum; + } + } +} + +const char* kernel_file = "kernel.vxbin"; +uint32_t size = 32; + +vx_device_h device = nullptr; +vx_buffer_h A_buffer = nullptr; +vx_buffer_h B_buffer = nullptr; +vx_buffer_h C_buffer = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + show_usage(); + exit(0); + break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(A_buffer); + vx_mem_free(B_buffer); + vx_mem_free(C_buffer); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + + + +//// + + + + +int main(int argc, char *argv[]) { + + MPI_Init(&argc, &argv); + + int rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + if (rank == 0) + parse_args(argc, argv); + + // Broadcast matrix size + MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD); + + uint32_t size_sq = size * size; + + // Compute row chunk (ceil division) + uint32_t rows_per_rank = (size + world_size - 1) / world_size; + uint32_t row_start = rank * rows_per_rank; + uint32_t row_end = std::min(row_start + rows_per_rank, size); + uint32_t local_rows = row_end - row_start; + + uint32_t local_elems = local_rows * size; + uint32_t local_buf_size = local_elems * sizeof(TYPE); + uint32_t full_buf_size = size_sq * sizeof(TYPE); + + std::vector h_A_full; + std::vector h_B(size_sq); + std::vector h_C_full; + + // Only rank 0 initializes + if (rank == 0) { + std::srand(50); + h_A_full.resize(size_sq); + h_C_full.resize(size_sq); + + for (uint32_t i = 0; i < size_sq; ++i) { + h_A_full[i] = Comparator::generate(); + h_B[i] = Comparator::generate(); + } + } + + // Allocate local A and C + std::vector h_A_local(local_elems); + std::vector h_C_local(local_elems); + + // Prepare scatter metadata + std::vector sendcounts(world_size); + std::vector displs(world_size); + + for (int i = 0; i < world_size; i++) { + uint32_t rs = i * rows_per_rank; + uint32_t re = std::min(rs + rows_per_rank, size); + sendcounts[i] = (re - rs) * size; + displs[i] = rs * size; + } + + // Scatter rows of A + MPI_Scatterv( + h_A_full.data(), + sendcounts.data(), + displs.data(), + MPI_FLOAT, + h_A_local.data(), + local_elems, + MPI_FLOAT, + 0, + MPI_COMM_WORLD); + + // Broadcast full B + MPI_Bcast( + h_B.data(), + size_sq, + MPI_FLOAT, + 0, + MPI_COMM_WORLD); + + // ============================ + // VORTEX EXECUTION PER RANK + // ============================ + + vx_device_h device; + vx_buffer_h A_buffer, B_buffer, C_buffer; + vx_buffer_h krnl_buffer, args_buffer; + kernel_arg_t kernel_arg = {}; + + RT_CHECK(vx_dev_open(&device)); + + RT_CHECK(vx_mem_alloc(device, local_buf_size, VX_MEM_READ, &A_buffer)); + RT_CHECK(vx_mem_alloc(device, full_buf_size, VX_MEM_READ, &B_buffer)); + RT_CHECK(vx_mem_alloc(device, local_buf_size, VX_MEM_WRITE, &C_buffer)); + + RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.A_addr)); + RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.B_addr)); + RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.C_addr)); + + kernel_arg.grid_dim[0] = size; + kernel_arg.grid_dim[1] = local_rows; + kernel_arg.size = size; + + RT_CHECK(vx_copy_to_dev(A_buffer, h_A_local.data(), 0, local_buf_size)); + RT_CHECK(vx_copy_to_dev(B_buffer, h_B.data(), 0, full_buf_size)); + + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer)); + + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + RT_CHECK(vx_copy_from_dev(h_C_local.data(), C_buffer, 0, local_buf_size)); + + // ============================ + // GATHER RESULTS + // ============================ + + MPI_Gatherv( + h_C_local.data(), + local_elems, + MPI_FLOAT, + h_C_full.data(), + sendcounts.data(), + displs.data(), + MPI_FLOAT, + 0, + MPI_COMM_WORLD); + + // ============================ + // VERIFY (Rank 0) + // ============================ + + if (rank == 0) { + std::vector h_ref(size_sq); + matmul_cpu(h_ref.data(), h_A_full.data(), h_B.data(), size, size); + + int errors = 0; + for (uint32_t i = 0; i < size_sq; ++i) { + if (!Comparator::compare(h_C_full[i], h_ref[i], i, errors)) + errors++; + } + + if (errors) + std::cout << "FAILED\n"; + else + std::cout << "PASSED\n"; + } + + vx_dev_close(device); + MPI_Finalize(); + return 0; +} \ No newline at end of file diff --git a/tests/regression/mpi_vecadd/Makefile b/tests/regression/mpi_vecadd/Makefile new file mode 100644 index 0000000000..7fb9732810 --- /dev/null +++ b/tests/regression/mpi_vecadd/Makefile @@ -0,0 +1,21 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mpi_vecadd + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n64 + +MPI ?= 0 +NP ?= 1 + +ifdef MPI + CXX = mpic++ +endif + +include ../common.mk \ No newline at end of file diff --git a/tests/regression/mpi_vecadd/README.md b/tests/regression/mpi_vecadd/README.md new file mode 100644 index 0000000000..c03a26787e --- /dev/null +++ b/tests/regression/mpi_vecadd/README.md @@ -0,0 +1,91 @@ +## MPI With SIMX + + +### Usage + +``` +Apptainer> ./ci/blackbox.sh --cores=2 --app=mpi_vecadd --driver=simx --np=4 --args="-n5000" +CONFIGS=-DNUM_CORES=2 +Running: CONFIGS="-DNUM_CORES=2" make -C ./ci/../runtime/simx > /dev/null +Running: OPTS="-n5000" NP=4 make -C "./ci/../tests/regression/mpi_vecadd" run-simx +make: Entering directory '/home/vortex/build/tests/regression/mpi_vecadd' +LD_LIBRARY_PATH=/home/vortex/build/runtime:/opt/boost-1.66/lib:/opt/openssl-1.1/lib::/.singularity.d/libs VORTEX_DRIVER=simx mpirun --allow-run-as-root --oversubscribe -np 4 ./mpi_vecadd -n5000 +rank = 3, world_size = 4 +rank = 0, world_size = 4 +rank = 1, world_size = 4 +rank = 2, world_size = 4 +Rank: 3- Upload kernel binary +Rank: 0- Upload kernel binary +Rank: 1- Upload kernel binary +Rank: 2- Upload kernel binary +PERF: core0: instrs=22440, cycles=59003, IPC=0.380320 +PERF: core1: instrs=22440, cycles=58635, IPC=0.382707 +PERF: instrs=44880, cycles=59003, IPC=0.760639 +PERF: core0: instrs=22440, cycles=59003, IPC=0.380320 +PERF: core1: instrs=22440, cycles=58635, IPC=0.382707 +PERF: instrs=44880, cycles=59003, IPC=0.760639 +PERF: core0: instrs=22440, cycles=59003, IPC=0.380320 +PERF: core1: instrs=22440, cycles=58635, IPC=0.382707 +PERF: instrs=44880, cycles=59003, IPC=0.760639 +PASSED! +PERF: core0: instrs=22440, cycles=59003, IPC=0.380320 +PERF: core1: instrs=22440, cycles=58635, IPC=0.382707 +PERF: instrs=44880, cycles=59003, IPC=0.760639 +make: Leaving directory '/home/vortex/build/tests/regression/mpi_vecadd' +Apptainer> +``` + + +### High-Level Summary of main.cpp + +#### MPI Setup + +Calls MPI_Init, gets the rank (MPI_Comm_rank) and world size (MPI_Comm_size). + +Each MPI rank prints its rank and total world_size. + +#### Argument Parsing + +Reads -n from the command line (number of elements in the vector). + +Rank 0 parses this value, then broadcasts it to all ranks with MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD). + +This ensures every rank sees the same problem size. + +#### Data Partitioning + +Total work = size elements. + +Each rank computes its chunk: + +``` + // Compute local chunk + uint32_t chunk = (size + world_size - 1) / world_size; // ceil div + uint32_t start = rank * chunk; + uint32_t end = std::min(start + chunk, size); + uint32_t num_points = end - start; +``` + + +So if size=50 and np=8, each rank gets about 6–7 elements. + +#### Kernel Upload + Execution + +Each rank loads the Vortex kernel binary (mpi_vecadd) into its own Vortex instance. + +That’s why you see “Upload kernel binary” printed for every rank, not just once. + +Then each rank launches the kernel for its assigned portion of the data. + +#### Performance Reporting + +After kernel finishes, each rank prints Vortex perf stats (instrs, cycles, IPC). + +These numbers are per rank’s Vortex instance, not shared across ranks. + + +#### Verification + +Each rank validates its results (checks that vector addition is correct). + +Finally, the ranks synchronize (MPI_Barrier) and finalize (MPI_Finalize). \ No newline at end of file diff --git a/tests/regression/mpi_vecadd/common.h b/tests/regression/mpi_vecadd/common.h new file mode 100644 index 0000000000..b511332c11 --- /dev/null +++ b/tests/regression/mpi_vecadd/common.h @@ -0,0 +1,15 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef TYPE +#define TYPE float +#endif + +typedef struct { + uint32_t num_points; + uint64_t src0_addr; + uint64_t src1_addr; + uint64_t dst_addr; +} kernel_arg_t; + +#endif diff --git a/tests/regression/mpi_vecadd/kernel.cpp b/tests/regression/mpi_vecadd/kernel.cpp new file mode 100644 index 0000000000..7774c970a7 --- /dev/null +++ b/tests/regression/mpi_vecadd/kernel.cpp @@ -0,0 +1,15 @@ +#include +#include "common.h" + +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto src0_ptr = reinterpret_cast(arg->src0_addr); + auto src1_ptr = reinterpret_cast(arg->src1_addr); + auto dst_ptr = reinterpret_cast(arg->dst_addr); + + dst_ptr[blockIdx.x] = src0_ptr[blockIdx.x] + src1_ptr[blockIdx.x]; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mpi_vecadd/main.cpp b/tests/regression/mpi_vecadd/main.cpp new file mode 100644 index 0000000000..3bcd415192 --- /dev/null +++ b/tests/regression/mpi_vecadd/main.cpp @@ -0,0 +1,186 @@ +#include +#include +#include +#include +#include +#include "common.h" + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +template +class Comparator {}; + +template <> +class Comparator { +public: + static const char* type_str() { return "integer"; } + static int generate(uint32_t idx) { return rand(); } + static bool compare(int a, int b, int index, int errors) { + if (a != b && errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a); + return false; + } + return a == b; + } +}; + +template <> +class Comparator { +public: + static const char* type_str() { return "float"; } + static float generate(uint32_t idx) { return static_cast(rand()) / RAND_MAX; } + static bool compare(float a, float b, int index, int errors) { + union { float f; int i; } fa, fb; + fa.f = a; fb.f = b; + int d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP && errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a); + return false; + } + return d <= FLOAT_ULP; + } +}; + +const char* kernel_file = "kernel.vxbin"; +uint32_t size = 16; + +vx_device_h device = nullptr; +vx_buffer_h src0_buffer = nullptr; +vx_buffer_h src1_buffer = nullptr; +vx_buffer_h dst_buffer = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; +kernel_arg_t kernel_arg = {}; + +void cleanup() { + if (device) { + vx_mem_free(src0_buffer); + vx_mem_free(src1_buffer); + vx_mem_free(dst_buffer); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h")) != -1) { + switch (c) { + case 'n': size = atoi(optarg); break; + case 'k': kernel_file = optarg; break; + case 'h': std::cout << "Usage: [-k kernel] [-n size] [-h help]\n"; exit(0); + default: std::cout << "Usage: [-k kernel] [-n size] [-h help]\n"; exit(-1); + } + } +} + +int main(int argc, char* argv[]) { + MPI_Init(&argc, &argv); + int rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + std::cout << "rank = " << rank << ", world_size = " << world_size << "\n"; + if (rank == 0) parse_args(argc, argv); + MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD); + + // Rank 0 generates full input arrays + std::vector full_src0, full_src1; + if (rank == 0) { + std::srand(50); + full_src0.resize(size); + full_src1.resize(size); + for (uint32_t i = 0; i < size; i++) { + full_src0[i] = Comparator::generate(i); + full_src1[i] = Comparator::generate(i); + } + } + + // Compute local chunk + uint32_t chunk = (size + world_size - 1) / world_size; // ceil div + uint32_t start = rank * chunk; + uint32_t end = std::min(start + chunk, size); + uint32_t num_points = end - start; + + // Local buffers + std::vector h_src0(num_points); + std::vector h_src1(num_points); + std::vector h_dst(num_points); + + // Scatter inputs + std::vector recvcounts(world_size), displs(world_size); + for (int i = 0; i < world_size; i++) { + int s = i * chunk; + int e = std::min(s + chunk, size); + recvcounts[i] = e - s; + displs[i] = s; + } + + MPI_Scatterv(full_src0.data(), recvcounts.data(), displs.data(), MPI_FLOAT, + h_src0.data(), num_points, MPI_FLOAT, 0, MPI_COMM_WORLD); + + MPI_Scatterv(full_src1.data(), recvcounts.data(), displs.data(), MPI_FLOAT, + h_src1.data(), num_points, MPI_FLOAT, 0, MPI_COMM_WORLD); + + // Open device + RT_CHECK(vx_dev_open(&device)); + + uint32_t buf_size = num_points * sizeof(TYPE); + kernel_arg.num_points = num_points; + + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer)); + RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer)); + RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer)); + RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr)); + + RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size)); + RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size)); + + std::cout << "Rank: " << rank << "- Upload kernel binary" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer)); + + // Run kernel + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size)); + + // Gather results + std::vector full_dst; + if (rank == 0) full_dst.resize(size); + + MPI_Gatherv(h_dst.data(), num_points, MPI_FLOAT, + full_dst.data(), recvcounts.data(), displs.data(), MPI_FLOAT, + 0, MPI_COMM_WORLD); + + // Verify (rank 0) + if (rank == 0) { + int errors = 0; + for (uint32_t i = 0; i < size; i++) { + auto ref = full_src0[i] + full_src1[i]; + auto cur = full_dst[i]; + if (!Comparator::compare(cur, ref, i, errors)) errors++; + } + if (errors) std::cout << "Found " << errors << " errors!\nFAILED!\n"; + else std::cout << "PASSED!\n"; + } + + cleanup(); + MPI_Finalize(); + return 0; +}