diff --git a/ci/blackbox.sh b/ci/blackbox.sh
index e89e0b40a2..8a771ea445 100755
--- a/ci/blackbox.sh
+++ b/ci/blackbox.sh
@@ -19,7 +19,7 @@ ROOT_DIR=$SCRIPT_DIR/..
 show_usage()
 {
     echo "Vortex BlackBox Test Driver v1.0"
-    echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--scope] [--perf=#class] [--log=logfile] [--nohup] [--help]]"
+    echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--np=#num of MPI processes] [--scope] [--perf=#class] [--log=logfile] [--nohup] [--help]]"
 }
 
 show_help()
@@ -47,6 +47,7 @@ DEFAULTS() {
     DEBUG_LEVEL=0
     SCOPE=0
     HAS_ARGS=0
+    HAS_NP=0
     PERF_CLASS=0
     CONFIGS="$CONFIGS"
     TEMPBUILD=0
@@ -69,6 +70,7 @@ parse_args() {
             --debug=*)  DEBUG=1; DEBUG_LEVEL=${i#*=} ;;
             --scope)    SCOPE=1; ;;
             --args=*)   HAS_ARGS=1; ARGS=${i#*=} ;;
+            --np=*)     HAS_NP=1; NP=${i#*=} ;;
             --log=*)    LOGFILE=${i#*=} ;;
             --nohup)    TEMPBUILD=1 ;;
             --help)     show_help; exit 0 ;;
@@ -123,6 +125,7 @@ run_app() {
     [ $DEBUG -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=1")
     [ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "VORTEX_RT_PATH=\"$TEMPDIR\"")
     [ $HAS_ARGS -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "OPTS=\"$ARGS\"")
+    [ $HAS_NP -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "NP=$NP")
     cmd_opts=$(add_option "$cmd_opts" "make -C \"$APP_PATH\" run-$DRIVER")
     [ $DEBUG -ne 0 ] && cmd_opts=$(add_option "$cmd_opts" "> $LOGFILE 2>&1")
     echo "Running: $cmd_opts"
diff --git a/miscs/apptainer/vortex.def b/miscs/apptainer/vortex.def
index cd32138928..ecb57cce29 100644
--- a/miscs/apptainer/vortex.def
+++ b/miscs/apptainer/vortex.def
@@ -45,7 +45,8 @@ From: ubuntu:22.04
                        openjdk-11-jre-zero libtheora0 libavcodec58 libcairo-gobject2 \
                        ca-certificates-java libchromaprint1 software-properties-common perl-modules bzip2 \
                        unzip zlib1g-dev libtinfo5 g++ usbutils pciutils gawk bison gcc make tar python3.9 locales zstd uuid-dev ccache \
-                       libboost-filesystem1.74.0 libboost-program-options1.74.0 libboost-system1.74.0 libboost-chrono1.74.0 libboost-thread1.74.0 environment-modules || true
+                       libboost-filesystem1.74.0 libboost-program-options1.74.0 libboost-system1.74.0 libboost-chrono1.74.0 libboost-thread1.74.0 \
+                       environment-modules openmpi-bin libopenmpi-dev || true
 
     ln -s /usr/bin/python3 /usr/bin/python
 
diff --git a/tests/regression/Makefile b/tests/regression/Makefile
index be3ccc9636..0bd35fe3c9 100644
--- a/tests/regression/Makefile
+++ b/tests/regression/Makefile
@@ -7,15 +7,23 @@ all:
 	$(MAKE) -C dogfood
 	$(MAKE) -C dropout
 	$(MAKE) -C dotproduct
+	$(MAKE) -C mpi_dotproduct
+	$(MAKE) -C mpi_put_dotproduct
 	$(MAKE) -C mstress
 	$(MAKE) -C io_addr
 	$(MAKE) -C printf
 	$(MAKE) -C diverge
+	$(MAKE) -C mpi_diverge
 	$(MAKE) -C sort
 	$(MAKE) -C fence
 	$(MAKE) -C vecadd
+	$(MAKE) -C mpi_vecadd
 	$(MAKE) -C sgemm
+	$(MAKE) -C mpi_sgemm
+	$(MAKE) -C mpi_blocked_sgemm
 	$(MAKE) -C conv3
+	$(MAKE) -C mpi_conv3
+	$(MAKE) -C mpi_neighbor_a2a_conv3
 	$(MAKE) -C relu
 	$(MAKE) -C sgemv
 	$(MAKE) -C sgemm2
@@ -28,15 +36,23 @@ run-simx:
 	$(MAKE) -C dogfood run-simx
 	$(MAKE) -C dropout run-simx
 	$(MAKE) -C dotproduct run-simx
+	$(MAKE) -C mpi_dotproduct run-simx
+	$(MAKE) -C mpi_put_dotproduct run-simx
 	$(MAKE) -C mstress run-simx
 	$(MAKE) -C io_addr run-simx
 	$(MAKE) -C printf run-simx
 	$(MAKE) -C diverge run-simx
+	$(MAKE) -C mpi_diverge run-simx
 	$(MAKE) -C sort run-simx
 	$(MAKE) -C fence run-simx
 	$(MAKE) -C vecadd run-simx
+	$(MAKE) -C mpi_vecadd run-simx
 	$(MAKE) -C sgemm run-simx
+	$(MAKE) -C mpi_sgemm run-simx
+	$(MAKE) -C mpi_blocked_sgemm run-simx
 	$(MAKE) -C conv3 run-simx
+	$(MAKE) -C mpi_conv3 run-simx
+	$(MAKE) -C mpi_neighbor_a2a_conv3 run-simx
 	$(MAKE) -C relu run-simx
 	$(MAKE) -C sgemv run-simx
 	$(MAKE) -C sgemm2 run-simx
@@ -70,15 +86,23 @@ clean:
 	$(MAKE) -C dogfood clean
 	$(MAKE) -C dropout clean
 	$(MAKE) -C dotproduct clean
+	$(MAKE) -C mpi_dotproduct clean
+	$(MAKE) -C mpi_put_dotproduct clean
 	$(MAKE) -C mstress clean
 	$(MAKE) -C io_addr clean
 	$(MAKE) -C printf clean
 	$(MAKE) -C diverge clean
+	$(MAKE) -C mpi_diverge clean
 	$(MAKE) -C sort clean
 	$(MAKE) -C fence clean
 	$(MAKE) -C vecadd clean
+	$(MAKE) -C mpi_vecadd clean
 	$(MAKE) -C sgemm clean
+	$(MAKE) -C mpi_sgemm clean
+	$(MAKE) -C mpi_blocked_sgemm clean
 	$(MAKE) -C conv3 clean
+	$(MAKE) -C mpi_neighbor_a2a_conv3 clean
+	$(MAKE) -C mpi_conv3 clean
 	$(MAKE) -C relu clean
 	$(MAKE) -C sgemv clean
 	$(MAKE) -C sgemm2 clean
diff --git a/tests/regression/common.mk b/tests/regression/common.mk
index dea17512b6..ae8cc9bae2 100644
--- a/tests/regression/common.mk
+++ b/tests/regression/common.mk
@@ -65,6 +65,12 @@ CXXFLAGS += $(CONFIGS)
 
 LDFLAGS += -L$(VORTEX_RT_PATH) -lvortex
 
+ifdef MPI
+	MPIRUN = mpirun  --allow-run-as-root --oversubscribe -np $(NP)
+else
+	MPIRUN =
+endif
+
 # Debugging
 ifdef DEBUG
 	CXXFLAGS += -g -O0
@@ -99,7 +105,7 @@ $(PROJECT): $(SRCS)
 	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
 
 run-simx: $(PROJECT) kernel.vxbin
-	LD_LIBRARY_PATH=$(VORTEX_RT_PATH):$(LD_LIBRARY_PATH) VORTEX_DRIVER=simx ./$(PROJECT) $(OPTS)
+	LD_LIBRARY_PATH=$(VORTEX_RT_PATH):$(LD_LIBRARY_PATH) VORTEX_DRIVER=simx  $(MPIRUN) ./$(PROJECT) $(OPTS)
 
 run-rtlsim: $(PROJECT) kernel.vxbin
 	LD_LIBRARY_PATH=$(VORTEX_RT_PATH):$(LD_LIBRARY_PATH) VORTEX_DRIVER=rtlsim ./$(PROJECT) $(OPTS)
diff --git a/tests/regression/mpi_blocked_sgemm/Makefile b/tests/regression/mpi_blocked_sgemm/Makefile
new file mode 100644
index 0000000000..52cf20b5eb
--- /dev/null
+++ b/tests/regression/mpi_blocked_sgemm/Makefile
@@ -0,0 +1,21 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := mpi_blocked_sgemm
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?= -n64
+
+MPI ?= 0
+NP ?= 1
+
+ifdef MPI
+	CXX = mpic++
+endif
+
+include ../common.mk
diff --git a/tests/regression/mpi_blocked_sgemm/common.h b/tests/regression/mpi_blocked_sgemm/common.h
new file mode 100644
index 0000000000..b58ec5f5e1
--- /dev/null
+++ b/tests/regression/mpi_blocked_sgemm/common.h
@@ -0,0 +1,16 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+typedef struct {
+  uint32_t grid_dim[2];
+  uint32_t size;
+  uint64_t A_addr;
+  uint64_t B_addr;
+  uint64_t C_addr;
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/mpi_blocked_sgemm/kernel.cpp b/tests/regression/mpi_blocked_sgemm/kernel.cpp
new file mode 100644
index 0000000000..5b67113ec1
--- /dev/null
+++ b/tests/regression/mpi_blocked_sgemm/kernel.cpp
@@ -0,0 +1,24 @@
+#include <vx_spawn.h>
+#include "common.h"
+
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+	auto A = reinterpret_cast<TYPE*>(arg->A_addr);
+	auto B = reinterpret_cast<TYPE*>(arg->B_addr);
+	auto C = reinterpret_cast<TYPE*>(arg->C_addr);
+    auto size = arg->size;
+
+    int col = blockIdx.x;
+    int row = blockIdx.y;
+
+    TYPE sum(0);
+    for (int e = 0; e < size; ++e) {
+        sum += A[row * size + e] * B[e * size + col];
+    }
+
+    C[row * size + col] = sum;
+}
+
+int main() {
+	kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+	return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/mpi_blocked_sgemm/main.cpp b/tests/regression/mpi_blocked_sgemm/main.cpp
new file mode 100644
index 0000000000..8124839f8f
--- /dev/null
+++ b/tests/regression/mpi_blocked_sgemm/main.cpp
@@ -0,0 +1,386 @@
+#include<mpi.h>
+#include <iostream>
+#include <unistd.h>
+#include <string.h>
+#include <vector>
+#include <chrono>
+#include <vortex.h>
+#include <cmath>
+#include "common.h"
+#include <mpi.h>
+
+#define FLOAT_ULP 6
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+	 cleanup();			                                              \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() {
+    return "integer";
+  }
+  static int generate() {
+    return rand();
+  }
+  static bool compare(int a, int b, int index, int errors) {
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
+      }
+      return false;
+    }
+    return true;
+  }
+};
+
+template <>
+class Comparator<float> {
+public:
+  static const char* type_str() {
+    return "float";
+  }
+  static float generate() {
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) {
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a);
+      }
+      return false;
+    }
+    return true;
+  }
+};
+
+static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
+  for (uint32_t row = 0; row < height; ++row) {
+    for (uint32_t col = 0; col < width; ++col) {
+      TYPE sum(0);
+      for (uint32_t e = 0; e < width; ++e) {
+          sum += A[row * width + e] * B[e * width + col];
+      }
+      out[row * width + col] = sum;
+    }
+  }
+}
+
+const char* kernel_file = "kernel.vxbin";
+uint32_t size = 32;
+
+vx_device_h device = nullptr;
+vx_buffer_h A_buffer = nullptr;
+vx_buffer_h B_buffer = nullptr;
+vx_buffer_h C_buffer = nullptr;
+vx_buffer_h krnl_buffer = nullptr;
+vx_buffer_h args_buffer = nullptr;
+kernel_arg_t kernel_arg = {};
+
+static void show_usage() {
+   std::cout << "Vortex Test." << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:h")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+      show_usage();
+      exit(0);
+      break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(A_buffer);
+    vx_mem_free(B_buffer);
+    vx_mem_free(C_buffer);
+    vx_mem_free(krnl_buffer);
+    vx_mem_free(args_buffer);
+    vx_dev_close(device);
+  }
+}
+
+
+
+////
+int main(int argc, char *argv[]) {
+
+  MPI_Init(&argc, &argv);
+
+  int rank, world_size;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+  if (rank == 0)
+    parse_args(argc, argv);
+
+  MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
+
+  // ===============================
+  // Create 2D Cartesian Topology
+  // ===============================
+
+  int dims[2];
+  dims[0] = dims[1] = std::sqrt(world_size);
+
+  if (dims[0] * dims[1] != world_size) {
+    if (rank == 0)
+      std::cout << "World size must be perfect square\n";
+    MPI_Finalize();
+    return -1;
+  }
+
+  int periods[2] = {1, 1};
+  MPI_Comm grid_comm;
+
+  MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &grid_comm);
+
+  int coords[2];
+  MPI_Cart_coords(grid_comm, rank, 2, coords);
+
+  int row = coords[0];
+  int col = coords[1];
+
+  uint32_t block_size = size / dims[0];
+  uint32_t block_elems = block_size * block_size;
+  uint32_t block_bytes = block_elems * sizeof(TYPE);
+
+  std::vector<TYPE> A_block(block_elems);
+  std::vector<TYPE> B_block(block_elems);
+  std::vector<TYPE> C_block(block_elems, 0);
+
+  std::vector<TYPE> A_full, B_full;
+
+  if (rank == 0) {
+    std::srand(50);
+    A_full.resize(size * size);
+    B_full.resize(size * size);
+    for (uint32_t i = 0; i < size*size; i++) {
+      A_full[i] = Comparator<TYPE>::generate();
+      B_full[i] = Comparator<TYPE>::generate();
+    }
+  }
+
+  // ==================================
+  // Manual block distribution
+  // ==================================
+
+  for (int r = 0; r < world_size; r++) {
+
+    int ccoords[2];
+    MPI_Cart_coords(grid_comm, r, 2, ccoords);
+
+    int r_row = ccoords[0];
+    int r_col = ccoords[1];
+
+    if (rank == 0) {
+
+      std::vector<TYPE> temp(block_elems);
+
+      for (uint32_t i = 0; i < block_size; i++)
+        for (uint32_t j = 0; j < block_size; j++)
+          temp[i*block_size+j] =
+              A_full[(r_row*block_size+i)*size + (r_col*block_size+j)];
+
+      if (r == 0)
+        A_block = temp;
+      else
+        MPI_Send(temp.data(), block_elems, MPI_FLOAT, r, 0, MPI_COMM_WORLD);
+
+      for (uint32_t i = 0; i < block_size; i++)
+        for (uint32_t j = 0; j < block_size; j++)
+          temp[i*block_size+j] =
+              B_full[(r_row*block_size+i)*size + (r_col*block_size+j)];
+
+      if (r == 0)
+        B_block = temp;
+      else
+        MPI_Send(temp.data(), block_elems, MPI_FLOAT, r, 1, MPI_COMM_WORLD);
+    }
+
+    if (rank == r && rank != 0) {
+      MPI_Recv(A_block.data(), block_elems, MPI_FLOAT, 0, 0,
+               MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+      MPI_Recv(B_block.data(), block_elems, MPI_FLOAT, 0, 1,
+               MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    }
+  }
+
+  // ==============================
+  // Cannon Initial Alignment
+  // ==============================
+
+  int src, dst;
+
+  MPI_Cart_shift(grid_comm, 1, -row, &src, &dst);
+  MPI_Sendrecv_replace(A_block.data(), block_elems,
+                       MPI_FLOAT, dst, 0, src, 0,
+                       grid_comm, MPI_STATUS_IGNORE);
+
+  MPI_Cart_shift(grid_comm, 0, -col, &src, &dst);
+  MPI_Sendrecv_replace(B_block.data(), block_elems,
+                       MPI_FLOAT, dst, 1, src, 1,
+                       grid_comm, MPI_STATUS_IGNORE);
+
+  // ==============================
+  // Main Cannon Loop
+  // ==============================
+
+  for (int k = 0; k < dims[0]; k++) {
+
+    // Vortex compute block multiply
+    vx_device_h device;
+    vx_buffer_h A_buffer, B_buffer, C_buffer;
+    vx_buffer_h krnl_buffer, args_buffer;
+    kernel_arg_t kernel_arg = {};
+
+    RT_CHECK(vx_dev_open(&device));
+    RT_CHECK(vx_mem_alloc(device, block_bytes, VX_MEM_READ, &A_buffer));
+    RT_CHECK(vx_mem_alloc(device, block_bytes, VX_MEM_READ, &B_buffer));
+    RT_CHECK(vx_mem_alloc(device, block_bytes, VX_MEM_WRITE, &C_buffer));
+
+    RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.A_addr));
+    RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.B_addr));
+    RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.C_addr));
+
+    kernel_arg.grid_dim[0] = block_size;
+    kernel_arg.grid_dim[1] = block_size;
+    kernel_arg.size = block_size;
+
+    RT_CHECK(vx_copy_to_dev(A_buffer, A_block.data(), 0, block_bytes));
+    RT_CHECK(vx_copy_to_dev(B_buffer, B_block.data(), 0, block_bytes));
+
+    RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+    RT_CHECK(vx_upload_bytes(device, &kernel_arg,
+                             sizeof(kernel_arg_t), &args_buffer));
+
+    RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
+    RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+
+    std::vector<TYPE> temp(block_elems);
+    RT_CHECK(vx_copy_from_dev(temp.data(), C_buffer, 0, block_bytes));
+
+    for (uint32_t i = 0; i < block_elems; i++)
+      C_block[i] += temp[i];
+
+    vx_dev_close(device);
+
+    // shift left A
+    MPI_Cart_shift(grid_comm, 1, -1, &src, &dst);
+    MPI_Sendrecv_replace(A_block.data(), block_elems,
+                         MPI_FLOAT, dst, 2, src, 2,
+                         grid_comm, MPI_STATUS_IGNORE);
+
+    // shift up B
+    MPI_Cart_shift(grid_comm, 0, -1, &src, &dst);
+    MPI_Sendrecv_replace(B_block.data(), block_elems,
+                         MPI_FLOAT, dst, 3, src, 3,
+                         grid_comm, MPI_STATUS_IGNORE);
+  }
+// ==============================
+// Gather C blocks to rank 0
+// ==============================
+
+std::vector<TYPE> C_full;
+
+if (rank == 0)
+  C_full.resize(size * size);
+
+if (rank == 0) {
+
+  // place own block
+  for (uint32_t i = 0; i < block_size; i++)
+    for (uint32_t j = 0; j < block_size; j++)
+      C_full[i*size + j] = C_block[i*block_size + j];
+
+  // receive others
+  for (int r = 1; r < world_size; r++) {
+
+    int coords_r[2];
+    MPI_Cart_coords(grid_comm, r, 2, coords_r);
+
+    std::vector<TYPE> temp(block_elems);
+
+    MPI_Recv(temp.data(), block_elems, MPI_FLOAT,
+             r, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+    int r_row = coords_r[0];
+    int r_col = coords_r[1];
+
+    for (uint32_t i = 0; i < block_size; i++)
+      for (uint32_t j = 0; j < block_size; j++)
+        C_full[(r_row*block_size+i)*size +
+               (r_col*block_size+j)]
+          = temp[i*block_size+j];
+  }
+
+} else {
+
+  MPI_Send(C_block.data(), block_elems,
+           MPI_FLOAT, 0, 99, MPI_COMM_WORLD);
+}
+
+
+if (rank == 0) {
+
+  std::vector<TYPE> ref(size * size);
+
+  matmul_cpu(ref.data(),
+             A_full.data(),
+             B_full.data(),
+             size,
+             size);
+
+  int errors = 0;
+
+  for (uint32_t i = 0; i < size*size; i++) {
+    if (!Comparator<TYPE>::compare(C_full[i],
+                                   ref[i],
+                                   i,
+                                   errors))
+      errors++;
+  }
+
+  if (errors)
+    std::cout << "FAILED with "
+              << errors << " errors\n";
+  else
+    std::cout << "PASSED\n";
+}
+
+
+
+  MPI_Finalize();
+  return 0;
+}
\ No newline at end of file
diff --git a/tests/regression/mpi_conv3/Makefile b/tests/regression/mpi_conv3/Makefile
new file mode 100644
index 0000000000..479776a731
--- /dev/null
+++ b/tests/regression/mpi_conv3/Makefile
@@ -0,0 +1,21 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := mpi_conv3
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?= -n64
+
+MPI ?= 0
+NP ?= 1
+
+ifdef MPI
+	CXX = mpic++
+endif
+
+include ../common.mk
diff --git a/tests/regression/mpi_conv3/common.h b/tests/regression/mpi_conv3/common.h
new file mode 100644
index 0000000000..25abebd076
--- /dev/null
+++ b/tests/regression/mpi_conv3/common.h
@@ -0,0 +1,17 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+typedef struct {
+  uint32_t grid_dim[2];
+  uint32_t width;
+  uint64_t I_addr;
+  uint64_t W_addr;
+  uint64_t O_addr;
+  bool     use_lmem;
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/mpi_conv3/kernel.cpp b/tests/regression/mpi_conv3/kernel.cpp
new file mode 100644
index 0000000000..27d86fd6dd
--- /dev/null
+++ b/tests/regression/mpi_conv3/kernel.cpp
@@ -0,0 +1,47 @@
+#include <vx_spawn.h>
+#include "common.h"
+
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+    auto I = reinterpret_cast<TYPE*>(arg->I_addr);
+    auto W = reinterpret_cast<TYPE*>(arg->use_lmem ? __local_mem(0) : (void*)arg->W_addr);
+	auto O = reinterpret_cast<TYPE*>(arg->O_addr);
+    auto width = arg->width;
+
+    int col = blockIdx.x;
+    int row = blockIdx.y;
+
+     // Adjust for padded borders
+    int paddedWidth = width + 2;
+    int paddedX = col + 1;
+    int paddedY = row + 1;
+
+    // Compute 3x3 convolution sum
+    float sum = 0.0f;
+
+    sum += I[(paddedY - 1) * paddedWidth + (paddedX - 1)] * W[0]; // Top-left
+    sum += I[(paddedY - 1) * paddedWidth + paddedX] * W[1];       // Top-center
+    sum += I[(paddedY - 1) * paddedWidth + (paddedX + 1)] * W[2]; // Top-right
+
+    sum += I[paddedY * paddedWidth + (paddedX - 1)] * W[3];       // Middle-left
+    sum += I[paddedY * paddedWidth + paddedX] * W[4];             // Center
+    sum += I[paddedY * paddedWidth + (paddedX + 1)] * W[5];       // Middle-right
+
+    sum += I[(paddedY + 1) * paddedWidth + (paddedX - 1)] * W[6]; // Bottom-left
+    sum += I[(paddedY + 1) * paddedWidth + paddedX] * W[7];       // Bottom-center
+    sum += I[(paddedY + 1) * paddedWidth + (paddedX + 1)] * W[8]; // Bottom-right
+
+    O[row * width + col] = sum;
+}
+
+int main() {
+    kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+    if (arg->use_lmem) {
+        // populate local memory
+        auto W = reinterpret_cast<TYPE*>(arg->W_addr);
+        auto L = reinterpret_cast<TYPE*>(__local_mem(0));
+        for (int i = 0; i < (3*3); ++i) {
+            L[i] = W[i];
+        }
+    }
+    return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/mpi_conv3/main.cpp b/tests/regression/mpi_conv3/main.cpp
new file mode 100644
index 0000000000..351b38f1be
--- /dev/null
+++ b/tests/regression/mpi_conv3/main.cpp
@@ -0,0 +1,212 @@
+#include <iostream>
+#include <vector>
+#include <unistd.h>
+#include <cmath>
+#include <mpi.h>
+#include <vortex.h>
+#include "common.h"
+
+#define FLOAT_ULP 6
+
+#define RT_CHECK(_expr) \
+   do { \
+     int _ret = _expr; \
+     if (0 == _ret) break; \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
+     cleanup(); \
+     MPI_Abort(MPI_COMM_WORLD, -1); \
+   } while(false)
+
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<float> {
+public:
+  static float generate() {
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) {
+    union { float f; int i; } fa, fb;
+    fa.f = a; fb.f = b;
+    int d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP && errors < 10) {
+      printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a);
+      return false;
+    }
+    return d <= FLOAT_ULP;
+  }
+};
+
+static void convolution_cpu(float *O, float *I, float *W, int width, int height) {
+  int paddedWidth = width + 2;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      int paddedY = y + 1;
+      int paddedX = x + 1;
+      float sum = 0.0f;
+      for (int ky = -1; ky <= 1; ++ky)
+        for (int kx = -1; kx <= 1; ++kx)
+          sum += I[(paddedY+ky)*paddedWidth + (paddedX+kx)]
+                 * W[(ky+1)*3 + (kx+1)];
+      O[y*width+x] = sum;
+    }
+  }
+}
+
+const char* kernel_file = "kernel.vxbin";
+int size = 32;
+
+vx_device_h device = nullptr;
+vx_buffer_h I_buf=nullptr, W_buf=nullptr, O_buf=nullptr;
+vx_buffer_h krnl_buf=nullptr, args_buf=nullptr;
+kernel_arg_t kernel_arg = {};
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(I_buf);
+    vx_mem_free(W_buf);
+    vx_mem_free(O_buf);
+    vx_mem_free(krnl_buf);
+    vx_mem_free(args_buf);
+    vx_dev_close(device);
+  }
+}
+
+int main(int argc, char** argv) {
+
+  MPI_Init(&argc,&argv);
+
+  int rank, world_size;
+  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+  MPI_Comm_size(MPI_COMM_WORLD,&world_size);
+
+if (rank == 0) {
+    for (int i = 1; i < argc; i++) {
+        if (strncmp(argv[i], "-n", 2) == 0) {
+            if (strlen(argv[i]) > 2) {
+                // case: -n64
+                size = atoi(&argv[i][2]);
+            } else if (i + 1 < argc) {
+                // case: -n 64
+                size = atoi(argv[i + 1]);
+            }
+        }
+    }
+}
+
+  MPI_Bcast(&size,1,MPI_INT,0,MPI_COMM_WORLD);
+
+  srand(50);
+
+  int rows_per_rank = (size + world_size - 1)/world_size;
+  int start = rank*rows_per_rank;
+  int end   = std::min(start+rows_per_rank,size);
+  int local_rows = end-start;
+
+  int padded_width = size+2;
+
+  std::vector<float> full_I, full_W, full_O;
+
+  if(rank==0){
+    full_I.resize((size+2)*(size+2));
+    full_W.resize(9);
+    full_O.resize(size*size);
+
+    for(int y=-1;y<size+1;y++)
+      for(int x=-1;x<size+1;x++)
+        full_I[(y+1)*(size+2)+(x+1)] =
+          (x>=0&&x<size&&y>=0&&y<size)
+          ? Comparator<float>::generate() : 0;
+
+    for(int i=0;i<9;i++)
+      full_W[i] = Comparator<float>::generate();
+  }
+
+  std::vector<float> local_I((local_rows+2)*padded_width);
+  std::vector<float> local_O(local_rows*size);
+  std::vector<float> local_W(9);
+
+  if(rank==0)
+    local_W = full_W;
+
+  MPI_Bcast(local_W.data(),9,MPI_FLOAT,0,MPI_COMM_WORLD);
+
+  std::vector<int> sendcounts(world_size), displs(world_size);
+
+  for(int r=0;r<world_size;r++){
+    int s=r*rows_per_rank;
+    int e=std::min(s+rows_per_rank,size);
+    sendcounts[r]=(e-s+2)*padded_width;
+    displs[r]=s*padded_width;
+  }
+
+  MPI_Scatterv(rank==0?full_I.data():nullptr,
+               sendcounts.data(),
+               displs.data(),
+               MPI_FLOAT,
+               local_I.data(),
+               (local_rows+2)*padded_width,
+               MPI_FLOAT,
+               0,MPI_COMM_WORLD);
+
+  RT_CHECK(vx_dev_open(&device));
+
+  kernel_arg.width=size;
+  kernel_arg.grid_dim[0]=size;
+  kernel_arg.grid_dim[1]=local_rows;
+  kernel_arg.use_lmem=false;
+
+  RT_CHECK(vx_mem_alloc(device,local_I.size()*sizeof(float),VX_MEM_READ,&I_buf));
+  RT_CHECK(vx_mem_alloc(device,9*sizeof(float),VX_MEM_READ,&W_buf));
+  RT_CHECK(vx_mem_alloc(device,local_O.size()*sizeof(float),VX_MEM_WRITE,&O_buf));
+
+  RT_CHECK(vx_mem_address(I_buf,&kernel_arg.I_addr));
+  RT_CHECK(vx_mem_address(W_buf,&kernel_arg.W_addr));
+  RT_CHECK(vx_mem_address(O_buf,&kernel_arg.O_addr));
+
+  RT_CHECK(vx_copy_to_dev(I_buf,local_I.data(),0,local_I.size()*sizeof(float)));
+  RT_CHECK(vx_copy_to_dev(W_buf,local_W.data(),0,9*sizeof(float)));
+
+  RT_CHECK(vx_upload_kernel_file(device,kernel_file,&krnl_buf));
+  RT_CHECK(vx_upload_bytes(device,&kernel_arg,sizeof(kernel_arg),&args_buf));
+
+  RT_CHECK(vx_start(device,krnl_buf,args_buf));
+  RT_CHECK(vx_ready_wait(device,VX_MAX_TIMEOUT));
+
+  RT_CHECK(vx_copy_from_dev(local_O.data(),O_buf,0,local_O.size()*sizeof(float)));
+
+  std::vector<int> recvcounts(world_size), recvdispls(world_size);
+
+  for(int r=0;r<world_size;r++){
+    int s=r*rows_per_rank;
+    int e=std::min(s+rows_per_rank,size);
+    recvcounts[r]=(e-s)*size;
+    recvdispls[r]=s*size;
+  }
+
+  MPI_Gatherv(local_O.data(),
+              local_rows*size,
+              MPI_FLOAT,
+              rank==0?full_O.data():nullptr,
+              recvcounts.data(),
+              recvdispls.data(),
+              MPI_FLOAT,
+              0,MPI_COMM_WORLD);
+
+  if(rank==0){
+    std::vector<float> ref(size*size);
+    convolution_cpu(ref.data(),full_I.data(),full_W.data(),size,size);
+
+    int errors=0;
+    for(int i=0;i<size*size;i++)
+      if(!Comparator<float>::compare(full_O[i],ref[i],i,errors))
+        errors++;
+
+    std::cout<<(errors?"FAILED\n":"PASSED\n");
+  }
+
+  cleanup();
+  MPI_Finalize();
+  return 0;
+}
\ No newline at end of file
diff --git a/tests/regression/mpi_diverge/Makefile b/tests/regression/mpi_diverge/Makefile
new file mode 100644
index 0000000000..0d99385b72
--- /dev/null
+++ b/tests/regression/mpi_diverge/Makefile
@@ -0,0 +1,21 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := mpi_diverge
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?= -n64
+
+MPI ?= 0
+NP ?= 1
+
+ifdef MPI
+	CXX = mpic++
+endif
+
+include ../common.mk
diff --git a/tests/regression/mpi_diverge/common.h b/tests/regression/mpi_diverge/common.h
new file mode 100644
index 0000000000..edf054657e
--- /dev/null
+++ b/tests/regression/mpi_diverge/common.h
@@ -0,0 +1,10 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+typedef struct {
+  uint32_t num_points;
+  uint64_t src_addr;
+  uint64_t dst_addr;  
+} kernel_arg_t;
+
+#endif
\ No newline at end of file
diff --git a/tests/regression/mpi_diverge/kernel.cpp b/tests/regression/mpi_diverge/kernel.cpp
new file mode 100644
index 0000000000..522a31b232
--- /dev/null
+++ b/tests/regression/mpi_diverge/kernel.cpp
@@ -0,0 +1,99 @@
+#include <vx_spawn.h>
+#include <assert.h>
+#include <algorithm>
+#include "common.h"
+
+// Parallel Selection sort
+
+struct key_t {
+	uint32_t user = 0;
+};
+
+static __attribute__((noinline)) void hacker(key_t* key, uint32_t task_id) {
+	key->user = task_id;
+}
+
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+	int32_t* src_ptr = (int32_t*)arg->src_addr;
+	int32_t* dst_ptr = (int32_t*)arg->dst_addr;
+
+	uint32_t task_id = blockIdx.x;
+
+	int value = src_ptr[task_id];
+
+	key_t key;
+	uint32_t samples = arg->num_points;
+  while (samples--) {
+		hacker(&key, task_id);
+		if ((key.user & 0x1) == 0) {
+			value += 1;
+		}
+	}
+
+	// none taken
+	if (task_id >= 0x7fffffff) {
+		value = 0;
+	} else {
+		value += 2;
+	}
+
+	// diverge
+	if (task_id > 1) {
+		if (task_id > 2) {
+			value += 6;
+		} else {
+			value += 5;
+		}
+	} else {
+		if (task_id > 0) {
+			value += 4;
+		} else {
+			value += 3;
+		}
+	}
+
+	// all taken
+	if (task_id >= 0) {
+		value += 7;
+	} else {
+		value = 0;
+	}
+
+	// loop
+	for (int i = 0, n = task_id; i < n; ++i) {
+		value += src_ptr[i];
+	}
+
+	// switch
+	switch (task_id) {
+	case 0:
+		value += 1;
+		break;
+	case 1:
+		value -= 1;
+		break;
+	case 2:
+		value *= 3;
+		break;
+	case 3:
+		value *= 5;
+		break;
+	default:
+		//assert(task_id < arg->num_points);
+		break;
+	}
+
+	// select
+	value += (task_id >= 0) ? ((task_id > 5) ? src_ptr[0] : task_id) : ((task_id < 5) ? src_ptr[1] : -task_id);
+
+	// min/max
+	value += std::min(src_ptr[task_id], value);
+	value += std::max(src_ptr[task_id], value);
+
+	dst_ptr[task_id] = value;
+}
+
+int main() {
+	kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+	return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/mpi_diverge/main.cpp b/tests/regression/mpi_diverge/main.cpp
new file mode 100644
index 0000000000..542dee019f
--- /dev/null
+++ b/tests/regression/mpi_diverge/main.cpp
@@ -0,0 +1,207 @@
+#include <mpi.h>
+#include <iostream>
+#include <unistd.h>
+#include <vortex.h>
+#include <vector>
+#include <assert.h>
+#include "common.h"
+
+#define RT_CHECK(_expr) \
+  do { \
+    int _ret = _expr; \
+    if (0 == _ret) break; \
+    printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
+    cleanup(); \
+    exit(-1); \
+  } while (false)
+
+const char* kernel_file = "kernel.vxbin";
+uint32_t count = 0;
+
+vx_device_h device = nullptr;
+vx_buffer_h src_buffer = nullptr;
+vx_buffer_h dst_buffer = nullptr;
+vx_buffer_h krnl_buffer = nullptr;
+vx_buffer_h args_buffer = nullptr;
+kernel_arg_t kernel_arg = {};
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(src_buffer);
+    vx_mem_free(dst_buffer);
+    vx_mem_free(krnl_buffer);
+    vx_mem_free(args_buffer);
+    vx_dev_close(device);
+  }
+}
+
+void gen_src_data(std::vector<int>& src_data, uint32_t size) {
+  src_data.resize(size);
+  for (uint32_t i = 0; i < size; ++i) {
+    int value = std::rand();
+    src_data[i] = value;
+    //std::cout << std::dec << i << ": value=0x" << std::hex << value << std::endl;
+  }
+}
+
+void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data, uint32_t size) {
+  ref_data.resize(size);
+  for (int i = 0; i < (int)size; ++i) {
+    int value = src_data.at(i);
+
+    uint32_t samples = size;
+    while (samples--) {
+      if ((i & 0x1) == 0) {
+        value += 1;
+      }
+    }
+
+    // none taken
+    if (i >= 0x7fffffff) {
+      value = 0;
+    } else {
+      value += 2;
+    }
+
+    // diverge
+    if (i > 1) {
+      if (i > 2) {
+        value += 6;
+      } else {
+        value += 5;
+      }
+    } else {
+      if (i > 0) {
+        value += 4;
+      } else {
+        value += 3;
+      }
+    }
+
+    // all taken
+    if (i >= 0) {
+      value += 7;
+    } else {
+      value = 0;
+    }
+
+    // loop
+    for (int j = 0, n = i; j < n; ++j) {
+      value += src_data.at(j);
+    }
+
+    // switch
+    switch (i) {
+    case 0:
+      value += 1;
+      break;
+    case 1:
+      value -= 1;
+      break;
+    case 2:
+      value *= 3;
+      break;
+    case 3:
+      value *= 5;
+      break;
+    default:
+      assert(i < (int)size);
+      break;
+    }
+
+    // select
+    value += (i >= 0) ? ((i > 5) ? src_data.at(0) : i) : ((i < 5) ? src_data.at(1) : -i);
+
+    // min/max
+	  value += std::min(src_data.at(i), value);
+	  value += std::max(src_data.at(i), value);
+
+    ref_data[i] = value;
+  }
+}
+
+int main(int argc, char** argv) {
+    MPI_Init(&argc, &argv);
+
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // parse args
+    int c;
+    while ((c = getopt(argc, argv, "n:k:h")) != -1) {
+        switch (c) {
+        case 'n': count = atoi(optarg); break;
+        case 'k': kernel_file = optarg; break;
+        case 'h': if(rank==0) std::cout<<"Usage: -n count -k kernel\n"; MPI_Finalize(); return 0;
+        default: if(rank==0) std::cout<<"Invalid\n"; MPI_Finalize(); return -1;
+        }
+    }
+    if(count==0) count=1;
+
+    std::srand(50);
+
+    // open device
+    RT_CHECK(vx_dev_open(&device));
+
+    uint64_t cores, warps, threads;
+    RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &cores));
+    RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &warps));
+    RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &threads));
+
+    uint32_t total_threads = cores*warps*threads;
+    uint32_t num_points = count*total_threads;
+    uint32_t buf_size = num_points*sizeof(int32_t);
+
+    // allocate full buffers on each rank
+    kernel_arg.num_points = num_points;
+    std::vector<int32_t> full_src(num_points);
+    std::vector<int32_t> full_dst(num_points);
+
+    if(rank==0) {
+        for(uint32_t i=0;i<num_points;++i) full_src[i]=std::rand();
+    }
+
+    // broadcast full source to all ranks
+    MPI_Bcast(full_src.data(), num_points, MPI_INT, 0, MPI_COMM_WORLD);
+
+    // allocate device memory
+    RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src_buffer));
+    RT_CHECK(vx_mem_address(src_buffer, &kernel_arg.src_addr));
+    RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
+    RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
+
+    RT_CHECK(vx_copy_to_dev(src_buffer, full_src.data(), 0, buf_size));
+
+    // upload kernel
+    RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+    RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
+
+    // start device
+    RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
+    RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+    RT_CHECK(vx_copy_from_dev(full_dst.data(), dst_buffer, 0, buf_size));
+
+    // gather results back to rank 0 (no-op here, already full_src)
+    // verify on rank 0
+    if(rank==0) {
+        std::vector<int32_t> h_ref(num_points);
+        gen_ref_data(h_ref, full_src, num_points);
+
+        int errors=0;
+        for(uint32_t i=0;i<num_points;++i){
+            if(full_dst[i]!=h_ref[i]){
+                std::cout<<"error at result #"<<i
+                         <<": actual 0x"<<std::hex<<full_dst[i]
+                         <<", expected 0x"<<h_ref[i]<<std::endl;
+                errors++;
+            }
+        }
+        if(errors) std::cout<<"Found "<<errors<<" errors! FAILED!\n";
+        else std::cout<<"PASSED!\n";
+    }
+
+    cleanup();
+    MPI_Finalize();
+    return 0;
+}
\ No newline at end of file
diff --git a/tests/regression/mpi_dotproduct/Makefile b/tests/regression/mpi_dotproduct/Makefile
new file mode 100644
index 0000000000..a9e6c59a65
--- /dev/null
+++ b/tests/regression/mpi_dotproduct/Makefile
@@ -0,0 +1,21 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := mpi_dotproduct
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?= -n64
+
+MPI ?= 0
+NP ?= 1
+
+ifdef MPI
+	CXX = mpic++
+endif
+
+include ../common.mk
diff --git a/tests/regression/mpi_dotproduct/common.h b/tests/regression/mpi_dotproduct/common.h
new file mode 100644
index 0000000000..f5dd2c9e89
--- /dev/null
+++ b/tests/regression/mpi_dotproduct/common.h
@@ -0,0 +1,17 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+typedef struct {
+  uint32_t grid_dim[1];
+  uint32_t block_dim[1];
+  uint32_t num_points;
+  uint64_t src0_addr;
+  uint64_t src1_addr;
+  uint64_t dst_addr;  
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/mpi_dotproduct/kernel.cpp b/tests/regression/mpi_dotproduct/kernel.cpp
new file mode 100644
index 0000000000..65ad1afd34
--- /dev/null
+++ b/tests/regression/mpi_dotproduct/kernel.cpp
@@ -0,0 +1,41 @@
+#include <vx_spawn.h>
+#include "common.h"
+
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+	auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
+	auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
+	auto dst_ptr  = reinterpret_cast<TYPE*>(arg->dst_addr);
+	auto num_points = arg->num_points;
+
+	int tid = threadIdx.x + blockIdx.x * blockDim.x;
+	int cacheIndex = threadIdx.x;
+
+	auto cache = reinterpret_cast<TYPE*>(__local_mem(blockDim.x * sizeof(TYPE)));
+
+	float temp = 0;
+	while (tid < num_points){
+		temp += src0_ptr[tid] * src1_ptr[tid];
+		tid += blockDim.x * gridDim.x;
+	}
+
+	// set the cache values
+	cache[cacheIndex] = temp;
+
+    __syncthreads();
+
+	int i = blockDim.x/2;
+	while (i != 0){
+		if (cacheIndex < i)
+			cache[cacheIndex] += cache[cacheIndex + i];
+		__syncthreads();
+		i /= 2;
+	}
+	
+	if (cacheIndex == 0)
+		dst_ptr[blockIdx.x] = cache[0];
+}
+
+int main() {
+	kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+	return vx_spawn_threads(1, arg->grid_dim, arg->block_dim, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/mpi_dotproduct/main.cpp b/tests/regression/mpi_dotproduct/main.cpp
new file mode 100644
index 0000000000..717946e31c
--- /dev/null
+++ b/tests/regression/mpi_dotproduct/main.cpp
@@ -0,0 +1,194 @@
+#include <iostream>
+#include <unistd.h>
+#include <vector>
+#include <mpi.h>
+#include <vortex.h>
+#include "common.h"
+
+#define FLOAT_ULP 6
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);  \
+     cleanup();                                                 \
+     MPI_Abort(MPI_COMM_WORLD, -1);                             \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<float> {
+public:
+  static float generate(uint32_t idx) {
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) {
+    union { float f; int i; } fa, fb;
+    fa.f = a; fb.f = b;
+    int d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP && errors < 100) {
+      printf("*** error: expected=%f, actual=%f\n", b, a);
+      return false;
+    }
+    return d <= FLOAT_ULP;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+const char* kernel_file = "kernel.vxbin";
+uint32_t size = 16;
+
+vx_device_h device = nullptr;
+vx_buffer_h src0_buffer = nullptr;
+vx_buffer_h src1_buffer = nullptr;
+vx_buffer_h dst_buffer  = nullptr;
+vx_buffer_h krnl_buffer = nullptr;
+vx_buffer_h args_buffer = nullptr;
+kernel_arg_t kernel_arg = {};
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(src0_buffer);
+    vx_mem_free(src1_buffer);
+    vx_mem_free(dst_buffer);
+    vx_mem_free(krnl_buffer);
+    vx_mem_free(args_buffer);
+    vx_dev_close(device);
+  }
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:h")) != -1) {
+    switch (c) {
+      case 'n': size = atoi(optarg); break;
+      case 'k': kernel_file = optarg; break;
+      case 'h': exit(0);
+      default: exit(-1);
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char* argv[]) {
+
+  MPI_Init(&argc, &argv);
+
+  int rank, world_size;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+  if (rank == 0) parse_args(argc, argv);
+  MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
+
+  std::srand(50);
+
+  // balanced partitioning
+  uint32_t base = size / world_size;
+  uint32_t rem  = size % world_size;
+
+  uint32_t local_size = (rank < rem) ? base + 1 : base;
+  uint32_t start = rank * base + std::min(rank, (int)rem);
+
+  // full input on rank 0
+  std::vector<TYPE> full_src0, full_src1;
+  if (rank == 0) {
+    full_src0.resize(size);
+    full_src1.resize(size);
+    for (uint32_t i = 0; i < size; i++) {
+      full_src0[i] = Comparator<TYPE>::generate(i);
+      full_src1[i] = Comparator<TYPE>::generate(i);
+    }
+  }
+
+  // Scatter setup
+  std::vector<int> counts(world_size), displs(world_size);
+  for (int r = 0; r < world_size; r++) {
+    counts[r] = (r < rem) ? base + 1 : base;
+    displs[r] = r * base + std::min(r, (int)rem);
+  }
+
+  std::vector<TYPE> h_src0(local_size);
+  std::vector<TYPE> h_src1(local_size);
+
+  MPI_Scatterv(full_src0.data(), counts.data(), displs.data(), MPI_FLOAT,
+               h_src0.data(), local_size, MPI_FLOAT, 0, MPI_COMM_WORLD);
+
+  MPI_Scatterv(full_src1.data(), counts.data(), displs.data(), MPI_FLOAT,
+               h_src1.data(), local_size, MPI_FLOAT, 0, MPI_COMM_WORLD);
+
+  // Open device
+  RT_CHECK(vx_dev_open(&device));
+
+  const uint32_t threadsPerBlock = 8;
+  const uint32_t blocksPerGrid =
+      (local_size + threadsPerBlock - 1) / threadsPerBlock;
+
+  uint32_t buf_size = local_size * sizeof(TYPE);
+  uint32_t dst_buf_size = blocksPerGrid * sizeof(TYPE);
+
+  kernel_arg.num_points = local_size;
+  kernel_arg.block_dim[0] = threadsPerBlock;
+  kernel_arg.grid_dim[0]  = blocksPerGrid;
+
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
+  RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
+
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
+  RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
+
+  RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_WRITE, &dst_buffer));
+  RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
+
+  RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));
+  RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));
+
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+  RT_CHECK(vx_upload_bytes(device, &kernel_arg,
+                           sizeof(kernel_arg_t), &args_buffer));
+
+  RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
+  RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+
+  std::vector<TYPE> h_dst(blocksPerGrid);
+  RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, dst_buf_size));
+
+  // Local reduction of block outputs
+  TYPE local_sum = 0;
+  for (uint32_t i = 0; i < blocksPerGrid; i++)
+    local_sum += h_dst[i];
+
+  TYPE global_sum = 0;
+  MPI_Reduce(&local_sum, &global_sum,
+             1, MPI_FLOAT, MPI_SUM,
+             0, MPI_COMM_WORLD);
+
+  // Final verification
+  if (rank == 0) {
+
+    TYPE ref = 0;
+    for (uint32_t i = 0; i < size; i++)
+      ref += full_src0[i] * full_src1[i];
+
+    int errors = 0;
+    if (!Comparator<TYPE>::compare(global_sum, ref, 0, errors))
+      errors++;
+
+    if (errors)
+      std::cout << "FAILED!\n";
+    else
+      std::cout << "PASSED!\n";
+  }
+
+  cleanup();
+  MPI_Finalize();
+  return 0;
+}
\ No newline at end of file
diff --git a/tests/regression/mpi_neighbor_a2a_conv3/Makefile b/tests/regression/mpi_neighbor_a2a_conv3/Makefile
new file mode 100644
index 0000000000..ac99098f9c
--- /dev/null
+++ b/tests/regression/mpi_neighbor_a2a_conv3/Makefile
@@ -0,0 +1,21 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := mpi_neighbor_a2a_conv3
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?= -n64
+
+MPI ?= 0
+NP ?= 1
+
+ifdef MPI
+	CXX = mpic++
+endif
+
+include ../common.mk
diff --git a/tests/regression/mpi_neighbor_a2a_conv3/common.h b/tests/regression/mpi_neighbor_a2a_conv3/common.h
new file mode 100644
index 0000000000..25abebd076
--- /dev/null
+++ b/tests/regression/mpi_neighbor_a2a_conv3/common.h
@@ -0,0 +1,17 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+typedef struct {
+  uint32_t grid_dim[2];
+  uint32_t width;
+  uint64_t I_addr;
+  uint64_t W_addr;
+  uint64_t O_addr;
+  bool     use_lmem;
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/mpi_neighbor_a2a_conv3/kernel.cpp b/tests/regression/mpi_neighbor_a2a_conv3/kernel.cpp
new file mode 100644
index 0000000000..27d86fd6dd
--- /dev/null
+++ b/tests/regression/mpi_neighbor_a2a_conv3/kernel.cpp
@@ -0,0 +1,47 @@
+#include <vx_spawn.h>
+#include "common.h"
+
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+    auto I = reinterpret_cast<TYPE*>(arg->I_addr);
+    auto W = reinterpret_cast<TYPE*>(arg->use_lmem ? __local_mem(0) : (void*)arg->W_addr);
+	auto O = reinterpret_cast<TYPE*>(arg->O_addr);
+    auto width = arg->width;
+
+    int col = blockIdx.x;
+    int row = blockIdx.y;
+
+     // Adjust for padded borders
+    int paddedWidth = width + 2;
+    int paddedX = col + 1;
+    int paddedY = row + 1;
+
+    // Compute 3x3 convolution sum
+    float sum = 0.0f;
+
+    sum += I[(paddedY - 1) * paddedWidth + (paddedX - 1)] * W[0]; // Top-left
+    sum += I[(paddedY - 1) * paddedWidth + paddedX] * W[1];       // Top-center
+    sum += I[(paddedY - 1) * paddedWidth + (paddedX + 1)] * W[2]; // Top-right
+
+    sum += I[paddedY * paddedWidth + (paddedX - 1)] * W[3];       // Middle-left
+    sum += I[paddedY * paddedWidth + paddedX] * W[4];             // Center
+    sum += I[paddedY * paddedWidth + (paddedX + 1)] * W[5];       // Middle-right
+
+    sum += I[(paddedY + 1) * paddedWidth + (paddedX - 1)] * W[6]; // Bottom-left
+    sum += I[(paddedY + 1) * paddedWidth + paddedX] * W[7];       // Bottom-center
+    sum += I[(paddedY + 1) * paddedWidth + (paddedX + 1)] * W[8]; // Bottom-right
+
+    O[row * width + col] = sum;
+}
+
+int main() {
+    kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+    if (arg->use_lmem) {
+        // populate local memory
+        auto W = reinterpret_cast<TYPE*>(arg->W_addr);
+        auto L = reinterpret_cast<TYPE*>(__local_mem(0));
+        for (int i = 0; i < (3*3); ++i) {
+            L[i] = W[i];
+        }
+    }
+    return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/mpi_neighbor_a2a_conv3/main.cpp b/tests/regression/mpi_neighbor_a2a_conv3/main.cpp
new file mode 100644
index 0000000000..3e458d3744
--- /dev/null
+++ b/tests/regression/mpi_neighbor_a2a_conv3/main.cpp
@@ -0,0 +1,280 @@
+#include <iostream>
+#include <vector>
+#include <unistd.h>
+#include <cmath>
+#include <mpi.h>
+#include <vortex.h>
+#include "common.h"
+
+#define FLOAT_ULP 6
+
+#define RT_CHECK(_expr) \
+   do { \
+     int _ret = _expr; \
+     if (0 == _ret) break; \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
+     cleanup(); \
+     MPI_Abort(MPI_COMM_WORLD, -1); \
+   } while(false)
+
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<float> {
+public:
+  static float generate() {
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) {
+    union { float f; int i; } fa, fb;
+    fa.f = a; fb.f = b;
+    int d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP && errors < 10) {
+      printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a);
+      return false;
+    }
+    return d <= FLOAT_ULP;
+  }
+};
+
+static void convolution_cpu(float *O, float *I, float *W, int width, int height) {
+  int paddedWidth = width + 2;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      int paddedY = y + 1;
+      int paddedX = x + 1;
+      float sum = 0.0f;
+      for (int ky = -1; ky <= 1; ++ky)
+        for (int kx = -1; kx <= 1; ++kx)
+          sum += I[(paddedY+ky)*paddedWidth + (paddedX+kx)]
+                 * W[(ky+1)*3 + (kx+1)];
+      O[y*width+x] = sum;
+    }
+  }
+}
+
+const char* kernel_file = "kernel.vxbin";
+int size = 32;
+
+vx_device_h device = nullptr;
+vx_buffer_h I_buf=nullptr, W_buf=nullptr, O_buf=nullptr;
+vx_buffer_h krnl_buf=nullptr, args_buf=nullptr;
+kernel_arg_t kernel_arg = {};
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(I_buf);
+    vx_mem_free(W_buf);
+    vx_mem_free(O_buf);
+    vx_mem_free(krnl_buf);
+    vx_mem_free(args_buf);
+    vx_dev_close(device);
+  }
+}
+
+
+int main(int argc, char** argv) {
+
+  MPI_Init(&argc,&argv);
+
+  int rank, world_size;
+  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+  MPI_Comm_size(MPI_COMM_WORLD,&world_size);
+
+  if (rank == 0) {
+    for (int i = 1; i < argc; i++) {
+      if (strncmp(argv[i], "-n", 2) == 0) {
+        if (strlen(argv[i]) > 2)
+          size = atoi(&argv[i][2]);
+        else if (i+1 < argc)
+          size = atoi(argv[i+1]);
+      }
+    }
+  }
+
+  MPI_Bcast(&size,1,MPI_INT,0,MPI_COMM_WORLD);
+
+  srand(50);
+
+  // ==========================================
+  // 1D Cartesian Topology
+  // ==========================================
+
+  int dims[1] = {world_size};
+  int periods[1] = {0};
+  MPI_Comm cart_comm;
+
+  MPI_Cart_create(MPI_COMM_WORLD,1,dims,periods,0,&cart_comm);
+
+  int up, down;
+  MPI_Cart_shift(cart_comm,0,1,&up,&down);
+
+  int rows_per_rank = (size + world_size - 1)/world_size;
+  int start = rank*rows_per_rank;
+  int end   = std::min(start+rows_per_rank,size);
+  int local_rows = end-start;
+
+  int padded_width = size+2;
+
+  // ==========================================
+  // Root initializes full image + kernel
+  // ==========================================
+
+  std::vector<float> full_I, full_W, full_O;
+
+  if(rank==0){
+    full_I.resize((size+2)*(size+2));
+    full_W.resize(9);
+    full_O.resize(size*size);
+
+    for(int y=-1;y<size+1;y++)
+      for(int x=-1;x<size+1;x++)
+        full_I[(y+1)*(size+2)+(x+1)] =
+          (x>=0&&x<size&&y>=0&&y<size)
+          ? Comparator<float>::generate() : 0;
+
+    for(int i=0;i<9;i++)
+      full_W[i] = Comparator<float>::generate();
+  }
+
+  // ==========================================
+  // Distribute only interior rows
+  // ==========================================
+
+  std::vector<float> local_I((local_rows+2)*padded_width,0);
+  std::vector<float> local_O(local_rows*size);
+  std::vector<float> local_W(9);
+
+  if(rank==0)
+    local_W = full_W;
+
+  MPI_Bcast(local_W.data(),9,MPI_FLOAT,0,MPI_COMM_WORLD);
+
+  // Scatter only interior rows (no halos)
+  std::vector<int> sendcounts(world_size), displs(world_size);
+
+  for(int r=0;r<world_size;r++){
+    int s=r*rows_per_rank;
+    int e=std::min(s+rows_per_rank,size);
+    sendcounts[r]=(e-s)*padded_width;
+    displs[r]=(s+1)*padded_width; // skip top padding
+  }
+
+  MPI_Scatterv(rank==0?full_I.data():nullptr,
+               sendcounts.data(),
+               displs.data(),
+               MPI_FLOAT,
+               local_I.data()+padded_width, // leave space for halo
+               local_rows*padded_width,
+               MPI_FLOAT,
+               0,MPI_COMM_WORLD);
+
+  // ==========================================
+  // Halo Exchange Using MPI_Neighbor_alltoall
+  // ==========================================
+
+  std::vector<float> sendbuf(2*padded_width);
+  std::vector<float> recvbuf(2*padded_width);
+
+  // Pack top and bottom interior rows
+  memcpy(sendbuf.data(),
+         local_I.data()+padded_width,
+         padded_width*sizeof(float));
+
+  memcpy(sendbuf.data()+padded_width,
+         local_I.data()+local_rows*padded_width,
+         padded_width*sizeof(float));
+
+  MPI_Neighbor_alltoall(sendbuf.data(),
+                        padded_width,
+                        MPI_FLOAT,
+                        recvbuf.data(),
+                        padded_width,
+                        MPI_FLOAT,
+                        cart_comm);
+
+  // Unpack halos
+  if(up != MPI_PROC_NULL)
+    memcpy(local_I.data(),
+           recvbuf.data(),
+           padded_width*sizeof(float));
+
+  if(down != MPI_PROC_NULL)
+    memcpy(local_I.data()+(local_rows+1)*padded_width,
+           recvbuf.data()+padded_width,
+           padded_width*sizeof(float));
+
+  // ==========================================
+  // Vortex Execution (unchanged)
+  // ==========================================
+
+  RT_CHECK(vx_dev_open(&device));
+
+  kernel_arg.width=size;
+  kernel_arg.grid_dim[0]=size;
+  kernel_arg.grid_dim[1]=local_rows;
+  kernel_arg.use_lmem=false;
+
+  RT_CHECK(vx_mem_alloc(device,local_I.size()*sizeof(float),VX_MEM_READ,&I_buf));
+  RT_CHECK(vx_mem_alloc(device,9*sizeof(float),VX_MEM_READ,&W_buf));
+  RT_CHECK(vx_mem_alloc(device,local_O.size()*sizeof(float),VX_MEM_WRITE,&O_buf));
+
+  RT_CHECK(vx_mem_address(I_buf,&kernel_arg.I_addr));
+  RT_CHECK(vx_mem_address(W_buf,&kernel_arg.W_addr));
+  RT_CHECK(vx_mem_address(O_buf,&kernel_arg.O_addr));
+
+  RT_CHECK(vx_copy_to_dev(I_buf,local_I.data(),0,local_I.size()*sizeof(float)));
+  RT_CHECK(vx_copy_to_dev(W_buf,local_W.data(),0,9*sizeof(float)));
+
+  RT_CHECK(vx_upload_kernel_file(device,kernel_file,&krnl_buf));
+  RT_CHECK(vx_upload_bytes(device,&kernel_arg,sizeof(kernel_arg),&args_buf));
+
+  RT_CHECK(vx_start(device,krnl_buf,args_buf));
+  RT_CHECK(vx_ready_wait(device,VX_MAX_TIMEOUT));
+
+  RT_CHECK(vx_copy_from_dev(local_O.data(),O_buf,0,local_O.size()*sizeof(float)));
+
+  // ==========================================
+  // Gather results
+  // ========================= =================
+
+  std::vector<int> recvcounts(world_size), recvdispls(world_size);
+
+  for(int r=0;r<world_size;r++){
+    int s=r*rows_per_rank;
+    int e=std::min(s+rows_per_rank,size);
+    recvcounts[r]=(e-s)*size;
+    recvdispls[r]=s*size;
+  }
+  
+
+  MPI_Gatherv(local_O.data(),
+              local_rows*size,
+              MPI_FLOAT,
+              rank==0?full_O.data():nullptr,
+              recvcounts.data(),
+              recvdispls.data(),
+              MPI_FLOAT,
+              0,MPI_COMM_WORLD);
+
+  // ==========================================
+  // Verification
+  // ==========================================
+
+  if(rank==0){
+    std::vector<float> ref(size*size);
+    convolution_cpu(ref.data(),full_I.data(),full_W.data(),size,size);
+
+    int errors=0;
+    for(int i=0;i<size*size;i++)
+      if(!Comparator<float>::compare(full_O[i],ref[i],i,errors))
+        errors++;
+
+    std::cout<<(errors?"FAILED\n":"PASSED\n");
+  }
+
+  cleanup();
+  MPI_Finalize();
+  return 0;
+}
\ No newline at end of file
diff --git a/tests/regression/mpi_put_dotproduct/Makefile b/tests/regression/mpi_put_dotproduct/Makefile
new file mode 100644
index 0000000000..4e96ff00b6
--- /dev/null
+++ b/tests/regression/mpi_put_dotproduct/Makefile
@@ -0,0 +1,21 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := mpi_put_dotproduct
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?= -n64
+
+MPI ?= 0
+NP ?= 1
+
+ifdef MPI
+	CXX = mpic++
+endif
+
+include ../common.mk
diff --git a/tests/regression/mpi_put_dotproduct/common.h b/tests/regression/mpi_put_dotproduct/common.h
new file mode 100644
index 0000000000..f5dd2c9e89
--- /dev/null
+++ b/tests/regression/mpi_put_dotproduct/common.h
@@ -0,0 +1,17 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+typedef struct {
+  uint32_t grid_dim[1];
+  uint32_t block_dim[1];
+  uint32_t num_points;
+  uint64_t src0_addr;
+  uint64_t src1_addr;
+  uint64_t dst_addr;  
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/mpi_put_dotproduct/kernel.cpp b/tests/regression/mpi_put_dotproduct/kernel.cpp
new file mode 100644
index 0000000000..65ad1afd34
--- /dev/null
+++ b/tests/regression/mpi_put_dotproduct/kernel.cpp
@@ -0,0 +1,41 @@
+#include <vx_spawn.h>
+#include "common.h"
+
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+	auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
+	auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
+	auto dst_ptr  = reinterpret_cast<TYPE*>(arg->dst_addr);
+	auto num_points = arg->num_points;
+
+	int tid = threadIdx.x + blockIdx.x * blockDim.x;
+	int cacheIndex = threadIdx.x;
+
+	auto cache = reinterpret_cast<TYPE*>(__local_mem(blockDim.x * sizeof(TYPE)));
+
+	float temp = 0;
+	while (tid < num_points){
+		temp += src0_ptr[tid] * src1_ptr[tid];
+		tid += blockDim.x * gridDim.x;
+	}
+
+	// set the cache values
+	cache[cacheIndex] = temp;
+
+    __syncthreads();
+
+	int i = blockDim.x/2;
+	while (i != 0){
+		if (cacheIndex < i)
+			cache[cacheIndex] += cache[cacheIndex + i];
+		__syncthreads();
+		i /= 2;
+	}
+	
+	if (cacheIndex == 0)
+		dst_ptr[blockIdx.x] = cache[0];
+}
+
+int main() {
+	kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+	return vx_spawn_threads(1, arg->grid_dim, arg->block_dim, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/mpi_put_dotproduct/main.cpp b/tests/regression/mpi_put_dotproduct/main.cpp
new file mode 100644
index 0000000000..fe6b363e4c
--- /dev/null
+++ b/tests/regression/mpi_put_dotproduct/main.cpp
@@ -0,0 +1,229 @@
+#include <iostream>
+#include <unistd.h>
+#include <vector>
+#include <mpi.h>
+#include <vortex.h>
+#include "common.h"
+
+#define FLOAT_ULP 6
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);  \
+     cleanup();                                                 \
+     MPI_Abort(MPI_COMM_WORLD, -1);                             \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<float> {
+public:
+  static float generate(uint32_t idx) {
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) {
+    union { float f; int i; } fa, fb;
+    fa.f = a; fb.f = b;
+    int d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP && errors < 100) {
+      printf("*** error: expected=%f, actual=%f\n", b, a);
+      return false;
+    }
+    return d <= FLOAT_ULP;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+const char* kernel_file = "kernel.vxbin";
+uint32_t size = 16;
+
+vx_device_h device = nullptr;
+vx_buffer_h src0_buffer = nullptr;
+vx_buffer_h src1_buffer = nullptr;
+vx_buffer_h dst_buffer  = nullptr;
+vx_buffer_h krnl_buffer = nullptr;
+vx_buffer_h args_buffer = nullptr;
+kernel_arg_t kernel_arg = {};
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(src0_buffer);
+    vx_mem_free(src1_buffer);
+    vx_mem_free(dst_buffer);
+    vx_mem_free(krnl_buffer);
+    vx_mem_free(args_buffer);
+    vx_dev_close(device);
+  }
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:h")) != -1) {
+    switch (c) {
+      case 'n': size = atoi(optarg); break;
+      case 'k': kernel_file = optarg; break;
+      case 'h': exit(0);
+      default: exit(-1);
+    }
+  }
+}
+
+
+int main(int argc, char* argv[]) {
+
+  MPI_Init(&argc, &argv);
+
+  int rank, world_size;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+  if (rank == 0) parse_args(argc, argv);
+  MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
+
+  std::srand(50);
+
+  uint32_t base = size / world_size;
+  uint32_t rem  = size % world_size;
+
+  uint32_t local_size = (rank < rem) ? base + 1 : base;
+  uint32_t start = rank * base + std::min(rank, (int)rem);
+
+  std::vector<TYPE> full_src0, full_src1;
+  if (rank == 0) {
+    full_src0.resize(size);
+    full_src1.resize(size);
+    for (uint32_t i = 0; i < size; i++) {
+      full_src0[i] = Comparator<TYPE>::generate(i);
+      full_src1[i] = Comparator<TYPE>::generate(i);
+    }
+  }
+
+  std::vector<int> counts(world_size), displs(world_size);
+  for (int r = 0; r < world_size; r++) {
+    counts[r] = (r < rem) ? base + 1 : base;
+    displs[r] = r * base + std::min(r, (int)rem);
+  }
+
+  std::vector<TYPE> h_src0(local_size);
+  std::vector<TYPE> h_src1(local_size);
+
+  MPI_Scatterv(full_src0.data(), counts.data(), displs.data(), MPI_FLOAT,
+               h_src0.data(), local_size, MPI_FLOAT, 0, MPI_COMM_WORLD);
+
+  MPI_Scatterv(full_src1.data(), counts.data(), displs.data(), MPI_FLOAT,
+               h_src1.data(), local_size, MPI_FLOAT, 0, MPI_COMM_WORLD);
+
+  // ================================
+  // VORTEX EXECUTION (unchanged)
+  // ================================
+
+  RT_CHECK(vx_dev_open(&device));
+
+  const uint32_t threadsPerBlock = 8;
+  const uint32_t blocksPerGrid =
+      (local_size + threadsPerBlock - 1) / threadsPerBlock;
+
+  uint32_t buf_size = local_size * sizeof(TYPE);
+  uint32_t dst_buf_size = blocksPerGrid * sizeof(TYPE);
+
+  kernel_arg.num_points = local_size;
+  kernel_arg.block_dim[0] = threadsPerBlock;
+  kernel_arg.grid_dim[0]  = blocksPerGrid;
+
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
+  RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
+
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
+  RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
+
+  RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_WRITE, &dst_buffer));
+  RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
+
+  RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));
+  RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));
+
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+  RT_CHECK(vx_upload_bytes(device, &kernel_arg,
+                           sizeof(kernel_arg_t), &args_buffer));
+
+  RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
+  RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+
+  std::vector<TYPE> h_dst(blocksPerGrid);
+  RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, dst_buf_size));
+
+  TYPE local_sum = 0;
+  for (uint32_t i = 0; i < blocksPerGrid; i++)
+    local_sum += h_dst[i];
+
+  // ================================
+  // RMA SECTION (MPI_Put)
+  // ================================
+
+  std::vector<TYPE> window_buf;
+  if (rank == 0)
+    window_buf.resize(world_size);
+
+  MPI_Win win;
+  MPI_Win_create(rank == 0 ? window_buf.data() : nullptr,
+                 rank == 0 ? world_size * sizeof(TYPE) : 0,
+                 sizeof(TYPE),
+                 MPI_INFO_NULL,
+                 MPI_COMM_WORLD,
+                 &win);
+
+  MPI_Win_fence(0, win);
+
+  // Each rank writes its local_sum into slot [rank] on rank 0
+  MPI_Put(&local_sum,
+          1,
+          MPI_FLOAT,
+          0,
+          rank,
+          1,
+          MPI_FLOAT,
+          win);
+
+  MPI_Win_fence(0, win);
+
+  TYPE global_sum = 0;
+
+  if (rank == 0) {
+    for (int r = 0; r < world_size; r++)
+      global_sum += window_buf[r];
+  }
+
+  MPI_Win_free(&win);
+
+  // ================================
+  // Verification
+  // ================================
+
+  if (rank == 0) {
+
+    TYPE ref = 0;
+    for (uint32_t i = 0; i < size; i++)
+      ref += full_src0[i] * full_src1[i];
+
+    int errors = 0;
+    if (!Comparator<TYPE>::compare(global_sum, ref, 0, errors))
+      errors++;
+
+    if (errors)
+      std::cout << "FAILED!\n";
+    else
+      std::cout << "PASSED!\n";
+  }
+
+  cleanup();
+  MPI_Finalize();
+  return 0;
+}
\ No newline at end of file
diff --git a/tests/regression/mpi_sgemm/Makefile b/tests/regression/mpi_sgemm/Makefile
new file mode 100644
index 0000000000..73f93674e0
--- /dev/null
+++ b/tests/regression/mpi_sgemm/Makefile
@@ -0,0 +1,21 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := mpi_sgemm
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?= -n64
+
+MPI ?= 0
+NP ?= 1
+
+ifdef MPI
+	CXX = mpic++
+endif
+
+include ../common.mk
\ No newline at end of file
diff --git a/tests/regression/mpi_sgemm/common.h b/tests/regression/mpi_sgemm/common.h
new file mode 100644
index 0000000000..b58ec5f5e1
--- /dev/null
+++ b/tests/regression/mpi_sgemm/common.h
@@ -0,0 +1,16 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+typedef struct {
+  uint32_t grid_dim[2];
+  uint32_t size;
+  uint64_t A_addr;
+  uint64_t B_addr;
+  uint64_t C_addr;
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/mpi_sgemm/kernel.cpp b/tests/regression/mpi_sgemm/kernel.cpp
new file mode 100644
index 0000000000..5b67113ec1
--- /dev/null
+++ b/tests/regression/mpi_sgemm/kernel.cpp
@@ -0,0 +1,24 @@
+#include <vx_spawn.h>
+#include "common.h"
+
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+	auto A = reinterpret_cast<TYPE*>(arg->A_addr);
+	auto B = reinterpret_cast<TYPE*>(arg->B_addr);
+	auto C = reinterpret_cast<TYPE*>(arg->C_addr);
+    auto size = arg->size;
+
+    int col = blockIdx.x;
+    int row = blockIdx.y;
+
+    TYPE sum(0);
+    for (int e = 0; e < size; ++e) {
+        sum += A[row * size + e] * B[e * size + col];
+    }
+
+    C[row * size + col] = sum;
+}
+
+int main() {
+	kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+	return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/mpi_sgemm/main.cpp b/tests/regression/mpi_sgemm/main.cpp
new file mode 100644
index 0000000000..ad353563cd
--- /dev/null
+++ b/tests/regression/mpi_sgemm/main.cpp
@@ -0,0 +1,289 @@
+#include <iostream>
+#include <unistd.h>
+#include <string.h>
+#include <vector>
+#include <chrono>
+#include <vortex.h>
+#include <cmath>
+#include "common.h"
+#include <mpi.h>
+
+#define FLOAT_ULP 6
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+	 cleanup();			                                              \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() {
+    return "integer";
+  }
+  static int generate() {
+    return rand();
+  }
+  static bool compare(int a, int b, int index, int errors) {
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
+      }
+      return false;
+    }
+    return true;
+  }
+};
+
+template <>
+class Comparator<float> {
+public:
+  static const char* type_str() {
+    return "float";
+  }
+  static float generate() {
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) {
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a);
+      }
+      return false;
+    }
+    return true;
+  }
+};
+
+static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
+  for (uint32_t row = 0; row < height; ++row) {
+    for (uint32_t col = 0; col < width; ++col) {
+      TYPE sum(0);
+      for (uint32_t e = 0; e < width; ++e) {
+          sum += A[row * width + e] * B[e * width + col];
+      }
+      out[row * width + col] = sum;
+    }
+  }
+}
+
+const char* kernel_file = "kernel.vxbin";
+uint32_t size = 32;
+
+vx_device_h device = nullptr;
+vx_buffer_h A_buffer = nullptr;
+vx_buffer_h B_buffer = nullptr;
+vx_buffer_h C_buffer = nullptr;
+vx_buffer_h krnl_buffer = nullptr;
+vx_buffer_h args_buffer = nullptr;
+kernel_arg_t kernel_arg = {};
+
+static void show_usage() {
+   std::cout << "Vortex Test." << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:h")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+      show_usage();
+      exit(0);
+      break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(A_buffer);
+    vx_mem_free(B_buffer);
+    vx_mem_free(C_buffer);
+    vx_mem_free(krnl_buffer);
+    vx_mem_free(args_buffer);
+    vx_dev_close(device);
+  }
+}
+
+
+
+////
+
+
+
+
+int main(int argc, char *argv[]) {
+
+  MPI_Init(&argc, &argv);
+
+  int rank, world_size;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+  if (rank == 0)
+    parse_args(argc, argv);
+
+  // Broadcast matrix size
+  MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
+
+  uint32_t size_sq = size * size;
+
+  // Compute row chunk (ceil division)
+  uint32_t rows_per_rank = (size + world_size - 1) / world_size;
+  uint32_t row_start = rank * rows_per_rank;
+  uint32_t row_end = std::min(row_start + rows_per_rank, size);
+  uint32_t local_rows = row_end - row_start;
+
+  uint32_t local_elems = local_rows * size;
+  uint32_t local_buf_size = local_elems * sizeof(TYPE);
+  uint32_t full_buf_size = size_sq * sizeof(TYPE);
+
+  std::vector<TYPE> h_A_full;
+  std::vector<TYPE> h_B(size_sq);
+  std::vector<TYPE> h_C_full;
+
+  // Only rank 0 initializes
+  if (rank == 0) {
+    std::srand(50);
+    h_A_full.resize(size_sq);
+    h_C_full.resize(size_sq);
+
+    for (uint32_t i = 0; i < size_sq; ++i) {
+      h_A_full[i] = Comparator<TYPE>::generate();
+      h_B[i] = Comparator<TYPE>::generate();
+    }
+  }
+
+  // Allocate local A and C
+  std::vector<TYPE> h_A_local(local_elems);
+  std::vector<TYPE> h_C_local(local_elems);
+
+  // Prepare scatter metadata
+  std::vector<int> sendcounts(world_size);
+  std::vector<int> displs(world_size);
+
+  for (int i = 0; i < world_size; i++) {
+    uint32_t rs = i * rows_per_rank;
+    uint32_t re = std::min(rs + rows_per_rank, size);
+    sendcounts[i] = (re - rs) * size;
+    displs[i] = rs * size;
+  }
+
+  // Scatter rows of A
+  MPI_Scatterv(
+      h_A_full.data(),
+      sendcounts.data(),
+      displs.data(),
+      MPI_FLOAT,
+      h_A_local.data(),
+      local_elems,
+      MPI_FLOAT,
+      0,
+      MPI_COMM_WORLD);
+
+  // Broadcast full B
+  MPI_Bcast(
+      h_B.data(),
+      size_sq,
+      MPI_FLOAT,
+      0,
+      MPI_COMM_WORLD);
+
+  // ============================
+  // VORTEX EXECUTION PER RANK
+  // ============================
+
+  vx_device_h device;
+  vx_buffer_h A_buffer, B_buffer, C_buffer;
+  vx_buffer_h krnl_buffer, args_buffer;
+  kernel_arg_t kernel_arg = {};
+
+  RT_CHECK(vx_dev_open(&device));
+
+  RT_CHECK(vx_mem_alloc(device, local_buf_size, VX_MEM_READ, &A_buffer));
+  RT_CHECK(vx_mem_alloc(device, full_buf_size, VX_MEM_READ, &B_buffer));
+  RT_CHECK(vx_mem_alloc(device, local_buf_size, VX_MEM_WRITE, &C_buffer));
+
+  RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.A_addr));
+  RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.B_addr));
+  RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.C_addr));
+
+  kernel_arg.grid_dim[0] = size;
+  kernel_arg.grid_dim[1] = local_rows;
+  kernel_arg.size = size;
+
+  RT_CHECK(vx_copy_to_dev(A_buffer, h_A_local.data(), 0, local_buf_size));
+  RT_CHECK(vx_copy_to_dev(B_buffer, h_B.data(), 0, full_buf_size));
+
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+  RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
+
+  RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
+  RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+
+  RT_CHECK(vx_copy_from_dev(h_C_local.data(), C_buffer, 0, local_buf_size));
+
+  // ============================
+  // GATHER RESULTS
+  // ============================
+
+  MPI_Gatherv(
+      h_C_local.data(),
+      local_elems,
+      MPI_FLOAT,
+      h_C_full.data(),
+      sendcounts.data(),
+      displs.data(),
+      MPI_FLOAT,
+      0,
+      MPI_COMM_WORLD);
+
+  // ============================
+  // VERIFY (Rank 0)
+  // ============================
+
+  if (rank == 0) {
+    std::vector<TYPE> h_ref(size_sq);
+    matmul_cpu(h_ref.data(), h_A_full.data(), h_B.data(), size, size);
+
+    int errors = 0;
+    for (uint32_t i = 0; i < size_sq; ++i) {
+      if (!Comparator<TYPE>::compare(h_C_full[i], h_ref[i], i, errors))
+        errors++;
+    }
+
+    if (errors)
+      std::cout << "FAILED\n";
+    else
+      std::cout << "PASSED\n";
+  }
+
+  vx_dev_close(device);
+  MPI_Finalize();
+  return 0;
+}
\ No newline at end of file
diff --git a/tests/regression/mpi_vecadd/Makefile b/tests/regression/mpi_vecadd/Makefile
new file mode 100644
index 0000000000..7fb9732810
--- /dev/null
+++ b/tests/regression/mpi_vecadd/Makefile
@@ -0,0 +1,21 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := mpi_vecadd
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?= -n64
+
+MPI ?= 0
+NP ?= 1
+
+ifdef MPI
+	CXX = mpic++
+endif
+
+include ../common.mk
\ No newline at end of file
diff --git a/tests/regression/mpi_vecadd/README.md b/tests/regression/mpi_vecadd/README.md
new file mode 100644
index 0000000000..c03a26787e
--- /dev/null
+++ b/tests/regression/mpi_vecadd/README.md
@@ -0,0 +1,91 @@
+## MPI With SIMX
+
+
+### Usage
+
+```
+Apptainer> ./ci/blackbox.sh --cores=2 --app=mpi_vecadd --driver=simx --np=4 --args="-n5000"
+CONFIGS=-DNUM_CORES=2
+Running: CONFIGS="-DNUM_CORES=2" make -C ./ci/../runtime/simx > /dev/null
+Running: OPTS="-n5000" NP=4 make -C "./ci/../tests/regression/mpi_vecadd" run-simx
+make: Entering directory '/home/vortex/build/tests/regression/mpi_vecadd'
+LD_LIBRARY_PATH=/home/vortex/build/runtime:/opt/boost-1.66/lib:/opt/openssl-1.1/lib::/.singularity.d/libs VORTEX_DRIVER=simx  mpirun  --allow-run-as-root --oversubscribe -np 4 ./mpi_vecadd -n5000
+rank = 3, world_size = 4
+rank = 0, world_size = 4
+rank = 1, world_size = 4
+rank = 2, world_size = 4
+Rank: 3- Upload kernel binary
+Rank: 0- Upload kernel binary
+Rank: 1- Upload kernel binary
+Rank: 2- Upload kernel binary
+PERF: core0: instrs=22440, cycles=59003, IPC=0.380320
+PERF: core1: instrs=22440, cycles=58635, IPC=0.382707
+PERF: instrs=44880, cycles=59003, IPC=0.760639
+PERF: core0: instrs=22440, cycles=59003, IPC=0.380320
+PERF: core1: instrs=22440, cycles=58635, IPC=0.382707
+PERF: instrs=44880, cycles=59003, IPC=0.760639
+PERF: core0: instrs=22440, cycles=59003, IPC=0.380320
+PERF: core1: instrs=22440, cycles=58635, IPC=0.382707
+PERF: instrs=44880, cycles=59003, IPC=0.760639
+PASSED!
+PERF: core0: instrs=22440, cycles=59003, IPC=0.380320
+PERF: core1: instrs=22440, cycles=58635, IPC=0.382707
+PERF: instrs=44880, cycles=59003, IPC=0.760639
+make: Leaving directory '/home/vortex/build/tests/regression/mpi_vecadd'
+Apptainer> 
+```
+
+
+###  High-Level Summary of main.cpp
+
+#### MPI Setup
+
+Calls MPI_Init, gets the rank (MPI_Comm_rank) and world size (MPI_Comm_size).
+
+Each MPI rank prints its rank and total world_size.
+
+#### Argument Parsing
+
+Reads -n <size> from the command line (number of elements in the vector).
+
+Rank 0 parses this value, then broadcasts it to all ranks with MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD).
+
+This ensures every rank sees the same problem size.
+
+#### Data Partitioning
+
+Total work = size elements.
+
+Each rank computes its chunk:
+
+```
+  // Compute local chunk
+  uint32_t chunk = (size + world_size - 1) / world_size; // ceil div
+  uint32_t start = rank * chunk;
+  uint32_t end   = std::min(start + chunk, size);
+  uint32_t num_points = end - start;
+```
+
+
+So if size=50 and np=8, each rank gets about 6–7 elements.
+
+#### Kernel Upload + Execution
+
+Each rank loads the Vortex kernel binary (mpi_vecadd) into its own Vortex instance.
+
+That’s why you see “Upload kernel binary” printed for every rank, not just once.
+
+Then each rank launches the kernel for its assigned portion of the data.
+
+#### Performance Reporting
+
+After kernel finishes, each rank prints Vortex perf stats (instrs, cycles, IPC).
+
+These numbers are per rank’s Vortex instance, not shared across ranks.
+
+
+#### Verification
+
+Each rank validates its results (checks that vector addition is correct).
+
+Finally, the ranks synchronize (MPI_Barrier) and finalize (MPI_Finalize).
\ No newline at end of file
diff --git a/tests/regression/mpi_vecadd/common.h b/tests/regression/mpi_vecadd/common.h
new file mode 100644
index 0000000000..b511332c11
--- /dev/null
+++ b/tests/regression/mpi_vecadd/common.h
@@ -0,0 +1,15 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+typedef struct {
+  uint32_t num_points;
+  uint64_t src0_addr;
+  uint64_t src1_addr;
+  uint64_t dst_addr;  
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/mpi_vecadd/kernel.cpp b/tests/regression/mpi_vecadd/kernel.cpp
new file mode 100644
index 0000000000..7774c970a7
--- /dev/null
+++ b/tests/regression/mpi_vecadd/kernel.cpp
@@ -0,0 +1,15 @@
+#include <vx_spawn.h>
+#include "common.h"
+
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+	auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
+	auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
+	auto dst_ptr  = reinterpret_cast<TYPE*>(arg->dst_addr);
+	
+	dst_ptr[blockIdx.x] = src0_ptr[blockIdx.x] + src1_ptr[blockIdx.x];
+}
+
+int main() {
+	kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+	return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/mpi_vecadd/main.cpp b/tests/regression/mpi_vecadd/main.cpp
new file mode 100644
index 0000000000..3bcd415192
--- /dev/null
+++ b/tests/regression/mpi_vecadd/main.cpp
@@ -0,0 +1,186 @@
+#include <iostream>
+#include <unistd.h>
+#include <vector>
+#include <vortex.h>
+#include <mpi.h>
+#include "common.h"
+
+#define FLOAT_ULP 6
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);  \
+     cleanup();                                                 \
+     MPI_Abort(MPI_COMM_WORLD, -1);                             \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() { return "integer"; }
+  static int generate(uint32_t idx) { return rand(); }
+  static bool compare(int a, int b, int index, int errors) {
+    if (a != b && errors < 100) {
+      printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
+      return false;
+    }
+    return a == b;
+  }
+};
+
+template <>
+class Comparator<float> {
+public:
+  static const char* type_str() { return "float"; }
+  static float generate(uint32_t idx) { return static_cast<float>(rand()) / RAND_MAX; }
+  static bool compare(float a, float b, int index, int errors) {
+    union { float f; int i; } fa, fb;
+    fa.f = a; fb.f = b;
+    int d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP && errors < 100) {
+      printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a);
+      return false;
+    }
+    return d <= FLOAT_ULP;
+  }
+};
+
+const char* kernel_file = "kernel.vxbin";
+uint32_t size = 16;
+
+vx_device_h device = nullptr;
+vx_buffer_h src0_buffer = nullptr;
+vx_buffer_h src1_buffer = nullptr;
+vx_buffer_h dst_buffer  = nullptr;
+vx_buffer_h krnl_buffer = nullptr;
+vx_buffer_h args_buffer = nullptr;
+kernel_arg_t kernel_arg = {};
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(src0_buffer);
+    vx_mem_free(src1_buffer);
+    vx_mem_free(dst_buffer);
+    vx_mem_free(krnl_buffer);
+    vx_mem_free(args_buffer);
+    vx_dev_close(device);
+  }
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:h")) != -1) {
+    switch (c) {
+      case 'n': size = atoi(optarg); break;
+      case 'k': kernel_file = optarg; break;
+      case 'h': std::cout << "Usage: [-k kernel] [-n size] [-h help]\n"; exit(0);
+      default: std::cout << "Usage: [-k kernel] [-n size] [-h help]\n"; exit(-1);
+    }
+  }
+}
+
+int main(int argc, char* argv[]) {
+  MPI_Init(&argc, &argv);
+  int rank, world_size;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+  std::cout << "rank = " << rank << ", world_size = " << world_size << "\n";
+  if (rank == 0) parse_args(argc, argv);
+  MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
+
+  // Rank 0 generates full input arrays
+  std::vector<TYPE> full_src0, full_src1;
+  if (rank == 0) {
+    std::srand(50);
+    full_src0.resize(size);
+    full_src1.resize(size);
+    for (uint32_t i = 0; i < size; i++) {
+      full_src0[i] = Comparator<TYPE>::generate(i);
+      full_src1[i] = Comparator<TYPE>::generate(i);
+    }
+  }
+
+  // Compute local chunk
+  uint32_t chunk = (size + world_size - 1) / world_size; // ceil div
+  uint32_t start = rank * chunk;
+  uint32_t end   = std::min(start + chunk, size);
+  uint32_t num_points = end - start;
+
+  // Local buffers
+  std::vector<TYPE> h_src0(num_points);
+  std::vector<TYPE> h_src1(num_points);
+  std::vector<TYPE> h_dst(num_points);
+
+  // Scatter inputs
+  std::vector<int> recvcounts(world_size), displs(world_size);
+  for (int i = 0; i < world_size; i++) {
+    int s = i * chunk;
+    int e = std::min(s + chunk, size);
+    recvcounts[i] = e - s;
+    displs[i]     = s;
+  }
+
+  MPI_Scatterv(full_src0.data(), recvcounts.data(), displs.data(), MPI_FLOAT,
+               h_src0.data(), num_points, MPI_FLOAT, 0, MPI_COMM_WORLD);
+
+  MPI_Scatterv(full_src1.data(), recvcounts.data(), displs.data(), MPI_FLOAT,
+               h_src1.data(), num_points, MPI_FLOAT, 0, MPI_COMM_WORLD);
+
+  // Open device
+  RT_CHECK(vx_dev_open(&device));
+
+  uint32_t buf_size = num_points * sizeof(TYPE);
+  kernel_arg.num_points = num_points;
+
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
+  RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
+  RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
+  RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
+
+  RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));
+  RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));
+
+  std::cout << "Rank: " << rank << "- Upload kernel binary" << std::endl;
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+  RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
+
+  // Run kernel
+  RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
+  RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+  RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
+
+  // Gather results
+  std::vector<TYPE> full_dst;
+  if (rank == 0) full_dst.resize(size);
+
+  MPI_Gatherv(h_dst.data(), num_points, MPI_FLOAT,
+              full_dst.data(), recvcounts.data(), displs.data(), MPI_FLOAT,
+              0, MPI_COMM_WORLD);
+
+  // Verify (rank 0)
+  if (rank == 0) {
+    int errors = 0;
+    for (uint32_t i = 0; i < size; i++) {
+      auto ref = full_src0[i] + full_src1[i];
+      auto cur = full_dst[i];
+      if (!Comparator<TYPE>::compare(cur, ref, i, errors)) errors++;
+    }
+    if (errors) std::cout << "Found " << errors << " errors!\nFAILED!\n";
+    else std::cout << "PASSED!\n";
+  }
+
+  cleanup();
+  MPI_Finalize();
+  return 0;
+}