From d7aa97e7436ed2caf7c43adf2142e7bd50c9452d Mon Sep 17 00:00:00 2001
From: "OReilly, Ossian" <ooreilly@amd.com>
Date: Wed, 13 May 2026 20:05:34 -0500
Subject: [PATCH] Add preliminary 2-D support. Note: performance is currently
 1.2x slower than the 3-D version for the same size.

---
 hip/hipBlockCopy.h             |  33 ++-
 hip/hipCompress.cpp            | 129 ++++++---
 hip/hipCompress.h              |   5 +-
 hip/hipWaveletRLE2D.h          | 473 +++++++++++++++++++++++++++++++++
 makefile                       |   4 +
 tests/profile_2d_vs_3d.cpp     | 121 +++++++++
 tests/test_compress_2d_hip.cpp | 354 ++++++++++++++++++++++++
 7 files changed, 1069 insertions(+), 50 deletions(-)
 create mode 100644 hip/hipWaveletRLE2D.h
 create mode 100644 tests/profile_2d_vs_3d.cpp
 create mode 100644 tests/test_compress_2d_hip.cpp
diff --git a/hip/hipBlockCopy.h b/hip/hipBlockCopy.h
index 189a59c..93ce222 100644
--- a/hip/hipBlockCopy.h
+++ b/hip/hipBlockCopy.h
@@ -82,18 +82,20 @@ __global__ void copyToWaveletKernelOpt(
         }
     }
 
-    // ---- Phase 3: Store ZPB planes ----
+    // ---- Phase 3: Store ZPB planes (skip planes beyond wnz) ----
     if constexpr (DO_COPY) {
         uint32_t dst_byte = (gx + gy * wnx) * (uint32_t)sizeof(float);
 
         #pragma unroll
         for (int dz = 0; dz < BCOPY_ZPB; ++dz) {
-            auto plane_rsrc = __builtin_amdgcn_make_buffer_rsrc(
-                d_dst + (long)(z_start + dz) * wnx * wny,
-                0, -1, 0x00027000);
-            auto vi = __builtin_bit_cast(bcopy_int4_vec, regs[dz]);
-            __builtin_amdgcn_raw_buffer_store_b128(
-                vi, plane_rsrc, dst_byte, 0, SLC);
+            if (z_start + dz < wnz) {
+                auto plane_rsrc = __builtin_amdgcn_make_buffer_rsrc(
+                    d_dst + (long)(z_start + dz) * wnx * wny,
+                    0, -1, 0x00027000);
+                auto vi = __builtin_bit_cast(bcopy_int4_vec, regs[dz]);
+                __builtin_amdgcn_raw_buffer_store_b128(
+                    vi, plane_rsrc, dst_byte, 0, SLC);
+            }
         }
     }
 
@@ -174,15 +176,20 @@ __global__ void copyFromWaveletKernelOpt(
     uint32_t src_byte = (gx + gy * wnx) * (uint32_t)sizeof(float);
     long wav_plane = (long)wnx * wny;
 
+    constexpr bcopy_float4_vec zero_vec_from = {0.0f, 0.0f, 0.0f, 0.0f};
     bcopy_float4_vec regs[BCOPY_ZPB];
     #pragma unroll
     for (int dz = 0; dz < BCOPY_ZPB; ++dz) {
-        auto plane_rsrc = __builtin_amdgcn_make_buffer_rsrc(
-            const_cast<float*>(d_src + (long)(z_start + dz) * wav_plane),
-            0, -1, 0x00027000);
-        regs[dz] = __builtin_bit_cast(bcopy_float4_vec,
-            __builtin_amdgcn_raw_buffer_load_b128(
-                plane_rsrc, src_byte, 0, SLC));
+        if (z_start + dz < wnz) {
+            auto plane_rsrc = __builtin_amdgcn_make_buffer_rsrc(
+                const_cast<float*>(d_src + (long)(z_start + dz) * wav_plane),
+                0, -1, 0x00027000);
+            regs[dz] = __builtin_bit_cast(bcopy_float4_vec,
+                __builtin_amdgcn_raw_buffer_load_b128(
+                    plane_rsrc, src_byte, 0, SLC));
+        } else {
+            regs[dz] = zero_vec_from;
+        }
     }
 
     uint32_t dst_byte = ((y0 + gy) * ldimx + x0 + gx) * (uint32_t)sizeof(float);
diff --git a/hip/hipCompress.cpp b/hip/hipCompress.cpp
index 770be8a..29d001b 100644
--- a/hip/hipCompress.cpp
+++ b/hip/hipCompress.cpp
@@ -7,6 +7,7 @@
 #define DS79_INCLUDE_REG32
 #include "hipWaveletRLE.h"
 #include "hipWaveletRLEInverse.h"
+#include "hipWaveletRLE2D.h"
 #include "hipBlockCopy.h"
 
 #include <cmath>
@@ -67,20 +68,37 @@ hipError_t hipCompressCreatePlan(hipCompressPlan** plan, int nx, int ny, int nz,
 
     if (nx <= 0 || ny <= 0 || nz <= 0)
         PLAN_ERROR(p, HIP_COMPRESS_ERROR_INVALID_DIMENSIONS, hipErrorInvalidValue);
-    if (nx % 32 != 0 || ny % 32 != 0 || nz % 32 != 0)
-        PLAN_ERROR(p, HIP_COMPRESS_ERROR_NOT_MULTIPLE_OF_32, hipErrorInvalidValue);
+
+    bool is_2d = (nz == 1);
+
+    if (is_2d) {
+        if (nx % 32 != 0 || ny % 32 != 0)
+            PLAN_ERROR(p, HIP_COMPRESS_ERROR_NOT_MULTIPLE_OF_32, hipErrorInvalidValue);
+    } else {
+        if (nx % 32 != 0 || ny % 32 != 0 || nz % 32 != 0)
+            PLAN_ERROR(p, HIP_COMPRESS_ERROR_NOT_MULTIPLE_OF_32, hipErrorInvalidValue);
+    }
+
     if ((long)nx * (long)ny * (long)sizeof(float) > (1L << 32))
         PLAN_ERROR(p, HIP_COMPRESS_ERROR_PLANE_TOO_LARGE, hipErrorInvalidValue);
 
     p->kernel = kernel;
     p->nx = nx; p->ny = ny; p->nz = nz;
-    p->num_blocks = (nx / 32) * (ny / 32) * (nz / 32);
+    p->is_2d = is_2d;
     p->aux_stream = aux_stream;
     p->compress_pending = false;
     p->last_error = HIP_COMPRESS_ERROR_HIP_RUNTIME;
 
+    if (is_2d) {
+        p->num_blocks = (nx / 32) * (ny / 32);
+        p->scratch_slot_stride = WRLE2D_SLOT_BYTES;
+    } else {
+        p->num_blocks = (nx / 32) * (ny / 32) * (nz / 32);
+        p->scratch_slot_stride = 4L * WRLE_LDS_BYTES;
+    }
+
     int nb = p->num_blocks;
-    long scratch_size = (long)nb * 4L * WRLE_LDS_BYTES;
+    long scratch_size = (long)nb * (long)p->scratch_slot_stride;
 
     HIPCHECK_PLAN(p, hipMalloc(&p->d_scratch, scratch_size));
     HIPCHECK_PLAN(p, hipMalloc(&p->d_mulfac, sizeof(float)));
@@ -95,13 +113,15 @@ hipError_t hipCompressCreatePlan(hipCompressPlan** plan, int nx, int ny, int nz,
 
     HIPCHECK_PLAN(p, hipMalloc(&p->d_rms, sizeof(double)));
 
-    p->max_copy_blocks = (nx / 32) * (ny / 32) * (nz / BCOPY_ZPB);
+    int nz_for_copy = is_2d ? 1 : nz;
+    p->max_copy_blocks = (nx / 32) * (ny / 32)
+                       * ((nz_for_copy + BCOPY_ZPB - 1) / BCOPY_ZPB);
     HIPCHECK_PLAN(p, hipMalloc(&p->d_partial_sums, p->max_copy_blocks * sizeof(double)));
 
     HIPCHECK_PLAN(p, hipEventCreateWithFlags(&p->ready_event, hipEventDisableTiming));
     HIPCHECK_PLAN(p, hipHostMalloc(&p->h_staging, 2 * sizeof(size_t)));
 
-    // JIT warmup: exclusive_scan on aux_stream (no user stream available yet)
+    // JIT warmup: exclusive_scan on aux_stream
     err = rocprim::exclusive_scan(
         p->d_scan_temp, p->scan_temp_bytes,
         p->d_block_sizes, p->d_block_offsets, (size_t)0, (size_t)1,
@@ -149,7 +169,7 @@ hipError_t hipCompress(
         PLAN_ERROR(plan, HIP_COMPRESS_ERROR_INVALID_SCALE, hipErrorInvalidValue);
 
     const int nx = plan->nx, ny = plan->ny, nz = plan->nz;
-    const int ldimx = nx, ldimxy = nx * ny;
+    const int ldimx = nx;
     const int nb = plan->num_blocks;
     const int num_mulfacs = 1;
     const int hdr_size = hipCompressHeaderSize(nb, num_mulfacs);
@@ -157,17 +177,27 @@ hipError_t hipCompress(
     hipStream_t aux = plan->aux_stream;
 
     // 1. Fused wavelet + quantize + RLE → scratch (user_stream)
-    dim3 grid((nx + 31) / 32, (ny + 31) / 32, (nz + 31) / 32);
-    if (plan->kernel == HIP_COMPRESS_KERNEL_SEGRLE) {
-        waveletSegRLEFusedKernel<<<grid, dim3(256), 0, s>>>(
+    if (plan->is_2d) {
+        int nbx = nx / 32, nby = ny / 32;
+        dim3 grid((nbx + WRLE2D_TILES_PER_WG - 1) / WRLE2D_TILES_PER_WG, nby);
+        waveletRLE2DFusedKernel<<<grid, dim3(256), 0, s>>>(
             d_input, plan->d_scratch, plan->d_block_sizes,
-            scale, ldimx, ldimxy,
+            scale, ldimx, nbx,
             d_rms, plan->d_mulfac);
     } else {
-        waveletRLEFusedKernel<<<grid, dim3(256), 0, s>>>(
-            d_input, plan->d_scratch, plan->d_block_sizes,
-            scale, ldimx, ldimxy,
-            d_rms, plan->d_mulfac);
+        const int ldimxy = nx * ny;
+        dim3 grid((nx + 31) / 32, (ny + 31) / 32, (nz + 31) / 32);
+        if (plan->kernel == HIP_COMPRESS_KERNEL_SEGRLE) {
+            waveletSegRLEFusedKernel<<<grid, dim3(256), 0, s>>>(
+                d_input, plan->d_scratch, plan->d_block_sizes,
+                scale, ldimx, ldimxy,
+                d_rms, plan->d_mulfac);
+        } else {
+            waveletRLEFusedKernel<<<grid, dim3(256), 0, s>>>(
+                d_input, plan->d_scratch, plan->d_block_sizes,
+                scale, ldimx, ldimxy,
+                d_rms, plan->d_mulfac);
+        }
     }
 
     // 2. Exclusive scan for compaction offsets (user_stream)
@@ -181,10 +211,18 @@ hipError_t hipCompress(
     HIPCHECK_PLAN(plan, hipStreamWaitEvent(aux, plan->ready_event, 0));
 
     // 4. Compact + write header on aux_stream
-    wrleCompactKernel<<<nb, 256, 0, aux>>>(
-        plan->d_scratch, d_output + hdr_size,
-        plan->d_block_sizes, plan->d_block_offsets,
-        d_output, nb, num_mulfacs, plan->d_mulfac);
+    if (plan->is_2d) {
+        wrle2DCompactKernel<<<nb, 256, 0, aux>>>(
+            plan->d_scratch, d_output + hdr_size,
+            plan->d_block_sizes, plan->d_block_offsets,
+            d_output, nb, num_mulfacs, plan->d_mulfac,
+            plan->scratch_slot_stride);
+    } else {
+        wrleCompactKernel<<<nb, 256, 0, aux>>>(
+            plan->d_scratch, d_output + hdr_size,
+            plan->d_block_sizes, plan->d_block_offsets,
+            d_output, nb, num_mulfacs, plan->d_mulfac);
+    }
 
     // 5. Async readback on aux_stream
     HIPCHECK_PLAN(plan, hipMemcpyAsync(&plan->h_staging[0], plan->d_block_offsets + (nb - 1),
@@ -241,14 +279,19 @@ hipError_t hipCopyToWaveletLayout(
         PLAN_ERROR(plan, HIP_COMPRESS_ERROR_NULL_INPUT, hipErrorInvalidValue);
     if (!d_dst && !d_rms_out)
         PLAN_ERROR(plan, HIP_COMPRESS_ERROR_BOTH_OUTPUTS_NULL, hipErrorInvalidValue);
-    if (ex < 32 || ey < 32 || ez < 32)
-        PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue);
+    if (plan->is_2d) {
+        if (ex < 32 || ey < 32 || ez != 1)
+            PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue);
+    } else {
+        if (ex < 32 || ey < 32 || ez < 32)
+            PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue);
+    }
     if ((long)ldimxy * (long)sizeof(float) > (1L << 32))
         PLAN_ERROR(plan, HIP_COMPRESS_ERROR_PLANE_TOO_LARGE, hipErrorInvalidValue);
 
     int wnx = hipCompressWaveletDim(ex);
     int wny = hipCompressWaveletDim(ey);
-    int wnz = hipCompressWaveletDim(ez);
+    int wnz = plan->is_2d ? 1 : hipCompressWaveletDim(ez);
 
     if (wnx != plan->nx || wny != plan->ny || wnz != plan->nz)
         PLAN_ERROR(plan, HIP_COMPRESS_ERROR_EXTRACTION_DIMS_MISMATCH, hipErrorInvalidValue);
@@ -258,7 +301,7 @@ hipError_t hipCopyToWaveletLayout(
     bool do_rms  = (d_rms_out != nullptr);
     long total_samples = (long)ex * ey * ez;
 
-    dim3 grid(wnx / 32, wny / 32, wnz / BCOPY_ZPB);
+    dim3 grid(wnx / 32, wny / 32, (wnz + BCOPY_ZPB - 1) / BCOPY_ZPB);
     int total_blocks = grid.x * grid.y * grid.z;
 
     if (do_copy && do_rms) {
@@ -307,19 +350,24 @@ hipError_t hipCopyFromWaveletLayout(
         PLAN_ERROR(plan, HIP_COMPRESS_ERROR_NULL_INPUT, hipErrorInvalidValue);
     if (!d_dst)
         PLAN_ERROR(plan, HIP_COMPRESS_ERROR_NULL_OUTPUT, hipErrorInvalidValue);
-    if (ex < 32 || ey < 32 || ez < 32)
-        PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue);
+    if (plan->is_2d) {
+        if (ex < 32 || ey < 32 || ez != 1)
+            PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue);
+    } else {
+        if (ex < 32 || ey < 32 || ez < 32)
+            PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue);
+    }
     if ((long)ldimxy * (long)sizeof(float) > (1L << 32))
         PLAN_ERROR(plan, HIP_COMPRESS_ERROR_PLANE_TOO_LARGE, hipErrorInvalidValue);
 
     int wnx = hipCompressWaveletDim(ex);
     int wny = hipCompressWaveletDim(ey);
-    int wnz = hipCompressWaveletDim(ez);
+    int wnz = plan->is_2d ? 1 : hipCompressWaveletDim(ez);
 
     if (wnx != plan->nx || wny != plan->ny || wnz != plan->nz)
         PLAN_ERROR(plan, HIP_COMPRESS_ERROR_EXTRACTION_DIMS_MISMATCH, hipErrorInvalidValue);
 
-    dim3 grid(wnx / 32, wny / 32, wnz / BCOPY_ZPB);
+    dim3 grid(wnx / 32, wny / 32, (wnz + BCOPY_ZPB - 1) / BCOPY_ZPB);
     copyFromWaveletKernelOpt<<<grid, 256, 0, user_stream>>>(
         d_src, wnx, wny, wnz,
         d_dst, ldimx, ldimxy, x0, y0, z0, ex, ey, ez);
@@ -344,17 +392,26 @@ hipError_t hipDecompress(
         PLAN_ERROR(plan, HIP_COMPRESS_ERROR_NULL_OUTPUT, hipErrorInvalidValue);
 
     const int nx = plan->nx, ny = plan->ny, nz = plan->nz;
-    const int ldimx = nx, ldimxy = nx * ny;
+    const int ldimx = nx;
 
-    dim3 grid((nx + 31) / 32, (ny + 31) / 32, (nz + 31) / 32);
-    if (plan->kernel == HIP_COMPRESS_KERNEL_SEGRLE) {
-        waveletSegRLEInverseFusedKernel<<<grid, dim3(256), 0, user_stream>>>(
+    if (plan->is_2d) {
+        int nbx = nx / 32, nby = ny / 32;
+        dim3 grid((nbx + WRLE2D_TILES_PER_WG - 1) / WRLE2D_TILES_PER_WG, nby);
+        waveletRLE2DInverseFusedKernel<<<grid, dim3(256), 0, user_stream>>>(
             d_input, nullptr, nullptr,
-            d_output, 0.0f, ldimx, ldimxy, 1);
+            d_output, 0.0f, ldimx, nbx, 1);
     } else {
-        waveletRLEInverseFusedKernel<<<grid, dim3(256), 0, user_stream>>>(
-            d_input, nullptr, nullptr,
-            d_output, 0.0f, ldimx, ldimxy, 1);
+        const int ldimxy = nx * ny;
+        dim3 grid((nx + 31) / 32, (ny + 31) / 32, (nz + 31) / 32);
+        if (plan->kernel == HIP_COMPRESS_KERNEL_SEGRLE) {
+            waveletSegRLEInverseFusedKernel<<<grid, dim3(256), 0, user_stream>>>(
+                d_input, nullptr, nullptr,
+                d_output, 0.0f, ldimx, ldimxy, 1);
+        } else {
+            waveletRLEInverseFusedKernel<<<grid, dim3(256), 0, user_stream>>>(
+                d_input, nullptr, nullptr,
+                d_output, 0.0f, ldimx, ldimxy, 1);
+        }
     }
 
     hipError_t launch_err = hipGetLastError();
@@ -372,6 +429,6 @@ hipError_t hipCompressMaxOutputSize(const hipCompressPlan* plan, size_t* size)
     }
     plan->last_error = HIP_COMPRESS_SUCCESS;
     int hdr_size = hipCompressHeaderSize(plan->num_blocks, 1);
-    *size = (size_t)hdr_size + (size_t)plan->num_blocks * 4 * WRLE_LDS_BYTES;
+    *size = (size_t)hdr_size + (size_t)plan->num_blocks * plan->scratch_slot_stride;
     return hipSuccess;
 }
diff --git a/hip/hipCompress.h b/hip/hipCompress.h
index 11af240..1c15d1c 100644
--- a/hip/hipCompress.h
+++ b/hip/hipCompress.h
@@ -38,6 +38,8 @@ struct hipCompressPlan {
     hipCompressKernel kernel;
     int nx, ny, nz;
     int num_blocks;
+    bool is_2d;             // true when nz == 1
+    size_t scratch_slot_stride;  // per-block scratch slot size
 
     unsigned char* d_scratch;
     float* d_mulfac;
@@ -89,7 +91,8 @@ hipError_t hipCompressDestroyPlan(hipCompressPlan* plan);
 // Zero-fills the padding band (ex..wnx-1, etc.).
 // RMS is computed over the ex*ey*ez extraction window only.
 //
-// ex,ey,ez >= 32. Wavelet dims are derived internally: wnx = round32(ex), etc.
+// 3D: ex,ey,ez >= 32. Wavelet dims: wnx = round32(ex), etc.
+// 2D (nz=1): ex,ey >= 32, ez must be 1, wnz = 1.
 // Must equal plan dimensions exactly.
 //
 // d_dst may be NULL (RMS-only mode). d_rms_out may be NULL (copy-only mode).
diff --git a/hip/hipWaveletRLE2D.h b/hip/hipWaveletRLE2D.h
new file mode 100644
index 0000000..9e9230e
--- /dev/null
+++ b/hip/hipWaveletRLE2D.h
@@ -0,0 +1,473 @@
+// Copyright (C) 2025 Advanced Micro Devices, Inc.
+// Use of this source code is governed by an MIT-style license that can be
+// found in the LICENSE file or at https://opensource.org/licenses/MIT.
+
+#ifndef HIPWAVELET_RLE_2D_H
+#define HIPWAVELET_RLE_2D_H
+
+// Fused 2D wavelet + quantize + RLE encode/decode kernels.
+//
+// Each workgroup processes up to 32 x-adjacent 32x32 tiles in 4 batches of 8.
+// Per batch:  load 8 tiles → Y-transform (LDS) → transposed readback →
+//             X-transform (registers) → x-line RLE encode (scan+compact).
+//
+// Each thread encodes one 32-element x-row as a single RLE line.
+// Per-block layout: [32B x-line metadata] [RLE data].
+// 32 KB LDS reused across phases. Occupancy 2 on gfx942.
+
+#include <hip/hip_runtime.h>
+#include <rocprim/block/block_scan.hpp>
+#include <rocprim/device/device_scan.hpp>
+#include "ds79.h"
+#include "Run_Length_Escape_Codes.hxx"
+#include "hipRLEDecode.h"
+
+static constexpr int WRLE2D_TILES_PER_WG = 32;
+static constexpr int WRLE2D_BATCH        = 8;
+static constexpr int WRLE2D_META_BYTES   = 32;
+static constexpr int WRLE2D_SLOT_BYTES   = 8192;
+
+// RLE encode 32 scalar values from a float[32] array.
+// WRITE=false: count-only. WRITE=true: writes encoded bytes.
+template<bool WRITE>
+__device__ __forceinline__
+int wrle_xline(const float* values, float scale, unsigned char* dst)
+{
+    int bp = 0, rle = 0;
+    #pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        float fval = scale * values[i];
+        int ival = (int)fval;
+        if (ival == 0) { ++rle; } else {
+            if constexpr (!WRITE) {
+                bool r1 = (rle == 1), rs = (rle >= 2) & (rle < 256);
+                int rb = (r1 ? 1 : (rs ? 2 : 4)) & -(rle > 0);
+                bp += rb;
+            } else {
+                if (rle == 1) {
+                    dst[bp] = 0; bp += 1;
+                } else if (rle >= 2 && rle < 256) {
+                    dst[bp] = (unsigned char)(RLESC1 & 0xFF);
+                    dst[bp+1] = (unsigned char)rle; bp += 2;
+                } else if (rle >= 256) {
+                    dst[bp] = (unsigned char)(RLESC3 & 0xFF);
+                    dst[bp+1] = (unsigned char)(rle & 0xFF);
+                    dst[bp+2] = (unsigned char)((rle >> 8) & 0xFF);
+                    dst[bp+3] = (unsigned char)((rle >> 16) & 0xFF); bp += 4;
+                }
+            }
+            rle = 0;
+            bool ib = (ival>VLESC2)&(ival<RLESC3), i16 = (ival>=-32768)&(ival<=32767);
+            bool i24 = (ival>=-8388608)&(ival<=8388607), f32 = !ib&!i16&!i24;
+            if constexpr (!WRITE) {
+                bp += ib ? 1 : i16 ? 3 : i24 ? 4 : 5;
+            } else {
+                unsigned u; __builtin_memcpy(&u, &fval, 4);
+                unsigned pay = f32 ? u : (unsigned)ival;
+                if (ib) {
+                    dst[bp] = (unsigned char)(signed char)ival;
+                } else if (i16) {
+                    dst[bp] = (unsigned char)(signed char)VLESC2;
+                    dst[bp+1] = (unsigned char)(pay & 0xFF);
+                    dst[bp+2] = (unsigned char)((pay >> 8) & 0xFF);
+                } else if (i24) {
+                    dst[bp] = (unsigned char)(signed char)VLESC3;
+                    dst[bp+1] = (unsigned char)(pay & 0xFF);
+                    dst[bp+2] = (unsigned char)((pay >> 8) & 0xFF);
+                    dst[bp+3] = (unsigned char)((pay >> 16) & 0xFF);
+                } else {
+                    dst[bp] = (unsigned char)(signed char)VLESC4;
+                    dst[bp+1] = (unsigned char)(pay & 0xFF);
+                    dst[bp+2] = (unsigned char)((pay >> 8) & 0xFF);
+                    dst[bp+3] = (unsigned char)((pay >> 16) & 0xFF);
+                    dst[bp+4] = (unsigned char)((pay >> 24) & 0xFF);
+                }
+                bp += ib ? 1 : i16 ? 3 : i24 ? 4 : 5;
+            }
+        }
+    }
+    if constexpr (!WRITE) {
+        bool r1=(rle==1),rs=(rle>=2)&(rle<256);
+        int rb=(r1?1:(rs?2:4))&-(rle>0);
+        bp+=rb;
+    } else {
+        if (rle == 1) {
+            dst[bp] = 0; bp += 1;
+        } else if (rle >= 2 && rle < 256) {
+            dst[bp] = (unsigned char)(RLESC1 & 0xFF);
+            dst[bp+1] = (unsigned char)rle; bp += 2;
+        } else if (rle >= 256) {
+            dst[bp] = (unsigned char)(RLESC3 & 0xFF);
+            dst[bp+1] = (unsigned char)(rle & 0xFF);
+            dst[bp+2] = (unsigned char)((rle >> 8) & 0xFF);
+            dst[bp+3] = (unsigned char)((rle >> 16) & 0xFF); bp += 4;
+        }
+    }
+    return bp;
+}
+
+// ---------------------------------------------------------------------------
+// Forward: 2D wavelet + quantize + RLE encode.
+// Grid: (ceil(nbx/32), nby) where nbx = nx/32, nby = ny/32.
+// ---------------------------------------------------------------------------
+__launch_bounds__(256, 2)
+__global__ void waveletRLE2DFusedKernel(
+    const float* __restrict__ input,
+    unsigned char* __restrict__ output,
+    size_t* __restrict__ block_sizes,
+    float scale,
+    int ldimx,
+    int nbx,
+    const double* __restrict__ d_rms,
+    float* __restrict__ d_mulfac_out)
+{
+    constexpr int NTHREADS = 256;
+    constexpr int SLC      = 2;
+    using BlockScan = rocprim::block_scan<int, NTHREADS>;
+
+    __shared__ union {
+        float wavelet[WRLE2D_BATCH * 1024];
+        struct {
+            typename BlockScan::storage_type scan;
+            int block_base_off[WRLE2D_BATCH];
+        };
+        unsigned char compact[WRLE_LDS_BYTES];
+    } lds;
+
+    int tid = threadIdx.x;
+
+    float mulfac;
+    if (d_rms != nullptr) {
+        float rms = (float)*d_rms;
+        float product = rms * scale;
+        mulfac = (product > 0.0f && __builtin_isfinite(1.0f / product))
+                 ? (1.0f / product) : 1.0f;
+        if (tid == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+            if (d_mulfac_out) *d_mulfac_out = mulfac;
+        }
+    } else {
+        mulfac = scale;
+    }
+
+    int wg_tile_x0 = blockIdx.x * WRLE2D_TILES_PER_WG;
+
+    for (int batch = 0; batch < 4; ++batch) {
+        int batch_tile_x0 = wg_tile_x0 + batch * WRLE2D_BATCH;
+
+        // ---- Load 8 tiles → LDS ----
+        int xg = tid % 8, yr = tid / 8;
+        for (int b = 0; b < WRLE2D_BATCH; ++b) {
+            int tile_bx = batch_tile_x0 + b;
+            int gx = tile_bx * 32 + xg * 4;
+            int gy = blockIdx.y * 32 + yr;
+            int x0 = xg * 4;
+            if (tile_bx < nbx) {
+                uint32_t byte_off = (gx + gy * ldimx) * (uint32_t)sizeof(float);
+                auto rsrc = __builtin_amdgcn_make_buffer_rsrc(
+                    const_cast<float*>(input), 0, -1, 0x00027000);
+                auto v = __builtin_bit_cast(
+                    __attribute__((__vector_size__(4 * sizeof(float)))) float,
+                    __builtin_amdgcn_raw_buffer_load_b128(rsrc, byte_off, 0, SLC));
+                lds.wavelet[b * 1024 + (x0+0) * 32 + (yr ^ (x0+0))] = v[0];
+                lds.wavelet[b * 1024 + (x0+1) * 32 + (yr ^ (x0+1))] = v[1];
+                lds.wavelet[b * 1024 + (x0+2) * 32 + (yr ^ (x0+2))] = v[2];
+                lds.wavelet[b * 1024 + (x0+3) * 32 + (yr ^ (x0+3))] = v[3];
+            } else {
+                lds.wavelet[b * 1024 + (x0+0) * 32 + (yr ^ (x0+0))] = 0.0f;
+                lds.wavelet[b * 1024 + (x0+1) * 32 + (yr ^ (x0+1))] = 0.0f;
+                lds.wavelet[b * 1024 + (x0+2) * 32 + (yr ^ (x0+2))] = 0.0f;
+                lds.wavelet[b * 1024 + (x0+3) * 32 + (yr ^ (x0+3))] = 0.0f;
+            }
+        }
+        __syncthreads();
+
+        // ---- Y-transform in LDS ----
+        {
+            int blk = tid / 32, pos = tid % 32;
+            float line[32];
+            for (int y = 0; y < 32; y++)
+                line[y] = lds.wavelet[blk * 1024 + pos * 32 + (y ^ pos)];
+            ds79_forward_reg32(line);
+            for (int y = 0; y < 32; y++)
+                lds.wavelet[blk * 1024 + pos * 32 + (y ^ pos)] = line[y];
+        }
+        __syncthreads();
+
+        // ---- Transposed readback + X-transform in registers ----
+        int blk = tid / 32;
+        int row = tid % 32;
+        int tile_bx = batch_tile_x0 + blk;
+        bool active = (tile_bx < nbx);
+
+        float xline[32];
+        if (active) {
+            for (int x = 0; x < 32; x++)
+                xline[x] = lds.wavelet[blk * 1024 + x * 32 + (row ^ x)];
+            ds79_forward_reg32(xline);
+        } else {
+            for (int x = 0; x < 32; x++)
+                xline[x] = 0.0f;
+        }
+        __syncthreads();
+
+        // ---- RLE encode ----
+        int my_count = active ? wrle_xline<false>(xline, mulfac, nullptr) : 0;
+
+        int global_bid = tile_bx + blockIdx.y * nbx;
+        // Write per-line metadata to global scratch
+        if (active) {
+            unsigned char* slot = output + (size_t)global_bid * WRLE2D_SLOT_BYTES;
+            slot[row] = (unsigned char)my_count;
+        }
+
+        int my_offset, pass_total;
+        BlockScan().exclusive_scan(my_count, my_offset, 0, pass_total, lds.scan);
+        __syncthreads();
+
+        // Store per-block base offsets (inside union, will be clobbered by compact)
+        if (tid % 32 == 0)
+            lds.block_base_off[tid / 32] = my_offset;
+        __syncthreads();
+
+        // Cache block boundaries in registers before compact clobbers them
+        int blk_start = lds.block_base_off[blk];
+        bool next_blk_active = (blk < WRLE2D_BATCH - 1)
+                             && (batch_tile_x0 + blk + 1 < nbx);
+        int blk_end = next_blk_active
+                    ? lds.block_base_off[blk + 1] : pass_total;
+        int blk_rle_bytes = blk_end - blk_start;
+        __syncthreads();
+
+        // Write encoded data to LDS compact buffer (clobbers block_base_off)
+        if (active)
+            wrle_xline<true>(xline, mulfac, lds.compact + my_offset);
+        __syncthreads();
+
+        // Copy per-block data from LDS → global scratch slot
+        if (active) {
+            unsigned char* slot = output + (size_t)global_bid * WRLE2D_SLOT_BYTES;
+            for (int i = row; i < blk_rle_bytes; i += 32) {
+                slot[WRLE2D_META_BYTES + i] = lds.compact[blk_start + i];
+            }
+            if (row == 0)
+                block_sizes[global_bid] = WRLE2D_META_BYTES + blk_rle_bytes;
+        }
+        __syncthreads();
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Inverse: RLE decode + dequantize + inverse wavelet 2D.
+// Grid: (ceil(nbx/32), nby).
+//
+// Phase-separated design with 2 macro-passes of 2 batches each.
+// Halves persistent register array (decoded[2][32] = 64 VGPRs vs 128)
+// to reduce scratch spilling while keeping phase-separation benefits.
+// ---------------------------------------------------------------------------
+__launch_bounds__(256, 2)
+__global__ void waveletRLE2DInverseFusedKernel(
+    const unsigned char* __restrict__ input,
+    const size_t* __restrict__ block_sizes,
+    const size_t* __restrict__ block_offsets,
+    float* __restrict__ output,
+    float inv_scale,
+    int ldimx,
+    int nbx,
+    int header_size)
+{
+    constexpr int NTHREADS = 256;
+    constexpr int SLC      = 2;
+    constexpr int MACRO_BATCHES = 2;
+    using BlockScan = rocprim::block_scan<int, NTHREADS>;
+    using f4vec = __attribute__((__vector_size__(4 * sizeof(float)))) float;
+
+    __shared__ union {
+        float wavelet[WRLE2D_BATCH * 1024];
+        struct {
+            typename BlockScan::storage_type scan;
+            int block_base_off[WRLE2D_BATCH];
+        };
+    } lds;
+
+    int tid = threadIdx.x;
+    int wg_tile_x0 = blockIdx.x * WRLE2D_TILES_PER_WG;
+    int blk = tid / 32;
+    int row = tid % 32;
+
+    float actual_inv_scale;
+    const unsigned char* data_base;
+
+    if (header_size != 0) {
+        const int* hdr = (const int*)input;
+        int num_blocks = hdr[0];
+        int num_mulfacs = hdr[1];
+        const float* mulfacs = (const float*)(input + 8 + 8 * num_blocks);
+        actual_inv_scale = 1.0f / mulfacs[0];
+        data_base = input + 8 + 8 * num_blocks + 4 * num_mulfacs;
+    } else {
+        actual_inv_scale = inv_scale;
+        data_base = input;
+    }
+
+    for (int mp = 0; mp < 4 / MACRO_BATCHES; ++mp) {
+
+        float decoded[MACRO_BATCHES][32];
+
+        // ---- Decode + X-transform (MACRO_BATCHES passes) ----
+        for (int b = 0; b < MACRO_BATCHES; ++b) {
+            int batch = mp * MACRO_BATCHES + b;
+            int batch_tile_x0 = wg_tile_x0 + batch * WRLE2D_BATCH;
+            int tile_bx = batch_tile_x0 + blk;
+            int global_bid = tile_bx + blockIdx.y * nbx;
+            bool active = (tile_bx < nbx);
+
+            int my_bytes = 0;
+            const unsigned char* rle_data = nullptr;
+
+            if (active) {
+                const unsigned char* block_in;
+                if (header_size != 0) {
+                    block_in = data_base + ((const size_t*)(input + 8))[global_bid];
+                } else if (block_offsets != nullptr) {
+                    block_in = data_base + block_offsets[global_bid];
+                } else {
+                    block_in = data_base + (size_t)global_bid * WRLE2D_SLOT_BYTES;
+                }
+                rle_data = block_in + WRLE2D_META_BYTES;
+                my_bytes = (int)block_in[row];
+            }
+
+            int my_offset, pass_total;
+            BlockScan().exclusive_scan(my_bytes, my_offset, 0, pass_total, lds.scan);
+            __syncthreads();
+
+            if (tid % 32 == 0)
+                lds.block_base_off[tid / 32] = my_offset;
+            __syncthreads();
+
+            if (active) {
+                int blk_base = lds.block_base_off[blk];
+                wrle_zline_decode(rle_data + my_offset - blk_base,
+                                  my_bytes, actual_inv_scale, decoded[b]);
+                us79_inverse_reg32(decoded[b]);
+            } else {
+                #pragma unroll
+                for (int x = 0; x < 32; x++) decoded[b][x] = 0.0f;
+            }
+        }
+
+        __syncthreads();
+
+        // ---- Y-transform + store (MACRO_BATCHES passes) ----
+        for (int b = 0; b < MACRO_BATCHES; ++b) {
+            int batch = mp * MACRO_BATCHES + b;
+            int batch_tile_x0 = wg_tile_x0 + batch * WRLE2D_BATCH;
+
+            for (int x = 0; x < 32; x++)
+                lds.wavelet[blk * 1024 + x * 32 + (row ^ x)] = decoded[b][x];
+            __syncthreads();
+
+            {
+                int pl  = tid / 32;
+                int pos = tid % 32;
+                float line[32];
+                for (int y = 0; y < 32; y++)
+                    line[y] = lds.wavelet[pl * 1024 + pos * 32 + (y ^ pos)];
+                us79_inverse_reg32(line);
+                for (int y = 0; y < 32; y++)
+                    lds.wavelet[pl * 1024 + pos * 32 + (y ^ pos)] = line[y];
+            }
+            __syncthreads();
+
+            f4vec store_regs[WRLE2D_BATCH];
+            {
+                int xg = tid % 8, yr = tid / 8;
+                int x0 = xg * 4;
+                #pragma unroll
+                for (int s = 0; s < WRLE2D_BATCH; ++s) {
+                    store_regs[s][0] = lds.wavelet[s * 1024 + (x0+0) * 32 + (yr ^ (x0+0))];
+                    store_regs[s][1] = lds.wavelet[s * 1024 + (x0+1) * 32 + (yr ^ (x0+1))];
+                    store_regs[s][2] = lds.wavelet[s * 1024 + (x0+2) * 32 + (yr ^ (x0+2))];
+                    store_regs[s][3] = lds.wavelet[s * 1024 + (x0+3) * 32 + (yr ^ (x0+3))];
+                }
+            }
+            __syncthreads();
+
+            {
+                int xg = tid % 8, yr = tid / 8;
+                #pragma unroll
+                for (int s = 0; s < WRLE2D_BATCH; ++s) {
+                    int tb = batch_tile_x0 + s;
+                    if (tb >= nbx) continue;
+                    int gx = tb * 32 + xg * 4;
+                    int gy = blockIdx.y * 32 + yr;
+                    uint32_t byte_off = (gx + gy * ldimx) * (uint32_t)sizeof(float);
+                    auto rsrc = __builtin_amdgcn_make_buffer_rsrc(
+                        output, 0, -1, 0x00027000);
+                    auto vi = __builtin_bit_cast(
+                        __attribute__((__vector_size__(4 * sizeof(int)))) int, store_regs[s]);
+                    __builtin_amdgcn_raw_buffer_store_b128(vi, rsrc, byte_off, 0, SLC);
+                }
+            }
+            __syncthreads();
+        }
+    }
+}
+
+// Output compaction for 2D blocks. Same as wrleCompactKernel but with
+// configurable slot stride.
+__global__ void wrle2DCompactKernel(
+    const unsigned char* __restrict__ src,
+    unsigned char* __restrict__ dst,
+    const size_t* __restrict__ block_sizes,
+    const size_t* __restrict__ offsets,
+    unsigned char* __restrict__ hdr,
+    int num_blocks,
+    int num_mulfacs,
+    const float* __restrict__ d_mulfac,
+    size_t slot_stride)
+{
+    int bid = blockIdx.x;
+    int tid = threadIdx.x;
+    size_t size = block_sizes[bid];
+    size_t dst_off = offsets[bid];
+    size_t src_off = (size_t)bid * slot_stride;
+
+    if (hdr != nullptr && tid == 0) {
+        ((size_t*)(hdr + 8))[bid] = offsets[bid];
+        if (bid == 0) {
+            ((int*)hdr)[0] = num_blocks;
+            ((int*)hdr)[1] = num_mulfacs;
+            float* mf_dst = (float*)(hdr + 8 + 8 * num_blocks);
+            for (int i = 0; i < num_mulfacs; ++i)
+                mf_dst[i] = d_mulfac[i];
+        }
+    }
+
+    for (size_t i = tid * 4; i < size; i += blockDim.x * 4) {
+        unsigned val;
+        __builtin_memcpy(&val, src + src_off + i, 4);
+        size_t remain = size - i;
+        if (remain >= 4) {
+            __builtin_memcpy(dst + dst_off + i, &val, 4);
+        } else {
+            for (size_t b = 0; b < remain; ++b)
+                dst[dst_off + i + b] = (unsigned char)(val >> (b * 8));
+        }
+    }
+}
+
+inline int hipCompressHeaderSize2D(int num_blocks, int num_mulfacs)
+{
+    return 8 + 8 * num_blocks + 4 * num_mulfacs;
+}
+
+inline hipError_t hipWaveletRLE2DCompactScanTempSize(int nblocks, size_t* scan_temp_bytes)
+{
+    return rocprim::exclusive_scan(
+        nullptr, *scan_temp_bytes,
+        (size_t*)nullptr, (size_t*)nullptr, (size_t)0, (size_t)nblocks,
+        rocprim::plus<size_t>());
+}
+
+#endif // HIPWAVELET_RLE_2D_H
diff --git a/makefile b/makefile
index 4c45e77..02659bb 100644
--- a/makefile
+++ b/makefile
@@ -113,6 +113,10 @@ test_wavelet_rle_fused_hip: tests/test_wavelet_rle_fused_hip.cpp hip/hipWaveletR
 test_compress_api_hip: tests/test_compress_api_hip.cpp hip/hipCompress.cpp hip/hipCompress.h hip/hipBlockCopy.h hip/hipWaveletRLE.h hip/hipWaveletRLEInverse.h hip/ds79.h hip/us79_reg32.inc hip/ds79_reg32.inc libcvxcompress.$(LIB_EXT) | $(BUILDDIR)
 	$(HIPCC) $(HIPCFLAGS) --offload-arch=$(HIP_ARCH) -mllvm -unroll-threshold=10000 -I. -Ihip -Itests tests/test_compress_api_hip.cpp hip/hipCompress.cpp -L. -lcvxcompress -lm -o $(BUILDDIR)/test_compress_api_hip
 
+# 2D compression test
+test_compress_2d_hip: tests/test_compress_2d_hip.cpp hip/hipCompress.cpp hip/hipCompress.h hip/hipBlockCopy.h hip/hipWaveletRLE.h hip/hipWaveletRLEInverse.h hip/hipWaveletRLE2D.h hip/ds79.h hip/us79_reg32.inc hip/ds79_reg32.inc | $(BUILDDIR)
+	$(HIPCC) $(HIPCFLAGS) --offload-arch=$(HIP_ARCH) -mllvm -unroll-threshold=10000 -I. -Ihip -Itests tests/test_compress_2d_hip.cpp hip/hipCompress.cpp -lm -o $(BUILDDIR)/test_compress_2d_hip
+
 # Async pipeline example (for profiling)
 example_async_pipeline: tests/example_async_pipeline.cpp hip/hipCompress.cpp hip/hipCompress.h hip/hipBlockCopy.h hip/hipWaveletRLE.h hip/hipWaveletRLEInverse.h hip/ds79.h hip/us79_reg32.inc hip/ds79_reg32.inc | $(BUILDDIR)
 	$(HIPCC) $(HIPCFLAGS) --offload-arch=$(HIP_ARCH) -mllvm -unroll-threshold=10000 -I. -Ihip -Itests tests/example_async_pipeline.cpp hip/hipCompress.cpp -lm -o $(BUILDDIR)/example_async_pipeline
diff --git a/tests/profile_2d_vs_3d.cpp b/tests/profile_2d_vs_3d.cpp
new file mode 100644
index 0000000..3098909
--- /dev/null
+++ b/tests/profile_2d_vs_3d.cpp
@@ -0,0 +1,121 @@
+// Isolated decompress-only benchmark for profiling 2D vs 3D inverse kernels.
+// Compresses once, then runs decompress N times for profiler capture.
+
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <hip/hip_runtime.h>
+#include "hipCompress.h"
+
+#define HIPCHECK(cmd) do { \
+    hipError_t e = (cmd); \
+    if (e != hipSuccess) { \
+        fprintf(stderr, "HIP error %s at %s:%d\n", hipGetErrorString(e), __FILE__, __LINE__); \
+        exit(1); \
+    } \
+} while(0)
+
+__global__ void initSin2D(float* data, int nx, int ny, float kx, float ky)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= nx * ny) return;
+    int iy = idx / nx, ix = idx % nx;
+    data[idx] = sinf(kx * (float)ix / nx) * sinf(ky * (float)iy / ny);
+}
+
+__global__ void initSin3D(float* data, int nx, int ny, int nz, float kx, float ky, float kz)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= nx * ny * nz) return;
+    int iz = idx / (nx * ny);
+    int rem = idx - iz * nx * ny;
+    int iy = rem / nx, ix = rem % nx;
+    data[idx] = sinf(kx * (float)ix / nx) * sinf(ky * (float)iy / ny) * sinf(kz * (float)iz / nz);
+}
+
+int main(int argc, char** argv)
+{
+    int iters = 20;
+    if (argc > 1) iters = atoi(argv[1]);
+
+    const float scale = 5e-2f;
+
+    // ---- 3D: 256^3 ----
+    {
+        const int N = 256;
+        long total = (long)N * N * N;
+        hipCompressPlan* plan = nullptr;
+        HIPCHECK(hipCompressCreatePlan(&plan, N, N, N, 0));
+
+        float *d_in, *d_out;
+        unsigned char* d_comp;
+        HIPCHECK(hipMalloc(&d_in, total * sizeof(float)));
+        HIPCHECK(hipMalloc(&d_out, total * sizeof(float)));
+        size_t comp_sz; HIPCHECK(hipCompressMaxOutputSize(plan, &comp_sz));
+        HIPCHECK(hipMalloc(&d_comp, comp_sz));
+
+        initSin3D<<<(total+255)/256, 256>>>(d_in, N, N, N, 20.f, 20.f, 20.f);
+        HIPCHECK(hipDeviceSynchronize());
+
+        long len; float cr;
+        HIPCHECK(hipCompress(scale, nullptr, d_in, d_comp, plan, 0));
+        HIPCHECK(hipCompressSynchronize(plan, &len, &cr));
+        printf("3D 256^3: CR=%.1f, %ld bytes\n", cr, len);
+
+        // warmup
+        for (int i = 0; i < 5; i++) {
+            HIPCHECK(hipDecompress(d_comp, d_out, plan, 0));
+            HIPCHECK(hipDeviceSynchronize());
+        }
+
+        // timed region for profiler
+        printf("3D decompress: %d iterations\n", iters);
+        for (int i = 0; i < iters; i++) {
+            HIPCHECK(hipDecompress(d_comp, d_out, plan, 0));
+            HIPCHECK(hipDeviceSynchronize());
+        }
+
+        hipFree(d_in); hipFree(d_out); hipFree(d_comp);
+        hipCompressDestroyPlan(plan);
+    }
+
+    // ---- 2D: 4064x4064 (similar total floats) ----
+    {
+        const int NX = 4064, NY = 4064;
+        long total = (long)NX * NY;
+        hipCompressPlan* plan = nullptr;
+        HIPCHECK(hipCompressCreatePlan(&plan, NX, NY, 1, 0));
+
+        float *d_in, *d_out;
+        unsigned char* d_comp;
+        HIPCHECK(hipMalloc(&d_in, total * sizeof(float)));
+        HIPCHECK(hipMalloc(&d_out, total * sizeof(float)));
+        size_t comp_sz; HIPCHECK(hipCompressMaxOutputSize(plan, &comp_sz));
+        HIPCHECK(hipMalloc(&d_comp, comp_sz));
+
+        initSin2D<<<(total+255)/256, 256>>>(d_in, NX, NY, 20.f, 20.f);
+        HIPCHECK(hipDeviceSynchronize());
+
+        long len; float cr;
+        HIPCHECK(hipCompress(scale, nullptr, d_in, d_comp, plan, 0));
+        HIPCHECK(hipCompressSynchronize(plan, &len, &cr));
+        printf("2D %dx%d: CR=%.1f, %ld bytes\n", NX, NY, cr, len);
+
+        for (int i = 0; i < 5; i++) {
+            HIPCHECK(hipDecompress(d_comp, d_out, plan, 0));
+            HIPCHECK(hipDeviceSynchronize());
+        }
+
+        printf("2D decompress: %d iterations\n", iters);
+        for (int i = 0; i < iters; i++) {
+            HIPCHECK(hipDecompress(d_comp, d_out, plan, 0));
+            HIPCHECK(hipDeviceSynchronize());
+        }
+
+        hipFree(d_in); hipFree(d_out); hipFree(d_comp);
+        hipCompressDestroyPlan(plan);
+    }
+
+    printf("Done.\n");
+    return 0;
+}
diff --git a/tests/test_compress_2d_hip.cpp b/tests/test_compress_2d_hip.cpp
new file mode 100644
index 0000000..0fd39bd
--- /dev/null
+++ b/tests/test_compress_2d_hip.cpp
@@ -0,0 +1,354 @@
+// 2D compression round-trip test.
+// Tests hipCompress API with nz=1 (2D mode):
+//   1. Plan lifecycle with nz=1
+//   2. Compress/decompress round-trip on a 2D sinusoidal field
+//   3. Copy-to/from wavelet layout with ez=1
+//   4. Various 2D grid sizes (including non-32-aligned extraction)
+
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <cstring>
+#include <vector>
+#include <hip/hip_runtime.h>
+#include "hipCompress.h"
+
+#define HIPCHECK(cmd) do { \
+    hipError_t e = (cmd); \
+    if (e != hipSuccess) { \
+        fprintf(stderr, "HIP error %s at %s:%d\n", hipGetErrorString(e), __FILE__, __LINE__); \
+        exit(1); \
+    } \
+} while(0)
+
+__global__ void initSin2DKernel(float* data, int nx, int ny, float kx, float ky)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total = nx * ny;
+    if (idx >= total) return;
+    int iy = idx / nx;
+    int ix = idx - iy * nx;
+    float x = (float)ix / (float)nx;
+    float y = (float)iy / (float)ny;
+    data[idx] = sinf(kx * x) * sinf(ky * y);
+}
+
+static float hostRMS(const float* data, int n)
+{
+    double sum = 0.0;
+    for (int i = 0; i < n; ++i) sum += (double)data[i] * (double)data[i];
+    return (float)sqrt(sum / n);
+}
+
+static float maxAbsError(const float* a, const float* b, int n)
+{
+    float mx = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        float d = fabsf(a[i] - b[i]);
+        if (d > mx) mx = d;
+    }
+    return mx;
+}
+
+static hipError_t deviceRMS2D(const float* d_input, int nx, int ny,
+                               double* d_rms, hipCompressPlan* plan,
+                               hipStream_t s = 0)
+{
+    return hipCopyToWaveletLayout(
+        d_input, nx, nx,
+        0, 0, 0, nx, ny, 1,
+        nullptr, d_rms, plan, s);
+}
+
+static hipError_t compressWithAutoRMS2D(float scale, const float* d_input,
+                                         unsigned char* d_output, long* len,
+                                         float* cr, hipCompressPlan* plan,
+                                         hipStream_t s = 0)
+{
+    int nx = plan->nx, ny = plan->ny;
+    hipError_t err = deviceRMS2D(d_input, nx, ny, plan->d_rms, plan, s);
+    if (err != hipSuccess) return err;
+    err = hipCompress(scale, plan->d_rms, d_input, d_output, plan, s);
+    if (err != hipSuccess) return err;
+    return hipCompressSynchronize(plan, len, cr);
+}
+
+static bool test_plan_lifecycle_2d()
+{
+    printf("Test 1: 2D plan lifecycle\n");
+    hipCompressPlan* plan = nullptr;
+
+    hipError_t err = hipCompressCreatePlan(&plan, 128, 128, 1, 0);
+    if (err != hipSuccess || !plan) {
+        printf("  FAIL: create 128x128 2D plan: %s\n", hipGetErrorString(err));
+        return false;
+    }
+    if (!plan->is_2d) { printf("  FAIL: is_2d not set\n"); hipCompressDestroyPlan(plan); return false; }
+    if (plan->num_blocks != 4 * 4) {
+        printf("  FAIL: num_blocks=%d expected=%d\n", plan->num_blocks, 16);
+        hipCompressDestroyPlan(plan); return false;
+    }
+    printf("  128x128: num_blocks=%d: PASS\n", plan->num_blocks);
+
+    size_t max_sz = 0;
+    HIPCHECK(hipCompressMaxOutputSize(plan, &max_sz));
+    printf("  MaxOutputSize=%zu\n", max_sz);
+    hipCompressDestroyPlan(plan);
+
+    // Reject nx not multiple of 32
+    plan = nullptr;
+    err = hipCompressCreatePlan(&plan, 100, 128, 1, 0);
+    if (err == hipSuccess) { printf("  FAIL: should reject nx=100\n"); hipCompressDestroyPlan(plan); return false; }
+    printf("  reject non-32-multiple: PASS\n");
+
+    // 256x256
+    plan = nullptr;
+    err = hipCompressCreatePlan(&plan, 256, 256, 1, 0);
+    if (err != hipSuccess) { printf("  FAIL: create 256x256\n"); return false; }
+    if (plan->num_blocks != 8 * 8) { printf("  FAIL: num_blocks\n"); hipCompressDestroyPlan(plan); return false; }
+    printf("  256x256: num_blocks=%d: PASS\n", plan->num_blocks);
+    hipCompressDestroyPlan(plan);
+
+    return true;
+}
+
+static bool test_round_trip_2d()
+{
+    printf("Test 2: 2D compress/decompress round-trip\n");
+    const int NX = 128, NY = 128, total = NX * NY;
+    const float scale = 5e-2f;
+
+    hipCompressPlan* plan = nullptr;
+    HIPCHECK(hipCompressCreatePlan(&plan, NX, NY, 1, 0));
+
+    float* d_input = nullptr;
+    float* d_output = nullptr;
+    unsigned char* d_compressed = nullptr;
+    HIPCHECK(hipMalloc(&d_input, total * sizeof(float)));
+    HIPCHECK(hipMalloc(&d_output, total * sizeof(float)));
+
+    size_t comp_size = 0;
+    HIPCHECK(hipCompressMaxOutputSize(plan, &comp_size));
+    HIPCHECK(hipMalloc(&d_compressed, comp_size));
+
+    int threads = 256, blocks = (total + threads - 1) / threads;
+    initSin2DKernel<<<blocks, threads>>>(d_input, NX, NY, 20.0f, 20.0f);
+    HIPCHECK(hipDeviceSynchronize());
+
+    long compressed_length = 0;
+    float cr = 0;
+    HIPCHECK(compressWithAutoRMS2D(scale, d_input, d_compressed, &compressed_length, &cr, plan));
+    if (cr <= 1.0f || compressed_length <= 0) {
+        printf("  FAIL: cr=%.2f compressed_length=%ld\n", cr, compressed_length);
+        return false;
+    }
+    printf("  compress: CR=%.2f, %ld bytes\n", cr, compressed_length);
+
+    HIPCHECK(hipDecompress(d_compressed, d_output, plan, 0));
+
+    std::vector<float> h_in(total), h_out(total);
+    HIPCHECK(hipMemcpy(h_in.data(), d_input, total * sizeof(float), hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(h_out.data(), d_output, total * sizeof(float), hipMemcpyDeviceToHost));
+
+    float rms = hostRMS(h_in.data(), total);
+    float max_err = maxAbsError(h_in.data(), h_out.data(), total);
+    float rel_err = max_err / rms;
+    printf("  decompress: max_err=%.6e, rms=%.6e, rel_max_err=%.6e\n", max_err, rms, rel_err);
+
+    bool pass = (max_err < rms) && (cr > 1.0f);
+    printf("  round-trip: %s\n", pass ? "PASS" : "FAIL");
+
+    hipFree(d_input); hipFree(d_output); hipFree(d_compressed);
+    hipCompressDestroyPlan(plan);
+    return pass;
+}
+
+static bool test_round_trip_2d_large()
+{
+    printf("Test 3: 2D round-trip 512x512\n");
+    const int NX = 512, NY = 512, total = NX * NY;
+    const float scale = 5e-2f;
+
+    hipCompressPlan* plan = nullptr;
+    HIPCHECK(hipCompressCreatePlan(&plan, NX, NY, 1, 0));
+
+    float* d_input = nullptr;
+    float* d_output = nullptr;
+    unsigned char* d_compressed = nullptr;
+    HIPCHECK(hipMalloc(&d_input, total * sizeof(float)));
+    HIPCHECK(hipMalloc(&d_output, total * sizeof(float)));
+
+    size_t comp_size = 0;
+    HIPCHECK(hipCompressMaxOutputSize(plan, &comp_size));
+    HIPCHECK(hipMalloc(&d_compressed, comp_size));
+
+    int threads = 256, blocks = (total + threads - 1) / threads;
+    initSin2DKernel<<<blocks, threads>>>(d_input, NX, NY, 40.0f, 40.0f);
+    HIPCHECK(hipDeviceSynchronize());
+
+    long compressed_length = 0;
+    float cr = 0;
+    HIPCHECK(compressWithAutoRMS2D(scale, d_input, d_compressed, &compressed_length, &cr, plan));
+    printf("  compress: CR=%.2f, %ld bytes\n", cr, compressed_length);
+
+    HIPCHECK(hipDecompress(d_compressed, d_output, plan, 0));
+
+    std::vector<float> h_in(total), h_out(total);
+    HIPCHECK(hipMemcpy(h_in.data(), d_input, total * sizeof(float), hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(h_out.data(), d_output, total * sizeof(float), hipMemcpyDeviceToHost));
+
+    float rms = hostRMS(h_in.data(), total);
+    float max_err = maxAbsError(h_in.data(), h_out.data(), total);
+    printf("  decompress: max_err=%.6e, rms=%.6e, rel_max_err=%.6e\n", max_err, rms, max_err / rms);
+
+    bool pass = (max_err < rms) && (cr > 1.0f);
+    printf("  round-trip: %s\n", pass ? "PASS" : "FAIL");
+
+    hipFree(d_input); hipFree(d_output); hipFree(d_compressed);
+    hipCompressDestroyPlan(plan);
+    return pass;
+}
+
+static bool test_copy_to_from_2d()
+{
+    printf("Test 4: 2D copy-to/copy-from wavelet layout\n");
+    const int EX = 100, EY = 80;
+    const int WNX = ((EX + 31) / 32) * 32;   // 128
+    const int WNY = ((EY + 31) / 32) * 32;   // 96
+    const int src_total = EX * EY;
+    const int wav_total = WNX * WNY;
+
+    hipCompressPlan* plan = nullptr;
+    HIPCHECK(hipCompressCreatePlan(&plan, WNX, WNY, 1, 0));
+
+    float* d_src = nullptr;
+    float* d_wav = nullptr;
+    float* d_dst = nullptr;
+    double* d_rms = nullptr;
+    HIPCHECK(hipMalloc(&d_src, EX * EY * sizeof(float)));
+    HIPCHECK(hipMalloc(&d_wav, wav_total * sizeof(float)));
+    HIPCHECK(hipMalloc(&d_dst, EX * EY * sizeof(float)));
+    HIPCHECK(hipMalloc(&d_rms, sizeof(double)));
+
+    int threads = 256, blocks = (src_total + threads - 1) / threads;
+    initSin2DKernel<<<blocks, threads>>>(d_src, EX, EY, 10.0f, 10.0f);
+    HIPCHECK(hipDeviceSynchronize());
+
+    HIPCHECK(hipCopyToWaveletLayout(
+        d_src, EX, EX,
+        0, 0, 0, EX, EY, 1,
+        d_wav, d_rms, plan, 0));
+
+    HIPCHECK(hipCopyFromWaveletLayout(
+        d_wav, d_dst, EX, EX,
+        0, 0, 0, EX, EY, 1,
+        plan, 0));
+    HIPCHECK(hipDeviceSynchronize());
+
+    std::vector<float> h_src(src_total), h_dst(src_total);
+    HIPCHECK(hipMemcpy(h_src.data(), d_src, src_total * sizeof(float), hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(h_dst.data(), d_dst, src_total * sizeof(float), hipMemcpyDeviceToHost));
+
+    float max_err = maxAbsError(h_src.data(), h_dst.data(), src_total);
+    printf("  copy round-trip max_err=%.6e\n", max_err);
+
+    double h_rms;
+    HIPCHECK(hipMemcpy(&h_rms, d_rms, sizeof(double), hipMemcpyDeviceToHost));
+    float cpu_rms = hostRMS(h_src.data(), src_total);
+    float rms_err = fabsf((float)h_rms - cpu_rms);
+    printf("  GPU RMS=%.6e, CPU RMS=%.6e, diff=%.6e\n", (float)h_rms, cpu_rms, rms_err);
+
+    bool pass = (max_err < 1e-6f) && (rms_err < 1e-4f);
+    printf("  copy-to/from 2D: %s\n", pass ? "PASS" : "FAIL");
+
+    hipFree(d_src); hipFree(d_wav); hipFree(d_dst); hipFree(d_rms);
+    hipCompressDestroyPlan(plan);
+    return pass;
+}
+
+static bool test_round_trip_with_copy_2d()
+{
+    printf("Test 5: 2D full pipeline (copy-to → compress → decompress → copy-from)\n");
+    const int EX = 100, EY = 80;
+    const int WNX = ((EX + 31) / 32) * 32;
+    const int WNY = ((EY + 31) / 32) * 32;
+    const float scale = 5e-2f;
+
+    hipCompressPlan* plan = nullptr;
+    HIPCHECK(hipCompressCreatePlan(&plan, WNX, WNY, 1, 0));
+
+    float* d_src = nullptr;
+    float* d_wav = nullptr;
+    float* d_dec = nullptr;
+    float* d_dst = nullptr;
+    unsigned char* d_comp = nullptr;
+    HIPCHECK(hipMalloc(&d_src, EX * EY * sizeof(float)));
+    HIPCHECK(hipMalloc(&d_wav, WNX * WNY * sizeof(float)));
+    HIPCHECK(hipMalloc(&d_dec, WNX * WNY * sizeof(float)));
+    HIPCHECK(hipMalloc(&d_dst, EX * EY * sizeof(float)));
+    size_t comp_size = 0;
+    HIPCHECK(hipCompressMaxOutputSize(plan, &comp_size));
+    HIPCHECK(hipMalloc(&d_comp, comp_size));
+
+    int threads = 256, blocks = (EX * EY + threads - 1) / threads;
+    initSin2DKernel<<<blocks, threads>>>(d_src, EX, EY, 15.0f, 15.0f);
+    HIPCHECK(hipDeviceSynchronize());
+
+    // Copy to wavelet layout + RMS
+    HIPCHECK(hipCopyToWaveletLayout(
+        d_src, EX, EX, 0, 0, 0, EX, EY, 1,
+        d_wav, plan->d_rms, plan, 0));
+
+    // Compress
+    long compressed_length = 0;
+    float cr = 0;
+    hipError_t err = hipCompress(scale, plan->d_rms, d_wav, d_comp, plan, 0);
+    HIPCHECK(err);
+    HIPCHECK(hipCompressSynchronize(plan, &compressed_length, &cr));
+    printf("  compress: CR=%.2f, %ld bytes\n", cr, compressed_length);
+
+    // Decompress
+    HIPCHECK(hipDecompress(d_comp, d_dec, plan, 0));
+
+    // Copy from wavelet layout
+    HIPCHECK(hipCopyFromWaveletLayout(
+        d_dec, d_dst, EX, EX, 0, 0, 0, EX, EY, 1, plan, 0));
+    HIPCHECK(hipDeviceSynchronize());
+
+    std::vector<float> h_src(EX * EY), h_dst(EX * EY);
+    HIPCHECK(hipMemcpy(h_src.data(), d_src, EX * EY * sizeof(float), hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(h_dst.data(), d_dst, EX * EY * sizeof(float), hipMemcpyDeviceToHost));
+
+    float rms = hostRMS(h_src.data(), EX * EY);
+    float max_err = maxAbsError(h_src.data(), h_dst.data(), EX * EY);
+    printf("  max_err=%.6e, rms=%.6e, rel=%.6e\n", max_err, rms, max_err / rms);
+
+    bool pass = (max_err < rms) && (cr > 1.0f);
+    printf("  full pipeline 2D: %s\n", pass ? "PASS" : "FAIL");
+
+    hipFree(d_src); hipFree(d_wav); hipFree(d_dec); hipFree(d_dst); hipFree(d_comp);
+    hipCompressDestroyPlan(plan);
+    return pass;
+}
+
+int main()
+{
+    printf("=== 2D Compression Tests ===\n\n");
+    fflush(stdout);
+
+    int passed = 0, total = 0;
+
+    total++; if (test_plan_lifecycle_2d()) passed++;
+    printf("\n");
+    total++; if (test_round_trip_2d()) passed++;
+    printf("\n");
+    total++; if (test_round_trip_2d_large()) passed++;
+    printf("\n");
+    total++; if (test_copy_to_from_2d()) passed++;
+    printf("\n");
+    total++; if (test_round_trip_with_copy_2d()) passed++;
+    printf("\n");
+
+    printf("=== Results: %d/%d passed ===\n", passed, total);
+    return (passed == total) ? 0 : 1;
+}