From d7aa97e7436ed2caf7c43adf2142e7bd50c9452d Mon Sep 17 00:00:00 2001 From: "OReilly, Ossian" Date: Wed, 13 May 2026 20:05:34 -0500 Subject: [PATCH] Add preliminary 2-D support. Note: performance is currently 1.2x slower than the 3-D version for the same size. --- hip/hipBlockCopy.h | 33 ++- hip/hipCompress.cpp | 129 ++++++--- hip/hipCompress.h | 5 +- hip/hipWaveletRLE2D.h | 473 +++++++++++++++++++++++++++++++++ makefile | 4 + tests/profile_2d_vs_3d.cpp | 121 +++++++++ tests/test_compress_2d_hip.cpp | 354 ++++++++++++++++++++++++ 7 files changed, 1069 insertions(+), 50 deletions(-) create mode 100644 hip/hipWaveletRLE2D.h create mode 100644 tests/profile_2d_vs_3d.cpp create mode 100644 tests/test_compress_2d_hip.cpp diff --git a/hip/hipBlockCopy.h b/hip/hipBlockCopy.h index 189a59c..93ce222 100644 --- a/hip/hipBlockCopy.h +++ b/hip/hipBlockCopy.h @@ -82,18 +82,20 @@ __global__ void copyToWaveletKernelOpt( } } - // ---- Phase 3: Store ZPB planes ---- + // ---- Phase 3: Store ZPB planes (skip planes beyond wnz) ---- if constexpr (DO_COPY) { uint32_t dst_byte = (gx + gy * wnx) * (uint32_t)sizeof(float); #pragma unroll for (int dz = 0; dz < BCOPY_ZPB; ++dz) { - auto plane_rsrc = __builtin_amdgcn_make_buffer_rsrc( - d_dst + (long)(z_start + dz) * wnx * wny, - 0, -1, 0x00027000); - auto vi = __builtin_bit_cast(bcopy_int4_vec, regs[dz]); - __builtin_amdgcn_raw_buffer_store_b128( - vi, plane_rsrc, dst_byte, 0, SLC); + if (z_start + dz < wnz) { + auto plane_rsrc = __builtin_amdgcn_make_buffer_rsrc( + d_dst + (long)(z_start + dz) * wnx * wny, + 0, -1, 0x00027000); + auto vi = __builtin_bit_cast(bcopy_int4_vec, regs[dz]); + __builtin_amdgcn_raw_buffer_store_b128( + vi, plane_rsrc, dst_byte, 0, SLC); + } } } @@ -174,15 +176,20 @@ __global__ void copyFromWaveletKernelOpt( uint32_t src_byte = (gx + gy * wnx) * (uint32_t)sizeof(float); long wav_plane = (long)wnx * wny; + constexpr bcopy_float4_vec zero_vec_from = {0.0f, 0.0f, 0.0f, 0.0f}; bcopy_float4_vec regs[BCOPY_ZPB]; #pragma unroll for (int dz = 0; dz < BCOPY_ZPB; ++dz) { - auto plane_rsrc = __builtin_amdgcn_make_buffer_rsrc( - const_cast(d_src + (long)(z_start + dz) * wav_plane), - 0, -1, 0x00027000); - regs[dz] = __builtin_bit_cast(bcopy_float4_vec, - __builtin_amdgcn_raw_buffer_load_b128( - plane_rsrc, src_byte, 0, SLC)); + if (z_start + dz < wnz) { + auto plane_rsrc = __builtin_amdgcn_make_buffer_rsrc( + const_cast(d_src + (long)(z_start + dz) * wav_plane), + 0, -1, 0x00027000); + regs[dz] = __builtin_bit_cast(bcopy_float4_vec, + __builtin_amdgcn_raw_buffer_load_b128( + plane_rsrc, src_byte, 0, SLC)); + } else { + regs[dz] = zero_vec_from; + } } uint32_t dst_byte = ((y0 + gy) * ldimx + x0 + gx) * (uint32_t)sizeof(float); diff --git a/hip/hipCompress.cpp b/hip/hipCompress.cpp index 770be8a..29d001b 100644 --- a/hip/hipCompress.cpp +++ b/hip/hipCompress.cpp @@ -7,6 +7,7 @@ #define DS79_INCLUDE_REG32 #include "hipWaveletRLE.h" #include "hipWaveletRLEInverse.h" +#include "hipWaveletRLE2D.h" #include "hipBlockCopy.h" #include @@ -67,20 +68,37 @@ hipError_t hipCompressCreatePlan(hipCompressPlan** plan, int nx, int ny, int nz, if (nx <= 0 || ny <= 0 || nz <= 0) PLAN_ERROR(p, HIP_COMPRESS_ERROR_INVALID_DIMENSIONS, hipErrorInvalidValue); - if (nx % 32 != 0 || ny % 32 != 0 || nz % 32 != 0) - PLAN_ERROR(p, HIP_COMPRESS_ERROR_NOT_MULTIPLE_OF_32, hipErrorInvalidValue); + + bool is_2d = (nz == 1); + + if (is_2d) { + if (nx % 32 != 0 || ny % 32 != 0) + PLAN_ERROR(p, HIP_COMPRESS_ERROR_NOT_MULTIPLE_OF_32, hipErrorInvalidValue); + } else { + if (nx % 32 != 0 || ny % 32 != 0 || nz % 32 != 0) + PLAN_ERROR(p, HIP_COMPRESS_ERROR_NOT_MULTIPLE_OF_32, hipErrorInvalidValue); + } + if ((long)nx * (long)ny * (long)sizeof(float) > (1L << 32)) PLAN_ERROR(p, HIP_COMPRESS_ERROR_PLANE_TOO_LARGE, hipErrorInvalidValue); p->kernel = kernel; p->nx = nx; p->ny = ny; p->nz = nz; - p->num_blocks = (nx / 32) * (ny / 32) * (nz / 32); + p->is_2d = is_2d; p->aux_stream = aux_stream; p->compress_pending = false; p->last_error = HIP_COMPRESS_ERROR_HIP_RUNTIME; + if (is_2d) { + p->num_blocks = (nx / 32) * (ny / 32); + p->scratch_slot_stride = WRLE2D_SLOT_BYTES; + } else { + p->num_blocks = (nx / 32) * (ny / 32) * (nz / 32); + p->scratch_slot_stride = 4L * WRLE_LDS_BYTES; + } + int nb = p->num_blocks; - long scratch_size = (long)nb * 4L * WRLE_LDS_BYTES; + long scratch_size = (long)nb * (long)p->scratch_slot_stride; HIPCHECK_PLAN(p, hipMalloc(&p->d_scratch, scratch_size)); HIPCHECK_PLAN(p, hipMalloc(&p->d_mulfac, sizeof(float))); @@ -95,13 +113,15 @@ hipError_t hipCompressCreatePlan(hipCompressPlan** plan, int nx, int ny, int nz, HIPCHECK_PLAN(p, hipMalloc(&p->d_rms, sizeof(double))); - p->max_copy_blocks = (nx / 32) * (ny / 32) * (nz / BCOPY_ZPB); + int nz_for_copy = is_2d ? 1 : nz; + p->max_copy_blocks = (nx / 32) * (ny / 32) + * ((nz_for_copy + BCOPY_ZPB - 1) / BCOPY_ZPB); HIPCHECK_PLAN(p, hipMalloc(&p->d_partial_sums, p->max_copy_blocks * sizeof(double))); HIPCHECK_PLAN(p, hipEventCreateWithFlags(&p->ready_event, hipEventDisableTiming)); HIPCHECK_PLAN(p, hipHostMalloc(&p->h_staging, 2 * sizeof(size_t))); - // JIT warmup: exclusive_scan on aux_stream (no user stream available yet) + // JIT warmup: exclusive_scan on aux_stream err = rocprim::exclusive_scan( p->d_scan_temp, p->scan_temp_bytes, p->d_block_sizes, p->d_block_offsets, (size_t)0, (size_t)1, @@ -149,7 +169,7 @@ hipError_t hipCompress( PLAN_ERROR(plan, HIP_COMPRESS_ERROR_INVALID_SCALE, hipErrorInvalidValue); const int nx = plan->nx, ny = plan->ny, nz = plan->nz; - const int ldimx = nx, ldimxy = nx * ny; + const int ldimx = nx; const int nb = plan->num_blocks; const int num_mulfacs = 1; const int hdr_size = hipCompressHeaderSize(nb, num_mulfacs); @@ -157,17 +177,27 @@ hipError_t hipCompress( hipStream_t aux = plan->aux_stream; // 1. Fused wavelet + quantize + RLE → scratch (user_stream) - dim3 grid((nx + 31) / 32, (ny + 31) / 32, (nz + 31) / 32); - if (plan->kernel == HIP_COMPRESS_KERNEL_SEGRLE) { - waveletSegRLEFusedKernel<<>>( + if (plan->is_2d) { + int nbx = nx / 32, nby = ny / 32; + dim3 grid((nbx + WRLE2D_TILES_PER_WG - 1) / WRLE2D_TILES_PER_WG, nby); + waveletRLE2DFusedKernel<<>>( d_input, plan->d_scratch, plan->d_block_sizes, - scale, ldimx, ldimxy, + scale, ldimx, nbx, d_rms, plan->d_mulfac); } else { - waveletRLEFusedKernel<<>>( - d_input, plan->d_scratch, plan->d_block_sizes, - scale, ldimx, ldimxy, - d_rms, plan->d_mulfac); + const int ldimxy = nx * ny; + dim3 grid((nx + 31) / 32, (ny + 31) / 32, (nz + 31) / 32); + if (plan->kernel == HIP_COMPRESS_KERNEL_SEGRLE) { + waveletSegRLEFusedKernel<<>>( + d_input, plan->d_scratch, plan->d_block_sizes, + scale, ldimx, ldimxy, + d_rms, plan->d_mulfac); + } else { + waveletRLEFusedKernel<<>>( + d_input, plan->d_scratch, plan->d_block_sizes, + scale, ldimx, ldimxy, + d_rms, plan->d_mulfac); + } } // 2. Exclusive scan for compaction offsets (user_stream) @@ -181,10 +211,18 @@ hipError_t hipCompress( HIPCHECK_PLAN(plan, hipStreamWaitEvent(aux, plan->ready_event, 0)); // 4. Compact + write header on aux_stream - wrleCompactKernel<<>>( - plan->d_scratch, d_output + hdr_size, - plan->d_block_sizes, plan->d_block_offsets, - d_output, nb, num_mulfacs, plan->d_mulfac); + if (plan->is_2d) { + wrle2DCompactKernel<<>>( + plan->d_scratch, d_output + hdr_size, + plan->d_block_sizes, plan->d_block_offsets, + d_output, nb, num_mulfacs, plan->d_mulfac, + plan->scratch_slot_stride); + } else { + wrleCompactKernel<<>>( + plan->d_scratch, d_output + hdr_size, + plan->d_block_sizes, plan->d_block_offsets, + d_output, nb, num_mulfacs, plan->d_mulfac); + } // 5. Async readback on aux_stream HIPCHECK_PLAN(plan, hipMemcpyAsync(&plan->h_staging[0], plan->d_block_offsets + (nb - 1), @@ -241,14 +279,19 @@ hipError_t hipCopyToWaveletLayout( PLAN_ERROR(plan, HIP_COMPRESS_ERROR_NULL_INPUT, hipErrorInvalidValue); if (!d_dst && !d_rms_out) PLAN_ERROR(plan, HIP_COMPRESS_ERROR_BOTH_OUTPUTS_NULL, hipErrorInvalidValue); - if (ex < 32 || ey < 32 || ez < 32) - PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue); + if (plan->is_2d) { + if (ex < 32 || ey < 32 || ez != 1) + PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue); + } else { + if (ex < 32 || ey < 32 || ez < 32) + PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue); + } if ((long)ldimxy * (long)sizeof(float) > (1L << 32)) PLAN_ERROR(plan, HIP_COMPRESS_ERROR_PLANE_TOO_LARGE, hipErrorInvalidValue); int wnx = hipCompressWaveletDim(ex); int wny = hipCompressWaveletDim(ey); - int wnz = hipCompressWaveletDim(ez); + int wnz = plan->is_2d ? 1 : hipCompressWaveletDim(ez); if (wnx != plan->nx || wny != plan->ny || wnz != plan->nz) PLAN_ERROR(plan, HIP_COMPRESS_ERROR_EXTRACTION_DIMS_MISMATCH, hipErrorInvalidValue); @@ -258,7 +301,7 @@ hipError_t hipCopyToWaveletLayout( bool do_rms = (d_rms_out != nullptr); long total_samples = (long)ex * ey * ez; - dim3 grid(wnx / 32, wny / 32, wnz / BCOPY_ZPB); + dim3 grid(wnx / 32, wny / 32, (wnz + BCOPY_ZPB - 1) / BCOPY_ZPB); int total_blocks = grid.x * grid.y * grid.z; if (do_copy && do_rms) { @@ -307,19 +350,24 @@ hipError_t hipCopyFromWaveletLayout( PLAN_ERROR(plan, HIP_COMPRESS_ERROR_NULL_INPUT, hipErrorInvalidValue); if (!d_dst) PLAN_ERROR(plan, HIP_COMPRESS_ERROR_NULL_OUTPUT, hipErrorInvalidValue); - if (ex < 32 || ey < 32 || ez < 32) - PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue); + if (plan->is_2d) { + if (ex < 32 || ey < 32 || ez != 1) + PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue); + } else { + if (ex < 32 || ey < 32 || ez < 32) + PLAN_ERROR(plan, HIP_COMPRESS_ERROR_WINDOW_TOO_SMALL, hipErrorInvalidValue); + } if ((long)ldimxy * (long)sizeof(float) > (1L << 32)) PLAN_ERROR(plan, HIP_COMPRESS_ERROR_PLANE_TOO_LARGE, hipErrorInvalidValue); int wnx = hipCompressWaveletDim(ex); int wny = hipCompressWaveletDim(ey); - int wnz = hipCompressWaveletDim(ez); + int wnz = plan->is_2d ? 1 : hipCompressWaveletDim(ez); if (wnx != plan->nx || wny != plan->ny || wnz != plan->nz) PLAN_ERROR(plan, HIP_COMPRESS_ERROR_EXTRACTION_DIMS_MISMATCH, hipErrorInvalidValue); - dim3 grid(wnx / 32, wny / 32, wnz / BCOPY_ZPB); + dim3 grid(wnx / 32, wny / 32, (wnz + BCOPY_ZPB - 1) / BCOPY_ZPB); copyFromWaveletKernelOpt<<>>( d_src, wnx, wny, wnz, d_dst, ldimx, ldimxy, x0, y0, z0, ex, ey, ez); @@ -344,17 +392,26 @@ hipError_t hipDecompress( PLAN_ERROR(plan, HIP_COMPRESS_ERROR_NULL_OUTPUT, hipErrorInvalidValue); const int nx = plan->nx, ny = plan->ny, nz = plan->nz; - const int ldimx = nx, ldimxy = nx * ny; + const int ldimx = nx; - dim3 grid((nx + 31) / 32, (ny + 31) / 32, (nz + 31) / 32); - if (plan->kernel == HIP_COMPRESS_KERNEL_SEGRLE) { - waveletSegRLEInverseFusedKernel<<>>( + if (plan->is_2d) { + int nbx = nx / 32, nby = ny / 32; + dim3 grid((nbx + WRLE2D_TILES_PER_WG - 1) / WRLE2D_TILES_PER_WG, nby); + waveletRLE2DInverseFusedKernel<<>>( d_input, nullptr, nullptr, - d_output, 0.0f, ldimx, ldimxy, 1); + d_output, 0.0f, ldimx, nbx, 1); } else { - waveletRLEInverseFusedKernel<<>>( - d_input, nullptr, nullptr, - d_output, 0.0f, ldimx, ldimxy, 1); + const int ldimxy = nx * ny; + dim3 grid((nx + 31) / 32, (ny + 31) / 32, (nz + 31) / 32); + if (plan->kernel == HIP_COMPRESS_KERNEL_SEGRLE) { + waveletSegRLEInverseFusedKernel<<>>( + d_input, nullptr, nullptr, + d_output, 0.0f, ldimx, ldimxy, 1); + } else { + waveletRLEInverseFusedKernel<<>>( + d_input, nullptr, nullptr, + d_output, 0.0f, ldimx, ldimxy, 1); + } } hipError_t launch_err = hipGetLastError(); @@ -372,6 +429,6 @@ hipError_t hipCompressMaxOutputSize(const hipCompressPlan* plan, size_t* size) } plan->last_error = HIP_COMPRESS_SUCCESS; int hdr_size = hipCompressHeaderSize(plan->num_blocks, 1); - *size = (size_t)hdr_size + (size_t)plan->num_blocks * 4 * WRLE_LDS_BYTES; + *size = (size_t)hdr_size + (size_t)plan->num_blocks * plan->scratch_slot_stride; return hipSuccess; } diff --git a/hip/hipCompress.h b/hip/hipCompress.h index 11af240..1c15d1c 100644 --- a/hip/hipCompress.h +++ b/hip/hipCompress.h @@ -38,6 +38,8 @@ struct hipCompressPlan { hipCompressKernel kernel; int nx, ny, nz; int num_blocks; + bool is_2d; // true when nz == 1 + size_t scratch_slot_stride; // per-block scratch slot size unsigned char* d_scratch; float* d_mulfac; @@ -89,7 +91,8 @@ hipError_t hipCompressDestroyPlan(hipCompressPlan* plan); // Zero-fills the padding band (ex..wnx-1, etc.). // RMS is computed over the ex*ey*ez extraction window only. // -// ex,ey,ez >= 32. Wavelet dims are derived internally: wnx = round32(ex), etc. +// 3D: ex,ey,ez >= 32. Wavelet dims: wnx = round32(ex), etc. +// 2D (nz=1): ex,ey >= 32, ez must be 1, wnz = 1. // Must equal plan dimensions exactly. // // d_dst may be NULL (RMS-only mode). d_rms_out may be NULL (copy-only mode). diff --git a/hip/hipWaveletRLE2D.h b/hip/hipWaveletRLE2D.h new file mode 100644 index 0000000..9e9230e --- /dev/null +++ b/hip/hipWaveletRLE2D.h @@ -0,0 +1,473 @@ +// Copyright (C) 2025 Advanced Micro Devices, Inc. +// Use of this source code is governed by an MIT-style license that can be +// found in the LICENSE file or at https://opensource.org/licenses/MIT. + +#ifndef HIPWAVELET_RLE_2D_H +#define HIPWAVELET_RLE_2D_H + +// Fused 2D wavelet + quantize + RLE encode/decode kernels. +// +// Each workgroup processes up to 32 x-adjacent 32x32 tiles in 4 batches of 8. +// Per batch: load 8 tiles → Y-transform (LDS) → transposed readback → +// X-transform (registers) → x-line RLE encode (scan+compact). +// +// Each thread encodes one 32-element x-row as a single RLE line. +// Per-block layout: [32B x-line metadata] [RLE data]. +// 32 KB LDS reused across phases. Occupancy 2 on gfx942. + +#include +#include +#include +#include "ds79.h" +#include "Run_Length_Escape_Codes.hxx" +#include "hipRLEDecode.h" + +static constexpr int WRLE2D_TILES_PER_WG = 32; +static constexpr int WRLE2D_BATCH = 8; +static constexpr int WRLE2D_META_BYTES = 32; +static constexpr int WRLE2D_SLOT_BYTES = 8192; + +// RLE encode 32 scalar values from a float[32] array. +// WRITE=false: count-only. WRITE=true: writes encoded bytes. +template +__device__ __forceinline__ +int wrle_xline(const float* values, float scale, unsigned char* dst) +{ + int bp = 0, rle = 0; + #pragma unroll + for (int i = 0; i < 32; ++i) { + float fval = scale * values[i]; + int ival = (int)fval; + if (ival == 0) { ++rle; } else { + if constexpr (!WRITE) { + bool r1 = (rle == 1), rs = (rle >= 2) & (rle < 256); + int rb = (r1 ? 1 : (rs ? 2 : 4)) & -(rle > 0); + bp += rb; + } else { + if (rle == 1) { + dst[bp] = 0; bp += 1; + } else if (rle >= 2 && rle < 256) { + dst[bp] = (unsigned char)(RLESC1 & 0xFF); + dst[bp+1] = (unsigned char)rle; bp += 2; + } else if (rle >= 256) { + dst[bp] = (unsigned char)(RLESC3 & 0xFF); + dst[bp+1] = (unsigned char)(rle & 0xFF); + dst[bp+2] = (unsigned char)((rle >> 8) & 0xFF); + dst[bp+3] = (unsigned char)((rle >> 16) & 0xFF); bp += 4; + } + } + rle = 0; + bool ib = (ival>VLESC2)&(ival=-32768)&(ival<=32767); + bool i24 = (ival>=-8388608)&(ival<=8388607), f32 = !ib&!i16&!i24; + if constexpr (!WRITE) { + bp += ib ? 1 : i16 ? 3 : i24 ? 4 : 5; + } else { + unsigned u; __builtin_memcpy(&u, &fval, 4); + unsigned pay = f32 ? u : (unsigned)ival; + if (ib) { + dst[bp] = (unsigned char)(signed char)ival; + } else if (i16) { + dst[bp] = (unsigned char)(signed char)VLESC2; + dst[bp+1] = (unsigned char)(pay & 0xFF); + dst[bp+2] = (unsigned char)((pay >> 8) & 0xFF); + } else if (i24) { + dst[bp] = (unsigned char)(signed char)VLESC3; + dst[bp+1] = (unsigned char)(pay & 0xFF); + dst[bp+2] = (unsigned char)((pay >> 8) & 0xFF); + dst[bp+3] = (unsigned char)((pay >> 16) & 0xFF); + } else { + dst[bp] = (unsigned char)(signed char)VLESC4; + dst[bp+1] = (unsigned char)(pay & 0xFF); + dst[bp+2] = (unsigned char)((pay >> 8) & 0xFF); + dst[bp+3] = (unsigned char)((pay >> 16) & 0xFF); + dst[bp+4] = (unsigned char)((pay >> 24) & 0xFF); + } + bp += ib ? 1 : i16 ? 3 : i24 ? 4 : 5; + } + } + } + if constexpr (!WRITE) { + bool r1=(rle==1),rs=(rle>=2)&(rle<256); + int rb=(r1?1:(rs?2:4))&-(rle>0); + bp+=rb; + } else { + if (rle == 1) { + dst[bp] = 0; bp += 1; + } else if (rle >= 2 && rle < 256) { + dst[bp] = (unsigned char)(RLESC1 & 0xFF); + dst[bp+1] = (unsigned char)rle; bp += 2; + } else if (rle >= 256) { + dst[bp] = (unsigned char)(RLESC3 & 0xFF); + dst[bp+1] = (unsigned char)(rle & 0xFF); + dst[bp+2] = (unsigned char)((rle >> 8) & 0xFF); + dst[bp+3] = (unsigned char)((rle >> 16) & 0xFF); bp += 4; + } + } + return bp; +} + +// --------------------------------------------------------------------------- +// Forward: 2D wavelet + quantize + RLE encode. +// Grid: (ceil(nbx/32), nby) where nbx = nx/32, nby = ny/32. +// --------------------------------------------------------------------------- +__launch_bounds__(256, 2) +__global__ void waveletRLE2DFusedKernel( + const float* __restrict__ input, + unsigned char* __restrict__ output, + size_t* __restrict__ block_sizes, + float scale, + int ldimx, + int nbx, + const double* __restrict__ d_rms, + float* __restrict__ d_mulfac_out) +{ + constexpr int NTHREADS = 256; + constexpr int SLC = 2; + using BlockScan = rocprim::block_scan; + + __shared__ union { + float wavelet[WRLE2D_BATCH * 1024]; + struct { + typename BlockScan::storage_type scan; + int block_base_off[WRLE2D_BATCH]; + }; + unsigned char compact[WRLE_LDS_BYTES]; + } lds; + + int tid = threadIdx.x; + + float mulfac; + if (d_rms != nullptr) { + float rms = (float)*d_rms; + float product = rms * scale; + mulfac = (product > 0.0f && __builtin_isfinite(1.0f / product)) + ? (1.0f / product) : 1.0f; + if (tid == 0 && blockIdx.x == 0 && blockIdx.y == 0) { + if (d_mulfac_out) *d_mulfac_out = mulfac; + } + } else { + mulfac = scale; + } + + int wg_tile_x0 = blockIdx.x * WRLE2D_TILES_PER_WG; + + for (int batch = 0; batch < 4; ++batch) { + int batch_tile_x0 = wg_tile_x0 + batch * WRLE2D_BATCH; + + // ---- Load 8 tiles → LDS ---- + int xg = tid % 8, yr = tid / 8; + for (int b = 0; b < WRLE2D_BATCH; ++b) { + int tile_bx = batch_tile_x0 + b; + int gx = tile_bx * 32 + xg * 4; + int gy = blockIdx.y * 32 + yr; + int x0 = xg * 4; + if (tile_bx < nbx) { + uint32_t byte_off = (gx + gy * ldimx) * (uint32_t)sizeof(float); + auto rsrc = __builtin_amdgcn_make_buffer_rsrc( + const_cast(input), 0, -1, 0x00027000); + auto v = __builtin_bit_cast( + __attribute__((__vector_size__(4 * sizeof(float)))) float, + __builtin_amdgcn_raw_buffer_load_b128(rsrc, byte_off, 0, SLC)); + lds.wavelet[b * 1024 + (x0+0) * 32 + (yr ^ (x0+0))] = v[0]; + lds.wavelet[b * 1024 + (x0+1) * 32 + (yr ^ (x0+1))] = v[1]; + lds.wavelet[b * 1024 + (x0+2) * 32 + (yr ^ (x0+2))] = v[2]; + lds.wavelet[b * 1024 + (x0+3) * 32 + (yr ^ (x0+3))] = v[3]; + } else { + lds.wavelet[b * 1024 + (x0+0) * 32 + (yr ^ (x0+0))] = 0.0f; + lds.wavelet[b * 1024 + (x0+1) * 32 + (yr ^ (x0+1))] = 0.0f; + lds.wavelet[b * 1024 + (x0+2) * 32 + (yr ^ (x0+2))] = 0.0f; + lds.wavelet[b * 1024 + (x0+3) * 32 + (yr ^ (x0+3))] = 0.0f; + } + } + __syncthreads(); + + // ---- Y-transform in LDS ---- + { + int blk = tid / 32, pos = tid % 32; + float line[32]; + for (int y = 0; y < 32; y++) + line[y] = lds.wavelet[blk * 1024 + pos * 32 + (y ^ pos)]; + ds79_forward_reg32(line); + for (int y = 0; y < 32; y++) + lds.wavelet[blk * 1024 + pos * 32 + (y ^ pos)] = line[y]; + } + __syncthreads(); + + // ---- Transposed readback + X-transform in registers ---- + int blk = tid / 32; + int row = tid % 32; + int tile_bx = batch_tile_x0 + blk; + bool active = (tile_bx < nbx); + + float xline[32]; + if (active) { + for (int x = 0; x < 32; x++) + xline[x] = lds.wavelet[blk * 1024 + x * 32 + (row ^ x)]; + ds79_forward_reg32(xline); + } else { + for (int x = 0; x < 32; x++) + xline[x] = 0.0f; + } + __syncthreads(); + + // ---- RLE encode ---- + int my_count = active ? wrle_xline(xline, mulfac, nullptr) : 0; + + int global_bid = tile_bx + blockIdx.y * nbx; + // Write per-line metadata to global scratch + if (active) { + unsigned char* slot = output + (size_t)global_bid * WRLE2D_SLOT_BYTES; + slot[row] = (unsigned char)my_count; + } + + int my_offset, pass_total; + BlockScan().exclusive_scan(my_count, my_offset, 0, pass_total, lds.scan); + __syncthreads(); + + // Store per-block base offsets (inside union, will be clobbered by compact) + if (tid % 32 == 0) + lds.block_base_off[tid / 32] = my_offset; + __syncthreads(); + + // Cache block boundaries in registers before compact clobbers them + int blk_start = lds.block_base_off[blk]; + bool next_blk_active = (blk < WRLE2D_BATCH - 1) + && (batch_tile_x0 + blk + 1 < nbx); + int blk_end = next_blk_active + ? lds.block_base_off[blk + 1] : pass_total; + int blk_rle_bytes = blk_end - blk_start; + __syncthreads(); + + // Write encoded data to LDS compact buffer (clobbers block_base_off) + if (active) + wrle_xline(xline, mulfac, lds.compact + my_offset); + __syncthreads(); + + // Copy per-block data from LDS → global scratch slot + if (active) { + unsigned char* slot = output + (size_t)global_bid * WRLE2D_SLOT_BYTES; + for (int i = row; i < blk_rle_bytes; i += 32) { + slot[WRLE2D_META_BYTES + i] = lds.compact[blk_start + i]; + } + if (row == 0) + block_sizes[global_bid] = WRLE2D_META_BYTES + blk_rle_bytes; + } + __syncthreads(); + } +} + +// --------------------------------------------------------------------------- +// Inverse: RLE decode + dequantize + inverse wavelet 2D. +// Grid: (ceil(nbx/32), nby). +// +// Phase-separated design with 2 macro-passes of 2 batches each. +// Halves persistent register array (decoded[2][32] = 64 VGPRs vs 128) +// to reduce scratch spilling while keeping phase-separation benefits. +// --------------------------------------------------------------------------- +__launch_bounds__(256, 2) +__global__ void waveletRLE2DInverseFusedKernel( + const unsigned char* __restrict__ input, + const size_t* __restrict__ block_sizes, + const size_t* __restrict__ block_offsets, + float* __restrict__ output, + float inv_scale, + int ldimx, + int nbx, + int header_size) +{ + constexpr int NTHREADS = 256; + constexpr int SLC = 2; + constexpr int MACRO_BATCHES = 2; + using BlockScan = rocprim::block_scan; + using f4vec = __attribute__((__vector_size__(4 * sizeof(float)))) float; + + __shared__ union { + float wavelet[WRLE2D_BATCH * 1024]; + struct { + typename BlockScan::storage_type scan; + int block_base_off[WRLE2D_BATCH]; + }; + } lds; + + int tid = threadIdx.x; + int wg_tile_x0 = blockIdx.x * WRLE2D_TILES_PER_WG; + int blk = tid / 32; + int row = tid % 32; + + float actual_inv_scale; + const unsigned char* data_base; + + if (header_size != 0) { + const int* hdr = (const int*)input; + int num_blocks = hdr[0]; + int num_mulfacs = hdr[1]; + const float* mulfacs = (const float*)(input + 8 + 8 * num_blocks); + actual_inv_scale = 1.0f / mulfacs[0]; + data_base = input + 8 + 8 * num_blocks + 4 * num_mulfacs; + } else { + actual_inv_scale = inv_scale; + data_base = input; + } + + for (int mp = 0; mp < 4 / MACRO_BATCHES; ++mp) { + + float decoded[MACRO_BATCHES][32]; + + // ---- Decode + X-transform (MACRO_BATCHES passes) ---- + for (int b = 0; b < MACRO_BATCHES; ++b) { + int batch = mp * MACRO_BATCHES + b; + int batch_tile_x0 = wg_tile_x0 + batch * WRLE2D_BATCH; + int tile_bx = batch_tile_x0 + blk; + int global_bid = tile_bx + blockIdx.y * nbx; + bool active = (tile_bx < nbx); + + int my_bytes = 0; + const unsigned char* rle_data = nullptr; + + if (active) { + const unsigned char* block_in; + if (header_size != 0) { + block_in = data_base + ((const size_t*)(input + 8))[global_bid]; + } else if (block_offsets != nullptr) { + block_in = data_base + block_offsets[global_bid]; + } else { + block_in = data_base + (size_t)global_bid * WRLE2D_SLOT_BYTES; + } + rle_data = block_in + WRLE2D_META_BYTES; + my_bytes = (int)block_in[row]; + } + + int my_offset, pass_total; + BlockScan().exclusive_scan(my_bytes, my_offset, 0, pass_total, lds.scan); + __syncthreads(); + + if (tid % 32 == 0) + lds.block_base_off[tid / 32] = my_offset; + __syncthreads(); + + if (active) { + int blk_base = lds.block_base_off[blk]; + wrle_zline_decode(rle_data + my_offset - blk_base, + my_bytes, actual_inv_scale, decoded[b]); + us79_inverse_reg32(decoded[b]); + } else { + #pragma unroll + for (int x = 0; x < 32; x++) decoded[b][x] = 0.0f; + } + } + + __syncthreads(); + + // ---- Y-transform + store (MACRO_BATCHES passes) ---- + for (int b = 0; b < MACRO_BATCHES; ++b) { + int batch = mp * MACRO_BATCHES + b; + int batch_tile_x0 = wg_tile_x0 + batch * WRLE2D_BATCH; + + for (int x = 0; x < 32; x++) + lds.wavelet[blk * 1024 + x * 32 + (row ^ x)] = decoded[b][x]; + __syncthreads(); + + { + int pl = tid / 32; + int pos = tid % 32; + float line[32]; + for (int y = 0; y < 32; y++) + line[y] = lds.wavelet[pl * 1024 + pos * 32 + (y ^ pos)]; + us79_inverse_reg32(line); + for (int y = 0; y < 32; y++) + lds.wavelet[pl * 1024 + pos * 32 + (y ^ pos)] = line[y]; + } + __syncthreads(); + + f4vec store_regs[WRLE2D_BATCH]; + { + int xg = tid % 8, yr = tid / 8; + int x0 = xg * 4; + #pragma unroll + for (int s = 0; s < WRLE2D_BATCH; ++s) { + store_regs[s][0] = lds.wavelet[s * 1024 + (x0+0) * 32 + (yr ^ (x0+0))]; + store_regs[s][1] = lds.wavelet[s * 1024 + (x0+1) * 32 + (yr ^ (x0+1))]; + store_regs[s][2] = lds.wavelet[s * 1024 + (x0+2) * 32 + (yr ^ (x0+2))]; + store_regs[s][3] = lds.wavelet[s * 1024 + (x0+3) * 32 + (yr ^ (x0+3))]; + } + } + __syncthreads(); + + { + int xg = tid % 8, yr = tid / 8; + #pragma unroll + for (int s = 0; s < WRLE2D_BATCH; ++s) { + int tb = batch_tile_x0 + s; + if (tb >= nbx) continue; + int gx = tb * 32 + xg * 4; + int gy = blockIdx.y * 32 + yr; + uint32_t byte_off = (gx + gy * ldimx) * (uint32_t)sizeof(float); + auto rsrc = __builtin_amdgcn_make_buffer_rsrc( + output, 0, -1, 0x00027000); + auto vi = __builtin_bit_cast( + __attribute__((__vector_size__(4 * sizeof(int)))) int, store_regs[s]); + __builtin_amdgcn_raw_buffer_store_b128(vi, rsrc, byte_off, 0, SLC); + } + } + __syncthreads(); + } + } +} + +// Output compaction for 2D blocks. Same as wrleCompactKernel but with +// configurable slot stride. +__global__ void wrle2DCompactKernel( + const unsigned char* __restrict__ src, + unsigned char* __restrict__ dst, + const size_t* __restrict__ block_sizes, + const size_t* __restrict__ offsets, + unsigned char* __restrict__ hdr, + int num_blocks, + int num_mulfacs, + const float* __restrict__ d_mulfac, + size_t slot_stride) +{ + int bid = blockIdx.x; + int tid = threadIdx.x; + size_t size = block_sizes[bid]; + size_t dst_off = offsets[bid]; + size_t src_off = (size_t)bid * slot_stride; + + if (hdr != nullptr && tid == 0) { + ((size_t*)(hdr + 8))[bid] = offsets[bid]; + if (bid == 0) { + ((int*)hdr)[0] = num_blocks; + ((int*)hdr)[1] = num_mulfacs; + float* mf_dst = (float*)(hdr + 8 + 8 * num_blocks); + for (int i = 0; i < num_mulfacs; ++i) + mf_dst[i] = d_mulfac[i]; + } + } + + for (size_t i = tid * 4; i < size; i += blockDim.x * 4) { + unsigned val; + __builtin_memcpy(&val, src + src_off + i, 4); + size_t remain = size - i; + if (remain >= 4) { + __builtin_memcpy(dst + dst_off + i, &val, 4); + } else { + for (size_t b = 0; b < remain; ++b) + dst[dst_off + i + b] = (unsigned char)(val >> (b * 8)); + } + } +} + +inline int hipCompressHeaderSize2D(int num_blocks, int num_mulfacs) +{ + return 8 + 8 * num_blocks + 4 * num_mulfacs; +} + +inline hipError_t hipWaveletRLE2DCompactScanTempSize(int nblocks, size_t* scan_temp_bytes) +{ + return rocprim::exclusive_scan( + nullptr, *scan_temp_bytes, + (size_t*)nullptr, (size_t*)nullptr, (size_t)0, (size_t)nblocks, + rocprim::plus()); +} + +#endif // HIPWAVELET_RLE_2D_H diff --git a/makefile b/makefile index 4c45e77..02659bb 100644 --- a/makefile +++ b/makefile @@ -113,6 +113,10 @@ test_wavelet_rle_fused_hip: tests/test_wavelet_rle_fused_hip.cpp hip/hipWaveletR test_compress_api_hip: tests/test_compress_api_hip.cpp hip/hipCompress.cpp hip/hipCompress.h hip/hipBlockCopy.h hip/hipWaveletRLE.h hip/hipWaveletRLEInverse.h hip/ds79.h hip/us79_reg32.inc hip/ds79_reg32.inc libcvxcompress.$(LIB_EXT) | $(BUILDDIR) $(HIPCC) $(HIPCFLAGS) --offload-arch=$(HIP_ARCH) -mllvm -unroll-threshold=10000 -I. -Ihip -Itests tests/test_compress_api_hip.cpp hip/hipCompress.cpp -L. -lcvxcompress -lm -o $(BUILDDIR)/test_compress_api_hip +# 2D compression test +test_compress_2d_hip: tests/test_compress_2d_hip.cpp hip/hipCompress.cpp hip/hipCompress.h hip/hipBlockCopy.h hip/hipWaveletRLE.h hip/hipWaveletRLEInverse.h hip/hipWaveletRLE2D.h hip/ds79.h hip/us79_reg32.inc hip/ds79_reg32.inc | $(BUILDDIR) + $(HIPCC) $(HIPCFLAGS) --offload-arch=$(HIP_ARCH) -mllvm -unroll-threshold=10000 -I. -Ihip -Itests tests/test_compress_2d_hip.cpp hip/hipCompress.cpp -lm -o $(BUILDDIR)/test_compress_2d_hip + # Async pipeline example (for profiling) example_async_pipeline: tests/example_async_pipeline.cpp hip/hipCompress.cpp hip/hipCompress.h hip/hipBlockCopy.h hip/hipWaveletRLE.h hip/hipWaveletRLEInverse.h hip/ds79.h hip/us79_reg32.inc hip/ds79_reg32.inc | $(BUILDDIR) $(HIPCC) $(HIPCFLAGS) --offload-arch=$(HIP_ARCH) -mllvm -unroll-threshold=10000 -I. -Ihip -Itests tests/example_async_pipeline.cpp hip/hipCompress.cpp -lm -o $(BUILDDIR)/example_async_pipeline diff --git a/tests/profile_2d_vs_3d.cpp b/tests/profile_2d_vs_3d.cpp new file mode 100644 index 0000000..3098909 --- /dev/null +++ b/tests/profile_2d_vs_3d.cpp @@ -0,0 +1,121 @@ +// Isolated decompress-only benchmark for profiling 2D vs 3D inverse kernels. +// Compresses once, then runs decompress N times for profiler capture. + +#include +#include +#include +#include +#include "hipCompress.h" + +#define HIPCHECK(cmd) do { \ + hipError_t e = (cmd); \ + if (e != hipSuccess) { \ + fprintf(stderr, "HIP error %s at %s:%d\n", hipGetErrorString(e), __FILE__, __LINE__); \ + exit(1); \ + } \ +} while(0) + +__global__ void initSin2D(float* data, int nx, int ny, float kx, float ky) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= nx * ny) return; + int iy = idx / nx, ix = idx % nx; + data[idx] = sinf(kx * (float)ix / nx) * sinf(ky * (float)iy / ny); +} + +__global__ void initSin3D(float* data, int nx, int ny, int nz, float kx, float ky, float kz) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= nx * ny * nz) return; + int iz = idx / (nx * ny); + int rem = idx - iz * nx * ny; + int iy = rem / nx, ix = rem % nx; + data[idx] = sinf(kx * (float)ix / nx) * sinf(ky * (float)iy / ny) * sinf(kz * (float)iz / nz); +} + +int main(int argc, char** argv) +{ + int iters = 20; + if (argc > 1) iters = atoi(argv[1]); + + const float scale = 5e-2f; + + // ---- 3D: 256^3 ---- + { + const int N = 256; + long total = (long)N * N * N; + hipCompressPlan* plan = nullptr; + HIPCHECK(hipCompressCreatePlan(&plan, N, N, N, 0)); + + float *d_in, *d_out; + unsigned char* d_comp; + HIPCHECK(hipMalloc(&d_in, total * sizeof(float))); + HIPCHECK(hipMalloc(&d_out, total * sizeof(float))); + size_t comp_sz; HIPCHECK(hipCompressMaxOutputSize(plan, &comp_sz)); + HIPCHECK(hipMalloc(&d_comp, comp_sz)); + + initSin3D<<<(total+255)/256, 256>>>(d_in, N, N, N, 20.f, 20.f, 20.f); + HIPCHECK(hipDeviceSynchronize()); + + long len; float cr; + HIPCHECK(hipCompress(scale, nullptr, d_in, d_comp, plan, 0)); + HIPCHECK(hipCompressSynchronize(plan, &len, &cr)); + printf("3D 256^3: CR=%.1f, %ld bytes\n", cr, len); + + // warmup + for (int i = 0; i < 5; i++) { + HIPCHECK(hipDecompress(d_comp, d_out, plan, 0)); + HIPCHECK(hipDeviceSynchronize()); + } + + // timed region for profiler + printf("3D decompress: %d iterations\n", iters); + for (int i = 0; i < iters; i++) { + HIPCHECK(hipDecompress(d_comp, d_out, plan, 0)); + HIPCHECK(hipDeviceSynchronize()); + } + + hipFree(d_in); hipFree(d_out); hipFree(d_comp); + hipCompressDestroyPlan(plan); + } + + // ---- 2D: 4064x4064 (similar total floats) ---- + { + const int NX = 4064, NY = 4064; + long total = (long)NX * NY; + hipCompressPlan* plan = nullptr; + HIPCHECK(hipCompressCreatePlan(&plan, NX, NY, 1, 0)); + + float *d_in, *d_out; + unsigned char* d_comp; + HIPCHECK(hipMalloc(&d_in, total * sizeof(float))); + HIPCHECK(hipMalloc(&d_out, total * sizeof(float))); + size_t comp_sz; HIPCHECK(hipCompressMaxOutputSize(plan, &comp_sz)); + HIPCHECK(hipMalloc(&d_comp, comp_sz)); + + initSin2D<<<(total+255)/256, 256>>>(d_in, NX, NY, 20.f, 20.f); + HIPCHECK(hipDeviceSynchronize()); + + long len; float cr; + HIPCHECK(hipCompress(scale, nullptr, d_in, d_comp, plan, 0)); + HIPCHECK(hipCompressSynchronize(plan, &len, &cr)); + printf("2D %dx%d: CR=%.1f, %ld bytes\n", NX, NY, cr, len); + + for (int i = 0; i < 5; i++) { + HIPCHECK(hipDecompress(d_comp, d_out, plan, 0)); + HIPCHECK(hipDeviceSynchronize()); + } + + printf("2D decompress: %d iterations\n", iters); + for (int i = 0; i < iters; i++) { + HIPCHECK(hipDecompress(d_comp, d_out, plan, 0)); + HIPCHECK(hipDeviceSynchronize()); + } + + hipFree(d_in); hipFree(d_out); hipFree(d_comp); + hipCompressDestroyPlan(plan); + } + + printf("Done.\n"); + return 0; +} diff --git a/tests/test_compress_2d_hip.cpp b/tests/test_compress_2d_hip.cpp new file mode 100644 index 0000000..0fd39bd --- /dev/null +++ b/tests/test_compress_2d_hip.cpp @@ -0,0 +1,354 @@ +// 2D compression round-trip test. +// Tests hipCompress API with nz=1 (2D mode): +// 1. Plan lifecycle with nz=1 +// 2. Compress/decompress round-trip on a 2D sinusoidal field +// 3. Copy-to/from wavelet layout with ez=1 +// 4. Various 2D grid sizes (including non-32-aligned extraction) + +#include +#include +#include +#include +#include +#include +#include "hipCompress.h" + +#define HIPCHECK(cmd) do { \ + hipError_t e = (cmd); \ + if (e != hipSuccess) { \ + fprintf(stderr, "HIP error %s at %s:%d\n", hipGetErrorString(e), __FILE__, __LINE__); \ + exit(1); \ + } \ +} while(0) + +__global__ void initSin2DKernel(float* data, int nx, int ny, float kx, float ky) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int total = nx * ny; + if (idx >= total) return; + int iy = idx / nx; + int ix = idx - iy * nx; + float x = (float)ix / (float)nx; + float y = (float)iy / (float)ny; + data[idx] = sinf(kx * x) * sinf(ky * y); +} + +static float hostRMS(const float* data, int n) +{ + double sum = 0.0; + for (int i = 0; i < n; ++i) sum += (double)data[i] * (double)data[i]; + return (float)sqrt(sum / n); +} + +static float maxAbsError(const float* a, const float* b, int n) +{ + float mx = 0.0f; + for (int i = 0; i < n; ++i) { + float d = fabsf(a[i] - b[i]); + if (d > mx) mx = d; + } + return mx; +} + +static hipError_t deviceRMS2D(const float* d_input, int nx, int ny, + double* d_rms, hipCompressPlan* plan, + hipStream_t s = 0) +{ + return hipCopyToWaveletLayout( + d_input, nx, nx, + 0, 0, 0, nx, ny, 1, + nullptr, d_rms, plan, s); +} + +static hipError_t compressWithAutoRMS2D(float scale, const float* d_input, + unsigned char* d_output, long* len, + float* cr, hipCompressPlan* plan, + hipStream_t s = 0) +{ + int nx = plan->nx, ny = plan->ny; + hipError_t err = deviceRMS2D(d_input, nx, ny, plan->d_rms, plan, s); + if (err != hipSuccess) return err; + err = hipCompress(scale, plan->d_rms, d_input, d_output, plan, s); + if (err != hipSuccess) return err; + return hipCompressSynchronize(plan, len, cr); +} + +static bool test_plan_lifecycle_2d() +{ + printf("Test 1: 2D plan lifecycle\n"); + hipCompressPlan* plan = nullptr; + + hipError_t err = hipCompressCreatePlan(&plan, 128, 128, 1, 0); + if (err != hipSuccess || !plan) { + printf(" FAIL: create 128x128 2D plan: %s\n", hipGetErrorString(err)); + return false; + } + if (!plan->is_2d) { printf(" FAIL: is_2d not set\n"); hipCompressDestroyPlan(plan); return false; } + if (plan->num_blocks != 4 * 4) { + printf(" FAIL: num_blocks=%d expected=%d\n", plan->num_blocks, 16); + hipCompressDestroyPlan(plan); return false; + } + printf(" 128x128: num_blocks=%d: PASS\n", plan->num_blocks); + + size_t max_sz = 0; + HIPCHECK(hipCompressMaxOutputSize(plan, &max_sz)); + printf(" MaxOutputSize=%zu\n", max_sz); + hipCompressDestroyPlan(plan); + + // Reject nx not multiple of 32 + plan = nullptr; + err = hipCompressCreatePlan(&plan, 100, 128, 1, 0); + if (err == hipSuccess) { printf(" FAIL: should reject nx=100\n"); hipCompressDestroyPlan(plan); return false; } + printf(" reject non-32-multiple: PASS\n"); + + // 256x256 + plan = nullptr; + err = hipCompressCreatePlan(&plan, 256, 256, 1, 0); + if (err != hipSuccess) { printf(" FAIL: create 256x256\n"); return false; } + if (plan->num_blocks != 8 * 8) { printf(" FAIL: num_blocks\n"); hipCompressDestroyPlan(plan); return false; } + printf(" 256x256: num_blocks=%d: PASS\n", plan->num_blocks); + hipCompressDestroyPlan(plan); + + return true; +} + +static bool test_round_trip_2d() +{ + printf("Test 2: 2D compress/decompress round-trip\n"); + const int NX = 128, NY = 128, total = NX * NY; + const float scale = 5e-2f; + + hipCompressPlan* plan = nullptr; + HIPCHECK(hipCompressCreatePlan(&plan, NX, NY, 1, 0)); + + float* d_input = nullptr; + float* d_output = nullptr; + unsigned char* d_compressed = nullptr; + HIPCHECK(hipMalloc(&d_input, total * sizeof(float))); + HIPCHECK(hipMalloc(&d_output, total * sizeof(float))); + + size_t comp_size = 0; + HIPCHECK(hipCompressMaxOutputSize(plan, &comp_size)); + HIPCHECK(hipMalloc(&d_compressed, comp_size)); + + int threads = 256, blocks = (total + threads - 1) / threads; + initSin2DKernel<<>>(d_input, NX, NY, 20.0f, 20.0f); + HIPCHECK(hipDeviceSynchronize()); + + long compressed_length = 0; + float cr = 0; + HIPCHECK(compressWithAutoRMS2D(scale, d_input, d_compressed, &compressed_length, &cr, plan)); + if (cr <= 1.0f || compressed_length <= 0) { + printf(" FAIL: cr=%.2f compressed_length=%ld\n", cr, compressed_length); + return false; + } + printf(" compress: CR=%.2f, %ld bytes\n", cr, compressed_length); + + HIPCHECK(hipDecompress(d_compressed, d_output, plan, 0)); + + std::vector h_in(total), h_out(total); + HIPCHECK(hipMemcpy(h_in.data(), d_input, total * sizeof(float), hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(h_out.data(), d_output, total * sizeof(float), hipMemcpyDeviceToHost)); + + float rms = hostRMS(h_in.data(), total); + float max_err = maxAbsError(h_in.data(), h_out.data(), total); + float rel_err = max_err / rms; + printf(" decompress: max_err=%.6e, rms=%.6e, rel_max_err=%.6e\n", max_err, rms, rel_err); + + bool pass = (max_err < rms) && (cr > 1.0f); + printf(" round-trip: %s\n", pass ? "PASS" : "FAIL"); + + hipFree(d_input); hipFree(d_output); hipFree(d_compressed); + hipCompressDestroyPlan(plan); + return pass; +} + +static bool test_round_trip_2d_large() +{ + printf("Test 3: 2D round-trip 512x512\n"); + const int NX = 512, NY = 512, total = NX * NY; + const float scale = 5e-2f; + + hipCompressPlan* plan = nullptr; + HIPCHECK(hipCompressCreatePlan(&plan, NX, NY, 1, 0)); + + float* d_input = nullptr; + float* d_output = nullptr; + unsigned char* d_compressed = nullptr; + HIPCHECK(hipMalloc(&d_input, total * sizeof(float))); + HIPCHECK(hipMalloc(&d_output, total * sizeof(float))); + + size_t comp_size = 0; + HIPCHECK(hipCompressMaxOutputSize(plan, &comp_size)); + HIPCHECK(hipMalloc(&d_compressed, comp_size)); + + int threads = 256, blocks = (total + threads - 1) / threads; + initSin2DKernel<<>>(d_input, NX, NY, 40.0f, 40.0f); + HIPCHECK(hipDeviceSynchronize()); + + long compressed_length = 0; + float cr = 0; + HIPCHECK(compressWithAutoRMS2D(scale, d_input, d_compressed, &compressed_length, &cr, plan)); + printf(" compress: CR=%.2f, %ld bytes\n", cr, compressed_length); + + HIPCHECK(hipDecompress(d_compressed, d_output, plan, 0)); + + std::vector h_in(total), h_out(total); + HIPCHECK(hipMemcpy(h_in.data(), d_input, total * sizeof(float), hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(h_out.data(), d_output, total * sizeof(float), hipMemcpyDeviceToHost)); + + float rms = hostRMS(h_in.data(), total); + float max_err = maxAbsError(h_in.data(), h_out.data(), total); + printf(" decompress: max_err=%.6e, rms=%.6e, rel_max_err=%.6e\n", max_err, rms, max_err / rms); + + bool pass = (max_err < rms) && (cr > 1.0f); + printf(" round-trip: %s\n", pass ? "PASS" : "FAIL"); + + hipFree(d_input); hipFree(d_output); hipFree(d_compressed); + hipCompressDestroyPlan(plan); + return pass; +} + +static bool test_copy_to_from_2d() +{ + printf("Test 4: 2D copy-to/copy-from wavelet layout\n"); + const int EX = 100, EY = 80; + const int WNX = ((EX + 31) / 32) * 32; // 128 + const int WNY = ((EY + 31) / 32) * 32; // 96 + const int src_total = EX * EY; + const int wav_total = WNX * WNY; + + hipCompressPlan* plan = nullptr; + HIPCHECK(hipCompressCreatePlan(&plan, WNX, WNY, 1, 0)); + + float* d_src = nullptr; + float* d_wav = nullptr; + float* d_dst = nullptr; + double* d_rms = nullptr; + HIPCHECK(hipMalloc(&d_src, EX * EY * sizeof(float))); + HIPCHECK(hipMalloc(&d_wav, wav_total * sizeof(float))); + HIPCHECK(hipMalloc(&d_dst, EX * EY * sizeof(float))); + HIPCHECK(hipMalloc(&d_rms, sizeof(double))); + + int threads = 256, blocks = (src_total + threads - 1) / threads; + initSin2DKernel<<>>(d_src, EX, EY, 10.0f, 10.0f); + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK(hipCopyToWaveletLayout( + d_src, EX, EX, + 0, 0, 0, EX, EY, 1, + d_wav, d_rms, plan, 0)); + + HIPCHECK(hipCopyFromWaveletLayout( + d_wav, d_dst, EX, EX, + 0, 0, 0, EX, EY, 1, + plan, 0)); + HIPCHECK(hipDeviceSynchronize()); + + std::vector h_src(src_total), h_dst(src_total); + HIPCHECK(hipMemcpy(h_src.data(), d_src, src_total * sizeof(float), hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(h_dst.data(), d_dst, src_total * sizeof(float), hipMemcpyDeviceToHost)); + + float max_err = maxAbsError(h_src.data(), h_dst.data(), src_total); + printf(" copy round-trip max_err=%.6e\n", max_err); + + double h_rms; + HIPCHECK(hipMemcpy(&h_rms, d_rms, sizeof(double), hipMemcpyDeviceToHost)); + float cpu_rms = hostRMS(h_src.data(), src_total); + float rms_err = fabsf((float)h_rms - cpu_rms); + printf(" GPU RMS=%.6e, CPU RMS=%.6e, diff=%.6e\n", (float)h_rms, cpu_rms, rms_err); + + bool pass = (max_err < 1e-6f) && (rms_err < 1e-4f); + printf(" copy-to/from 2D: %s\n", pass ? "PASS" : "FAIL"); + + hipFree(d_src); hipFree(d_wav); hipFree(d_dst); hipFree(d_rms); + hipCompressDestroyPlan(plan); + return pass; +} + +static bool test_round_trip_with_copy_2d() +{ + printf("Test 5: 2D full pipeline (copy-to → compress → decompress → copy-from)\n"); + const int EX = 100, EY = 80; + const int WNX = ((EX + 31) / 32) * 32; + const int WNY = ((EY + 31) / 32) * 32; + const float scale = 5e-2f; + + hipCompressPlan* plan = nullptr; + HIPCHECK(hipCompressCreatePlan(&plan, WNX, WNY, 1, 0)); + + float* d_src = nullptr; + float* d_wav = nullptr; + float* d_dec = nullptr; + float* d_dst = nullptr; + unsigned char* d_comp = nullptr; + HIPCHECK(hipMalloc(&d_src, EX * EY * sizeof(float))); + HIPCHECK(hipMalloc(&d_wav, WNX * WNY * sizeof(float))); + HIPCHECK(hipMalloc(&d_dec, WNX * WNY * sizeof(float))); + HIPCHECK(hipMalloc(&d_dst, EX * EY * sizeof(float))); + size_t comp_size = 0; + HIPCHECK(hipCompressMaxOutputSize(plan, &comp_size)); + HIPCHECK(hipMalloc(&d_comp, comp_size)); + + int threads = 256, blocks = (EX * EY + threads - 1) / threads; + initSin2DKernel<<>>(d_src, EX, EY, 15.0f, 15.0f); + HIPCHECK(hipDeviceSynchronize()); + + // Copy to wavelet layout + RMS + HIPCHECK(hipCopyToWaveletLayout( + d_src, EX, EX, 0, 0, 0, EX, EY, 1, + d_wav, plan->d_rms, plan, 0)); + + // Compress + long compressed_length = 0; + float cr = 0; + hipError_t err = hipCompress(scale, plan->d_rms, d_wav, d_comp, plan, 0); + HIPCHECK(err); + HIPCHECK(hipCompressSynchronize(plan, &compressed_length, &cr)); + printf(" compress: CR=%.2f, %ld bytes\n", cr, compressed_length); + + // Decompress + HIPCHECK(hipDecompress(d_comp, d_dec, plan, 0)); + + // Copy from wavelet layout + HIPCHECK(hipCopyFromWaveletLayout( + d_dec, d_dst, EX, EX, 0, 0, 0, EX, EY, 1, plan, 0)); + HIPCHECK(hipDeviceSynchronize()); + + std::vector h_src(EX * EY), h_dst(EX * EY); + HIPCHECK(hipMemcpy(h_src.data(), d_src, EX * EY * sizeof(float), hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(h_dst.data(), d_dst, EX * EY * sizeof(float), hipMemcpyDeviceToHost)); + + float rms = hostRMS(h_src.data(), EX * EY); + float max_err = maxAbsError(h_src.data(), h_dst.data(), EX * EY); + printf(" max_err=%.6e, rms=%.6e, rel=%.6e\n", max_err, rms, max_err / rms); + + bool pass = (max_err < rms) && (cr > 1.0f); + printf(" full pipeline 2D: %s\n", pass ? "PASS" : "FAIL"); + + hipFree(d_src); hipFree(d_wav); hipFree(d_dec); hipFree(d_dst); hipFree(d_comp); + hipCompressDestroyPlan(plan); + return pass; +} + +int main() +{ + printf("=== 2D Compression Tests ===\n\n"); + fflush(stdout); + + int passed = 0, total = 0; + + total++; if (test_plan_lifecycle_2d()) passed++; + printf("\n"); + total++; if (test_round_trip_2d()) passed++; + printf("\n"); + total++; if (test_round_trip_2d_large()) passed++; + printf("\n"); + total++; if (test_copy_to_from_2d()) passed++; + printf("\n"); + total++; if (test_round_trip_with_copy_2d()) passed++; + printf("\n"); + + printf("=== Results: %d/%d passed ===\n", passed, total); + return (passed == total) ? 0 : 1; +}