From 51b1ced1db89207fedded6057d01b5842964de5f Mon Sep 17 00:00:00 2001
From: "Randy J. Spaulding" <rspauldi@amd.com>
Date: Sun, 28 Jul 2024 06:05:45 +0000
Subject: [PATCH 01/10] checkpoint

---
 src/CMakeLists.txt                          |   5 +
 src/include/miopen/pooling/solvers.hpp      |  61 ++++
 src/kernels/MIOpenPoolingBwd.cl             | 322 ++++++++++++++++++++
 src/kernels/MIOpenPoolingBwdND.cl           | 247 +++++++++++++++
 src/kernels/MIOpenPoolingForwardNaive.cl    |   4 +-
 src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp | 245 +++++++++++++++
 src/ocl/pooling_ocl.cpp                     |   5 +
 src/solver.cpp                              |   7 +
 src/solver/pooling/backwardCk2d.cpp         | 308 +++++++++++++++++++
 src/solver/pooling/backwardCkNd.cpp         | 273 +++++++++++++++++
 src/solver/pooling/forwardCk2d.cpp          | 267 ++++++++++++++++
 src/solver/pooling/forwardCkNd.cpp          | 264 ++++++++++++++++
 src/solver/pooling/forwardNaive.cpp         | 143 ++++++---
 test/gtest/poolingFwdNdNaive.cpp            | 217 +++++++++++++
 test/pooling2d.hpp                          |  32 +-
 test/pooling3d.cpp                          |  26 +-
 test/pooling3d.hpp                          |  62 ++++
 17 files changed, 2405 insertions(+), 83 deletions(-)
 create mode 100644 src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp
 create mode 100644 src/solver/pooling/backwardCk2d.cpp
 create mode 100644 src/solver/pooling/backwardCkNd.cpp
 create mode 100644 src/solver/pooling/forwardCk2d.cpp
 create mode 100644 src/solver/pooling/forwardCkNd.cpp
 create mode 100644 test/gtest/poolingFwdNdNaive.cpp
 create mode 100644 test/pooling3d.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 14e7c954b1..150e5cb76a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -289,8 +289,12 @@ set( MIOpen_Source
     solver/pooling/forward2d.cpp
     solver/pooling/forwardNaive.cpp
     solver/pooling/forwardNd.cpp
+    solver/pooling/forwardCk2d.cpp
+    solver/pooling/forwardCkNd.cpp
     solver/pooling/backward2d.cpp
     solver/pooling/backwardNd.cpp
+    solver/pooling/backwardCk2d.cpp
+    solver/pooling/backwardCkNd.cpp
     solver/reduce/forward_argmax.cpp
     solver/reduce/forward_argmin.cpp
     solver/reduce/forward_max.cpp
@@ -495,6 +499,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenPoolingBwd.cl
         kernels/MIOpenPoolingBwdND.cl
         kernels/MIOpenPoolingForwardNaive.cl
+        kernels/MIOpenPoolingFwdNDNhwcNaive.cpp
         kernels/MIOpenPoolingND.cl
         kernels/MIOpenConv1x1S.cl
         kernels/MIOpenConv1x1J1.cl
diff --git a/src/include/miopen/pooling/solvers.hpp b/src/include/miopen/pooling/solvers.hpp
index 0d0e35755a..ab86a52aae 100644
--- a/src/include/miopen/pooling/solvers.hpp
+++ b/src/include/miopen/pooling/solvers.hpp
@@ -66,6 +66,30 @@ struct PoolingForwardNd final : PoolingSolver
                                  const miopen::pooling::ProblemDescription& problem) const override;
 };
 
+struct PoolingForwardCk2d final : PoolingSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingForwardCk2d>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::pooling::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::pooling::ProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const override;
+};
+
+struct PoolingForwardCkNd final : PoolingSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingForwardCkNd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::pooling::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::pooling::ProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const override;
+};
+
 struct PoolingForwardNaive final : PoolingSolver
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<PoolingForwardNaive>(); }
@@ -79,6 +103,19 @@ struct PoolingForwardNaive final : PoolingSolver
                                  const miopen::pooling::ProblemDescription& problem) const override;
 };
 
+struct PoolingForwardNdNhwcNaive final : PoolingSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingForwardNdNhwcNaive>(); }
+    bool IsDynamic() const override { return true; }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::pooling::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::pooling::ProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const override;
+};
+
 template <class Inner>
 struct PoolingFwdNCHWTransposingSolver : TransposingSolver<PoolingFwdNCHWTransposingSolver<Inner>,
                                                            PoolingSolver,
@@ -145,6 +182,18 @@ struct PoolingBackward2d final : PoolingSolver
                                  const miopen::pooling::ProblemDescription& problem) const override;
 };
 
+struct PoolingBackwardCk2d final : PoolingSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingBackwardCk2d>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::pooling::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::pooling::ProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const override;
+};
+
 struct PoolingBackwardNd final : PoolingSolver
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<PoolingBackwardNd>(); }
@@ -157,6 +206,18 @@ struct PoolingBackwardNd final : PoolingSolver
                                  const miopen::pooling::ProblemDescription& problem) const override;
 };
 
+struct PoolingBackwardCkNd final : PoolingSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingBackwardCkNd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::pooling::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::pooling::ProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const override;
+};
+
 template <class Inner>
 struct PoolingBwdNCHWTransposingSolver : TransposingSolver<PoolingBwdNCHWTransposingSolver<Inner>,
                                                            PoolingSolver,
diff --git a/src/kernels/MIOpenPoolingBwd.cl b/src/kernels/MIOpenPoolingBwd.cl
index 6c88bebadf..062c5ed337 100644
--- a/src/kernels/MIOpenPoolingBwd.cl
+++ b/src/kernels/MIOpenPoolingBwd.cl
@@ -218,6 +218,168 @@ mloPoolingAveBwd(const __global _FLOAT* top_diff,
         }
     }
 }
+
+__attribute__((reqd_work_group_size(MLO_POOLBWD_GROUP_SZ0,
+                                    MLO_POOLBWD_GROUP_SZ1,
+                                    MLO_POOLBWD_GROUP_SZ2))) __kernel void
+mloPoolingAveBwdNhwc(const __global _FLOAT* top_diff,
+                 __global _FLOAT* bot_diff,
+                 int mlo_pad1,
+                 int mlo_pad0,
+                 int mlo_n_outputs,
+                 int mlo_bot_height,
+                 int mlo_bot_width,
+                 int mlo_top_height,
+                 int mlo_top_width,
+                 int mlo_botdf_batch_str,
+                 int mlo_botdf_channel_str,
+                 int mlo_botdf_str,
+                 int mlo_topdf_batch_str,
+                 int mlo_topdf_channel_str,
+                 int mlo_topdf_str)
+{
+    __local _FLOAT lcl_top_diff[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT];
+
+    int x       = get_group_id(0) * MLO_POOLBWD_GROUP_SZ0 * MLO_POOLBWD_N_HORIZ_OUT_PIX;
+    int y       = get_group_id(1) * MLO_POOLBWD_GROUP_SZ1 * MLO_POOLBWD_N_VERT_OUT_PIX;
+    int lcl_id0 = get_local_id(0);
+    int lcl_id1 = get_local_id(1);
+    //		int lcl_id = (lcl_id1 << MLO_POOLBWD_GROUP_LG2SZ1) + lcl_id0;
+    int ob = get_global_id(2); // outputs * batch_sz
+    int b  = ob / mlo_n_outputs;
+    int o  = ob - b * mlo_n_outputs;
+
+    int top_x   = (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) < 0
+                      ? 0
+                      : (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1;
+    int top_y   = (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) < 0
+                      ? 0
+                      : (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1;
+    int top_off = b * mlo_topdf_batch_str + o * mlo_topdf_channel_str;
+
+    _FLOAT_ACCUM res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX];
+    for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++)
+    {
+        for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++)
+        {
+            res[k][l] = (_FLOAT_ACCUM)0;
+        }
+    }
+
+    // load tile
+    for(int tj = lcl_id1; tj < MLO_POOLBWD_LCL_DATA_HEIGHT; tj += MLO_POOLBWD_GROUP_SZ1)
+    {
+        int top_y_act = top_y + tj;
+        int top_y_off = top_y_act * mlo_topdf_str;
+
+        int lcl_off_v = tj * MLO_POOLBWD_LCL_DATA_WIDTH;
+
+        bool invisibleY = (top_y_act >= mlo_top_height);
+
+        for(int ti = lcl_id0; ti < MLO_POOLBWD_LCL_DATA_WIDTH; ti += MLO_POOLBWD_GROUP_SZ0)
+        {
+
+            int top_x_act = top_x + ti;
+
+            bool invisibleX = (top_x_act >= mlo_top_width);
+
+            int top_diff_off = (invisibleX || invisibleY) ? 0 : top_off + top_y_off + top_x_act;
+
+            _FLOAT top_val = top_diff[top_diff_off];
+
+            top_val = (invisibleX || invisibleY) ? 0 : top_val;
+
+            lcl_top_diff[lcl_off_v + ti] = top_val;
+#if 0
+				if (lcl_id1==0&&o==0&&b==0)
+				{
+				  printf("K:in: %d %d %d   %f\n", top_off + top_y_off + top_x_act, top_y_act, top_x_act, top_val);
+				}
+#endif
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int bot_y = (y + lcl_id1 * MLO_POOLBWD_N_VERT_OUT_PIX);
+    int bot_x = (x + lcl_id0 * MLO_POOLBWD_N_HORIZ_OUT_PIX);
+
+    for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++)
+    {
+
+        int h          = bot_y + k + mlo_pad1;
+        int top_hstart = (h < MLO_POOLING_KERNEL_SZ1)
+                             ? 0
+                             : (h - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1;
+        int top_hend   = min(h / MLO_POOLING_STRIDE1 + 1, mlo_top_height);
+
+        for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++)
+        {
+
+            int w          = bot_x + l + mlo_pad0;
+            int top_wstart = (w < MLO_POOLING_KERNEL_SZ0)
+                                 ? 0
+                                 : (w - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1;
+            int top_wend   = min(w / MLO_POOLING_STRIDE0 + 1, mlo_top_width);
+
+            for(int top_h = top_hstart; top_h < top_hend; ++top_h)
+            {
+                int hstart = top_h * MLO_POOLING_STRIDE1 - mlo_pad1;
+                int hend   = min(hstart + MLO_POOLING_KERNEL_SZ1, mlo_bot_height);
+                hstart     = max(hstart, 0);
+
+                for(int top_w = top_wstart; top_w < top_wend; ++top_w)
+                {
+                    // figure out the pooling size
+                    int wstart = top_w * MLO_POOLING_STRIDE0 - mlo_pad0;
+                    int wend   = min(wstart + MLO_POOLING_KERNEL_SZ0, mlo_bot_width);
+                    wstart     = max(wstart, 0);
+                    int pool_size =
+#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
+                        MLO_POOLING_KERNEL_SZ0 * MLO_POOLING_KERNEL_SZ1;
+                    (void)wend;
+                    (void)hend;
+#else
+                        (hend - hstart) * (wend - wstart);
+#endif
+                    pool_size     = (pool_size == 0) ? 1 : pool_size;
+                    int lcl_top_h = top_h - top_y;
+                    int lcl_top_w = top_w - top_x;
+                    _FLOAT_ACCUM add_val =
+                        CVT_FLOAT2ACCUM(
+                            lcl_top_diff[lcl_top_h * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w]) /
+                        CVT_INTEGRAL2ACCUM(pool_size);
+                    res[k][l] += add_val;
+#if 0
+				if (bot_x+l==6&&bot_y+k==0&&o==3&&b==0)
+				{
+				  printf("K:com: %d %d %d %d %d %d   %10.8f %10.8f %10.8f %d\n", k,l,top_h, top_w, lcl_top_h, lcl_top_w, res[k][l], add_val, lcl_top_diff[lcl_top_h *  MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w], pool_size);
+				}
+#endif
+                }
+            }
+        }
+    }
+
+    int bot_off =
+        b * mlo_botdf_batch_str + o * mlo_botdf_channel_str + bot_y * mlo_botdf_str + bot_x;
+    for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++)
+    {
+        for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++)
+        {
+            if(bot_y + k < mlo_bot_height && bot_x + l < mlo_bot_width)
+            {
+                bot_diff[bot_off + k * mlo_botdf_str + l] = CVT_ACCUM2FLOAT(res[k][l]);
+#if 0
+					if (lcl_id0==0&&lcl_id1==0&&o==0&&b==0)
+					{
+						printf("K:out: %d %d %d  %f\n", bot_off + k * mlo_botdf_str +l, k, l, bot_diff[bot_off + k * mlo_botdf_str +l]);
+					}
+#endif
+            }
+        }
+    }
+}
 #endif // AVERAGE_OPS
 
 #if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX
@@ -379,4 +541,164 @@ mloPoolingMaxBwd(const __global _FLOAT* top_df,
         }
     }
 }
+
+__attribute__((reqd_work_group_size(MLO_POOLBWD_GROUP_SZ0,
+                                    MLO_POOLBWD_GROUP_SZ1,
+                                    MLO_POOLBWD_GROUP_SZ2))) __kernel void
+mloPoolingMaxBwdNhwc(const __global _FLOAT* top_df,
+                 __global _FLOAT* bot_df,
+                 __global index_t* mask,
+                 int mlo_pad1,
+                 int mlo_pad0,
+                 int mlo_n_outputs,
+                 int mlo_bot_height,
+                 int mlo_bot_width,
+                 int mlo_top_height,
+                 int mlo_top_width,
+                 int mlo_botdf_batch_str,
+                 int mlo_botdf_channel_str,
+                 int mlo_botdf_str,
+                 int mlo_topdf_batch_str,
+                 int mlo_topdf_channel_str,
+                 int mlo_topdf_str)
+{
+    __local _FLOAT lcl_top_df[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT];
+    __local index_t lcl_mask[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT];
+
+    int gid0    = get_group_id(0);
+    int gid1    = get_group_id(1);
+    int x       = gid0 * MLO_POOLBWD_GROUP_SZ0 * MLO_POOLBWD_N_HORIZ_OUT_PIX;
+    int y       = gid1 * MLO_POOLBWD_GROUP_SZ1 * MLO_POOLBWD_N_VERT_OUT_PIX;
+    int lcl_id0 = get_local_id(0);
+    int lcl_id1 = get_local_id(1);
+    int ob      = get_global_id(2); // outputs * batch_sz
+    int b       = ob / mlo_n_outputs;
+    int o       = ob - b * mlo_n_outputs;
+
+    int top_x      = (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) < 0
+                         ? 0
+                         : (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1;
+    int top_y      = (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) < 0
+                         ? 0
+                         : (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1;
+    int top_df_off = b * mlo_topdf_batch_str + o * mlo_topdf_channel_str;
+
+    _FLOAT res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX];
+    _FLOAT top_df_val;
+    index_t mask_val;
+    // load tiles
+    // top df and top
+    for(int tj = lcl_id1; tj < MLO_POOLBWD_LCL_DATA_HEIGHT; tj += MLO_POOLBWD_GROUP_SZ1)
+    {
+        int top_y_act    = top_y + tj;
+        int top_df_y_off = top_y_act * mlo_topdf_str;
+
+        int lcl_off_v = tj * MLO_POOLBWD_LCL_DATA_WIDTH;
+
+        bool visibleY = (top_y_act < mlo_top_height);
+
+        for(int ti = lcl_id0; ti < MLO_POOLBWD_LCL_DATA_WIDTH; ti += MLO_POOLBWD_GROUP_SZ0)
+        {
+            mask_val      = MLO_POOLING_INDEX_MAX;
+            int top_x_act = top_x + ti;
+            int lcl_idx   = lcl_off_v + ti;
+
+            bool visible = visibleY && (top_x_act < mlo_top_width);
+            if(visible)
+            {
+                int idx = top_df_off + top_df_y_off + top_x_act;
+
+                top_df_val = top_df[idx];
+                mask_val   = mask[idx];
+                // top_df_val *= visible;
+
+                lcl_top_df[lcl_idx] = top_df_val;
+            }
+            lcl_mask[lcl_idx] = mask_val;
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    _FLOAT add_val;
+    int bt_y = (y + lcl_id1 * MLO_POOLBWD_N_VERT_OUT_PIX);
+    int bt_x = (x + lcl_id0 * MLO_POOLBWD_N_HORIZ_OUT_PIX);
+
+    for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++)
+    {
+        int b_y = bt_y + k;
+
+        // top most top y that can be influenced by this bot y
+        int tt_y1 =
+            (b_y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1) / MLO_POOLING_STRIDE1;
+        int tt_y = max(0, tt_y1);
+
+        for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++)
+        {
+            int b_x = bt_x + l;
+            // left most top x that can be influenced by this bot x
+            int lt_x1 = (b_x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0 + MLO_POOLING_STRIDE0) /
+                        MLO_POOLING_STRIDE0;
+            int lt_x = max(0, lt_x1);
+
+            // find and sum up all tops that have been influenced by particular bot
+            res[k][l] = 0;
+
+            for(int th = tt_y; th < tt_y + (MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1 - 1) /
+                                               MLO_POOLING_STRIDE1;
+                ++th)
+            {
+                __attribute__((opencl_unroll_hint(2))) for(int tw = lt_x;
+                                                           tw < lt_x + (MLO_POOLING_KERNEL_SZ0 +
+                                                                        MLO_POOLING_STRIDE0 - 1) /
+                                                                           MLO_POOLING_STRIDE0;
+                                                           ++tw)
+                {
+                    int lcl_th = th - top_y;
+                    int lcl_tw = tw - top_x;
+#if USE_IMG_INDEX
+                    index_t img_idx = b_x + b_y * mlo_bot_width;
+#else
+                    int filter_x   = b_x - tw * MLO_POOLING_STRIDE0 + mlo_pad0;
+                    int filter_y   = b_y - th * MLO_POOLING_STRIDE1 + mlo_pad1;
+                    int filter_idx = filter_x + filter_y * MLO_POOLING_KERNEL_SZ0;
+#endif
+                    bool visible = (lcl_th < MLO_POOLBWD_LCL_DATA_HEIGHT) &&
+                                   (lcl_tw < MLO_POOLBWD_LCL_DATA_WIDTH);
+                    int lcl_idx = visible ? (lcl_th * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_tw) : 0;
+
+                    bool match = visible &&
+#if USE_IMG_INDEX
+                                 (img_idx == lcl_mask[lcl_idx])
+#else
+                                 (filter_idx == lcl_mask[lcl_idx]) && (filter_x >= 0) &&
+                                 (filter_y >= 0)
+#endif
+                        ;
+
+                    //_FLOAT add_val = lcl_top_df[lcl_idx] * match;
+                    //_FLOAT add_val = match ? lcl_top_df[lcl_idx] : (_FLOAT)0;
+                    if(match)
+                    {
+                        add_val = lcl_top_df[lcl_idx];
+                        res[k][l] += add_val;
+                    }
+                }
+            }
+        }
+    }
+
+    int bot_df_off =
+        b * mlo_botdf_batch_str + o * mlo_botdf_channel_str + bt_y * mlo_botdf_str + bt_x;
+    for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++)
+    {
+        for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++)
+        {
+            if((bt_y + k) < mlo_bot_height && (bt_x + l) < mlo_bot_width)
+            {
+                bot_df[bot_df_off + k * mlo_botdf_str + l] = res[k][l];
+            }
+        }
+    }
+}
+
 #endif // MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX
diff --git a/src/kernels/MIOpenPoolingBwdND.cl b/src/kernels/MIOpenPoolingBwdND.cl
index 7daacd24ab..4e9b4da04e 100644
--- a/src/kernels/MIOpenPoolingBwdND.cl
+++ b/src/kernels/MIOpenPoolingBwdND.cl
@@ -147,6 +147,120 @@ mloPoolingNDMaxBwd(const __global _FLOAT* top_df,
     }
 }
 
+__attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void
+mloPoolingNDMaxBwdNhwc(const __global _FLOAT* top_df,
+                   __global _FLOAT* bot_df,
+                   __global index_t* mask,
+                   const uint pad_d,
+                   const uint pad_h,
+                   const uint pad_w,
+                   const uint batch,
+                   const uint chal,
+                   const uint bot_d,
+                   const uint bot_h,
+                   const uint bot_w,
+                   const uint top_d,
+                   const uint top_h,
+                   const uint top_w,
+                   const uint bot_str_b,
+                   const uint bot_str_c,
+                   const uint bot_str_d,
+                   const uint bot_str_h,
+                   const uint top_str_b,
+                   const uint top_str_c,
+                   const uint top_str_d,
+                   const uint top_str_h,
+                   const uint total_work)
+{
+
+    int bot_blk_w = (bot_w + PIX_W_PER_WORK - 1) / PIX_W_PER_WORK;
+    int bot_blk_h = (bot_h + PIX_H_PER_WORK - 1) / PIX_H_PER_WORK;
+    int bot_blk_d = (bot_d + PIX_D_PER_WORK - 1) / PIX_D_PER_WORK;
+
+    bot_blk_w = max(bot_blk_w, 1);
+    bot_blk_h = max(bot_blk_h, 1);
+    bot_blk_d = max(bot_blk_d, 1);
+
+    for(uint gid = get_global_id(0); gid < total_work; gid += MAX_ACTIV_WORKITEM)
+    {
+        int b_id = gid / chal / bot_blk_w / bot_blk_h / bot_blk_d;
+        int c_id = (gid / bot_blk_w / bot_blk_h / bot_blk_d) % chal;
+
+        int bot_d_id = ((gid / bot_blk_w / bot_blk_h) % bot_blk_d) * PIX_D_PER_WORK;
+        int bot_h_id = ((gid / bot_blk_w) % bot_blk_h) * PIX_H_PER_WORK;
+        int bot_w_id = (gid % bot_blk_w) * PIX_W_PER_WORK;
+
+        int top_d_start =
+            bot_d_id + pad_d < KERNEL_SZ_D ? 0 : (bot_d_id + pad_d - KERNEL_SZ_D) / STRIDE_D + 1;
+        int top_h_start =
+            bot_h_id + pad_h < KERNEL_SZ_H ? 0 : (bot_h_id + pad_h - KERNEL_SZ_H) / STRIDE_H + 1;
+        int top_w_start =
+            bot_w_id + pad_w < KERNEL_SZ_W ? 0 : (bot_w_id + pad_w - KERNEL_SZ_W) / STRIDE_W + 1;
+
+        int top_d_end = (bot_d_id + PIX_D_PER_WORK - 1 + pad_d) / STRIDE_D + 1;
+        int top_h_end = (bot_h_id + PIX_H_PER_WORK - 1 + pad_h) / STRIDE_H + 1;
+        int top_w_end = (bot_w_id + PIX_W_PER_WORK - 1 + pad_w) / STRIDE_W + 1;
+
+        top_d_end = min(top_d_end, (int)top_d);
+        top_h_end = min(top_h_end, (int)top_h);
+        top_w_end = min(top_w_end, (int)top_w);
+
+        _FLOAT bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0};
+
+        for(int h = top_d_start; h < top_d_end; ++h)
+        {
+            for(int j = top_h_start; j < top_h_end; ++j)
+            {
+                for(int i = top_w_start; i < top_w_end; ++i)
+                {
+                    uint top_gbl_off =
+                        b_id * top_str_b + c_id * top_str_c + h * top_str_d + j * top_str_h + i;
+
+                    _FLOAT top_val   = b_id < batch ? top_df[top_gbl_off] : 0;
+                    index_t mask_idx = b_id < batch ? mask[top_gbl_off] : MLO_POOLING_INDEX_MAX;
+
+                    uint mask_d_id = mask_idx / bot_h / bot_w;
+                    uint mask_h_id = (mask_idx / bot_w) % bot_h;
+                    uint mask_w_id = mask_idx % bot_w;
+
+                    if(mask_d_id >= bot_d_id && mask_h_id >= bot_h_id && mask_w_id >= bot_w_id &&
+                       mask_d_id < bot_d_id + PIX_D_PER_WORK &&
+                       mask_h_id < bot_h_id + PIX_H_PER_WORK &&
+                       mask_w_id < bot_w_id + PIX_W_PER_WORK)
+                    {
+                        mask_d_id -= bot_d_id;
+                        mask_h_id -= bot_h_id;
+                        mask_w_id -= bot_w_id;
+
+                        bot_data[mask_d_id][mask_h_id][mask_w_id] += top_val;
+                    }
+                }
+            }
+        }
+
+        uint bot_off = b_id * bot_str_b + c_id * bot_str_c + bot_d_id * bot_str_d +
+                       bot_h_id * bot_str_h + bot_w_id;
+
+        for(uint m = 0; m < PIX_D_PER_WORK; m++)
+        {
+            for(uint k = 0; k < PIX_H_PER_WORK; k++)
+            {
+                for(uint l = 0; l < PIX_W_PER_WORK; l++)
+                {
+
+                    if(bot_d_id + m < bot_d && bot_h_id + k < bot_h && bot_w_id + l < bot_w &&
+                       b_id < batch)
+                    {
+                        uint bot_idx = bot_off + m * bot_str_d + k * bot_str_h + l;
+
+                        bot_df[bot_idx] = bot_data[m][k][l];
+                    }
+                }
+            }
+        }
+    }
+}
+
 #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE || MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
 
 __attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void
@@ -281,4 +395,137 @@ mloPoolingNDAveBwd(const __global _FLOAT* top_df,
         }
     }
 }
+
+__attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void
+mloPoolingNDAveBwdNhwc(const __global _FLOAT* top_df,
+                   __global _FLOAT* bot_df,
+                   const uint pad_d,
+                   const uint pad_h,
+                   const uint pad_w,
+                   const uint batch,
+                   const uint chal,
+                   const uint bot_d,
+                   const uint bot_h,
+                   const uint bot_w,
+                   const uint top_d,
+                   const uint top_h,
+                   const uint top_w,
+                   const uint bot_str_b,
+                   const uint bot_str_c,
+                   const uint bot_str_d,
+                   const uint bot_str_h,
+                   const uint top_str_b,
+                   const uint top_str_c,
+                   const uint top_str_d,
+                   const uint top_str_h,
+                   const uint total_work)
+{
+
+    int bot_blk_w = (bot_w + PIX_W_PER_WORK - 1) / PIX_W_PER_WORK;
+    int bot_blk_h = (bot_h + PIX_H_PER_WORK - 1) / PIX_H_PER_WORK;
+    int bot_blk_d = (bot_d + PIX_D_PER_WORK - 1) / PIX_D_PER_WORK;
+
+    bot_blk_w = max(bot_blk_w, 1);
+    bot_blk_h = max(bot_blk_h, 1);
+    bot_blk_d = max(bot_blk_d, 1);
+
+    for(uint gid = get_global_id(0); gid < total_work; gid += MAX_ACTIV_WORKITEM)
+    {
+        int b_id = gid / chal / bot_blk_w / bot_blk_h / bot_blk_d;
+        int c_id = (gid / bot_blk_w / bot_blk_h / bot_blk_d) % chal;
+
+        int bot_d_id = ((gid / bot_blk_w / bot_blk_h) % bot_blk_d) * PIX_D_PER_WORK;
+        int bot_h_id = ((gid / bot_blk_w) % bot_blk_h) * PIX_H_PER_WORK;
+        int bot_w_id = (gid % bot_blk_w) * PIX_W_PER_WORK;
+
+        int top_d_start =
+            bot_d_id + pad_d < KERNEL_SZ_D ? 0 : (bot_d_id + pad_d - KERNEL_SZ_D) / STRIDE_D + 1;
+        int top_h_start =
+            bot_h_id + pad_h < KERNEL_SZ_H ? 0 : (bot_h_id + pad_h - KERNEL_SZ_H) / STRIDE_H + 1;
+        int top_w_start =
+            bot_w_id + pad_w < KERNEL_SZ_W ? 0 : (bot_w_id + pad_w - KERNEL_SZ_W) / STRIDE_W + 1;
+
+        int top_d_end = (bot_d_id + PIX_D_PER_WORK - 1 + pad_d) / STRIDE_D + 1;
+        int top_h_end = (bot_h_id + PIX_H_PER_WORK - 1 + pad_h) / STRIDE_H + 1;
+        int top_w_end = (bot_w_id + PIX_W_PER_WORK - 1 + pad_w) / STRIDE_W + 1;
+
+        top_d_end = min(top_d_end, (int)top_d);
+        top_h_end = min(top_h_end, (int)top_h);
+        top_w_end = min(top_w_end, (int)top_w);
+
+        _FLOAT_ACCUM bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0};
+
+        for(int h = top_d_start; h < top_d_end; ++h)
+        {
+            int dstart = h * STRIDE_D - pad_d;
+            int dend   = min((dstart + KERNEL_SZ_D), (int)bot_d);
+            dstart     = max(dstart, 0);
+
+            for(int j = top_h_start; j < top_h_end; ++j)
+            {
+                int hstart = j * STRIDE_H - pad_h;
+                int hend   = min((hstart + KERNEL_SZ_H), (int)bot_h);
+                hstart     = max(hstart, 0);
+
+                for(int i = top_w_start; i < top_w_end; ++i)
+                {
+                    int wstart = i * STRIDE_W - pad_w;
+                    int wend   = min((wstart + KERNEL_SZ_W), (int)bot_w);
+                    wstart     = max(wstart, 0);
+
+                    uint pool_size =
+#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
+                        KERNEL_SZ_W * KERNEL_SZ_H * KERNEL_SZ_D;
+#else
+                        (dend - dstart) * (hend - hstart) * (wend - wstart);
+#endif
+                    pool_size = (pool_size == 0) ? 1 : pool_size;
+
+                    uint top_gbl_off =
+                        b_id * top_str_b + c_id * top_str_c + h * top_str_d + j * top_str_h + i;
+                    _FLOAT_ACCUM add_val =
+                        b_id < batch ? CVT_FLOAT2ACCUM(top_df[top_gbl_off]) : CVT_FP32_2ACCUM(0.0f);
+                    add_val /= CVT_INTEGRAL2ACCUM(pool_size);
+
+                    for(int m = dstart; m < dend; ++m)
+                    {
+                        for(int k = hstart; k < hend; ++k)
+                        {
+                            for(int l = wstart; l < wend; ++l)
+                            {
+                                if(m >= bot_d_id && m < PIX_D_PER_WORK + bot_d_id &&
+                                   k >= bot_h_id && k < PIX_H_PER_WORK + bot_h_id &&
+                                   l >= bot_w_id && l < PIX_W_PER_WORK + bot_w_id && b_id < batch)
+                                {
+                                    bot_data[m - bot_d_id][k - bot_h_id][l - bot_w_id] += add_val;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        uint bot_off = b_id * bot_str_b + c_id * bot_str_c + bot_d_id * bot_str_d +
+                       bot_h_id * bot_str_h + bot_w_id;
+
+        for(uint m = 0; m < PIX_D_PER_WORK; m++)
+        {
+            for(uint k = 0; k < PIX_H_PER_WORK; k++)
+            {
+                for(uint l = 0; l < PIX_W_PER_WORK; l++)
+                {
+
+                    if(bot_d_id + m < bot_d && bot_h_id + k < bot_h && bot_w_id + l < bot_w &&
+                       b_id < batch)
+                    {
+                        uint bot_idx = bot_off + m * bot_str_d + k * bot_str_h + l;
+
+                        bot_df[bot_idx] = CVT_ACCUM2FLOAT(bot_data[m][k][l]);
+                    }
+                }
+            }
+        }
+    }
+}
 #endif
diff --git a/src/kernels/MIOpenPoolingForwardNaive.cl b/src/kernels/MIOpenPoolingForwardNaive.cl
index 20e0949967..f839193297 100644
--- a/src/kernels/MIOpenPoolingForwardNaive.cl
+++ b/src/kernels/MIOpenPoolingForwardNaive.cl
@@ -145,7 +145,7 @@ __kernel void mloPoolingForwardNaive(const __global _FLOAT* bot_ptr,
             uint h_save          = 0;
             uint w_save          = 0;
 #endif
-        for(uint d = dstart; d < dend; ++d)
+        for(size_t d = dstart; d < dend; ++d)
         {
             for(uint h = hstart; h < hend; ++h)
             {
@@ -153,7 +153,7 @@ __kernel void mloPoolingForwardNaive(const __global _FLOAT* bot_ptr,
                 {
                     const size_t bot_index = b * bot_n_stride             //
                                              + o * bot_c_stride           //
-                                             + (size_t)(d * bot_d_stride) //
+                                             + d * bot_d_stride //
                                              + (size_t)(h * bot_h_stride) //
                                              + (size_t)(w * bot_w_stride);
 #if AVERAGE_OPS
diff --git a/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp
new file mode 100644
index 0000000000..bc17af8922
--- /dev/null
+++ b/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp
@@ -0,0 +1,245 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "pooling_functions.h"
+
+#include <algorithm>
+
+#if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE)
+#define AVERAGE_OPS 1
+#else
+#define AVERAGE_OPS 0
+#endif
+
+// Let's use extended-precision accumulator only in FP16 pooling and only for averaging.
+// For all other ops and datatypes, use native accumulator, i.e. treate FLOAT_ACCUM as FLOAT.
+#if !(AVERAGE_OPS && MIOPEN_USE_FP16)
+#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 1
+#endif
+#include "float_types.h"
+
+#ifndef MLO_POOLING_IS2D_KERNEL
+#error "MLO_POOLING_IS2D_KERNEL must be defined"
+#endif
+
+#if AVERAGE_OPS
+#define ARG_UNUSED_FOR_AVERAGE __attribute__((__unused__))
+#else
+#define ARG_UNUSED_FOR_AVERAGE
+#endif
+
+#if MLO_POOLING_IS2D_KERNEL
+#define ARG_UNUSED_FOR_2D __attribute__((__unused__))
+#else
+#define ARG_UNUSED_FOR_2D
+#endif
+
+// Out N, D, H are encoded into the block indices x, y, z
+// Requires all lens, strides, pads to be in DHW[NC] order. The code is
+// cleaner and more performant this way.
+// No 2D-only optimization.
+template <typename TI, typename TO>
+__device__ void poolingFwdNDNhwcNaive(const TI* in_data,
+                                    TO* out_data,
+                                    ARG_UNUSED_FOR_AVERAGE index_t* mask_ptr,
+                                    ARG_UNUSED_FOR_AVERAGE int save_index,
+                                    ARG_UNUSED_FOR_AVERAGE int index_mode,
+                                    std::vector<uint32_t> filter_lens,
+                                    std::vector<uint32_t> filter_strides,
+                                    std::vector<uint32_t> filter_pads,
+                                    uint32_t all_n,
+                                    uint32_t all_c,
+                                    std::vector<uint32_t> lens,
+                                    std::vector<size_t> strides,
+                                    std::vector<uint32_t> out_lens,
+                                    std::vector<size_t> out_strides,
+                                    ARG_UNUSED_FOR_AVERAGE std::vector<size_t> mask_strides)
+{
+    constexpr uint32_t D_IDX = 0;
+    constexpr uint32_t H_IDX = 1;
+    constexpr uint32_t W_IDX = 2;
+    constexpr uint32_t N_IDX = 3;
+    constexpr uint32_t C_IDX = 4;
+
+    const uint32_t b = blockIdx.x;  // out N
+    if(!(b < all_n))
+        return;
+
+    const uint32_t k = blockIdx.y;  // out D
+    if(!(k < out_lens[D_IDX]))
+        return;
+
+    const uint32_t j = blockIdx.z;  // out H
+    if(!(j < out_lens[H_IDX]))
+        return;
+
+    for(uint32_t i = 0; i < out_lens[W_IDX]; ++i)  // out W
+    {
+        for(uint32_t o = 0; o < all_c ++o)  // out C
+        {
+            const auto int_dstart   = static_cast<int64_t>(k * filter_strides[D_IDX]) - static_cast<int64_t>(filter_pads[D_IDX]);
+            const auto int_hstart   = static_cast<int>(j * filter_strides[H_IDX]) - static_cast<int>(filter_pads[H_IDX]);
+            const auto int_wstart        = static_cast<int>(i * filter_strides[W_IDX]) - static_cast<int>(filter_pads[W_IDX]);
+            const auto dend           = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(filter_lens[D_IDX]), static_cast<int64_t>(out_lens[D_IDX])));
+            const auto hend             = static_cast<uint32_t>(min(int_hstart + static_cast<int>(filter_lens[H_IDX]), static_cast<int>(out_lens[H_IDX])));
+            const auto wend             = static_cast<uint32_t>(min(int_wstart + static_cast<int>(filter_lens[W_IDX]), static_cast<int>(out_lens[W_IDX])));
+            const auto dstart         = static_cast<size_t>(max(int_dstart, 0));
+            const auto hstart         = static_cast<uint32_t>(max(int_hstart, 0));
+            const auto wstart           = static_cast<uint32_t>(max(int_wstart, 0));
+
+#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
+        uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        pool_size       = (pool_size == 0) ? 1 : pool_size;
+#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
+        const uint32_t pool_size = filter_lens[D_IDX] * filter_lens[H_IDX] * filter_lens[W_IDX];
+#endif
+
+#if AVERAGE_OPS
+        _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0);
+#else // MAX
+            _FLOAT_ACCUM res     = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM);
+            bool found           = false; // May remain false if bot contains only NaNs/-INFs.
+            uint32_t d_save          = 0;
+            uint32_t h_save          = 0;
+            uint32_t w_save          = 0;
+#endif
+        for(size_t d = dstart; d < dend; ++d)
+        {
+            for(uint32_t h = hstart; h < hend; ++h)
+            {
+                for(uint32_t w = wstart; w < wend; ++w)
+                {
+                    const size_t in_index = b * strides[N_IDX] +           //
+                                             o * strides[C_IDX] +           //
+                                             d * strides[D_IDX] + //
+                                             static_cast<size_t>(h * strides[H_IDX]) + //
+                                             static_cast<size_t>(w * strides[W_IDX]);
+#if AVERAGE_OPS
+                    res += in_data[in_index];
+#else // MAX
+                        if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index] > res))
+                        {
+                            res = in_data[in_index];
+                            if(save_index)
+                            {
+                                found  = true;
+                                d_save = d;
+                                h_save = h;
+                                w_save = w;
+                            }
+                        }
+#endif
+                }
+            }
+        }
+
+#if AVERAGE_OPS
+        res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size);
+#else // MAX
+            if(save_index)
+            {
+                index_t res_index = 0;
+
+                /// Preventing overflow during computation of res_index:
+                /// If Index is shorter than uint, then let's perform computation in 32-bit
+                /// domain and then convert to narrower Index. That would reduce the probability of
+                /// overflow. If Index is wider then 32 bits, then it seems like it is better to
+                /// convert to Index type before multiplication. However this is not actually
+                /// necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in
+                /// 32 bits and then convert.
+
+                if(found)
+                {
+                    if(index_mode == 1)
+                        res_index = (index_t)(d_save * lens[H_IDX] * lens[W_IDX] //
+                                              + h_save * lens[W_IDX]       //
+                                              + w_save);
+                    else
+                        res_index = (index_t)(                                                    //
+                            ((d_save - k * filter_strides[D_IDX] + filter_pads[D_IDX]) * filter_lens[W_IDX] * filter_lens[H_IDX]) //
+                            + ((h_save - j * filter_strides[H_IDX] + filter_pads[H_IDX]) * filter_lens[W_IDX])          //
+                            + (w_save - i * filter_strides[W_IDX] + filter_pads[W_IDX])                       //
+                        );
+                }
+
+                const size_t mask_index = b * mask_strides[N_IDX]             //
+                                          + o * mask_strides[C_IDX]           //
+                                          + (size_t)(k * mask_strides[D_IDX]) //
+                                          + (size_t)(j * mask_strides[H_IDX]) //
+                                          + (size_t)(i * mask_strides[W_IDX]);
+                mask_ptr[mask_index] = res_index;
+            }
+#endif
+        const size_t out_index = out_strides[N_IDX]             //
+                                 + o * out_strides[C_IDX]           //
+                                 + (size_t)(k * out_strides[D_IDX]) //
+                                 + (size_t)(j * out_strides[H_IDX]) //
+                                 + (size_t)(i * out_strides]W_IDX]);
+
+        out_data[out_index] = (_FLOAT)res;
+    }
+}
+}
+
+extern "C" __global__ void mloPoolingForwardNDNhwcNaive(const INPUT_TYPE* __restrict__ in_data,
+                                     OUTPUT_TYPE* out_data,
+                                     ARG_UNUSED_FOR_AVERAGE index_t* mask_ptr,
+                                     ARG_UNUSED_FOR_AVERAGE int save_index,
+                                     ARG_UNUSED_FOR_AVERAGE int index_mode,
+                                     std::vector<uint32_t> filter_lens,
+                                     std::vector<uint32_t> filter_strides,
+                                     std::vector<uint32_t> filter_pads,
+                                     uint32_t all_n,
+                                     uint32_t all_c,
+                                     std::vector<uint32_t> lens,
+                                     std::vector<size_t> strides,
+                                     std::vector<uint32_t> out_lens,
+                                     std::vector<size_t> out_strides,
+                                     ARG_UNUSED_FOR_AVERAGE std::vector<size_t> mask_strides)
+{
+    poolingFwdNDNhwcNaive<INPUT_TYPE, OUTPUT_TYPE>(
+        in_data,
+        out_data,
+        mask_ptr,
+        save_index,
+        index_mode,
+        filter_lens,
+        filter_strides,
+        filter_pads,
+        all_n,
+        all_c,
+        lens,
+        strides,
+        out_lens,
+        out_strides,
+        mask_strides
+    );
+}
diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp
index 9881c1596f..9a2258908f 100644
--- a/src/ocl/pooling_ocl.cpp
+++ b/src/ocl/pooling_ocl.cpp
@@ -42,6 +42,9 @@ static auto PoolingForwardSolvers()
     return solver::SolverContainer<solver::pooling::PoolingForward2d,
                                    solver::pooling::PoolingForwardNd,
                                    solver::pooling::PoolingForwardNaive,
+                                   solver::pooling::PoolingForwardNdNhwcNaive,
+                                   solver::pooling::PoolingForwardCk2d,
+                                   solver::pooling::PoolingForwardCkNd,
                                    solver::pooling::TransposedPoolingFwd2d,
                                    solver::pooling::TransposedPoolingFwdNd>{};
 }
@@ -50,6 +53,8 @@ static auto PoolingBackwardSolvers()
 {
     return solver::SolverContainer<solver::pooling::PoolingBackward2d,
                                    solver::pooling::PoolingBackwardNd,
+                                   solver::pooling::PoolingBackwardCk2d,
+                                   solver::pooling::PoolingBackwardCkNd,
                                    solver::pooling::TransposedPoolingBwd2d,
                                    solver::pooling::TransposedPoolingBwdNd>{};
 }
diff --git a/src/solver.cpp b/src/solver.cpp
index e468d38d0a..0088a10cb1 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -561,12 +561,18 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingForward2d{}.SolverDbId());
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNd{}.SolverDbId());
 
+    Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardCk2d{}.SolverDbId());
+    Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardCkNd{}.SolverDbId());
+
     Register(registry, ++id, Primitive::Pooling, pooling::TransposedPoolingFwd2d{}.SolverDbId());
     Register(registry, ++id, Primitive::Pooling, pooling::TransposedPoolingFwdNd{}.SolverDbId());
 
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackward2d{}.SolverDbId());
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardNd{}.SolverDbId());
 
+    Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardCk2d{}.SolverDbId());
+    Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardCkNd{}.SolverDbId());
+
     RegisterWithSolver(registry,
                        ++id,
                        conv::ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC{},
@@ -594,6 +600,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              fusion::ConvCKIgemmFwdBiasActivFused{}.SolverDbId(),
              miopenConvolutionAlgoImplicitGEMM);
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNaive{}.SolverDbId());
+    Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNdNhwcNaive{}.SolverDbId());
     RegisterWithSolver(registry,
                        ++id,
                        conv::ConvHipImplicitGemmGroupFwdXdlops{},
diff --git a/src/solver/pooling/backwardCk2d.cpp b/src/solver/pooling/backwardCk2d.cpp
new file mode 100644
index 0000000000..d1ee157190
--- /dev/null
+++ b/src/solver/pooling/backwardCk2d.cpp
@@ -0,0 +1,308 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/pooling/solvers.hpp>
+
+#include <miopen/pooling/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/pooling.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/target_properties.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace pooling {
+
+namespace {
+
+struct kernel_params
+{
+    int kernel_size_h;
+    int kernel_size_w;
+    int kernel_stride_h;
+    int kernel_stride_w;
+    int out_pix_tile0;
+    int out_pix_tile1;
+    std::size_t batch_sz;
+    std::size_t n_inputs;
+    std::size_t in_height;
+    std::size_t in_width;
+    std::size_t grp_tile0;
+    std::size_t grp_tile1;
+
+    kernel_params(const miopen::pooling::ProblemDescription& problem)
+    {
+        const auto& pd = problem.GetPooling();
+
+        kernel_size_w   = pd.lens[1];
+        kernel_size_h   = pd.lens[0];
+        kernel_stride_w = pd.strides[1];
+        kernel_stride_h = pd.strides[0];
+
+        std::tie(batch_sz, n_inputs, in_height, in_width) =
+            miopen::tien<4>(problem.GetXDesc().GetLengths(), 1);
+
+        out_pix_tile0 = 1;
+        out_pix_tile1 = 1;
+        if(pd.GetMode() == miopenPoolingMax)
+        {
+            out_pix_tile0 = in_width > 8 && in_width <= 24 ? 4 : 1;
+            out_pix_tile1 = in_width <= 24 ? 1 : (in_width > 64 && in_width <= 96 ? 4 : 8);
+        }
+
+        grp_tile0 = 8;
+        grp_tile1 = 8;
+        if(pd.GetMode() == miopenPoolingMax)
+        {
+            grp_tile0 = in_width <= 8     ? 8  //
+                        : in_width <= 16  ? 4  //
+                        : in_width <= 24  ? 8  //
+                        : in_width <= 32  ? 32 //
+                        : in_width <= 64  ? 8  //
+                        : in_width <= 96  ? 16 //
+                        : in_width <= 128 ? 16
+                                          : 32;
+            grp_tile1 = in_width <= 8     ? 8  //
+                        : in_width <= 16  ? 16 //
+                        : in_width <= 24  ? 8  //
+                        : in_width <= 32  ? 4  //
+                        : in_width <= 64  ? 8  //
+                        : in_width <= 96  ? 4  //
+                        : in_width <= 128 ? 16
+                                          : 4;
+        }
+    }
+};
+
+std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem)
+{
+    const auto datatype = problem.GetXDesc().GetType();
+    return get_data_size(datatype);
+}
+
+std::size_t sizeof_kernel_index_t(const miopen::pooling::ProblemDescription& problem)
+{
+    return get_data_size(problem.GetPooling().GetIndexType());
+}
+
+inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m)
+{
+    assert(m > 0);
+    return ((v + m - 1) / m) * m;
+}
+
+// Compute amount of local memory required for holding the arrays defined
+// in the "mloPoolingAveBwd" and "mloPoolingMaxBwd" kernels.
+std::size_t sizeof_local_memory(const miopen::pooling::ProblemDescription& problem)
+{
+    const kernel_params kp(problem);
+
+    // aliases to ease programming
+    const auto& MLO_POOLING_KERNEL_SZ0      = kp.kernel_size_w;
+    const auto& MLO_POOLING_KERNEL_SZ1      = kp.kernel_size_h;
+    const auto& MLO_POOLBWD_N_HORIZ_OUT_PIX = kp.out_pix_tile0;
+    const auto& MLO_POOLBWD_N_VERT_OUT_PIX  = kp.out_pix_tile1;
+    const auto& MLO_POOLING_STRIDE0         = kp.kernel_stride_w;
+    const auto& MLO_POOLING_STRIDE1         = kp.kernel_stride_h;
+    const auto& MLO_POOLBWD_GROUP_SZ0       = kp.grp_tile0;
+    const auto& MLO_POOLBWD_GROUP_SZ1       = kp.grp_tile1;
+
+    const auto MLO_POOLBWD_LCL_DATA_WIDTH =
+        (static_cast<std::size_t>(MLO_POOLBWD_GROUP_SZ0) * MLO_POOLBWD_N_HORIZ_OUT_PIX +
+         MLO_POOLING_KERNEL_SZ0 + MLO_POOLING_STRIDE0 - 2) /
+        MLO_POOLING_STRIDE0;
+    const auto MLO_POOLBWD_LCL_DATA_HEIGHT =
+        (static_cast<std::size_t>(MLO_POOLBWD_GROUP_SZ1) * MLO_POOLBWD_N_VERT_OUT_PIX +
+         MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1 - 2) /
+        MLO_POOLING_STRIDE1;
+
+    std::size_t rv   = 0;
+    const auto nelem = MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT;
+    if(problem.GetPooling().GetMode() == miopenPoolingMax)
+    {
+        const auto sizeof_lcl_top_df = sizeof_kernel_FLOAT(problem) * nelem;
+        const auto sizeof_lcl_mask   = sizeof_kernel_index_t(problem) * nelem;
+        /// \anchor alignment_of_arrays_in_gpu_memory
+        /// The total amount of memory calculated here is slightly less than the amount calculated
+        /// by the compiler. As a result, the check here may pass, while then the compiler might
+        /// refuse to build the kernel. The most likely reason for the difference is padding (due to
+        /// alignment requirements). We don't know exactly how the compiler takes alignment into
+        /// account, but what can we do is applying an alignment that imposes a slightly tighter
+        /// constraints than the compiler. So far, 16-byte (4xDWORD) alignment works well.
+        rv = RoundUpToMultiple(sizeof_lcl_top_df, 16) + RoundUpToMultiple(sizeof_lcl_mask, 16);
+    }
+    else
+    {
+        const auto sizeof_lcl_top_diff = sizeof_kernel_FLOAT(problem) * nelem;
+        rv                             = RoundUpToMultiple(sizeof_lcl_top_diff, 16);
+    }
+    MIOPEN_LOG_T(rv);
+    return rv;
+}
+
+} // namespace
+
+bool PoolingBackwardCk2d::IsApplicable(const ExecutionContext&,
+                                     const miopen::pooling::ProblemDescription& problem) const
+{
+    return problem.GetDirection() == miopen::pooling::Direction::Backward &&
+           (problem.GetPooling().GetMode() == miopenPoolingMax ||
+            problem.GetPooling().GetMode() == miopenPoolingAverage ||
+            problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) &&
+           problem.GetXDesc().GetNumDims() == 4 && problem.GetXDesc().GetLayout("NCHW") == "NHWC" &&
+           problem.GetYDesc().GetLayout("NCHW") == "NHWC" &&
+           sizeof_local_memory(problem) <= TargetProperties::GetMaxLocalMemorySize();
+}
+
+ConvSolution
+PoolingBackwardCk2d::GetSolution(const ExecutionContext&,
+                               const miopen::pooling::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const kernel_params kp(problem);
+
+    {
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenPoolingBwd.cl";
+// TODO: backwardCk2d kernel
+        if(problem.GetPooling().GetMode() == miopenPoolingMax)
+        {
+            kernel.kernel_name = "mloPoolingMaxBwd";
+        }
+        else if(problem.GetPooling().GetMode() == miopenPoolingAverage ||
+                problem.GetPooling().GetMode() == miopenPoolingAverageInclusive)
+        {
+            kernel.kernel_name = "mloPoolingAveBwd";
+        }
+
+        const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax)
+                                       ? MLO_POOLING_OP_MAX
+                                       : ((problem.GetPooling().GetMode() == miopenPoolingAverage)
+                                              ? MLO_POOLING_OP_AVE
+                                              : MLO_POOLING_OP_AVE_INCLUSIVE);
+
+        const int g_wk_width  = ((kp.in_width + kp.grp_tile0 * kp.out_pix_tile0 - 1) /
+                                (kp.grp_tile0 * kp.out_pix_tile0));
+        const int g_wk_height = ((kp.in_height + kp.grp_tile1 * kp.out_pix_tile1 - 1) /
+                                 (kp.grp_tile1 * kp.out_pix_tile1));
+
+        const auto build_params =
+            KernelBuildParameters{
+                {"MLO_POOLING_OP_ID", pooling_method},
+                {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h},
+                {"MLO_POOLING_STRIDE1", kp.kernel_stride_h},
+                {"MLO_POOLING_KERNEL_SZ0", kp.kernel_size_w},
+                {"MLO_POOLING_STRIDE0", kp.kernel_stride_w},
+                {"MLO_POOLBWD_N_HORIZ_OUT_PIX", kp.out_pix_tile0},
+                {"MLO_POOLBWD_N_VERT_OUT_PIX", kp.out_pix_tile1},
+                {"MLO_POOLBWD_GROUP_SZ0", kp.grp_tile0},
+                {"MLO_POOLBWD_GROUP_SZ1", kp.grp_tile1},
+                {"MLO_POOLING_INDEX_TYPE",
+                 get_pooling_index_type_name(problem.GetPooling().GetIndexType())},
+                {"MLO_POOLING_INDEX_MAX",
+                 get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())},
+                {"USE_IMG_INDEX",
+                 problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexImage
+                     ? 1
+                     : 0},
+            }
+            << GetDataTypeKBP(problem.GetXDesc().GetType());
+
+        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+        kernel.l_wk = {kp.grp_tile0, kp.grp_tile1, 1};
+        kernel.g_wk = {
+            g_wk_width * kp.grp_tile0, g_wk_height * kp.grp_tile1, kp.n_inputs * kp.batch_sz};
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::BwdInvokeParams>();
+
+            if(params.pooling.GetMode() == miopenPoolingMax)
+            {
+                kernel(params.dy,
+                       params.dx,
+                       params.workspace,
+                       static_cast<int>(params.pooling.pads[0]),
+                       static_cast<int>(params.pooling.pads[1]),
+                       static_cast<int>(params.dyDesc.GetLengths()[1]),
+                       static_cast<int>(params.dxDesc.GetLengths()[2]),
+                       static_cast<int>(params.dxDesc.GetLengths()[3]),
+                       static_cast<int>(params.dyDesc.GetLengths()[2]),
+                       static_cast<int>(params.dyDesc.GetLengths()[3]),
+                       static_cast<int>(params.dxDesc.GetStrides()[0]),
+                       static_cast<int>(params.dxDesc.GetStrides()[1]),
+                       static_cast<int>(params.dxDesc.GetStrides()[2]),
+                       static_cast<int>(params.dyDesc.GetStrides()[0]),
+                       static_cast<int>(params.dyDesc.GetStrides()[1]),
+                       static_cast<int>(params.dyDesc.GetStrides()[2]));
+            }
+            else
+            {
+                kernel(params.dy,
+                       params.dx,
+                       static_cast<int>(params.pooling.pads[0]),
+                       static_cast<int>(params.pooling.pads[1]),
+                       static_cast<int>(params.dyDesc.GetLengths()[1]),
+                       static_cast<int>(params.dxDesc.GetLengths()[2]),
+                       static_cast<int>(params.dxDesc.GetLengths()[3]),
+                       static_cast<int>(params.dyDesc.GetLengths()[2]),
+                       static_cast<int>(params.dyDesc.GetLengths()[3]),
+                       static_cast<int>(params.dxDesc.GetStrides()[0]),
+                       static_cast<int>(params.dxDesc.GetStrides()[1]),
+                       static_cast<int>(params.dxDesc.GetStrides()[2]),
+                       static_cast<int>(params.dyDesc.GetStrides()[0]),
+                       static_cast<int>(params.dyDesc.GetStrides()[1]),
+                       static_cast<int>(params.dyDesc.GetStrides()[2]));
+            }
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+PoolingBackwardCk2d::GetWorkspaceSize(const ExecutionContext&,
+                                    const miopen::pooling::ProblemDescription& problem) const
+{
+    if(problem.GetPooling().GetMode() != miopenPoolingMax)
+        return 0;
+    return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType());
+}
+
+} // namespace pooling
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/pooling/backwardCkNd.cpp b/src/solver/pooling/backwardCkNd.cpp
new file mode 100644
index 0000000000..b8cdbb0286
--- /dev/null
+++ b/src/solver/pooling/backwardCkNd.cpp
@@ -0,0 +1,273 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/pooling/solvers.hpp>
+
+#include <miopen/pooling/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/pooling.hpp>
+#include <miopen/kernel_build_params.hpp>
+
+#define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCm/MIFin/issues/80
+
+namespace miopen {
+
+namespace solver {
+
+namespace pooling {
+
+bool PoolingBackwardCkNd::IsApplicable(const ExecutionContext&,
+                                     const miopen::pooling::ProblemDescription& problem) const
+{
+    return problem.GetDirection() == miopen::pooling::Direction::Backward          //
+           && problem.GetXDesc().GetType() == problem.GetYDesc().GetType()         //
+           && (problem.GetXDesc().GetType() == miopenFloat                         //
+               || problem.GetXDesc().GetType() == miopenHalf)                      //
+           && (problem.GetPooling().GetMode() == miopenPoolingMax                  //
+               || problem.GetPooling().GetMode() == miopenPoolingAverage           //
+               || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) //
+           && (                                                                    //
+                  (problem.GetXDesc().GetNumDims() == 5                            //
+                   && problem.GetXDesc().GetLayout("NCDHW") == "NDHWC"             //
+                   && problem.GetYDesc().GetLayout("NCDHW") == "NDHWC")            //
+                  ||                                                               //
+                  (problem.GetXDesc().GetNumDims() == 4                            //
+                   && problem.GetXDesc().GetLayout("NCHW") == "NHWC"               //
+                   && problem.GetYDesc().GetLayout("NCHW") == "NHWC")              //
+                  )                                                                //
+           /// \todo This solver does not support workspace index mask mode yet.
+           && !(problem.GetPooling().GetMode() == miopenPoolingMax //
+                && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask);
+}
+
+ConvSolution
+PoolingBackwardCkNd::GetSolution(const ExecutionContext&,
+                               const miopen::pooling::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto kernel        = KernelInfo{};
+    kernel.kernel_file = "MIOpenPoolingBwdND.cl";
+    kernel.kernel_name = "mloPoolingND";
+// TODO: backwardCkNd kernel
+    if(problem.GetPooling().GetMode() == miopenPoolingMax)
+    {
+        kernel.kernel_name += "MaxBwd";
+    }
+    else if(problem.GetPooling().GetMode() == miopenPoolingAverage ||
+            problem.GetPooling().GetMode() == miopenPoolingAverageInclusive)
+    {
+        kernel.kernel_name += "AveBwd";
+    }
+
+    const auto& bot = problem.GetXDesc();
+    const auto& top = problem.GetYDesc();
+
+    std::size_t batch_sz, n_inputs, in_height, in_width;
+    std::tie(batch_sz, n_inputs, in_height, in_width) = miopen::tien<4>(bot.GetLengths(), 1);
+
+    const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax)
+                                   ? MLO_POOLING_OP_MAX
+                                   : ((problem.GetPooling().GetMode() == miopenPoolingAverage)
+                                          ? MLO_POOLING_OP_AVE
+                                          : MLO_POOLING_OP_AVE_INCLUSIVE);
+
+    int pix_w_per_work = 1;
+    int pix_h_per_work = 4;
+    int pix_d_per_work = 2;
+
+    int batch = top.GetLengths()[0];
+    int chal  = top.GetLengths()[1];
+
+    const bool is2d = (bot.GetNumDims() == 4);
+
+    int bot_d = is2d ? 1 : *(bot.GetLengths().rbegin() + 2);
+    int bot_h = *(bot.GetLengths().rbegin() + 1);
+    int bot_w = *(bot.GetLengths().rbegin());
+
+    int pix_blk_w = std::max((bot_w + pix_w_per_work - 1) / pix_w_per_work, 1);
+    int pix_blk_h = std::max((bot_h + pix_h_per_work - 1) / pix_h_per_work, 1);
+    int pix_blk_d = std::max((bot_d + pix_d_per_work - 1) / pix_d_per_work, 1);
+
+    int max_activ_workitem = 65536;
+    int total_work         = batch * chal * pix_blk_w * pix_blk_h * pix_blk_d;
+    int activ_work         = std::min(total_work, max_activ_workitem);
+
+#if WORKAROUND_ISSUE_MIFIN_80
+    const std::size_t wavesize = 64;
+#else
+    const std::size_t wavesize = context.GetStream().GetWavefrontWidth();
+#endif
+    size_t grp_num = (activ_work + wavesize - 1) / wavesize;
+
+    auto strides = problem.GetPooling().strides;
+    auto lens    = problem.GetPooling().lens;
+    auto pads    = problem.GetPooling().pads;
+
+    if(is2d)
+    {
+        strides.push_back(strides[1]);
+        strides[1] = strides[0];
+        lens.push_back(lens[1]);
+        lens[1] = lens[0];
+        lens[0] = 1;
+        pads.push_back(pads[1]);
+        pads[1] = pads[0];
+        pads[0] = 0;
+    }
+
+    bool territory_overlap = false;
+    for(std::size_t i = 0; i < strides.size(); i++)
+        territory_overlap |= (strides[i] < lens[i]);
+
+    const auto build_params =
+        KernelBuildParameters{
+            {"MLO_POOLING_OP_ID", pooling_method},
+            {"MAX_ACTIV_WORKITEM", max_activ_workitem},
+            {"MLO_POOLING_GROUP_SZ0", wavesize},
+            {"MLO_POOLING_GROUP_SZ1", 1},
+            {"MLO_POOLING_GROUP_SZ2", 1},
+            {"PIX_W_PER_WORK", pix_w_per_work},
+            {"PIX_H_PER_WORK", pix_h_per_work},
+            {"PIX_D_PER_WORK", pix_d_per_work},
+            {"KERNEL_SZ_D", lens[0]},
+            {"KERNEL_SZ_H", lens[1]},
+            {"KERNEL_SZ_W", lens[2]},
+            {"STRIDE_D", strides[0]},
+            {"STRIDE_H", strides[1]},
+            {"STRIDE_W", strides[2]},
+            {"TERRITORY_OVERLAP", static_cast<int>(territory_overlap)},
+            {"MLO_POOLING_INDEX_TYPE",
+             get_pooling_index_type_name(problem.GetPooling().GetIndexType())},
+            {"MLO_POOLING_INDEX_MAX",
+             get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())},
+        }
+        << GetDataTypeKBP(problem.GetDYDesc().GetType());
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+    kernel.l_wk = {wavesize, 1, 1};
+    kernel.g_wk = {wavesize * grp_num, 1, 1};
+
+    result.construction_params.push_back(kernel);
+
+    const auto top_d = is2d ? 1 : *(top.GetLengths().rbegin() + 2);
+    const auto top_h = *(top.GetLengths().rbegin() + 1);
+    const auto top_w = *(top.GetLengths().rbegin());
+
+    auto unpackStrides = [is2d](const auto& strides) {
+        return std::make_tuple(strides[0], // N stride
+                               strides[1], // C stride
+                               strides[2], // D stride. Same as H_stride in 3D converted from 2D.
+                               is2d        //
+                                   ? strides[2] // 2D H stride
+                                   : strides[3] // 3D H stride
+        );
+    };
+
+    std::size_t bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride;
+    std::size_t top_n_stride, top_c_stride, top_d_stride, top_h_stride;
+    std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride) =
+        unpackStrides(bot.GetStrides());
+    std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride) =
+        unpackStrides(top.GetStrides());
+
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::BwdInvokeParams>();
+
+            if(params.pooling.GetMode() == miopenPoolingMax)
+            {
+                kernel(params.dy,
+                       params.dx,
+                       params.workspace,
+                       static_cast<unsigned>(pads[0]),
+                       static_cast<unsigned>(pads[1]),
+                       static_cast<unsigned>(pads[2]),
+                       static_cast<unsigned>(batch),
+                       static_cast<unsigned>(chal),
+                       static_cast<unsigned>(bot_d),
+                       static_cast<unsigned>(bot_h),
+                       static_cast<unsigned>(bot_w),
+                       static_cast<unsigned>(top_d),
+                       static_cast<unsigned>(top_h),
+                       static_cast<unsigned>(top_w),
+                       static_cast<unsigned>(bot_n_stride),
+                       static_cast<unsigned>(bot_c_stride),
+                       static_cast<unsigned>(bot_d_stride),
+                       static_cast<unsigned>(bot_h_stride),
+                       static_cast<unsigned>(top_n_stride),
+                       static_cast<unsigned>(top_c_stride),
+                       static_cast<unsigned>(top_d_stride),
+                       static_cast<unsigned>(top_h_stride),
+                       static_cast<unsigned>(total_work));
+            }
+            else
+            {
+                kernel(params.dy,
+                       params.dx,
+                       static_cast<unsigned>(pads[0]),
+                       static_cast<unsigned>(pads[1]),
+                       static_cast<unsigned>(pads[2]),
+                       static_cast<unsigned>(batch),
+                       static_cast<unsigned>(chal),
+                       static_cast<unsigned>(bot_d),
+                       static_cast<unsigned>(bot_h),
+                       static_cast<unsigned>(bot_w),
+                       static_cast<unsigned>(top_d),
+                       static_cast<unsigned>(top_h),
+                       static_cast<unsigned>(top_w),
+                       static_cast<unsigned>(bot_n_stride),
+                       static_cast<unsigned>(bot_c_stride),
+                       static_cast<unsigned>(bot_d_stride),
+                       static_cast<unsigned>(bot_h_stride),
+                       static_cast<unsigned>(top_n_stride),
+                       static_cast<unsigned>(top_c_stride),
+                       static_cast<unsigned>(top_d_stride),
+                       static_cast<unsigned>(top_h_stride),
+                       static_cast<unsigned>(total_work));
+            }
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+PoolingBackwardCkNd::GetWorkspaceSize(const ExecutionContext&,
+                                    const miopen::pooling::ProblemDescription& problem) const
+{
+    if(problem.GetPooling().GetMode() != miopenPoolingMax)
+        return 0;
+    return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType());
+}
+
+} // namespace pooling
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/pooling/forwardCk2d.cpp b/src/solver/pooling/forwardCk2d.cpp
new file mode 100644
index 0000000000..d66da12a33
--- /dev/null
+++ b/src/solver/pooling/forwardCk2d.cpp
@@ -0,0 +1,267 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/pooling/solvers.hpp>
+
+#include <miopen/pooling/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/pooling.hpp>
+#include <miopen/kernel_build_params.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace pooling {
+
+namespace {
+
+struct kernel_params
+{
+    int kernel_size_h;
+    int kernel_size_w;
+    int kernel_stride_h;
+    int kernel_stride_w;
+    int out_height;
+    int out_width;
+    int out_pix_tile0;
+    int out_pix_tile1;
+
+    kernel_params(const miopen::pooling::ProblemDescription& p)
+    {
+        const auto& pd  = p.GetPooling();
+        const auto& yd  = p.GetYDesc();
+        kernel_size_h   = pd.lens[0];
+        kernel_size_w   = pd.lens[1];
+        kernel_stride_h = pd.strides[0];
+        kernel_stride_w = pd.strides[1];
+        out_height      = yd.GetLengths()[2];
+        out_width       = yd.GetLengths()[3];
+        out_pix_tile0   = 1;
+        out_pix_tile1   = out_height <= 8    ? 1 //
+                          : out_height <= 32 ? 4 //
+                                             : 8;
+        if(out_height > 16 && out_height % 32 > 16)
+            out_pix_tile1 = std::min(16, std::max(1, prePow2(out_pix_tile1 * kernel_stride_h)));
+    }
+};
+
+std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem)
+{
+    const auto datatype = problem.GetXDesc().GetType();
+    return get_data_size(datatype);
+}
+
+std::size_t sizeof_kernel_FLOAT_ACCUM(const miopen::pooling::ProblemDescription& problem)
+{
+    const auto datatype = problem.GetXDesc().GetType();
+    if(datatype == miopenHalf)
+        return get_data_size(miopenFloat); // mixed precision
+    return get_data_size(datatype);
+}
+
+inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m)
+{
+    assert(m > 0);
+    return ((v + m - 1) / m) * m;
+}
+
+// Compute amount of private memory required for holding the arrays defined
+// in the "mloPoolingG" kernel:
+//
+// #define MLO_BOT_DATA_SZ0
+//     ((MLO_POOLING_N_HORIZ_OUT_PIX - 1) * MLO_POOLING_STRIDE0 + MLO_POOLING_KERNEL_SZ0)
+//
+// #define MLO_BOT_DATA_SZ1
+//    ((MLO_POOLING_N_VERT_OUT_PIX - 1) * MLO_POOLING_STRIDE1 + MLO_POOLING_KERNEL_SZ1)
+//
+// _FLOAT bot_data[MLO_BOT_DATA_SZ1][MLO_BOT_DATA_SZ0];
+// _FLOAT_ACCUM res[MLO_POOLING_N_VERT_OUT_PIX][MLO_POOLING_N_HORIZ_OUT_PIX];
+//
+std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& problem)
+{
+    const kernel_params kp(problem);
+
+    // aliases to ease programming
+    const auto& MLO_POOLING_KERNEL_SZ1      = kp.kernel_size_h;
+    const auto& MLO_POOLING_STRIDE1         = kp.kernel_stride_h;
+    const auto& MLO_POOLING_KERNEL_SZ0      = kp.kernel_size_w;
+    const auto& MLO_POOLING_STRIDE0         = kp.kernel_stride_w;
+    const auto& MLO_POOLING_N_HORIZ_OUT_PIX = kp.out_pix_tile0;
+    const auto& MLO_POOLING_N_VERT_OUT_PIX  = kp.out_pix_tile1;
+
+    const auto MLO_BOT_DATA_SZ0 =
+        (static_cast<std::size_t>(MLO_POOLING_N_HORIZ_OUT_PIX) - 1) * MLO_POOLING_STRIDE0 +
+        MLO_POOLING_KERNEL_SZ0;
+    const auto MLO_BOT_DATA_SZ1 =
+        (static_cast<std::size_t>(MLO_POOLING_N_VERT_OUT_PIX) - 1) * MLO_POOLING_STRIDE1 +
+        MLO_POOLING_KERNEL_SZ1;
+
+    const auto sizeof_bot_data = sizeof_kernel_FLOAT(problem) * MLO_BOT_DATA_SZ1 * MLO_BOT_DATA_SZ0;
+    const auto sizeof_res      = sizeof_kernel_FLOAT_ACCUM(problem) * MLO_POOLING_N_VERT_OUT_PIX *
+                            MLO_POOLING_N_HORIZ_OUT_PIX;
+
+    MIOPEN_LOG_T("sizeof_bot_data " << sizeof_bot_data << "sizeof_res" << sizeof_res);
+
+    /// \ref alignment_of_arrays_in_gpu_memory
+    return RoundUpToMultiple(sizeof_bot_data, 16) + RoundUpToMultiple(sizeof_res, 16);
+}
+
+} // namespace
+
+bool PoolingForwardCk2d::IsApplicable(const ExecutionContext& context,
+                                    const miopen::pooling::ProblemDescription& problem) const
+{
+    return problem.GetDirection() == miopen::pooling::Direction::Forward &&
+           problem.GetXDesc().GetNumDims() == 4 &&
+           problem.GetXDesc().GetType() == problem.GetYDesc().GetType() &&
+           (problem.GetXDesc().GetType() == miopenFloat ||
+            problem.GetXDesc().GetType() == miopenHalf) &&
+           problem.GetXDesc().GetLayout("NCHW") == "NHWC" &&
+           problem.GetYDesc().GetLayout("NCHW") == "NHWC" &&
+           sizeof_private_memory(problem) <=
+               TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth();
+}
+
+ConvSolution PoolingForwardCk2d::GetSolution(const ExecutionContext&,
+                                           const miopen::pooling::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    {
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenPooling.cl";
+        kernel.kernel_name = "mloPoolingG";
+// TODO: forwardCk2d kernel
+        const kernel_params kp(problem);
+
+        int batch_sz, n_outputs;
+        std::tie(batch_sz, n_outputs, std::ignore, std::ignore) =
+            miopen::tien<4>(problem.GetYDesc().GetLengths(), 1);
+
+        const auto& pool_d   = problem.GetPooling();
+        const auto wsp_index = pool_d.GetWorkspaceIndexMode();
+
+        int grp_tile0 = kp.out_width <= 8 ? 8 : (kp.out_width % 32 <= 16 ? 16 : 32);
+        int grp_tile1 = kp.out_height <= 8    ? 8
+                        : kp.out_height < 16  ? 16
+                        : kp.out_height <= 32 ? 32
+                        : kp.out_height <= 64 ? 64
+                                              : 128;
+        grp_tile1 /= kp.out_pix_tile1;
+        while(grp_tile0 * grp_tile1 > 256 && grp_tile0 > 1)
+            grp_tile0 >>= 1;
+
+        int pooling_method =
+            (pool_d.GetMode() == miopenPoolingMax)
+                ? MLO_POOLING_OP_MAX
+                : ((pool_d.GetMode() == miopenPoolingAverage) ? MLO_POOLING_OP_AVE
+                                                              : MLO_POOLING_OP_AVE_INCLUSIVE);
+
+        auto build_params = KernelBuildParameters{
+            {"MLO_POOLING_OP_ID", pooling_method},
+            {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h},
+            {"MLO_POOLING_STRIDE1", kp.kernel_stride_h},
+            {"MLO_POOLING_KERNEL_SZ0", kp.kernel_size_w},
+            {"MLO_POOLING_STRIDE0", kp.kernel_stride_w},
+            {"MLO_POOLING_N_HORIZ_OUT_PIX", kp.out_pix_tile0},
+            {"MLO_POOLING_N_VERT_OUT_PIX", kp.out_pix_tile1},
+            {"MLO_POOLING_GROUP_SZ0", grp_tile0},
+            {"MLO_POOLING_GROUP_SZ1", grp_tile1},
+            {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(pool_d.GetIndexType())},
+            {"MLO_POOLING_INDEX_MAX", get_pooling_index_type_max_name(pool_d.GetIndexType())},
+        };
+
+        if(problem.SaveIndex())
+        {
+            build_params << KernelBuildParameters{
+                {"MLO_POOLING_SAVE_INDEX"},
+                {"USE_IMG_INDEX", (wsp_index == miopenPoolingWorkspaceIndexImage ? 1 : 0)},
+            };
+        }
+
+        build_params << GetDataTypeKBP(problem.GetXDesc().GetType());
+
+        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+        kernel.l_wk.push_back(grp_tile0);
+        kernel.l_wk.push_back(grp_tile1);
+        kernel.l_wk.push_back(1);
+
+        int g_wk_width =
+            ((kp.out_width + grp_tile0 * kp.out_pix_tile0 - 1) / (grp_tile0 * kp.out_pix_tile0));
+        int g_wk_height =
+            ((kp.out_height + grp_tile1 * kp.out_pix_tile1 - 1) / (grp_tile1 * kp.out_pix_tile1));
+
+        kernel.g_wk.push_back(static_cast<std::size_t>(g_wk_width) * grp_tile0);
+        kernel.g_wk.push_back(static_cast<std::size_t>(g_wk_height) * grp_tile1);
+        kernel.g_wk.push_back(static_cast<std::size_t>(n_outputs) * batch_sz);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
+
+            kernel(params.x,
+                   params.y,
+                   params.workspace,
+                   static_cast<int>(params.pooling.pads[0]),
+                   static_cast<int>(params.pooling.pads[1]),
+                   static_cast<int>(params.xDesc.GetLengths()[1]),
+                   static_cast<int>(params.xDesc.GetLengths()[2]),
+                   static_cast<int>(params.xDesc.GetLengths()[3]),
+                   static_cast<int>(params.yDesc.GetLengths()[2]),
+                   static_cast<int>(params.yDesc.GetLengths()[3]),
+                   static_cast<int>(params.xDesc.GetStrides()[0]),
+                   static_cast<int>(params.xDesc.GetStrides()[1]),
+                   static_cast<int>(params.xDesc.GetStrides()[2]),
+                   static_cast<int>(params.yDesc.GetStrides()[0]),
+                   static_cast<int>(params.yDesc.GetStrides()[1]),
+                   static_cast<int>(params.yDesc.GetStrides()[2]));
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+PoolingForwardCk2d::GetWorkspaceSize(const ExecutionContext&,
+                                   const miopen::pooling::ProblemDescription& problem) const
+{
+    if(problem.GetPooling().GetMode() != miopenPoolingMax)
+        return 0;
+    return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType());
+}
+
+} // namespace pooling
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/pooling/forwardCkNd.cpp b/src/solver/pooling/forwardCkNd.cpp
new file mode 100644
index 0000000000..a4f2f781c3
--- /dev/null
+++ b/src/solver/pooling/forwardCkNd.cpp
@@ -0,0 +1,264 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/pooling/solvers.hpp>
+
+#include <miopen/pooling/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/pooling.hpp>
+#include <miopen/kernel_build_params.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace pooling {
+
+namespace {
+
+constexpr int top_w_per_work = 1;
+constexpr int top_h_per_work = 4;
+constexpr int top_d_per_work = 2;
+
+struct kernel_params
+{
+    uint32_t stride_d;
+    uint32_t stride_h;
+    uint32_t stride_w;
+    uint32_t kernel_sz_d;
+    uint32_t kernel_sz_h;
+    uint32_t kernel_sz_w;
+
+    kernel_params(const miopen::pooling::ProblemDescription& p)
+    {
+        const auto& pd = p.GetPooling();
+        stride_d       = pd.strides[0];
+        stride_h       = pd.strides[1];
+        stride_w       = pd.strides[2];
+        kernel_sz_d    = pd.lens[0];
+        kernel_sz_h    = pd.lens[1];
+        kernel_sz_w    = pd.lens[2];
+    }
+};
+
+std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem)
+{
+    const auto datatype = problem.GetXDesc().GetType();
+    return get_data_size(datatype);
+}
+
+inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m)
+{
+    assert(m > 0);
+    return ((v + m - 1) / m) * m;
+}
+
+// Compute amount of private memory required for holding the arrays defined
+// in the "mloPoolingNDFwd" kernel:
+//
+// #define BOT_TILE_W ((TOP_W_PER_WORK - 1) * STRIDE_W + KERNEL_SZ_W)
+// #define BOT_TILE_H ((TOP_H_PER_WORK - 1) * STRIDE_H + KERNEL_SZ_H)
+// #define BOT_TILE_D ((TOP_D_PER_WORK - 1) * STRIDE_D + KERNEL_SZ_D)
+//
+// _FLOAT bot_data[BOT_TILE_D][BOT_TILE_H][BOT_TILE_W];
+//
+std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& problem)
+{
+    const kernel_params kp(problem);
+
+    const std::size_t bot_tile_w = ((top_w_per_work - 1) * kp.stride_w + kp.kernel_sz_w);
+    const std::size_t bot_tile_h = ((top_h_per_work - 1) * kp.stride_h + kp.kernel_sz_h);
+    const std::size_t bot_tile_d = ((top_d_per_work - 1) * kp.stride_d + kp.kernel_sz_d);
+
+    const auto sizeof_bot_data =
+        sizeof_kernel_FLOAT(problem) * bot_tile_d * bot_tile_h * bot_tile_w;
+    MIOPEN_LOG_T("sizeof_bot_data " << sizeof_bot_data);
+
+    /// \ref alignment_of_arrays_in_gpu_memory
+    return RoundUpToMultiple(sizeof_bot_data, 16);
+}
+
+} // namespace
+
+bool PoolingForwardCkNd::IsApplicable(const ExecutionContext& context,
+                                    const miopen::pooling::ProblemDescription& problem) const
+{
+
+    return problem.GetDirection() == miopen::pooling::Direction::Forward                      //
+           && problem.GetXDesc().GetNumDims() == 5                                            //
+           && problem.GetXDesc().GetLayout("NCDHW") == "NDHWC"                                //
+           && problem.GetYDesc().GetLayout("NCDHW") == "NDHWC"                                //
+           && problem.GetXDesc().GetType() == problem.GetYDesc().GetType()                    //
+           && (problem.GetXDesc().GetType() == miopenFloat                                    //
+               || problem.GetXDesc().GetType() == miopenHalf)                                 //
+           && (problem.GetPooling().GetMode() == miopenPoolingMax                             //
+               || problem.GetPooling().GetMode() == miopenPoolingAverage                      //
+               || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive)            //
+           && sizeof_private_memory(problem) <= TargetProperties::GetMaxWaveScratchSize()     //
+                                                    / context.GetStream().GetWavefrontWidth() //
+           /// \todo This solver does not support workspace index mask mode yet.
+           &&
+           !(problem.GetPooling().GetMode() == miopenPoolingMax                                 //
+             && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask //
+             && problem.SaveIndex() == true);
+}
+
+ConvSolution PoolingForwardCkNd::GetSolution(const ExecutionContext&,
+                                           const miopen::pooling::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const int batch = problem.GetXDesc().GetLengths()[0];
+    const int chal  = problem.GetXDesc().GetLengths()[1];
+
+    const kernel_params kp(problem);
+
+    const int top_d = *(problem.GetYDesc().GetLengths().rbegin() + 2);
+    const int top_h = *(problem.GetYDesc().GetLengths().rbegin() + 1);
+    const int top_w = *(problem.GetYDesc().GetLengths().rbegin());
+
+    const int top_blk_w = std::max((top_w + top_w_per_work - 1) / top_w_per_work, 1);
+    const int top_blk_h = std::max((top_h + top_h_per_work - 1) / top_h_per_work, 1);
+    const int top_blk_d = std::max((top_d + top_d_per_work - 1) / top_d_per_work, 1);
+
+    const int max_activ_workitem = 65536;
+    const int total_work         = batch * chal * top_blk_w * top_blk_h * top_blk_d;
+    const int activ_work         = std::min(total_work, max_activ_workitem);
+
+    {
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenPoolingND.cl";
+        kernel.kernel_name = "mloPoolingNDFwd";
+// TODO: forwardCkNd kernel
+        int pooling_method = (problem.GetPooling().mode == miopenPoolingMax)
+                                 ? MLO_POOLING_OP_MAX
+                                 : ((problem.GetPooling().mode == miopenPoolingAverage)
+                                        ? MLO_POOLING_OP_AVE
+                                        : MLO_POOLING_OP_AVE_INCLUSIVE);
+
+        const size_t lcl_work = 64;
+        const size_t grp_num  = (activ_work + lcl_work - 1) / lcl_work;
+
+        auto build_params = KernelBuildParameters{
+            {"MLO_POOLING_OP_ID", static_cast<long long>(pooling_method)},
+            {"MAX_ACTIV_WORKITEM", static_cast<unsigned>(max_activ_workitem)},
+            {"MLO_POOLING_GROUP_SZ0", static_cast<long long>(lcl_work)},
+            {"MLO_POOLING_GROUP_SZ1", 1},
+            {"MLO_POOLING_GROUP_SZ2", 1},
+            {"TOP_W_PER_WORK", top_w_per_work},
+            {"TOP_H_PER_WORK", top_h_per_work},
+            {"TOP_D_PER_WORK", top_d_per_work},
+            {"KERNEL_SZ_D", kp.kernel_sz_d},
+            {"KERNEL_SZ_H", kp.kernel_sz_h},
+            {"KERNEL_SZ_W", kp.kernel_sz_w},
+            {"STRIDE_D", kp.stride_d},
+            {"STRIDE_H", kp.stride_h},
+            {"STRIDE_W", kp.stride_w},
+            {"MLO_POOLING_INDEX_TYPE",
+             get_pooling_index_type_name(problem.GetPooling().GetIndexType())},
+            {"MLO_POOLING_INDEX_MAX",
+             get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())},
+        };
+
+        if(problem.SaveIndex())
+        {
+            build_params << KernelBuildParameters{
+                {"MLO_POOLING_SAVE_INDEX"},
+            };
+        }
+
+        build_params << GetDataTypeKBP(problem.GetXDesc().GetType());
+
+        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+        kernel.l_wk = {lcl_work, 1, 1};
+        kernel.g_wk = {lcl_work * grp_num, 1, 1};
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
+
+            const int batch_ = params.xDesc.GetLengths()[0];
+            const int chal_  = params.xDesc.GetLengths()[1];
+
+            const int top_d_ = *(params.yDesc.GetLengths().rbegin() + 2);
+            const int top_h_ = *(params.yDesc.GetLengths().rbegin() + 1);
+            const int top_w_ = *(params.yDesc.GetLengths().rbegin());
+
+            const int top_blk_w_ = std::max((top_w_ + top_w_per_work - 1) / top_w_per_work, 1);
+            const int top_blk_h_ = std::max((top_h_ + top_h_per_work - 1) / top_h_per_work, 1);
+            const int top_blk_d_ = std::max((top_d_ + top_d_per_work - 1) / top_d_per_work, 1);
+
+            const int total_work_ = batch_ * chal_ * top_blk_w_ * top_blk_h_ * top_blk_d_;
+
+            kernel(params.x,
+                   params.y,
+                   params.workspace,
+                   static_cast<unsigned>(params.pooling.pads[0]),
+                   static_cast<unsigned>(params.pooling.pads[1]),
+                   static_cast<unsigned>(params.pooling.pads[2]),
+                   static_cast<unsigned>(batch_),
+                   static_cast<unsigned>(chal_),
+                   static_cast<unsigned>(params.xDesc.GetLengths()[2]),
+                   static_cast<unsigned>(params.xDesc.GetLengths()[3]),
+                   static_cast<unsigned>(params.xDesc.GetLengths()[4]),
+                   static_cast<unsigned>(top_d_),
+                   static_cast<unsigned>(top_h_),
+                   static_cast<unsigned>(top_w_),
+                   static_cast<unsigned>(params.xDesc.GetStrides()[0]),
+                   static_cast<unsigned>(params.xDesc.GetStrides()[1]),
+                   static_cast<unsigned>(params.xDesc.GetStrides()[2]),
+                   static_cast<unsigned>(params.xDesc.GetStrides()[3]),
+                   static_cast<unsigned>(params.yDesc.GetStrides()[0]),
+                   static_cast<unsigned>(params.yDesc.GetStrides()[1]),
+                   static_cast<unsigned>(params.yDesc.GetStrides()[2]),
+                   static_cast<unsigned>(params.yDesc.GetStrides()[3]),
+                   static_cast<unsigned>(total_work_));
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+PoolingForwardCkNd::GetWorkspaceSize(const ExecutionContext&,
+                                   const miopen::pooling::ProblemDescription& problem) const
+{
+    if(problem.GetPooling().GetMode() != miopenPoolingMax)
+        return 0;
+    return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType());
+}
+
+} // namespace pooling
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp
index 43406cd508..86d24a03de 100644
--- a/src/solver/pooling/forwardNaive.cpp
+++ b/src/solver/pooling/forwardNaive.cpp
@@ -75,14 +75,15 @@ bool PoolingForwardNaive::IsApplicable(const ExecutionContext&,
            && (problem.GetPooling().GetMode() == miopenPoolingMax                  //
                || problem.GetPooling().GetMode() == miopenPoolingAverage           //
                || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) //
+           && (problem.GetXDesc().GetLayout("NCDHW") == problem.GetYDesc().GetLayout("NCDHW")) //
            && (                                                                    //
                   (problem.GetXDesc().GetNumDims() == 5                            //
-                   && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW"             //
-                   && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW")            //
+                   && (problem.GetXDesc().GetLayout("NCDHW") == "NCDHW"            //
+                   || problem.GetXDesc().GetLayout("NCDHW") == "NDHWC"))           //
                   ||                                                               //
                   (problem.GetXDesc().GetNumDims() == 4                            //
-                   && problem.GetXDesc().GetLayout("NCHW") == "NCHW"               //
-                   && problem.GetYDesc().GetLayout("NCHW") == "NCHW")              //
+                   && (problem.GetXDesc().GetLayout("NCHW") == "NCHW"              //
+                   || problem.GetYDesc().GetLayout("NCHW") == "NHWC"))             //
               );
 }
 
@@ -95,6 +96,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     const auto bot  = problem.GetXDesc();
     const auto top  = problem.GetYDesc();
     const bool is2d = (bot.GetNumDims() == 4);
+    const bool isTranspose = problem.GetXDesc().GetLayout("NCHW")[1] == 'C';
 
     // To compact code:
     const auto& pooling = problem.GetPooling();
@@ -208,8 +210,16 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     {
         auto kernel = KernelInfo{};
 
-        kernel.kernel_file = "MIOpenPoolingForwardNaive.cl";
-        kernel.kernel_name = "mloPoolingForwardNaive";
+        if(isTranspose)
+        {
+            kernel.kernel_file = "MIOpenPoolingFwdNDNhwcNaive.cpp";
+            kernel.kernel_name = "mloPoolingForwardNDNhwcNaive";
+        }
+        else
+        {
+            kernel.kernel_file = "MIOpenPoolingForwardNaive.cl";
+            kernel.kernel_name = "mloPoolingForwardNaive";
+        }
 
         auto build_params = KernelBuildParameters{
             {"MLO_POOLING_OP_ID", pooling_method}, // We need this at compile time in order to
@@ -218,7 +228,10 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
             {"MLO_POOLING_IS2D_KERNEL", static_cast<int>(is2d_kernel)},
         };
         build_params << GetDataTypeKBP(bot.GetType());
-        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+        if(isTranspose)
+            kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+        else
+            kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
 
         // [Informative] The total number of kernels required to cover the whole
         // forward pooling problem space is 3*4*2*2 = 48. The solver is dynamic.
@@ -237,50 +250,80 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
         result.construction_params.push_back(kernel);
     }
 
-    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
-        return [=](const Handle& handle, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
-
-            kernel(params.x,
-                   params.y,
-                   params.workspace,
-                   save_index,
-                   index_mode,
-                   filter_d,
-                   filter_h,
-                   filter_w,
-                   filter_d_stride,
-                   filter_h_stride,
-                   filter_w_stride,
-                   filter_d_pad,
-                   filter_h_pad,
-                   filter_w_pad,
-                   all_n,
-                   all_c,
-                   bot_d,
-                   bot_h,
-                   bot_w,
-                   bot_n_stride,
-                   bot_c_stride,
-                   bot_d_stride,
-                   bot_h_stride,
-                   bot_w_stride,
-                   top_d,
-                   top_h,
-                   top_w,
-                   top_n_stride,
-                   top_c_stride,
-                   top_d_stride,
-                   top_h_stride,
-                   top_w_stride,
-                   mask_n_stride,
-                   mask_c_stride,
-                   mask_d_stride,
-                   mask_h_stride,
-                   mask_w_stride);
+    if(isTranspose)
+    {
+        result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+            return [=](const Handle& handle, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
+
+                // NOTE: kernel 'mloPoolingForwardNDNhwcNaive' expects indices in DHW[NC] order
+                kernel(params.x,
+                    params.y,
+                    params.workspace,
+                    save_index,
+                    index_mode,
+                    std::vector<uint32_t>{filter_d, filter_h, filter_w},
+                    std::vector<uint32_t>{filter_d_stride, filter_h_stride, filter_w_stride},
+                    std::vector<uint32_t>{filter_d_pad, filter_h_pad, filter_w_pad},
+                    all_n,
+                    all_c,
+                    std::vector<uint32_t>{bot_d, bot_h, bot_w},
+                    std::vector<size_t>{bot_d_stride, bot_h_stride, bot_w_stride, bot_n_stride, bot_c_stride},
+                    std::vector<uint32_t>{top_d, top_h, top_w},
+                    std::vector<size_t>{top_d_stride, top_h_stride, top_w_stride, top_n_stride, top_c_stride},
+                    std::vector<size_t>{mask_d_stride, mask_h_stride, mask_w_stride, mask_n_stride, mask_c_stride});
+            };
+        };
+    }
+    else
+    {
+        result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+            return [=](const Handle& handle, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
+
+                kernel(params.x,
+                    params.y,
+                    params.workspace,
+                    save_index,
+                    index_mode,
+                    filter_d,
+                    filter_h,
+                    filter_w,
+                    filter_d_stride,
+                    filter_h_stride,
+                    filter_w_stride,
+                    filter_d_pad,
+                    filter_h_pad,
+                    filter_w_pad,
+                    all_n,
+                    all_c,
+                    bot_h,
+                    bot_d,  // TODO RJS: broke it
+                    bot_w,
+                    bot_n_stride,
+                    bot_c_stride,
+                    bot_d_stride,
+                    bot_h_stride,
+                    bot_w_stride,
+                    top_d,
+                    top_h,
+                    top_w,
+                    top_n_stride,
+                    top_c_stride,
+                    top_d_stride,
+                    top_h_stride,
+                    top_w_stride,
+                    mask_n_stride,
+                    mask_c_stride,
+                    mask_d_stride,
+                    mask_h_stride,
+                    mask_w_stride);
+            };
         };
-    };
+    }
+
     return result;
 }
 
diff --git a/test/gtest/poolingFwdNdNaive.cpp b/test/gtest/poolingFwdNdNaive.cpp
new file mode 100644
index 0000000000..3e11396fff
--- /dev/null
+++ b/test/gtest/poolingFwdNdNaive.cpp
@@ -0,0 +1,217 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <gtest/gtest.h>
+#include <miopen/env.hpp>
+#include "get_handle.hpp"
+#include "test_env.hpp"
+
+#include "pooling2d.hpp"
+
+#include "tensor_holder.hpp"
+
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS)
+
+namespace env = miopen::env;
+
+namespace {
+template <typename T>
+struct tensor_data
+{
+    static std::vector<int> get_layout_lengths(int n, int c, std::vector<int>& dims)
+    {
+        auto ret = std::vector<int>{n, c};
+        ret.insert(ret.end(), dims.cbegin(), dims.cend());
+
+        return ret;
+    }
+
+    static std::vector<int>
+    get_strides(std::vector<int>& lens, int dims, miopenTensorLayout_t tensor_layout)
+    {
+        std::vector<int> strides;
+        std::string layout_default = miopen::tensor_layout_get_default(dims + 2);
+        std::string layout_string  = miopen::TensorDescriptor::GetLayoutStr(tensor_layout);
+
+        miopen::tensor_layout_to_strides(lens, layout_default, layout_string, strides);
+
+        constexpr int min_stride_multiplier = 1;
+        constexpr int max_stride_multiplier = 4;
+
+        auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier);
+        for(auto& v : strides)
+        {
+            // cppcheck-suppress useStlAlgorithm
+            v = v * c;
+        }
+
+        return strides;
+    }
+
+    static miopenTensorDescriptor_t init_tensor_descriptor(miopenDataType_t type,
+                                                           const std::vector<int>& lens,
+                                                           const std::vector<int>& strides)
+    {
+        miopenTensorDescriptor_t desc;
+
+        EXPECT_TRUE(miopenCreateTensorDescriptor(&desc) == miopenStatusSuccess);
+        EXPECT_TRUE(
+            miopenSetTensorDescriptor(desc, type, lens.size(), lens.data(), strides.data()) ==
+            miopenStatusSuccess);
+
+        return desc;
+    }
+
+    layout_data(int _n, std::vector<int> _dims, int _c, miopenTensorLayout_t _tensor_layout)
+    {
+        auto lens    = get_layout_lengths(_n, _c, _dims);
+        auto strides = get_strides(lens, _dims.size(), _tensor_layout);
+        descriptor   = miopen::TensorDescriptor{miopen_type<T>{}, lens, strides};
+        host         = tensor<T>{lens, strides}.generate(gen_value<T>);
+    }
+
+    ~layout_data() {}
+
+    void read_gpu_data(miopen::Handle& handle, const miopen::Allocator::ManageDataPtr& ddata)
+    {
+        check      = tensor<T>{descriptor.GetLengths(), descriptor.GetStrides()};
+        check.data = handle.Read<T>(ddata, check.data.size());
+    }
+
+    tensor<T> check{};
+    tensor<T> host;
+    miopen::TensorDescriptor descriptor;
+};
+
+}
+
+namespace pooling_tests {
+
+class PoolingFwdNdNaive : public testing::TestWithParam<std::vector<std::string>>
+{
+};
+
+static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); }
+
+// void GetArgs(const std::string& param, std::vector<std::string>& tokens)
+// {
+//     std::stringstream ss(param);
+//     std::istream_iterator<std::string> begin(ss);
+//     std::istream_iterator<std::string> end;
+//     while(begin != end)
+//         tokens.push_back(*begin++);
+// }
+
+void Run2dDriver(miopenDataType_t prec)
+{
+
+    std::vector<std::string> params;
+    switch(prec)
+    {
+    case miopenFloat: params = Pooling2dFloat::GetParam(); break;
+    case miopenHalf: params = WidePooling2dHalf::GetParam(); break;
+    case miopenBFloat16:
+    case miopenInt8:
+    case miopenFloat8:
+    case miopenBFloat8:
+    case miopenInt32:
+    case miopenInt64:
+    case miopenDouble:
+        FAIL()
+            << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8 "
+               "data type not supported by "
+               "immed_conv2d_codecov test";
+
+    default: params = Pooling2dFloat::GetParam();
+    }
+
+    for(const auto& test_value : params)
+    {
+        std::vector<std::string> tokens;
+        GetArgs(test_value, tokens);
+        std::vector<const char*> ptrs;
+
+        std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) {
+            return str.data();
+        });
+
+        testing::internal::CaptureStderr();
+        test_drive<pooling2d_driver>(ptrs.size(), ptrs.data());
+        auto capture = testing::internal::GetCapturedStderr();
+        std::cout << capture;
+    }
+};
+
+bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; }
+
+std::vector<std::string> GetTestCases(const std::string precision)
+{
+    const auto& flag_arg = env::value(MIOPEN_TEST_FLAGS_ARGS);
+
+    const std::vector<std::string> test_cases = {
+        // clang-format off
+    {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 "+flag_arg}
+        // clang-format on
+    };
+
+    return test_cases;
+}
+
+} // namespace pooling_tests
+using namespace pooling_tests;
+
+/*
+TEST_P(Pooling2dFloat, FloatTest_pooling2d_wide)
+{
+    const auto& handle = get_handle();
+    if(IsTestSupportedForDevice(handle) && !SkipTest() && IsTestRunWith("--float"))
+    {
+        Run2dDriver(miopenFloat);
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+*/
+
+TEST_P(WidePooling2dHalf, HalfTest_pooling2d_wide)
+{
+    const auto& handle = get_handle();
+    if(IsTestSupportedForDevice(handle) && !SkipTest() && IsTestRunWith("--half"))
+    {
+        Run2dDriver(miopenHalf);
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+// INSTANTIATE_TEST_SUITE_P(Pooling2D, Pooling2dFloat, testing::Values(GetTestCases("--float")));
+
+INSTANTIATE_TEST_SUITE_P(Pooling2D, WidePooling2dHalf, testing::Values(GetTestCases("--half")));
diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp
index 128e81cce2..399592a1f4 100644
--- a/test/pooling2d.hpp
+++ b/test/pooling2d.hpp
@@ -29,12 +29,12 @@
 #define WORKAROUND_ISSUE_1670 1
 #define TEST_GET_INPUT_TENSOR 0
 
-template <class T>
-struct pooling2d_driver : pooling_driver<T>
+struct pooling2d_shapes
 {
-private:
+public:
     using U = typename std::vector<int>;
-    std::vector<U> get_2d_pooling_input_shapes()
+
+    static std::vector<U> get_2d_pooling_input_shapes()
     {
         return {{1, 19, 1024, 2048},
                 {10, 3, 32, 32},
@@ -57,13 +57,33 @@ struct pooling2d_driver : pooling_driver<T>
     }
 
     // Dataset 1 is intended for testing of asymmetric configs.
-    std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}}; }
+    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}}; }
 
     // Dataset 2 is intended for testing of configs with wide window.
-    std::vector<U> get_2d_pooling_input_shapes_wide()
+    static std::vector<U> get_2d_pooling_input_shapes_wide()
     {
         return {{1, 3, 255, 255}, {2, 3, 227, 227}, {1, 7, 127, 127}, {1, 1, 410, 400}};
     }
+};
+
+template <class T>
+struct pooling2d_driver : pooling_driver<T>
+{
+private:
+    using U = typename std::vector<int>;
+    std::vector<U> get_2d_pooling_input_shapes()
+    {
+        return pooling2d_shapes::get_2d_pooling_input_shapes();
+    }
+
+    // Dataset 1 is intended for testing of asymmetric configs.
+    std::vector<U> get_2d_pooling_input_shapes_minimal() { return pooling2d_shapes::get_2d_pooling_input_shapes_minimal(); }
+
+    // Dataset 2 is intended for testing of configs with wide window.
+    std::vector<U> get_2d_pooling_input_shapes_wide()
+    {
+        return pooling2d_shapes::get_2d_pooling_input_shapes_wide();
+    }
 
 public:
     pooling2d_driver() : pooling_driver<T>()
diff --git a/test/pooling3d.cpp b/test/pooling3d.cpp
index 90b37d5c75..966b3f5303 100644
--- a/test/pooling3d.cpp
+++ b/test/pooling3d.cpp
@@ -25,30 +25,6 @@
  *******************************************************************************/
 
 #include "pooling_common.hpp"
-
-template <class T>
-struct pooling3d_driver : pooling_driver<T>
-{
-    std::vector<std::vector<int>> get_3d_pooling_input_shapes()
-    {
-        return {{16, 64, 3, 4, 4},
-                {16, 32, 4, 9, 9},
-                {8, 512, 3, 14, 14},
-                {8, 512, 4, 28, 28},
-                {16, 64, 56, 56, 56},
-                {4, 3, 4, 227, 227},
-                {4, 4, 4, 161, 700}};
-    }
-
-    pooling3d_driver() : pooling_driver<T>()
-    {
-        this->add(
-            this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4));
-        this->add(this->lens, "lens", this->generate_data({{2, 2, 2}, {3, 3, 3}}));
-        this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}}));
-        this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}}));
-        this->add(this->wsidx, "wsidx", this->generate_data({1}));
-    }
-};
+#include "pooling3d.hpp"
 
 int main(int argc, const char* argv[]) { test_drive<pooling3d_driver>(argc, argv); }
diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp
new file mode 100644
index 0000000000..c6d80e8e98
--- /dev/null
+++ b/test/pooling3d.hpp
@@ -0,0 +1,62 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "pooling_common.hpp"
+
+struct pooling3d_shapes
+{
+public:
+    using U = typename std::vector<int>;
+
+    static std::vector<U> get_3d_pooling_input_shapes()
+    {
+        return {{16, 64, 3, 4, 4},
+                {16, 32, 4, 9, 9},
+                {8, 512, 3, 14, 14},
+                {8, 512, 4, 28, 28},
+                {16, 64, 56, 56, 56},
+                {4, 3, 4, 227, 227},
+                {4, 4, 4, 161, 700}};
+    }
+};
+
+template <class T>
+struct pooling3d_driver : pooling_driver<T>
+{
+    std::vector<std::vector<int>> get_3d_pooling_input_shapes()
+    {
+        return pooling3d_shapes::get_3d_pooling_input_shapes();
+    }
+    pooling3d_driver() : pooling_driver<T>()
+    {
+        this->add(
+            this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4));
+        this->add(this->lens, "lens", this->generate_data({{2, 2, 2}, {3, 3, 3}}));
+        this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}}));
+        this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}}));
+        this->add(this->wsidx, "wsidx", this->generate_data({1}));
+    }
+};

From 1f1ed93cf1c5f5fdd652022287e33b454b8e4dda Mon Sep 17 00:00:00 2001
From: "Randy J. Spaulding" <rspauldi@amd.com>
Date: Tue, 20 Aug 2024 19:25:58 +0000
Subject: [PATCH 02/10] checkpoint

---
 include/miopen/miopen.h                       |   2 +-
 src/CMakeLists.txt                            |  14 +-
 src/comgr.cpp                                 |   3 +
 src/hipoc/hipoc_program.cpp                   |   1 +
 .../miopen/conv/problem_description.hpp       |  52 +--
 src/include/miopen/pooling.hpp                |   3 +-
 src/include/miopen/pooling/invoke_params.hpp  |   1 +
 src/include/miopen/pooling/solvers.hpp        |   5 +-
 .../miopen/problem_description_layout.hpp     | 102 +++++
 src/include/miopen/tensor.hpp                 |  63 ++-
 .../MIOpenPoolingForwardNDNhwcNaive.cpp       | 394 ++++++++++++++++++
 src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp   | 245 -----------
 src/ocl/pooling_ocl.cpp                       |   6 +-
 src/pooling.cpp                               |   9 +-
 src/pooling/problem_description.cpp           |   8 +-
 src/solver.cpp                                |   2 +-
 src/solver/pooling/forward2d.cpp              |  39 +-
 src/solver/pooling/forwardCk2d.cpp            |   1 +
 src/solver/pooling/forwardNaive.cpp           | 204 +++++----
 src/solver/pooling/forwardNdNhwcNaive.cpp     | 345 +++++++++++++++
 src/tensor.cpp                                |  13 +
 test/CMakeLists.txt                           |   2 +
 test/gtest/ex1.cpp                            |  15 +
 test/gtest/layout_transpose.cpp               |  47 ++-
 test/gtest/poolingFwdNdNaive.cpp              | 152 ++++---
 test/pooling2d.hpp                            |   3 +-
 test/pooling3d.hpp                            |   2 +
 test/pooling_common.hpp                       | 290 ++++++++++---
 28 files changed, 1477 insertions(+), 546 deletions(-)
 create mode 100644 src/include/miopen/problem_description_layout.hpp
 create mode 100644 src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
 delete mode 100644 src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp
 create mode 100644 src/solver/pooling/forwardNdNhwcNaive.cpp
 create mode 100644 test/gtest/ex1.cpp

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 0f2c2a5cb0..c69c9957c8 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -396,7 +396,7 @@ typedef enum
     miopenTensorCHWNc4 = 5, /*!< CHWNc4 memory layout (Partially supported) */
     miopenTensorCHWNc8 = 6, /*!< CHWNc8 memory layout (Partially supported) */
     miopenTensorNCDHW  = 7, /*!< NCDHW memory layout (Fully supported) */
-    miopenTensorNDHWC  = 8, /*!< NCDHW memory layout (Fully supported) */
+    miopenTensorNDHWC  = 8, /*!< NDHWC memory layout (Fully supported) */
 } miopenTensorLayout_t;
 
 /*! @ingroup pooling
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 150e5cb76a..3b51f60f05 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -288,6 +288,7 @@ set( MIOpen_Source
     solver/mha/mha_solver_forward.cpp
     solver/pooling/forward2d.cpp
     solver/pooling/forwardNaive.cpp
+    solver/pooling/forwardNdNhwcNaive.cpp
     solver/pooling/forwardNd.cpp
     solver/pooling/forwardCk2d.cpp
     solver/pooling/forwardCkNd.cpp
@@ -451,6 +452,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/miopen_type_traits.hpp
         kernels/miopen_utility.hpp
         kernels/neuron.inc
+        kernels/pooling_functions.h
         kernels/rocm_version.inc
         kernels/stride_array.hpp
         kernels/utilities.inc
@@ -499,7 +501,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenPoolingBwd.cl
         kernels/MIOpenPoolingBwdND.cl
         kernels/MIOpenPoolingForwardNaive.cl
-        kernels/MIOpenPoolingFwdNDNhwcNaive.cpp
+        kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
         kernels/MIOpenPoolingND.cl
         kernels/MIOpenConv1x1S.cl
         kernels/MIOpenConv1x1J1.cl
@@ -592,6 +594,16 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/xform_bidirect_winograd_out.s
         kernels/UniversalTranspose.cl)
 
+        # TEMPCODE RJS
+    set(MIOPEN_KERNELS
+        kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
+    )
+
+    set(MIOPEN_KERNEL_INCLUDES
+    kernels/bfloat16_dev.hpp
+    kernels/float_types.h
+    kernels/pooling_functions.h
+    )
     # Kernels in development lists.
     # Should be ALWAYS empty in develop branch (at the time of PR merge)
     # Intention: to speed up kernel development rebuild time
diff --git a/src/comgr.cpp b/src/comgr.cpp
index aa53b71bb5..0fec8fa8ea 100644
--- a/src/comgr.cpp
+++ b/src/comgr.cpp
@@ -887,6 +887,7 @@ class HiprtcProgram
         const auto log = GetLog(false);
         if(!log.empty())
             MIOPEN_LOG_I(log);
+        std::cout << "************** HipRTC compile log: '" << log << "'" << std::endl; // TEMPCODE RJS
     }
 
     void GetCode(std::vector<char>& bytes) const
@@ -988,7 +989,9 @@ void BuildHip(const std::string& name,
             opts.push_back("-std=c++17");
 
         HiprtcProgram prog(name, text);
+        try{    // TEMPCODE RJS
         prog.Compile(opts);
+        } catch(Error& ex) { std::cout << __FUNCTION__ << " : Exception calling prog.Compile!: " << ex.text << std::endl; throw(ex); }
         prog.GetCode(binary);
     }
     catch(Error& ex)
diff --git a/src/hipoc/hipoc_program.cpp b/src/hipoc/hipoc_program.cpp
index ec090455e8..fa151bd3ed 100644
--- a/src/hipoc/hipoc_program.cpp
+++ b/src/hipoc/hipoc_program.cpp
@@ -275,6 +275,7 @@ void HIPOCProgramImpl::BuildCodeObjectInMemory(const std::string& params,
 #endif
         if(filename.extension() == ".cpp")
         {
+            std::cout << "Compling HIP: '" << filename << "'" << std::endl; // TEMPCODE RJS
             hiprtc::BuildHip(filename.string(), src, params, target, binary);
         }
         else if(filename.extension() == ".s")
diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp
index 9447d7a5ca..dcbcac27f3 100644
--- a/src/include/miopen/conv/problem_description.hpp
+++ b/src/include/miopen/conv/problem_description.hpp
@@ -32,6 +32,7 @@
 #include <miopen/scalar.hpp>
 
 #include <miopen/problem_description_base.hpp>
+#include <miopen/problem_description_layout.hpp>
 #include <miopen/tensor.hpp>
 #include <miopen/convolution.hpp>
 
@@ -136,7 +137,7 @@ namespace conv {
 MIOPEN_INTERNALS_EXPORT miopenAlphaBetaCase_t ClassifyAlphaBeta(const Scalar& alpha,
                                                                 const Scalar& beta);
 
-struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
+struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionWeightsBase
 #if MIOPEN_ENABLE_SQLITE
     ,
                                                     SQLiteSerializable<ProblemDescription>
@@ -153,13 +154,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
                        int bias_            = 0,
                        const Scalar& alpha_ = Scalar(1.0),
                        const Scalar& beta_  = Scalar(0.0))
-        : in(in_),
-          weights(weights_),
-          out(out_),
+        : ProblemDescriptionWeightsBase(in_, weights_, out_),
           conv(conv_),
-          in_layout(ComputeInLayout()),
-          weights_layout(ComputeWeightsLayout()),
-          out_layout(ComputeOutLayout()),
           direction(direction_),
           bias(bias_),
           alpha(alpha_),
@@ -443,49 +439,7 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
     void SetupFloats(ExecutionContext& ctx) const;
 
 private:
-    std::string ComputeInLayout() const
-    {
-        if(GetSpatialDims() == 2)
-        {
-            return in.GetLayout(in.GetLayout_str());
-        }
-        else
-        {
-            return in.GetLayout("NCDHW");
-        }
-    }
-
-    std::string ComputeOutLayout() const
-    {
-        if(GetSpatialDims() == 2)
-        {
-            return out.GetLayout(out.GetLayout_str());
-        }
-        else
-        {
-            return out.GetLayout("NCDHW");
-        }
-    }
-
-    std::string ComputeWeightsLayout() const
-    {
-        if(GetSpatialDims() == 2)
-        {
-            return weights.GetLayout(weights.GetLayout_str());
-        }
-        else
-        {
-            return weights.GetLayout("NCDHW");
-        }
-    }
-
-    TensorDescriptor in;
-    TensorDescriptor weights;
-    TensorDescriptor out;
     ConvolutionDescriptor conv;
-    std::string in_layout;
-    std::string weights_layout;
-    std::string out_layout;
     Direction direction                   = Direction::Forward;
     int bias                              = 0;
     Scalar alpha                          = Scalar(1.0);
diff --git a/src/include/miopen/pooling.hpp b/src/include/miopen/pooling.hpp
index 0ab5ffa1c7..2670c3b09c 100644
--- a/src/include/miopen/pooling.hpp
+++ b/src/include/miopen/pooling.hpp
@@ -146,7 +146,8 @@ struct MIOPEN_EXPORT PoolingDescriptor : miopenPoolingDescriptor
                            Data_t y,
                            bool save_index,
                            Data_t workSpace,
-                           size_t workSpaceSize) const;
+                           size_t workSpaceSize,
+                           Data_t junk = nullptr) const;  // TEMPCODE RJS
 
     miopenStatus_t Backward(Handle& handle,
                             const void* alpha,
diff --git a/src/include/miopen/pooling/invoke_params.hpp b/src/include/miopen/pooling/invoke_params.hpp
index 2d55786c21..2140c0a1fb 100644
--- a/src/include/miopen/pooling/invoke_params.hpp
+++ b/src/include/miopen/pooling/invoke_params.hpp
@@ -45,6 +45,7 @@ struct FwdInvokeParams : public miopen::InvokeParams
     Data_t y                   = nullptr;
     Data_t workspace           = nullptr;
     std::size_t workspace_size = 0;
+    Data_t junk                   = nullptr;    // TEMPCODE RJS
 
     std::size_t GetWorkspaceSize() const { return workspace_size; }
     Data_t GetWorkspace() const { return workspace; }
diff --git a/src/include/miopen/pooling/solvers.hpp b/src/include/miopen/pooling/solvers.hpp
index ab86a52aae..d8836fcdc0 100644
--- a/src/include/miopen/pooling/solvers.hpp
+++ b/src/include/miopen/pooling/solvers.hpp
@@ -103,10 +103,9 @@ struct PoolingForwardNaive final : PoolingSolver
                                  const miopen::pooling::ProblemDescription& problem) const override;
 };
 
-struct PoolingForwardNdNhwcNaive final : PoolingSolver
+struct PoolingForwardNDNhwcNaive final : PoolingSolver
 {
-    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingForwardNdNhwcNaive>(); }
-    bool IsDynamic() const override { return true; }
+    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingForwardNDNhwcNaive>(); }
 
     bool IsApplicable(const ExecutionContext& context,
                       const miopen::pooling::ProblemDescription& problem) const override;
diff --git a/src/include/miopen/problem_description_layout.hpp b/src/include/miopen/problem_description_layout.hpp
new file mode 100644
index 0000000000..aa20c4058c
--- /dev/null
+++ b/src/include/miopen/problem_description_layout.hpp
@@ -0,0 +1,102 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/miopen.h>
+#include <miopen/names.hpp>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+#include <string>
+
+namespace miopen {
+
+struct ProblemDescriptionLayoutBase : ProblemDescriptionBase
+{
+    ProblemDescriptionLayoutBase()                              = default;
+    ProblemDescriptionLayoutBase(const ProblemDescriptionLayoutBase&) = default;
+ProblemDescriptionLayoutBase(const TensorDescriptor& in_, // x for Forward, y for Backward*
+                            const TensorDescriptor& out_ // y for Forward, x for Backward*
+                       )
+    : ProblemDescriptionBase(),
+      in(in_),
+      out(out_),
+          in_layout(ComputeInLayout()),
+          out_layout(ComputeOutLayout())
+    {}
+    virtual ~ProblemDescriptionLayoutBase()                     = default;
+
+    ProblemDescriptionLayoutBase& operator=(const ProblemDescriptionLayoutBase&) = default;
+
+    [[nodiscard]] virtual NetworkConfig MakeNetworkConfig() const = 0;
+
+protected:
+    TensorDescriptor in;
+    TensorDescriptor out;
+    std::string in_layout;
+    std::string out_layout;
+
+    std::string ComputeInLayout() const
+    {
+        return in.GetLayout(in.GetLayout_str());
+    }
+
+    std::string ComputeOutLayout() const
+    {
+        return out.GetLayout(out.GetLayout_str());
+    }
+};
+
+struct ProblemDescriptionWeightsBase : ProblemDescriptionLayoutBase
+{
+    ProblemDescriptionWeightsBase()                              = default;
+    ProblemDescriptionWeightsBase(const ProblemDescriptionWeightsBase&) = default;
+    ProblemDescriptionWeightsBase(const TensorDescriptor& in_, // x for Forward, y for Backward*
+                       const TensorDescriptor& weights_,
+                       const TensorDescriptor& out_ // y for Forward, x for Backward*
+                       )
+        : ProblemDescriptionLayoutBase(in_, out_),
+          weights(weights_),
+          weights_layout(ComputeWeightsLayout())
+    {}
+    virtual ~ProblemDescriptionWeightsBase()                     = default;
+
+    ProblemDescriptionWeightsBase& operator=(const ProblemDescriptionWeightsBase&) = default;
+
+    [[nodiscard]] virtual NetworkConfig MakeNetworkConfig() const = 0;
+
+protected:
+    TensorDescriptor weights;
+    std::string weights_layout;
+
+    std::string ComputeWeightsLayout() const
+    {
+        return weights.GetLayout(weights.GetLayout_str());
+    }
+};
+
+} // namespace miopen
diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
index f4d2b2dca7..30a197e503 100644
--- a/src/include/miopen/tensor.hpp
+++ b/src/include/miopen/tensor.hpp
@@ -40,6 +40,7 @@
 #include <algorithm>
 #include <cassert>
 #include <numeric>
+#include <strstream>
 #include <vector>
 #include <optional>
 
@@ -162,6 +163,10 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
     TensorDescriptor(miopenDataType_t t,
                      const std::vector<int>& lens_in,
                      const std::vector<int>& strides_in);
+    TensorDescriptor(miopenDataType_t t,
+                     miopenTensorLayout_t layout_in,
+                     const std::vector<int>& lens_in,
+                     const std::vector<int>& strides_in);
     TensorDescriptor(miopenDataType_t t,
                      const std::initializer_list<std::size_t>& lens_in,
                      const std::initializer_list<std::size_t>& strides_in);
@@ -207,6 +212,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
     miopenTensorLayout_t GetLayout_t() const;
     static std::string GetLayoutStr(miopenTensorLayout_t layout);
     std::string GetLayout_str() const;
+    bool IsDefaultLayout() const;
 
     std::size_t GetVectorLength() const;
     std::optional<miopenDataType_t> GetCastType() const;
@@ -259,8 +265,10 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
         {
             if(labels.size() != strides.size())
             {
-                MIOPEN_THROW(
-                    "Invalid labels size. Layout labels size must be equavalent to stride size");
+                std::ostringstream oss;
+                oss << "Invalid labels size. labels='" << labels << "', strides size=" << strides.size()
+                    << ". Layout labels size must be equivalent to stride size";
+                MIOPEN_THROW(oss.str().c_str());
             }
 
             // Copy construct the result string from labels. This allocates the space at one go
@@ -276,7 +284,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
             if(base_label.size() != strides.size())
             {
                 MIOPEN_THROW(
-                    "Invalid labels size. Layout labels size must be equavalent to stride size");
+                    "Invalid labels size. Layout labels size must be equivalent to stride size");
             }
             auto result = base_label;
             auto p      = find_permutation(lens, strides);
@@ -292,7 +300,29 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
     friend void from_json(const nlohmann::json& j, TensorDescriptor& descriptor);
 
 protected:
-    static miopenTensorLayout_t GetDefaultLayout() { return miopenTensorNCHW; };
+    static miopenTensorLayout_t GetDefaultLayout(unsigned spatial_dims = 2)
+    {
+        switch (spatial_dims)
+        {
+            case 2: return miopenTensorNCHW;
+            case 3: return miopenTensorNCDHW;
+            default:
+                MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3.");
+        }
+    };
+
+    static bool IsDefaultLayout(miopenTensorLayout_t layout, unsigned spatial_dims = 2)
+    {
+        switch (spatial_dims)
+        {
+            case 2:
+            case 3:
+                return layout == GetDefaultLayout();
+            default:
+                MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3.");
+        }
+    }
+
 
 private:
     TensorDescriptor(miopenDataType_t t,
@@ -328,11 +358,34 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
 
 template <class TElement>
 constexpr auto GetNCDHW(unsigned spatial_dims, const std::vector<TElement>& data)
+{
+    if(spatial_dims == 3)
+    {
+        if(data.size() == 5)        // NCDHW
+            return miopen::tien<5>(data, 1);
+        else if(data.size() == 3)   //   DHW
+            return std::make_tuple(static_cast<TElement>(1), static_cast<TElement>(1), data[0], data[1], data[2]);
+        else
+            MIOPEN_THROW("Invalid data length; must be 5 or 3 with 3 spatial dimensions");
+    }
+    else
+    {
+        if(data.size() == 4)        // NCHW
+            return std::make_tuple(data[0], data[1], static_cast<TElement>(1), data[2], data[3]);
+        else if(data.size() == 2)   //   HW
+            return std::make_tuple(static_cast<TElement>(1), static_cast<TElement>(1), static_cast<TElement>(1), data[0], data[1]);
+        else
+            MIOPEN_THROW("Invalid data length; must be 4 or 2 with 2 spatial dimensions");
+    }
+}
+
+template <class TElement>
+constexpr auto GetNDHWC(unsigned spatial_dims, const std::vector<TElement>& data)
 {
     if(spatial_dims == 3)
         return miopen::tien<5>(data, 1);
     else
-        return std::make_tuple(data[0], data[1], static_cast<TElement>(1), data[2], data[3]);
+        return std::make_tuple(data[0], static_cast<TElement>(1), data[1], data[2], data[3]);
 }
 
 } // namespace miopen
diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
new file mode 100644
index 0000000000..970104b3b1
--- /dev/null
+++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
@@ -0,0 +1,394 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+//#define TEMPCODE RJS
+#ifdef TEMPCODE
+#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0
+
+#define MLO_POOLING_OP_ID MLO_POOLING_OP_AVE
+
+#define MLO_POOLING_INDEX_TYPE int
+#define MLO_POOLING_IS2D_KERNEL 0
+#define INPUT_TYPE _FLOAT
+#define OUTPUT_TYPE _FLOAT
+// #define TI INPUT_TYPE
+// #define TO OUTPUT_TYPE
+#define CVT_FP32_2ACCUM(x) (x)
+#endif
+
+// #define _FLOAT float
+// #define _FLOAT_ACCUM _FLOAT
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#ifdef TEMPCODE
+#include "float_types.h"
+#endif
+#include "pooling_functions.h"
+
+#if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE)
+#define AVERAGE_OPS 1
+#else
+#define AVERAGE_OPS 0
+#endif
+
+// Let's use extended-precision accumulator only in FP16 pooling and only for averaging.
+// For all other ops and datatypes, use native accumulator, i.e. treate FLOAT_ACCUM as FLOAT.
+#ifndef TEMPCODE
+#if !(AVERAGE_OPS && MIOPEN_USE_FP16)
+#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 1
+// #else
+// #define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0
+#endif
+
+#include "float_types.h"
+#endif // TEMPCODE
+
+#if AVERAGE_OPS
+#define ARG_UNUSED_FOR_AVERAGE __attribute__((__unused__))
+#else
+#define ARG_UNUSED_FOR_AVERAGE
+#endif
+
+#define doUU 0
+#if doUU
+#define UU  __attribute__((__unused__))
+#else
+#define UU
+#endif
+
+#define doUU1 1
+#if doUU1
+#define UU1  __attribute__((__unused__))
+#else
+#define UU1
+#endif
+
+// Out N, D, H are encoded into the block indices x, y, z
+// No 2D-only optimization.
+template <typename TI, typename TO>
+__device__ void poolingForwardNDNhwcNaive(UU1 const TI* __restrict__ bot_ptr,
+                                    TO* __restrict__ top_ptr,
+                                    UU1 TO* __restrict__ junk_ptr,  // TEMPCODE RJS
+                                    UU1 ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr,
+                                    UU1 ARG_UNUSED_FOR_AVERAGE int save_index,
+                                    UU1 ARG_UNUSED_FOR_AVERAGE int index_mode,
+                                    UU uint32_t filter_d,
+                                    UU uint32_t filter_h,
+                                    UU uint32_t filter_w,
+                                    UU uint32_t filter_d_stride,
+                                    UU uint32_t filter_h_stride,
+                                    UU uint32_t filter_w_stride,
+                                    UU uint32_t filter_d_pad,
+                                    UU uint32_t filter_h_pad,
+                                    UU uint32_t filter_w_pad,
+                                    uint32_t all_n,
+                                    UU uint32_t all_c, // TEMPCODE RJS
+                                    UU uint32_t bot_d,
+                                    UU uint32_t bot_h,
+                                    UU uint32_t bot_w,
+                                    UU size_t bot_n_stride,
+                                    UU uint32_t bot_c_stride,
+                                    UU size_t bot_d_stride,
+                                    UU uint32_t bot_h_stride,
+                                    UU uint32_t bot_w_stride,
+                                    uint32_t top_d,
+                                    uint32_t top_h,
+                                    uint32_t top_w,
+                                    size_t top_n_stride,
+                                    uint32_t top_c_stride,
+                                    size_t top_d_stride,
+                                    uint32_t top_h_stride,
+                                    uint32_t top_w_stride,
+                                    UU ARG_UNUSED_FOR_AVERAGE size_t mask_n_stride,
+                                    UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_c_stride,
+                                    UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_d_stride,
+                                    UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_h_stride,
+                                    UU ARG_UNUSED_FOR_AVERAGE size_t mask_w_stride)
+{
+    const uint32_t nn = blockIdx.x / top_d;                          // N=slow index
+    if(!(nn < all_n))
+        return;
+
+    const uint32_t td = blockIdx.x % top_d;                          // top D=fast index
+    if(td >= top_d)
+        return;
+
+    const uint32_t th = blockIdx.y;  // top H
+    // const uint32_t j = (gridDim.y == 1) ? threadIdx.y : blockIdx.y;  // top H
+    if(th >= top_h)
+        return;
+
+    const uint32_t tw = blockIdx.z % top_w;  // top W=fast index
+    if(tw >= top_w)
+        return;
+
+    if(nn == 0 && td == 0 && th == 0 && tw == 0)
+    {
+        int idx = 0;
+        top_ptr[idx++] = gridDim.x;
+        top_ptr[idx++] = gridDim.y;
+        top_ptr[idx++] = gridDim.z;
+        top_ptr[idx++] = -9;
+
+        top_ptr[idx++] = blockDim.x;
+        top_ptr[idx++] = blockDim.y;
+        top_ptr[idx++] = blockDim.z;
+        top_ptr[idx++] = -8;
+
+        top_ptr[idx++] = filter_d;
+        top_ptr[idx++] = filter_h;
+        top_ptr[idx++] = filter_w;
+        top_ptr[idx++] = -7;
+
+        top_ptr[idx++] = filter_d_stride;
+        top_ptr[idx++] = filter_h_stride;
+        top_ptr[idx++] = filter_w_stride;
+        top_ptr[idx++] = -6;
+
+        top_ptr[idx++] = filter_d_pad;
+        top_ptr[idx++] = filter_h_pad;
+        top_ptr[idx++] = filter_w_pad;
+        top_ptr[idx++] = -5;
+
+        top_ptr[idx++] = all_n;
+        top_ptr[idx++] = all_c;
+        top_ptr[idx++] = bot_n_stride;
+        top_ptr[idx++] = bot_c_stride;
+
+        top_ptr[idx++] = top_n_stride;
+        top_ptr[idx++] = top_c_stride;
+        #if AVERAGE_OPS
+        top_ptr[idx++] = -4;
+        top_ptr[idx++] = -4;
+        #else
+        top_ptr[idx++] = mask_n_stride;
+        top_ptr[idx++] = mask_c_stride;
+        #endif
+
+        top_ptr[idx++] = bot_d;
+        top_ptr[idx++] = bot_h;
+        top_ptr[idx++] = bot_w;
+        top_ptr[idx++] = -3;
+
+        top_ptr[idx++] = bot_d_stride;
+        top_ptr[idx++] = bot_h_stride;
+        top_ptr[idx++] = bot_w_stride;
+        top_ptr[idx++] = -2;
+
+        top_ptr[idx++] = top_d;
+        top_ptr[idx++] = top_h;
+        top_ptr[idx++] = top_w;
+        top_ptr[idx++] = -1;
+    
+        top_ptr[idx++] = top_d_stride;
+        top_ptr[idx++] = top_h_stride;
+        top_ptr[idx++] = top_w_stride;
+        top_ptr[idx++] = -9;
+
+        #if AVERAGE_OPS
+        top_ptr[idx++] = -8;
+        top_ptr[idx++] = -8;
+        top_ptr[idx++] = -8;
+        #else
+        top_ptr[idx++] = mask_d_stride;
+        top_ptr[idx++] = mask_h_stride;
+        top_ptr[idx++] = mask_w_stride;
+        #endif
+        top_ptr[idx++] = -7;
+    }
+
+    uint32_t cc = 0;
+    // const auto c_base = (blockDim.x == all_c) ? 0 : (blockIdx.z / top_w) * blockDim.x;
+            size_t top_index = 64
+                    + nn * top_n_stride             // TEMPCODE RJS
+                    + cc * top_c_stride           //
+                    + (size_t)(td * top_d_stride) //
+                    + (size_t)(th * top_h_stride) //
+                    + (size_t)(tw * top_w_stride);
+
+        top_ptr[top_index] = top_index;
+
+//     const auto int_dstart   = static_cast<int64_t>(td * filter_d_stride) - static_cast<int64_t>(filter_d_pad);
+//     const auto dend           = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(filter_d), static_cast<int64_t>(bot_d)));
+//     const auto dstart         = static_cast<size_t>(max(int_dstart, 0));
+
+//     const auto int_hstart   = static_cast<int>(th * filter_h_stride) - static_cast<int>(filter_h_pad);
+//     const auto hend             = static_cast<uint32_t>(min(int_hstart + static_cast<int>(filter_h), static_cast<int>(bot_h)));
+//     const auto hstart         = static_cast<uint32_t>(max(int_hstart, 0));
+
+//     const auto int_wstart        = static_cast<int>(tw * filter_w_stride) - static_cast<int>(filter_w_pad);
+//     const auto wend             = static_cast<uint32_t>(min(int_wstart + static_cast<int>(filter_w), static_cast<int>(bot_w)));
+//     const auto wstart           = static_cast<uint32_t>(max(int_wstart, 0));
+//     out_ptr[0] = 1.2;
+//     for(uint32_t cc = 0; cc < 1; ++cc)  // top C loop
+//     {
+//         if(cc >= all_c)   return;
+//         {
+//             size_t top_index = nn * top_n_stride             // TEMPCODE RJS
+//                     + cc * top_c_stride           //
+//                     + (size_t)(td * top_d_stride) //
+//                     + (size_t)(th * top_h_stride) //
+//                     + (size_t)(tw * top_w_stride);
+//         top_index = 1;
+
+//         junk_ptr[top_index] = (_FLOAT)1.1;
+//         if(j != 1)
+//         }
+// #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
+//         uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+//         pool_size       = (pool_size == 0) ? 1 : pool_size;
+// #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
+//         const uint32_t pool_size = filter_d * filter_h * filter_w;
+// #endif
+
+// #if AVERAGE_OPS
+//         _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0);
+// #else // MAX
+//         _FLOAT_ACCUM res     = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM);
+//         bool found           = false; // May remain false if bot contains only NaNs/-INFs.
+//         uint32_t d_save          = 0;
+//         uint32_t h_save          = 0;
+//         uint32_t w_save          = 0;
+// #endif
+//         for(size_t bd = dstart; bd < dend; ++bd)
+//         {
+//             for(uint32_t bh = hstart; bh < hend; ++bh)
+//             {
+//                 for(uint32_t bw = wstart; bw < wend; ++bw)
+//                 {
+//                     const size_t bot_index = nn * bot_n_stride +           //
+//                                             cc * bot_c_stride +           //
+//                                             bd * bot_d_stride + //
+//                                             static_cast<size_t>(bh * bot_h_stride) + //
+//                                             static_cast<size_t>(bw * bot_w_stride);
+// #if AVERAGE_OPS
+//                     res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]);
+// #else // MAX
+//                     if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res)
+//                     {
+//                         res = bot_ptr[bot_index];
+//                         if(save_index)
+//                         {
+//                             found  = true;
+//                             d_save = bd;
+//                             h_save = bh;
+//                             w_save = bw;
+//                         }
+//                     }
+// #endif
+//                 }
+//             }
+//         }
+
+// #if AVERAGE_OPS
+//         res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size);
+// #else // MAX
+// res *= 1.0; // TEMPCODE RJS fix UNUSED
+//         if(save_index)
+//         {
+//             index_t res_index = 0;
+
+//             / Preventing overflow during computation of res_index:
+//             / If Index is shorter than uint, then let's perform computation in 32-bit
+//             / domain and then convert to narrower Index. That would reduce the probability of
+//             / overflow. If Index is wider then 32 bits, then it seems like it is better to
+//             / convert to Index type before multiplication. However this is not actually
+//             / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in
+//             / 32 bits and then convert.
+
+//             if(found)
+//             {
+//                 if(index_mode == 1)
+//                     res_index = (index_t)(d_save * bot_h * bot_w //
+//                                             + h_save * bot_w       //
+//                                             + w_save);
+//                 else
+//                     res_index = (index_t)(                                                    //
+//                         ((d_save - td * filter_d_stride + filter_d_pad) * filter_h * filter_w) //
+//                         + ((h_save - th * filter_h_stride + filter_h_pad) * filter_w)          //
+//                         + (w_save - tw * filter_w_stride + filter_w_pad)                       //
+//                     );
+//             }
+
+//             const size_t mask_index = nn * mask_n_stride             //
+//                                         + cc * mask_c_stride           //
+//                                         + (size_t)(td * mask_d_stride) //
+//                                         + (size_t)(tw * mask_h_stride) //
+//                                         + (size_t)(th * mask_w_stride);
+//             mask_ptr[mask_index] = res_index;
+//         }
+// #endif
+//         size_t top_index = nn * top_n_stride             //
+//                                 + cc * top_c_stride           //
+//                                 + (size_t)(td * top_d_stride) //
+//                                 + (size_t)(th * top_h_stride) //
+//                                 + (size_t)(tw * top_w_stride);
+//         top_index = 1;
+//         junk_ptr[top_index] = (_FLOAT)res;    // TEMPCODE RJS
+//     }
+}
+
+extern "C" __global__ void mloPoolingForwardNDNhwcNaive(
+                                    const INPUT_TYPE* __restrict__ bot_ptr,
+                                    OUTPUT_TYPE* __restrict__ top_ptr,
+                                    OUTPUT_TYPE* __restrict__ junk_ptr,    // TEMPCODE RJS
+                                    index_t* __restrict__ mask_ptr,
+                                    int save_index,
+                                    int index_mode,
+                                    uint32_t filter_d, uint32_t filter_h, uint32_t filter_w,
+                                    uint32_t filter_d_stride, uint32_t filter_h_stride, uint32_t filter_w_stride,
+                                    uint32_t filter_d_pad, uint32_t filter_h_pad, uint32_t filter_w_pad,
+                                    uint32_t all_n,
+                                    uint32_t all_c,
+                                    uint32_t bot_d, uint32_t bot_h, uint32_t bot_w,
+                                    size_t bot_n_stride, size_t bot_c_stride, uint32_t bot_d_stride, uint32_t bot_h_stride, uint32_t bot_w_stride,
+                                    uint32_t top_d, uint32_t top_h, uint32_t top_w,
+                                    size_t top_n_stride, size_t top_c_stride, uint32_t top_d_stride, uint32_t top_h_stride, uint32_t top_w_stride,
+                                    size_t mask_n_stride, uint32_t mask_c_stride, uint32_t mask_d_stride, uint32_t mask_h_stride, size_t mask_w_stride)
+{
+    poolingForwardNDNhwcNaive<INPUT_TYPE, OUTPUT_TYPE>(
+        bot_ptr,
+        top_ptr,
+        junk_ptr,
+        mask_ptr,
+        save_index,
+        index_mode,
+        filter_d, filter_h, filter_w,
+        filter_d_stride, filter_h_stride, filter_w_stride,
+        filter_d_pad, filter_h_pad, filter_w_pad,
+        all_n,
+        all_c,
+        bot_d, bot_h, bot_w,
+        bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride,
+        top_d, top_h, top_w,
+        top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride,
+        mask_n_stride, mask_c_stride, mask_d_stride, mask_h_stride, mask_w_stride
+    );
+}
diff --git a/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp
deleted file mode 100644
index bc17af8922..0000000000
--- a/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-#endif
-
-#include "pooling_functions.h"
-
-#include <algorithm>
-
-#if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE)
-#define AVERAGE_OPS 1
-#else
-#define AVERAGE_OPS 0
-#endif
-
-// Let's use extended-precision accumulator only in FP16 pooling and only for averaging.
-// For all other ops and datatypes, use native accumulator, i.e. treate FLOAT_ACCUM as FLOAT.
-#if !(AVERAGE_OPS && MIOPEN_USE_FP16)
-#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 1
-#endif
-#include "float_types.h"
-
-#ifndef MLO_POOLING_IS2D_KERNEL
-#error "MLO_POOLING_IS2D_KERNEL must be defined"
-#endif
-
-#if AVERAGE_OPS
-#define ARG_UNUSED_FOR_AVERAGE __attribute__((__unused__))
-#else
-#define ARG_UNUSED_FOR_AVERAGE
-#endif
-
-#if MLO_POOLING_IS2D_KERNEL
-#define ARG_UNUSED_FOR_2D __attribute__((__unused__))
-#else
-#define ARG_UNUSED_FOR_2D
-#endif
-
-// Out N, D, H are encoded into the block indices x, y, z
-// Requires all lens, strides, pads to be in DHW[NC] order. The code is
-// cleaner and more performant this way.
-// No 2D-only optimization.
-template <typename TI, typename TO>
-__device__ void poolingFwdNDNhwcNaive(const TI* in_data,
-                                    TO* out_data,
-                                    ARG_UNUSED_FOR_AVERAGE index_t* mask_ptr,
-                                    ARG_UNUSED_FOR_AVERAGE int save_index,
-                                    ARG_UNUSED_FOR_AVERAGE int index_mode,
-                                    std::vector<uint32_t> filter_lens,
-                                    std::vector<uint32_t> filter_strides,
-                                    std::vector<uint32_t> filter_pads,
-                                    uint32_t all_n,
-                                    uint32_t all_c,
-                                    std::vector<uint32_t> lens,
-                                    std::vector<size_t> strides,
-                                    std::vector<uint32_t> out_lens,
-                                    std::vector<size_t> out_strides,
-                                    ARG_UNUSED_FOR_AVERAGE std::vector<size_t> mask_strides)
-{
-    constexpr uint32_t D_IDX = 0;
-    constexpr uint32_t H_IDX = 1;
-    constexpr uint32_t W_IDX = 2;
-    constexpr uint32_t N_IDX = 3;
-    constexpr uint32_t C_IDX = 4;
-
-    const uint32_t b = blockIdx.x;  // out N
-    if(!(b < all_n))
-        return;
-
-    const uint32_t k = blockIdx.y;  // out D
-    if(!(k < out_lens[D_IDX]))
-        return;
-
-    const uint32_t j = blockIdx.z;  // out H
-    if(!(j < out_lens[H_IDX]))
-        return;
-
-    for(uint32_t i = 0; i < out_lens[W_IDX]; ++i)  // out W
-    {
-        for(uint32_t o = 0; o < all_c ++o)  // out C
-        {
-            const auto int_dstart   = static_cast<int64_t>(k * filter_strides[D_IDX]) - static_cast<int64_t>(filter_pads[D_IDX]);
-            const auto int_hstart   = static_cast<int>(j * filter_strides[H_IDX]) - static_cast<int>(filter_pads[H_IDX]);
-            const auto int_wstart        = static_cast<int>(i * filter_strides[W_IDX]) - static_cast<int>(filter_pads[W_IDX]);
-            const auto dend           = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(filter_lens[D_IDX]), static_cast<int64_t>(out_lens[D_IDX])));
-            const auto hend             = static_cast<uint32_t>(min(int_hstart + static_cast<int>(filter_lens[H_IDX]), static_cast<int>(out_lens[H_IDX])));
-            const auto wend             = static_cast<uint32_t>(min(int_wstart + static_cast<int>(filter_lens[W_IDX]), static_cast<int>(out_lens[W_IDX])));
-            const auto dstart         = static_cast<size_t>(max(int_dstart, 0));
-            const auto hstart         = static_cast<uint32_t>(max(int_hstart, 0));
-            const auto wstart           = static_cast<uint32_t>(max(int_wstart, 0));
-
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-        uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-        pool_size       = (pool_size == 0) ? 1 : pool_size;
-#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
-        const uint32_t pool_size = filter_lens[D_IDX] * filter_lens[H_IDX] * filter_lens[W_IDX];
-#endif
-
-#if AVERAGE_OPS
-        _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0);
-#else // MAX
-            _FLOAT_ACCUM res     = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM);
-            bool found           = false; // May remain false if bot contains only NaNs/-INFs.
-            uint32_t d_save          = 0;
-            uint32_t h_save          = 0;
-            uint32_t w_save          = 0;
-#endif
-        for(size_t d = dstart; d < dend; ++d)
-        {
-            for(uint32_t h = hstart; h < hend; ++h)
-            {
-                for(uint32_t w = wstart; w < wend; ++w)
-                {
-                    const size_t in_index = b * strides[N_IDX] +           //
-                                             o * strides[C_IDX] +           //
-                                             d * strides[D_IDX] + //
-                                             static_cast<size_t>(h * strides[H_IDX]) + //
-                                             static_cast<size_t>(w * strides[W_IDX]);
-#if AVERAGE_OPS
-                    res += in_data[in_index];
-#else // MAX
-                        if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index] > res))
-                        {
-                            res = in_data[in_index];
-                            if(save_index)
-                            {
-                                found  = true;
-                                d_save = d;
-                                h_save = h;
-                                w_save = w;
-                            }
-                        }
-#endif
-                }
-            }
-        }
-
-#if AVERAGE_OPS
-        res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size);
-#else // MAX
-            if(save_index)
-            {
-                index_t res_index = 0;
-
-                /// Preventing overflow during computation of res_index:
-                /// If Index is shorter than uint, then let's perform computation in 32-bit
-                /// domain and then convert to narrower Index. That would reduce the probability of
-                /// overflow. If Index is wider then 32 bits, then it seems like it is better to
-                /// convert to Index type before multiplication. However this is not actually
-                /// necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in
-                /// 32 bits and then convert.
-
-                if(found)
-                {
-                    if(index_mode == 1)
-                        res_index = (index_t)(d_save * lens[H_IDX] * lens[W_IDX] //
-                                              + h_save * lens[W_IDX]       //
-                                              + w_save);
-                    else
-                        res_index = (index_t)(                                                    //
-                            ((d_save - k * filter_strides[D_IDX] + filter_pads[D_IDX]) * filter_lens[W_IDX] * filter_lens[H_IDX]) //
-                            + ((h_save - j * filter_strides[H_IDX] + filter_pads[H_IDX]) * filter_lens[W_IDX])          //
-                            + (w_save - i * filter_strides[W_IDX] + filter_pads[W_IDX])                       //
-                        );
-                }
-
-                const size_t mask_index = b * mask_strides[N_IDX]             //
-                                          + o * mask_strides[C_IDX]           //
-                                          + (size_t)(k * mask_strides[D_IDX]) //
-                                          + (size_t)(j * mask_strides[H_IDX]) //
-                                          + (size_t)(i * mask_strides[W_IDX]);
-                mask_ptr[mask_index] = res_index;
-            }
-#endif
-        const size_t out_index = out_strides[N_IDX]             //
-                                 + o * out_strides[C_IDX]           //
-                                 + (size_t)(k * out_strides[D_IDX]) //
-                                 + (size_t)(j * out_strides[H_IDX]) //
-                                 + (size_t)(i * out_strides]W_IDX]);
-
-        out_data[out_index] = (_FLOAT)res;
-    }
-}
-}
-
-extern "C" __global__ void mloPoolingForwardNDNhwcNaive(const INPUT_TYPE* __restrict__ in_data,
-                                     OUTPUT_TYPE* out_data,
-                                     ARG_UNUSED_FOR_AVERAGE index_t* mask_ptr,
-                                     ARG_UNUSED_FOR_AVERAGE int save_index,
-                                     ARG_UNUSED_FOR_AVERAGE int index_mode,
-                                     std::vector<uint32_t> filter_lens,
-                                     std::vector<uint32_t> filter_strides,
-                                     std::vector<uint32_t> filter_pads,
-                                     uint32_t all_n,
-                                     uint32_t all_c,
-                                     std::vector<uint32_t> lens,
-                                     std::vector<size_t> strides,
-                                     std::vector<uint32_t> out_lens,
-                                     std::vector<size_t> out_strides,
-                                     ARG_UNUSED_FOR_AVERAGE std::vector<size_t> mask_strides)
-{
-    poolingFwdNDNhwcNaive<INPUT_TYPE, OUTPUT_TYPE>(
-        in_data,
-        out_data,
-        mask_ptr,
-        save_index,
-        index_mode,
-        filter_lens,
-        filter_strides,
-        filter_pads,
-        all_n,
-        all_c,
-        lens,
-        strides,
-        out_lens,
-        out_strides,
-        mask_strides
-    );
-}
diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp
index 9a2258908f..1b22cd91b7 100644
--- a/src/ocl/pooling_ocl.cpp
+++ b/src/ocl/pooling_ocl.cpp
@@ -42,7 +42,7 @@ static auto PoolingForwardSolvers()
     return solver::SolverContainer<solver::pooling::PoolingForward2d,
                                    solver::pooling::PoolingForwardNd,
                                    solver::pooling::PoolingForwardNaive,
-                                   solver::pooling::PoolingForwardNdNhwcNaive,
+                                   solver::pooling::PoolingForwardNDNhwcNaive,
                                    solver::pooling::PoolingForwardCk2d,
                                    solver::pooling::PoolingForwardCkNd,
                                    solver::pooling::TransposedPoolingFwd2d,
@@ -68,7 +68,8 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle,
                                           Data_t y,
                                           bool save_index,
                                           Data_t workSpace,
-                                          size_t workSpaceSize) const
+                                          size_t workSpaceSize,
+                                          Data_t junk) const    // TEMPCODE RJS
 {
 
     if(!float_equal(*(static_cast<const float*>(alpha)), 1.0) ||
@@ -131,6 +132,7 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle,
         tmp.y              = y;
         tmp.workspace      = workSpace;
         tmp.workspace_size = workSpaceSize;
+        tmp.junk = junk;    // TEMPCODE RJS
         return tmp;
     }();
 
diff --git a/src/pooling.cpp b/src/pooling.cpp
index a65cb3c0ab..000e297f3c 100644
--- a/src/pooling.cpp
+++ b/src/pooling.cpp
@@ -217,13 +217,14 @@ TensorDescriptor PoolingDescriptor::GetForwardOutputTensor(const TensorDescripto
 {
     std::vector<int> out_dim(xDesc.GetNumDims());
     GetForwardOutputDimNd(xDesc, xDesc.GetNumDims(), out_dim.data());
+    auto layout_str = xDesc.GetLayout_str();
+    auto layout = xDesc.GetLayout_t();
+    auto lengths_layout = miopen::tensor_layout_get_default(xDesc.GetNumDims());
 
-    const std::string default_layout = tensor_layout_get_default(xDesc.GetNumDims());
-    const std::string in_layout      = xDesc.GetLayout(default_layout);
     std::vector<int> out_strides;
-    tensor_layout_to_strides(out_dim, default_layout, in_layout, out_strides);
+    tensor_layout_to_strides(out_dim, lengths_layout, layout_str, out_strides);
 
-    return {xDesc.GetType(), out_dim, out_strides};
+    return {xDesc.GetType(), layout, out_dim, out_strides};
 }
 
 std::size_t PoolingDescriptor::GetWorkSpaceSize(const TensorDescriptor& yDesc) const
diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp
index 8e171a4ac0..ad36abfb08 100644
--- a/src/pooling/problem_description.cpp
+++ b/src/pooling/problem_description.cpp
@@ -82,8 +82,14 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
         ss << "_dxs" << get_vect_config(dxDesc.GetStrides());
         ss << "_dyd" << get_vect_config(dyDesc.GetLengths());
         ss << "_dys" << get_vect_config(dyDesc.GetStrides());
+    }   // TEMPCODE RJS
+    std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << " *************************" << std::endl;
+    if(!xDesc.IsDefaultLayout())
+    {
+    std::cout <<   "               xDesc layout is not default! " << " *************************\n";
     }
-
+    ss << "_l" << (xDesc.IsDefaultLayout() ? 0 : 1);
+std::cout << "               " << ss.str() << std::endl;
     return NetworkConfig{ss.str()};
 }
 
diff --git a/src/solver.cpp b/src/solver.cpp
index 0088a10cb1..28ca3751a8 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -600,7 +600,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              fusion::ConvCKIgemmFwdBiasActivFused{}.SolverDbId(),
              miopenConvolutionAlgoImplicitGEMM);
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNaive{}.SolverDbId());
-    Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNdNhwcNaive{}.SolverDbId());
+    Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNDNhwcNaive{}.SolverDbId());
     RegisterWithSolver(registry,
                        ++id,
                        conv::ConvHipImplicitGemmGroupFwdXdlops{},
diff --git a/src/solver/pooling/forward2d.cpp b/src/solver/pooling/forward2d.cpp
index 87fd0e851f..d3fe890ecd 100644
--- a/src/solver/pooling/forward2d.cpp
+++ b/src/solver/pooling/forward2d.cpp
@@ -135,7 +135,8 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro
 bool PoolingForward2d::IsApplicable(const ExecutionContext& context,
                                     const miopen::pooling::ProblemDescription& problem) const
 {
-    return problem.GetDirection() == miopen::pooling::Direction::Forward &&
+    bool app =
+    problem.GetDirection() == miopen::pooling::Direction::Forward &&
            problem.GetXDesc().GetNumDims() == 4 &&
            problem.GetXDesc().GetType() == problem.GetYDesc().GetType() &&
            (problem.GetXDesc().GetType() == miopenFloat ||
@@ -144,6 +145,23 @@ bool PoolingForward2d::IsApplicable(const ExecutionContext& context,
            problem.GetYDesc().GetLayout("NCHW") == "NCHW" &&
            sizeof_private_memory(problem) <=
                TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth();
+
+// TEMPCODE RJS
+    std::cout << "%%%%%%%%%% PoolingForward2d::IsApplicable: " << app << " " <<  problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout("NCHW") << std::endl;
+    return false;
+               return app;
+}
+
+#include <iomanip>  // TEMPCODE RJS
+namespace {
+    template<typename T>
+    void printVec(std::string name, const std::vector<T>& vec)
+    {
+        return;
+        std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: ";
+        for(auto i : vec)    std::cout << std::setw(8) << i;
+        std::cout << std::endl;
+    }
 }
 
 ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&,
@@ -182,6 +200,25 @@ ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&,
                 : ((pool_d.GetMode() == miopenPoolingAverage) ? MLO_POOLING_OP_AVE
                                                               : MLO_POOLING_OP_AVE_INCLUSIVE);
 
+
+    // TEMPCODE RJS
+        const auto bot  = problem.GetXDesc();
+    const auto top  = problem.GetYDesc();
+    const auto& pooling = problem.GetPooling();
+    const auto& lengths = pooling.GetLengths();
+    const auto& strides = pooling.GetStrides();
+    const auto& pads    = pooling.GetPads();
+
+    std::cout << "======================================================================" << std::endl;
+    printVec("bot lengths", bot.GetLengths());
+    printVec("bot strides", bot.GetStrides());
+    printVec("top lengths", top.GetLengths());
+    printVec("top strides", top.GetStrides());
+    printVec("pool lengths", lengths);
+    printVec("pool strides", strides);
+    printVec("pool pads", pads);
+    std::cout << "======================================================================" << std::endl;
+
         auto build_params = KernelBuildParameters{
             {"MLO_POOLING_OP_ID", pooling_method},
             {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h},
diff --git a/src/solver/pooling/forwardCk2d.cpp b/src/solver/pooling/forwardCk2d.cpp
index d66da12a33..9681f7f563 100644
--- a/src/solver/pooling/forwardCk2d.cpp
+++ b/src/solver/pooling/forwardCk2d.cpp
@@ -135,6 +135,7 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro
 bool PoolingForwardCk2d::IsApplicable(const ExecutionContext& context,
                                     const miopen::pooling::ProblemDescription& problem) const
 {
+    return false;
     return problem.GetDirection() == miopen::pooling::Direction::Forward &&
            problem.GetXDesc().GetNumDims() == 4 &&
            problem.GetXDesc().GetType() == problem.GetYDesc().GetType() &&
diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp
index 86d24a03de..faf4fac836 100644
--- a/src/solver/pooling/forwardNaive.cpp
+++ b/src/solver/pooling/forwardNaive.cpp
@@ -68,23 +68,35 @@ inline uint32_t RoundUpNearestPower2Positive(uint32_t v)
 bool PoolingForwardNaive::IsApplicable(const ExecutionContext&,
                                        const miopen::pooling::ProblemDescription& problem) const
 {
-    return problem.GetDirection() == miopen::pooling::Direction::Forward           //
-           && problem.GetXDesc().GetType() == problem.GetYDesc().GetType()         //
-           && (problem.GetXDesc().GetType() == miopenFloat                         //
-               || problem.GetXDesc().GetType() == miopenHalf)                      //
-           && (problem.GetPooling().GetMode() == miopenPoolingMax                  //
-               || problem.GetPooling().GetMode() == miopenPoolingAverage           //
-               || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) //
-           && (problem.GetXDesc().GetLayout("NCDHW") == problem.GetYDesc().GetLayout("NCDHW")) //
-           && (                                                                    //
-                  (problem.GetXDesc().GetNumDims() == 5                            //
-                   && (problem.GetXDesc().GetLayout("NCDHW") == "NCDHW"            //
-                   || problem.GetXDesc().GetLayout("NCDHW") == "NDHWC"))           //
-                  ||                                                               //
-                  (problem.GetXDesc().GetNumDims() == 4                            //
-                   && (problem.GetXDesc().GetLayout("NCHW") == "NCHW"              //
-                   || problem.GetYDesc().GetLayout("NCHW") == "NHWC"))             //
-              );
+    auto x_type = problem.GetXDesc().GetType();
+    auto y_type = problem.GetYDesc().GetType();
+    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf};
+
+    auto mode = problem.GetPooling().GetMode();
+    std::vector<miopenPoolingMode_t> modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive};
+
+    auto x_layout = problem.GetXDesc().GetLayout_str();
+    auto y_layout = problem.GetYDesc().GetLayout_str();
+    std::vector<std::string> layouts {"NCHW", "NCDHW"};
+
+    return (problem.GetDirection() == miopen::pooling::Direction::Forward)          //
+        && (x_type == y_type)                                                       //
+        && (std::find(types.cbegin(), types.cend(), x_type) != types.cend())        //
+        && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend())          //)
+        && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end());
+}
+
+
+#include <iomanip>  // TEMPCODE RJS
+namespace {
+    template<typename T>
+    void printVec(std::string name, const std::vector<T>& vec)
+    {
+         return;
+       std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: ";
+        for(auto i : vec)    std::cout << std::setw(8) << i;
+        std::cout << std::endl;
+    }
 }
 
 ConvSolution
@@ -96,7 +108,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     const auto bot  = problem.GetXDesc();
     const auto top  = problem.GetYDesc();
     const bool is2d = (bot.GetNumDims() == 4);
-    const bool isTranspose = problem.GetXDesc().GetLayout("NCHW")[1] == 'C';
+    const bool isTranspose = problem.GetXDesc().GetLayout_str()[1] != 'C';
 
     // To compact code:
     const auto& pooling = problem.GetPooling();
@@ -124,6 +136,17 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     const auto index_mode = pooling.GetWorkspaceIndexMode();
     const auto index_type = pooling.GetIndexType();
 
+    // TEMPCODE RJS
+    std::cout << "======================================================================" << std::endl;
+    printVec("bot lengths", bot.GetLengths());
+    printVec("bot strides", bot.GetStrides());
+    printVec("top lengths", top.GetLengths());
+    printVec("top strides", top.GetStrides());
+    printVec("pool lengths", lengths);
+    printVec("pool strides", strides);
+    printVec("pool pads", pads);
+    std::cout << "======================================================================" << std::endl;
+
     /// \anchor multiply_dims_overflow_assumption
     ///
     /// Preventing overflow during dimension-related computations:
@@ -160,7 +183,8 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     const size_t mask_c_stride   = static_cast<size_t>(mask_d_stride) * top_d;
     const size_t mask_n_stride   = mask_c_stride * all_c;
 
-    /// About optimal grid size. The simplest way is to map the problem onto grid is 1:1 mapping of
+    /// About optimal grid size:
+    /// NC[D]HW: The simplest way is to map the problem onto grid is 1:1 mapping of
     /// N,C and top.D onto grid dimensions.
     ///
     /// However, this would waste 1 dimension of grid for 2D convolutions, i.e. the grid size would
@@ -169,6 +193,10 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     /// access memory in a scattered way, which would affect performance again. Current design
     /// choice is using separate 2D and 3D kernels (via build-time parameter) and N*C*H grid for 2D.
     ///
+    /// N[D]HWC: top N, D, H are mapped directly onto grid dimensions
+    ///
+    ///  
+    ///
     /// \anchor naive_pooling_max_grid_size
     /// * Assumption: Max grid size is >= 2^32-1 (4G-1) i.e. std::max<unint32_t>.
     ///   Currently this limitation is valid for both ROCm HIP and OCL runtimes.
@@ -197,9 +225,10 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
 
     const auto is2d_kernel = (top_d == 1); // For 2D + optimize for 3D where the 1st dim is 1.
     const auto g0          = RoundUpNearestPower2Positive(all_n);
-    const auto g1          = RoundUpNearestPower2Positive(all_c);
-    const auto g2          = RoundUpNearestPower2Positive(is2d_kernel ? top_h : top_d);
+    const auto g1          = RoundUpNearestPower2Positive(isTranspose ? top_d : all_c);
+    const auto g2          = RoundUpNearestPower2Positive(isTranspose || is2d_kernel ? top_h : top_d);
 
+    // TODO RJS: finish NHWC grid
     auto work_left = wavesize / 1;
     const auto w0  = (g0 < work_left) ? g0 : work_left;
     work_left /= w0;
@@ -210,16 +239,8 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     {
         auto kernel = KernelInfo{};
 
-        if(isTranspose)
-        {
-            kernel.kernel_file = "MIOpenPoolingFwdNDNhwcNaive.cpp";
-            kernel.kernel_name = "mloPoolingForwardNDNhwcNaive";
-        }
-        else
-        {
-            kernel.kernel_file = "MIOpenPoolingForwardNaive.cl";
-            kernel.kernel_name = "mloPoolingForwardNaive";
-        }
+        kernel.kernel_file = "MIOpenPoolingForwardNaive.cl";
+        kernel.kernel_name = "mloPoolingForwardNaive";
 
         auto build_params = KernelBuildParameters{
             {"MLO_POOLING_OP_ID", pooling_method}, // We need this at compile time in order to
@@ -227,11 +248,9 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
             {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(index_type)},
             {"MLO_POOLING_IS2D_KERNEL", static_cast<int>(is2d_kernel)},
         };
+
         build_params << GetDataTypeKBP(bot.GetType());
-        if(isTranspose)
-            kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
-        else
-            kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
 
         // [Informative] The total number of kernels required to cover the whole
         // forward pooling problem space is 3*4*2*2 = 48. The solver is dynamic.
@@ -250,79 +269,50 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
         result.construction_params.push_back(kernel);
     }
 
-    if(isTranspose)
-    {
-        result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
-            return [=](const Handle& handle, const AnyInvokeParams& raw_params) {
-                decltype(auto) kernel = handle.Run(kernels.front());
-                decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
-
-                // NOTE: kernel 'mloPoolingForwardNDNhwcNaive' expects indices in DHW[NC] order
-                kernel(params.x,
-                    params.y,
-                    params.workspace,
-                    save_index,
-                    index_mode,
-                    std::vector<uint32_t>{filter_d, filter_h, filter_w},
-                    std::vector<uint32_t>{filter_d_stride, filter_h_stride, filter_w_stride},
-                    std::vector<uint32_t>{filter_d_pad, filter_h_pad, filter_w_pad},
-                    all_n,
-                    all_c,
-                    std::vector<uint32_t>{bot_d, bot_h, bot_w},
-                    std::vector<size_t>{bot_d_stride, bot_h_stride, bot_w_stride, bot_n_stride, bot_c_stride},
-                    std::vector<uint32_t>{top_d, top_h, top_w},
-                    std::vector<size_t>{top_d_stride, top_h_stride, top_w_stride, top_n_stride, top_c_stride},
-                    std::vector<size_t>{mask_d_stride, mask_h_stride, mask_w_stride, mask_n_stride, mask_c_stride});
-            };
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
+
+            kernel(params.x,
+                params.y,
+                params.workspace,
+                save_index,
+                index_mode,
+                filter_d,
+                filter_h,
+                filter_w,
+                filter_d_stride,
+                filter_h_stride,
+                filter_w_stride,
+                filter_d_pad,
+                filter_h_pad,
+                filter_w_pad,
+                all_n,
+                all_c,
+                bot_d,  // TEMPCODE RJS: have not broke it
+                bot_h,
+                bot_w,
+                bot_n_stride,
+                bot_c_stride,
+                bot_d_stride,
+                bot_h_stride,
+                bot_w_stride,
+                top_d,
+                top_h,
+                top_w,
+                top_n_stride,
+                top_c_stride,
+                top_d_stride,
+                top_h_stride,
+                top_w_stride,
+                mask_n_stride,
+                mask_c_stride,
+                mask_d_stride,
+                mask_h_stride,
+                mask_w_stride);
         };
-    }
-    else
-    {
-        result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
-            return [=](const Handle& handle, const AnyInvokeParams& raw_params) {
-                decltype(auto) kernel = handle.Run(kernels.front());
-                decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
-
-                kernel(params.x,
-                    params.y,
-                    params.workspace,
-                    save_index,
-                    index_mode,
-                    filter_d,
-                    filter_h,
-                    filter_w,
-                    filter_d_stride,
-                    filter_h_stride,
-                    filter_w_stride,
-                    filter_d_pad,
-                    filter_h_pad,
-                    filter_w_pad,
-                    all_n,
-                    all_c,
-                    bot_h,
-                    bot_d,  // TODO RJS: broke it
-                    bot_w,
-                    bot_n_stride,
-                    bot_c_stride,
-                    bot_d_stride,
-                    bot_h_stride,
-                    bot_w_stride,
-                    top_d,
-                    top_h,
-                    top_w,
-                    top_n_stride,
-                    top_c_stride,
-                    top_d_stride,
-                    top_h_stride,
-                    top_w_stride,
-                    mask_n_stride,
-                    mask_c_stride,
-                    mask_d_stride,
-                    mask_h_stride,
-                    mask_w_stride);
-            };
-        };
-    }
+    };
 
     return result;
 }
diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp
new file mode 100644
index 0000000000..c946fd8da7
--- /dev/null
+++ b/src/solver/pooling/forwardNdNhwcNaive.cpp
@@ -0,0 +1,345 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/pooling.hpp>
+#include <miopen/pooling/invoke_params.hpp>
+#include <miopen/pooling/solvers.hpp>
+
+#define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCm/MIFin/issues/80
+
+namespace miopen {
+
+namespace solver {
+
+namespace pooling {
+
+namespace {
+
+#if !MIOPEN_NDEBUG && !WORKAROUND_ISSUE_MIFIN_80
+template <typename T>
+bool IsPower2(T v)
+{
+    return (v != 0) && ((v & (v - 1)) == 0);
+}
+#endif
+
+template <typename T>
+T RoundUpNearestPower2Positive(T v) = delete;
+
+inline uint32_t RoundUpNearestPower2Positive(uint32_t v)
+{
+    assert(v > 0);
+    --v;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return std::max(++v, 1U); // Shut clang-tidy.
+}
+
+} // namespace
+
+bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&,
+                                       const miopen::pooling::ProblemDescription& problem) const
+{
+    auto x_type = problem.GetXDesc().GetType();
+    auto y_type = problem.GetYDesc().GetType();
+    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf};
+
+    auto mode = problem.GetPooling().GetMode();
+    std::vector<miopenPoolingMode_t> modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive};
+
+    auto x_layout = problem.GetXDesc().GetLayout_str();
+    auto y_layout = problem.GetYDesc().GetLayout_str();
+    std::vector<std::string> layouts {"NHWC", "NDHWC"};
+
+    bool app = (problem.GetDirection() == miopen::pooling::Direction::Forward)          //
+        && (x_type == y_type)                                                       //
+        && (x_layout == y_layout)                                                   //
+        && (std::find(types.cbegin(), types.cend(), x_type) != types.cend())        //
+        && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend())          //)
+        && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end());
+
+    std::cout << "%%%%%%%%%% PoolingForwardNDNhwcNaive::IsApplicable: " << app << " " <<  problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout("NCHW")
+     << "  " << problem.GetYDesc().GetLayout_str() << "->" << problem.GetYDesc().GetLayout("NCHW")
+       << "  "  << (problem.GetDirection() == miopen::pooling::Direction::Forward)
+        << (x_type == y_type)
+        << (x_layout == y_layout) << (std::find(types.cbegin(), types.cend(), x_type) != types.cend())
+        << (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) << (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()) << std::endl;
+    return app;
+}
+
+#include <iomanip>  // TEMPCODE RJS
+namespace {
+    template<typename T>
+    void printVec(std::string name, const std::vector<T>& vec)
+    {
+         return;
+      std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: ";
+        for(auto i : vec)    std::cout << std::setw(8) << i;
+        std::cout << std::endl;
+    }
+}
+
+ConvSolution
+PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto input_dtype  = miopen::GetDataType(problem.GetXDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType());
+
+    const auto bot  = problem.GetXDesc();
+    const auto top  = problem.GetYDesc();
+    const bool is2d = (bot.GetNumDims() == 4);
+    const bool is_transpose = problem.GetXDesc().GetLayout_str()[1] != 'C';
+    if(!is_transpose)
+    {
+        MIOPEN_THROW("Tried to run NHWC solver on NCHW data");
+    }
+
+    // To compact code:
+    const auto& pooling = problem.GetPooling();
+    const auto& lengths = pooling.GetLengths();
+    const auto& strides = pooling.GetStrides();
+    const auto& pads    = pooling.GetPads();
+
+    // This also deduces 3D (DHW) parameters from 2D (HW) descriptor.
+    const uint32_t filter_w        = lengths[is2d ? 1 : 2];
+    const uint32_t filter_h        = lengths[is2d ? 0 : 1];
+    const uint32_t filter_d        = is2d ? 1 : lengths[0];
+    const uint32_t filter_w_stride = strides[is2d ? 1 : 2];
+    const uint32_t filter_h_stride = strides[is2d ? 0 : 1];
+    const uint32_t filter_d_stride = is2d ? (filter_h_stride * filter_d) : strides[0];
+    const uint32_t filter_w_pad    = pads[is2d ? 1 : 2];
+    const uint32_t filter_h_pad    = pads[is2d ? 0 : 1];
+    const uint32_t filter_d_pad    = is2d ? 0 : pads[0];
+
+    const int pooling_method = (pooling.GetMode() == miopenPoolingMax) ? MLO_POOLING_OP_MAX
+                               : (pooling.GetMode() == miopenPoolingAverage)
+                                   ? MLO_POOLING_OP_AVE
+                                   : MLO_POOLING_OP_AVE_INCLUSIVE;
+
+    const auto save_index = problem.SaveIndex();
+    const auto index_mode = pooling.GetWorkspaceIndexMode();
+    const auto index_type = pooling.GetIndexType();
+
+    /// \anchor multiply_dims_overflow_assumption
+    ///
+    /// Preventing overflow during dimension-related computations:
+    /// Let's assume that multiplication of three dims always fits into 32 bits (unsigned).
+    /// Then let's use size_t when we need to multiply more than three dims.
+    /// For example, in NCDHW layout, the N and C strides are results of multiplication
+    /// of >= 3 dims, so we have to use size_t for storing them.
+    ///
+    /// We need to pay special attention to muls of D stride with some other dims.
+    /// The D stride is a result of 2 muls. Therefore (d_stride * dim) does
+    /// not require widening to size_t prior mul, but (d_stride * dim * dim)
+    /// requires it because the total number of muls is 4.
+
+    // TEMPCODE RJS
+    std::cout << "======================================================================" << std::endl;
+    printVec("bot lengths", bot.GetLengths());
+    printVec("bot strides", bot.GetStrides());
+    printVec("top lengths", top.GetLengths());
+    printVec("top strides", top.GetStrides());
+    printVec("pool lengths", lengths);
+    printVec("pool strides", strides);
+    printVec("pool pads", pads);
+    std::cout << "======================================================================" << std::endl;
+
+    const auto spatial_dim = is2d ? 2U : 3U;
+
+    size_t all_n, all_c, bot_d, bot_h, bot_w;
+    std::tie(all_n, all_c, bot_d, bot_h, bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths());
+
+    size_t bot_n_stride, bot_d_stride;
+    size_t bot_h_stride, bot_w_stride, bot_c_stride;
+    std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride) =
+        miopen::GetNCDHW(spatial_dim, bot.GetStrides());
+
+    size_t a1, a2, top_d, top_h, top_w;
+    std::tie(a1, a2, top_d, top_h, top_w) =
+        miopen::GetNCDHW(spatial_dim, top.GetLengths());
+    std::cout << "GetSol: top_lens " << a1 << " " << a2 << " " << top_d << " " << top_h << " " << top_w << std::endl;
+
+    size_t top_n_stride, top_d_stride;
+    size_t top_h_stride, top_w_stride, top_c_stride;
+    std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride) =
+        miopen::GetNCDHW(spatial_dim, top.GetStrides());
+    // TEMPCODE RJS
+    std::cout << "GetSol: top_strides " << top_n_stride << " " << top_c_stride << " "
+    << top_d_stride << " " << top_h_stride << " " << top_w_stride << std::endl;
+    // Mask data is always NCDHW layout
+    const uint32_t mask_w_stride = 1;
+    const uint32_t mask_h_stride = mask_w_stride * top_w;
+    const uint32_t mask_d_stride = mask_h_stride * top_h;
+    const size_t mask_c_stride   = static_cast<size_t>(mask_d_stride) * top_d;
+    const size_t mask_n_stride   = mask_c_stride * all_c;
+
+    /// About optimal grid size:
+    /// top D, H, and W are mapped directly onto grid dimensions, except in very small problems
+    /// when they are combined into workgroup items in an attempt to improve overlapping and coalescense.
+    /// N seems to be generally small, so we'll multiply it into the 'D' dimension.
+    ///
+    /// \anchor naive_pooling_max_grid_size
+    /// * Assumption: Max grid size is >= 2^32-1 (4G-1) i.e. std::max<uint32_t>.
+    ///   However, assume the product of two dimensions is always <= 2^30.
+    ///   Currently this limitation is valid for both ROCm HIP and OCL runtimes.
+    ///
+    /// Selecting the optimal workgroup size is an interesting problem.
+    /// We'll first map c into the workgroup up to the maximum 1024 items. For large C, the
+    /// extra are mapped into the grid dimensions.
+    /// For small C, w and h are mapped into the workgroup dimensions as needed, in that
+    /// order, up to a maximum of 128 workitems (favoring more active blocks over more threads).
+    /// We do permit a partial workgroup when it is not an exact multiple of the wavefront size.
+    /// As said above, remaining H and W are mapped onto the grid dimensions.
+    ///
+    /// The workgroup size does not have the restrictions imposed by synchronization between
+    /// workitems because the kernel does not require synchronization.
+
+    std::ignore = context;
+    constexpr uint32_t LARGE_C_MAX_ITEMS = 512;
+    constexpr uint32_t SMALL_C_MAX_ITEMS = 128;
+
+    auto nd_ = all_n * top_d;
+    auto h_  = top_h;
+    auto w_  = top_w;
+    auto c_  = all_c;
+
+    uint32_t l1 = 1U;
+    uint32_t l2 = 1U;
+
+    if(c_ > LARGE_C_MAX_ITEMS)
+    {
+        auto c2 = c_ / LARGE_C_MAX_ITEMS + 1;
+        c_ = LARGE_C_MAX_ITEMS;
+        w_ *= c2;
+    }
+    // else if(c_ <= SMALL_C_MAX_ITEMS / 2)
+    // {
+    //     if(c_ * w_ <= SMALL_C_MAX_ITEMS)
+    //     {
+    //         std::swap(l2, w_);
+
+    //         if(c_ * w_ * h_ <= SMALL_C_MAX_ITEMS)
+    //         {
+    //             std::swap(l1, h_);
+    //         }
+    //     }
+    // }
+
+    const auto g0 = nd_;
+    const auto g1 = h_;
+    const auto g2 = w_;
+    const auto l0 = c_;
+
+    {
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenPoolingForwardNDNhwcNaive.cpp";
+        kernel.kernel_name = "mloPoolingForwardNDNhwcNaive";
+
+        auto build_params = KernelBuildParameters{
+            {"MLO_POOLING_OP_ID", pooling_method}, // We need this at compile time in order to
+                                                   // engage mixed precision only when necessary.
+            {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(index_type)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}
+        };
+
+        build_params << GetDataTypeKBP(bot.GetType());
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        // [Informative] The total number of kernels required to cover the whole
+        // forward pooling problem space is 3*4*2*2*2 = 96. The solver is dynamic.
+        // * 3: the number of supported operations
+        // * 4: the number of supported index types
+        // * 2: the number of supported data types
+        // * 2: layout (NCHW vs NHWC)
+        // * 2: 2D and 3D kernels (optimization)
+
+        kernel.g_wk.clear();
+        kernel.g_wk.push_back(g0);
+        kernel.g_wk.push_back(g1);
+        kernel.g_wk.push_back(g2);
+        kernel.l_wk.clear();
+        kernel.l_wk.push_back(l0);
+        kernel.l_wk.push_back(l1);
+        kernel.l_wk.push_back(l2);
+
+        // TEMPCODE RJS
+        std::cout << "Kernel dims: " << kernel.g_wk.size() << " " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2]
+        << " | " << kernel.l_wk.size() << " " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl;
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
+
+            kernel(
+                params.x,
+                params.y,
+                params.junk,   // TEMPCODE RJS
+                params.workspace,
+                save_index,
+                index_mode,
+                filter_d, filter_h, filter_w,
+                filter_d_stride, filter_h_stride, filter_w_stride,
+                filter_d_pad, filter_h_pad, filter_w_pad,
+                all_n,
+                all_c,
+                bot_d, bot_h, bot_w,
+                bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride,
+                top_d, top_h, top_w,
+                top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride,
+                mask_n_stride, mask_c_stride, mask_d_stride, mask_h_stride, mask_w_stride);
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+PoolingForwardNDNhwcNaive::GetWorkspaceSize(const ExecutionContext&,
+                                      const miopen::pooling::ProblemDescription& problem) const
+{
+    if(problem.GetPooling().GetMode() != miopenPoolingMax || !problem.SaveIndex())
+        return 0;
+    return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType());
+}
+
+} // namespace pooling
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 7ec4c4e581..f189dcf8da 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -153,6 +153,14 @@ TensorDescriptor::TensorDescriptor(miopenDataType_t t,
 {
 }
 
+TensorDescriptor::TensorDescriptor(miopenDataType_t t,
+                                  miopenTensorLayout_t layout_in,
+                                    const std::vector<int>& lens_in,
+                     const std::vector<int>& strides_in)
+    : TensorDescriptor(t, layout_in, ConvertLengthsOrThrow(lens_in, "Lengths must be > 0"), ConvertLengthsOrThrow(strides_in, "Strides must be > 0"))
+{
+}
+
 TensorDescriptor::TensorDescriptor(miopenDataType_t t,
                                    miopenTensorLayout_t layout_in,
                                    const std::initializer_list<std::size_t>& lens_in)
@@ -446,6 +454,11 @@ std::string TensorDescriptor::GetLayoutStr(miopenTensorLayout_t tensorLayout)
 
 std::string TensorDescriptor::GetLayout_str() const { return GetLayoutStr(this->tensorLayout); }
 
+bool TensorDescriptor::IsDefaultLayout() const
+{
+    return IsDefaultLayout(tensorLayout, lens.size() - 2);
+}
+
 std::size_t TensorDescriptor::GetVectorLength() const { return this->vector_length; }
 
 std::size_t TensorDescriptor::GetIndex(std::initializer_list<int> l) const
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 434fdfe5df..0211790493 100755
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -192,6 +192,8 @@ message(STATUS "MIOPEN_TEST_COMPOSABLEKERNEL ${MIOPEN_TEST_COMPOSABLEKERNEL}")
 message(STATUS "MIOPEN_TEST_DISCRETE ${MIOPEN_TEST_DISCRETE}")
 message(STATUS "MIOPEN_TEST_DBSYNC ${MIOPEN_TEST_DBSYNC}")
 message(STATUS "CODECOV_TEST ${CODECOV_TEST}")
+message(STATUS "CMAKE_CTEST_COMMAND ${CMAKE_CTEST_COMMAND}")
+message(STATUS "CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR}")
 
 if(MIOPEN_TEST_DRIVER_ITER_MODE)
     add_definitions(-DMIOPEN_TEST_DRIVER_MODE=2)
diff --git a/test/gtest/ex1.cpp b/test/gtest/ex1.cpp
new file mode 100644
index 0000000000..4b1ab5e2b1
--- /dev/null
+++ b/test/gtest/ex1.cpp
@@ -0,0 +1,15 @@
+#include <gtest/gtest.h>
+
+struct paramType { std::string value; };
+
+class MyFixture : public testing::TestWithParam<paramType> {};
+class FixtureA : public MyFixture {};
+class FixtureB : public MyFixture {};
+
+TEST_P(FixtureA, TestNameA0) { auto& myParam = GetParam(); EXPECT_GT(myParam.value.size(), 0ULL); }
+TEST_P(FixtureB, TestNameA0) { auto& myParam = GetParam(); EXPECT_GT(myParam.value.size(), 0ULL); }
+
+INSTANTIATE_TEST_SUITE_P(PIN0, FixtureA, testing::Values(paramType{"v00"}, paramType{"v01"}));
+INSTANTIATE_TEST_SUITE_P(PIN1, FixtureA, testing::Values(paramType{"v10"}, paramType{"v11"}, paramType{"v12"}));
+INSTANTIATE_TEST_SUITE_P(PIN2, FixtureB, testing::Values(paramType{"v00"}, paramType{"v11"}));
+
diff --git a/test/gtest/layout_transpose.cpp b/test/gtest/layout_transpose.cpp
index 54d6cc21a7..1161eff2b9 100644
--- a/test/gtest/layout_transpose.cpp
+++ b/test/gtest/layout_transpose.cpp
@@ -274,7 +274,52 @@ struct LayoutTransposeTest_2D : public ::testing::TestWithParam<std::tuple<uint3
                 miopen::tensor_layout_to_strides(
                     tensor_len, layout_default, layout_string, tensor_strides);
 
-                auto t_src     = tensor<T>{tensor_len, tensor_strides}.generate(gen_value<T>);
+                auto t_src     = tensor<T>{tensor_len, tensor_strides};
+        bool printing = true; // in_dim[0]==8 && in_dim[1]==8;
+        if(printing)
+        {
+            auto inlen = t_src.desc.GetLengths();
+            auto instr = t_src.desc.GetStrides();
+            std::cout << "CPU in : ";
+            for(auto dim : inlen) std::cout << std::setw(4) << dim;
+            std::cout << " | ";
+            for(auto str : instr) std::cout << std::setw(4) << str;
+            std::cout << std::endl;
+
+            for(int nn = 0; nn < inlen[0]; ++nn) {
+                for(int cc = 0; cc < inlen[1]; ++cc) {
+            for(int hh = 0; hh < inlen[2]; ++hh) {
+                for(int ww = 0; ww < inlen[3]; ++ww) {
+                    std::cout << std::setw(11) << std::setprecision(5) << t_src(nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]) << "  ";
+                }
+            std::cout << std::endl;
+            }
+            }
+            }
+        }
+
+                t_src.generate(gen_value<T>);
+        if(printing)
+        {
+            auto inlen = t_src.desc.GetLengths();
+            auto instr = t_src.desc.GetStrides();
+            std::cout << "CPU in : ";
+            for(auto dim : inlen) std::cout << std::setw(4) << dim;
+            std::cout << " | ";
+            for(auto str : instr) std::cout << std::setw(4) << str;
+            std::cout << std::endl;
+
+            for(int nn = 0; nn < inlen[0]; ++nn) {
+                for(int cc = 0; cc < inlen[1]; ++cc) {
+            for(int hh = 0; hh < inlen[2]; ++hh) {
+                for(int ww = 0; ww < inlen[3]; ++ww) {
+                    std::cout << std::setw(11) << std::setprecision(5) << t_src(nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]) << "  ";
+                }
+            std::cout << std::endl;
+            }
+            }
+            }
+        }
                 auto t_dst     = tensor<T>{tensor_len, tensor_strides};
                 auto t_dst_gpu = tensor<T>{tensor_len, tensor_strides};
 
diff --git a/test/gtest/poolingFwdNdNaive.cpp b/test/gtest/poolingFwdNdNaive.cpp
index 3e11396fff..0b7682ccc3 100644
--- a/test/gtest/poolingFwdNdNaive.cpp
+++ b/test/gtest/poolingFwdNdNaive.cpp
@@ -24,6 +24,8 @@
  *
  *******************************************************************************/
 
+#ifndef POOLING_GTEST_BUILD
+
 #include <gtest/gtest.h>
 #include <miopen/env.hpp>
 #include "get_handle.hpp"
@@ -32,6 +34,7 @@
 #include "pooling2d.hpp"
 
 #include "tensor_holder.hpp"
+#include "miopen/tensor_layout.hpp"
 
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
 MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS)
@@ -39,8 +42,9 @@ MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS)
 namespace env = miopen::env;
 
 namespace {
+
 template <typename T>
-struct tensor_data
+struct layout_data
 {
     static std::vector<int> get_layout_lengths(int n, int c, std::vector<int>& dims)
     {
@@ -109,62 +113,24 @@ struct tensor_data
 
 }
 
-namespace pooling_tests {
+class PoolingFwd : public testing::TestWithParam<std::vector<std::string>> {};
+class PoolingFwdFloat : public PoolingFwd {};
+class PoolingFwdHalf : public PoolingFwd {};
 
-class PoolingFwdNdNaive : public testing::TestWithParam<std::vector<std::string>>
-{
-};
+void Run2dDriver(miopenDataType_t prec);
 
-static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); }
+namespace {
 
-// void GetArgs(const std::string& param, std::vector<std::string>& tokens)
-// {
-//     std::stringstream ss(param);
-//     std::istream_iterator<std::string> begin(ss);
-//     std::istream_iterator<std::string> end;
-//     while(begin != end)
-//         tokens.push_back(*begin++);
-// }
+static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); }
 
-void Run2dDriver(miopenDataType_t prec)
+void GetArgs(const std::string& param, std::vector<std::string>& tokens)
 {
-
-    std::vector<std::string> params;
-    switch(prec)
-    {
-    case miopenFloat: params = Pooling2dFloat::GetParam(); break;
-    case miopenHalf: params = WidePooling2dHalf::GetParam(); break;
-    case miopenBFloat16:
-    case miopenInt8:
-    case miopenFloat8:
-    case miopenBFloat8:
-    case miopenInt32:
-    case miopenInt64:
-    case miopenDouble:
-        FAIL()
-            << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8 "
-               "data type not supported by "
-               "immed_conv2d_codecov test";
-
-    default: params = Pooling2dFloat::GetParam();
-    }
-
-    for(const auto& test_value : params)
-    {
-        std::vector<std::string> tokens;
-        GetArgs(test_value, tokens);
-        std::vector<const char*> ptrs;
-
-        std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) {
-            return str.data();
-        });
-
-        testing::internal::CaptureStderr();
-        test_drive<pooling2d_driver>(ptrs.size(), ptrs.data());
-        auto capture = testing::internal::GetCapturedStderr();
-        std::cout << capture;
-    }
-};
+    std::stringstream ss(param);
+    std::istream_iterator<std::string> begin(ss);
+    std::istream_iterator<std::string> end;
+    while(begin != end)
+        tokens.push_back(*begin++);
+}
 
 bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; }
 
@@ -174,21 +140,24 @@ std::vector<std::string> GetTestCases(const std::string precision)
 
     const std::vector<std::string> test_cases = {
         // clang-format off
-    {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 "+flag_arg}
+    {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}
         // clang-format on
     };
 
     return test_cases;
 }
-
 } // namespace pooling_tests
-using namespace pooling_tests;
+// using namespace pooling_tests;
 
-/*
-TEST_P(Pooling2dFloat, FloatTest_pooling2d_wide)
+TEST_P(PoolingFwdFloat, NNT)    // NDNaiveTranspose
 {
     const auto& handle = get_handle();
-    if(IsTestSupportedForDevice(handle) && !SkipTest() && IsTestRunWith("--float"))
+    if(!IsTestSupportedForDevice(handle))   std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl;
+    if(SkipTest())                          std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl;
+    // if(!IsTestRunWith("--float"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl;
+        // Run2dDriver(miopenFloat);   return; // TEMPCODE RJS
+    //  && IsTestRunWith("--float")
+    if(IsTestSupportedForDevice(handle) && !SkipTest())
     {
         Run2dDriver(miopenFloat);
     }
@@ -197,12 +166,15 @@ TEST_P(Pooling2dFloat, FloatTest_pooling2d_wide)
         GTEST_SKIP();
     }
 };
-*/
 
-TEST_P(WidePooling2dHalf, HalfTest_pooling2d_wide)
+TEST_P(PoolingFwdHalf, NNT)
 {
     const auto& handle = get_handle();
-    if(IsTestSupportedForDevice(handle) && !SkipTest() && IsTestRunWith("--half"))
+    if(!IsTestSupportedForDevice(handle))   std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl;
+    if(SkipTest())                          std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl;
+    // if(!IsTestRunWith("--half"))           std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl;
+
+    if(IsTestSupportedForDevice(handle) && !SkipTest()) //  && IsTestRunWith("--half") TEMPCODE RJS
     {
         Run2dDriver(miopenHalf);
     }
@@ -212,6 +184,60 @@ TEST_P(WidePooling2dHalf, HalfTest_pooling2d_wide)
     }
 };
 
-// INSTANTIATE_TEST_SUITE_P(Pooling2D, Pooling2dFloat, testing::Values(GetTestCases("--float")));
+void Run2dDriver(miopenDataType_t prec)
+{
+    auto cases = GetTestCases("--float");
+       std::cerr << " Cases: " << cases.size() << std::endl;
+    for(const auto& test_value : cases)
+    {
+        std::cerr << "      : " << test_value << std::endl;    // TEMPCODE RJS
+    }
+ 
+    std::vector<std::string> params;
+    switch(prec)
+    {
+    case miopenFloat: params = PoolingFwdFloat_NNT_Test::GetParam(); break;
+    case miopenHalf: params = PoolingFwdHalf_NNT_Test::GetParam(); break;
+    case miopenBFloat16:
+    case miopenInt8:
+    case miopenInt32:
+    case miopenDouble:
+    case miopenFloat8:
+    case miopenBFloat8:
+    case miopenInt64:
+        FAIL()
+            << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 "
+               "data type not supported by "
+               "poolingFwdNdNaive test";
+
+    default: params = PoolingFwdFloat_NNT_Test::GetParam();
+    }
+
+    std::cerr << "Params: " << params.size() << std::endl;
+    for(const auto& test_value : params)
+    {
+        std::cerr << "      : " << test_value << std::endl;    // TEMPCODE RJS
+    }
+    for(const auto& test_value : params)
+    {
+        std::cerr << "Testing: " << test_value << std::endl;    // TEMPCODE RJS
+        std::vector<std::string> tokens;
+        GetArgs(test_value, tokens);
+        std::vector<const char*> ptrs;
+
+        std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) {
+            return str.data();
+        });
+
+        testing::internal::CaptureStderr();
+        test_drive<pooling2d_driver>(ptrs.size(), ptrs.data());
+        auto capture = testing::internal::GetCapturedStderr();
+        std::cout << capture;
+    }
+}
+
+INSTANTIATE_TEST_SUITE_P(Float, PoolingFwdFloat, testing::Values(GetTestCases("--float")));
+
+INSTANTIATE_TEST_SUITE_P(Half, PoolingFwdHalf, testing::Values(GetTestCases("--half")));
 
-INSTANTIATE_TEST_SUITE_P(Pooling2D, WidePooling2dHalf, testing::Values(GetTestCases("--half")));
+#endif
diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp
index 399592a1f4..a9ca1446b5 100644
--- a/test/pooling2d.hpp
+++ b/test/pooling2d.hpp
@@ -57,7 +57,7 @@ struct pooling2d_shapes
     }
 
     // Dataset 1 is intended for testing of asymmetric configs.
-    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}}; }
+    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 1, 8, 8}}; }
 
     // Dataset 2 is intended for testing of configs with wide window.
     static std::vector<U> get_2d_pooling_input_shapes_wide()
@@ -123,5 +123,6 @@ struct pooling2d_driver : pooling_driver<T>
             {{0, 0}}}));
         // clang-format on
         this->add(this->wsidx, "wsidx", this->generate_data({0, 1}));
+        this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNHWC
     }
 };
diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp
index c6d80e8e98..d2a64540c6 100644
--- a/test/pooling3d.hpp
+++ b/test/pooling3d.hpp
@@ -50,6 +50,7 @@ struct pooling3d_driver : pooling_driver<T>
     {
         return pooling3d_shapes::get_3d_pooling_input_shapes();
     }
+
     pooling3d_driver() : pooling_driver<T>()
     {
         this->add(
@@ -58,5 +59,6 @@ struct pooling3d_driver : pooling_driver<T>
         this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}}));
         this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}}));
         this->add(this->wsidx, "wsidx", this->generate_data({1}));
+        this->add(this->layout, "layout", this->generate_data({0, 1}));
     }
 };
diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp
index 231b635a63..69c19478ae 100644
--- a/test/pooling_common.hpp
+++ b/test/pooling_common.hpp
@@ -60,7 +60,17 @@ static int num_uint64_case = 0;
 // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
 static int num_uint64_case_imgidx = 0;
 
-static inline void print(const miopen::PoolingDescriptor& filter)
+namespace {
+
+constexpr int RAND_INTEGER_MAX = 1200;
+constexpr int RAND_INTEGER_MIN = -880;
+
+template <typename T>
+auto gen_value =
+    [](auto... is) { return static_cast<T>(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 10; };
+}
+
+static inline void print(const miopen::PoolingDescriptor& filter, bool is_default_layout)
 {
     std::cout << "Pooling: ";
     if(filter.GetMode() == miopenPoolingAverage)
@@ -70,6 +80,7 @@ static inline void print(const miopen::PoolingDescriptor& filter)
     else
         std::cout << "Max";
     std::cout << std::endl;
+    std::cout << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl;  // TEMPCODE RJS
     std::cout << "Lengths: ";
     miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl;
     std::cout << "Pads: ";
@@ -84,13 +95,24 @@ tensor<T> get_output_tensor(const miopen::PoolingDescriptor& filter, const tenso
     return tensor<T>{filter.GetForwardOutputTensor(input.desc)};
 }
 
+template <class T>
+tensor<T> get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor<T>& input)
+{
+    auto desc = filter.GetForwardOutputTensor(input.desc);
+    auto lens = desc.GetLengths();
+    lens[0] *= 10;
+    auto big = miopen::TensorDescriptor{desc.GetType(), input.desc.GetLayout_t(), lens, desc.GetStrides()};
+    std::cout << "get_big_output_tensor: " << input.desc.GetLayout_str() << " " << desc.GetLayout_str() << std::endl;
+    return tensor<T>{big};
+}
+
 template <class T>
 struct pooling_operators
 {
     miopen::PoolingDescriptor filter;
     pooling_operators(miopen::PoolingDescriptor f) : filter(f) {}
 
-    double start() const
+    double initialize() const
     {
         if(filter.GetMode() == miopenPoolingMax)
             return std::numeric_limits<T>::lowest();
@@ -111,7 +133,7 @@ struct pooling_operators
         }
     }
 
-    double final(double x, double y)
+    double finalize(double x, double y)
     {
         if(filter.GetMode() == miopenPoolingMax)
             return (x);
@@ -120,6 +142,8 @@ struct pooling_operators
     }
 };
 
+#include <iomanip>
+
 template <int SptDim>
 struct verify_forward_pooling
 {
@@ -127,22 +151,56 @@ struct verify_forward_pooling
     tensor<T>
     cpu(const tensor<T>& input, const miopen::PoolingDescriptor& filter, std::vector<Index>&) const
     {
+        const bool is_default_layout = input.desc.IsDefaultLayout();
+        const int sptl_dim_offset = 2; // is_default_layout ? 2 : 1; TEMPCODE RJS
+        const int chan_dim_offset = 1; // is_default_layout ? 1 : SptDim + 1;
+
         auto out = get_output_tensor(filter, input);
 
         std::array<int, SptDim> in_dim{};
-        std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin());
+        std::copy_n(input.desc.GetLengths().begin() + sptl_dim_offset, SptDim, in_dim.begin());
         std::array<int, SptDim> strides{};
         std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin());
         std::array<int, SptDim> pads{};
         std::copy_n(filter.GetPads().begin(), SptDim, pads.begin());
         std::array<int, SptDim> kers{};
         std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin());
-        auto op = pooling_operators<T>{filter};
+        auto pooler = pooling_operators<T>{filter};
+
+        // TEMPCODE RJS print input tensor
+        bool printing = true; // in_dim[0]==8 && in_dim[1]==8;
+        if(printing)
+        {
+            auto inlen = input.desc.GetLengths();
+            auto instr = input.desc.GetStrides();
+            std::cout << "CPU in : m" << filter.GetMode() << " t" << input.desc.GetType() << " | ";
+            for(auto dim : inlen) std::cout << std::setw(4) << dim;
+            std::cout << " | ";
+            for(auto str : instr) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
+            std::cout << std::endl;
+
+            for(int nn = 0; nn < inlen[0]; ++nn) {
+                for(int cc = 0; cc < inlen[1]; ++cc) {
+                    for(int hh = 0; hh < inlen[2]; ++hh) {
+                        for(int ww = 0; ww < inlen[3]; ++ww) {
+                            std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << "  ";
+                        }
+                        std::cout << std::endl;
+                    }
+                }
+            }
+        }
 
         int b_n = out.desc.GetLengths()[0];
-        int k_n = out.desc.GetLengths()[1];
+        int k_n = out.desc.GetLengths()[chan_dim_offset];
         std::array<int, SptDim> out_spatial_len{};
-        std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin());
+        std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin());
 
         auto par_ford_out =
             miopen::unpacker(miopen::prepender(par_ford, b_n, k_n))(out_spatial_len);
@@ -167,27 +225,53 @@ struct verify_forward_pooling
                     ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies<int>())
                     : std::accumulate(win_sz.begin(), win_sz.end(), 1, std::multiplies<int>());
 
-            double acc = op.start();
+            double acc = pooler.initialize();
             miopen::unpacker(ford)(win_sz)([&](auto... in_spatial_id_pack) {
                 auto in_spatial_id = make_array(in_spatial_id_pack...);
                 std::array<std::size_t, SptDim + 2> idx{};
                 idx[0] = o;
-                idx[1] = w;
+                idx[chan_dim_offset] = w;
 
                 bool in_cmp_idx = true;
                 for(int i = 0; i < SptDim; ++i)
                 {
-                    idx[i + 2] = start_idx[i] + in_spatial_id[i];
-                    in_cmp_idx &= (in_dim[i] > idx[i + 2]);
+                    idx[i + sptl_dim_offset] = start_idx[i] + in_spatial_id[i];
+                    in_cmp_idx &= (in_dim[i] > idx[i + sptl_dim_offset]);
                 }
 
                 if(in_cmp_idx)
                 {
-                    acc = op(acc, input(idx));
+                    acc = pooler(acc, input(idx));
                 }
             });
-            out(o, w, out_spatial_id_pack...) = T(op.final(acc, pool_size));
+            out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size));
         });
+        if(printing)
+        {
+            std::cout << "CPU out: ";
+            auto outlen = out.desc.GetLengths();
+            auto outstr = out.desc.GetStrides();
+            for(auto dim : outlen) std::cout << std::setw(4) << dim;
+            std::cout << " | ";
+            for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
+            std::cout << std::endl;
+
+            for(int nn = 0; nn < outlen[0]; ++nn) {
+                for(int cc = 0; cc < outlen[1]; ++cc) {
+            for(int hh = 0; hh < outlen[2]; ++hh) {
+                for(int ww = 0; ww < outlen[3]; ++ww) {
+                    std::cout << std::setw(11) << std::setprecision(5) << out.data[nn * outstr[0] + cc * outstr[1] + hh * outstr[2] + ww * outstr[3]] << "  ";
+                }
+            std::cout << std::endl;
+            }
+            }
+            }
+        }   // print output tensor
+
         return out;
     }
 
@@ -197,11 +281,16 @@ struct verify_forward_pooling
                   std::vector<Index>& indices) const
     {
         auto&& handle = get_handle();
-        auto out      = get_output_tensor(filter, input);
+        auto out      = get_big_output_tensor(filter, input);   // TEMPCODE RJS
+
         indices.resize(out.data.size(), 0);
 
         auto in_dev  = handle.Write(input.data);
         auto out_dev = handle.Create<T>(out.data.size());
+        auto junk_dev = handle.Create<T>(out.data.size());  // 
+        std::cout << "gpu Sizes: in:" << input.data.size() << " out: " << out.data.size() << " " << out.desc.GetLayout_str()
+        << " " << out.desc.GetLengths()[0]  << " " << out.desc.GetLengths()[1] << " " << out.desc.GetLengths()[2] << " " << out.desc.GetLengths()[3]
+        << " " << out.desc.GetStrides()[0]  << " " << out.desc.GetStrides()[1] << " " << out.desc.GetStrides()[2] << " " << out.desc.GetStrides()[3] << std::endl;
         Workspace wspace{};
         wspace.Write(indices);
 
@@ -215,10 +304,38 @@ struct verify_forward_pooling
                        out_dev.get(),
                        true,
                        wspace.ptr(),
-                       wspace.size());
+                       wspace.size(),
+                       junk_dev.get()); // TEMPCODE RJS
 
         indices  = wspace.Read<std::vector<Index>>();
         out.data = handle.Read<T>(out_dev, out.data.size());
+        if(true)
+        {
+            std::cout << "GPU out: ";
+            auto outlen = out.desc.GetLengths();
+            for(auto dim : outlen) std::cout << std::setw(4) << dim;
+            std::cout << " | ";
+            auto outstr = out.desc.GetStrides();
+            for(auto dim : outstr) std::cout << std::setw(4) << dim;
+            std::cout << " | ";
+            for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
+            std::cout << std::endl;
+
+            for(int nn = 0; nn < outlen[0]; ++nn) {
+                for(int cc = 0; cc < outlen[1]; ++cc) {
+                    for(int hh = 0; hh < outlen[2]; ++hh) {
+                        for(int ww = 0; ww < outlen[3]; ++ww) {
+                            std::cout << std::setw(11) << std::setprecision(5) << out.data[out.desc.GetIndex(nn, cc, hh, ww)] << "  ";
+                        }
+                        std::cout << std::endl;
+                    }
+                }
+            }
+        }   // print output tensor
         return out;
     }
 
@@ -229,7 +346,7 @@ struct verify_forward_pooling
               const std::vector<Index>&) const
     {
         std::cout << "Forward ";
-        print(filter);
+        print(filter, input.desc.IsDefaultLayout());
         std::cout << "Input tensor: " << input.desc.ToString() << std::endl;
         std::cout << "Output tensor: " << filter.GetForwardOutputTensor(input.desc).ToString()
                   << std::endl;
@@ -248,7 +365,12 @@ struct verify_backward_pooling
                   bool use_global_index,
                   bool verify_index) const
     {
+        const bool is_default_layout = input.desc.IsDefaultLayout();
+        const int sptl_dim_offset = is_default_layout ? 2 : 1;
+        const int chan_dim_offset = is_default_layout ? 1 : SptDim + 1;
+
         auto dinput = input;
+
         std::vector<double> din_vec(input.desc.GetElementSpace(), 0.0);
         CHECK(dout.desc == out.desc);
         std::array<int, SptDim + 2> in_dim{};
@@ -264,9 +386,9 @@ struct verify_backward_pooling
         auto ford_ker = miopen::unpacker(ford)(kers);
 
         int out_n = out.desc.GetLengths()[0];
-        int out_c = out.desc.GetLengths()[1];
+        int out_c = out.desc.GetLengths()[chan_dim_offset];
         std::array<int, SptDim> out_spatial_len{};
-        std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin());
+        std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin());
         auto ford_out = miopen::unpacker(ford)(out_spatial_len);
 
         par_ford(out_n, out_c)([&](int o, int w) {
@@ -281,12 +403,12 @@ struct verify_backward_pooling
                         for(int i = 0; i < SptDim; i++)
                         {
                             std::size_t mx_idx_dim = mx_idx;
-                            mx_idx_dim /= std::accumulate(in_dim.begin() + i + 3,
+                            mx_idx_dim /= std::accumulate(in_dim.begin() + sptl_dim_offset + i + 1,
                                                           in_dim.end(),
                                                           1ULL,
                                                           std::multiplies<std::size_t>());
-                            mx_idx_dim %= in_dim[i + 2];
-                            idx[i + 2] = mx_idx_dim;
+                            mx_idx_dim %= in_dim[i + sptl_dim_offset];
+                            idx[i + sptl_dim_offset] = mx_idx_dim;
                         }
                     }
                     else
@@ -441,7 +563,7 @@ struct verify_backward_pooling
               bool) const
     {
         std::cout << "Backward ";
-        print(filter);
+        print(filter, input.desc.IsDefaultLayout());
         std::cout << "Input tensor: " << input.desc.ToString() << std::endl;
         std::cout << "Output tensor: " << out.desc.ToString() << std::endl;
     }
@@ -462,6 +584,7 @@ struct pooling_driver : test_driver
 #endif
     int verify_indices{};
     int wsidx{};
+    miopenTensorLayout_t layout{};
     std::unordered_map<std::string, miopenIndexType_t> index_type_lookup = {
         {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8},
         {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16},
@@ -505,49 +628,92 @@ struct pooling_driver : test_driver
         add(verify_indices, "verify_indices", generate_data({1}));
     }
 
-    template <class Index, int SptDim>
+    template <class Index, int SptlDim>
     void run_impl()
     {
         std::vector<Index> indices{};
-        auto input = tensor<T>{in_shape}.generate(
-            tensor_elem_gen_integer{miopen_type<T>{} == miopenHalf ? 5 : 17});
-        auto out  = verify(verify_forward_pooling<SptDim>{}, input, filter, indices);
-        auto dout = out.first;
-        dout.generate(tensor_elem_gen_integer{2503});
-        verify(verify_backward_pooling<SptDim>{},
-               input,
-               dout,
-               out.first,
-               filter,
-               indices,
-               wsidx != 0,
-               static_cast<bool>(this->verify_indices));
+        auto input = tensor<T>{layout, in_shape};
+        for(auto& v : input.data)   v = gen_value<T>();
+
+        // TEMPCODE RJS print input tensor
+        bool printing = true; // in_dim[0]==8 && in_dim[1]==8;
+        if(printing)
+        {
+            auto inlen = input.desc.GetLengths();
+            auto instr = input.desc.GetStrides();
+            std::cout << "CPU GEN : " << input.desc.GetLayout_str() << "(" << inlen.size() << ") | " << input.data.size() << " | " << input.desc.GetElementSpace() << " | ";
+            for(auto dim : inlen) std::cout << std::setw(4) << dim;
+            std::cout << " | ";
+            for(auto str : instr) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
+            std::cout << std::endl;
+
+            for(int nn = 0; nn < inlen[0]; ++nn) {
+                for(int cc = 0; cc < inlen[1]; ++cc) {
+                    for(int hh = 0; hh < inlen[2]; ++hh) {
+                        for(int ww = 0; ww < inlen[3]; ++ww) {// nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]
+                            std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << "  ";
+                        }
+                    std::cout << std::endl;
+                    }
+                }
+            }
+        }
+
+        auto out  = verify(verify_forward_pooling<SptlDim>{},
+            input,
+            filter,
+            indices);
+
+        // auto dout = out.first;
+        // dout.generate(tensor_elem_gen_integer{2503});
+        // verify(verify_backward_pooling<SptlDim>{},   // TEMPCODE RJS no backward
+        //        input,
+        //        dout,
+        //        out.first,
+        //        filter,
+        //        indices,
+        //        wsidx != 0,
+        //        static_cast<bool>(this->verify_indices));
     }
 
     void run()
     {
+        if(miopen::ToUpper(mode) == "MAX") return;  // TEMPCODE RJS skip all except max, do max only
+
         auto idx_typ = index_type_lookup.at(miopen::ToUpper(index_type));
         auto idx_sz  = sizeof(uint8_t);
-        int spt_dim  = in_shape.size() - 2;
+        int sptl_dim  = in_shape.size() - 2;
         const bool skip_many_configs_with_non_int8_index =
             (dataset_id == 0) && full_set; // Otherwise the default dataset takes too much time.
         const bool wide_dataset = (dataset_id == 2) && full_set;
 
+        // Input dimensions to the driver are always NCHW-style
+        const bool is_default_layout = !(layout == miopenTensorNHWC || layout == miopenTensorNDHWC);
+        std::cout << "################## pooling_driver run layout=" << (int)layout << " is_default=" << (int)is_default_layout << std::endl;
+
         filter = miopen::PoolingDescriptor
         {
             mode_lookup.at(miopen::ToUpper(mode)),
 #if TEST_PADDING_MODE == 1
-                pmode_lookup.at(miopen::ToUpper(pmode)),
+            pmode_lookup.at(miopen::ToUpper(pmode)),
 #else
-                miopenPaddingDefault,
+            miopenPaddingDefault,
 #endif
-                lens, strides, pads
+            lens,
+            strides,
+            pads
         };
 
         filter.SetIndexType(idx_typ);
         filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx));
 
-        if(wsidx == 0 && spt_dim == 3 && filter.GetMode() == miopenPoolingMax && full_set)
+        if(wsidx == 0 && sptl_dim == 3 && filter.GetMode() == miopenPoolingMax && full_set)
         {
             show_command();
             std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
@@ -556,7 +722,7 @@ struct pooling_driver : test_driver
             return;
         }
 
-        if(wsidx == 0 && spt_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset)
+        if(wsidx == 0 && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset)
         {
             show_command();
             std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
@@ -587,12 +753,12 @@ struct pooling_driver : test_driver
         /// the "full test" is ran. See:
         /// \ref max_pooling_index_max_restriction
         case miopenIndexUint8: {
-            if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set &&
+            if((sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) && full_set &&
                filter.GetMode() == miopenPoolingMax)
             {
                 show_command();
                 std::cout << "Warning: Config skipped: uint8 index is too small "
-                             "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) "
+                             "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
                              "&& filter.GetMode() == miopenPoolingMax"
                           << std::endl;
                 return;
@@ -600,12 +766,12 @@ struct pooling_driver : test_driver
             break;
         }
         case miopenIndexUint16: {
-            if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set &&
+            if((sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) && full_set &&
                filter.GetMode() == miopenPoolingMax)
             {
                 show_command();
                 std::cout << "Warning: Config skipped: uint16 index is too small "
-                             "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) "
+                             "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
                              "&& filter.GetMode() == miopenPoolingMax"
                           << std::endl;
                 return;
@@ -675,12 +841,12 @@ struct pooling_driver : test_driver
                 }
                 else
                 {
-                    if(num_uint64_case_imgidx > 5 && spt_dim == 2)
+                    if(num_uint64_case_imgidx > 5 && sptl_dim == 2)
                     {
                         show_command();
                         std::cout << "Warning: Config skipped to speed up testing of the "
                                      "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 "
-                                     "&& spt_dim == 2)"
+                                     "&& sptl_dim == 2)"
                                   << std::endl;
                         return;
                     }
@@ -692,18 +858,18 @@ struct pooling_driver : test_driver
         }
         }
 
-        auto input_desc = miopen::TensorDescriptor(this->type, in_shape);
+        auto input_desc = miopen::TensorDescriptor(this->type, layout, in_shape);
 
-        if(spt_dim != 2 && spt_dim != 3)
+        if(sptl_dim != 2 && sptl_dim != 3)
         {
             show_command();
             std::cout << "Warning: Config skipped becuse it is not supported " //
-                         "(spt_dim != 2 && spt_dim != 3)"
+                         "(sptl_dim != 2 && sptl_dim != 3)"
                       << std::endl;
             return;
         }
 
-        for(int i = 0; i < spt_dim; i++)
+        for(int i = 0; i < sptl_dim; i++)
         {
             if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast<uint64_t>(2) * pads[i]))
             {
@@ -733,9 +899,13 @@ struct pooling_driver : test_driver
             }
         }
 
-        std::vector<int> in_dim(input_desc.GetLengths().begin() + 2, input_desc.GetLengths().end());
-        std::vector<int> out_dim(spt_dim);
+        int sptl_index = is_default_layout ? 2 : 1;
+
+        std::vector<int> in_dim(input_desc.GetLengths().begin() + sptl_index,
+            input_desc.GetLengths().begin() + sptl_index + sptl_dim);
+        std::vector<int> out_dim(sptl_dim);
         std::vector<int> ker_dim(filter.GetLengths().begin(), filter.GetLengths().end());
+
 #if TEST_PADDING_MODE == 1
         if(filter.pmode == miopenPaddingSame)
         {
@@ -743,7 +913,7 @@ struct pooling_driver : test_driver
                    return i == 0;
                }))
                 return;
-            for(int i = 0; i < spt_dim; i++)
+            for(int i = 0; i < sptl_dim; i++)
             {
                 filter.pads[i] =
                     ((in_dim[i] % filter.GetStrides()[i] == 0)
@@ -763,7 +933,7 @@ struct pooling_driver : test_driver
                    return i == 0;
                }))
                 return;
-            for(int i = 0; i < spt_dim; i++)
+            for(int i = 0; i < sptl_dim; i++)
             {
                 filter.pads[i] = 0;
 
@@ -778,7 +948,7 @@ struct pooling_driver : test_driver
         switch(filter.GetIndexType())
         {
         case miopenIndexUint8: {
-            if(spt_dim == 3)
+            if(sptl_dim == 3)
             {
                 run_impl<uint8_t, 3>();
             }
@@ -789,7 +959,7 @@ struct pooling_driver : test_driver
             break;
         }
         case miopenIndexUint16: {
-            if(spt_dim == 3)
+            if(sptl_dim == 3)
             {
                 run_impl<uint16_t, 3>();
             }
@@ -800,7 +970,7 @@ struct pooling_driver : test_driver
             break;
         }
         case miopenIndexUint32: {
-            if(spt_dim == 3)
+            if(sptl_dim == 3)
             {
                 run_impl<uint32_t, 3>();
             }
@@ -811,7 +981,7 @@ struct pooling_driver : test_driver
             break;
         }
         case miopenIndexUint64: {
-            if(spt_dim == 3)
+            if(sptl_dim == 3)
             {
                 run_impl<uint64_t, 3>();
             }

From 04ab85c450470ee5576fa8aa3ab0de6e8864180d Mon Sep 17 00:00:00 2001
From: "Randy J. Spaulding" <rspauldi@amd.com>
Date: Wed, 21 Aug 2024 09:49:16 +0000
Subject: [PATCH 03/10] naive kernel works

---
 .../MIOpenPoolingForwardNDNhwcNaive.cpp       | 401 +++++++++---------
 src/solver/pooling/forwardNdNhwcNaive.cpp     |  19 +-
 test/pooling_common.hpp                       |  42 +-
 3 files changed, 235 insertions(+), 227 deletions(-)

diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
index 970104b3b1..60aa2aac23 100644
--- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
+++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
@@ -39,8 +39,8 @@
 #define CVT_FP32_2ACCUM(x) (x)
 #endif
 
-// #define _FLOAT float
-// #define _FLOAT_ACCUM _FLOAT
+#define _FLOAT float
+#define _FLOAT_ACCUM _FLOAT
 
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include <hip/hip_fp16.h>
@@ -83,7 +83,7 @@
 #define UU
 #endif
 
-#define doUU1 1
+#define doUU1 0
 #if doUU1
 #define UU1  __attribute__((__unused__))
 #else
@@ -132,227 +132,220 @@ __device__ void poolingForwardNDNhwcNaive(UU1 const TI* __restrict__ bot_ptr,
                                     UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_h_stride,
                                     UU ARG_UNUSED_FOR_AVERAGE size_t mask_w_stride)
 {
-    const uint32_t nn = blockIdx.x / top_d;                          // N=slow index
-    if(!(nn < all_n))
-        return;
-
-    const uint32_t td = blockIdx.x % top_d;                          // top D=fast index
-    if(td >= top_d)
-        return;
-
-    const uint32_t th = blockIdx.y;  // top H
-    // const uint32_t j = (gridDim.y == 1) ? threadIdx.y : blockIdx.y;  // top H
-    if(th >= top_h)
-        return;
-
-    const uint32_t tw = blockIdx.z % top_w;  // top W=fast index
-    if(tw >= top_w)
-        return;
 
+    auto log_ptr = junk_ptr;
     if(nn == 0 && td == 0 && th == 0 && tw == 0)
     {
         int idx = 0;
-        top_ptr[idx++] = gridDim.x;
-        top_ptr[idx++] = gridDim.y;
-        top_ptr[idx++] = gridDim.z;
-        top_ptr[idx++] = -9;
-
-        top_ptr[idx++] = blockDim.x;
-        top_ptr[idx++] = blockDim.y;
-        top_ptr[idx++] = blockDim.z;
-        top_ptr[idx++] = -8;
-
-        top_ptr[idx++] = filter_d;
-        top_ptr[idx++] = filter_h;
-        top_ptr[idx++] = filter_w;
-        top_ptr[idx++] = -7;
-
-        top_ptr[idx++] = filter_d_stride;
-        top_ptr[idx++] = filter_h_stride;
-        top_ptr[idx++] = filter_w_stride;
-        top_ptr[idx++] = -6;
-
-        top_ptr[idx++] = filter_d_pad;
-        top_ptr[idx++] = filter_h_pad;
-        top_ptr[idx++] = filter_w_pad;
-        top_ptr[idx++] = -5;
-
-        top_ptr[idx++] = all_n;
-        top_ptr[idx++] = all_c;
-        top_ptr[idx++] = bot_n_stride;
-        top_ptr[idx++] = bot_c_stride;
-
-        top_ptr[idx++] = top_n_stride;
-        top_ptr[idx++] = top_c_stride;
+        log_ptr[idx++] = gridDim.x;
+        log_ptr[idx++] = gridDim.y;
+        log_ptr[idx++] = gridDim.z;
+        log_ptr[idx++] = -9;
+
+        log_ptr[idx++] = blockDim.x;
+        log_ptr[idx++] = blockDim.y;
+        log_ptr[idx++] = blockDim.z;
+        log_ptr[idx++] = -8;
+
+        log_ptr[idx++] = filter_d;
+        log_ptr[idx++] = filter_h;
+        log_ptr[idx++] = filter_w;
+        log_ptr[idx++] = -7;
+
+        log_ptr[idx++] = filter_d_stride;
+        log_ptr[idx++] = filter_h_stride;
+        log_ptr[idx++] = filter_w_stride;
+        log_ptr[idx++] = -6;
+
+        log_ptr[idx++] = filter_d_pad;
+        log_ptr[idx++] = filter_h_pad;
+        log_ptr[idx++] = filter_w_pad;
+        log_ptr[idx++] = -5;
+
+        log_ptr[idx++] = all_n;
+        log_ptr[idx++] = all_c;
+        log_ptr[idx++] = bot_n_stride;
+        log_ptr[idx++] = bot_c_stride;
+
+        log_ptr[idx++] = top_n_stride;
+        log_ptr[idx++] = top_c_stride;
         #if AVERAGE_OPS
-        top_ptr[idx++] = -4;
-        top_ptr[idx++] = -4;
+        log_ptr[idx++] = -4;
+        log_ptr[idx++] = -4;
         #else
-        top_ptr[idx++] = mask_n_stride;
-        top_ptr[idx++] = mask_c_stride;
+        log_ptr[idx++] = mask_n_stride;
+        log_ptr[idx++] = mask_c_stride;
         #endif
 
-        top_ptr[idx++] = bot_d;
-        top_ptr[idx++] = bot_h;
-        top_ptr[idx++] = bot_w;
-        top_ptr[idx++] = -3;
+        log_ptr[idx++] = bot_d;
+        log_ptr[idx++] = bot_h;
+        log_ptr[idx++] = bot_w;
+        log_ptr[idx++] = -3;
 
-        top_ptr[idx++] = bot_d_stride;
-        top_ptr[idx++] = bot_h_stride;
-        top_ptr[idx++] = bot_w_stride;
-        top_ptr[idx++] = -2;
+        log_ptr[idx++] = bot_d_stride;
+        log_ptr[idx++] = bot_h_stride;
+        log_ptr[idx++] = bot_w_stride;
+        log_ptr[idx++] = -2;
 
-        top_ptr[idx++] = top_d;
-        top_ptr[idx++] = top_h;
-        top_ptr[idx++] = top_w;
-        top_ptr[idx++] = -1;
+        log_ptr[idx++] = top_d;
+        log_ptr[idx++] = top_h;
+        log_ptr[idx++] = top_w;
+        log_ptr[idx++] = -1;
     
-        top_ptr[idx++] = top_d_stride;
-        top_ptr[idx++] = top_h_stride;
-        top_ptr[idx++] = top_w_stride;
-        top_ptr[idx++] = -9;
+        log_ptr[idx++] = top_d_stride;
+        log_ptr[idx++] = top_h_stride;
+        log_ptr[idx++] = top_w_stride;
+        log_ptr[idx++] = -9;
 
         #if AVERAGE_OPS
-        top_ptr[idx++] = -8;
-        top_ptr[idx++] = -8;
-        top_ptr[idx++] = -8;
+        log_ptr[idx++] = -8;
+        log_ptr[idx++] = -8;
+        log_ptr[idx++] = -8;
         #else
-        top_ptr[idx++] = mask_d_stride;
-        top_ptr[idx++] = mask_h_stride;
-        top_ptr[idx++] = mask_w_stride;
+        log_ptr[idx++] = mask_d_stride;
+        log_ptr[idx++] = mask_h_stride;
+        log_ptr[idx++] = mask_w_stride;
         #endif
-        top_ptr[idx++] = -7;
+        log_ptr[idx++] = -7;
     }
+    const uint32_t nn = blockIdx.x / top_d;                          // N=slow index
+    if(nn >= all_n)
+        return;
+
+    const uint32_t td = blockIdx.x % top_d;                          // top D=fast index
+    if(td >= top_d)
+        return;
+
+    const uint32_t th = blockIdx.y;  // top H
+    const uint32_t j = (gridDim.y == 1) ? threadIdx.y : blockIdx.y;  // top H
+    if(th >= top_h)
+        return;
 
+    const uint32_t tw = blockIdx.z % top_w;  // top W=fast index
+    if(tw >= top_w)
+        return;
+if(true) {
     uint32_t cc = 0;
-    // const auto c_base = (blockDim.x == all_c) ? 0 : (blockIdx.z / top_w) * blockDim.x;
-            size_t top_index = 64
-                    + nn * top_n_stride             // TEMPCODE RJS
+            size_t top_index = 
+                    nn * top_n_stride             // TEMPCODE RJS
                     + cc * top_c_stride           //
                     + (size_t)(td * top_d_stride) //
                     + (size_t)(th * top_h_stride) //
                     + (size_t)(tw * top_w_stride);
 
-        top_ptr[top_index] = top_index;
-
-//     const auto int_dstart   = static_cast<int64_t>(td * filter_d_stride) - static_cast<int64_t>(filter_d_pad);
-//     const auto dend           = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(filter_d), static_cast<int64_t>(bot_d)));
-//     const auto dstart         = static_cast<size_t>(max(int_dstart, 0));
-
-//     const auto int_hstart   = static_cast<int>(th * filter_h_stride) - static_cast<int>(filter_h_pad);
-//     const auto hend             = static_cast<uint32_t>(min(int_hstart + static_cast<int>(filter_h), static_cast<int>(bot_h)));
-//     const auto hstart         = static_cast<uint32_t>(max(int_hstart, 0));
-
-//     const auto int_wstart        = static_cast<int>(tw * filter_w_stride) - static_cast<int>(filter_w_pad);
-//     const auto wend             = static_cast<uint32_t>(min(int_wstart + static_cast<int>(filter_w), static_cast<int>(bot_w)));
-//     const auto wstart           = static_cast<uint32_t>(max(int_wstart, 0));
-//     out_ptr[0] = 1.2;
-//     for(uint32_t cc = 0; cc < 1; ++cc)  // top C loop
-//     {
-//         if(cc >= all_c)   return;
-//         {
-//             size_t top_index = nn * top_n_stride             // TEMPCODE RJS
-//                     + cc * top_c_stride           //
-//                     + (size_t)(td * top_d_stride) //
-//                     + (size_t)(th * top_h_stride) //
-//                     + (size_t)(tw * top_w_stride);
-//         top_index = 1;
-
-//         junk_ptr[top_index] = (_FLOAT)1.1;
-//         if(j != 1)
-//         }
-// #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-//         uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-//         pool_size       = (pool_size == 0) ? 1 : pool_size;
-// #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
-//         const uint32_t pool_size = filter_d * filter_h * filter_w;
-// #endif
-
-// #if AVERAGE_OPS
-//         _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0);
-// #else // MAX
-//         _FLOAT_ACCUM res     = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM);
-//         bool found           = false; // May remain false if bot contains only NaNs/-INFs.
-//         uint32_t d_save          = 0;
-//         uint32_t h_save          = 0;
-//         uint32_t w_save          = 0;
-// #endif
-//         for(size_t bd = dstart; bd < dend; ++bd)
-//         {
-//             for(uint32_t bh = hstart; bh < hend; ++bh)
-//             {
-//                 for(uint32_t bw = wstart; bw < wend; ++bw)
-//                 {
-//                     const size_t bot_index = nn * bot_n_stride +           //
-//                                             cc * bot_c_stride +           //
-//                                             bd * bot_d_stride + //
-//                                             static_cast<size_t>(bh * bot_h_stride) + //
-//                                             static_cast<size_t>(bw * bot_w_stride);
-// #if AVERAGE_OPS
-//                     res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]);
-// #else // MAX
-//                     if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res)
-//                     {
-//                         res = bot_ptr[bot_index];
-//                         if(save_index)
-//                         {
-//                             found  = true;
-//                             d_save = bd;
-//                             h_save = bh;
-//                             w_save = bw;
-//                         }
-//                     }
-// #endif
-//                 }
-//             }
-//         }
-
-// #if AVERAGE_OPS
-//         res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size);
-// #else // MAX
-// res *= 1.0; // TEMPCODE RJS fix UNUSED
-//         if(save_index)
-//         {
-//             index_t res_index = 0;
-
-//             / Preventing overflow during computation of res_index:
-//             / If Index is shorter than uint, then let's perform computation in 32-bit
-//             / domain and then convert to narrower Index. That would reduce the probability of
-//             / overflow. If Index is wider then 32 bits, then it seems like it is better to
-//             / convert to Index type before multiplication. However this is not actually
-//             / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in
-//             / 32 bits and then convert.
-
-//             if(found)
-//             {
-//                 if(index_mode == 1)
-//                     res_index = (index_t)(d_save * bot_h * bot_w //
-//                                             + h_save * bot_w       //
-//                                             + w_save);
-//                 else
-//                     res_index = (index_t)(                                                    //
-//                         ((d_save - td * filter_d_stride + filter_d_pad) * filter_h * filter_w) //
-//                         + ((h_save - th * filter_h_stride + filter_h_pad) * filter_w)          //
-//                         + (w_save - tw * filter_w_stride + filter_w_pad)                       //
-//                     );
-//             }
-
-//             const size_t mask_index = nn * mask_n_stride             //
-//                                         + cc * mask_c_stride           //
-//                                         + (size_t)(td * mask_d_stride) //
-//                                         + (size_t)(tw * mask_h_stride) //
-//                                         + (size_t)(th * mask_w_stride);
-//             mask_ptr[mask_index] = res_index;
-//         }
-// #endif
-//         size_t top_index = nn * top_n_stride             //
-//                                 + cc * top_c_stride           //
-//                                 + (size_t)(td * top_d_stride) //
-//                                 + (size_t)(th * top_h_stride) //
-//                                 + (size_t)(tw * top_w_stride);
-//         top_index = 1;
-//         junk_ptr[top_index] = (_FLOAT)res;    // TEMPCODE RJS
-//     }
+        junk_ptr[top_index] = top_index;
+}
+if(true) {  // TEMPCODE RJS
+    const auto int_dstart   = static_cast<int64_t>(td * filter_d_stride) - static_cast<int64_t>(filter_d_pad);
+    const auto dend           = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(filter_d), static_cast<int64_t>(bot_d)));
+    const auto dstart         = static_cast<size_t>(max(int_dstart, 0));
+
+    const auto int_hstart   = static_cast<int>(th * filter_h_stride) - static_cast<int>(filter_h_pad);
+    const auto hend             = static_cast<uint32_t>(min(int_hstart + static_cast<int>(filter_h), static_cast<int>(bot_h)));
+    const auto hstart         = static_cast<uint32_t>(max(int_hstart, 0));
+
+    const auto int_wstart        = static_cast<int>(tw * filter_w_stride) - static_cast<int>(filter_w_pad);
+    const auto wend             = static_cast<uint32_t>(min(int_wstart + static_cast<int>(filter_w), static_cast<int>(bot_w)));
+    const auto wstart           = static_cast<uint32_t>(max(int_wstart, 0));
+    // const auto c_base = (blockDim.x == all_c) ? 0 : (blockIdx.z / top_w) * blockDim.x;
+
+    for(uint32_t cc = 0; cc < 1; ++cc)  // top C loop
+    {
+        if(cc >= all_c)   return;
+
+#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
+        uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        pool_size       = (pool_size == 0) ? 1 : pool_size;
+#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
+        const uint32_t pool_size = filter_d * filter_h * filter_w;
+#endif
+
+#if AVERAGE_OPS
+        _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0);
+#else // MAX
+        _FLOAT_ACCUM res     = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM);
+        bool found           = false; // May remain false if bot contains only NaNs/-INFs.
+        uint32_t d_save          = 0;
+        uint32_t h_save          = 0;
+        uint32_t w_save          = 0;
+#endif
+        for(size_t bd = dstart; bd < dend; ++bd)
+        {
+            for(uint32_t bh = hstart; bh < hend; ++bh)
+            {
+                for(uint32_t bw = wstart; bw < wend; ++bw)
+                {
+                    const size_t bot_index = nn * bot_n_stride +           //
+                                            cc * bot_c_stride +           //
+                                            bd * bot_d_stride + //
+                                            static_cast<size_t>(bh * bot_h_stride) + //
+                                            static_cast<size_t>(bw * bot_w_stride);
+#if AVERAGE_OPS
+                    res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]);
+#else // MAX
+                    if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res)
+                    {
+                        res = bot_ptr[bot_index];
+                        if(save_index)
+                        {
+                            found  = true;
+                            d_save = bd;
+                            h_save = bh;
+                            w_save = bw;
+                        }
+                    }
+#endif
+                }
+            }
+        }
+
+#if AVERAGE_OPS
+        res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size);
+#else // MAX
+res *= 1.0; // TEMPCODE RJS fix UNUSED
+        if(save_index)
+        {
+            index_t res_index = 0;
+
+            // / Preventing overflow during computation of res_index:
+            // / If Index is shorter than uint, then let's perform computation in 32-bit
+            // / domain and then convert to narrower Index. That would reduce the probability of
+            // / overflow. If Index is wider then 32 bits, then it seems like it is better to
+            // / convert to Index type before multiplication. However this is not actually
+            // / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in
+            // / 32 bits and then convert.
+
+            if(found)
+            {
+                if(index_mode == 1)
+                    res_index = (index_t)(d_save * bot_h * bot_w //
+                                            + h_save * bot_w       //
+                                            + w_save);
+                else
+                    res_index = (index_t)(                                                    //
+                        ((d_save - td * filter_d_stride + filter_d_pad) * filter_h * filter_w) //
+                        + ((h_save - th * filter_h_stride + filter_h_pad) * filter_w)          //
+                        + (w_save - tw * filter_w_stride + filter_w_pad)                       //
+                    );
+            }
+
+            const size_t mask_index = nn * mask_n_stride             //
+                                        + cc * mask_c_stride           //
+                                        + (size_t)(td * mask_d_stride) //
+                                        + (size_t)(tw * mask_h_stride) //
+                                        + (size_t)(th * mask_w_stride);
+            mask_ptr[mask_index] = res_index;
+        }
+#endif
+        size_t top_index = nn * top_n_stride             //
+                                + cc * top_c_stride           //
+                                + (size_t)(td * top_d_stride) //
+                                + (size_t)(th * top_h_stride) //
+                                + (size_t)(tw * top_w_stride);
+
+        top_ptr[top_index] = (_FLOAT)res;    // TEMPCODE RJS
+    }
+} // TEMPCODE
 }
 
 extern "C" __global__ void mloPoolingForwardNDNhwcNaive(
@@ -368,10 +361,10 @@ extern "C" __global__ void mloPoolingForwardNDNhwcNaive(
                                     uint32_t all_n,
                                     uint32_t all_c,
                                     uint32_t bot_d, uint32_t bot_h, uint32_t bot_w,
-                                    size_t bot_n_stride, size_t bot_c_stride, uint32_t bot_d_stride, uint32_t bot_h_stride, uint32_t bot_w_stride,
+                                    size_t bot_n_stride, uint32_t bot_c_stride, size_t bot_d_stride, uint32_t bot_h_stride, uint32_t bot_w_stride,
                                     uint32_t top_d, uint32_t top_h, uint32_t top_w,
-                                    size_t top_n_stride, size_t top_c_stride, uint32_t top_d_stride, uint32_t top_h_stride, uint32_t top_w_stride,
-                                    size_t mask_n_stride, uint32_t mask_c_stride, uint32_t mask_d_stride, uint32_t mask_h_stride, size_t mask_w_stride)
+                                    size_t top_n_stride, uint32_t top_c_stride, size_t top_d_stride, uint32_t top_h_stride, uint32_t top_w_stride,
+                                    size_t mask_n_stride, size_t mask_c_stride, uint32_t mask_d_stride, uint32_t mask_h_stride, uint32_t mask_w_stride)
 {
     poolingForwardNDNhwcNaive<INPUT_TYPE, OUTPUT_TYPE>(
         bot_ptr,
diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp
index c946fd8da7..70e4d998cf 100644
--- a/src/solver/pooling/forwardNdNhwcNaive.cpp
+++ b/src/solver/pooling/forwardNdNhwcNaive.cpp
@@ -177,21 +177,24 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
 
     const auto spatial_dim = is2d ? 2U : 3U;
 
-    size_t all_n, all_c, bot_d, bot_h, bot_w;
+    uint32_t all_n, all_c, bot_d, bot_h, bot_w;
     std::tie(all_n, all_c, bot_d, bot_h, bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths());
+    std::cout << "GetSol: bot_lens " << all_n << " " << all_c << " " << bot_d << " " << bot_h << " " << bot_w << std::endl;
 
     size_t bot_n_stride, bot_d_stride;
-    size_t bot_h_stride, bot_w_stride, bot_c_stride;
+    uint32_t bot_h_stride, bot_w_stride, bot_c_stride;
     std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride) =
         miopen::GetNCDHW(spatial_dim, bot.GetStrides());
+std::cout << "GetSol: bot_strides " << bot_n_stride << " " << bot_c_stride << " " << bot_d_stride
+<< " " << bot_h_stride << " " << bot_w_stride  << std::endl;
 
-    size_t a1, a2, top_d, top_h, top_w;
-    std::tie(a1, a2, top_d, top_h, top_w) =
+    uint32_t top_d, top_h, top_w;
+    std::tie(std::ignore, std::ignore, top_d, top_h, top_w) =
         miopen::GetNCDHW(spatial_dim, top.GetLengths());
-    std::cout << "GetSol: top_lens " << a1 << " " << a2 << " " << top_d << " " << top_h << " " << top_w << std::endl;
+    std::cout << "GetSol: top_lens " << top_d << " " << top_h << " " << top_w << std::endl;
 
     size_t top_n_stride, top_d_stride;
-    size_t top_h_stride, top_w_stride, top_c_stride;
+    uint32_t top_h_stride, top_w_stride, top_c_stride;
     std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride) =
         miopen::GetNCDHW(spatial_dim, top.GetStrides());
     // TEMPCODE RJS
@@ -296,8 +299,8 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
         kernel.l_wk.push_back(l2);
 
         // TEMPCODE RJS
-        std::cout << "Kernel dims: " << kernel.g_wk.size() << " " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2]
-        << " | " << kernel.l_wk.size() << " " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl;
+        std::cout << "Kernel dims: g[" << kernel.g_wk.size() << "] " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2]
+        << " | l[" << kernel.l_wk.size() << "] " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl;
         result.construction_params.push_back(kernel);
     }
 
diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp
index 69c19478ae..6bbd949f78 100644
--- a/test/pooling_common.hpp
+++ b/test/pooling_common.hpp
@@ -151,7 +151,7 @@ struct verify_forward_pooling
     tensor<T>
     cpu(const tensor<T>& input, const miopen::PoolingDescriptor& filter, std::vector<Index>&) const
     {
-        const bool is_default_layout = input.desc.IsDefaultLayout();
+        // const bool is_default_layout = input.desc.IsDefaultLayout();
         const int sptl_dim_offset = 2; // is_default_layout ? 2 : 1; TEMPCODE RJS
         const int chan_dim_offset = 1; // is_default_layout ? 1 : SptDim + 1;
 
@@ -281,16 +281,14 @@ struct verify_forward_pooling
                   std::vector<Index>& indices) const
     {
         auto&& handle = get_handle();
-        auto out      = get_big_output_tensor(filter, input);   // TEMPCODE RJS
+        auto out      = get_output_tensor(filter, input);
+        auto junk      = get_big_output_tensor(filter, input);   // TEMPCODE RJS
 
         indices.resize(out.data.size(), 0);
 
         auto in_dev  = handle.Write(input.data);
         auto out_dev = handle.Create<T>(out.data.size());
-        auto junk_dev = handle.Create<T>(out.data.size());  // 
-        std::cout << "gpu Sizes: in:" << input.data.size() << " out: " << out.data.size() << " " << out.desc.GetLayout_str()
-        << " " << out.desc.GetLengths()[0]  << " " << out.desc.GetLengths()[1] << " " << out.desc.GetLengths()[2] << " " << out.desc.GetLengths()[3]
-        << " " << out.desc.GetStrides()[0]  << " " << out.desc.GetStrides()[1] << " " << out.desc.GetStrides()[2] << " " << out.desc.GetStrides()[3] << std::endl;
+        auto junk_dev = handle.Create<T>(junk.data.size());  // 
         Workspace wspace{};
         wspace.Write(indices);
 
@@ -308,8 +306,9 @@ struct verify_forward_pooling
                        junk_dev.get()); // TEMPCODE RJS
 
         indices  = wspace.Read<std::vector<Index>>();
-        out.data = handle.Read<T>(out_dev, out.data.size());
-        if(true)
+        handle.ReadTo(out.data.data(), out_dev, out.data.size() * sizeof(T));
+        handle.ReadTo(junk.data.data(), junk_dev, junk.data.size() * sizeof(T));
+        if(false)
         {
             std::cout << "GPU out: ";
             auto outlen = out.desc.GetLengths();
@@ -335,6 +334,18 @@ struct verify_forward_pooling
                     }
                 }
             }
+            if(false){
+            std::cout << "GPU out (4-cols): " << std::endl;
+            for(int idx = 0; idx < 160; ++idx) {
+                std::cout << std::setw(11) << std::setprecision(5) << out.data[idx] << "  ";
+                if((idx % 4) == 3)  std::cout <<std::endl;
+            }
+            std::cout << "GPU junk: " << std::endl;
+            for(int idx = 0; idx < 160; ++idx) {
+                std::cout << std::setw(11) << std::setprecision(5) << junk.data[idx] << "  ";
+                if((idx % 4) == 3)  std::cout <<std::endl;
+            }
+            }
         }   // print output tensor
         return out;
     }
@@ -610,13 +621,14 @@ struct pooling_driver : test_driver
     {
         add(index_type,
             "index_type",
-            generate_multi_data<const char*>( //
-                {{"miopenIndexUint8",
-                  "miopenIndexUint16",
-                  "miopenIndexUint32",
-                  "miopenIndexUint64"},                     //
-                 {"miopenIndexUint8", "miopenIndexUint32"}, //
-                 {"miopenIndexUint32"}}                     //
+            generate_data({"miopenIndexUint32",}    // TEMPCODE RJS
+            // generate_multi_data<const char*>( //
+            //     {{"miopenIndexUint8",
+            //       "miopenIndexUint16",
+            //       "miopenIndexUint32",
+            //       "miopenIndexUint64"},                     //
+            //      {"miopenIndexUint8", "miopenIndexUint32"}, //
+            //      {"miopenIndexUint32"}}                     //
                 ));
         add(mode,
             "mode",

From 7dd833335cd36efc1f39d048e17d541086da4e91 Mon Sep 17 00:00:00 2001
From: "Randy J. Spaulding" <rspauldi@amd.com>
Date: Mon, 26 Aug 2024 09:31:51 +0000
Subject: [PATCH 04/10] cp

---
 src/CMakeLists.txt                            |   1 +
 .../miopen/pooling/poolingNdNhwcArgs.hpp      |  19 +
 .../MIOpenPoolingForwardNDNhwcNaive.cpp       | 326 +++++++++---------
 src/pooling/problem_description.cpp           |   6 +-
 src/solver/pooling/forward2d.cpp              |  38 +-
 src/solver/pooling/forwardNdNhwcNaive.cpp     | 143 ++++----
 test/gtest/poolingFwdNdNaive.cpp              |   8 +-
 test/pooling2d.hpp                            |   6 +-
 test/pooling_common.hpp                       | 122 ++++---
 9 files changed, 358 insertions(+), 311 deletions(-)
 create mode 100644 src/include/miopen/pooling/poolingNdNhwcArgs.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3b51f60f05..1bd81f9e87 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -603,6 +603,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
     kernels/bfloat16_dev.hpp
     kernels/float_types.h
     kernels/pooling_functions.h
+    include/miopen/pooling/poolingNdNhwcArgs.hpp
     )
     # Kernels in development lists.
     # Should be ALWAYS empty in develop branch (at the time of PR merge)
diff --git a/src/include/miopen/pooling/poolingNdNhwcArgs.hpp b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp
new file mode 100644
index 0000000000..9fd15bd4bd
--- /dev/null
+++ b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+// #include <stdint.h>
+
+using BIGONE = uint32_t; // TEMPCODE RJS 
+
+struct poolingNdNhwcArgs
+{
+    uint32_t filter_d; uint32_t filter_h; uint32_t filter_w;
+    uint32_t filter_d_stride; uint32_t filter_h_stride; uint32_t filter_w_stride;
+    uint32_t filter_d_pad; uint32_t filter_h_pad; uint32_t filter_w_pad;
+    uint32_t all_n;
+    uint32_t all_c;
+    uint32_t bot_d; uint32_t bot_h; uint32_t bot_w;
+    BIGONE bot_n_stride; uint32_t bot_c_stride; BIGONE bot_d_stride; uint32_t bot_h_stride; uint32_t bot_w_stride;
+    uint32_t top_d; uint32_t top_h; uint32_t top_w;
+    BIGONE top_n_stride; uint32_t top_c_stride; BIGONE top_d_stride; uint32_t top_h_stride; uint32_t top_w_stride;
+    BIGONE mask_n_stride; BIGONE mask_c_stride; uint32_t mask_d_stride; uint32_t mask_h_stride; uint32_t mask_w_stride;
+};
diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
index 60aa2aac23..a6aa93aade 100644
--- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
+++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
@@ -51,6 +51,7 @@
 #include "float_types.h"
 #endif
 #include "pooling_functions.h"
+#include "poolingNdNhwcArgs.hpp"
 
 #if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE)
 #define AVERAGE_OPS 1
@@ -76,125 +77,127 @@
 #define ARG_UNUSED_FOR_AVERAGE
 #endif
 
-#define doUU 0
-#if doUU
-#define UU  __attribute__((__unused__))
-#else
-#define UU
-#endif
-
-#define doUU1 0
-#if doUU1
-#define UU1  __attribute__((__unused__))
-#else
-#define UU1
-#endif
-
 // Out N, D, H are encoded into the block indices x, y, z
 // No 2D-only optimization.
 template <typename TI, typename TO>
-__device__ void poolingForwardNDNhwcNaive(UU1 const TI* __restrict__ bot_ptr,
+__device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
                                     TO* __restrict__ top_ptr,
-                                    UU1 TO* __restrict__ junk_ptr,  // TEMPCODE RJS
-                                    UU1 ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr,
-                                    UU1 ARG_UNUSED_FOR_AVERAGE int save_index,
-                                    UU1 ARG_UNUSED_FOR_AVERAGE int index_mode,
-                                    UU uint32_t filter_d,
-                                    UU uint32_t filter_h,
-                                    UU uint32_t filter_w,
-                                    UU uint32_t filter_d_stride,
-                                    UU uint32_t filter_h_stride,
-                                    UU uint32_t filter_w_stride,
-                                    UU uint32_t filter_d_pad,
-                                    UU uint32_t filter_h_pad,
-                                    UU uint32_t filter_w_pad,
-                                    uint32_t all_n,
-                                    UU uint32_t all_c, // TEMPCODE RJS
-                                    UU uint32_t bot_d,
-                                    UU uint32_t bot_h,
-                                    UU uint32_t bot_w,
-                                    UU size_t bot_n_stride,
-                                    UU uint32_t bot_c_stride,
-                                    UU size_t bot_d_stride,
-                                    UU uint32_t bot_h_stride,
-                                    UU uint32_t bot_w_stride,
-                                    uint32_t top_d,
-                                    uint32_t top_h,
-                                    uint32_t top_w,
-                                    size_t top_n_stride,
-                                    uint32_t top_c_stride,
-                                    size_t top_d_stride,
-                                    uint32_t top_h_stride,
-                                    uint32_t top_w_stride,
-                                    UU ARG_UNUSED_FOR_AVERAGE size_t mask_n_stride,
-                                    UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_c_stride,
-                                    UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_d_stride,
-                                    UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_h_stride,
-                                    UU ARG_UNUSED_FOR_AVERAGE size_t mask_w_stride)
+                                    TO* __restrict__ junk_ptr,  // TEMPCODE RJS
+                                    ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr,
+                                    ARG_UNUSED_FOR_AVERAGE int save_index,
+                                    ARG_UNUSED_FOR_AVERAGE int index_mode,
+                                    poolingNdNhwcArgs args
+                                    // UU uint32_t filter_d,
+                                    // UU uint32_t filter_h,
+                                    // UU uint32_t filter_w,
+                                    // UU uint32_t filter_d_stride,
+                                    // UU uint32_t filter_h_stride,
+                                    // UU uint32_t filter_w_stride,
+                                    // UU uint32_t filter_d_pad,
+                                    // UU uint32_t filter_h_pad,
+                                    // UU uint32_t filter_w_pad,
+                                    // uint32_t all_n,
+                                    // UU uint32_t all_c, // TEMPCODE RJS
+                                    // UU uint32_t bot_d,
+                                    // UU uint32_t bot_h,
+                                    // UU uint32_t bot_w,
+                                    // UU BIGONE bot_n_stride,
+                                    // UU uint32_t bot_c_stride,
+                                    // UU BIGONE bot_d_stride,
+                                    // UU uint32_t bot_h_stride,
+                                    // UU uint32_t bot_w_stride,
+                                    // uint32_t top_d,
+                                    // uint32_t top_h,
+                                    // uint32_t top_w,
+                                    // BIGONE top_n_stride,
+                                    // uint32_t top_c_stride,
+                                    // BIGONE top_d_stride,
+                                    // uint32_t top_h_stride,
+                                    // uint32_t top_w_stride,
+                                    // UU ARG_UNUSED_FOR_AVERAGE BIGONE mask_n_stride,
+                                    // UU ARG_UNUSED_FOR_AVERAGE BIGONE mask_c_stride,
+                                    // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_d_stride,
+                                    // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_h_stride,
+                                    // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_w_stride
+)
 {
+    const uint32_t nn = blockIdx.x / args.top_d;                          // N=slow index
+    const uint32_t td = blockIdx.x % args.top_d;                          // top D=fast index
+    const uint32_t th = blockIdx.y;  // top H
+    const uint32_t tw = blockIdx.z % args.all_c;  // top W=fast index
+    const auto c_base = (blockIdx.z / args.all_c) * blockDim.x;
+    if(blockDim.x > args.all_c)
+    {
+        // // TODO: h, w, or both may be encoded into threadIdx
+        // if(top_h > 1 && blockDim.y == 1)    
+    }
 
     auto log_ptr = junk_ptr;
-    if(nn == 0 && td == 0 && th == 0 && tw == 0)
+    if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0 &&  threadIdx.y == 0 &&  threadIdx.z == 0)
     {
+        for(int i = 0; i < 320; ++i)
+        {
+            junk_ptr[i] = (_FLOAT)1.11111;
+        }
         int idx = 0;
-        log_ptr[idx++] = gridDim.x;
-        log_ptr[idx++] = gridDim.y;
-        log_ptr[idx++] = gridDim.z;
+        log_ptr[idx++] = gridDim.x;     // ND
+        log_ptr[idx++] = gridDim.y;     // H
+        log_ptr[idx++] = gridDim.z;     // W (*C overflow)
         log_ptr[idx++] = -9;
 
-        log_ptr[idx++] = blockDim.x;
-        log_ptr[idx++] = blockDim.y;
-        log_ptr[idx++] = blockDim.z;
+        log_ptr[idx++] = blockDim.x;    // C
+        log_ptr[idx++] = blockDim.y;    // small-C H
+        log_ptr[idx++] = blockDim.z;    // small-C W
         log_ptr[idx++] = -8;
 
-        log_ptr[idx++] = filter_d;
-        log_ptr[idx++] = filter_h;
-        log_ptr[idx++] = filter_w;
+        log_ptr[idx++] = args.filter_d;
+        log_ptr[idx++] = args.filter_h;
+        log_ptr[idx++] = args.filter_w;
         log_ptr[idx++] = -7;
 
-        log_ptr[idx++] = filter_d_stride;
-        log_ptr[idx++] = filter_h_stride;
-        log_ptr[idx++] = filter_w_stride;
+        log_ptr[idx++] = args.filter_d_stride;
+        log_ptr[idx++] = args.filter_h_stride;
+        log_ptr[idx++] = args.filter_w_stride;
         log_ptr[idx++] = -6;
 
-        log_ptr[idx++] = filter_d_pad;
-        log_ptr[idx++] = filter_h_pad;
-        log_ptr[idx++] = filter_w_pad;
+        log_ptr[idx++] = args.filter_d_pad;
+        log_ptr[idx++] = args.filter_h_pad;
+        log_ptr[idx++] = args.filter_w_pad;
         log_ptr[idx++] = -5;
 
-        log_ptr[idx++] = all_n;
-        log_ptr[idx++] = all_c;
-        log_ptr[idx++] = bot_n_stride;
-        log_ptr[idx++] = bot_c_stride;
+        log_ptr[idx++] = args.all_n;
+        log_ptr[idx++] = args.all_c;
+        log_ptr[idx++] = args.bot_n_stride;
+        log_ptr[idx++] = args.bot_c_stride;
 
-        log_ptr[idx++] = top_n_stride;
-        log_ptr[idx++] = top_c_stride;
+        log_ptr[idx++] = args.top_n_stride;
+        log_ptr[idx++] = args.top_c_stride;
         #if AVERAGE_OPS
         log_ptr[idx++] = -4;
         log_ptr[idx++] = -4;
         #else
-        log_ptr[idx++] = mask_n_stride;
-        log_ptr[idx++] = mask_c_stride;
+        log_ptr[idx++] = args.mask_n_stride;
+        log_ptr[idx++] = args.mask_c_stride;
         #endif
 
-        log_ptr[idx++] = bot_d;
-        log_ptr[idx++] = bot_h;
-        log_ptr[idx++] = bot_w;
+        log_ptr[idx++] = args.bot_d;
+        log_ptr[idx++] = args.bot_h;
+        log_ptr[idx++] = args.bot_w;
         log_ptr[idx++] = -3;
 
-        log_ptr[idx++] = bot_d_stride;
-        log_ptr[idx++] = bot_h_stride;
-        log_ptr[idx++] = bot_w_stride;
+        log_ptr[idx++] = args.bot_d_stride;
+        log_ptr[idx++] = args.bot_h_stride;
+        log_ptr[idx++] = args.bot_w_stride;
         log_ptr[idx++] = -2;
 
-        log_ptr[idx++] = top_d;
-        log_ptr[idx++] = top_h;
-        log_ptr[idx++] = top_w;
+        log_ptr[idx++] = args.top_d;
+        log_ptr[idx++] = args.top_h;
+        log_ptr[idx++] = args.top_w;
         log_ptr[idx++] = -1;
     
-        log_ptr[idx++] = top_d_stride;
-        log_ptr[idx++] = top_h_stride;
-        log_ptr[idx++] = top_w_stride;
+        log_ptr[idx++] = args.top_d_stride;
+        log_ptr[idx++] = args.top_h_stride;
+        log_ptr[idx++] = args.top_w_stride;
         log_ptr[idx++] = -9;
 
         #if AVERAGE_OPS
@@ -202,62 +205,58 @@ __device__ void poolingForwardNDNhwcNaive(UU1 const TI* __restrict__ bot_ptr,
         log_ptr[idx++] = -8;
         log_ptr[idx++] = -8;
         #else
-        log_ptr[idx++] = mask_d_stride;
-        log_ptr[idx++] = mask_h_stride;
-        log_ptr[idx++] = mask_w_stride;
+        log_ptr[idx++] = args.mask_d_stride;
+        log_ptr[idx++] = args.mask_h_stride;
+        log_ptr[idx++] = args.mask_w_stride;
         #endif
         log_ptr[idx++] = -7;
+        while(idx < 64) log_ptr[idx++] = (_FLOAT)0;
     }
-    const uint32_t nn = blockIdx.x / top_d;                          // N=slow index
-    if(nn >= all_n)
-        return;
 
-    const uint32_t td = blockIdx.x % top_d;                          // top D=fast index
-    if(td >= top_d)
-        return;
+    // if(nn >= args.all_n)
+    //     return;
 
-    const uint32_t th = blockIdx.y;  // top H
-    const uint32_t j = (gridDim.y == 1) ? threadIdx.y : blockIdx.y;  // top H
-    if(th >= top_h)
-        return;
+    // if(td >= args.top_d)
+    //     return;
+
+    // if(th >= args.top_h)
+    //     return;
+
+    // if(tw >= args.top_w)
+    //     return;
 
-    const uint32_t tw = blockIdx.z % top_w;  // top W=fast index
-    if(tw >= top_w)
-        return;
-if(true) {
-    uint32_t cc = 0;
-            size_t top_index = 
-                    nn * top_n_stride             // TEMPCODE RJS
-                    + cc * top_c_stride           //
-                    + (size_t)(td * top_d_stride) //
-                    + (size_t)(th * top_h_stride) //
-                    + (size_t)(tw * top_w_stride);
-
-        junk_ptr[top_index] = top_index;
-}
 if(true) {  // TEMPCODE RJS
-    const auto int_dstart   = static_cast<int64_t>(td * filter_d_stride) - static_cast<int64_t>(filter_d_pad);
-    const auto dend           = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(filter_d), static_cast<int64_t>(bot_d)));
+    const auto int_dstart   = static_cast<int64_t>(td * args.filter_d_stride) - static_cast<int64_t>(args.filter_d_pad);
+    const auto dend           = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(args.filter_d), static_cast<int64_t>(args.bot_d)));
     const auto dstart         = static_cast<size_t>(max(int_dstart, 0));
 
-    const auto int_hstart   = static_cast<int>(th * filter_h_stride) - static_cast<int>(filter_h_pad);
-    const auto hend             = static_cast<uint32_t>(min(int_hstart + static_cast<int>(filter_h), static_cast<int>(bot_h)));
+    const auto int_hstart   = static_cast<int>(th * args.filter_h_stride) - static_cast<int>(args.filter_h_pad);
+    const auto hend             = static_cast<uint32_t>(min(int_hstart + static_cast<int>(args.filter_h), static_cast<int>(args.bot_h)));
     const auto hstart         = static_cast<uint32_t>(max(int_hstart, 0));
 
-    const auto int_wstart        = static_cast<int>(tw * filter_w_stride) - static_cast<int>(filter_w_pad);
-    const auto wend             = static_cast<uint32_t>(min(int_wstart + static_cast<int>(filter_w), static_cast<int>(bot_w)));
+    const auto int_wstart        = static_cast<int>(tw * args.filter_w_stride) - static_cast<int>(args.filter_w_pad);
+    const auto wend             = static_cast<uint32_t>(min(int_wstart + static_cast<int>(args.filter_w), static_cast<int>(args.bot_w)));
     const auto wstart           = static_cast<uint32_t>(max(int_wstart, 0));
-    // const auto c_base = (blockDim.x == all_c) ? 0 : (blockIdx.z / top_w) * blockDim.x;
 
-    for(uint32_t cc = 0; cc < 1; ++cc)  // top C loop
-    {
-        if(cc >= all_c)   return;
+    uint32_t cc = c_base + threadIdx.x;
+    // if(cc > args.all_c) return;
+
+    size_t top_index = 
+            nn * args.top_n_stride             // TEMPCODE RJS
+            + cc * args.top_c_stride           //
+            + (size_t)(td * args.top_d_stride) //
+            + (size_t)(th * args.top_h_stride) //
+            + (size_t)(tw * args.top_w_stride);
+if(true) {
+        top_ptr[top_index] = (TO)-1.11111;
+        junk_ptr[64 + top_index] = top_index;
+}
 
 #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
         uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
         pool_size       = (pool_size == 0) ? 1 : pool_size;
 #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
-        const uint32_t pool_size = filter_d * filter_h * filter_w;
+        const uint32_t pool_size = args.filter_d * args.filter_h * args.filter_w;
 #endif
 
 #if AVERAGE_OPS
@@ -275,11 +274,11 @@ if(true) {  // TEMPCODE RJS
             {
                 for(uint32_t bw = wstart; bw < wend; ++bw)
                 {
-                    const size_t bot_index = nn * bot_n_stride +           //
-                                            cc * bot_c_stride +           //
-                                            bd * bot_d_stride + //
-                                            static_cast<size_t>(bh * bot_h_stride) + //
-                                            static_cast<size_t>(bw * bot_w_stride);
+                    const size_t bot_index = nn * args.bot_n_stride +           //
+                                            cc * args.bot_c_stride +           //
+                                            bd * args.bot_d_stride + //
+                                            static_cast<size_t>(bh * args.bot_h_stride) + //
+                                            static_cast<size_t>(bw * args.bot_w_stride);
 #if AVERAGE_OPS
                     res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]);
 #else // MAX
@@ -318,33 +317,35 @@ res *= 1.0; // TEMPCODE RJS fix UNUSED
             if(found)
             {
                 if(index_mode == 1)
-                    res_index = (index_t)(d_save * bot_h * bot_w //
-                                            + h_save * bot_w       //
+                    res_index = (index_t)(d_save * args.bot_h * args.bot_w //
+                                            + h_save * args.bot_w       //
                                             + w_save);
                 else
                     res_index = (index_t)(                                                    //
-                        ((d_save - td * filter_d_stride + filter_d_pad) * filter_h * filter_w) //
-                        + ((h_save - th * filter_h_stride + filter_h_pad) * filter_w)          //
-                        + (w_save - tw * filter_w_stride + filter_w_pad)                       //
+                        ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) //
+                        + ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w)          //
+                        + (w_save - tw * args.filter_w_stride + args.filter_w_pad)                       //
                     );
             }
 
-            const size_t mask_index = nn * mask_n_stride             //
-                                        + cc * mask_c_stride           //
-                                        + (size_t)(td * mask_d_stride) //
-                                        + (size_t)(tw * mask_h_stride) //
-                                        + (size_t)(th * mask_w_stride);
+            const size_t mask_index = nn * args.mask_n_stride             //
+                                        + cc * args.mask_c_stride           //
+                                        + (size_t)(td * args.mask_d_stride) //
+                                        + (size_t)(tw * args.mask_h_stride) //
+                                        + (size_t)(th * args.mask_w_stride);
             mask_ptr[mask_index] = res_index;
         }
 #endif
-        size_t top_index = nn * top_n_stride             //
-                                + cc * top_c_stride           //
-                                + (size_t)(td * top_d_stride) //
-                                + (size_t)(th * top_h_stride) //
-                                + (size_t)(tw * top_w_stride);
+        // top_index = nn * args.top_n_stride             //
+        //                         + cc * args.top_c_stride           //
+        //                         + (size_t)(td * args.top_d_stride) //
+        //                         + (size_t)(th * args.top_h_stride) //
+        //                         + (size_t)(tw * args.top_w_stride);
 
         top_ptr[top_index] = (_FLOAT)res;    // TEMPCODE RJS
-    }
+        top_ptr[top_index] = (_FLOAT)1.11111;    // TEMPCODE RJS
+
+        cc += blockDim.x;
 } // TEMPCODE
 }
 
@@ -355,16 +356,8 @@ extern "C" __global__ void mloPoolingForwardNDNhwcNaive(
                                     index_t* __restrict__ mask_ptr,
                                     int save_index,
                                     int index_mode,
-                                    uint32_t filter_d, uint32_t filter_h, uint32_t filter_w,
-                                    uint32_t filter_d_stride, uint32_t filter_h_stride, uint32_t filter_w_stride,
-                                    uint32_t filter_d_pad, uint32_t filter_h_pad, uint32_t filter_w_pad,
-                                    uint32_t all_n,
-                                    uint32_t all_c,
-                                    uint32_t bot_d, uint32_t bot_h, uint32_t bot_w,
-                                    size_t bot_n_stride, uint32_t bot_c_stride, size_t bot_d_stride, uint32_t bot_h_stride, uint32_t bot_w_stride,
-                                    uint32_t top_d, uint32_t top_h, uint32_t top_w,
-                                    size_t top_n_stride, uint32_t top_c_stride, size_t top_d_stride, uint32_t top_h_stride, uint32_t top_w_stride,
-                                    size_t mask_n_stride, size_t mask_c_stride, uint32_t mask_d_stride, uint32_t mask_h_stride, uint32_t mask_w_stride)
+poolingNdNhwcArgs args
+)
 {
     poolingForwardNDNhwcNaive<INPUT_TYPE, OUTPUT_TYPE>(
         bot_ptr,
@@ -373,15 +366,16 @@ extern "C" __global__ void mloPoolingForwardNDNhwcNaive(
         mask_ptr,
         save_index,
         index_mode,
-        filter_d, filter_h, filter_w,
-        filter_d_stride, filter_h_stride, filter_w_stride,
-        filter_d_pad, filter_h_pad, filter_w_pad,
-        all_n,
-        all_c,
-        bot_d, bot_h, bot_w,
-        bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride,
-        top_d, top_h, top_w,
-        top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride,
-        mask_n_stride, mask_c_stride, mask_d_stride, mask_h_stride, mask_w_stride
+        args
+        // args.filter_d, args.filter_h, args.filter_w,
+        // args.filter_d_stride, args.filter_h_stride, args.filter_w_stride,
+        // args.filter_d_pad, args.filter_h_pad, args.filter_w_pad,
+        // args.all_n,
+        // args.all_c,
+        // args.bot_d, args.bot_h, args.bot_w,
+        // args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride,
+        // args.top_d, args.top_h, args.top_w,
+        // args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride,
+        // args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride
     );
 }
diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp
index ad36abfb08..804bc2bf84 100644
--- a/src/pooling/problem_description.cpp
+++ b/src/pooling/problem_description.cpp
@@ -83,11 +83,7 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
         ss << "_dyd" << get_vect_config(dyDesc.GetLengths());
         ss << "_dys" << get_vect_config(dyDesc.GetStrides());
     }   // TEMPCODE RJS
-    std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << " *************************" << std::endl;
-    if(!xDesc.IsDefaultLayout())
-    {
-    std::cout <<   "               xDesc layout is not default! " << " *************************\n";
-    }
+    std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << (xDesc.IsDefaultLayout() ? "" : " (not default)") << " *************************" << std::endl;
     ss << "_l" << (xDesc.IsDefaultLayout() ? 0 : 1);
 std::cout << "               " << ss.str() << std::endl;
     return NetworkConfig{ss.str()};
diff --git a/src/solver/pooling/forward2d.cpp b/src/solver/pooling/forward2d.cpp
index d3fe890ecd..f1e1cd6e52 100644
--- a/src/solver/pooling/forward2d.cpp
+++ b/src/solver/pooling/forward2d.cpp
@@ -148,19 +148,18 @@ bool PoolingForward2d::IsApplicable(const ExecutionContext& context,
 
 // TEMPCODE RJS
     std::cout << "%%%%%%%%%% PoolingForward2d::IsApplicable: " << app << " " <<  problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout("NCHW") << std::endl;
-    return false;
                return app;
 }
 
 #include <iomanip>  // TEMPCODE RJS
 namespace {
     template<typename T>
-    void printVec(std::string name, const std::vector<T>& vec)
+    std::ostream& printVec(std::string name, const std::vector<T>& vec)
     {
-        return;
+        return std::cout;
         std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: ";
         for(auto i : vec)    std::cout << std::setw(8) << i;
-        std::cout << std::endl;
+        return std::cout;
     }
 }
 
@@ -202,22 +201,21 @@ ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&,
 
 
     // TEMPCODE RJS
-        const auto bot  = problem.GetXDesc();
-    const auto top  = problem.GetYDesc();
-    const auto& pooling = problem.GetPooling();
-    const auto& lengths = pooling.GetLengths();
-    const auto& strides = pooling.GetStrides();
-    const auto& pads    = pooling.GetPads();
-
-    std::cout << "======================================================================" << std::endl;
-    printVec("bot lengths", bot.GetLengths());
-    printVec("bot strides", bot.GetStrides());
-    printVec("top lengths", top.GetLengths());
-    printVec("top strides", top.GetStrides());
-    printVec("pool lengths", lengths);
-    printVec("pool strides", strides);
-    printVec("pool pads", pads);
-    std::cout << "======================================================================" << std::endl;
+    // const auto bot  = problem.GetXDesc();
+    // const auto top  = problem.GetYDesc();
+    // const auto& pooling = problem.GetPooling();
+    // const auto& lengths = pooling.GetLengths();
+    // const auto& strides = pooling.GetStrides();
+    // const auto& pads    = pooling.GetPads();
+
+    // std::cout << "PoolingForward2d GetSolution: " << std::endl;
+    // printVec("   bot lengths", bot.GetLengths()) <<
+    // printVec("   bot strides", bot.GetStrides()) << std::endl;
+    // printVec("   top lengths", top.GetLengths()) <<
+    // printVec("   top strides", top.GetStrides()) << std::endl;
+    // printVec("   pool lengths", lengths) <<
+    // printVec("   pool strides", strides) <<
+    // printVec("   pool pads", pads) << std::endl;
 
         auto build_params = KernelBuildParameters{
             {"MLO_POOLING_OP_ID", pooling_method},
diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp
index 70e4d998cf..77b5a6ea2d 100644
--- a/src/solver/pooling/forwardNdNhwcNaive.cpp
+++ b/src/solver/pooling/forwardNdNhwcNaive.cpp
@@ -30,6 +30,8 @@
 #include <miopen/pooling/invoke_params.hpp>
 #include <miopen/pooling/solvers.hpp>
 
+#include <miopen/pooling/poolingNdNhwcArgs.hpp>
+
 #define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCm/MIFin/issues/80
 
 namespace miopen {
@@ -98,7 +100,7 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&,
 #include <iomanip>  // TEMPCODE RJS
 namespace {
     template<typename T>
-    void printVec(std::string name, const std::vector<T>& vec)
+    void printVec(std::string name, std::vector<T> vec)
     {
          return;
       std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: ";
@@ -112,6 +114,7 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
                                  const miopen::pooling::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
+    poolingNdNhwcArgs args; 
 
     auto input_dtype  = miopen::GetDataType(problem.GetXDesc().GetType());
     auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType());
@@ -132,15 +135,34 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
     const auto& pads    = pooling.GetPads();
 
     // This also deduces 3D (DHW) parameters from 2D (HW) descriptor.
-    const uint32_t filter_w        = lengths[is2d ? 1 : 2];
-    const uint32_t filter_h        = lengths[is2d ? 0 : 1];
-    const uint32_t filter_d        = is2d ? 1 : lengths[0];
-    const uint32_t filter_w_stride = strides[is2d ? 1 : 2];
-    const uint32_t filter_h_stride = strides[is2d ? 0 : 1];
-    const uint32_t filter_d_stride = is2d ? (filter_h_stride * filter_d) : strides[0];
-    const uint32_t filter_w_pad    = pads[is2d ? 1 : 2];
-    const uint32_t filter_h_pad    = pads[is2d ? 0 : 1];
-    const uint32_t filter_d_pad    = is2d ? 0 : pads[0];
+    uint32_t idx = 0;
+    args.filter_d        = is2d ? 1 : lengths[idx++];
+     args.filter_h        = lengths[idx++];
+     args.filter_w        = lengths[idx++];
+
+    idx = 0;
+     args.filter_d_stride = is2d ? (strides[0]) : strides[idx++];
+     args.filter_h_stride = strides[idx++];
+     args.filter_w_stride = strides[idx++];
+
+    idx = 0;
+    args.filter_d_pad    = is2d ? 0 : pads[idx++];
+    args.filter_h_pad    = pads[idx++];
+    args.filter_w_pad    = pads[idx++];
+    // uint32_t idx = 0;
+    // const uint32_t filter_d        = is2d ? 1 : lengths[idx++];
+    // const uint32_t filter_h        = lengths[idx++];
+    // const uint32_t filter_w        = lengths[idx++];
+
+    // idx = 0;
+    // const uint32_t filter_d_stride = is2d ? (strides[0]) : strides[idx++];
+    // const uint32_t filter_h_stride = strides[idx++];
+    // const uint32_t filter_w_stride = strides[idx++];
+
+    // idx = 0;
+    // const uint32_t filter_d_pad    = is2d ? 0 : pads[idx++];
+    // const uint32_t filter_h_pad    = pads[idx++];
+    // const uint32_t filter_w_pad    = pads[idx++];
 
     const int pooling_method = (pooling.GetMode() == miopenPoolingMax) ? MLO_POOLING_OP_MAX
                                : (pooling.GetMode() == miopenPoolingAverage)
@@ -165,7 +187,7 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
     /// requires it because the total number of muls is 4.
 
     // TEMPCODE RJS
-    std::cout << "======================================================================" << std::endl;
+    printVec("======================================================================", std::vector<int>{});
     printVec("bot lengths", bot.GetLengths());
     printVec("bot strides", bot.GetStrides());
     printVec("top lengths", top.GetLengths());
@@ -173,39 +195,35 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
     printVec("pool lengths", lengths);
     printVec("pool strides", strides);
     printVec("pool pads", pads);
-    std::cout << "======================================================================" << std::endl;
+    printVec("======================================================================", std::vector<int>{});
 
     const auto spatial_dim = is2d ? 2U : 3U;
 
-    uint32_t all_n, all_c, bot_d, bot_h, bot_w;
-    std::tie(all_n, all_c, bot_d, bot_h, bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths());
-    std::cout << "GetSol: bot_lens " << all_n << " " << all_c << " " << bot_d << " " << bot_h << " " << bot_w << std::endl;
+    // uint32_t all_n, all_c, bot_d, bot_h, bot_w;
+    std::tie(args.all_n, args.all_c, args.bot_d, args.bot_h, args.bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths());
+std::cout << "GetSol: bot_lens " << args.all_n << " " << args.all_c << " " << args.bot_d << " " << args.bot_h << " " << args.bot_w << std::endl;
 
-    size_t bot_n_stride, bot_d_stride;
-    uint32_t bot_h_stride, bot_w_stride, bot_c_stride;
-    std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride) =
+    std::tie(args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride) =
         miopen::GetNCDHW(spatial_dim, bot.GetStrides());
-std::cout << "GetSol: bot_strides " << bot_n_stride << " " << bot_c_stride << " " << bot_d_stride
-<< " " << bot_h_stride << " " << bot_w_stride  << std::endl;
+std::cout << "GetSol: bot_strides " << args.bot_n_stride << " " << args.bot_c_stride << " " << args.bot_d_stride
+<< " " << args.bot_h_stride << " " << args.bot_w_stride  << std::endl;
 
-    uint32_t top_d, top_h, top_w;
-    std::tie(std::ignore, std::ignore, top_d, top_h, top_w) =
+    std::tie(std::ignore, std::ignore, args.top_d, args.top_h, args.top_w) =
         miopen::GetNCDHW(spatial_dim, top.GetLengths());
-    std::cout << "GetSol: top_lens " << top_d << " " << top_h << " " << top_w << std::endl;
+std::cout << "GetSol: top_lens " << args.top_d << " " << args.top_h << " " << args.top_w << std::endl;
 
-    size_t top_n_stride, top_d_stride;
-    uint32_t top_h_stride, top_w_stride, top_c_stride;
-    std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride) =
+    std::tie(args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride,args. top_w_stride) =
         miopen::GetNCDHW(spatial_dim, top.GetStrides());
     // TEMPCODE RJS
-    std::cout << "GetSol: top_strides " << top_n_stride << " " << top_c_stride << " "
-    << top_d_stride << " " << top_h_stride << " " << top_w_stride << std::endl;
+std::cout << "GetSol: top_strides " << args.top_n_stride << " " << args.top_c_stride << " "
+    << args.top_d_stride << " " << args.top_h_stride << " " << args.top_w_stride << std::endl;
+
     // Mask data is always NCDHW layout
-    const uint32_t mask_w_stride = 1;
-    const uint32_t mask_h_stride = mask_w_stride * top_w;
-    const uint32_t mask_d_stride = mask_h_stride * top_h;
-    const size_t mask_c_stride   = static_cast<size_t>(mask_d_stride) * top_d;
-    const size_t mask_n_stride   = mask_c_stride * all_c;
+    args.mask_w_stride = 1;
+    args.mask_h_stride = args.mask_w_stride * args.top_w;
+    args.mask_d_stride = args.mask_h_stride * args.top_h;
+    args.mask_c_stride   = static_cast<BIGONE>(args.mask_d_stride) * args.top_d;
+    args.mask_n_stride   = args.mask_c_stride * args.all_c;
 
     /// About optimal grid size:
     /// top D, H, and W are mapped directly onto grid dimensions, except in very small problems
@@ -229,34 +247,40 @@ std::cout << "GetSol: bot_strides " << bot_n_stride << " " << bot_c_stride << "
     /// workitems because the kernel does not require synchronization.
 
     std::ignore = context;
-    constexpr uint32_t LARGE_C_MAX_ITEMS = 512;
+    constexpr uint32_t MAX_THREADS       = 512;
+    constexpr uint32_t LARGE_C_MAX_ITEMS = MAX_THREADS;
     constexpr uint32_t SMALL_C_MAX_ITEMS = 128;
 
-    auto nd_ = all_n * top_d;
-    auto h_  = top_h;
-    auto w_  = top_w;
-    auto c_  = all_c;
+    auto nd_ = args.all_n * args.top_d;
+    auto h_  = args.top_h;
+    auto w_  = args.top_w;
+    auto c_  = args.all_c;
+std::cout << "nd_ " << nd_ << " h_ " << h_ << " w_ " << w_ << " c_ " << c_ << std::endl;
 
     uint32_t l1 = 1U;
     uint32_t l2 = 1U;
 
     if(c_ > LARGE_C_MAX_ITEMS)
     {
-        auto c2 = c_ / LARGE_C_MAX_ITEMS + 1;
+        auto c2 = (c_ + LARGE_C_MAX_ITEMS - 1) / LARGE_C_MAX_ITEMS;
         c_ = LARGE_C_MAX_ITEMS;
         w_ *= c2;
     }
     // else if(c_ <= SMALL_C_MAX_ITEMS / 2)
     // {
-    //     if(c_ * w_ <= SMALL_C_MAX_ITEMS)
+    //     if(c_ * w_ <= MAX_THREADS)
     //     {
     //         std::swap(l2, w_);
-
-    //         if(c_ * w_ * h_ <= SMALL_C_MAX_ITEMS)
+    //
+    //         if(c_ * w_ * h_ <= MAX_THREADS)
     //         {
     //             std::swap(l1, h_);
     //         }
     //     }
+    //     else if(c_ * h_ <= MAX_THREADS)
+    //     {
+    //         std::swap(l1, h_);
+    //     }
     // }
 
     const auto g0 = nd_;
@@ -289,18 +313,20 @@ std::cout << "GetSol: bot_strides " << bot_n_stride << " " << bot_c_stride << "
         // * 2: layout (NCHW vs NHWC)
         // * 2: 2D and 3D kernels (optimization)
 
-        kernel.g_wk.clear();
-        kernel.g_wk.push_back(g0);
-        kernel.g_wk.push_back(g1);
-        kernel.g_wk.push_back(g2);
+        // l1 = 11;
+        // l2 = 11;
         kernel.l_wk.clear();
         kernel.l_wk.push_back(l0);
         kernel.l_wk.push_back(l1);
         kernel.l_wk.push_back(l2);
+        kernel.g_wk.clear();
+        kernel.g_wk.push_back(g0 * l0);
+        kernel.g_wk.push_back(g1 * l1);
+        kernel.g_wk.push_back(g2 * l2);
 
         // TEMPCODE RJS
-        std::cout << "Kernel dims: g[" << kernel.g_wk.size() << "] " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2]
-        << " | l[" << kernel.l_wk.size() << "] " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl;
+std::cout << "Kernel dims: g[" << kernel.g_wk.size() << "] " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2]
+<< " | l[" << kernel.l_wk.size() << "] " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl;
         result.construction_params.push_back(kernel);
     }
 
@@ -316,16 +342,17 @@ std::cout << "GetSol: bot_strides " << bot_n_stride << " " << bot_c_stride << "
                 params.workspace,
                 save_index,
                 index_mode,
-                filter_d, filter_h, filter_w,
-                filter_d_stride, filter_h_stride, filter_w_stride,
-                filter_d_pad, filter_h_pad, filter_w_pad,
-                all_n,
-                all_c,
-                bot_d, bot_h, bot_w,
-                bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride,
-                top_d, top_h, top_w,
-                top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride,
-                mask_n_stride, mask_c_stride, mask_d_stride, mask_h_stride, mask_w_stride);
+                args.filter_d, args.filter_h, args.filter_w,
+                args.filter_d_stride, args.filter_h_stride, args.filter_w_stride,
+                args.filter_d_pad, args.filter_h_pad, args.filter_w_pad,
+                args.all_n,
+                args.all_c,
+                args.bot_d, args.bot_h, args.bot_w,
+                args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride,
+                args.top_d, args.top_h, args.top_w,
+                args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride,
+                args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride
+            );
         };
     };
 
diff --git a/test/gtest/poolingFwdNdNaive.cpp b/test/gtest/poolingFwdNdNaive.cpp
index 0b7682ccc3..abf9cb0509 100644
--- a/test/gtest/poolingFwdNdNaive.cpp
+++ b/test/gtest/poolingFwdNdNaive.cpp
@@ -103,7 +103,7 @@ struct layout_data
     void read_gpu_data(miopen::Handle& handle, const miopen::Allocator::ManageDataPtr& ddata)
     {
         check      = tensor<T>{descriptor.GetLengths(), descriptor.GetStrides()};
-        check.data = handle.Read<T>(ddata, check.data.size());
+        handle.ReadTo(check.data.data(), ddata, check.data.size());
     }
 
     tensor<T> check{};
@@ -140,7 +140,7 @@ std::vector<std::string> GetTestCases(const std::string precision)
 
     const std::vector<std::string> test_cases = {
         // clang-format off
-    {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}
+    {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}    // TEMPCODE RJS DATASET
         // clang-format on
     };
 
@@ -154,7 +154,7 @@ TEST_P(PoolingFwdFloat, NNT)    // NDNaiveTranspose
     const auto& handle = get_handle();
     if(!IsTestSupportedForDevice(handle))   std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl;
     if(SkipTest())                          std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl;
-    // if(!IsTestRunWith("--float"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl;
+    if(!IsTestRunWith("--float"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl;
         // Run2dDriver(miopenFloat);   return; // TEMPCODE RJS
     //  && IsTestRunWith("--float")
     if(IsTestSupportedForDevice(handle) && !SkipTest())
@@ -172,7 +172,7 @@ TEST_P(PoolingFwdHalf, NNT)
     const auto& handle = get_handle();
     if(!IsTestSupportedForDevice(handle))   std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl;
     if(SkipTest())                          std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl;
-    // if(!IsTestRunWith("--half"))           std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl;
+    if(!IsTestRunWith("--half"))           std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl;
 
     if(IsTestSupportedForDevice(handle) && !SkipTest()) //  && IsTestRunWith("--half") TEMPCODE RJS
     {
diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp
index a9ca1446b5..fb1556bc1a 100644
--- a/test/pooling2d.hpp
+++ b/test/pooling2d.hpp
@@ -57,7 +57,7 @@ struct pooling2d_shapes
     }
 
     // Dataset 1 is intended for testing of asymmetric configs.
-    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 1, 8, 8}}; }
+    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 2, 8, 8}}; } // {1, 1, 8, 8}, 
 
     // Dataset 2 is intended for testing of configs with wide window.
     static std::vector<U> get_2d_pooling_input_shapes_wide()
@@ -110,7 +110,7 @@ struct pooling2d_driver : pooling_driver<T>
         this->add(this->strides,
                   "strides",
                   this->template generate_multi_data<U>({{{2, 2}, {1, 1}},                 //
-                                                         {{1, 1}, {2, 1}, {1, 2}, {2, 2}}, //
+                                                         {{2, 2}, {2, 1}}, // , {1, 2}, {1, 1}
                                                          {{1, 1}}}));
         // clang-format off
         this->add(this->pads, "pads", this->template generate_multi_data<U>({
@@ -123,6 +123,6 @@ struct pooling2d_driver : pooling_driver<T>
             {{0, 0}}}));
         // clang-format on
         this->add(this->wsidx, "wsidx", this->generate_data({0, 1}));
-        this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNHWC
+        this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNCHW
     }
 };
diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp
index 6bbd949f78..8c84180663 100644
--- a/test/pooling_common.hpp
+++ b/test/pooling_common.hpp
@@ -102,7 +102,6 @@ tensor<T> get_big_output_tensor(const miopen::PoolingDescriptor& filter, const t
     auto lens = desc.GetLengths();
     lens[0] *= 10;
     auto big = miopen::TensorDescriptor{desc.GetType(), input.desc.GetLayout_t(), lens, desc.GetStrides()};
-    std::cout << "get_big_output_tensor: " << input.desc.GetLayout_str() << " " << desc.GetLayout_str() << std::endl;
     return tensor<T>{big};
 }
 
@@ -143,6 +142,9 @@ struct pooling_operators
 };
 
 #include <iomanip>
+#define MAX_PRINT 16    // TEMPCODE RJS
+#define GPU_JUNK 240
+#define GPU_4COL false
 
 template <int SptDim>
 struct verify_forward_pooling
@@ -168,35 +170,8 @@ struct verify_forward_pooling
         auto pooler = pooling_operators<T>{filter};
 
         // TEMPCODE RJS print input tensor
-        bool printing = true; // in_dim[0]==8 && in_dim[1]==8;
-        if(printing)
-        {
-            auto inlen = input.desc.GetLengths();
-            auto instr = input.desc.GetStrides();
-            std::cout << "CPU in : m" << filter.GetMode() << " t" << input.desc.GetType() << " | ";
-            for(auto dim : inlen) std::cout << std::setw(4) << dim;
-            std::cout << " | ";
-            for(auto str : instr) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
-            std::cout << std::endl;
-
-            for(int nn = 0; nn < inlen[0]; ++nn) {
-                for(int cc = 0; cc < inlen[1]; ++cc) {
-                    for(int hh = 0; hh < inlen[2]; ++hh) {
-                        for(int ww = 0; ww < inlen[3]; ++ww) {
-                            std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << "  ";
-                        }
-                        std::cout << std::endl;
-                    }
-                }
-            }
-        }
-
+        bool printing = in_dim[0]<=MAX_PRINT && in_dim[1]<=MAX_PRINT;
+        if(in_dim.size() > 2) printing &= in_dim[2]<=MAX_PRINT;
         int b_n = out.desc.GetLengths()[0];
         int k_n = out.desc.GetLengths()[chan_dim_offset];
         std::array<int, SptDim> out_spatial_len{};
@@ -262,6 +237,7 @@ struct verify_forward_pooling
 
             for(int nn = 0; nn < outlen[0]; ++nn) {
                 for(int cc = 0; cc < outlen[1]; ++cc) {
+            std::cout << "n=" << nn << " c=" << cc <<std::endl;
             for(int hh = 0; hh < outlen[2]; ++hh) {
                 for(int ww = 0; ww < outlen[3]; ++ww) {
                     std::cout << std::setw(11) << std::setprecision(5) << out.data[nn * outstr[0] + cc * outstr[1] + hh * outstr[2] + ww * outstr[3]] << "  ";
@@ -271,6 +247,33 @@ struct verify_forward_pooling
             }
             }
         }   // print output tensor
+        if(false && printing)
+        {
+            auto inlen = input.desc.GetLengths();
+            auto instr = input.desc.GetStrides();
+            std::cout << "CPU in : m" << filter.GetMode() << " t" << input.desc.GetType() << " | ";
+            for(auto dim : inlen) std::cout << std::setw(4) << dim;
+            std::cout << " | ";
+            for(auto str : instr) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
+            std::cout << " | ";
+            for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
+            std::cout << std::endl;
+
+            for(int nn = 0; nn < inlen[0]; ++nn) {
+                for(int cc = 0; cc < inlen[1]; ++cc) {
+                    for(int hh = 0; hh < inlen[2]; ++hh) {
+                        for(int ww = 0; ww < inlen[3]; ++ww) {
+                            std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << "  ";
+                        }
+                        std::cout << std::endl;
+                    }
+                }
+            }
+        }
 
         return out;
     }
@@ -307,9 +310,26 @@ struct verify_forward_pooling
 
         indices  = wspace.Read<std::vector<Index>>();
         handle.ReadTo(out.data.data(), out_dev, out.data.size() * sizeof(T));
-        handle.ReadTo(junk.data.data(), junk_dev, junk.data.size() * sizeof(T));
-        if(false)
+        bool printing = input.desc.GetLengths()[2] <= MAX_PRINT && input.desc.GetLengths()[3] <= MAX_PRINT;
+        if(input.desc.GetLengths().size() > 4) printing &= input.desc.GetLengths()[4] <= MAX_PRINT;
+        if(printing)
         {
+            if(GPU_4COL){
+                std::cout << "GPU (8-cols): " << std::endl;
+                for(int idx = 0; idx < GPU_JUNK; ++idx) {
+                    std::cout << std::setw(11) << std::setprecision(5) << out.data[idx] << "  ";
+                    if((idx % 8) == 7)  std::cout <<std::endl;
+                }
+            }
+            if(GPU_JUNK > 0){
+        handle.ReadTo(junk.data.data(), junk_dev, junk.data.size() * sizeof(T));
+                std::cout << "GPU junk: " << std::endl;
+                for(int idx = 0; idx < GPU_JUNK; ++idx) {
+                    std::cout << std::setw(11) << std::setprecision(5) << junk.data[idx] << "  ";
+                    if((idx % 4) == 3)  std::cout <<std::endl;
+                }
+            }
+
             std::cout << "GPU out: ";
             auto outlen = out.desc.GetLengths();
             for(auto dim : outlen) std::cout << std::setw(4) << dim;
@@ -323,29 +343,20 @@ struct verify_forward_pooling
             std::cout << " | ";
             for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
             std::cout << std::endl;
-
             for(int nn = 0; nn < outlen[0]; ++nn) {
                 for(int cc = 0; cc < outlen[1]; ++cc) {
+                    std::cout << "n=" << nn << " c=" << cc <<std::endl;
                     for(int hh = 0; hh < outlen[2]; ++hh) {
                         for(int ww = 0; ww < outlen[3]; ++ww) {
-                            std::cout << std::setw(11) << std::setprecision(5) << out.data[out.desc.GetIndex(nn, cc, hh, ww)] << "  ";
+                            std::cout << std::setw(11) << std::setprecision(5) << out.data[
+                                nn * outstr[0] + cc * outstr[1] + hh * outstr[2] + ww * outstr[2]
+                                // out.desc.GetIndex(nn, cc, hh, ww)
+                                ] << "  ";
                         }
                         std::cout << std::endl;
                     }
                 }
             }
-            if(false){
-            std::cout << "GPU out (4-cols): " << std::endl;
-            for(int idx = 0; idx < 160; ++idx) {
-                std::cout << std::setw(11) << std::setprecision(5) << out.data[idx] << "  ";
-                if((idx % 4) == 3)  std::cout <<std::endl;
-            }
-            std::cout << "GPU junk: " << std::endl;
-            for(int idx = 0; idx < 160; ++idx) {
-                std::cout << std::setw(11) << std::setprecision(5) << junk.data[idx] << "  ";
-                if((idx % 4) == 3)  std::cout <<std::endl;
-            }
-            }
         }   // print output tensor
         return out;
     }
@@ -621,14 +632,14 @@ struct pooling_driver : test_driver
     {
         add(index_type,
             "index_type",
-            generate_data({"miopenIndexUint32",}    // TEMPCODE RJS
-            // generate_multi_data<const char*>( //
-            //     {{"miopenIndexUint8",
-            //       "miopenIndexUint16",
-            //       "miopenIndexUint32",
-            //       "miopenIndexUint64"},                     //
-            //      {"miopenIndexUint8", "miopenIndexUint32"}, //
-            //      {"miopenIndexUint32"}}                     //
+            // generate_data({"miopenIndexUint32",}    // TEMPCODE RJS
+            generate_multi_data<const char*>( //
+                {{"miopenIndexUint8",
+                  "miopenIndexUint16",
+                  "miopenIndexUint32",
+                  "miopenIndexUint64"},                     //
+                 {"miopenIndexUint8", "miopenIndexUint32"}, //
+                 {"miopenIndexUint32"}}                     //
                 ));
         add(mode,
             "mode",
@@ -648,7 +659,8 @@ struct pooling_driver : test_driver
         for(auto& v : input.data)   v = gen_value<T>();
 
         // TEMPCODE RJS print input tensor
-        bool printing = true; // in_dim[0]==8 && in_dim[1]==8;
+        bool printing = in_shape[0]<=MAX_PRINT && in_shape[1]<=MAX_PRINT;
+        if (in_shape.size() > 2) printing &= in_shape[2]<=MAX_PRINT;
         if(printing)
         {
             auto inlen = input.desc.GetLengths();

From ea745efd340e083fee8e9aef65f500567a3c81e3 Mon Sep 17 00:00:00 2001
From: "Randy J. Spaulding" <rspauldi@amd.com>
Date: Thu, 29 Aug 2024 22:38:35 -0700
Subject: [PATCH 05/10] fwd works!

---
 src/include/miopen/pooling.hpp                |    2 +
 src/include/miopen/tensor.hpp                 |   25 +-
 .../MIOpenPoolingForwardNDNhwcNaive.cpp       |  142 +-
 src/kernels/pooling_functions.h               |    2 +-
 src/pooling.cpp                               |    5 +
 src/solver/pooling/forwardNdNhwcNaive.cpp     |    4 +-
 test/gtest/poolingFwdNdNaive.cpp              |    5 +-
 test/gtest/pooling_testing.hpp                | 1149 +++++++++++++++++
 test/pooling2d.hpp                            |   45 +-
 test/pooling_common.hpp                       |   22 +-
 10 files changed, 1268 insertions(+), 133 deletions(-)
 create mode 100644 test/gtest/pooling_testing.hpp

diff --git a/src/include/miopen/pooling.hpp b/src/include/miopen/pooling.hpp
index 2670c3b09c..9cf2bae82f 100644
--- a/src/include/miopen/pooling.hpp
+++ b/src/include/miopen/pooling.hpp
@@ -113,6 +113,8 @@ struct MIOPEN_EXPORT PoolingDescriptor : miopenPoolingDescriptor
     void SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t workspace_index);
 
     miopenPoolingMode_t GetMode() const;
+    
+    bool ModeIsAveraging() const;
 
     miopenPaddingMode_t GetPaddingMode() const;
 
diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
index 30a197e503..ec368dbb83 100644
--- a/src/include/miopen/tensor.hpp
+++ b/src/include/miopen/tensor.hpp
@@ -293,6 +293,18 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
         }
     }
 
+    static bool IsDefaultLayout(miopenTensorLayout_t layout, unsigned spatial_dims = 2)
+    {
+        switch (spatial_dims)
+        {
+            case 2:
+            case 3:
+                return layout == GetDefaultLayout();
+            default:
+                MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3.");
+        }
+    }
+
     friend MIOPEN_INTERNALS_EXPORT std::ostream& operator<<(std::ostream& stream,
                                                             const TensorDescriptor& t);
 
@@ -311,19 +323,6 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
         }
     };
 
-    static bool IsDefaultLayout(miopenTensorLayout_t layout, unsigned spatial_dims = 2)
-    {
-        switch (spatial_dims)
-        {
-            case 2:
-            case 3:
-                return layout == GetDefaultLayout();
-            default:
-                MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3.");
-        }
-    }
-
-
 private:
     TensorDescriptor(miopenDataType_t t,
                      miopenTensorLayout_t layout_in,
diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
index a6aa93aade..b194b6d44c 100644
--- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
+++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
@@ -53,6 +53,8 @@
 #include "pooling_functions.h"
 #include "poolingNdNhwcArgs.hpp"
 
+// TODO: add ability to decode network string into pooling descriptor or similar for targeted debugging
+
 #if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE)
 #define AVERAGE_OPS 1
 #else
@@ -87,45 +89,13 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
                                     ARG_UNUSED_FOR_AVERAGE int save_index,
                                     ARG_UNUSED_FOR_AVERAGE int index_mode,
                                     poolingNdNhwcArgs args
-                                    // UU uint32_t filter_d,
-                                    // UU uint32_t filter_h,
-                                    // UU uint32_t filter_w,
-                                    // UU uint32_t filter_d_stride,
-                                    // UU uint32_t filter_h_stride,
-                                    // UU uint32_t filter_w_stride,
-                                    // UU uint32_t filter_d_pad,
-                                    // UU uint32_t filter_h_pad,
-                                    // UU uint32_t filter_w_pad,
-                                    // uint32_t all_n,
-                                    // UU uint32_t all_c, // TEMPCODE RJS
-                                    // UU uint32_t bot_d,
-                                    // UU uint32_t bot_h,
-                                    // UU uint32_t bot_w,
-                                    // UU BIGONE bot_n_stride,
-                                    // UU uint32_t bot_c_stride,
-                                    // UU BIGONE bot_d_stride,
-                                    // UU uint32_t bot_h_stride,
-                                    // UU uint32_t bot_w_stride,
-                                    // uint32_t top_d,
-                                    // uint32_t top_h,
-                                    // uint32_t top_w,
-                                    // BIGONE top_n_stride,
-                                    // uint32_t top_c_stride,
-                                    // BIGONE top_d_stride,
-                                    // uint32_t top_h_stride,
-                                    // uint32_t top_w_stride,
-                                    // UU ARG_UNUSED_FOR_AVERAGE BIGONE mask_n_stride,
-                                    // UU ARG_UNUSED_FOR_AVERAGE BIGONE mask_c_stride,
-                                    // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_d_stride,
-                                    // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_h_stride,
-                                    // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_w_stride
 )
 {
     const uint32_t nn = blockIdx.x / args.top_d;                          // N=slow index
+    const auto c_base = (blockIdx.z / args.top_w) * blockDim.x;
     const uint32_t td = blockIdx.x % args.top_d;                          // top D=fast index
     const uint32_t th = blockIdx.y;  // top H
-    const uint32_t tw = blockIdx.z % args.all_c;  // top W=fast index
-    const auto c_base = (blockIdx.z / args.all_c) * blockDim.x;
+    const uint32_t tw = blockIdx.z % args.top_w;  // top W=fast index
     if(blockDim.x > args.all_c)
     {
         // // TODO: h, w, or both may be encoded into threadIdx
@@ -210,25 +180,17 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
         log_ptr[idx++] = args.mask_w_stride;
         #endif
         log_ptr[idx++] = -7;
-        while(idx < 64) log_ptr[idx++] = (_FLOAT)0;
     }
 
-    // if(nn >= args.all_n)
-    //     return;
-
-    // if(td >= args.top_d)
-    //     return;
-
-    // if(th >= args.top_h)
-    //     return;
-
-    // if(tw >= args.top_w)
-    //     return;
+    if(nn >= args.all_n) return;
+    if(td >= args.top_d) return;
+    if(th >= args.top_h) return;
+    if(tw >= args.top_w) return;
 
 if(true) {  // TEMPCODE RJS
     const auto int_dstart   = static_cast<int64_t>(td * args.filter_d_stride) - static_cast<int64_t>(args.filter_d_pad);
-    const auto dend           = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(args.filter_d), static_cast<int64_t>(args.bot_d)));
-    const auto dstart         = static_cast<size_t>(max(int_dstart, 0));
+    const auto dend         = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(args.filter_d), static_cast<int64_t>(args.bot_d)));
+    const auto dstart       = static_cast<size_t>(max(int_dstart, 0));
 
     const auto int_hstart   = static_cast<int>(th * args.filter_h_stride) - static_cast<int>(args.filter_h_pad);
     const auto hend             = static_cast<uint32_t>(min(int_hstart + static_cast<int>(args.filter_h), static_cast<int>(args.bot_h)));
@@ -239,7 +201,7 @@ if(true) {  // TEMPCODE RJS
     const auto wstart           = static_cast<uint32_t>(max(int_wstart, 0));
 
     uint32_t cc = c_base + threadIdx.x;
-    // if(cc > args.all_c) return;
+    if(cc >= args.all_c) return;
 
     size_t top_index = 
             nn * args.top_n_stride             // TEMPCODE RJS
@@ -247,9 +209,23 @@ if(true) {  // TEMPCODE RJS
             + (size_t)(td * args.top_d_stride) //
             + (size_t)(th * args.top_h_stride) //
             + (size_t)(tw * args.top_w_stride);
+    size_t junk_idx = 64 + 4 * th;
 if(true) {
-        top_ptr[top_index] = (TO)-1.11111;
-        junk_ptr[64 + top_index] = top_index;
+    if(nn == 0 && cc == 0 && td == 0 && tw < 8 && th == 0)
+    {
+        size_t bot_ncd = static_cast<size_t>(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride);
+            size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride;
+                size_t bot_index = bot_ncdh + wstart * args.bot_w_stride;
+
+        junk_ptr[junk_idx++] = top_index;
+        junk_ptr[junk_idx++] = bot_index;
+        junk_ptr[junk_idx++] = dstart;
+        junk_ptr[junk_idx++] = dend;
+        junk_ptr[junk_idx++] = hstart;
+        junk_ptr[junk_idx++] = hend;
+        junk_ptr[junk_idx++] = wstart;
+        junk_ptr[junk_idx++] = wend;
+    }
 }
 
 #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
@@ -267,18 +243,18 @@ if(true) {
         uint32_t d_save          = 0;
         uint32_t h_save          = 0;
         uint32_t w_save          = 0;
+        uint32_t saved_index     = 0;
 #endif
+
+        size_t bot_ncd = static_cast<size_t>(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride);
         for(size_t bd = dstart; bd < dend; ++bd)
         {
+            size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride;
             for(uint32_t bh = hstart; bh < hend; ++bh)
             {
+                size_t bot_index = bot_ncdh + wstart * args.bot_w_stride;
                 for(uint32_t bw = wstart; bw < wend; ++bw)
                 {
-                    const size_t bot_index = nn * args.bot_n_stride +           //
-                                            cc * args.bot_c_stride +           //
-                                            bd * args.bot_d_stride + //
-                                            static_cast<size_t>(bh * args.bot_h_stride) + //
-                                            static_cast<size_t>(bw * args.bot_w_stride);
 #if AVERAGE_OPS
                     res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]);
 #else // MAX
@@ -291,20 +267,34 @@ if(true) {
                             d_save = bd;
                             h_save = bh;
                             w_save = bw;
+                            saved_index = bot_index;
                         }
                     }
+    if(top_index == 1662 || (nn == 0 && cc == 0 && td == 0 && tw == 2 && th == 0))
+    {
+        junk_ptr[junk_idx++] = nn;
+        junk_ptr[junk_idx++] = cc;
+        junk_ptr[junk_idx++] = th;
+        junk_ptr[junk_idx++] = tw;
+        junk_ptr[junk_idx++] = bot_ptr[bot_index];
+        junk_ptr[junk_idx++] = bot_index;
+        junk_ptr[junk_idx++] = res;
+        junk_ptr[junk_idx++] = saved_index;
+    }
 #endif
+                    bot_index += args.bot_w_stride;
                 }
+                bot_ncdh += args.bot_h_stride;
             }
+            bot_ncd += args.bot_d_stride;
         }
 
 #if AVERAGE_OPS
         res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size);
 #else // MAX
-res *= 1.0; // TEMPCODE RJS fix UNUSED
         if(save_index)
         {
-            index_t res_index = 0;
+            index_t res_index = 5150;
 
             // / Preventing overflow during computation of res_index:
             // / If Index is shorter than uint, then let's perform computation in 32-bit
@@ -316,10 +306,14 @@ res *= 1.0; // TEMPCODE RJS fix UNUSED
 
             if(found)
             {
-                if(index_mode == 1)
-                    res_index = (index_t)(d_save * args.bot_h * args.bot_w //
-                                            + h_save * args.bot_w       //
-                                            + w_save);
+                if(index_mode == 1) // TEMPCODE RJS
+                    res_index = saved_index;
+                    // res_index = (index_t)(              //
+                    //     nn * args.bot_n_stride          //
+                    //     + cc * args.bot_c_stride        //
+                    //     + d_save * args.bot_d_stride    //
+                    //     + h_save * args.bot_h_stride    //
+                    //     + w_save * args.bot_w_stride);
                 else
                     res_index = (index_t)(                                                    //
                         ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) //
@@ -331,21 +325,13 @@ res *= 1.0; // TEMPCODE RJS fix UNUSED
             const size_t mask_index = nn * args.mask_n_stride             //
                                         + cc * args.mask_c_stride           //
                                         + (size_t)(td * args.mask_d_stride) //
-                                        + (size_t)(tw * args.mask_h_stride) //
-                                        + (size_t)(th * args.mask_w_stride);
+                                        + (size_t)(th * args.mask_h_stride) //
+                                        + (size_t)(tw * args.mask_w_stride);
             mask_ptr[mask_index] = res_index;
         }
 #endif
-        // top_index = nn * args.top_n_stride             //
-        //                         + cc * args.top_c_stride           //
-        //                         + (size_t)(td * args.top_d_stride) //
-        //                         + (size_t)(th * args.top_h_stride) //
-        //                         + (size_t)(tw * args.top_w_stride);
-
-        top_ptr[top_index] = (_FLOAT)res;    // TEMPCODE RJS
-        top_ptr[top_index] = (_FLOAT)1.11111;    // TEMPCODE RJS
 
-        cc += blockDim.x;
+        top_ptr[top_index] = (_FLOAT)res;
 } // TEMPCODE
 }
 
@@ -367,15 +353,5 @@ poolingNdNhwcArgs args
         save_index,
         index_mode,
         args
-        // args.filter_d, args.filter_h, args.filter_w,
-        // args.filter_d_stride, args.filter_h_stride, args.filter_w_stride,
-        // args.filter_d_pad, args.filter_h_pad, args.filter_w_pad,
-        // args.all_n,
-        // args.all_c,
-        // args.bot_d, args.bot_h, args.bot_w,
-        // args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride,
-        // args.top_d, args.top_h, args.top_w,
-        // args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride,
-        // args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride
     );
 }
diff --git a/src/kernels/pooling_functions.h b/src/kernels/pooling_functions.h
index c4821b10fb..6f53a2daab 100644
--- a/src/kernels/pooling_functions.h
+++ b/src/kernels/pooling_functions.h
@@ -40,7 +40,7 @@ typedef MLO_POOLING_INDEX_TYPE index_t;
 #define MLO_POOLING_OP_AVE_INCLUSIVE 3
 
 #ifndef MLO_POOLING_OP_ID
-#define MLO_POOLING_OP_ID 0
+#define MLO_POOLING_OP_ID 1
 #endif
 
 #endif // GUARD_POOLING_FUNCTIONS_H
diff --git a/src/pooling.cpp b/src/pooling.cpp
index 000e297f3c..e0d9112bd2 100644
--- a/src/pooling.cpp
+++ b/src/pooling.cpp
@@ -96,6 +96,11 @@ miopenPoolingWorkspaceIndexMode_t PoolingDescriptor::GetWorkspaceIndexMode() con
 
 miopenPoolingMode_t PoolingDescriptor::GetMode() const { return mode; }
 
+bool PoolingDescriptor::ModeIsAveraging() const
+{
+    return mode == miopenPoolingAverage || mode == miopenPoolingAverageInclusive;
+}
+
 miopenPaddingMode_t PoolingDescriptor::GetPaddingMode() const { return (pmode); }
 
 const std::vector<int>& PoolingDescriptor::GetLengths() const { return lens; }
diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp
index 77b5a6ea2d..62b65249df 100644
--- a/src/solver/pooling/forwardNdNhwcNaive.cpp
+++ b/src/solver/pooling/forwardNdNhwcNaive.cpp
@@ -137,8 +137,8 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
     // This also deduces 3D (DHW) parameters from 2D (HW) descriptor.
     uint32_t idx = 0;
     args.filter_d        = is2d ? 1 : lengths[idx++];
-     args.filter_h        = lengths[idx++];
-     args.filter_w        = lengths[idx++];
+    args.filter_h        = lengths[idx++];
+    args.filter_w        = lengths[idx++];
 
     idx = 0;
      args.filter_d_stride = is2d ? (strides[0]) : strides[idx++];
diff --git a/test/gtest/poolingFwdNdNaive.cpp b/test/gtest/poolingFwdNdNaive.cpp
index abf9cb0509..3a27b3c786 100644
--- a/test/gtest/poolingFwdNdNaive.cpp
+++ b/test/gtest/poolingFwdNdNaive.cpp
@@ -31,6 +31,7 @@
 #include "get_handle.hpp"
 #include "test_env.hpp"
 
+#include "pooling_testing.hpp"
 #include "pooling2d.hpp"
 
 #include "tensor_holder.hpp"
@@ -140,7 +141,7 @@ std::vector<std::string> GetTestCases(const std::string precision)
 
     const std::vector<std::string> test_cases = {
         // clang-format off
-    {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}    // TEMPCODE RJS DATASET
+    {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg}    // TEMPCODE RJS DATASET
         // clang-format on
     };
 
@@ -237,7 +238,7 @@ void Run2dDriver(miopenDataType_t prec)
 }
 
 INSTANTIATE_TEST_SUITE_P(Float, PoolingFwdFloat, testing::Values(GetTestCases("--float")));
-
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwdHalf);
 INSTANTIATE_TEST_SUITE_P(Half, PoolingFwdHalf, testing::Values(GetTestCases("--half")));
 
 #endif
diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp
new file mode 100644
index 0000000000..f114be8764
--- /dev/null
+++ b/test/gtest/pooling_testing.hpp
@@ -0,0 +1,1149 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+// TODO: I've hijacked pooling_common here. This is a temporary workaround until
+// all pooling tests have been converted to gtest. This work has not been planned yet.
+#ifndef GUARD_MIOPEN_TEST_POOLING_COMMON_HPP
+#define GUARD_MIOPEN_TEST_POOLING_COMMON_HPP
+
+#include <gtest/gtest.h>
+#include <array>
+#include <iostream>
+#include <iterator>
+#include <strstream>
+#include <limits>
+#include <memory>
+#include <miopen/logger.hpp>
+#include <miopen/miopen.h>
+#include <miopen/pooling.hpp>
+#include <miopen/stringutils.hpp>
+#include <miopen/tensor.hpp>
+#include <utility>
+
+// #include "network_data.hpp"
+#include "driver.hpp"
+#include "get_handle.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include "cpu_conv.hpp"
+#include "workspace.hpp"
+
+#define TEST_PADDING_MODE 0
+
+namespace {
+int num_all_case = 0;
+int num_uint16_case = 0;
+int num_uint32_case = 0;
+int num_uint32_case_imgidx = 0;
+int num_uint64_case = 0;
+int num_uint64_case_imgidx = 0;
+constexpr int max_typed_cases = 5;
+constexpr int MAX_ALL_CASES = 0;
+
+constexpr int RAND_INTEGER_MAX = 12000;
+constexpr int RAND_INTEGER_MIN = -8800;
+
+template <typename T>
+auto gen_value =
+    [](auto... is) { return static_cast<T>(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; };
+}
+
+static inline void print(std::ostringstream& oss, const miopen::PoolingDescriptor& filter, bool is_default_layout)
+{
+    oss << "Pooling: ";
+    if(filter.GetMode() == miopenPoolingAverage)
+        oss << "Average";
+    else if(filter.GetMode() == miopenPoolingAverageInclusive)
+        oss << "AverageInclusive";
+    else
+        oss << "Max";
+    oss << std::endl;
+    oss << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl;  // TEMPCODE RJS
+    oss << "Lengths: ";
+    miopen::LogRange(oss, filter.GetLengths(), ", ") << std::endl;
+    oss << "Pads: ";
+    miopen::LogRange(oss, filter.GetPads(), ", ") << std::endl;
+    oss << "Strides: ";
+    miopen::LogRange(oss, filter.GetStrides(), ", ") << std::endl;
+}
+
+template <class T>
+tensor<T> get_output_tensor(const miopen::PoolingDescriptor& filter, const tensor<T>& input)
+{
+    return tensor<T>{filter.GetForwardOutputTensor(input.desc)};
+}
+
+template <class T>
+tensor<T> get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor<T>& input)
+{
+    auto desc = filter.GetForwardOutputTensor(input.desc);
+    auto lens = desc.GetLengths();
+    if(desc.GetNumBytes() > 1000)
+        lens[0] *= 2;
+    else
+        lens[0] *= 10;
+    auto big = miopen::TensorDescriptor{desc.GetType(), input.desc.GetLayout_t(), lens, desc.GetStrides()};
+    return tensor<T>{big};
+}
+
+template <class T>
+struct pooling_operators
+{
+    miopen::PoolingDescriptor filter;
+    pooling_operators(miopen::PoolingDescriptor f) : filter(f) {}
+
+    double initialize() const
+    {
+        if(filter.GetMode() == miopenPoolingMax)
+            return std::numeric_limits<T>::lowest();
+        else
+            return 0.0;
+    }
+
+    double operator()(double x, double y) const
+    {
+        if(filter.GetMode() == miopenPoolingMax)
+        {
+            double m = std::max(x, y);
+            return (m);
+        }
+        else
+        {
+            return x + y;
+        }
+    }
+
+    double finalize(double x, double y)
+    {
+        if(filter.GetMode() == miopenPoolingMax)
+            return (x);
+        else
+            return x / y;
+    }
+};
+
+#include <algorithm>
+#include <iomanip>
+#define MAX_PRINTING 128    // TEMPCODE RJS
+#define MAX_PRINT 12    // TEMPCODE RJS
+#define MAX_NCD 2
+#define GPU_JUNK 160
+#define PRINT_CPU_IN 0
+#define PRINT_GPU_OUT 0
+#define GPU_4COL false
+#define PIPE std::cout
+
+namespace {
+    template<typename T>
+    std::vector<T> ClampNCS(const std::vector<T>& lens, std::vector<T>& strides)
+    {
+        std::vector<T> out;
+        bool is2d = lens.size() == 4;
+        constexpr T max_ncd = MAX_NCD;
+        constexpr T max_print = MAX_PRINT;
+        int idx = 0;
+
+        out.push_back(std::min(lens[idx++], max_ncd));
+        out.push_back(std::min(lens[idx++], max_ncd));
+        if(is2d)
+            out.push_back(1);
+        else
+            out.push_back(std::min(lens[idx++], max_ncd));
+
+        out.push_back(std::min(lens[idx++], max_print));
+        out.push_back(std::min(lens[idx++], max_print));
+
+        if(is2d)
+           strides.insert(strides.begin() + 2, {strides[2]});
+
+        return out;
+    }
+
+    template<typename T, typename S>
+    std::ostream& printClamped(std::ostream& oss, const std::vector<T>& out, const std::vector<S>& outlen, const std::vector<S>& outstr, int mode = 0)
+    {
+        auto ostr = outstr;
+        auto olen = ClampNCS(outlen, ostr);
+
+        for(int nn = 0; nn < olen[0]; ++nn) {
+            for(int cc = 0; cc < olen[1]; ++cc) {
+                for(int dd = 0; dd < olen[2]; ++dd) {
+                    oss << "n= " << nn << " c= " << cc << " d= " << dd << std::endl;
+                    for(int hh = 0; hh < olen[3]; ++hh) {
+                        for(int ww = 0; ww < olen[4]; ++ww) {
+switch(mode) {
+    case 0:
+                            oss << std::setw(11) << std::setprecision(5) << out[nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4]] << "  ";
+                            break;
+    case 1:
+                            oss << std::setw(11) << std::setprecision(5) << nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4] << "  ";
+                            break;
+}    
+                        }
+                        oss << std::endl;
+                    }
+                }
+            }
+        }
+        return oss;
+    }
+}
+
+    template<typename T, typename S>
+    std::ostream& printClamped(std::ostream& oss, const tensor<T>& out, const std::vector<S>& outlen, const std::vector<S>& outstr, int mode = 0)
+    {
+        return printClamped<T, S>(oss, out.data, outlen, outstr, mode);
+    }
+
+template <int SptDim>
+struct verify_forward_pooling
+{
+    template <class T, class Index>
+    tensor<T>
+    cpu(const tensor<T>& input, const miopen::PoolingDescriptor& filter, std::vector<Index>&) const
+    {
+        auto out = get_output_tensor(filter, input);
+
+        std::array<int, SptDim> in_dim{};
+        std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin());
+        std::array<int, SptDim> strides{};
+        std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin());
+        std::array<int, SptDim> pads{};
+        std::copy_n(filter.GetPads().begin(), SptDim, pads.begin());
+        std::array<int, SptDim> kers{};
+        std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin());
+        auto pooler = pooling_operators<T>{filter};
+
+        // TEMPCODE RJS print input tensor
+        bool printing = in_dim[0] <= MAX_PRINTING && in_dim[1] <= MAX_PRINTING;
+        if(in_dim.size() > 2) printing &= in_dim[2] <= MAX_PRINTING;
+
+        int b_n = out.desc.GetLengths()[0];
+        int k_n = out.desc.GetLengths()[1];
+        std::array<int, SptDim> out_spatial_len{};
+        std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin());
+
+        auto par_ford_out =
+            miopen::unpacker(miopen::prepender(par_ford, b_n, k_n))(out_spatial_len);
+
+        par_ford_out([&](int o, int w, auto... out_spatial_id_pack) {
+            auto out_spatial_id = make_array(out_spatial_id_pack...);
+
+            std::array<int, SptDim> start_idx{};
+            std::array<int, SptDim> win_sz{};
+            for(int i = 0; i < SptDim; ++i)
+            {
+                start_idx[i] = out_spatial_id[i] * strides[i] - pads[i];
+                int end_idx  = start_idx[i] + kers[i];
+                end_idx      = std::min(end_idx, in_dim[i]);
+                start_idx[i] = std::max(start_idx[i], 0);
+                win_sz[i]    = end_idx - start_idx[i];
+                win_sz[i]    = std::max(win_sz[i], 1);
+            }
+
+            int pool_size =
+                filter.GetMode() == miopenPoolingAverageInclusive
+                    ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies<int>())
+                    : std::accumulate(win_sz.begin(), win_sz.end(), 1, std::multiplies<int>());
+
+            double acc = pooler.initialize();
+            miopen::unpacker(ford)(win_sz)([&](auto... in_spatial_id_pack) {
+                auto in_spatial_id = make_array(in_spatial_id_pack...);
+                std::array<std::size_t, SptDim + 2> idx{};
+                idx[0] = o;
+                idx[1] = w;
+
+                bool in_cmp_idx = true;
+                for(int i = 0; i < SptDim; ++i)
+                {
+                    idx[i + 2] = start_idx[i] + in_spatial_id[i];
+                    in_cmp_idx &= (in_dim[i] > idx[i + 2]);
+                }
+
+                if(in_cmp_idx)
+                {
+                    acc = pooler(acc, input(idx));
+                }
+            });
+            out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size));
+        });
+
+        if(printing)
+        {
+            PIPE << "CPU out: ";
+            auto outlen = out.desc.GetLengths();
+            for(auto dim : outlen)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
+            auto outstr = out.desc.GetStrides();
+            for(auto dim : outstr)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
+            for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
+            for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
+            for(auto str : filter.GetPads())    PIPE << std::setw(4) << str << " ";
+            PIPE << std::endl;
+
+            printClamped(PIPE, out, outlen, outstr);
+            // printClamped(PIPE, out, outlen, outstr, 1);
+        }   // print output tensor
+        if(PRINT_CPU_IN && printing)
+        {
+            auto inlen = input.desc.GetLengths();
+            auto instr = input.desc.GetStrides();
+            PIPE << "CPU in : m" << filter.GetMode() << " t" << input.desc.GetType() << " | ";
+            for(auto dim : inlen)               PIPE << std::setw(4) << dim << " "; PIPE << " | ";
+            for(auto str : instr)               PIPE << std::setw(4) << str << " "; PIPE << " | ";
+            for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
+            for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
+            for(auto str : filter.GetPads())    PIPE << std::setw(4) << str << " ";
+            PIPE << std::endl;
+
+            printClamped(PIPE, input, inlen, instr);
+            // printClamped(PIPE, input, inlen, instr, 1);
+        }
+
+        return out;
+    }
+
+    template <class T, class Index>
+    tensor<T>
+    cpu_naive(const tensor<T>& input, const miopen::PoolingDescriptor& filter, std::vector<Index>&) const
+    {
+        auto out = get_output_tensor(filter, input);
+
+        std::array<int, SptDim> in_dim{};
+        std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin());
+        std::array<int, SptDim> strides{};
+        std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin());
+        std::array<int, SptDim> pads{};
+        std::copy_n(filter.GetPads().begin(), SptDim, pads.begin());
+        std::array<int, SptDim> kers{};
+        std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin());
+        auto pooler = pooling_operators<T>{filter};
+
+        // TEMPCODE RJS print input tensor
+        bool printing = in_dim[0] <= MAX_PRINTING && in_dim[1] <= MAX_PRINTING;
+        if(in_dim.size() > 2) printing &= in_dim[2] <= MAX_PRINTING;
+
+        auto flens = filter.GetLengths();
+        auto fstrs = filter.GetStrides();
+        auto ilens = input.desc.GetLengths();
+        auto istrs = input.desc.GetStrides();
+        auto olens  = out.desc.GetLengths();
+        auto ostrs  = out.desc.GetStrides();
+
+        for(int n = 0; n < olens[0]; ++n)
+        {
+            for(int c = 0; c < olens[1]; ++c)
+            {
+                for(int h = 0; h < olens[2]; ++h)
+                {
+                    for(int w = 0; w < olens[3]; ++w)
+                    {
+                        int hstart = h * fstrs[0];
+                        int hend = hstart + flens[0];
+                        hend = min(hend, ilens[2]);
+
+                        int wstart = w * fstrs[1];
+                        int wend = wstart + flens[1];
+                        wend = min(wend, ilens[3]);
+                        double res = -10000.0;
+                        for(int fh = hstart; fh < hend; ++fh)
+                        {
+                            for(int fw = wstart; fw < wend; ++fw)
+                            {
+                                double val = input[n * istrs[0] + c * istrs[1] + fh * istrs[2] + fw * istrs[3]];
+                                if(val > res)
+                                    res = val;
+                            }
+                        }
+                        out[n * ostrs[0] + c * ostrs[1] + h * ostrs[2] + w * ostrs[3]] = res;
+                    }
+                }
+            }
+        }
+
+        if(printing)
+        {
+            PIPE << "CPU nve: ";
+            for(auto dim : olens)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
+            for(auto dim : ostrs)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
+            for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
+            for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE<< " | ";
+            for(auto str : filter.GetPads())    PIPE << std::setw(4) << str << " ";
+            PIPE << std::endl;
+
+            printClamped(PIPE, out, olens, ostrs);
+        }
+
+        return out;
+    }
+
+    template <class T, class Index>
+    tensor<T> gpu(const tensor<T>& input,
+                  const miopen::PoolingDescriptor& filter,
+                  std::vector<Index>& indices) const
+    {
+        auto&& handle = get_handle();
+        auto out      = get_output_tensor(filter, input);
+        auto junk      = get_big_output_tensor(filter, input);   // TEMPCODE RJS
+
+        indices.resize(out.data.size(), 0);
+
+        auto in_dev  = handle.Write(input.data);
+        auto out_dev = handle.Create<T>(out.GetSize());
+        auto junk_dev = handle.Create<T>(junk.GetSize());  // 
+        Workspace wspace{};
+        wspace.Write(indices);
+
+        float alpha = 1, beta = 0;
+        filter.Forward(handle,
+                       &alpha,
+                       input.desc,
+                       in_dev.get(),
+                       &beta,
+                       out.desc,
+                       out_dev.get(),
+                       true,
+                       wspace.ptr(),
+                       wspace.size(),
+                       junk_dev.get()); // TEMPCODE RJS
+
+        handle.ReadTo(out.data.data(), out_dev, out.GetDataByteSize());
+        wspace.ReadTo(indices);
+        bool printing = input.desc.GetLengths()[2] <= MAX_PRINTING && input.desc.GetLengths()[3] <= MAX_PRINTING;
+        if(input.desc.GetLengths().size() > 4) printing &= input.desc.GetLengths()[4] <= MAX_PRINTING;
+std::cout << (printing ? "printing output from GPU..." : "skipping GPU print.") << std::endl;
+        if(PRINT_GPU_OUT && printing)
+        {
+#if GPU_JUNK > 0
+        handle.ReadTo(junk.data.data(), junk_dev, junk.GetDataByteSize());
+            if(GPU_4COL){
+                PIPE<< "GPU (8-cols): " << std::endl;
+                for(int idx = 0; idx < GPU_JUNK; ++idx) {
+                    PIPE << std::setw(11) << std::setprecision(5) << out.data[idx] << "  ";
+                    if((idx % 8) == 7)  PIPE <<std::endl;
+                }
+            }
+                PIPE << "GPU junk: " << std::endl;
+                for(int idx = 0; idx < GPU_JUNK; ++idx) {
+                    PIPE << std::setw(11) << std::setprecision(5) << junk.data[idx] << "  ";
+                    if((idx % 4) == 3)  PIPE <<std::endl;
+                }
+#endif
+
+            auto outlen = out.desc.GetLengths();
+            auto outstr = out.desc.GetStrides();
+
+            if(false){
+                std::vector<size_t> olen = outlen;
+                if(olen.size() == 4)    olen.insert(olen.begin() + 2, 1);
+                // Mask data is always NCDHW layout
+                size_t mask_w_stride = 1;
+                size_t mask_h_stride = mask_w_stride * olen[4];
+                size_t mask_d_stride = mask_h_stride * olen[3];
+                size_t mask_c_stride   = mask_d_stride * olen[2];
+                size_t mask_n_stride   = mask_c_stride * olen[1];
+                std::vector<size_t> mask_str{mask_n_stride, mask_c_stride, mask_d_stride,mask_h_stride, mask_w_stride};
+                std::cout << "GPU indices: " << mask_n_stride << " " << mask_c_stride << " " << mask_d_stride << " " << mask_h_stride << " " << mask_w_stride << std::endl;
+                printClamped(std::cout, indices, olen, mask_str);
+            }
+
+            PIPE << "GPU out: ";
+            for(auto dim : outlen)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
+            for(auto dim : outstr)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
+            for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
+            for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE<< " | ";
+            for(auto str : filter.GetPads())    PIPE << std::setw(4) << str << " ";
+            PIPE << std::endl;
+
+            printClamped(PIPE, out, outlen, outstr);
+            // printClamped(std::cout, out, outlen, outstr, 1);
+        }   // print output tensor
+
+        return out;
+    }
+
+    template <class T, class Index>
+    void fail(float,
+              const tensor<T>& input,
+              const miopen::PoolingDescriptor& filter,
+              const std::vector<Index>&) const
+    {
+        std::ostringstream oss;
+        oss << "Forward ";
+        print(oss, filter, input.desc.IsDefaultLayout());
+        oss << "Input tensor: " << input.desc.ToString() << std::endl;
+        oss << "Output tensor: " << filter.GetForwardOutputTensor(input.desc).ToString()
+                  << std::endl;
+        GTEST_FAIL() << oss.str();
+    }
+};
+
+template <int SptDim>
+struct verify_backward_pooling
+{
+    template <class T, class Index>
+    tensor<T> cpu(const tensor<T>& input,
+                  const tensor<T>& dout,
+                  const tensor<T>& out,
+                  const miopen::PoolingDescriptor& filter,
+                  const std::vector<Index>& indices,
+                  bool use_global_index,
+                  bool verify_index) const
+    {
+        const int sptl_dim_offset = 2;
+        const int chan_dim_offset = 1;
+
+        auto dinput = input;
+
+        std::vector<double> din_vec(input.desc.GetElementSpace(), 0.0);
+        CHECK(dout.desc == out.desc);
+        std::array<int, SptDim + 2> in_dim{};
+        std::copy_n(input.desc.GetLengths().begin(), SptDim + 2, in_dim.begin());
+        std::array<int, SptDim + 2> in_str{};
+        std::copy_n(input.desc.GetStrides().begin(), SptDim + 2, in_str.begin());
+        std::array<int, SptDim> strides{};
+        std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin());
+        std::array<int, SptDim> pads{};
+        std::copy_n(filter.GetPads().begin(), SptDim, pads.begin());
+        std::array<int, SptDim> kers{};
+        std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin());
+        auto ford_ker = miopen::unpacker(ford)(kers);
+
+        int out_n = out.desc.GetLengths()[0];
+        int out_c = out.desc.GetLengths()[chan_dim_offset];
+        std::array<int, SptDim> out_spatial_len{};
+        std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin());
+        auto ford_out = miopen::unpacker(ford)(out_spatial_len);
+
+        par_ford(out_n, out_c)([&](int o, int w) {
+            if(filter.GetMode() == miopenPoolingMax)
+            {
+                ford_out([&](auto... out_spatial_id_pack) {
+                    auto mx_idx = indices.at(dout.desc.GetIndex(o, w, out_spatial_id_pack...));
+                    std::array<std::size_t, SptDim + 2> idx{};
+                    bool in_cmp_idx = true;
+                    if(use_global_index)
+                    {
+                        for(int i = 0; i < SptDim; i++)
+                        {
+                            std::size_t mx_idx_dim = mx_idx;
+                            mx_idx_dim /= std::accumulate(in_dim.begin() + sptl_dim_offset + i + 1,
+                                                          in_dim.end(),
+                                                          1ULL,
+                                                          std::multiplies<std::size_t>());
+                            mx_idx_dim %= in_dim[i + sptl_dim_offset];
+                            idx[i + sptl_dim_offset] = mx_idx_dim;
+                        }
+                    }
+                    else
+                    {
+                        auto out_spatial_id = make_array(out_spatial_id_pack...);
+
+                        for(int i = 0; i < SptDim; i++)
+                        {
+                            int mx_idx_dim = mx_idx;
+                            mx_idx_dim /= std::accumulate(
+                                kers.begin() + i + 1, kers.end(), 1, std::multiplies<int>());
+                            mx_idx_dim %= kers[i];
+
+                            mx_idx_dim += (out_spatial_id[i] * strides[i] - pads[i]);
+                            in_cmp_idx &= (in_dim[i + 2] > mx_idx_dim && mx_idx_dim >= 0);
+
+                            idx[i + 2] = std::size_t(mx_idx_dim);
+                        }
+                    }
+
+                    if(in_cmp_idx)
+                    {
+                        idx[0] = o;
+                        idx[1] = w;
+                        if(verify_index)
+                        {
+                            CHECK(
+                                miopen::float_equal(input(idx), out(o, w, out_spatial_id_pack...)));
+                        }
+                        std::size_t din_idx = 0;
+                        for(int i = 0; i < SptDim + 2; i++)
+                        {
+                            din_idx += idx[i] * in_str[i];
+                        }
+                        din_vec.at(din_idx) += dout(o, w, out_spatial_id_pack...);
+                    }
+                });
+            }
+            else
+            {
+                ford_out([&](auto... out_spatial_id_pack) {
+                    auto out_spatial_id = make_array(out_spatial_id_pack...);
+
+                    std::array<int, SptDim> start_idx{};
+                    std::array<int, SptDim> win_sz{};
+                    for(int i = 0; i < SptDim; ++i)
+                    {
+                        start_idx[i] = out_spatial_id[i] * strides[i] - pads[i];
+                        int end_idx  = start_idx[i] + kers[i];
+                        end_idx      = std::min(end_idx, in_dim[i + 2]);
+                        win_sz[i]    = end_idx - std::max(start_idx[i], 0);
+                        win_sz[i]    = std::max(win_sz[i], 1);
+                    }
+
+                    int pool_size =
+                        filter.GetMode() == miopenPoolingAverageInclusive
+                            ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies<int>())
+                            : std::accumulate(
+                                  win_sz.begin(), win_sz.end(), 1, std::multiplies<int>());
+
+                    ford_ker([&](auto... ker_id_pack) {
+                        auto ker_id = make_array(ker_id_pack...);
+
+                        bool in_cmp_idx = true;
+                        std::array<int, SptDim + 2> in_idx{};
+                        in_idx[0] = o;
+                        in_idx[1] = w;
+                        for(int i = 0; i < SptDim; ++i)
+                        {
+                            in_idx[i + 2] = start_idx[i] + ker_id[i];
+                            in_cmp_idx &= (in_dim[i + 2] > in_idx[i + 2] && in_idx[i + 2] >= 0);
+                        }
+
+                        if(in_cmp_idx)
+                        {
+                            std::size_t din_idx = 0;
+                            for(int i = 0; i < SptDim + 2; i++)
+                            {
+                                din_idx += in_idx[i] * in_str[i];
+                            }
+
+                            din_vec.at(din_idx) +=
+                                static_cast<double>(dout(o, w, out_spatial_id_pack...)) / pool_size;
+                        }
+                    });
+                });
+            }
+        });
+
+        miopen::unpacker(ford)(in_dim)([&](auto... in_id_pack) {
+            auto in_id          = make_array(in_id_pack...);
+            std::size_t din_idx = 0;
+            for(int i = 0; i < SptDim + 2; i++)
+            {
+                din_idx += in_id[i] * in_str[i];
+            }
+            dinput(in_id_pack...) = din_vec.at(din_idx);
+        });
+        return dinput;
+    }
+
+    template <class T, class Index>
+    tensor<T> gpu(const tensor<T>& input,
+                  const tensor<T>& dout,
+                  const tensor<T>& out,
+                  const miopen::PoolingDescriptor& filter,
+                  const std::vector<Index>& indices,
+                  bool,
+                  bool) const
+    {
+        auto&& handle = get_handle();
+        auto dinput   = input;
+
+        auto in_dev   = handle.Write(input.data);
+        auto dout_dev = handle.Write(dout.data);
+        auto out_dev  = handle.Write(out.data);
+        auto din_dev  = handle.Create<T>(dinput.data.size());
+
+        Workspace wspace{};
+        wspace.Write(indices);
+
+        float alpha = 1, beta = 0;
+        filter.Backward(handle,
+                        &alpha,
+                        // y
+                        out.desc,
+                        out_dev.get(),
+                        // dy
+                        dout.desc,
+                        dout_dev.get(),
+                        // x
+                        input.desc,
+                        in_dev.get(),
+                        &beta,
+                        // dx
+                        dinput.desc,
+                        din_dev.get(),
+                        wspace.ptr());
+
+        dinput.data = handle.Read<T>(din_dev, dinput.data.size());
+        return dinput;
+    }
+
+    template <class T, class Index>
+    void fail(float,
+              const tensor<T>& input,
+              const tensor<T>&,
+              const tensor<T>& out,
+              const miopen::PoolingDescriptor& filter,
+              const std::vector<Index>&,
+              bool,
+              bool) const
+    {
+        std::cout << "Backward ";
+        print(filter, input.desc.IsDefaultLayout());
+        std::cout << "Input tensor: " << input.desc.ToString() << std::endl;
+        std::cout << "Output tensor: " << out.desc.ToString() << std::endl;
+    }
+};
+
+template <class T>
+struct pooling_driver : test_driver
+{
+    miopen::PoolingDescriptor filter;
+    std::vector<int> in_shape;
+    std::vector<int> lens;
+    std::vector<int> pads;
+    std::vector<int> strides;
+    std::string index_type;
+    std::string mode_str;
+#if TEST_PADDING_MODE == 1
+    std::string pmode;
+#endif
+    int verify_indices{};
+    miopenPoolingWorkspaceIndexMode_t wsidx{};
+    miopenTensorLayout_t layout{};
+    std::unordered_map<std::string, miopenIndexType_t> index_type_lookup = {
+        {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8},
+        {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16},
+        {miopen::ToUpper("miopenIndexUint32"), miopenIndexUint32},
+        {miopen::ToUpper("miopenIndexUint64"), miopenIndexUint64},
+    };
+    std::unordered_map<std::string, miopenPoolingMode_t> mode_lookup = {
+        {"MAX", miopenPoolingMax},
+        {"MIOPENPOOLINGMAX", miopenPoolingMax},
+        {"AVERAGE", miopenPoolingAverage},
+        {"MIOPENPOOLINGAVERAGE", miopenPoolingAverage},
+        {"AVERAGEINCLUSIVE", miopenPoolingAverageInclusive},
+        {"MIOPENPOOLINGAVERAGEINCLUSIVE", miopenPoolingAverageInclusive},
+    };
+#if TEST_PADDING_MODE == 1
+    std::unordered_map<std::string, miopenPaddingMode_t> pmode_lookup = {
+        {"DEFAULT", miopenPaddingDefault},
+        {"SAME", miopenPaddingSame},
+        {"VALID", miopenPaddingValid},
+    };
+#endif
+    pooling_driver()
+    {
+        add(index_type,
+            "index_type",
+            generate_data({"miopenIndexUint32"}    // TEMPCODE RJS
+            // generate_multi_data<const char*>( //
+            //     {{"miopenIndexUint8",
+            //       "miopenIndexUint16",
+            //       "miopenIndexUint32",
+            //       "miopenIndexUint64"},                     //
+            //      {"miopenIndexUint8", "miopenIndexUint32"}, //
+            //      {"miopenIndexUint32"}}                     //
+                ));
+        add(mode_str,
+            "mode_str",
+            generate_data(
+                {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); // 
+#if TEST_PADDING_MODE == 1
+        add(pmode, "pmode", generate_data({"default", "same", "valid"}));
+#endif
+        add(verify_indices, "verify_indices", generate_data({1}));
+    }
+
+    template <class Index, int SptlDim>
+    void run_impl()
+    {
+        std::vector<Index> indices{};
+        auto input = tensor<T>{layout, in_shape};
+        for(auto& v : input.data)   v = gen_value<T>();
+
+        // TEMPCODE RJS print input tensor
+        // bool printing = in_shape[0] <= MAX_PRINTING && in_shape[1] <= MAX_PRINTING;
+        // if (in_shape.size() > 2) printing &= in_shape[2] <= MAX_PRINTING;
+        // if(printing)
+        // {
+        //     auto inlen = input.desc.GetLengths();
+        //     auto instr = input.desc.GetStrides();
+        //     std::cout << "CPU GEN : " << input.desc.GetLayout_str() << "(" << inlen.size() << ") | " << input.data.size() << " | " << input.desc.GetElementSpace() << " | ";
+        //     for(auto dim : inlen) std::cout << std::setw(4) << dim;
+        //     std::cout << " | ";
+        //     for(auto str : instr) std::cout << std::setw(4) << str;
+        //     std::cout << " | ";
+        //     for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
+        //     std::cout << " | ";
+        //     for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
+        //     std::cout << " | ";
+        //     for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
+        //     std::cout << std::endl;
+
+        //     for(int nn = 0; nn < inlen[0]; ++nn) {
+        //         for(int cc = 0; cc < inlen[1]; ++cc) {
+        //             for(int hh = 0; hh < inlen[2]; ++hh) {
+        //                 for(int ww = 0; ww < inlen[3]; ++ww) {// nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]
+        //                     std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << "  ";
+        //                 }
+        //             std::cout << std::endl;
+        //             }
+        //         }
+        //     }
+        // }
+
+        auto out  = verify(verify_forward_pooling<SptlDim>{},
+            input,
+            filter,
+            indices);
+#ifdef BACKWARD
+        auto dout = out.first;
+        dout.generate(tensor_elem_gen_integer{2503});
+        verify(verify_backward_pooling<SptlDim>{},   // TEMPCODE RJS no backward
+               input,
+               dout,
+               out.first,
+               filter,
+               indices,
+               wsidx != 0,
+               static_cast<bool>(this->verify_indices));
+#endif
+    }
+
+    void run()
+    {
+        std::cout << "\n############   Run # " << std::setw(6) << num_all_case++ << " : ";
+        if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES)
+        {
+            std::cout << " skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES << " : ";
+            show_command();
+            return;
+        }
+        else if(this->dry_run)
+        {
+            std::cout << " skipped due to dry_run : ";
+            show_command();
+            return;
+        }
+        else
+        {
+            show_command();
+        }
+
+        int sptl_dim = static_cast<int>(in_shape.size()) - 2;
+        if(sptl_dim != 2 && sptl_dim != 3)
+        {
+            std::cout << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW form."
+                      << std::endl;
+            return;
+        }
+
+        // To simplify launching, input dimensions to the driver are always default layout. Desire to
+        // test non-default layouts is communicated exclusively via 'layout'.
+        // const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); // TEMPCODE RJS
+
+        auto mode = mode_lookup.at(miopen::ToUpper(mode_str));
+        if(mode != miopenPoolingMax) return;  // TEMPCODE RJS skip all except max, do max only
+
+        auto pad_mode = miopenPaddingDefault;
+#if TEST_PADDING_MODE
+        pad_mode = pmode_lookup.at(miopen::ToUpper(pmode));
+#endif
+
+        auto idx_typ = index_type_lookup.at(miopen::ToUpper(index_type));
+        auto idx_sz  = sizeof(uint8_t);
+        const bool skip_many_configs_with_non_int8_index =
+            (dataset_id == 0) && !full_set; // Otherwise the default dataset takes too much time.
+        const bool wide_dataset = (dataset_id == 2) && full_set;
+
+        filter = miopen::PoolingDescriptor
+        {
+            mode,
+            pad_mode,
+            lens,
+            strides,
+            pads
+        };
+
+        filter.SetIndexType(idx_typ);
+        filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx));
+        bool mask_idx = filter.GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask;
+
+        if(mask_idx && sptl_dim == 3 && filter.GetMode() == miopenPoolingMax)
+        {
+            std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
+                         "yet in 3D max pooling solvers."
+                      << std::endl;
+            return;
+        }
+
+        if(mask_idx && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset)
+        {
+            std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
+                         "yet in 2D max backward solvers that support wide pooling window."
+                      << std::endl;
+            return;
+        }
+
+        if(mask_idx && filter.ModeIsAveraging())
+        {
+            std::cout << "Warning: Config skipped. Workspace index modes are irrelevant for "
+                         "Average pooling. "
+                         "In order to optimize performance of full tests, we "
+                         "skip average pooling configs when (wsidx == 0). "
+                         "Please make sure that dataset includes counterparts with (wsidx == 1)."
+                      << std::endl;
+            return;
+        }
+
+        switch(idx_typ)
+        {
+        /// The "index is too small" limitation is an approximation
+        /// of the real limitation, and therefore applied only when
+        /// the "full test" is ran. See:
+        /// \ref max_pooling_index_max_restriction
+        case miopenIndexUint8: {
+            if(full_set && (sptl_dim == 3 || (mask_idx && sptl_dim == 2)) &&
+               filter.GetMode() == miopenPoolingMax)
+            {
+                std::cout << "Warning: Config skipped: uint8 index is too small "
+                             "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
+                             "&& filter.GetMode() == miopenPoolingMax"
+                          << std::endl;
+                return;
+            }
+            break;
+        }
+        case miopenIndexUint16: {
+            if(full_set && (sptl_dim == 3 || (!mask_idx && sptl_dim == 2)) &&
+               filter.GetMode() == miopenPoolingMax)
+            {
+                std::cout << "Warning: Config skipped: uint16 index is too small "
+                             "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
+                             "&& filter.GetMode() == miopenPoolingMax"
+                          << std::endl;
+                return;
+            }
+            if(skip_many_configs_with_non_int8_index)
+            {
+                // test_pooling_test --all limit uint16 cases
+                if(num_uint16_case >= max_typed_cases)
+                {
+                    std::cout << "Warning: Config skipped for the default dataset to speed "
+                                 "up testing (num_uint16_case > 5)"
+                              << std::endl;
+                    return;
+                }
+                ++num_uint16_case;
+            }
+            idx_sz = sizeof(uint16_t);
+            break;
+        }
+        case miopenIndexUint32: {
+            if(skip_many_configs_with_non_int8_index)
+            {
+                // test_pooling_test --all limit uint32 cases
+                if(mask_idx)
+                {
+                    if(num_uint32_case >= max_typed_cases)
+                    {
+                        std::cout << "Warning: Config skipped for the default dataset to speed up "
+                                     "testing (wsidx == 0 && num_uint32_case > 5)"
+                                  << std::endl;
+                        return;
+                    }
+                    ++num_uint32_case;
+                }
+                else
+                {
+                    if(num_uint32_case_imgidx >= max_typed_cases)
+                    {
+                        std::cout << "Warning: Config skipped for the default dataset to speed up "
+                                     "testing (wsidx != 0 && num_uint32_case_imgidx > 5)"
+                                  << std::endl;
+                        return;
+                    }
+                    ++num_uint32_case_imgidx;
+                }
+            }
+            idx_sz = sizeof(uint32_t);
+            break;
+        }
+        case miopenIndexUint64: {
+            if(skip_many_configs_with_non_int8_index)
+            {
+                if(mask_idx)
+                {
+                    if(num_uint64_case >= max_typed_cases)
+                    {
+                        std::cout << "Warning: Config skipped for the default dataset to speed up "
+                                     "testing (wsidx == 0) && (num_uint64_case > 5)"
+                                  << std::endl;
+                        return;
+                    }
+                    ++num_uint64_case;
+                }
+                else
+                {
+                    if(num_uint64_case_imgidx >= max_typed_cases && sptl_dim == 2)
+                    {
+                        std::cout << "Warning: Config skipped to speed up testing of the "
+                                     "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 "
+                                     "&& sptl_dim == 2)"
+                                  << std::endl;
+                        return;
+                    }
+                    ++num_uint64_case_imgidx;
+                }
+            }
+            idx_sz = sizeof(uint64_t);
+            break;
+        }
+        }
+
+        auto input_desc = miopen::TensorDescriptor(this->type, layout, in_shape);
+
+        for(int i = 0; i < sptl_dim; i++)
+        {
+            if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast<uint64_t>(2) * pads[i]))
+            {
+                std::cout << "Warning: Config skipped becuse it is invalid "
+                             "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))"
+                          << std::endl;
+                return;
+            }
+        }
+
+        if(full_set)
+        {
+            auto output_desc = filter.GetForwardOutputTensor(input_desc);
+            size_t total_mem =
+                3 * input_desc.GetNumBytes() + output_desc.GetNumBytes() +
+                idx_sz * output_desc.GetElementSize(); // estimate based on backward pass
+
+            size_t device_mem = get_handle().GetGlobalMemorySize();
+            if(total_mem >= device_mem)
+            {
+                std::cout << "Config skipped because it requires " << total_mem
+                          << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem
+                          << " Bytes of memory." << std::endl;
+                return;
+            }
+        }
+
+        std::vector<int> in_dim(input_desc.GetLengths().begin(),
+            input_desc.GetLengths().begin() + sptl_dim);
+        std::vector<int> out_dim(sptl_dim);
+        std::vector<int> ker_dim(filter.GetLengths().begin(), filter.GetLengths().end());
+
+#if TEST_PADDING_MODE == 1
+        if(filter.pmode == miopenPaddingSame)
+        {
+            if(std::any_of(filter.GetStrides().begin(), filter.GetStrides().end(), [](int i) {
+                   return i == 0;
+               }))
+                return;
+            for(int i = 0; i < sptl_dim; i++)
+            {
+                filter.pads[i] =
+                    ((in_dim[i] % filter.GetStrides()[i] == 0)
+                         ? (std::max((ker_dim[i] - filter.GetStrides()[i]), 0))
+                         : (std::max((ker_dim[i] - (in_dim[i] % filter.GetStrides()[i])), 0))) /
+                    2;
+
+                out_dim[i] = std::ceil(static_cast<double>(in_dim[i]) / filter.strides[i]);
+            }
+
+            if(std::any_of(out_dim.begin(), out_dim.end(), [](int i) { return i <= 0; }))
+                return;
+        }
+        else if(filter.pmode == miopenPaddingValid)
+        {
+            if(std::any_of(filter.GetStrides().begin(), filter.GetStrides().end(), [](int i) {
+                   return i == 0;
+               }))
+                return;
+            for(int i = 0; i < sptl_dim; i++)
+            {
+                filter.pads[i] = 0;
+
+                out_dim[i] = std::ceil(static_cast<double>(in_dim[i] - filter.lens[i] + 1) /
+                                       filter.strides[i]);
+            }
+
+            if(std::any_of(out_dim.begin(), out_dim.end(), [](int i) { return i <= 0; }))
+                return;
+        }
+#endif
+        switch(filter.GetIndexType())
+        {
+        case miopenIndexUint8: {
+            if(sptl_dim == 3)
+            {
+                run_impl<uint8_t, 3>();
+            }
+            else
+            {
+                run_impl<uint8_t, 2>();
+            }
+            break;
+        }
+        case miopenIndexUint16: {
+            if(sptl_dim == 3)
+            {
+                run_impl<uint16_t, 3>();
+            }
+            else
+            {
+                run_impl<uint16_t, 2>();
+            }
+            break;
+        }
+        case miopenIndexUint32: {
+            if(sptl_dim == 3)
+            {
+                run_impl<uint32_t, 3>();
+            }
+            else
+            {
+                run_impl<uint32_t, 2>();
+            }
+            break;
+        }
+        case miopenIndexUint64: {
+            if(sptl_dim == 3)
+            {
+                run_impl<uint64_t, 3>();
+            }
+            else
+            {
+                run_impl<uint64_t, 2>();
+            }
+            break;
+        }
+        }
+    }
+};
+
+#endif
diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp
index fb1556bc1a..8472b23375 100644
--- a/test/pooling2d.hpp
+++ b/test/pooling2d.hpp
@@ -24,6 +24,7 @@
  *
  *******************************************************************************/
 
+#include "gtest/pooling_testing.hpp"
 #include "pooling_common.hpp"
 
 #define WORKAROUND_ISSUE_1670 1
@@ -36,9 +37,11 @@ struct pooling2d_shapes
 
     static std::vector<U> get_2d_pooling_input_shapes()
     {
-        return {{1, 19, 1024, 2048},
-                {10, 3, 32, 32},
+        return {
+                {1, 831, 64, 128},
                 {5, 32, 8, 8},
+                {10, 3, 32, 32},
+                {1, 19, 1024, 2048},
                 {2, 1024, 12, 12},
                 {4, 3, 231, 231},
                 {8, 3, 227, 227},
@@ -47,13 +50,13 @@ struct pooling2d_shapes
                 {2, 160, 7, 7},
                 {1, 192, 256, 512},
                 {2, 192, 28, 28},
-                {1, 832, 64, 128},
                 {1, 256, 56, 56},
                 {4, 3, 224, 224},
                 {2, 64, 112, 112},
                 {2, 608, 4, 4},
                 {1, 2048, 11, 11},
-                {1, 16, 4096, 4096}};
+                {1, 16, 4096, 4096}
+        };
     }
 
     // Dataset 1 is intended for testing of asymmetric configs.
@@ -88,6 +91,23 @@ struct pooling2d_driver : pooling_driver<T>
 public:
     pooling2d_driver() : pooling_driver<T>()
     {
+        // clang-format off
+        this->add(this->pads, "pads", this->template generate_multi_data<U>({
+            {{0, 0}, {1, 1}}, //
+#if WORKAROUND_ISSUE_1670
+            {{0, 0}}, //
+#else
+            {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, //
+#endif
+            {{0, 0}}}));
+        // clang-format on
+        this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNCHW
+        this->add(this->wsidx, "wsidx", this->generate_data({miopenPoolingWorkspaceIndexMask, miopenPoolingWorkspaceIndexImage}));
+        this->add(this->strides,
+                  "strides",
+                  this->template generate_multi_data<U>({{{2, 2}, {1, 1}},                 //
+                                                         {{2, 2}, {2, 1}}, // , {1, 2}, {1, 1}
+                                                         {{1, 1}}}));
 #if TEST_GET_INPUT_TENSOR
         std::set<U> in_dim_set = get_inputs(this->batch_factor);
         std::vector<U> in_dim_vec(in_dim_set.begin(), in_dim_set.end());
@@ -107,22 +127,5 @@ struct pooling2d_driver : pooling_driver<T>
                       {{{2, 2}, {3, 3}},         //
                        {{2, 2}, {1, 2}, {2, 1}}, //
                        {{35, 35}, {100, 100}, {255, 255}, {410, 400}}}));
-        this->add(this->strides,
-                  "strides",
-                  this->template generate_multi_data<U>({{{2, 2}, {1, 1}},                 //
-                                                         {{2, 2}, {2, 1}}, // , {1, 2}, {1, 1}
-                                                         {{1, 1}}}));
-        // clang-format off
-        this->add(this->pads, "pads", this->template generate_multi_data<U>({
-            {{0, 0}, {1, 1}}, //
-#if WORKAROUND_ISSUE_1670
-            {{0, 0}}, //
-#else
-            {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, //
-#endif
-            {{0, 0}}}));
-        // clang-format on
-        this->add(this->wsidx, "wsidx", this->generate_data({0, 1}));
-        this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNCHW
     }
 };
diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp
index 8c84180663..956c529533 100644
--- a/test/pooling_common.hpp
+++ b/test/pooling_common.hpp
@@ -143,7 +143,7 @@ struct pooling_operators
 
 #include <iomanip>
 #define MAX_PRINT 16    // TEMPCODE RJS
-#define GPU_JUNK 240
+#define GPU_JUNK 0
 #define GPU_4COL false
 
 template <int SptDim>
@@ -349,7 +349,7 @@ struct verify_forward_pooling
                     for(int hh = 0; hh < outlen[2]; ++hh) {
                         for(int ww = 0; ww < outlen[3]; ++ww) {
                             std::cout << std::setw(11) << std::setprecision(5) << out.data[
-                                nn * outstr[0] + cc * outstr[1] + hh * outstr[2] + ww * outstr[2]
+                                nn * outstr[0] + cc * outstr[1] + hh * outstr[2] + ww * outstr[3]
                                 // out.desc.GetIndex(nn, cc, hh, ww)
                                 ] << "  ";
                         }
@@ -632,19 +632,19 @@ struct pooling_driver : test_driver
     {
         add(index_type,
             "index_type",
-            // generate_data({"miopenIndexUint32",}    // TEMPCODE RJS
-            generate_multi_data<const char*>( //
-                {{"miopenIndexUint8",
-                  "miopenIndexUint16",
-                  "miopenIndexUint32",
-                  "miopenIndexUint64"},                     //
-                 {"miopenIndexUint8", "miopenIndexUint32"}, //
-                 {"miopenIndexUint32"}}                     //
+            generate_data({"miopenIndexUint32"}    // TEMPCODE RJS
+            // generate_multi_data<const char*>( //
+            //     {{"miopenIndexUint8",
+            //       "miopenIndexUint16",
+            //       "miopenIndexUint32",
+            //       "miopenIndexUint64"},                     //
+            //      {"miopenIndexUint8", "miopenIndexUint32"}, //
+            //      {"miopenIndexUint32"}}                     //
                 ));
         add(mode,
             "mode",
             generate_data(
-                {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"}));
+                {"miopenPoolingMax", "miopenPoolingAverage"})); // , "miopenPoolingAverageInclusive"
 #if TEST_PADDING_MODE == 1
         add(pmode, "pmode", generate_data({"default", "same", "valid"}));
 #endif

From 2acd21486a1305afb3fed740610f04e81125313b Mon Sep 17 00:00:00 2001
From: "Randy J. Spaulding" <rspauldi@amd.com>
Date: Thu, 5 Sep 2024 22:17:42 -0700
Subject: [PATCH 06/10] benchmarking

---
 src/CMakeLists.txt                            |   6 +
 .../MIOpenPoolingForwardNDNhwcNaive.cpp       | 226 +++++++-------
 src/ocl/pooling_ocl.cpp                       |   6 +-
 src/pooling/problem_description.cpp           |   4 +-
 src/solver/pooling/forward2d.cpp              |  23 +-
 src/solver/pooling/forwardNdNhwcNaive.cpp     |  99 +++---
 ...ngFwdNdNaive.cpp => poolingFwd2dNaive.cpp} | 114 ++++---
 test/gtest/poolingFwd3dNaive.cpp              | 283 ++++++++++++++++++
 test/gtest/pooling_testing.hpp                | 192 ++++++++----
 test/pooling2d.hpp                            | 102 ++++---
 test/pooling3d.hpp                            |   7 +-
 11 files changed, 731 insertions(+), 331 deletions(-)
 rename test/gtest/{poolingFwdNdNaive.cpp => poolingFwd2dNaive.cpp} (66%)
 create mode 100644 test/gtest/poolingFwd3dNaive.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1bd81f9e87..f536de3b10 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -597,11 +597,17 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         # TEMPCODE RJS
     set(MIOPEN_KERNELS
         kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
+        kernels/MIOpenPooling.cl
+        kernels/MIOpenPoolingBwd.cl
+        kernels/MIOpenPoolingBwdND.cl
+        kernels/MIOpenPoolingForwardNaive.cl
+        kernels/MIOpenPoolingND.cl
     )
 
     set(MIOPEN_KERNEL_INCLUDES
     kernels/bfloat16_dev.hpp
     kernels/float_types.h
+    kernels/miopen_cstdint.hpp
     kernels/pooling_functions.h
     include/miopen/pooling/poolingNdNhwcArgs.hpp
     )
diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
index b194b6d44c..5c197e3b8e 100644
--- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
+++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
@@ -39,9 +39,6 @@
 #define CVT_FP32_2ACCUM(x) (x)
 #endif
 
-#define _FLOAT float
-#define _FLOAT_ACCUM _FLOAT
-
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
@@ -71,6 +68,22 @@
 #endif
 
 #include "float_types.h"
+#include "miopen_cstdint.hpp"
+
+#if MIOPEN_USE_INT8 == 1
+    #define _FLOAT char
+    #if !AVERAGE_OPS
+        #ifndef FLT_MAX
+        #define MAX_VAL 127 /* max value */
+        #else
+        #define MAX_VAL FLT_MAX
+        #endif
+    #endif
+#else
+    #define _FLOAT float
+#endif
+#define _FLOAT_ACCUM _FLOAT
+
 #endif // TEMPCODE
 
 #if AVERAGE_OPS
@@ -91,16 +104,31 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
                                     poolingNdNhwcArgs args
 )
 {
-    const uint32_t nn = blockIdx.x / args.top_d;                          // N=slow index
-    const auto c_base = (blockIdx.z / args.top_w) * blockDim.x;
-    const uint32_t td = blockIdx.x % args.top_d;                          // top D=fast index
-    const uint32_t th = blockIdx.y;  // top H
-    const uint32_t tw = blockIdx.z % args.top_w;  // top W=fast index
-    if(blockDim.x > args.all_c)
-    {
-        // // TODO: h, w, or both may be encoded into threadIdx
-        // if(top_h > 1 && blockDim.y == 1)    
-    }
+    // naming: caps=count, lowercase=index, <canonical>_<modified>
+    const uint32_t nd = blockIdx.x;
+    const uint32_t h_ = blockIdx.y;
+    const uint32_t w_c = blockIdx.z;
+    const uint32_t w_ = w_c % args.top_w;                               // CAN w=fast index
+
+    const uint32_t C_WH = blockDim.x;
+    const uint32_t _H = blockDim.y;
+    const uint32_t _W = blockDim.z;
+
+    const uint32_t c  = threadIdx.x;
+    const uint32_t _h = threadIdx.y;
+    const uint32_t _w = threadIdx.z;
+
+    const uint32_t nn = nd / args.top_d;                                // n=slow index
+    const uint32_t cc = (w_c / args.top_w) * C_WH + c;                  // c=slow index (lg-C)
+    const uint32_t td = nd % args.top_d;                                // top d=fast index
+    const uint32_t th = h_ * _H + _h;                                   // top h: blockIdx is slow (sm-C)
+    const uint32_t tw = w_ * _W + _w;                                   // top w: blockIdx is slow (sm-C)
+
+    if(nn >= args.all_n) return;
+    if(td >= args.top_d) return;
+    if(th >= args.top_h) return;
+    if(tw >= args.top_w) return;
+    if(cc >= args.all_c) return;
 
     auto log_ptr = junk_ptr;
     if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0 &&  threadIdx.y == 0 &&  threadIdx.z == 0)
@@ -182,12 +210,6 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
         log_ptr[idx++] = -7;
     }
 
-    if(nn >= args.all_n) return;
-    if(td >= args.top_d) return;
-    if(th >= args.top_h) return;
-    if(tw >= args.top_w) return;
-
-if(true) {  // TEMPCODE RJS
     const auto int_dstart   = static_cast<int64_t>(td * args.filter_d_stride) - static_cast<int64_t>(args.filter_d_pad);
     const auto dend         = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(args.filter_d), static_cast<int64_t>(args.bot_d)));
     const auto dstart       = static_cast<size_t>(max(int_dstart, 0));
@@ -200,17 +222,15 @@ if(true) {  // TEMPCODE RJS
     const auto wend             = static_cast<uint32_t>(min(int_wstart + static_cast<int>(args.filter_w), static_cast<int>(args.bot_w)));
     const auto wstart           = static_cast<uint32_t>(max(int_wstart, 0));
 
-    uint32_t cc = c_base + threadIdx.x;
-    if(cc >= args.all_c) return;
-
     size_t top_index = 
-            nn * args.top_n_stride             // TEMPCODE RJS
-            + cc * args.top_c_stride           //
-            + (size_t)(td * args.top_d_stride) //
-            + (size_t)(th * args.top_h_stride) //
-            + (size_t)(tw * args.top_w_stride);
+            nn * args.top_n_stride +            //
+            cc * args.top_c_stride +            //
+            td * args.top_d_stride +            //
+            th * args.top_h_stride +            //
+            tw * args.top_w_stride;
+
+#if false
     size_t junk_idx = 64 + 4 * th;
-if(true) {
     if(nn == 0 && cc == 0 && td == 0 && tw < 8 && th == 0)
     {
         size_t bot_ncd = static_cast<size_t>(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride);
@@ -226,113 +246,101 @@ if(true) {
         junk_ptr[junk_idx++] = wstart;
         junk_ptr[junk_idx++] = wend;
     }
-}
+#endif
 
 #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-        uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-        pool_size       = (pool_size == 0) ? 1 : pool_size;
+    uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    pool_size       = (pool_size == 0) ? 1 : pool_size;
 #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
-        const uint32_t pool_size = args.filter_d * args.filter_h * args.filter_w;
+    const uint32_t pool_size = args.filter_d * args.filter_h * args.filter_w;
 #endif
 
 #if AVERAGE_OPS
-        _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0);
+    _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0);
 #else // MAX
-        _FLOAT_ACCUM res     = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM);
-        bool found           = false; // May remain false if bot contains only NaNs/-INFs.
-        uint32_t d_save          = 0;
-        uint32_t h_save          = 0;
-        uint32_t w_save          = 0;
-        uint32_t saved_index     = 0;
+    _FLOAT_ACCUM res     = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM);
+    bool found           = false; // May remain false if bot contains only NaNs/-INFs.
+    uint32_t d_save          = 0;
+    uint32_t h_save          = 0;
+    uint32_t w_save          = 0;
+    uint32_t saved_index     = 0;
 #endif
 
-        size_t bot_ncd = static_cast<size_t>(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride);
-        for(size_t bd = dstart; bd < dend; ++bd)
+    size_t bot_ncd = static_cast<size_t>(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride);
+    for(size_t bd = dstart; bd < dend; ++bd)
+    {
+        size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride;
+        for(uint32_t bh = hstart; bh < hend; ++bh)
         {
-            size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride;
-            for(uint32_t bh = hstart; bh < hend; ++bh)
+            size_t bot_index = bot_ncdh + wstart * args.bot_w_stride;
+            for(uint32_t bw = wstart; bw < wend; ++bw)
             {
-                size_t bot_index = bot_ncdh + wstart * args.bot_w_stride;
-                for(uint32_t bw = wstart; bw < wend; ++bw)
-                {
 #if AVERAGE_OPS
-                    res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]);
+                res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]);
 #else // MAX
-                    if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res)
+                if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res)
+                {
+                    res = bot_ptr[bot_index];
+                    if(save_index)
                     {
-                        res = bot_ptr[bot_index];
-                        if(save_index)
-                        {
-                            found  = true;
-                            d_save = bd;
-                            h_save = bh;
-                            w_save = bw;
-                            saved_index = bot_index;
-                        }
+                        found  = true;
+                        d_save = bd;
+                        h_save = bh;
+                        w_save = bw;
+                        saved_index = bot_index;
                     }
-    if(top_index == 1662 || (nn == 0 && cc == 0 && td == 0 && tw == 2 && th == 0))
-    {
-        junk_ptr[junk_idx++] = nn;
-        junk_ptr[junk_idx++] = cc;
-        junk_ptr[junk_idx++] = th;
-        junk_ptr[junk_idx++] = tw;
-        junk_ptr[junk_idx++] = bot_ptr[bot_index];
-        junk_ptr[junk_idx++] = bot_index;
-        junk_ptr[junk_idx++] = res;
-        junk_ptr[junk_idx++] = saved_index;
-    }
-#endif
-                    bot_index += args.bot_w_stride;
                 }
-                bot_ncdh += args.bot_h_stride;
+#endif
+                bot_index += args.bot_w_stride;
             }
-            bot_ncd += args.bot_d_stride;
+            bot_ncdh += args.bot_h_stride;
         }
+        bot_ncd += args.bot_d_stride;
+    }
 
 #if AVERAGE_OPS
-        res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size);
+    res /= static_cast<_FLOAT_ACCUM>(pool_size);
 #else // MAX
-        if(save_index)
-        {
-            index_t res_index = 5150;
+    if(save_index)
+    {
+        index_t res_index = 5150;
 
-            // / Preventing overflow during computation of res_index:
-            // / If Index is shorter than uint, then let's perform computation in 32-bit
-            // / domain and then convert to narrower Index. That would reduce the probability of
-            // / overflow. If Index is wider then 32 bits, then it seems like it is better to
-            // / convert to Index type before multiplication. However this is not actually
-            // / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in
-            // / 32 bits and then convert.
+        // / Preventing overflow during computation of res_index:
+        // / If Index is shorter than uint, then let's perform computation in 32-bit
+        // / domain and then convert to narrower Index. That would reduce the probability of
+        // / overflow. If Index is wider then 32 bits, then it seems like it is better to
+        // / convert to Index type before multiplication. However this is not actually
+        // / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in
+        // / 32 bits and then convert.
 
-            if(found)
-            {
-                if(index_mode == 1) // TEMPCODE RJS
-                    res_index = saved_index;
-                    // res_index = (index_t)(              //
-                    //     nn * args.bot_n_stride          //
-                    //     + cc * args.bot_c_stride        //
-                    //     + d_save * args.bot_d_stride    //
-                    //     + h_save * args.bot_h_stride    //
-                    //     + w_save * args.bot_w_stride);
-                else
-                    res_index = (index_t)(                                                    //
-                        ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) //
-                        + ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w)          //
-                        + (w_save - tw * args.filter_w_stride + args.filter_w_pad)                       //
-                    );
-            }
-
-            const size_t mask_index = nn * args.mask_n_stride             //
-                                        + cc * args.mask_c_stride           //
-                                        + (size_t)(td * args.mask_d_stride) //
-                                        + (size_t)(th * args.mask_h_stride) //
-                                        + (size_t)(tw * args.mask_w_stride);
-            mask_ptr[mask_index] = res_index;
+        if(found)
+        {
+            if(index_mode == 1) // TEMPCODE RJS
+                res_index = saved_index;
+                // res_index = (index_t)(              //
+                //     nn * args.bot_n_stride          //
+                //     + cc * args.bot_c_stride        //
+                //     + d_save * args.bot_d_stride    //
+                //     + h_save * args.bot_h_stride    //
+                //     + w_save * args.bot_w_stride);
+            else
+                res_index = (index_t)(                                                    //
+                    ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) //
+                    + ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w)          //
+                    + (w_save - tw * args.filter_w_stride + args.filter_w_pad)                       //
+                );
         }
+
+        const size_t mask_index = nn * args.mask_n_stride             //
+                                    + cc * args.mask_c_stride           //
+                                    + (size_t)(td * args.mask_d_stride) //
+                                    + (size_t)(th * args.mask_h_stride) //
+                                    + (size_t)(tw * args.mask_w_stride);
+        mask_ptr[mask_index] = res_index;
+    }
 #endif
 
-        top_ptr[top_index] = (_FLOAT)res;
-} // TEMPCODE
+    top_ptr[top_index] = (_FLOAT)res;
 }
 
 extern "C" __global__ void mloPoolingForwardNDNhwcNaive(
diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp
index 1b22cd91b7..9a7e166081 100644
--- a/src/ocl/pooling_ocl.cpp
+++ b/src/ocl/pooling_ocl.cpp
@@ -95,7 +95,7 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle,
     auto index_max = get_index_max(GetIndexType());
 
     /// \anchor max_pooling_index_max_restriction
-    /// For kernel implementation max pooling backward pass,
+    /// For kernel implementation max pooling forward pass,
     /// "index_max" means ghost, and thus should not be reached.
     if(mode == miopenPoolingMax && save_index)
     {
@@ -108,13 +108,13 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle,
                                             1,
                                             std::multiplies<int>())))
         {
-            MIOPEN_THROW("Index range not enough for max pooling bwd");
+            MIOPEN_THROW("Index range not enough for max pooling fwd");
         }
 
         if(workSpace == nullptr)
         {
             throw std::invalid_argument("workSpace cannot be NULL in Forward Pooling MAX mode when "
-                                        "backward pass is requested");
+                                        "forward pass is requested");
         }
     }
 
diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp
index 804bc2bf84..7807d5425b 100644
--- a/src/pooling/problem_description.cpp
+++ b/src/pooling/problem_description.cpp
@@ -83,8 +83,8 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
         ss << "_dyd" << get_vect_config(dyDesc.GetLengths());
         ss << "_dys" << get_vect_config(dyDesc.GetStrides());
     }   // TEMPCODE RJS
-    std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << (xDesc.IsDefaultLayout() ? "" : " (not default)") << " *************************" << std::endl;
-    ss << "_l" << (xDesc.IsDefaultLayout() ? 0 : 1);
+    std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << " (" << static_cast<int>(xDesc.GetLayout_t()) << (xDesc.IsDefaultLayout() ? ")" : " not default)") << " *************************" << std::endl;
+    ss << "_l" << static_cast<int>(xDesc.GetLayout_t());
 std::cout << "               " << ss.str() << std::endl;
     return NetworkConfig{ss.str()};
 }
diff --git a/src/solver/pooling/forward2d.cpp b/src/solver/pooling/forward2d.cpp
index f1e1cd6e52..aa8fdb2f6b 100644
--- a/src/solver/pooling/forward2d.cpp
+++ b/src/solver/pooling/forward2d.cpp
@@ -135,19 +135,26 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro
 bool PoolingForward2d::IsApplicable(const ExecutionContext& context,
                                     const miopen::pooling::ProblemDescription& problem) const
 {
+    auto x_type = problem.GetXDesc().GetType();
+    auto y_type = problem.GetYDesc().GetType();
+    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf};    // TEMPCODE RJS fix types , miopenInt8, miopenFloat8
+
+    auto x_layout = problem.GetXDesc().GetLayout_str();
+    auto y_layout = problem.GetYDesc().GetLayout_str();
+    std::vector<std::string> layouts {"NCHW"};
+
     bool app =
-    problem.GetDirection() == miopen::pooling::Direction::Forward &&
-           problem.GetXDesc().GetNumDims() == 4 &&
-           problem.GetXDesc().GetType() == problem.GetYDesc().GetType() &&
-           (problem.GetXDesc().GetType() == miopenFloat ||
-            problem.GetXDesc().GetType() == miopenHalf) &&
-           problem.GetXDesc().GetLayout("NCHW") == "NCHW" &&
-           problem.GetYDesc().GetLayout("NCHW") == "NCHW" &&
+        problem.GetDirection() == miopen::pooling::Direction::Forward &&
+            problem.GetXDesc().GetNumDims() == 4 &&
+            (x_type == y_type) &&                                                    //
+            (x_layout == y_layout) &&                                                //
+            (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) &&    //
+            (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()) && //
            sizeof_private_memory(problem) <=
                TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth();
 
 // TEMPCODE RJS
-    std::cout << "%%%%%%%%%% PoolingForward2d::IsApplicable: " << app << " " <<  problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout("NCHW") << std::endl;
+    std::cout << "%%%%%%%%%% PoolingForward2d::IsApplicable: " << app << " " <<  problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout(x_layout) << std::endl;
                return app;
 }
 
diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp
index 62b65249df..6e28b98645 100644
--- a/src/solver/pooling/forwardNdNhwcNaive.cpp
+++ b/src/solver/pooling/forwardNdNhwcNaive.cpp
@@ -72,7 +72,7 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&,
 {
     auto x_type = problem.GetXDesc().GetType();
     auto y_type = problem.GetYDesc().GetType();
-    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf};
+    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf, miopenInt8, miopenFloat8}; // , miopenBFloat16
 
     auto mode = problem.GetPooling().GetMode();
     std::vector<miopenPoolingMode_t> modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive};
@@ -88,12 +88,15 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&,
         && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend())          //)
         && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end());
 
-    std::cout << "%%%%%%%%%% PoolingForwardNDNhwcNaive::IsApplicable: " << app << " " <<  problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout("NCHW")
-     << "  " << problem.GetYDesc().GetLayout_str() << "->" << problem.GetYDesc().GetLayout("NCHW")
+    // TODO RJS check grid size
+
+    std::cout << "%%%%%%%%%% PoolingForwardNDNhwcNaive::IsApplicable: " << app << " " <<  problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout(x_layout)
+     << "  " << problem.GetYDesc().GetLayout_str() << "->" << problem.GetYDesc().GetLayout(y_layout)
        << "  "  << (problem.GetDirection() == miopen::pooling::Direction::Forward)
         << (x_type == y_type)
         << (x_layout == y_layout) << (std::find(types.cbegin(), types.cend(), x_type) != types.cend())
         << (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) << (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()) << std::endl;
+
     return app;
 }
 
@@ -141,29 +144,16 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
     args.filter_w        = lengths[idx++];
 
     idx = 0;
-     args.filter_d_stride = is2d ? (strides[0]) : strides[idx++];
-     args.filter_h_stride = strides[idx++];
-     args.filter_w_stride = strides[idx++];
+    args.filter_d_stride = is2d ? (strides[0]) : strides[idx++];
+    args.filter_h_stride = strides[idx++];
+    args.filter_w_stride = strides[idx++];
 
     idx = 0;
     args.filter_d_pad    = is2d ? 0 : pads[idx++];
     args.filter_h_pad    = pads[idx++];
     args.filter_w_pad    = pads[idx++];
-    // uint32_t idx = 0;
-    // const uint32_t filter_d        = is2d ? 1 : lengths[idx++];
-    // const uint32_t filter_h        = lengths[idx++];
-    // const uint32_t filter_w        = lengths[idx++];
-
-    // idx = 0;
-    // const uint32_t filter_d_stride = is2d ? (strides[0]) : strides[idx++];
-    // const uint32_t filter_h_stride = strides[idx++];
-    // const uint32_t filter_w_stride = strides[idx++];
-
-    // idx = 0;
-    // const uint32_t filter_d_pad    = is2d ? 0 : pads[idx++];
-    // const uint32_t filter_h_pad    = pads[idx++];
-    // const uint32_t filter_w_pad    = pads[idx++];
 
+    // TODO RJS move pooling_method to shared code
     const int pooling_method = (pooling.GetMode() == miopenPoolingMax) ? MLO_POOLING_OP_MAX
                                : (pooling.GetMode() == miopenPoolingAverage)
                                    ? MLO_POOLING_OP_AVE
@@ -199,24 +189,16 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
 
     const auto spatial_dim = is2d ? 2U : 3U;
 
-    // uint32_t all_n, all_c, bot_d, bot_h, bot_w;
     std::tie(args.all_n, args.all_c, args.bot_d, args.bot_h, args.bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths());
-std::cout << "GetSol: bot_lens " << args.all_n << " " << args.all_c << " " << args.bot_d << " " << args.bot_h << " " << args.bot_w << std::endl;
 
     std::tie(args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride) =
         miopen::GetNCDHW(spatial_dim, bot.GetStrides());
-std::cout << "GetSol: bot_strides " << args.bot_n_stride << " " << args.bot_c_stride << " " << args.bot_d_stride
-<< " " << args.bot_h_stride << " " << args.bot_w_stride  << std::endl;
 
     std::tie(std::ignore, std::ignore, args.top_d, args.top_h, args.top_w) =
         miopen::GetNCDHW(spatial_dim, top.GetLengths());
-std::cout << "GetSol: top_lens " << args.top_d << " " << args.top_h << " " << args.top_w << std::endl;
 
     std::tie(args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride,args. top_w_stride) =
         miopen::GetNCDHW(spatial_dim, top.GetStrides());
-    // TEMPCODE RJS
-std::cout << "GetSol: top_strides " << args.top_n_stride << " " << args.top_c_stride << " "
-    << args.top_d_stride << " " << args.top_h_stride << " " << args.top_w_stride << std::endl;
 
     // Mask data is always NCDHW layout
     args.mask_w_stride = 1;
@@ -236,12 +218,14 @@ std::cout << "GetSol: top_strides " << args.top_n_stride << " " << args.top_c_st
     ///   Currently this limitation is valid for both ROCm HIP and OCL runtimes.
     ///
     /// Selecting the optimal workgroup size is an interesting problem.
-    /// We'll first map c into the workgroup up to the maximum 1024 items. For large C, the
-    /// extra are mapped into the grid dimensions.
-    /// For small C, w and h are mapped into the workgroup dimensions as needed, in that
-    /// order, up to a maximum of 128 workitems (favoring more active blocks over more threads).
-    /// We do permit a partial workgroup when it is not an exact multiple of the wavefront size.
-    /// As said above, remaining H and W are mapped onto the grid dimensions.
+    /// We'll first map N * D to blockIdx.x. H and W are canonically mapped into
+    /// blockIdx.y and z, respectively. C, being the fastest index, is mapped
+    /// into threadIdx.x up to the maximum items. For larger C, the remainder are
+    /// mapped into blockIdx.z.
+    ///
+    /// For small C, we favor more waves over more blocks. W/H are mapped into threadIdx.z/y,
+    /// in that order, fractionally in powers of 2 if possible, up to a maximum
+    /// of 256 workitems. Finally, any remaining W/H are then mapped onto blockIdx.z/y.
     ///
     /// The workgroup size does not have the restrictions imposed by synchronization between
     /// workitems because the kernel does not require synchronization.
@@ -249,14 +233,14 @@ std::cout << "GetSol: top_strides " << args.top_n_stride << " " << args.top_c_st
     std::ignore = context;
     constexpr uint32_t MAX_THREADS       = 512;
     constexpr uint32_t LARGE_C_MAX_ITEMS = MAX_THREADS;
-    constexpr uint32_t SMALL_C_MAX_ITEMS = 128;
+    constexpr uint32_t SMALL_C_TGT_ITEMS = 256;
 
     auto nd_ = args.all_n * args.top_d;
     auto h_  = args.top_h;
     auto w_  = args.top_w;
     auto c_  = args.all_c;
-std::cout << "nd_ " << nd_ << " h_ " << h_ << " w_ " << w_ << " c_ " << c_ << std::endl;
 
+    // These are hip-style indexes (not OCL)
     uint32_t l1 = 1U;
     uint32_t l2 = 1U;
 
@@ -266,22 +250,30 @@ std::cout << "nd_ " << nd_ << " h_ " << h_ << " w_ " << w_ << " c_ " << c_ << st
         c_ = LARGE_C_MAX_ITEMS;
         w_ *= c2;
     }
-    // else if(c_ <= SMALL_C_MAX_ITEMS / 2)
-    // {
-    //     if(c_ * w_ <= MAX_THREADS)
-    //     {
-    //         std::swap(l2, w_);
-    //
-    //         if(c_ * w_ * h_ <= MAX_THREADS)
-    //         {
-    //             std::swap(l1, h_);
-    //         }
-    //     }
-    //     else if(c_ * h_ <= MAX_THREADS)
-    //     {
-    //         std::swap(l1, h_);
-    //     }
-    // }
+    else if(c_ <= SMALL_C_TGT_ITEMS / 2)    // Small C, remap H and W to increase occupancy
+    {
+        if(c_ * w_ < SMALL_C_TGT_ITEMS)
+        {
+            std::swap(l2, w_);              // full w mapped to threads
+        }
+
+        while(w_ > 2 && ((c_ * l2) < SMALL_C_TGT_ITEMS))
+        {
+            w_ = (w_ + 1) / 2;              // partial w mapped to threads (rounddown-safe)
+            l2 *= 2;
+        }
+
+        if(c_ * l2 * h_ < SMALL_C_TGT_ITEMS)
+        {
+            std::swap(l1, h_);              // full h mapped to threads
+        }
+
+        while(h_ > 2 && ((c_ * l1 * l2) < SMALL_C_TGT_ITEMS))
+        {
+            h_ = (h_ + 1 ) / 2;             // partial h mapped to threads (rounddown-safe)
+            l1 *= 2;
+        }
+    }
 
     const auto g0 = nd_;
     const auto g1 = h_;
@@ -313,8 +305,7 @@ std::cout << "nd_ " << nd_ << " h_ " << h_ << " w_ " << w_ << " c_ " << c_ << st
         // * 2: layout (NCHW vs NHWC)
         // * 2: 2D and 3D kernels (optimization)
 
-        // l1 = 11;
-        // l2 = 11;
+        // KernelInfo uses OCL-style indexes
         kernel.l_wk.clear();
         kernel.l_wk.push_back(l0);
         kernel.l_wk.push_back(l1);
diff --git a/test/gtest/poolingFwdNdNaive.cpp b/test/gtest/poolingFwd2dNaive.cpp
similarity index 66%
rename from test/gtest/poolingFwdNdNaive.cpp
rename to test/gtest/poolingFwd2dNaive.cpp
index 3a27b3c786..45a51525e6 100644
--- a/test/gtest/poolingFwdNdNaive.cpp
+++ b/test/gtest/poolingFwd2dNaive.cpp
@@ -114,9 +114,12 @@ struct layout_data
 
 }
 
-class PoolingFwd : public testing::TestWithParam<std::vector<std::string>> {};
-class PoolingFwdFloat : public PoolingFwd {};
-class PoolingFwdHalf : public PoolingFwd {};
+class PoolingFwd2d : public testing::TestWithParam<std::vector<std::string>> {};
+class PoolingFwd2dInt8 : public PoolingFwd2d {};
+class PoolingFwd2dFloat : public PoolingFwd2d {};
+class PoolingFwd2dHalf : public PoolingFwd2d {};
+class PoolingFwd2dBF16 : public PoolingFwd2d {};
+class PoolingFwd2dF8 : public PoolingFwd2d {};
 
 void Run2dDriver(miopenDataType_t prec);
 
@@ -135,13 +138,15 @@ void GetArgs(const std::string& param, std::vector<std::string>& tokens)
 
 bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; }
 
-std::vector<std::string> GetTestCases(const std::string precision)
+std::vector<std::string> Get2dTestCases(const std::string precision)
 {
     const auto& flag_arg = env::value(MIOPEN_TEST_FLAGS_ARGS);
 
     const std::vector<std::string> test_cases = {
         // clang-format off
-    {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg}    // TEMPCODE RJS DATASET
+    {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg},   // TEMPCODE RJS DATASET
+    // {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg},   // TEMPCODE RJS DATASET
+    // {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 " + flag_arg}    // TEMPCODE RJS DATASET
         // clang-format on
     };
 
@@ -150,68 +155,83 @@ std::vector<std::string> GetTestCases(const std::string precision)
 } // namespace pooling_tests
 // using namespace pooling_tests;
 
-TEST_P(PoolingFwdFloat, NNT)    // NDNaiveTranspose
+TEST_P(PoolingFwd2dInt8, NNT)
+{
+    if(!IsTestRunWith("--int8"))           std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--int8") TEMPCODE RJS
+        GTEST_SKIP();
+
+    Run2dDriver(miopenInt8);
+};
+
+TEST_P(PoolingFwd2dFloat, NNT)
 {
-    const auto& handle = get_handle();
-    if(!IsTestSupportedForDevice(handle))   std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl;
-    if(SkipTest())                          std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl;
     if(!IsTestRunWith("--float"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl;
-        // Run2dDriver(miopenFloat);   return; // TEMPCODE RJS
-    //  && IsTestRunWith("--float")
-    if(IsTestSupportedForDevice(handle) && !SkipTest())
-    {
-        Run2dDriver(miopenFloat);
-    }
-    else
-    {
+
+    if(SkipTest() || !IsTestSupportedForDevice(get_handle()))
         GTEST_SKIP();
-    }
+
+    Run2dDriver(miopenFloat);
 };
 
-TEST_P(PoolingFwdHalf, NNT)
+TEST_P(PoolingFwd2dHalf, NNT)
 {
-    const auto& handle = get_handle();
-    if(!IsTestSupportedForDevice(handle))   std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl;
-    if(SkipTest())                          std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl;
     if(!IsTestRunWith("--half"))           std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl;
 
-    if(IsTestSupportedForDevice(handle) && !SkipTest()) //  && IsTestRunWith("--half") TEMPCODE RJS
-    {
-        Run2dDriver(miopenHalf);
-    }
-    else
-    {
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--half") TEMPCODE RJS
         GTEST_SKIP();
-    }
+
+    Run2dDriver(miopenHalf);
+};
+
+TEST_P(PoolingFwd2dBF16, NNT)
+{
+    if(!IsTestRunWith("--bfloat16"))           std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--bfloat16") TEMPCODE RJS
+        GTEST_SKIP();
+
+    Run2dDriver(miopenBFloat16);
+};
+
+TEST_P(PoolingFwd2dF8, NNT)
+{
+    if(!IsTestRunWith("--float8"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--float8") TEMPCODE RJS
+        GTEST_SKIP();
+
+    Run2dDriver(miopenFloat8);
 };
 
 void Run2dDriver(miopenDataType_t prec)
 {
-    auto cases = GetTestCases("--float");
-       std::cerr << " Cases: " << cases.size() << std::endl;
-    for(const auto& test_value : cases)
-    {
-        std::cerr << "      : " << test_value << std::endl;    // TEMPCODE RJS
-    }
+    auto cases = Get2dTestCases("--float");
+    // std::cerr << " Cases: " << cases.size() << std::endl;    // TEMPCODE RJS
+    // for(const auto& test_value : cases)
+    // {
+    //     std::cerr << "      : " << test_value << std::endl;    // TEMPCODE RJS
+    // }
  
     std::vector<std::string> params;
     switch(prec)
     {
-    case miopenFloat: params = PoolingFwdFloat_NNT_Test::GetParam(); break;
-    case miopenHalf: params = PoolingFwdHalf_NNT_Test::GetParam(); break;
-    case miopenBFloat16:
-    case miopenInt8:
+    case miopenFloat: params = PoolingFwd2dFloat_NNT_Test::GetParam(); break;
+    case miopenHalf: params = PoolingFwd2dHalf_NNT_Test::GetParam(); break;
+    case miopenBFloat16: params = PoolingFwd2dBF16_NNT_Test::GetParam(); break;
+    case miopenInt8: params = PoolingFwd2dInt8_NNT_Test::GetParam(); break;
+    case miopenFloat8: params = PoolingFwd2dF8_NNT_Test::GetParam(); break;
     case miopenInt32:
     case miopenDouble:
-    case miopenFloat8:
     case miopenBFloat8:
     case miopenInt64:
         FAIL()
             << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 "
                "data type not supported by "
-               "poolingFwdNdNaive test";
+               "poolingFwd2dNaive test";
 
-    default: params = PoolingFwdFloat_NNT_Test::GetParam();
+    default: params = PoolingFwd2dFloat_NNT_Test::GetParam();
     }
 
     std::cerr << "Params: " << params.size() << std::endl;
@@ -237,8 +257,12 @@ void Run2dDriver(miopenDataType_t prec)
     }
 }
 
-INSTANTIATE_TEST_SUITE_P(Float, PoolingFwdFloat, testing::Values(GetTestCases("--float")));
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwdHalf);
-INSTANTIATE_TEST_SUITE_P(Half, PoolingFwdHalf, testing::Values(GetTestCases("--half")));
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dBF16);
+
+//INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd2dBF16, testing::Values(Get2dTestCases("--bfloat16")));
+INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd2dInt8, testing::Values(Get2dTestCases("--int8")));
+INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd2dFloat, testing::Values(Get2dTestCases("--float")));
+INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd2dHalf, testing::Values(Get2dTestCases("--half")));
+INSTANTIATE_TEST_SUITE_P(F8, PoolingFwd2dF8, testing::Values(Get2dTestCases("--float8")));
 
 #endif
diff --git a/test/gtest/poolingFwd3dNaive.cpp b/test/gtest/poolingFwd3dNaive.cpp
new file mode 100644
index 0000000000..a32432f1e6
--- /dev/null
+++ b/test/gtest/poolingFwd3dNaive.cpp
@@ -0,0 +1,283 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef POOLING_GTEST_BUILD
+
+#include <gtest/gtest.h>
+#include <miopen/env.hpp>
+#include "get_handle.hpp"
+#include "test_env.hpp"
+
+#include "pooling_testing.hpp"
+#include "pooling3d.hpp"
+
+#include "tensor_holder.hpp"
+#include "miopen/tensor_layout.hpp"
+
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS)
+
+namespace env = miopen::env;
+
+namespace {
+
+template <typename T>
+struct layout_data
+{
+    static std::vector<int> get_layout_lengths(int n, int c, std::vector<int>& dims)
+    {
+        auto ret = std::vector<int>{n, c};
+        ret.insert(ret.end(), dims.cbegin(), dims.cend());
+
+        return ret;
+    }
+
+    static std::vector<int>
+    get_strides(std::vector<int>& lens, int dims, miopenTensorLayout_t tensor_layout)
+    {
+        std::vector<int> strides;
+        std::string layout_default = miopen::tensor_layout_get_default(dims + 2);
+        std::string layout_string  = miopen::TensorDescriptor::GetLayoutStr(tensor_layout);
+
+std::cout << "get_strides: dims=" << dims << " lens=" << lens.size() << " " << layout_default << " " << layout_string << std::endl;
+        miopen::tensor_layout_to_strides(lens, layout_default, layout_string, strides);
+
+        constexpr int min_stride_multiplier = 1;
+        constexpr int max_stride_multiplier = 4;
+
+        auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier);
+        for(auto& v : strides)
+        {
+            // cppcheck-suppress useStlAlgorithm
+            v = v * c;
+        }
+
+        return strides;
+    }
+
+    static miopenTensorDescriptor_t init_tensor_descriptor(miopenDataType_t type,
+                                                           const std::vector<int>& lens,
+                                                           const std::vector<int>& strides)
+    {
+        miopenTensorDescriptor_t desc;
+
+        EXPECT_TRUE(miopenCreateTensorDescriptor(&desc) == miopenStatusSuccess);
+        EXPECT_TRUE(
+            miopenSetTensorDescriptor(desc, type, lens.size(), lens.data(), strides.data()) ==
+            miopenStatusSuccess);
+
+        return desc;
+    }
+
+    layout_data(int _n, std::vector<int> _dims, int _c, miopenTensorLayout_t _tensor_layout)
+    {
+        auto lens    = get_layout_lengths(_n, _c, _dims);
+        auto strides = get_strides(lens, _dims.size(), _tensor_layout);
+        descriptor   = miopen::TensorDescriptor{miopen_type<T>{}, lens, strides};
+        host         = tensor<T>{lens, strides}.generate(gen_value<T>);
+    }
+
+    ~layout_data() {}
+
+    void read_gpu_data(miopen::Handle& handle, const miopen::Allocator::ManageDataPtr& ddata)
+    {
+        check      = tensor<T>{descriptor.GetLengths(), descriptor.GetStrides()};
+        handle.ReadTo(check.data.data(), ddata, check.data.size());
+    }
+
+    tensor<T> check{};
+    tensor<T> host;
+    miopen::TensorDescriptor descriptor;
+};
+
+}
+
+class PoolingFwd3d : public testing::TestWithParam<std::vector<std::string>> {};
+class PoolingFwd3dFloat : public PoolingFwd3d {};
+class PoolingFwd3dHalf : public PoolingFwd3d {};
+class PoolingFwd3dBF16 : public PoolingFwd3d {};
+class PoolingFwd3dInt8 : public PoolingFwd3d {};
+class PoolingFwd3dF8 : public PoolingFwd3d {};
+
+void Run3dDriver(miopenDataType_t prec);
+
+namespace {
+
+static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); }
+
+void GetArgs(const std::string& param, std::vector<std::string>& tokens)
+{
+    std::stringstream ss(param);
+    std::istream_iterator<std::string> begin(ss);
+    std::istream_iterator<std::string> end;
+    while(begin != end)
+        tokens.push_back(*begin++);
+}
+
+bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; }
+
+std::vector<std::string> Get3dTestCases(const std::string precision)
+{
+    const auto& flag_arg = env::value(MIOPEN_TEST_FLAGS_ARGS);
+
+    const std::vector<std::string> test_cases = {
+        // clang-format off
+    {"test_pooling3d " + precision + " --all --dataset 0 --limit 0 " + flag_arg},    // TEMPCODE RJS DATASET
+    // {"test_pooling3d " + precision + " --all --dataset 1 --limit 0 " + flag_arg},  // TEMPCODE RJS DATASET
+    // {"test_pooling3d " + precision + " --all --dataset 2 --limit 0 " + flag_arg}    // TEMPCODE RJS DATASET
+        // clang-format on
+    };
+
+    return test_cases;
+}
+} // namespace pooling_tests
+// using namespace pooling_tests;
+
+TEST_P(PoolingFwd3dFloat, NNT)    // NDNaiveTranspose
+{
+    const auto& handle = get_handle();
+    if(!IsTestSupportedForDevice(handle))   std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl;
+    if(SkipTest())                          std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl;
+    if(!IsTestRunWith("--float"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl;
+        // Run3dDriver(miopenFloat);   return; // TEMPCODE RJS
+    //  && IsTestRunWith("--float")
+    if(IsTestSupportedForDevice(handle) && !SkipTest())
+    {
+        Run3dDriver(miopenFloat);
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(PoolingFwd3dHalf, NNT)
+{
+    const auto& handle = get_handle();
+    if(!IsTestSupportedForDevice(handle))   std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl;
+    if(SkipTest())                          std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl;
+    if(!IsTestRunWith("--half"))           std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl;
+
+    if(IsTestSupportedForDevice(handle) && !SkipTest()) //  && IsTestRunWith("--half") TEMPCODE RJS
+    {
+        Run3dDriver(miopenHalf);
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(PoolingFwd3dBF16, NNT)
+{
+    if(!IsTestRunWith("--bfloat16"))           std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--bfloat16") TEMPCODE RJS
+        GTEST_SKIP();
+
+    Run3dDriver(miopenBFloat16);
+};
+
+TEST_P(PoolingFwd3dInt8, NNT)
+{
+    if(!IsTestRunWith("--int8"))           std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--int8") TEMPCODE RJS
+        GTEST_SKIP();
+
+    Run3dDriver(miopenInt8);
+};
+
+TEST_P(PoolingFwd3dF8, NNT)
+{
+    if(!IsTestRunWith("--float8"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--float8") TEMPCODE RJS
+        GTEST_SKIP();
+
+    Run3dDriver(miopenFloat8);
+};
+
+void Run3dDriver(miopenDataType_t prec)
+{
+    auto cases = Get3dTestCases("--float");
+       std::cerr << " Cases: " << cases.size() << std::endl;
+    for(const auto& test_value : cases)
+    {
+        std::cerr << "      : " << test_value << std::endl;    // TEMPCODE RJS
+    }
+ 
+    std::vector<std::string> params;
+    switch(prec)
+    {
+    case miopenFloat: params = PoolingFwd3dFloat_NNT_Test::GetParam(); break;
+    case miopenHalf: params = PoolingFwd3dHalf_NNT_Test::GetParam(); break;
+    case miopenBFloat16: params = PoolingFwd3dBF16_NNT_Test::GetParam(); break;
+    case miopenInt8: params = PoolingFwd3dInt8_NNT_Test::GetParam(); break;
+    case miopenFloat8: params = PoolingFwd3dF8_NNT_Test::GetParam(); break;
+    case miopenInt32:
+    case miopenDouble:
+    case miopenBFloat8:
+    case miopenInt64:
+        FAIL()
+            << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 "
+               "data type not supported by "
+               "poolingFwdNdNaive test";
+
+    default: params = PoolingFwd3dFloat_NNT_Test::GetParam();
+    }
+
+    for(const auto& test_value : params)
+    {
+        std::cerr << "Running Test: " << test_value << std::endl;    // TEMPCODE RJS
+        std::vector<std::string> tokens;
+        GetArgs(test_value, tokens);
+        std::vector<const char*> ptrs;
+
+        std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) {
+            return str.data();
+        });
+
+        testing::internal::CaptureStderr();
+        test_drive<pooling3d_driver>(ptrs.size(), ptrs.data());
+        auto capture = testing::internal::GetCapturedStderr();
+        std::cout << capture;
+    }
+}
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dBF16);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dFloat);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dHalf);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dInt8);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dF8);
+
+// INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd3dBF16, testing::Values(Get3dTestCases("--bfloat16")));
+INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd3dFloat, testing::Values(Get3dTestCases("--float")));
+INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd3dHalf, testing::Values(Get3dTestCases("--half")));
+INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd3dInt8, testing::Values(Get3dTestCases("--int8")));
+INSTANTIATE_TEST_SUITE_P(F8, PoolingFwd3dF8, testing::Values(Get3dTestCases("--float8")));
+
+#endif
diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp
index f114be8764..990759c620 100644
--- a/test/gtest/pooling_testing.hpp
+++ b/test/gtest/pooling_testing.hpp
@@ -29,6 +29,18 @@
 #ifndef GUARD_MIOPEN_TEST_POOLING_COMMON_HPP
 #define GUARD_MIOPEN_TEST_POOLING_COMMON_HPP
 
+#define DATASET "0"
+
+#include <chrono>
+#include <iomanip>
+namespace {using sc = std::chrono::steady_clock;}
+#undef tomillis
+#define tomillis(__DUR) (0.001 * std::chrono::duration_cast<std::chrono::microseconds>(__DUR).count())
+#undef coutms
+#define coutms(__TOK, __TP) (std::cout << "ms[" << std::setw(16) << __TOK << "]: "          \
+    << std::setw(12) << std::fixed << std::setprecision(3) << tomillis(sc::now() - __TP) << std::endl)
+
+
 #include <gtest/gtest.h>
 #include <array>
 #include <iostream>
@@ -36,6 +48,7 @@
 #include <strstream>
 #include <limits>
 #include <memory>
+#include <miopen/datatype.hpp>
 #include <miopen/logger.hpp>
 #include <miopen/miopen.h>
 #include <miopen/pooling.hpp>
@@ -68,7 +81,7 @@ constexpr int RAND_INTEGER_MIN = -8800;
 
 template <typename T>
 auto gen_value =
-    [](auto... is) { return static_cast<T>(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; };
+    [](auto... is) { return static_cast<T>(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX) / 100); };
 }
 
 static inline void print(std::ostringstream& oss, const miopen::PoolingDescriptor& filter, bool is_default_layout)
@@ -97,7 +110,7 @@ tensor<T> get_output_tensor(const miopen::PoolingDescriptor& filter, const tenso
 }
 
 template <class T>
-tensor<T> get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor<T>& input)
+tensor<float> get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor<T>& input)
 {
     auto desc = filter.GetForwardOutputTensor(input.desc);
     auto lens = desc.GetLengths();
@@ -105,8 +118,8 @@ tensor<T> get_big_output_tensor(const miopen::PoolingDescriptor& filter, const t
         lens[0] *= 2;
     else
         lens[0] *= 10;
-    auto big = miopen::TensorDescriptor{desc.GetType(), input.desc.GetLayout_t(), lens, desc.GetStrides()};
-    return tensor<T>{big};
+    auto big = miopen::TensorDescriptor{miopenFloat, input.desc.GetLayout_t(), lens, desc.GetStrides()};
+    return tensor<float>{big};
 }
 
 template <class T>
@@ -150,7 +163,7 @@ struct pooling_operators
 #define MAX_PRINTING 128    // TEMPCODE RJS
 #define MAX_PRINT 12    // TEMPCODE RJS
 #define MAX_NCD 2
-#define GPU_JUNK 160
+#define GPU_JUNK 0 // 160
 #define PRINT_CPU_IN 0
 #define PRINT_GPU_OUT 0
 #define GPU_4COL false
@@ -194,12 +207,17 @@ namespace {
                     oss << "n= " << nn << " c= " << cc << " d= " << dd << std::endl;
                     for(int hh = 0; hh < olen[3]; ++hh) {
                         for(int ww = 0; ww < olen[4]; ++ww) {
+                            auto index = nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4];
 switch(mode) {
     case 0:
-                            oss << std::setw(11) << std::setprecision(5) << out[nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4]] << "  ";
+    if(std::is_same<T, char>::value || std::is_same<T, int8_t>::value) {
+                            oss << std::setw(11) << std::setprecision(5) << (int16_t)out[index] << "  ";
+    } else {
+                            oss << std::setw(11) << std::setprecision(5) << out[index] << "  ";
+    }
                             break;
     case 1:
-                            oss << std::setw(11) << std::setprecision(5) << nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4] << "  ";
+                            oss << std::setw(11) << std::setprecision(5) << index << "  ";
                             break;
 }    
                         }
@@ -225,6 +243,7 @@ struct verify_forward_pooling
     tensor<T>
     cpu(const tensor<T>& input, const miopen::PoolingDescriptor& filter, std::vector<Index>&) const
     {
+auto st = sc::now();
         auto out = get_output_tensor(filter, input);
 
         std::array<int, SptDim> in_dim{};
@@ -291,7 +310,7 @@ struct verify_forward_pooling
             out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size));
         });
 
-        if(printing)
+        if(PRINT_GPU_OUT && printing)
         {
             PIPE << "CPU out: ";
             auto outlen = out.desc.GetLengths();
@@ -321,7 +340,7 @@ struct verify_forward_pooling
             printClamped(PIPE, input, inlen, instr);
             // printClamped(PIPE, input, inlen, instr, 1);
         }
-
+coutms("cpu", st);
         return out;
     }
 
@@ -404,6 +423,8 @@ struct verify_forward_pooling
                   const miopen::PoolingDescriptor& filter,
                   std::vector<Index>& indices) const
     {
+auto st = sc::now();
+auto write = sc::now();
         auto&& handle = get_handle();
         auto out      = get_output_tensor(filter, input);
         auto junk      = get_big_output_tensor(filter, input);   // TEMPCODE RJS
@@ -412,10 +433,10 @@ struct verify_forward_pooling
 
         auto in_dev  = handle.Write(input.data);
         auto out_dev = handle.Create<T>(out.GetSize());
-        auto junk_dev = handle.Create<T>(junk.GetSize());  // 
+        auto junk_dev = handle.Create<float>(junk.GetSize());  //
         Workspace wspace{};
         wspace.Write(indices);
-
+coutms("GPUwrite", write);
         float alpha = 1, beta = 0;
         filter.Forward(handle,
                        &alpha,
@@ -428,7 +449,21 @@ struct verify_forward_pooling
                        wspace.ptr(),
                        wspace.size(),
                        junk_dev.get()); // TEMPCODE RJS
-
+handle.Finish(); // TEMPCODE RJS
+coutms("gpu1", st);
+        filter.Forward(handle,
+                       &alpha,
+                       input.desc,
+                       in_dev.get(),
+                       &beta,
+                       out.desc,
+                       out_dev.get(),
+                       true,
+                       wspace.ptr(),
+                       wspace.size(),
+                       junk_dev.get()); // TEMPCODE RJS
+handle.Finish(); // TEMPCODE RJS
+auto read = sc::now();
         handle.ReadTo(out.data.data(), out_dev, out.GetDataByteSize());
         wspace.ReadTo(indices);
         bool printing = input.desc.GetLengths()[2] <= MAX_PRINTING && input.desc.GetLengths()[3] <= MAX_PRINTING;
@@ -437,7 +472,7 @@ std::cout << (printing ? "printing output from GPU..." : "skipping GPU print.")
         if(PRINT_GPU_OUT && printing)
         {
 #if GPU_JUNK > 0
-        handle.ReadTo(junk.data.data(), junk_dev, junk.GetDataByteSize());
+            handle.ReadTo(junk.data.data(), junk_dev, junk.GetDataByteSize());
             if(GPU_4COL){
                 PIPE<< "GPU (8-cols): " << std::endl;
                 for(int idx = 0; idx < GPU_JUNK; ++idx) {
@@ -445,11 +480,11 @@ std::cout << (printing ? "printing output from GPU..." : "skipping GPU print.")
                     if((idx % 8) == 7)  PIPE <<std::endl;
                 }
             }
-                PIPE << "GPU junk: " << std::endl;
-                for(int idx = 0; idx < GPU_JUNK; ++idx) {
-                    PIPE << std::setw(11) << std::setprecision(5) << junk.data[idx] << "  ";
-                    if((idx % 4) == 3)  PIPE <<std::endl;
-                }
+            PIPE << "GPU junk: " << std::endl;
+            for(int idx = 0; idx < GPU_JUNK; ++idx) {
+                PIPE << std::setw(11) << std::setprecision(5) << junk.data[idx] << "  ";
+                if((idx % 4) == 3)  PIPE <<std::endl;
+            }
 #endif
 
             auto outlen = out.desc.GetLengths();
@@ -480,7 +515,8 @@ std::cout << (printing ? "printing output from GPU..." : "skipping GPU print.")
             printClamped(PIPE, out, outlen, outstr);
             // printClamped(std::cout, out, outlen, outstr, 1);
         }   // print output tensor
-
+coutms("GPUread", read);
+coutms("gpu", st);
         return out;
     }
 
@@ -535,6 +571,7 @@ struct verify_backward_pooling
         int out_c = out.desc.GetLengths()[chan_dim_offset];
         std::array<int, SptDim> out_spatial_len{};
         std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin());
+
         auto ford_out = miopen::unpacker(ford)(out_spatial_len);
 
         par_ford(out_n, out_c)([&](int o, int w) {
@@ -653,6 +690,7 @@ struct verify_backward_pooling
             }
             dinput(in_id_pack...) = din_vec.at(din_idx);
         });
+
         return dinput;
     }
 
@@ -756,25 +794,28 @@ struct pooling_driver : test_driver
     {
         add(index_type,
             "index_type",
-            generate_data({"miopenIndexUint32"}    // TEMPCODE RJS
-            // generate_multi_data<const char*>( //
-            //     {{"miopenIndexUint8",
-            //       "miopenIndexUint16",
-            //       "miopenIndexUint32",
-            //       "miopenIndexUint64"},                     //
-            //      {"miopenIndexUint8", "miopenIndexUint32"}, //
-            //      {"miopenIndexUint32"}}                     //
+            // generate_data({"miopenIndexUint32"}    // TEMPCODE RJS
+            generate_multi_data<const char*>( //
+                {{"miopenIndexUint32",
+                  "miopenIndexUint8"
+                  ,
+                  "miopenIndexUint16",
+                  "miopenIndexUint64"
+                  },                     //
+                 {"miopenIndexUint8", "miopenIndexUint32"}, //
+                 {"miopenIndexUint32"}}                     //
                 ));
         add(mode_str,
             "mode_str",
             generate_data(
-                {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); // 
+                {"miopenPoolingMax", "miopenPoolingAverage"})); // , "miopenPoolingAverageInclusive"
 #if TEST_PADDING_MODE == 1
         add(pmode, "pmode", generate_data({"default", "same", "valid"}));
 #endif
         add(verify_indices, "verify_indices", generate_data({1}));
     }
 
+
     template <class Index, int SptlDim>
     void run_impl()
     {
@@ -812,11 +853,12 @@ struct pooling_driver : test_driver
         //         }
         //     }
         // }
-
+auto st = sc::now();
         auto out  = verify(verify_forward_pooling<SptlDim>{},
             input,
             filter,
             indices);
+            coutms("verifyfwd", st);
 #ifdef BACKWARD
         auto dout = out.first;
         dout.generate(tensor_elem_gen_integer{2503});
@@ -831,24 +873,42 @@ struct pooling_driver : test_driver
 #endif
     }
 
+
+#define RETURN coutms("Run", st); return
+
     void run()
     {
-        std::cout << "\n############   Run # " << std::setw(6) << num_all_case++ << " : ";
-        if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES)
-        {
-            std::cout << " skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES << " : ";
-            show_command();
-            return;
-        }
-        else if(this->dry_run)
-        {
-            std::cout << " skipped due to dry_run : ";
-            show_command();
-            return;
-        }
-        else
+auto st = sc::now();
+        const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); // TEMPCODE RJS
+
         {
+            bool skip = false;
+            std::ostringstream oss;
+
+            if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES)
+            {
+                skip = true;
+                oss << " : skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES;
+            }
+            if(this->dry_run)
+            {
+                skip = true;
+                oss << " : skipped due to dry_run";
+            }
+            if(is_default_layout && (this->type == miopenInt8 || this->type == miopenFloat8))
+            {
+                skip = true;
+                oss << " : skipped, no solvers for datatype " << this->type << " and default layouts";
+            }
+
+            std::cout << "\n############   Run " << (skip ? " " : "#") << " " << std::setw(6) << num_all_case++ << " : ";
             show_command();
+
+            if(skip)
+            {
+                std::cout << "-- " << oss.str() << std::endl;
+                RETURN;
+            }
         }
 
         int sptl_dim = static_cast<int>(in_shape.size()) - 2;
@@ -856,15 +916,13 @@ struct pooling_driver : test_driver
         {
             std::cout << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW form."
                       << std::endl;
-            return;
+            RETURN;
         }
 
         // To simplify launching, input dimensions to the driver are always default layout. Desire to
         // test non-default layouts is communicated exclusively via 'layout'.
-        // const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); // TEMPCODE RJS
 
         auto mode = mode_lookup.at(miopen::ToUpper(mode_str));
-        if(mode != miopenPoolingMax) return;  // TEMPCODE RJS skip all except max, do max only
 
         auto pad_mode = miopenPaddingDefault;
 #if TEST_PADDING_MODE
@@ -895,7 +953,7 @@ struct pooling_driver : test_driver
             std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
                          "yet in 3D max pooling solvers."
                       << std::endl;
-            return;
+            RETURN;
         }
 
         if(mask_idx && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset)
@@ -903,7 +961,7 @@ struct pooling_driver : test_driver
             std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
                          "yet in 2D max backward solvers that support wide pooling window."
                       << std::endl;
-            return;
+            RETURN;
         }
 
         if(mask_idx && filter.ModeIsAveraging())
@@ -914,7 +972,24 @@ struct pooling_driver : test_driver
                          "skip average pooling configs when (wsidx == 0). "
                          "Please make sure that dataset includes counterparts with (wsidx == 1)."
                       << std::endl;
-            return;
+            RETURN;
+        }
+
+        // index size filter
+        if(filter.GetMode() == miopenPoolingMax)
+        {
+            auto index_max = miopen::get_index_max(filter.GetIndexType());
+            auto index_needed = mask_idx ?
+                std::accumulate(lens.begin(), lens.end(), 1, std::multiplies<int>()) :
+                std::accumulate(in_shape.begin() + 2, in_shape.end(), 1, std::multiplies<int>());
+
+            if(index_max <= index_needed)
+            {
+                std::cout << "Warning: Config skipped: index mode " << filter.GetWorkspaceIndexMode()
+                    << " type " << filter.GetIndexType() << " is too small. max="
+                    << index_max << ", needed=" << index_needed << std::endl;
+                RETURN;
+            }
         }
 
         switch(idx_typ)
@@ -931,7 +1006,7 @@ struct pooling_driver : test_driver
                              "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
                              "&& filter.GetMode() == miopenPoolingMax"
                           << std::endl;
-                return;
+                RETURN;
             }
             break;
         }
@@ -943,7 +1018,7 @@ struct pooling_driver : test_driver
                              "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
                              "&& filter.GetMode() == miopenPoolingMax"
                           << std::endl;
-                return;
+                RETURN;
             }
             if(skip_many_configs_with_non_int8_index)
             {
@@ -953,7 +1028,7 @@ struct pooling_driver : test_driver
                     std::cout << "Warning: Config skipped for the default dataset to speed "
                                  "up testing (num_uint16_case > 5)"
                               << std::endl;
-                    return;
+                    RETURN;
                 }
                 ++num_uint16_case;
             }
@@ -971,7 +1046,7 @@ struct pooling_driver : test_driver
                         std::cout << "Warning: Config skipped for the default dataset to speed up "
                                      "testing (wsidx == 0 && num_uint32_case > 5)"
                                   << std::endl;
-                        return;
+                        RETURN;
                     }
                     ++num_uint32_case;
                 }
@@ -982,7 +1057,7 @@ struct pooling_driver : test_driver
                         std::cout << "Warning: Config skipped for the default dataset to speed up "
                                      "testing (wsidx != 0 && num_uint32_case_imgidx > 5)"
                                   << std::endl;
-                        return;
+                        RETURN;
                     }
                     ++num_uint32_case_imgidx;
                 }
@@ -1000,7 +1075,7 @@ struct pooling_driver : test_driver
                         std::cout << "Warning: Config skipped for the default dataset to speed up "
                                      "testing (wsidx == 0) && (num_uint64_case > 5)"
                                   << std::endl;
-                        return;
+                        RETURN;
                     }
                     ++num_uint64_case;
                 }
@@ -1012,7 +1087,7 @@ struct pooling_driver : test_driver
                                      "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 "
                                      "&& sptl_dim == 2)"
                                   << std::endl;
-                        return;
+                        RETURN;
                     }
                     ++num_uint64_case_imgidx;
                 }
@@ -1031,7 +1106,7 @@ struct pooling_driver : test_driver
                 std::cout << "Warning: Config skipped becuse it is invalid "
                              "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))"
                           << std::endl;
-                return;
+                RETURN;
             }
         }
 
@@ -1048,7 +1123,7 @@ struct pooling_driver : test_driver
                 std::cout << "Config skipped because it requires " << total_mem
                           << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem
                           << " Bytes of memory." << std::endl;
-                return;
+                RETURN;
             }
         }
 
@@ -1143,6 +1218,7 @@ struct pooling_driver : test_driver
             break;
         }
         }
+coutms("Run", st);
     }
 };
 
diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp
index 8472b23375..b30b9ce36f 100644
--- a/test/pooling2d.hpp
+++ b/test/pooling2d.hpp
@@ -38,29 +38,33 @@ struct pooling2d_shapes
     static std::vector<U> get_2d_pooling_input_shapes()
     {
         return {
-                {1, 831, 64, 128},
                 {5, 32, 8, 8},
-                {10, 3, 32, 32},
-                {1, 19, 1024, 2048},
-                {2, 1024, 12, 12},
-                {4, 3, 231, 231},
-                {8, 3, 227, 227},
-                {1, 384, 13, 13},
-                {1, 96, 27, 27},
-                {2, 160, 7, 7},
-                {1, 192, 256, 512},
-                {2, 192, 28, 28},
-                {1, 256, 56, 56},
-                {4, 3, 224, 224},
-                {2, 64, 112, 112},
-                {2, 608, 4, 4},
-                {1, 2048, 11, 11},
-                {1, 16, 4096, 4096}
+                {16, 1, 4096, 4096},
+                {1, 16, 4096, 4096},
+                {1, 1024, 512, 512},
+                {16, 1024, 128, 128}
+                // ,
+                // {1, 832, 64, 128},
+                // {10, 3, 32, 32},
+                // {1, 19, 1024, 2048},
+                // {2, 1024, 12, 12},
+                // {4, 3, 231, 231},
+                // {8, 3, 227, 227},
+                // {1, 384, 13, 13},
+                // {1, 96, 27, 27},
+                // {2, 160, 7, 7},
+                // {1, 192, 256, 512},
+                // {2, 192, 28, 28},
+                // {1, 256, 56, 56},
+                // {4, 3, 224, 224},
+                // {2, 64, 112, 112},
+                // {2, 608, 4, 4},
+                // {1, 2048, 11, 11}
         };
     }
 
     // Dataset 1 is intended for testing of asymmetric configs.
-    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 2, 8, 8}}; } // {1, 1, 8, 8}, 
+    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 2, 8, 8}, {10, 3, 32, 32}}; } // {1, 1, 8, 8}, 
 
     // Dataset 2 is intended for testing of configs with wide window.
     static std::vector<U> get_2d_pooling_input_shapes_wide()
@@ -91,41 +95,43 @@ struct pooling2d_driver : pooling_driver<T>
 public:
     pooling2d_driver() : pooling_driver<T>()
     {
+#if TEST_GET_INPUT_TENSOR
+        std::set<U> in_dim_set = get_inputs(this->batch_factor);
+        std::vector<U> in_dim_vec(in_dim_set.begin(), in_dim_set.end());
+        this->add(this->in_shape, "input", this->generate_data(in_dim_vec, {16, 32, 8, 8}));
+#else
+        this->add(this->in_shape, "input", this->template generate_multi_data_limited<U>({
+                get_2d_pooling_input_shapes(),
+                get_2d_pooling_input_shapes_minimal(),
+                get_2d_pooling_input_shapes_wide()
+            }, 9
+        ));
+#endif
+        this->add(this->lens, "lens", this->template generate_multi_data<U>({
+                {{2, 2}, {3, 3}},         //
+                {{2, 2}, {1, 2}, {2, 1}}, //
+                {{35, 35}, {100, 100}, {255, 255}, {410, 400}}
+            }
+        ));
+        this->add(this->strides, "strides", this->template generate_multi_data<U>({
+                {{2, 2}, {1, 1}},               //
+                {{2, 2}, {2, 1}, {1, 2}},       //
+                {{1, 1}}
+            }
+        ));
         // clang-format off
         this->add(this->pads, "pads", this->template generate_multi_data<U>({
-            {{0, 0}, {1, 1}}, //
+                {{0, 0}, {1, 1}}, //
 #if WORKAROUND_ISSUE_1670
-            {{0, 0}}, //
+                {{0, 0}}, //
 #else
-            {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, //
+                {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, //
 #endif
-            {{0, 0}}}));
+                {{0, 0}}
+            }
+        ));
         // clang-format on
-        this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNCHW
         this->add(this->wsidx, "wsidx", this->generate_data({miopenPoolingWorkspaceIndexMask, miopenPoolingWorkspaceIndexImage}));
-        this->add(this->strides,
-                  "strides",
-                  this->template generate_multi_data<U>({{{2, 2}, {1, 1}},                 //
-                                                         {{2, 2}, {2, 1}}, // , {1, 2}, {1, 1}
-                                                         {{1, 1}}}));
-#if TEST_GET_INPUT_TENSOR
-        std::set<U> in_dim_set = get_inputs(this->batch_factor);
-        std::vector<U> in_dim_vec(in_dim_set.begin(), in_dim_set.end());
-        this->add(this->in_shape, "input", this->generate_data(in_dim_vec, {16, 32, 8, 8}));
-#else
-        this->add(
-            this->in_shape,
-            "input",
-            this->template generate_multi_data_limited<U>({get_2d_pooling_input_shapes(),
-                                                           get_2d_pooling_input_shapes_minimal(),
-                                                           get_2d_pooling_input_shapes_wide()},
-                                                          9));
-#endif
-        this->add(this->lens,
-                  "lens",
-                  this->template generate_multi_data<U>(
-                      {{{2, 2}, {3, 3}},         //
-                       {{2, 2}, {1, 2}, {2, 1}}, //
-                       {{35, 35}, {100, 100}, {255, 255}, {410, 400}}}));
-    }
+        this->add(this->layout, "layout", this->generate_data({miopenTensorNCHW, miopenTensorNHWC}));
+   }
 };
diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp
index d2a64540c6..6792a5cc84 100644
--- a/test/pooling3d.hpp
+++ b/test/pooling3d.hpp
@@ -53,12 +53,11 @@ struct pooling3d_driver : pooling_driver<T>
 
     pooling3d_driver() : pooling_driver<T>()
     {
-        this->add(
-            this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4));
+        this->add(this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4));
         this->add(this->lens, "lens", this->generate_data({{2, 2, 2}, {3, 3, 3}}));
         this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}}));
         this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}}));
-        this->add(this->wsidx, "wsidx", this->generate_data({1}));
-        this->add(this->layout, "layout", this->generate_data({0, 1}));
+        this->add(this->wsidx, "wsidx", this->generate_data({miopenPoolingWorkspaceIndexImage}));
+        this->add(this->layout, "layout", this->generate_data({miopenTensorNCDHW, miopenTensorNDHWC}));
     }
 };

From c5a4b11d406aa5bd2351a35476c7f70a41af685e Mon Sep 17 00:00:00 2001
From: "Randy J. Spaulding" <rspauldi@amd.com>
Date: Thu, 19 Sep 2024 16:25:26 -0700
Subject: [PATCH 07/10] fwd naive done

---
 src/CMakeLists.txt                            |  13 +-
 .../MIOpenPoolingForwardNDNhwcNaive.cpp       |  80 +++---
 src/pooling/problem_description.cpp           |   1 +
 src/solver/pooling/forwardNdNhwcNaive.cpp     |   4 +-
 test/gtest/poolingFwd2dNaive.cpp              |   6 +-
 test/gtest/pooling_testing.hpp                | 239 ++++++++++--------
 test/pooling2d.hpp                            |  39 +--
 test/pooling3d.hpp                            |   6 +-
 8 files changed, 219 insertions(+), 169 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f536de3b10..df8c2f9614 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -26,6 +26,11 @@
 
 cmake_policy(SET CMP0057 NEW)
 
+include_directories(SYSTEM
+    /opt/rocm/include/gtest
+    /opt/rocm/include
+)
+
 include(ExportHeader)
 if(MIOPEN_ENABLE_SQLITE)
     add_subdirectory(sqlite)
@@ -594,7 +599,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/xform_bidirect_winograd_out.s
         kernels/UniversalTranspose.cl)
 
-        # TEMPCODE RJS
+# TEMPCODE KERNELS
     set(MIOPEN_KERNELS
         kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
         kernels/MIOpenPooling.cl
@@ -788,6 +793,12 @@ else()
         )
 endif()
 
+# TEMPCODE LIB_INC
+include_directories(SYSTEM
+    /opt/rocm/include/gtest/
+    /opt/rocm/include/
+)
+
 rocm_set_soversion(MIOpen ${MIOpen_SOVERSION})
 
 clang_tidy_check(MIOpen)
diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
index 5c197e3b8e..be6a548091 100644
--- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
+++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
@@ -25,22 +25,19 @@
  *******************************************************************************/
 
 //#define TEMPCODE RJS
-#ifdef TEMPCODE
-#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0
+// #ifdef TEMPCODE
+// #define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0
 
-#define MLO_POOLING_OP_ID MLO_POOLING_OP_AVE
+// #define MLO_POOLING_OP_ID MLO_POOLING_OP_AVE
 
-#define MLO_POOLING_INDEX_TYPE int
-#define MLO_POOLING_IS2D_KERNEL 0
-#define INPUT_TYPE _FLOAT
-#define OUTPUT_TYPE _FLOAT
-// #define TI INPUT_TYPE
-// #define TO OUTPUT_TYPE
-#define CVT_FP32_2ACCUM(x) (x)
-#endif
+// #define INPUT_TYPE _FLOAT
+// #define OUTPUT_TYPE _FLOAT
+// // #define TI INPUT_TYPE
+// // #define TO OUTPUT_TYPE
+// #define CVT_FP32_2ACCUM(x) (x)
+// #endif
 
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
-#include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
 #endif
 
@@ -70,8 +67,11 @@
 #include "float_types.h"
 #include "miopen_cstdint.hpp"
 
+// This implementation is extremely memory-bound, so float type is used for all calculations
+#define _FLOAT          float
+#define _FLOAT_ACCUM    float
+
 #if MIOPEN_USE_INT8 == 1
-    #define _FLOAT char
     #if !AVERAGE_OPS
         #ifndef FLT_MAX
         #define MAX_VAL 127 /* max value */
@@ -79,10 +79,14 @@
         #define MAX_VAL FLT_MAX
         #endif
     #endif
+#endif
+#if MIOPEN_USE_BFP16
+    #define NATIVE_CAST(_x)     (_FLOAT)bfloat16_to_float(_x)
+    #define NATIVE_UNCAST(_x)   (_FLOAT)float_to_bfloat16(_x)
 #else
-    #define _FLOAT float
+    #define NATIVE_CAST(_x)     (_FLOAT)(_x)
+    #define NATIVE_UNCAST(_x)   (_FLOAT)(_x)
 #endif
-#define _FLOAT_ACCUM _FLOAT
 
 #endif // TEMPCODE
 
@@ -94,10 +98,10 @@
 
 // Out N, D, H are encoded into the block indices x, y, z
 // No 2D-only optimization.
-template <typename TI, typename TO>
+template <typename TI>
 __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
-                                    TO* __restrict__ top_ptr,
-                                    TO* __restrict__ junk_ptr,  // TEMPCODE RJS
+                                    TI* __restrict__ top_ptr,
+                                    float* __restrict__ junk_ptr,  // TEMPCODE RJS
                                     ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr,
                                     ARG_UNUSED_FOR_AVERAGE int save_index,
                                     ARG_UNUSED_FOR_AVERAGE int index_mode,
@@ -130,14 +134,14 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
     if(tw >= args.top_w) return;
     if(cc >= args.all_c) return;
 
+    for(int i = 4; i < 320; ++i)
+    {
+        junk_ptr[i] = 1.11111;
+    }
     auto log_ptr = junk_ptr;
     if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0 &&  threadIdx.y == 0 &&  threadIdx.z == 0)
     {
-        for(int i = 0; i < 320; ++i)
-        {
-            junk_ptr[i] = (_FLOAT)1.11111;
-        }
-        int idx = 0;
+        int idx = 8;
         log_ptr[idx++] = gridDim.x;     // ND
         log_ptr[idx++] = gridDim.y;     // H
         log_ptr[idx++] = gridDim.z;     // W (*C overflow)
@@ -229,10 +233,17 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
             th * args.top_h_stride +            //
             tw * args.top_w_stride;
 
-#if false
-    size_t junk_idx = 64 + 4 * th;
+#if true
+    size_t junk_idx = 64;
+    int bot_idx = 0;
+    junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]);
+    junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]);
+    junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]);
+    junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]);
+    junk_idx += 4 * th;
     if(nn == 0 && cc == 0 && td == 0 && tw < 8 && th == 0)
     {
+
         size_t bot_ncd = static_cast<size_t>(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride);
             size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride;
                 size_t bot_index = bot_ncdh + wstart * args.bot_w_stride;
@@ -258,7 +269,7 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
 #if AVERAGE_OPS
     _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0);
 #else // MAX
-    _FLOAT_ACCUM res     = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM);
+    _FLOAT_ACCUM res     = (_FLOAT_ACCUM)NATIVE_CAST(-MAX_VAL_ACCUM);
     bool found           = false; // May remain false if bot contains only NaNs/-INFs.
     uint32_t d_save          = 0;
     uint32_t h_save          = 0;
@@ -276,11 +287,12 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
             for(uint32_t bw = wstart; bw < wend; ++bw)
             {
 #if AVERAGE_OPS
-                res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]);
+                res += static_cast<_FLOAT_ACCUM>(NATIVE_CAST(bot_ptr[bot_index]));
 #else // MAX
-                if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res)
+                auto val = static_cast<_FLOAT_ACCUM>(NATIVE_CAST(bot_ptr[bot_index]));
+                if(val > res)
                 {
-                    res = bot_ptr[bot_index];
+                    res = val;
                     if(save_index)
                     {
                         found  = true;
@@ -340,20 +352,20 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
     }
 #endif
 
-    top_ptr[top_index] = (_FLOAT)res;
+    top_ptr[top_index] = NATIVE_UNCAST(res);
 }
 
 extern "C" __global__ void mloPoolingForwardNDNhwcNaive(
                                     const INPUT_TYPE* __restrict__ bot_ptr,
-                                    OUTPUT_TYPE* __restrict__ top_ptr,
-                                    OUTPUT_TYPE* __restrict__ junk_ptr,    // TEMPCODE RJS
+                                    INPUT_TYPE* __restrict__ top_ptr,
+                                    float* __restrict__ junk_ptr,    // TEMPCODE RJS
                                     index_t* __restrict__ mask_ptr,
                                     int save_index,
                                     int index_mode,
-poolingNdNhwcArgs args
+                                    poolingNdNhwcArgs args
 )
 {
-    poolingForwardNDNhwcNaive<INPUT_TYPE, OUTPUT_TYPE>(
+    poolingForwardNDNhwcNaive<INPUT_TYPE>(
         bot_ptr,
         top_ptr,
         junk_ptr,
diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp
index 7807d5425b..a96101be3a 100644
--- a/src/pooling/problem_description.cpp
+++ b/src/pooling/problem_description.cpp
@@ -60,6 +60,7 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
                                                            : MLO_POOLING_OP_AVE_INCLUSIVE);
 
     ss << "m" + std::to_string(pooling_method);
+
     ss << "_dt" << xDesc.GetType();
     if(const auto ct = xDesc.GetCastType())
         ss << "_dct" << GetDataTypeName(*ct);
diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp
index 6e28b98645..d31df3358d 100644
--- a/src/solver/pooling/forwardNdNhwcNaive.cpp
+++ b/src/solver/pooling/forwardNdNhwcNaive.cpp
@@ -72,7 +72,7 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&,
 {
     auto x_type = problem.GetXDesc().GetType();
     auto y_type = problem.GetYDesc().GetType();
-    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf, miopenInt8, miopenFloat8}; // , miopenBFloat16
+    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf, miopenInt8, miopenFloat8, miopenBFloat16}; // 
 
     auto mode = problem.GetPooling().GetMode();
     std::vector<miopenPoolingMode_t> modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive};
@@ -81,7 +81,7 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&,
     auto y_layout = problem.GetYDesc().GetLayout_str();
     std::vector<std::string> layouts {"NHWC", "NDHWC"};
 
-    bool app = (problem.GetDirection() == miopen::pooling::Direction::Forward)          //
+    bool app = (problem.GetDirection() == miopen::pooling::Direction::Forward)      //
         && (x_type == y_type)                                                       //
         && (x_layout == y_layout)                                                   //
         && (std::find(types.cbegin(), types.cend(), x_type) != types.cend())        //
diff --git a/test/gtest/poolingFwd2dNaive.cpp b/test/gtest/poolingFwd2dNaive.cpp
index 45a51525e6..828fc556cb 100644
--- a/test/gtest/poolingFwd2dNaive.cpp
+++ b/test/gtest/poolingFwd2dNaive.cpp
@@ -258,8 +258,12 @@ void Run2dDriver(miopenDataType_t prec)
 }
 
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dBF16);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dInt8);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dFloat);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dHalf);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dF8);
 
-//INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd2dBF16, testing::Values(Get2dTestCases("--bfloat16")));
+INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd2dBF16, testing::Values(Get2dTestCases("--bfloat16")));
 INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd2dInt8, testing::Values(Get2dTestCases("--int8")));
 INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd2dFloat, testing::Values(Get2dTestCases("--float")));
 INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd2dHalf, testing::Values(Get2dTestCases("--half")));
diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp
index 990759c620..2fdb4ded19 100644
--- a/test/gtest/pooling_testing.hpp
+++ b/test/gtest/pooling_testing.hpp
@@ -114,12 +114,14 @@ tensor<float> get_big_output_tensor(const miopen::PoolingDescriptor& filter, con
 {
     auto desc = filter.GetForwardOutputTensor(input.desc);
     auto lens = desc.GetLengths();
-    if(desc.GetNumBytes() > 1000)
+    if(desc.GetElementSize() > 1024)
         lens[0] *= 2;
     else
-        lens[0] *= 10;
-    auto big = miopen::TensorDescriptor{miopenFloat, input.desc.GetLayout_t(), lens, desc.GetStrides()};
-    return tensor<float>{big};
+        lens[0] *= (2047 / desc.GetElementSize()) + 1 ;
+    auto dbig = miopen::TensorDescriptor{miopenFloat, input.desc.GetLayout_t(), lens, desc.GetStrides()};
+    auto big = tensor<float>{dbig};
+    for (auto& v : big)  v = -2.2222f;
+    return big;
 }
 
 template <class T>
@@ -163,7 +165,7 @@ struct pooling_operators
 #define MAX_PRINTING 128    // TEMPCODE RJS
 #define MAX_PRINT 12    // TEMPCODE RJS
 #define MAX_NCD 2
-#define GPU_JUNK 0 // 160
+#define GPU_JUNK 0 // typ. 160, reasonable max is 320
 #define PRINT_CPU_IN 0
 #define PRINT_GPU_OUT 0
 #define GPU_4COL false
@@ -340,7 +342,7 @@ auto st = sc::now();
             printClamped(PIPE, input, inlen, instr);
             // printClamped(PIPE, input, inlen, instr, 1);
         }
-coutms("cpu", st);
+coutms("f.cpu", st);
         return out;
     }
 
@@ -427,16 +429,24 @@ auto st = sc::now();
 auto write = sc::now();
         auto&& handle = get_handle();
         auto out      = get_output_tensor(filter, input);
-        auto junk      = get_big_output_tensor(filter, input);   // TEMPCODE RJS
+        auto junk     = get_big_output_tensor(filter, input);   // TEMPCODE RJS
+
+#if GPU_JUNK > 0
+            PIPE << "GPU junk init: " << std::endl;
+            for(int idx = 0; idx < GPU_JUNK; ++idx) {
+                PIPE << std::setw(11) << std::setprecision(5) << junk.data[idx] << "  ";
+                if((idx % 4) == 3)  PIPE <<std::endl;
+            }
+#endif
 
         indices.resize(out.data.size(), 0);
 
         auto in_dev  = handle.Write(input.data);
         auto out_dev = handle.Create<T>(out.GetSize());
-        auto junk_dev = handle.Create<float>(junk.GetSize());  //
+        auto junk_dev = handle.Write(junk.data);  // TEMPCODE RJS
         Workspace wspace{};
         wspace.Write(indices);
-coutms("GPUwrite", write);
+coutms("f.GPUwrite", write);
         float alpha = 1, beta = 0;
         filter.Forward(handle,
                        &alpha,
@@ -450,7 +460,7 @@ coutms("GPUwrite", write);
                        wspace.size(),
                        junk_dev.get()); // TEMPCODE RJS
 handle.Finish(); // TEMPCODE RJS
-coutms("gpu1", st);
+coutms("f.gpu1", st);
         filter.Forward(handle,
                        &alpha,
                        input.desc,
@@ -515,8 +525,8 @@ std::cout << (printing ? "printing output from GPU..." : "skipping GPU print.")
             printClamped(PIPE, out, outlen, outstr);
             // printClamped(std::cout, out, outlen, outstr, 1);
         }   // print output tensor
-coutms("GPUread", read);
-coutms("gpu", st);
+coutms("f.GPUread", read);
+coutms("f.gpu", st);
         return out;
     }
 
@@ -548,6 +558,7 @@ struct verify_backward_pooling
                   bool use_global_index,
                   bool verify_index) const
     {
+auto st = sc::now();
         const int sptl_dim_offset = 2;
         const int chan_dim_offset = 1;
 
@@ -616,7 +627,7 @@ struct verify_backward_pooling
                     {
                         idx[0] = o;
                         idx[1] = w;
-                        if(verify_index)
+                        if(false && verify_index)
                         {
                             CHECK(
                                 miopen::float_equal(input(idx), out(o, w, out_spatial_id_pack...)));
@@ -691,7 +702,8 @@ struct verify_backward_pooling
             dinput(in_id_pack...) = din_vec.at(din_idx);
         });
 
-        return dinput;
+    coutms("b.cpu", st);
+    return dinput;
     }
 
     template <class T, class Index>
@@ -703,6 +715,8 @@ struct verify_backward_pooling
                   bool,
                   bool) const
     {
+auto st = sc::now();
+auto write = sc::now();
         auto&& handle = get_handle();
         auto dinput   = input;
 
@@ -713,6 +727,7 @@ struct verify_backward_pooling
 
         Workspace wspace{};
         wspace.Write(indices);
+coutms("b.GPUwrite", write);
 
         float alpha = 1, beta = 0;
         filter.Backward(handle,
@@ -731,8 +746,29 @@ struct verify_backward_pooling
                         dinput.desc,
                         din_dev.get(),
                         wspace.ptr());
-
+handle.Finish(); // TEMPCODE RJS
+coutms("b.gpu1", st);
+        filter.Backward(handle,
+                        &alpha,
+                        // y
+                        out.desc,
+                        out_dev.get(),
+                        // dy
+                        dout.desc,
+                        dout_dev.get(),
+                        // x
+                        input.desc,
+                        in_dev.get(),
+                        &beta,
+                        // dx
+                        dinput.desc,
+                        din_dev.get(),
+                        wspace.ptr());
+handle.Finish(); // TEMPCODE RJS
+auto read = sc::now();
         dinput.data = handle.Read<T>(din_dev, dinput.data.size());
+coutms("b.GPUread", read);
+coutms("b.gpu", st);
         return dinput;
     }
 
@@ -746,10 +782,12 @@ struct verify_backward_pooling
               bool,
               bool) const
     {
-        std::cout << "Backward ";
-        print(filter, input.desc.IsDefaultLayout());
-        std::cout << "Input tensor: " << input.desc.ToString() << std::endl;
-        std::cout << "Output tensor: " << out.desc.ToString() << std::endl;
+        std::ostringstream oss;
+        oss << "Backward ";
+        print(oss, filter, input.desc.IsDefaultLayout());
+        oss << "Input tensor: " << input.desc.ToString() << std::endl;
+        oss << "Output tensor: " << out.desc.ToString() << std::endl;
+        GTEST_FAIL() << oss.str();
     }
 };
 
@@ -794,7 +832,7 @@ struct pooling_driver : test_driver
     {
         add(index_type,
             "index_type",
-            // generate_data({"miopenIndexUint32"}    // TEMPCODE RJS
+            // generate_data({"miopenIndexUint32"}    // TEMPCODE RUN
             generate_multi_data<const char*>( //
                 {{"miopenIndexUint32",
                   "miopenIndexUint8"
@@ -808,13 +846,15 @@ struct pooling_driver : test_driver
         add(mode_str,
             "mode_str",
             generate_data(
-                {"miopenPoolingMax", "miopenPoolingAverage"})); // , "miopenPoolingAverageInclusive"
+                {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); // TEMPCODE RUN
 #if TEST_PADDING_MODE == 1
         add(pmode, "pmode", generate_data({"default", "same", "valid"}));
 #endif
         add(verify_indices, "verify_indices", generate_data({1}));
     }
 
+#define FORWARD
+#define BACKWARD
 
     template <class Index, int SptlDim>
     void run_impl()
@@ -823,46 +863,20 @@ struct pooling_driver : test_driver
         auto input = tensor<T>{layout, in_shape};
         for(auto& v : input.data)   v = gen_value<T>();
 
-        // TEMPCODE RJS print input tensor
-        // bool printing = in_shape[0] <= MAX_PRINTING && in_shape[1] <= MAX_PRINTING;
-        // if (in_shape.size() > 2) printing &= in_shape[2] <= MAX_PRINTING;
-        // if(printing)
-        // {
-        //     auto inlen = input.desc.GetLengths();
-        //     auto instr = input.desc.GetStrides();
-        //     std::cout << "CPU GEN : " << input.desc.GetLayout_str() << "(" << inlen.size() << ") | " << input.data.size() << " | " << input.desc.GetElementSpace() << " | ";
-        //     for(auto dim : inlen) std::cout << std::setw(4) << dim;
-        //     std::cout << " | ";
-        //     for(auto str : instr) std::cout << std::setw(4) << str;
-        //     std::cout << " | ";
-        //     for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
-        //     std::cout << " | ";
-        //     for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
-        //     std::cout << " | ";
-        //     for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
-        //     std::cout << std::endl;
-
-        //     for(int nn = 0; nn < inlen[0]; ++nn) {
-        //         for(int cc = 0; cc < inlen[1]; ++cc) {
-        //             for(int hh = 0; hh < inlen[2]; ++hh) {
-        //                 for(int ww = 0; ww < inlen[3]; ++ww) {// nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]
-        //                     std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << "  ";
-        //                 }
-        //             std::cout << std::endl;
-        //             }
-        //         }
-        //     }
-        // }
 auto st = sc::now();
+#ifdef FORWARD
         auto out  = verify(verify_forward_pooling<SptlDim>{},
             input,
             filter,
             indices);
-            coutms("verifyfwd", st);
-#ifdef BACKWARD
+coutms("f.verify", st);
+#endif
+#ifdef BACKWARD    // TEMPCODE RJS no backward
+        if(!std::is_same<T, float>::value && !std::is_same<T, half>::value) return;
+auto bst = sc::now();
         auto dout = out.first;
         dout.generate(tensor_elem_gen_integer{2503});
-        verify(verify_backward_pooling<SptlDim>{},   // TEMPCODE RJS no backward
+        verify(verify_backward_pooling<SptlDim>{},
                input,
                dout,
                out.first,
@@ -870,53 +884,53 @@ auto st = sc::now();
                indices,
                wsidx != 0,
                static_cast<bool>(this->verify_indices));
+coutms("b.verify", bst);
 #endif
     }
 
+#define CHECK_SKIP  \
+if(skip)        \
+{               \
+    std::cout << "\n############ RunSkip # " << std::setw(7) << num_all_case++ << " : ";    \
+    show_command(); \
+    std::cout << "-- " << oss.str() << std::endl;   \
+coutms("skipRun", st); return;   \
+}
 
-#define RETURN coutms("Run", st); return
+#define SKIP_RUN  skip = true; CHECK_SKIP
 
     void run()
     {
 auto st = sc::now();
         const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); // TEMPCODE RJS
 
-        {
-            bool skip = false;
-            std::ostringstream oss;
-
-            if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES)
-            {
-                skip = true;
-                oss << " : skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES;
-            }
-            if(this->dry_run)
-            {
-                skip = true;
-                oss << " : skipped due to dry_run";
-            }
-            if(is_default_layout && (this->type == miopenInt8 || this->type == miopenFloat8))
-            {
-                skip = true;
-                oss << " : skipped, no solvers for datatype " << this->type << " and default layouts";
-            }
-
-            std::cout << "\n############   Run " << (skip ? " " : "#") << " " << std::setw(6) << num_all_case++ << " : ";
-            show_command();
+        bool skip = false;
+        std::ostringstream oss;
 
-            if(skip)
-            {
-                std::cout << "-- " << oss.str() << std::endl;
-                RETURN;
-            }
+        if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES)
+        {
+            skip = true;
+            oss << " : skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES;
+        }
+        if(this->dry_run)
+        {
+            skip = true;
+            oss << " : skipped due to dry_run";
+        }
+        if(is_default_layout && (this->type != miopenFloat && this->type != miopenHalf))
+        {
+            skip = true;
+            oss << " : skipped, no solvers for datatype " << this->type << " and default layouts";
         }
 
+        CHECK_SKIP;
+
         int sptl_dim = static_cast<int>(in_shape.size()) - 2;
         if(sptl_dim != 2 && sptl_dim != 3)
         {
-            std::cout << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW form."
+            oss << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW form."
                       << std::endl;
-            RETURN;
+            SKIP_RUN;
         }
 
         // To simplify launching, input dimensions to the driver are always default layout. Desire to
@@ -950,29 +964,29 @@ auto st = sc::now();
 
         if(mask_idx && sptl_dim == 3 && filter.GetMode() == miopenPoolingMax)
         {
-            std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
+            oss << "Warning: Config skipped. Workspace index mask mode is not implemented "
                          "yet in 3D max pooling solvers."
                       << std::endl;
-            RETURN;
+            SKIP_RUN;
         }
 
         if(mask_idx && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset)
         {
-            std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
+            oss << "Warning: Config skipped. Workspace index mask mode is not implemented "
                          "yet in 2D max backward solvers that support wide pooling window."
                       << std::endl;
-            RETURN;
+            SKIP_RUN;
         }
 
         if(mask_idx && filter.ModeIsAveraging())
         {
-            std::cout << "Warning: Config skipped. Workspace index modes are irrelevant for "
+            oss << "Warning: Config skipped. Workspace index modes are irrelevant for "
                          "Average pooling. "
                          "In order to optimize performance of full tests, we "
                          "skip average pooling configs when (wsidx == 0). "
                          "Please make sure that dataset includes counterparts with (wsidx == 1)."
                       << std::endl;
-            RETURN;
+            SKIP_RUN;
         }
 
         // index size filter
@@ -985,10 +999,10 @@ auto st = sc::now();
 
             if(index_max <= index_needed)
             {
-                std::cout << "Warning: Config skipped: index mode " << filter.GetWorkspaceIndexMode()
+                oss << "Warning: Config skipped: index mode " << filter.GetWorkspaceIndexMode()
                     << " type " << filter.GetIndexType() << " is too small. max="
                     << index_max << ", needed=" << index_needed << std::endl;
-                RETURN;
+                SKIP_RUN;
             }
         }
 
@@ -1002,11 +1016,11 @@ auto st = sc::now();
             if(full_set && (sptl_dim == 3 || (mask_idx && sptl_dim == 2)) &&
                filter.GetMode() == miopenPoolingMax)
             {
-                std::cout << "Warning: Config skipped: uint8 index is too small "
+                oss << "Warning: Config skipped: uint8 index is too small "
                              "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
                              "&& filter.GetMode() == miopenPoolingMax"
                           << std::endl;
-                RETURN;
+                SKIP_RUN;
             }
             break;
         }
@@ -1014,21 +1028,21 @@ auto st = sc::now();
             if(full_set && (sptl_dim == 3 || (!mask_idx && sptl_dim == 2)) &&
                filter.GetMode() == miopenPoolingMax)
             {
-                std::cout << "Warning: Config skipped: uint16 index is too small "
+                oss << "Warning: Config skipped: uint16 index is too small "
                              "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
                              "&& filter.GetMode() == miopenPoolingMax"
                           << std::endl;
-                RETURN;
+                SKIP_RUN;
             }
             if(skip_many_configs_with_non_int8_index)
             {
                 // test_pooling_test --all limit uint16 cases
                 if(num_uint16_case >= max_typed_cases)
                 {
-                    std::cout << "Warning: Config skipped for the default dataset to speed "
+                    oss << "Warning: Config skipped for the default dataset to speed "
                                  "up testing (num_uint16_case > 5)"
                               << std::endl;
-                    RETURN;
+                    SKIP_RUN;
                 }
                 ++num_uint16_case;
             }
@@ -1043,10 +1057,10 @@ auto st = sc::now();
                 {
                     if(num_uint32_case >= max_typed_cases)
                     {
-                        std::cout << "Warning: Config skipped for the default dataset to speed up "
+                        oss << "Warning: Config skipped for the default dataset to speed up "
                                      "testing (wsidx == 0 && num_uint32_case > 5)"
                                   << std::endl;
-                        RETURN;
+                        SKIP_RUN;
                     }
                     ++num_uint32_case;
                 }
@@ -1054,10 +1068,10 @@ auto st = sc::now();
                 {
                     if(num_uint32_case_imgidx >= max_typed_cases)
                     {
-                        std::cout << "Warning: Config skipped for the default dataset to speed up "
+                        oss << "Warning: Config skipped for the default dataset to speed up "
                                      "testing (wsidx != 0 && num_uint32_case_imgidx > 5)"
                                   << std::endl;
-                        RETURN;
+                        SKIP_RUN;
                     }
                     ++num_uint32_case_imgidx;
                 }
@@ -1072,10 +1086,10 @@ auto st = sc::now();
                 {
                     if(num_uint64_case >= max_typed_cases)
                     {
-                        std::cout << "Warning: Config skipped for the default dataset to speed up "
+                        oss << "Warning: Config skipped for the default dataset to speed up "
                                      "testing (wsidx == 0) && (num_uint64_case > 5)"
                                   << std::endl;
-                        RETURN;
+                        SKIP_RUN;
                     }
                     ++num_uint64_case;
                 }
@@ -1083,11 +1097,11 @@ auto st = sc::now();
                 {
                     if(num_uint64_case_imgidx >= max_typed_cases && sptl_dim == 2)
                     {
-                        std::cout << "Warning: Config skipped to speed up testing of the "
+                        oss << "Warning: Config skipped to speed up testing of the "
                                      "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 "
                                      "&& sptl_dim == 2)"
                                   << std::endl;
-                        RETURN;
+                        SKIP_RUN;
                     }
                     ++num_uint64_case_imgidx;
                 }
@@ -1103,10 +1117,10 @@ auto st = sc::now();
         {
             if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast<uint64_t>(2) * pads[i]))
             {
-                std::cout << "Warning: Config skipped becuse it is invalid "
+                oss << "Warning: Config skipped becuse it is invalid "
                              "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))"
                           << std::endl;
-                RETURN;
+                SKIP_RUN;
             }
         }
 
@@ -1120,13 +1134,18 @@ auto st = sc::now();
             size_t device_mem = get_handle().GetGlobalMemorySize();
             if(total_mem >= device_mem)
             {
-                std::cout << "Config skipped because it requires " << total_mem
+                oss << "Config skipped because it requires " << total_mem
                           << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem
                           << " Bytes of memory." << std::endl;
-                RETURN;
+                SKIP_RUN;
             }
         }
 
+        CHECK_SKIP;
+
+        std::cout << "\n############     Run # " << std::setw(7) << num_all_case++ << " : ";
+        show_command();
+
         std::vector<int> in_dim(input_desc.GetLengths().begin(),
             input_desc.GetLengths().begin() + sptl_dim);
         std::vector<int> out_dim(sptl_dim);
diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp
index b30b9ce36f..b3acb61e07 100644
--- a/test/pooling2d.hpp
+++ b/test/pooling2d.hpp
@@ -38,33 +38,34 @@ struct pooling2d_shapes
     static std::vector<U> get_2d_pooling_input_shapes()
     {
         return {
-                {5, 32, 8, 8},
+                {5, 32, 8, 8}   // TEMPCODE RUN
+                ,
                 {16, 1, 4096, 4096},
                 {1, 16, 4096, 4096},
                 {1, 1024, 512, 512},
                 {16, 1024, 128, 128}
-                // ,
-                // {1, 832, 64, 128},
-                // {10, 3, 32, 32},
-                // {1, 19, 1024, 2048},
-                // {2, 1024, 12, 12},
-                // {4, 3, 231, 231},
-                // {8, 3, 227, 227},
-                // {1, 384, 13, 13},
-                // {1, 96, 27, 27},
-                // {2, 160, 7, 7},
-                // {1, 192, 256, 512},
-                // {2, 192, 28, 28},
-                // {1, 256, 56, 56},
-                // {4, 3, 224, 224},
-                // {2, 64, 112, 112},
-                // {2, 608, 4, 4},
-                // {1, 2048, 11, 11}
+                ,
+                {1, 832, 64, 128},
+                {10, 3, 32, 32},
+                {1, 19, 1024, 2048},
+                {2, 1024, 12, 12},
+                {4, 3, 231, 231},
+                {8, 3, 227, 227},
+                {1, 384, 13, 13},
+                {1, 96, 27, 27},
+                {2, 160, 7, 7},
+                {1, 192, 256, 512},
+                {2, 192, 28, 28},
+                {1, 256, 56, 56},
+                {4, 3, 224, 224},
+                {2, 64, 112, 112},
+                {2, 608, 4, 4},
+                {1, 2048, 11, 11}
         };
     }
 
     // Dataset 1 is intended for testing of asymmetric configs.
-    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 2, 8, 8}, {10, 3, 32, 32}}; } // {1, 1, 8, 8}, 
+    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 1, 8, 8}, {10, 3, 32, 32}}; }
 
     // Dataset 2 is intended for testing of configs with wide window.
     static std::vector<U> get_2d_pooling_input_shapes_wide()
diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp
index 6792a5cc84..341cc34660 100644
--- a/test/pooling3d.hpp
+++ b/test/pooling3d.hpp
@@ -33,13 +33,15 @@ struct pooling3d_shapes
 
     static std::vector<U> get_3d_pooling_input_shapes()
     {
-        return {{16, 64, 3, 4, 4},
+        return {{16, 64, 3, 4, 4}   // TEMPCODE RUN
+        ,
                 {16, 32, 4, 9, 9},
                 {8, 512, 3, 14, 14},
                 {8, 512, 4, 28, 28},
                 {16, 64, 56, 56, 56},
                 {4, 3, 4, 227, 227},
-                {4, 4, 4, 161, 700}};
+                {4, 4, 4, 161, 700}
+                };
     }
 };
 

From e04542086e1e4bf4f1aacf8e9c5c18ab80ae9cb0 Mon Sep 17 00:00:00 2001
From: "Randy J. Spaulding" <rspauldi@amd.com>
Date: Tue, 8 Oct 2024 06:40:29 -0700
Subject: [PATCH 08/10] pr chkpt, pads1,1 broken

---
 src/CMakeLists.txt                            |  26 +-
 src/comgr.cpp                                 |   9 +-
 src/hipoc/hipoc_program.cpp                   |   1 -
 src/include/miopen/kernel_info.hpp            |   3 +
 src/include/miopen/pooling.hpp                |   3 +-
 src/include/miopen/pooling/invoke_params.hpp  |   1 -
 .../miopen/pooling/poolingNdNhwcArgs.hpp      |  50 +-
 .../MIOpenPoolingForwardNDNhwcNaive.cpp       | 180 +------
 src/ocl/pooling_ocl.cpp                       |   4 +-
 src/pooling/problem_description.cpp           |   5 +-
 src/solver.cpp                                |  13 +
 src/solver/pooling/forward2d.cpp              |  36 +-
 src/solver/pooling/forwardCk2d.cpp            |   2 +-
 src/solver/pooling/forwardNaive.cpp           |  26 +-
 src/solver/pooling/forwardNdNhwcNaive.cpp     |  92 +---
 test/gtest/layout_transpose.cpp               |  44 +-
 test/gtest/poolingFwd2dNaive.cpp              |  86 ++--
 test/gtest/poolingFwd3dNaive.cpp              |  46 +-
 test/gtest/pooling_testing.hpp                | 454 +++---------------
 test/pooling2d.hpp                            |   2 +-
 test/pooling_common.hpp                       | 187 +-------
 21 files changed, 251 insertions(+), 1019 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index df8c2f9614..359604bfd3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -458,6 +458,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/miopen_utility.hpp
         kernels/neuron.inc
         kernels/pooling_functions.h
+        include/miopen/pooling/poolingNdNhwcArgs.hpp
         kernels/rocm_version.inc
         kernels/stride_array.hpp
         kernels/utilities.inc
@@ -599,24 +600,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/xform_bidirect_winograd_out.s
         kernels/UniversalTranspose.cl)
 
-# TEMPCODE KERNELS
-    set(MIOPEN_KERNELS
-        kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
-        kernels/MIOpenPooling.cl
-        kernels/MIOpenPoolingBwd.cl
-        kernels/MIOpenPoolingBwdND.cl
-        kernels/MIOpenPoolingForwardNaive.cl
-        kernels/MIOpenPoolingND.cl
-    )
-
-    set(MIOPEN_KERNEL_INCLUDES
-    kernels/bfloat16_dev.hpp
-    kernels/float_types.h
-    kernels/miopen_cstdint.hpp
-    kernels/pooling_functions.h
-    include/miopen/pooling/poolingNdNhwcArgs.hpp
-    )
-    # Kernels in development lists.
+        # Kernels in development lists.
     # Should be ALWAYS empty in develop branch (at the time of PR merge)
     # Intention: to speed up kernel development rebuild time
     set(MIOPEN_DEVELOPMENT_KERNELS)
@@ -793,12 +777,6 @@ else()
         )
 endif()
 
-# TEMPCODE LIB_INC
-include_directories(SYSTEM
-    /opt/rocm/include/gtest/
-    /opt/rocm/include/
-)
-
 rocm_set_soversion(MIOpen ${MIOpen_SOVERSION})
 
 clang_tidy_check(MIOpen)
diff --git a/src/comgr.cpp b/src/comgr.cpp
index 0fec8fa8ea..2c533f9623 100644
--- a/src/comgr.cpp
+++ b/src/comgr.cpp
@@ -887,7 +887,6 @@ class HiprtcProgram
         const auto log = GetLog(false);
         if(!log.empty())
             MIOPEN_LOG_I(log);
-        std::cout << "************** HipRTC compile log: '" << log << "'" << std::endl; // TEMPCODE RJS
     }
 
     void GetCode(std::vector<char>& bytes) const
@@ -990,9 +989,11 @@ void BuildHip(const std::string& name,
 
         HiprtcProgram prog(name, text);
         try{    // TEMPCODE RJS
-        prog.Compile(opts);
-        } catch(Error& ex) { std::cout << __FUNCTION__ << " : Exception calling prog.Compile!: " << ex.text << std::endl; throw(ex); }
-        prog.GetCode(binary);
+
+  std::cout << "Compling HIP: '" << name << "'" << std::endl; // TEMPCODE RJS
+                  prog.Compile(opts);
+         } catch(Error& ex) { std::cout << __FUNCTION__ << " : Exception calling prog.Compile!: " << ex.text << std::endl; throw(ex); }
+                prog.GetCode(binary);
     }
     catch(Error& ex)
     {
diff --git a/src/hipoc/hipoc_program.cpp b/src/hipoc/hipoc_program.cpp
index fa151bd3ed..ec090455e8 100644
--- a/src/hipoc/hipoc_program.cpp
+++ b/src/hipoc/hipoc_program.cpp
@@ -275,7 +275,6 @@ void HIPOCProgramImpl::BuildCodeObjectInMemory(const std::string& params,
 #endif
         if(filename.extension() == ".cpp")
         {
-            std::cout << "Compling HIP: '" << filename << "'" << std::endl; // TEMPCODE RJS
             hiprtc::BuildHip(filename.string(), src, params, target, binary);
         }
         else if(filename.extension() == ".s")
diff --git a/src/include/miopen/kernel_info.hpp b/src/include/miopen/kernel_info.hpp
index d2571afa32..d824fe5c78 100644
--- a/src/include/miopen/kernel_info.hpp
+++ b/src/include/miopen/kernel_info.hpp
@@ -48,6 +48,9 @@ struct KernelInfo
     fs::path kernel_file;
     std::string kernel_name;
     friend std::ostream& operator<<(std::ostream& os, const KernelInfo& k);
+
+    /// configures the working set using hip-style indices
+    void ConfigureHip(size_t l0, size_t l1, size_t l2, size_t g0, size_t g1, size_t g2);
 };
 
 std::vector<Program> PrecompileKernels(const Handle& h,
diff --git a/src/include/miopen/pooling.hpp b/src/include/miopen/pooling.hpp
index 9cf2bae82f..26805f6df8 100644
--- a/src/include/miopen/pooling.hpp
+++ b/src/include/miopen/pooling.hpp
@@ -148,8 +148,7 @@ struct MIOPEN_EXPORT PoolingDescriptor : miopenPoolingDescriptor
                            Data_t y,
                            bool save_index,
                            Data_t workSpace,
-                           size_t workSpaceSize,
-                           Data_t junk = nullptr) const;  // TEMPCODE RJS
+                           size_t workSpaceSize) const;
 
     miopenStatus_t Backward(Handle& handle,
                             const void* alpha,
diff --git a/src/include/miopen/pooling/invoke_params.hpp b/src/include/miopen/pooling/invoke_params.hpp
index 2140c0a1fb..2d55786c21 100644
--- a/src/include/miopen/pooling/invoke_params.hpp
+++ b/src/include/miopen/pooling/invoke_params.hpp
@@ -45,7 +45,6 @@ struct FwdInvokeParams : public miopen::InvokeParams
     Data_t y                   = nullptr;
     Data_t workspace           = nullptr;
     std::size_t workspace_size = 0;
-    Data_t junk                   = nullptr;    // TEMPCODE RJS
 
     std::size_t GetWorkspaceSize() const { return workspace_size; }
     Data_t GetWorkspace() const { return workspace; }
diff --git a/src/include/miopen/pooling/poolingNdNhwcArgs.hpp b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp
index 9fd15bd4bd..5084f28b4f 100644
--- a/src/include/miopen/pooling/poolingNdNhwcArgs.hpp
+++ b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp
@@ -1,19 +1,45 @@
 #pragma once
 
-// #include <stdint.h>
-
-using BIGONE = uint32_t; // TEMPCODE RJS 
-
 struct poolingNdNhwcArgs
 {
-    uint32_t filter_d; uint32_t filter_h; uint32_t filter_w;
-    uint32_t filter_d_stride; uint32_t filter_h_stride; uint32_t filter_w_stride;
-    uint32_t filter_d_pad; uint32_t filter_h_pad; uint32_t filter_w_pad;
+    uint32_t filter_d;
+    uint32_t filter_h;
+    uint32_t filter_w;
+
+    uint32_t filter_d_stride;
+    uint32_t filter_h_stride;
+    uint32_t filter_w_stride;
+
+    uint32_t filter_d_pad;
+    uint32_t filter_h_pad;
+    uint32_t filter_w_pad;
+
     uint32_t all_n;
     uint32_t all_c;
-    uint32_t bot_d; uint32_t bot_h; uint32_t bot_w;
-    BIGONE bot_n_stride; uint32_t bot_c_stride; BIGONE bot_d_stride; uint32_t bot_h_stride; uint32_t bot_w_stride;
-    uint32_t top_d; uint32_t top_h; uint32_t top_w;
-    BIGONE top_n_stride; uint32_t top_c_stride; BIGONE top_d_stride; uint32_t top_h_stride; uint32_t top_w_stride;
-    BIGONE mask_n_stride; BIGONE mask_c_stride; uint32_t mask_d_stride; uint32_t mask_h_stride; uint32_t mask_w_stride;
+
+    uint32_t bot_d;
+    uint32_t bot_h;
+    uint32_t bot_w;
+
+    uint32_t bot_n_stride;
+    uint32_t bot_c_stride;
+    uint32_t bot_d_stride;
+    uint32_t bot_h_stride;
+    uint32_t bot_w_stride;
+
+    uint32_t top_d;
+    uint32_t top_h;
+    uint32_t top_w;
+
+    uint32_t top_n_stride;
+    uint32_t top_c_stride;
+    uint32_t top_d_stride;
+    uint32_t top_h_stride;
+    uint32_t top_w_stride;
+
+    uint32_t mask_n_stride;
+    uint32_t mask_c_stride;
+    uint32_t mask_d_stride;
+    uint32_t mask_h_stride;
+    uint32_t mask_w_stride;
 };
diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
index be6a548091..c502cf625f 100644
--- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
+++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
@@ -24,26 +24,10 @@
  *
  *******************************************************************************/
 
-//#define TEMPCODE RJS
-// #ifdef TEMPCODE
-// #define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0
-
-// #define MLO_POOLING_OP_ID MLO_POOLING_OP_AVE
-
-// #define INPUT_TYPE _FLOAT
-// #define OUTPUT_TYPE _FLOAT
-// // #define TI INPUT_TYPE
-// // #define TO OUTPUT_TYPE
-// #define CVT_FP32_2ACCUM(x) (x)
-// #endif
-
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef TEMPCODE
-#include "float_types.h"
-#endif
 #include "pooling_functions.h"
 #include "poolingNdNhwcArgs.hpp"
 
@@ -57,7 +41,6 @@
 
 // Let's use extended-precision accumulator only in FP16 pooling and only for averaging.
 // For all other ops and datatypes, use native accumulator, i.e. treate FLOAT_ACCUM as FLOAT.
-#ifndef TEMPCODE
 #if !(AVERAGE_OPS && MIOPEN_USE_FP16)
 #define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 1
 // #else
@@ -88,8 +71,6 @@
     #define NATIVE_UNCAST(_x)   (_FLOAT)(_x)
 #endif
 
-#endif // TEMPCODE
-
 #if AVERAGE_OPS
 #define ARG_UNUSED_FOR_AVERAGE __attribute__((__unused__))
 #else
@@ -101,7 +82,6 @@
 template <typename TI>
 __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
                                     TI* __restrict__ top_ptr,
-                                    float* __restrict__ junk_ptr,  // TEMPCODE RJS
                                     ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr,
                                     ARG_UNUSED_FOR_AVERAGE int save_index,
                                     ARG_UNUSED_FOR_AVERAGE int index_mode,
@@ -112,7 +92,7 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
     const uint32_t nd = blockIdx.x;
     const uint32_t h_ = blockIdx.y;
     const uint32_t w_c = blockIdx.z;
-    const uint32_t w_ = w_c % args.top_w;                               // CAN w=fast index
+    const uint32_t w_ = w_c % args.top_w;                   // CAN w=fast index
 
     const uint32_t C_WH = blockDim.x;
     const uint32_t _H = blockDim.y;
@@ -122,11 +102,11 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
     const uint32_t _h = threadIdx.y;
     const uint32_t _w = threadIdx.z;
 
-    const uint32_t nn = nd / args.top_d;                                // n=slow index
-    const uint32_t cc = (w_c / args.top_w) * C_WH + c;                  // c=slow index (lg-C)
-    const uint32_t td = nd % args.top_d;                                // top d=fast index
-    const uint32_t th = h_ * _H + _h;                                   // top h: blockIdx is slow (sm-C)
-    const uint32_t tw = w_ * _W + _w;                                   // top w: blockIdx is slow (sm-C)
+    const uint32_t nn = nd / args.top_d;                    // n=slow index
+    const uint32_t cc = (w_c / args.top_w) * C_WH + c;      // c=slow index (lg-C)
+    const uint32_t td = nd % args.top_d;                    // top d=fast index
+    const uint32_t th = h_ * _H + _h;                       // top h: blockIdx is slow (sm-C)
+    const uint32_t tw = w_ * _W + _w;                       // top w: blockIdx is slow (sm-C)
 
     if(nn >= args.all_n) return;
     if(td >= args.top_d) return;
@@ -134,96 +114,16 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
     if(tw >= args.top_w) return;
     if(cc >= args.all_c) return;
 
-    for(int i = 4; i < 320; ++i)
-    {
-        junk_ptr[i] = 1.11111;
-    }
-    auto log_ptr = junk_ptr;
-    if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0 &&  threadIdx.y == 0 &&  threadIdx.z == 0)
-    {
-        int idx = 8;
-        log_ptr[idx++] = gridDim.x;     // ND
-        log_ptr[idx++] = gridDim.y;     // H
-        log_ptr[idx++] = gridDim.z;     // W (*C overflow)
-        log_ptr[idx++] = -9;
-
-        log_ptr[idx++] = blockDim.x;    // C
-        log_ptr[idx++] = blockDim.y;    // small-C H
-        log_ptr[idx++] = blockDim.z;    // small-C W
-        log_ptr[idx++] = -8;
-
-        log_ptr[idx++] = args.filter_d;
-        log_ptr[idx++] = args.filter_h;
-        log_ptr[idx++] = args.filter_w;
-        log_ptr[idx++] = -7;
-
-        log_ptr[idx++] = args.filter_d_stride;
-        log_ptr[idx++] = args.filter_h_stride;
-        log_ptr[idx++] = args.filter_w_stride;
-        log_ptr[idx++] = -6;
-
-        log_ptr[idx++] = args.filter_d_pad;
-        log_ptr[idx++] = args.filter_h_pad;
-        log_ptr[idx++] = args.filter_w_pad;
-        log_ptr[idx++] = -5;
-
-        log_ptr[idx++] = args.all_n;
-        log_ptr[idx++] = args.all_c;
-        log_ptr[idx++] = args.bot_n_stride;
-        log_ptr[idx++] = args.bot_c_stride;
-
-        log_ptr[idx++] = args.top_n_stride;
-        log_ptr[idx++] = args.top_c_stride;
-        #if AVERAGE_OPS
-        log_ptr[idx++] = -4;
-        log_ptr[idx++] = -4;
-        #else
-        log_ptr[idx++] = args.mask_n_stride;
-        log_ptr[idx++] = args.mask_c_stride;
-        #endif
-
-        log_ptr[idx++] = args.bot_d;
-        log_ptr[idx++] = args.bot_h;
-        log_ptr[idx++] = args.bot_w;
-        log_ptr[idx++] = -3;
-
-        log_ptr[idx++] = args.bot_d_stride;
-        log_ptr[idx++] = args.bot_h_stride;
-        log_ptr[idx++] = args.bot_w_stride;
-        log_ptr[idx++] = -2;
-
-        log_ptr[idx++] = args.top_d;
-        log_ptr[idx++] = args.top_h;
-        log_ptr[idx++] = args.top_w;
-        log_ptr[idx++] = -1;
-    
-        log_ptr[idx++] = args.top_d_stride;
-        log_ptr[idx++] = args.top_h_stride;
-        log_ptr[idx++] = args.top_w_stride;
-        log_ptr[idx++] = -9;
-
-        #if AVERAGE_OPS
-        log_ptr[idx++] = -8;
-        log_ptr[idx++] = -8;
-        log_ptr[idx++] = -8;
-        #else
-        log_ptr[idx++] = args.mask_d_stride;
-        log_ptr[idx++] = args.mask_h_stride;
-        log_ptr[idx++] = args.mask_w_stride;
-        #endif
-        log_ptr[idx++] = -7;
-    }
-
     const auto int_dstart   = static_cast<int64_t>(td * args.filter_d_stride) - static_cast<int64_t>(args.filter_d_pad);
-    const auto dend         = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(args.filter_d), static_cast<int64_t>(args.bot_d)));
+    /* const */ auto dend         = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(args.filter_d), static_cast<int64_t>(args.bot_d)));
     const auto dstart       = static_cast<size_t>(max(int_dstart, 0));
 
     const auto int_hstart   = static_cast<int>(th * args.filter_h_stride) - static_cast<int>(args.filter_h_pad);
-    const auto hend             = static_cast<uint32_t>(min(int_hstart + static_cast<int>(args.filter_h), static_cast<int>(args.bot_h)));
+    /* const */ auto hend             = static_cast<uint32_t>(min(int_hstart + static_cast<int>(args.filter_h), static_cast<int>(args.bot_h)));
     const auto hstart         = static_cast<uint32_t>(max(int_hstart, 0));
 
     const auto int_wstart        = static_cast<int>(tw * args.filter_w_stride) - static_cast<int>(args.filter_w_pad);
-    const auto wend             = static_cast<uint32_t>(min(int_wstart + static_cast<int>(args.filter_w), static_cast<int>(args.bot_w)));
+    /* const */ auto wend             = static_cast<uint32_t>(min(int_wstart + static_cast<int>(args.filter_w), static_cast<int>(args.bot_w)));
     const auto wstart           = static_cast<uint32_t>(max(int_wstart, 0));
 
     size_t top_index = 
@@ -233,32 +133,6 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
             th * args.top_h_stride +            //
             tw * args.top_w_stride;
 
-#if true
-    size_t junk_idx = 64;
-    int bot_idx = 0;
-    junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]);
-    junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]);
-    junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]);
-    junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]);
-    junk_idx += 4 * th;
-    if(nn == 0 && cc == 0 && td == 0 && tw < 8 && th == 0)
-    {
-
-        size_t bot_ncd = static_cast<size_t>(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride);
-            size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride;
-                size_t bot_index = bot_ncdh + wstart * args.bot_w_stride;
-
-        junk_ptr[junk_idx++] = top_index;
-        junk_ptr[junk_idx++] = bot_index;
-        junk_ptr[junk_idx++] = dstart;
-        junk_ptr[junk_idx++] = dend;
-        junk_ptr[junk_idx++] = hstart;
-        junk_ptr[junk_idx++] = hend;
-        junk_ptr[junk_idx++] = wstart;
-        junk_ptr[junk_idx++] = wend;
-    }
-#endif
-
 #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
     uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
     pool_size       = (pool_size == 0) ? 1 : pool_size;
@@ -315,35 +189,27 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
 #else // MAX
     if(save_index)
     {
-        index_t res_index = 5150;
+        index_t res_index = saved_index;
 
-        // / Preventing overflow during computation of res_index:
-        // / If Index is shorter than uint, then let's perform computation in 32-bit
-        // / domain and then convert to narrower Index. That would reduce the probability of
-        // / overflow. If Index is wider then 32 bits, then it seems like it is better to
-        // / convert to Index type before multiplication. However this is not actually
-        // / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in
-        // / 32 bits and then convert.
+        /// Preventing overflow during computation of res_index:
+        /// If Index is shorter than uint, then let's perform computation in 32-bit
+        /// domain and then convert to narrower Index. That would reduce the probability of
+        /// overflow. If Index is wider then 32 bits, then it seems like it is better to
+        /// convert to Index type before multiplication. However this is not actually
+        /// necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in
+        /// 32 bits and then convert.
 
         if(found)
         {
-            if(index_mode == 1) // TEMPCODE RJS
-                res_index = saved_index;
-                // res_index = (index_t)(              //
-                //     nn * args.bot_n_stride          //
-                //     + cc * args.bot_c_stride        //
-                //     + d_save * args.bot_d_stride    //
-                //     + h_save * args.bot_h_stride    //
-                //     + w_save * args.bot_w_stride);
-            else
+            if(index_mode == 0)
                 res_index = (index_t)(                                                    //
-                    ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) //
-                    + ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w)          //
-                    + (w_save - tw * args.filter_w_stride + args.filter_w_pad)                       //
+                    ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) + //
+                    ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w) +         //
+                    (w_save - tw * args.filter_w_stride + args.filter_w_pad)                       //
                 );
         }
 
-        const size_t mask_index = nn * args.mask_n_stride             //
+        const size_t mask_index = nn * args.mask_n_stride               //
                                     + cc * args.mask_c_stride           //
                                     + (size_t)(td * args.mask_d_stride) //
                                     + (size_t)(th * args.mask_h_stride) //
@@ -358,7 +224,6 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
 extern "C" __global__ void mloPoolingForwardNDNhwcNaive(
                                     const INPUT_TYPE* __restrict__ bot_ptr,
                                     INPUT_TYPE* __restrict__ top_ptr,
-                                    float* __restrict__ junk_ptr,    // TEMPCODE RJS
                                     index_t* __restrict__ mask_ptr,
                                     int save_index,
                                     int index_mode,
@@ -368,7 +233,6 @@ extern "C" __global__ void mloPoolingForwardNDNhwcNaive(
     poolingForwardNDNhwcNaive<INPUT_TYPE>(
         bot_ptr,
         top_ptr,
-        junk_ptr,
         mask_ptr,
         save_index,
         index_mode,
diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp
index 9a7e166081..6b79def0b0 100644
--- a/src/ocl/pooling_ocl.cpp
+++ b/src/ocl/pooling_ocl.cpp
@@ -68,8 +68,7 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle,
                                           Data_t y,
                                           bool save_index,
                                           Data_t workSpace,
-                                          size_t workSpaceSize,
-                                          Data_t junk) const    // TEMPCODE RJS
+                                          size_t workSpaceSize) const
 {
 
     if(!float_equal(*(static_cast<const float*>(alpha)), 1.0) ||
@@ -132,7 +131,6 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle,
         tmp.y              = y;
         tmp.workspace      = workSpace;
         tmp.workspace_size = workSpaceSize;
-        tmp.junk = junk;    // TEMPCODE RJS
         return tmp;
     }();
 
diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp
index a96101be3a..df21b8cd93 100644
--- a/src/pooling/problem_description.cpp
+++ b/src/pooling/problem_description.cpp
@@ -83,10 +83,9 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
         ss << "_dxs" << get_vect_config(dxDesc.GetStrides());
         ss << "_dyd" << get_vect_config(dyDesc.GetLengths());
         ss << "_dys" << get_vect_config(dyDesc.GetStrides());
-    }   // TEMPCODE RJS
-    std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << " (" << static_cast<int>(xDesc.GetLayout_t()) << (xDesc.IsDefaultLayout() ? ")" : " not default)") << " *************************" << std::endl;
+    }
     ss << "_l" << static_cast<int>(xDesc.GetLayout_t());
-std::cout << "               " << ss.str() << std::endl;
+
     return NetworkConfig{ss.str()};
 }
 
diff --git a/src/solver.cpp b/src/solver.cpp
index 28ca3751a8..c9607df252 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -66,6 +66,19 @@ std::ostream& operator<<(std::ostream& os, const KernelInfo& k)
     return os << "} '" << k.comp_options << '\'';
 }
 
+void KernelInfo::ConfigureHip(size_t l0, size_t l1, size_t l2, size_t g0, size_t g1, size_t g2) 
+{
+    l_wk.clear();
+    l_wk.push_back(l0);
+    l_wk.push_back(l1);
+    l_wk.push_back(l2);
+
+    g_wk.clear();
+    g_wk.push_back(g0 * l0);
+    g_wk.push_back(g1 * l1);
+    g_wk.push_back(g2 * l2);
+}
+
 std::vector<Program>
 PrecompileKernels(const Handle& h, const std::vector<KernelInfo>& kernels, bool force_attach_binary)
 {
diff --git a/src/solver/pooling/forward2d.cpp b/src/solver/pooling/forward2d.cpp
index aa8fdb2f6b..b757c07d54 100644
--- a/src/solver/pooling/forward2d.cpp
+++ b/src/solver/pooling/forward2d.cpp
@@ -137,7 +137,7 @@ bool PoolingForward2d::IsApplicable(const ExecutionContext& context,
 {
     auto x_type = problem.GetXDesc().GetType();
     auto y_type = problem.GetYDesc().GetType();
-    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf};    // TEMPCODE RJS fix types , miopenInt8, miopenFloat8
+    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf};    // TRJS TODO fix types , miopenInt8, miopenFloat8
 
     auto x_layout = problem.GetXDesc().GetLayout_str();
     auto y_layout = problem.GetYDesc().GetLayout_str();
@@ -153,21 +153,7 @@ bool PoolingForward2d::IsApplicable(const ExecutionContext& context,
            sizeof_private_memory(problem) <=
                TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth();
 
-// TEMPCODE RJS
-    std::cout << "%%%%%%%%%% PoolingForward2d::IsApplicable: " << app << " " <<  problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout(x_layout) << std::endl;
-               return app;
-}
-
-#include <iomanip>  // TEMPCODE RJS
-namespace {
-    template<typename T>
-    std::ostream& printVec(std::string name, const std::vector<T>& vec)
-    {
-        return std::cout;
-        std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: ";
-        for(auto i : vec)    std::cout << std::setw(8) << i;
-        return std::cout;
-    }
+    return app;
 }
 
 ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&,
@@ -206,24 +192,6 @@ ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&,
                 : ((pool_d.GetMode() == miopenPoolingAverage) ? MLO_POOLING_OP_AVE
                                                               : MLO_POOLING_OP_AVE_INCLUSIVE);
 
-
-    // TEMPCODE RJS
-    // const auto bot  = problem.GetXDesc();
-    // const auto top  = problem.GetYDesc();
-    // const auto& pooling = problem.GetPooling();
-    // const auto& lengths = pooling.GetLengths();
-    // const auto& strides = pooling.GetStrides();
-    // const auto& pads    = pooling.GetPads();
-
-    // std::cout << "PoolingForward2d GetSolution: " << std::endl;
-    // printVec("   bot lengths", bot.GetLengths()) <<
-    // printVec("   bot strides", bot.GetStrides()) << std::endl;
-    // printVec("   top lengths", top.GetLengths()) <<
-    // printVec("   top strides", top.GetStrides()) << std::endl;
-    // printVec("   pool lengths", lengths) <<
-    // printVec("   pool strides", strides) <<
-    // printVec("   pool pads", pads) << std::endl;
-
         auto build_params = KernelBuildParameters{
             {"MLO_POOLING_OP_ID", pooling_method},
             {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h},
diff --git a/src/solver/pooling/forwardCk2d.cpp b/src/solver/pooling/forwardCk2d.cpp
index 9681f7f563..7ac0c15170 100644
--- a/src/solver/pooling/forwardCk2d.cpp
+++ b/src/solver/pooling/forwardCk2d.cpp
@@ -135,7 +135,7 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro
 bool PoolingForwardCk2d::IsApplicable(const ExecutionContext& context,
                                     const miopen::pooling::ProblemDescription& problem) const
 {
-    return false;
+    return false;   // TRJS not active yet
     return problem.GetDirection() == miopen::pooling::Direction::Forward &&
            problem.GetXDesc().GetNumDims() == 4 &&
            problem.GetXDesc().GetType() == problem.GetYDesc().GetType() &&
diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp
index faf4fac836..9039d49836 100644
--- a/src/solver/pooling/forwardNaive.cpp
+++ b/src/solver/pooling/forwardNaive.cpp
@@ -86,19 +86,6 @@ bool PoolingForwardNaive::IsApplicable(const ExecutionContext&,
         && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end());
 }
 
-
-#include <iomanip>  // TEMPCODE RJS
-namespace {
-    template<typename T>
-    void printVec(std::string name, const std::vector<T>& vec)
-    {
-         return;
-       std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: ";
-        for(auto i : vec)    std::cout << std::setw(8) << i;
-        std::cout << std::endl;
-    }
-}
-
 ConvSolution
 PoolingForwardNaive::GetSolution(const ExecutionContext& context,
                                  const miopen::pooling::ProblemDescription& problem) const
@@ -136,17 +123,6 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     const auto index_mode = pooling.GetWorkspaceIndexMode();
     const auto index_type = pooling.GetIndexType();
 
-    // TEMPCODE RJS
-    std::cout << "======================================================================" << std::endl;
-    printVec("bot lengths", bot.GetLengths());
-    printVec("bot strides", bot.GetStrides());
-    printVec("top lengths", top.GetLengths());
-    printVec("top strides", top.GetStrides());
-    printVec("pool lengths", lengths);
-    printVec("pool strides", strides);
-    printVec("pool pads", pads);
-    std::cout << "======================================================================" << std::endl;
-
     /// \anchor multiply_dims_overflow_assumption
     ///
     /// Preventing overflow during dimension-related computations:
@@ -290,7 +266,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
                 filter_w_pad,
                 all_n,
                 all_c,
-                bot_d,  // TEMPCODE RJS: have not broke it
+                bot_d,
                 bot_h,
                 bot_w,
                 bot_n_stride,
diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp
index d31df3358d..86555d9560 100644
--- a/src/solver/pooling/forwardNdNhwcNaive.cpp
+++ b/src/solver/pooling/forwardNdNhwcNaive.cpp
@@ -50,21 +50,6 @@ bool IsPower2(T v)
 }
 #endif
 
-template <typename T>
-T RoundUpNearestPower2Positive(T v) = delete;
-
-inline uint32_t RoundUpNearestPower2Positive(uint32_t v)
-{
-    assert(v > 0);
-    --v;
-    v |= v >> 1;
-    v |= v >> 2;
-    v |= v >> 4;
-    v |= v >> 8;
-    v |= v >> 16;
-    return std::max(++v, 1U); // Shut clang-tidy.
-}
-
 } // namespace
 
 bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&,
@@ -88,30 +73,9 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&,
         && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend())          //)
         && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end());
 
-    // TODO RJS check grid size
-
-    std::cout << "%%%%%%%%%% PoolingForwardNDNhwcNaive::IsApplicable: " << app << " " <<  problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout(x_layout)
-     << "  " << problem.GetYDesc().GetLayout_str() << "->" << problem.GetYDesc().GetLayout(y_layout)
-       << "  "  << (problem.GetDirection() == miopen::pooling::Direction::Forward)
-        << (x_type == y_type)
-        << (x_layout == y_layout) << (std::find(types.cbegin(), types.cend(), x_type) != types.cend())
-        << (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) << (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()) << std::endl;
-
     return app;
 }
 
-#include <iomanip>  // TEMPCODE RJS
-namespace {
-    template<typename T>
-    void printVec(std::string name, std::vector<T> vec)
-    {
-         return;
-      std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: ";
-        for(auto i : vec)    std::cout << std::setw(8) << i;
-        std::cout << std::endl;
-    }
-}
-
 ConvSolution
 PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
                                  const miopen::pooling::ProblemDescription& problem) const
@@ -176,17 +140,6 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
     /// not require widening to size_t prior mul, but (d_stride * dim * dim)
     /// requires it because the total number of muls is 4.
 
-    // TEMPCODE RJS
-    printVec("======================================================================", std::vector<int>{});
-    printVec("bot lengths", bot.GetLengths());
-    printVec("bot strides", bot.GetStrides());
-    printVec("top lengths", top.GetLengths());
-    printVec("top strides", top.GetStrides());
-    printVec("pool lengths", lengths);
-    printVec("pool strides", strides);
-    printVec("pool pads", pads);
-    printVec("======================================================================", std::vector<int>{});
-
     const auto spatial_dim = is2d ? 2U : 3U;
 
     std::tie(args.all_n, args.all_c, args.bot_d, args.bot_h, args.bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths());
@@ -204,7 +157,7 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
     args.mask_w_stride = 1;
     args.mask_h_stride = args.mask_w_stride * args.top_w;
     args.mask_d_stride = args.mask_h_stride * args.top_h;
-    args.mask_c_stride   = static_cast<BIGONE>(args.mask_d_stride) * args.top_d;
+    args.mask_c_stride   = args.mask_d_stride * args.top_d;
     args.mask_n_stride   = args.mask_c_stride * args.all_c;
 
     /// About optimal grid size:
@@ -305,19 +258,16 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
         // * 2: layout (NCHW vs NHWC)
         // * 2: 2D and 3D kernels (optimization)
 
+        kernel.ConfigureHip(l0, l1, l2, g0, g1, g2);
         // KernelInfo uses OCL-style indexes
-        kernel.l_wk.clear();
-        kernel.l_wk.push_back(l0);
-        kernel.l_wk.push_back(l1);
-        kernel.l_wk.push_back(l2);
-        kernel.g_wk.clear();
-        kernel.g_wk.push_back(g0 * l0);
-        kernel.g_wk.push_back(g1 * l1);
-        kernel.g_wk.push_back(g2 * l2);
-
-        // TEMPCODE RJS
-std::cout << "Kernel dims: g[" << kernel.g_wk.size() << "] " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2]
-<< " | l[" << kernel.l_wk.size() << "] " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl;
+        // kernel.l_wk.clear();
+        // kernel.l_wk.push_back(l0);
+        // kernel.l_wk.push_back(l1);
+        // kernel.l_wk.push_back(l2);
+        // kernel.g_wk.clear();
+        // kernel.g_wk.push_back(g0 * l0);
+        // kernel.g_wk.push_back(g1 * l1);
+        // kernel.g_wk.push_back(g2 * l2);
         result.construction_params.push_back(kernel);
     }
 
@@ -329,20 +279,20 @@ std::cout << "Kernel dims: g[" << kernel.g_wk.size() << "] " << kernel.g_wk[0] <
             kernel(
                 params.x,
                 params.y,
-                params.junk,   // TEMPCODE RJS
                 params.workspace,
                 save_index,
                 index_mode,
-                args.filter_d, args.filter_h, args.filter_w,
-                args.filter_d_stride, args.filter_h_stride, args.filter_w_stride,
-                args.filter_d_pad, args.filter_h_pad, args.filter_w_pad,
-                args.all_n,
-                args.all_c,
-                args.bot_d, args.bot_h, args.bot_w,
-                args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride,
-                args.top_d, args.top_h, args.top_w,
-                args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride,
-                args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride
+                args
+                // args.filter_d, args.filter_h, args.filter_w,
+                // args.filter_d_stride, args.filter_h_stride, args.filter_w_stride,
+                // args.filter_d_pad, args.filter_h_pad, args.filter_w_pad,
+                // args.all_n,
+                // args.all_c,
+                // args.bot_d, args.bot_h, args.bot_w,
+                // args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride,
+                // args.top_d, args.top_h, args.top_w,
+                // args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride,
+                // args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride
             );
         };
     };
diff --git a/test/gtest/layout_transpose.cpp b/test/gtest/layout_transpose.cpp
index 1161eff2b9..bab82000d7 100644
--- a/test/gtest/layout_transpose.cpp
+++ b/test/gtest/layout_transpose.cpp
@@ -275,51 +275,9 @@ struct LayoutTransposeTest_2D : public ::testing::TestWithParam<std::tuple<uint3
                     tensor_len, layout_default, layout_string, tensor_strides);
 
                 auto t_src     = tensor<T>{tensor_len, tensor_strides};
-        bool printing = true; // in_dim[0]==8 && in_dim[1]==8;
-        if(printing)
-        {
-            auto inlen = t_src.desc.GetLengths();
-            auto instr = t_src.desc.GetStrides();
-            std::cout << "CPU in : ";
-            for(auto dim : inlen) std::cout << std::setw(4) << dim;
-            std::cout << " | ";
-            for(auto str : instr) std::cout << std::setw(4) << str;
-            std::cout << std::endl;
-
-            for(int nn = 0; nn < inlen[0]; ++nn) {
-                for(int cc = 0; cc < inlen[1]; ++cc) {
-            for(int hh = 0; hh < inlen[2]; ++hh) {
-                for(int ww = 0; ww < inlen[3]; ++ww) {
-                    std::cout << std::setw(11) << std::setprecision(5) << t_src(nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]) << "  ";
-                }
-            std::cout << std::endl;
-            }
-            }
-            }
-        }
 
                 t_src.generate(gen_value<T>);
-        if(printing)
-        {
-            auto inlen = t_src.desc.GetLengths();
-            auto instr = t_src.desc.GetStrides();
-            std::cout << "CPU in : ";
-            for(auto dim : inlen) std::cout << std::setw(4) << dim;
-            std::cout << " | ";
-            for(auto str : instr) std::cout << std::setw(4) << str;
-            std::cout << std::endl;
-
-            for(int nn = 0; nn < inlen[0]; ++nn) {
-                for(int cc = 0; cc < inlen[1]; ++cc) {
-            for(int hh = 0; hh < inlen[2]; ++hh) {
-                for(int ww = 0; ww < inlen[3]; ++ww) {
-                    std::cout << std::setw(11) << std::setprecision(5) << t_src(nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]) << "  ";
-                }
-            std::cout << std::endl;
-            }
-            }
-            }
-        }
+
                 auto t_dst     = tensor<T>{tensor_len, tensor_strides};
                 auto t_dst_gpu = tensor<T>{tensor_len, tensor_strides};
 
diff --git a/test/gtest/poolingFwd2dNaive.cpp b/test/gtest/poolingFwd2dNaive.cpp
index 828fc556cb..d09c35302a 100644
--- a/test/gtest/poolingFwd2dNaive.cpp
+++ b/test/gtest/poolingFwd2dNaive.cpp
@@ -114,12 +114,12 @@ struct layout_data
 
 }
 
-class PoolingFwd2d : public testing::TestWithParam<std::vector<std::string>> {};
-class PoolingFwd2dInt8 : public PoolingFwd2d {};
-class PoolingFwd2dFloat : public PoolingFwd2d {};
-class PoolingFwd2dHalf : public PoolingFwd2d {};
-class PoolingFwd2dBF16 : public PoolingFwd2d {};
-class PoolingFwd2dF8 : public PoolingFwd2d {};
+class Pooling2d : public testing::TestWithParam<std::vector<std::string>> {};
+class Pooling2dInt8 : public Pooling2d {};
+class Pooling2dFloat : public Pooling2d {};
+class Pooling2dHalf : public Pooling2d {};
+class Pooling2dBF16 : public Pooling2d {};
+class Pooling2dF8 : public Pooling2d {};
 
 void Run2dDriver(miopenDataType_t prec);
 
@@ -144,62 +144,61 @@ std::vector<std::string> Get2dTestCases(const std::string precision)
 
     const std::vector<std::string> test_cases = {
         // clang-format off
-    {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg},   // TEMPCODE RJS DATASET
-    // {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg},   // TEMPCODE RJS DATASET
-    // {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 " + flag_arg}    // TEMPCODE RJS DATASET
+    {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg},
+    {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg},
+    {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 " + flag_arg}
         // clang-format on
     };
 
     return test_cases;
 }
 } // namespace pooling_tests
-// using namespace pooling_tests;
 
-TEST_P(PoolingFwd2dInt8, NNT)
+TEST_P(Pooling2dInt8, NNT)
 {
     if(!IsTestRunWith("--int8"))           std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl;
 
-    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--int8") TEMPCODE RJS
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--int8") TRJS
         GTEST_SKIP();
 
     Run2dDriver(miopenInt8);
 };
 
-TEST_P(PoolingFwd2dFloat, NNT)
+TEST_P(Pooling2dFloat, NNT)
 {
     if(!IsTestRunWith("--float"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl;
 
-    if(SkipTest() || !IsTestSupportedForDevice(get_handle()))
+    if(SkipTest() || !IsTestSupportedForDevice(get_handle())) //  && IsTestRunWith("--float") TRJS
         GTEST_SKIP();
 
     Run2dDriver(miopenFloat);
 };
 
-TEST_P(PoolingFwd2dHalf, NNT)
+TEST_P(Pooling2dHalf, NNT)
 {
     if(!IsTestRunWith("--half"))           std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl;
 
-    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--half") TEMPCODE RJS
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--half") TRJS
         GTEST_SKIP();
 
     Run2dDriver(miopenHalf);
 };
 
-TEST_P(PoolingFwd2dBF16, NNT)
+TEST_P(Pooling2dBF16, NNT)
 {
-    if(!IsTestRunWith("--bfloat16"))           std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl;
+    if(!IsTestRunWith("--bfloat16"))            std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl;
 
-    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--bfloat16") TEMPCODE RJS
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--bfloat16") TRJS
         GTEST_SKIP();
 
     Run2dDriver(miopenBFloat16);
 };
 
-TEST_P(PoolingFwd2dF8, NNT)
+TEST_P(Pooling2dF8, NNT)
 {
-    if(!IsTestRunWith("--float8"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl;
+    if(!IsTestRunWith("--float8"))              std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl;
 
-    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--float8") TEMPCODE RJS
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--float8") TRJS
         GTEST_SKIP();
 
     Run2dDriver(miopenFloat8);
@@ -208,40 +207,31 @@ TEST_P(PoolingFwd2dF8, NNT)
 void Run2dDriver(miopenDataType_t prec)
 {
     auto cases = Get2dTestCases("--float");
-    // std::cerr << " Cases: " << cases.size() << std::endl;    // TEMPCODE RJS
-    // for(const auto& test_value : cases)
-    // {
-    //     std::cerr << "      : " << test_value << std::endl;    // TEMPCODE RJS
-    // }
  
     std::vector<std::string> params;
     switch(prec)
     {
-    case miopenFloat: params = PoolingFwd2dFloat_NNT_Test::GetParam(); break;
-    case miopenHalf: params = PoolingFwd2dHalf_NNT_Test::GetParam(); break;
-    case miopenBFloat16: params = PoolingFwd2dBF16_NNT_Test::GetParam(); break;
-    case miopenInt8: params = PoolingFwd2dInt8_NNT_Test::GetParam(); break;
-    case miopenFloat8: params = PoolingFwd2dF8_NNT_Test::GetParam(); break;
+    case miopenFloat: params = Pooling2dFloat_NNT_Test::GetParam(); break;
+    case miopenHalf: params = Pooling2dHalf_NNT_Test::GetParam(); break;
+    case miopenBFloat16: params = Pooling2dBF16_NNT_Test::GetParam(); break;
+    case miopenInt8: params = Pooling2dInt8_NNT_Test::GetParam(); break;
+    case miopenFloat8: params = Pooling2dF8_NNT_Test::GetParam(); break;
     case miopenInt32:
     case miopenDouble:
     case miopenBFloat8:
     case miopenInt64:
         FAIL()
-            << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 "
-               "data type not supported by "
+            << "miopenInt32, miopenDouble, miopenBFloat8, miopenInt64 "
+               "data types not supported by "
                "poolingFwd2dNaive test";
 
-    default: params = PoolingFwd2dFloat_NNT_Test::GetParam();
+    default: params = Pooling2dFloat_NNT_Test::GetParam();
     }
 
     std::cerr << "Params: " << params.size() << std::endl;
+
     for(const auto& test_value : params)
     {
-        std::cerr << "      : " << test_value << std::endl;    // TEMPCODE RJS
-    }
-    for(const auto& test_value : params)
-    {
-        std::cerr << "Testing: " << test_value << std::endl;    // TEMPCODE RJS
         std::vector<std::string> tokens;
         GetArgs(test_value, tokens);
         std::vector<const char*> ptrs;
@@ -257,16 +247,10 @@ void Run2dDriver(miopenDataType_t prec)
     }
 }
 
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dBF16);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dInt8);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dFloat);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dHalf);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dF8);
-
-INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd2dBF16, testing::Values(Get2dTestCases("--bfloat16")));
-INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd2dInt8, testing::Values(Get2dTestCases("--int8")));
-INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd2dFloat, testing::Values(Get2dTestCases("--float")));
-INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd2dHalf, testing::Values(Get2dTestCases("--half")));
-INSTANTIATE_TEST_SUITE_P(F8, PoolingFwd2dF8, testing::Values(Get2dTestCases("--float8")));
+INSTANTIATE_TEST_SUITE_P(Full, Pooling2dBF16, testing::Values(Get2dTestCases("--bfloat16")));
+INSTANTIATE_TEST_SUITE_P(Full, Pooling2dInt8, testing::Values(Get2dTestCases("--int8")));
+INSTANTIATE_TEST_SUITE_P(Full, Pooling2dFloat, testing::Values(Get2dTestCases("--float")));
+INSTANTIATE_TEST_SUITE_P(Full, Pooling2dHalf, testing::Values(Get2dTestCases("--half")));
+INSTANTIATE_TEST_SUITE_P(Full, Pooling2dF8, testing::Values(Get2dTestCases("--float8")));
 
 #endif
diff --git a/test/gtest/poolingFwd3dNaive.cpp b/test/gtest/poolingFwd3dNaive.cpp
index a32432f1e6..d3f8975b15 100644
--- a/test/gtest/poolingFwd3dNaive.cpp
+++ b/test/gtest/poolingFwd3dNaive.cpp
@@ -145,26 +145,22 @@ std::vector<std::string> Get3dTestCases(const std::string precision)
 
     const std::vector<std::string> test_cases = {
         // clang-format off
-    {"test_pooling3d " + precision + " --all --dataset 0 --limit 0 " + flag_arg},    // TEMPCODE RJS DATASET
-    // {"test_pooling3d " + precision + " --all --dataset 1 --limit 0 " + flag_arg},  // TEMPCODE RJS DATASET
-    // {"test_pooling3d " + precision + " --all --dataset 2 --limit 0 " + flag_arg}    // TEMPCODE RJS DATASET
+    {"test_pooling3d " + precision + " --all --dataset 0 --limit 0 " + flag_arg},
+    {"test_pooling3d " + precision + " --all --dataset 1 --limit 0 " + flag_arg},
+    {"test_pooling3d " + precision + " --all --dataset 2 --limit 0 " + flag_arg}
         // clang-format on
     };
 
     return test_cases;
 }
 } // namespace pooling_tests
-// using namespace pooling_tests;
 
-TEST_P(PoolingFwd3dFloat, NNT)    // NDNaiveTranspose
+TEST_P(PoolingFwd3dFloat, NNT)    // NNT=NdNaiveTranspose
 {
     const auto& handle = get_handle();
-    if(!IsTestSupportedForDevice(handle))   std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl;
-    if(SkipTest())                          std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl;
     if(!IsTestRunWith("--float"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl;
-        // Run3dDriver(miopenFloat);   return; // TEMPCODE RJS
-    //  && IsTestRunWith("--float")
-    if(IsTestSupportedForDevice(handle) && !SkipTest())
+
+    if(IsTestSupportedForDevice(handle) && !SkipTest()) //  && IsTestRunWith("--float") TRJS
     {
         Run3dDriver(miopenFloat);
     }
@@ -177,11 +173,9 @@ TEST_P(PoolingFwd3dFloat, NNT)    // NDNaiveTranspose
 TEST_P(PoolingFwd3dHalf, NNT)
 {
     const auto& handle = get_handle();
-    if(!IsTestSupportedForDevice(handle))   std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl;
-    if(SkipTest())                          std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl;
-    if(!IsTestRunWith("--half"))           std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl;
+    if(!IsTestRunWith("--half"))            std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl;
 
-    if(IsTestSupportedForDevice(handle) && !SkipTest()) //  && IsTestRunWith("--half") TEMPCODE RJS
+    if(IsTestSupportedForDevice(handle) && !SkipTest()) //  && IsTestRunWith("--half") TRJS
     {
         Run3dDriver(miopenHalf);
     }
@@ -195,7 +189,7 @@ TEST_P(PoolingFwd3dBF16, NNT)
 {
     if(!IsTestRunWith("--bfloat16"))           std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl;
 
-    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--bfloat16") TEMPCODE RJS
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--bfloat16") TRJS
         GTEST_SKIP();
 
     Run3dDriver(miopenBFloat16);
@@ -205,7 +199,7 @@ TEST_P(PoolingFwd3dInt8, NNT)
 {
     if(!IsTestRunWith("--int8"))           std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl;
 
-    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--int8") TEMPCODE RJS
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--int8") TRJS
         GTEST_SKIP();
 
     Run3dDriver(miopenInt8);
@@ -215,7 +209,7 @@ TEST_P(PoolingFwd3dF8, NNT)
 {
     if(!IsTestRunWith("--float8"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl;
 
-    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--float8") TEMPCODE RJS
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--float8") TRJS
         GTEST_SKIP();
 
     Run3dDriver(miopenFloat8);
@@ -224,11 +218,6 @@ TEST_P(PoolingFwd3dF8, NNT)
 void Run3dDriver(miopenDataType_t prec)
 {
     auto cases = Get3dTestCases("--float");
-       std::cerr << " Cases: " << cases.size() << std::endl;
-    for(const auto& test_value : cases)
-    {
-        std::cerr << "      : " << test_value << std::endl;    // TEMPCODE RJS
-    }
  
     std::vector<std::string> params;
     switch(prec)
@@ -243,8 +232,8 @@ void Run3dDriver(miopenDataType_t prec)
     case miopenBFloat8:
     case miopenInt64:
         FAIL()
-            << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 "
-               "data type not supported by "
+            << "miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 "
+               "data types not supported by "
                "poolingFwdNdNaive test";
 
     default: params = PoolingFwd3dFloat_NNT_Test::GetParam();
@@ -252,7 +241,6 @@ void Run3dDriver(miopenDataType_t prec)
 
     for(const auto& test_value : params)
     {
-        std::cerr << "Running Test: " << test_value << std::endl;    // TEMPCODE RJS
         std::vector<std::string> tokens;
         GetArgs(test_value, tokens);
         std::vector<const char*> ptrs;
@@ -268,13 +256,7 @@ void Run3dDriver(miopenDataType_t prec)
     }
 }
 
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dBF16);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dFloat);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dHalf);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dInt8);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dF8);
-
-// INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd3dBF16, testing::Values(Get3dTestCases("--bfloat16")));
+INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd3dBF16, testing::Values(Get3dTestCases("--bfloat16")));
 INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd3dFloat, testing::Values(Get3dTestCases("--float")));
 INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd3dHalf, testing::Values(Get3dTestCases("--half")));
 INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd3dInt8, testing::Values(Get3dTestCases("--int8")));
diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp
index 2fdb4ded19..eacd9ebeb6 100644
--- a/test/gtest/pooling_testing.hpp
+++ b/test/gtest/pooling_testing.hpp
@@ -36,10 +36,10 @@
 namespace {using sc = std::chrono::steady_clock;}
 #undef tomillis
 #define tomillis(__DUR) (0.001 * std::chrono::duration_cast<std::chrono::microseconds>(__DUR).count())
+#undef mstocout
+#define mstocout(__TP) std::setw(15) << std::fixed << std::setprecision(3) << tomillis(sc::now() - __TP)
 #undef coutms
-#define coutms(__TOK, __TP) (std::cout << "ms[" << std::setw(16) << __TOK << "]: "          \
-    << std::setw(12) << std::fixed << std::setprecision(3) << tomillis(sc::now() - __TP) << std::endl)
-
+#define coutms(__TOK, __TP) (std::cout << "ms[" << std::setw(16) << __TOK << "]: " << mstocout(__TP) << std::endl)
 
 #include <gtest/gtest.h>
 #include <array>
@@ -56,13 +56,12 @@ namespace {using sc = std::chrono::steady_clock;}
 #include <miopen/tensor.hpp>
 #include <utility>
 
-// #include "network_data.hpp"
-#include "driver.hpp"
-#include "get_handle.hpp"
-#include "tensor_holder.hpp"
-#include "verify.hpp"
-#include "cpu_conv.hpp"
-#include "workspace.hpp"
+#include "../driver.hpp"
+#include "../get_handle.hpp"
+#include "../tensor_holder.hpp"
+#include "../verify.hpp"
+#include "../cpu_conv.hpp"
+#include "../workspace.hpp"
 
 #define TEST_PADDING_MODE 0
 
@@ -75,13 +74,17 @@ int num_uint64_case = 0;
 int num_uint64_case_imgidx = 0;
 constexpr int max_typed_cases = 5;
 constexpr int MAX_ALL_CASES = 0;
+auto __start = sc::now();
 
 constexpr int RAND_INTEGER_MAX = 12000;
 constexpr int RAND_INTEGER_MIN = -8800;
 
 template <typename T>
 auto gen_value =
-    [](auto... is) { return static_cast<T>(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX) / 100); };
+    [](auto... is) { return static_cast<T>(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; };
+
+auto gen_start =
+    [](auto... is) { return prng::gen_0_to_B(1ULL << 28); };
 }
 
 static inline void print(std::ostringstream& oss, const miopen::PoolingDescriptor& filter, bool is_default_layout)
@@ -94,7 +97,7 @@ static inline void print(std::ostringstream& oss, const miopen::PoolingDescripto
     else
         oss << "Max";
     oss << std::endl;
-    oss << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl;  // TEMPCODE RJS
+    oss << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl;
     oss << "Lengths: ";
     miopen::LogRange(oss, filter.GetLengths(), ", ") << std::endl;
     oss << "Pads: ";
@@ -160,84 +163,6 @@ struct pooling_operators
     }
 };
 
-#include <algorithm>
-#include <iomanip>
-#define MAX_PRINTING 128    // TEMPCODE RJS
-#define MAX_PRINT 12    // TEMPCODE RJS
-#define MAX_NCD 2
-#define GPU_JUNK 0 // typ. 160, reasonable max is 320
-#define PRINT_CPU_IN 0
-#define PRINT_GPU_OUT 0
-#define GPU_4COL false
-#define PIPE std::cout
-
-namespace {
-    template<typename T>
-    std::vector<T> ClampNCS(const std::vector<T>& lens, std::vector<T>& strides)
-    {
-        std::vector<T> out;
-        bool is2d = lens.size() == 4;
-        constexpr T max_ncd = MAX_NCD;
-        constexpr T max_print = MAX_PRINT;
-        int idx = 0;
-
-        out.push_back(std::min(lens[idx++], max_ncd));
-        out.push_back(std::min(lens[idx++], max_ncd));
-        if(is2d)
-            out.push_back(1);
-        else
-            out.push_back(std::min(lens[idx++], max_ncd));
-
-        out.push_back(std::min(lens[idx++], max_print));
-        out.push_back(std::min(lens[idx++], max_print));
-
-        if(is2d)
-           strides.insert(strides.begin() + 2, {strides[2]});
-
-        return out;
-    }
-
-    template<typename T, typename S>
-    std::ostream& printClamped(std::ostream& oss, const std::vector<T>& out, const std::vector<S>& outlen, const std::vector<S>& outstr, int mode = 0)
-    {
-        auto ostr = outstr;
-        auto olen = ClampNCS(outlen, ostr);
-
-        for(int nn = 0; nn < olen[0]; ++nn) {
-            for(int cc = 0; cc < olen[1]; ++cc) {
-                for(int dd = 0; dd < olen[2]; ++dd) {
-                    oss << "n= " << nn << " c= " << cc << " d= " << dd << std::endl;
-                    for(int hh = 0; hh < olen[3]; ++hh) {
-                        for(int ww = 0; ww < olen[4]; ++ww) {
-                            auto index = nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4];
-switch(mode) {
-    case 0:
-    if(std::is_same<T, char>::value || std::is_same<T, int8_t>::value) {
-                            oss << std::setw(11) << std::setprecision(5) << (int16_t)out[index] << "  ";
-    } else {
-                            oss << std::setw(11) << std::setprecision(5) << out[index] << "  ";
-    }
-                            break;
-    case 1:
-                            oss << std::setw(11) << std::setprecision(5) << index << "  ";
-                            break;
-}    
-                        }
-                        oss << std::endl;
-                    }
-                }
-            }
-        }
-        return oss;
-    }
-}
-
-    template<typename T, typename S>
-    std::ostream& printClamped(std::ostream& oss, const tensor<T>& out, const std::vector<S>& outlen, const std::vector<S>& outstr, int mode = 0)
-    {
-        return printClamped<T, S>(oss, out.data, outlen, outstr, mode);
-    }
-
 template <int SptDim>
 struct verify_forward_pooling
 {
@@ -245,7 +170,6 @@ struct verify_forward_pooling
     tensor<T>
     cpu(const tensor<T>& input, const miopen::PoolingDescriptor& filter, std::vector<Index>&) const
     {
-auto st = sc::now();
         auto out = get_output_tensor(filter, input);
 
         std::array<int, SptDim> in_dim{};
@@ -258,10 +182,6 @@ auto st = sc::now();
         std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin());
         auto pooler = pooling_operators<T>{filter};
 
-        // TEMPCODE RJS print input tensor
-        bool printing = in_dim[0] <= MAX_PRINTING && in_dim[1] <= MAX_PRINTING;
-        if(in_dim.size() > 2) printing &= in_dim[2] <= MAX_PRINTING;
-
         int b_n = out.desc.GetLengths()[0];
         int k_n = out.desc.GetLengths()[1];
         std::array<int, SptDim> out_spatial_len{};
@@ -312,111 +232,6 @@ auto st = sc::now();
             out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size));
         });
 
-        if(PRINT_GPU_OUT && printing)
-        {
-            PIPE << "CPU out: ";
-            auto outlen = out.desc.GetLengths();
-            for(auto dim : outlen)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
-            auto outstr = out.desc.GetStrides();
-            for(auto dim : outstr)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
-            for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
-            for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
-            for(auto str : filter.GetPads())    PIPE << std::setw(4) << str << " ";
-            PIPE << std::endl;
-
-            printClamped(PIPE, out, outlen, outstr);
-            // printClamped(PIPE, out, outlen, outstr, 1);
-        }   // print output tensor
-        if(PRINT_CPU_IN && printing)
-        {
-            auto inlen = input.desc.GetLengths();
-            auto instr = input.desc.GetStrides();
-            PIPE << "CPU in : m" << filter.GetMode() << " t" << input.desc.GetType() << " | ";
-            for(auto dim : inlen)               PIPE << std::setw(4) << dim << " "; PIPE << " | ";
-            for(auto str : instr)               PIPE << std::setw(4) << str << " "; PIPE << " | ";
-            for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
-            for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
-            for(auto str : filter.GetPads())    PIPE << std::setw(4) << str << " ";
-            PIPE << std::endl;
-
-            printClamped(PIPE, input, inlen, instr);
-            // printClamped(PIPE, input, inlen, instr, 1);
-        }
-coutms("f.cpu", st);
-        return out;
-    }
-
-    template <class T, class Index>
-    tensor<T>
-    cpu_naive(const tensor<T>& input, const miopen::PoolingDescriptor& filter, std::vector<Index>&) const
-    {
-        auto out = get_output_tensor(filter, input);
-
-        std::array<int, SptDim> in_dim{};
-        std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin());
-        std::array<int, SptDim> strides{};
-        std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin());
-        std::array<int, SptDim> pads{};
-        std::copy_n(filter.GetPads().begin(), SptDim, pads.begin());
-        std::array<int, SptDim> kers{};
-        std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin());
-        auto pooler = pooling_operators<T>{filter};
-
-        // TEMPCODE RJS print input tensor
-        bool printing = in_dim[0] <= MAX_PRINTING && in_dim[1] <= MAX_PRINTING;
-        if(in_dim.size() > 2) printing &= in_dim[2] <= MAX_PRINTING;
-
-        auto flens = filter.GetLengths();
-        auto fstrs = filter.GetStrides();
-        auto ilens = input.desc.GetLengths();
-        auto istrs = input.desc.GetStrides();
-        auto olens  = out.desc.GetLengths();
-        auto ostrs  = out.desc.GetStrides();
-
-        for(int n = 0; n < olens[0]; ++n)
-        {
-            for(int c = 0; c < olens[1]; ++c)
-            {
-                for(int h = 0; h < olens[2]; ++h)
-                {
-                    for(int w = 0; w < olens[3]; ++w)
-                    {
-                        int hstart = h * fstrs[0];
-                        int hend = hstart + flens[0];
-                        hend = min(hend, ilens[2]);
-
-                        int wstart = w * fstrs[1];
-                        int wend = wstart + flens[1];
-                        wend = min(wend, ilens[3]);
-                        double res = -10000.0;
-                        for(int fh = hstart; fh < hend; ++fh)
-                        {
-                            for(int fw = wstart; fw < wend; ++fw)
-                            {
-                                double val = input[n * istrs[0] + c * istrs[1] + fh * istrs[2] + fw * istrs[3]];
-                                if(val > res)
-                                    res = val;
-                            }
-                        }
-                        out[n * ostrs[0] + c * ostrs[1] + h * ostrs[2] + w * ostrs[3]] = res;
-                    }
-                }
-            }
-        }
-
-        if(printing)
-        {
-            PIPE << "CPU nve: ";
-            for(auto dim : olens)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
-            for(auto dim : ostrs)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
-            for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
-            for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE<< " | ";
-            for(auto str : filter.GetPads())    PIPE << std::setw(4) << str << " ";
-            PIPE << std::endl;
-
-            printClamped(PIPE, out, olens, ostrs);
-        }
-
         return out;
     }
 
@@ -425,28 +240,16 @@ coutms("f.cpu", st);
                   const miopen::PoolingDescriptor& filter,
                   std::vector<Index>& indices) const
     {
-auto st = sc::now();
-auto write = sc::now();
         auto&& handle = get_handle();
         auto out      = get_output_tensor(filter, input);
-        auto junk     = get_big_output_tensor(filter, input);   // TEMPCODE RJS
-
-#if GPU_JUNK > 0
-            PIPE << "GPU junk init: " << std::endl;
-            for(int idx = 0; idx < GPU_JUNK; ++idx) {
-                PIPE << std::setw(11) << std::setprecision(5) << junk.data[idx] << "  ";
-                if((idx % 4) == 3)  PIPE <<std::endl;
-            }
-#endif
 
         indices.resize(out.data.size(), 0);
 
         auto in_dev  = handle.Write(input.data);
         auto out_dev = handle.Create<T>(out.GetSize());
-        auto junk_dev = handle.Write(junk.data);  // TEMPCODE RJS
         Workspace wspace{};
         wspace.Write(indices);
-coutms("f.GPUwrite", write);
+
         float alpha = 1, beta = 0;
         filter.Forward(handle,
                        &alpha,
@@ -457,76 +260,10 @@ coutms("f.GPUwrite", write);
                        out_dev.get(),
                        true,
                        wspace.ptr(),
-                       wspace.size(),
-                       junk_dev.get()); // TEMPCODE RJS
-handle.Finish(); // TEMPCODE RJS
-coutms("f.gpu1", st);
-        filter.Forward(handle,
-                       &alpha,
-                       input.desc,
-                       in_dev.get(),
-                       &beta,
-                       out.desc,
-                       out_dev.get(),
-                       true,
-                       wspace.ptr(),
-                       wspace.size(),
-                       junk_dev.get()); // TEMPCODE RJS
-handle.Finish(); // TEMPCODE RJS
-auto read = sc::now();
+                       wspace.size());
         handle.ReadTo(out.data.data(), out_dev, out.GetDataByteSize());
         wspace.ReadTo(indices);
-        bool printing = input.desc.GetLengths()[2] <= MAX_PRINTING && input.desc.GetLengths()[3] <= MAX_PRINTING;
-        if(input.desc.GetLengths().size() > 4) printing &= input.desc.GetLengths()[4] <= MAX_PRINTING;
-std::cout << (printing ? "printing output from GPU..." : "skipping GPU print.") << std::endl;
-        if(PRINT_GPU_OUT && printing)
-        {
-#if GPU_JUNK > 0
-            handle.ReadTo(junk.data.data(), junk_dev, junk.GetDataByteSize());
-            if(GPU_4COL){
-                PIPE<< "GPU (8-cols): " << std::endl;
-                for(int idx = 0; idx < GPU_JUNK; ++idx) {
-                    PIPE << std::setw(11) << std::setprecision(5) << out.data[idx] << "  ";
-                    if((idx % 8) == 7)  PIPE <<std::endl;
-                }
-            }
-            PIPE << "GPU junk: " << std::endl;
-            for(int idx = 0; idx < GPU_JUNK; ++idx) {
-                PIPE << std::setw(11) << std::setprecision(5) << junk.data[idx] << "  ";
-                if((idx % 4) == 3)  PIPE <<std::endl;
-            }
-#endif
-
-            auto outlen = out.desc.GetLengths();
-            auto outstr = out.desc.GetStrides();
-
-            if(false){
-                std::vector<size_t> olen = outlen;
-                if(olen.size() == 4)    olen.insert(olen.begin() + 2, 1);
-                // Mask data is always NCDHW layout
-                size_t mask_w_stride = 1;
-                size_t mask_h_stride = mask_w_stride * olen[4];
-                size_t mask_d_stride = mask_h_stride * olen[3];
-                size_t mask_c_stride   = mask_d_stride * olen[2];
-                size_t mask_n_stride   = mask_c_stride * olen[1];
-                std::vector<size_t> mask_str{mask_n_stride, mask_c_stride, mask_d_stride,mask_h_stride, mask_w_stride};
-                std::cout << "GPU indices: " << mask_n_stride << " " << mask_c_stride << " " << mask_d_stride << " " << mask_h_stride << " " << mask_w_stride << std::endl;
-                printClamped(std::cout, indices, olen, mask_str);
-            }
 
-            PIPE << "GPU out: ";
-            for(auto dim : outlen)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
-            for(auto dim : outstr)              PIPE << std::setw(4) << dim << " "; PIPE << " | ";
-            for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | ";
-            for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE<< " | ";
-            for(auto str : filter.GetPads())    PIPE << std::setw(4) << str << " ";
-            PIPE << std::endl;
-
-            printClamped(PIPE, out, outlen, outstr);
-            // printClamped(std::cout, out, outlen, outstr, 1);
-        }   // print output tensor
-coutms("f.GPUread", read);
-coutms("f.gpu", st);
         return out;
     }
 
@@ -558,11 +295,11 @@ struct verify_backward_pooling
                   bool use_global_index,
                   bool verify_index) const
     {
-auto st = sc::now();
         const int sptl_dim_offset = 2;
         const int chan_dim_offset = 1;
 
         auto dinput = input;
+        return dinput; // TRJS
 
         std::vector<double> din_vec(input.desc.GetElementSpace(), 0.0);
         CHECK(dout.desc == out.desc);
@@ -702,7 +439,6 @@ auto st = sc::now();
             dinput(in_id_pack...) = din_vec.at(din_idx);
         });
 
-    coutms("b.cpu", st);
     return dinput;
     }
 
@@ -715,10 +451,9 @@ auto st = sc::now();
                   bool,
                   bool) const
     {
-auto st = sc::now();
-auto write = sc::now();
         auto&& handle = get_handle();
         auto dinput   = input;
+        return dinput; // TRJS
 
         auto in_dev   = handle.Write(input.data);
         auto dout_dev = handle.Write(dout.data);
@@ -727,7 +462,6 @@ auto write = sc::now();
 
         Workspace wspace{};
         wspace.Write(indices);
-coutms("b.GPUwrite", write);
 
         float alpha = 1, beta = 0;
         filter.Backward(handle,
@@ -746,29 +480,9 @@ coutms("b.GPUwrite", write);
                         dinput.desc,
                         din_dev.get(),
                         wspace.ptr());
-handle.Finish(); // TEMPCODE RJS
-coutms("b.gpu1", st);
-        filter.Backward(handle,
-                        &alpha,
-                        // y
-                        out.desc,
-                        out_dev.get(),
-                        // dy
-                        dout.desc,
-                        dout_dev.get(),
-                        // x
-                        input.desc,
-                        in_dev.get(),
-                        &beta,
-                        // dx
-                        dinput.desc,
-                        din_dev.get(),
-                        wspace.ptr());
-handle.Finish(); // TEMPCODE RJS
-auto read = sc::now();
-        dinput.data = handle.Read<T>(din_dev, dinput.data.size());
-coutms("b.GPUread", read);
-coutms("b.gpu", st);
+
+        handle.ReadTo(dinput.data.data(), din_dev, dinput.data.size());
+
         return dinput;
     }
 
@@ -807,6 +521,33 @@ struct pooling_driver : test_driver
     int verify_indices{};
     miopenPoolingWorkspaceIndexMode_t wsidx{};
     miopenTensorLayout_t layout{};
+
+    static void randomize_tensor(tensor<T>& in)
+    {
+        static tensor<T> random_data{{1}};
+        static tensor<int> starts{{1}};
+        static size_t start_idx = 0;
+
+        const auto size = in.GetSize();
+        const auto ran_size = size > 2 ? (3 * size) / 2 : 3;
+        if (random_data.GetSize() < ran_size)
+        {
+            random_data = tensor<T>{{ran_size}}.generate(tensor_elem_gen_integer{2503});
+        }
+        if (starts.GetSize() == 1)
+        {
+            starts = tensor<size_t>{{1 << 20}}.generate(gen_start);
+        }
+
+        const auto r_start = starts[start_idx++] % (random_data.GetSize() / 3);
+        if (start_idx >= starts.GetSize()) start_idx = 0;
+
+        std::cout << "randomizing " << std::setw(9) << size << " elems from " << std::setw(9) << r_start << " (" << start_idx << ")"
+        // << "(" << std::setw(8) << prng::gen_0_to_B(size / 2)  << std::setw(8) << prng::gen_0_to_B(size / 2)  << std::setw(8) << prng::gen_0_to_B(size / 2)  << std::setw(8) << prng::gen_0_to_B(size / 2) << ")" 
+        << std::endl;
+        in.data.assign(random_data.begin() + r_start, random_data.begin() + r_start + size);
+    }
+
     std::unordered_map<std::string, miopenIndexType_t> index_type_lookup = {
         {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8},
         {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16},
@@ -832,7 +573,7 @@ struct pooling_driver : test_driver
     {
         add(index_type,
             "index_type",
-            // generate_data({"miopenIndexUint32"}    // TEMPCODE RUN
+            // generate_data({"miopenIndexUint32"}    // TEMPCODE RJS RUN
             generate_multi_data<const char*>( //
                 {{"miopenIndexUint32",
                   "miopenIndexUint8"
@@ -846,63 +587,55 @@ struct pooling_driver : test_driver
         add(mode_str,
             "mode_str",
             generate_data(
-                {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); // TEMPCODE RUN
+                {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"}));
 #if TEST_PADDING_MODE == 1
         add(pmode, "pmode", generate_data({"default", "same", "valid"}));
 #endif
         add(verify_indices, "verify_indices", generate_data({1}));
     }
 
-#define FORWARD
-#define BACKWARD
-
     template <class Index, int SptlDim>
     void run_impl()
     {
         std::vector<Index> indices{};
+auto gst = sc::now();
         auto input = tensor<T>{layout, in_shape};
-        for(auto& v : input.data)   v = gen_value<T>();
-
-auto st = sc::now();
-#ifdef FORWARD
+        randomize_tensor(input);
+coutms("gen", gst);
+auto vst = sc::now();
         auto out  = verify(verify_forward_pooling<SptlDim>{},
             input,
             filter,
             indices);
-coutms("f.verify", st);
-#endif
-#ifdef BACKWARD    // TEMPCODE RJS no backward
+coutms("verify", vst);
         if(!std::is_same<T, float>::value && !std::is_same<T, half>::value) return;
-auto bst = sc::now();
-        auto dout = out.first;
-        dout.generate(tensor_elem_gen_integer{2503});
-        verify(verify_backward_pooling<SptlDim>{},
-               input,
-               dout,
-               out.first,
-               filter,
-               indices,
-               wsidx != 0,
-               static_cast<bool>(this->verify_indices));
-coutms("b.verify", bst);
-#endif
+
+        // auto dout = out.first;
+        // dout.generate(tensor_elem_gen_integer{2503});
+        // verify(verify_backward_pooling<SptlDim>{},   // TRJS
+        //        input,
+        //        dout,
+        //        out.first,
+        //        filter,
+        //        indices,
+        //        wsidx != 0,
+        //        static_cast<bool>(this->verify_indices));
     }
 
 #define CHECK_SKIP  \
 if(skip)        \
 {               \
-    std::cout << "\n############ RunSkip # " << std::setw(7) << num_all_case++ << " : ";    \
+    std::cout << "\nSkipping run # " << std::setw(7) << num_all_case++ << " @ET=" << mstocout(__start) << " : ";    \
     show_command(); \
     std::cout << "-- " << oss.str() << std::endl;   \
-coutms("skipRun", st); return;   \
+    return; \
 }
 
 #define SKIP_RUN  skip = true; CHECK_SKIP
 
     void run()
     {
-auto st = sc::now();
-        const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); // TEMPCODE RJS
+        const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout);
 
         bool skip = false;
         std::ostringstream oss;
@@ -928,8 +661,7 @@ auto st = sc::now();
         int sptl_dim = static_cast<int>(in_shape.size()) - 2;
         if(sptl_dim != 2 && sptl_dim != 3)
         {
-            oss << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW form."
-                      << std::endl;
+            oss << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW format." << std::endl;
             SKIP_RUN;
         }
 
@@ -1117,7 +849,7 @@ auto st = sc::now();
         {
             if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast<uint64_t>(2) * pads[i]))
             {
-                oss << "Warning: Config skipped becuse it is invalid "
+                oss << "Warning: Config skipped because it is invalid "
                              "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))"
                           << std::endl;
                 SKIP_RUN;
@@ -1143,7 +875,7 @@ auto st = sc::now();
 
         CHECK_SKIP;
 
-        std::cout << "\n############     Run # " << std::setw(7) << num_all_case++ << " : ";
+        std::cout << "\nRun # " << std::setw(7) << num_all_case++ << " @ET=" << mstocout(__start) << " : ";
         show_command();
 
         std::vector<int> in_dim(input_desc.GetLengths().begin(),
@@ -1151,45 +883,6 @@ auto st = sc::now();
         std::vector<int> out_dim(sptl_dim);
         std::vector<int> ker_dim(filter.GetLengths().begin(), filter.GetLengths().end());
 
-#if TEST_PADDING_MODE == 1
-        if(filter.pmode == miopenPaddingSame)
-        {
-            if(std::any_of(filter.GetStrides().begin(), filter.GetStrides().end(), [](int i) {
-                   return i == 0;
-               }))
-                return;
-            for(int i = 0; i < sptl_dim; i++)
-            {
-                filter.pads[i] =
-                    ((in_dim[i] % filter.GetStrides()[i] == 0)
-                         ? (std::max((ker_dim[i] - filter.GetStrides()[i]), 0))
-                         : (std::max((ker_dim[i] - (in_dim[i] % filter.GetStrides()[i])), 0))) /
-                    2;
-
-                out_dim[i] = std::ceil(static_cast<double>(in_dim[i]) / filter.strides[i]);
-            }
-
-            if(std::any_of(out_dim.begin(), out_dim.end(), [](int i) { return i <= 0; }))
-                return;
-        }
-        else if(filter.pmode == miopenPaddingValid)
-        {
-            if(std::any_of(filter.GetStrides().begin(), filter.GetStrides().end(), [](int i) {
-                   return i == 0;
-               }))
-                return;
-            for(int i = 0; i < sptl_dim; i++)
-            {
-                filter.pads[i] = 0;
-
-                out_dim[i] = std::ceil(static_cast<double>(in_dim[i] - filter.lens[i] + 1) /
-                                       filter.strides[i]);
-            }
-
-            if(std::any_of(out_dim.begin(), out_dim.end(), [](int i) { return i <= 0; }))
-                return;
-        }
-#endif
         switch(filter.GetIndexType())
         {
         case miopenIndexUint8: {
@@ -1237,7 +930,6 @@ auto st = sc::now();
             break;
         }
         }
-coutms("Run", st);
     }
 };
 
diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp
index b3acb61e07..546b40876f 100644
--- a/test/pooling2d.hpp
+++ b/test/pooling2d.hpp
@@ -24,7 +24,7 @@
  *
  *******************************************************************************/
 
-#include "gtest/pooling_testing.hpp"
+// #include "gtest/pooling_testing.hpp"
 #include "pooling_common.hpp"
 
 #define WORKAROUND_ISSUE_1670 1
diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp
index 956c529533..f42b6da6aa 100644
--- a/test/pooling_common.hpp
+++ b/test/pooling_common.hpp
@@ -80,7 +80,7 @@ static inline void print(const miopen::PoolingDescriptor& filter, bool is_defaul
     else
         std::cout << "Max";
     std::cout << std::endl;
-    std::cout << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl;  // TEMPCODE RJS
+    std::cout << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl;
     std::cout << "Lengths: ";
     miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl;
     std::cout << "Pads: ";
@@ -141,11 +141,6 @@ struct pooling_operators
     }
 };
 
-#include <iomanip>
-#define MAX_PRINT 16    // TEMPCODE RJS
-#define GPU_JUNK 0
-#define GPU_4COL false
-
 template <int SptDim>
 struct verify_forward_pooling
 {
@@ -154,8 +149,8 @@ struct verify_forward_pooling
     cpu(const tensor<T>& input, const miopen::PoolingDescriptor& filter, std::vector<Index>&) const
     {
         // const bool is_default_layout = input.desc.IsDefaultLayout();
-        const int sptl_dim_offset = 2; // is_default_layout ? 2 : 1; TEMPCODE RJS
-        const int chan_dim_offset = 1; // is_default_layout ? 1 : SptDim + 1;
+        const int sptl_dim_offset = 2;
+        const int chan_dim_offset = 1;
 
         auto out = get_output_tensor(filter, input);
 
@@ -169,9 +164,6 @@ struct verify_forward_pooling
         std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin());
         auto pooler = pooling_operators<T>{filter};
 
-        // TEMPCODE RJS print input tensor
-        bool printing = in_dim[0]<=MAX_PRINT && in_dim[1]<=MAX_PRINT;
-        if(in_dim.size() > 2) printing &= in_dim[2]<=MAX_PRINT;
         int b_n = out.desc.GetLengths()[0];
         int k_n = out.desc.GetLengths()[chan_dim_offset];
         std::array<int, SptDim> out_spatial_len{};
@@ -221,59 +213,6 @@ struct verify_forward_pooling
             });
             out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size));
         });
-        if(printing)
-        {
-            std::cout << "CPU out: ";
-            auto outlen = out.desc.GetLengths();
-            auto outstr = out.desc.GetStrides();
-            for(auto dim : outlen) std::cout << std::setw(4) << dim;
-            std::cout << " | ";
-            for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
-            std::cout << std::endl;
-
-            for(int nn = 0; nn < outlen[0]; ++nn) {
-                for(int cc = 0; cc < outlen[1]; ++cc) {
-            std::cout << "n=" << nn << " c=" << cc <<std::endl;
-            for(int hh = 0; hh < outlen[2]; ++hh) {
-                for(int ww = 0; ww < outlen[3]; ++ww) {
-                    std::cout << std::setw(11) << std::setprecision(5) << out.data[nn * outstr[0] + cc * outstr[1] + hh * outstr[2] + ww * outstr[3]] << "  ";
-                }
-            std::cout << std::endl;
-            }
-            }
-            }
-        }   // print output tensor
-        if(false && printing)
-        {
-            auto inlen = input.desc.GetLengths();
-            auto instr = input.desc.GetStrides();
-            std::cout << "CPU in : m" << filter.GetMode() << " t" << input.desc.GetType() << " | ";
-            for(auto dim : inlen) std::cout << std::setw(4) << dim;
-            std::cout << " | ";
-            for(auto str : instr) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
-            std::cout << std::endl;
-
-            for(int nn = 0; nn < inlen[0]; ++nn) {
-                for(int cc = 0; cc < inlen[1]; ++cc) {
-                    for(int hh = 0; hh < inlen[2]; ++hh) {
-                        for(int ww = 0; ww < inlen[3]; ++ww) {
-                            std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << "  ";
-                        }
-                        std::cout << std::endl;
-                    }
-                }
-            }
-        }
 
         return out;
     }
@@ -285,13 +224,11 @@ struct verify_forward_pooling
     {
         auto&& handle = get_handle();
         auto out      = get_output_tensor(filter, input);
-        auto junk      = get_big_output_tensor(filter, input);   // TEMPCODE RJS
 
         indices.resize(out.data.size(), 0);
 
         auto in_dev  = handle.Write(input.data);
         auto out_dev = handle.Create<T>(out.data.size());
-        auto junk_dev = handle.Create<T>(junk.data.size());  // 
         Workspace wspace{};
         wspace.Write(indices);
 
@@ -305,59 +242,11 @@ struct verify_forward_pooling
                        out_dev.get(),
                        true,
                        wspace.ptr(),
-                       wspace.size(),
-                       junk_dev.get()); // TEMPCODE RJS
+                       wspace.size());
 
         indices  = wspace.Read<std::vector<Index>>();
         handle.ReadTo(out.data.data(), out_dev, out.data.size() * sizeof(T));
-        bool printing = input.desc.GetLengths()[2] <= MAX_PRINT && input.desc.GetLengths()[3] <= MAX_PRINT;
-        if(input.desc.GetLengths().size() > 4) printing &= input.desc.GetLengths()[4] <= MAX_PRINT;
-        if(printing)
-        {
-            if(GPU_4COL){
-                std::cout << "GPU (8-cols): " << std::endl;
-                for(int idx = 0; idx < GPU_JUNK; ++idx) {
-                    std::cout << std::setw(11) << std::setprecision(5) << out.data[idx] << "  ";
-                    if((idx % 8) == 7)  std::cout <<std::endl;
-                }
-            }
-            if(GPU_JUNK > 0){
-        handle.ReadTo(junk.data.data(), junk_dev, junk.data.size() * sizeof(T));
-                std::cout << "GPU junk: " << std::endl;
-                for(int idx = 0; idx < GPU_JUNK; ++idx) {
-                    std::cout << std::setw(11) << std::setprecision(5) << junk.data[idx] << "  ";
-                    if((idx % 4) == 3)  std::cout <<std::endl;
-                }
-            }
 
-            std::cout << "GPU out: ";
-            auto outlen = out.desc.GetLengths();
-            for(auto dim : outlen) std::cout << std::setw(4) << dim;
-            std::cout << " | ";
-            auto outstr = out.desc.GetStrides();
-            for(auto dim : outstr) std::cout << std::setw(4) << dim;
-            std::cout << " | ";
-            for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
-            std::cout << std::endl;
-            for(int nn = 0; nn < outlen[0]; ++nn) {
-                for(int cc = 0; cc < outlen[1]; ++cc) {
-                    std::cout << "n=" << nn << " c=" << cc <<std::endl;
-                    for(int hh = 0; hh < outlen[2]; ++hh) {
-                        for(int ww = 0; ww < outlen[3]; ++ww) {
-                            std::cout << std::setw(11) << std::setprecision(5) << out.data[
-                                nn * outstr[0] + cc * outstr[1] + hh * outstr[2] + ww * outstr[3]
-                                // out.desc.GetIndex(nn, cc, hh, ww)
-                                ] << "  ";
-                        }
-                        std::cout << std::endl;
-                    }
-                }
-            }
-        }   // print output tensor
         return out;
     }
 
@@ -632,19 +521,18 @@ struct pooling_driver : test_driver
     {
         add(index_type,
             "index_type",
-            generate_data({"miopenIndexUint32"}    // TEMPCODE RJS
-            // generate_multi_data<const char*>( //
-            //     {{"miopenIndexUint8",
-            //       "miopenIndexUint16",
-            //       "miopenIndexUint32",
-            //       "miopenIndexUint64"},                     //
-            //      {"miopenIndexUint8", "miopenIndexUint32"}, //
-            //      {"miopenIndexUint32"}}                     //
+            generate_multi_data<const char*>( //
+                {{"miopenIndexUint8",
+                  "miopenIndexUint16",
+                  "miopenIndexUint32",
+                  "miopenIndexUint64"},                     //
+                 {"miopenIndexUint8", "miopenIndexUint32"}, //
+                 {"miopenIndexUint32"}}                     //
                 ));
         add(mode,
             "mode",
             generate_data(
-                {"miopenPoolingMax", "miopenPoolingAverage"})); // , "miopenPoolingAverageInclusive"
+                {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"}));
 #if TEST_PADDING_MODE == 1
         add(pmode, "pmode", generate_data({"default", "same", "valid"}));
 #endif
@@ -656,60 +544,16 @@ struct pooling_driver : test_driver
     {
         std::vector<Index> indices{};
         auto input = tensor<T>{layout, in_shape};
-        for(auto& v : input.data)   v = gen_value<T>();
-
-        // TEMPCODE RJS print input tensor
-        bool printing = in_shape[0]<=MAX_PRINT && in_shape[1]<=MAX_PRINT;
-        if (in_shape.size() > 2) printing &= in_shape[2]<=MAX_PRINT;
-        if(printing)
-        {
-            auto inlen = input.desc.GetLengths();
-            auto instr = input.desc.GetStrides();
-            std::cout << "CPU GEN : " << input.desc.GetLayout_str() << "(" << inlen.size() << ") | " << input.data.size() << " | " << input.desc.GetElementSpace() << " | ";
-            for(auto dim : inlen) std::cout << std::setw(4) << dim;
-            std::cout << " | ";
-            for(auto str : instr) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str;
-            std::cout << " | ";
-            for(auto str : filter.GetPads()) std::cout << std::setw(4) << str;
-            std::cout << std::endl;
-
-            for(int nn = 0; nn < inlen[0]; ++nn) {
-                for(int cc = 0; cc < inlen[1]; ++cc) {
-                    for(int hh = 0; hh < inlen[2]; ++hh) {
-                        for(int ww = 0; ww < inlen[3]; ++ww) {// nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]
-                            std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << "  ";
-                        }
-                    std::cout << std::endl;
-                    }
-                }
-            }
-        }
+        for(auto& v : input.data)   v = gen_value<T>(); // TODO RJS use generate
 
         auto out  = verify(verify_forward_pooling<SptlDim>{},
             input,
             filter,
             indices);
-
-        // auto dout = out.first;
-        // dout.generate(tensor_elem_gen_integer{2503});
-        // verify(verify_backward_pooling<SptlDim>{},   // TEMPCODE RJS no backward
-        //        input,
-        //        dout,
-        //        out.first,
-        //        filter,
-        //        indices,
-        //        wsidx != 0,
-        //        static_cast<bool>(this->verify_indices));
     }
 
     void run()
     {
-        if(miopen::ToUpper(mode) == "MAX") return;  // TEMPCODE RJS skip all except max, do max only
-
         auto idx_typ = index_type_lookup.at(miopen::ToUpper(index_type));
         auto idx_sz  = sizeof(uint8_t);
         int sptl_dim  = in_shape.size() - 2;
@@ -719,7 +563,6 @@ struct pooling_driver : test_driver
 
         // Input dimensions to the driver are always NCHW-style
         const bool is_default_layout = !(layout == miopenTensorNHWC || layout == miopenTensorNDHWC);
-        std::cout << "################## pooling_driver run layout=" << (int)layout << " is_default=" << (int)is_default_layout << std::endl;
 
         filter = miopen::PoolingDescriptor
         {
@@ -887,7 +730,7 @@ struct pooling_driver : test_driver
         if(sptl_dim != 2 && sptl_dim != 3)
         {
             show_command();
-            std::cout << "Warning: Config skipped becuse it is not supported " //
+            std::cout << "Warning: Config skipped because it is not supported " //
                          "(sptl_dim != 2 && sptl_dim != 3)"
                       << std::endl;
             return;
@@ -898,7 +741,7 @@ struct pooling_driver : test_driver
             if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast<uint64_t>(2) * pads[i]))
             {
                 show_command();
-                std::cout << "Warning: Config skipped becuse it is invalid "
+                std::cout << "Warning: Config skipped because it is invalid "
                              "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))"
                           << std::endl;
                 return;

From 98b6dafee3c8fe263bf599a156a0e0f1a29cfc8f Mon Sep 17 00:00:00 2001
From: "Randy J. Spaulding" <rspauldi@amd.com>
Date: Tue, 8 Oct 2024 07:44:42 -0700
Subject: [PATCH 09/10] cleanup

---
 src/CMakeLists.txt                            |  7 +----
 .../miopen/problem_description_layout.hpp     |  4 +--
 src/include/miopen/tensor.hpp                 |  2 +-
 src/kernels/pooling_functions.h               |  2 +-
 src/solver/pooling/forwardNaive.cpp           |  5 ++--
 test/gtest/ex1.cpp                            |  2 +-
 test/gtest/layout_transpose.cpp               |  5 +---
 test/gtest/pooling_testing.hpp                | 10 +++----
 test/pooling2d.hpp                            | 11 +++----
 test/pooling3d.hpp                            | 18 ++++++------
 test/pooling_common.hpp                       | 29 ++++++++++---------
 11 files changed, 42 insertions(+), 53 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 359604bfd3..5855cf20e7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -26,11 +26,6 @@
 
 cmake_policy(SET CMP0057 NEW)
 
-include_directories(SYSTEM
-    /opt/rocm/include/gtest
-    /opt/rocm/include
-)
-
 include(ExportHeader)
 if(MIOPEN_ENABLE_SQLITE)
     add_subdirectory(sqlite)
@@ -600,7 +595,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/xform_bidirect_winograd_out.s
         kernels/UniversalTranspose.cl)
 
-        # Kernels in development lists.
+    # Kernels in development lists.
     # Should be ALWAYS empty in develop branch (at the time of PR merge)
     # Intention: to speed up kernel development rebuild time
     set(MIOPEN_DEVELOPMENT_KERNELS)
diff --git a/src/include/miopen/problem_description_layout.hpp b/src/include/miopen/problem_description_layout.hpp
index aa20c4058c..780086afe6 100644
--- a/src/include/miopen/problem_description_layout.hpp
+++ b/src/include/miopen/problem_description_layout.hpp
@@ -39,8 +39,8 @@ struct ProblemDescriptionLayoutBase : ProblemDescriptionBase
 {
     ProblemDescriptionLayoutBase()                              = default;
     ProblemDescriptionLayoutBase(const ProblemDescriptionLayoutBase&) = default;
-ProblemDescriptionLayoutBase(const TensorDescriptor& in_, // x for Forward, y for Backward*
-                            const TensorDescriptor& out_ // y for Forward, x for Backward*
+    ProblemDescriptionLayoutBase(const TensorDescriptor& in_, // x for Forward, y for Backward*
+                                const TensorDescriptor& out_ // y for Forward, x for Backward*
                        )
     : ProblemDescriptionBase(),
       in(in_),
diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
index ec368dbb83..3059622374 100644
--- a/src/include/miopen/tensor.hpp
+++ b/src/include/miopen/tensor.hpp
@@ -265,7 +265,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
         {
             if(labels.size() != strides.size())
             {
-                std::ostringstream oss;
+                std::ostringstream oss; // TODO TRJS check this print
                 oss << "Invalid labels size. labels='" << labels << "', strides size=" << strides.size()
                     << ". Layout labels size must be equivalent to stride size";
                 MIOPEN_THROW(oss.str().c_str());
diff --git a/src/kernels/pooling_functions.h b/src/kernels/pooling_functions.h
index 6f53a2daab..aaaa431f53 100644
--- a/src/kernels/pooling_functions.h
+++ b/src/kernels/pooling_functions.h
@@ -39,7 +39,7 @@ typedef MLO_POOLING_INDEX_TYPE index_t;
 #define MLO_POOLING_OP_STC 2
 #define MLO_POOLING_OP_AVE_INCLUSIVE 3
 
-#ifndef MLO_POOLING_OP_ID
+#ifndef MLO_POOLING_OP_ID   // TODO TRJS check: default used to be 0
 #define MLO_POOLING_OP_ID 1
 #endif
 
diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp
index 9039d49836..5bc4835ff5 100644
--- a/src/solver/pooling/forwardNaive.cpp
+++ b/src/solver/pooling/forwardNaive.cpp
@@ -82,7 +82,7 @@ bool PoolingForwardNaive::IsApplicable(const ExecutionContext&,
     return (problem.GetDirection() == miopen::pooling::Direction::Forward)          //
         && (x_type == y_type)                                                       //
         && (std::find(types.cbegin(), types.cend(), x_type) != types.cend())        //
-        && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend())          //)
+        && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend())          //
         && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end());
 }
 
@@ -95,7 +95,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     const auto bot  = problem.GetXDesc();
     const auto top  = problem.GetYDesc();
     const bool is2d = (bot.GetNumDims() == 4);
-    const bool isTranspose = problem.GetXDesc().GetLayout_str()[1] != 'C';
+    const bool isTranspose = problem.GetXDesc().GetLayout_str()[1] != 'C';  // TODO TRJS create member func
 
     // To compact code:
     const auto& pooling = problem.GetPooling();
@@ -204,7 +204,6 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     const auto g1          = RoundUpNearestPower2Positive(isTranspose ? top_d : all_c);
     const auto g2          = RoundUpNearestPower2Positive(isTranspose || is2d_kernel ? top_h : top_d);
 
-    // TODO RJS: finish NHWC grid
     auto work_left = wavesize / 1;
     const auto w0  = (g0 < work_left) ? g0 : work_left;
     work_left /= w0;
diff --git a/test/gtest/ex1.cpp b/test/gtest/ex1.cpp
index 4b1ab5e2b1..324a3055cb 100644
--- a/test/gtest/ex1.cpp
+++ b/test/gtest/ex1.cpp
@@ -1,5 +1,5 @@
 #include <gtest/gtest.h>
-
+// TODO TRJS delete file
 struct paramType { std::string value; };
 
 class MyFixture : public testing::TestWithParam<paramType> {};
diff --git a/test/gtest/layout_transpose.cpp b/test/gtest/layout_transpose.cpp
index bab82000d7..54d6cc21a7 100644
--- a/test/gtest/layout_transpose.cpp
+++ b/test/gtest/layout_transpose.cpp
@@ -274,10 +274,7 @@ struct LayoutTransposeTest_2D : public ::testing::TestWithParam<std::tuple<uint3
                 miopen::tensor_layout_to_strides(
                     tensor_len, layout_default, layout_string, tensor_strides);
 
-                auto t_src     = tensor<T>{tensor_len, tensor_strides};
-
-                t_src.generate(gen_value<T>);
-
+                auto t_src     = tensor<T>{tensor_len, tensor_strides}.generate(gen_value<T>);
                 auto t_dst     = tensor<T>{tensor_len, tensor_strides};
                 auto t_dst_gpu = tensor<T>{tensor_len, tensor_strides};
 
diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp
index eacd9ebeb6..8e94032708 100644
--- a/test/gtest/pooling_testing.hpp
+++ b/test/gtest/pooling_testing.hpp
@@ -525,7 +525,7 @@ struct pooling_driver : test_driver
     static void randomize_tensor(tensor<T>& in)
     {
         static tensor<T> random_data{{1}};
-        static tensor<int> starts{{1}};
+        static tensor<int> starts{std::vector<size_t>{1}};
         static size_t start_idx = 0;
 
         const auto size = in.GetSize();
@@ -534,22 +534,22 @@ struct pooling_driver : test_driver
         {
             random_data = tensor<T>{{ran_size}}.generate(tensor_elem_gen_integer{2503});
         }
-        if (starts.GetSize() == 1)
+        if (starts.GetSize() == 1)  // TODO TRJS is there a cleaner way to initialize starts?
         {
-            starts = tensor<size_t>{{1 << 20}}.generate(gen_start);
+            starts = tensor<int>{std::vector<size_t>{1 << 20}}.generate(gen_start);
         }
 
         const auto r_start = starts[start_idx++] % (random_data.GetSize() / 3);
         if (start_idx >= starts.GetSize()) start_idx = 0;
 
-        std::cout << "randomizing " << std::setw(9) << size << " elems from " << std::setw(9) << r_start << " (" << start_idx << ")"
+        std::cout << "randomizing " << std::setw(9) << size << " elems from " << std::setw(9) << r_start << " (" << start_idx << ")"    // TRJS
         // << "(" << std::setw(8) << prng::gen_0_to_B(size / 2)  << std::setw(8) << prng::gen_0_to_B(size / 2)  << std::setw(8) << prng::gen_0_to_B(size / 2)  << std::setw(8) << prng::gen_0_to_B(size / 2) << ")" 
         << std::endl;
         in.data.assign(random_data.begin() + r_start, random_data.begin() + r_start + size);
     }
 
     std::unordered_map<std::string, miopenIndexType_t> index_type_lookup = {
-        {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8},
+        {miopen::ToUpper("miopenIndexUint8"),  miopenIndexUint8},
         {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16},
         {miopen::ToUpper("miopenIndexUint32"), miopenIndexUint32},
         {miopen::ToUpper("miopenIndexUint64"), miopenIndexUint64},
diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp
index 546b40876f..1bd44a364b 100644
--- a/test/pooling2d.hpp
+++ b/test/pooling2d.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-// #include "gtest/pooling_testing.hpp"
 #include "pooling_common.hpp"
 
 #define WORKAROUND_ISSUE_1670 1
@@ -38,13 +37,11 @@ struct pooling2d_shapes
     static std::vector<U> get_2d_pooling_input_shapes()
     {
         return {
-                {5, 32, 8, 8}   // TEMPCODE RUN
-                ,
+                {5, 32, 8, 8},
                 {16, 1, 4096, 4096},
                 {1, 16, 4096, 4096},
                 {1, 1024, 512, 512},
-                {16, 1024, 128, 128}
-                ,
+                {16, 1024, 128, 128},
                 {1, 832, 64, 128},
                 {10, 3, 32, 32},
                 {1, 19, 1024, 2048},
@@ -65,7 +62,7 @@ struct pooling2d_shapes
     }
 
     // Dataset 1 is intended for testing of asymmetric configs.
-    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 1, 8, 8}, {10, 3, 32, 32}}; }
+    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}, {10, 3, 32, 32}}; }
 
     // Dataset 2 is intended for testing of configs with wide window.
     static std::vector<U> get_2d_pooling_input_shapes_wide()
@@ -96,6 +93,7 @@ struct pooling2d_driver : pooling_driver<T>
 public:
     pooling2d_driver() : pooling_driver<T>()
     {
+        // clang-format off
 #if TEST_GET_INPUT_TENSOR
         std::set<U> in_dim_set = get_inputs(this->batch_factor);
         std::vector<U> in_dim_vec(in_dim_set.begin(), in_dim_set.end());
@@ -120,7 +118,6 @@ struct pooling2d_driver : pooling_driver<T>
                 {{1, 1}}
             }
         ));
-        // clang-format off
         this->add(this->pads, "pads", this->template generate_multi_data<U>({
                 {{0, 0}, {1, 1}}, //
 #if WORKAROUND_ISSUE_1670
diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp
index 341cc34660..ffaaf4a9ee 100644
--- a/test/pooling3d.hpp
+++ b/test/pooling3d.hpp
@@ -33,15 +33,15 @@ struct pooling3d_shapes
 
     static std::vector<U> get_3d_pooling_input_shapes()
     {
-        return {{16, 64, 3, 4, 4}   // TEMPCODE RUN
-        ,
-                {16, 32, 4, 9, 9},
-                {8, 512, 3, 14, 14},
-                {8, 512, 4, 28, 28},
-                {16, 64, 56, 56, 56},
-                {4, 3, 4, 227, 227},
-                {4, 4, 4, 161, 700}
-                };
+        return {
+            {16, 64, 3, 4, 4},
+            {16, 32, 4, 9, 9},
+            {8, 512, 3, 14, 14},
+            {8, 512, 4, 28, 28},
+            {16, 64, 56, 56, 56},
+            {4, 3, 4, 227, 227},
+            {4, 4, 4, 161, 700}
+        };
     }
 };
 
diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp
index f42b6da6aa..0156f781a3 100644
--- a/test/pooling_common.hpp
+++ b/test/pooling_common.hpp
@@ -62,15 +62,15 @@ static int num_uint64_case_imgidx = 0;
 
 namespace {
 
-constexpr int RAND_INTEGER_MAX = 1200;
-constexpr int RAND_INTEGER_MIN = -880;
+constexpr int RAND_INTEGER_MAX = 12000;
+constexpr int RAND_INTEGER_MIN = -8800;
 
 template <typename T>
 auto gen_value =
-    [](auto... is) { return static_cast<T>(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 10; };
+    [](auto... is) { return static_cast<T>(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; };
 }
 
-static inline void print(const miopen::PoolingDescriptor& filter, bool is_default_layout)
+static inline void print(const miopen::PoolingDescriptor& filter)
 {
     std::cout << "Pooling: ";
     if(filter.GetMode() == miopenPoolingAverage)
@@ -80,7 +80,6 @@ static inline void print(const miopen::PoolingDescriptor& filter, bool is_defaul
     else
         std::cout << "Max";
     std::cout << std::endl;
-    std::cout << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl;
     std::cout << "Lengths: ";
     miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl;
     std::cout << "Pads: ";
@@ -148,7 +147,6 @@ struct verify_forward_pooling
     tensor<T>
     cpu(const tensor<T>& input, const miopen::PoolingDescriptor& filter, std::vector<Index>&) const
     {
-        // const bool is_default_layout = input.desc.IsDefaultLayout();
         const int sptl_dim_offset = 2;
         const int chan_dim_offset = 1;
 
@@ -276,10 +274,6 @@ struct verify_backward_pooling
                   bool use_global_index,
                   bool verify_index) const
     {
-        const bool is_default_layout = input.desc.IsDefaultLayout();
-        const int sptl_dim_offset = is_default_layout ? 2 : 1;
-        const int chan_dim_offset = is_default_layout ? 1 : SptDim + 1;
-
         auto dinput = input;
 
         std::vector<double> din_vec(input.desc.GetElementSpace(), 0.0);
@@ -550,6 +544,16 @@ struct pooling_driver : test_driver
             input,
             filter,
             indices);
+        auto dout = out.first;
+        dout.generate(tensor_elem_gen_integer{2503});
+        verify(verify_backward_pooling<SptDim>{},
+               input,
+               dout,
+               out.first,
+               filter,
+               indices,
+               wsidx != 0,
+               static_cast<bool>(this->verify_indices));
     }
 
     void run()
@@ -561,9 +565,6 @@ struct pooling_driver : test_driver
             (dataset_id == 0) && full_set; // Otherwise the default dataset takes too much time.
         const bool wide_dataset = (dataset_id == 2) && full_set;
 
-        // Input dimensions to the driver are always NCHW-style
-        const bool is_default_layout = !(layout == miopenTensorNHWC || layout == miopenTensorNDHWC);
-
         filter = miopen::PoolingDescriptor
         {
             mode_lookup.at(miopen::ToUpper(mode)),
@@ -766,7 +767,7 @@ struct pooling_driver : test_driver
             }
         }
 
-        int sptl_index = is_default_layout ? 2 : 1;
+        constexpr int sptl_index = 2;
 
         std::vector<int> in_dim(input_desc.GetLengths().begin() + sptl_index,
             input_desc.GetLengths().begin() + sptl_index + sptl_dim);

From 964bc57f209c065ad5c8a524f59475b2f45476b7 Mon Sep 17 00:00:00 2001
From: "Randy J. Spaulding" <rspauldi@amd.com>
Date: Tue, 8 Oct 2024 08:13:52 -0700
Subject: [PATCH 10/10] fix build

---
 test/pooling_common.hpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp
index 0156f781a3..c9f787b3ed 100644
--- a/test/pooling_common.hpp
+++ b/test/pooling_common.hpp
@@ -255,7 +255,7 @@ struct verify_forward_pooling
               const std::vector<Index>&) const
     {
         std::cout << "Forward ";
-        print(filter, input.desc.IsDefaultLayout());
+        print(filter);
         std::cout << "Input tensor: " << input.desc.ToString() << std::endl;
         std::cout << "Output tensor: " << filter.GetForwardOutputTensor(input.desc).ToString()
                   << std::endl;
@@ -276,6 +276,9 @@ struct verify_backward_pooling
     {
         auto dinput = input;
 
+        constexpr int sptl_dim_offset = 2;
+        constexpr int chan_dim_offset = 1;
+
         std::vector<double> din_vec(input.desc.GetElementSpace(), 0.0);
         CHECK(dout.desc == out.desc);
         std::array<int, SptDim + 2> in_dim{};
@@ -468,7 +471,7 @@ struct verify_backward_pooling
               bool) const
     {
         std::cout << "Backward ";
-        print(filter, input.desc.IsDefaultLayout());
+        print(filter);
         std::cout << "Input tensor: " << input.desc.ToString() << std::endl;
         std::cout << "Output tensor: " << out.desc.ToString() << std::endl;
     }
@@ -546,7 +549,7 @@ struct pooling_driver : test_driver
             indices);
         auto dout = out.first;
         dout.generate(tensor_elem_gen_integer{2503});
-        verify(verify_backward_pooling<SptDim>{},
+        verify(verify_backward_pooling<SptlDim>{},
                input,
                dout,
                out.first,