diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 0f2c2a5cb0..c69c9957c8 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -396,7 +396,7 @@ typedef enum
     miopenTensorCHWNc4 = 5, /*!< CHWNc4 memory layout (Partially supported) */
     miopenTensorCHWNc8 = 6, /*!< CHWNc8 memory layout (Partially supported) */
     miopenTensorNCDHW  = 7, /*!< NCDHW memory layout (Fully supported) */
-    miopenTensorNDHWC  = 8, /*!< NCDHW memory layout (Fully supported) */
+    miopenTensorNDHWC  = 8, /*!< NDHWC memory layout (Fully supported) */
 } miopenTensorLayout_t;
 
 /*! @ingroup pooling
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 14e7c954b1..5855cf20e7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -288,9 +288,14 @@ set( MIOpen_Source
     solver/mha/mha_solver_forward.cpp
     solver/pooling/forward2d.cpp
     solver/pooling/forwardNaive.cpp
+    solver/pooling/forwardNdNhwcNaive.cpp
     solver/pooling/forwardNd.cpp
+    solver/pooling/forwardCk2d.cpp
+    solver/pooling/forwardCkNd.cpp
     solver/pooling/backward2d.cpp
     solver/pooling/backwardNd.cpp
+    solver/pooling/backwardCk2d.cpp
+    solver/pooling/backwardCkNd.cpp
     solver/reduce/forward_argmax.cpp
     solver/reduce/forward_argmin.cpp
     solver/reduce/forward_max.cpp
@@ -447,6 +452,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/miopen_type_traits.hpp
         kernels/miopen_utility.hpp
         kernels/neuron.inc
+        kernels/pooling_functions.h
+        include/miopen/pooling/poolingNdNhwcArgs.hpp
         kernels/rocm_version.inc
         kernels/stride_array.hpp
         kernels/utilities.inc
@@ -495,6 +502,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenPoolingBwd.cl
         kernels/MIOpenPoolingBwdND.cl
         kernels/MIOpenPoolingForwardNaive.cl
+        kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
         kernels/MIOpenPoolingND.cl
         kernels/MIOpenConv1x1S.cl
         kernels/MIOpenConv1x1J1.cl
diff --git a/src/comgr.cpp b/src/comgr.cpp
index aa53b71bb5..2c533f9623 100644
--- a/src/comgr.cpp
+++ b/src/comgr.cpp
@@ -988,8 +988,12 @@ void BuildHip(const std::string& name,
             opts.push_back("-std=c++17");
 
         HiprtcProgram prog(name, text);
-        prog.Compile(opts);
-        prog.GetCode(binary);
+        try{    // TEMPCODE RJS
+
+  std::cout << "Compling HIP: '" << name << "'" << std::endl; // TEMPCODE RJS
+                  prog.Compile(opts);
+         } catch(Error& ex) { std::cout << __FUNCTION__ << " : Exception calling prog.Compile!: " << ex.text << std::endl; throw(ex); }
+                prog.GetCode(binary);
     }
     catch(Error& ex)
     {
diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp
index 9447d7a5ca..dcbcac27f3 100644
--- a/src/include/miopen/conv/problem_description.hpp
+++ b/src/include/miopen/conv/problem_description.hpp
@@ -32,6 +32,7 @@
 #include <miopen/scalar.hpp>
 
 #include <miopen/problem_description_base.hpp>
+#include <miopen/problem_description_layout.hpp>
 #include <miopen/tensor.hpp>
 #include <miopen/convolution.hpp>
 
@@ -136,7 +137,7 @@ namespace conv {
 MIOPEN_INTERNALS_EXPORT miopenAlphaBetaCase_t ClassifyAlphaBeta(const Scalar& alpha,
                                                                 const Scalar& beta);
 
-struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
+struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionWeightsBase
 #if MIOPEN_ENABLE_SQLITE
     ,
                                                     SQLiteSerializable<ProblemDescription>
@@ -153,13 +154,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
                        int bias_            = 0,
                        const Scalar& alpha_ = Scalar(1.0),
                        const Scalar& beta_  = Scalar(0.0))
-        : in(in_),
-          weights(weights_),
-          out(out_),
+        : ProblemDescriptionWeightsBase(in_, weights_, out_),
           conv(conv_),
-          in_layout(ComputeInLayout()),
-          weights_layout(ComputeWeightsLayout()),
-          out_layout(ComputeOutLayout()),
           direction(direction_),
           bias(bias_),
           alpha(alpha_),
@@ -443,49 +439,7 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
     void SetupFloats(ExecutionContext& ctx) const;
 
 private:
-    std::string ComputeInLayout() const
-    {
-        if(GetSpatialDims() == 2)
-        {
-            return in.GetLayout(in.GetLayout_str());
-        }
-        else
-        {
-            return in.GetLayout("NCDHW");
-        }
-    }
-
-    std::string ComputeOutLayout() const
-    {
-        if(GetSpatialDims() == 2)
-        {
-            return out.GetLayout(out.GetLayout_str());
-        }
-        else
-        {
-            return out.GetLayout("NCDHW");
-        }
-    }
-
-    std::string ComputeWeightsLayout() const
-    {
-        if(GetSpatialDims() == 2)
-        {
-            return weights.GetLayout(weights.GetLayout_str());
-        }
-        else
-        {
-            return weights.GetLayout("NCDHW");
-        }
-    }
-
-    TensorDescriptor in;
-    TensorDescriptor weights;
-    TensorDescriptor out;
     ConvolutionDescriptor conv;
-    std::string in_layout;
-    std::string weights_layout;
-    std::string out_layout;
     Direction direction                   = Direction::Forward;
     int bias                              = 0;
     Scalar alpha                          = Scalar(1.0);
diff --git a/src/include/miopen/kernel_info.hpp b/src/include/miopen/kernel_info.hpp
index d2571afa32..d824fe5c78 100644
--- a/src/include/miopen/kernel_info.hpp
+++ b/src/include/miopen/kernel_info.hpp
@@ -48,6 +48,9 @@ struct KernelInfo
     fs::path kernel_file;
     std::string kernel_name;
     friend std::ostream& operator<<(std::ostream& os, const KernelInfo& k);
+
+    /// configures the working set using hip-style indices
+    void ConfigureHip(size_t l0, size_t l1, size_t l2, size_t g0, size_t g1, size_t g2);
 };
 
 std::vector<Program> PrecompileKernels(const Handle& h,
diff --git a/src/include/miopen/pooling.hpp b/src/include/miopen/pooling.hpp
index 0ab5ffa1c7..26805f6df8 100644
--- a/src/include/miopen/pooling.hpp
+++ b/src/include/miopen/pooling.hpp
@@ -113,6 +113,8 @@ struct MIOPEN_EXPORT PoolingDescriptor : miopenPoolingDescriptor
     void SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t workspace_index);
 
     miopenPoolingMode_t GetMode() const;
+    
+    bool ModeIsAveraging() const;
 
     miopenPaddingMode_t GetPaddingMode() const;
 
diff --git a/src/include/miopen/pooling/poolingNdNhwcArgs.hpp b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp
new file mode 100644
index 0000000000..5084f28b4f
--- /dev/null
+++ b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp
@@ -0,0 +1,45 @@
+#pragma once
+
+struct poolingNdNhwcArgs
+{
+    uint32_t filter_d;
+    uint32_t filter_h;
+    uint32_t filter_w;
+
+    uint32_t filter_d_stride;
+    uint32_t filter_h_stride;
+    uint32_t filter_w_stride;
+
+    uint32_t filter_d_pad;
+    uint32_t filter_h_pad;
+    uint32_t filter_w_pad;
+
+    uint32_t all_n;
+    uint32_t all_c;
+
+    uint32_t bot_d;
+    uint32_t bot_h;
+    uint32_t bot_w;
+
+    uint32_t bot_n_stride;
+    uint32_t bot_c_stride;
+    uint32_t bot_d_stride;
+    uint32_t bot_h_stride;
+    uint32_t bot_w_stride;
+
+    uint32_t top_d;
+    uint32_t top_h;
+    uint32_t top_w;
+
+    uint32_t top_n_stride;
+    uint32_t top_c_stride;
+    uint32_t top_d_stride;
+    uint32_t top_h_stride;
+    uint32_t top_w_stride;
+
+    uint32_t mask_n_stride;
+    uint32_t mask_c_stride;
+    uint32_t mask_d_stride;
+    uint32_t mask_h_stride;
+    uint32_t mask_w_stride;
+};
diff --git a/src/include/miopen/pooling/solvers.hpp b/src/include/miopen/pooling/solvers.hpp
index 0d0e35755a..d8836fcdc0 100644
--- a/src/include/miopen/pooling/solvers.hpp
+++ b/src/include/miopen/pooling/solvers.hpp
@@ -66,6 +66,30 @@ struct PoolingForwardNd final : PoolingSolver
                                  const miopen::pooling::ProblemDescription& problem) const override;
 };
 
+struct PoolingForwardCk2d final : PoolingSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingForwardCk2d>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::pooling::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::pooling::ProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const override;
+};
+
+struct PoolingForwardCkNd final : PoolingSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingForwardCkNd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::pooling::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::pooling::ProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const override;
+};
+
 struct PoolingForwardNaive final : PoolingSolver
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<PoolingForwardNaive>(); }
@@ -79,6 +103,18 @@ struct PoolingForwardNaive final : PoolingSolver
                                  const miopen::pooling::ProblemDescription& problem) const override;
 };
 
+struct PoolingForwardNDNhwcNaive final : PoolingSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingForwardNDNhwcNaive>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::pooling::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::pooling::ProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const override;
+};
+
 template <class Inner>
 struct PoolingFwdNCHWTransposingSolver : TransposingSolver<PoolingFwdNCHWTransposingSolver<Inner>,
                                                            PoolingSolver,
@@ -145,6 +181,18 @@ struct PoolingBackward2d final : PoolingSolver
                                  const miopen::pooling::ProblemDescription& problem) const override;
 };
 
+struct PoolingBackwardCk2d final : PoolingSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingBackwardCk2d>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::pooling::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::pooling::ProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const override;
+};
+
 struct PoolingBackwardNd final : PoolingSolver
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<PoolingBackwardNd>(); }
@@ -157,6 +205,18 @@ struct PoolingBackwardNd final : PoolingSolver
                                  const miopen::pooling::ProblemDescription& problem) const override;
 };
 
+struct PoolingBackwardCkNd final : PoolingSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<PoolingBackwardCkNd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::pooling::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::pooling::ProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const override;
+};
+
 template <class Inner>
 struct PoolingBwdNCHWTransposingSolver : TransposingSolver<PoolingBwdNCHWTransposingSolver<Inner>,
                                                            PoolingSolver,
diff --git a/src/include/miopen/problem_description_layout.hpp b/src/include/miopen/problem_description_layout.hpp
new file mode 100644
index 0000000000..780086afe6
--- /dev/null
+++ b/src/include/miopen/problem_description_layout.hpp
@@ -0,0 +1,102 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/miopen.h>
+#include <miopen/names.hpp>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+#include <string>
+
+namespace miopen {
+
+struct ProblemDescriptionLayoutBase : ProblemDescriptionBase
+{
+    ProblemDescriptionLayoutBase()                              = default;
+    ProblemDescriptionLayoutBase(const ProblemDescriptionLayoutBase&) = default;
+    ProblemDescriptionLayoutBase(const TensorDescriptor& in_, // x for Forward, y for Backward*
+                                const TensorDescriptor& out_ // y for Forward, x for Backward*
+                       )
+    : ProblemDescriptionBase(),
+      in(in_),
+      out(out_),
+          in_layout(ComputeInLayout()),
+          out_layout(ComputeOutLayout())
+    {}
+    virtual ~ProblemDescriptionLayoutBase()                     = default;
+
+    ProblemDescriptionLayoutBase& operator=(const ProblemDescriptionLayoutBase&) = default;
+
+    [[nodiscard]] virtual NetworkConfig MakeNetworkConfig() const = 0;
+
+protected:
+    TensorDescriptor in;
+    TensorDescriptor out;
+    std::string in_layout;
+    std::string out_layout;
+
+    std::string ComputeInLayout() const
+    {
+        return in.GetLayout(in.GetLayout_str());
+    }
+
+    std::string ComputeOutLayout() const
+    {
+        return out.GetLayout(out.GetLayout_str());
+    }
+};
+
+struct ProblemDescriptionWeightsBase : ProblemDescriptionLayoutBase
+{
+    ProblemDescriptionWeightsBase()                              = default;
+    ProblemDescriptionWeightsBase(const ProblemDescriptionWeightsBase&) = default;
+    ProblemDescriptionWeightsBase(const TensorDescriptor& in_, // x for Forward, y for Backward*
+                       const TensorDescriptor& weights_,
+                       const TensorDescriptor& out_ // y for Forward, x for Backward*
+                       )
+        : ProblemDescriptionLayoutBase(in_, out_),
+          weights(weights_),
+          weights_layout(ComputeWeightsLayout())
+    {}
+    virtual ~ProblemDescriptionWeightsBase()                     = default;
+
+    ProblemDescriptionWeightsBase& operator=(const ProblemDescriptionWeightsBase&) = default;
+
+    [[nodiscard]] virtual NetworkConfig MakeNetworkConfig() const = 0;
+
+protected:
+    TensorDescriptor weights;
+    std::string weights_layout;
+
+    std::string ComputeWeightsLayout() const
+    {
+        return weights.GetLayout(weights.GetLayout_str());
+    }
+};
+
+} // namespace miopen
diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
index f4d2b2dca7..3059622374 100644
--- a/src/include/miopen/tensor.hpp
+++ b/src/include/miopen/tensor.hpp
@@ -40,6 +40,7 @@
 #include <algorithm>
 #include <cassert>
 #include <numeric>
+#include <strstream>
 #include <vector>
 #include <optional>
 
@@ -162,6 +163,10 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
     TensorDescriptor(miopenDataType_t t,
                      const std::vector<int>& lens_in,
                      const std::vector<int>& strides_in);
+    TensorDescriptor(miopenDataType_t t,
+                     miopenTensorLayout_t layout_in,
+                     const std::vector<int>& lens_in,
+                     const std::vector<int>& strides_in);
     TensorDescriptor(miopenDataType_t t,
                      const std::initializer_list<std::size_t>& lens_in,
                      const std::initializer_list<std::size_t>& strides_in);
@@ -207,6 +212,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
     miopenTensorLayout_t GetLayout_t() const;
     static std::string GetLayoutStr(miopenTensorLayout_t layout);
     std::string GetLayout_str() const;
+    bool IsDefaultLayout() const;
 
     std::size_t GetVectorLength() const;
     std::optional<miopenDataType_t> GetCastType() const;
@@ -259,8 +265,10 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
         {
             if(labels.size() != strides.size())
             {
-                MIOPEN_THROW(
-                    "Invalid labels size. Layout labels size must be equavalent to stride size");
+                std::ostringstream oss; // TODO TRJS check this print
+                oss << "Invalid labels size. labels='" << labels << "', strides size=" << strides.size()
+                    << ". Layout labels size must be equivalent to stride size";
+                MIOPEN_THROW(oss.str().c_str());
             }
 
             // Copy construct the result string from labels. This allocates the space at one go
@@ -276,7 +284,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
             if(base_label.size() != strides.size())
             {
                 MIOPEN_THROW(
-                    "Invalid labels size. Layout labels size must be equavalent to stride size");
+                    "Invalid labels size. Layout labels size must be equivalent to stride size");
             }
             auto result = base_label;
             auto p      = find_permutation(lens, strides);
@@ -285,6 +293,18 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
         }
     }
 
+    static bool IsDefaultLayout(miopenTensorLayout_t layout, unsigned spatial_dims = 2)
+    {
+        switch (spatial_dims)
+        {
+            case 2:
+            case 3:
+                return layout == GetDefaultLayout();
+            default:
+                MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3.");
+        }
+    }
+
     friend MIOPEN_INTERNALS_EXPORT std::ostream& operator<<(std::ostream& stream,
                                                             const TensorDescriptor& t);
 
@@ -292,7 +312,16 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
     friend void from_json(const nlohmann::json& j, TensorDescriptor& descriptor);
 
 protected:
-    static miopenTensorLayout_t GetDefaultLayout() { return miopenTensorNCHW; };
+    static miopenTensorLayout_t GetDefaultLayout(unsigned spatial_dims = 2)
+    {
+        switch (spatial_dims)
+        {
+            case 2: return miopenTensorNCHW;
+            case 3: return miopenTensorNCDHW;
+            default:
+                MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3.");
+        }
+    };
 
 private:
     TensorDescriptor(miopenDataType_t t,
@@ -328,11 +357,34 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
 
 template <class TElement>
 constexpr auto GetNCDHW(unsigned spatial_dims, const std::vector<TElement>& data)
+{
+    if(spatial_dims == 3)
+    {
+        if(data.size() == 5)        // NCDHW
+            return miopen::tien<5>(data, 1);
+        else if(data.size() == 3)   //   DHW
+            return std::make_tuple(static_cast<TElement>(1), static_cast<TElement>(1), data[0], data[1], data[2]);
+        else
+            MIOPEN_THROW("Invalid data length; must be 5 or 3 with 3 spatial dimensions");
+    }
+    else
+    {
+        if(data.size() == 4)        // NCHW
+            return std::make_tuple(data[0], data[1], static_cast<TElement>(1), data[2], data[3]);
+        else if(data.size() == 2)   //   HW
+            return std::make_tuple(static_cast<TElement>(1), static_cast<TElement>(1), static_cast<TElement>(1), data[0], data[1]);
+        else
+            MIOPEN_THROW("Invalid data length; must be 4 or 2 with 2 spatial dimensions");
+    }
+}
+
+template <class TElement>
+constexpr auto GetNDHWC(unsigned spatial_dims, const std::vector<TElement>& data)
 {
     if(spatial_dims == 3)
         return miopen::tien<5>(data, 1);
     else
-        return std::make_tuple(data[0], data[1], static_cast<TElement>(1), data[2], data[3]);
+        return std::make_tuple(data[0], static_cast<TElement>(1), data[1], data[2], data[3]);
 }
 
 } // namespace miopen
diff --git a/src/kernels/MIOpenPoolingBwd.cl b/src/kernels/MIOpenPoolingBwd.cl
index 6c88bebadf..062c5ed337 100644
--- a/src/kernels/MIOpenPoolingBwd.cl
+++ b/src/kernels/MIOpenPoolingBwd.cl
@@ -218,6 +218,168 @@ mloPoolingAveBwd(const __global _FLOAT* top_diff,
         }
     }
 }
+
+__attribute__((reqd_work_group_size(MLO_POOLBWD_GROUP_SZ0,
+                                    MLO_POOLBWD_GROUP_SZ1,
+                                    MLO_POOLBWD_GROUP_SZ2))) __kernel void
+mloPoolingAveBwdNhwc(const __global _FLOAT* top_diff,
+                 __global _FLOAT* bot_diff,
+                 int mlo_pad1,
+                 int mlo_pad0,
+                 int mlo_n_outputs,
+                 int mlo_bot_height,
+                 int mlo_bot_width,
+                 int mlo_top_height,
+                 int mlo_top_width,
+                 int mlo_botdf_batch_str,
+                 int mlo_botdf_channel_str,
+                 int mlo_botdf_str,
+                 int mlo_topdf_batch_str,
+                 int mlo_topdf_channel_str,
+                 int mlo_topdf_str)
+{
+    __local _FLOAT lcl_top_diff[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT];
+
+    int x       = get_group_id(0) * MLO_POOLBWD_GROUP_SZ0 * MLO_POOLBWD_N_HORIZ_OUT_PIX;
+    int y       = get_group_id(1) * MLO_POOLBWD_GROUP_SZ1 * MLO_POOLBWD_N_VERT_OUT_PIX;
+    int lcl_id0 = get_local_id(0);
+    int lcl_id1 = get_local_id(1);
+    //		int lcl_id = (lcl_id1 << MLO_POOLBWD_GROUP_LG2SZ1) + lcl_id0;
+    int ob = get_global_id(2); // outputs * batch_sz
+    int b  = ob / mlo_n_outputs;
+    int o  = ob - b * mlo_n_outputs;
+
+    int top_x   = (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) < 0
+                      ? 0
+                      : (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1;
+    int top_y   = (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) < 0
+                      ? 0
+                      : (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1;
+    int top_off = b * mlo_topdf_batch_str + o * mlo_topdf_channel_str;
+
+    _FLOAT_ACCUM res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX];
+    for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++)
+    {
+        for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++)
+        {
+            res[k][l] = (_FLOAT_ACCUM)0;
+        }
+    }
+
+    // load tile
+    for(int tj = lcl_id1; tj < MLO_POOLBWD_LCL_DATA_HEIGHT; tj += MLO_POOLBWD_GROUP_SZ1)
+    {
+        int top_y_act = top_y + tj;
+        int top_y_off = top_y_act * mlo_topdf_str;
+
+        int lcl_off_v = tj * MLO_POOLBWD_LCL_DATA_WIDTH;
+
+        bool invisibleY = (top_y_act >= mlo_top_height);
+
+        for(int ti = lcl_id0; ti < MLO_POOLBWD_LCL_DATA_WIDTH; ti += MLO_POOLBWD_GROUP_SZ0)
+        {
+
+            int top_x_act = top_x + ti;
+
+            bool invisibleX = (top_x_act >= mlo_top_width);
+
+            int top_diff_off = (invisibleX || invisibleY) ? 0 : top_off + top_y_off + top_x_act;
+
+            _FLOAT top_val = top_diff[top_diff_off];
+
+            top_val = (invisibleX || invisibleY) ? 0 : top_val;
+
+            lcl_top_diff[lcl_off_v + ti] = top_val;
+#if 0
+				if (lcl_id1==0&&o==0&&b==0)
+				{
+				  printf("K:in: %d %d %d   %f\n", top_off + top_y_off + top_x_act, top_y_act, top_x_act, top_val);
+				}
+#endif
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int bot_y = (y + lcl_id1 * MLO_POOLBWD_N_VERT_OUT_PIX);
+    int bot_x = (x + lcl_id0 * MLO_POOLBWD_N_HORIZ_OUT_PIX);
+
+    for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++)
+    {
+
+        int h          = bot_y + k + mlo_pad1;
+        int top_hstart = (h < MLO_POOLING_KERNEL_SZ1)
+                             ? 0
+                             : (h - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1;
+        int top_hend   = min(h / MLO_POOLING_STRIDE1 + 1, mlo_top_height);
+
+        for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++)
+        {
+
+            int w          = bot_x + l + mlo_pad0;
+            int top_wstart = (w < MLO_POOLING_KERNEL_SZ0)
+                                 ? 0
+                                 : (w - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1;
+            int top_wend   = min(w / MLO_POOLING_STRIDE0 + 1, mlo_top_width);
+
+            for(int top_h = top_hstart; top_h < top_hend; ++top_h)
+            {
+                int hstart = top_h * MLO_POOLING_STRIDE1 - mlo_pad1;
+                int hend   = min(hstart + MLO_POOLING_KERNEL_SZ1, mlo_bot_height);
+                hstart     = max(hstart, 0);
+
+                for(int top_w = top_wstart; top_w < top_wend; ++top_w)
+                {
+                    // figure out the pooling size
+                    int wstart = top_w * MLO_POOLING_STRIDE0 - mlo_pad0;
+                    int wend   = min(wstart + MLO_POOLING_KERNEL_SZ0, mlo_bot_width);
+                    wstart     = max(wstart, 0);
+                    int pool_size =
+#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
+                        MLO_POOLING_KERNEL_SZ0 * MLO_POOLING_KERNEL_SZ1;
+                    (void)wend;
+                    (void)hend;
+#else
+                        (hend - hstart) * (wend - wstart);
+#endif
+                    pool_size     = (pool_size == 0) ? 1 : pool_size;
+                    int lcl_top_h = top_h - top_y;
+                    int lcl_top_w = top_w - top_x;
+                    _FLOAT_ACCUM add_val =
+                        CVT_FLOAT2ACCUM(
+                            lcl_top_diff[lcl_top_h * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w]) /
+                        CVT_INTEGRAL2ACCUM(pool_size);
+                    res[k][l] += add_val;
+#if 0
+				if (bot_x+l==6&&bot_y+k==0&&o==3&&b==0)
+				{
+				  printf("K:com: %d %d %d %d %d %d   %10.8f %10.8f %10.8f %d\n", k,l,top_h, top_w, lcl_top_h, lcl_top_w, res[k][l], add_val, lcl_top_diff[lcl_top_h *  MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w], pool_size);
+				}
+#endif
+                }
+            }
+        }
+    }
+
+    int bot_off =
+        b * mlo_botdf_batch_str + o * mlo_botdf_channel_str + bot_y * mlo_botdf_str + bot_x;
+    for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++)
+    {
+        for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++)
+        {
+            if(bot_y + k < mlo_bot_height && bot_x + l < mlo_bot_width)
+            {
+                bot_diff[bot_off + k * mlo_botdf_str + l] = CVT_ACCUM2FLOAT(res[k][l]);
+#if 0
+					if (lcl_id0==0&&lcl_id1==0&&o==0&&b==0)
+					{
+						printf("K:out: %d %d %d  %f\n", bot_off + k * mlo_botdf_str +l, k, l, bot_diff[bot_off + k * mlo_botdf_str +l]);
+					}
+#endif
+            }
+        }
+    }
+}
 #endif // AVERAGE_OPS
 
 #if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX
@@ -379,4 +541,164 @@ mloPoolingMaxBwd(const __global _FLOAT* top_df,
         }
     }
 }
+
+__attribute__((reqd_work_group_size(MLO_POOLBWD_GROUP_SZ0,
+                                    MLO_POOLBWD_GROUP_SZ1,
+                                    MLO_POOLBWD_GROUP_SZ2))) __kernel void
+mloPoolingMaxBwdNhwc(const __global _FLOAT* top_df,
+                 __global _FLOAT* bot_df,
+                 __global index_t* mask,
+                 int mlo_pad1,
+                 int mlo_pad0,
+                 int mlo_n_outputs,
+                 int mlo_bot_height,
+                 int mlo_bot_width,
+                 int mlo_top_height,
+                 int mlo_top_width,
+                 int mlo_botdf_batch_str,
+                 int mlo_botdf_channel_str,
+                 int mlo_botdf_str,
+                 int mlo_topdf_batch_str,
+                 int mlo_topdf_channel_str,
+                 int mlo_topdf_str)
+{
+    __local _FLOAT lcl_top_df[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT];
+    __local index_t lcl_mask[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT];
+
+    int gid0    = get_group_id(0);
+    int gid1    = get_group_id(1);
+    int x       = gid0 * MLO_POOLBWD_GROUP_SZ0 * MLO_POOLBWD_N_HORIZ_OUT_PIX;
+    int y       = gid1 * MLO_POOLBWD_GROUP_SZ1 * MLO_POOLBWD_N_VERT_OUT_PIX;
+    int lcl_id0 = get_local_id(0);
+    int lcl_id1 = get_local_id(1);
+    int ob      = get_global_id(2); // outputs * batch_sz
+    int b       = ob / mlo_n_outputs;
+    int o       = ob - b * mlo_n_outputs;
+
+    int top_x      = (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) < 0
+                         ? 0
+                         : (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1;
+    int top_y      = (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) < 0
+                         ? 0
+                         : (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1;
+    int top_df_off = b * mlo_topdf_batch_str + o * mlo_topdf_channel_str;
+
+    _FLOAT res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX];
+    _FLOAT top_df_val;
+    index_t mask_val;
+    // load tiles
+    // top df and top
+    for(int tj = lcl_id1; tj < MLO_POOLBWD_LCL_DATA_HEIGHT; tj += MLO_POOLBWD_GROUP_SZ1)
+    {
+        int top_y_act    = top_y + tj;
+        int top_df_y_off = top_y_act * mlo_topdf_str;
+
+        int lcl_off_v = tj * MLO_POOLBWD_LCL_DATA_WIDTH;
+
+        bool visibleY = (top_y_act < mlo_top_height);
+
+        for(int ti = lcl_id0; ti < MLO_POOLBWD_LCL_DATA_WIDTH; ti += MLO_POOLBWD_GROUP_SZ0)
+        {
+            mask_val      = MLO_POOLING_INDEX_MAX;
+            int top_x_act = top_x + ti;
+            int lcl_idx   = lcl_off_v + ti;
+
+            bool visible = visibleY && (top_x_act < mlo_top_width);
+            if(visible)
+            {
+                int idx = top_df_off + top_df_y_off + top_x_act;
+
+                top_df_val = top_df[idx];
+                mask_val   = mask[idx];
+                // top_df_val *= visible;
+
+                lcl_top_df[lcl_idx] = top_df_val;
+            }
+            lcl_mask[lcl_idx] = mask_val;
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    _FLOAT add_val;
+    int bt_y = (y + lcl_id1 * MLO_POOLBWD_N_VERT_OUT_PIX);
+    int bt_x = (x + lcl_id0 * MLO_POOLBWD_N_HORIZ_OUT_PIX);
+
+    for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++)
+    {
+        int b_y = bt_y + k;
+
+        // top most top y that can be influenced by this bot y
+        int tt_y1 =
+            (b_y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1) / MLO_POOLING_STRIDE1;
+        int tt_y = max(0, tt_y1);
+
+        for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++)
+        {
+            int b_x = bt_x + l;
+            // left most top x that can be influenced by this bot x
+            int lt_x1 = (b_x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0 + MLO_POOLING_STRIDE0) /
+                        MLO_POOLING_STRIDE0;
+            int lt_x = max(0, lt_x1);
+
+            // find and sum up all tops that have been influenced by particular bot
+            res[k][l] = 0;
+
+            for(int th = tt_y; th < tt_y + (MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1 - 1) /
+                                               MLO_POOLING_STRIDE1;
+                ++th)
+            {
+                __attribute__((opencl_unroll_hint(2))) for(int tw = lt_x;
+                                                           tw < lt_x + (MLO_POOLING_KERNEL_SZ0 +
+                                                                        MLO_POOLING_STRIDE0 - 1) /
+                                                                           MLO_POOLING_STRIDE0;
+                                                           ++tw)
+                {
+                    int lcl_th = th - top_y;
+                    int lcl_tw = tw - top_x;
+#if USE_IMG_INDEX
+                    index_t img_idx = b_x + b_y * mlo_bot_width;
+#else
+                    int filter_x   = b_x - tw * MLO_POOLING_STRIDE0 + mlo_pad0;
+                    int filter_y   = b_y - th * MLO_POOLING_STRIDE1 + mlo_pad1;
+                    int filter_idx = filter_x + filter_y * MLO_POOLING_KERNEL_SZ0;
+#endif
+                    bool visible = (lcl_th < MLO_POOLBWD_LCL_DATA_HEIGHT) &&
+                                   (lcl_tw < MLO_POOLBWD_LCL_DATA_WIDTH);
+                    int lcl_idx = visible ? (lcl_th * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_tw) : 0;
+
+                    bool match = visible &&
+#if USE_IMG_INDEX
+                                 (img_idx == lcl_mask[lcl_idx])
+#else
+                                 (filter_idx == lcl_mask[lcl_idx]) && (filter_x >= 0) &&
+                                 (filter_y >= 0)
+#endif
+                        ;
+
+                    //_FLOAT add_val = lcl_top_df[lcl_idx] * match;
+                    //_FLOAT add_val = match ? lcl_top_df[lcl_idx] : (_FLOAT)0;
+                    if(match)
+                    {
+                        add_val = lcl_top_df[lcl_idx];
+                        res[k][l] += add_val;
+                    }
+                }
+            }
+        }
+    }
+
+    int bot_df_off =
+        b * mlo_botdf_batch_str + o * mlo_botdf_channel_str + bt_y * mlo_botdf_str + bt_x;
+    for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++)
+    {
+        for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++)
+        {
+            if((bt_y + k) < mlo_bot_height && (bt_x + l) < mlo_bot_width)
+            {
+                bot_df[bot_df_off + k * mlo_botdf_str + l] = res[k][l];
+            }
+        }
+    }
+}
+
 #endif // MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX
diff --git a/src/kernels/MIOpenPoolingBwdND.cl b/src/kernels/MIOpenPoolingBwdND.cl
index 7daacd24ab..4e9b4da04e 100644
--- a/src/kernels/MIOpenPoolingBwdND.cl
+++ b/src/kernels/MIOpenPoolingBwdND.cl
@@ -147,6 +147,120 @@ mloPoolingNDMaxBwd(const __global _FLOAT* top_df,
     }
 }
 
+__attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void
+mloPoolingNDMaxBwdNhwc(const __global _FLOAT* top_df,
+                   __global _FLOAT* bot_df,
+                   __global index_t* mask,
+                   const uint pad_d,
+                   const uint pad_h,
+                   const uint pad_w,
+                   const uint batch,
+                   const uint chal,
+                   const uint bot_d,
+                   const uint bot_h,
+                   const uint bot_w,
+                   const uint top_d,
+                   const uint top_h,
+                   const uint top_w,
+                   const uint bot_str_b,
+                   const uint bot_str_c,
+                   const uint bot_str_d,
+                   const uint bot_str_h,
+                   const uint top_str_b,
+                   const uint top_str_c,
+                   const uint top_str_d,
+                   const uint top_str_h,
+                   const uint total_work)
+{
+
+    int bot_blk_w = (bot_w + PIX_W_PER_WORK - 1) / PIX_W_PER_WORK;
+    int bot_blk_h = (bot_h + PIX_H_PER_WORK - 1) / PIX_H_PER_WORK;
+    int bot_blk_d = (bot_d + PIX_D_PER_WORK - 1) / PIX_D_PER_WORK;
+
+    bot_blk_w = max(bot_blk_w, 1);
+    bot_blk_h = max(bot_blk_h, 1);
+    bot_blk_d = max(bot_blk_d, 1);
+
+    for(uint gid = get_global_id(0); gid < total_work; gid += MAX_ACTIV_WORKITEM)
+    {
+        int b_id = gid / chal / bot_blk_w / bot_blk_h / bot_blk_d;
+        int c_id = (gid / bot_blk_w / bot_blk_h / bot_blk_d) % chal;
+
+        int bot_d_id = ((gid / bot_blk_w / bot_blk_h) % bot_blk_d) * PIX_D_PER_WORK;
+        int bot_h_id = ((gid / bot_blk_w) % bot_blk_h) * PIX_H_PER_WORK;
+        int bot_w_id = (gid % bot_blk_w) * PIX_W_PER_WORK;
+
+        int top_d_start =
+            bot_d_id + pad_d < KERNEL_SZ_D ? 0 : (bot_d_id + pad_d - KERNEL_SZ_D) / STRIDE_D + 1;
+        int top_h_start =
+            bot_h_id + pad_h < KERNEL_SZ_H ? 0 : (bot_h_id + pad_h - KERNEL_SZ_H) / STRIDE_H + 1;
+        int top_w_start =
+            bot_w_id + pad_w < KERNEL_SZ_W ? 0 : (bot_w_id + pad_w - KERNEL_SZ_W) / STRIDE_W + 1;
+
+        int top_d_end = (bot_d_id + PIX_D_PER_WORK - 1 + pad_d) / STRIDE_D + 1;
+        int top_h_end = (bot_h_id + PIX_H_PER_WORK - 1 + pad_h) / STRIDE_H + 1;
+        int top_w_end = (bot_w_id + PIX_W_PER_WORK - 1 + pad_w) / STRIDE_W + 1;
+
+        top_d_end = min(top_d_end, (int)top_d);
+        top_h_end = min(top_h_end, (int)top_h);
+        top_w_end = min(top_w_end, (int)top_w);
+
+        _FLOAT bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0};
+
+        for(int h = top_d_start; h < top_d_end; ++h)
+        {
+            for(int j = top_h_start; j < top_h_end; ++j)
+            {
+                for(int i = top_w_start; i < top_w_end; ++i)
+                {
+                    uint top_gbl_off =
+                        b_id * top_str_b + c_id * top_str_c + h * top_str_d + j * top_str_h + i;
+
+                    _FLOAT top_val   = b_id < batch ? top_df[top_gbl_off] : 0;
+                    index_t mask_idx = b_id < batch ? mask[top_gbl_off] : MLO_POOLING_INDEX_MAX;
+
+                    uint mask_d_id = mask_idx / bot_h / bot_w;
+                    uint mask_h_id = (mask_idx / bot_w) % bot_h;
+                    uint mask_w_id = mask_idx % bot_w;
+
+                    if(mask_d_id >= bot_d_id && mask_h_id >= bot_h_id && mask_w_id >= bot_w_id &&
+                       mask_d_id < bot_d_id + PIX_D_PER_WORK &&
+                       mask_h_id < bot_h_id + PIX_H_PER_WORK &&
+                       mask_w_id < bot_w_id + PIX_W_PER_WORK)
+                    {
+                        mask_d_id -= bot_d_id;
+                        mask_h_id -= bot_h_id;
+                        mask_w_id -= bot_w_id;
+
+                        bot_data[mask_d_id][mask_h_id][mask_w_id] += top_val;
+                    }
+                }
+            }
+        }
+
+        uint bot_off = b_id * bot_str_b + c_id * bot_str_c + bot_d_id * bot_str_d +
+                       bot_h_id * bot_str_h + bot_w_id;
+
+        for(uint m = 0; m < PIX_D_PER_WORK; m++)
+        {
+            for(uint k = 0; k < PIX_H_PER_WORK; k++)
+            {
+                for(uint l = 0; l < PIX_W_PER_WORK; l++)
+                {
+
+                    if(bot_d_id + m < bot_d && bot_h_id + k < bot_h && bot_w_id + l < bot_w &&
+                       b_id < batch)
+                    {
+                        uint bot_idx = bot_off + m * bot_str_d + k * bot_str_h + l;
+
+                        bot_df[bot_idx] = bot_data[m][k][l];
+                    }
+                }
+            }
+        }
+    }
+}
+
 #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE || MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
 
 __attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void
@@ -281,4 +395,137 @@ mloPoolingNDAveBwd(const __global _FLOAT* top_df,
         }
     }
 }
+
+__attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void
+mloPoolingNDAveBwdNhwc(const __global _FLOAT* top_df,
+                   __global _FLOAT* bot_df,
+                   const uint pad_d,
+                   const uint pad_h,
+                   const uint pad_w,
+                   const uint batch,
+                   const uint chal,
+                   const uint bot_d,
+                   const uint bot_h,
+                   const uint bot_w,
+                   const uint top_d,
+                   const uint top_h,
+                   const uint top_w,
+                   const uint bot_str_b,
+                   const uint bot_str_c,
+                   const uint bot_str_d,
+                   const uint bot_str_h,
+                   const uint top_str_b,
+                   const uint top_str_c,
+                   const uint top_str_d,
+                   const uint top_str_h,
+                   const uint total_work)
+{
+
+    int bot_blk_w = (bot_w + PIX_W_PER_WORK - 1) / PIX_W_PER_WORK;
+    int bot_blk_h = (bot_h + PIX_H_PER_WORK - 1) / PIX_H_PER_WORK;
+    int bot_blk_d = (bot_d + PIX_D_PER_WORK - 1) / PIX_D_PER_WORK;
+
+    bot_blk_w = max(bot_blk_w, 1);
+    bot_blk_h = max(bot_blk_h, 1);
+    bot_blk_d = max(bot_blk_d, 1);
+
+    for(uint gid = get_global_id(0); gid < total_work; gid += MAX_ACTIV_WORKITEM)
+    {
+        int b_id = gid / chal / bot_blk_w / bot_blk_h / bot_blk_d;
+        int c_id = (gid / bot_blk_w / bot_blk_h / bot_blk_d) % chal;
+
+        int bot_d_id = ((gid / bot_blk_w / bot_blk_h) % bot_blk_d) * PIX_D_PER_WORK;
+        int bot_h_id = ((gid / bot_blk_w) % bot_blk_h) * PIX_H_PER_WORK;
+        int bot_w_id = (gid % bot_blk_w) * PIX_W_PER_WORK;
+
+        int top_d_start =
+            bot_d_id + pad_d < KERNEL_SZ_D ? 0 : (bot_d_id + pad_d - KERNEL_SZ_D) / STRIDE_D + 1;
+        int top_h_start =
+            bot_h_id + pad_h < KERNEL_SZ_H ? 0 : (bot_h_id + pad_h - KERNEL_SZ_H) / STRIDE_H + 1;
+        int top_w_start =
+            bot_w_id + pad_w < KERNEL_SZ_W ? 0 : (bot_w_id + pad_w - KERNEL_SZ_W) / STRIDE_W + 1;
+
+        int top_d_end = (bot_d_id + PIX_D_PER_WORK - 1 + pad_d) / STRIDE_D + 1;
+        int top_h_end = (bot_h_id + PIX_H_PER_WORK - 1 + pad_h) / STRIDE_H + 1;
+        int top_w_end = (bot_w_id + PIX_W_PER_WORK - 1 + pad_w) / STRIDE_W + 1;
+
+        top_d_end = min(top_d_end, (int)top_d);
+        top_h_end = min(top_h_end, (int)top_h);
+        top_w_end = min(top_w_end, (int)top_w);
+
+        _FLOAT_ACCUM bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0};
+
+        for(int h = top_d_start; h < top_d_end; ++h)
+        {
+            int dstart = h * STRIDE_D - pad_d;
+            int dend   = min((dstart + KERNEL_SZ_D), (int)bot_d);
+            dstart     = max(dstart, 0);
+
+            for(int j = top_h_start; j < top_h_end; ++j)
+            {
+                int hstart = j * STRIDE_H - pad_h;
+                int hend   = min((hstart + KERNEL_SZ_H), (int)bot_h);
+                hstart     = max(hstart, 0);
+
+                for(int i = top_w_start; i < top_w_end; ++i)
+                {
+                    int wstart = i * STRIDE_W - pad_w;
+                    int wend   = min((wstart + KERNEL_SZ_W), (int)bot_w);
+                    wstart     = max(wstart, 0);
+
+                    uint pool_size =
+#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
+                        KERNEL_SZ_W * KERNEL_SZ_H * KERNEL_SZ_D;
+#else
+                        (dend - dstart) * (hend - hstart) * (wend - wstart);
+#endif
+                    pool_size = (pool_size == 0) ? 1 : pool_size;
+
+                    uint top_gbl_off =
+                        b_id * top_str_b + c_id * top_str_c + h * top_str_d + j * top_str_h + i;
+                    _FLOAT_ACCUM add_val =
+                        b_id < batch ? CVT_FLOAT2ACCUM(top_df[top_gbl_off]) : CVT_FP32_2ACCUM(0.0f);
+                    add_val /= CVT_INTEGRAL2ACCUM(pool_size);
+
+                    for(int m = dstart; m < dend; ++m)
+                    {
+                        for(int k = hstart; k < hend; ++k)
+                        {
+                            for(int l = wstart; l < wend; ++l)
+                            {
+                                if(m >= bot_d_id && m < PIX_D_PER_WORK + bot_d_id &&
+                                   k >= bot_h_id && k < PIX_H_PER_WORK + bot_h_id &&
+                                   l >= bot_w_id && l < PIX_W_PER_WORK + bot_w_id && b_id < batch)
+                                {
+                                    bot_data[m - bot_d_id][k - bot_h_id][l - bot_w_id] += add_val;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        uint bot_off = b_id * bot_str_b + c_id * bot_str_c + bot_d_id * bot_str_d +
+                       bot_h_id * bot_str_h + bot_w_id;
+
+        for(uint m = 0; m < PIX_D_PER_WORK; m++)
+        {
+            for(uint k = 0; k < PIX_H_PER_WORK; k++)
+            {
+                for(uint l = 0; l < PIX_W_PER_WORK; l++)
+                {
+
+                    if(bot_d_id + m < bot_d && bot_h_id + k < bot_h && bot_w_id + l < bot_w &&
+                       b_id < batch)
+                    {
+                        uint bot_idx = bot_off + m * bot_str_d + k * bot_str_h + l;
+
+                        bot_df[bot_idx] = CVT_ACCUM2FLOAT(bot_data[m][k][l]);
+                    }
+                }
+            }
+        }
+    }
+}
 #endif
diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
new file mode 100644
index 0000000000..c502cf625f
--- /dev/null
+++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp
@@ -0,0 +1,241 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_runtime.h>
+#endif
+
+#include "pooling_functions.h"
+#include "poolingNdNhwcArgs.hpp"
+
+// TODO: add ability to decode network string into pooling descriptor or similar for targeted debugging
+
+#if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE)
+#define AVERAGE_OPS 1
+#else
+#define AVERAGE_OPS 0
+#endif
+
+// Let's use extended-precision accumulator only in FP16 pooling and only for averaging.
+// For all other ops and datatypes, use native accumulator, i.e. treate FLOAT_ACCUM as FLOAT.
+#if !(AVERAGE_OPS && MIOPEN_USE_FP16)
+#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 1
+// #else
+// #define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0
+#endif
+
+#include "float_types.h"
+#include "miopen_cstdint.hpp"
+
+// This implementation is extremely memory-bound, so float type is used for all calculations
+#define _FLOAT          float
+#define _FLOAT_ACCUM    float
+
+#if MIOPEN_USE_INT8 == 1
+    #if !AVERAGE_OPS
+        #ifndef FLT_MAX
+        #define MAX_VAL 127 /* max value */
+        #else
+        #define MAX_VAL FLT_MAX
+        #endif
+    #endif
+#endif
+#if MIOPEN_USE_BFP16
+    #define NATIVE_CAST(_x)     (_FLOAT)bfloat16_to_float(_x)
+    #define NATIVE_UNCAST(_x)   (_FLOAT)float_to_bfloat16(_x)
+#else
+    #define NATIVE_CAST(_x)     (_FLOAT)(_x)
+    #define NATIVE_UNCAST(_x)   (_FLOAT)(_x)
+#endif
+
+#if AVERAGE_OPS
+#define ARG_UNUSED_FOR_AVERAGE __attribute__((__unused__))
+#else
+#define ARG_UNUSED_FOR_AVERAGE
+#endif
+
+// Out N, D, H are encoded into the block indices x, y, z
+// No 2D-only optimization.
+template <typename TI>
+__device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr,
+                                    TI* __restrict__ top_ptr,
+                                    ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr,
+                                    ARG_UNUSED_FOR_AVERAGE int save_index,
+                                    ARG_UNUSED_FOR_AVERAGE int index_mode,
+                                    poolingNdNhwcArgs args
+)
+{
+    // naming: caps=count, lowercase=index, <canonical>_<modified>
+    const uint32_t nd = blockIdx.x;
+    const uint32_t h_ = blockIdx.y;
+    const uint32_t w_c = blockIdx.z;
+    const uint32_t w_ = w_c % args.top_w;                   // CAN w=fast index
+
+    const uint32_t C_WH = blockDim.x;
+    const uint32_t _H = blockDim.y;
+    const uint32_t _W = blockDim.z;
+
+    const uint32_t c  = threadIdx.x;
+    const uint32_t _h = threadIdx.y;
+    const uint32_t _w = threadIdx.z;
+
+    const uint32_t nn = nd / args.top_d;                    // n=slow index
+    const uint32_t cc = (w_c / args.top_w) * C_WH + c;      // c=slow index (lg-C)
+    const uint32_t td = nd % args.top_d;                    // top d=fast index
+    const uint32_t th = h_ * _H + _h;                       // top h: blockIdx is slow (sm-C)
+    const uint32_t tw = w_ * _W + _w;                       // top w: blockIdx is slow (sm-C)
+
+    if(nn >= args.all_n) return;
+    if(td >= args.top_d) return;
+    if(th >= args.top_h) return;
+    if(tw >= args.top_w) return;
+    if(cc >= args.all_c) return;
+
+    const auto int_dstart   = static_cast<int64_t>(td * args.filter_d_stride) - static_cast<int64_t>(args.filter_d_pad);
+    /* const */ auto dend         = static_cast<size_t>(min(int_dstart + static_cast<int64_t>(args.filter_d), static_cast<int64_t>(args.bot_d)));
+    const auto dstart       = static_cast<size_t>(max(int_dstart, 0));
+
+    const auto int_hstart   = static_cast<int>(th * args.filter_h_stride) - static_cast<int>(args.filter_h_pad);
+    /* const */ auto hend             = static_cast<uint32_t>(min(int_hstart + static_cast<int>(args.filter_h), static_cast<int>(args.bot_h)));
+    const auto hstart         = static_cast<uint32_t>(max(int_hstart, 0));
+
+    const auto int_wstart        = static_cast<int>(tw * args.filter_w_stride) - static_cast<int>(args.filter_w_pad);
+    /* const */ auto wend             = static_cast<uint32_t>(min(int_wstart + static_cast<int>(args.filter_w), static_cast<int>(args.bot_w)));
+    const auto wstart           = static_cast<uint32_t>(max(int_wstart, 0));
+
+    size_t top_index = 
+            nn * args.top_n_stride +            //
+            cc * args.top_c_stride +            //
+            td * args.top_d_stride +            //
+            th * args.top_h_stride +            //
+            tw * args.top_w_stride;
+
+#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
+    uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    pool_size       = (pool_size == 0) ? 1 : pool_size;
+#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
+    const uint32_t pool_size = args.filter_d * args.filter_h * args.filter_w;
+#endif
+
+#if AVERAGE_OPS
+    _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0);
+#else // MAX
+    _FLOAT_ACCUM res     = (_FLOAT_ACCUM)NATIVE_CAST(-MAX_VAL_ACCUM);
+    bool found           = false; // May remain false if bot contains only NaNs/-INFs.
+    uint32_t d_save          = 0;
+    uint32_t h_save          = 0;
+    uint32_t w_save          = 0;
+    uint32_t saved_index     = 0;
+#endif
+
+    size_t bot_ncd = static_cast<size_t>(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride);
+    for(size_t bd = dstart; bd < dend; ++bd)
+    {
+        size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride;
+        for(uint32_t bh = hstart; bh < hend; ++bh)
+        {
+            size_t bot_index = bot_ncdh + wstart * args.bot_w_stride;
+            for(uint32_t bw = wstart; bw < wend; ++bw)
+            {
+#if AVERAGE_OPS
+                res += static_cast<_FLOAT_ACCUM>(NATIVE_CAST(bot_ptr[bot_index]));
+#else // MAX
+                auto val = static_cast<_FLOAT_ACCUM>(NATIVE_CAST(bot_ptr[bot_index]));
+                if(val > res)
+                {
+                    res = val;
+                    if(save_index)
+                    {
+                        found  = true;
+                        d_save = bd;
+                        h_save = bh;
+                        w_save = bw;
+                        saved_index = bot_index;
+                    }
+                }
+#endif
+                bot_index += args.bot_w_stride;
+            }
+            bot_ncdh += args.bot_h_stride;
+        }
+        bot_ncd += args.bot_d_stride;
+    }
+
+#if AVERAGE_OPS
+    res /= static_cast<_FLOAT_ACCUM>(pool_size);
+#else // MAX
+    if(save_index)
+    {
+        index_t res_index = saved_index;
+
+        /// Preventing overflow during computation of res_index:
+        /// If Index is shorter than uint, then let's perform computation in 32-bit
+        /// domain and then convert to narrower Index. That would reduce the probability of
+        /// overflow. If Index is wider then 32 bits, then it seems like it is better to
+        /// convert to Index type before multiplication. However this is not actually
+        /// necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in
+        /// 32 bits and then convert.
+
+        if(found)
+        {
+            if(index_mode == 0)
+                res_index = (index_t)(                                                    //
+                    ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) + //
+                    ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w) +         //
+                    (w_save - tw * args.filter_w_stride + args.filter_w_pad)                       //
+                );
+        }
+
+        const size_t mask_index = nn * args.mask_n_stride               //
+                                    + cc * args.mask_c_stride           //
+                                    + (size_t)(td * args.mask_d_stride) //
+                                    + (size_t)(th * args.mask_h_stride) //
+                                    + (size_t)(tw * args.mask_w_stride);
+        mask_ptr[mask_index] = res_index;
+    }
+#endif
+
+    top_ptr[top_index] = NATIVE_UNCAST(res);
+}
+
+extern "C" __global__ void mloPoolingForwardNDNhwcNaive(
+                                    const INPUT_TYPE* __restrict__ bot_ptr,
+                                    INPUT_TYPE* __restrict__ top_ptr,
+                                    index_t* __restrict__ mask_ptr,
+                                    int save_index,
+                                    int index_mode,
+                                    poolingNdNhwcArgs args
+)
+{
+    poolingForwardNDNhwcNaive<INPUT_TYPE>(
+        bot_ptr,
+        top_ptr,
+        mask_ptr,
+        save_index,
+        index_mode,
+        args
+    );
+}
diff --git a/src/kernels/MIOpenPoolingForwardNaive.cl b/src/kernels/MIOpenPoolingForwardNaive.cl
index 20e0949967..f839193297 100644
--- a/src/kernels/MIOpenPoolingForwardNaive.cl
+++ b/src/kernels/MIOpenPoolingForwardNaive.cl
@@ -145,7 +145,7 @@ __kernel void mloPoolingForwardNaive(const __global _FLOAT* bot_ptr,
             uint h_save          = 0;
             uint w_save          = 0;
 #endif
-        for(uint d = dstart; d < dend; ++d)
+        for(size_t d = dstart; d < dend; ++d)
         {
             for(uint h = hstart; h < hend; ++h)
             {
@@ -153,7 +153,7 @@ __kernel void mloPoolingForwardNaive(const __global _FLOAT* bot_ptr,
                 {
                     const size_t bot_index = b * bot_n_stride             //
                                              + o * bot_c_stride           //
-                                             + (size_t)(d * bot_d_stride) //
+                                             + d * bot_d_stride //
                                              + (size_t)(h * bot_h_stride) //
                                              + (size_t)(w * bot_w_stride);
 #if AVERAGE_OPS
diff --git a/src/kernels/pooling_functions.h b/src/kernels/pooling_functions.h
index c4821b10fb..aaaa431f53 100644
--- a/src/kernels/pooling_functions.h
+++ b/src/kernels/pooling_functions.h
@@ -39,8 +39,8 @@ typedef MLO_POOLING_INDEX_TYPE index_t;
 #define MLO_POOLING_OP_STC 2
 #define MLO_POOLING_OP_AVE_INCLUSIVE 3
 
-#ifndef MLO_POOLING_OP_ID
-#define MLO_POOLING_OP_ID 0
+#ifndef MLO_POOLING_OP_ID   // TODO TRJS check: default used to be 0
+#define MLO_POOLING_OP_ID 1
 #endif
 
 #endif // GUARD_POOLING_FUNCTIONS_H
diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp
index 9881c1596f..6b79def0b0 100644
--- a/src/ocl/pooling_ocl.cpp
+++ b/src/ocl/pooling_ocl.cpp
@@ -42,6 +42,9 @@ static auto PoolingForwardSolvers()
     return solver::SolverContainer<solver::pooling::PoolingForward2d,
                                    solver::pooling::PoolingForwardNd,
                                    solver::pooling::PoolingForwardNaive,
+                                   solver::pooling::PoolingForwardNDNhwcNaive,
+                                   solver::pooling::PoolingForwardCk2d,
+                                   solver::pooling::PoolingForwardCkNd,
                                    solver::pooling::TransposedPoolingFwd2d,
                                    solver::pooling::TransposedPoolingFwdNd>{};
 }
@@ -50,6 +53,8 @@ static auto PoolingBackwardSolvers()
 {
     return solver::SolverContainer<solver::pooling::PoolingBackward2d,
                                    solver::pooling::PoolingBackwardNd,
+                                   solver::pooling::PoolingBackwardCk2d,
+                                   solver::pooling::PoolingBackwardCkNd,
                                    solver::pooling::TransposedPoolingBwd2d,
                                    solver::pooling::TransposedPoolingBwdNd>{};
 }
@@ -89,7 +94,7 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle,
     auto index_max = get_index_max(GetIndexType());
 
     /// \anchor max_pooling_index_max_restriction
-    /// For kernel implementation max pooling backward pass,
+    /// For kernel implementation max pooling forward pass,
     /// "index_max" means ghost, and thus should not be reached.
     if(mode == miopenPoolingMax && save_index)
     {
@@ -102,13 +107,13 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle,
                                             1,
                                             std::multiplies<int>())))
         {
-            MIOPEN_THROW("Index range not enough for max pooling bwd");
+            MIOPEN_THROW("Index range not enough for max pooling fwd");
         }
 
         if(workSpace == nullptr)
         {
             throw std::invalid_argument("workSpace cannot be NULL in Forward Pooling MAX mode when "
-                                        "backward pass is requested");
+                                        "forward pass is requested");
         }
     }
 
diff --git a/src/pooling.cpp b/src/pooling.cpp
index a65cb3c0ab..e0d9112bd2 100644
--- a/src/pooling.cpp
+++ b/src/pooling.cpp
@@ -96,6 +96,11 @@ miopenPoolingWorkspaceIndexMode_t PoolingDescriptor::GetWorkspaceIndexMode() con
 
 miopenPoolingMode_t PoolingDescriptor::GetMode() const { return mode; }
 
+bool PoolingDescriptor::ModeIsAveraging() const
+{
+    return mode == miopenPoolingAverage || mode == miopenPoolingAverageInclusive;
+}
+
 miopenPaddingMode_t PoolingDescriptor::GetPaddingMode() const { return (pmode); }
 
 const std::vector<int>& PoolingDescriptor::GetLengths() const { return lens; }
@@ -217,13 +222,14 @@ TensorDescriptor PoolingDescriptor::GetForwardOutputTensor(const TensorDescripto
 {
     std::vector<int> out_dim(xDesc.GetNumDims());
     GetForwardOutputDimNd(xDesc, xDesc.GetNumDims(), out_dim.data());
+    auto layout_str = xDesc.GetLayout_str();
+    auto layout = xDesc.GetLayout_t();
+    auto lengths_layout = miopen::tensor_layout_get_default(xDesc.GetNumDims());
 
-    const std::string default_layout = tensor_layout_get_default(xDesc.GetNumDims());
-    const std::string in_layout      = xDesc.GetLayout(default_layout);
     std::vector<int> out_strides;
-    tensor_layout_to_strides(out_dim, default_layout, in_layout, out_strides);
+    tensor_layout_to_strides(out_dim, lengths_layout, layout_str, out_strides);
 
-    return {xDesc.GetType(), out_dim, out_strides};
+    return {xDesc.GetType(), layout, out_dim, out_strides};
 }
 
 std::size_t PoolingDescriptor::GetWorkSpaceSize(const TensorDescriptor& yDesc) const
diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp
index 8e171a4ac0..df21b8cd93 100644
--- a/src/pooling/problem_description.cpp
+++ b/src/pooling/problem_description.cpp
@@ -60,6 +60,7 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
                                                            : MLO_POOLING_OP_AVE_INCLUSIVE);
 
     ss << "m" + std::to_string(pooling_method);
+
     ss << "_dt" << xDesc.GetType();
     if(const auto ct = xDesc.GetCastType())
         ss << "_dct" << GetDataTypeName(*ct);
@@ -83,6 +84,7 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
         ss << "_dyd" << get_vect_config(dyDesc.GetLengths());
         ss << "_dys" << get_vect_config(dyDesc.GetStrides());
     }
+    ss << "_l" << static_cast<int>(xDesc.GetLayout_t());
 
     return NetworkConfig{ss.str()};
 }
diff --git a/src/solver.cpp b/src/solver.cpp
index e468d38d0a..c9607df252 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -66,6 +66,19 @@ std::ostream& operator<<(std::ostream& os, const KernelInfo& k)
     return os << "} '" << k.comp_options << '\'';
 }
 
+void KernelInfo::ConfigureHip(size_t l0, size_t l1, size_t l2, size_t g0, size_t g1, size_t g2) 
+{
+    l_wk.clear();
+    l_wk.push_back(l0);
+    l_wk.push_back(l1);
+    l_wk.push_back(l2);
+
+    g_wk.clear();
+    g_wk.push_back(g0 * l0);
+    g_wk.push_back(g1 * l1);
+    g_wk.push_back(g2 * l2);
+}
+
 std::vector<Program>
 PrecompileKernels(const Handle& h, const std::vector<KernelInfo>& kernels, bool force_attach_binary)
 {
@@ -561,12 +574,18 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingForward2d{}.SolverDbId());
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNd{}.SolverDbId());
 
+    Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardCk2d{}.SolverDbId());
+    Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardCkNd{}.SolverDbId());
+
     Register(registry, ++id, Primitive::Pooling, pooling::TransposedPoolingFwd2d{}.SolverDbId());
     Register(registry, ++id, Primitive::Pooling, pooling::TransposedPoolingFwdNd{}.SolverDbId());
 
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackward2d{}.SolverDbId());
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardNd{}.SolverDbId());
 
+    Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardCk2d{}.SolverDbId());
+    Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardCkNd{}.SolverDbId());
+
     RegisterWithSolver(registry,
                        ++id,
                        conv::ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC{},
@@ -594,6 +613,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              fusion::ConvCKIgemmFwdBiasActivFused{}.SolverDbId(),
              miopenConvolutionAlgoImplicitGEMM);
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNaive{}.SolverDbId());
+    Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNDNhwcNaive{}.SolverDbId());
     RegisterWithSolver(registry,
                        ++id,
                        conv::ConvHipImplicitGemmGroupFwdXdlops{},
diff --git a/src/solver/pooling/backwardCk2d.cpp b/src/solver/pooling/backwardCk2d.cpp
new file mode 100644
index 0000000000..d1ee157190
--- /dev/null
+++ b/src/solver/pooling/backwardCk2d.cpp
@@ -0,0 +1,308 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/pooling/solvers.hpp>
+
+#include <miopen/pooling/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/pooling.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/target_properties.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace pooling {
+
+namespace {
+
+struct kernel_params
+{
+    int kernel_size_h;
+    int kernel_size_w;
+    int kernel_stride_h;
+    int kernel_stride_w;
+    int out_pix_tile0;
+    int out_pix_tile1;
+    std::size_t batch_sz;
+    std::size_t n_inputs;
+    std::size_t in_height;
+    std::size_t in_width;
+    std::size_t grp_tile0;
+    std::size_t grp_tile1;
+
+    kernel_params(const miopen::pooling::ProblemDescription& problem)
+    {
+        const auto& pd = problem.GetPooling();
+
+        kernel_size_w   = pd.lens[1];
+        kernel_size_h   = pd.lens[0];
+        kernel_stride_w = pd.strides[1];
+        kernel_stride_h = pd.strides[0];
+
+        std::tie(batch_sz, n_inputs, in_height, in_width) =
+            miopen::tien<4>(problem.GetXDesc().GetLengths(), 1);
+
+        out_pix_tile0 = 1;
+        out_pix_tile1 = 1;
+        if(pd.GetMode() == miopenPoolingMax)
+        {
+            out_pix_tile0 = in_width > 8 && in_width <= 24 ? 4 : 1;
+            out_pix_tile1 = in_width <= 24 ? 1 : (in_width > 64 && in_width <= 96 ? 4 : 8);
+        }
+
+        grp_tile0 = 8;
+        grp_tile1 = 8;
+        if(pd.GetMode() == miopenPoolingMax)
+        {
+            grp_tile0 = in_width <= 8     ? 8  //
+                        : in_width <= 16  ? 4  //
+                        : in_width <= 24  ? 8  //
+                        : in_width <= 32  ? 32 //
+                        : in_width <= 64  ? 8  //
+                        : in_width <= 96  ? 16 //
+                        : in_width <= 128 ? 16
+                                          : 32;
+            grp_tile1 = in_width <= 8     ? 8  //
+                        : in_width <= 16  ? 16 //
+                        : in_width <= 24  ? 8  //
+                        : in_width <= 32  ? 4  //
+                        : in_width <= 64  ? 8  //
+                        : in_width <= 96  ? 4  //
+                        : in_width <= 128 ? 16
+                                          : 4;
+        }
+    }
+};
+
+std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem)
+{
+    const auto datatype = problem.GetXDesc().GetType();
+    return get_data_size(datatype);
+}
+
+std::size_t sizeof_kernel_index_t(const miopen::pooling::ProblemDescription& problem)
+{
+    return get_data_size(problem.GetPooling().GetIndexType());
+}
+
+inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m)
+{
+    assert(m > 0);
+    return ((v + m - 1) / m) * m;
+}
+
+// Compute amount of local memory required for holding the arrays defined
+// in the "mloPoolingAveBwd" and "mloPoolingMaxBwd" kernels.
+std::size_t sizeof_local_memory(const miopen::pooling::ProblemDescription& problem)
+{
+    const kernel_params kp(problem);
+
+    // aliases to ease programming
+    const auto& MLO_POOLING_KERNEL_SZ0      = kp.kernel_size_w;
+    const auto& MLO_POOLING_KERNEL_SZ1      = kp.kernel_size_h;
+    const auto& MLO_POOLBWD_N_HORIZ_OUT_PIX = kp.out_pix_tile0;
+    const auto& MLO_POOLBWD_N_VERT_OUT_PIX  = kp.out_pix_tile1;
+    const auto& MLO_POOLING_STRIDE0         = kp.kernel_stride_w;
+    const auto& MLO_POOLING_STRIDE1         = kp.kernel_stride_h;
+    const auto& MLO_POOLBWD_GROUP_SZ0       = kp.grp_tile0;
+    const auto& MLO_POOLBWD_GROUP_SZ1       = kp.grp_tile1;
+
+    const auto MLO_POOLBWD_LCL_DATA_WIDTH =
+        (static_cast<std::size_t>(MLO_POOLBWD_GROUP_SZ0) * MLO_POOLBWD_N_HORIZ_OUT_PIX +
+         MLO_POOLING_KERNEL_SZ0 + MLO_POOLING_STRIDE0 - 2) /
+        MLO_POOLING_STRIDE0;
+    const auto MLO_POOLBWD_LCL_DATA_HEIGHT =
+        (static_cast<std::size_t>(MLO_POOLBWD_GROUP_SZ1) * MLO_POOLBWD_N_VERT_OUT_PIX +
+         MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1 - 2) /
+        MLO_POOLING_STRIDE1;
+
+    std::size_t rv   = 0;
+    const auto nelem = MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT;
+    if(problem.GetPooling().GetMode() == miopenPoolingMax)
+    {
+        const auto sizeof_lcl_top_df = sizeof_kernel_FLOAT(problem) * nelem;
+        const auto sizeof_lcl_mask   = sizeof_kernel_index_t(problem) * nelem;
+        /// \anchor alignment_of_arrays_in_gpu_memory
+        /// The total amount of memory calculated here is slightly less than the amount calculated
+        /// by the compiler. As a result, the check here may pass, while then the compiler might
+        /// refuse to build the kernel. The most likely reason for the difference is padding (due to
+        /// alignment requirements). We don't know exactly how the compiler takes alignment into
+        /// account, but what can we do is applying an alignment that imposes a slightly tighter
+        /// constraints than the compiler. So far, 16-byte (4xDWORD) alignment works well.
+        rv = RoundUpToMultiple(sizeof_lcl_top_df, 16) + RoundUpToMultiple(sizeof_lcl_mask, 16);
+    }
+    else
+    {
+        const auto sizeof_lcl_top_diff = sizeof_kernel_FLOAT(problem) * nelem;
+        rv                             = RoundUpToMultiple(sizeof_lcl_top_diff, 16);
+    }
+    MIOPEN_LOG_T(rv);
+    return rv;
+}
+
+} // namespace
+
+bool PoolingBackwardCk2d::IsApplicable(const ExecutionContext&,
+                                     const miopen::pooling::ProblemDescription& problem) const
+{
+    return problem.GetDirection() == miopen::pooling::Direction::Backward &&
+           (problem.GetPooling().GetMode() == miopenPoolingMax ||
+            problem.GetPooling().GetMode() == miopenPoolingAverage ||
+            problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) &&
+           problem.GetXDesc().GetNumDims() == 4 && problem.GetXDesc().GetLayout("NCHW") == "NHWC" &&
+           problem.GetYDesc().GetLayout("NCHW") == "NHWC" &&
+           sizeof_local_memory(problem) <= TargetProperties::GetMaxLocalMemorySize();
+}
+
+ConvSolution
+PoolingBackwardCk2d::GetSolution(const ExecutionContext&,
+                               const miopen::pooling::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const kernel_params kp(problem);
+
+    {
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenPoolingBwd.cl";
+// TODO: backwardCk2d kernel
+        if(problem.GetPooling().GetMode() == miopenPoolingMax)
+        {
+            kernel.kernel_name = "mloPoolingMaxBwd";
+        }
+        else if(problem.GetPooling().GetMode() == miopenPoolingAverage ||
+                problem.GetPooling().GetMode() == miopenPoolingAverageInclusive)
+        {
+            kernel.kernel_name = "mloPoolingAveBwd";
+        }
+
+        const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax)
+                                       ? MLO_POOLING_OP_MAX
+                                       : ((problem.GetPooling().GetMode() == miopenPoolingAverage)
+                                              ? MLO_POOLING_OP_AVE
+                                              : MLO_POOLING_OP_AVE_INCLUSIVE);
+
+        const int g_wk_width  = ((kp.in_width + kp.grp_tile0 * kp.out_pix_tile0 - 1) /
+                                (kp.grp_tile0 * kp.out_pix_tile0));
+        const int g_wk_height = ((kp.in_height + kp.grp_tile1 * kp.out_pix_tile1 - 1) /
+                                 (kp.grp_tile1 * kp.out_pix_tile1));
+
+        const auto build_params =
+            KernelBuildParameters{
+                {"MLO_POOLING_OP_ID", pooling_method},
+                {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h},
+                {"MLO_POOLING_STRIDE1", kp.kernel_stride_h},
+                {"MLO_POOLING_KERNEL_SZ0", kp.kernel_size_w},
+                {"MLO_POOLING_STRIDE0", kp.kernel_stride_w},
+                {"MLO_POOLBWD_N_HORIZ_OUT_PIX", kp.out_pix_tile0},
+                {"MLO_POOLBWD_N_VERT_OUT_PIX", kp.out_pix_tile1},
+                {"MLO_POOLBWD_GROUP_SZ0", kp.grp_tile0},
+                {"MLO_POOLBWD_GROUP_SZ1", kp.grp_tile1},
+                {"MLO_POOLING_INDEX_TYPE",
+                 get_pooling_index_type_name(problem.GetPooling().GetIndexType())},
+                {"MLO_POOLING_INDEX_MAX",
+                 get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())},
+                {"USE_IMG_INDEX",
+                 problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexImage
+                     ? 1
+                     : 0},
+            }
+            << GetDataTypeKBP(problem.GetXDesc().GetType());
+
+        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+        kernel.l_wk = {kp.grp_tile0, kp.grp_tile1, 1};
+        kernel.g_wk = {
+            g_wk_width * kp.grp_tile0, g_wk_height * kp.grp_tile1, kp.n_inputs * kp.batch_sz};
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::BwdInvokeParams>();
+
+            if(params.pooling.GetMode() == miopenPoolingMax)
+            {
+                kernel(params.dy,
+                       params.dx,
+                       params.workspace,
+                       static_cast<int>(params.pooling.pads[0]),
+                       static_cast<int>(params.pooling.pads[1]),
+                       static_cast<int>(params.dyDesc.GetLengths()[1]),
+                       static_cast<int>(params.dxDesc.GetLengths()[2]),
+                       static_cast<int>(params.dxDesc.GetLengths()[3]),
+                       static_cast<int>(params.dyDesc.GetLengths()[2]),
+                       static_cast<int>(params.dyDesc.GetLengths()[3]),
+                       static_cast<int>(params.dxDesc.GetStrides()[0]),
+                       static_cast<int>(params.dxDesc.GetStrides()[1]),
+                       static_cast<int>(params.dxDesc.GetStrides()[2]),
+                       static_cast<int>(params.dyDesc.GetStrides()[0]),
+                       static_cast<int>(params.dyDesc.GetStrides()[1]),
+                       static_cast<int>(params.dyDesc.GetStrides()[2]));
+            }
+            else
+            {
+                kernel(params.dy,
+                       params.dx,
+                       static_cast<int>(params.pooling.pads[0]),
+                       static_cast<int>(params.pooling.pads[1]),
+                       static_cast<int>(params.dyDesc.GetLengths()[1]),
+                       static_cast<int>(params.dxDesc.GetLengths()[2]),
+                       static_cast<int>(params.dxDesc.GetLengths()[3]),
+                       static_cast<int>(params.dyDesc.GetLengths()[2]),
+                       static_cast<int>(params.dyDesc.GetLengths()[3]),
+                       static_cast<int>(params.dxDesc.GetStrides()[0]),
+                       static_cast<int>(params.dxDesc.GetStrides()[1]),
+                       static_cast<int>(params.dxDesc.GetStrides()[2]),
+                       static_cast<int>(params.dyDesc.GetStrides()[0]),
+                       static_cast<int>(params.dyDesc.GetStrides()[1]),
+                       static_cast<int>(params.dyDesc.GetStrides()[2]));
+            }
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+PoolingBackwardCk2d::GetWorkspaceSize(const ExecutionContext&,
+                                    const miopen::pooling::ProblemDescription& problem) const
+{
+    if(problem.GetPooling().GetMode() != miopenPoolingMax)
+        return 0;
+    return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType());
+}
+
+} // namespace pooling
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/pooling/backwardCkNd.cpp b/src/solver/pooling/backwardCkNd.cpp
new file mode 100644
index 0000000000..b8cdbb0286
--- /dev/null
+++ b/src/solver/pooling/backwardCkNd.cpp
@@ -0,0 +1,273 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/pooling/solvers.hpp>
+
+#include <miopen/pooling/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/pooling.hpp>
+#include <miopen/kernel_build_params.hpp>
+
+#define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCm/MIFin/issues/80
+
+namespace miopen {
+
+namespace solver {
+
+namespace pooling {
+
+bool PoolingBackwardCkNd::IsApplicable(const ExecutionContext&,
+                                     const miopen::pooling::ProblemDescription& problem) const
+{
+    return problem.GetDirection() == miopen::pooling::Direction::Backward          //
+           && problem.GetXDesc().GetType() == problem.GetYDesc().GetType()         //
+           && (problem.GetXDesc().GetType() == miopenFloat                         //
+               || problem.GetXDesc().GetType() == miopenHalf)                      //
+           && (problem.GetPooling().GetMode() == miopenPoolingMax                  //
+               || problem.GetPooling().GetMode() == miopenPoolingAverage           //
+               || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) //
+           && (                                                                    //
+                  (problem.GetXDesc().GetNumDims() == 5                            //
+                   && problem.GetXDesc().GetLayout("NCDHW") == "NDHWC"             //
+                   && problem.GetYDesc().GetLayout("NCDHW") == "NDHWC")            //
+                  ||                                                               //
+                  (problem.GetXDesc().GetNumDims() == 4                            //
+                   && problem.GetXDesc().GetLayout("NCHW") == "NHWC"               //
+                   && problem.GetYDesc().GetLayout("NCHW") == "NHWC")              //
+                  )                                                                //
+           /// \todo This solver does not support workspace index mask mode yet.
+           && !(problem.GetPooling().GetMode() == miopenPoolingMax //
+                && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask);
+}
+
+ConvSolution
+PoolingBackwardCkNd::GetSolution(const ExecutionContext&,
+                               const miopen::pooling::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto kernel        = KernelInfo{};
+    kernel.kernel_file = "MIOpenPoolingBwdND.cl";
+    kernel.kernel_name = "mloPoolingND";
+// TODO: backwardCkNd kernel
+    if(problem.GetPooling().GetMode() == miopenPoolingMax)
+    {
+        kernel.kernel_name += "MaxBwd";
+    }
+    else if(problem.GetPooling().GetMode() == miopenPoolingAverage ||
+            problem.GetPooling().GetMode() == miopenPoolingAverageInclusive)
+    {
+        kernel.kernel_name += "AveBwd";
+    }
+
+    const auto& bot = problem.GetXDesc();
+    const auto& top = problem.GetYDesc();
+
+    std::size_t batch_sz, n_inputs, in_height, in_width;
+    std::tie(batch_sz, n_inputs, in_height, in_width) = miopen::tien<4>(bot.GetLengths(), 1);
+
+    const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax)
+                                   ? MLO_POOLING_OP_MAX
+                                   : ((problem.GetPooling().GetMode() == miopenPoolingAverage)
+                                          ? MLO_POOLING_OP_AVE
+                                          : MLO_POOLING_OP_AVE_INCLUSIVE);
+
+    int pix_w_per_work = 1;
+    int pix_h_per_work = 4;
+    int pix_d_per_work = 2;
+
+    int batch = top.GetLengths()[0];
+    int chal  = top.GetLengths()[1];
+
+    const bool is2d = (bot.GetNumDims() == 4);
+
+    int bot_d = is2d ? 1 : *(bot.GetLengths().rbegin() + 2);
+    int bot_h = *(bot.GetLengths().rbegin() + 1);
+    int bot_w = *(bot.GetLengths().rbegin());
+
+    int pix_blk_w = std::max((bot_w + pix_w_per_work - 1) / pix_w_per_work, 1);
+    int pix_blk_h = std::max((bot_h + pix_h_per_work - 1) / pix_h_per_work, 1);
+    int pix_blk_d = std::max((bot_d + pix_d_per_work - 1) / pix_d_per_work, 1);
+
+    int max_activ_workitem = 65536;
+    int total_work         = batch * chal * pix_blk_w * pix_blk_h * pix_blk_d;
+    int activ_work         = std::min(total_work, max_activ_workitem);
+
+#if WORKAROUND_ISSUE_MIFIN_80
+    const std::size_t wavesize = 64;
+#else
+    const std::size_t wavesize = context.GetStream().GetWavefrontWidth();
+#endif
+    size_t grp_num = (activ_work + wavesize - 1) / wavesize;
+
+    auto strides = problem.GetPooling().strides;
+    auto lens    = problem.GetPooling().lens;
+    auto pads    = problem.GetPooling().pads;
+
+    if(is2d)
+    {
+        strides.push_back(strides[1]);
+        strides[1] = strides[0];
+        lens.push_back(lens[1]);
+        lens[1] = lens[0];
+        lens[0] = 1;
+        pads.push_back(pads[1]);
+        pads[1] = pads[0];
+        pads[0] = 0;
+    }
+
+    bool territory_overlap = false;
+    for(std::size_t i = 0; i < strides.size(); i++)
+        territory_overlap |= (strides[i] < lens[i]);
+
+    const auto build_params =
+        KernelBuildParameters{
+            {"MLO_POOLING_OP_ID", pooling_method},
+            {"MAX_ACTIV_WORKITEM", max_activ_workitem},
+            {"MLO_POOLING_GROUP_SZ0", wavesize},
+            {"MLO_POOLING_GROUP_SZ1", 1},
+            {"MLO_POOLING_GROUP_SZ2", 1},
+            {"PIX_W_PER_WORK", pix_w_per_work},
+            {"PIX_H_PER_WORK", pix_h_per_work},
+            {"PIX_D_PER_WORK", pix_d_per_work},
+            {"KERNEL_SZ_D", lens[0]},
+            {"KERNEL_SZ_H", lens[1]},
+            {"KERNEL_SZ_W", lens[2]},
+            {"STRIDE_D", strides[0]},
+            {"STRIDE_H", strides[1]},
+            {"STRIDE_W", strides[2]},
+            {"TERRITORY_OVERLAP", static_cast<int>(territory_overlap)},
+            {"MLO_POOLING_INDEX_TYPE",
+             get_pooling_index_type_name(problem.GetPooling().GetIndexType())},
+            {"MLO_POOLING_INDEX_MAX",
+             get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())},
+        }
+        << GetDataTypeKBP(problem.GetDYDesc().GetType());
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+    kernel.l_wk = {wavesize, 1, 1};
+    kernel.g_wk = {wavesize * grp_num, 1, 1};
+
+    result.construction_params.push_back(kernel);
+
+    const auto top_d = is2d ? 1 : *(top.GetLengths().rbegin() + 2);
+    const auto top_h = *(top.GetLengths().rbegin() + 1);
+    const auto top_w = *(top.GetLengths().rbegin());
+
+    auto unpackStrides = [is2d](const auto& strides) {
+        return std::make_tuple(strides[0], // N stride
+                               strides[1], // C stride
+                               strides[2], // D stride. Same as H_stride in 3D converted from 2D.
+                               is2d        //
+                                   ? strides[2] // 2D H stride
+                                   : strides[3] // 3D H stride
+        );
+    };
+
+    std::size_t bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride;
+    std::size_t top_n_stride, top_c_stride, top_d_stride, top_h_stride;
+    std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride) =
+        unpackStrides(bot.GetStrides());
+    std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride) =
+        unpackStrides(top.GetStrides());
+
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::BwdInvokeParams>();
+
+            if(params.pooling.GetMode() == miopenPoolingMax)
+            {
+                kernel(params.dy,
+                       params.dx,
+                       params.workspace,
+                       static_cast<unsigned>(pads[0]),
+                       static_cast<unsigned>(pads[1]),
+                       static_cast<unsigned>(pads[2]),
+                       static_cast<unsigned>(batch),
+                       static_cast<unsigned>(chal),
+                       static_cast<unsigned>(bot_d),
+                       static_cast<unsigned>(bot_h),
+                       static_cast<unsigned>(bot_w),
+                       static_cast<unsigned>(top_d),
+                       static_cast<unsigned>(top_h),
+                       static_cast<unsigned>(top_w),
+                       static_cast<unsigned>(bot_n_stride),
+                       static_cast<unsigned>(bot_c_stride),
+                       static_cast<unsigned>(bot_d_stride),
+                       static_cast<unsigned>(bot_h_stride),
+                       static_cast<unsigned>(top_n_stride),
+                       static_cast<unsigned>(top_c_stride),
+                       static_cast<unsigned>(top_d_stride),
+                       static_cast<unsigned>(top_h_stride),
+                       static_cast<unsigned>(total_work));
+            }
+            else
+            {
+                kernel(params.dy,
+                       params.dx,
+                       static_cast<unsigned>(pads[0]),
+                       static_cast<unsigned>(pads[1]),
+                       static_cast<unsigned>(pads[2]),
+                       static_cast<unsigned>(batch),
+                       static_cast<unsigned>(chal),
+                       static_cast<unsigned>(bot_d),
+                       static_cast<unsigned>(bot_h),
+                       static_cast<unsigned>(bot_w),
+                       static_cast<unsigned>(top_d),
+                       static_cast<unsigned>(top_h),
+                       static_cast<unsigned>(top_w),
+                       static_cast<unsigned>(bot_n_stride),
+                       static_cast<unsigned>(bot_c_stride),
+                       static_cast<unsigned>(bot_d_stride),
+                       static_cast<unsigned>(bot_h_stride),
+                       static_cast<unsigned>(top_n_stride),
+                       static_cast<unsigned>(top_c_stride),
+                       static_cast<unsigned>(top_d_stride),
+                       static_cast<unsigned>(top_h_stride),
+                       static_cast<unsigned>(total_work));
+            }
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+PoolingBackwardCkNd::GetWorkspaceSize(const ExecutionContext&,
+                                    const miopen::pooling::ProblemDescription& problem) const
+{
+    if(problem.GetPooling().GetMode() != miopenPoolingMax)
+        return 0;
+    return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType());
+}
+
+} // namespace pooling
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/pooling/forward2d.cpp b/src/solver/pooling/forward2d.cpp
index 87fd0e851f..b757c07d54 100644
--- a/src/solver/pooling/forward2d.cpp
+++ b/src/solver/pooling/forward2d.cpp
@@ -135,15 +135,25 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro
 bool PoolingForward2d::IsApplicable(const ExecutionContext& context,
                                     const miopen::pooling::ProblemDescription& problem) const
 {
-    return problem.GetDirection() == miopen::pooling::Direction::Forward &&
-           problem.GetXDesc().GetNumDims() == 4 &&
-           problem.GetXDesc().GetType() == problem.GetYDesc().GetType() &&
-           (problem.GetXDesc().GetType() == miopenFloat ||
-            problem.GetXDesc().GetType() == miopenHalf) &&
-           problem.GetXDesc().GetLayout("NCHW") == "NCHW" &&
-           problem.GetYDesc().GetLayout("NCHW") == "NCHW" &&
+    auto x_type = problem.GetXDesc().GetType();
+    auto y_type = problem.GetYDesc().GetType();
+    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf};    // TRJS TODO fix types , miopenInt8, miopenFloat8
+
+    auto x_layout = problem.GetXDesc().GetLayout_str();
+    auto y_layout = problem.GetYDesc().GetLayout_str();
+    std::vector<std::string> layouts {"NCHW"};
+
+    bool app =
+        problem.GetDirection() == miopen::pooling::Direction::Forward &&
+            problem.GetXDesc().GetNumDims() == 4 &&
+            (x_type == y_type) &&                                                    //
+            (x_layout == y_layout) &&                                                //
+            (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) &&    //
+            (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()) && //
            sizeof_private_memory(problem) <=
                TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth();
+
+    return app;
 }
 
 ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&,
diff --git a/src/solver/pooling/forwardCk2d.cpp b/src/solver/pooling/forwardCk2d.cpp
new file mode 100644
index 0000000000..7ac0c15170
--- /dev/null
+++ b/src/solver/pooling/forwardCk2d.cpp
@@ -0,0 +1,268 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/pooling/solvers.hpp>
+
+#include <miopen/pooling/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/pooling.hpp>
+#include <miopen/kernel_build_params.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace pooling {
+
+namespace {
+
+struct kernel_params
+{
+    int kernel_size_h;
+    int kernel_size_w;
+    int kernel_stride_h;
+    int kernel_stride_w;
+    int out_height;
+    int out_width;
+    int out_pix_tile0;
+    int out_pix_tile1;
+
+    kernel_params(const miopen::pooling::ProblemDescription& p)
+    {
+        const auto& pd  = p.GetPooling();
+        const auto& yd  = p.GetYDesc();
+        kernel_size_h   = pd.lens[0];
+        kernel_size_w   = pd.lens[1];
+        kernel_stride_h = pd.strides[0];
+        kernel_stride_w = pd.strides[1];
+        out_height      = yd.GetLengths()[2];
+        out_width       = yd.GetLengths()[3];
+        out_pix_tile0   = 1;
+        out_pix_tile1   = out_height <= 8    ? 1 //
+                          : out_height <= 32 ? 4 //
+                                             : 8;
+        if(out_height > 16 && out_height % 32 > 16)
+            out_pix_tile1 = std::min(16, std::max(1, prePow2(out_pix_tile1 * kernel_stride_h)));
+    }
+};
+
+std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem)
+{
+    const auto datatype = problem.GetXDesc().GetType();
+    return get_data_size(datatype);
+}
+
+std::size_t sizeof_kernel_FLOAT_ACCUM(const miopen::pooling::ProblemDescription& problem)
+{
+    const auto datatype = problem.GetXDesc().GetType();
+    if(datatype == miopenHalf)
+        return get_data_size(miopenFloat); // mixed precision
+    return get_data_size(datatype);
+}
+
+inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m)
+{
+    assert(m > 0);
+    return ((v + m - 1) / m) * m;
+}
+
+// Compute amount of private memory required for holding the arrays defined
+// in the "mloPoolingG" kernel:
+//
+// #define MLO_BOT_DATA_SZ0
+//     ((MLO_POOLING_N_HORIZ_OUT_PIX - 1) * MLO_POOLING_STRIDE0 + MLO_POOLING_KERNEL_SZ0)
+//
+// #define MLO_BOT_DATA_SZ1
+//    ((MLO_POOLING_N_VERT_OUT_PIX - 1) * MLO_POOLING_STRIDE1 + MLO_POOLING_KERNEL_SZ1)
+//
+// _FLOAT bot_data[MLO_BOT_DATA_SZ1][MLO_BOT_DATA_SZ0];
+// _FLOAT_ACCUM res[MLO_POOLING_N_VERT_OUT_PIX][MLO_POOLING_N_HORIZ_OUT_PIX];
+//
+std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& problem)
+{
+    const kernel_params kp(problem);
+
+    // aliases to ease programming
+    const auto& MLO_POOLING_KERNEL_SZ1      = kp.kernel_size_h;
+    const auto& MLO_POOLING_STRIDE1         = kp.kernel_stride_h;
+    const auto& MLO_POOLING_KERNEL_SZ0      = kp.kernel_size_w;
+    const auto& MLO_POOLING_STRIDE0         = kp.kernel_stride_w;
+    const auto& MLO_POOLING_N_HORIZ_OUT_PIX = kp.out_pix_tile0;
+    const auto& MLO_POOLING_N_VERT_OUT_PIX  = kp.out_pix_tile1;
+
+    const auto MLO_BOT_DATA_SZ0 =
+        (static_cast<std::size_t>(MLO_POOLING_N_HORIZ_OUT_PIX) - 1) * MLO_POOLING_STRIDE0 +
+        MLO_POOLING_KERNEL_SZ0;
+    const auto MLO_BOT_DATA_SZ1 =
+        (static_cast<std::size_t>(MLO_POOLING_N_VERT_OUT_PIX) - 1) * MLO_POOLING_STRIDE1 +
+        MLO_POOLING_KERNEL_SZ1;
+
+    const auto sizeof_bot_data = sizeof_kernel_FLOAT(problem) * MLO_BOT_DATA_SZ1 * MLO_BOT_DATA_SZ0;
+    const auto sizeof_res      = sizeof_kernel_FLOAT_ACCUM(problem) * MLO_POOLING_N_VERT_OUT_PIX *
+                            MLO_POOLING_N_HORIZ_OUT_PIX;
+
+    MIOPEN_LOG_T("sizeof_bot_data " << sizeof_bot_data << "sizeof_res" << sizeof_res);
+
+    /// \ref alignment_of_arrays_in_gpu_memory
+    return RoundUpToMultiple(sizeof_bot_data, 16) + RoundUpToMultiple(sizeof_res, 16);
+}
+
+} // namespace
+
+bool PoolingForwardCk2d::IsApplicable(const ExecutionContext& context,
+                                    const miopen::pooling::ProblemDescription& problem) const
+{
+    return false;   // TRJS not active yet
+    return problem.GetDirection() == miopen::pooling::Direction::Forward &&
+           problem.GetXDesc().GetNumDims() == 4 &&
+           problem.GetXDesc().GetType() == problem.GetYDesc().GetType() &&
+           (problem.GetXDesc().GetType() == miopenFloat ||
+            problem.GetXDesc().GetType() == miopenHalf) &&
+           problem.GetXDesc().GetLayout("NCHW") == "NHWC" &&
+           problem.GetYDesc().GetLayout("NCHW") == "NHWC" &&
+           sizeof_private_memory(problem) <=
+               TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth();
+}
+
+ConvSolution PoolingForwardCk2d::GetSolution(const ExecutionContext&,
+                                           const miopen::pooling::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    {
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenPooling.cl";
+        kernel.kernel_name = "mloPoolingG";
+// TODO: forwardCk2d kernel
+        const kernel_params kp(problem);
+
+        int batch_sz, n_outputs;
+        std::tie(batch_sz, n_outputs, std::ignore, std::ignore) =
+            miopen::tien<4>(problem.GetYDesc().GetLengths(), 1);
+
+        const auto& pool_d   = problem.GetPooling();
+        const auto wsp_index = pool_d.GetWorkspaceIndexMode();
+
+        int grp_tile0 = kp.out_width <= 8 ? 8 : (kp.out_width % 32 <= 16 ? 16 : 32);
+        int grp_tile1 = kp.out_height <= 8    ? 8
+                        : kp.out_height < 16  ? 16
+                        : kp.out_height <= 32 ? 32
+                        : kp.out_height <= 64 ? 64
+                                              : 128;
+        grp_tile1 /= kp.out_pix_tile1;
+        while(grp_tile0 * grp_tile1 > 256 && grp_tile0 > 1)
+            grp_tile0 >>= 1;
+
+        int pooling_method =
+            (pool_d.GetMode() == miopenPoolingMax)
+                ? MLO_POOLING_OP_MAX
+                : ((pool_d.GetMode() == miopenPoolingAverage) ? MLO_POOLING_OP_AVE
+                                                              : MLO_POOLING_OP_AVE_INCLUSIVE);
+
+        auto build_params = KernelBuildParameters{
+            {"MLO_POOLING_OP_ID", pooling_method},
+            {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h},
+            {"MLO_POOLING_STRIDE1", kp.kernel_stride_h},
+            {"MLO_POOLING_KERNEL_SZ0", kp.kernel_size_w},
+            {"MLO_POOLING_STRIDE0", kp.kernel_stride_w},
+            {"MLO_POOLING_N_HORIZ_OUT_PIX", kp.out_pix_tile0},
+            {"MLO_POOLING_N_VERT_OUT_PIX", kp.out_pix_tile1},
+            {"MLO_POOLING_GROUP_SZ0", grp_tile0},
+            {"MLO_POOLING_GROUP_SZ1", grp_tile1},
+            {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(pool_d.GetIndexType())},
+            {"MLO_POOLING_INDEX_MAX", get_pooling_index_type_max_name(pool_d.GetIndexType())},
+        };
+
+        if(problem.SaveIndex())
+        {
+            build_params << KernelBuildParameters{
+                {"MLO_POOLING_SAVE_INDEX"},
+                {"USE_IMG_INDEX", (wsp_index == miopenPoolingWorkspaceIndexImage ? 1 : 0)},
+            };
+        }
+
+        build_params << GetDataTypeKBP(problem.GetXDesc().GetType());
+
+        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+        kernel.l_wk.push_back(grp_tile0);
+        kernel.l_wk.push_back(grp_tile1);
+        kernel.l_wk.push_back(1);
+
+        int g_wk_width =
+            ((kp.out_width + grp_tile0 * kp.out_pix_tile0 - 1) / (grp_tile0 * kp.out_pix_tile0));
+        int g_wk_height =
+            ((kp.out_height + grp_tile1 * kp.out_pix_tile1 - 1) / (grp_tile1 * kp.out_pix_tile1));
+
+        kernel.g_wk.push_back(static_cast<std::size_t>(g_wk_width) * grp_tile0);
+        kernel.g_wk.push_back(static_cast<std::size_t>(g_wk_height) * grp_tile1);
+        kernel.g_wk.push_back(static_cast<std::size_t>(n_outputs) * batch_sz);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
+
+            kernel(params.x,
+                   params.y,
+                   params.workspace,
+                   static_cast<int>(params.pooling.pads[0]),
+                   static_cast<int>(params.pooling.pads[1]),
+                   static_cast<int>(params.xDesc.GetLengths()[1]),
+                   static_cast<int>(params.xDesc.GetLengths()[2]),
+                   static_cast<int>(params.xDesc.GetLengths()[3]),
+                   static_cast<int>(params.yDesc.GetLengths()[2]),
+                   static_cast<int>(params.yDesc.GetLengths()[3]),
+                   static_cast<int>(params.xDesc.GetStrides()[0]),
+                   static_cast<int>(params.xDesc.GetStrides()[1]),
+                   static_cast<int>(params.xDesc.GetStrides()[2]),
+                   static_cast<int>(params.yDesc.GetStrides()[0]),
+                   static_cast<int>(params.yDesc.GetStrides()[1]),
+                   static_cast<int>(params.yDesc.GetStrides()[2]));
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+PoolingForwardCk2d::GetWorkspaceSize(const ExecutionContext&,
+                                   const miopen::pooling::ProblemDescription& problem) const
+{
+    if(problem.GetPooling().GetMode() != miopenPoolingMax)
+        return 0;
+    return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType());
+}
+
+} // namespace pooling
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/pooling/forwardCkNd.cpp b/src/solver/pooling/forwardCkNd.cpp
new file mode 100644
index 0000000000..a4f2f781c3
--- /dev/null
+++ b/src/solver/pooling/forwardCkNd.cpp
@@ -0,0 +1,264 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/pooling/solvers.hpp>
+
+#include <miopen/pooling/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/pooling.hpp>
+#include <miopen/kernel_build_params.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace pooling {
+
+namespace {
+
+constexpr int top_w_per_work = 1;
+constexpr int top_h_per_work = 4;
+constexpr int top_d_per_work = 2;
+
+struct kernel_params
+{
+    uint32_t stride_d;
+    uint32_t stride_h;
+    uint32_t stride_w;
+    uint32_t kernel_sz_d;
+    uint32_t kernel_sz_h;
+    uint32_t kernel_sz_w;
+
+    kernel_params(const miopen::pooling::ProblemDescription& p)
+    {
+        const auto& pd = p.GetPooling();
+        stride_d       = pd.strides[0];
+        stride_h       = pd.strides[1];
+        stride_w       = pd.strides[2];
+        kernel_sz_d    = pd.lens[0];
+        kernel_sz_h    = pd.lens[1];
+        kernel_sz_w    = pd.lens[2];
+    }
+};
+
+std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem)
+{
+    const auto datatype = problem.GetXDesc().GetType();
+    return get_data_size(datatype);
+}
+
+inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m)
+{
+    assert(m > 0);
+    return ((v + m - 1) / m) * m;
+}
+
+// Compute amount of private memory required for holding the arrays defined
+// in the "mloPoolingNDFwd" kernel:
+//
+// #define BOT_TILE_W ((TOP_W_PER_WORK - 1) * STRIDE_W + KERNEL_SZ_W)
+// #define BOT_TILE_H ((TOP_H_PER_WORK - 1) * STRIDE_H + KERNEL_SZ_H)
+// #define BOT_TILE_D ((TOP_D_PER_WORK - 1) * STRIDE_D + KERNEL_SZ_D)
+//
+// _FLOAT bot_data[BOT_TILE_D][BOT_TILE_H][BOT_TILE_W];
+//
+std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& problem)
+{
+    const kernel_params kp(problem);
+
+    const std::size_t bot_tile_w = ((top_w_per_work - 1) * kp.stride_w + kp.kernel_sz_w);
+    const std::size_t bot_tile_h = ((top_h_per_work - 1) * kp.stride_h + kp.kernel_sz_h);
+    const std::size_t bot_tile_d = ((top_d_per_work - 1) * kp.stride_d + kp.kernel_sz_d);
+
+    const auto sizeof_bot_data =
+        sizeof_kernel_FLOAT(problem) * bot_tile_d * bot_tile_h * bot_tile_w;
+    MIOPEN_LOG_T("sizeof_bot_data " << sizeof_bot_data);
+
+    /// \ref alignment_of_arrays_in_gpu_memory
+    return RoundUpToMultiple(sizeof_bot_data, 16);
+}
+
+} // namespace
+
+bool PoolingForwardCkNd::IsApplicable(const ExecutionContext& context,
+                                    const miopen::pooling::ProblemDescription& problem) const
+{
+
+    return problem.GetDirection() == miopen::pooling::Direction::Forward                      //
+           && problem.GetXDesc().GetNumDims() == 5                                            //
+           && problem.GetXDesc().GetLayout("NCDHW") == "NDHWC"                                //
+           && problem.GetYDesc().GetLayout("NCDHW") == "NDHWC"                                //
+           && problem.GetXDesc().GetType() == problem.GetYDesc().GetType()                    //
+           && (problem.GetXDesc().GetType() == miopenFloat                                    //
+               || problem.GetXDesc().GetType() == miopenHalf)                                 //
+           && (problem.GetPooling().GetMode() == miopenPoolingMax                             //
+               || problem.GetPooling().GetMode() == miopenPoolingAverage                      //
+               || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive)            //
+           && sizeof_private_memory(problem) <= TargetProperties::GetMaxWaveScratchSize()     //
+                                                    / context.GetStream().GetWavefrontWidth() //
+           /// \todo This solver does not support workspace index mask mode yet.
+           &&
+           !(problem.GetPooling().GetMode() == miopenPoolingMax                                 //
+             && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask //
+             && problem.SaveIndex() == true);
+}
+
+ConvSolution PoolingForwardCkNd::GetSolution(const ExecutionContext&,
+                                           const miopen::pooling::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const int batch = problem.GetXDesc().GetLengths()[0];
+    const int chal  = problem.GetXDesc().GetLengths()[1];
+
+    const kernel_params kp(problem);
+
+    const int top_d = *(problem.GetYDesc().GetLengths().rbegin() + 2);
+    const int top_h = *(problem.GetYDesc().GetLengths().rbegin() + 1);
+    const int top_w = *(problem.GetYDesc().GetLengths().rbegin());
+
+    const int top_blk_w = std::max((top_w + top_w_per_work - 1) / top_w_per_work, 1);
+    const int top_blk_h = std::max((top_h + top_h_per_work - 1) / top_h_per_work, 1);
+    const int top_blk_d = std::max((top_d + top_d_per_work - 1) / top_d_per_work, 1);
+
+    const int max_activ_workitem = 65536;
+    const int total_work         = batch * chal * top_blk_w * top_blk_h * top_blk_d;
+    const int activ_work         = std::min(total_work, max_activ_workitem);
+
+    {
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenPoolingND.cl";
+        kernel.kernel_name = "mloPoolingNDFwd";
+// TODO: forwardCkNd kernel
+        int pooling_method = (problem.GetPooling().mode == miopenPoolingMax)
+                                 ? MLO_POOLING_OP_MAX
+                                 : ((problem.GetPooling().mode == miopenPoolingAverage)
+                                        ? MLO_POOLING_OP_AVE
+                                        : MLO_POOLING_OP_AVE_INCLUSIVE);
+
+        const size_t lcl_work = 64;
+        const size_t grp_num  = (activ_work + lcl_work - 1) / lcl_work;
+
+        auto build_params = KernelBuildParameters{
+            {"MLO_POOLING_OP_ID", static_cast<long long>(pooling_method)},
+            {"MAX_ACTIV_WORKITEM", static_cast<unsigned>(max_activ_workitem)},
+            {"MLO_POOLING_GROUP_SZ0", static_cast<long long>(lcl_work)},
+            {"MLO_POOLING_GROUP_SZ1", 1},
+            {"MLO_POOLING_GROUP_SZ2", 1},
+            {"TOP_W_PER_WORK", top_w_per_work},
+            {"TOP_H_PER_WORK", top_h_per_work},
+            {"TOP_D_PER_WORK", top_d_per_work},
+            {"KERNEL_SZ_D", kp.kernel_sz_d},
+            {"KERNEL_SZ_H", kp.kernel_sz_h},
+            {"KERNEL_SZ_W", kp.kernel_sz_w},
+            {"STRIDE_D", kp.stride_d},
+            {"STRIDE_H", kp.stride_h},
+            {"STRIDE_W", kp.stride_w},
+            {"MLO_POOLING_INDEX_TYPE",
+             get_pooling_index_type_name(problem.GetPooling().GetIndexType())},
+            {"MLO_POOLING_INDEX_MAX",
+             get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())},
+        };
+
+        if(problem.SaveIndex())
+        {
+            build_params << KernelBuildParameters{
+                {"MLO_POOLING_SAVE_INDEX"},
+            };
+        }
+
+        build_params << GetDataTypeKBP(problem.GetXDesc().GetType());
+
+        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+        kernel.l_wk = {lcl_work, 1, 1};
+        kernel.g_wk = {lcl_work * grp_num, 1, 1};
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
+
+            const int batch_ = params.xDesc.GetLengths()[0];
+            const int chal_  = params.xDesc.GetLengths()[1];
+
+            const int top_d_ = *(params.yDesc.GetLengths().rbegin() + 2);
+            const int top_h_ = *(params.yDesc.GetLengths().rbegin() + 1);
+            const int top_w_ = *(params.yDesc.GetLengths().rbegin());
+
+            const int top_blk_w_ = std::max((top_w_ + top_w_per_work - 1) / top_w_per_work, 1);
+            const int top_blk_h_ = std::max((top_h_ + top_h_per_work - 1) / top_h_per_work, 1);
+            const int top_blk_d_ = std::max((top_d_ + top_d_per_work - 1) / top_d_per_work, 1);
+
+            const int total_work_ = batch_ * chal_ * top_blk_w_ * top_blk_h_ * top_blk_d_;
+
+            kernel(params.x,
+                   params.y,
+                   params.workspace,
+                   static_cast<unsigned>(params.pooling.pads[0]),
+                   static_cast<unsigned>(params.pooling.pads[1]),
+                   static_cast<unsigned>(params.pooling.pads[2]),
+                   static_cast<unsigned>(batch_),
+                   static_cast<unsigned>(chal_),
+                   static_cast<unsigned>(params.xDesc.GetLengths()[2]),
+                   static_cast<unsigned>(params.xDesc.GetLengths()[3]),
+                   static_cast<unsigned>(params.xDesc.GetLengths()[4]),
+                   static_cast<unsigned>(top_d_),
+                   static_cast<unsigned>(top_h_),
+                   static_cast<unsigned>(top_w_),
+                   static_cast<unsigned>(params.xDesc.GetStrides()[0]),
+                   static_cast<unsigned>(params.xDesc.GetStrides()[1]),
+                   static_cast<unsigned>(params.xDesc.GetStrides()[2]),
+                   static_cast<unsigned>(params.xDesc.GetStrides()[3]),
+                   static_cast<unsigned>(params.yDesc.GetStrides()[0]),
+                   static_cast<unsigned>(params.yDesc.GetStrides()[1]),
+                   static_cast<unsigned>(params.yDesc.GetStrides()[2]),
+                   static_cast<unsigned>(params.yDesc.GetStrides()[3]),
+                   static_cast<unsigned>(total_work_));
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+PoolingForwardCkNd::GetWorkspaceSize(const ExecutionContext&,
+                                   const miopen::pooling::ProblemDescription& problem) const
+{
+    if(problem.GetPooling().GetMode() != miopenPoolingMax)
+        return 0;
+    return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType());
+}
+
+} // namespace pooling
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp
index 43406cd508..5bc4835ff5 100644
--- a/src/solver/pooling/forwardNaive.cpp
+++ b/src/solver/pooling/forwardNaive.cpp
@@ -68,22 +68,22 @@ inline uint32_t RoundUpNearestPower2Positive(uint32_t v)
 bool PoolingForwardNaive::IsApplicable(const ExecutionContext&,
                                        const miopen::pooling::ProblemDescription& problem) const
 {
-    return problem.GetDirection() == miopen::pooling::Direction::Forward           //
-           && problem.GetXDesc().GetType() == problem.GetYDesc().GetType()         //
-           && (problem.GetXDesc().GetType() == miopenFloat                         //
-               || problem.GetXDesc().GetType() == miopenHalf)                      //
-           && (problem.GetPooling().GetMode() == miopenPoolingMax                  //
-               || problem.GetPooling().GetMode() == miopenPoolingAverage           //
-               || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) //
-           && (                                                                    //
-                  (problem.GetXDesc().GetNumDims() == 5                            //
-                   && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW"             //
-                   && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW")            //
-                  ||                                                               //
-                  (problem.GetXDesc().GetNumDims() == 4                            //
-                   && problem.GetXDesc().GetLayout("NCHW") == "NCHW"               //
-                   && problem.GetYDesc().GetLayout("NCHW") == "NCHW")              //
-              );
+    auto x_type = problem.GetXDesc().GetType();
+    auto y_type = problem.GetYDesc().GetType();
+    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf};
+
+    auto mode = problem.GetPooling().GetMode();
+    std::vector<miopenPoolingMode_t> modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive};
+
+    auto x_layout = problem.GetXDesc().GetLayout_str();
+    auto y_layout = problem.GetYDesc().GetLayout_str();
+    std::vector<std::string> layouts {"NCHW", "NCDHW"};
+
+    return (problem.GetDirection() == miopen::pooling::Direction::Forward)          //
+        && (x_type == y_type)                                                       //
+        && (std::find(types.cbegin(), types.cend(), x_type) != types.cend())        //
+        && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend())          //
+        && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end());
 }
 
 ConvSolution
@@ -95,6 +95,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     const auto bot  = problem.GetXDesc();
     const auto top  = problem.GetYDesc();
     const bool is2d = (bot.GetNumDims() == 4);
+    const bool isTranspose = problem.GetXDesc().GetLayout_str()[1] != 'C';  // TODO TRJS create member func
 
     // To compact code:
     const auto& pooling = problem.GetPooling();
@@ -158,7 +159,8 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     const size_t mask_c_stride   = static_cast<size_t>(mask_d_stride) * top_d;
     const size_t mask_n_stride   = mask_c_stride * all_c;
 
-    /// About optimal grid size. The simplest way is to map the problem onto grid is 1:1 mapping of
+    /// About optimal grid size:
+    /// NC[D]HW: The simplest way is to map the problem onto grid is 1:1 mapping of
     /// N,C and top.D onto grid dimensions.
     ///
     /// However, this would waste 1 dimension of grid for 2D convolutions, i.e. the grid size would
@@ -167,6 +169,10 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
     /// access memory in a scattered way, which would affect performance again. Current design
     /// choice is using separate 2D and 3D kernels (via build-time parameter) and N*C*H grid for 2D.
     ///
+    /// N[D]HWC: top N, D, H are mapped directly onto grid dimensions
+    ///
+    ///  
+    ///
     /// \anchor naive_pooling_max_grid_size
     /// * Assumption: Max grid size is >= 2^32-1 (4G-1) i.e. std::max<unint32_t>.
     ///   Currently this limitation is valid for both ROCm HIP and OCL runtimes.
@@ -195,8 +201,8 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
 
     const auto is2d_kernel = (top_d == 1); // For 2D + optimize for 3D where the 1st dim is 1.
     const auto g0          = RoundUpNearestPower2Positive(all_n);
-    const auto g1          = RoundUpNearestPower2Positive(all_c);
-    const auto g2          = RoundUpNearestPower2Positive(is2d_kernel ? top_h : top_d);
+    const auto g1          = RoundUpNearestPower2Positive(isTranspose ? top_d : all_c);
+    const auto g2          = RoundUpNearestPower2Positive(isTranspose || is2d_kernel ? top_h : top_d);
 
     auto work_left = wavesize / 1;
     const auto w0  = (g0 < work_left) ? g0 : work_left;
@@ -217,6 +223,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
             {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(index_type)},
             {"MLO_POOLING_IS2D_KERNEL", static_cast<int>(is2d_kernel)},
         };
+
         build_params << GetDataTypeKBP(bot.GetType());
         kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
 
@@ -243,44 +250,45 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context,
             decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
 
             kernel(params.x,
-                   params.y,
-                   params.workspace,
-                   save_index,
-                   index_mode,
-                   filter_d,
-                   filter_h,
-                   filter_w,
-                   filter_d_stride,
-                   filter_h_stride,
-                   filter_w_stride,
-                   filter_d_pad,
-                   filter_h_pad,
-                   filter_w_pad,
-                   all_n,
-                   all_c,
-                   bot_d,
-                   bot_h,
-                   bot_w,
-                   bot_n_stride,
-                   bot_c_stride,
-                   bot_d_stride,
-                   bot_h_stride,
-                   bot_w_stride,
-                   top_d,
-                   top_h,
-                   top_w,
-                   top_n_stride,
-                   top_c_stride,
-                   top_d_stride,
-                   top_h_stride,
-                   top_w_stride,
-                   mask_n_stride,
-                   mask_c_stride,
-                   mask_d_stride,
-                   mask_h_stride,
-                   mask_w_stride);
+                params.y,
+                params.workspace,
+                save_index,
+                index_mode,
+                filter_d,
+                filter_h,
+                filter_w,
+                filter_d_stride,
+                filter_h_stride,
+                filter_w_stride,
+                filter_d_pad,
+                filter_h_pad,
+                filter_w_pad,
+                all_n,
+                all_c,
+                bot_d,
+                bot_h,
+                bot_w,
+                bot_n_stride,
+                bot_c_stride,
+                bot_d_stride,
+                bot_h_stride,
+                bot_w_stride,
+                top_d,
+                top_h,
+                top_w,
+                top_n_stride,
+                top_c_stride,
+                top_d_stride,
+                top_h_stride,
+                top_w_stride,
+                mask_n_stride,
+                mask_c_stride,
+                mask_d_stride,
+                mask_h_stride,
+                mask_w_stride);
         };
     };
+
     return result;
 }
 
diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp
new file mode 100644
index 0000000000..86555d9560
--- /dev/null
+++ b/src/solver/pooling/forwardNdNhwcNaive.cpp
@@ -0,0 +1,316 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/pooling.hpp>
+#include <miopen/pooling/invoke_params.hpp>
+#include <miopen/pooling/solvers.hpp>
+
+#include <miopen/pooling/poolingNdNhwcArgs.hpp>
+
+#define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCm/MIFin/issues/80
+
+namespace miopen {
+
+namespace solver {
+
+namespace pooling {
+
+namespace {
+
+#if !MIOPEN_NDEBUG && !WORKAROUND_ISSUE_MIFIN_80
+template <typename T>
+bool IsPower2(T v)
+{
+    return (v != 0) && ((v & (v - 1)) == 0);
+}
+#endif
+
+} // namespace
+
+bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&,
+                                       const miopen::pooling::ProblemDescription& problem) const
+{
+    auto x_type = problem.GetXDesc().GetType();
+    auto y_type = problem.GetYDesc().GetType();
+    std::vector<miopenDataType_t> types {miopenFloat, miopenHalf, miopenInt8, miopenFloat8, miopenBFloat16}; // 
+
+    auto mode = problem.GetPooling().GetMode();
+    std::vector<miopenPoolingMode_t> modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive};
+
+    auto x_layout = problem.GetXDesc().GetLayout_str();
+    auto y_layout = problem.GetYDesc().GetLayout_str();
+    std::vector<std::string> layouts {"NHWC", "NDHWC"};
+
+    bool app = (problem.GetDirection() == miopen::pooling::Direction::Forward)      //
+        && (x_type == y_type)                                                       //
+        && (x_layout == y_layout)                                                   //
+        && (std::find(types.cbegin(), types.cend(), x_type) != types.cend())        //
+        && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend())          //)
+        && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end());
+
+    return app;
+}
+
+ConvSolution
+PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context,
+                                 const miopen::pooling::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+    poolingNdNhwcArgs args; 
+
+    auto input_dtype  = miopen::GetDataType(problem.GetXDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType());
+
+    const auto bot  = problem.GetXDesc();
+    const auto top  = problem.GetYDesc();
+    const bool is2d = (bot.GetNumDims() == 4);
+    const bool is_transpose = problem.GetXDesc().GetLayout_str()[1] != 'C';
+    if(!is_transpose)
+    {
+        MIOPEN_THROW("Tried to run NHWC solver on NCHW data");
+    }
+
+    // To compact code:
+    const auto& pooling = problem.GetPooling();
+    const auto& lengths = pooling.GetLengths();
+    const auto& strides = pooling.GetStrides();
+    const auto& pads    = pooling.GetPads();
+
+    // This also deduces 3D (DHW) parameters from 2D (HW) descriptor.
+    uint32_t idx = 0;
+    args.filter_d        = is2d ? 1 : lengths[idx++];
+    args.filter_h        = lengths[idx++];
+    args.filter_w        = lengths[idx++];
+
+    idx = 0;
+    args.filter_d_stride = is2d ? (strides[0]) : strides[idx++];
+    args.filter_h_stride = strides[idx++];
+    args.filter_w_stride = strides[idx++];
+
+    idx = 0;
+    args.filter_d_pad    = is2d ? 0 : pads[idx++];
+    args.filter_h_pad    = pads[idx++];
+    args.filter_w_pad    = pads[idx++];
+
+    // TODO RJS move pooling_method to shared code
+    const int pooling_method = (pooling.GetMode() == miopenPoolingMax) ? MLO_POOLING_OP_MAX
+                               : (pooling.GetMode() == miopenPoolingAverage)
+                                   ? MLO_POOLING_OP_AVE
+                                   : MLO_POOLING_OP_AVE_INCLUSIVE;
+
+    const auto save_index = problem.SaveIndex();
+    const auto index_mode = pooling.GetWorkspaceIndexMode();
+    const auto index_type = pooling.GetIndexType();
+
+    /// \anchor multiply_dims_overflow_assumption
+    ///
+    /// Preventing overflow during dimension-related computations:
+    /// Let's assume that multiplication of three dims always fits into 32 bits (unsigned).
+    /// Then let's use size_t when we need to multiply more than three dims.
+    /// For example, in NCDHW layout, the N and C strides are results of multiplication
+    /// of >= 3 dims, so we have to use size_t for storing them.
+    ///
+    /// We need to pay special attention to muls of D stride with some other dims.
+    /// The D stride is a result of 2 muls. Therefore (d_stride * dim) does
+    /// not require widening to size_t prior mul, but (d_stride * dim * dim)
+    /// requires it because the total number of muls is 4.
+
+    const auto spatial_dim = is2d ? 2U : 3U;
+
+    std::tie(args.all_n, args.all_c, args.bot_d, args.bot_h, args.bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths());
+
+    std::tie(args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride) =
+        miopen::GetNCDHW(spatial_dim, bot.GetStrides());
+
+    std::tie(std::ignore, std::ignore, args.top_d, args.top_h, args.top_w) =
+        miopen::GetNCDHW(spatial_dim, top.GetLengths());
+
+    std::tie(args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride,args. top_w_stride) =
+        miopen::GetNCDHW(spatial_dim, top.GetStrides());
+
+    // Mask data is always NCDHW layout
+    args.mask_w_stride = 1;
+    args.mask_h_stride = args.mask_w_stride * args.top_w;
+    args.mask_d_stride = args.mask_h_stride * args.top_h;
+    args.mask_c_stride   = args.mask_d_stride * args.top_d;
+    args.mask_n_stride   = args.mask_c_stride * args.all_c;
+
+    /// About optimal grid size:
+    /// top D, H, and W are mapped directly onto grid dimensions, except in very small problems
+    /// when they are combined into workgroup items in an attempt to improve overlapping and coalescense.
+    /// N seems to be generally small, so we'll multiply it into the 'D' dimension.
+    ///
+    /// \anchor naive_pooling_max_grid_size
+    /// * Assumption: Max grid size is >= 2^32-1 (4G-1) i.e. std::max<uint32_t>.
+    ///   However, assume the product of two dimensions is always <= 2^30.
+    ///   Currently this limitation is valid for both ROCm HIP and OCL runtimes.
+    ///
+    /// Selecting the optimal workgroup size is an interesting problem.
+    /// We'll first map N * D to blockIdx.x. H and W are canonically mapped into
+    /// blockIdx.y and z, respectively. C, being the fastest index, is mapped
+    /// into threadIdx.x up to the maximum items. For larger C, the remainder are
+    /// mapped into blockIdx.z.
+    ///
+    /// For small C, we favor more waves over more blocks. W/H are mapped into threadIdx.z/y,
+    /// in that order, fractionally in powers of 2 if possible, up to a maximum
+    /// of 256 workitems. Finally, any remaining W/H are then mapped onto blockIdx.z/y.
+    ///
+    /// The workgroup size does not have the restrictions imposed by synchronization between
+    /// workitems because the kernel does not require synchronization.
+
+    std::ignore = context;
+    constexpr uint32_t MAX_THREADS       = 512;
+    constexpr uint32_t LARGE_C_MAX_ITEMS = MAX_THREADS;
+    constexpr uint32_t SMALL_C_TGT_ITEMS = 256;
+
+    auto nd_ = args.all_n * args.top_d;
+    auto h_  = args.top_h;
+    auto w_  = args.top_w;
+    auto c_  = args.all_c;
+
+    // These are hip-style indexes (not OCL)
+    uint32_t l1 = 1U;
+    uint32_t l2 = 1U;
+
+    if(c_ > LARGE_C_MAX_ITEMS)
+    {
+        auto c2 = (c_ + LARGE_C_MAX_ITEMS - 1) / LARGE_C_MAX_ITEMS;
+        c_ = LARGE_C_MAX_ITEMS;
+        w_ *= c2;
+    }
+    else if(c_ <= SMALL_C_TGT_ITEMS / 2)    // Small C, remap H and W to increase occupancy
+    {
+        if(c_ * w_ < SMALL_C_TGT_ITEMS)
+        {
+            std::swap(l2, w_);              // full w mapped to threads
+        }
+
+        while(w_ > 2 && ((c_ * l2) < SMALL_C_TGT_ITEMS))
+        {
+            w_ = (w_ + 1) / 2;              // partial w mapped to threads (rounddown-safe)
+            l2 *= 2;
+        }
+
+        if(c_ * l2 * h_ < SMALL_C_TGT_ITEMS)
+        {
+            std::swap(l1, h_);              // full h mapped to threads
+        }
+
+        while(h_ > 2 && ((c_ * l1 * l2) < SMALL_C_TGT_ITEMS))
+        {
+            h_ = (h_ + 1 ) / 2;             // partial h mapped to threads (rounddown-safe)
+            l1 *= 2;
+        }
+    }
+
+    const auto g0 = nd_;
+    const auto g1 = h_;
+    const auto g2 = w_;
+    const auto l0 = c_;
+
+    {
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenPoolingForwardNDNhwcNaive.cpp";
+        kernel.kernel_name = "mloPoolingForwardNDNhwcNaive";
+
+        auto build_params = KernelBuildParameters{
+            {"MLO_POOLING_OP_ID", pooling_method}, // We need this at compile time in order to
+                                                   // engage mixed precision only when necessary.
+            {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(index_type)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}
+        };
+
+        build_params << GetDataTypeKBP(bot.GetType());
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        // [Informative] The total number of kernels required to cover the whole
+        // forward pooling problem space is 3*4*2*2*2 = 96. The solver is dynamic.
+        // * 3: the number of supported operations
+        // * 4: the number of supported index types
+        // * 2: the number of supported data types
+        // * 2: layout (NCHW vs NHWC)
+        // * 2: 2D and 3D kernels (optimization)
+
+        kernel.ConfigureHip(l0, l1, l2, g0, g1, g2);
+        // KernelInfo uses OCL-style indexes
+        // kernel.l_wk.clear();
+        // kernel.l_wk.push_back(l0);
+        // kernel.l_wk.push_back(l1);
+        // kernel.l_wk.push_back(l2);
+        // kernel.g_wk.clear();
+        // kernel.g_wk.push_back(g0 * l0);
+        // kernel.g_wk.push_back(g1 * l1);
+        // kernel.g_wk.push_back(g2 * l2);
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::FwdInvokeParams>();
+
+            kernel(
+                params.x,
+                params.y,
+                params.workspace,
+                save_index,
+                index_mode,
+                args
+                // args.filter_d, args.filter_h, args.filter_w,
+                // args.filter_d_stride, args.filter_h_stride, args.filter_w_stride,
+                // args.filter_d_pad, args.filter_h_pad, args.filter_w_pad,
+                // args.all_n,
+                // args.all_c,
+                // args.bot_d, args.bot_h, args.bot_w,
+                // args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride,
+                // args.top_d, args.top_h, args.top_w,
+                // args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride,
+                // args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride
+            );
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+PoolingForwardNDNhwcNaive::GetWorkspaceSize(const ExecutionContext&,
+                                      const miopen::pooling::ProblemDescription& problem) const
+{
+    if(problem.GetPooling().GetMode() != miopenPoolingMax || !problem.SaveIndex())
+        return 0;
+    return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType());
+}
+
+} // namespace pooling
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 7ec4c4e581..f189dcf8da 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -153,6 +153,14 @@ TensorDescriptor::TensorDescriptor(miopenDataType_t t,
 {
 }
 
+TensorDescriptor::TensorDescriptor(miopenDataType_t t,
+                                  miopenTensorLayout_t layout_in,
+                                    const std::vector<int>& lens_in,
+                     const std::vector<int>& strides_in)
+    : TensorDescriptor(t, layout_in, ConvertLengthsOrThrow(lens_in, "Lengths must be > 0"), ConvertLengthsOrThrow(strides_in, "Strides must be > 0"))
+{
+}
+
 TensorDescriptor::TensorDescriptor(miopenDataType_t t,
                                    miopenTensorLayout_t layout_in,
                                    const std::initializer_list<std::size_t>& lens_in)
@@ -446,6 +454,11 @@ std::string TensorDescriptor::GetLayoutStr(miopenTensorLayout_t tensorLayout)
 
 std::string TensorDescriptor::GetLayout_str() const { return GetLayoutStr(this->tensorLayout); }
 
+bool TensorDescriptor::IsDefaultLayout() const
+{
+    return IsDefaultLayout(tensorLayout, lens.size() - 2);
+}
+
 std::size_t TensorDescriptor::GetVectorLength() const { return this->vector_length; }
 
 std::size_t TensorDescriptor::GetIndex(std::initializer_list<int> l) const
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 434fdfe5df..0211790493 100755
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -192,6 +192,8 @@ message(STATUS "MIOPEN_TEST_COMPOSABLEKERNEL ${MIOPEN_TEST_COMPOSABLEKERNEL}")
 message(STATUS "MIOPEN_TEST_DISCRETE ${MIOPEN_TEST_DISCRETE}")
 message(STATUS "MIOPEN_TEST_DBSYNC ${MIOPEN_TEST_DBSYNC}")
 message(STATUS "CODECOV_TEST ${CODECOV_TEST}")
+message(STATUS "CMAKE_CTEST_COMMAND ${CMAKE_CTEST_COMMAND}")
+message(STATUS "CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR}")
 
 if(MIOPEN_TEST_DRIVER_ITER_MODE)
     add_definitions(-DMIOPEN_TEST_DRIVER_MODE=2)
diff --git a/test/gtest/ex1.cpp b/test/gtest/ex1.cpp
new file mode 100644
index 0000000000..324a3055cb
--- /dev/null
+++ b/test/gtest/ex1.cpp
@@ -0,0 +1,15 @@
+#include <gtest/gtest.h>
+// TODO TRJS delete file
+struct paramType { std::string value; };
+
+class MyFixture : public testing::TestWithParam<paramType> {};
+class FixtureA : public MyFixture {};
+class FixtureB : public MyFixture {};
+
+TEST_P(FixtureA, TestNameA0) { auto& myParam = GetParam(); EXPECT_GT(myParam.value.size(), 0ULL); }
+TEST_P(FixtureB, TestNameA0) { auto& myParam = GetParam(); EXPECT_GT(myParam.value.size(), 0ULL); }
+
+INSTANTIATE_TEST_SUITE_P(PIN0, FixtureA, testing::Values(paramType{"v00"}, paramType{"v01"}));
+INSTANTIATE_TEST_SUITE_P(PIN1, FixtureA, testing::Values(paramType{"v10"}, paramType{"v11"}, paramType{"v12"}));
+INSTANTIATE_TEST_SUITE_P(PIN2, FixtureB, testing::Values(paramType{"v00"}, paramType{"v11"}));
+
diff --git a/test/gtest/poolingFwd2dNaive.cpp b/test/gtest/poolingFwd2dNaive.cpp
new file mode 100644
index 0000000000..d09c35302a
--- /dev/null
+++ b/test/gtest/poolingFwd2dNaive.cpp
@@ -0,0 +1,256 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef POOLING_GTEST_BUILD
+
+#include <gtest/gtest.h>
+#include <miopen/env.hpp>
+#include "get_handle.hpp"
+#include "test_env.hpp"
+
+#include "pooling_testing.hpp"
+#include "pooling2d.hpp"
+
+#include "tensor_holder.hpp"
+#include "miopen/tensor_layout.hpp"
+
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS)
+
+namespace env = miopen::env;
+
+namespace {
+
+template <typename T>
+struct layout_data
+{
+    static std::vector<int> get_layout_lengths(int n, int c, std::vector<int>& dims)
+    {
+        auto ret = std::vector<int>{n, c};
+        ret.insert(ret.end(), dims.cbegin(), dims.cend());
+
+        return ret;
+    }
+
+    static std::vector<int>
+    get_strides(std::vector<int>& lens, int dims, miopenTensorLayout_t tensor_layout)
+    {
+        std::vector<int> strides;
+        std::string layout_default = miopen::tensor_layout_get_default(dims + 2);
+        std::string layout_string  = miopen::TensorDescriptor::GetLayoutStr(tensor_layout);
+
+        miopen::tensor_layout_to_strides(lens, layout_default, layout_string, strides);
+
+        constexpr int min_stride_multiplier = 1;
+        constexpr int max_stride_multiplier = 4;
+
+        auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier);
+        for(auto& v : strides)
+        {
+            // cppcheck-suppress useStlAlgorithm
+            v = v * c;
+        }
+
+        return strides;
+    }
+
+    static miopenTensorDescriptor_t init_tensor_descriptor(miopenDataType_t type,
+                                                           const std::vector<int>& lens,
+                                                           const std::vector<int>& strides)
+    {
+        miopenTensorDescriptor_t desc;
+
+        EXPECT_TRUE(miopenCreateTensorDescriptor(&desc) == miopenStatusSuccess);
+        EXPECT_TRUE(
+            miopenSetTensorDescriptor(desc, type, lens.size(), lens.data(), strides.data()) ==
+            miopenStatusSuccess);
+
+        return desc;
+    }
+
+    layout_data(int _n, std::vector<int> _dims, int _c, miopenTensorLayout_t _tensor_layout)
+    {
+        auto lens    = get_layout_lengths(_n, _c, _dims);
+        auto strides = get_strides(lens, _dims.size(), _tensor_layout);
+        descriptor   = miopen::TensorDescriptor{miopen_type<T>{}, lens, strides};
+        host         = tensor<T>{lens, strides}.generate(gen_value<T>);
+    }
+
+    ~layout_data() {}
+
+    void read_gpu_data(miopen::Handle& handle, const miopen::Allocator::ManageDataPtr& ddata)
+    {
+        check      = tensor<T>{descriptor.GetLengths(), descriptor.GetStrides()};
+        handle.ReadTo(check.data.data(), ddata, check.data.size());
+    }
+
+    tensor<T> check{};
+    tensor<T> host;
+    miopen::TensorDescriptor descriptor;
+};
+
+}
+
+class Pooling2d : public testing::TestWithParam<std::vector<std::string>> {};
+class Pooling2dInt8 : public Pooling2d {};
+class Pooling2dFloat : public Pooling2d {};
+class Pooling2dHalf : public Pooling2d {};
+class Pooling2dBF16 : public Pooling2d {};
+class Pooling2dF8 : public Pooling2d {};
+
+void Run2dDriver(miopenDataType_t prec);
+
+namespace {
+
+static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); }
+
+void GetArgs(const std::string& param, std::vector<std::string>& tokens)
+{
+    std::stringstream ss(param);
+    std::istream_iterator<std::string> begin(ss);
+    std::istream_iterator<std::string> end;
+    while(begin != end)
+        tokens.push_back(*begin++);
+}
+
+bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; }
+
+std::vector<std::string> Get2dTestCases(const std::string precision)
+{
+    const auto& flag_arg = env::value(MIOPEN_TEST_FLAGS_ARGS);
+
+    const std::vector<std::string> test_cases = {
+        // clang-format off
+    {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg},
+    {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg},
+    {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 " + flag_arg}
+        // clang-format on
+    };
+
+    return test_cases;
+}
+} // namespace pooling_tests
+
+TEST_P(Pooling2dInt8, NNT)
+{
+    if(!IsTestRunWith("--int8"))           std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--int8") TRJS
+        GTEST_SKIP();
+
+    Run2dDriver(miopenInt8);
+};
+
+TEST_P(Pooling2dFloat, NNT)
+{
+    if(!IsTestRunWith("--float"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl;
+
+    if(SkipTest() || !IsTestSupportedForDevice(get_handle())) //  && IsTestRunWith("--float") TRJS
+        GTEST_SKIP();
+
+    Run2dDriver(miopenFloat);
+};
+
+TEST_P(Pooling2dHalf, NNT)
+{
+    if(!IsTestRunWith("--half"))           std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--half") TRJS
+        GTEST_SKIP();
+
+    Run2dDriver(miopenHalf);
+};
+
+TEST_P(Pooling2dBF16, NNT)
+{
+    if(!IsTestRunWith("--bfloat16"))            std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--bfloat16") TRJS
+        GTEST_SKIP();
+
+    Run2dDriver(miopenBFloat16);
+};
+
+TEST_P(Pooling2dF8, NNT)
+{
+    if(!IsTestRunWith("--float8"))              std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--float8") TRJS
+        GTEST_SKIP();
+
+    Run2dDriver(miopenFloat8);
+};
+
+void Run2dDriver(miopenDataType_t prec)
+{
+    auto cases = Get2dTestCases("--float");
+ 
+    std::vector<std::string> params;
+    switch(prec)
+    {
+    case miopenFloat: params = Pooling2dFloat_NNT_Test::GetParam(); break;
+    case miopenHalf: params = Pooling2dHalf_NNT_Test::GetParam(); break;
+    case miopenBFloat16: params = Pooling2dBF16_NNT_Test::GetParam(); break;
+    case miopenInt8: params = Pooling2dInt8_NNT_Test::GetParam(); break;
+    case miopenFloat8: params = Pooling2dF8_NNT_Test::GetParam(); break;
+    case miopenInt32:
+    case miopenDouble:
+    case miopenBFloat8:
+    case miopenInt64:
+        FAIL()
+            << "miopenInt32, miopenDouble, miopenBFloat8, miopenInt64 "
+               "data types not supported by "
+               "poolingFwd2dNaive test";
+
+    default: params = Pooling2dFloat_NNT_Test::GetParam();
+    }
+
+    std::cerr << "Params: " << params.size() << std::endl;
+
+    for(const auto& test_value : params)
+    {
+        std::vector<std::string> tokens;
+        GetArgs(test_value, tokens);
+        std::vector<const char*> ptrs;
+
+        std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) {
+            return str.data();
+        });
+
+        testing::internal::CaptureStderr();
+        test_drive<pooling2d_driver>(ptrs.size(), ptrs.data());
+        auto capture = testing::internal::GetCapturedStderr();
+        std::cout << capture;
+    }
+}
+
+INSTANTIATE_TEST_SUITE_P(Full, Pooling2dBF16, testing::Values(Get2dTestCases("--bfloat16")));
+INSTANTIATE_TEST_SUITE_P(Full, Pooling2dInt8, testing::Values(Get2dTestCases("--int8")));
+INSTANTIATE_TEST_SUITE_P(Full, Pooling2dFloat, testing::Values(Get2dTestCases("--float")));
+INSTANTIATE_TEST_SUITE_P(Full, Pooling2dHalf, testing::Values(Get2dTestCases("--half")));
+INSTANTIATE_TEST_SUITE_P(Full, Pooling2dF8, testing::Values(Get2dTestCases("--float8")));
+
+#endif
diff --git a/test/gtest/poolingFwd3dNaive.cpp b/test/gtest/poolingFwd3dNaive.cpp
new file mode 100644
index 0000000000..d3f8975b15
--- /dev/null
+++ b/test/gtest/poolingFwd3dNaive.cpp
@@ -0,0 +1,265 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef POOLING_GTEST_BUILD
+
+#include <gtest/gtest.h>
+#include <miopen/env.hpp>
+#include "get_handle.hpp"
+#include "test_env.hpp"
+
+#include "pooling_testing.hpp"
+#include "pooling3d.hpp"
+
+#include "tensor_holder.hpp"
+#include "miopen/tensor_layout.hpp"
+
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS)
+
+namespace env = miopen::env;
+
+namespace {
+
+template <typename T>
+struct layout_data
+{
+    static std::vector<int> get_layout_lengths(int n, int c, std::vector<int>& dims)
+    {
+        auto ret = std::vector<int>{n, c};
+        ret.insert(ret.end(), dims.cbegin(), dims.cend());
+
+        return ret;
+    }
+
+    static std::vector<int>
+    get_strides(std::vector<int>& lens, int dims, miopenTensorLayout_t tensor_layout)
+    {
+        std::vector<int> strides;
+        std::string layout_default = miopen::tensor_layout_get_default(dims + 2);
+        std::string layout_string  = miopen::TensorDescriptor::GetLayoutStr(tensor_layout);
+
+std::cout << "get_strides: dims=" << dims << " lens=" << lens.size() << " " << layout_default << " " << layout_string << std::endl;
+        miopen::tensor_layout_to_strides(lens, layout_default, layout_string, strides);
+
+        constexpr int min_stride_multiplier = 1;
+        constexpr int max_stride_multiplier = 4;
+
+        auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier);
+        for(auto& v : strides)
+        {
+            // cppcheck-suppress useStlAlgorithm
+            v = v * c;
+        }
+
+        return strides;
+    }
+
+    static miopenTensorDescriptor_t init_tensor_descriptor(miopenDataType_t type,
+                                                           const std::vector<int>& lens,
+                                                           const std::vector<int>& strides)
+    {
+        miopenTensorDescriptor_t desc;
+
+        EXPECT_TRUE(miopenCreateTensorDescriptor(&desc) == miopenStatusSuccess);
+        EXPECT_TRUE(
+            miopenSetTensorDescriptor(desc, type, lens.size(), lens.data(), strides.data()) ==
+            miopenStatusSuccess);
+
+        return desc;
+    }
+
+    layout_data(int _n, std::vector<int> _dims, int _c, miopenTensorLayout_t _tensor_layout)
+    {
+        auto lens    = get_layout_lengths(_n, _c, _dims);
+        auto strides = get_strides(lens, _dims.size(), _tensor_layout);
+        descriptor   = miopen::TensorDescriptor{miopen_type<T>{}, lens, strides};
+        host         = tensor<T>{lens, strides}.generate(gen_value<T>);
+    }
+
+    ~layout_data() {}
+
+    void read_gpu_data(miopen::Handle& handle, const miopen::Allocator::ManageDataPtr& ddata)
+    {
+        check      = tensor<T>{descriptor.GetLengths(), descriptor.GetStrides()};
+        handle.ReadTo(check.data.data(), ddata, check.data.size());
+    }
+
+    tensor<T> check{};
+    tensor<T> host;
+    miopen::TensorDescriptor descriptor;
+};
+
+}
+
+class PoolingFwd3d : public testing::TestWithParam<std::vector<std::string>> {};
+class PoolingFwd3dFloat : public PoolingFwd3d {};
+class PoolingFwd3dHalf : public PoolingFwd3d {};
+class PoolingFwd3dBF16 : public PoolingFwd3d {};
+class PoolingFwd3dInt8 : public PoolingFwd3d {};
+class PoolingFwd3dF8 : public PoolingFwd3d {};
+
+void Run3dDriver(miopenDataType_t prec);
+
+namespace {
+
+static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); }
+
+void GetArgs(const std::string& param, std::vector<std::string>& tokens)
+{
+    std::stringstream ss(param);
+    std::istream_iterator<std::string> begin(ss);
+    std::istream_iterator<std::string> end;
+    while(begin != end)
+        tokens.push_back(*begin++);
+}
+
+bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; }
+
+std::vector<std::string> Get3dTestCases(const std::string precision)
+{
+    const auto& flag_arg = env::value(MIOPEN_TEST_FLAGS_ARGS);
+
+    const std::vector<std::string> test_cases = {
+        // clang-format off
+    {"test_pooling3d " + precision + " --all --dataset 0 --limit 0 " + flag_arg},
+    {"test_pooling3d " + precision + " --all --dataset 1 --limit 0 " + flag_arg},
+    {"test_pooling3d " + precision + " --all --dataset 2 --limit 0 " + flag_arg}
+        // clang-format on
+    };
+
+    return test_cases;
+}
+} // namespace pooling_tests
+
+TEST_P(PoolingFwd3dFloat, NNT)    // NNT=NdNaiveTranspose
+{
+    const auto& handle = get_handle();
+    if(!IsTestRunWith("--float"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl;
+
+    if(IsTestSupportedForDevice(handle) && !SkipTest()) //  && IsTestRunWith("--float") TRJS
+    {
+        Run3dDriver(miopenFloat);
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(PoolingFwd3dHalf, NNT)
+{
+    const auto& handle = get_handle();
+    if(!IsTestRunWith("--half"))            std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl;
+
+    if(IsTestSupportedForDevice(handle) && !SkipTest()) //  && IsTestRunWith("--half") TRJS
+    {
+        Run3dDriver(miopenHalf);
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(PoolingFwd3dBF16, NNT)
+{
+    if(!IsTestRunWith("--bfloat16"))           std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--bfloat16") TRJS
+        GTEST_SKIP();
+
+    Run3dDriver(miopenBFloat16);
+};
+
+TEST_P(PoolingFwd3dInt8, NNT)
+{
+    if(!IsTestRunWith("--int8"))           std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--int8") TRJS
+        GTEST_SKIP();
+
+    Run3dDriver(miopenInt8);
+};
+
+TEST_P(PoolingFwd3dF8, NNT)
+{
+    if(!IsTestRunWith("--float8"))           std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl;
+
+    if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) //  && IsTestRunWith("--float8") TRJS
+        GTEST_SKIP();
+
+    Run3dDriver(miopenFloat8);
+};
+
+void Run3dDriver(miopenDataType_t prec)
+{
+    auto cases = Get3dTestCases("--float");
+ 
+    std::vector<std::string> params;
+    switch(prec)
+    {
+    case miopenFloat: params = PoolingFwd3dFloat_NNT_Test::GetParam(); break;
+    case miopenHalf: params = PoolingFwd3dHalf_NNT_Test::GetParam(); break;
+    case miopenBFloat16: params = PoolingFwd3dBF16_NNT_Test::GetParam(); break;
+    case miopenInt8: params = PoolingFwd3dInt8_NNT_Test::GetParam(); break;
+    case miopenFloat8: params = PoolingFwd3dF8_NNT_Test::GetParam(); break;
+    case miopenInt32:
+    case miopenDouble:
+    case miopenBFloat8:
+    case miopenInt64:
+        FAIL()
+            << "miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 "
+               "data types not supported by "
+               "poolingFwdNdNaive test";
+
+    default: params = PoolingFwd3dFloat_NNT_Test::GetParam();
+    }
+
+    for(const auto& test_value : params)
+    {
+        std::vector<std::string> tokens;
+        GetArgs(test_value, tokens);
+        std::vector<const char*> ptrs;
+
+        std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) {
+            return str.data();
+        });
+
+        testing::internal::CaptureStderr();
+        test_drive<pooling3d_driver>(ptrs.size(), ptrs.data());
+        auto capture = testing::internal::GetCapturedStderr();
+        std::cout << capture;
+    }
+}
+
+INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd3dBF16, testing::Values(Get3dTestCases("--bfloat16")));
+INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd3dFloat, testing::Values(Get3dTestCases("--float")));
+INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd3dHalf, testing::Values(Get3dTestCases("--half")));
+INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd3dInt8, testing::Values(Get3dTestCases("--int8")));
+INSTANTIATE_TEST_SUITE_P(F8, PoolingFwd3dF8, testing::Values(Get3dTestCases("--float8")));
+
+#endif
diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp
new file mode 100644
index 0000000000..8e94032708
--- /dev/null
+++ b/test/gtest/pooling_testing.hpp
@@ -0,0 +1,936 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+// TODO: I've hijacked pooling_common here. This is a temporary workaround until
+// all pooling tests have been converted to gtest. This work has not been planned yet.
+#ifndef GUARD_MIOPEN_TEST_POOLING_COMMON_HPP
+#define GUARD_MIOPEN_TEST_POOLING_COMMON_HPP
+
+#define DATASET "0"
+
+#include <chrono>
+#include <iomanip>
+namespace {using sc = std::chrono::steady_clock;}
+#undef tomillis
+#define tomillis(__DUR) (0.001 * std::chrono::duration_cast<std::chrono::microseconds>(__DUR).count())
+#undef mstocout
+#define mstocout(__TP) std::setw(15) << std::fixed << std::setprecision(3) << tomillis(sc::now() - __TP)
+#undef coutms
+#define coutms(__TOK, __TP) (std::cout << "ms[" << std::setw(16) << __TOK << "]: " << mstocout(__TP) << std::endl)
+
+#include <gtest/gtest.h>
+#include <array>
+#include <iostream>
+#include <iterator>
+#include <strstream>
+#include <limits>
+#include <memory>
+#include <miopen/datatype.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/miopen.h>
+#include <miopen/pooling.hpp>
+#include <miopen/stringutils.hpp>
+#include <miopen/tensor.hpp>
+#include <utility>
+
+#include "../driver.hpp"
+#include "../get_handle.hpp"
+#include "../tensor_holder.hpp"
+#include "../verify.hpp"
+#include "../cpu_conv.hpp"
+#include "../workspace.hpp"
+
+#define TEST_PADDING_MODE 0
+
+namespace {
+int num_all_case = 0;
+int num_uint16_case = 0;
+int num_uint32_case = 0;
+int num_uint32_case_imgidx = 0;
+int num_uint64_case = 0;
+int num_uint64_case_imgidx = 0;
+constexpr int max_typed_cases = 5;
+constexpr int MAX_ALL_CASES = 0;
+auto __start = sc::now();
+
+constexpr int RAND_INTEGER_MAX = 12000;
+constexpr int RAND_INTEGER_MIN = -8800;
+
+template <typename T>
+auto gen_value =
+    [](auto... is) { return static_cast<T>(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; };
+
+auto gen_start =
+    [](auto... is) { return prng::gen_0_to_B(1ULL << 28); };
+}
+
+static inline void print(std::ostringstream& oss, const miopen::PoolingDescriptor& filter, bool is_default_layout)
+{
+    oss << "Pooling: ";
+    if(filter.GetMode() == miopenPoolingAverage)
+        oss << "Average";
+    else if(filter.GetMode() == miopenPoolingAverageInclusive)
+        oss << "AverageInclusive";
+    else
+        oss << "Max";
+    oss << std::endl;
+    oss << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl;
+    oss << "Lengths: ";
+    miopen::LogRange(oss, filter.GetLengths(), ", ") << std::endl;
+    oss << "Pads: ";
+    miopen::LogRange(oss, filter.GetPads(), ", ") << std::endl;
+    oss << "Strides: ";
+    miopen::LogRange(oss, filter.GetStrides(), ", ") << std::endl;
+}
+
+template <class T>
+tensor<T> get_output_tensor(const miopen::PoolingDescriptor& filter, const tensor<T>& input)
+{
+    return tensor<T>{filter.GetForwardOutputTensor(input.desc)};
+}
+
+template <class T>
+tensor<float> get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor<T>& input)
+{
+    auto desc = filter.GetForwardOutputTensor(input.desc);
+    auto lens = desc.GetLengths();
+    if(desc.GetElementSize() > 1024)
+        lens[0] *= 2;
+    else
+        lens[0] *= (2047 / desc.GetElementSize()) + 1 ;
+    auto dbig = miopen::TensorDescriptor{miopenFloat, input.desc.GetLayout_t(), lens, desc.GetStrides()};
+    auto big = tensor<float>{dbig};
+    for (auto& v : big)  v = -2.2222f;
+    return big;
+}
+
+template <class T>
+struct pooling_operators
+{
+    miopen::PoolingDescriptor filter;
+    pooling_operators(miopen::PoolingDescriptor f) : filter(f) {}
+
+    double initialize() const
+    {
+        if(filter.GetMode() == miopenPoolingMax)
+            return std::numeric_limits<T>::lowest();
+        else
+            return 0.0;
+    }
+
+    double operator()(double x, double y) const
+    {
+        if(filter.GetMode() == miopenPoolingMax)
+        {
+            double m = std::max(x, y);
+            return (m);
+        }
+        else
+        {
+            return x + y;
+        }
+    }
+
+    double finalize(double x, double y)
+    {
+        if(filter.GetMode() == miopenPoolingMax)
+            return (x);
+        else
+            return x / y;
+    }
+};
+
+template <int SptDim>
+struct verify_forward_pooling
+{
+    template <class T, class Index>
+    tensor<T>
+    cpu(const tensor<T>& input, const miopen::PoolingDescriptor& filter, std::vector<Index>&) const
+    {
+        auto out = get_output_tensor(filter, input);
+
+        std::array<int, SptDim> in_dim{};
+        std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin());
+        std::array<int, SptDim> strides{};
+        std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin());
+        std::array<int, SptDim> pads{};
+        std::copy_n(filter.GetPads().begin(), SptDim, pads.begin());
+        std::array<int, SptDim> kers{};
+        std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin());
+        auto pooler = pooling_operators<T>{filter};
+
+        int b_n = out.desc.GetLengths()[0];
+        int k_n = out.desc.GetLengths()[1];
+        std::array<int, SptDim> out_spatial_len{};
+        std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin());
+
+        auto par_ford_out =
+            miopen::unpacker(miopen::prepender(par_ford, b_n, k_n))(out_spatial_len);
+
+        par_ford_out([&](int o, int w, auto... out_spatial_id_pack) {
+            auto out_spatial_id = make_array(out_spatial_id_pack...);
+
+            std::array<int, SptDim> start_idx{};
+            std::array<int, SptDim> win_sz{};
+            for(int i = 0; i < SptDim; ++i)
+            {
+                start_idx[i] = out_spatial_id[i] * strides[i] - pads[i];
+                int end_idx  = start_idx[i] + kers[i];
+                end_idx      = std::min(end_idx, in_dim[i]);
+                start_idx[i] = std::max(start_idx[i], 0);
+                win_sz[i]    = end_idx - start_idx[i];
+                win_sz[i]    = std::max(win_sz[i], 1);
+            }
+
+            int pool_size =
+                filter.GetMode() == miopenPoolingAverageInclusive
+                    ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies<int>())
+                    : std::accumulate(win_sz.begin(), win_sz.end(), 1, std::multiplies<int>());
+
+            double acc = pooler.initialize();
+            miopen::unpacker(ford)(win_sz)([&](auto... in_spatial_id_pack) {
+                auto in_spatial_id = make_array(in_spatial_id_pack...);
+                std::array<std::size_t, SptDim + 2> idx{};
+                idx[0] = o;
+                idx[1] = w;
+
+                bool in_cmp_idx = true;
+                for(int i = 0; i < SptDim; ++i)
+                {
+                    idx[i + 2] = start_idx[i] + in_spatial_id[i];
+                    in_cmp_idx &= (in_dim[i] > idx[i + 2]);
+                }
+
+                if(in_cmp_idx)
+                {
+                    acc = pooler(acc, input(idx));
+                }
+            });
+            out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size));
+        });
+
+        return out;
+    }
+
+    template <class T, class Index>
+    tensor<T> gpu(const tensor<T>& input,
+                  const miopen::PoolingDescriptor& filter,
+                  std::vector<Index>& indices) const
+    {
+        auto&& handle = get_handle();
+        auto out      = get_output_tensor(filter, input);
+
+        indices.resize(out.data.size(), 0);
+
+        auto in_dev  = handle.Write(input.data);
+        auto out_dev = handle.Create<T>(out.GetSize());
+        Workspace wspace{};
+        wspace.Write(indices);
+
+        float alpha = 1, beta = 0;
+        filter.Forward(handle,
+                       &alpha,
+                       input.desc,
+                       in_dev.get(),
+                       &beta,
+                       out.desc,
+                       out_dev.get(),
+                       true,
+                       wspace.ptr(),
+                       wspace.size());
+        handle.ReadTo(out.data.data(), out_dev, out.GetDataByteSize());
+        wspace.ReadTo(indices);
+
+        return out;
+    }
+
+    template <class T, class Index>
+    void fail(float,
+              const tensor<T>& input,
+              const miopen::PoolingDescriptor& filter,
+              const std::vector<Index>&) const
+    {
+        std::ostringstream oss;
+        oss << "Forward ";
+        print(oss, filter, input.desc.IsDefaultLayout());
+        oss << "Input tensor: " << input.desc.ToString() << std::endl;
+        oss << "Output tensor: " << filter.GetForwardOutputTensor(input.desc).ToString()
+                  << std::endl;
+        GTEST_FAIL() << oss.str();
+    }
+};
+
+template <int SptDim>
+struct verify_backward_pooling
+{
+    template <class T, class Index>
+    tensor<T> cpu(const tensor<T>& input,
+                  const tensor<T>& dout,
+                  const tensor<T>& out,
+                  const miopen::PoolingDescriptor& filter,
+                  const std::vector<Index>& indices,
+                  bool use_global_index,
+                  bool verify_index) const
+    {
+        const int sptl_dim_offset = 2;
+        const int chan_dim_offset = 1;
+
+        auto dinput = input;
+        return dinput; // TRJS
+
+        std::vector<double> din_vec(input.desc.GetElementSpace(), 0.0);
+        CHECK(dout.desc == out.desc);
+        std::array<int, SptDim + 2> in_dim{};
+        std::copy_n(input.desc.GetLengths().begin(), SptDim + 2, in_dim.begin());
+        std::array<int, SptDim + 2> in_str{};
+        std::copy_n(input.desc.GetStrides().begin(), SptDim + 2, in_str.begin());
+        std::array<int, SptDim> strides{};
+        std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin());
+        std::array<int, SptDim> pads{};
+        std::copy_n(filter.GetPads().begin(), SptDim, pads.begin());
+        std::array<int, SptDim> kers{};
+        std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin());
+        auto ford_ker = miopen::unpacker(ford)(kers);
+
+        int out_n = out.desc.GetLengths()[0];
+        int out_c = out.desc.GetLengths()[chan_dim_offset];
+        std::array<int, SptDim> out_spatial_len{};
+        std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin());
+
+        auto ford_out = miopen::unpacker(ford)(out_spatial_len);
+
+        par_ford(out_n, out_c)([&](int o, int w) {
+            if(filter.GetMode() == miopenPoolingMax)
+            {
+                ford_out([&](auto... out_spatial_id_pack) {
+                    auto mx_idx = indices.at(dout.desc.GetIndex(o, w, out_spatial_id_pack...));
+                    std::array<std::size_t, SptDim + 2> idx{};
+                    bool in_cmp_idx = true;
+                    if(use_global_index)
+                    {
+                        for(int i = 0; i < SptDim; i++)
+                        {
+                            std::size_t mx_idx_dim = mx_idx;
+                            mx_idx_dim /= std::accumulate(in_dim.begin() + sptl_dim_offset + i + 1,
+                                                          in_dim.end(),
+                                                          1ULL,
+                                                          std::multiplies<std::size_t>());
+                            mx_idx_dim %= in_dim[i + sptl_dim_offset];
+                            idx[i + sptl_dim_offset] = mx_idx_dim;
+                        }
+                    }
+                    else
+                    {
+                        auto out_spatial_id = make_array(out_spatial_id_pack...);
+
+                        for(int i = 0; i < SptDim; i++)
+                        {
+                            int mx_idx_dim = mx_idx;
+                            mx_idx_dim /= std::accumulate(
+                                kers.begin() + i + 1, kers.end(), 1, std::multiplies<int>());
+                            mx_idx_dim %= kers[i];
+
+                            mx_idx_dim += (out_spatial_id[i] * strides[i] - pads[i]);
+                            in_cmp_idx &= (in_dim[i + 2] > mx_idx_dim && mx_idx_dim >= 0);
+
+                            idx[i + 2] = std::size_t(mx_idx_dim);
+                        }
+                    }
+
+                    if(in_cmp_idx)
+                    {
+                        idx[0] = o;
+                        idx[1] = w;
+                        if(false && verify_index)
+                        {
+                            CHECK(
+                                miopen::float_equal(input(idx), out(o, w, out_spatial_id_pack...)));
+                        }
+                        std::size_t din_idx = 0;
+                        for(int i = 0; i < SptDim + 2; i++)
+                        {
+                            din_idx += idx[i] * in_str[i];
+                        }
+                        din_vec.at(din_idx) += dout(o, w, out_spatial_id_pack...);
+                    }
+                });
+            }
+            else
+            {
+                ford_out([&](auto... out_spatial_id_pack) {
+                    auto out_spatial_id = make_array(out_spatial_id_pack...);
+
+                    std::array<int, SptDim> start_idx{};
+                    std::array<int, SptDim> win_sz{};
+                    for(int i = 0; i < SptDim; ++i)
+                    {
+                        start_idx[i] = out_spatial_id[i] * strides[i] - pads[i];
+                        int end_idx  = start_idx[i] + kers[i];
+                        end_idx      = std::min(end_idx, in_dim[i + 2]);
+                        win_sz[i]    = end_idx - std::max(start_idx[i], 0);
+                        win_sz[i]    = std::max(win_sz[i], 1);
+                    }
+
+                    int pool_size =
+                        filter.GetMode() == miopenPoolingAverageInclusive
+                            ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies<int>())
+                            : std::accumulate(
+                                  win_sz.begin(), win_sz.end(), 1, std::multiplies<int>());
+
+                    ford_ker([&](auto... ker_id_pack) {
+                        auto ker_id = make_array(ker_id_pack...);
+
+                        bool in_cmp_idx = true;
+                        std::array<int, SptDim + 2> in_idx{};
+                        in_idx[0] = o;
+                        in_idx[1] = w;
+                        for(int i = 0; i < SptDim; ++i)
+                        {
+                            in_idx[i + 2] = start_idx[i] + ker_id[i];
+                            in_cmp_idx &= (in_dim[i + 2] > in_idx[i + 2] && in_idx[i + 2] >= 0);
+                        }
+
+                        if(in_cmp_idx)
+                        {
+                            std::size_t din_idx = 0;
+                            for(int i = 0; i < SptDim + 2; i++)
+                            {
+                                din_idx += in_idx[i] * in_str[i];
+                            }
+
+                            din_vec.at(din_idx) +=
+                                static_cast<double>(dout(o, w, out_spatial_id_pack...)) / pool_size;
+                        }
+                    });
+                });
+            }
+        });
+
+        miopen::unpacker(ford)(in_dim)([&](auto... in_id_pack) {
+            auto in_id          = make_array(in_id_pack...);
+            std::size_t din_idx = 0;
+            for(int i = 0; i < SptDim + 2; i++)
+            {
+                din_idx += in_id[i] * in_str[i];
+            }
+            dinput(in_id_pack...) = din_vec.at(din_idx);
+        });
+
+    return dinput;
+    }
+
+    template <class T, class Index>
+    tensor<T> gpu(const tensor<T>& input,
+                  const tensor<T>& dout,
+                  const tensor<T>& out,
+                  const miopen::PoolingDescriptor& filter,
+                  const std::vector<Index>& indices,
+                  bool,
+                  bool) const
+    {
+        auto&& handle = get_handle();
+        auto dinput   = input;
+        return dinput; // TRJS
+
+        auto in_dev   = handle.Write(input.data);
+        auto dout_dev = handle.Write(dout.data);
+        auto out_dev  = handle.Write(out.data);
+        auto din_dev  = handle.Create<T>(dinput.data.size());
+
+        Workspace wspace{};
+        wspace.Write(indices);
+
+        float alpha = 1, beta = 0;
+        filter.Backward(handle,
+                        &alpha,
+                        // y
+                        out.desc,
+                        out_dev.get(),
+                        // dy
+                        dout.desc,
+                        dout_dev.get(),
+                        // x
+                        input.desc,
+                        in_dev.get(),
+                        &beta,
+                        // dx
+                        dinput.desc,
+                        din_dev.get(),
+                        wspace.ptr());
+
+        handle.ReadTo(dinput.data.data(), din_dev, dinput.data.size());
+
+        return dinput;
+    }
+
+    template <class T, class Index>
+    void fail(float,
+              const tensor<T>& input,
+              const tensor<T>&,
+              const tensor<T>& out,
+              const miopen::PoolingDescriptor& filter,
+              const std::vector<Index>&,
+              bool,
+              bool) const
+    {
+        std::ostringstream oss;
+        oss << "Backward ";
+        print(oss, filter, input.desc.IsDefaultLayout());
+        oss << "Input tensor: " << input.desc.ToString() << std::endl;
+        oss << "Output tensor: " << out.desc.ToString() << std::endl;
+        GTEST_FAIL() << oss.str();
+    }
+};
+
+template <class T>
+struct pooling_driver : test_driver
+{
+    miopen::PoolingDescriptor filter;
+    std::vector<int> in_shape;
+    std::vector<int> lens;
+    std::vector<int> pads;
+    std::vector<int> strides;
+    std::string index_type;
+    std::string mode_str;
+#if TEST_PADDING_MODE == 1
+    std::string pmode;
+#endif
+    int verify_indices{};
+    miopenPoolingWorkspaceIndexMode_t wsidx{};
+    miopenTensorLayout_t layout{};
+
+    static void randomize_tensor(tensor<T>& in)
+    {
+        static tensor<T> random_data{{1}};
+        static tensor<int> starts{std::vector<size_t>{1}};
+        static size_t start_idx = 0;
+
+        const auto size = in.GetSize();
+        const auto ran_size = size > 2 ? (3 * size) / 2 : 3;
+        if (random_data.GetSize() < ran_size)
+        {
+            random_data = tensor<T>{{ran_size}}.generate(tensor_elem_gen_integer{2503});
+        }
+        if (starts.GetSize() == 1)  // TODO TRJS is there a cleaner way to initialize starts?
+        {
+            starts = tensor<int>{std::vector<size_t>{1 << 20}}.generate(gen_start);
+        }
+
+        const auto r_start = starts[start_idx++] % (random_data.GetSize() / 3);
+        if (start_idx >= starts.GetSize()) start_idx = 0;
+
+        std::cout << "randomizing " << std::setw(9) << size << " elems from " << std::setw(9) << r_start << " (" << start_idx << ")"    // TRJS
+        // << "(" << std::setw(8) << prng::gen_0_to_B(size / 2)  << std::setw(8) << prng::gen_0_to_B(size / 2)  << std::setw(8) << prng::gen_0_to_B(size / 2)  << std::setw(8) << prng::gen_0_to_B(size / 2) << ")" 
+        << std::endl;
+        in.data.assign(random_data.begin() + r_start, random_data.begin() + r_start + size);
+    }
+
+    std::unordered_map<std::string, miopenIndexType_t> index_type_lookup = {
+        {miopen::ToUpper("miopenIndexUint8"),  miopenIndexUint8},
+        {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16},
+        {miopen::ToUpper("miopenIndexUint32"), miopenIndexUint32},
+        {miopen::ToUpper("miopenIndexUint64"), miopenIndexUint64},
+    };
+    std::unordered_map<std::string, miopenPoolingMode_t> mode_lookup = {
+        {"MAX", miopenPoolingMax},
+        {"MIOPENPOOLINGMAX", miopenPoolingMax},
+        {"AVERAGE", miopenPoolingAverage},
+        {"MIOPENPOOLINGAVERAGE", miopenPoolingAverage},
+        {"AVERAGEINCLUSIVE", miopenPoolingAverageInclusive},
+        {"MIOPENPOOLINGAVERAGEINCLUSIVE", miopenPoolingAverageInclusive},
+    };
+#if TEST_PADDING_MODE == 1
+    std::unordered_map<std::string, miopenPaddingMode_t> pmode_lookup = {
+        {"DEFAULT", miopenPaddingDefault},
+        {"SAME", miopenPaddingSame},
+        {"VALID", miopenPaddingValid},
+    };
+#endif
+    pooling_driver()
+    {
+        add(index_type,
+            "index_type",
+            // generate_data({"miopenIndexUint32"}    // TEMPCODE RJS RUN
+            generate_multi_data<const char*>( //
+                {{"miopenIndexUint32",
+                  "miopenIndexUint8"
+                  ,
+                  "miopenIndexUint16",
+                  "miopenIndexUint64"
+                  },                     //
+                 {"miopenIndexUint8", "miopenIndexUint32"}, //
+                 {"miopenIndexUint32"}}                     //
+                ));
+        add(mode_str,
+            "mode_str",
+            generate_data(
+                {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"}));
+#if TEST_PADDING_MODE == 1
+        add(pmode, "pmode", generate_data({"default", "same", "valid"}));
+#endif
+        add(verify_indices, "verify_indices", generate_data({1}));
+    }
+
+    template <class Index, int SptlDim>
+    void run_impl()
+    {
+        std::vector<Index> indices{};
+auto gst = sc::now();
+        auto input = tensor<T>{layout, in_shape};
+        randomize_tensor(input);
+coutms("gen", gst);
+auto vst = sc::now();
+        auto out  = verify(verify_forward_pooling<SptlDim>{},
+            input,
+            filter,
+            indices);
+coutms("verify", vst);
+        if(!std::is_same<T, float>::value && !std::is_same<T, half>::value) return;
+
+        // auto dout = out.first;
+        // dout.generate(tensor_elem_gen_integer{2503});
+        // verify(verify_backward_pooling<SptlDim>{},   // TRJS
+        //        input,
+        //        dout,
+        //        out.first,
+        //        filter,
+        //        indices,
+        //        wsidx != 0,
+        //        static_cast<bool>(this->verify_indices));
+    }
+
+#define CHECK_SKIP  \
+if(skip)        \
+{               \
+    std::cout << "\nSkipping run # " << std::setw(7) << num_all_case++ << " @ET=" << mstocout(__start) << " : ";    \
+    show_command(); \
+    std::cout << "-- " << oss.str() << std::endl;   \
+    return; \
+}
+
+#define SKIP_RUN  skip = true; CHECK_SKIP
+
+    void run()
+    {
+        const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout);
+
+        bool skip = false;
+        std::ostringstream oss;
+
+        if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES)
+        {
+            skip = true;
+            oss << " : skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES;
+        }
+        if(this->dry_run)
+        {
+            skip = true;
+            oss << " : skipped due to dry_run";
+        }
+        if(is_default_layout && (this->type != miopenFloat && this->type != miopenHalf))
+        {
+            skip = true;
+            oss << " : skipped, no solvers for datatype " << this->type << " and default layouts";
+        }
+
+        CHECK_SKIP;
+
+        int sptl_dim = static_cast<int>(in_shape.size()) - 2;
+        if(sptl_dim != 2 && sptl_dim != 3)
+        {
+            oss << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW format." << std::endl;
+            SKIP_RUN;
+        }
+
+        // To simplify launching, input dimensions to the driver are always default layout. Desire to
+        // test non-default layouts is communicated exclusively via 'layout'.
+
+        auto mode = mode_lookup.at(miopen::ToUpper(mode_str));
+
+        auto pad_mode = miopenPaddingDefault;
+#if TEST_PADDING_MODE
+        pad_mode = pmode_lookup.at(miopen::ToUpper(pmode));
+#endif
+
+        auto idx_typ = index_type_lookup.at(miopen::ToUpper(index_type));
+        auto idx_sz  = sizeof(uint8_t);
+        const bool skip_many_configs_with_non_int8_index =
+            (dataset_id == 0) && !full_set; // Otherwise the default dataset takes too much time.
+        const bool wide_dataset = (dataset_id == 2) && full_set;
+
+        filter = miopen::PoolingDescriptor
+        {
+            mode,
+            pad_mode,
+            lens,
+            strides,
+            pads
+        };
+
+        filter.SetIndexType(idx_typ);
+        filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx));
+        bool mask_idx = filter.GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask;
+
+        if(mask_idx && sptl_dim == 3 && filter.GetMode() == miopenPoolingMax)
+        {
+            oss << "Warning: Config skipped. Workspace index mask mode is not implemented "
+                         "yet in 3D max pooling solvers."
+                      << std::endl;
+            SKIP_RUN;
+        }
+
+        if(mask_idx && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset)
+        {
+            oss << "Warning: Config skipped. Workspace index mask mode is not implemented "
+                         "yet in 2D max backward solvers that support wide pooling window."
+                      << std::endl;
+            SKIP_RUN;
+        }
+
+        if(mask_idx && filter.ModeIsAveraging())
+        {
+            oss << "Warning: Config skipped. Workspace index modes are irrelevant for "
+                         "Average pooling. "
+                         "In order to optimize performance of full tests, we "
+                         "skip average pooling configs when (wsidx == 0). "
+                         "Please make sure that dataset includes counterparts with (wsidx == 1)."
+                      << std::endl;
+            SKIP_RUN;
+        }
+
+        // index size filter
+        if(filter.GetMode() == miopenPoolingMax)
+        {
+            auto index_max = miopen::get_index_max(filter.GetIndexType());
+            auto index_needed = mask_idx ?
+                std::accumulate(lens.begin(), lens.end(), 1, std::multiplies<int>()) :
+                std::accumulate(in_shape.begin() + 2, in_shape.end(), 1, std::multiplies<int>());
+
+            if(index_max <= index_needed)
+            {
+                oss << "Warning: Config skipped: index mode " << filter.GetWorkspaceIndexMode()
+                    << " type " << filter.GetIndexType() << " is too small. max="
+                    << index_max << ", needed=" << index_needed << std::endl;
+                SKIP_RUN;
+            }
+        }
+
+        switch(idx_typ)
+        {
+        /// The "index is too small" limitation is an approximation
+        /// of the real limitation, and therefore applied only when
+        /// the "full test" is ran. See:
+        /// \ref max_pooling_index_max_restriction
+        case miopenIndexUint8: {
+            if(full_set && (sptl_dim == 3 || (mask_idx && sptl_dim == 2)) &&
+               filter.GetMode() == miopenPoolingMax)
+            {
+                oss << "Warning: Config skipped: uint8 index is too small "
+                             "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
+                             "&& filter.GetMode() == miopenPoolingMax"
+                          << std::endl;
+                SKIP_RUN;
+            }
+            break;
+        }
+        case miopenIndexUint16: {
+            if(full_set && (sptl_dim == 3 || (!mask_idx && sptl_dim == 2)) &&
+               filter.GetMode() == miopenPoolingMax)
+            {
+                oss << "Warning: Config skipped: uint16 index is too small "
+                             "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
+                             "&& filter.GetMode() == miopenPoolingMax"
+                          << std::endl;
+                SKIP_RUN;
+            }
+            if(skip_many_configs_with_non_int8_index)
+            {
+                // test_pooling_test --all limit uint16 cases
+                if(num_uint16_case >= max_typed_cases)
+                {
+                    oss << "Warning: Config skipped for the default dataset to speed "
+                                 "up testing (num_uint16_case > 5)"
+                              << std::endl;
+                    SKIP_RUN;
+                }
+                ++num_uint16_case;
+            }
+            idx_sz = sizeof(uint16_t);
+            break;
+        }
+        case miopenIndexUint32: {
+            if(skip_many_configs_with_non_int8_index)
+            {
+                // test_pooling_test --all limit uint32 cases
+                if(mask_idx)
+                {
+                    if(num_uint32_case >= max_typed_cases)
+                    {
+                        oss << "Warning: Config skipped for the default dataset to speed up "
+                                     "testing (wsidx == 0 && num_uint32_case > 5)"
+                                  << std::endl;
+                        SKIP_RUN;
+                    }
+                    ++num_uint32_case;
+                }
+                else
+                {
+                    if(num_uint32_case_imgidx >= max_typed_cases)
+                    {
+                        oss << "Warning: Config skipped for the default dataset to speed up "
+                                     "testing (wsidx != 0 && num_uint32_case_imgidx > 5)"
+                                  << std::endl;
+                        SKIP_RUN;
+                    }
+                    ++num_uint32_case_imgidx;
+                }
+            }
+            idx_sz = sizeof(uint32_t);
+            break;
+        }
+        case miopenIndexUint64: {
+            if(skip_many_configs_with_non_int8_index)
+            {
+                if(mask_idx)
+                {
+                    if(num_uint64_case >= max_typed_cases)
+                    {
+                        oss << "Warning: Config skipped for the default dataset to speed up "
+                                     "testing (wsidx == 0) && (num_uint64_case > 5)"
+                                  << std::endl;
+                        SKIP_RUN;
+                    }
+                    ++num_uint64_case;
+                }
+                else
+                {
+                    if(num_uint64_case_imgidx >= max_typed_cases && sptl_dim == 2)
+                    {
+                        oss << "Warning: Config skipped to speed up testing of the "
+                                     "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 "
+                                     "&& sptl_dim == 2)"
+                                  << std::endl;
+                        SKIP_RUN;
+                    }
+                    ++num_uint64_case_imgidx;
+                }
+            }
+            idx_sz = sizeof(uint64_t);
+            break;
+        }
+        }
+
+        auto input_desc = miopen::TensorDescriptor(this->type, layout, in_shape);
+
+        for(int i = 0; i < sptl_dim; i++)
+        {
+            if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast<uint64_t>(2) * pads[i]))
+            {
+                oss << "Warning: Config skipped because it is invalid "
+                             "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))"
+                          << std::endl;
+                SKIP_RUN;
+            }
+        }
+
+        if(full_set)
+        {
+            auto output_desc = filter.GetForwardOutputTensor(input_desc);
+            size_t total_mem =
+                3 * input_desc.GetNumBytes() + output_desc.GetNumBytes() +
+                idx_sz * output_desc.GetElementSize(); // estimate based on backward pass
+
+            size_t device_mem = get_handle().GetGlobalMemorySize();
+            if(total_mem >= device_mem)
+            {
+                oss << "Config skipped because it requires " << total_mem
+                          << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem
+                          << " Bytes of memory." << std::endl;
+                SKIP_RUN;
+            }
+        }
+
+        CHECK_SKIP;
+
+        std::cout << "\nRun # " << std::setw(7) << num_all_case++ << " @ET=" << mstocout(__start) << " : ";
+        show_command();
+
+        std::vector<int> in_dim(input_desc.GetLengths().begin(),
+            input_desc.GetLengths().begin() + sptl_dim);
+        std::vector<int> out_dim(sptl_dim);
+        std::vector<int> ker_dim(filter.GetLengths().begin(), filter.GetLengths().end());
+
+        switch(filter.GetIndexType())
+        {
+        case miopenIndexUint8: {
+            if(sptl_dim == 3)
+            {
+                run_impl<uint8_t, 3>();
+            }
+            else
+            {
+                run_impl<uint8_t, 2>();
+            }
+            break;
+        }
+        case miopenIndexUint16: {
+            if(sptl_dim == 3)
+            {
+                run_impl<uint16_t, 3>();
+            }
+            else
+            {
+                run_impl<uint16_t, 2>();
+            }
+            break;
+        }
+        case miopenIndexUint32: {
+            if(sptl_dim == 3)
+            {
+                run_impl<uint32_t, 3>();
+            }
+            else
+            {
+                run_impl<uint32_t, 2>();
+            }
+            break;
+        }
+        case miopenIndexUint64: {
+            if(sptl_dim == 3)
+            {
+                run_impl<uint64_t, 3>();
+            }
+            else
+            {
+                run_impl<uint64_t, 2>();
+            }
+            break;
+        }
+        }
+    }
+};
+
+#endif
diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp
index 128e81cce2..1bd44a364b 100644
--- a/test/pooling2d.hpp
+++ b/test/pooling2d.hpp
@@ -29,16 +29,22 @@
 #define WORKAROUND_ISSUE_1670 1
 #define TEST_GET_INPUT_TENSOR 0
 
-template <class T>
-struct pooling2d_driver : pooling_driver<T>
+struct pooling2d_shapes
 {
-private:
+public:
     using U = typename std::vector<int>;
-    std::vector<U> get_2d_pooling_input_shapes()
+
+    static std::vector<U> get_2d_pooling_input_shapes()
     {
-        return {{1, 19, 1024, 2048},
-                {10, 3, 32, 32},
+        return {
                 {5, 32, 8, 8},
+                {16, 1, 4096, 4096},
+                {1, 16, 4096, 4096},
+                {1, 1024, 512, 512},
+                {16, 1024, 128, 128},
+                {1, 832, 64, 128},
+                {10, 3, 32, 32},
+                {1, 19, 1024, 2048},
                 {2, 1024, 12, 12},
                 {4, 3, 231, 231},
                 {8, 3, 227, 227},
@@ -47,61 +53,83 @@ struct pooling2d_driver : pooling_driver<T>
                 {2, 160, 7, 7},
                 {1, 192, 256, 512},
                 {2, 192, 28, 28},
-                {1, 832, 64, 128},
                 {1, 256, 56, 56},
                 {4, 3, 224, 224},
                 {2, 64, 112, 112},
                 {2, 608, 4, 4},
-                {1, 2048, 11, 11},
-                {1, 16, 4096, 4096}};
+                {1, 2048, 11, 11}
+        };
     }
 
     // Dataset 1 is intended for testing of asymmetric configs.
-    std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}}; }
+    static std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}, {10, 3, 32, 32}}; }
 
     // Dataset 2 is intended for testing of configs with wide window.
-    std::vector<U> get_2d_pooling_input_shapes_wide()
+    static std::vector<U> get_2d_pooling_input_shapes_wide()
     {
         return {{1, 3, 255, 255}, {2, 3, 227, 227}, {1, 7, 127, 127}, {1, 1, 410, 400}};
     }
+};
+
+template <class T>
+struct pooling2d_driver : pooling_driver<T>
+{
+private:
+    using U = typename std::vector<int>;
+    std::vector<U> get_2d_pooling_input_shapes()
+    {
+        return pooling2d_shapes::get_2d_pooling_input_shapes();
+    }
+
+    // Dataset 1 is intended for testing of asymmetric configs.
+    std::vector<U> get_2d_pooling_input_shapes_minimal() { return pooling2d_shapes::get_2d_pooling_input_shapes_minimal(); }
+
+    // Dataset 2 is intended for testing of configs with wide window.
+    std::vector<U> get_2d_pooling_input_shapes_wide()
+    {
+        return pooling2d_shapes::get_2d_pooling_input_shapes_wide();
+    }
 
 public:
     pooling2d_driver() : pooling_driver<T>()
     {
+        // clang-format off
 #if TEST_GET_INPUT_TENSOR
         std::set<U> in_dim_set = get_inputs(this->batch_factor);
         std::vector<U> in_dim_vec(in_dim_set.begin(), in_dim_set.end());
         this->add(this->in_shape, "input", this->generate_data(in_dim_vec, {16, 32, 8, 8}));
 #else
-        this->add(
-            this->in_shape,
-            "input",
-            this->template generate_multi_data_limited<U>({get_2d_pooling_input_shapes(),
-                                                           get_2d_pooling_input_shapes_minimal(),
-                                                           get_2d_pooling_input_shapes_wide()},
-                                                          9));
+        this->add(this->in_shape, "input", this->template generate_multi_data_limited<U>({
+                get_2d_pooling_input_shapes(),
+                get_2d_pooling_input_shapes_minimal(),
+                get_2d_pooling_input_shapes_wide()
+            }, 9
+        ));
 #endif
-        this->add(this->lens,
-                  "lens",
-                  this->template generate_multi_data<U>(
-                      {{{2, 2}, {3, 3}},         //
-                       {{2, 2}, {1, 2}, {2, 1}}, //
-                       {{35, 35}, {100, 100}, {255, 255}, {410, 400}}}));
-        this->add(this->strides,
-                  "strides",
-                  this->template generate_multi_data<U>({{{2, 2}, {1, 1}},                 //
-                                                         {{1, 1}, {2, 1}, {1, 2}, {2, 2}}, //
-                                                         {{1, 1}}}));
-        // clang-format off
+        this->add(this->lens, "lens", this->template generate_multi_data<U>({
+                {{2, 2}, {3, 3}},         //
+                {{2, 2}, {1, 2}, {2, 1}}, //
+                {{35, 35}, {100, 100}, {255, 255}, {410, 400}}
+            }
+        ));
+        this->add(this->strides, "strides", this->template generate_multi_data<U>({
+                {{2, 2}, {1, 1}},               //
+                {{2, 2}, {2, 1}, {1, 2}},       //
+                {{1, 1}}
+            }
+        ));
         this->add(this->pads, "pads", this->template generate_multi_data<U>({
-            {{0, 0}, {1, 1}}, //
+                {{0, 0}, {1, 1}}, //
 #if WORKAROUND_ISSUE_1670
-            {{0, 0}}, //
+                {{0, 0}}, //
 #else
-            {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, //
+                {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, //
 #endif
-            {{0, 0}}}));
+                {{0, 0}}
+            }
+        ));
         // clang-format on
-        this->add(this->wsidx, "wsidx", this->generate_data({0, 1}));
-    }
+        this->add(this->wsidx, "wsidx", this->generate_data({miopenPoolingWorkspaceIndexMask, miopenPoolingWorkspaceIndexImage}));
+        this->add(this->layout, "layout", this->generate_data({miopenTensorNCHW, miopenTensorNHWC}));
+   }
 };
diff --git a/test/pooling3d.cpp b/test/pooling3d.cpp
index 90b37d5c75..966b3f5303 100644
--- a/test/pooling3d.cpp
+++ b/test/pooling3d.cpp
@@ -25,30 +25,6 @@
  *******************************************************************************/
 
 #include "pooling_common.hpp"
-
-template <class T>
-struct pooling3d_driver : pooling_driver<T>
-{
-    std::vector<std::vector<int>> get_3d_pooling_input_shapes()
-    {
-        return {{16, 64, 3, 4, 4},
-                {16, 32, 4, 9, 9},
-                {8, 512, 3, 14, 14},
-                {8, 512, 4, 28, 28},
-                {16, 64, 56, 56, 56},
-                {4, 3, 4, 227, 227},
-                {4, 4, 4, 161, 700}};
-    }
-
-    pooling3d_driver() : pooling_driver<T>()
-    {
-        this->add(
-            this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4));
-        this->add(this->lens, "lens", this->generate_data({{2, 2, 2}, {3, 3, 3}}));
-        this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}}));
-        this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}}));
-        this->add(this->wsidx, "wsidx", this->generate_data({1}));
-    }
-};
+#include "pooling3d.hpp"
 
 int main(int argc, const char* argv[]) { test_drive<pooling3d_driver>(argc, argv); }
diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp
new file mode 100644
index 0000000000..ffaaf4a9ee
--- /dev/null
+++ b/test/pooling3d.hpp
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "pooling_common.hpp"
+
+struct pooling3d_shapes
+{
+public:
+    using U = typename std::vector<int>;
+
+    static std::vector<U> get_3d_pooling_input_shapes()
+    {
+        return {
+            {16, 64, 3, 4, 4},
+            {16, 32, 4, 9, 9},
+            {8, 512, 3, 14, 14},
+            {8, 512, 4, 28, 28},
+            {16, 64, 56, 56, 56},
+            {4, 3, 4, 227, 227},
+            {4, 4, 4, 161, 700}
+        };
+    }
+};
+
+template <class T>
+struct pooling3d_driver : pooling_driver<T>
+{
+    std::vector<std::vector<int>> get_3d_pooling_input_shapes()
+    {
+        return pooling3d_shapes::get_3d_pooling_input_shapes();
+    }
+
+    pooling3d_driver() : pooling_driver<T>()
+    {
+        this->add(this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4));
+        this->add(this->lens, "lens", this->generate_data({{2, 2, 2}, {3, 3, 3}}));
+        this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}}));
+        this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}}));
+        this->add(this->wsidx, "wsidx", this->generate_data({miopenPoolingWorkspaceIndexImage}));
+        this->add(this->layout, "layout", this->generate_data({miopenTensorNCDHW, miopenTensorNDHWC}));
+    }
+};
diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp
index 231b635a63..c9f787b3ed 100644
--- a/test/pooling_common.hpp
+++ b/test/pooling_common.hpp
@@ -60,6 +60,16 @@ static int num_uint64_case = 0;
 // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
 static int num_uint64_case_imgidx = 0;
 
+namespace {
+
+constexpr int RAND_INTEGER_MAX = 12000;
+constexpr int RAND_INTEGER_MIN = -8800;
+
+template <typename T>
+auto gen_value =
+    [](auto... is) { return static_cast<T>(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; };
+}
+
 static inline void print(const miopen::PoolingDescriptor& filter)
 {
     std::cout << "Pooling: ";
@@ -84,13 +94,23 @@ tensor<T> get_output_tensor(const miopen::PoolingDescriptor& filter, const tenso
     return tensor<T>{filter.GetForwardOutputTensor(input.desc)};
 }
 
+template <class T>
+tensor<T> get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor<T>& input)
+{
+    auto desc = filter.GetForwardOutputTensor(input.desc);
+    auto lens = desc.GetLengths();
+    lens[0] *= 10;
+    auto big = miopen::TensorDescriptor{desc.GetType(), input.desc.GetLayout_t(), lens, desc.GetStrides()};
+    return tensor<T>{big};
+}
+
 template <class T>
 struct pooling_operators
 {
     miopen::PoolingDescriptor filter;
     pooling_operators(miopen::PoolingDescriptor f) : filter(f) {}
 
-    double start() const
+    double initialize() const
     {
         if(filter.GetMode() == miopenPoolingMax)
             return std::numeric_limits<T>::lowest();
@@ -111,7 +131,7 @@ struct pooling_operators
         }
     }
 
-    double final(double x, double y)
+    double finalize(double x, double y)
     {
         if(filter.GetMode() == miopenPoolingMax)
             return (x);
@@ -127,22 +147,25 @@ struct verify_forward_pooling
     tensor<T>
     cpu(const tensor<T>& input, const miopen::PoolingDescriptor& filter, std::vector<Index>&) const
     {
+        const int sptl_dim_offset = 2;
+        const int chan_dim_offset = 1;
+
         auto out = get_output_tensor(filter, input);
 
         std::array<int, SptDim> in_dim{};
-        std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin());
+        std::copy_n(input.desc.GetLengths().begin() + sptl_dim_offset, SptDim, in_dim.begin());
         std::array<int, SptDim> strides{};
         std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin());
         std::array<int, SptDim> pads{};
         std::copy_n(filter.GetPads().begin(), SptDim, pads.begin());
         std::array<int, SptDim> kers{};
         std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin());
-        auto op = pooling_operators<T>{filter};
+        auto pooler = pooling_operators<T>{filter};
 
         int b_n = out.desc.GetLengths()[0];
-        int k_n = out.desc.GetLengths()[1];
+        int k_n = out.desc.GetLengths()[chan_dim_offset];
         std::array<int, SptDim> out_spatial_len{};
-        std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin());
+        std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin());
 
         auto par_ford_out =
             miopen::unpacker(miopen::prepender(par_ford, b_n, k_n))(out_spatial_len);
@@ -167,27 +190,28 @@ struct verify_forward_pooling
                     ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies<int>())
                     : std::accumulate(win_sz.begin(), win_sz.end(), 1, std::multiplies<int>());
 
-            double acc = op.start();
+            double acc = pooler.initialize();
             miopen::unpacker(ford)(win_sz)([&](auto... in_spatial_id_pack) {
                 auto in_spatial_id = make_array(in_spatial_id_pack...);
                 std::array<std::size_t, SptDim + 2> idx{};
                 idx[0] = o;
-                idx[1] = w;
+                idx[chan_dim_offset] = w;
 
                 bool in_cmp_idx = true;
                 for(int i = 0; i < SptDim; ++i)
                 {
-                    idx[i + 2] = start_idx[i] + in_spatial_id[i];
-                    in_cmp_idx &= (in_dim[i] > idx[i + 2]);
+                    idx[i + sptl_dim_offset] = start_idx[i] + in_spatial_id[i];
+                    in_cmp_idx &= (in_dim[i] > idx[i + sptl_dim_offset]);
                 }
 
                 if(in_cmp_idx)
                 {
-                    acc = op(acc, input(idx));
+                    acc = pooler(acc, input(idx));
                 }
             });
-            out(o, w, out_spatial_id_pack...) = T(op.final(acc, pool_size));
+            out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size));
         });
+
         return out;
     }
 
@@ -198,6 +222,7 @@ struct verify_forward_pooling
     {
         auto&& handle = get_handle();
         auto out      = get_output_tensor(filter, input);
+
         indices.resize(out.data.size(), 0);
 
         auto in_dev  = handle.Write(input.data);
@@ -218,7 +243,8 @@ struct verify_forward_pooling
                        wspace.size());
 
         indices  = wspace.Read<std::vector<Index>>();
-        out.data = handle.Read<T>(out_dev, out.data.size());
+        handle.ReadTo(out.data.data(), out_dev, out.data.size() * sizeof(T));
+
         return out;
     }
 
@@ -249,6 +275,10 @@ struct verify_backward_pooling
                   bool verify_index) const
     {
         auto dinput = input;
+
+        constexpr int sptl_dim_offset = 2;
+        constexpr int chan_dim_offset = 1;
+
         std::vector<double> din_vec(input.desc.GetElementSpace(), 0.0);
         CHECK(dout.desc == out.desc);
         std::array<int, SptDim + 2> in_dim{};
@@ -264,9 +294,9 @@ struct verify_backward_pooling
         auto ford_ker = miopen::unpacker(ford)(kers);
 
         int out_n = out.desc.GetLengths()[0];
-        int out_c = out.desc.GetLengths()[1];
+        int out_c = out.desc.GetLengths()[chan_dim_offset];
         std::array<int, SptDim> out_spatial_len{};
-        std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin());
+        std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin());
         auto ford_out = miopen::unpacker(ford)(out_spatial_len);
 
         par_ford(out_n, out_c)([&](int o, int w) {
@@ -281,12 +311,12 @@ struct verify_backward_pooling
                         for(int i = 0; i < SptDim; i++)
                         {
                             std::size_t mx_idx_dim = mx_idx;
-                            mx_idx_dim /= std::accumulate(in_dim.begin() + i + 3,
+                            mx_idx_dim /= std::accumulate(in_dim.begin() + sptl_dim_offset + i + 1,
                                                           in_dim.end(),
                                                           1ULL,
                                                           std::multiplies<std::size_t>());
-                            mx_idx_dim %= in_dim[i + 2];
-                            idx[i + 2] = mx_idx_dim;
+                            mx_idx_dim %= in_dim[i + sptl_dim_offset];
+                            idx[i + sptl_dim_offset] = mx_idx_dim;
                         }
                     }
                     else
@@ -462,6 +492,7 @@ struct pooling_driver : test_driver
 #endif
     int verify_indices{};
     int wsidx{};
+    miopenTensorLayout_t layout{};
     std::unordered_map<std::string, miopenIndexType_t> index_type_lookup = {
         {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8},
         {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16},
@@ -505,16 +536,20 @@ struct pooling_driver : test_driver
         add(verify_indices, "verify_indices", generate_data({1}));
     }
 
-    template <class Index, int SptDim>
+    template <class Index, int SptlDim>
     void run_impl()
     {
         std::vector<Index> indices{};
-        auto input = tensor<T>{in_shape}.generate(
-            tensor_elem_gen_integer{miopen_type<T>{} == miopenHalf ? 5 : 17});
-        auto out  = verify(verify_forward_pooling<SptDim>{}, input, filter, indices);
+        auto input = tensor<T>{layout, in_shape};
+        for(auto& v : input.data)   v = gen_value<T>(); // TODO RJS use generate
+
+        auto out  = verify(verify_forward_pooling<SptlDim>{},
+            input,
+            filter,
+            indices);
         auto dout = out.first;
         dout.generate(tensor_elem_gen_integer{2503});
-        verify(verify_backward_pooling<SptDim>{},
+        verify(verify_backward_pooling<SptlDim>{},
                input,
                dout,
                out.first,
@@ -528,7 +563,7 @@ struct pooling_driver : test_driver
     {
         auto idx_typ = index_type_lookup.at(miopen::ToUpper(index_type));
         auto idx_sz  = sizeof(uint8_t);
-        int spt_dim  = in_shape.size() - 2;
+        int sptl_dim  = in_shape.size() - 2;
         const bool skip_many_configs_with_non_int8_index =
             (dataset_id == 0) && full_set; // Otherwise the default dataset takes too much time.
         const bool wide_dataset = (dataset_id == 2) && full_set;
@@ -537,17 +572,19 @@ struct pooling_driver : test_driver
         {
             mode_lookup.at(miopen::ToUpper(mode)),
 #if TEST_PADDING_MODE == 1
-                pmode_lookup.at(miopen::ToUpper(pmode)),
+            pmode_lookup.at(miopen::ToUpper(pmode)),
 #else
-                miopenPaddingDefault,
+            miopenPaddingDefault,
 #endif
-                lens, strides, pads
+            lens,
+            strides,
+            pads
         };
 
         filter.SetIndexType(idx_typ);
         filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx));
 
-        if(wsidx == 0 && spt_dim == 3 && filter.GetMode() == miopenPoolingMax && full_set)
+        if(wsidx == 0 && sptl_dim == 3 && filter.GetMode() == miopenPoolingMax && full_set)
         {
             show_command();
             std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
@@ -556,7 +593,7 @@ struct pooling_driver : test_driver
             return;
         }
 
-        if(wsidx == 0 && spt_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset)
+        if(wsidx == 0 && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset)
         {
             show_command();
             std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
@@ -587,12 +624,12 @@ struct pooling_driver : test_driver
         /// the "full test" is ran. See:
         /// \ref max_pooling_index_max_restriction
         case miopenIndexUint8: {
-            if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set &&
+            if((sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) && full_set &&
                filter.GetMode() == miopenPoolingMax)
             {
                 show_command();
                 std::cout << "Warning: Config skipped: uint8 index is too small "
-                             "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) "
+                             "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
                              "&& filter.GetMode() == miopenPoolingMax"
                           << std::endl;
                 return;
@@ -600,12 +637,12 @@ struct pooling_driver : test_driver
             break;
         }
         case miopenIndexUint16: {
-            if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set &&
+            if((sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) && full_set &&
                filter.GetMode() == miopenPoolingMax)
             {
                 show_command();
                 std::cout << "Warning: Config skipped: uint16 index is too small "
-                             "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) "
+                             "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) "
                              "&& filter.GetMode() == miopenPoolingMax"
                           << std::endl;
                 return;
@@ -675,12 +712,12 @@ struct pooling_driver : test_driver
                 }
                 else
                 {
-                    if(num_uint64_case_imgidx > 5 && spt_dim == 2)
+                    if(num_uint64_case_imgidx > 5 && sptl_dim == 2)
                     {
                         show_command();
                         std::cout << "Warning: Config skipped to speed up testing of the "
                                      "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 "
-                                     "&& spt_dim == 2)"
+                                     "&& sptl_dim == 2)"
                                   << std::endl;
                         return;
                     }
@@ -692,23 +729,23 @@ struct pooling_driver : test_driver
         }
         }
 
-        auto input_desc = miopen::TensorDescriptor(this->type, in_shape);
+        auto input_desc = miopen::TensorDescriptor(this->type, layout, in_shape);
 
-        if(spt_dim != 2 && spt_dim != 3)
+        if(sptl_dim != 2 && sptl_dim != 3)
         {
             show_command();
-            std::cout << "Warning: Config skipped becuse it is not supported " //
-                         "(spt_dim != 2 && spt_dim != 3)"
+            std::cout << "Warning: Config skipped because it is not supported " //
+                         "(sptl_dim != 2 && sptl_dim != 3)"
                       << std::endl;
             return;
         }
 
-        for(int i = 0; i < spt_dim; i++)
+        for(int i = 0; i < sptl_dim; i++)
         {
             if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast<uint64_t>(2) * pads[i]))
             {
                 show_command();
-                std::cout << "Warning: Config skipped becuse it is invalid "
+                std::cout << "Warning: Config skipped because it is invalid "
                              "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))"
                           << std::endl;
                 return;
@@ -733,9 +770,13 @@ struct pooling_driver : test_driver
             }
         }
 
-        std::vector<int> in_dim(input_desc.GetLengths().begin() + 2, input_desc.GetLengths().end());
-        std::vector<int> out_dim(spt_dim);
+        constexpr int sptl_index = 2;
+
+        std::vector<int> in_dim(input_desc.GetLengths().begin() + sptl_index,
+            input_desc.GetLengths().begin() + sptl_index + sptl_dim);
+        std::vector<int> out_dim(sptl_dim);
         std::vector<int> ker_dim(filter.GetLengths().begin(), filter.GetLengths().end());
+
 #if TEST_PADDING_MODE == 1
         if(filter.pmode == miopenPaddingSame)
         {
@@ -743,7 +784,7 @@ struct pooling_driver : test_driver
                    return i == 0;
                }))
                 return;
-            for(int i = 0; i < spt_dim; i++)
+            for(int i = 0; i < sptl_dim; i++)
             {
                 filter.pads[i] =
                     ((in_dim[i] % filter.GetStrides()[i] == 0)
@@ -763,7 +804,7 @@ struct pooling_driver : test_driver
                    return i == 0;
                }))
                 return;
-            for(int i = 0; i < spt_dim; i++)
+            for(int i = 0; i < sptl_dim; i++)
             {
                 filter.pads[i] = 0;
 
@@ -778,7 +819,7 @@ struct pooling_driver : test_driver
         switch(filter.GetIndexType())
         {
         case miopenIndexUint8: {
-            if(spt_dim == 3)
+            if(sptl_dim == 3)
             {
                 run_impl<uint8_t, 3>();
             }
@@ -789,7 +830,7 @@ struct pooling_driver : test_driver
             break;
         }
         case miopenIndexUint16: {
-            if(spt_dim == 3)
+            if(sptl_dim == 3)
             {
                 run_impl<uint16_t, 3>();
             }
@@ -800,7 +841,7 @@ struct pooling_driver : test_driver
             break;
         }
         case miopenIndexUint32: {
-            if(spt_dim == 3)
+            if(sptl_dim == 3)
             {
                 run_impl<uint32_t, 3>();
             }
@@ -811,7 +852,7 @@ struct pooling_driver : test_driver
             break;
         }
         case miopenIndexUint64: {
-            if(spt_dim == 3)
+            if(sptl_dim == 3)
             {
                 run_impl<uint64_t, 3>();
             }