diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 0f2c2a5cb0..c69c9957c8 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -396,7 +396,7 @@ typedef enum miopenTensorCHWNc4 = 5, /*!< CHWNc4 memory layout (Partially supported) */ miopenTensorCHWNc8 = 6, /*!< CHWNc8 memory layout (Partially supported) */ miopenTensorNCDHW = 7, /*!< NCDHW memory layout (Fully supported) */ - miopenTensorNDHWC = 8, /*!< NCDHW memory layout (Fully supported) */ + miopenTensorNDHWC = 8, /*!< NDHWC memory layout (Fully supported) */ } miopenTensorLayout_t; /*! @ingroup pooling diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 14e7c954b1..5855cf20e7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -288,9 +288,14 @@ set( MIOpen_Source solver/mha/mha_solver_forward.cpp solver/pooling/forward2d.cpp solver/pooling/forwardNaive.cpp + solver/pooling/forwardNdNhwcNaive.cpp solver/pooling/forwardNd.cpp + solver/pooling/forwardCk2d.cpp + solver/pooling/forwardCkNd.cpp solver/pooling/backward2d.cpp solver/pooling/backwardNd.cpp + solver/pooling/backwardCk2d.cpp + solver/pooling/backwardCkNd.cpp solver/reduce/forward_argmax.cpp solver/reduce/forward_argmin.cpp solver/reduce/forward_max.cpp @@ -447,6 +452,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/miopen_type_traits.hpp kernels/miopen_utility.hpp kernels/neuron.inc + kernels/pooling_functions.h + include/miopen/pooling/poolingNdNhwcArgs.hpp kernels/rocm_version.inc kernels/stride_array.hpp kernels/utilities.inc @@ -495,6 +502,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenPoolingBwd.cl kernels/MIOpenPoolingBwdND.cl kernels/MIOpenPoolingForwardNaive.cl + kernels/MIOpenPoolingForwardNDNhwcNaive.cpp kernels/MIOpenPoolingND.cl kernels/MIOpenConv1x1S.cl kernels/MIOpenConv1x1J1.cl diff --git a/src/comgr.cpp b/src/comgr.cpp index aa53b71bb5..2c533f9623 100644 --- a/src/comgr.cpp +++ b/src/comgr.cpp @@ -988,8 +988,12 @@ void BuildHip(const std::string& name, opts.push_back("-std=c++17"); HiprtcProgram prog(name, text); - prog.Compile(opts); - prog.GetCode(binary); + try{ // TEMPCODE RJS + + std::cout << "Compling HIP: '" << name << "'" << std::endl; // TEMPCODE RJS + prog.Compile(opts); + } catch(Error& ex) { std::cout << __FUNCTION__ << " : Exception calling prog.Compile!: " << ex.text << std::endl; throw(ex); } + prog.GetCode(binary); } catch(Error& ex) { diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp index 9447d7a5ca..dcbcac27f3 100644 --- a/src/include/miopen/conv/problem_description.hpp +++ b/src/include/miopen/conv/problem_description.hpp @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -136,7 +137,7 @@ namespace conv { MIOPEN_INTERNALS_EXPORT miopenAlphaBetaCase_t ClassifyAlphaBeta(const Scalar& alpha, const Scalar& beta); -struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase +struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionWeightsBase #if MIOPEN_ENABLE_SQLITE , SQLiteSerializable @@ -153,13 +154,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase int bias_ = 0, const Scalar& alpha_ = Scalar(1.0), const Scalar& beta_ = Scalar(0.0)) - : in(in_), - weights(weights_), - out(out_), + : ProblemDescriptionWeightsBase(in_, weights_, out_), conv(conv_), - in_layout(ComputeInLayout()), - weights_layout(ComputeWeightsLayout()), - out_layout(ComputeOutLayout()), direction(direction_), bias(bias_), alpha(alpha_), @@ -443,49 +439,7 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase void SetupFloats(ExecutionContext& ctx) const; private: - std::string ComputeInLayout() const - { - if(GetSpatialDims() == 2) - { - return in.GetLayout(in.GetLayout_str()); - } - else - { - return in.GetLayout("NCDHW"); - } - } - - std::string ComputeOutLayout() const - { - if(GetSpatialDims() == 2) - { - return out.GetLayout(out.GetLayout_str()); - } - else - { - return out.GetLayout("NCDHW"); - } - } - - std::string ComputeWeightsLayout() const - { - if(GetSpatialDims() == 2) - { - return weights.GetLayout(weights.GetLayout_str()); - } - else - { - return weights.GetLayout("NCDHW"); - } - } - - TensorDescriptor in; - TensorDescriptor weights; - TensorDescriptor out; ConvolutionDescriptor conv; - std::string in_layout; - std::string weights_layout; - std::string out_layout; Direction direction = Direction::Forward; int bias = 0; Scalar alpha = Scalar(1.0); diff --git a/src/include/miopen/kernel_info.hpp b/src/include/miopen/kernel_info.hpp index d2571afa32..d824fe5c78 100644 --- a/src/include/miopen/kernel_info.hpp +++ b/src/include/miopen/kernel_info.hpp @@ -48,6 +48,9 @@ struct KernelInfo fs::path kernel_file; std::string kernel_name; friend std::ostream& operator<<(std::ostream& os, const KernelInfo& k); + + /// configures the working set using hip-style indices + void ConfigureHip(size_t l0, size_t l1, size_t l2, size_t g0, size_t g1, size_t g2); }; std::vector PrecompileKernels(const Handle& h, diff --git a/src/include/miopen/pooling.hpp b/src/include/miopen/pooling.hpp index 0ab5ffa1c7..26805f6df8 100644 --- a/src/include/miopen/pooling.hpp +++ b/src/include/miopen/pooling.hpp @@ -113,6 +113,8 @@ struct MIOPEN_EXPORT PoolingDescriptor : miopenPoolingDescriptor void SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t workspace_index); miopenPoolingMode_t GetMode() const; + + bool ModeIsAveraging() const; miopenPaddingMode_t GetPaddingMode() const; diff --git a/src/include/miopen/pooling/poolingNdNhwcArgs.hpp b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp new file mode 100644 index 0000000000..5084f28b4f --- /dev/null +++ b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp @@ -0,0 +1,45 @@ +#pragma once + +struct poolingNdNhwcArgs +{ + uint32_t filter_d; + uint32_t filter_h; + uint32_t filter_w; + + uint32_t filter_d_stride; + uint32_t filter_h_stride; + uint32_t filter_w_stride; + + uint32_t filter_d_pad; + uint32_t filter_h_pad; + uint32_t filter_w_pad; + + uint32_t all_n; + uint32_t all_c; + + uint32_t bot_d; + uint32_t bot_h; + uint32_t bot_w; + + uint32_t bot_n_stride; + uint32_t bot_c_stride; + uint32_t bot_d_stride; + uint32_t bot_h_stride; + uint32_t bot_w_stride; + + uint32_t top_d; + uint32_t top_h; + uint32_t top_w; + + uint32_t top_n_stride; + uint32_t top_c_stride; + uint32_t top_d_stride; + uint32_t top_h_stride; + uint32_t top_w_stride; + + uint32_t mask_n_stride; + uint32_t mask_c_stride; + uint32_t mask_d_stride; + uint32_t mask_h_stride; + uint32_t mask_w_stride; +}; diff --git a/src/include/miopen/pooling/solvers.hpp b/src/include/miopen/pooling/solvers.hpp index 0d0e35755a..d8836fcdc0 100644 --- a/src/include/miopen/pooling/solvers.hpp +++ b/src/include/miopen/pooling/solvers.hpp @@ -66,6 +66,30 @@ struct PoolingForwardNd final : PoolingSolver const miopen::pooling::ProblemDescription& problem) const override; }; +struct PoolingForwardCk2d final : PoolingSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; +}; + +struct PoolingForwardCkNd final : PoolingSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; +}; + struct PoolingForwardNaive final : PoolingSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } @@ -79,6 +103,18 @@ struct PoolingForwardNaive final : PoolingSolver const miopen::pooling::ProblemDescription& problem) const override; }; +struct PoolingForwardNDNhwcNaive final : PoolingSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; +}; + template struct PoolingFwdNCHWTransposingSolver : TransposingSolver, PoolingSolver, @@ -145,6 +181,18 @@ struct PoolingBackward2d final : PoolingSolver const miopen::pooling::ProblemDescription& problem) const override; }; +struct PoolingBackwardCk2d final : PoolingSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; +}; + struct PoolingBackwardNd final : PoolingSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } @@ -157,6 +205,18 @@ struct PoolingBackwardNd final : PoolingSolver const miopen::pooling::ProblemDescription& problem) const override; }; +struct PoolingBackwardCkNd final : PoolingSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; +}; + template struct PoolingBwdNCHWTransposingSolver : TransposingSolver, PoolingSolver, diff --git a/src/include/miopen/problem_description_layout.hpp b/src/include/miopen/problem_description_layout.hpp new file mode 100644 index 0000000000..780086afe6 --- /dev/null +++ b/src/include/miopen/problem_description_layout.hpp @@ -0,0 +1,102 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include +#include +#include + +#include + +namespace miopen { + +struct ProblemDescriptionLayoutBase : ProblemDescriptionBase +{ + ProblemDescriptionLayoutBase() = default; + ProblemDescriptionLayoutBase(const ProblemDescriptionLayoutBase&) = default; + ProblemDescriptionLayoutBase(const TensorDescriptor& in_, // x for Forward, y for Backward* + const TensorDescriptor& out_ // y for Forward, x for Backward* + ) + : ProblemDescriptionBase(), + in(in_), + out(out_), + in_layout(ComputeInLayout()), + out_layout(ComputeOutLayout()) + {} + virtual ~ProblemDescriptionLayoutBase() = default; + + ProblemDescriptionLayoutBase& operator=(const ProblemDescriptionLayoutBase&) = default; + + [[nodiscard]] virtual NetworkConfig MakeNetworkConfig() const = 0; + +protected: + TensorDescriptor in; + TensorDescriptor out; + std::string in_layout; + std::string out_layout; + + std::string ComputeInLayout() const + { + return in.GetLayout(in.GetLayout_str()); + } + + std::string ComputeOutLayout() const + { + return out.GetLayout(out.GetLayout_str()); + } +}; + +struct ProblemDescriptionWeightsBase : ProblemDescriptionLayoutBase +{ + ProblemDescriptionWeightsBase() = default; + ProblemDescriptionWeightsBase(const ProblemDescriptionWeightsBase&) = default; + ProblemDescriptionWeightsBase(const TensorDescriptor& in_, // x for Forward, y for Backward* + const TensorDescriptor& weights_, + const TensorDescriptor& out_ // y for Forward, x for Backward* + ) + : ProblemDescriptionLayoutBase(in_, out_), + weights(weights_), + weights_layout(ComputeWeightsLayout()) + {} + virtual ~ProblemDescriptionWeightsBase() = default; + + ProblemDescriptionWeightsBase& operator=(const ProblemDescriptionWeightsBase&) = default; + + [[nodiscard]] virtual NetworkConfig MakeNetworkConfig() const = 0; + +protected: + TensorDescriptor weights; + std::string weights_layout; + + std::string ComputeWeightsLayout() const + { + return weights.GetLayout(weights.GetLayout_str()); + } +}; + +} // namespace miopen diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp index f4d2b2dca7..3059622374 100644 --- a/src/include/miopen/tensor.hpp +++ b/src/include/miopen/tensor.hpp @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -162,6 +163,10 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor TensorDescriptor(miopenDataType_t t, const std::vector& lens_in, const std::vector& strides_in); + TensorDescriptor(miopenDataType_t t, + miopenTensorLayout_t layout_in, + const std::vector& lens_in, + const std::vector& strides_in); TensorDescriptor(miopenDataType_t t, const std::initializer_list& lens_in, const std::initializer_list& strides_in); @@ -207,6 +212,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor miopenTensorLayout_t GetLayout_t() const; static std::string GetLayoutStr(miopenTensorLayout_t layout); std::string GetLayout_str() const; + bool IsDefaultLayout() const; std::size_t GetVectorLength() const; std::optional GetCastType() const; @@ -259,8 +265,10 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor { if(labels.size() != strides.size()) { - MIOPEN_THROW( - "Invalid labels size. Layout labels size must be equavalent to stride size"); + std::ostringstream oss; // TODO TRJS check this print + oss << "Invalid labels size. labels='" << labels << "', strides size=" << strides.size() + << ". Layout labels size must be equivalent to stride size"; + MIOPEN_THROW(oss.str().c_str()); } // Copy construct the result string from labels. This allocates the space at one go @@ -276,7 +284,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor if(base_label.size() != strides.size()) { MIOPEN_THROW( - "Invalid labels size. Layout labels size must be equavalent to stride size"); + "Invalid labels size. Layout labels size must be equivalent to stride size"); } auto result = base_label; auto p = find_permutation(lens, strides); @@ -285,6 +293,18 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor } } + static bool IsDefaultLayout(miopenTensorLayout_t layout, unsigned spatial_dims = 2) + { + switch (spatial_dims) + { + case 2: + case 3: + return layout == GetDefaultLayout(); + default: + MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3."); + } + } + friend MIOPEN_INTERNALS_EXPORT std::ostream& operator<<(std::ostream& stream, const TensorDescriptor& t); @@ -292,7 +312,16 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor friend void from_json(const nlohmann::json& j, TensorDescriptor& descriptor); protected: - static miopenTensorLayout_t GetDefaultLayout() { return miopenTensorNCHW; }; + static miopenTensorLayout_t GetDefaultLayout(unsigned spatial_dims = 2) + { + switch (spatial_dims) + { + case 2: return miopenTensorNCHW; + case 3: return miopenTensorNCDHW; + default: + MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3."); + } + }; private: TensorDescriptor(miopenDataType_t t, @@ -328,11 +357,34 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor template constexpr auto GetNCDHW(unsigned spatial_dims, const std::vector& data) +{ + if(spatial_dims == 3) + { + if(data.size() == 5) // NCDHW + return miopen::tien<5>(data, 1); + else if(data.size() == 3) // DHW + return std::make_tuple(static_cast(1), static_cast(1), data[0], data[1], data[2]); + else + MIOPEN_THROW("Invalid data length; must be 5 or 3 with 3 spatial dimensions"); + } + else + { + if(data.size() == 4) // NCHW + return std::make_tuple(data[0], data[1], static_cast(1), data[2], data[3]); + else if(data.size() == 2) // HW + return std::make_tuple(static_cast(1), static_cast(1), static_cast(1), data[0], data[1]); + else + MIOPEN_THROW("Invalid data length; must be 4 or 2 with 2 spatial dimensions"); + } +} + +template +constexpr auto GetNDHWC(unsigned spatial_dims, const std::vector& data) { if(spatial_dims == 3) return miopen::tien<5>(data, 1); else - return std::make_tuple(data[0], data[1], static_cast(1), data[2], data[3]); + return std::make_tuple(data[0], static_cast(1), data[1], data[2], data[3]); } } // namespace miopen diff --git a/src/kernels/MIOpenPoolingBwd.cl b/src/kernels/MIOpenPoolingBwd.cl index 6c88bebadf..062c5ed337 100644 --- a/src/kernels/MIOpenPoolingBwd.cl +++ b/src/kernels/MIOpenPoolingBwd.cl @@ -218,6 +218,168 @@ mloPoolingAveBwd(const __global _FLOAT* top_diff, } } } + +__attribute__((reqd_work_group_size(MLO_POOLBWD_GROUP_SZ0, + MLO_POOLBWD_GROUP_SZ1, + MLO_POOLBWD_GROUP_SZ2))) __kernel void +mloPoolingAveBwdNhwc(const __global _FLOAT* top_diff, + __global _FLOAT* bot_diff, + int mlo_pad1, + int mlo_pad0, + int mlo_n_outputs, + int mlo_bot_height, + int mlo_bot_width, + int mlo_top_height, + int mlo_top_width, + int mlo_botdf_batch_str, + int mlo_botdf_channel_str, + int mlo_botdf_str, + int mlo_topdf_batch_str, + int mlo_topdf_channel_str, + int mlo_topdf_str) +{ + __local _FLOAT lcl_top_diff[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT]; + + int x = get_group_id(0) * MLO_POOLBWD_GROUP_SZ0 * MLO_POOLBWD_N_HORIZ_OUT_PIX; + int y = get_group_id(1) * MLO_POOLBWD_GROUP_SZ1 * MLO_POOLBWD_N_VERT_OUT_PIX; + int lcl_id0 = get_local_id(0); + int lcl_id1 = get_local_id(1); + // int lcl_id = (lcl_id1 << MLO_POOLBWD_GROUP_LG2SZ1) + lcl_id0; + int ob = get_global_id(2); // outputs * batch_sz + int b = ob / mlo_n_outputs; + int o = ob - b * mlo_n_outputs; + + int top_x = (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) < 0 + ? 0 + : (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1; + int top_y = (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) < 0 + ? 0 + : (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1; + int top_off = b * mlo_topdf_batch_str + o * mlo_topdf_channel_str; + + _FLOAT_ACCUM res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX]; + for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++) + { + for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++) + { + res[k][l] = (_FLOAT_ACCUM)0; + } + } + + // load tile + for(int tj = lcl_id1; tj < MLO_POOLBWD_LCL_DATA_HEIGHT; tj += MLO_POOLBWD_GROUP_SZ1) + { + int top_y_act = top_y + tj; + int top_y_off = top_y_act * mlo_topdf_str; + + int lcl_off_v = tj * MLO_POOLBWD_LCL_DATA_WIDTH; + + bool invisibleY = (top_y_act >= mlo_top_height); + + for(int ti = lcl_id0; ti < MLO_POOLBWD_LCL_DATA_WIDTH; ti += MLO_POOLBWD_GROUP_SZ0) + { + + int top_x_act = top_x + ti; + + bool invisibleX = (top_x_act >= mlo_top_width); + + int top_diff_off = (invisibleX || invisibleY) ? 0 : top_off + top_y_off + top_x_act; + + _FLOAT top_val = top_diff[top_diff_off]; + + top_val = (invisibleX || invisibleY) ? 0 : top_val; + + lcl_top_diff[lcl_off_v + ti] = top_val; +#if 0 + if (lcl_id1==0&&o==0&&b==0) + { + printf("K:in: %d %d %d %f\n", top_off + top_y_off + top_x_act, top_y_act, top_x_act, top_val); + } +#endif + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + int bot_y = (y + lcl_id1 * MLO_POOLBWD_N_VERT_OUT_PIX); + int bot_x = (x + lcl_id0 * MLO_POOLBWD_N_HORIZ_OUT_PIX); + + for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++) + { + + int h = bot_y + k + mlo_pad1; + int top_hstart = (h < MLO_POOLING_KERNEL_SZ1) + ? 0 + : (h - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1; + int top_hend = min(h / MLO_POOLING_STRIDE1 + 1, mlo_top_height); + + for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++) + { + + int w = bot_x + l + mlo_pad0; + int top_wstart = (w < MLO_POOLING_KERNEL_SZ0) + ? 0 + : (w - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1; + int top_wend = min(w / MLO_POOLING_STRIDE0 + 1, mlo_top_width); + + for(int top_h = top_hstart; top_h < top_hend; ++top_h) + { + int hstart = top_h * MLO_POOLING_STRIDE1 - mlo_pad1; + int hend = min(hstart + MLO_POOLING_KERNEL_SZ1, mlo_bot_height); + hstart = max(hstart, 0); + + for(int top_w = top_wstart; top_w < top_wend; ++top_w) + { + // figure out the pooling size + int wstart = top_w * MLO_POOLING_STRIDE0 - mlo_pad0; + int wend = min(wstart + MLO_POOLING_KERNEL_SZ0, mlo_bot_width); + wstart = max(wstart, 0); + int pool_size = +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE + MLO_POOLING_KERNEL_SZ0 * MLO_POOLING_KERNEL_SZ1; + (void)wend; + (void)hend; +#else + (hend - hstart) * (wend - wstart); +#endif + pool_size = (pool_size == 0) ? 1 : pool_size; + int lcl_top_h = top_h - top_y; + int lcl_top_w = top_w - top_x; + _FLOAT_ACCUM add_val = + CVT_FLOAT2ACCUM( + lcl_top_diff[lcl_top_h * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w]) / + CVT_INTEGRAL2ACCUM(pool_size); + res[k][l] += add_val; +#if 0 + if (bot_x+l==6&&bot_y+k==0&&o==3&&b==0) + { + printf("K:com: %d %d %d %d %d %d %10.8f %10.8f %10.8f %d\n", k,l,top_h, top_w, lcl_top_h, lcl_top_w, res[k][l], add_val, lcl_top_diff[lcl_top_h * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w], pool_size); + } +#endif + } + } + } + } + + int bot_off = + b * mlo_botdf_batch_str + o * mlo_botdf_channel_str + bot_y * mlo_botdf_str + bot_x; + for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++) + { + for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++) + { + if(bot_y + k < mlo_bot_height && bot_x + l < mlo_bot_width) + { + bot_diff[bot_off + k * mlo_botdf_str + l] = CVT_ACCUM2FLOAT(res[k][l]); +#if 0 + if (lcl_id0==0&&lcl_id1==0&&o==0&&b==0) + { + printf("K:out: %d %d %d %f\n", bot_off + k * mlo_botdf_str +l, k, l, bot_diff[bot_off + k * mlo_botdf_str +l]); + } +#endif + } + } + } +} #endif // AVERAGE_OPS #if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX @@ -379,4 +541,164 @@ mloPoolingMaxBwd(const __global _FLOAT* top_df, } } } + +__attribute__((reqd_work_group_size(MLO_POOLBWD_GROUP_SZ0, + MLO_POOLBWD_GROUP_SZ1, + MLO_POOLBWD_GROUP_SZ2))) __kernel void +mloPoolingMaxBwdNhwc(const __global _FLOAT* top_df, + __global _FLOAT* bot_df, + __global index_t* mask, + int mlo_pad1, + int mlo_pad0, + int mlo_n_outputs, + int mlo_bot_height, + int mlo_bot_width, + int mlo_top_height, + int mlo_top_width, + int mlo_botdf_batch_str, + int mlo_botdf_channel_str, + int mlo_botdf_str, + int mlo_topdf_batch_str, + int mlo_topdf_channel_str, + int mlo_topdf_str) +{ + __local _FLOAT lcl_top_df[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT]; + __local index_t lcl_mask[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT]; + + int gid0 = get_group_id(0); + int gid1 = get_group_id(1); + int x = gid0 * MLO_POOLBWD_GROUP_SZ0 * MLO_POOLBWD_N_HORIZ_OUT_PIX; + int y = gid1 * MLO_POOLBWD_GROUP_SZ1 * MLO_POOLBWD_N_VERT_OUT_PIX; + int lcl_id0 = get_local_id(0); + int lcl_id1 = get_local_id(1); + int ob = get_global_id(2); // outputs * batch_sz + int b = ob / mlo_n_outputs; + int o = ob - b * mlo_n_outputs; + + int top_x = (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) < 0 + ? 0 + : (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1; + int top_y = (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) < 0 + ? 0 + : (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1; + int top_df_off = b * mlo_topdf_batch_str + o * mlo_topdf_channel_str; + + _FLOAT res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX]; + _FLOAT top_df_val; + index_t mask_val; + // load tiles + // top df and top + for(int tj = lcl_id1; tj < MLO_POOLBWD_LCL_DATA_HEIGHT; tj += MLO_POOLBWD_GROUP_SZ1) + { + int top_y_act = top_y + tj; + int top_df_y_off = top_y_act * mlo_topdf_str; + + int lcl_off_v = tj * MLO_POOLBWD_LCL_DATA_WIDTH; + + bool visibleY = (top_y_act < mlo_top_height); + + for(int ti = lcl_id0; ti < MLO_POOLBWD_LCL_DATA_WIDTH; ti += MLO_POOLBWD_GROUP_SZ0) + { + mask_val = MLO_POOLING_INDEX_MAX; + int top_x_act = top_x + ti; + int lcl_idx = lcl_off_v + ti; + + bool visible = visibleY && (top_x_act < mlo_top_width); + if(visible) + { + int idx = top_df_off + top_df_y_off + top_x_act; + + top_df_val = top_df[idx]; + mask_val = mask[idx]; + // top_df_val *= visible; + + lcl_top_df[lcl_idx] = top_df_val; + } + lcl_mask[lcl_idx] = mask_val; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + _FLOAT add_val; + int bt_y = (y + lcl_id1 * MLO_POOLBWD_N_VERT_OUT_PIX); + int bt_x = (x + lcl_id0 * MLO_POOLBWD_N_HORIZ_OUT_PIX); + + for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++) + { + int b_y = bt_y + k; + + // top most top y that can be influenced by this bot y + int tt_y1 = + (b_y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1) / MLO_POOLING_STRIDE1; + int tt_y = max(0, tt_y1); + + for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++) + { + int b_x = bt_x + l; + // left most top x that can be influenced by this bot x + int lt_x1 = (b_x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0 + MLO_POOLING_STRIDE0) / + MLO_POOLING_STRIDE0; + int lt_x = max(0, lt_x1); + + // find and sum up all tops that have been influenced by particular bot + res[k][l] = 0; + + for(int th = tt_y; th < tt_y + (MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1 - 1) / + MLO_POOLING_STRIDE1; + ++th) + { + __attribute__((opencl_unroll_hint(2))) for(int tw = lt_x; + tw < lt_x + (MLO_POOLING_KERNEL_SZ0 + + MLO_POOLING_STRIDE0 - 1) / + MLO_POOLING_STRIDE0; + ++tw) + { + int lcl_th = th - top_y; + int lcl_tw = tw - top_x; +#if USE_IMG_INDEX + index_t img_idx = b_x + b_y * mlo_bot_width; +#else + int filter_x = b_x - tw * MLO_POOLING_STRIDE0 + mlo_pad0; + int filter_y = b_y - th * MLO_POOLING_STRIDE1 + mlo_pad1; + int filter_idx = filter_x + filter_y * MLO_POOLING_KERNEL_SZ0; +#endif + bool visible = (lcl_th < MLO_POOLBWD_LCL_DATA_HEIGHT) && + (lcl_tw < MLO_POOLBWD_LCL_DATA_WIDTH); + int lcl_idx = visible ? (lcl_th * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_tw) : 0; + + bool match = visible && +#if USE_IMG_INDEX + (img_idx == lcl_mask[lcl_idx]) +#else + (filter_idx == lcl_mask[lcl_idx]) && (filter_x >= 0) && + (filter_y >= 0) +#endif + ; + + //_FLOAT add_val = lcl_top_df[lcl_idx] * match; + //_FLOAT add_val = match ? lcl_top_df[lcl_idx] : (_FLOAT)0; + if(match) + { + add_val = lcl_top_df[lcl_idx]; + res[k][l] += add_val; + } + } + } + } + } + + int bot_df_off = + b * mlo_botdf_batch_str + o * mlo_botdf_channel_str + bt_y * mlo_botdf_str + bt_x; + for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++) + { + for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++) + { + if((bt_y + k) < mlo_bot_height && (bt_x + l) < mlo_bot_width) + { + bot_df[bot_df_off + k * mlo_botdf_str + l] = res[k][l]; + } + } + } +} + #endif // MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX diff --git a/src/kernels/MIOpenPoolingBwdND.cl b/src/kernels/MIOpenPoolingBwdND.cl index 7daacd24ab..4e9b4da04e 100644 --- a/src/kernels/MIOpenPoolingBwdND.cl +++ b/src/kernels/MIOpenPoolingBwdND.cl @@ -147,6 +147,120 @@ mloPoolingNDMaxBwd(const __global _FLOAT* top_df, } } +__attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void +mloPoolingNDMaxBwdNhwc(const __global _FLOAT* top_df, + __global _FLOAT* bot_df, + __global index_t* mask, + const uint pad_d, + const uint pad_h, + const uint pad_w, + const uint batch, + const uint chal, + const uint bot_d, + const uint bot_h, + const uint bot_w, + const uint top_d, + const uint top_h, + const uint top_w, + const uint bot_str_b, + const uint bot_str_c, + const uint bot_str_d, + const uint bot_str_h, + const uint top_str_b, + const uint top_str_c, + const uint top_str_d, + const uint top_str_h, + const uint total_work) +{ + + int bot_blk_w = (bot_w + PIX_W_PER_WORK - 1) / PIX_W_PER_WORK; + int bot_blk_h = (bot_h + PIX_H_PER_WORK - 1) / PIX_H_PER_WORK; + int bot_blk_d = (bot_d + PIX_D_PER_WORK - 1) / PIX_D_PER_WORK; + + bot_blk_w = max(bot_blk_w, 1); + bot_blk_h = max(bot_blk_h, 1); + bot_blk_d = max(bot_blk_d, 1); + + for(uint gid = get_global_id(0); gid < total_work; gid += MAX_ACTIV_WORKITEM) + { + int b_id = gid / chal / bot_blk_w / bot_blk_h / bot_blk_d; + int c_id = (gid / bot_blk_w / bot_blk_h / bot_blk_d) % chal; + + int bot_d_id = ((gid / bot_blk_w / bot_blk_h) % bot_blk_d) * PIX_D_PER_WORK; + int bot_h_id = ((gid / bot_blk_w) % bot_blk_h) * PIX_H_PER_WORK; + int bot_w_id = (gid % bot_blk_w) * PIX_W_PER_WORK; + + int top_d_start = + bot_d_id + pad_d < KERNEL_SZ_D ? 0 : (bot_d_id + pad_d - KERNEL_SZ_D) / STRIDE_D + 1; + int top_h_start = + bot_h_id + pad_h < KERNEL_SZ_H ? 0 : (bot_h_id + pad_h - KERNEL_SZ_H) / STRIDE_H + 1; + int top_w_start = + bot_w_id + pad_w < KERNEL_SZ_W ? 0 : (bot_w_id + pad_w - KERNEL_SZ_W) / STRIDE_W + 1; + + int top_d_end = (bot_d_id + PIX_D_PER_WORK - 1 + pad_d) / STRIDE_D + 1; + int top_h_end = (bot_h_id + PIX_H_PER_WORK - 1 + pad_h) / STRIDE_H + 1; + int top_w_end = (bot_w_id + PIX_W_PER_WORK - 1 + pad_w) / STRIDE_W + 1; + + top_d_end = min(top_d_end, (int)top_d); + top_h_end = min(top_h_end, (int)top_h); + top_w_end = min(top_w_end, (int)top_w); + + _FLOAT bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0}; + + for(int h = top_d_start; h < top_d_end; ++h) + { + for(int j = top_h_start; j < top_h_end; ++j) + { + for(int i = top_w_start; i < top_w_end; ++i) + { + uint top_gbl_off = + b_id * top_str_b + c_id * top_str_c + h * top_str_d + j * top_str_h + i; + + _FLOAT top_val = b_id < batch ? top_df[top_gbl_off] : 0; + index_t mask_idx = b_id < batch ? mask[top_gbl_off] : MLO_POOLING_INDEX_MAX; + + uint mask_d_id = mask_idx / bot_h / bot_w; + uint mask_h_id = (mask_idx / bot_w) % bot_h; + uint mask_w_id = mask_idx % bot_w; + + if(mask_d_id >= bot_d_id && mask_h_id >= bot_h_id && mask_w_id >= bot_w_id && + mask_d_id < bot_d_id + PIX_D_PER_WORK && + mask_h_id < bot_h_id + PIX_H_PER_WORK && + mask_w_id < bot_w_id + PIX_W_PER_WORK) + { + mask_d_id -= bot_d_id; + mask_h_id -= bot_h_id; + mask_w_id -= bot_w_id; + + bot_data[mask_d_id][mask_h_id][mask_w_id] += top_val; + } + } + } + } + + uint bot_off = b_id * bot_str_b + c_id * bot_str_c + bot_d_id * bot_str_d + + bot_h_id * bot_str_h + bot_w_id; + + for(uint m = 0; m < PIX_D_PER_WORK; m++) + { + for(uint k = 0; k < PIX_H_PER_WORK; k++) + { + for(uint l = 0; l < PIX_W_PER_WORK; l++) + { + + if(bot_d_id + m < bot_d && bot_h_id + k < bot_h && bot_w_id + l < bot_w && + b_id < batch) + { + uint bot_idx = bot_off + m * bot_str_d + k * bot_str_h + l; + + bot_df[bot_idx] = bot_data[m][k][l]; + } + } + } + } + } +} + #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE || MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE __attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void @@ -281,4 +395,137 @@ mloPoolingNDAveBwd(const __global _FLOAT* top_df, } } } + +__attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void +mloPoolingNDAveBwdNhwc(const __global _FLOAT* top_df, + __global _FLOAT* bot_df, + const uint pad_d, + const uint pad_h, + const uint pad_w, + const uint batch, + const uint chal, + const uint bot_d, + const uint bot_h, + const uint bot_w, + const uint top_d, + const uint top_h, + const uint top_w, + const uint bot_str_b, + const uint bot_str_c, + const uint bot_str_d, + const uint bot_str_h, + const uint top_str_b, + const uint top_str_c, + const uint top_str_d, + const uint top_str_h, + const uint total_work) +{ + + int bot_blk_w = (bot_w + PIX_W_PER_WORK - 1) / PIX_W_PER_WORK; + int bot_blk_h = (bot_h + PIX_H_PER_WORK - 1) / PIX_H_PER_WORK; + int bot_blk_d = (bot_d + PIX_D_PER_WORK - 1) / PIX_D_PER_WORK; + + bot_blk_w = max(bot_blk_w, 1); + bot_blk_h = max(bot_blk_h, 1); + bot_blk_d = max(bot_blk_d, 1); + + for(uint gid = get_global_id(0); gid < total_work; gid += MAX_ACTIV_WORKITEM) + { + int b_id = gid / chal / bot_blk_w / bot_blk_h / bot_blk_d; + int c_id = (gid / bot_blk_w / bot_blk_h / bot_blk_d) % chal; + + int bot_d_id = ((gid / bot_blk_w / bot_blk_h) % bot_blk_d) * PIX_D_PER_WORK; + int bot_h_id = ((gid / bot_blk_w) % bot_blk_h) * PIX_H_PER_WORK; + int bot_w_id = (gid % bot_blk_w) * PIX_W_PER_WORK; + + int top_d_start = + bot_d_id + pad_d < KERNEL_SZ_D ? 0 : (bot_d_id + pad_d - KERNEL_SZ_D) / STRIDE_D + 1; + int top_h_start = + bot_h_id + pad_h < KERNEL_SZ_H ? 0 : (bot_h_id + pad_h - KERNEL_SZ_H) / STRIDE_H + 1; + int top_w_start = + bot_w_id + pad_w < KERNEL_SZ_W ? 0 : (bot_w_id + pad_w - KERNEL_SZ_W) / STRIDE_W + 1; + + int top_d_end = (bot_d_id + PIX_D_PER_WORK - 1 + pad_d) / STRIDE_D + 1; + int top_h_end = (bot_h_id + PIX_H_PER_WORK - 1 + pad_h) / STRIDE_H + 1; + int top_w_end = (bot_w_id + PIX_W_PER_WORK - 1 + pad_w) / STRIDE_W + 1; + + top_d_end = min(top_d_end, (int)top_d); + top_h_end = min(top_h_end, (int)top_h); + top_w_end = min(top_w_end, (int)top_w); + + _FLOAT_ACCUM bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0}; + + for(int h = top_d_start; h < top_d_end; ++h) + { + int dstart = h * STRIDE_D - pad_d; + int dend = min((dstart + KERNEL_SZ_D), (int)bot_d); + dstart = max(dstart, 0); + + for(int j = top_h_start; j < top_h_end; ++j) + { + int hstart = j * STRIDE_H - pad_h; + int hend = min((hstart + KERNEL_SZ_H), (int)bot_h); + hstart = max(hstart, 0); + + for(int i = top_w_start; i < top_w_end; ++i) + { + int wstart = i * STRIDE_W - pad_w; + int wend = min((wstart + KERNEL_SZ_W), (int)bot_w); + wstart = max(wstart, 0); + + uint pool_size = +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE + KERNEL_SZ_W * KERNEL_SZ_H * KERNEL_SZ_D; +#else + (dend - dstart) * (hend - hstart) * (wend - wstart); +#endif + pool_size = (pool_size == 0) ? 1 : pool_size; + + uint top_gbl_off = + b_id * top_str_b + c_id * top_str_c + h * top_str_d + j * top_str_h + i; + _FLOAT_ACCUM add_val = + b_id < batch ? CVT_FLOAT2ACCUM(top_df[top_gbl_off]) : CVT_FP32_2ACCUM(0.0f); + add_val /= CVT_INTEGRAL2ACCUM(pool_size); + + for(int m = dstart; m < dend; ++m) + { + for(int k = hstart; k < hend; ++k) + { + for(int l = wstart; l < wend; ++l) + { + if(m >= bot_d_id && m < PIX_D_PER_WORK + bot_d_id && + k >= bot_h_id && k < PIX_H_PER_WORK + bot_h_id && + l >= bot_w_id && l < PIX_W_PER_WORK + bot_w_id && b_id < batch) + { + bot_data[m - bot_d_id][k - bot_h_id][l - bot_w_id] += add_val; + } + } + } + } + } + } + } + + uint bot_off = b_id * bot_str_b + c_id * bot_str_c + bot_d_id * bot_str_d + + bot_h_id * bot_str_h + bot_w_id; + + for(uint m = 0; m < PIX_D_PER_WORK; m++) + { + for(uint k = 0; k < PIX_H_PER_WORK; k++) + { + for(uint l = 0; l < PIX_W_PER_WORK; l++) + { + + if(bot_d_id + m < bot_d && bot_h_id + k < bot_h && bot_w_id + l < bot_w && + b_id < batch) + { + uint bot_idx = bot_off + m * bot_str_d + k * bot_str_h + l; + + bot_df[bot_idx] = CVT_ACCUM2FLOAT(bot_data[m][k][l]); + } + } + } + } + } +} #endif diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp new file mode 100644 index 0000000000..c502cf625f --- /dev/null +++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp @@ -0,0 +1,241 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#endif + +#include "pooling_functions.h" +#include "poolingNdNhwcArgs.hpp" + +// TODO: add ability to decode network string into pooling descriptor or similar for targeted debugging + +#if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE) +#define AVERAGE_OPS 1 +#else +#define AVERAGE_OPS 0 +#endif + +// Let's use extended-precision accumulator only in FP16 pooling and only for averaging. +// For all other ops and datatypes, use native accumulator, i.e. treate FLOAT_ACCUM as FLOAT. +#if !(AVERAGE_OPS && MIOPEN_USE_FP16) +#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 1 +// #else +// #define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0 +#endif + +#include "float_types.h" +#include "miopen_cstdint.hpp" + +// This implementation is extremely memory-bound, so float type is used for all calculations +#define _FLOAT float +#define _FLOAT_ACCUM float + +#if MIOPEN_USE_INT8 == 1 + #if !AVERAGE_OPS + #ifndef FLT_MAX + #define MAX_VAL 127 /* max value */ + #else + #define MAX_VAL FLT_MAX + #endif + #endif +#endif +#if MIOPEN_USE_BFP16 + #define NATIVE_CAST(_x) (_FLOAT)bfloat16_to_float(_x) + #define NATIVE_UNCAST(_x) (_FLOAT)float_to_bfloat16(_x) +#else + #define NATIVE_CAST(_x) (_FLOAT)(_x) + #define NATIVE_UNCAST(_x) (_FLOAT)(_x) +#endif + +#if AVERAGE_OPS +#define ARG_UNUSED_FOR_AVERAGE __attribute__((__unused__)) +#else +#define ARG_UNUSED_FOR_AVERAGE +#endif + +// Out N, D, H are encoded into the block indices x, y, z +// No 2D-only optimization. +template +__device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, + TI* __restrict__ top_ptr, + ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr, + ARG_UNUSED_FOR_AVERAGE int save_index, + ARG_UNUSED_FOR_AVERAGE int index_mode, + poolingNdNhwcArgs args +) +{ + // naming: caps=count, lowercase=index, _ + const uint32_t nd = blockIdx.x; + const uint32_t h_ = blockIdx.y; + const uint32_t w_c = blockIdx.z; + const uint32_t w_ = w_c % args.top_w; // CAN w=fast index + + const uint32_t C_WH = blockDim.x; + const uint32_t _H = blockDim.y; + const uint32_t _W = blockDim.z; + + const uint32_t c = threadIdx.x; + const uint32_t _h = threadIdx.y; + const uint32_t _w = threadIdx.z; + + const uint32_t nn = nd / args.top_d; // n=slow index + const uint32_t cc = (w_c / args.top_w) * C_WH + c; // c=slow index (lg-C) + const uint32_t td = nd % args.top_d; // top d=fast index + const uint32_t th = h_ * _H + _h; // top h: blockIdx is slow (sm-C) + const uint32_t tw = w_ * _W + _w; // top w: blockIdx is slow (sm-C) + + if(nn >= args.all_n) return; + if(td >= args.top_d) return; + if(th >= args.top_h) return; + if(tw >= args.top_w) return; + if(cc >= args.all_c) return; + + const auto int_dstart = static_cast(td * args.filter_d_stride) - static_cast(args.filter_d_pad); + /* const */ auto dend = static_cast(min(int_dstart + static_cast(args.filter_d), static_cast(args.bot_d))); + const auto dstart = static_cast(max(int_dstart, 0)); + + const auto int_hstart = static_cast(th * args.filter_h_stride) - static_cast(args.filter_h_pad); + /* const */ auto hend = static_cast(min(int_hstart + static_cast(args.filter_h), static_cast(args.bot_h))); + const auto hstart = static_cast(max(int_hstart, 0)); + + const auto int_wstart = static_cast(tw * args.filter_w_stride) - static_cast(args.filter_w_pad); + /* const */ auto wend = static_cast(min(int_wstart + static_cast(args.filter_w), static_cast(args.bot_w))); + const auto wstart = static_cast(max(int_wstart, 0)); + + size_t top_index = + nn * args.top_n_stride + // + cc * args.top_c_stride + // + td * args.top_d_stride + // + th * args.top_h_stride + // + tw * args.top_w_stride; + +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + pool_size = (pool_size == 0) ? 1 : pool_size; +#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE + const uint32_t pool_size = args.filter_d * args.filter_h * args.filter_w; +#endif + +#if AVERAGE_OPS + _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0); +#else // MAX + _FLOAT_ACCUM res = (_FLOAT_ACCUM)NATIVE_CAST(-MAX_VAL_ACCUM); + bool found = false; // May remain false if bot contains only NaNs/-INFs. + uint32_t d_save = 0; + uint32_t h_save = 0; + uint32_t w_save = 0; + uint32_t saved_index = 0; +#endif + + size_t bot_ncd = static_cast(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride); + for(size_t bd = dstart; bd < dend; ++bd) + { + size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride; + for(uint32_t bh = hstart; bh < hend; ++bh) + { + size_t bot_index = bot_ncdh + wstart * args.bot_w_stride; + for(uint32_t bw = wstart; bw < wend; ++bw) + { +#if AVERAGE_OPS + res += static_cast<_FLOAT_ACCUM>(NATIVE_CAST(bot_ptr[bot_index])); +#else // MAX + auto val = static_cast<_FLOAT_ACCUM>(NATIVE_CAST(bot_ptr[bot_index])); + if(val > res) + { + res = val; + if(save_index) + { + found = true; + d_save = bd; + h_save = bh; + w_save = bw; + saved_index = bot_index; + } + } +#endif + bot_index += args.bot_w_stride; + } + bot_ncdh += args.bot_h_stride; + } + bot_ncd += args.bot_d_stride; + } + +#if AVERAGE_OPS + res /= static_cast<_FLOAT_ACCUM>(pool_size); +#else // MAX + if(save_index) + { + index_t res_index = saved_index; + + /// Preventing overflow during computation of res_index: + /// If Index is shorter than uint, then let's perform computation in 32-bit + /// domain and then convert to narrower Index. That would reduce the probability of + /// overflow. If Index is wider then 32 bits, then it seems like it is better to + /// convert to Index type before multiplication. However this is not actually + /// necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in + /// 32 bits and then convert. + + if(found) + { + if(index_mode == 0) + res_index = (index_t)( // + ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) + // + ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w) + // + (w_save - tw * args.filter_w_stride + args.filter_w_pad) // + ); + } + + const size_t mask_index = nn * args.mask_n_stride // + + cc * args.mask_c_stride // + + (size_t)(td * args.mask_d_stride) // + + (size_t)(th * args.mask_h_stride) // + + (size_t)(tw * args.mask_w_stride); + mask_ptr[mask_index] = res_index; + } +#endif + + top_ptr[top_index] = NATIVE_UNCAST(res); +} + +extern "C" __global__ void mloPoolingForwardNDNhwcNaive( + const INPUT_TYPE* __restrict__ bot_ptr, + INPUT_TYPE* __restrict__ top_ptr, + index_t* __restrict__ mask_ptr, + int save_index, + int index_mode, + poolingNdNhwcArgs args +) +{ + poolingForwardNDNhwcNaive( + bot_ptr, + top_ptr, + mask_ptr, + save_index, + index_mode, + args + ); +} diff --git a/src/kernels/MIOpenPoolingForwardNaive.cl b/src/kernels/MIOpenPoolingForwardNaive.cl index 20e0949967..f839193297 100644 --- a/src/kernels/MIOpenPoolingForwardNaive.cl +++ b/src/kernels/MIOpenPoolingForwardNaive.cl @@ -145,7 +145,7 @@ __kernel void mloPoolingForwardNaive(const __global _FLOAT* bot_ptr, uint h_save = 0; uint w_save = 0; #endif - for(uint d = dstart; d < dend; ++d) + for(size_t d = dstart; d < dend; ++d) { for(uint h = hstart; h < hend; ++h) { @@ -153,7 +153,7 @@ __kernel void mloPoolingForwardNaive(const __global _FLOAT* bot_ptr, { const size_t bot_index = b * bot_n_stride // + o * bot_c_stride // - + (size_t)(d * bot_d_stride) // + + d * bot_d_stride // + (size_t)(h * bot_h_stride) // + (size_t)(w * bot_w_stride); #if AVERAGE_OPS diff --git a/src/kernels/pooling_functions.h b/src/kernels/pooling_functions.h index c4821b10fb..aaaa431f53 100644 --- a/src/kernels/pooling_functions.h +++ b/src/kernels/pooling_functions.h @@ -39,8 +39,8 @@ typedef MLO_POOLING_INDEX_TYPE index_t; #define MLO_POOLING_OP_STC 2 #define MLO_POOLING_OP_AVE_INCLUSIVE 3 -#ifndef MLO_POOLING_OP_ID -#define MLO_POOLING_OP_ID 0 +#ifndef MLO_POOLING_OP_ID // TODO TRJS check: default used to be 0 +#define MLO_POOLING_OP_ID 1 #endif #endif // GUARD_POOLING_FUNCTIONS_H diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp index 9881c1596f..6b79def0b0 100644 --- a/src/ocl/pooling_ocl.cpp +++ b/src/ocl/pooling_ocl.cpp @@ -42,6 +42,9 @@ static auto PoolingForwardSolvers() return solver::SolverContainer{}; } @@ -50,6 +53,8 @@ static auto PoolingBackwardSolvers() { return solver::SolverContainer{}; } @@ -89,7 +94,7 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle, auto index_max = get_index_max(GetIndexType()); /// \anchor max_pooling_index_max_restriction - /// For kernel implementation max pooling backward pass, + /// For kernel implementation max pooling forward pass, /// "index_max" means ghost, and thus should not be reached. if(mode == miopenPoolingMax && save_index) { @@ -102,13 +107,13 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle, 1, std::multiplies()))) { - MIOPEN_THROW("Index range not enough for max pooling bwd"); + MIOPEN_THROW("Index range not enough for max pooling fwd"); } if(workSpace == nullptr) { throw std::invalid_argument("workSpace cannot be NULL in Forward Pooling MAX mode when " - "backward pass is requested"); + "forward pass is requested"); } } diff --git a/src/pooling.cpp b/src/pooling.cpp index a65cb3c0ab..e0d9112bd2 100644 --- a/src/pooling.cpp +++ b/src/pooling.cpp @@ -96,6 +96,11 @@ miopenPoolingWorkspaceIndexMode_t PoolingDescriptor::GetWorkspaceIndexMode() con miopenPoolingMode_t PoolingDescriptor::GetMode() const { return mode; } +bool PoolingDescriptor::ModeIsAveraging() const +{ + return mode == miopenPoolingAverage || mode == miopenPoolingAverageInclusive; +} + miopenPaddingMode_t PoolingDescriptor::GetPaddingMode() const { return (pmode); } const std::vector& PoolingDescriptor::GetLengths() const { return lens; } @@ -217,13 +222,14 @@ TensorDescriptor PoolingDescriptor::GetForwardOutputTensor(const TensorDescripto { std::vector out_dim(xDesc.GetNumDims()); GetForwardOutputDimNd(xDesc, xDesc.GetNumDims(), out_dim.data()); + auto layout_str = xDesc.GetLayout_str(); + auto layout = xDesc.GetLayout_t(); + auto lengths_layout = miopen::tensor_layout_get_default(xDesc.GetNumDims()); - const std::string default_layout = tensor_layout_get_default(xDesc.GetNumDims()); - const std::string in_layout = xDesc.GetLayout(default_layout); std::vector out_strides; - tensor_layout_to_strides(out_dim, default_layout, in_layout, out_strides); + tensor_layout_to_strides(out_dim, lengths_layout, layout_str, out_strides); - return {xDesc.GetType(), out_dim, out_strides}; + return {xDesc.GetType(), layout, out_dim, out_strides}; } std::size_t PoolingDescriptor::GetWorkSpaceSize(const TensorDescriptor& yDesc) const diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp index 8e171a4ac0..df21b8cd93 100644 --- a/src/pooling/problem_description.cpp +++ b/src/pooling/problem_description.cpp @@ -60,6 +60,7 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const : MLO_POOLING_OP_AVE_INCLUSIVE); ss << "m" + std::to_string(pooling_method); + ss << "_dt" << xDesc.GetType(); if(const auto ct = xDesc.GetCastType()) ss << "_dct" << GetDataTypeName(*ct); @@ -83,6 +84,7 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const ss << "_dyd" << get_vect_config(dyDesc.GetLengths()); ss << "_dys" << get_vect_config(dyDesc.GetStrides()); } + ss << "_l" << static_cast(xDesc.GetLayout_t()); return NetworkConfig{ss.str()}; } diff --git a/src/solver.cpp b/src/solver.cpp index e468d38d0a..c9607df252 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -66,6 +66,19 @@ std::ostream& operator<<(std::ostream& os, const KernelInfo& k) return os << "} '" << k.comp_options << '\''; } +void KernelInfo::ConfigureHip(size_t l0, size_t l1, size_t l2, size_t g0, size_t g1, size_t g2) +{ + l_wk.clear(); + l_wk.push_back(l0); + l_wk.push_back(l1); + l_wk.push_back(l2); + + g_wk.clear(); + g_wk.push_back(g0 * l0); + g_wk.push_back(g1 * l1); + g_wk.push_back(g2 * l2); +} + std::vector PrecompileKernels(const Handle& h, const std::vector& kernels, bool force_attach_binary) { @@ -561,12 +574,18 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Pooling, pooling::PoolingForward2d{}.SolverDbId()); Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNd{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardCk2d{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardCkNd{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::TransposedPoolingFwd2d{}.SolverDbId()); Register(registry, ++id, Primitive::Pooling, pooling::TransposedPoolingFwdNd{}.SolverDbId()); Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackward2d{}.SolverDbId()); Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardNd{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardCk2d{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardCkNd{}.SolverDbId()); + RegisterWithSolver(registry, ++id, conv::ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC{}, @@ -594,6 +613,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) fusion::ConvCKIgemmFwdBiasActivFused{}.SolverDbId(), miopenConvolutionAlgoImplicitGEMM); Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNaive{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNDNhwcNaive{}.SolverDbId()); RegisterWithSolver(registry, ++id, conv::ConvHipImplicitGemmGroupFwdXdlops{}, diff --git a/src/solver/pooling/backwardCk2d.cpp b/src/solver/pooling/backwardCk2d.cpp new file mode 100644 index 0000000000..d1ee157190 --- /dev/null +++ b/src/solver/pooling/backwardCk2d.cpp @@ -0,0 +1,308 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace pooling { + +namespace { + +struct kernel_params +{ + int kernel_size_h; + int kernel_size_w; + int kernel_stride_h; + int kernel_stride_w; + int out_pix_tile0; + int out_pix_tile1; + std::size_t batch_sz; + std::size_t n_inputs; + std::size_t in_height; + std::size_t in_width; + std::size_t grp_tile0; + std::size_t grp_tile1; + + kernel_params(const miopen::pooling::ProblemDescription& problem) + { + const auto& pd = problem.GetPooling(); + + kernel_size_w = pd.lens[1]; + kernel_size_h = pd.lens[0]; + kernel_stride_w = pd.strides[1]; + kernel_stride_h = pd.strides[0]; + + std::tie(batch_sz, n_inputs, in_height, in_width) = + miopen::tien<4>(problem.GetXDesc().GetLengths(), 1); + + out_pix_tile0 = 1; + out_pix_tile1 = 1; + if(pd.GetMode() == miopenPoolingMax) + { + out_pix_tile0 = in_width > 8 && in_width <= 24 ? 4 : 1; + out_pix_tile1 = in_width <= 24 ? 1 : (in_width > 64 && in_width <= 96 ? 4 : 8); + } + + grp_tile0 = 8; + grp_tile1 = 8; + if(pd.GetMode() == miopenPoolingMax) + { + grp_tile0 = in_width <= 8 ? 8 // + : in_width <= 16 ? 4 // + : in_width <= 24 ? 8 // + : in_width <= 32 ? 32 // + : in_width <= 64 ? 8 // + : in_width <= 96 ? 16 // + : in_width <= 128 ? 16 + : 32; + grp_tile1 = in_width <= 8 ? 8 // + : in_width <= 16 ? 16 // + : in_width <= 24 ? 8 // + : in_width <= 32 ? 4 // + : in_width <= 64 ? 8 // + : in_width <= 96 ? 4 // + : in_width <= 128 ? 16 + : 4; + } + } +}; + +std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem) +{ + const auto datatype = problem.GetXDesc().GetType(); + return get_data_size(datatype); +} + +std::size_t sizeof_kernel_index_t(const miopen::pooling::ProblemDescription& problem) +{ + return get_data_size(problem.GetPooling().GetIndexType()); +} + +inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m) +{ + assert(m > 0); + return ((v + m - 1) / m) * m; +} + +// Compute amount of local memory required for holding the arrays defined +// in the "mloPoolingAveBwd" and "mloPoolingMaxBwd" kernels. +std::size_t sizeof_local_memory(const miopen::pooling::ProblemDescription& problem) +{ + const kernel_params kp(problem); + + // aliases to ease programming + const auto& MLO_POOLING_KERNEL_SZ0 = kp.kernel_size_w; + const auto& MLO_POOLING_KERNEL_SZ1 = kp.kernel_size_h; + const auto& MLO_POOLBWD_N_HORIZ_OUT_PIX = kp.out_pix_tile0; + const auto& MLO_POOLBWD_N_VERT_OUT_PIX = kp.out_pix_tile1; + const auto& MLO_POOLING_STRIDE0 = kp.kernel_stride_w; + const auto& MLO_POOLING_STRIDE1 = kp.kernel_stride_h; + const auto& MLO_POOLBWD_GROUP_SZ0 = kp.grp_tile0; + const auto& MLO_POOLBWD_GROUP_SZ1 = kp.grp_tile1; + + const auto MLO_POOLBWD_LCL_DATA_WIDTH = + (static_cast(MLO_POOLBWD_GROUP_SZ0) * MLO_POOLBWD_N_HORIZ_OUT_PIX + + MLO_POOLING_KERNEL_SZ0 + MLO_POOLING_STRIDE0 - 2) / + MLO_POOLING_STRIDE0; + const auto MLO_POOLBWD_LCL_DATA_HEIGHT = + (static_cast(MLO_POOLBWD_GROUP_SZ1) * MLO_POOLBWD_N_VERT_OUT_PIX + + MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1 - 2) / + MLO_POOLING_STRIDE1; + + std::size_t rv = 0; + const auto nelem = MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT; + if(problem.GetPooling().GetMode() == miopenPoolingMax) + { + const auto sizeof_lcl_top_df = sizeof_kernel_FLOAT(problem) * nelem; + const auto sizeof_lcl_mask = sizeof_kernel_index_t(problem) * nelem; + /// \anchor alignment_of_arrays_in_gpu_memory + /// The total amount of memory calculated here is slightly less than the amount calculated + /// by the compiler. As a result, the check here may pass, while then the compiler might + /// refuse to build the kernel. The most likely reason for the difference is padding (due to + /// alignment requirements). We don't know exactly how the compiler takes alignment into + /// account, but what can we do is applying an alignment that imposes a slightly tighter + /// constraints than the compiler. So far, 16-byte (4xDWORD) alignment works well. + rv = RoundUpToMultiple(sizeof_lcl_top_df, 16) + RoundUpToMultiple(sizeof_lcl_mask, 16); + } + else + { + const auto sizeof_lcl_top_diff = sizeof_kernel_FLOAT(problem) * nelem; + rv = RoundUpToMultiple(sizeof_lcl_top_diff, 16); + } + MIOPEN_LOG_T(rv); + return rv; +} + +} // namespace + +bool PoolingBackwardCk2d::IsApplicable(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + return problem.GetDirection() == miopen::pooling::Direction::Backward && + (problem.GetPooling().GetMode() == miopenPoolingMax || + problem.GetPooling().GetMode() == miopenPoolingAverage || + problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) && + problem.GetXDesc().GetNumDims() == 4 && problem.GetXDesc().GetLayout("NCHW") == "NHWC" && + problem.GetYDesc().GetLayout("NCHW") == "NHWC" && + sizeof_local_memory(problem) <= TargetProperties::GetMaxLocalMemorySize(); +} + +ConvSolution +PoolingBackwardCk2d::GetSolution(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + const kernel_params kp(problem); + + { + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenPoolingBwd.cl"; +// TODO: backwardCk2d kernel + if(problem.GetPooling().GetMode() == miopenPoolingMax) + { + kernel.kernel_name = "mloPoolingMaxBwd"; + } + else if(problem.GetPooling().GetMode() == miopenPoolingAverage || + problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) + { + kernel.kernel_name = "mloPoolingAveBwd"; + } + + const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax) + ? MLO_POOLING_OP_MAX + : ((problem.GetPooling().GetMode() == miopenPoolingAverage) + ? MLO_POOLING_OP_AVE + : MLO_POOLING_OP_AVE_INCLUSIVE); + + const int g_wk_width = ((kp.in_width + kp.grp_tile0 * kp.out_pix_tile0 - 1) / + (kp.grp_tile0 * kp.out_pix_tile0)); + const int g_wk_height = ((kp.in_height + kp.grp_tile1 * kp.out_pix_tile1 - 1) / + (kp.grp_tile1 * kp.out_pix_tile1)); + + const auto build_params = + KernelBuildParameters{ + {"MLO_POOLING_OP_ID", pooling_method}, + {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h}, + {"MLO_POOLING_STRIDE1", kp.kernel_stride_h}, + {"MLO_POOLING_KERNEL_SZ0", kp.kernel_size_w}, + {"MLO_POOLING_STRIDE0", kp.kernel_stride_w}, + {"MLO_POOLBWD_N_HORIZ_OUT_PIX", kp.out_pix_tile0}, + {"MLO_POOLBWD_N_VERT_OUT_PIX", kp.out_pix_tile1}, + {"MLO_POOLBWD_GROUP_SZ0", kp.grp_tile0}, + {"MLO_POOLBWD_GROUP_SZ1", kp.grp_tile1}, + {"MLO_POOLING_INDEX_TYPE", + get_pooling_index_type_name(problem.GetPooling().GetIndexType())}, + {"MLO_POOLING_INDEX_MAX", + get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())}, + {"USE_IMG_INDEX", + problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexImage + ? 1 + : 0}, + } + << GetDataTypeKBP(problem.GetXDesc().GetType()); + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.l_wk = {kp.grp_tile0, kp.grp_tile1, 1}; + kernel.g_wk = { + g_wk_width * kp.grp_tile0, g_wk_height * kp.grp_tile1, kp.n_inputs * kp.batch_sz}; + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + if(params.pooling.GetMode() == miopenPoolingMax) + { + kernel(params.dy, + params.dx, + params.workspace, + static_cast(params.pooling.pads[0]), + static_cast(params.pooling.pads[1]), + static_cast(params.dyDesc.GetLengths()[1]), + static_cast(params.dxDesc.GetLengths()[2]), + static_cast(params.dxDesc.GetLengths()[3]), + static_cast(params.dyDesc.GetLengths()[2]), + static_cast(params.dyDesc.GetLengths()[3]), + static_cast(params.dxDesc.GetStrides()[0]), + static_cast(params.dxDesc.GetStrides()[1]), + static_cast(params.dxDesc.GetStrides()[2]), + static_cast(params.dyDesc.GetStrides()[0]), + static_cast(params.dyDesc.GetStrides()[1]), + static_cast(params.dyDesc.GetStrides()[2])); + } + else + { + kernel(params.dy, + params.dx, + static_cast(params.pooling.pads[0]), + static_cast(params.pooling.pads[1]), + static_cast(params.dyDesc.GetLengths()[1]), + static_cast(params.dxDesc.GetLengths()[2]), + static_cast(params.dxDesc.GetLengths()[3]), + static_cast(params.dyDesc.GetLengths()[2]), + static_cast(params.dyDesc.GetLengths()[3]), + static_cast(params.dxDesc.GetStrides()[0]), + static_cast(params.dxDesc.GetStrides()[1]), + static_cast(params.dxDesc.GetStrides()[2]), + static_cast(params.dyDesc.GetStrides()[0]), + static_cast(params.dyDesc.GetStrides()[1]), + static_cast(params.dyDesc.GetStrides()[2])); + } + }; + }; + + return result; +} + +std::size_t +PoolingBackwardCk2d::GetWorkspaceSize(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + if(problem.GetPooling().GetMode() != miopenPoolingMax) + return 0; + return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType()); +} + +} // namespace pooling + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/pooling/backwardCkNd.cpp b/src/solver/pooling/backwardCkNd.cpp new file mode 100644 index 0000000000..b8cdbb0286 --- /dev/null +++ b/src/solver/pooling/backwardCkNd.cpp @@ -0,0 +1,273 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include + +#define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCm/MIFin/issues/80 + +namespace miopen { + +namespace solver { + +namespace pooling { + +bool PoolingBackwardCkNd::IsApplicable(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + return problem.GetDirection() == miopen::pooling::Direction::Backward // + && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() // + && (problem.GetXDesc().GetType() == miopenFloat // + || problem.GetXDesc().GetType() == miopenHalf) // + && (problem.GetPooling().GetMode() == miopenPoolingMax // + || problem.GetPooling().GetMode() == miopenPoolingAverage // + || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) // + && ( // + (problem.GetXDesc().GetNumDims() == 5 // + && problem.GetXDesc().GetLayout("NCDHW") == "NDHWC" // + && problem.GetYDesc().GetLayout("NCDHW") == "NDHWC") // + || // + (problem.GetXDesc().GetNumDims() == 4 // + && problem.GetXDesc().GetLayout("NCHW") == "NHWC" // + && problem.GetYDesc().GetLayout("NCHW") == "NHWC") // + ) // + /// \todo This solver does not support workspace index mask mode yet. + && !(problem.GetPooling().GetMode() == miopenPoolingMax // + && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask); +} + +ConvSolution +PoolingBackwardCkNd::GetSolution(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenPoolingBwdND.cl"; + kernel.kernel_name = "mloPoolingND"; +// TODO: backwardCkNd kernel + if(problem.GetPooling().GetMode() == miopenPoolingMax) + { + kernel.kernel_name += "MaxBwd"; + } + else if(problem.GetPooling().GetMode() == miopenPoolingAverage || + problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) + { + kernel.kernel_name += "AveBwd"; + } + + const auto& bot = problem.GetXDesc(); + const auto& top = problem.GetYDesc(); + + std::size_t batch_sz, n_inputs, in_height, in_width; + std::tie(batch_sz, n_inputs, in_height, in_width) = miopen::tien<4>(bot.GetLengths(), 1); + + const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax) + ? MLO_POOLING_OP_MAX + : ((problem.GetPooling().GetMode() == miopenPoolingAverage) + ? MLO_POOLING_OP_AVE + : MLO_POOLING_OP_AVE_INCLUSIVE); + + int pix_w_per_work = 1; + int pix_h_per_work = 4; + int pix_d_per_work = 2; + + int batch = top.GetLengths()[0]; + int chal = top.GetLengths()[1]; + + const bool is2d = (bot.GetNumDims() == 4); + + int bot_d = is2d ? 1 : *(bot.GetLengths().rbegin() + 2); + int bot_h = *(bot.GetLengths().rbegin() + 1); + int bot_w = *(bot.GetLengths().rbegin()); + + int pix_blk_w = std::max((bot_w + pix_w_per_work - 1) / pix_w_per_work, 1); + int pix_blk_h = std::max((bot_h + pix_h_per_work - 1) / pix_h_per_work, 1); + int pix_blk_d = std::max((bot_d + pix_d_per_work - 1) / pix_d_per_work, 1); + + int max_activ_workitem = 65536; + int total_work = batch * chal * pix_blk_w * pix_blk_h * pix_blk_d; + int activ_work = std::min(total_work, max_activ_workitem); + +#if WORKAROUND_ISSUE_MIFIN_80 + const std::size_t wavesize = 64; +#else + const std::size_t wavesize = context.GetStream().GetWavefrontWidth(); +#endif + size_t grp_num = (activ_work + wavesize - 1) / wavesize; + + auto strides = problem.GetPooling().strides; + auto lens = problem.GetPooling().lens; + auto pads = problem.GetPooling().pads; + + if(is2d) + { + strides.push_back(strides[1]); + strides[1] = strides[0]; + lens.push_back(lens[1]); + lens[1] = lens[0]; + lens[0] = 1; + pads.push_back(pads[1]); + pads[1] = pads[0]; + pads[0] = 0; + } + + bool territory_overlap = false; + for(std::size_t i = 0; i < strides.size(); i++) + territory_overlap |= (strides[i] < lens[i]); + + const auto build_params = + KernelBuildParameters{ + {"MLO_POOLING_OP_ID", pooling_method}, + {"MAX_ACTIV_WORKITEM", max_activ_workitem}, + {"MLO_POOLING_GROUP_SZ0", wavesize}, + {"MLO_POOLING_GROUP_SZ1", 1}, + {"MLO_POOLING_GROUP_SZ2", 1}, + {"PIX_W_PER_WORK", pix_w_per_work}, + {"PIX_H_PER_WORK", pix_h_per_work}, + {"PIX_D_PER_WORK", pix_d_per_work}, + {"KERNEL_SZ_D", lens[0]}, + {"KERNEL_SZ_H", lens[1]}, + {"KERNEL_SZ_W", lens[2]}, + {"STRIDE_D", strides[0]}, + {"STRIDE_H", strides[1]}, + {"STRIDE_W", strides[2]}, + {"TERRITORY_OVERLAP", static_cast(territory_overlap)}, + {"MLO_POOLING_INDEX_TYPE", + get_pooling_index_type_name(problem.GetPooling().GetIndexType())}, + {"MLO_POOLING_INDEX_MAX", + get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())}, + } + << GetDataTypeKBP(problem.GetDYDesc().GetType()); + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.l_wk = {wavesize, 1, 1}; + kernel.g_wk = {wavesize * grp_num, 1, 1}; + + result.construction_params.push_back(kernel); + + const auto top_d = is2d ? 1 : *(top.GetLengths().rbegin() + 2); + const auto top_h = *(top.GetLengths().rbegin() + 1); + const auto top_w = *(top.GetLengths().rbegin()); + + auto unpackStrides = [is2d](const auto& strides) { + return std::make_tuple(strides[0], // N stride + strides[1], // C stride + strides[2], // D stride. Same as H_stride in 3D converted from 2D. + is2d // + ? strides[2] // 2D H stride + : strides[3] // 3D H stride + ); + }; + + std::size_t bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride; + std::size_t top_n_stride, top_c_stride, top_d_stride, top_h_stride; + std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride) = + unpackStrides(bot.GetStrides()); + std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride) = + unpackStrides(top.GetStrides()); + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + if(params.pooling.GetMode() == miopenPoolingMax) + { + kernel(params.dy, + params.dx, + params.workspace, + static_cast(pads[0]), + static_cast(pads[1]), + static_cast(pads[2]), + static_cast(batch), + static_cast(chal), + static_cast(bot_d), + static_cast(bot_h), + static_cast(bot_w), + static_cast(top_d), + static_cast(top_h), + static_cast(top_w), + static_cast(bot_n_stride), + static_cast(bot_c_stride), + static_cast(bot_d_stride), + static_cast(bot_h_stride), + static_cast(top_n_stride), + static_cast(top_c_stride), + static_cast(top_d_stride), + static_cast(top_h_stride), + static_cast(total_work)); + } + else + { + kernel(params.dy, + params.dx, + static_cast(pads[0]), + static_cast(pads[1]), + static_cast(pads[2]), + static_cast(batch), + static_cast(chal), + static_cast(bot_d), + static_cast(bot_h), + static_cast(bot_w), + static_cast(top_d), + static_cast(top_h), + static_cast(top_w), + static_cast(bot_n_stride), + static_cast(bot_c_stride), + static_cast(bot_d_stride), + static_cast(bot_h_stride), + static_cast(top_n_stride), + static_cast(top_c_stride), + static_cast(top_d_stride), + static_cast(top_h_stride), + static_cast(total_work)); + } + }; + }; + + return result; +} + +std::size_t +PoolingBackwardCkNd::GetWorkspaceSize(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + if(problem.GetPooling().GetMode() != miopenPoolingMax) + return 0; + return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType()); +} + +} // namespace pooling + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/pooling/forward2d.cpp b/src/solver/pooling/forward2d.cpp index 87fd0e851f..b757c07d54 100644 --- a/src/solver/pooling/forward2d.cpp +++ b/src/solver/pooling/forward2d.cpp @@ -135,15 +135,25 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro bool PoolingForward2d::IsApplicable(const ExecutionContext& context, const miopen::pooling::ProblemDescription& problem) const { - return problem.GetDirection() == miopen::pooling::Direction::Forward && - problem.GetXDesc().GetNumDims() == 4 && - problem.GetXDesc().GetType() == problem.GetYDesc().GetType() && - (problem.GetXDesc().GetType() == miopenFloat || - problem.GetXDesc().GetType() == miopenHalf) && - problem.GetXDesc().GetLayout("NCHW") == "NCHW" && - problem.GetYDesc().GetLayout("NCHW") == "NCHW" && + auto x_type = problem.GetXDesc().GetType(); + auto y_type = problem.GetYDesc().GetType(); + std::vector types {miopenFloat, miopenHalf}; // TRJS TODO fix types , miopenInt8, miopenFloat8 + + auto x_layout = problem.GetXDesc().GetLayout_str(); + auto y_layout = problem.GetYDesc().GetLayout_str(); + std::vector layouts {"NCHW"}; + + bool app = + problem.GetDirection() == miopen::pooling::Direction::Forward && + problem.GetXDesc().GetNumDims() == 4 && + (x_type == y_type) && // + (x_layout == y_layout) && // + (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) && // + (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()) && // sizeof_private_memory(problem) <= TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth(); + + return app; } ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&, diff --git a/src/solver/pooling/forwardCk2d.cpp b/src/solver/pooling/forwardCk2d.cpp new file mode 100644 index 0000000000..7ac0c15170 --- /dev/null +++ b/src/solver/pooling/forwardCk2d.cpp @@ -0,0 +1,268 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace pooling { + +namespace { + +struct kernel_params +{ + int kernel_size_h; + int kernel_size_w; + int kernel_stride_h; + int kernel_stride_w; + int out_height; + int out_width; + int out_pix_tile0; + int out_pix_tile1; + + kernel_params(const miopen::pooling::ProblemDescription& p) + { + const auto& pd = p.GetPooling(); + const auto& yd = p.GetYDesc(); + kernel_size_h = pd.lens[0]; + kernel_size_w = pd.lens[1]; + kernel_stride_h = pd.strides[0]; + kernel_stride_w = pd.strides[1]; + out_height = yd.GetLengths()[2]; + out_width = yd.GetLengths()[3]; + out_pix_tile0 = 1; + out_pix_tile1 = out_height <= 8 ? 1 // + : out_height <= 32 ? 4 // + : 8; + if(out_height > 16 && out_height % 32 > 16) + out_pix_tile1 = std::min(16, std::max(1, prePow2(out_pix_tile1 * kernel_stride_h))); + } +}; + +std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem) +{ + const auto datatype = problem.GetXDesc().GetType(); + return get_data_size(datatype); +} + +std::size_t sizeof_kernel_FLOAT_ACCUM(const miopen::pooling::ProblemDescription& problem) +{ + const auto datatype = problem.GetXDesc().GetType(); + if(datatype == miopenHalf) + return get_data_size(miopenFloat); // mixed precision + return get_data_size(datatype); +} + +inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m) +{ + assert(m > 0); + return ((v + m - 1) / m) * m; +} + +// Compute amount of private memory required for holding the arrays defined +// in the "mloPoolingG" kernel: +// +// #define MLO_BOT_DATA_SZ0 +// ((MLO_POOLING_N_HORIZ_OUT_PIX - 1) * MLO_POOLING_STRIDE0 + MLO_POOLING_KERNEL_SZ0) +// +// #define MLO_BOT_DATA_SZ1 +// ((MLO_POOLING_N_VERT_OUT_PIX - 1) * MLO_POOLING_STRIDE1 + MLO_POOLING_KERNEL_SZ1) +// +// _FLOAT bot_data[MLO_BOT_DATA_SZ1][MLO_BOT_DATA_SZ0]; +// _FLOAT_ACCUM res[MLO_POOLING_N_VERT_OUT_PIX][MLO_POOLING_N_HORIZ_OUT_PIX]; +// +std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& problem) +{ + const kernel_params kp(problem); + + // aliases to ease programming + const auto& MLO_POOLING_KERNEL_SZ1 = kp.kernel_size_h; + const auto& MLO_POOLING_STRIDE1 = kp.kernel_stride_h; + const auto& MLO_POOLING_KERNEL_SZ0 = kp.kernel_size_w; + const auto& MLO_POOLING_STRIDE0 = kp.kernel_stride_w; + const auto& MLO_POOLING_N_HORIZ_OUT_PIX = kp.out_pix_tile0; + const auto& MLO_POOLING_N_VERT_OUT_PIX = kp.out_pix_tile1; + + const auto MLO_BOT_DATA_SZ0 = + (static_cast(MLO_POOLING_N_HORIZ_OUT_PIX) - 1) * MLO_POOLING_STRIDE0 + + MLO_POOLING_KERNEL_SZ0; + const auto MLO_BOT_DATA_SZ1 = + (static_cast(MLO_POOLING_N_VERT_OUT_PIX) - 1) * MLO_POOLING_STRIDE1 + + MLO_POOLING_KERNEL_SZ1; + + const auto sizeof_bot_data = sizeof_kernel_FLOAT(problem) * MLO_BOT_DATA_SZ1 * MLO_BOT_DATA_SZ0; + const auto sizeof_res = sizeof_kernel_FLOAT_ACCUM(problem) * MLO_POOLING_N_VERT_OUT_PIX * + MLO_POOLING_N_HORIZ_OUT_PIX; + + MIOPEN_LOG_T("sizeof_bot_data " << sizeof_bot_data << "sizeof_res" << sizeof_res); + + /// \ref alignment_of_arrays_in_gpu_memory + return RoundUpToMultiple(sizeof_bot_data, 16) + RoundUpToMultiple(sizeof_res, 16); +} + +} // namespace + +bool PoolingForwardCk2d::IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const +{ + return false; // TRJS not active yet + return problem.GetDirection() == miopen::pooling::Direction::Forward && + problem.GetXDesc().GetNumDims() == 4 && + problem.GetXDesc().GetType() == problem.GetYDesc().GetType() && + (problem.GetXDesc().GetType() == miopenFloat || + problem.GetXDesc().GetType() == miopenHalf) && + problem.GetXDesc().GetLayout("NCHW") == "NHWC" && + problem.GetYDesc().GetLayout("NCHW") == "NHWC" && + sizeof_private_memory(problem) <= + TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth(); +} + +ConvSolution PoolingForwardCk2d::GetSolution(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + { + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenPooling.cl"; + kernel.kernel_name = "mloPoolingG"; +// TODO: forwardCk2d kernel + const kernel_params kp(problem); + + int batch_sz, n_outputs; + std::tie(batch_sz, n_outputs, std::ignore, std::ignore) = + miopen::tien<4>(problem.GetYDesc().GetLengths(), 1); + + const auto& pool_d = problem.GetPooling(); + const auto wsp_index = pool_d.GetWorkspaceIndexMode(); + + int grp_tile0 = kp.out_width <= 8 ? 8 : (kp.out_width % 32 <= 16 ? 16 : 32); + int grp_tile1 = kp.out_height <= 8 ? 8 + : kp.out_height < 16 ? 16 + : kp.out_height <= 32 ? 32 + : kp.out_height <= 64 ? 64 + : 128; + grp_tile1 /= kp.out_pix_tile1; + while(grp_tile0 * grp_tile1 > 256 && grp_tile0 > 1) + grp_tile0 >>= 1; + + int pooling_method = + (pool_d.GetMode() == miopenPoolingMax) + ? MLO_POOLING_OP_MAX + : ((pool_d.GetMode() == miopenPoolingAverage) ? MLO_POOLING_OP_AVE + : MLO_POOLING_OP_AVE_INCLUSIVE); + + auto build_params = KernelBuildParameters{ + {"MLO_POOLING_OP_ID", pooling_method}, + {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h}, + {"MLO_POOLING_STRIDE1", kp.kernel_stride_h}, + {"MLO_POOLING_KERNEL_SZ0", kp.kernel_size_w}, + {"MLO_POOLING_STRIDE0", kp.kernel_stride_w}, + {"MLO_POOLING_N_HORIZ_OUT_PIX", kp.out_pix_tile0}, + {"MLO_POOLING_N_VERT_OUT_PIX", kp.out_pix_tile1}, + {"MLO_POOLING_GROUP_SZ0", grp_tile0}, + {"MLO_POOLING_GROUP_SZ1", grp_tile1}, + {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(pool_d.GetIndexType())}, + {"MLO_POOLING_INDEX_MAX", get_pooling_index_type_max_name(pool_d.GetIndexType())}, + }; + + if(problem.SaveIndex()) + { + build_params << KernelBuildParameters{ + {"MLO_POOLING_SAVE_INDEX"}, + {"USE_IMG_INDEX", (wsp_index == miopenPoolingWorkspaceIndexImage ? 1 : 0)}, + }; + } + + build_params << GetDataTypeKBP(problem.GetXDesc().GetType()); + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.l_wk.push_back(grp_tile0); + kernel.l_wk.push_back(grp_tile1); + kernel.l_wk.push_back(1); + + int g_wk_width = + ((kp.out_width + grp_tile0 * kp.out_pix_tile0 - 1) / (grp_tile0 * kp.out_pix_tile0)); + int g_wk_height = + ((kp.out_height + grp_tile1 * kp.out_pix_tile1 - 1) / (grp_tile1 * kp.out_pix_tile1)); + + kernel.g_wk.push_back(static_cast(g_wk_width) * grp_tile0); + kernel.g_wk.push_back(static_cast(g_wk_height) * grp_tile1); + kernel.g_wk.push_back(static_cast(n_outputs) * batch_sz); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + kernel(params.x, + params.y, + params.workspace, + static_cast(params.pooling.pads[0]), + static_cast(params.pooling.pads[1]), + static_cast(params.xDesc.GetLengths()[1]), + static_cast(params.xDesc.GetLengths()[2]), + static_cast(params.xDesc.GetLengths()[3]), + static_cast(params.yDesc.GetLengths()[2]), + static_cast(params.yDesc.GetLengths()[3]), + static_cast(params.xDesc.GetStrides()[0]), + static_cast(params.xDesc.GetStrides()[1]), + static_cast(params.xDesc.GetStrides()[2]), + static_cast(params.yDesc.GetStrides()[0]), + static_cast(params.yDesc.GetStrides()[1]), + static_cast(params.yDesc.GetStrides()[2])); + }; + }; + + return result; +} + +std::size_t +PoolingForwardCk2d::GetWorkspaceSize(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + if(problem.GetPooling().GetMode() != miopenPoolingMax) + return 0; + return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType()); +} + +} // namespace pooling + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/pooling/forwardCkNd.cpp b/src/solver/pooling/forwardCkNd.cpp new file mode 100644 index 0000000000..a4f2f781c3 --- /dev/null +++ b/src/solver/pooling/forwardCkNd.cpp @@ -0,0 +1,264 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace pooling { + +namespace { + +constexpr int top_w_per_work = 1; +constexpr int top_h_per_work = 4; +constexpr int top_d_per_work = 2; + +struct kernel_params +{ + uint32_t stride_d; + uint32_t stride_h; + uint32_t stride_w; + uint32_t kernel_sz_d; + uint32_t kernel_sz_h; + uint32_t kernel_sz_w; + + kernel_params(const miopen::pooling::ProblemDescription& p) + { + const auto& pd = p.GetPooling(); + stride_d = pd.strides[0]; + stride_h = pd.strides[1]; + stride_w = pd.strides[2]; + kernel_sz_d = pd.lens[0]; + kernel_sz_h = pd.lens[1]; + kernel_sz_w = pd.lens[2]; + } +}; + +std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem) +{ + const auto datatype = problem.GetXDesc().GetType(); + return get_data_size(datatype); +} + +inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m) +{ + assert(m > 0); + return ((v + m - 1) / m) * m; +} + +// Compute amount of private memory required for holding the arrays defined +// in the "mloPoolingNDFwd" kernel: +// +// #define BOT_TILE_W ((TOP_W_PER_WORK - 1) * STRIDE_W + KERNEL_SZ_W) +// #define BOT_TILE_H ((TOP_H_PER_WORK - 1) * STRIDE_H + KERNEL_SZ_H) +// #define BOT_TILE_D ((TOP_D_PER_WORK - 1) * STRIDE_D + KERNEL_SZ_D) +// +// _FLOAT bot_data[BOT_TILE_D][BOT_TILE_H][BOT_TILE_W]; +// +std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& problem) +{ + const kernel_params kp(problem); + + const std::size_t bot_tile_w = ((top_w_per_work - 1) * kp.stride_w + kp.kernel_sz_w); + const std::size_t bot_tile_h = ((top_h_per_work - 1) * kp.stride_h + kp.kernel_sz_h); + const std::size_t bot_tile_d = ((top_d_per_work - 1) * kp.stride_d + kp.kernel_sz_d); + + const auto sizeof_bot_data = + sizeof_kernel_FLOAT(problem) * bot_tile_d * bot_tile_h * bot_tile_w; + MIOPEN_LOG_T("sizeof_bot_data " << sizeof_bot_data); + + /// \ref alignment_of_arrays_in_gpu_memory + return RoundUpToMultiple(sizeof_bot_data, 16); +} + +} // namespace + +bool PoolingForwardCkNd::IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const +{ + + return problem.GetDirection() == miopen::pooling::Direction::Forward // + && problem.GetXDesc().GetNumDims() == 5 // + && problem.GetXDesc().GetLayout("NCDHW") == "NDHWC" // + && problem.GetYDesc().GetLayout("NCDHW") == "NDHWC" // + && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() // + && (problem.GetXDesc().GetType() == miopenFloat // + || problem.GetXDesc().GetType() == miopenHalf) // + && (problem.GetPooling().GetMode() == miopenPoolingMax // + || problem.GetPooling().GetMode() == miopenPoolingAverage // + || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) // + && sizeof_private_memory(problem) <= TargetProperties::GetMaxWaveScratchSize() // + / context.GetStream().GetWavefrontWidth() // + /// \todo This solver does not support workspace index mask mode yet. + && + !(problem.GetPooling().GetMode() == miopenPoolingMax // + && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask // + && problem.SaveIndex() == true); +} + +ConvSolution PoolingForwardCkNd::GetSolution(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + const int batch = problem.GetXDesc().GetLengths()[0]; + const int chal = problem.GetXDesc().GetLengths()[1]; + + const kernel_params kp(problem); + + const int top_d = *(problem.GetYDesc().GetLengths().rbegin() + 2); + const int top_h = *(problem.GetYDesc().GetLengths().rbegin() + 1); + const int top_w = *(problem.GetYDesc().GetLengths().rbegin()); + + const int top_blk_w = std::max((top_w + top_w_per_work - 1) / top_w_per_work, 1); + const int top_blk_h = std::max((top_h + top_h_per_work - 1) / top_h_per_work, 1); + const int top_blk_d = std::max((top_d + top_d_per_work - 1) / top_d_per_work, 1); + + const int max_activ_workitem = 65536; + const int total_work = batch * chal * top_blk_w * top_blk_h * top_blk_d; + const int activ_work = std::min(total_work, max_activ_workitem); + + { + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenPoolingND.cl"; + kernel.kernel_name = "mloPoolingNDFwd"; +// TODO: forwardCkNd kernel + int pooling_method = (problem.GetPooling().mode == miopenPoolingMax) + ? MLO_POOLING_OP_MAX + : ((problem.GetPooling().mode == miopenPoolingAverage) + ? MLO_POOLING_OP_AVE + : MLO_POOLING_OP_AVE_INCLUSIVE); + + const size_t lcl_work = 64; + const size_t grp_num = (activ_work + lcl_work - 1) / lcl_work; + + auto build_params = KernelBuildParameters{ + {"MLO_POOLING_OP_ID", static_cast(pooling_method)}, + {"MAX_ACTIV_WORKITEM", static_cast(max_activ_workitem)}, + {"MLO_POOLING_GROUP_SZ0", static_cast(lcl_work)}, + {"MLO_POOLING_GROUP_SZ1", 1}, + {"MLO_POOLING_GROUP_SZ2", 1}, + {"TOP_W_PER_WORK", top_w_per_work}, + {"TOP_H_PER_WORK", top_h_per_work}, + {"TOP_D_PER_WORK", top_d_per_work}, + {"KERNEL_SZ_D", kp.kernel_sz_d}, + {"KERNEL_SZ_H", kp.kernel_sz_h}, + {"KERNEL_SZ_W", kp.kernel_sz_w}, + {"STRIDE_D", kp.stride_d}, + {"STRIDE_H", kp.stride_h}, + {"STRIDE_W", kp.stride_w}, + {"MLO_POOLING_INDEX_TYPE", + get_pooling_index_type_name(problem.GetPooling().GetIndexType())}, + {"MLO_POOLING_INDEX_MAX", + get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())}, + }; + + if(problem.SaveIndex()) + { + build_params << KernelBuildParameters{ + {"MLO_POOLING_SAVE_INDEX"}, + }; + } + + build_params << GetDataTypeKBP(problem.GetXDesc().GetType()); + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.l_wk = {lcl_work, 1, 1}; + kernel.g_wk = {lcl_work * grp_num, 1, 1}; + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + const int batch_ = params.xDesc.GetLengths()[0]; + const int chal_ = params.xDesc.GetLengths()[1]; + + const int top_d_ = *(params.yDesc.GetLengths().rbegin() + 2); + const int top_h_ = *(params.yDesc.GetLengths().rbegin() + 1); + const int top_w_ = *(params.yDesc.GetLengths().rbegin()); + + const int top_blk_w_ = std::max((top_w_ + top_w_per_work - 1) / top_w_per_work, 1); + const int top_blk_h_ = std::max((top_h_ + top_h_per_work - 1) / top_h_per_work, 1); + const int top_blk_d_ = std::max((top_d_ + top_d_per_work - 1) / top_d_per_work, 1); + + const int total_work_ = batch_ * chal_ * top_blk_w_ * top_blk_h_ * top_blk_d_; + + kernel(params.x, + params.y, + params.workspace, + static_cast(params.pooling.pads[0]), + static_cast(params.pooling.pads[1]), + static_cast(params.pooling.pads[2]), + static_cast(batch_), + static_cast(chal_), + static_cast(params.xDesc.GetLengths()[2]), + static_cast(params.xDesc.GetLengths()[3]), + static_cast(params.xDesc.GetLengths()[4]), + static_cast(top_d_), + static_cast(top_h_), + static_cast(top_w_), + static_cast(params.xDesc.GetStrides()[0]), + static_cast(params.xDesc.GetStrides()[1]), + static_cast(params.xDesc.GetStrides()[2]), + static_cast(params.xDesc.GetStrides()[3]), + static_cast(params.yDesc.GetStrides()[0]), + static_cast(params.yDesc.GetStrides()[1]), + static_cast(params.yDesc.GetStrides()[2]), + static_cast(params.yDesc.GetStrides()[3]), + static_cast(total_work_)); + }; + }; + + return result; +} + +std::size_t +PoolingForwardCkNd::GetWorkspaceSize(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + if(problem.GetPooling().GetMode() != miopenPoolingMax) + return 0; + return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType()); +} + +} // namespace pooling + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp index 43406cd508..5bc4835ff5 100644 --- a/src/solver/pooling/forwardNaive.cpp +++ b/src/solver/pooling/forwardNaive.cpp @@ -68,22 +68,22 @@ inline uint32_t RoundUpNearestPower2Positive(uint32_t v) bool PoolingForwardNaive::IsApplicable(const ExecutionContext&, const miopen::pooling::ProblemDescription& problem) const { - return problem.GetDirection() == miopen::pooling::Direction::Forward // - && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() // - && (problem.GetXDesc().GetType() == miopenFloat // - || problem.GetXDesc().GetType() == miopenHalf) // - && (problem.GetPooling().GetMode() == miopenPoolingMax // - || problem.GetPooling().GetMode() == miopenPoolingAverage // - || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) // - && ( // - (problem.GetXDesc().GetNumDims() == 5 // - && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" // - && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW") // - || // - (problem.GetXDesc().GetNumDims() == 4 // - && problem.GetXDesc().GetLayout("NCHW") == "NCHW" // - && problem.GetYDesc().GetLayout("NCHW") == "NCHW") // - ); + auto x_type = problem.GetXDesc().GetType(); + auto y_type = problem.GetYDesc().GetType(); + std::vector types {miopenFloat, miopenHalf}; + + auto mode = problem.GetPooling().GetMode(); + std::vector modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive}; + + auto x_layout = problem.GetXDesc().GetLayout_str(); + auto y_layout = problem.GetYDesc().GetLayout_str(); + std::vector layouts {"NCHW", "NCDHW"}; + + return (problem.GetDirection() == miopen::pooling::Direction::Forward) // + && (x_type == y_type) // + && (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) // + && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) // + && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()); } ConvSolution @@ -95,6 +95,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, const auto bot = problem.GetXDesc(); const auto top = problem.GetYDesc(); const bool is2d = (bot.GetNumDims() == 4); + const bool isTranspose = problem.GetXDesc().GetLayout_str()[1] != 'C'; // TODO TRJS create member func // To compact code: const auto& pooling = problem.GetPooling(); @@ -158,7 +159,8 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, const size_t mask_c_stride = static_cast(mask_d_stride) * top_d; const size_t mask_n_stride = mask_c_stride * all_c; - /// About optimal grid size. The simplest way is to map the problem onto grid is 1:1 mapping of + /// About optimal grid size: + /// NC[D]HW: The simplest way is to map the problem onto grid is 1:1 mapping of /// N,C and top.D onto grid dimensions. /// /// However, this would waste 1 dimension of grid for 2D convolutions, i.e. the grid size would @@ -167,6 +169,10 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, /// access memory in a scattered way, which would affect performance again. Current design /// choice is using separate 2D and 3D kernels (via build-time parameter) and N*C*H grid for 2D. /// + /// N[D]HWC: top N, D, H are mapped directly onto grid dimensions + /// + /// + /// /// \anchor naive_pooling_max_grid_size /// * Assumption: Max grid size is >= 2^32-1 (4G-1) i.e. std::max. /// Currently this limitation is valid for both ROCm HIP and OCL runtimes. @@ -195,8 +201,8 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, const auto is2d_kernel = (top_d == 1); // For 2D + optimize for 3D where the 1st dim is 1. const auto g0 = RoundUpNearestPower2Positive(all_n); - const auto g1 = RoundUpNearestPower2Positive(all_c); - const auto g2 = RoundUpNearestPower2Positive(is2d_kernel ? top_h : top_d); + const auto g1 = RoundUpNearestPower2Positive(isTranspose ? top_d : all_c); + const auto g2 = RoundUpNearestPower2Positive(isTranspose || is2d_kernel ? top_h : top_d); auto work_left = wavesize / 1; const auto w0 = (g0 < work_left) ? g0 : work_left; @@ -217,6 +223,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(index_type)}, {"MLO_POOLING_IS2D_KERNEL", static_cast(is2d_kernel)}, }; + build_params << GetDataTypeKBP(bot.GetType()); kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); @@ -243,44 +250,45 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, decltype(auto) params = raw_params.CastTo(); kernel(params.x, - params.y, - params.workspace, - save_index, - index_mode, - filter_d, - filter_h, - filter_w, - filter_d_stride, - filter_h_stride, - filter_w_stride, - filter_d_pad, - filter_h_pad, - filter_w_pad, - all_n, - all_c, - bot_d, - bot_h, - bot_w, - bot_n_stride, - bot_c_stride, - bot_d_stride, - bot_h_stride, - bot_w_stride, - top_d, - top_h, - top_w, - top_n_stride, - top_c_stride, - top_d_stride, - top_h_stride, - top_w_stride, - mask_n_stride, - mask_c_stride, - mask_d_stride, - mask_h_stride, - mask_w_stride); + params.y, + params.workspace, + save_index, + index_mode, + filter_d, + filter_h, + filter_w, + filter_d_stride, + filter_h_stride, + filter_w_stride, + filter_d_pad, + filter_h_pad, + filter_w_pad, + all_n, + all_c, + bot_d, + bot_h, + bot_w, + bot_n_stride, + bot_c_stride, + bot_d_stride, + bot_h_stride, + bot_w_stride, + top_d, + top_h, + top_w, + top_n_stride, + top_c_stride, + top_d_stride, + top_h_stride, + top_w_stride, + mask_n_stride, + mask_c_stride, + mask_d_stride, + mask_h_stride, + mask_w_stride); }; }; + return result; } diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp new file mode 100644 index 0000000000..86555d9560 --- /dev/null +++ b/src/solver/pooling/forwardNdNhwcNaive.cpp @@ -0,0 +1,316 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +#include + +#define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCm/MIFin/issues/80 + +namespace miopen { + +namespace solver { + +namespace pooling { + +namespace { + +#if !MIOPEN_NDEBUG && !WORKAROUND_ISSUE_MIFIN_80 +template +bool IsPower2(T v) +{ + return (v != 0) && ((v & (v - 1)) == 0); +} +#endif + +} // namespace + +bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + auto x_type = problem.GetXDesc().GetType(); + auto y_type = problem.GetYDesc().GetType(); + std::vector types {miopenFloat, miopenHalf, miopenInt8, miopenFloat8, miopenBFloat16}; // + + auto mode = problem.GetPooling().GetMode(); + std::vector modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive}; + + auto x_layout = problem.GetXDesc().GetLayout_str(); + auto y_layout = problem.GetYDesc().GetLayout_str(); + std::vector layouts {"NHWC", "NDHWC"}; + + bool app = (problem.GetDirection() == miopen::pooling::Direction::Forward) // + && (x_type == y_type) // + && (x_layout == y_layout) // + && (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) // + && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) //) + && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()); + + return app; +} + +ConvSolution +PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + poolingNdNhwcArgs args; + + auto input_dtype = miopen::GetDataType(problem.GetXDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType()); + + const auto bot = problem.GetXDesc(); + const auto top = problem.GetYDesc(); + const bool is2d = (bot.GetNumDims() == 4); + const bool is_transpose = problem.GetXDesc().GetLayout_str()[1] != 'C'; + if(!is_transpose) + { + MIOPEN_THROW("Tried to run NHWC solver on NCHW data"); + } + + // To compact code: + const auto& pooling = problem.GetPooling(); + const auto& lengths = pooling.GetLengths(); + const auto& strides = pooling.GetStrides(); + const auto& pads = pooling.GetPads(); + + // This also deduces 3D (DHW) parameters from 2D (HW) descriptor. + uint32_t idx = 0; + args.filter_d = is2d ? 1 : lengths[idx++]; + args.filter_h = lengths[idx++]; + args.filter_w = lengths[idx++]; + + idx = 0; + args.filter_d_stride = is2d ? (strides[0]) : strides[idx++]; + args.filter_h_stride = strides[idx++]; + args.filter_w_stride = strides[idx++]; + + idx = 0; + args.filter_d_pad = is2d ? 0 : pads[idx++]; + args.filter_h_pad = pads[idx++]; + args.filter_w_pad = pads[idx++]; + + // TODO RJS move pooling_method to shared code + const int pooling_method = (pooling.GetMode() == miopenPoolingMax) ? MLO_POOLING_OP_MAX + : (pooling.GetMode() == miopenPoolingAverage) + ? MLO_POOLING_OP_AVE + : MLO_POOLING_OP_AVE_INCLUSIVE; + + const auto save_index = problem.SaveIndex(); + const auto index_mode = pooling.GetWorkspaceIndexMode(); + const auto index_type = pooling.GetIndexType(); + + /// \anchor multiply_dims_overflow_assumption + /// + /// Preventing overflow during dimension-related computations: + /// Let's assume that multiplication of three dims always fits into 32 bits (unsigned). + /// Then let's use size_t when we need to multiply more than three dims. + /// For example, in NCDHW layout, the N and C strides are results of multiplication + /// of >= 3 dims, so we have to use size_t for storing them. + /// + /// We need to pay special attention to muls of D stride with some other dims. + /// The D stride is a result of 2 muls. Therefore (d_stride * dim) does + /// not require widening to size_t prior mul, but (d_stride * dim * dim) + /// requires it because the total number of muls is 4. + + const auto spatial_dim = is2d ? 2U : 3U; + + std::tie(args.all_n, args.all_c, args.bot_d, args.bot_h, args.bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths()); + + std::tie(args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride) = + miopen::GetNCDHW(spatial_dim, bot.GetStrides()); + + std::tie(std::ignore, std::ignore, args.top_d, args.top_h, args.top_w) = + miopen::GetNCDHW(spatial_dim, top.GetLengths()); + + std::tie(args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride,args. top_w_stride) = + miopen::GetNCDHW(spatial_dim, top.GetStrides()); + + // Mask data is always NCDHW layout + args.mask_w_stride = 1; + args.mask_h_stride = args.mask_w_stride * args.top_w; + args.mask_d_stride = args.mask_h_stride * args.top_h; + args.mask_c_stride = args.mask_d_stride * args.top_d; + args.mask_n_stride = args.mask_c_stride * args.all_c; + + /// About optimal grid size: + /// top D, H, and W are mapped directly onto grid dimensions, except in very small problems + /// when they are combined into workgroup items in an attempt to improve overlapping and coalescense. + /// N seems to be generally small, so we'll multiply it into the 'D' dimension. + /// + /// \anchor naive_pooling_max_grid_size + /// * Assumption: Max grid size is >= 2^32-1 (4G-1) i.e. std::max. + /// However, assume the product of two dimensions is always <= 2^30. + /// Currently this limitation is valid for both ROCm HIP and OCL runtimes. + /// + /// Selecting the optimal workgroup size is an interesting problem. + /// We'll first map N * D to blockIdx.x. H and W are canonically mapped into + /// blockIdx.y and z, respectively. C, being the fastest index, is mapped + /// into threadIdx.x up to the maximum items. For larger C, the remainder are + /// mapped into blockIdx.z. + /// + /// For small C, we favor more waves over more blocks. W/H are mapped into threadIdx.z/y, + /// in that order, fractionally in powers of 2 if possible, up to a maximum + /// of 256 workitems. Finally, any remaining W/H are then mapped onto blockIdx.z/y. + /// + /// The workgroup size does not have the restrictions imposed by synchronization between + /// workitems because the kernel does not require synchronization. + + std::ignore = context; + constexpr uint32_t MAX_THREADS = 512; + constexpr uint32_t LARGE_C_MAX_ITEMS = MAX_THREADS; + constexpr uint32_t SMALL_C_TGT_ITEMS = 256; + + auto nd_ = args.all_n * args.top_d; + auto h_ = args.top_h; + auto w_ = args.top_w; + auto c_ = args.all_c; + + // These are hip-style indexes (not OCL) + uint32_t l1 = 1U; + uint32_t l2 = 1U; + + if(c_ > LARGE_C_MAX_ITEMS) + { + auto c2 = (c_ + LARGE_C_MAX_ITEMS - 1) / LARGE_C_MAX_ITEMS; + c_ = LARGE_C_MAX_ITEMS; + w_ *= c2; + } + else if(c_ <= SMALL_C_TGT_ITEMS / 2) // Small C, remap H and W to increase occupancy + { + if(c_ * w_ < SMALL_C_TGT_ITEMS) + { + std::swap(l2, w_); // full w mapped to threads + } + + while(w_ > 2 && ((c_ * l2) < SMALL_C_TGT_ITEMS)) + { + w_ = (w_ + 1) / 2; // partial w mapped to threads (rounddown-safe) + l2 *= 2; + } + + if(c_ * l2 * h_ < SMALL_C_TGT_ITEMS) + { + std::swap(l1, h_); // full h mapped to threads + } + + while(h_ > 2 && ((c_ * l1 * l2) < SMALL_C_TGT_ITEMS)) + { + h_ = (h_ + 1 ) / 2; // partial h mapped to threads (rounddown-safe) + l1 *= 2; + } + } + + const auto g0 = nd_; + const auto g1 = h_; + const auto g2 = w_; + const auto l0 = c_; + + { + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenPoolingForwardNDNhwcNaive.cpp"; + kernel.kernel_name = "mloPoolingForwardNDNhwcNaive"; + + auto build_params = KernelBuildParameters{ + {"MLO_POOLING_OP_ID", pooling_method}, // We need this at compile time in order to + // engage mixed precision only when necessary. + {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(index_type)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype} + }; + + build_params << GetDataTypeKBP(bot.GetType()); + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + // [Informative] The total number of kernels required to cover the whole + // forward pooling problem space is 3*4*2*2*2 = 96. The solver is dynamic. + // * 3: the number of supported operations + // * 4: the number of supported index types + // * 2: the number of supported data types + // * 2: layout (NCHW vs NHWC) + // * 2: 2D and 3D kernels (optimization) + + kernel.ConfigureHip(l0, l1, l2, g0, g1, g2); + // KernelInfo uses OCL-style indexes + // kernel.l_wk.clear(); + // kernel.l_wk.push_back(l0); + // kernel.l_wk.push_back(l1); + // kernel.l_wk.push_back(l2); + // kernel.g_wk.clear(); + // kernel.g_wk.push_back(g0 * l0); + // kernel.g_wk.push_back(g1 * l1); + // kernel.g_wk.push_back(g2 * l2); + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + kernel( + params.x, + params.y, + params.workspace, + save_index, + index_mode, + args + // args.filter_d, args.filter_h, args.filter_w, + // args.filter_d_stride, args.filter_h_stride, args.filter_w_stride, + // args.filter_d_pad, args.filter_h_pad, args.filter_w_pad, + // args.all_n, + // args.all_c, + // args.bot_d, args.bot_h, args.bot_w, + // args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride, + // args.top_d, args.top_h, args.top_w, + // args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride, + // args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride + ); + }; + }; + + return result; +} + +std::size_t +PoolingForwardNDNhwcNaive::GetWorkspaceSize(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + if(problem.GetPooling().GetMode() != miopenPoolingMax || !problem.SaveIndex()) + return 0; + return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType()); +} + +} // namespace pooling + +} // namespace solver + +} // namespace miopen diff --git a/src/tensor.cpp b/src/tensor.cpp index 7ec4c4e581..f189dcf8da 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -153,6 +153,14 @@ TensorDescriptor::TensorDescriptor(miopenDataType_t t, { } +TensorDescriptor::TensorDescriptor(miopenDataType_t t, + miopenTensorLayout_t layout_in, + const std::vector& lens_in, + const std::vector& strides_in) + : TensorDescriptor(t, layout_in, ConvertLengthsOrThrow(lens_in, "Lengths must be > 0"), ConvertLengthsOrThrow(strides_in, "Strides must be > 0")) +{ +} + TensorDescriptor::TensorDescriptor(miopenDataType_t t, miopenTensorLayout_t layout_in, const std::initializer_list& lens_in) @@ -446,6 +454,11 @@ std::string TensorDescriptor::GetLayoutStr(miopenTensorLayout_t tensorLayout) std::string TensorDescriptor::GetLayout_str() const { return GetLayoutStr(this->tensorLayout); } +bool TensorDescriptor::IsDefaultLayout() const +{ + return IsDefaultLayout(tensorLayout, lens.size() - 2); +} + std::size_t TensorDescriptor::GetVectorLength() const { return this->vector_length; } std::size_t TensorDescriptor::GetIndex(std::initializer_list l) const diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 434fdfe5df..0211790493 100755 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -192,6 +192,8 @@ message(STATUS "MIOPEN_TEST_COMPOSABLEKERNEL ${MIOPEN_TEST_COMPOSABLEKERNEL}") message(STATUS "MIOPEN_TEST_DISCRETE ${MIOPEN_TEST_DISCRETE}") message(STATUS "MIOPEN_TEST_DBSYNC ${MIOPEN_TEST_DBSYNC}") message(STATUS "CODECOV_TEST ${CODECOV_TEST}") +message(STATUS "CMAKE_CTEST_COMMAND ${CMAKE_CTEST_COMMAND}") +message(STATUS "CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR}") if(MIOPEN_TEST_DRIVER_ITER_MODE) add_definitions(-DMIOPEN_TEST_DRIVER_MODE=2) diff --git a/test/gtest/ex1.cpp b/test/gtest/ex1.cpp new file mode 100644 index 0000000000..324a3055cb --- /dev/null +++ b/test/gtest/ex1.cpp @@ -0,0 +1,15 @@ +#include +// TODO TRJS delete file +struct paramType { std::string value; }; + +class MyFixture : public testing::TestWithParam {}; +class FixtureA : public MyFixture {}; +class FixtureB : public MyFixture {}; + +TEST_P(FixtureA, TestNameA0) { auto& myParam = GetParam(); EXPECT_GT(myParam.value.size(), 0ULL); } +TEST_P(FixtureB, TestNameA0) { auto& myParam = GetParam(); EXPECT_GT(myParam.value.size(), 0ULL); } + +INSTANTIATE_TEST_SUITE_P(PIN0, FixtureA, testing::Values(paramType{"v00"}, paramType{"v01"})); +INSTANTIATE_TEST_SUITE_P(PIN1, FixtureA, testing::Values(paramType{"v10"}, paramType{"v11"}, paramType{"v12"})); +INSTANTIATE_TEST_SUITE_P(PIN2, FixtureB, testing::Values(paramType{"v00"}, paramType{"v11"})); + diff --git a/test/gtest/poolingFwd2dNaive.cpp b/test/gtest/poolingFwd2dNaive.cpp new file mode 100644 index 0000000000..d09c35302a --- /dev/null +++ b/test/gtest/poolingFwd2dNaive.cpp @@ -0,0 +1,256 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef POOLING_GTEST_BUILD + +#include +#include +#include "get_handle.hpp" +#include "test_env.hpp" + +#include "pooling_testing.hpp" +#include "pooling2d.hpp" + +#include "tensor_holder.hpp" +#include "miopen/tensor_layout.hpp" + +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS) + +namespace env = miopen::env; + +namespace { + +template +struct layout_data +{ + static std::vector get_layout_lengths(int n, int c, std::vector& dims) + { + auto ret = std::vector{n, c}; + ret.insert(ret.end(), dims.cbegin(), dims.cend()); + + return ret; + } + + static std::vector + get_strides(std::vector& lens, int dims, miopenTensorLayout_t tensor_layout) + { + std::vector strides; + std::string layout_default = miopen::tensor_layout_get_default(dims + 2); + std::string layout_string = miopen::TensorDescriptor::GetLayoutStr(tensor_layout); + + miopen::tensor_layout_to_strides(lens, layout_default, layout_string, strides); + + constexpr int min_stride_multiplier = 1; + constexpr int max_stride_multiplier = 4; + + auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier); + for(auto& v : strides) + { + // cppcheck-suppress useStlAlgorithm + v = v * c; + } + + return strides; + } + + static miopenTensorDescriptor_t init_tensor_descriptor(miopenDataType_t type, + const std::vector& lens, + const std::vector& strides) + { + miopenTensorDescriptor_t desc; + + EXPECT_TRUE(miopenCreateTensorDescriptor(&desc) == miopenStatusSuccess); + EXPECT_TRUE( + miopenSetTensorDescriptor(desc, type, lens.size(), lens.data(), strides.data()) == + miopenStatusSuccess); + + return desc; + } + + layout_data(int _n, std::vector _dims, int _c, miopenTensorLayout_t _tensor_layout) + { + auto lens = get_layout_lengths(_n, _c, _dims); + auto strides = get_strides(lens, _dims.size(), _tensor_layout); + descriptor = miopen::TensorDescriptor{miopen_type{}, lens, strides}; + host = tensor{lens, strides}.generate(gen_value); + } + + ~layout_data() {} + + void read_gpu_data(miopen::Handle& handle, const miopen::Allocator::ManageDataPtr& ddata) + { + check = tensor{descriptor.GetLengths(), descriptor.GetStrides()}; + handle.ReadTo(check.data.data(), ddata, check.data.size()); + } + + tensor check{}; + tensor host; + miopen::TensorDescriptor descriptor; +}; + +} + +class Pooling2d : public testing::TestWithParam> {}; +class Pooling2dInt8 : public Pooling2d {}; +class Pooling2dFloat : public Pooling2d {}; +class Pooling2dHalf : public Pooling2d {}; +class Pooling2dBF16 : public Pooling2d {}; +class Pooling2dF8 : public Pooling2d {}; + +void Run2dDriver(miopenDataType_t prec); + +namespace { + +static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); } + +void GetArgs(const std::string& param, std::vector& tokens) +{ + std::stringstream ss(param); + std::istream_iterator begin(ss); + std::istream_iterator end; + while(begin != end) + tokens.push_back(*begin++); +} + +bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; } + +std::vector Get2dTestCases(const std::string precision) +{ + const auto& flag_arg = env::value(MIOPEN_TEST_FLAGS_ARGS); + + const std::vector test_cases = { + // clang-format off + {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg}, + {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}, + {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 " + flag_arg} + // clang-format on + }; + + return test_cases; +} +} // namespace pooling_tests + +TEST_P(Pooling2dInt8, NNT) +{ + if(!IsTestRunWith("--int8")) std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--int8") TRJS + GTEST_SKIP(); + + Run2dDriver(miopenInt8); +}; + +TEST_P(Pooling2dFloat, NNT) +{ + if(!IsTestRunWith("--float")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl; + + if(SkipTest() || !IsTestSupportedForDevice(get_handle())) // && IsTestRunWith("--float") TRJS + GTEST_SKIP(); + + Run2dDriver(miopenFloat); +}; + +TEST_P(Pooling2dHalf, NNT) +{ + if(!IsTestRunWith("--half")) std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--half") TRJS + GTEST_SKIP(); + + Run2dDriver(miopenHalf); +}; + +TEST_P(Pooling2dBF16, NNT) +{ + if(!IsTestRunWith("--bfloat16")) std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--bfloat16") TRJS + GTEST_SKIP(); + + Run2dDriver(miopenBFloat16); +}; + +TEST_P(Pooling2dF8, NNT) +{ + if(!IsTestRunWith("--float8")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--float8") TRJS + GTEST_SKIP(); + + Run2dDriver(miopenFloat8); +}; + +void Run2dDriver(miopenDataType_t prec) +{ + auto cases = Get2dTestCases("--float"); + + std::vector params; + switch(prec) + { + case miopenFloat: params = Pooling2dFloat_NNT_Test::GetParam(); break; + case miopenHalf: params = Pooling2dHalf_NNT_Test::GetParam(); break; + case miopenBFloat16: params = Pooling2dBF16_NNT_Test::GetParam(); break; + case miopenInt8: params = Pooling2dInt8_NNT_Test::GetParam(); break; + case miopenFloat8: params = Pooling2dF8_NNT_Test::GetParam(); break; + case miopenInt32: + case miopenDouble: + case miopenBFloat8: + case miopenInt64: + FAIL() + << "miopenInt32, miopenDouble, miopenBFloat8, miopenInt64 " + "data types not supported by " + "poolingFwd2dNaive test"; + + default: params = Pooling2dFloat_NNT_Test::GetParam(); + } + + std::cerr << "Params: " << params.size() << std::endl; + + for(const auto& test_value : params) + { + std::vector tokens; + GetArgs(test_value, tokens); + std::vector ptrs; + + std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) { + return str.data(); + }); + + testing::internal::CaptureStderr(); + test_drive(ptrs.size(), ptrs.data()); + auto capture = testing::internal::GetCapturedStderr(); + std::cout << capture; + } +} + +INSTANTIATE_TEST_SUITE_P(Full, Pooling2dBF16, testing::Values(Get2dTestCases("--bfloat16"))); +INSTANTIATE_TEST_SUITE_P(Full, Pooling2dInt8, testing::Values(Get2dTestCases("--int8"))); +INSTANTIATE_TEST_SUITE_P(Full, Pooling2dFloat, testing::Values(Get2dTestCases("--float"))); +INSTANTIATE_TEST_SUITE_P(Full, Pooling2dHalf, testing::Values(Get2dTestCases("--half"))); +INSTANTIATE_TEST_SUITE_P(Full, Pooling2dF8, testing::Values(Get2dTestCases("--float8"))); + +#endif diff --git a/test/gtest/poolingFwd3dNaive.cpp b/test/gtest/poolingFwd3dNaive.cpp new file mode 100644 index 0000000000..d3f8975b15 --- /dev/null +++ b/test/gtest/poolingFwd3dNaive.cpp @@ -0,0 +1,265 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef POOLING_GTEST_BUILD + +#include +#include +#include "get_handle.hpp" +#include "test_env.hpp" + +#include "pooling_testing.hpp" +#include "pooling3d.hpp" + +#include "tensor_holder.hpp" +#include "miopen/tensor_layout.hpp" + +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS) + +namespace env = miopen::env; + +namespace { + +template +struct layout_data +{ + static std::vector get_layout_lengths(int n, int c, std::vector& dims) + { + auto ret = std::vector{n, c}; + ret.insert(ret.end(), dims.cbegin(), dims.cend()); + + return ret; + } + + static std::vector + get_strides(std::vector& lens, int dims, miopenTensorLayout_t tensor_layout) + { + std::vector strides; + std::string layout_default = miopen::tensor_layout_get_default(dims + 2); + std::string layout_string = miopen::TensorDescriptor::GetLayoutStr(tensor_layout); + +std::cout << "get_strides: dims=" << dims << " lens=" << lens.size() << " " << layout_default << " " << layout_string << std::endl; + miopen::tensor_layout_to_strides(lens, layout_default, layout_string, strides); + + constexpr int min_stride_multiplier = 1; + constexpr int max_stride_multiplier = 4; + + auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier); + for(auto& v : strides) + { + // cppcheck-suppress useStlAlgorithm + v = v * c; + } + + return strides; + } + + static miopenTensorDescriptor_t init_tensor_descriptor(miopenDataType_t type, + const std::vector& lens, + const std::vector& strides) + { + miopenTensorDescriptor_t desc; + + EXPECT_TRUE(miopenCreateTensorDescriptor(&desc) == miopenStatusSuccess); + EXPECT_TRUE( + miopenSetTensorDescriptor(desc, type, lens.size(), lens.data(), strides.data()) == + miopenStatusSuccess); + + return desc; + } + + layout_data(int _n, std::vector _dims, int _c, miopenTensorLayout_t _tensor_layout) + { + auto lens = get_layout_lengths(_n, _c, _dims); + auto strides = get_strides(lens, _dims.size(), _tensor_layout); + descriptor = miopen::TensorDescriptor{miopen_type{}, lens, strides}; + host = tensor{lens, strides}.generate(gen_value); + } + + ~layout_data() {} + + void read_gpu_data(miopen::Handle& handle, const miopen::Allocator::ManageDataPtr& ddata) + { + check = tensor{descriptor.GetLengths(), descriptor.GetStrides()}; + handle.ReadTo(check.data.data(), ddata, check.data.size()); + } + + tensor check{}; + tensor host; + miopen::TensorDescriptor descriptor; +}; + +} + +class PoolingFwd3d : public testing::TestWithParam> {}; +class PoolingFwd3dFloat : public PoolingFwd3d {}; +class PoolingFwd3dHalf : public PoolingFwd3d {}; +class PoolingFwd3dBF16 : public PoolingFwd3d {}; +class PoolingFwd3dInt8 : public PoolingFwd3d {}; +class PoolingFwd3dF8 : public PoolingFwd3d {}; + +void Run3dDriver(miopenDataType_t prec); + +namespace { + +static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); } + +void GetArgs(const std::string& param, std::vector& tokens) +{ + std::stringstream ss(param); + std::istream_iterator begin(ss); + std::istream_iterator end; + while(begin != end) + tokens.push_back(*begin++); +} + +bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; } + +std::vector Get3dTestCases(const std::string precision) +{ + const auto& flag_arg = env::value(MIOPEN_TEST_FLAGS_ARGS); + + const std::vector test_cases = { + // clang-format off + {"test_pooling3d " + precision + " --all --dataset 0 --limit 0 " + flag_arg}, + {"test_pooling3d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}, + {"test_pooling3d " + precision + " --all --dataset 2 --limit 0 " + flag_arg} + // clang-format on + }; + + return test_cases; +} +} // namespace pooling_tests + +TEST_P(PoolingFwd3dFloat, NNT) // NNT=NdNaiveTranspose +{ + const auto& handle = get_handle(); + if(!IsTestRunWith("--float")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl; + + if(IsTestSupportedForDevice(handle) && !SkipTest()) // && IsTestRunWith("--float") TRJS + { + Run3dDriver(miopenFloat); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(PoolingFwd3dHalf, NNT) +{ + const auto& handle = get_handle(); + if(!IsTestRunWith("--half")) std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl; + + if(IsTestSupportedForDevice(handle) && !SkipTest()) // && IsTestRunWith("--half") TRJS + { + Run3dDriver(miopenHalf); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(PoolingFwd3dBF16, NNT) +{ + if(!IsTestRunWith("--bfloat16")) std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--bfloat16") TRJS + GTEST_SKIP(); + + Run3dDriver(miopenBFloat16); +}; + +TEST_P(PoolingFwd3dInt8, NNT) +{ + if(!IsTestRunWith("--int8")) std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--int8") TRJS + GTEST_SKIP(); + + Run3dDriver(miopenInt8); +}; + +TEST_P(PoolingFwd3dF8, NNT) +{ + if(!IsTestRunWith("--float8")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--float8") TRJS + GTEST_SKIP(); + + Run3dDriver(miopenFloat8); +}; + +void Run3dDriver(miopenDataType_t prec) +{ + auto cases = Get3dTestCases("--float"); + + std::vector params; + switch(prec) + { + case miopenFloat: params = PoolingFwd3dFloat_NNT_Test::GetParam(); break; + case miopenHalf: params = PoolingFwd3dHalf_NNT_Test::GetParam(); break; + case miopenBFloat16: params = PoolingFwd3dBF16_NNT_Test::GetParam(); break; + case miopenInt8: params = PoolingFwd3dInt8_NNT_Test::GetParam(); break; + case miopenFloat8: params = PoolingFwd3dF8_NNT_Test::GetParam(); break; + case miopenInt32: + case miopenDouble: + case miopenBFloat8: + case miopenInt64: + FAIL() + << "miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 " + "data types not supported by " + "poolingFwdNdNaive test"; + + default: params = PoolingFwd3dFloat_NNT_Test::GetParam(); + } + + for(const auto& test_value : params) + { + std::vector tokens; + GetArgs(test_value, tokens); + std::vector ptrs; + + std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) { + return str.data(); + }); + + testing::internal::CaptureStderr(); + test_drive(ptrs.size(), ptrs.data()); + auto capture = testing::internal::GetCapturedStderr(); + std::cout << capture; + } +} + +INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd3dBF16, testing::Values(Get3dTestCases("--bfloat16"))); +INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd3dFloat, testing::Values(Get3dTestCases("--float"))); +INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd3dHalf, testing::Values(Get3dTestCases("--half"))); +INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd3dInt8, testing::Values(Get3dTestCases("--int8"))); +INSTANTIATE_TEST_SUITE_P(F8, PoolingFwd3dF8, testing::Values(Get3dTestCases("--float8"))); + +#endif diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp new file mode 100644 index 0000000000..8e94032708 --- /dev/null +++ b/test/gtest/pooling_testing.hpp @@ -0,0 +1,936 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +// TODO: I've hijacked pooling_common here. This is a temporary workaround until +// all pooling tests have been converted to gtest. This work has not been planned yet. +#ifndef GUARD_MIOPEN_TEST_POOLING_COMMON_HPP +#define GUARD_MIOPEN_TEST_POOLING_COMMON_HPP + +#define DATASET "0" + +#include +#include +namespace {using sc = std::chrono::steady_clock;} +#undef tomillis +#define tomillis(__DUR) (0.001 * std::chrono::duration_cast(__DUR).count()) +#undef mstocout +#define mstocout(__TP) std::setw(15) << std::fixed << std::setprecision(3) << tomillis(sc::now() - __TP) +#undef coutms +#define coutms(__TOK, __TP) (std::cout << "ms[" << std::setw(16) << __TOK << "]: " << mstocout(__TP) << std::endl) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../driver.hpp" +#include "../get_handle.hpp" +#include "../tensor_holder.hpp" +#include "../verify.hpp" +#include "../cpu_conv.hpp" +#include "../workspace.hpp" + +#define TEST_PADDING_MODE 0 + +namespace { +int num_all_case = 0; +int num_uint16_case = 0; +int num_uint32_case = 0; +int num_uint32_case_imgidx = 0; +int num_uint64_case = 0; +int num_uint64_case_imgidx = 0; +constexpr int max_typed_cases = 5; +constexpr int MAX_ALL_CASES = 0; +auto __start = sc::now(); + +constexpr int RAND_INTEGER_MAX = 12000; +constexpr int RAND_INTEGER_MIN = -8800; + +template +auto gen_value = + [](auto... is) { return static_cast(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; }; + +auto gen_start = + [](auto... is) { return prng::gen_0_to_B(1ULL << 28); }; +} + +static inline void print(std::ostringstream& oss, const miopen::PoolingDescriptor& filter, bool is_default_layout) +{ + oss << "Pooling: "; + if(filter.GetMode() == miopenPoolingAverage) + oss << "Average"; + else if(filter.GetMode() == miopenPoolingAverageInclusive) + oss << "AverageInclusive"; + else + oss << "Max"; + oss << std::endl; + oss << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl; + oss << "Lengths: "; + miopen::LogRange(oss, filter.GetLengths(), ", ") << std::endl; + oss << "Pads: "; + miopen::LogRange(oss, filter.GetPads(), ", ") << std::endl; + oss << "Strides: "; + miopen::LogRange(oss, filter.GetStrides(), ", ") << std::endl; +} + +template +tensor get_output_tensor(const miopen::PoolingDescriptor& filter, const tensor& input) +{ + return tensor{filter.GetForwardOutputTensor(input.desc)}; +} + +template +tensor get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor& input) +{ + auto desc = filter.GetForwardOutputTensor(input.desc); + auto lens = desc.GetLengths(); + if(desc.GetElementSize() > 1024) + lens[0] *= 2; + else + lens[0] *= (2047 / desc.GetElementSize()) + 1 ; + auto dbig = miopen::TensorDescriptor{miopenFloat, input.desc.GetLayout_t(), lens, desc.GetStrides()}; + auto big = tensor{dbig}; + for (auto& v : big) v = -2.2222f; + return big; +} + +template +struct pooling_operators +{ + miopen::PoolingDescriptor filter; + pooling_operators(miopen::PoolingDescriptor f) : filter(f) {} + + double initialize() const + { + if(filter.GetMode() == miopenPoolingMax) + return std::numeric_limits::lowest(); + else + return 0.0; + } + + double operator()(double x, double y) const + { + if(filter.GetMode() == miopenPoolingMax) + { + double m = std::max(x, y); + return (m); + } + else + { + return x + y; + } + } + + double finalize(double x, double y) + { + if(filter.GetMode() == miopenPoolingMax) + return (x); + else + return x / y; + } +}; + +template +struct verify_forward_pooling +{ + template + tensor + cpu(const tensor& input, const miopen::PoolingDescriptor& filter, std::vector&) const + { + auto out = get_output_tensor(filter, input); + + std::array in_dim{}; + std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin()); + std::array strides{}; + std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin()); + std::array pads{}; + std::copy_n(filter.GetPads().begin(), SptDim, pads.begin()); + std::array kers{}; + std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin()); + auto pooler = pooling_operators{filter}; + + int b_n = out.desc.GetLengths()[0]; + int k_n = out.desc.GetLengths()[1]; + std::array out_spatial_len{}; + std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin()); + + auto par_ford_out = + miopen::unpacker(miopen::prepender(par_ford, b_n, k_n))(out_spatial_len); + + par_ford_out([&](int o, int w, auto... out_spatial_id_pack) { + auto out_spatial_id = make_array(out_spatial_id_pack...); + + std::array start_idx{}; + std::array win_sz{}; + for(int i = 0; i < SptDim; ++i) + { + start_idx[i] = out_spatial_id[i] * strides[i] - pads[i]; + int end_idx = start_idx[i] + kers[i]; + end_idx = std::min(end_idx, in_dim[i]); + start_idx[i] = std::max(start_idx[i], 0); + win_sz[i] = end_idx - start_idx[i]; + win_sz[i] = std::max(win_sz[i], 1); + } + + int pool_size = + filter.GetMode() == miopenPoolingAverageInclusive + ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies()) + : std::accumulate(win_sz.begin(), win_sz.end(), 1, std::multiplies()); + + double acc = pooler.initialize(); + miopen::unpacker(ford)(win_sz)([&](auto... in_spatial_id_pack) { + auto in_spatial_id = make_array(in_spatial_id_pack...); + std::array idx{}; + idx[0] = o; + idx[1] = w; + + bool in_cmp_idx = true; + for(int i = 0; i < SptDim; ++i) + { + idx[i + 2] = start_idx[i] + in_spatial_id[i]; + in_cmp_idx &= (in_dim[i] > idx[i + 2]); + } + + if(in_cmp_idx) + { + acc = pooler(acc, input(idx)); + } + }); + out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size)); + }); + + return out; + } + + template + tensor gpu(const tensor& input, + const miopen::PoolingDescriptor& filter, + std::vector& indices) const + { + auto&& handle = get_handle(); + auto out = get_output_tensor(filter, input); + + indices.resize(out.data.size(), 0); + + auto in_dev = handle.Write(input.data); + auto out_dev = handle.Create(out.GetSize()); + Workspace wspace{}; + wspace.Write(indices); + + float alpha = 1, beta = 0; + filter.Forward(handle, + &alpha, + input.desc, + in_dev.get(), + &beta, + out.desc, + out_dev.get(), + true, + wspace.ptr(), + wspace.size()); + handle.ReadTo(out.data.data(), out_dev, out.GetDataByteSize()); + wspace.ReadTo(indices); + + return out; + } + + template + void fail(float, + const tensor& input, + const miopen::PoolingDescriptor& filter, + const std::vector&) const + { + std::ostringstream oss; + oss << "Forward "; + print(oss, filter, input.desc.IsDefaultLayout()); + oss << "Input tensor: " << input.desc.ToString() << std::endl; + oss << "Output tensor: " << filter.GetForwardOutputTensor(input.desc).ToString() + << std::endl; + GTEST_FAIL() << oss.str(); + } +}; + +template +struct verify_backward_pooling +{ + template + tensor cpu(const tensor& input, + const tensor& dout, + const tensor& out, + const miopen::PoolingDescriptor& filter, + const std::vector& indices, + bool use_global_index, + bool verify_index) const + { + const int sptl_dim_offset = 2; + const int chan_dim_offset = 1; + + auto dinput = input; + return dinput; // TRJS + + std::vector din_vec(input.desc.GetElementSpace(), 0.0); + CHECK(dout.desc == out.desc); + std::array in_dim{}; + std::copy_n(input.desc.GetLengths().begin(), SptDim + 2, in_dim.begin()); + std::array in_str{}; + std::copy_n(input.desc.GetStrides().begin(), SptDim + 2, in_str.begin()); + std::array strides{}; + std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin()); + std::array pads{}; + std::copy_n(filter.GetPads().begin(), SptDim, pads.begin()); + std::array kers{}; + std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin()); + auto ford_ker = miopen::unpacker(ford)(kers); + + int out_n = out.desc.GetLengths()[0]; + int out_c = out.desc.GetLengths()[chan_dim_offset]; + std::array out_spatial_len{}; + std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin()); + + auto ford_out = miopen::unpacker(ford)(out_spatial_len); + + par_ford(out_n, out_c)([&](int o, int w) { + if(filter.GetMode() == miopenPoolingMax) + { + ford_out([&](auto... out_spatial_id_pack) { + auto mx_idx = indices.at(dout.desc.GetIndex(o, w, out_spatial_id_pack...)); + std::array idx{}; + bool in_cmp_idx = true; + if(use_global_index) + { + for(int i = 0; i < SptDim; i++) + { + std::size_t mx_idx_dim = mx_idx; + mx_idx_dim /= std::accumulate(in_dim.begin() + sptl_dim_offset + i + 1, + in_dim.end(), + 1ULL, + std::multiplies()); + mx_idx_dim %= in_dim[i + sptl_dim_offset]; + idx[i + sptl_dim_offset] = mx_idx_dim; + } + } + else + { + auto out_spatial_id = make_array(out_spatial_id_pack...); + + for(int i = 0; i < SptDim; i++) + { + int mx_idx_dim = mx_idx; + mx_idx_dim /= std::accumulate( + kers.begin() + i + 1, kers.end(), 1, std::multiplies()); + mx_idx_dim %= kers[i]; + + mx_idx_dim += (out_spatial_id[i] * strides[i] - pads[i]); + in_cmp_idx &= (in_dim[i + 2] > mx_idx_dim && mx_idx_dim >= 0); + + idx[i + 2] = std::size_t(mx_idx_dim); + } + } + + if(in_cmp_idx) + { + idx[0] = o; + idx[1] = w; + if(false && verify_index) + { + CHECK( + miopen::float_equal(input(idx), out(o, w, out_spatial_id_pack...))); + } + std::size_t din_idx = 0; + for(int i = 0; i < SptDim + 2; i++) + { + din_idx += idx[i] * in_str[i]; + } + din_vec.at(din_idx) += dout(o, w, out_spatial_id_pack...); + } + }); + } + else + { + ford_out([&](auto... out_spatial_id_pack) { + auto out_spatial_id = make_array(out_spatial_id_pack...); + + std::array start_idx{}; + std::array win_sz{}; + for(int i = 0; i < SptDim; ++i) + { + start_idx[i] = out_spatial_id[i] * strides[i] - pads[i]; + int end_idx = start_idx[i] + kers[i]; + end_idx = std::min(end_idx, in_dim[i + 2]); + win_sz[i] = end_idx - std::max(start_idx[i], 0); + win_sz[i] = std::max(win_sz[i], 1); + } + + int pool_size = + filter.GetMode() == miopenPoolingAverageInclusive + ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies()) + : std::accumulate( + win_sz.begin(), win_sz.end(), 1, std::multiplies()); + + ford_ker([&](auto... ker_id_pack) { + auto ker_id = make_array(ker_id_pack...); + + bool in_cmp_idx = true; + std::array in_idx{}; + in_idx[0] = o; + in_idx[1] = w; + for(int i = 0; i < SptDim; ++i) + { + in_idx[i + 2] = start_idx[i] + ker_id[i]; + in_cmp_idx &= (in_dim[i + 2] > in_idx[i + 2] && in_idx[i + 2] >= 0); + } + + if(in_cmp_idx) + { + std::size_t din_idx = 0; + for(int i = 0; i < SptDim + 2; i++) + { + din_idx += in_idx[i] * in_str[i]; + } + + din_vec.at(din_idx) += + static_cast(dout(o, w, out_spatial_id_pack...)) / pool_size; + } + }); + }); + } + }); + + miopen::unpacker(ford)(in_dim)([&](auto... in_id_pack) { + auto in_id = make_array(in_id_pack...); + std::size_t din_idx = 0; + for(int i = 0; i < SptDim + 2; i++) + { + din_idx += in_id[i] * in_str[i]; + } + dinput(in_id_pack...) = din_vec.at(din_idx); + }); + + return dinput; + } + + template + tensor gpu(const tensor& input, + const tensor& dout, + const tensor& out, + const miopen::PoolingDescriptor& filter, + const std::vector& indices, + bool, + bool) const + { + auto&& handle = get_handle(); + auto dinput = input; + return dinput; // TRJS + + auto in_dev = handle.Write(input.data); + auto dout_dev = handle.Write(dout.data); + auto out_dev = handle.Write(out.data); + auto din_dev = handle.Create(dinput.data.size()); + + Workspace wspace{}; + wspace.Write(indices); + + float alpha = 1, beta = 0; + filter.Backward(handle, + &alpha, + // y + out.desc, + out_dev.get(), + // dy + dout.desc, + dout_dev.get(), + // x + input.desc, + in_dev.get(), + &beta, + // dx + dinput.desc, + din_dev.get(), + wspace.ptr()); + + handle.ReadTo(dinput.data.data(), din_dev, dinput.data.size()); + + return dinput; + } + + template + void fail(float, + const tensor& input, + const tensor&, + const tensor& out, + const miopen::PoolingDescriptor& filter, + const std::vector&, + bool, + bool) const + { + std::ostringstream oss; + oss << "Backward "; + print(oss, filter, input.desc.IsDefaultLayout()); + oss << "Input tensor: " << input.desc.ToString() << std::endl; + oss << "Output tensor: " << out.desc.ToString() << std::endl; + GTEST_FAIL() << oss.str(); + } +}; + +template +struct pooling_driver : test_driver +{ + miopen::PoolingDescriptor filter; + std::vector in_shape; + std::vector lens; + std::vector pads; + std::vector strides; + std::string index_type; + std::string mode_str; +#if TEST_PADDING_MODE == 1 + std::string pmode; +#endif + int verify_indices{}; + miopenPoolingWorkspaceIndexMode_t wsidx{}; + miopenTensorLayout_t layout{}; + + static void randomize_tensor(tensor& in) + { + static tensor random_data{{1}}; + static tensor starts{std::vector{1}}; + static size_t start_idx = 0; + + const auto size = in.GetSize(); + const auto ran_size = size > 2 ? (3 * size) / 2 : 3; + if (random_data.GetSize() < ran_size) + { + random_data = tensor{{ran_size}}.generate(tensor_elem_gen_integer{2503}); + } + if (starts.GetSize() == 1) // TODO TRJS is there a cleaner way to initialize starts? + { + starts = tensor{std::vector{1 << 20}}.generate(gen_start); + } + + const auto r_start = starts[start_idx++] % (random_data.GetSize() / 3); + if (start_idx >= starts.GetSize()) start_idx = 0; + + std::cout << "randomizing " << std::setw(9) << size << " elems from " << std::setw(9) << r_start << " (" << start_idx << ")" // TRJS + // << "(" << std::setw(8) << prng::gen_0_to_B(size / 2) << std::setw(8) << prng::gen_0_to_B(size / 2) << std::setw(8) << prng::gen_0_to_B(size / 2) << std::setw(8) << prng::gen_0_to_B(size / 2) << ")" + << std::endl; + in.data.assign(random_data.begin() + r_start, random_data.begin() + r_start + size); + } + + std::unordered_map index_type_lookup = { + {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8}, + {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16}, + {miopen::ToUpper("miopenIndexUint32"), miopenIndexUint32}, + {miopen::ToUpper("miopenIndexUint64"), miopenIndexUint64}, + }; + std::unordered_map mode_lookup = { + {"MAX", miopenPoolingMax}, + {"MIOPENPOOLINGMAX", miopenPoolingMax}, + {"AVERAGE", miopenPoolingAverage}, + {"MIOPENPOOLINGAVERAGE", miopenPoolingAverage}, + {"AVERAGEINCLUSIVE", miopenPoolingAverageInclusive}, + {"MIOPENPOOLINGAVERAGEINCLUSIVE", miopenPoolingAverageInclusive}, + }; +#if TEST_PADDING_MODE == 1 + std::unordered_map pmode_lookup = { + {"DEFAULT", miopenPaddingDefault}, + {"SAME", miopenPaddingSame}, + {"VALID", miopenPaddingValid}, + }; +#endif + pooling_driver() + { + add(index_type, + "index_type", + // generate_data({"miopenIndexUint32"} // TEMPCODE RJS RUN + generate_multi_data( // + {{"miopenIndexUint32", + "miopenIndexUint8" + , + "miopenIndexUint16", + "miopenIndexUint64" + }, // + {"miopenIndexUint8", "miopenIndexUint32"}, // + {"miopenIndexUint32"}} // + )); + add(mode_str, + "mode_str", + generate_data( + {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); +#if TEST_PADDING_MODE == 1 + add(pmode, "pmode", generate_data({"default", "same", "valid"})); +#endif + add(verify_indices, "verify_indices", generate_data({1})); + } + + template + void run_impl() + { + std::vector indices{}; +auto gst = sc::now(); + auto input = tensor{layout, in_shape}; + randomize_tensor(input); +coutms("gen", gst); +auto vst = sc::now(); + auto out = verify(verify_forward_pooling{}, + input, + filter, + indices); +coutms("verify", vst); + if(!std::is_same::value && !std::is_same::value) return; + + // auto dout = out.first; + // dout.generate(tensor_elem_gen_integer{2503}); + // verify(verify_backward_pooling{}, // TRJS + // input, + // dout, + // out.first, + // filter, + // indices, + // wsidx != 0, + // static_cast(this->verify_indices)); + } + +#define CHECK_SKIP \ +if(skip) \ +{ \ + std::cout << "\nSkipping run # " << std::setw(7) << num_all_case++ << " @ET=" << mstocout(__start) << " : "; \ + show_command(); \ + std::cout << "-- " << oss.str() << std::endl; \ + return; \ +} + +#define SKIP_RUN skip = true; CHECK_SKIP + + void run() + { + const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); + + bool skip = false; + std::ostringstream oss; + + if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES) + { + skip = true; + oss << " : skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES; + } + if(this->dry_run) + { + skip = true; + oss << " : skipped due to dry_run"; + } + if(is_default_layout && (this->type != miopenFloat && this->type != miopenHalf)) + { + skip = true; + oss << " : skipped, no solvers for datatype " << this->type << " and default layouts"; + } + + CHECK_SKIP; + + int sptl_dim = static_cast(in_shape.size()) - 2; + if(sptl_dim != 2 && sptl_dim != 3) + { + oss << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW format." << std::endl; + SKIP_RUN; + } + + // To simplify launching, input dimensions to the driver are always default layout. Desire to + // test non-default layouts is communicated exclusively via 'layout'. + + auto mode = mode_lookup.at(miopen::ToUpper(mode_str)); + + auto pad_mode = miopenPaddingDefault; +#if TEST_PADDING_MODE + pad_mode = pmode_lookup.at(miopen::ToUpper(pmode)); +#endif + + auto idx_typ = index_type_lookup.at(miopen::ToUpper(index_type)); + auto idx_sz = sizeof(uint8_t); + const bool skip_many_configs_with_non_int8_index = + (dataset_id == 0) && !full_set; // Otherwise the default dataset takes too much time. + const bool wide_dataset = (dataset_id == 2) && full_set; + + filter = miopen::PoolingDescriptor + { + mode, + pad_mode, + lens, + strides, + pads + }; + + filter.SetIndexType(idx_typ); + filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx)); + bool mask_idx = filter.GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask; + + if(mask_idx && sptl_dim == 3 && filter.GetMode() == miopenPoolingMax) + { + oss << "Warning: Config skipped. Workspace index mask mode is not implemented " + "yet in 3D max pooling solvers." + << std::endl; + SKIP_RUN; + } + + if(mask_idx && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset) + { + oss << "Warning: Config skipped. Workspace index mask mode is not implemented " + "yet in 2D max backward solvers that support wide pooling window." + << std::endl; + SKIP_RUN; + } + + if(mask_idx && filter.ModeIsAveraging()) + { + oss << "Warning: Config skipped. Workspace index modes are irrelevant for " + "Average pooling. " + "In order to optimize performance of full tests, we " + "skip average pooling configs when (wsidx == 0). " + "Please make sure that dataset includes counterparts with (wsidx == 1)." + << std::endl; + SKIP_RUN; + } + + // index size filter + if(filter.GetMode() == miopenPoolingMax) + { + auto index_max = miopen::get_index_max(filter.GetIndexType()); + auto index_needed = mask_idx ? + std::accumulate(lens.begin(), lens.end(), 1, std::multiplies()) : + std::accumulate(in_shape.begin() + 2, in_shape.end(), 1, std::multiplies()); + + if(index_max <= index_needed) + { + oss << "Warning: Config skipped: index mode " << filter.GetWorkspaceIndexMode() + << " type " << filter.GetIndexType() << " is too small. max=" + << index_max << ", needed=" << index_needed << std::endl; + SKIP_RUN; + } + } + + switch(idx_typ) + { + /// The "index is too small" limitation is an approximation + /// of the real limitation, and therefore applied only when + /// the "full test" is ran. See: + /// \ref max_pooling_index_max_restriction + case miopenIndexUint8: { + if(full_set && (sptl_dim == 3 || (mask_idx && sptl_dim == 2)) && + filter.GetMode() == miopenPoolingMax) + { + oss << "Warning: Config skipped: uint8 index is too small " + "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " + "&& filter.GetMode() == miopenPoolingMax" + << std::endl; + SKIP_RUN; + } + break; + } + case miopenIndexUint16: { + if(full_set && (sptl_dim == 3 || (!mask_idx && sptl_dim == 2)) && + filter.GetMode() == miopenPoolingMax) + { + oss << "Warning: Config skipped: uint16 index is too small " + "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " + "&& filter.GetMode() == miopenPoolingMax" + << std::endl; + SKIP_RUN; + } + if(skip_many_configs_with_non_int8_index) + { + // test_pooling_test --all limit uint16 cases + if(num_uint16_case >= max_typed_cases) + { + oss << "Warning: Config skipped for the default dataset to speed " + "up testing (num_uint16_case > 5)" + << std::endl; + SKIP_RUN; + } + ++num_uint16_case; + } + idx_sz = sizeof(uint16_t); + break; + } + case miopenIndexUint32: { + if(skip_many_configs_with_non_int8_index) + { + // test_pooling_test --all limit uint32 cases + if(mask_idx) + { + if(num_uint32_case >= max_typed_cases) + { + oss << "Warning: Config skipped for the default dataset to speed up " + "testing (wsidx == 0 && num_uint32_case > 5)" + << std::endl; + SKIP_RUN; + } + ++num_uint32_case; + } + else + { + if(num_uint32_case_imgidx >= max_typed_cases) + { + oss << "Warning: Config skipped for the default dataset to speed up " + "testing (wsidx != 0 && num_uint32_case_imgidx > 5)" + << std::endl; + SKIP_RUN; + } + ++num_uint32_case_imgidx; + } + } + idx_sz = sizeof(uint32_t); + break; + } + case miopenIndexUint64: { + if(skip_many_configs_with_non_int8_index) + { + if(mask_idx) + { + if(num_uint64_case >= max_typed_cases) + { + oss << "Warning: Config skipped for the default dataset to speed up " + "testing (wsidx == 0) && (num_uint64_case > 5)" + << std::endl; + SKIP_RUN; + } + ++num_uint64_case; + } + else + { + if(num_uint64_case_imgidx >= max_typed_cases && sptl_dim == 2) + { + oss << "Warning: Config skipped to speed up testing of the " + "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 " + "&& sptl_dim == 2)" + << std::endl; + SKIP_RUN; + } + ++num_uint64_case_imgidx; + } + } + idx_sz = sizeof(uint64_t); + break; + } + } + + auto input_desc = miopen::TensorDescriptor(this->type, layout, in_shape); + + for(int i = 0; i < sptl_dim; i++) + { + if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast(2) * pads[i])) + { + oss << "Warning: Config skipped because it is invalid " + "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))" + << std::endl; + SKIP_RUN; + } + } + + if(full_set) + { + auto output_desc = filter.GetForwardOutputTensor(input_desc); + size_t total_mem = + 3 * input_desc.GetNumBytes() + output_desc.GetNumBytes() + + idx_sz * output_desc.GetElementSize(); // estimate based on backward pass + + size_t device_mem = get_handle().GetGlobalMemorySize(); + if(total_mem >= device_mem) + { + oss << "Config skipped because it requires " << total_mem + << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem + << " Bytes of memory." << std::endl; + SKIP_RUN; + } + } + + CHECK_SKIP; + + std::cout << "\nRun # " << std::setw(7) << num_all_case++ << " @ET=" << mstocout(__start) << " : "; + show_command(); + + std::vector in_dim(input_desc.GetLengths().begin(), + input_desc.GetLengths().begin() + sptl_dim); + std::vector out_dim(sptl_dim); + std::vector ker_dim(filter.GetLengths().begin(), filter.GetLengths().end()); + + switch(filter.GetIndexType()) + { + case miopenIndexUint8: { + if(sptl_dim == 3) + { + run_impl(); + } + else + { + run_impl(); + } + break; + } + case miopenIndexUint16: { + if(sptl_dim == 3) + { + run_impl(); + } + else + { + run_impl(); + } + break; + } + case miopenIndexUint32: { + if(sptl_dim == 3) + { + run_impl(); + } + else + { + run_impl(); + } + break; + } + case miopenIndexUint64: { + if(sptl_dim == 3) + { + run_impl(); + } + else + { + run_impl(); + } + break; + } + } + } +}; + +#endif diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp index 128e81cce2..1bd44a364b 100644 --- a/test/pooling2d.hpp +++ b/test/pooling2d.hpp @@ -29,16 +29,22 @@ #define WORKAROUND_ISSUE_1670 1 #define TEST_GET_INPUT_TENSOR 0 -template -struct pooling2d_driver : pooling_driver +struct pooling2d_shapes { -private: +public: using U = typename std::vector; - std::vector get_2d_pooling_input_shapes() + + static std::vector get_2d_pooling_input_shapes() { - return {{1, 19, 1024, 2048}, - {10, 3, 32, 32}, + return { {5, 32, 8, 8}, + {16, 1, 4096, 4096}, + {1, 16, 4096, 4096}, + {1, 1024, 512, 512}, + {16, 1024, 128, 128}, + {1, 832, 64, 128}, + {10, 3, 32, 32}, + {1, 19, 1024, 2048}, {2, 1024, 12, 12}, {4, 3, 231, 231}, {8, 3, 227, 227}, @@ -47,61 +53,83 @@ struct pooling2d_driver : pooling_driver {2, 160, 7, 7}, {1, 192, 256, 512}, {2, 192, 28, 28}, - {1, 832, 64, 128}, {1, 256, 56, 56}, {4, 3, 224, 224}, {2, 64, 112, 112}, {2, 608, 4, 4}, - {1, 2048, 11, 11}, - {1, 16, 4096, 4096}}; + {1, 2048, 11, 11} + }; } // Dataset 1 is intended for testing of asymmetric configs. - std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}}; } + static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}, {10, 3, 32, 32}}; } // Dataset 2 is intended for testing of configs with wide window. - std::vector get_2d_pooling_input_shapes_wide() + static std::vector get_2d_pooling_input_shapes_wide() { return {{1, 3, 255, 255}, {2, 3, 227, 227}, {1, 7, 127, 127}, {1, 1, 410, 400}}; } +}; + +template +struct pooling2d_driver : pooling_driver +{ +private: + using U = typename std::vector; + std::vector get_2d_pooling_input_shapes() + { + return pooling2d_shapes::get_2d_pooling_input_shapes(); + } + + // Dataset 1 is intended for testing of asymmetric configs. + std::vector get_2d_pooling_input_shapes_minimal() { return pooling2d_shapes::get_2d_pooling_input_shapes_minimal(); } + + // Dataset 2 is intended for testing of configs with wide window. + std::vector get_2d_pooling_input_shapes_wide() + { + return pooling2d_shapes::get_2d_pooling_input_shapes_wide(); + } public: pooling2d_driver() : pooling_driver() { + // clang-format off #if TEST_GET_INPUT_TENSOR std::set in_dim_set = get_inputs(this->batch_factor); std::vector in_dim_vec(in_dim_set.begin(), in_dim_set.end()); this->add(this->in_shape, "input", this->generate_data(in_dim_vec, {16, 32, 8, 8})); #else - this->add( - this->in_shape, - "input", - this->template generate_multi_data_limited({get_2d_pooling_input_shapes(), - get_2d_pooling_input_shapes_minimal(), - get_2d_pooling_input_shapes_wide()}, - 9)); + this->add(this->in_shape, "input", this->template generate_multi_data_limited({ + get_2d_pooling_input_shapes(), + get_2d_pooling_input_shapes_minimal(), + get_2d_pooling_input_shapes_wide() + }, 9 + )); #endif - this->add(this->lens, - "lens", - this->template generate_multi_data( - {{{2, 2}, {3, 3}}, // - {{2, 2}, {1, 2}, {2, 1}}, // - {{35, 35}, {100, 100}, {255, 255}, {410, 400}}})); - this->add(this->strides, - "strides", - this->template generate_multi_data({{{2, 2}, {1, 1}}, // - {{1, 1}, {2, 1}, {1, 2}, {2, 2}}, // - {{1, 1}}})); - // clang-format off + this->add(this->lens, "lens", this->template generate_multi_data({ + {{2, 2}, {3, 3}}, // + {{2, 2}, {1, 2}, {2, 1}}, // + {{35, 35}, {100, 100}, {255, 255}, {410, 400}} + } + )); + this->add(this->strides, "strides", this->template generate_multi_data({ + {{2, 2}, {1, 1}}, // + {{2, 2}, {2, 1}, {1, 2}}, // + {{1, 1}} + } + )); this->add(this->pads, "pads", this->template generate_multi_data({ - {{0, 0}, {1, 1}}, // + {{0, 0}, {1, 1}}, // #if WORKAROUND_ISSUE_1670 - {{0, 0}}, // + {{0, 0}}, // #else - {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, // + {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, // #endif - {{0, 0}}})); + {{0, 0}} + } + )); // clang-format on - this->add(this->wsidx, "wsidx", this->generate_data({0, 1})); - } + this->add(this->wsidx, "wsidx", this->generate_data({miopenPoolingWorkspaceIndexMask, miopenPoolingWorkspaceIndexImage})); + this->add(this->layout, "layout", this->generate_data({miopenTensorNCHW, miopenTensorNHWC})); + } }; diff --git a/test/pooling3d.cpp b/test/pooling3d.cpp index 90b37d5c75..966b3f5303 100644 --- a/test/pooling3d.cpp +++ b/test/pooling3d.cpp @@ -25,30 +25,6 @@ *******************************************************************************/ #include "pooling_common.hpp" - -template -struct pooling3d_driver : pooling_driver -{ - std::vector> get_3d_pooling_input_shapes() - { - return {{16, 64, 3, 4, 4}, - {16, 32, 4, 9, 9}, - {8, 512, 3, 14, 14}, - {8, 512, 4, 28, 28}, - {16, 64, 56, 56, 56}, - {4, 3, 4, 227, 227}, - {4, 4, 4, 161, 700}}; - } - - pooling3d_driver() : pooling_driver() - { - this->add( - this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4)); - this->add(this->lens, "lens", this->generate_data({{2, 2, 2}, {3, 3, 3}})); - this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}})); - this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}})); - this->add(this->wsidx, "wsidx", this->generate_data({1})); - } -}; +#include "pooling3d.hpp" int main(int argc, const char* argv[]) { test_drive(argc, argv); } diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp new file mode 100644 index 0000000000..ffaaf4a9ee --- /dev/null +++ b/test/pooling3d.hpp @@ -0,0 +1,65 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "pooling_common.hpp" + +struct pooling3d_shapes +{ +public: + using U = typename std::vector; + + static std::vector get_3d_pooling_input_shapes() + { + return { + {16, 64, 3, 4, 4}, + {16, 32, 4, 9, 9}, + {8, 512, 3, 14, 14}, + {8, 512, 4, 28, 28}, + {16, 64, 56, 56, 56}, + {4, 3, 4, 227, 227}, + {4, 4, 4, 161, 700} + }; + } +}; + +template +struct pooling3d_driver : pooling_driver +{ + std::vector> get_3d_pooling_input_shapes() + { + return pooling3d_shapes::get_3d_pooling_input_shapes(); + } + + pooling3d_driver() : pooling_driver() + { + this->add(this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4)); + this->add(this->lens, "lens", this->generate_data({{2, 2, 2}, {3, 3, 3}})); + this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}})); + this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}})); + this->add(this->wsidx, "wsidx", this->generate_data({miopenPoolingWorkspaceIndexImage})); + this->add(this->layout, "layout", this->generate_data({miopenTensorNCDHW, miopenTensorNDHWC})); + } +}; diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp index 231b635a63..c9f787b3ed 100644 --- a/test/pooling_common.hpp +++ b/test/pooling_common.hpp @@ -60,6 +60,16 @@ static int num_uint64_case = 0; // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) static int num_uint64_case_imgidx = 0; +namespace { + +constexpr int RAND_INTEGER_MAX = 12000; +constexpr int RAND_INTEGER_MIN = -8800; + +template +auto gen_value = + [](auto... is) { return static_cast(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; }; +} + static inline void print(const miopen::PoolingDescriptor& filter) { std::cout << "Pooling: "; @@ -84,13 +94,23 @@ tensor get_output_tensor(const miopen::PoolingDescriptor& filter, const tenso return tensor{filter.GetForwardOutputTensor(input.desc)}; } +template +tensor get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor& input) +{ + auto desc = filter.GetForwardOutputTensor(input.desc); + auto lens = desc.GetLengths(); + lens[0] *= 10; + auto big = miopen::TensorDescriptor{desc.GetType(), input.desc.GetLayout_t(), lens, desc.GetStrides()}; + return tensor{big}; +} + template struct pooling_operators { miopen::PoolingDescriptor filter; pooling_operators(miopen::PoolingDescriptor f) : filter(f) {} - double start() const + double initialize() const { if(filter.GetMode() == miopenPoolingMax) return std::numeric_limits::lowest(); @@ -111,7 +131,7 @@ struct pooling_operators } } - double final(double x, double y) + double finalize(double x, double y) { if(filter.GetMode() == miopenPoolingMax) return (x); @@ -127,22 +147,25 @@ struct verify_forward_pooling tensor cpu(const tensor& input, const miopen::PoolingDescriptor& filter, std::vector&) const { + const int sptl_dim_offset = 2; + const int chan_dim_offset = 1; + auto out = get_output_tensor(filter, input); std::array in_dim{}; - std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin()); + std::copy_n(input.desc.GetLengths().begin() + sptl_dim_offset, SptDim, in_dim.begin()); std::array strides{}; std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin()); std::array pads{}; std::copy_n(filter.GetPads().begin(), SptDim, pads.begin()); std::array kers{}; std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin()); - auto op = pooling_operators{filter}; + auto pooler = pooling_operators{filter}; int b_n = out.desc.GetLengths()[0]; - int k_n = out.desc.GetLengths()[1]; + int k_n = out.desc.GetLengths()[chan_dim_offset]; std::array out_spatial_len{}; - std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin()); + std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin()); auto par_ford_out = miopen::unpacker(miopen::prepender(par_ford, b_n, k_n))(out_spatial_len); @@ -167,27 +190,28 @@ struct verify_forward_pooling ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies()) : std::accumulate(win_sz.begin(), win_sz.end(), 1, std::multiplies()); - double acc = op.start(); + double acc = pooler.initialize(); miopen::unpacker(ford)(win_sz)([&](auto... in_spatial_id_pack) { auto in_spatial_id = make_array(in_spatial_id_pack...); std::array idx{}; idx[0] = o; - idx[1] = w; + idx[chan_dim_offset] = w; bool in_cmp_idx = true; for(int i = 0; i < SptDim; ++i) { - idx[i + 2] = start_idx[i] + in_spatial_id[i]; - in_cmp_idx &= (in_dim[i] > idx[i + 2]); + idx[i + sptl_dim_offset] = start_idx[i] + in_spatial_id[i]; + in_cmp_idx &= (in_dim[i] > idx[i + sptl_dim_offset]); } if(in_cmp_idx) { - acc = op(acc, input(idx)); + acc = pooler(acc, input(idx)); } }); - out(o, w, out_spatial_id_pack...) = T(op.final(acc, pool_size)); + out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size)); }); + return out; } @@ -198,6 +222,7 @@ struct verify_forward_pooling { auto&& handle = get_handle(); auto out = get_output_tensor(filter, input); + indices.resize(out.data.size(), 0); auto in_dev = handle.Write(input.data); @@ -218,7 +243,8 @@ struct verify_forward_pooling wspace.size()); indices = wspace.Read>(); - out.data = handle.Read(out_dev, out.data.size()); + handle.ReadTo(out.data.data(), out_dev, out.data.size() * sizeof(T)); + return out; } @@ -249,6 +275,10 @@ struct verify_backward_pooling bool verify_index) const { auto dinput = input; + + constexpr int sptl_dim_offset = 2; + constexpr int chan_dim_offset = 1; + std::vector din_vec(input.desc.GetElementSpace(), 0.0); CHECK(dout.desc == out.desc); std::array in_dim{}; @@ -264,9 +294,9 @@ struct verify_backward_pooling auto ford_ker = miopen::unpacker(ford)(kers); int out_n = out.desc.GetLengths()[0]; - int out_c = out.desc.GetLengths()[1]; + int out_c = out.desc.GetLengths()[chan_dim_offset]; std::array out_spatial_len{}; - std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin()); + std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin()); auto ford_out = miopen::unpacker(ford)(out_spatial_len); par_ford(out_n, out_c)([&](int o, int w) { @@ -281,12 +311,12 @@ struct verify_backward_pooling for(int i = 0; i < SptDim; i++) { std::size_t mx_idx_dim = mx_idx; - mx_idx_dim /= std::accumulate(in_dim.begin() + i + 3, + mx_idx_dim /= std::accumulate(in_dim.begin() + sptl_dim_offset + i + 1, in_dim.end(), 1ULL, std::multiplies()); - mx_idx_dim %= in_dim[i + 2]; - idx[i + 2] = mx_idx_dim; + mx_idx_dim %= in_dim[i + sptl_dim_offset]; + idx[i + sptl_dim_offset] = mx_idx_dim; } } else @@ -462,6 +492,7 @@ struct pooling_driver : test_driver #endif int verify_indices{}; int wsidx{}; + miopenTensorLayout_t layout{}; std::unordered_map index_type_lookup = { {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8}, {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16}, @@ -505,16 +536,20 @@ struct pooling_driver : test_driver add(verify_indices, "verify_indices", generate_data({1})); } - template + template void run_impl() { std::vector indices{}; - auto input = tensor{in_shape}.generate( - tensor_elem_gen_integer{miopen_type{} == miopenHalf ? 5 : 17}); - auto out = verify(verify_forward_pooling{}, input, filter, indices); + auto input = tensor{layout, in_shape}; + for(auto& v : input.data) v = gen_value(); // TODO RJS use generate + + auto out = verify(verify_forward_pooling{}, + input, + filter, + indices); auto dout = out.first; dout.generate(tensor_elem_gen_integer{2503}); - verify(verify_backward_pooling{}, + verify(verify_backward_pooling{}, input, dout, out.first, @@ -528,7 +563,7 @@ struct pooling_driver : test_driver { auto idx_typ = index_type_lookup.at(miopen::ToUpper(index_type)); auto idx_sz = sizeof(uint8_t); - int spt_dim = in_shape.size() - 2; + int sptl_dim = in_shape.size() - 2; const bool skip_many_configs_with_non_int8_index = (dataset_id == 0) && full_set; // Otherwise the default dataset takes too much time. const bool wide_dataset = (dataset_id == 2) && full_set; @@ -537,17 +572,19 @@ struct pooling_driver : test_driver { mode_lookup.at(miopen::ToUpper(mode)), #if TEST_PADDING_MODE == 1 - pmode_lookup.at(miopen::ToUpper(pmode)), + pmode_lookup.at(miopen::ToUpper(pmode)), #else - miopenPaddingDefault, + miopenPaddingDefault, #endif - lens, strides, pads + lens, + strides, + pads }; filter.SetIndexType(idx_typ); filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx)); - if(wsidx == 0 && spt_dim == 3 && filter.GetMode() == miopenPoolingMax && full_set) + if(wsidx == 0 && sptl_dim == 3 && filter.GetMode() == miopenPoolingMax && full_set) { show_command(); std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " @@ -556,7 +593,7 @@ struct pooling_driver : test_driver return; } - if(wsidx == 0 && spt_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset) + if(wsidx == 0 && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset) { show_command(); std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " @@ -587,12 +624,12 @@ struct pooling_driver : test_driver /// the "full test" is ran. See: /// \ref max_pooling_index_max_restriction case miopenIndexUint8: { - if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set && + if((sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) && full_set && filter.GetMode() == miopenPoolingMax) { show_command(); std::cout << "Warning: Config skipped: uint8 index is too small " - "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) " + "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " "&& filter.GetMode() == miopenPoolingMax" << std::endl; return; @@ -600,12 +637,12 @@ struct pooling_driver : test_driver break; } case miopenIndexUint16: { - if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set && + if((sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) && full_set && filter.GetMode() == miopenPoolingMax) { show_command(); std::cout << "Warning: Config skipped: uint16 index is too small " - "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) " + "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " "&& filter.GetMode() == miopenPoolingMax" << std::endl; return; @@ -675,12 +712,12 @@ struct pooling_driver : test_driver } else { - if(num_uint64_case_imgidx > 5 && spt_dim == 2) + if(num_uint64_case_imgidx > 5 && sptl_dim == 2) { show_command(); std::cout << "Warning: Config skipped to speed up testing of the " "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 " - "&& spt_dim == 2)" + "&& sptl_dim == 2)" << std::endl; return; } @@ -692,23 +729,23 @@ struct pooling_driver : test_driver } } - auto input_desc = miopen::TensorDescriptor(this->type, in_shape); + auto input_desc = miopen::TensorDescriptor(this->type, layout, in_shape); - if(spt_dim != 2 && spt_dim != 3) + if(sptl_dim != 2 && sptl_dim != 3) { show_command(); - std::cout << "Warning: Config skipped becuse it is not supported " // - "(spt_dim != 2 && spt_dim != 3)" + std::cout << "Warning: Config skipped because it is not supported " // + "(sptl_dim != 2 && sptl_dim != 3)" << std::endl; return; } - for(int i = 0; i < spt_dim; i++) + for(int i = 0; i < sptl_dim; i++) { if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast(2) * pads[i])) { show_command(); - std::cout << "Warning: Config skipped becuse it is invalid " + std::cout << "Warning: Config skipped because it is invalid " "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))" << std::endl; return; @@ -733,9 +770,13 @@ struct pooling_driver : test_driver } } - std::vector in_dim(input_desc.GetLengths().begin() + 2, input_desc.GetLengths().end()); - std::vector out_dim(spt_dim); + constexpr int sptl_index = 2; + + std::vector in_dim(input_desc.GetLengths().begin() + sptl_index, + input_desc.GetLengths().begin() + sptl_index + sptl_dim); + std::vector out_dim(sptl_dim); std::vector ker_dim(filter.GetLengths().begin(), filter.GetLengths().end()); + #if TEST_PADDING_MODE == 1 if(filter.pmode == miopenPaddingSame) { @@ -743,7 +784,7 @@ struct pooling_driver : test_driver return i == 0; })) return; - for(int i = 0; i < spt_dim; i++) + for(int i = 0; i < sptl_dim; i++) { filter.pads[i] = ((in_dim[i] % filter.GetStrides()[i] == 0) @@ -763,7 +804,7 @@ struct pooling_driver : test_driver return i == 0; })) return; - for(int i = 0; i < spt_dim; i++) + for(int i = 0; i < sptl_dim; i++) { filter.pads[i] = 0; @@ -778,7 +819,7 @@ struct pooling_driver : test_driver switch(filter.GetIndexType()) { case miopenIndexUint8: { - if(spt_dim == 3) + if(sptl_dim == 3) { run_impl(); } @@ -789,7 +830,7 @@ struct pooling_driver : test_driver break; } case miopenIndexUint16: { - if(spt_dim == 3) + if(sptl_dim == 3) { run_impl(); } @@ -800,7 +841,7 @@ struct pooling_driver : test_driver break; } case miopenIndexUint32: { - if(spt_dim == 3) + if(sptl_dim == 3) { run_impl(); } @@ -811,7 +852,7 @@ struct pooling_driver : test_driver break; } case miopenIndexUint64: { - if(spt_dim == 3) + if(sptl_dim == 3) { run_impl(); }