From 51b1ced1db89207fedded6057d01b5842964de5f Mon Sep 17 00:00:00 2001 From: "Randy J. Spaulding" Date: Sun, 28 Jul 2024 06:05:45 +0000 Subject: [PATCH 01/10] checkpoint --- src/CMakeLists.txt | 5 + src/include/miopen/pooling/solvers.hpp | 61 ++++ src/kernels/MIOpenPoolingBwd.cl | 322 ++++++++++++++++++++ src/kernels/MIOpenPoolingBwdND.cl | 247 +++++++++++++++ src/kernels/MIOpenPoolingForwardNaive.cl | 4 +- src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp | 245 +++++++++++++++ src/ocl/pooling_ocl.cpp | 5 + src/solver.cpp | 7 + src/solver/pooling/backwardCk2d.cpp | 308 +++++++++++++++++++ src/solver/pooling/backwardCkNd.cpp | 273 +++++++++++++++++ src/solver/pooling/forwardCk2d.cpp | 267 ++++++++++++++++ src/solver/pooling/forwardCkNd.cpp | 264 ++++++++++++++++ src/solver/pooling/forwardNaive.cpp | 143 ++++++--- test/gtest/poolingFwdNdNaive.cpp | 217 +++++++++++++ test/pooling2d.hpp | 32 +- test/pooling3d.cpp | 26 +- test/pooling3d.hpp | 62 ++++ 17 files changed, 2405 insertions(+), 83 deletions(-) create mode 100644 src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp create mode 100644 src/solver/pooling/backwardCk2d.cpp create mode 100644 src/solver/pooling/backwardCkNd.cpp create mode 100644 src/solver/pooling/forwardCk2d.cpp create mode 100644 src/solver/pooling/forwardCkNd.cpp create mode 100644 test/gtest/poolingFwdNdNaive.cpp create mode 100644 test/pooling3d.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 14e7c954b1..150e5cb76a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -289,8 +289,12 @@ set( MIOpen_Source solver/pooling/forward2d.cpp solver/pooling/forwardNaive.cpp solver/pooling/forwardNd.cpp + solver/pooling/forwardCk2d.cpp + solver/pooling/forwardCkNd.cpp solver/pooling/backward2d.cpp solver/pooling/backwardNd.cpp + solver/pooling/backwardCk2d.cpp + solver/pooling/backwardCkNd.cpp solver/reduce/forward_argmax.cpp solver/reduce/forward_argmin.cpp solver/reduce/forward_max.cpp @@ -495,6 +499,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenPoolingBwd.cl kernels/MIOpenPoolingBwdND.cl kernels/MIOpenPoolingForwardNaive.cl + kernels/MIOpenPoolingFwdNDNhwcNaive.cpp kernels/MIOpenPoolingND.cl kernels/MIOpenConv1x1S.cl kernels/MIOpenConv1x1J1.cl diff --git a/src/include/miopen/pooling/solvers.hpp b/src/include/miopen/pooling/solvers.hpp index 0d0e35755a..ab86a52aae 100644 --- a/src/include/miopen/pooling/solvers.hpp +++ b/src/include/miopen/pooling/solvers.hpp @@ -66,6 +66,30 @@ struct PoolingForwardNd final : PoolingSolver const miopen::pooling::ProblemDescription& problem) const override; }; +struct PoolingForwardCk2d final : PoolingSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; +}; + +struct PoolingForwardCkNd final : PoolingSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; +}; + struct PoolingForwardNaive final : PoolingSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } @@ -79,6 +103,19 @@ struct PoolingForwardNaive final : PoolingSolver const miopen::pooling::ProblemDescription& problem) const override; }; +struct PoolingForwardNdNhwcNaive final : PoolingSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + bool IsDynamic() const override { return true; } + + bool IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; +}; + template struct PoolingFwdNCHWTransposingSolver : TransposingSolver, PoolingSolver, @@ -145,6 +182,18 @@ struct PoolingBackward2d final : PoolingSolver const miopen::pooling::ProblemDescription& problem) const override; }; +struct PoolingBackwardCk2d final : PoolingSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; +}; + struct PoolingBackwardNd final : PoolingSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } @@ -157,6 +206,18 @@ struct PoolingBackwardNd final : PoolingSolver const miopen::pooling::ProblemDescription& problem) const override; }; +struct PoolingBackwardCkNd final : PoolingSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const override; +}; + template struct PoolingBwdNCHWTransposingSolver : TransposingSolver, PoolingSolver, diff --git a/src/kernels/MIOpenPoolingBwd.cl b/src/kernels/MIOpenPoolingBwd.cl index 6c88bebadf..062c5ed337 100644 --- a/src/kernels/MIOpenPoolingBwd.cl +++ b/src/kernels/MIOpenPoolingBwd.cl @@ -218,6 +218,168 @@ mloPoolingAveBwd(const __global _FLOAT* top_diff, } } } + +__attribute__((reqd_work_group_size(MLO_POOLBWD_GROUP_SZ0, + MLO_POOLBWD_GROUP_SZ1, + MLO_POOLBWD_GROUP_SZ2))) __kernel void +mloPoolingAveBwdNhwc(const __global _FLOAT* top_diff, + __global _FLOAT* bot_diff, + int mlo_pad1, + int mlo_pad0, + int mlo_n_outputs, + int mlo_bot_height, + int mlo_bot_width, + int mlo_top_height, + int mlo_top_width, + int mlo_botdf_batch_str, + int mlo_botdf_channel_str, + int mlo_botdf_str, + int mlo_topdf_batch_str, + int mlo_topdf_channel_str, + int mlo_topdf_str) +{ + __local _FLOAT lcl_top_diff[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT]; + + int x = get_group_id(0) * MLO_POOLBWD_GROUP_SZ0 * MLO_POOLBWD_N_HORIZ_OUT_PIX; + int y = get_group_id(1) * MLO_POOLBWD_GROUP_SZ1 * MLO_POOLBWD_N_VERT_OUT_PIX; + int lcl_id0 = get_local_id(0); + int lcl_id1 = get_local_id(1); + // int lcl_id = (lcl_id1 << MLO_POOLBWD_GROUP_LG2SZ1) + lcl_id0; + int ob = get_global_id(2); // outputs * batch_sz + int b = ob / mlo_n_outputs; + int o = ob - b * mlo_n_outputs; + + int top_x = (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) < 0 + ? 0 + : (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1; + int top_y = (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) < 0 + ? 0 + : (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1; + int top_off = b * mlo_topdf_batch_str + o * mlo_topdf_channel_str; + + _FLOAT_ACCUM res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX]; + for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++) + { + for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++) + { + res[k][l] = (_FLOAT_ACCUM)0; + } + } + + // load tile + for(int tj = lcl_id1; tj < MLO_POOLBWD_LCL_DATA_HEIGHT; tj += MLO_POOLBWD_GROUP_SZ1) + { + int top_y_act = top_y + tj; + int top_y_off = top_y_act * mlo_topdf_str; + + int lcl_off_v = tj * MLO_POOLBWD_LCL_DATA_WIDTH; + + bool invisibleY = (top_y_act >= mlo_top_height); + + for(int ti = lcl_id0; ti < MLO_POOLBWD_LCL_DATA_WIDTH; ti += MLO_POOLBWD_GROUP_SZ0) + { + + int top_x_act = top_x + ti; + + bool invisibleX = (top_x_act >= mlo_top_width); + + int top_diff_off = (invisibleX || invisibleY) ? 0 : top_off + top_y_off + top_x_act; + + _FLOAT top_val = top_diff[top_diff_off]; + + top_val = (invisibleX || invisibleY) ? 0 : top_val; + + lcl_top_diff[lcl_off_v + ti] = top_val; +#if 0 + if (lcl_id1==0&&o==0&&b==0) + { + printf("K:in: %d %d %d %f\n", top_off + top_y_off + top_x_act, top_y_act, top_x_act, top_val); + } +#endif + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + int bot_y = (y + lcl_id1 * MLO_POOLBWD_N_VERT_OUT_PIX); + int bot_x = (x + lcl_id0 * MLO_POOLBWD_N_HORIZ_OUT_PIX); + + for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++) + { + + int h = bot_y + k + mlo_pad1; + int top_hstart = (h < MLO_POOLING_KERNEL_SZ1) + ? 0 + : (h - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1; + int top_hend = min(h / MLO_POOLING_STRIDE1 + 1, mlo_top_height); + + for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++) + { + + int w = bot_x + l + mlo_pad0; + int top_wstart = (w < MLO_POOLING_KERNEL_SZ0) + ? 0 + : (w - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1; + int top_wend = min(w / MLO_POOLING_STRIDE0 + 1, mlo_top_width); + + for(int top_h = top_hstart; top_h < top_hend; ++top_h) + { + int hstart = top_h * MLO_POOLING_STRIDE1 - mlo_pad1; + int hend = min(hstart + MLO_POOLING_KERNEL_SZ1, mlo_bot_height); + hstart = max(hstart, 0); + + for(int top_w = top_wstart; top_w < top_wend; ++top_w) + { + // figure out the pooling size + int wstart = top_w * MLO_POOLING_STRIDE0 - mlo_pad0; + int wend = min(wstart + MLO_POOLING_KERNEL_SZ0, mlo_bot_width); + wstart = max(wstart, 0); + int pool_size = +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE + MLO_POOLING_KERNEL_SZ0 * MLO_POOLING_KERNEL_SZ1; + (void)wend; + (void)hend; +#else + (hend - hstart) * (wend - wstart); +#endif + pool_size = (pool_size == 0) ? 1 : pool_size; + int lcl_top_h = top_h - top_y; + int lcl_top_w = top_w - top_x; + _FLOAT_ACCUM add_val = + CVT_FLOAT2ACCUM( + lcl_top_diff[lcl_top_h * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w]) / + CVT_INTEGRAL2ACCUM(pool_size); + res[k][l] += add_val; +#if 0 + if (bot_x+l==6&&bot_y+k==0&&o==3&&b==0) + { + printf("K:com: %d %d %d %d %d %d %10.8f %10.8f %10.8f %d\n", k,l,top_h, top_w, lcl_top_h, lcl_top_w, res[k][l], add_val, lcl_top_diff[lcl_top_h * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w], pool_size); + } +#endif + } + } + } + } + + int bot_off = + b * mlo_botdf_batch_str + o * mlo_botdf_channel_str + bot_y * mlo_botdf_str + bot_x; + for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++) + { + for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++) + { + if(bot_y + k < mlo_bot_height && bot_x + l < mlo_bot_width) + { + bot_diff[bot_off + k * mlo_botdf_str + l] = CVT_ACCUM2FLOAT(res[k][l]); +#if 0 + if (lcl_id0==0&&lcl_id1==0&&o==0&&b==0) + { + printf("K:out: %d %d %d %f\n", bot_off + k * mlo_botdf_str +l, k, l, bot_diff[bot_off + k * mlo_botdf_str +l]); + } +#endif + } + } + } +} #endif // AVERAGE_OPS #if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX @@ -379,4 +541,164 @@ mloPoolingMaxBwd(const __global _FLOAT* top_df, } } } + +__attribute__((reqd_work_group_size(MLO_POOLBWD_GROUP_SZ0, + MLO_POOLBWD_GROUP_SZ1, + MLO_POOLBWD_GROUP_SZ2))) __kernel void +mloPoolingMaxBwdNhwc(const __global _FLOAT* top_df, + __global _FLOAT* bot_df, + __global index_t* mask, + int mlo_pad1, + int mlo_pad0, + int mlo_n_outputs, + int mlo_bot_height, + int mlo_bot_width, + int mlo_top_height, + int mlo_top_width, + int mlo_botdf_batch_str, + int mlo_botdf_channel_str, + int mlo_botdf_str, + int mlo_topdf_batch_str, + int mlo_topdf_channel_str, + int mlo_topdf_str) +{ + __local _FLOAT lcl_top_df[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT]; + __local index_t lcl_mask[MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT]; + + int gid0 = get_group_id(0); + int gid1 = get_group_id(1); + int x = gid0 * MLO_POOLBWD_GROUP_SZ0 * MLO_POOLBWD_N_HORIZ_OUT_PIX; + int y = gid1 * MLO_POOLBWD_GROUP_SZ1 * MLO_POOLBWD_N_VERT_OUT_PIX; + int lcl_id0 = get_local_id(0); + int lcl_id1 = get_local_id(1); + int ob = get_global_id(2); // outputs * batch_sz + int b = ob / mlo_n_outputs; + int o = ob - b * mlo_n_outputs; + + int top_x = (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) < 0 + ? 0 + : (x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0) / MLO_POOLING_STRIDE0 + 1; + int top_y = (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) < 0 + ? 0 + : (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1; + int top_df_off = b * mlo_topdf_batch_str + o * mlo_topdf_channel_str; + + _FLOAT res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX]; + _FLOAT top_df_val; + index_t mask_val; + // load tiles + // top df and top + for(int tj = lcl_id1; tj < MLO_POOLBWD_LCL_DATA_HEIGHT; tj += MLO_POOLBWD_GROUP_SZ1) + { + int top_y_act = top_y + tj; + int top_df_y_off = top_y_act * mlo_topdf_str; + + int lcl_off_v = tj * MLO_POOLBWD_LCL_DATA_WIDTH; + + bool visibleY = (top_y_act < mlo_top_height); + + for(int ti = lcl_id0; ti < MLO_POOLBWD_LCL_DATA_WIDTH; ti += MLO_POOLBWD_GROUP_SZ0) + { + mask_val = MLO_POOLING_INDEX_MAX; + int top_x_act = top_x + ti; + int lcl_idx = lcl_off_v + ti; + + bool visible = visibleY && (top_x_act < mlo_top_width); + if(visible) + { + int idx = top_df_off + top_df_y_off + top_x_act; + + top_df_val = top_df[idx]; + mask_val = mask[idx]; + // top_df_val *= visible; + + lcl_top_df[lcl_idx] = top_df_val; + } + lcl_mask[lcl_idx] = mask_val; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + _FLOAT add_val; + int bt_y = (y + lcl_id1 * MLO_POOLBWD_N_VERT_OUT_PIX); + int bt_x = (x + lcl_id0 * MLO_POOLBWD_N_HORIZ_OUT_PIX); + + for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++) + { + int b_y = bt_y + k; + + // top most top y that can be influenced by this bot y + int tt_y1 = + (b_y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1) / MLO_POOLING_STRIDE1; + int tt_y = max(0, tt_y1); + + for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++) + { + int b_x = bt_x + l; + // left most top x that can be influenced by this bot x + int lt_x1 = (b_x + mlo_pad0 - MLO_POOLING_KERNEL_SZ0 + MLO_POOLING_STRIDE0) / + MLO_POOLING_STRIDE0; + int lt_x = max(0, lt_x1); + + // find and sum up all tops that have been influenced by particular bot + res[k][l] = 0; + + for(int th = tt_y; th < tt_y + (MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1 - 1) / + MLO_POOLING_STRIDE1; + ++th) + { + __attribute__((opencl_unroll_hint(2))) for(int tw = lt_x; + tw < lt_x + (MLO_POOLING_KERNEL_SZ0 + + MLO_POOLING_STRIDE0 - 1) / + MLO_POOLING_STRIDE0; + ++tw) + { + int lcl_th = th - top_y; + int lcl_tw = tw - top_x; +#if USE_IMG_INDEX + index_t img_idx = b_x + b_y * mlo_bot_width; +#else + int filter_x = b_x - tw * MLO_POOLING_STRIDE0 + mlo_pad0; + int filter_y = b_y - th * MLO_POOLING_STRIDE1 + mlo_pad1; + int filter_idx = filter_x + filter_y * MLO_POOLING_KERNEL_SZ0; +#endif + bool visible = (lcl_th < MLO_POOLBWD_LCL_DATA_HEIGHT) && + (lcl_tw < MLO_POOLBWD_LCL_DATA_WIDTH); + int lcl_idx = visible ? (lcl_th * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_tw) : 0; + + bool match = visible && +#if USE_IMG_INDEX + (img_idx == lcl_mask[lcl_idx]) +#else + (filter_idx == lcl_mask[lcl_idx]) && (filter_x >= 0) && + (filter_y >= 0) +#endif + ; + + //_FLOAT add_val = lcl_top_df[lcl_idx] * match; + //_FLOAT add_val = match ? lcl_top_df[lcl_idx] : (_FLOAT)0; + if(match) + { + add_val = lcl_top_df[lcl_idx]; + res[k][l] += add_val; + } + } + } + } + } + + int bot_df_off = + b * mlo_botdf_batch_str + o * mlo_botdf_channel_str + bt_y * mlo_botdf_str + bt_x; + for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++) + { + for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++) + { + if((bt_y + k) < mlo_bot_height && (bt_x + l) < mlo_bot_width) + { + bot_df[bot_df_off + k * mlo_botdf_str + l] = res[k][l]; + } + } + } +} + #endif // MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX diff --git a/src/kernels/MIOpenPoolingBwdND.cl b/src/kernels/MIOpenPoolingBwdND.cl index 7daacd24ab..4e9b4da04e 100644 --- a/src/kernels/MIOpenPoolingBwdND.cl +++ b/src/kernels/MIOpenPoolingBwdND.cl @@ -147,6 +147,120 @@ mloPoolingNDMaxBwd(const __global _FLOAT* top_df, } } +__attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void +mloPoolingNDMaxBwdNhwc(const __global _FLOAT* top_df, + __global _FLOAT* bot_df, + __global index_t* mask, + const uint pad_d, + const uint pad_h, + const uint pad_w, + const uint batch, + const uint chal, + const uint bot_d, + const uint bot_h, + const uint bot_w, + const uint top_d, + const uint top_h, + const uint top_w, + const uint bot_str_b, + const uint bot_str_c, + const uint bot_str_d, + const uint bot_str_h, + const uint top_str_b, + const uint top_str_c, + const uint top_str_d, + const uint top_str_h, + const uint total_work) +{ + + int bot_blk_w = (bot_w + PIX_W_PER_WORK - 1) / PIX_W_PER_WORK; + int bot_blk_h = (bot_h + PIX_H_PER_WORK - 1) / PIX_H_PER_WORK; + int bot_blk_d = (bot_d + PIX_D_PER_WORK - 1) / PIX_D_PER_WORK; + + bot_blk_w = max(bot_blk_w, 1); + bot_blk_h = max(bot_blk_h, 1); + bot_blk_d = max(bot_blk_d, 1); + + for(uint gid = get_global_id(0); gid < total_work; gid += MAX_ACTIV_WORKITEM) + { + int b_id = gid / chal / bot_blk_w / bot_blk_h / bot_blk_d; + int c_id = (gid / bot_blk_w / bot_blk_h / bot_blk_d) % chal; + + int bot_d_id = ((gid / bot_blk_w / bot_blk_h) % bot_blk_d) * PIX_D_PER_WORK; + int bot_h_id = ((gid / bot_blk_w) % bot_blk_h) * PIX_H_PER_WORK; + int bot_w_id = (gid % bot_blk_w) * PIX_W_PER_WORK; + + int top_d_start = + bot_d_id + pad_d < KERNEL_SZ_D ? 0 : (bot_d_id + pad_d - KERNEL_SZ_D) / STRIDE_D + 1; + int top_h_start = + bot_h_id + pad_h < KERNEL_SZ_H ? 0 : (bot_h_id + pad_h - KERNEL_SZ_H) / STRIDE_H + 1; + int top_w_start = + bot_w_id + pad_w < KERNEL_SZ_W ? 0 : (bot_w_id + pad_w - KERNEL_SZ_W) / STRIDE_W + 1; + + int top_d_end = (bot_d_id + PIX_D_PER_WORK - 1 + pad_d) / STRIDE_D + 1; + int top_h_end = (bot_h_id + PIX_H_PER_WORK - 1 + pad_h) / STRIDE_H + 1; + int top_w_end = (bot_w_id + PIX_W_PER_WORK - 1 + pad_w) / STRIDE_W + 1; + + top_d_end = min(top_d_end, (int)top_d); + top_h_end = min(top_h_end, (int)top_h); + top_w_end = min(top_w_end, (int)top_w); + + _FLOAT bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0}; + + for(int h = top_d_start; h < top_d_end; ++h) + { + for(int j = top_h_start; j < top_h_end; ++j) + { + for(int i = top_w_start; i < top_w_end; ++i) + { + uint top_gbl_off = + b_id * top_str_b + c_id * top_str_c + h * top_str_d + j * top_str_h + i; + + _FLOAT top_val = b_id < batch ? top_df[top_gbl_off] : 0; + index_t mask_idx = b_id < batch ? mask[top_gbl_off] : MLO_POOLING_INDEX_MAX; + + uint mask_d_id = mask_idx / bot_h / bot_w; + uint mask_h_id = (mask_idx / bot_w) % bot_h; + uint mask_w_id = mask_idx % bot_w; + + if(mask_d_id >= bot_d_id && mask_h_id >= bot_h_id && mask_w_id >= bot_w_id && + mask_d_id < bot_d_id + PIX_D_PER_WORK && + mask_h_id < bot_h_id + PIX_H_PER_WORK && + mask_w_id < bot_w_id + PIX_W_PER_WORK) + { + mask_d_id -= bot_d_id; + mask_h_id -= bot_h_id; + mask_w_id -= bot_w_id; + + bot_data[mask_d_id][mask_h_id][mask_w_id] += top_val; + } + } + } + } + + uint bot_off = b_id * bot_str_b + c_id * bot_str_c + bot_d_id * bot_str_d + + bot_h_id * bot_str_h + bot_w_id; + + for(uint m = 0; m < PIX_D_PER_WORK; m++) + { + for(uint k = 0; k < PIX_H_PER_WORK; k++) + { + for(uint l = 0; l < PIX_W_PER_WORK; l++) + { + + if(bot_d_id + m < bot_d && bot_h_id + k < bot_h && bot_w_id + l < bot_w && + b_id < batch) + { + uint bot_idx = bot_off + m * bot_str_d + k * bot_str_h + l; + + bot_df[bot_idx] = bot_data[m][k][l]; + } + } + } + } + } +} + #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE || MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE __attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void @@ -281,4 +395,137 @@ mloPoolingNDAveBwd(const __global _FLOAT* top_df, } } } + +__attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void +mloPoolingNDAveBwdNhwc(const __global _FLOAT* top_df, + __global _FLOAT* bot_df, + const uint pad_d, + const uint pad_h, + const uint pad_w, + const uint batch, + const uint chal, + const uint bot_d, + const uint bot_h, + const uint bot_w, + const uint top_d, + const uint top_h, + const uint top_w, + const uint bot_str_b, + const uint bot_str_c, + const uint bot_str_d, + const uint bot_str_h, + const uint top_str_b, + const uint top_str_c, + const uint top_str_d, + const uint top_str_h, + const uint total_work) +{ + + int bot_blk_w = (bot_w + PIX_W_PER_WORK - 1) / PIX_W_PER_WORK; + int bot_blk_h = (bot_h + PIX_H_PER_WORK - 1) / PIX_H_PER_WORK; + int bot_blk_d = (bot_d + PIX_D_PER_WORK - 1) / PIX_D_PER_WORK; + + bot_blk_w = max(bot_blk_w, 1); + bot_blk_h = max(bot_blk_h, 1); + bot_blk_d = max(bot_blk_d, 1); + + for(uint gid = get_global_id(0); gid < total_work; gid += MAX_ACTIV_WORKITEM) + { + int b_id = gid / chal / bot_blk_w / bot_blk_h / bot_blk_d; + int c_id = (gid / bot_blk_w / bot_blk_h / bot_blk_d) % chal; + + int bot_d_id = ((gid / bot_blk_w / bot_blk_h) % bot_blk_d) * PIX_D_PER_WORK; + int bot_h_id = ((gid / bot_blk_w) % bot_blk_h) * PIX_H_PER_WORK; + int bot_w_id = (gid % bot_blk_w) * PIX_W_PER_WORK; + + int top_d_start = + bot_d_id + pad_d < KERNEL_SZ_D ? 0 : (bot_d_id + pad_d - KERNEL_SZ_D) / STRIDE_D + 1; + int top_h_start = + bot_h_id + pad_h < KERNEL_SZ_H ? 0 : (bot_h_id + pad_h - KERNEL_SZ_H) / STRIDE_H + 1; + int top_w_start = + bot_w_id + pad_w < KERNEL_SZ_W ? 0 : (bot_w_id + pad_w - KERNEL_SZ_W) / STRIDE_W + 1; + + int top_d_end = (bot_d_id + PIX_D_PER_WORK - 1 + pad_d) / STRIDE_D + 1; + int top_h_end = (bot_h_id + PIX_H_PER_WORK - 1 + pad_h) / STRIDE_H + 1; + int top_w_end = (bot_w_id + PIX_W_PER_WORK - 1 + pad_w) / STRIDE_W + 1; + + top_d_end = min(top_d_end, (int)top_d); + top_h_end = min(top_h_end, (int)top_h); + top_w_end = min(top_w_end, (int)top_w); + + _FLOAT_ACCUM bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0}; + + for(int h = top_d_start; h < top_d_end; ++h) + { + int dstart = h * STRIDE_D - pad_d; + int dend = min((dstart + KERNEL_SZ_D), (int)bot_d); + dstart = max(dstart, 0); + + for(int j = top_h_start; j < top_h_end; ++j) + { + int hstart = j * STRIDE_H - pad_h; + int hend = min((hstart + KERNEL_SZ_H), (int)bot_h); + hstart = max(hstart, 0); + + for(int i = top_w_start; i < top_w_end; ++i) + { + int wstart = i * STRIDE_W - pad_w; + int wend = min((wstart + KERNEL_SZ_W), (int)bot_w); + wstart = max(wstart, 0); + + uint pool_size = +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE + KERNEL_SZ_W * KERNEL_SZ_H * KERNEL_SZ_D; +#else + (dend - dstart) * (hend - hstart) * (wend - wstart); +#endif + pool_size = (pool_size == 0) ? 1 : pool_size; + + uint top_gbl_off = + b_id * top_str_b + c_id * top_str_c + h * top_str_d + j * top_str_h + i; + _FLOAT_ACCUM add_val = + b_id < batch ? CVT_FLOAT2ACCUM(top_df[top_gbl_off]) : CVT_FP32_2ACCUM(0.0f); + add_val /= CVT_INTEGRAL2ACCUM(pool_size); + + for(int m = dstart; m < dend; ++m) + { + for(int k = hstart; k < hend; ++k) + { + for(int l = wstart; l < wend; ++l) + { + if(m >= bot_d_id && m < PIX_D_PER_WORK + bot_d_id && + k >= bot_h_id && k < PIX_H_PER_WORK + bot_h_id && + l >= bot_w_id && l < PIX_W_PER_WORK + bot_w_id && b_id < batch) + { + bot_data[m - bot_d_id][k - bot_h_id][l - bot_w_id] += add_val; + } + } + } + } + } + } + } + + uint bot_off = b_id * bot_str_b + c_id * bot_str_c + bot_d_id * bot_str_d + + bot_h_id * bot_str_h + bot_w_id; + + for(uint m = 0; m < PIX_D_PER_WORK; m++) + { + for(uint k = 0; k < PIX_H_PER_WORK; k++) + { + for(uint l = 0; l < PIX_W_PER_WORK; l++) + { + + if(bot_d_id + m < bot_d && bot_h_id + k < bot_h && bot_w_id + l < bot_w && + b_id < batch) + { + uint bot_idx = bot_off + m * bot_str_d + k * bot_str_h + l; + + bot_df[bot_idx] = CVT_ACCUM2FLOAT(bot_data[m][k][l]); + } + } + } + } + } +} #endif diff --git a/src/kernels/MIOpenPoolingForwardNaive.cl b/src/kernels/MIOpenPoolingForwardNaive.cl index 20e0949967..f839193297 100644 --- a/src/kernels/MIOpenPoolingForwardNaive.cl +++ b/src/kernels/MIOpenPoolingForwardNaive.cl @@ -145,7 +145,7 @@ __kernel void mloPoolingForwardNaive(const __global _FLOAT* bot_ptr, uint h_save = 0; uint w_save = 0; #endif - for(uint d = dstart; d < dend; ++d) + for(size_t d = dstart; d < dend; ++d) { for(uint h = hstart; h < hend; ++h) { @@ -153,7 +153,7 @@ __kernel void mloPoolingForwardNaive(const __global _FLOAT* bot_ptr, { const size_t bot_index = b * bot_n_stride // + o * bot_c_stride // - + (size_t)(d * bot_d_stride) // + + d * bot_d_stride // + (size_t)(h * bot_h_stride) // + (size_t)(w * bot_w_stride); #if AVERAGE_OPS diff --git a/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp new file mode 100644 index 0000000000..bc17af8922 --- /dev/null +++ b/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp @@ -0,0 +1,245 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "pooling_functions.h" + +#include + +#if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE) +#define AVERAGE_OPS 1 +#else +#define AVERAGE_OPS 0 +#endif + +// Let's use extended-precision accumulator only in FP16 pooling and only for averaging. +// For all other ops and datatypes, use native accumulator, i.e. treate FLOAT_ACCUM as FLOAT. +#if !(AVERAGE_OPS && MIOPEN_USE_FP16) +#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 1 +#endif +#include "float_types.h" + +#ifndef MLO_POOLING_IS2D_KERNEL +#error "MLO_POOLING_IS2D_KERNEL must be defined" +#endif + +#if AVERAGE_OPS +#define ARG_UNUSED_FOR_AVERAGE __attribute__((__unused__)) +#else +#define ARG_UNUSED_FOR_AVERAGE +#endif + +#if MLO_POOLING_IS2D_KERNEL +#define ARG_UNUSED_FOR_2D __attribute__((__unused__)) +#else +#define ARG_UNUSED_FOR_2D +#endif + +// Out N, D, H are encoded into the block indices x, y, z +// Requires all lens, strides, pads to be in DHW[NC] order. The code is +// cleaner and more performant this way. +// No 2D-only optimization. +template +__device__ void poolingFwdNDNhwcNaive(const TI* in_data, + TO* out_data, + ARG_UNUSED_FOR_AVERAGE index_t* mask_ptr, + ARG_UNUSED_FOR_AVERAGE int save_index, + ARG_UNUSED_FOR_AVERAGE int index_mode, + std::vector filter_lens, + std::vector filter_strides, + std::vector filter_pads, + uint32_t all_n, + uint32_t all_c, + std::vector lens, + std::vector strides, + std::vector out_lens, + std::vector out_strides, + ARG_UNUSED_FOR_AVERAGE std::vector mask_strides) +{ + constexpr uint32_t D_IDX = 0; + constexpr uint32_t H_IDX = 1; + constexpr uint32_t W_IDX = 2; + constexpr uint32_t N_IDX = 3; + constexpr uint32_t C_IDX = 4; + + const uint32_t b = blockIdx.x; // out N + if(!(b < all_n)) + return; + + const uint32_t k = blockIdx.y; // out D + if(!(k < out_lens[D_IDX])) + return; + + const uint32_t j = blockIdx.z; // out H + if(!(j < out_lens[H_IDX])) + return; + + for(uint32_t i = 0; i < out_lens[W_IDX]; ++i) // out W + { + for(uint32_t o = 0; o < all_c ++o) // out C + { + const auto int_dstart = static_cast(k * filter_strides[D_IDX]) - static_cast(filter_pads[D_IDX]); + const auto int_hstart = static_cast(j * filter_strides[H_IDX]) - static_cast(filter_pads[H_IDX]); + const auto int_wstart = static_cast(i * filter_strides[W_IDX]) - static_cast(filter_pads[W_IDX]); + const auto dend = static_cast(min(int_dstart + static_cast(filter_lens[D_IDX]), static_cast(out_lens[D_IDX]))); + const auto hend = static_cast(min(int_hstart + static_cast(filter_lens[H_IDX]), static_cast(out_lens[H_IDX]))); + const auto wend = static_cast(min(int_wstart + static_cast(filter_lens[W_IDX]), static_cast(out_lens[W_IDX]))); + const auto dstart = static_cast(max(int_dstart, 0)); + const auto hstart = static_cast(max(int_hstart, 0)); + const auto wstart = static_cast(max(int_wstart, 0)); + +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + pool_size = (pool_size == 0) ? 1 : pool_size; +#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE + const uint32_t pool_size = filter_lens[D_IDX] * filter_lens[H_IDX] * filter_lens[W_IDX]; +#endif + +#if AVERAGE_OPS + _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0); +#else // MAX + _FLOAT_ACCUM res = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM); + bool found = false; // May remain false if bot contains only NaNs/-INFs. + uint32_t d_save = 0; + uint32_t h_save = 0; + uint32_t w_save = 0; +#endif + for(size_t d = dstart; d < dend; ++d) + { + for(uint32_t h = hstart; h < hend; ++h) + { + for(uint32_t w = wstart; w < wend; ++w) + { + const size_t in_index = b * strides[N_IDX] + // + o * strides[C_IDX] + // + d * strides[D_IDX] + // + static_cast(h * strides[H_IDX]) + // + static_cast(w * strides[W_IDX]); +#if AVERAGE_OPS + res += in_data[in_index]; +#else // MAX + if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index] > res)) + { + res = in_data[in_index]; + if(save_index) + { + found = true; + d_save = d; + h_save = h; + w_save = w; + } + } +#endif + } + } + } + +#if AVERAGE_OPS + res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size); +#else // MAX + if(save_index) + { + index_t res_index = 0; + + /// Preventing overflow during computation of res_index: + /// If Index is shorter than uint, then let's perform computation in 32-bit + /// domain and then convert to narrower Index. That would reduce the probability of + /// overflow. If Index is wider then 32 bits, then it seems like it is better to + /// convert to Index type before multiplication. However this is not actually + /// necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in + /// 32 bits and then convert. + + if(found) + { + if(index_mode == 1) + res_index = (index_t)(d_save * lens[H_IDX] * lens[W_IDX] // + + h_save * lens[W_IDX] // + + w_save); + else + res_index = (index_t)( // + ((d_save - k * filter_strides[D_IDX] + filter_pads[D_IDX]) * filter_lens[W_IDX] * filter_lens[H_IDX]) // + + ((h_save - j * filter_strides[H_IDX] + filter_pads[H_IDX]) * filter_lens[W_IDX]) // + + (w_save - i * filter_strides[W_IDX] + filter_pads[W_IDX]) // + ); + } + + const size_t mask_index = b * mask_strides[N_IDX] // + + o * mask_strides[C_IDX] // + + (size_t)(k * mask_strides[D_IDX]) // + + (size_t)(j * mask_strides[H_IDX]) // + + (size_t)(i * mask_strides[W_IDX]); + mask_ptr[mask_index] = res_index; + } +#endif + const size_t out_index = out_strides[N_IDX] // + + o * out_strides[C_IDX] // + + (size_t)(k * out_strides[D_IDX]) // + + (size_t)(j * out_strides[H_IDX]) // + + (size_t)(i * out_strides]W_IDX]); + + out_data[out_index] = (_FLOAT)res; + } +} +} + +extern "C" __global__ void mloPoolingForwardNDNhwcNaive(const INPUT_TYPE* __restrict__ in_data, + OUTPUT_TYPE* out_data, + ARG_UNUSED_FOR_AVERAGE index_t* mask_ptr, + ARG_UNUSED_FOR_AVERAGE int save_index, + ARG_UNUSED_FOR_AVERAGE int index_mode, + std::vector filter_lens, + std::vector filter_strides, + std::vector filter_pads, + uint32_t all_n, + uint32_t all_c, + std::vector lens, + std::vector strides, + std::vector out_lens, + std::vector out_strides, + ARG_UNUSED_FOR_AVERAGE std::vector mask_strides) +{ + poolingFwdNDNhwcNaive( + in_data, + out_data, + mask_ptr, + save_index, + index_mode, + filter_lens, + filter_strides, + filter_pads, + all_n, + all_c, + lens, + strides, + out_lens, + out_strides, + mask_strides + ); +} diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp index 9881c1596f..9a2258908f 100644 --- a/src/ocl/pooling_ocl.cpp +++ b/src/ocl/pooling_ocl.cpp @@ -42,6 +42,9 @@ static auto PoolingForwardSolvers() return solver::SolverContainer{}; } @@ -50,6 +53,8 @@ static auto PoolingBackwardSolvers() { return solver::SolverContainer{}; } diff --git a/src/solver.cpp b/src/solver.cpp index e468d38d0a..0088a10cb1 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -561,12 +561,18 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Pooling, pooling::PoolingForward2d{}.SolverDbId()); Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNd{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardCk2d{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardCkNd{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::TransposedPoolingFwd2d{}.SolverDbId()); Register(registry, ++id, Primitive::Pooling, pooling::TransposedPoolingFwdNd{}.SolverDbId()); Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackward2d{}.SolverDbId()); Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardNd{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardCk2d{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::PoolingBackwardCkNd{}.SolverDbId()); + RegisterWithSolver(registry, ++id, conv::ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC{}, @@ -594,6 +600,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) fusion::ConvCKIgemmFwdBiasActivFused{}.SolverDbId(), miopenConvolutionAlgoImplicitGEMM); Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNaive{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNdNhwcNaive{}.SolverDbId()); RegisterWithSolver(registry, ++id, conv::ConvHipImplicitGemmGroupFwdXdlops{}, diff --git a/src/solver/pooling/backwardCk2d.cpp b/src/solver/pooling/backwardCk2d.cpp new file mode 100644 index 0000000000..d1ee157190 --- /dev/null +++ b/src/solver/pooling/backwardCk2d.cpp @@ -0,0 +1,308 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace pooling { + +namespace { + +struct kernel_params +{ + int kernel_size_h; + int kernel_size_w; + int kernel_stride_h; + int kernel_stride_w; + int out_pix_tile0; + int out_pix_tile1; + std::size_t batch_sz; + std::size_t n_inputs; + std::size_t in_height; + std::size_t in_width; + std::size_t grp_tile0; + std::size_t grp_tile1; + + kernel_params(const miopen::pooling::ProblemDescription& problem) + { + const auto& pd = problem.GetPooling(); + + kernel_size_w = pd.lens[1]; + kernel_size_h = pd.lens[0]; + kernel_stride_w = pd.strides[1]; + kernel_stride_h = pd.strides[0]; + + std::tie(batch_sz, n_inputs, in_height, in_width) = + miopen::tien<4>(problem.GetXDesc().GetLengths(), 1); + + out_pix_tile0 = 1; + out_pix_tile1 = 1; + if(pd.GetMode() == miopenPoolingMax) + { + out_pix_tile0 = in_width > 8 && in_width <= 24 ? 4 : 1; + out_pix_tile1 = in_width <= 24 ? 1 : (in_width > 64 && in_width <= 96 ? 4 : 8); + } + + grp_tile0 = 8; + grp_tile1 = 8; + if(pd.GetMode() == miopenPoolingMax) + { + grp_tile0 = in_width <= 8 ? 8 // + : in_width <= 16 ? 4 // + : in_width <= 24 ? 8 // + : in_width <= 32 ? 32 // + : in_width <= 64 ? 8 // + : in_width <= 96 ? 16 // + : in_width <= 128 ? 16 + : 32; + grp_tile1 = in_width <= 8 ? 8 // + : in_width <= 16 ? 16 // + : in_width <= 24 ? 8 // + : in_width <= 32 ? 4 // + : in_width <= 64 ? 8 // + : in_width <= 96 ? 4 // + : in_width <= 128 ? 16 + : 4; + } + } +}; + +std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem) +{ + const auto datatype = problem.GetXDesc().GetType(); + return get_data_size(datatype); +} + +std::size_t sizeof_kernel_index_t(const miopen::pooling::ProblemDescription& problem) +{ + return get_data_size(problem.GetPooling().GetIndexType()); +} + +inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m) +{ + assert(m > 0); + return ((v + m - 1) / m) * m; +} + +// Compute amount of local memory required for holding the arrays defined +// in the "mloPoolingAveBwd" and "mloPoolingMaxBwd" kernels. +std::size_t sizeof_local_memory(const miopen::pooling::ProblemDescription& problem) +{ + const kernel_params kp(problem); + + // aliases to ease programming + const auto& MLO_POOLING_KERNEL_SZ0 = kp.kernel_size_w; + const auto& MLO_POOLING_KERNEL_SZ1 = kp.kernel_size_h; + const auto& MLO_POOLBWD_N_HORIZ_OUT_PIX = kp.out_pix_tile0; + const auto& MLO_POOLBWD_N_VERT_OUT_PIX = kp.out_pix_tile1; + const auto& MLO_POOLING_STRIDE0 = kp.kernel_stride_w; + const auto& MLO_POOLING_STRIDE1 = kp.kernel_stride_h; + const auto& MLO_POOLBWD_GROUP_SZ0 = kp.grp_tile0; + const auto& MLO_POOLBWD_GROUP_SZ1 = kp.grp_tile1; + + const auto MLO_POOLBWD_LCL_DATA_WIDTH = + (static_cast(MLO_POOLBWD_GROUP_SZ0) * MLO_POOLBWD_N_HORIZ_OUT_PIX + + MLO_POOLING_KERNEL_SZ0 + MLO_POOLING_STRIDE0 - 2) / + MLO_POOLING_STRIDE0; + const auto MLO_POOLBWD_LCL_DATA_HEIGHT = + (static_cast(MLO_POOLBWD_GROUP_SZ1) * MLO_POOLBWD_N_VERT_OUT_PIX + + MLO_POOLING_KERNEL_SZ1 + MLO_POOLING_STRIDE1 - 2) / + MLO_POOLING_STRIDE1; + + std::size_t rv = 0; + const auto nelem = MLO_POOLBWD_LCL_DATA_WIDTH * MLO_POOLBWD_LCL_DATA_HEIGHT; + if(problem.GetPooling().GetMode() == miopenPoolingMax) + { + const auto sizeof_lcl_top_df = sizeof_kernel_FLOAT(problem) * nelem; + const auto sizeof_lcl_mask = sizeof_kernel_index_t(problem) * nelem; + /// \anchor alignment_of_arrays_in_gpu_memory + /// The total amount of memory calculated here is slightly less than the amount calculated + /// by the compiler. As a result, the check here may pass, while then the compiler might + /// refuse to build the kernel. The most likely reason for the difference is padding (due to + /// alignment requirements). We don't know exactly how the compiler takes alignment into + /// account, but what can we do is applying an alignment that imposes a slightly tighter + /// constraints than the compiler. So far, 16-byte (4xDWORD) alignment works well. + rv = RoundUpToMultiple(sizeof_lcl_top_df, 16) + RoundUpToMultiple(sizeof_lcl_mask, 16); + } + else + { + const auto sizeof_lcl_top_diff = sizeof_kernel_FLOAT(problem) * nelem; + rv = RoundUpToMultiple(sizeof_lcl_top_diff, 16); + } + MIOPEN_LOG_T(rv); + return rv; +} + +} // namespace + +bool PoolingBackwardCk2d::IsApplicable(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + return problem.GetDirection() == miopen::pooling::Direction::Backward && + (problem.GetPooling().GetMode() == miopenPoolingMax || + problem.GetPooling().GetMode() == miopenPoolingAverage || + problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) && + problem.GetXDesc().GetNumDims() == 4 && problem.GetXDesc().GetLayout("NCHW") == "NHWC" && + problem.GetYDesc().GetLayout("NCHW") == "NHWC" && + sizeof_local_memory(problem) <= TargetProperties::GetMaxLocalMemorySize(); +} + +ConvSolution +PoolingBackwardCk2d::GetSolution(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + const kernel_params kp(problem); + + { + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenPoolingBwd.cl"; +// TODO: backwardCk2d kernel + if(problem.GetPooling().GetMode() == miopenPoolingMax) + { + kernel.kernel_name = "mloPoolingMaxBwd"; + } + else if(problem.GetPooling().GetMode() == miopenPoolingAverage || + problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) + { + kernel.kernel_name = "mloPoolingAveBwd"; + } + + const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax) + ? MLO_POOLING_OP_MAX + : ((problem.GetPooling().GetMode() == miopenPoolingAverage) + ? MLO_POOLING_OP_AVE + : MLO_POOLING_OP_AVE_INCLUSIVE); + + const int g_wk_width = ((kp.in_width + kp.grp_tile0 * kp.out_pix_tile0 - 1) / + (kp.grp_tile0 * kp.out_pix_tile0)); + const int g_wk_height = ((kp.in_height + kp.grp_tile1 * kp.out_pix_tile1 - 1) / + (kp.grp_tile1 * kp.out_pix_tile1)); + + const auto build_params = + KernelBuildParameters{ + {"MLO_POOLING_OP_ID", pooling_method}, + {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h}, + {"MLO_POOLING_STRIDE1", kp.kernel_stride_h}, + {"MLO_POOLING_KERNEL_SZ0", kp.kernel_size_w}, + {"MLO_POOLING_STRIDE0", kp.kernel_stride_w}, + {"MLO_POOLBWD_N_HORIZ_OUT_PIX", kp.out_pix_tile0}, + {"MLO_POOLBWD_N_VERT_OUT_PIX", kp.out_pix_tile1}, + {"MLO_POOLBWD_GROUP_SZ0", kp.grp_tile0}, + {"MLO_POOLBWD_GROUP_SZ1", kp.grp_tile1}, + {"MLO_POOLING_INDEX_TYPE", + get_pooling_index_type_name(problem.GetPooling().GetIndexType())}, + {"MLO_POOLING_INDEX_MAX", + get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())}, + {"USE_IMG_INDEX", + problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexImage + ? 1 + : 0}, + } + << GetDataTypeKBP(problem.GetXDesc().GetType()); + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.l_wk = {kp.grp_tile0, kp.grp_tile1, 1}; + kernel.g_wk = { + g_wk_width * kp.grp_tile0, g_wk_height * kp.grp_tile1, kp.n_inputs * kp.batch_sz}; + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + if(params.pooling.GetMode() == miopenPoolingMax) + { + kernel(params.dy, + params.dx, + params.workspace, + static_cast(params.pooling.pads[0]), + static_cast(params.pooling.pads[1]), + static_cast(params.dyDesc.GetLengths()[1]), + static_cast(params.dxDesc.GetLengths()[2]), + static_cast(params.dxDesc.GetLengths()[3]), + static_cast(params.dyDesc.GetLengths()[2]), + static_cast(params.dyDesc.GetLengths()[3]), + static_cast(params.dxDesc.GetStrides()[0]), + static_cast(params.dxDesc.GetStrides()[1]), + static_cast(params.dxDesc.GetStrides()[2]), + static_cast(params.dyDesc.GetStrides()[0]), + static_cast(params.dyDesc.GetStrides()[1]), + static_cast(params.dyDesc.GetStrides()[2])); + } + else + { + kernel(params.dy, + params.dx, + static_cast(params.pooling.pads[0]), + static_cast(params.pooling.pads[1]), + static_cast(params.dyDesc.GetLengths()[1]), + static_cast(params.dxDesc.GetLengths()[2]), + static_cast(params.dxDesc.GetLengths()[3]), + static_cast(params.dyDesc.GetLengths()[2]), + static_cast(params.dyDesc.GetLengths()[3]), + static_cast(params.dxDesc.GetStrides()[0]), + static_cast(params.dxDesc.GetStrides()[1]), + static_cast(params.dxDesc.GetStrides()[2]), + static_cast(params.dyDesc.GetStrides()[0]), + static_cast(params.dyDesc.GetStrides()[1]), + static_cast(params.dyDesc.GetStrides()[2])); + } + }; + }; + + return result; +} + +std::size_t +PoolingBackwardCk2d::GetWorkspaceSize(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + if(problem.GetPooling().GetMode() != miopenPoolingMax) + return 0; + return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType()); +} + +} // namespace pooling + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/pooling/backwardCkNd.cpp b/src/solver/pooling/backwardCkNd.cpp new file mode 100644 index 0000000000..b8cdbb0286 --- /dev/null +++ b/src/solver/pooling/backwardCkNd.cpp @@ -0,0 +1,273 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include + +#define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCm/MIFin/issues/80 + +namespace miopen { + +namespace solver { + +namespace pooling { + +bool PoolingBackwardCkNd::IsApplicable(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + return problem.GetDirection() == miopen::pooling::Direction::Backward // + && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() // + && (problem.GetXDesc().GetType() == miopenFloat // + || problem.GetXDesc().GetType() == miopenHalf) // + && (problem.GetPooling().GetMode() == miopenPoolingMax // + || problem.GetPooling().GetMode() == miopenPoolingAverage // + || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) // + && ( // + (problem.GetXDesc().GetNumDims() == 5 // + && problem.GetXDesc().GetLayout("NCDHW") == "NDHWC" // + && problem.GetYDesc().GetLayout("NCDHW") == "NDHWC") // + || // + (problem.GetXDesc().GetNumDims() == 4 // + && problem.GetXDesc().GetLayout("NCHW") == "NHWC" // + && problem.GetYDesc().GetLayout("NCHW") == "NHWC") // + ) // + /// \todo This solver does not support workspace index mask mode yet. + && !(problem.GetPooling().GetMode() == miopenPoolingMax // + && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask); +} + +ConvSolution +PoolingBackwardCkNd::GetSolution(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenPoolingBwdND.cl"; + kernel.kernel_name = "mloPoolingND"; +// TODO: backwardCkNd kernel + if(problem.GetPooling().GetMode() == miopenPoolingMax) + { + kernel.kernel_name += "MaxBwd"; + } + else if(problem.GetPooling().GetMode() == miopenPoolingAverage || + problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) + { + kernel.kernel_name += "AveBwd"; + } + + const auto& bot = problem.GetXDesc(); + const auto& top = problem.GetYDesc(); + + std::size_t batch_sz, n_inputs, in_height, in_width; + std::tie(batch_sz, n_inputs, in_height, in_width) = miopen::tien<4>(bot.GetLengths(), 1); + + const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax) + ? MLO_POOLING_OP_MAX + : ((problem.GetPooling().GetMode() == miopenPoolingAverage) + ? MLO_POOLING_OP_AVE + : MLO_POOLING_OP_AVE_INCLUSIVE); + + int pix_w_per_work = 1; + int pix_h_per_work = 4; + int pix_d_per_work = 2; + + int batch = top.GetLengths()[0]; + int chal = top.GetLengths()[1]; + + const bool is2d = (bot.GetNumDims() == 4); + + int bot_d = is2d ? 1 : *(bot.GetLengths().rbegin() + 2); + int bot_h = *(bot.GetLengths().rbegin() + 1); + int bot_w = *(bot.GetLengths().rbegin()); + + int pix_blk_w = std::max((bot_w + pix_w_per_work - 1) / pix_w_per_work, 1); + int pix_blk_h = std::max((bot_h + pix_h_per_work - 1) / pix_h_per_work, 1); + int pix_blk_d = std::max((bot_d + pix_d_per_work - 1) / pix_d_per_work, 1); + + int max_activ_workitem = 65536; + int total_work = batch * chal * pix_blk_w * pix_blk_h * pix_blk_d; + int activ_work = std::min(total_work, max_activ_workitem); + +#if WORKAROUND_ISSUE_MIFIN_80 + const std::size_t wavesize = 64; +#else + const std::size_t wavesize = context.GetStream().GetWavefrontWidth(); +#endif + size_t grp_num = (activ_work + wavesize - 1) / wavesize; + + auto strides = problem.GetPooling().strides; + auto lens = problem.GetPooling().lens; + auto pads = problem.GetPooling().pads; + + if(is2d) + { + strides.push_back(strides[1]); + strides[1] = strides[0]; + lens.push_back(lens[1]); + lens[1] = lens[0]; + lens[0] = 1; + pads.push_back(pads[1]); + pads[1] = pads[0]; + pads[0] = 0; + } + + bool territory_overlap = false; + for(std::size_t i = 0; i < strides.size(); i++) + territory_overlap |= (strides[i] < lens[i]); + + const auto build_params = + KernelBuildParameters{ + {"MLO_POOLING_OP_ID", pooling_method}, + {"MAX_ACTIV_WORKITEM", max_activ_workitem}, + {"MLO_POOLING_GROUP_SZ0", wavesize}, + {"MLO_POOLING_GROUP_SZ1", 1}, + {"MLO_POOLING_GROUP_SZ2", 1}, + {"PIX_W_PER_WORK", pix_w_per_work}, + {"PIX_H_PER_WORK", pix_h_per_work}, + {"PIX_D_PER_WORK", pix_d_per_work}, + {"KERNEL_SZ_D", lens[0]}, + {"KERNEL_SZ_H", lens[1]}, + {"KERNEL_SZ_W", lens[2]}, + {"STRIDE_D", strides[0]}, + {"STRIDE_H", strides[1]}, + {"STRIDE_W", strides[2]}, + {"TERRITORY_OVERLAP", static_cast(territory_overlap)}, + {"MLO_POOLING_INDEX_TYPE", + get_pooling_index_type_name(problem.GetPooling().GetIndexType())}, + {"MLO_POOLING_INDEX_MAX", + get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())}, + } + << GetDataTypeKBP(problem.GetDYDesc().GetType()); + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.l_wk = {wavesize, 1, 1}; + kernel.g_wk = {wavesize * grp_num, 1, 1}; + + result.construction_params.push_back(kernel); + + const auto top_d = is2d ? 1 : *(top.GetLengths().rbegin() + 2); + const auto top_h = *(top.GetLengths().rbegin() + 1); + const auto top_w = *(top.GetLengths().rbegin()); + + auto unpackStrides = [is2d](const auto& strides) { + return std::make_tuple(strides[0], // N stride + strides[1], // C stride + strides[2], // D stride. Same as H_stride in 3D converted from 2D. + is2d // + ? strides[2] // 2D H stride + : strides[3] // 3D H stride + ); + }; + + std::size_t bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride; + std::size_t top_n_stride, top_c_stride, top_d_stride, top_h_stride; + std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride) = + unpackStrides(bot.GetStrides()); + std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride) = + unpackStrides(top.GetStrides()); + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + if(params.pooling.GetMode() == miopenPoolingMax) + { + kernel(params.dy, + params.dx, + params.workspace, + static_cast(pads[0]), + static_cast(pads[1]), + static_cast(pads[2]), + static_cast(batch), + static_cast(chal), + static_cast(bot_d), + static_cast(bot_h), + static_cast(bot_w), + static_cast(top_d), + static_cast(top_h), + static_cast(top_w), + static_cast(bot_n_stride), + static_cast(bot_c_stride), + static_cast(bot_d_stride), + static_cast(bot_h_stride), + static_cast(top_n_stride), + static_cast(top_c_stride), + static_cast(top_d_stride), + static_cast(top_h_stride), + static_cast(total_work)); + } + else + { + kernel(params.dy, + params.dx, + static_cast(pads[0]), + static_cast(pads[1]), + static_cast(pads[2]), + static_cast(batch), + static_cast(chal), + static_cast(bot_d), + static_cast(bot_h), + static_cast(bot_w), + static_cast(top_d), + static_cast(top_h), + static_cast(top_w), + static_cast(bot_n_stride), + static_cast(bot_c_stride), + static_cast(bot_d_stride), + static_cast(bot_h_stride), + static_cast(top_n_stride), + static_cast(top_c_stride), + static_cast(top_d_stride), + static_cast(top_h_stride), + static_cast(total_work)); + } + }; + }; + + return result; +} + +std::size_t +PoolingBackwardCkNd::GetWorkspaceSize(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + if(problem.GetPooling().GetMode() != miopenPoolingMax) + return 0; + return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType()); +} + +} // namespace pooling + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/pooling/forwardCk2d.cpp b/src/solver/pooling/forwardCk2d.cpp new file mode 100644 index 0000000000..d66da12a33 --- /dev/null +++ b/src/solver/pooling/forwardCk2d.cpp @@ -0,0 +1,267 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace pooling { + +namespace { + +struct kernel_params +{ + int kernel_size_h; + int kernel_size_w; + int kernel_stride_h; + int kernel_stride_w; + int out_height; + int out_width; + int out_pix_tile0; + int out_pix_tile1; + + kernel_params(const miopen::pooling::ProblemDescription& p) + { + const auto& pd = p.GetPooling(); + const auto& yd = p.GetYDesc(); + kernel_size_h = pd.lens[0]; + kernel_size_w = pd.lens[1]; + kernel_stride_h = pd.strides[0]; + kernel_stride_w = pd.strides[1]; + out_height = yd.GetLengths()[2]; + out_width = yd.GetLengths()[3]; + out_pix_tile0 = 1; + out_pix_tile1 = out_height <= 8 ? 1 // + : out_height <= 32 ? 4 // + : 8; + if(out_height > 16 && out_height % 32 > 16) + out_pix_tile1 = std::min(16, std::max(1, prePow2(out_pix_tile1 * kernel_stride_h))); + } +}; + +std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem) +{ + const auto datatype = problem.GetXDesc().GetType(); + return get_data_size(datatype); +} + +std::size_t sizeof_kernel_FLOAT_ACCUM(const miopen::pooling::ProblemDescription& problem) +{ + const auto datatype = problem.GetXDesc().GetType(); + if(datatype == miopenHalf) + return get_data_size(miopenFloat); // mixed precision + return get_data_size(datatype); +} + +inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m) +{ + assert(m > 0); + return ((v + m - 1) / m) * m; +} + +// Compute amount of private memory required for holding the arrays defined +// in the "mloPoolingG" kernel: +// +// #define MLO_BOT_DATA_SZ0 +// ((MLO_POOLING_N_HORIZ_OUT_PIX - 1) * MLO_POOLING_STRIDE0 + MLO_POOLING_KERNEL_SZ0) +// +// #define MLO_BOT_DATA_SZ1 +// ((MLO_POOLING_N_VERT_OUT_PIX - 1) * MLO_POOLING_STRIDE1 + MLO_POOLING_KERNEL_SZ1) +// +// _FLOAT bot_data[MLO_BOT_DATA_SZ1][MLO_BOT_DATA_SZ0]; +// _FLOAT_ACCUM res[MLO_POOLING_N_VERT_OUT_PIX][MLO_POOLING_N_HORIZ_OUT_PIX]; +// +std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& problem) +{ + const kernel_params kp(problem); + + // aliases to ease programming + const auto& MLO_POOLING_KERNEL_SZ1 = kp.kernel_size_h; + const auto& MLO_POOLING_STRIDE1 = kp.kernel_stride_h; + const auto& MLO_POOLING_KERNEL_SZ0 = kp.kernel_size_w; + const auto& MLO_POOLING_STRIDE0 = kp.kernel_stride_w; + const auto& MLO_POOLING_N_HORIZ_OUT_PIX = kp.out_pix_tile0; + const auto& MLO_POOLING_N_VERT_OUT_PIX = kp.out_pix_tile1; + + const auto MLO_BOT_DATA_SZ0 = + (static_cast(MLO_POOLING_N_HORIZ_OUT_PIX) - 1) * MLO_POOLING_STRIDE0 + + MLO_POOLING_KERNEL_SZ0; + const auto MLO_BOT_DATA_SZ1 = + (static_cast(MLO_POOLING_N_VERT_OUT_PIX) - 1) * MLO_POOLING_STRIDE1 + + MLO_POOLING_KERNEL_SZ1; + + const auto sizeof_bot_data = sizeof_kernel_FLOAT(problem) * MLO_BOT_DATA_SZ1 * MLO_BOT_DATA_SZ0; + const auto sizeof_res = sizeof_kernel_FLOAT_ACCUM(problem) * MLO_POOLING_N_VERT_OUT_PIX * + MLO_POOLING_N_HORIZ_OUT_PIX; + + MIOPEN_LOG_T("sizeof_bot_data " << sizeof_bot_data << "sizeof_res" << sizeof_res); + + /// \ref alignment_of_arrays_in_gpu_memory + return RoundUpToMultiple(sizeof_bot_data, 16) + RoundUpToMultiple(sizeof_res, 16); +} + +} // namespace + +bool PoolingForwardCk2d::IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const +{ + return problem.GetDirection() == miopen::pooling::Direction::Forward && + problem.GetXDesc().GetNumDims() == 4 && + problem.GetXDesc().GetType() == problem.GetYDesc().GetType() && + (problem.GetXDesc().GetType() == miopenFloat || + problem.GetXDesc().GetType() == miopenHalf) && + problem.GetXDesc().GetLayout("NCHW") == "NHWC" && + problem.GetYDesc().GetLayout("NCHW") == "NHWC" && + sizeof_private_memory(problem) <= + TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth(); +} + +ConvSolution PoolingForwardCk2d::GetSolution(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + { + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenPooling.cl"; + kernel.kernel_name = "mloPoolingG"; +// TODO: forwardCk2d kernel + const kernel_params kp(problem); + + int batch_sz, n_outputs; + std::tie(batch_sz, n_outputs, std::ignore, std::ignore) = + miopen::tien<4>(problem.GetYDesc().GetLengths(), 1); + + const auto& pool_d = problem.GetPooling(); + const auto wsp_index = pool_d.GetWorkspaceIndexMode(); + + int grp_tile0 = kp.out_width <= 8 ? 8 : (kp.out_width % 32 <= 16 ? 16 : 32); + int grp_tile1 = kp.out_height <= 8 ? 8 + : kp.out_height < 16 ? 16 + : kp.out_height <= 32 ? 32 + : kp.out_height <= 64 ? 64 + : 128; + grp_tile1 /= kp.out_pix_tile1; + while(grp_tile0 * grp_tile1 > 256 && grp_tile0 > 1) + grp_tile0 >>= 1; + + int pooling_method = + (pool_d.GetMode() == miopenPoolingMax) + ? MLO_POOLING_OP_MAX + : ((pool_d.GetMode() == miopenPoolingAverage) ? MLO_POOLING_OP_AVE + : MLO_POOLING_OP_AVE_INCLUSIVE); + + auto build_params = KernelBuildParameters{ + {"MLO_POOLING_OP_ID", pooling_method}, + {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h}, + {"MLO_POOLING_STRIDE1", kp.kernel_stride_h}, + {"MLO_POOLING_KERNEL_SZ0", kp.kernel_size_w}, + {"MLO_POOLING_STRIDE0", kp.kernel_stride_w}, + {"MLO_POOLING_N_HORIZ_OUT_PIX", kp.out_pix_tile0}, + {"MLO_POOLING_N_VERT_OUT_PIX", kp.out_pix_tile1}, + {"MLO_POOLING_GROUP_SZ0", grp_tile0}, + {"MLO_POOLING_GROUP_SZ1", grp_tile1}, + {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(pool_d.GetIndexType())}, + {"MLO_POOLING_INDEX_MAX", get_pooling_index_type_max_name(pool_d.GetIndexType())}, + }; + + if(problem.SaveIndex()) + { + build_params << KernelBuildParameters{ + {"MLO_POOLING_SAVE_INDEX"}, + {"USE_IMG_INDEX", (wsp_index == miopenPoolingWorkspaceIndexImage ? 1 : 0)}, + }; + } + + build_params << GetDataTypeKBP(problem.GetXDesc().GetType()); + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.l_wk.push_back(grp_tile0); + kernel.l_wk.push_back(grp_tile1); + kernel.l_wk.push_back(1); + + int g_wk_width = + ((kp.out_width + grp_tile0 * kp.out_pix_tile0 - 1) / (grp_tile0 * kp.out_pix_tile0)); + int g_wk_height = + ((kp.out_height + grp_tile1 * kp.out_pix_tile1 - 1) / (grp_tile1 * kp.out_pix_tile1)); + + kernel.g_wk.push_back(static_cast(g_wk_width) * grp_tile0); + kernel.g_wk.push_back(static_cast(g_wk_height) * grp_tile1); + kernel.g_wk.push_back(static_cast(n_outputs) * batch_sz); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + kernel(params.x, + params.y, + params.workspace, + static_cast(params.pooling.pads[0]), + static_cast(params.pooling.pads[1]), + static_cast(params.xDesc.GetLengths()[1]), + static_cast(params.xDesc.GetLengths()[2]), + static_cast(params.xDesc.GetLengths()[3]), + static_cast(params.yDesc.GetLengths()[2]), + static_cast(params.yDesc.GetLengths()[3]), + static_cast(params.xDesc.GetStrides()[0]), + static_cast(params.xDesc.GetStrides()[1]), + static_cast(params.xDesc.GetStrides()[2]), + static_cast(params.yDesc.GetStrides()[0]), + static_cast(params.yDesc.GetStrides()[1]), + static_cast(params.yDesc.GetStrides()[2])); + }; + }; + + return result; +} + +std::size_t +PoolingForwardCk2d::GetWorkspaceSize(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + if(problem.GetPooling().GetMode() != miopenPoolingMax) + return 0; + return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType()); +} + +} // namespace pooling + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/pooling/forwardCkNd.cpp b/src/solver/pooling/forwardCkNd.cpp new file mode 100644 index 0000000000..a4f2f781c3 --- /dev/null +++ b/src/solver/pooling/forwardCkNd.cpp @@ -0,0 +1,264 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace pooling { + +namespace { + +constexpr int top_w_per_work = 1; +constexpr int top_h_per_work = 4; +constexpr int top_d_per_work = 2; + +struct kernel_params +{ + uint32_t stride_d; + uint32_t stride_h; + uint32_t stride_w; + uint32_t kernel_sz_d; + uint32_t kernel_sz_h; + uint32_t kernel_sz_w; + + kernel_params(const miopen::pooling::ProblemDescription& p) + { + const auto& pd = p.GetPooling(); + stride_d = pd.strides[0]; + stride_h = pd.strides[1]; + stride_w = pd.strides[2]; + kernel_sz_d = pd.lens[0]; + kernel_sz_h = pd.lens[1]; + kernel_sz_w = pd.lens[2]; + } +}; + +std::size_t sizeof_kernel_FLOAT(const miopen::pooling::ProblemDescription& problem) +{ + const auto datatype = problem.GetXDesc().GetType(); + return get_data_size(datatype); +} + +inline std::size_t RoundUpToMultiple(std::size_t v, std::size_t m) +{ + assert(m > 0); + return ((v + m - 1) / m) * m; +} + +// Compute amount of private memory required for holding the arrays defined +// in the "mloPoolingNDFwd" kernel: +// +// #define BOT_TILE_W ((TOP_W_PER_WORK - 1) * STRIDE_W + KERNEL_SZ_W) +// #define BOT_TILE_H ((TOP_H_PER_WORK - 1) * STRIDE_H + KERNEL_SZ_H) +// #define BOT_TILE_D ((TOP_D_PER_WORK - 1) * STRIDE_D + KERNEL_SZ_D) +// +// _FLOAT bot_data[BOT_TILE_D][BOT_TILE_H][BOT_TILE_W]; +// +std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& problem) +{ + const kernel_params kp(problem); + + const std::size_t bot_tile_w = ((top_w_per_work - 1) * kp.stride_w + kp.kernel_sz_w); + const std::size_t bot_tile_h = ((top_h_per_work - 1) * kp.stride_h + kp.kernel_sz_h); + const std::size_t bot_tile_d = ((top_d_per_work - 1) * kp.stride_d + kp.kernel_sz_d); + + const auto sizeof_bot_data = + sizeof_kernel_FLOAT(problem) * bot_tile_d * bot_tile_h * bot_tile_w; + MIOPEN_LOG_T("sizeof_bot_data " << sizeof_bot_data); + + /// \ref alignment_of_arrays_in_gpu_memory + return RoundUpToMultiple(sizeof_bot_data, 16); +} + +} // namespace + +bool PoolingForwardCkNd::IsApplicable(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const +{ + + return problem.GetDirection() == miopen::pooling::Direction::Forward // + && problem.GetXDesc().GetNumDims() == 5 // + && problem.GetXDesc().GetLayout("NCDHW") == "NDHWC" // + && problem.GetYDesc().GetLayout("NCDHW") == "NDHWC" // + && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() // + && (problem.GetXDesc().GetType() == miopenFloat // + || problem.GetXDesc().GetType() == miopenHalf) // + && (problem.GetPooling().GetMode() == miopenPoolingMax // + || problem.GetPooling().GetMode() == miopenPoolingAverage // + || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) // + && sizeof_private_memory(problem) <= TargetProperties::GetMaxWaveScratchSize() // + / context.GetStream().GetWavefrontWidth() // + /// \todo This solver does not support workspace index mask mode yet. + && + !(problem.GetPooling().GetMode() == miopenPoolingMax // + && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask // + && problem.SaveIndex() == true); +} + +ConvSolution PoolingForwardCkNd::GetSolution(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + const int batch = problem.GetXDesc().GetLengths()[0]; + const int chal = problem.GetXDesc().GetLengths()[1]; + + const kernel_params kp(problem); + + const int top_d = *(problem.GetYDesc().GetLengths().rbegin() + 2); + const int top_h = *(problem.GetYDesc().GetLengths().rbegin() + 1); + const int top_w = *(problem.GetYDesc().GetLengths().rbegin()); + + const int top_blk_w = std::max((top_w + top_w_per_work - 1) / top_w_per_work, 1); + const int top_blk_h = std::max((top_h + top_h_per_work - 1) / top_h_per_work, 1); + const int top_blk_d = std::max((top_d + top_d_per_work - 1) / top_d_per_work, 1); + + const int max_activ_workitem = 65536; + const int total_work = batch * chal * top_blk_w * top_blk_h * top_blk_d; + const int activ_work = std::min(total_work, max_activ_workitem); + + { + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenPoolingND.cl"; + kernel.kernel_name = "mloPoolingNDFwd"; +// TODO: forwardCkNd kernel + int pooling_method = (problem.GetPooling().mode == miopenPoolingMax) + ? MLO_POOLING_OP_MAX + : ((problem.GetPooling().mode == miopenPoolingAverage) + ? MLO_POOLING_OP_AVE + : MLO_POOLING_OP_AVE_INCLUSIVE); + + const size_t lcl_work = 64; + const size_t grp_num = (activ_work + lcl_work - 1) / lcl_work; + + auto build_params = KernelBuildParameters{ + {"MLO_POOLING_OP_ID", static_cast(pooling_method)}, + {"MAX_ACTIV_WORKITEM", static_cast(max_activ_workitem)}, + {"MLO_POOLING_GROUP_SZ0", static_cast(lcl_work)}, + {"MLO_POOLING_GROUP_SZ1", 1}, + {"MLO_POOLING_GROUP_SZ2", 1}, + {"TOP_W_PER_WORK", top_w_per_work}, + {"TOP_H_PER_WORK", top_h_per_work}, + {"TOP_D_PER_WORK", top_d_per_work}, + {"KERNEL_SZ_D", kp.kernel_sz_d}, + {"KERNEL_SZ_H", kp.kernel_sz_h}, + {"KERNEL_SZ_W", kp.kernel_sz_w}, + {"STRIDE_D", kp.stride_d}, + {"STRIDE_H", kp.stride_h}, + {"STRIDE_W", kp.stride_w}, + {"MLO_POOLING_INDEX_TYPE", + get_pooling_index_type_name(problem.GetPooling().GetIndexType())}, + {"MLO_POOLING_INDEX_MAX", + get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())}, + }; + + if(problem.SaveIndex()) + { + build_params << KernelBuildParameters{ + {"MLO_POOLING_SAVE_INDEX"}, + }; + } + + build_params << GetDataTypeKBP(problem.GetXDesc().GetType()); + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.l_wk = {lcl_work, 1, 1}; + kernel.g_wk = {lcl_work * grp_num, 1, 1}; + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + const int batch_ = params.xDesc.GetLengths()[0]; + const int chal_ = params.xDesc.GetLengths()[1]; + + const int top_d_ = *(params.yDesc.GetLengths().rbegin() + 2); + const int top_h_ = *(params.yDesc.GetLengths().rbegin() + 1); + const int top_w_ = *(params.yDesc.GetLengths().rbegin()); + + const int top_blk_w_ = std::max((top_w_ + top_w_per_work - 1) / top_w_per_work, 1); + const int top_blk_h_ = std::max((top_h_ + top_h_per_work - 1) / top_h_per_work, 1); + const int top_blk_d_ = std::max((top_d_ + top_d_per_work - 1) / top_d_per_work, 1); + + const int total_work_ = batch_ * chal_ * top_blk_w_ * top_blk_h_ * top_blk_d_; + + kernel(params.x, + params.y, + params.workspace, + static_cast(params.pooling.pads[0]), + static_cast(params.pooling.pads[1]), + static_cast(params.pooling.pads[2]), + static_cast(batch_), + static_cast(chal_), + static_cast(params.xDesc.GetLengths()[2]), + static_cast(params.xDesc.GetLengths()[3]), + static_cast(params.xDesc.GetLengths()[4]), + static_cast(top_d_), + static_cast(top_h_), + static_cast(top_w_), + static_cast(params.xDesc.GetStrides()[0]), + static_cast(params.xDesc.GetStrides()[1]), + static_cast(params.xDesc.GetStrides()[2]), + static_cast(params.xDesc.GetStrides()[3]), + static_cast(params.yDesc.GetStrides()[0]), + static_cast(params.yDesc.GetStrides()[1]), + static_cast(params.yDesc.GetStrides()[2]), + static_cast(params.yDesc.GetStrides()[3]), + static_cast(total_work_)); + }; + }; + + return result; +} + +std::size_t +PoolingForwardCkNd::GetWorkspaceSize(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + if(problem.GetPooling().GetMode() != miopenPoolingMax) + return 0; + return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType()); +} + +} // namespace pooling + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp index 43406cd508..86d24a03de 100644 --- a/src/solver/pooling/forwardNaive.cpp +++ b/src/solver/pooling/forwardNaive.cpp @@ -75,14 +75,15 @@ bool PoolingForwardNaive::IsApplicable(const ExecutionContext&, && (problem.GetPooling().GetMode() == miopenPoolingMax // || problem.GetPooling().GetMode() == miopenPoolingAverage // || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) // + && (problem.GetXDesc().GetLayout("NCDHW") == problem.GetYDesc().GetLayout("NCDHW")) // && ( // (problem.GetXDesc().GetNumDims() == 5 // - && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" // - && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW") // + && (problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" // + || problem.GetXDesc().GetLayout("NCDHW") == "NDHWC")) // || // (problem.GetXDesc().GetNumDims() == 4 // - && problem.GetXDesc().GetLayout("NCHW") == "NCHW" // - && problem.GetYDesc().GetLayout("NCHW") == "NCHW") // + && (problem.GetXDesc().GetLayout("NCHW") == "NCHW" // + || problem.GetYDesc().GetLayout("NCHW") == "NHWC")) // ); } @@ -95,6 +96,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, const auto bot = problem.GetXDesc(); const auto top = problem.GetYDesc(); const bool is2d = (bot.GetNumDims() == 4); + const bool isTranspose = problem.GetXDesc().GetLayout("NCHW")[1] == 'C'; // To compact code: const auto& pooling = problem.GetPooling(); @@ -208,8 +210,16 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, { auto kernel = KernelInfo{}; - kernel.kernel_file = "MIOpenPoolingForwardNaive.cl"; - kernel.kernel_name = "mloPoolingForwardNaive"; + if(isTranspose) + { + kernel.kernel_file = "MIOpenPoolingFwdNDNhwcNaive.cpp"; + kernel.kernel_name = "mloPoolingForwardNDNhwcNaive"; + } + else + { + kernel.kernel_file = "MIOpenPoolingForwardNaive.cl"; + kernel.kernel_name = "mloPoolingForwardNaive"; + } auto build_params = KernelBuildParameters{ {"MLO_POOLING_OP_ID", pooling_method}, // We need this at compile time in order to @@ -218,7 +228,10 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, {"MLO_POOLING_IS2D_KERNEL", static_cast(is2d_kernel)}, }; build_params << GetDataTypeKBP(bot.GetType()); - kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + if(isTranspose) + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + else + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); // [Informative] The total number of kernels required to cover the whole // forward pooling problem space is 3*4*2*2 = 48. The solver is dynamic. @@ -237,50 +250,80 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, result.construction_params.push_back(kernel); } - result.invoker_factory = [=](const std::vector& kernels) { - return [=](const Handle& handle, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - kernel(params.x, - params.y, - params.workspace, - save_index, - index_mode, - filter_d, - filter_h, - filter_w, - filter_d_stride, - filter_h_stride, - filter_w_stride, - filter_d_pad, - filter_h_pad, - filter_w_pad, - all_n, - all_c, - bot_d, - bot_h, - bot_w, - bot_n_stride, - bot_c_stride, - bot_d_stride, - bot_h_stride, - bot_w_stride, - top_d, - top_h, - top_w, - top_n_stride, - top_c_stride, - top_d_stride, - top_h_stride, - top_w_stride, - mask_n_stride, - mask_c_stride, - mask_d_stride, - mask_h_stride, - mask_w_stride); + if(isTranspose) + { + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + // NOTE: kernel 'mloPoolingForwardNDNhwcNaive' expects indices in DHW[NC] order + kernel(params.x, + params.y, + params.workspace, + save_index, + index_mode, + std::vector{filter_d, filter_h, filter_w}, + std::vector{filter_d_stride, filter_h_stride, filter_w_stride}, + std::vector{filter_d_pad, filter_h_pad, filter_w_pad}, + all_n, + all_c, + std::vector{bot_d, bot_h, bot_w}, + std::vector{bot_d_stride, bot_h_stride, bot_w_stride, bot_n_stride, bot_c_stride}, + std::vector{top_d, top_h, top_w}, + std::vector{top_d_stride, top_h_stride, top_w_stride, top_n_stride, top_c_stride}, + std::vector{mask_d_stride, mask_h_stride, mask_w_stride, mask_n_stride, mask_c_stride}); + }; + }; + } + else + { + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + kernel(params.x, + params.y, + params.workspace, + save_index, + index_mode, + filter_d, + filter_h, + filter_w, + filter_d_stride, + filter_h_stride, + filter_w_stride, + filter_d_pad, + filter_h_pad, + filter_w_pad, + all_n, + all_c, + bot_h, + bot_d, // TODO RJS: broke it + bot_w, + bot_n_stride, + bot_c_stride, + bot_d_stride, + bot_h_stride, + bot_w_stride, + top_d, + top_h, + top_w, + top_n_stride, + top_c_stride, + top_d_stride, + top_h_stride, + top_w_stride, + mask_n_stride, + mask_c_stride, + mask_d_stride, + mask_h_stride, + mask_w_stride); + }; }; - }; + } + return result; } diff --git a/test/gtest/poolingFwdNdNaive.cpp b/test/gtest/poolingFwdNdNaive.cpp new file mode 100644 index 0000000000..3e11396fff --- /dev/null +++ b/test/gtest/poolingFwdNdNaive.cpp @@ -0,0 +1,217 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include "get_handle.hpp" +#include "test_env.hpp" + +#include "pooling2d.hpp" + +#include "tensor_holder.hpp" + +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS) + +namespace env = miopen::env; + +namespace { +template +struct tensor_data +{ + static std::vector get_layout_lengths(int n, int c, std::vector& dims) + { + auto ret = std::vector{n, c}; + ret.insert(ret.end(), dims.cbegin(), dims.cend()); + + return ret; + } + + static std::vector + get_strides(std::vector& lens, int dims, miopenTensorLayout_t tensor_layout) + { + std::vector strides; + std::string layout_default = miopen::tensor_layout_get_default(dims + 2); + std::string layout_string = miopen::TensorDescriptor::GetLayoutStr(tensor_layout); + + miopen::tensor_layout_to_strides(lens, layout_default, layout_string, strides); + + constexpr int min_stride_multiplier = 1; + constexpr int max_stride_multiplier = 4; + + auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier); + for(auto& v : strides) + { + // cppcheck-suppress useStlAlgorithm + v = v * c; + } + + return strides; + } + + static miopenTensorDescriptor_t init_tensor_descriptor(miopenDataType_t type, + const std::vector& lens, + const std::vector& strides) + { + miopenTensorDescriptor_t desc; + + EXPECT_TRUE(miopenCreateTensorDescriptor(&desc) == miopenStatusSuccess); + EXPECT_TRUE( + miopenSetTensorDescriptor(desc, type, lens.size(), lens.data(), strides.data()) == + miopenStatusSuccess); + + return desc; + } + + layout_data(int _n, std::vector _dims, int _c, miopenTensorLayout_t _tensor_layout) + { + auto lens = get_layout_lengths(_n, _c, _dims); + auto strides = get_strides(lens, _dims.size(), _tensor_layout); + descriptor = miopen::TensorDescriptor{miopen_type{}, lens, strides}; + host = tensor{lens, strides}.generate(gen_value); + } + + ~layout_data() {} + + void read_gpu_data(miopen::Handle& handle, const miopen::Allocator::ManageDataPtr& ddata) + { + check = tensor{descriptor.GetLengths(), descriptor.GetStrides()}; + check.data = handle.Read(ddata, check.data.size()); + } + + tensor check{}; + tensor host; + miopen::TensorDescriptor descriptor; +}; + +} + +namespace pooling_tests { + +class PoolingFwdNdNaive : public testing::TestWithParam> +{ +}; + +static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); } + +// void GetArgs(const std::string& param, std::vector& tokens) +// { +// std::stringstream ss(param); +// std::istream_iterator begin(ss); +// std::istream_iterator end; +// while(begin != end) +// tokens.push_back(*begin++); +// } + +void Run2dDriver(miopenDataType_t prec) +{ + + std::vector params; + switch(prec) + { + case miopenFloat: params = Pooling2dFloat::GetParam(); break; + case miopenHalf: params = WidePooling2dHalf::GetParam(); break; + case miopenBFloat16: + case miopenInt8: + case miopenFloat8: + case miopenBFloat8: + case miopenInt32: + case miopenInt64: + case miopenDouble: + FAIL() + << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8 " + "data type not supported by " + "immed_conv2d_codecov test"; + + default: params = Pooling2dFloat::GetParam(); + } + + for(const auto& test_value : params) + { + std::vector tokens; + GetArgs(test_value, tokens); + std::vector ptrs; + + std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) { + return str.data(); + }); + + testing::internal::CaptureStderr(); + test_drive(ptrs.size(), ptrs.data()); + auto capture = testing::internal::GetCapturedStderr(); + std::cout << capture; + } +}; + +bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; } + +std::vector GetTestCases(const std::string precision) +{ + const auto& flag_arg = env::value(MIOPEN_TEST_FLAGS_ARGS); + + const std::vector test_cases = { + // clang-format off + {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 "+flag_arg} + // clang-format on + }; + + return test_cases; +} + +} // namespace pooling_tests +using namespace pooling_tests; + +/* +TEST_P(Pooling2dFloat, FloatTest_pooling2d_wide) +{ + const auto& handle = get_handle(); + if(IsTestSupportedForDevice(handle) && !SkipTest() && IsTestRunWith("--float")) + { + Run2dDriver(miopenFloat); + } + else + { + GTEST_SKIP(); + } +}; +*/ + +TEST_P(WidePooling2dHalf, HalfTest_pooling2d_wide) +{ + const auto& handle = get_handle(); + if(IsTestSupportedForDevice(handle) && !SkipTest() && IsTestRunWith("--half")) + { + Run2dDriver(miopenHalf); + } + else + { + GTEST_SKIP(); + } +}; + +// INSTANTIATE_TEST_SUITE_P(Pooling2D, Pooling2dFloat, testing::Values(GetTestCases("--float"))); + +INSTANTIATE_TEST_SUITE_P(Pooling2D, WidePooling2dHalf, testing::Values(GetTestCases("--half"))); diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp index 128e81cce2..399592a1f4 100644 --- a/test/pooling2d.hpp +++ b/test/pooling2d.hpp @@ -29,12 +29,12 @@ #define WORKAROUND_ISSUE_1670 1 #define TEST_GET_INPUT_TENSOR 0 -template -struct pooling2d_driver : pooling_driver +struct pooling2d_shapes { -private: +public: using U = typename std::vector; - std::vector get_2d_pooling_input_shapes() + + static std::vector get_2d_pooling_input_shapes() { return {{1, 19, 1024, 2048}, {10, 3, 32, 32}, @@ -57,13 +57,33 @@ struct pooling2d_driver : pooling_driver } // Dataset 1 is intended for testing of asymmetric configs. - std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}}; } + static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}}; } // Dataset 2 is intended for testing of configs with wide window. - std::vector get_2d_pooling_input_shapes_wide() + static std::vector get_2d_pooling_input_shapes_wide() { return {{1, 3, 255, 255}, {2, 3, 227, 227}, {1, 7, 127, 127}, {1, 1, 410, 400}}; } +}; + +template +struct pooling2d_driver : pooling_driver +{ +private: + using U = typename std::vector; + std::vector get_2d_pooling_input_shapes() + { + return pooling2d_shapes::get_2d_pooling_input_shapes(); + } + + // Dataset 1 is intended for testing of asymmetric configs. + std::vector get_2d_pooling_input_shapes_minimal() { return pooling2d_shapes::get_2d_pooling_input_shapes_minimal(); } + + // Dataset 2 is intended for testing of configs with wide window. + std::vector get_2d_pooling_input_shapes_wide() + { + return pooling2d_shapes::get_2d_pooling_input_shapes_wide(); + } public: pooling2d_driver() : pooling_driver() diff --git a/test/pooling3d.cpp b/test/pooling3d.cpp index 90b37d5c75..966b3f5303 100644 --- a/test/pooling3d.cpp +++ b/test/pooling3d.cpp @@ -25,30 +25,6 @@ *******************************************************************************/ #include "pooling_common.hpp" - -template -struct pooling3d_driver : pooling_driver -{ - std::vector> get_3d_pooling_input_shapes() - { - return {{16, 64, 3, 4, 4}, - {16, 32, 4, 9, 9}, - {8, 512, 3, 14, 14}, - {8, 512, 4, 28, 28}, - {16, 64, 56, 56, 56}, - {4, 3, 4, 227, 227}, - {4, 4, 4, 161, 700}}; - } - - pooling3d_driver() : pooling_driver() - { - this->add( - this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4)); - this->add(this->lens, "lens", this->generate_data({{2, 2, 2}, {3, 3, 3}})); - this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}})); - this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}})); - this->add(this->wsidx, "wsidx", this->generate_data({1})); - } -}; +#include "pooling3d.hpp" int main(int argc, const char* argv[]) { test_drive(argc, argv); } diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp new file mode 100644 index 0000000000..c6d80e8e98 --- /dev/null +++ b/test/pooling3d.hpp @@ -0,0 +1,62 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "pooling_common.hpp" + +struct pooling3d_shapes +{ +public: + using U = typename std::vector; + + static std::vector get_3d_pooling_input_shapes() + { + return {{16, 64, 3, 4, 4}, + {16, 32, 4, 9, 9}, + {8, 512, 3, 14, 14}, + {8, 512, 4, 28, 28}, + {16, 64, 56, 56, 56}, + {4, 3, 4, 227, 227}, + {4, 4, 4, 161, 700}}; + } +}; + +template +struct pooling3d_driver : pooling_driver +{ + std::vector> get_3d_pooling_input_shapes() + { + return pooling3d_shapes::get_3d_pooling_input_shapes(); + } + pooling3d_driver() : pooling_driver() + { + this->add( + this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4)); + this->add(this->lens, "lens", this->generate_data({{2, 2, 2}, {3, 3, 3}})); + this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}})); + this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}})); + this->add(this->wsidx, "wsidx", this->generate_data({1})); + } +}; From 1f1ed93cf1c5f5fdd652022287e33b454b8e4dda Mon Sep 17 00:00:00 2001 From: "Randy J. Spaulding" Date: Tue, 20 Aug 2024 19:25:58 +0000 Subject: [PATCH 02/10] checkpoint --- include/miopen/miopen.h | 2 +- src/CMakeLists.txt | 14 +- src/comgr.cpp | 3 + src/hipoc/hipoc_program.cpp | 1 + .../miopen/conv/problem_description.hpp | 52 +-- src/include/miopen/pooling.hpp | 3 +- src/include/miopen/pooling/invoke_params.hpp | 1 + src/include/miopen/pooling/solvers.hpp | 5 +- .../miopen/problem_description_layout.hpp | 102 +++++ src/include/miopen/tensor.hpp | 63 ++- .../MIOpenPoolingForwardNDNhwcNaive.cpp | 394 ++++++++++++++++++ src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp | 245 ----------- src/ocl/pooling_ocl.cpp | 6 +- src/pooling.cpp | 9 +- src/pooling/problem_description.cpp | 8 +- src/solver.cpp | 2 +- src/solver/pooling/forward2d.cpp | 39 +- src/solver/pooling/forwardCk2d.cpp | 1 + src/solver/pooling/forwardNaive.cpp | 204 +++++---- src/solver/pooling/forwardNdNhwcNaive.cpp | 345 +++++++++++++++ src/tensor.cpp | 13 + test/CMakeLists.txt | 2 + test/gtest/ex1.cpp | 15 + test/gtest/layout_transpose.cpp | 47 ++- test/gtest/poolingFwdNdNaive.cpp | 152 ++++--- test/pooling2d.hpp | 3 +- test/pooling3d.hpp | 2 + test/pooling_common.hpp | 290 ++++++++++--- 28 files changed, 1477 insertions(+), 546 deletions(-) create mode 100644 src/include/miopen/problem_description_layout.hpp create mode 100644 src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp delete mode 100644 src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp create mode 100644 src/solver/pooling/forwardNdNhwcNaive.cpp create mode 100644 test/gtest/ex1.cpp diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 0f2c2a5cb0..c69c9957c8 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -396,7 +396,7 @@ typedef enum miopenTensorCHWNc4 = 5, /*!< CHWNc4 memory layout (Partially supported) */ miopenTensorCHWNc8 = 6, /*!< CHWNc8 memory layout (Partially supported) */ miopenTensorNCDHW = 7, /*!< NCDHW memory layout (Fully supported) */ - miopenTensorNDHWC = 8, /*!< NCDHW memory layout (Fully supported) */ + miopenTensorNDHWC = 8, /*!< NDHWC memory layout (Fully supported) */ } miopenTensorLayout_t; /*! @ingroup pooling diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 150e5cb76a..3b51f60f05 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -288,6 +288,7 @@ set( MIOpen_Source solver/mha/mha_solver_forward.cpp solver/pooling/forward2d.cpp solver/pooling/forwardNaive.cpp + solver/pooling/forwardNdNhwcNaive.cpp solver/pooling/forwardNd.cpp solver/pooling/forwardCk2d.cpp solver/pooling/forwardCkNd.cpp @@ -451,6 +452,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/miopen_type_traits.hpp kernels/miopen_utility.hpp kernels/neuron.inc + kernels/pooling_functions.h kernels/rocm_version.inc kernels/stride_array.hpp kernels/utilities.inc @@ -499,7 +501,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenPoolingBwd.cl kernels/MIOpenPoolingBwdND.cl kernels/MIOpenPoolingForwardNaive.cl - kernels/MIOpenPoolingFwdNDNhwcNaive.cpp + kernels/MIOpenPoolingForwardNDNhwcNaive.cpp kernels/MIOpenPoolingND.cl kernels/MIOpenConv1x1S.cl kernels/MIOpenConv1x1J1.cl @@ -592,6 +594,16 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/xform_bidirect_winograd_out.s kernels/UniversalTranspose.cl) + # TEMPCODE RJS + set(MIOPEN_KERNELS + kernels/MIOpenPoolingForwardNDNhwcNaive.cpp + ) + + set(MIOPEN_KERNEL_INCLUDES + kernels/bfloat16_dev.hpp + kernels/float_types.h + kernels/pooling_functions.h + ) # Kernels in development lists. # Should be ALWAYS empty in develop branch (at the time of PR merge) # Intention: to speed up kernel development rebuild time diff --git a/src/comgr.cpp b/src/comgr.cpp index aa53b71bb5..0fec8fa8ea 100644 --- a/src/comgr.cpp +++ b/src/comgr.cpp @@ -887,6 +887,7 @@ class HiprtcProgram const auto log = GetLog(false); if(!log.empty()) MIOPEN_LOG_I(log); + std::cout << "************** HipRTC compile log: '" << log << "'" << std::endl; // TEMPCODE RJS } void GetCode(std::vector& bytes) const @@ -988,7 +989,9 @@ void BuildHip(const std::string& name, opts.push_back("-std=c++17"); HiprtcProgram prog(name, text); + try{ // TEMPCODE RJS prog.Compile(opts); + } catch(Error& ex) { std::cout << __FUNCTION__ << " : Exception calling prog.Compile!: " << ex.text << std::endl; throw(ex); } prog.GetCode(binary); } catch(Error& ex) diff --git a/src/hipoc/hipoc_program.cpp b/src/hipoc/hipoc_program.cpp index ec090455e8..fa151bd3ed 100644 --- a/src/hipoc/hipoc_program.cpp +++ b/src/hipoc/hipoc_program.cpp @@ -275,6 +275,7 @@ void HIPOCProgramImpl::BuildCodeObjectInMemory(const std::string& params, #endif if(filename.extension() == ".cpp") { + std::cout << "Compling HIP: '" << filename << "'" << std::endl; // TEMPCODE RJS hiprtc::BuildHip(filename.string(), src, params, target, binary); } else if(filename.extension() == ".s") diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp index 9447d7a5ca..dcbcac27f3 100644 --- a/src/include/miopen/conv/problem_description.hpp +++ b/src/include/miopen/conv/problem_description.hpp @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -136,7 +137,7 @@ namespace conv { MIOPEN_INTERNALS_EXPORT miopenAlphaBetaCase_t ClassifyAlphaBeta(const Scalar& alpha, const Scalar& beta); -struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase +struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionWeightsBase #if MIOPEN_ENABLE_SQLITE , SQLiteSerializable @@ -153,13 +154,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase int bias_ = 0, const Scalar& alpha_ = Scalar(1.0), const Scalar& beta_ = Scalar(0.0)) - : in(in_), - weights(weights_), - out(out_), + : ProblemDescriptionWeightsBase(in_, weights_, out_), conv(conv_), - in_layout(ComputeInLayout()), - weights_layout(ComputeWeightsLayout()), - out_layout(ComputeOutLayout()), direction(direction_), bias(bias_), alpha(alpha_), @@ -443,49 +439,7 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase void SetupFloats(ExecutionContext& ctx) const; private: - std::string ComputeInLayout() const - { - if(GetSpatialDims() == 2) - { - return in.GetLayout(in.GetLayout_str()); - } - else - { - return in.GetLayout("NCDHW"); - } - } - - std::string ComputeOutLayout() const - { - if(GetSpatialDims() == 2) - { - return out.GetLayout(out.GetLayout_str()); - } - else - { - return out.GetLayout("NCDHW"); - } - } - - std::string ComputeWeightsLayout() const - { - if(GetSpatialDims() == 2) - { - return weights.GetLayout(weights.GetLayout_str()); - } - else - { - return weights.GetLayout("NCDHW"); - } - } - - TensorDescriptor in; - TensorDescriptor weights; - TensorDescriptor out; ConvolutionDescriptor conv; - std::string in_layout; - std::string weights_layout; - std::string out_layout; Direction direction = Direction::Forward; int bias = 0; Scalar alpha = Scalar(1.0); diff --git a/src/include/miopen/pooling.hpp b/src/include/miopen/pooling.hpp index 0ab5ffa1c7..2670c3b09c 100644 --- a/src/include/miopen/pooling.hpp +++ b/src/include/miopen/pooling.hpp @@ -146,7 +146,8 @@ struct MIOPEN_EXPORT PoolingDescriptor : miopenPoolingDescriptor Data_t y, bool save_index, Data_t workSpace, - size_t workSpaceSize) const; + size_t workSpaceSize, + Data_t junk = nullptr) const; // TEMPCODE RJS miopenStatus_t Backward(Handle& handle, const void* alpha, diff --git a/src/include/miopen/pooling/invoke_params.hpp b/src/include/miopen/pooling/invoke_params.hpp index 2d55786c21..2140c0a1fb 100644 --- a/src/include/miopen/pooling/invoke_params.hpp +++ b/src/include/miopen/pooling/invoke_params.hpp @@ -45,6 +45,7 @@ struct FwdInvokeParams : public miopen::InvokeParams Data_t y = nullptr; Data_t workspace = nullptr; std::size_t workspace_size = 0; + Data_t junk = nullptr; // TEMPCODE RJS std::size_t GetWorkspaceSize() const { return workspace_size; } Data_t GetWorkspace() const { return workspace; } diff --git a/src/include/miopen/pooling/solvers.hpp b/src/include/miopen/pooling/solvers.hpp index ab86a52aae..d8836fcdc0 100644 --- a/src/include/miopen/pooling/solvers.hpp +++ b/src/include/miopen/pooling/solvers.hpp @@ -103,10 +103,9 @@ struct PoolingForwardNaive final : PoolingSolver const miopen::pooling::ProblemDescription& problem) const override; }; -struct PoolingForwardNdNhwcNaive final : PoolingSolver +struct PoolingForwardNDNhwcNaive final : PoolingSolver { - const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsDynamic() const override { return true; } + const std::string& SolverDbId() const override { return GetSolverDbId(); } bool IsApplicable(const ExecutionContext& context, const miopen::pooling::ProblemDescription& problem) const override; diff --git a/src/include/miopen/problem_description_layout.hpp b/src/include/miopen/problem_description_layout.hpp new file mode 100644 index 0000000000..aa20c4058c --- /dev/null +++ b/src/include/miopen/problem_description_layout.hpp @@ -0,0 +1,102 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include +#include +#include + +#include + +namespace miopen { + +struct ProblemDescriptionLayoutBase : ProblemDescriptionBase +{ + ProblemDescriptionLayoutBase() = default; + ProblemDescriptionLayoutBase(const ProblemDescriptionLayoutBase&) = default; +ProblemDescriptionLayoutBase(const TensorDescriptor& in_, // x for Forward, y for Backward* + const TensorDescriptor& out_ // y for Forward, x for Backward* + ) + : ProblemDescriptionBase(), + in(in_), + out(out_), + in_layout(ComputeInLayout()), + out_layout(ComputeOutLayout()) + {} + virtual ~ProblemDescriptionLayoutBase() = default; + + ProblemDescriptionLayoutBase& operator=(const ProblemDescriptionLayoutBase&) = default; + + [[nodiscard]] virtual NetworkConfig MakeNetworkConfig() const = 0; + +protected: + TensorDescriptor in; + TensorDescriptor out; + std::string in_layout; + std::string out_layout; + + std::string ComputeInLayout() const + { + return in.GetLayout(in.GetLayout_str()); + } + + std::string ComputeOutLayout() const + { + return out.GetLayout(out.GetLayout_str()); + } +}; + +struct ProblemDescriptionWeightsBase : ProblemDescriptionLayoutBase +{ + ProblemDescriptionWeightsBase() = default; + ProblemDescriptionWeightsBase(const ProblemDescriptionWeightsBase&) = default; + ProblemDescriptionWeightsBase(const TensorDescriptor& in_, // x for Forward, y for Backward* + const TensorDescriptor& weights_, + const TensorDescriptor& out_ // y for Forward, x for Backward* + ) + : ProblemDescriptionLayoutBase(in_, out_), + weights(weights_), + weights_layout(ComputeWeightsLayout()) + {} + virtual ~ProblemDescriptionWeightsBase() = default; + + ProblemDescriptionWeightsBase& operator=(const ProblemDescriptionWeightsBase&) = default; + + [[nodiscard]] virtual NetworkConfig MakeNetworkConfig() const = 0; + +protected: + TensorDescriptor weights; + std::string weights_layout; + + std::string ComputeWeightsLayout() const + { + return weights.GetLayout(weights.GetLayout_str()); + } +}; + +} // namespace miopen diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp index f4d2b2dca7..30a197e503 100644 --- a/src/include/miopen/tensor.hpp +++ b/src/include/miopen/tensor.hpp @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -162,6 +163,10 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor TensorDescriptor(miopenDataType_t t, const std::vector& lens_in, const std::vector& strides_in); + TensorDescriptor(miopenDataType_t t, + miopenTensorLayout_t layout_in, + const std::vector& lens_in, + const std::vector& strides_in); TensorDescriptor(miopenDataType_t t, const std::initializer_list& lens_in, const std::initializer_list& strides_in); @@ -207,6 +212,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor miopenTensorLayout_t GetLayout_t() const; static std::string GetLayoutStr(miopenTensorLayout_t layout); std::string GetLayout_str() const; + bool IsDefaultLayout() const; std::size_t GetVectorLength() const; std::optional GetCastType() const; @@ -259,8 +265,10 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor { if(labels.size() != strides.size()) { - MIOPEN_THROW( - "Invalid labels size. Layout labels size must be equavalent to stride size"); + std::ostringstream oss; + oss << "Invalid labels size. labels='" << labels << "', strides size=" << strides.size() + << ". Layout labels size must be equivalent to stride size"; + MIOPEN_THROW(oss.str().c_str()); } // Copy construct the result string from labels. This allocates the space at one go @@ -276,7 +284,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor if(base_label.size() != strides.size()) { MIOPEN_THROW( - "Invalid labels size. Layout labels size must be equavalent to stride size"); + "Invalid labels size. Layout labels size must be equivalent to stride size"); } auto result = base_label; auto p = find_permutation(lens, strides); @@ -292,7 +300,29 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor friend void from_json(const nlohmann::json& j, TensorDescriptor& descriptor); protected: - static miopenTensorLayout_t GetDefaultLayout() { return miopenTensorNCHW; }; + static miopenTensorLayout_t GetDefaultLayout(unsigned spatial_dims = 2) + { + switch (spatial_dims) + { + case 2: return miopenTensorNCHW; + case 3: return miopenTensorNCDHW; + default: + MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3."); + } + }; + + static bool IsDefaultLayout(miopenTensorLayout_t layout, unsigned spatial_dims = 2) + { + switch (spatial_dims) + { + case 2: + case 3: + return layout == GetDefaultLayout(); + default: + MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3."); + } + } + private: TensorDescriptor(miopenDataType_t t, @@ -328,11 +358,34 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor template constexpr auto GetNCDHW(unsigned spatial_dims, const std::vector& data) +{ + if(spatial_dims == 3) + { + if(data.size() == 5) // NCDHW + return miopen::tien<5>(data, 1); + else if(data.size() == 3) // DHW + return std::make_tuple(static_cast(1), static_cast(1), data[0], data[1], data[2]); + else + MIOPEN_THROW("Invalid data length; must be 5 or 3 with 3 spatial dimensions"); + } + else + { + if(data.size() == 4) // NCHW + return std::make_tuple(data[0], data[1], static_cast(1), data[2], data[3]); + else if(data.size() == 2) // HW + return std::make_tuple(static_cast(1), static_cast(1), static_cast(1), data[0], data[1]); + else + MIOPEN_THROW("Invalid data length; must be 4 or 2 with 2 spatial dimensions"); + } +} + +template +constexpr auto GetNDHWC(unsigned spatial_dims, const std::vector& data) { if(spatial_dims == 3) return miopen::tien<5>(data, 1); else - return std::make_tuple(data[0], data[1], static_cast(1), data[2], data[3]); + return std::make_tuple(data[0], static_cast(1), data[1], data[2], data[3]); } } // namespace miopen diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp new file mode 100644 index 0000000000..970104b3b1 --- /dev/null +++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp @@ -0,0 +1,394 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +//#define TEMPCODE RJS +#ifdef TEMPCODE +#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0 + +#define MLO_POOLING_OP_ID MLO_POOLING_OP_AVE + +#define MLO_POOLING_INDEX_TYPE int +#define MLO_POOLING_IS2D_KERNEL 0 +#define INPUT_TYPE _FLOAT +#define OUTPUT_TYPE _FLOAT +// #define TI INPUT_TYPE +// #define TO OUTPUT_TYPE +#define CVT_FP32_2ACCUM(x) (x) +#endif + +// #define _FLOAT float +// #define _FLOAT_ACCUM _FLOAT + +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#ifdef TEMPCODE +#include "float_types.h" +#endif +#include "pooling_functions.h" + +#if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE) +#define AVERAGE_OPS 1 +#else +#define AVERAGE_OPS 0 +#endif + +// Let's use extended-precision accumulator only in FP16 pooling and only for averaging. +// For all other ops and datatypes, use native accumulator, i.e. treate FLOAT_ACCUM as FLOAT. +#ifndef TEMPCODE +#if !(AVERAGE_OPS && MIOPEN_USE_FP16) +#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 1 +// #else +// #define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0 +#endif + +#include "float_types.h" +#endif // TEMPCODE + +#if AVERAGE_OPS +#define ARG_UNUSED_FOR_AVERAGE __attribute__((__unused__)) +#else +#define ARG_UNUSED_FOR_AVERAGE +#endif + +#define doUU 0 +#if doUU +#define UU __attribute__((__unused__)) +#else +#define UU +#endif + +#define doUU1 1 +#if doUU1 +#define UU1 __attribute__((__unused__)) +#else +#define UU1 +#endif + +// Out N, D, H are encoded into the block indices x, y, z +// No 2D-only optimization. +template +__device__ void poolingForwardNDNhwcNaive(UU1 const TI* __restrict__ bot_ptr, + TO* __restrict__ top_ptr, + UU1 TO* __restrict__ junk_ptr, // TEMPCODE RJS + UU1 ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr, + UU1 ARG_UNUSED_FOR_AVERAGE int save_index, + UU1 ARG_UNUSED_FOR_AVERAGE int index_mode, + UU uint32_t filter_d, + UU uint32_t filter_h, + UU uint32_t filter_w, + UU uint32_t filter_d_stride, + UU uint32_t filter_h_stride, + UU uint32_t filter_w_stride, + UU uint32_t filter_d_pad, + UU uint32_t filter_h_pad, + UU uint32_t filter_w_pad, + uint32_t all_n, + UU uint32_t all_c, // TEMPCODE RJS + UU uint32_t bot_d, + UU uint32_t bot_h, + UU uint32_t bot_w, + UU size_t bot_n_stride, + UU uint32_t bot_c_stride, + UU size_t bot_d_stride, + UU uint32_t bot_h_stride, + UU uint32_t bot_w_stride, + uint32_t top_d, + uint32_t top_h, + uint32_t top_w, + size_t top_n_stride, + uint32_t top_c_stride, + size_t top_d_stride, + uint32_t top_h_stride, + uint32_t top_w_stride, + UU ARG_UNUSED_FOR_AVERAGE size_t mask_n_stride, + UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_c_stride, + UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_d_stride, + UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_h_stride, + UU ARG_UNUSED_FOR_AVERAGE size_t mask_w_stride) +{ + const uint32_t nn = blockIdx.x / top_d; // N=slow index + if(!(nn < all_n)) + return; + + const uint32_t td = blockIdx.x % top_d; // top D=fast index + if(td >= top_d) + return; + + const uint32_t th = blockIdx.y; // top H + // const uint32_t j = (gridDim.y == 1) ? threadIdx.y : blockIdx.y; // top H + if(th >= top_h) + return; + + const uint32_t tw = blockIdx.z % top_w; // top W=fast index + if(tw >= top_w) + return; + + if(nn == 0 && td == 0 && th == 0 && tw == 0) + { + int idx = 0; + top_ptr[idx++] = gridDim.x; + top_ptr[idx++] = gridDim.y; + top_ptr[idx++] = gridDim.z; + top_ptr[idx++] = -9; + + top_ptr[idx++] = blockDim.x; + top_ptr[idx++] = blockDim.y; + top_ptr[idx++] = blockDim.z; + top_ptr[idx++] = -8; + + top_ptr[idx++] = filter_d; + top_ptr[idx++] = filter_h; + top_ptr[idx++] = filter_w; + top_ptr[idx++] = -7; + + top_ptr[idx++] = filter_d_stride; + top_ptr[idx++] = filter_h_stride; + top_ptr[idx++] = filter_w_stride; + top_ptr[idx++] = -6; + + top_ptr[idx++] = filter_d_pad; + top_ptr[idx++] = filter_h_pad; + top_ptr[idx++] = filter_w_pad; + top_ptr[idx++] = -5; + + top_ptr[idx++] = all_n; + top_ptr[idx++] = all_c; + top_ptr[idx++] = bot_n_stride; + top_ptr[idx++] = bot_c_stride; + + top_ptr[idx++] = top_n_stride; + top_ptr[idx++] = top_c_stride; + #if AVERAGE_OPS + top_ptr[idx++] = -4; + top_ptr[idx++] = -4; + #else + top_ptr[idx++] = mask_n_stride; + top_ptr[idx++] = mask_c_stride; + #endif + + top_ptr[idx++] = bot_d; + top_ptr[idx++] = bot_h; + top_ptr[idx++] = bot_w; + top_ptr[idx++] = -3; + + top_ptr[idx++] = bot_d_stride; + top_ptr[idx++] = bot_h_stride; + top_ptr[idx++] = bot_w_stride; + top_ptr[idx++] = -2; + + top_ptr[idx++] = top_d; + top_ptr[idx++] = top_h; + top_ptr[idx++] = top_w; + top_ptr[idx++] = -1; + + top_ptr[idx++] = top_d_stride; + top_ptr[idx++] = top_h_stride; + top_ptr[idx++] = top_w_stride; + top_ptr[idx++] = -9; + + #if AVERAGE_OPS + top_ptr[idx++] = -8; + top_ptr[idx++] = -8; + top_ptr[idx++] = -8; + #else + top_ptr[idx++] = mask_d_stride; + top_ptr[idx++] = mask_h_stride; + top_ptr[idx++] = mask_w_stride; + #endif + top_ptr[idx++] = -7; + } + + uint32_t cc = 0; + // const auto c_base = (blockDim.x == all_c) ? 0 : (blockIdx.z / top_w) * blockDim.x; + size_t top_index = 64 + + nn * top_n_stride // TEMPCODE RJS + + cc * top_c_stride // + + (size_t)(td * top_d_stride) // + + (size_t)(th * top_h_stride) // + + (size_t)(tw * top_w_stride); + + top_ptr[top_index] = top_index; + +// const auto int_dstart = static_cast(td * filter_d_stride) - static_cast(filter_d_pad); +// const auto dend = static_cast(min(int_dstart + static_cast(filter_d), static_cast(bot_d))); +// const auto dstart = static_cast(max(int_dstart, 0)); + +// const auto int_hstart = static_cast(th * filter_h_stride) - static_cast(filter_h_pad); +// const auto hend = static_cast(min(int_hstart + static_cast(filter_h), static_cast(bot_h))); +// const auto hstart = static_cast(max(int_hstart, 0)); + +// const auto int_wstart = static_cast(tw * filter_w_stride) - static_cast(filter_w_pad); +// const auto wend = static_cast(min(int_wstart + static_cast(filter_w), static_cast(bot_w))); +// const auto wstart = static_cast(max(int_wstart, 0)); +// out_ptr[0] = 1.2; +// for(uint32_t cc = 0; cc < 1; ++cc) // top C loop +// { +// if(cc >= all_c) return; +// { +// size_t top_index = nn * top_n_stride // TEMPCODE RJS +// + cc * top_c_stride // +// + (size_t)(td * top_d_stride) // +// + (size_t)(th * top_h_stride) // +// + (size_t)(tw * top_w_stride); +// top_index = 1; + +// junk_ptr[top_index] = (_FLOAT)1.1; +// if(j != 1) +// } +// #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE +// uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); +// pool_size = (pool_size == 0) ? 1 : pool_size; +// #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE +// const uint32_t pool_size = filter_d * filter_h * filter_w; +// #endif + +// #if AVERAGE_OPS +// _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0); +// #else // MAX +// _FLOAT_ACCUM res = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM); +// bool found = false; // May remain false if bot contains only NaNs/-INFs. +// uint32_t d_save = 0; +// uint32_t h_save = 0; +// uint32_t w_save = 0; +// #endif +// for(size_t bd = dstart; bd < dend; ++bd) +// { +// for(uint32_t bh = hstart; bh < hend; ++bh) +// { +// for(uint32_t bw = wstart; bw < wend; ++bw) +// { +// const size_t bot_index = nn * bot_n_stride + // +// cc * bot_c_stride + // +// bd * bot_d_stride + // +// static_cast(bh * bot_h_stride) + // +// static_cast(bw * bot_w_stride); +// #if AVERAGE_OPS +// res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]); +// #else // MAX +// if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res) +// { +// res = bot_ptr[bot_index]; +// if(save_index) +// { +// found = true; +// d_save = bd; +// h_save = bh; +// w_save = bw; +// } +// } +// #endif +// } +// } +// } + +// #if AVERAGE_OPS +// res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size); +// #else // MAX +// res *= 1.0; // TEMPCODE RJS fix UNUSED +// if(save_index) +// { +// index_t res_index = 0; + +// / Preventing overflow during computation of res_index: +// / If Index is shorter than uint, then let's perform computation in 32-bit +// / domain and then convert to narrower Index. That would reduce the probability of +// / overflow. If Index is wider then 32 bits, then it seems like it is better to +// / convert to Index type before multiplication. However this is not actually +// / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in +// / 32 bits and then convert. + +// if(found) +// { +// if(index_mode == 1) +// res_index = (index_t)(d_save * bot_h * bot_w // +// + h_save * bot_w // +// + w_save); +// else +// res_index = (index_t)( // +// ((d_save - td * filter_d_stride + filter_d_pad) * filter_h * filter_w) // +// + ((h_save - th * filter_h_stride + filter_h_pad) * filter_w) // +// + (w_save - tw * filter_w_stride + filter_w_pad) // +// ); +// } + +// const size_t mask_index = nn * mask_n_stride // +// + cc * mask_c_stride // +// + (size_t)(td * mask_d_stride) // +// + (size_t)(tw * mask_h_stride) // +// + (size_t)(th * mask_w_stride); +// mask_ptr[mask_index] = res_index; +// } +// #endif +// size_t top_index = nn * top_n_stride // +// + cc * top_c_stride // +// + (size_t)(td * top_d_stride) // +// + (size_t)(th * top_h_stride) // +// + (size_t)(tw * top_w_stride); +// top_index = 1; +// junk_ptr[top_index] = (_FLOAT)res; // TEMPCODE RJS +// } +} + +extern "C" __global__ void mloPoolingForwardNDNhwcNaive( + const INPUT_TYPE* __restrict__ bot_ptr, + OUTPUT_TYPE* __restrict__ top_ptr, + OUTPUT_TYPE* __restrict__ junk_ptr, // TEMPCODE RJS + index_t* __restrict__ mask_ptr, + int save_index, + int index_mode, + uint32_t filter_d, uint32_t filter_h, uint32_t filter_w, + uint32_t filter_d_stride, uint32_t filter_h_stride, uint32_t filter_w_stride, + uint32_t filter_d_pad, uint32_t filter_h_pad, uint32_t filter_w_pad, + uint32_t all_n, + uint32_t all_c, + uint32_t bot_d, uint32_t bot_h, uint32_t bot_w, + size_t bot_n_stride, size_t bot_c_stride, uint32_t bot_d_stride, uint32_t bot_h_stride, uint32_t bot_w_stride, + uint32_t top_d, uint32_t top_h, uint32_t top_w, + size_t top_n_stride, size_t top_c_stride, uint32_t top_d_stride, uint32_t top_h_stride, uint32_t top_w_stride, + size_t mask_n_stride, uint32_t mask_c_stride, uint32_t mask_d_stride, uint32_t mask_h_stride, size_t mask_w_stride) +{ + poolingForwardNDNhwcNaive( + bot_ptr, + top_ptr, + junk_ptr, + mask_ptr, + save_index, + index_mode, + filter_d, filter_h, filter_w, + filter_d_stride, filter_h_stride, filter_w_stride, + filter_d_pad, filter_h_pad, filter_w_pad, + all_n, + all_c, + bot_d, bot_h, bot_w, + bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride, + top_d, top_h, top_w, + top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride, + mask_n_stride, mask_c_stride, mask_d_stride, mask_h_stride, mask_w_stride + ); +} diff --git a/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp deleted file mode 100644 index bc17af8922..0000000000 --- a/src/kernels/MIOpenPoolingFwdNDNhwcNaive.cpp +++ /dev/null @@ -1,245 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2023 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS -#include -#include -#endif - -#include "pooling_functions.h" - -#include - -#if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE) -#define AVERAGE_OPS 1 -#else -#define AVERAGE_OPS 0 -#endif - -// Let's use extended-precision accumulator only in FP16 pooling and only for averaging. -// For all other ops and datatypes, use native accumulator, i.e. treate FLOAT_ACCUM as FLOAT. -#if !(AVERAGE_OPS && MIOPEN_USE_FP16) -#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 1 -#endif -#include "float_types.h" - -#ifndef MLO_POOLING_IS2D_KERNEL -#error "MLO_POOLING_IS2D_KERNEL must be defined" -#endif - -#if AVERAGE_OPS -#define ARG_UNUSED_FOR_AVERAGE __attribute__((__unused__)) -#else -#define ARG_UNUSED_FOR_AVERAGE -#endif - -#if MLO_POOLING_IS2D_KERNEL -#define ARG_UNUSED_FOR_2D __attribute__((__unused__)) -#else -#define ARG_UNUSED_FOR_2D -#endif - -// Out N, D, H are encoded into the block indices x, y, z -// Requires all lens, strides, pads to be in DHW[NC] order. The code is -// cleaner and more performant this way. -// No 2D-only optimization. -template -__device__ void poolingFwdNDNhwcNaive(const TI* in_data, - TO* out_data, - ARG_UNUSED_FOR_AVERAGE index_t* mask_ptr, - ARG_UNUSED_FOR_AVERAGE int save_index, - ARG_UNUSED_FOR_AVERAGE int index_mode, - std::vector filter_lens, - std::vector filter_strides, - std::vector filter_pads, - uint32_t all_n, - uint32_t all_c, - std::vector lens, - std::vector strides, - std::vector out_lens, - std::vector out_strides, - ARG_UNUSED_FOR_AVERAGE std::vector mask_strides) -{ - constexpr uint32_t D_IDX = 0; - constexpr uint32_t H_IDX = 1; - constexpr uint32_t W_IDX = 2; - constexpr uint32_t N_IDX = 3; - constexpr uint32_t C_IDX = 4; - - const uint32_t b = blockIdx.x; // out N - if(!(b < all_n)) - return; - - const uint32_t k = blockIdx.y; // out D - if(!(k < out_lens[D_IDX])) - return; - - const uint32_t j = blockIdx.z; // out H - if(!(j < out_lens[H_IDX])) - return; - - for(uint32_t i = 0; i < out_lens[W_IDX]; ++i) // out W - { - for(uint32_t o = 0; o < all_c ++o) // out C - { - const auto int_dstart = static_cast(k * filter_strides[D_IDX]) - static_cast(filter_pads[D_IDX]); - const auto int_hstart = static_cast(j * filter_strides[H_IDX]) - static_cast(filter_pads[H_IDX]); - const auto int_wstart = static_cast(i * filter_strides[W_IDX]) - static_cast(filter_pads[W_IDX]); - const auto dend = static_cast(min(int_dstart + static_cast(filter_lens[D_IDX]), static_cast(out_lens[D_IDX]))); - const auto hend = static_cast(min(int_hstart + static_cast(filter_lens[H_IDX]), static_cast(out_lens[H_IDX]))); - const auto wend = static_cast(min(int_wstart + static_cast(filter_lens[W_IDX]), static_cast(out_lens[W_IDX]))); - const auto dstart = static_cast(max(int_dstart, 0)); - const auto hstart = static_cast(max(int_hstart, 0)); - const auto wstart = static_cast(max(int_wstart, 0)); - -#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - pool_size = (pool_size == 0) ? 1 : pool_size; -#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE - const uint32_t pool_size = filter_lens[D_IDX] * filter_lens[H_IDX] * filter_lens[W_IDX]; -#endif - -#if AVERAGE_OPS - _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0); -#else // MAX - _FLOAT_ACCUM res = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM); - bool found = false; // May remain false if bot contains only NaNs/-INFs. - uint32_t d_save = 0; - uint32_t h_save = 0; - uint32_t w_save = 0; -#endif - for(size_t d = dstart; d < dend; ++d) - { - for(uint32_t h = hstart; h < hend; ++h) - { - for(uint32_t w = wstart; w < wend; ++w) - { - const size_t in_index = b * strides[N_IDX] + // - o * strides[C_IDX] + // - d * strides[D_IDX] + // - static_cast(h * strides[H_IDX]) + // - static_cast(w * strides[W_IDX]); -#if AVERAGE_OPS - res += in_data[in_index]; -#else // MAX - if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index] > res)) - { - res = in_data[in_index]; - if(save_index) - { - found = true; - d_save = d; - h_save = h; - w_save = w; - } - } -#endif - } - } - } - -#if AVERAGE_OPS - res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size); -#else // MAX - if(save_index) - { - index_t res_index = 0; - - /// Preventing overflow during computation of res_index: - /// If Index is shorter than uint, then let's perform computation in 32-bit - /// domain and then convert to narrower Index. That would reduce the probability of - /// overflow. If Index is wider then 32 bits, then it seems like it is better to - /// convert to Index type before multiplication. However this is not actually - /// necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in - /// 32 bits and then convert. - - if(found) - { - if(index_mode == 1) - res_index = (index_t)(d_save * lens[H_IDX] * lens[W_IDX] // - + h_save * lens[W_IDX] // - + w_save); - else - res_index = (index_t)( // - ((d_save - k * filter_strides[D_IDX] + filter_pads[D_IDX]) * filter_lens[W_IDX] * filter_lens[H_IDX]) // - + ((h_save - j * filter_strides[H_IDX] + filter_pads[H_IDX]) * filter_lens[W_IDX]) // - + (w_save - i * filter_strides[W_IDX] + filter_pads[W_IDX]) // - ); - } - - const size_t mask_index = b * mask_strides[N_IDX] // - + o * mask_strides[C_IDX] // - + (size_t)(k * mask_strides[D_IDX]) // - + (size_t)(j * mask_strides[H_IDX]) // - + (size_t)(i * mask_strides[W_IDX]); - mask_ptr[mask_index] = res_index; - } -#endif - const size_t out_index = out_strides[N_IDX] // - + o * out_strides[C_IDX] // - + (size_t)(k * out_strides[D_IDX]) // - + (size_t)(j * out_strides[H_IDX]) // - + (size_t)(i * out_strides]W_IDX]); - - out_data[out_index] = (_FLOAT)res; - } -} -} - -extern "C" __global__ void mloPoolingForwardNDNhwcNaive(const INPUT_TYPE* __restrict__ in_data, - OUTPUT_TYPE* out_data, - ARG_UNUSED_FOR_AVERAGE index_t* mask_ptr, - ARG_UNUSED_FOR_AVERAGE int save_index, - ARG_UNUSED_FOR_AVERAGE int index_mode, - std::vector filter_lens, - std::vector filter_strides, - std::vector filter_pads, - uint32_t all_n, - uint32_t all_c, - std::vector lens, - std::vector strides, - std::vector out_lens, - std::vector out_strides, - ARG_UNUSED_FOR_AVERAGE std::vector mask_strides) -{ - poolingFwdNDNhwcNaive( - in_data, - out_data, - mask_ptr, - save_index, - index_mode, - filter_lens, - filter_strides, - filter_pads, - all_n, - all_c, - lens, - strides, - out_lens, - out_strides, - mask_strides - ); -} diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp index 9a2258908f..1b22cd91b7 100644 --- a/src/ocl/pooling_ocl.cpp +++ b/src/ocl/pooling_ocl.cpp @@ -42,7 +42,7 @@ static auto PoolingForwardSolvers() return solver::SolverContainer(alpha)), 1.0) || @@ -131,6 +132,7 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle, tmp.y = y; tmp.workspace = workSpace; tmp.workspace_size = workSpaceSize; + tmp.junk = junk; // TEMPCODE RJS return tmp; }(); diff --git a/src/pooling.cpp b/src/pooling.cpp index a65cb3c0ab..000e297f3c 100644 --- a/src/pooling.cpp +++ b/src/pooling.cpp @@ -217,13 +217,14 @@ TensorDescriptor PoolingDescriptor::GetForwardOutputTensor(const TensorDescripto { std::vector out_dim(xDesc.GetNumDims()); GetForwardOutputDimNd(xDesc, xDesc.GetNumDims(), out_dim.data()); + auto layout_str = xDesc.GetLayout_str(); + auto layout = xDesc.GetLayout_t(); + auto lengths_layout = miopen::tensor_layout_get_default(xDesc.GetNumDims()); - const std::string default_layout = tensor_layout_get_default(xDesc.GetNumDims()); - const std::string in_layout = xDesc.GetLayout(default_layout); std::vector out_strides; - tensor_layout_to_strides(out_dim, default_layout, in_layout, out_strides); + tensor_layout_to_strides(out_dim, lengths_layout, layout_str, out_strides); - return {xDesc.GetType(), out_dim, out_strides}; + return {xDesc.GetType(), layout, out_dim, out_strides}; } std::size_t PoolingDescriptor::GetWorkSpaceSize(const TensorDescriptor& yDesc) const diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp index 8e171a4ac0..ad36abfb08 100644 --- a/src/pooling/problem_description.cpp +++ b/src/pooling/problem_description.cpp @@ -82,8 +82,14 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const ss << "_dxs" << get_vect_config(dxDesc.GetStrides()); ss << "_dyd" << get_vect_config(dyDesc.GetLengths()); ss << "_dys" << get_vect_config(dyDesc.GetStrides()); + } // TEMPCODE RJS + std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << " *************************" << std::endl; + if(!xDesc.IsDefaultLayout()) + { + std::cout << " xDesc layout is not default! " << " *************************\n"; } - + ss << "_l" << (xDesc.IsDefaultLayout() ? 0 : 1); +std::cout << " " << ss.str() << std::endl; return NetworkConfig{ss.str()}; } diff --git a/src/solver.cpp b/src/solver.cpp index 0088a10cb1..28ca3751a8 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -600,7 +600,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) fusion::ConvCKIgemmFwdBiasActivFused{}.SolverDbId(), miopenConvolutionAlgoImplicitGEMM); Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNaive{}.SolverDbId()); - Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNdNhwcNaive{}.SolverDbId()); + Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNDNhwcNaive{}.SolverDbId()); RegisterWithSolver(registry, ++id, conv::ConvHipImplicitGemmGroupFwdXdlops{}, diff --git a/src/solver/pooling/forward2d.cpp b/src/solver/pooling/forward2d.cpp index 87fd0e851f..d3fe890ecd 100644 --- a/src/solver/pooling/forward2d.cpp +++ b/src/solver/pooling/forward2d.cpp @@ -135,7 +135,8 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro bool PoolingForward2d::IsApplicable(const ExecutionContext& context, const miopen::pooling::ProblemDescription& problem) const { - return problem.GetDirection() == miopen::pooling::Direction::Forward && + bool app = + problem.GetDirection() == miopen::pooling::Direction::Forward && problem.GetXDesc().GetNumDims() == 4 && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() && (problem.GetXDesc().GetType() == miopenFloat || @@ -144,6 +145,23 @@ bool PoolingForward2d::IsApplicable(const ExecutionContext& context, problem.GetYDesc().GetLayout("NCHW") == "NCHW" && sizeof_private_memory(problem) <= TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth(); + +// TEMPCODE RJS + std::cout << "%%%%%%%%%% PoolingForward2d::IsApplicable: " << app << " " << problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout("NCHW") << std::endl; + return false; + return app; +} + +#include // TEMPCODE RJS +namespace { + template + void printVec(std::string name, const std::vector& vec) + { + return; + std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: "; + for(auto i : vec) std::cout << std::setw(8) << i; + std::cout << std::endl; + } } ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&, @@ -182,6 +200,25 @@ ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&, : ((pool_d.GetMode() == miopenPoolingAverage) ? MLO_POOLING_OP_AVE : MLO_POOLING_OP_AVE_INCLUSIVE); + + // TEMPCODE RJS + const auto bot = problem.GetXDesc(); + const auto top = problem.GetYDesc(); + const auto& pooling = problem.GetPooling(); + const auto& lengths = pooling.GetLengths(); + const auto& strides = pooling.GetStrides(); + const auto& pads = pooling.GetPads(); + + std::cout << "======================================================================" << std::endl; + printVec("bot lengths", bot.GetLengths()); + printVec("bot strides", bot.GetStrides()); + printVec("top lengths", top.GetLengths()); + printVec("top strides", top.GetStrides()); + printVec("pool lengths", lengths); + printVec("pool strides", strides); + printVec("pool pads", pads); + std::cout << "======================================================================" << std::endl; + auto build_params = KernelBuildParameters{ {"MLO_POOLING_OP_ID", pooling_method}, {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h}, diff --git a/src/solver/pooling/forwardCk2d.cpp b/src/solver/pooling/forwardCk2d.cpp index d66da12a33..9681f7f563 100644 --- a/src/solver/pooling/forwardCk2d.cpp +++ b/src/solver/pooling/forwardCk2d.cpp @@ -135,6 +135,7 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro bool PoolingForwardCk2d::IsApplicable(const ExecutionContext& context, const miopen::pooling::ProblemDescription& problem) const { + return false; return problem.GetDirection() == miopen::pooling::Direction::Forward && problem.GetXDesc().GetNumDims() == 4 && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() && diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp index 86d24a03de..faf4fac836 100644 --- a/src/solver/pooling/forwardNaive.cpp +++ b/src/solver/pooling/forwardNaive.cpp @@ -68,23 +68,35 @@ inline uint32_t RoundUpNearestPower2Positive(uint32_t v) bool PoolingForwardNaive::IsApplicable(const ExecutionContext&, const miopen::pooling::ProblemDescription& problem) const { - return problem.GetDirection() == miopen::pooling::Direction::Forward // - && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() // - && (problem.GetXDesc().GetType() == miopenFloat // - || problem.GetXDesc().GetType() == miopenHalf) // - && (problem.GetPooling().GetMode() == miopenPoolingMax // - || problem.GetPooling().GetMode() == miopenPoolingAverage // - || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) // - && (problem.GetXDesc().GetLayout("NCDHW") == problem.GetYDesc().GetLayout("NCDHW")) // - && ( // - (problem.GetXDesc().GetNumDims() == 5 // - && (problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" // - || problem.GetXDesc().GetLayout("NCDHW") == "NDHWC")) // - || // - (problem.GetXDesc().GetNumDims() == 4 // - && (problem.GetXDesc().GetLayout("NCHW") == "NCHW" // - || problem.GetYDesc().GetLayout("NCHW") == "NHWC")) // - ); + auto x_type = problem.GetXDesc().GetType(); + auto y_type = problem.GetYDesc().GetType(); + std::vector types {miopenFloat, miopenHalf}; + + auto mode = problem.GetPooling().GetMode(); + std::vector modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive}; + + auto x_layout = problem.GetXDesc().GetLayout_str(); + auto y_layout = problem.GetYDesc().GetLayout_str(); + std::vector layouts {"NCHW", "NCDHW"}; + + return (problem.GetDirection() == miopen::pooling::Direction::Forward) // + && (x_type == y_type) // + && (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) // + && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) //) + && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()); +} + + +#include // TEMPCODE RJS +namespace { + template + void printVec(std::string name, const std::vector& vec) + { + return; + std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: "; + for(auto i : vec) std::cout << std::setw(8) << i; + std::cout << std::endl; + } } ConvSolution @@ -96,7 +108,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, const auto bot = problem.GetXDesc(); const auto top = problem.GetYDesc(); const bool is2d = (bot.GetNumDims() == 4); - const bool isTranspose = problem.GetXDesc().GetLayout("NCHW")[1] == 'C'; + const bool isTranspose = problem.GetXDesc().GetLayout_str()[1] != 'C'; // To compact code: const auto& pooling = problem.GetPooling(); @@ -124,6 +136,17 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, const auto index_mode = pooling.GetWorkspaceIndexMode(); const auto index_type = pooling.GetIndexType(); + // TEMPCODE RJS + std::cout << "======================================================================" << std::endl; + printVec("bot lengths", bot.GetLengths()); + printVec("bot strides", bot.GetStrides()); + printVec("top lengths", top.GetLengths()); + printVec("top strides", top.GetStrides()); + printVec("pool lengths", lengths); + printVec("pool strides", strides); + printVec("pool pads", pads); + std::cout << "======================================================================" << std::endl; + /// \anchor multiply_dims_overflow_assumption /// /// Preventing overflow during dimension-related computations: @@ -160,7 +183,8 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, const size_t mask_c_stride = static_cast(mask_d_stride) * top_d; const size_t mask_n_stride = mask_c_stride * all_c; - /// About optimal grid size. The simplest way is to map the problem onto grid is 1:1 mapping of + /// About optimal grid size: + /// NC[D]HW: The simplest way is to map the problem onto grid is 1:1 mapping of /// N,C and top.D onto grid dimensions. /// /// However, this would waste 1 dimension of grid for 2D convolutions, i.e. the grid size would @@ -169,6 +193,10 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, /// access memory in a scattered way, which would affect performance again. Current design /// choice is using separate 2D and 3D kernels (via build-time parameter) and N*C*H grid for 2D. /// + /// N[D]HWC: top N, D, H are mapped directly onto grid dimensions + /// + /// + /// /// \anchor naive_pooling_max_grid_size /// * Assumption: Max grid size is >= 2^32-1 (4G-1) i.e. std::max. /// Currently this limitation is valid for both ROCm HIP and OCL runtimes. @@ -197,9 +225,10 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, const auto is2d_kernel = (top_d == 1); // For 2D + optimize for 3D where the 1st dim is 1. const auto g0 = RoundUpNearestPower2Positive(all_n); - const auto g1 = RoundUpNearestPower2Positive(all_c); - const auto g2 = RoundUpNearestPower2Positive(is2d_kernel ? top_h : top_d); + const auto g1 = RoundUpNearestPower2Positive(isTranspose ? top_d : all_c); + const auto g2 = RoundUpNearestPower2Positive(isTranspose || is2d_kernel ? top_h : top_d); + // TODO RJS: finish NHWC grid auto work_left = wavesize / 1; const auto w0 = (g0 < work_left) ? g0 : work_left; work_left /= w0; @@ -210,16 +239,8 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, { auto kernel = KernelInfo{}; - if(isTranspose) - { - kernel.kernel_file = "MIOpenPoolingFwdNDNhwcNaive.cpp"; - kernel.kernel_name = "mloPoolingForwardNDNhwcNaive"; - } - else - { - kernel.kernel_file = "MIOpenPoolingForwardNaive.cl"; - kernel.kernel_name = "mloPoolingForwardNaive"; - } + kernel.kernel_file = "MIOpenPoolingForwardNaive.cl"; + kernel.kernel_name = "mloPoolingForwardNaive"; auto build_params = KernelBuildParameters{ {"MLO_POOLING_OP_ID", pooling_method}, // We need this at compile time in order to @@ -227,11 +248,9 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(index_type)}, {"MLO_POOLING_IS2D_KERNEL", static_cast(is2d_kernel)}, }; + build_params << GetDataTypeKBP(bot.GetType()); - if(isTranspose) - kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); - else - kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); // [Informative] The total number of kernels required to cover the whole // forward pooling problem space is 3*4*2*2 = 48. The solver is dynamic. @@ -250,79 +269,50 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, result.construction_params.push_back(kernel); } - if(isTranspose) - { - result.invoker_factory = [=](const std::vector& kernels) { - return [=](const Handle& handle, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - // NOTE: kernel 'mloPoolingForwardNDNhwcNaive' expects indices in DHW[NC] order - kernel(params.x, - params.y, - params.workspace, - save_index, - index_mode, - std::vector{filter_d, filter_h, filter_w}, - std::vector{filter_d_stride, filter_h_stride, filter_w_stride}, - std::vector{filter_d_pad, filter_h_pad, filter_w_pad}, - all_n, - all_c, - std::vector{bot_d, bot_h, bot_w}, - std::vector{bot_d_stride, bot_h_stride, bot_w_stride, bot_n_stride, bot_c_stride}, - std::vector{top_d, top_h, top_w}, - std::vector{top_d_stride, top_h_stride, top_w_stride, top_n_stride, top_c_stride}, - std::vector{mask_d_stride, mask_h_stride, mask_w_stride, mask_n_stride, mask_c_stride}); - }; + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + kernel(params.x, + params.y, + params.workspace, + save_index, + index_mode, + filter_d, + filter_h, + filter_w, + filter_d_stride, + filter_h_stride, + filter_w_stride, + filter_d_pad, + filter_h_pad, + filter_w_pad, + all_n, + all_c, + bot_d, // TEMPCODE RJS: have not broke it + bot_h, + bot_w, + bot_n_stride, + bot_c_stride, + bot_d_stride, + bot_h_stride, + bot_w_stride, + top_d, + top_h, + top_w, + top_n_stride, + top_c_stride, + top_d_stride, + top_h_stride, + top_w_stride, + mask_n_stride, + mask_c_stride, + mask_d_stride, + mask_h_stride, + mask_w_stride); }; - } - else - { - result.invoker_factory = [=](const std::vector& kernels) { - return [=](const Handle& handle, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - kernel(params.x, - params.y, - params.workspace, - save_index, - index_mode, - filter_d, - filter_h, - filter_w, - filter_d_stride, - filter_h_stride, - filter_w_stride, - filter_d_pad, - filter_h_pad, - filter_w_pad, - all_n, - all_c, - bot_h, - bot_d, // TODO RJS: broke it - bot_w, - bot_n_stride, - bot_c_stride, - bot_d_stride, - bot_h_stride, - bot_w_stride, - top_d, - top_h, - top_w, - top_n_stride, - top_c_stride, - top_d_stride, - top_h_stride, - top_w_stride, - mask_n_stride, - mask_c_stride, - mask_d_stride, - mask_h_stride, - mask_w_stride); - }; - }; - } + }; return result; } diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp new file mode 100644 index 0000000000..c946fd8da7 --- /dev/null +++ b/src/solver/pooling/forwardNdNhwcNaive.cpp @@ -0,0 +1,345 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +#define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCm/MIFin/issues/80 + +namespace miopen { + +namespace solver { + +namespace pooling { + +namespace { + +#if !MIOPEN_NDEBUG && !WORKAROUND_ISSUE_MIFIN_80 +template +bool IsPower2(T v) +{ + return (v != 0) && ((v & (v - 1)) == 0); +} +#endif + +template +T RoundUpNearestPower2Positive(T v) = delete; + +inline uint32_t RoundUpNearestPower2Positive(uint32_t v) +{ + assert(v > 0); + --v; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return std::max(++v, 1U); // Shut clang-tidy. +} + +} // namespace + +bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + auto x_type = problem.GetXDesc().GetType(); + auto y_type = problem.GetYDesc().GetType(); + std::vector types {miopenFloat, miopenHalf}; + + auto mode = problem.GetPooling().GetMode(); + std::vector modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive}; + + auto x_layout = problem.GetXDesc().GetLayout_str(); + auto y_layout = problem.GetYDesc().GetLayout_str(); + std::vector layouts {"NHWC", "NDHWC"}; + + bool app = (problem.GetDirection() == miopen::pooling::Direction::Forward) // + && (x_type == y_type) // + && (x_layout == y_layout) // + && (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) // + && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) //) + && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()); + + std::cout << "%%%%%%%%%% PoolingForwardNDNhwcNaive::IsApplicable: " << app << " " << problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout("NCHW") + << " " << problem.GetYDesc().GetLayout_str() << "->" << problem.GetYDesc().GetLayout("NCHW") + << " " << (problem.GetDirection() == miopen::pooling::Direction::Forward) + << (x_type == y_type) + << (x_layout == y_layout) << (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) + << (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) << (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()) << std::endl; + return app; +} + +#include // TEMPCODE RJS +namespace { + template + void printVec(std::string name, const std::vector& vec) + { + return; + std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: "; + for(auto i : vec) std::cout << std::setw(8) << i; + std::cout << std::endl; + } +} + +ConvSolution +PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, + const miopen::pooling::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto input_dtype = miopen::GetDataType(problem.GetXDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType()); + + const auto bot = problem.GetXDesc(); + const auto top = problem.GetYDesc(); + const bool is2d = (bot.GetNumDims() == 4); + const bool is_transpose = problem.GetXDesc().GetLayout_str()[1] != 'C'; + if(!is_transpose) + { + MIOPEN_THROW("Tried to run NHWC solver on NCHW data"); + } + + // To compact code: + const auto& pooling = problem.GetPooling(); + const auto& lengths = pooling.GetLengths(); + const auto& strides = pooling.GetStrides(); + const auto& pads = pooling.GetPads(); + + // This also deduces 3D (DHW) parameters from 2D (HW) descriptor. + const uint32_t filter_w = lengths[is2d ? 1 : 2]; + const uint32_t filter_h = lengths[is2d ? 0 : 1]; + const uint32_t filter_d = is2d ? 1 : lengths[0]; + const uint32_t filter_w_stride = strides[is2d ? 1 : 2]; + const uint32_t filter_h_stride = strides[is2d ? 0 : 1]; + const uint32_t filter_d_stride = is2d ? (filter_h_stride * filter_d) : strides[0]; + const uint32_t filter_w_pad = pads[is2d ? 1 : 2]; + const uint32_t filter_h_pad = pads[is2d ? 0 : 1]; + const uint32_t filter_d_pad = is2d ? 0 : pads[0]; + + const int pooling_method = (pooling.GetMode() == miopenPoolingMax) ? MLO_POOLING_OP_MAX + : (pooling.GetMode() == miopenPoolingAverage) + ? MLO_POOLING_OP_AVE + : MLO_POOLING_OP_AVE_INCLUSIVE; + + const auto save_index = problem.SaveIndex(); + const auto index_mode = pooling.GetWorkspaceIndexMode(); + const auto index_type = pooling.GetIndexType(); + + /// \anchor multiply_dims_overflow_assumption + /// + /// Preventing overflow during dimension-related computations: + /// Let's assume that multiplication of three dims always fits into 32 bits (unsigned). + /// Then let's use size_t when we need to multiply more than three dims. + /// For example, in NCDHW layout, the N and C strides are results of multiplication + /// of >= 3 dims, so we have to use size_t for storing them. + /// + /// We need to pay special attention to muls of D stride with some other dims. + /// The D stride is a result of 2 muls. Therefore (d_stride * dim) does + /// not require widening to size_t prior mul, but (d_stride * dim * dim) + /// requires it because the total number of muls is 4. + + // TEMPCODE RJS + std::cout << "======================================================================" << std::endl; + printVec("bot lengths", bot.GetLengths()); + printVec("bot strides", bot.GetStrides()); + printVec("top lengths", top.GetLengths()); + printVec("top strides", top.GetStrides()); + printVec("pool lengths", lengths); + printVec("pool strides", strides); + printVec("pool pads", pads); + std::cout << "======================================================================" << std::endl; + + const auto spatial_dim = is2d ? 2U : 3U; + + size_t all_n, all_c, bot_d, bot_h, bot_w; + std::tie(all_n, all_c, bot_d, bot_h, bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths()); + + size_t bot_n_stride, bot_d_stride; + size_t bot_h_stride, bot_w_stride, bot_c_stride; + std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride) = + miopen::GetNCDHW(spatial_dim, bot.GetStrides()); + + size_t a1, a2, top_d, top_h, top_w; + std::tie(a1, a2, top_d, top_h, top_w) = + miopen::GetNCDHW(spatial_dim, top.GetLengths()); + std::cout << "GetSol: top_lens " << a1 << " " << a2 << " " << top_d << " " << top_h << " " << top_w << std::endl; + + size_t top_n_stride, top_d_stride; + size_t top_h_stride, top_w_stride, top_c_stride; + std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride) = + miopen::GetNCDHW(spatial_dim, top.GetStrides()); + // TEMPCODE RJS + std::cout << "GetSol: top_strides " << top_n_stride << " " << top_c_stride << " " + << top_d_stride << " " << top_h_stride << " " << top_w_stride << std::endl; + // Mask data is always NCDHW layout + const uint32_t mask_w_stride = 1; + const uint32_t mask_h_stride = mask_w_stride * top_w; + const uint32_t mask_d_stride = mask_h_stride * top_h; + const size_t mask_c_stride = static_cast(mask_d_stride) * top_d; + const size_t mask_n_stride = mask_c_stride * all_c; + + /// About optimal grid size: + /// top D, H, and W are mapped directly onto grid dimensions, except in very small problems + /// when they are combined into workgroup items in an attempt to improve overlapping and coalescense. + /// N seems to be generally small, so we'll multiply it into the 'D' dimension. + /// + /// \anchor naive_pooling_max_grid_size + /// * Assumption: Max grid size is >= 2^32-1 (4G-1) i.e. std::max. + /// However, assume the product of two dimensions is always <= 2^30. + /// Currently this limitation is valid for both ROCm HIP and OCL runtimes. + /// + /// Selecting the optimal workgroup size is an interesting problem. + /// We'll first map c into the workgroup up to the maximum 1024 items. For large C, the + /// extra are mapped into the grid dimensions. + /// For small C, w and h are mapped into the workgroup dimensions as needed, in that + /// order, up to a maximum of 128 workitems (favoring more active blocks over more threads). + /// We do permit a partial workgroup when it is not an exact multiple of the wavefront size. + /// As said above, remaining H and W are mapped onto the grid dimensions. + /// + /// The workgroup size does not have the restrictions imposed by synchronization between + /// workitems because the kernel does not require synchronization. + + std::ignore = context; + constexpr uint32_t LARGE_C_MAX_ITEMS = 512; + constexpr uint32_t SMALL_C_MAX_ITEMS = 128; + + auto nd_ = all_n * top_d; + auto h_ = top_h; + auto w_ = top_w; + auto c_ = all_c; + + uint32_t l1 = 1U; + uint32_t l2 = 1U; + + if(c_ > LARGE_C_MAX_ITEMS) + { + auto c2 = c_ / LARGE_C_MAX_ITEMS + 1; + c_ = LARGE_C_MAX_ITEMS; + w_ *= c2; + } + // else if(c_ <= SMALL_C_MAX_ITEMS / 2) + // { + // if(c_ * w_ <= SMALL_C_MAX_ITEMS) + // { + // std::swap(l2, w_); + + // if(c_ * w_ * h_ <= SMALL_C_MAX_ITEMS) + // { + // std::swap(l1, h_); + // } + // } + // } + + const auto g0 = nd_; + const auto g1 = h_; + const auto g2 = w_; + const auto l0 = c_; + + { + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenPoolingForwardNDNhwcNaive.cpp"; + kernel.kernel_name = "mloPoolingForwardNDNhwcNaive"; + + auto build_params = KernelBuildParameters{ + {"MLO_POOLING_OP_ID", pooling_method}, // We need this at compile time in order to + // engage mixed precision only when necessary. + {"MLO_POOLING_INDEX_TYPE", get_pooling_index_type_name(index_type)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype} + }; + + build_params << GetDataTypeKBP(bot.GetType()); + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + // [Informative] The total number of kernels required to cover the whole + // forward pooling problem space is 3*4*2*2*2 = 96. The solver is dynamic. + // * 3: the number of supported operations + // * 4: the number of supported index types + // * 2: the number of supported data types + // * 2: layout (NCHW vs NHWC) + // * 2: 2D and 3D kernels (optimization) + + kernel.g_wk.clear(); + kernel.g_wk.push_back(g0); + kernel.g_wk.push_back(g1); + kernel.g_wk.push_back(g2); + kernel.l_wk.clear(); + kernel.l_wk.push_back(l0); + kernel.l_wk.push_back(l1); + kernel.l_wk.push_back(l2); + + // TEMPCODE RJS + std::cout << "Kernel dims: " << kernel.g_wk.size() << " " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2] + << " | " << kernel.l_wk.size() << " " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl; + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + kernel( + params.x, + params.y, + params.junk, // TEMPCODE RJS + params.workspace, + save_index, + index_mode, + filter_d, filter_h, filter_w, + filter_d_stride, filter_h_stride, filter_w_stride, + filter_d_pad, filter_h_pad, filter_w_pad, + all_n, + all_c, + bot_d, bot_h, bot_w, + bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride, + top_d, top_h, top_w, + top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride, + mask_n_stride, mask_c_stride, mask_d_stride, mask_h_stride, mask_w_stride); + }; + }; + + return result; +} + +std::size_t +PoolingForwardNDNhwcNaive::GetWorkspaceSize(const ExecutionContext&, + const miopen::pooling::ProblemDescription& problem) const +{ + if(problem.GetPooling().GetMode() != miopenPoolingMax || !problem.SaveIndex()) + return 0; + return problem.GetYDesc().GetElementSize() * get_data_size(problem.GetPooling().GetIndexType()); +} + +} // namespace pooling + +} // namespace solver + +} // namespace miopen diff --git a/src/tensor.cpp b/src/tensor.cpp index 7ec4c4e581..f189dcf8da 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -153,6 +153,14 @@ TensorDescriptor::TensorDescriptor(miopenDataType_t t, { } +TensorDescriptor::TensorDescriptor(miopenDataType_t t, + miopenTensorLayout_t layout_in, + const std::vector& lens_in, + const std::vector& strides_in) + : TensorDescriptor(t, layout_in, ConvertLengthsOrThrow(lens_in, "Lengths must be > 0"), ConvertLengthsOrThrow(strides_in, "Strides must be > 0")) +{ +} + TensorDescriptor::TensorDescriptor(miopenDataType_t t, miopenTensorLayout_t layout_in, const std::initializer_list& lens_in) @@ -446,6 +454,11 @@ std::string TensorDescriptor::GetLayoutStr(miopenTensorLayout_t tensorLayout) std::string TensorDescriptor::GetLayout_str() const { return GetLayoutStr(this->tensorLayout); } +bool TensorDescriptor::IsDefaultLayout() const +{ + return IsDefaultLayout(tensorLayout, lens.size() - 2); +} + std::size_t TensorDescriptor::GetVectorLength() const { return this->vector_length; } std::size_t TensorDescriptor::GetIndex(std::initializer_list l) const diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 434fdfe5df..0211790493 100755 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -192,6 +192,8 @@ message(STATUS "MIOPEN_TEST_COMPOSABLEKERNEL ${MIOPEN_TEST_COMPOSABLEKERNEL}") message(STATUS "MIOPEN_TEST_DISCRETE ${MIOPEN_TEST_DISCRETE}") message(STATUS "MIOPEN_TEST_DBSYNC ${MIOPEN_TEST_DBSYNC}") message(STATUS "CODECOV_TEST ${CODECOV_TEST}") +message(STATUS "CMAKE_CTEST_COMMAND ${CMAKE_CTEST_COMMAND}") +message(STATUS "CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR}") if(MIOPEN_TEST_DRIVER_ITER_MODE) add_definitions(-DMIOPEN_TEST_DRIVER_MODE=2) diff --git a/test/gtest/ex1.cpp b/test/gtest/ex1.cpp new file mode 100644 index 0000000000..4b1ab5e2b1 --- /dev/null +++ b/test/gtest/ex1.cpp @@ -0,0 +1,15 @@ +#include + +struct paramType { std::string value; }; + +class MyFixture : public testing::TestWithParam {}; +class FixtureA : public MyFixture {}; +class FixtureB : public MyFixture {}; + +TEST_P(FixtureA, TestNameA0) { auto& myParam = GetParam(); EXPECT_GT(myParam.value.size(), 0ULL); } +TEST_P(FixtureB, TestNameA0) { auto& myParam = GetParam(); EXPECT_GT(myParam.value.size(), 0ULL); } + +INSTANTIATE_TEST_SUITE_P(PIN0, FixtureA, testing::Values(paramType{"v00"}, paramType{"v01"})); +INSTANTIATE_TEST_SUITE_P(PIN1, FixtureA, testing::Values(paramType{"v10"}, paramType{"v11"}, paramType{"v12"})); +INSTANTIATE_TEST_SUITE_P(PIN2, FixtureB, testing::Values(paramType{"v00"}, paramType{"v11"})); + diff --git a/test/gtest/layout_transpose.cpp b/test/gtest/layout_transpose.cpp index 54d6cc21a7..1161eff2b9 100644 --- a/test/gtest/layout_transpose.cpp +++ b/test/gtest/layout_transpose.cpp @@ -274,7 +274,52 @@ struct LayoutTransposeTest_2D : public ::testing::TestWithParam{tensor_len, tensor_strides}.generate(gen_value); + auto t_src = tensor{tensor_len, tensor_strides}; + bool printing = true; // in_dim[0]==8 && in_dim[1]==8; + if(printing) + { + auto inlen = t_src.desc.GetLengths(); + auto instr = t_src.desc.GetStrides(); + std::cout << "CPU in : "; + for(auto dim : inlen) std::cout << std::setw(4) << dim; + std::cout << " | "; + for(auto str : instr) std::cout << std::setw(4) << str; + std::cout << std::endl; + + for(int nn = 0; nn < inlen[0]; ++nn) { + for(int cc = 0; cc < inlen[1]; ++cc) { + for(int hh = 0; hh < inlen[2]; ++hh) { + for(int ww = 0; ww < inlen[3]; ++ww) { + std::cout << std::setw(11) << std::setprecision(5) << t_src(nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]) << " "; + } + std::cout << std::endl; + } + } + } + } + + t_src.generate(gen_value); + if(printing) + { + auto inlen = t_src.desc.GetLengths(); + auto instr = t_src.desc.GetStrides(); + std::cout << "CPU in : "; + for(auto dim : inlen) std::cout << std::setw(4) << dim; + std::cout << " | "; + for(auto str : instr) std::cout << std::setw(4) << str; + std::cout << std::endl; + + for(int nn = 0; nn < inlen[0]; ++nn) { + for(int cc = 0; cc < inlen[1]; ++cc) { + for(int hh = 0; hh < inlen[2]; ++hh) { + for(int ww = 0; ww < inlen[3]; ++ww) { + std::cout << std::setw(11) << std::setprecision(5) << t_src(nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]) << " "; + } + std::cout << std::endl; + } + } + } + } auto t_dst = tensor{tensor_len, tensor_strides}; auto t_dst_gpu = tensor{tensor_len, tensor_strides}; diff --git a/test/gtest/poolingFwdNdNaive.cpp b/test/gtest/poolingFwdNdNaive.cpp index 3e11396fff..0b7682ccc3 100644 --- a/test/gtest/poolingFwdNdNaive.cpp +++ b/test/gtest/poolingFwdNdNaive.cpp @@ -24,6 +24,8 @@ * *******************************************************************************/ +#ifndef POOLING_GTEST_BUILD + #include #include #include "get_handle.hpp" @@ -32,6 +34,7 @@ #include "pooling2d.hpp" #include "tensor_holder.hpp" +#include "miopen/tensor_layout.hpp" MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS) @@ -39,8 +42,9 @@ MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS) namespace env = miopen::env; namespace { + template -struct tensor_data +struct layout_data { static std::vector get_layout_lengths(int n, int c, std::vector& dims) { @@ -109,62 +113,24 @@ struct tensor_data } -namespace pooling_tests { +class PoolingFwd : public testing::TestWithParam> {}; +class PoolingFwdFloat : public PoolingFwd {}; +class PoolingFwdHalf : public PoolingFwd {}; -class PoolingFwdNdNaive : public testing::TestWithParam> -{ -}; +void Run2dDriver(miopenDataType_t prec); -static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); } +namespace { -// void GetArgs(const std::string& param, std::vector& tokens) -// { -// std::stringstream ss(param); -// std::istream_iterator begin(ss); -// std::istream_iterator end; -// while(begin != end) -// tokens.push_back(*begin++); -// } +static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); } -void Run2dDriver(miopenDataType_t prec) +void GetArgs(const std::string& param, std::vector& tokens) { - - std::vector params; - switch(prec) - { - case miopenFloat: params = Pooling2dFloat::GetParam(); break; - case miopenHalf: params = WidePooling2dHalf::GetParam(); break; - case miopenBFloat16: - case miopenInt8: - case miopenFloat8: - case miopenBFloat8: - case miopenInt32: - case miopenInt64: - case miopenDouble: - FAIL() - << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8 " - "data type not supported by " - "immed_conv2d_codecov test"; - - default: params = Pooling2dFloat::GetParam(); - } - - for(const auto& test_value : params) - { - std::vector tokens; - GetArgs(test_value, tokens); - std::vector ptrs; - - std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) { - return str.data(); - }); - - testing::internal::CaptureStderr(); - test_drive(ptrs.size(), ptrs.data()); - auto capture = testing::internal::GetCapturedStderr(); - std::cout << capture; - } -}; + std::stringstream ss(param); + std::istream_iterator begin(ss); + std::istream_iterator end; + while(begin != end) + tokens.push_back(*begin++); +} bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; } @@ -174,21 +140,24 @@ std::vector GetTestCases(const std::string precision) const std::vector test_cases = { // clang-format off - {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 "+flag_arg} + {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg} // clang-format on }; return test_cases; } - } // namespace pooling_tests -using namespace pooling_tests; +// using namespace pooling_tests; -/* -TEST_P(Pooling2dFloat, FloatTest_pooling2d_wide) +TEST_P(PoolingFwdFloat, NNT) // NDNaiveTranspose { const auto& handle = get_handle(); - if(IsTestSupportedForDevice(handle) && !SkipTest() && IsTestRunWith("--float")) + if(!IsTestSupportedForDevice(handle)) std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl; + if(SkipTest()) std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl; + // if(!IsTestRunWith("--float")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl; + // Run2dDriver(miopenFloat); return; // TEMPCODE RJS + // && IsTestRunWith("--float") + if(IsTestSupportedForDevice(handle) && !SkipTest()) { Run2dDriver(miopenFloat); } @@ -197,12 +166,15 @@ TEST_P(Pooling2dFloat, FloatTest_pooling2d_wide) GTEST_SKIP(); } }; -*/ -TEST_P(WidePooling2dHalf, HalfTest_pooling2d_wide) +TEST_P(PoolingFwdHalf, NNT) { const auto& handle = get_handle(); - if(IsTestSupportedForDevice(handle) && !SkipTest() && IsTestRunWith("--half")) + if(!IsTestSupportedForDevice(handle)) std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl; + if(SkipTest()) std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl; + // if(!IsTestRunWith("--half")) std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl; + + if(IsTestSupportedForDevice(handle) && !SkipTest()) // && IsTestRunWith("--half") TEMPCODE RJS { Run2dDriver(miopenHalf); } @@ -212,6 +184,60 @@ TEST_P(WidePooling2dHalf, HalfTest_pooling2d_wide) } }; -// INSTANTIATE_TEST_SUITE_P(Pooling2D, Pooling2dFloat, testing::Values(GetTestCases("--float"))); +void Run2dDriver(miopenDataType_t prec) +{ + auto cases = GetTestCases("--float"); + std::cerr << " Cases: " << cases.size() << std::endl; + for(const auto& test_value : cases) + { + std::cerr << " : " << test_value << std::endl; // TEMPCODE RJS + } + + std::vector params; + switch(prec) + { + case miopenFloat: params = PoolingFwdFloat_NNT_Test::GetParam(); break; + case miopenHalf: params = PoolingFwdHalf_NNT_Test::GetParam(); break; + case miopenBFloat16: + case miopenInt8: + case miopenInt32: + case miopenDouble: + case miopenFloat8: + case miopenBFloat8: + case miopenInt64: + FAIL() + << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 " + "data type not supported by " + "poolingFwdNdNaive test"; + + default: params = PoolingFwdFloat_NNT_Test::GetParam(); + } + + std::cerr << "Params: " << params.size() << std::endl; + for(const auto& test_value : params) + { + std::cerr << " : " << test_value << std::endl; // TEMPCODE RJS + } + for(const auto& test_value : params) + { + std::cerr << "Testing: " << test_value << std::endl; // TEMPCODE RJS + std::vector tokens; + GetArgs(test_value, tokens); + std::vector ptrs; + + std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) { + return str.data(); + }); + + testing::internal::CaptureStderr(); + test_drive(ptrs.size(), ptrs.data()); + auto capture = testing::internal::GetCapturedStderr(); + std::cout << capture; + } +} + +INSTANTIATE_TEST_SUITE_P(Float, PoolingFwdFloat, testing::Values(GetTestCases("--float"))); + +INSTANTIATE_TEST_SUITE_P(Half, PoolingFwdHalf, testing::Values(GetTestCases("--half"))); -INSTANTIATE_TEST_SUITE_P(Pooling2D, WidePooling2dHalf, testing::Values(GetTestCases("--half"))); +#endif diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp index 399592a1f4..a9ca1446b5 100644 --- a/test/pooling2d.hpp +++ b/test/pooling2d.hpp @@ -57,7 +57,7 @@ struct pooling2d_shapes } // Dataset 1 is intended for testing of asymmetric configs. - static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}}; } + static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 1, 8, 8}}; } // Dataset 2 is intended for testing of configs with wide window. static std::vector get_2d_pooling_input_shapes_wide() @@ -123,5 +123,6 @@ struct pooling2d_driver : pooling_driver {{0, 0}}})); // clang-format on this->add(this->wsidx, "wsidx", this->generate_data({0, 1})); + this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNHWC } }; diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp index c6d80e8e98..d2a64540c6 100644 --- a/test/pooling3d.hpp +++ b/test/pooling3d.hpp @@ -50,6 +50,7 @@ struct pooling3d_driver : pooling_driver { return pooling3d_shapes::get_3d_pooling_input_shapes(); } + pooling3d_driver() : pooling_driver() { this->add( @@ -58,5 +59,6 @@ struct pooling3d_driver : pooling_driver this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}})); this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}})); this->add(this->wsidx, "wsidx", this->generate_data({1})); + this->add(this->layout, "layout", this->generate_data({0, 1})); } }; diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp index 231b635a63..69c19478ae 100644 --- a/test/pooling_common.hpp +++ b/test/pooling_common.hpp @@ -60,7 +60,17 @@ static int num_uint64_case = 0; // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) static int num_uint64_case_imgidx = 0; -static inline void print(const miopen::PoolingDescriptor& filter) +namespace { + +constexpr int RAND_INTEGER_MAX = 1200; +constexpr int RAND_INTEGER_MIN = -880; + +template +auto gen_value = + [](auto... is) { return static_cast(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 10; }; +} + +static inline void print(const miopen::PoolingDescriptor& filter, bool is_default_layout) { std::cout << "Pooling: "; if(filter.GetMode() == miopenPoolingAverage) @@ -70,6 +80,7 @@ static inline void print(const miopen::PoolingDescriptor& filter) else std::cout << "Max"; std::cout << std::endl; + std::cout << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl; // TEMPCODE RJS std::cout << "Lengths: "; miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl; std::cout << "Pads: "; @@ -84,13 +95,24 @@ tensor get_output_tensor(const miopen::PoolingDescriptor& filter, const tenso return tensor{filter.GetForwardOutputTensor(input.desc)}; } +template +tensor get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor& input) +{ + auto desc = filter.GetForwardOutputTensor(input.desc); + auto lens = desc.GetLengths(); + lens[0] *= 10; + auto big = miopen::TensorDescriptor{desc.GetType(), input.desc.GetLayout_t(), lens, desc.GetStrides()}; + std::cout << "get_big_output_tensor: " << input.desc.GetLayout_str() << " " << desc.GetLayout_str() << std::endl; + return tensor{big}; +} + template struct pooling_operators { miopen::PoolingDescriptor filter; pooling_operators(miopen::PoolingDescriptor f) : filter(f) {} - double start() const + double initialize() const { if(filter.GetMode() == miopenPoolingMax) return std::numeric_limits::lowest(); @@ -111,7 +133,7 @@ struct pooling_operators } } - double final(double x, double y) + double finalize(double x, double y) { if(filter.GetMode() == miopenPoolingMax) return (x); @@ -120,6 +142,8 @@ struct pooling_operators } }; +#include + template struct verify_forward_pooling { @@ -127,22 +151,56 @@ struct verify_forward_pooling tensor cpu(const tensor& input, const miopen::PoolingDescriptor& filter, std::vector&) const { + const bool is_default_layout = input.desc.IsDefaultLayout(); + const int sptl_dim_offset = 2; // is_default_layout ? 2 : 1; TEMPCODE RJS + const int chan_dim_offset = 1; // is_default_layout ? 1 : SptDim + 1; + auto out = get_output_tensor(filter, input); std::array in_dim{}; - std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin()); + std::copy_n(input.desc.GetLengths().begin() + sptl_dim_offset, SptDim, in_dim.begin()); std::array strides{}; std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin()); std::array pads{}; std::copy_n(filter.GetPads().begin(), SptDim, pads.begin()); std::array kers{}; std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin()); - auto op = pooling_operators{filter}; + auto pooler = pooling_operators{filter}; + + // TEMPCODE RJS print input tensor + bool printing = true; // in_dim[0]==8 && in_dim[1]==8; + if(printing) + { + auto inlen = input.desc.GetLengths(); + auto instr = input.desc.GetStrides(); + std::cout << "CPU in : m" << filter.GetMode() << " t" << input.desc.GetType() << " | "; + for(auto dim : inlen) std::cout << std::setw(4) << dim; + std::cout << " | "; + for(auto str : instr) std::cout << std::setw(4) << str; + std::cout << " | "; + for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str; + std::cout << " | "; + for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str; + std::cout << " | "; + for(auto str : filter.GetPads()) std::cout << std::setw(4) << str; + std::cout << std::endl; + + for(int nn = 0; nn < inlen[0]; ++nn) { + for(int cc = 0; cc < inlen[1]; ++cc) { + for(int hh = 0; hh < inlen[2]; ++hh) { + for(int ww = 0; ww < inlen[3]; ++ww) { + std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << " "; + } + std::cout << std::endl; + } + } + } + } int b_n = out.desc.GetLengths()[0]; - int k_n = out.desc.GetLengths()[1]; + int k_n = out.desc.GetLengths()[chan_dim_offset]; std::array out_spatial_len{}; - std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin()); + std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin()); auto par_ford_out = miopen::unpacker(miopen::prepender(par_ford, b_n, k_n))(out_spatial_len); @@ -167,27 +225,53 @@ struct verify_forward_pooling ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies()) : std::accumulate(win_sz.begin(), win_sz.end(), 1, std::multiplies()); - double acc = op.start(); + double acc = pooler.initialize(); miopen::unpacker(ford)(win_sz)([&](auto... in_spatial_id_pack) { auto in_spatial_id = make_array(in_spatial_id_pack...); std::array idx{}; idx[0] = o; - idx[1] = w; + idx[chan_dim_offset] = w; bool in_cmp_idx = true; for(int i = 0; i < SptDim; ++i) { - idx[i + 2] = start_idx[i] + in_spatial_id[i]; - in_cmp_idx &= (in_dim[i] > idx[i + 2]); + idx[i + sptl_dim_offset] = start_idx[i] + in_spatial_id[i]; + in_cmp_idx &= (in_dim[i] > idx[i + sptl_dim_offset]); } if(in_cmp_idx) { - acc = op(acc, input(idx)); + acc = pooler(acc, input(idx)); } }); - out(o, w, out_spatial_id_pack...) = T(op.final(acc, pool_size)); + out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size)); }); + if(printing) + { + std::cout << "CPU out: "; + auto outlen = out.desc.GetLengths(); + auto outstr = out.desc.GetStrides(); + for(auto dim : outlen) std::cout << std::setw(4) << dim; + std::cout << " | "; + for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str; + std::cout << " | "; + for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str; + std::cout << " | "; + for(auto str : filter.GetPads()) std::cout << std::setw(4) << str; + std::cout << std::endl; + + for(int nn = 0; nn < outlen[0]; ++nn) { + for(int cc = 0; cc < outlen[1]; ++cc) { + for(int hh = 0; hh < outlen[2]; ++hh) { + for(int ww = 0; ww < outlen[3]; ++ww) { + std::cout << std::setw(11) << std::setprecision(5) << out.data[nn * outstr[0] + cc * outstr[1] + hh * outstr[2] + ww * outstr[3]] << " "; + } + std::cout << std::endl; + } + } + } + } // print output tensor + return out; } @@ -197,11 +281,16 @@ struct verify_forward_pooling std::vector& indices) const { auto&& handle = get_handle(); - auto out = get_output_tensor(filter, input); + auto out = get_big_output_tensor(filter, input); // TEMPCODE RJS + indices.resize(out.data.size(), 0); auto in_dev = handle.Write(input.data); auto out_dev = handle.Create(out.data.size()); + auto junk_dev = handle.Create(out.data.size()); // + std::cout << "gpu Sizes: in:" << input.data.size() << " out: " << out.data.size() << " " << out.desc.GetLayout_str() + << " " << out.desc.GetLengths()[0] << " " << out.desc.GetLengths()[1] << " " << out.desc.GetLengths()[2] << " " << out.desc.GetLengths()[3] + << " " << out.desc.GetStrides()[0] << " " << out.desc.GetStrides()[1] << " " << out.desc.GetStrides()[2] << " " << out.desc.GetStrides()[3] << std::endl; Workspace wspace{}; wspace.Write(indices); @@ -215,10 +304,38 @@ struct verify_forward_pooling out_dev.get(), true, wspace.ptr(), - wspace.size()); + wspace.size(), + junk_dev.get()); // TEMPCODE RJS indices = wspace.Read>(); out.data = handle.Read(out_dev, out.data.size()); + if(true) + { + std::cout << "GPU out: "; + auto outlen = out.desc.GetLengths(); + for(auto dim : outlen) std::cout << std::setw(4) << dim; + std::cout << " | "; + auto outstr = out.desc.GetStrides(); + for(auto dim : outstr) std::cout << std::setw(4) << dim; + std::cout << " | "; + for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str; + std::cout << " | "; + for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str; + std::cout << " | "; + for(auto str : filter.GetPads()) std::cout << std::setw(4) << str; + std::cout << std::endl; + + for(int nn = 0; nn < outlen[0]; ++nn) { + for(int cc = 0; cc < outlen[1]; ++cc) { + for(int hh = 0; hh < outlen[2]; ++hh) { + for(int ww = 0; ww < outlen[3]; ++ww) { + std::cout << std::setw(11) << std::setprecision(5) << out.data[out.desc.GetIndex(nn, cc, hh, ww)] << " "; + } + std::cout << std::endl; + } + } + } + } // print output tensor return out; } @@ -229,7 +346,7 @@ struct verify_forward_pooling const std::vector&) const { std::cout << "Forward "; - print(filter); + print(filter, input.desc.IsDefaultLayout()); std::cout << "Input tensor: " << input.desc.ToString() << std::endl; std::cout << "Output tensor: " << filter.GetForwardOutputTensor(input.desc).ToString() << std::endl; @@ -248,7 +365,12 @@ struct verify_backward_pooling bool use_global_index, bool verify_index) const { + const bool is_default_layout = input.desc.IsDefaultLayout(); + const int sptl_dim_offset = is_default_layout ? 2 : 1; + const int chan_dim_offset = is_default_layout ? 1 : SptDim + 1; + auto dinput = input; + std::vector din_vec(input.desc.GetElementSpace(), 0.0); CHECK(dout.desc == out.desc); std::array in_dim{}; @@ -264,9 +386,9 @@ struct verify_backward_pooling auto ford_ker = miopen::unpacker(ford)(kers); int out_n = out.desc.GetLengths()[0]; - int out_c = out.desc.GetLengths()[1]; + int out_c = out.desc.GetLengths()[chan_dim_offset]; std::array out_spatial_len{}; - std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin()); + std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin()); auto ford_out = miopen::unpacker(ford)(out_spatial_len); par_ford(out_n, out_c)([&](int o, int w) { @@ -281,12 +403,12 @@ struct verify_backward_pooling for(int i = 0; i < SptDim; i++) { std::size_t mx_idx_dim = mx_idx; - mx_idx_dim /= std::accumulate(in_dim.begin() + i + 3, + mx_idx_dim /= std::accumulate(in_dim.begin() + sptl_dim_offset + i + 1, in_dim.end(), 1ULL, std::multiplies()); - mx_idx_dim %= in_dim[i + 2]; - idx[i + 2] = mx_idx_dim; + mx_idx_dim %= in_dim[i + sptl_dim_offset]; + idx[i + sptl_dim_offset] = mx_idx_dim; } } else @@ -441,7 +563,7 @@ struct verify_backward_pooling bool) const { std::cout << "Backward "; - print(filter); + print(filter, input.desc.IsDefaultLayout()); std::cout << "Input tensor: " << input.desc.ToString() << std::endl; std::cout << "Output tensor: " << out.desc.ToString() << std::endl; } @@ -462,6 +584,7 @@ struct pooling_driver : test_driver #endif int verify_indices{}; int wsidx{}; + miopenTensorLayout_t layout{}; std::unordered_map index_type_lookup = { {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8}, {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16}, @@ -505,49 +628,92 @@ struct pooling_driver : test_driver add(verify_indices, "verify_indices", generate_data({1})); } - template + template void run_impl() { std::vector indices{}; - auto input = tensor{in_shape}.generate( - tensor_elem_gen_integer{miopen_type{} == miopenHalf ? 5 : 17}); - auto out = verify(verify_forward_pooling{}, input, filter, indices); - auto dout = out.first; - dout.generate(tensor_elem_gen_integer{2503}); - verify(verify_backward_pooling{}, - input, - dout, - out.first, - filter, - indices, - wsidx != 0, - static_cast(this->verify_indices)); + auto input = tensor{layout, in_shape}; + for(auto& v : input.data) v = gen_value(); + + // TEMPCODE RJS print input tensor + bool printing = true; // in_dim[0]==8 && in_dim[1]==8; + if(printing) + { + auto inlen = input.desc.GetLengths(); + auto instr = input.desc.GetStrides(); + std::cout << "CPU GEN : " << input.desc.GetLayout_str() << "(" << inlen.size() << ") | " << input.data.size() << " | " << input.desc.GetElementSpace() << " | "; + for(auto dim : inlen) std::cout << std::setw(4) << dim; + std::cout << " | "; + for(auto str : instr) std::cout << std::setw(4) << str; + std::cout << " | "; + for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str; + std::cout << " | "; + for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str; + std::cout << " | "; + for(auto str : filter.GetPads()) std::cout << std::setw(4) << str; + std::cout << std::endl; + + for(int nn = 0; nn < inlen[0]; ++nn) { + for(int cc = 0; cc < inlen[1]; ++cc) { + for(int hh = 0; hh < inlen[2]; ++hh) { + for(int ww = 0; ww < inlen[3]; ++ww) {// nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3] + std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << " "; + } + std::cout << std::endl; + } + } + } + } + + auto out = verify(verify_forward_pooling{}, + input, + filter, + indices); + + // auto dout = out.first; + // dout.generate(tensor_elem_gen_integer{2503}); + // verify(verify_backward_pooling{}, // TEMPCODE RJS no backward + // input, + // dout, + // out.first, + // filter, + // indices, + // wsidx != 0, + // static_cast(this->verify_indices)); } void run() { + if(miopen::ToUpper(mode) == "MAX") return; // TEMPCODE RJS skip all except max, do max only + auto idx_typ = index_type_lookup.at(miopen::ToUpper(index_type)); auto idx_sz = sizeof(uint8_t); - int spt_dim = in_shape.size() - 2; + int sptl_dim = in_shape.size() - 2; const bool skip_many_configs_with_non_int8_index = (dataset_id == 0) && full_set; // Otherwise the default dataset takes too much time. const bool wide_dataset = (dataset_id == 2) && full_set; + // Input dimensions to the driver are always NCHW-style + const bool is_default_layout = !(layout == miopenTensorNHWC || layout == miopenTensorNDHWC); + std::cout << "################## pooling_driver run layout=" << (int)layout << " is_default=" << (int)is_default_layout << std::endl; + filter = miopen::PoolingDescriptor { mode_lookup.at(miopen::ToUpper(mode)), #if TEST_PADDING_MODE == 1 - pmode_lookup.at(miopen::ToUpper(pmode)), + pmode_lookup.at(miopen::ToUpper(pmode)), #else - miopenPaddingDefault, + miopenPaddingDefault, #endif - lens, strides, pads + lens, + strides, + pads }; filter.SetIndexType(idx_typ); filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx)); - if(wsidx == 0 && spt_dim == 3 && filter.GetMode() == miopenPoolingMax && full_set) + if(wsidx == 0 && sptl_dim == 3 && filter.GetMode() == miopenPoolingMax && full_set) { show_command(); std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " @@ -556,7 +722,7 @@ struct pooling_driver : test_driver return; } - if(wsidx == 0 && spt_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset) + if(wsidx == 0 && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset) { show_command(); std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " @@ -587,12 +753,12 @@ struct pooling_driver : test_driver /// the "full test" is ran. See: /// \ref max_pooling_index_max_restriction case miopenIndexUint8: { - if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set && + if((sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) && full_set && filter.GetMode() == miopenPoolingMax) { show_command(); std::cout << "Warning: Config skipped: uint8 index is too small " - "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) " + "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " "&& filter.GetMode() == miopenPoolingMax" << std::endl; return; @@ -600,12 +766,12 @@ struct pooling_driver : test_driver break; } case miopenIndexUint16: { - if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set && + if((sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) && full_set && filter.GetMode() == miopenPoolingMax) { show_command(); std::cout << "Warning: Config skipped: uint16 index is too small " - "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) " + "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " "&& filter.GetMode() == miopenPoolingMax" << std::endl; return; @@ -675,12 +841,12 @@ struct pooling_driver : test_driver } else { - if(num_uint64_case_imgidx > 5 && spt_dim == 2) + if(num_uint64_case_imgidx > 5 && sptl_dim == 2) { show_command(); std::cout << "Warning: Config skipped to speed up testing of the " "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 " - "&& spt_dim == 2)" + "&& sptl_dim == 2)" << std::endl; return; } @@ -692,18 +858,18 @@ struct pooling_driver : test_driver } } - auto input_desc = miopen::TensorDescriptor(this->type, in_shape); + auto input_desc = miopen::TensorDescriptor(this->type, layout, in_shape); - if(spt_dim != 2 && spt_dim != 3) + if(sptl_dim != 2 && sptl_dim != 3) { show_command(); std::cout << "Warning: Config skipped becuse it is not supported " // - "(spt_dim != 2 && spt_dim != 3)" + "(sptl_dim != 2 && sptl_dim != 3)" << std::endl; return; } - for(int i = 0; i < spt_dim; i++) + for(int i = 0; i < sptl_dim; i++) { if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast(2) * pads[i])) { @@ -733,9 +899,13 @@ struct pooling_driver : test_driver } } - std::vector in_dim(input_desc.GetLengths().begin() + 2, input_desc.GetLengths().end()); - std::vector out_dim(spt_dim); + int sptl_index = is_default_layout ? 2 : 1; + + std::vector in_dim(input_desc.GetLengths().begin() + sptl_index, + input_desc.GetLengths().begin() + sptl_index + sptl_dim); + std::vector out_dim(sptl_dim); std::vector ker_dim(filter.GetLengths().begin(), filter.GetLengths().end()); + #if TEST_PADDING_MODE == 1 if(filter.pmode == miopenPaddingSame) { @@ -743,7 +913,7 @@ struct pooling_driver : test_driver return i == 0; })) return; - for(int i = 0; i < spt_dim; i++) + for(int i = 0; i < sptl_dim; i++) { filter.pads[i] = ((in_dim[i] % filter.GetStrides()[i] == 0) @@ -763,7 +933,7 @@ struct pooling_driver : test_driver return i == 0; })) return; - for(int i = 0; i < spt_dim; i++) + for(int i = 0; i < sptl_dim; i++) { filter.pads[i] = 0; @@ -778,7 +948,7 @@ struct pooling_driver : test_driver switch(filter.GetIndexType()) { case miopenIndexUint8: { - if(spt_dim == 3) + if(sptl_dim == 3) { run_impl(); } @@ -789,7 +959,7 @@ struct pooling_driver : test_driver break; } case miopenIndexUint16: { - if(spt_dim == 3) + if(sptl_dim == 3) { run_impl(); } @@ -800,7 +970,7 @@ struct pooling_driver : test_driver break; } case miopenIndexUint32: { - if(spt_dim == 3) + if(sptl_dim == 3) { run_impl(); } @@ -811,7 +981,7 @@ struct pooling_driver : test_driver break; } case miopenIndexUint64: { - if(spt_dim == 3) + if(sptl_dim == 3) { run_impl(); } From 04ab85c450470ee5576fa8aa3ab0de6e8864180d Mon Sep 17 00:00:00 2001 From: "Randy J. Spaulding" Date: Wed, 21 Aug 2024 09:49:16 +0000 Subject: [PATCH 03/10] naive kernel works --- .../MIOpenPoolingForwardNDNhwcNaive.cpp | 401 +++++++++--------- src/solver/pooling/forwardNdNhwcNaive.cpp | 19 +- test/pooling_common.hpp | 42 +- 3 files changed, 235 insertions(+), 227 deletions(-) diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp index 970104b3b1..60aa2aac23 100644 --- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp +++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp @@ -39,8 +39,8 @@ #define CVT_FP32_2ACCUM(x) (x) #endif -// #define _FLOAT float -// #define _FLOAT_ACCUM _FLOAT +#define _FLOAT float +#define _FLOAT_ACCUM _FLOAT #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS #include @@ -83,7 +83,7 @@ #define UU #endif -#define doUU1 1 +#define doUU1 0 #if doUU1 #define UU1 __attribute__((__unused__)) #else @@ -132,227 +132,220 @@ __device__ void poolingForwardNDNhwcNaive(UU1 const TI* __restrict__ bot_ptr, UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_h_stride, UU ARG_UNUSED_FOR_AVERAGE size_t mask_w_stride) { - const uint32_t nn = blockIdx.x / top_d; // N=slow index - if(!(nn < all_n)) - return; - - const uint32_t td = blockIdx.x % top_d; // top D=fast index - if(td >= top_d) - return; - - const uint32_t th = blockIdx.y; // top H - // const uint32_t j = (gridDim.y == 1) ? threadIdx.y : blockIdx.y; // top H - if(th >= top_h) - return; - - const uint32_t tw = blockIdx.z % top_w; // top W=fast index - if(tw >= top_w) - return; + auto log_ptr = junk_ptr; if(nn == 0 && td == 0 && th == 0 && tw == 0) { int idx = 0; - top_ptr[idx++] = gridDim.x; - top_ptr[idx++] = gridDim.y; - top_ptr[idx++] = gridDim.z; - top_ptr[idx++] = -9; - - top_ptr[idx++] = blockDim.x; - top_ptr[idx++] = blockDim.y; - top_ptr[idx++] = blockDim.z; - top_ptr[idx++] = -8; - - top_ptr[idx++] = filter_d; - top_ptr[idx++] = filter_h; - top_ptr[idx++] = filter_w; - top_ptr[idx++] = -7; - - top_ptr[idx++] = filter_d_stride; - top_ptr[idx++] = filter_h_stride; - top_ptr[idx++] = filter_w_stride; - top_ptr[idx++] = -6; - - top_ptr[idx++] = filter_d_pad; - top_ptr[idx++] = filter_h_pad; - top_ptr[idx++] = filter_w_pad; - top_ptr[idx++] = -5; - - top_ptr[idx++] = all_n; - top_ptr[idx++] = all_c; - top_ptr[idx++] = bot_n_stride; - top_ptr[idx++] = bot_c_stride; - - top_ptr[idx++] = top_n_stride; - top_ptr[idx++] = top_c_stride; + log_ptr[idx++] = gridDim.x; + log_ptr[idx++] = gridDim.y; + log_ptr[idx++] = gridDim.z; + log_ptr[idx++] = -9; + + log_ptr[idx++] = blockDim.x; + log_ptr[idx++] = blockDim.y; + log_ptr[idx++] = blockDim.z; + log_ptr[idx++] = -8; + + log_ptr[idx++] = filter_d; + log_ptr[idx++] = filter_h; + log_ptr[idx++] = filter_w; + log_ptr[idx++] = -7; + + log_ptr[idx++] = filter_d_stride; + log_ptr[idx++] = filter_h_stride; + log_ptr[idx++] = filter_w_stride; + log_ptr[idx++] = -6; + + log_ptr[idx++] = filter_d_pad; + log_ptr[idx++] = filter_h_pad; + log_ptr[idx++] = filter_w_pad; + log_ptr[idx++] = -5; + + log_ptr[idx++] = all_n; + log_ptr[idx++] = all_c; + log_ptr[idx++] = bot_n_stride; + log_ptr[idx++] = bot_c_stride; + + log_ptr[idx++] = top_n_stride; + log_ptr[idx++] = top_c_stride; #if AVERAGE_OPS - top_ptr[idx++] = -4; - top_ptr[idx++] = -4; + log_ptr[idx++] = -4; + log_ptr[idx++] = -4; #else - top_ptr[idx++] = mask_n_stride; - top_ptr[idx++] = mask_c_stride; + log_ptr[idx++] = mask_n_stride; + log_ptr[idx++] = mask_c_stride; #endif - top_ptr[idx++] = bot_d; - top_ptr[idx++] = bot_h; - top_ptr[idx++] = bot_w; - top_ptr[idx++] = -3; + log_ptr[idx++] = bot_d; + log_ptr[idx++] = bot_h; + log_ptr[idx++] = bot_w; + log_ptr[idx++] = -3; - top_ptr[idx++] = bot_d_stride; - top_ptr[idx++] = bot_h_stride; - top_ptr[idx++] = bot_w_stride; - top_ptr[idx++] = -2; + log_ptr[idx++] = bot_d_stride; + log_ptr[idx++] = bot_h_stride; + log_ptr[idx++] = bot_w_stride; + log_ptr[idx++] = -2; - top_ptr[idx++] = top_d; - top_ptr[idx++] = top_h; - top_ptr[idx++] = top_w; - top_ptr[idx++] = -1; + log_ptr[idx++] = top_d; + log_ptr[idx++] = top_h; + log_ptr[idx++] = top_w; + log_ptr[idx++] = -1; - top_ptr[idx++] = top_d_stride; - top_ptr[idx++] = top_h_stride; - top_ptr[idx++] = top_w_stride; - top_ptr[idx++] = -9; + log_ptr[idx++] = top_d_stride; + log_ptr[idx++] = top_h_stride; + log_ptr[idx++] = top_w_stride; + log_ptr[idx++] = -9; #if AVERAGE_OPS - top_ptr[idx++] = -8; - top_ptr[idx++] = -8; - top_ptr[idx++] = -8; + log_ptr[idx++] = -8; + log_ptr[idx++] = -8; + log_ptr[idx++] = -8; #else - top_ptr[idx++] = mask_d_stride; - top_ptr[idx++] = mask_h_stride; - top_ptr[idx++] = mask_w_stride; + log_ptr[idx++] = mask_d_stride; + log_ptr[idx++] = mask_h_stride; + log_ptr[idx++] = mask_w_stride; #endif - top_ptr[idx++] = -7; + log_ptr[idx++] = -7; } + const uint32_t nn = blockIdx.x / top_d; // N=slow index + if(nn >= all_n) + return; + + const uint32_t td = blockIdx.x % top_d; // top D=fast index + if(td >= top_d) + return; + + const uint32_t th = blockIdx.y; // top H + const uint32_t j = (gridDim.y == 1) ? threadIdx.y : blockIdx.y; // top H + if(th >= top_h) + return; + const uint32_t tw = blockIdx.z % top_w; // top W=fast index + if(tw >= top_w) + return; +if(true) { uint32_t cc = 0; - // const auto c_base = (blockDim.x == all_c) ? 0 : (blockIdx.z / top_w) * blockDim.x; - size_t top_index = 64 - + nn * top_n_stride // TEMPCODE RJS + size_t top_index = + nn * top_n_stride // TEMPCODE RJS + cc * top_c_stride // + (size_t)(td * top_d_stride) // + (size_t)(th * top_h_stride) // + (size_t)(tw * top_w_stride); - top_ptr[top_index] = top_index; - -// const auto int_dstart = static_cast(td * filter_d_stride) - static_cast(filter_d_pad); -// const auto dend = static_cast(min(int_dstart + static_cast(filter_d), static_cast(bot_d))); -// const auto dstart = static_cast(max(int_dstart, 0)); - -// const auto int_hstart = static_cast(th * filter_h_stride) - static_cast(filter_h_pad); -// const auto hend = static_cast(min(int_hstart + static_cast(filter_h), static_cast(bot_h))); -// const auto hstart = static_cast(max(int_hstart, 0)); - -// const auto int_wstart = static_cast(tw * filter_w_stride) - static_cast(filter_w_pad); -// const auto wend = static_cast(min(int_wstart + static_cast(filter_w), static_cast(bot_w))); -// const auto wstart = static_cast(max(int_wstart, 0)); -// out_ptr[0] = 1.2; -// for(uint32_t cc = 0; cc < 1; ++cc) // top C loop -// { -// if(cc >= all_c) return; -// { -// size_t top_index = nn * top_n_stride // TEMPCODE RJS -// + cc * top_c_stride // -// + (size_t)(td * top_d_stride) // -// + (size_t)(th * top_h_stride) // -// + (size_t)(tw * top_w_stride); -// top_index = 1; - -// junk_ptr[top_index] = (_FLOAT)1.1; -// if(j != 1) -// } -// #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE -// uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); -// pool_size = (pool_size == 0) ? 1 : pool_size; -// #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE -// const uint32_t pool_size = filter_d * filter_h * filter_w; -// #endif - -// #if AVERAGE_OPS -// _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0); -// #else // MAX -// _FLOAT_ACCUM res = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM); -// bool found = false; // May remain false if bot contains only NaNs/-INFs. -// uint32_t d_save = 0; -// uint32_t h_save = 0; -// uint32_t w_save = 0; -// #endif -// for(size_t bd = dstart; bd < dend; ++bd) -// { -// for(uint32_t bh = hstart; bh < hend; ++bh) -// { -// for(uint32_t bw = wstart; bw < wend; ++bw) -// { -// const size_t bot_index = nn * bot_n_stride + // -// cc * bot_c_stride + // -// bd * bot_d_stride + // -// static_cast(bh * bot_h_stride) + // -// static_cast(bw * bot_w_stride); -// #if AVERAGE_OPS -// res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]); -// #else // MAX -// if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res) -// { -// res = bot_ptr[bot_index]; -// if(save_index) -// { -// found = true; -// d_save = bd; -// h_save = bh; -// w_save = bw; -// } -// } -// #endif -// } -// } -// } - -// #if AVERAGE_OPS -// res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size); -// #else // MAX -// res *= 1.0; // TEMPCODE RJS fix UNUSED -// if(save_index) -// { -// index_t res_index = 0; - -// / Preventing overflow during computation of res_index: -// / If Index is shorter than uint, then let's perform computation in 32-bit -// / domain and then convert to narrower Index. That would reduce the probability of -// / overflow. If Index is wider then 32 bits, then it seems like it is better to -// / convert to Index type before multiplication. However this is not actually -// / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in -// / 32 bits and then convert. - -// if(found) -// { -// if(index_mode == 1) -// res_index = (index_t)(d_save * bot_h * bot_w // -// + h_save * bot_w // -// + w_save); -// else -// res_index = (index_t)( // -// ((d_save - td * filter_d_stride + filter_d_pad) * filter_h * filter_w) // -// + ((h_save - th * filter_h_stride + filter_h_pad) * filter_w) // -// + (w_save - tw * filter_w_stride + filter_w_pad) // -// ); -// } - -// const size_t mask_index = nn * mask_n_stride // -// + cc * mask_c_stride // -// + (size_t)(td * mask_d_stride) // -// + (size_t)(tw * mask_h_stride) // -// + (size_t)(th * mask_w_stride); -// mask_ptr[mask_index] = res_index; -// } -// #endif -// size_t top_index = nn * top_n_stride // -// + cc * top_c_stride // -// + (size_t)(td * top_d_stride) // -// + (size_t)(th * top_h_stride) // -// + (size_t)(tw * top_w_stride); -// top_index = 1; -// junk_ptr[top_index] = (_FLOAT)res; // TEMPCODE RJS -// } + junk_ptr[top_index] = top_index; +} +if(true) { // TEMPCODE RJS + const auto int_dstart = static_cast(td * filter_d_stride) - static_cast(filter_d_pad); + const auto dend = static_cast(min(int_dstart + static_cast(filter_d), static_cast(bot_d))); + const auto dstart = static_cast(max(int_dstart, 0)); + + const auto int_hstart = static_cast(th * filter_h_stride) - static_cast(filter_h_pad); + const auto hend = static_cast(min(int_hstart + static_cast(filter_h), static_cast(bot_h))); + const auto hstart = static_cast(max(int_hstart, 0)); + + const auto int_wstart = static_cast(tw * filter_w_stride) - static_cast(filter_w_pad); + const auto wend = static_cast(min(int_wstart + static_cast(filter_w), static_cast(bot_w))); + const auto wstart = static_cast(max(int_wstart, 0)); + // const auto c_base = (blockDim.x == all_c) ? 0 : (blockIdx.z / top_w) * blockDim.x; + + for(uint32_t cc = 0; cc < 1; ++cc) // top C loop + { + if(cc >= all_c) return; + +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + pool_size = (pool_size == 0) ? 1 : pool_size; +#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE + const uint32_t pool_size = filter_d * filter_h * filter_w; +#endif + +#if AVERAGE_OPS + _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0); +#else // MAX + _FLOAT_ACCUM res = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM); + bool found = false; // May remain false if bot contains only NaNs/-INFs. + uint32_t d_save = 0; + uint32_t h_save = 0; + uint32_t w_save = 0; +#endif + for(size_t bd = dstart; bd < dend; ++bd) + { + for(uint32_t bh = hstart; bh < hend; ++bh) + { + for(uint32_t bw = wstart; bw < wend; ++bw) + { + const size_t bot_index = nn * bot_n_stride + // + cc * bot_c_stride + // + bd * bot_d_stride + // + static_cast(bh * bot_h_stride) + // + static_cast(bw * bot_w_stride); +#if AVERAGE_OPS + res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]); +#else // MAX + if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res) + { + res = bot_ptr[bot_index]; + if(save_index) + { + found = true; + d_save = bd; + h_save = bh; + w_save = bw; + } + } +#endif + } + } + } + +#if AVERAGE_OPS + res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size); +#else // MAX +res *= 1.0; // TEMPCODE RJS fix UNUSED + if(save_index) + { + index_t res_index = 0; + + // / Preventing overflow during computation of res_index: + // / If Index is shorter than uint, then let's perform computation in 32-bit + // / domain and then convert to narrower Index. That would reduce the probability of + // / overflow. If Index is wider then 32 bits, then it seems like it is better to + // / convert to Index type before multiplication. However this is not actually + // / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in + // / 32 bits and then convert. + + if(found) + { + if(index_mode == 1) + res_index = (index_t)(d_save * bot_h * bot_w // + + h_save * bot_w // + + w_save); + else + res_index = (index_t)( // + ((d_save - td * filter_d_stride + filter_d_pad) * filter_h * filter_w) // + + ((h_save - th * filter_h_stride + filter_h_pad) * filter_w) // + + (w_save - tw * filter_w_stride + filter_w_pad) // + ); + } + + const size_t mask_index = nn * mask_n_stride // + + cc * mask_c_stride // + + (size_t)(td * mask_d_stride) // + + (size_t)(tw * mask_h_stride) // + + (size_t)(th * mask_w_stride); + mask_ptr[mask_index] = res_index; + } +#endif + size_t top_index = nn * top_n_stride // + + cc * top_c_stride // + + (size_t)(td * top_d_stride) // + + (size_t)(th * top_h_stride) // + + (size_t)(tw * top_w_stride); + + top_ptr[top_index] = (_FLOAT)res; // TEMPCODE RJS + } +} // TEMPCODE } extern "C" __global__ void mloPoolingForwardNDNhwcNaive( @@ -368,10 +361,10 @@ extern "C" __global__ void mloPoolingForwardNDNhwcNaive( uint32_t all_n, uint32_t all_c, uint32_t bot_d, uint32_t bot_h, uint32_t bot_w, - size_t bot_n_stride, size_t bot_c_stride, uint32_t bot_d_stride, uint32_t bot_h_stride, uint32_t bot_w_stride, + size_t bot_n_stride, uint32_t bot_c_stride, size_t bot_d_stride, uint32_t bot_h_stride, uint32_t bot_w_stride, uint32_t top_d, uint32_t top_h, uint32_t top_w, - size_t top_n_stride, size_t top_c_stride, uint32_t top_d_stride, uint32_t top_h_stride, uint32_t top_w_stride, - size_t mask_n_stride, uint32_t mask_c_stride, uint32_t mask_d_stride, uint32_t mask_h_stride, size_t mask_w_stride) + size_t top_n_stride, uint32_t top_c_stride, size_t top_d_stride, uint32_t top_h_stride, uint32_t top_w_stride, + size_t mask_n_stride, size_t mask_c_stride, uint32_t mask_d_stride, uint32_t mask_h_stride, uint32_t mask_w_stride) { poolingForwardNDNhwcNaive( bot_ptr, diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp index c946fd8da7..70e4d998cf 100644 --- a/src/solver/pooling/forwardNdNhwcNaive.cpp +++ b/src/solver/pooling/forwardNdNhwcNaive.cpp @@ -177,21 +177,24 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, const auto spatial_dim = is2d ? 2U : 3U; - size_t all_n, all_c, bot_d, bot_h, bot_w; + uint32_t all_n, all_c, bot_d, bot_h, bot_w; std::tie(all_n, all_c, bot_d, bot_h, bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths()); + std::cout << "GetSol: bot_lens " << all_n << " " << all_c << " " << bot_d << " " << bot_h << " " << bot_w << std::endl; size_t bot_n_stride, bot_d_stride; - size_t bot_h_stride, bot_w_stride, bot_c_stride; + uint32_t bot_h_stride, bot_w_stride, bot_c_stride; std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride) = miopen::GetNCDHW(spatial_dim, bot.GetStrides()); +std::cout << "GetSol: bot_strides " << bot_n_stride << " " << bot_c_stride << " " << bot_d_stride +<< " " << bot_h_stride << " " << bot_w_stride << std::endl; - size_t a1, a2, top_d, top_h, top_w; - std::tie(a1, a2, top_d, top_h, top_w) = + uint32_t top_d, top_h, top_w; + std::tie(std::ignore, std::ignore, top_d, top_h, top_w) = miopen::GetNCDHW(spatial_dim, top.GetLengths()); - std::cout << "GetSol: top_lens " << a1 << " " << a2 << " " << top_d << " " << top_h << " " << top_w << std::endl; + std::cout << "GetSol: top_lens " << top_d << " " << top_h << " " << top_w << std::endl; size_t top_n_stride, top_d_stride; - size_t top_h_stride, top_w_stride, top_c_stride; + uint32_t top_h_stride, top_w_stride, top_c_stride; std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride) = miopen::GetNCDHW(spatial_dim, top.GetStrides()); // TEMPCODE RJS @@ -296,8 +299,8 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, kernel.l_wk.push_back(l2); // TEMPCODE RJS - std::cout << "Kernel dims: " << kernel.g_wk.size() << " " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2] - << " | " << kernel.l_wk.size() << " " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl; + std::cout << "Kernel dims: g[" << kernel.g_wk.size() << "] " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2] + << " | l[" << kernel.l_wk.size() << "] " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl; result.construction_params.push_back(kernel); } diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp index 69c19478ae..6bbd949f78 100644 --- a/test/pooling_common.hpp +++ b/test/pooling_common.hpp @@ -151,7 +151,7 @@ struct verify_forward_pooling tensor cpu(const tensor& input, const miopen::PoolingDescriptor& filter, std::vector&) const { - const bool is_default_layout = input.desc.IsDefaultLayout(); + // const bool is_default_layout = input.desc.IsDefaultLayout(); const int sptl_dim_offset = 2; // is_default_layout ? 2 : 1; TEMPCODE RJS const int chan_dim_offset = 1; // is_default_layout ? 1 : SptDim + 1; @@ -281,16 +281,14 @@ struct verify_forward_pooling std::vector& indices) const { auto&& handle = get_handle(); - auto out = get_big_output_tensor(filter, input); // TEMPCODE RJS + auto out = get_output_tensor(filter, input); + auto junk = get_big_output_tensor(filter, input); // TEMPCODE RJS indices.resize(out.data.size(), 0); auto in_dev = handle.Write(input.data); auto out_dev = handle.Create(out.data.size()); - auto junk_dev = handle.Create(out.data.size()); // - std::cout << "gpu Sizes: in:" << input.data.size() << " out: " << out.data.size() << " " << out.desc.GetLayout_str() - << " " << out.desc.GetLengths()[0] << " " << out.desc.GetLengths()[1] << " " << out.desc.GetLengths()[2] << " " << out.desc.GetLengths()[3] - << " " << out.desc.GetStrides()[0] << " " << out.desc.GetStrides()[1] << " " << out.desc.GetStrides()[2] << " " << out.desc.GetStrides()[3] << std::endl; + auto junk_dev = handle.Create(junk.data.size()); // Workspace wspace{}; wspace.Write(indices); @@ -308,8 +306,9 @@ struct verify_forward_pooling junk_dev.get()); // TEMPCODE RJS indices = wspace.Read>(); - out.data = handle.Read(out_dev, out.data.size()); - if(true) + handle.ReadTo(out.data.data(), out_dev, out.data.size() * sizeof(T)); + handle.ReadTo(junk.data.data(), junk_dev, junk.data.size() * sizeof(T)); + if(false) { std::cout << "GPU out: "; auto outlen = out.desc.GetLengths(); @@ -335,6 +334,18 @@ struct verify_forward_pooling } } } + if(false){ + std::cout << "GPU out (4-cols): " << std::endl; + for(int idx = 0; idx < 160; ++idx) { + std::cout << std::setw(11) << std::setprecision(5) << out.data[idx] << " "; + if((idx % 4) == 3) std::cout <( // - {{"miopenIndexUint8", - "miopenIndexUint16", - "miopenIndexUint32", - "miopenIndexUint64"}, // - {"miopenIndexUint8", "miopenIndexUint32"}, // - {"miopenIndexUint32"}} // + generate_data({"miopenIndexUint32",} // TEMPCODE RJS + // generate_multi_data( // + // {{"miopenIndexUint8", + // "miopenIndexUint16", + // "miopenIndexUint32", + // "miopenIndexUint64"}, // + // {"miopenIndexUint8", "miopenIndexUint32"}, // + // {"miopenIndexUint32"}} // )); add(mode, "mode", From 7dd833335cd36efc1f39d048e17d541086da4e91 Mon Sep 17 00:00:00 2001 From: "Randy J. Spaulding" Date: Mon, 26 Aug 2024 09:31:51 +0000 Subject: [PATCH 04/10] cp --- src/CMakeLists.txt | 1 + .../miopen/pooling/poolingNdNhwcArgs.hpp | 19 + .../MIOpenPoolingForwardNDNhwcNaive.cpp | 326 +++++++++--------- src/pooling/problem_description.cpp | 6 +- src/solver/pooling/forward2d.cpp | 38 +- src/solver/pooling/forwardNdNhwcNaive.cpp | 143 ++++---- test/gtest/poolingFwdNdNaive.cpp | 8 +- test/pooling2d.hpp | 6 +- test/pooling_common.hpp | 122 ++++--- 9 files changed, 358 insertions(+), 311 deletions(-) create mode 100644 src/include/miopen/pooling/poolingNdNhwcArgs.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3b51f60f05..1bd81f9e87 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -603,6 +603,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/bfloat16_dev.hpp kernels/float_types.h kernels/pooling_functions.h + include/miopen/pooling/poolingNdNhwcArgs.hpp ) # Kernels in development lists. # Should be ALWAYS empty in develop branch (at the time of PR merge) diff --git a/src/include/miopen/pooling/poolingNdNhwcArgs.hpp b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp new file mode 100644 index 0000000000..9fd15bd4bd --- /dev/null +++ b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp @@ -0,0 +1,19 @@ +#pragma once + +// #include + +using BIGONE = uint32_t; // TEMPCODE RJS + +struct poolingNdNhwcArgs +{ + uint32_t filter_d; uint32_t filter_h; uint32_t filter_w; + uint32_t filter_d_stride; uint32_t filter_h_stride; uint32_t filter_w_stride; + uint32_t filter_d_pad; uint32_t filter_h_pad; uint32_t filter_w_pad; + uint32_t all_n; + uint32_t all_c; + uint32_t bot_d; uint32_t bot_h; uint32_t bot_w; + BIGONE bot_n_stride; uint32_t bot_c_stride; BIGONE bot_d_stride; uint32_t bot_h_stride; uint32_t bot_w_stride; + uint32_t top_d; uint32_t top_h; uint32_t top_w; + BIGONE top_n_stride; uint32_t top_c_stride; BIGONE top_d_stride; uint32_t top_h_stride; uint32_t top_w_stride; + BIGONE mask_n_stride; BIGONE mask_c_stride; uint32_t mask_d_stride; uint32_t mask_h_stride; uint32_t mask_w_stride; +}; diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp index 60aa2aac23..a6aa93aade 100644 --- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp +++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp @@ -51,6 +51,7 @@ #include "float_types.h" #endif #include "pooling_functions.h" +#include "poolingNdNhwcArgs.hpp" #if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE) #define AVERAGE_OPS 1 @@ -76,125 +77,127 @@ #define ARG_UNUSED_FOR_AVERAGE #endif -#define doUU 0 -#if doUU -#define UU __attribute__((__unused__)) -#else -#define UU -#endif - -#define doUU1 0 -#if doUU1 -#define UU1 __attribute__((__unused__)) -#else -#define UU1 -#endif - // Out N, D, H are encoded into the block indices x, y, z // No 2D-only optimization. template -__device__ void poolingForwardNDNhwcNaive(UU1 const TI* __restrict__ bot_ptr, +__device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, TO* __restrict__ top_ptr, - UU1 TO* __restrict__ junk_ptr, // TEMPCODE RJS - UU1 ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr, - UU1 ARG_UNUSED_FOR_AVERAGE int save_index, - UU1 ARG_UNUSED_FOR_AVERAGE int index_mode, - UU uint32_t filter_d, - UU uint32_t filter_h, - UU uint32_t filter_w, - UU uint32_t filter_d_stride, - UU uint32_t filter_h_stride, - UU uint32_t filter_w_stride, - UU uint32_t filter_d_pad, - UU uint32_t filter_h_pad, - UU uint32_t filter_w_pad, - uint32_t all_n, - UU uint32_t all_c, // TEMPCODE RJS - UU uint32_t bot_d, - UU uint32_t bot_h, - UU uint32_t bot_w, - UU size_t bot_n_stride, - UU uint32_t bot_c_stride, - UU size_t bot_d_stride, - UU uint32_t bot_h_stride, - UU uint32_t bot_w_stride, - uint32_t top_d, - uint32_t top_h, - uint32_t top_w, - size_t top_n_stride, - uint32_t top_c_stride, - size_t top_d_stride, - uint32_t top_h_stride, - uint32_t top_w_stride, - UU ARG_UNUSED_FOR_AVERAGE size_t mask_n_stride, - UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_c_stride, - UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_d_stride, - UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_h_stride, - UU ARG_UNUSED_FOR_AVERAGE size_t mask_w_stride) + TO* __restrict__ junk_ptr, // TEMPCODE RJS + ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr, + ARG_UNUSED_FOR_AVERAGE int save_index, + ARG_UNUSED_FOR_AVERAGE int index_mode, + poolingNdNhwcArgs args + // UU uint32_t filter_d, + // UU uint32_t filter_h, + // UU uint32_t filter_w, + // UU uint32_t filter_d_stride, + // UU uint32_t filter_h_stride, + // UU uint32_t filter_w_stride, + // UU uint32_t filter_d_pad, + // UU uint32_t filter_h_pad, + // UU uint32_t filter_w_pad, + // uint32_t all_n, + // UU uint32_t all_c, // TEMPCODE RJS + // UU uint32_t bot_d, + // UU uint32_t bot_h, + // UU uint32_t bot_w, + // UU BIGONE bot_n_stride, + // UU uint32_t bot_c_stride, + // UU BIGONE bot_d_stride, + // UU uint32_t bot_h_stride, + // UU uint32_t bot_w_stride, + // uint32_t top_d, + // uint32_t top_h, + // uint32_t top_w, + // BIGONE top_n_stride, + // uint32_t top_c_stride, + // BIGONE top_d_stride, + // uint32_t top_h_stride, + // uint32_t top_w_stride, + // UU ARG_UNUSED_FOR_AVERAGE BIGONE mask_n_stride, + // UU ARG_UNUSED_FOR_AVERAGE BIGONE mask_c_stride, + // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_d_stride, + // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_h_stride, + // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_w_stride +) { + const uint32_t nn = blockIdx.x / args.top_d; // N=slow index + const uint32_t td = blockIdx.x % args.top_d; // top D=fast index + const uint32_t th = blockIdx.y; // top H + const uint32_t tw = blockIdx.z % args.all_c; // top W=fast index + const auto c_base = (blockIdx.z / args.all_c) * blockDim.x; + if(blockDim.x > args.all_c) + { + // // TODO: h, w, or both may be encoded into threadIdx + // if(top_h > 1 && blockDim.y == 1) + } auto log_ptr = junk_ptr; - if(nn == 0 && td == 0 && th == 0 && tw == 0) + if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { + for(int i = 0; i < 320; ++i) + { + junk_ptr[i] = (_FLOAT)1.11111; + } int idx = 0; - log_ptr[idx++] = gridDim.x; - log_ptr[idx++] = gridDim.y; - log_ptr[idx++] = gridDim.z; + log_ptr[idx++] = gridDim.x; // ND + log_ptr[idx++] = gridDim.y; // H + log_ptr[idx++] = gridDim.z; // W (*C overflow) log_ptr[idx++] = -9; - log_ptr[idx++] = blockDim.x; - log_ptr[idx++] = blockDim.y; - log_ptr[idx++] = blockDim.z; + log_ptr[idx++] = blockDim.x; // C + log_ptr[idx++] = blockDim.y; // small-C H + log_ptr[idx++] = blockDim.z; // small-C W log_ptr[idx++] = -8; - log_ptr[idx++] = filter_d; - log_ptr[idx++] = filter_h; - log_ptr[idx++] = filter_w; + log_ptr[idx++] = args.filter_d; + log_ptr[idx++] = args.filter_h; + log_ptr[idx++] = args.filter_w; log_ptr[idx++] = -7; - log_ptr[idx++] = filter_d_stride; - log_ptr[idx++] = filter_h_stride; - log_ptr[idx++] = filter_w_stride; + log_ptr[idx++] = args.filter_d_stride; + log_ptr[idx++] = args.filter_h_stride; + log_ptr[idx++] = args.filter_w_stride; log_ptr[idx++] = -6; - log_ptr[idx++] = filter_d_pad; - log_ptr[idx++] = filter_h_pad; - log_ptr[idx++] = filter_w_pad; + log_ptr[idx++] = args.filter_d_pad; + log_ptr[idx++] = args.filter_h_pad; + log_ptr[idx++] = args.filter_w_pad; log_ptr[idx++] = -5; - log_ptr[idx++] = all_n; - log_ptr[idx++] = all_c; - log_ptr[idx++] = bot_n_stride; - log_ptr[idx++] = bot_c_stride; + log_ptr[idx++] = args.all_n; + log_ptr[idx++] = args.all_c; + log_ptr[idx++] = args.bot_n_stride; + log_ptr[idx++] = args.bot_c_stride; - log_ptr[idx++] = top_n_stride; - log_ptr[idx++] = top_c_stride; + log_ptr[idx++] = args.top_n_stride; + log_ptr[idx++] = args.top_c_stride; #if AVERAGE_OPS log_ptr[idx++] = -4; log_ptr[idx++] = -4; #else - log_ptr[idx++] = mask_n_stride; - log_ptr[idx++] = mask_c_stride; + log_ptr[idx++] = args.mask_n_stride; + log_ptr[idx++] = args.mask_c_stride; #endif - log_ptr[idx++] = bot_d; - log_ptr[idx++] = bot_h; - log_ptr[idx++] = bot_w; + log_ptr[idx++] = args.bot_d; + log_ptr[idx++] = args.bot_h; + log_ptr[idx++] = args.bot_w; log_ptr[idx++] = -3; - log_ptr[idx++] = bot_d_stride; - log_ptr[idx++] = bot_h_stride; - log_ptr[idx++] = bot_w_stride; + log_ptr[idx++] = args.bot_d_stride; + log_ptr[idx++] = args.bot_h_stride; + log_ptr[idx++] = args.bot_w_stride; log_ptr[idx++] = -2; - log_ptr[idx++] = top_d; - log_ptr[idx++] = top_h; - log_ptr[idx++] = top_w; + log_ptr[idx++] = args.top_d; + log_ptr[idx++] = args.top_h; + log_ptr[idx++] = args.top_w; log_ptr[idx++] = -1; - log_ptr[idx++] = top_d_stride; - log_ptr[idx++] = top_h_stride; - log_ptr[idx++] = top_w_stride; + log_ptr[idx++] = args.top_d_stride; + log_ptr[idx++] = args.top_h_stride; + log_ptr[idx++] = args.top_w_stride; log_ptr[idx++] = -9; #if AVERAGE_OPS @@ -202,62 +205,58 @@ __device__ void poolingForwardNDNhwcNaive(UU1 const TI* __restrict__ bot_ptr, log_ptr[idx++] = -8; log_ptr[idx++] = -8; #else - log_ptr[idx++] = mask_d_stride; - log_ptr[idx++] = mask_h_stride; - log_ptr[idx++] = mask_w_stride; + log_ptr[idx++] = args.mask_d_stride; + log_ptr[idx++] = args.mask_h_stride; + log_ptr[idx++] = args.mask_w_stride; #endif log_ptr[idx++] = -7; + while(idx < 64) log_ptr[idx++] = (_FLOAT)0; } - const uint32_t nn = blockIdx.x / top_d; // N=slow index - if(nn >= all_n) - return; - const uint32_t td = blockIdx.x % top_d; // top D=fast index - if(td >= top_d) - return; + // if(nn >= args.all_n) + // return; - const uint32_t th = blockIdx.y; // top H - const uint32_t j = (gridDim.y == 1) ? threadIdx.y : blockIdx.y; // top H - if(th >= top_h) - return; + // if(td >= args.top_d) + // return; + + // if(th >= args.top_h) + // return; + + // if(tw >= args.top_w) + // return; - const uint32_t tw = blockIdx.z % top_w; // top W=fast index - if(tw >= top_w) - return; -if(true) { - uint32_t cc = 0; - size_t top_index = - nn * top_n_stride // TEMPCODE RJS - + cc * top_c_stride // - + (size_t)(td * top_d_stride) // - + (size_t)(th * top_h_stride) // - + (size_t)(tw * top_w_stride); - - junk_ptr[top_index] = top_index; -} if(true) { // TEMPCODE RJS - const auto int_dstart = static_cast(td * filter_d_stride) - static_cast(filter_d_pad); - const auto dend = static_cast(min(int_dstart + static_cast(filter_d), static_cast(bot_d))); + const auto int_dstart = static_cast(td * args.filter_d_stride) - static_cast(args.filter_d_pad); + const auto dend = static_cast(min(int_dstart + static_cast(args.filter_d), static_cast(args.bot_d))); const auto dstart = static_cast(max(int_dstart, 0)); - const auto int_hstart = static_cast(th * filter_h_stride) - static_cast(filter_h_pad); - const auto hend = static_cast(min(int_hstart + static_cast(filter_h), static_cast(bot_h))); + const auto int_hstart = static_cast(th * args.filter_h_stride) - static_cast(args.filter_h_pad); + const auto hend = static_cast(min(int_hstart + static_cast(args.filter_h), static_cast(args.bot_h))); const auto hstart = static_cast(max(int_hstart, 0)); - const auto int_wstart = static_cast(tw * filter_w_stride) - static_cast(filter_w_pad); - const auto wend = static_cast(min(int_wstart + static_cast(filter_w), static_cast(bot_w))); + const auto int_wstart = static_cast(tw * args.filter_w_stride) - static_cast(args.filter_w_pad); + const auto wend = static_cast(min(int_wstart + static_cast(args.filter_w), static_cast(args.bot_w))); const auto wstart = static_cast(max(int_wstart, 0)); - // const auto c_base = (blockDim.x == all_c) ? 0 : (blockIdx.z / top_w) * blockDim.x; - for(uint32_t cc = 0; cc < 1; ++cc) // top C loop - { - if(cc >= all_c) return; + uint32_t cc = c_base + threadIdx.x; + // if(cc > args.all_c) return; + + size_t top_index = + nn * args.top_n_stride // TEMPCODE RJS + + cc * args.top_c_stride // + + (size_t)(td * args.top_d_stride) // + + (size_t)(th * args.top_h_stride) // + + (size_t)(tw * args.top_w_stride); +if(true) { + top_ptr[top_index] = (TO)-1.11111; + junk_ptr[64 + top_index] = top_index; +} #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); pool_size = (pool_size == 0) ? 1 : pool_size; #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE - const uint32_t pool_size = filter_d * filter_h * filter_w; + const uint32_t pool_size = args.filter_d * args.filter_h * args.filter_w; #endif #if AVERAGE_OPS @@ -275,11 +274,11 @@ if(true) { // TEMPCODE RJS { for(uint32_t bw = wstart; bw < wend; ++bw) { - const size_t bot_index = nn * bot_n_stride + // - cc * bot_c_stride + // - bd * bot_d_stride + // - static_cast(bh * bot_h_stride) + // - static_cast(bw * bot_w_stride); + const size_t bot_index = nn * args.bot_n_stride + // + cc * args.bot_c_stride + // + bd * args.bot_d_stride + // + static_cast(bh * args.bot_h_stride) + // + static_cast(bw * args.bot_w_stride); #if AVERAGE_OPS res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]); #else // MAX @@ -318,33 +317,35 @@ res *= 1.0; // TEMPCODE RJS fix UNUSED if(found) { if(index_mode == 1) - res_index = (index_t)(d_save * bot_h * bot_w // - + h_save * bot_w // + res_index = (index_t)(d_save * args.bot_h * args.bot_w // + + h_save * args.bot_w // + w_save); else res_index = (index_t)( // - ((d_save - td * filter_d_stride + filter_d_pad) * filter_h * filter_w) // - + ((h_save - th * filter_h_stride + filter_h_pad) * filter_w) // - + (w_save - tw * filter_w_stride + filter_w_pad) // + ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) // + + ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w) // + + (w_save - tw * args.filter_w_stride + args.filter_w_pad) // ); } - const size_t mask_index = nn * mask_n_stride // - + cc * mask_c_stride // - + (size_t)(td * mask_d_stride) // - + (size_t)(tw * mask_h_stride) // - + (size_t)(th * mask_w_stride); + const size_t mask_index = nn * args.mask_n_stride // + + cc * args.mask_c_stride // + + (size_t)(td * args.mask_d_stride) // + + (size_t)(tw * args.mask_h_stride) // + + (size_t)(th * args.mask_w_stride); mask_ptr[mask_index] = res_index; } #endif - size_t top_index = nn * top_n_stride // - + cc * top_c_stride // - + (size_t)(td * top_d_stride) // - + (size_t)(th * top_h_stride) // - + (size_t)(tw * top_w_stride); + // top_index = nn * args.top_n_stride // + // + cc * args.top_c_stride // + // + (size_t)(td * args.top_d_stride) // + // + (size_t)(th * args.top_h_stride) // + // + (size_t)(tw * args.top_w_stride); top_ptr[top_index] = (_FLOAT)res; // TEMPCODE RJS - } + top_ptr[top_index] = (_FLOAT)1.11111; // TEMPCODE RJS + + cc += blockDim.x; } // TEMPCODE } @@ -355,16 +356,8 @@ extern "C" __global__ void mloPoolingForwardNDNhwcNaive( index_t* __restrict__ mask_ptr, int save_index, int index_mode, - uint32_t filter_d, uint32_t filter_h, uint32_t filter_w, - uint32_t filter_d_stride, uint32_t filter_h_stride, uint32_t filter_w_stride, - uint32_t filter_d_pad, uint32_t filter_h_pad, uint32_t filter_w_pad, - uint32_t all_n, - uint32_t all_c, - uint32_t bot_d, uint32_t bot_h, uint32_t bot_w, - size_t bot_n_stride, uint32_t bot_c_stride, size_t bot_d_stride, uint32_t bot_h_stride, uint32_t bot_w_stride, - uint32_t top_d, uint32_t top_h, uint32_t top_w, - size_t top_n_stride, uint32_t top_c_stride, size_t top_d_stride, uint32_t top_h_stride, uint32_t top_w_stride, - size_t mask_n_stride, size_t mask_c_stride, uint32_t mask_d_stride, uint32_t mask_h_stride, uint32_t mask_w_stride) +poolingNdNhwcArgs args +) { poolingForwardNDNhwcNaive( bot_ptr, @@ -373,15 +366,16 @@ extern "C" __global__ void mloPoolingForwardNDNhwcNaive( mask_ptr, save_index, index_mode, - filter_d, filter_h, filter_w, - filter_d_stride, filter_h_stride, filter_w_stride, - filter_d_pad, filter_h_pad, filter_w_pad, - all_n, - all_c, - bot_d, bot_h, bot_w, - bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride, - top_d, top_h, top_w, - top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride, - mask_n_stride, mask_c_stride, mask_d_stride, mask_h_stride, mask_w_stride + args + // args.filter_d, args.filter_h, args.filter_w, + // args.filter_d_stride, args.filter_h_stride, args.filter_w_stride, + // args.filter_d_pad, args.filter_h_pad, args.filter_w_pad, + // args.all_n, + // args.all_c, + // args.bot_d, args.bot_h, args.bot_w, + // args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride, + // args.top_d, args.top_h, args.top_w, + // args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride, + // args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride ); } diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp index ad36abfb08..804bc2bf84 100644 --- a/src/pooling/problem_description.cpp +++ b/src/pooling/problem_description.cpp @@ -83,11 +83,7 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const ss << "_dyd" << get_vect_config(dyDesc.GetLengths()); ss << "_dys" << get_vect_config(dyDesc.GetStrides()); } // TEMPCODE RJS - std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << " *************************" << std::endl; - if(!xDesc.IsDefaultLayout()) - { - std::cout << " xDesc layout is not default! " << " *************************\n"; - } + std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << (xDesc.IsDefaultLayout() ? "" : " (not default)") << " *************************" << std::endl; ss << "_l" << (xDesc.IsDefaultLayout() ? 0 : 1); std::cout << " " << ss.str() << std::endl; return NetworkConfig{ss.str()}; diff --git a/src/solver/pooling/forward2d.cpp b/src/solver/pooling/forward2d.cpp index d3fe890ecd..f1e1cd6e52 100644 --- a/src/solver/pooling/forward2d.cpp +++ b/src/solver/pooling/forward2d.cpp @@ -148,19 +148,18 @@ bool PoolingForward2d::IsApplicable(const ExecutionContext& context, // TEMPCODE RJS std::cout << "%%%%%%%%%% PoolingForward2d::IsApplicable: " << app << " " << problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout("NCHW") << std::endl; - return false; return app; } #include // TEMPCODE RJS namespace { template - void printVec(std::string name, const std::vector& vec) + std::ostream& printVec(std::string name, const std::vector& vec) { - return; + return std::cout; std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: "; for(auto i : vec) std::cout << std::setw(8) << i; - std::cout << std::endl; + return std::cout; } } @@ -202,22 +201,21 @@ ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&, // TEMPCODE RJS - const auto bot = problem.GetXDesc(); - const auto top = problem.GetYDesc(); - const auto& pooling = problem.GetPooling(); - const auto& lengths = pooling.GetLengths(); - const auto& strides = pooling.GetStrides(); - const auto& pads = pooling.GetPads(); - - std::cout << "======================================================================" << std::endl; - printVec("bot lengths", bot.GetLengths()); - printVec("bot strides", bot.GetStrides()); - printVec("top lengths", top.GetLengths()); - printVec("top strides", top.GetStrides()); - printVec("pool lengths", lengths); - printVec("pool strides", strides); - printVec("pool pads", pads); - std::cout << "======================================================================" << std::endl; + // const auto bot = problem.GetXDesc(); + // const auto top = problem.GetYDesc(); + // const auto& pooling = problem.GetPooling(); + // const auto& lengths = pooling.GetLengths(); + // const auto& strides = pooling.GetStrides(); + // const auto& pads = pooling.GetPads(); + + // std::cout << "PoolingForward2d GetSolution: " << std::endl; + // printVec(" bot lengths", bot.GetLengths()) << + // printVec(" bot strides", bot.GetStrides()) << std::endl; + // printVec(" top lengths", top.GetLengths()) << + // printVec(" top strides", top.GetStrides()) << std::endl; + // printVec(" pool lengths", lengths) << + // printVec(" pool strides", strides) << + // printVec(" pool pads", pads) << std::endl; auto build_params = KernelBuildParameters{ {"MLO_POOLING_OP_ID", pooling_method}, diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp index 70e4d998cf..77b5a6ea2d 100644 --- a/src/solver/pooling/forwardNdNhwcNaive.cpp +++ b/src/solver/pooling/forwardNdNhwcNaive.cpp @@ -30,6 +30,8 @@ #include #include +#include + #define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCm/MIFin/issues/80 namespace miopen { @@ -98,7 +100,7 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&, #include // TEMPCODE RJS namespace { template - void printVec(std::string name, const std::vector& vec) + void printVec(std::string name, std::vector vec) { return; std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: "; @@ -112,6 +114,7 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, const miopen::pooling::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; + poolingNdNhwcArgs args; auto input_dtype = miopen::GetDataType(problem.GetXDesc().GetType()); auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType()); @@ -132,15 +135,34 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, const auto& pads = pooling.GetPads(); // This also deduces 3D (DHW) parameters from 2D (HW) descriptor. - const uint32_t filter_w = lengths[is2d ? 1 : 2]; - const uint32_t filter_h = lengths[is2d ? 0 : 1]; - const uint32_t filter_d = is2d ? 1 : lengths[0]; - const uint32_t filter_w_stride = strides[is2d ? 1 : 2]; - const uint32_t filter_h_stride = strides[is2d ? 0 : 1]; - const uint32_t filter_d_stride = is2d ? (filter_h_stride * filter_d) : strides[0]; - const uint32_t filter_w_pad = pads[is2d ? 1 : 2]; - const uint32_t filter_h_pad = pads[is2d ? 0 : 1]; - const uint32_t filter_d_pad = is2d ? 0 : pads[0]; + uint32_t idx = 0; + args.filter_d = is2d ? 1 : lengths[idx++]; + args.filter_h = lengths[idx++]; + args.filter_w = lengths[idx++]; + + idx = 0; + args.filter_d_stride = is2d ? (strides[0]) : strides[idx++]; + args.filter_h_stride = strides[idx++]; + args.filter_w_stride = strides[idx++]; + + idx = 0; + args.filter_d_pad = is2d ? 0 : pads[idx++]; + args.filter_h_pad = pads[idx++]; + args.filter_w_pad = pads[idx++]; + // uint32_t idx = 0; + // const uint32_t filter_d = is2d ? 1 : lengths[idx++]; + // const uint32_t filter_h = lengths[idx++]; + // const uint32_t filter_w = lengths[idx++]; + + // idx = 0; + // const uint32_t filter_d_stride = is2d ? (strides[0]) : strides[idx++]; + // const uint32_t filter_h_stride = strides[idx++]; + // const uint32_t filter_w_stride = strides[idx++]; + + // idx = 0; + // const uint32_t filter_d_pad = is2d ? 0 : pads[idx++]; + // const uint32_t filter_h_pad = pads[idx++]; + // const uint32_t filter_w_pad = pads[idx++]; const int pooling_method = (pooling.GetMode() == miopenPoolingMax) ? MLO_POOLING_OP_MAX : (pooling.GetMode() == miopenPoolingAverage) @@ -165,7 +187,7 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, /// requires it because the total number of muls is 4. // TEMPCODE RJS - std::cout << "======================================================================" << std::endl; + printVec("======================================================================", std::vector{}); printVec("bot lengths", bot.GetLengths()); printVec("bot strides", bot.GetStrides()); printVec("top lengths", top.GetLengths()); @@ -173,39 +195,35 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, printVec("pool lengths", lengths); printVec("pool strides", strides); printVec("pool pads", pads); - std::cout << "======================================================================" << std::endl; + printVec("======================================================================", std::vector{}); const auto spatial_dim = is2d ? 2U : 3U; - uint32_t all_n, all_c, bot_d, bot_h, bot_w; - std::tie(all_n, all_c, bot_d, bot_h, bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths()); - std::cout << "GetSol: bot_lens " << all_n << " " << all_c << " " << bot_d << " " << bot_h << " " << bot_w << std::endl; + // uint32_t all_n, all_c, bot_d, bot_h, bot_w; + std::tie(args.all_n, args.all_c, args.bot_d, args.bot_h, args.bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths()); +std::cout << "GetSol: bot_lens " << args.all_n << " " << args.all_c << " " << args.bot_d << " " << args.bot_h << " " << args.bot_w << std::endl; - size_t bot_n_stride, bot_d_stride; - uint32_t bot_h_stride, bot_w_stride, bot_c_stride; - std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride) = + std::tie(args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride) = miopen::GetNCDHW(spatial_dim, bot.GetStrides()); -std::cout << "GetSol: bot_strides " << bot_n_stride << " " << bot_c_stride << " " << bot_d_stride -<< " " << bot_h_stride << " " << bot_w_stride << std::endl; +std::cout << "GetSol: bot_strides " << args.bot_n_stride << " " << args.bot_c_stride << " " << args.bot_d_stride +<< " " << args.bot_h_stride << " " << args.bot_w_stride << std::endl; - uint32_t top_d, top_h, top_w; - std::tie(std::ignore, std::ignore, top_d, top_h, top_w) = + std::tie(std::ignore, std::ignore, args.top_d, args.top_h, args.top_w) = miopen::GetNCDHW(spatial_dim, top.GetLengths()); - std::cout << "GetSol: top_lens " << top_d << " " << top_h << " " << top_w << std::endl; +std::cout << "GetSol: top_lens " << args.top_d << " " << args.top_h << " " << args.top_w << std::endl; - size_t top_n_stride, top_d_stride; - uint32_t top_h_stride, top_w_stride, top_c_stride; - std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride) = + std::tie(args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride,args. top_w_stride) = miopen::GetNCDHW(spatial_dim, top.GetStrides()); // TEMPCODE RJS - std::cout << "GetSol: top_strides " << top_n_stride << " " << top_c_stride << " " - << top_d_stride << " " << top_h_stride << " " << top_w_stride << std::endl; +std::cout << "GetSol: top_strides " << args.top_n_stride << " " << args.top_c_stride << " " + << args.top_d_stride << " " << args.top_h_stride << " " << args.top_w_stride << std::endl; + // Mask data is always NCDHW layout - const uint32_t mask_w_stride = 1; - const uint32_t mask_h_stride = mask_w_stride * top_w; - const uint32_t mask_d_stride = mask_h_stride * top_h; - const size_t mask_c_stride = static_cast(mask_d_stride) * top_d; - const size_t mask_n_stride = mask_c_stride * all_c; + args.mask_w_stride = 1; + args.mask_h_stride = args.mask_w_stride * args.top_w; + args.mask_d_stride = args.mask_h_stride * args.top_h; + args.mask_c_stride = static_cast(args.mask_d_stride) * args.top_d; + args.mask_n_stride = args.mask_c_stride * args.all_c; /// About optimal grid size: /// top D, H, and W are mapped directly onto grid dimensions, except in very small problems @@ -229,34 +247,40 @@ std::cout << "GetSol: bot_strides " << bot_n_stride << " " << bot_c_stride << " /// workitems because the kernel does not require synchronization. std::ignore = context; - constexpr uint32_t LARGE_C_MAX_ITEMS = 512; + constexpr uint32_t MAX_THREADS = 512; + constexpr uint32_t LARGE_C_MAX_ITEMS = MAX_THREADS; constexpr uint32_t SMALL_C_MAX_ITEMS = 128; - auto nd_ = all_n * top_d; - auto h_ = top_h; - auto w_ = top_w; - auto c_ = all_c; + auto nd_ = args.all_n * args.top_d; + auto h_ = args.top_h; + auto w_ = args.top_w; + auto c_ = args.all_c; +std::cout << "nd_ " << nd_ << " h_ " << h_ << " w_ " << w_ << " c_ " << c_ << std::endl; uint32_t l1 = 1U; uint32_t l2 = 1U; if(c_ > LARGE_C_MAX_ITEMS) { - auto c2 = c_ / LARGE_C_MAX_ITEMS + 1; + auto c2 = (c_ + LARGE_C_MAX_ITEMS - 1) / LARGE_C_MAX_ITEMS; c_ = LARGE_C_MAX_ITEMS; w_ *= c2; } // else if(c_ <= SMALL_C_MAX_ITEMS / 2) // { - // if(c_ * w_ <= SMALL_C_MAX_ITEMS) + // if(c_ * w_ <= MAX_THREADS) // { // std::swap(l2, w_); - - // if(c_ * w_ * h_ <= SMALL_C_MAX_ITEMS) + // + // if(c_ * w_ * h_ <= MAX_THREADS) // { // std::swap(l1, h_); // } // } + // else if(c_ * h_ <= MAX_THREADS) + // { + // std::swap(l1, h_); + // } // } const auto g0 = nd_; @@ -289,18 +313,20 @@ std::cout << "GetSol: bot_strides " << bot_n_stride << " " << bot_c_stride << " // * 2: layout (NCHW vs NHWC) // * 2: 2D and 3D kernels (optimization) - kernel.g_wk.clear(); - kernel.g_wk.push_back(g0); - kernel.g_wk.push_back(g1); - kernel.g_wk.push_back(g2); + // l1 = 11; + // l2 = 11; kernel.l_wk.clear(); kernel.l_wk.push_back(l0); kernel.l_wk.push_back(l1); kernel.l_wk.push_back(l2); + kernel.g_wk.clear(); + kernel.g_wk.push_back(g0 * l0); + kernel.g_wk.push_back(g1 * l1); + kernel.g_wk.push_back(g2 * l2); // TEMPCODE RJS - std::cout << "Kernel dims: g[" << kernel.g_wk.size() << "] " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2] - << " | l[" << kernel.l_wk.size() << "] " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl; +std::cout << "Kernel dims: g[" << kernel.g_wk.size() << "] " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2] +<< " | l[" << kernel.l_wk.size() << "] " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl; result.construction_params.push_back(kernel); } @@ -316,16 +342,17 @@ std::cout << "GetSol: bot_strides " << bot_n_stride << " " << bot_c_stride << " params.workspace, save_index, index_mode, - filter_d, filter_h, filter_w, - filter_d_stride, filter_h_stride, filter_w_stride, - filter_d_pad, filter_h_pad, filter_w_pad, - all_n, - all_c, - bot_d, bot_h, bot_w, - bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride, bot_w_stride, - top_d, top_h, top_w, - top_n_stride, top_c_stride, top_d_stride, top_h_stride, top_w_stride, - mask_n_stride, mask_c_stride, mask_d_stride, mask_h_stride, mask_w_stride); + args.filter_d, args.filter_h, args.filter_w, + args.filter_d_stride, args.filter_h_stride, args.filter_w_stride, + args.filter_d_pad, args.filter_h_pad, args.filter_w_pad, + args.all_n, + args.all_c, + args.bot_d, args.bot_h, args.bot_w, + args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride, + args.top_d, args.top_h, args.top_w, + args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride, + args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride + ); }; }; diff --git a/test/gtest/poolingFwdNdNaive.cpp b/test/gtest/poolingFwdNdNaive.cpp index 0b7682ccc3..abf9cb0509 100644 --- a/test/gtest/poolingFwdNdNaive.cpp +++ b/test/gtest/poolingFwdNdNaive.cpp @@ -103,7 +103,7 @@ struct layout_data void read_gpu_data(miopen::Handle& handle, const miopen::Allocator::ManageDataPtr& ddata) { check = tensor{descriptor.GetLengths(), descriptor.GetStrides()}; - check.data = handle.Read(ddata, check.data.size()); + handle.ReadTo(check.data.data(), ddata, check.data.size()); } tensor check{}; @@ -140,7 +140,7 @@ std::vector GetTestCases(const std::string precision) const std::vector test_cases = { // clang-format off - {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg} + {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg} // TEMPCODE RJS DATASET // clang-format on }; @@ -154,7 +154,7 @@ TEST_P(PoolingFwdFloat, NNT) // NDNaiveTranspose const auto& handle = get_handle(); if(!IsTestSupportedForDevice(handle)) std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl; if(SkipTest()) std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl; - // if(!IsTestRunWith("--float")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl; + if(!IsTestRunWith("--float")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl; // Run2dDriver(miopenFloat); return; // TEMPCODE RJS // && IsTestRunWith("--float") if(IsTestSupportedForDevice(handle) && !SkipTest()) @@ -172,7 +172,7 @@ TEST_P(PoolingFwdHalf, NNT) const auto& handle = get_handle(); if(!IsTestSupportedForDevice(handle)) std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl; if(SkipTest()) std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl; - // if(!IsTestRunWith("--half")) std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl; + if(!IsTestRunWith("--half")) std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl; if(IsTestSupportedForDevice(handle) && !SkipTest()) // && IsTestRunWith("--half") TEMPCODE RJS { diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp index a9ca1446b5..fb1556bc1a 100644 --- a/test/pooling2d.hpp +++ b/test/pooling2d.hpp @@ -57,7 +57,7 @@ struct pooling2d_shapes } // Dataset 1 is intended for testing of asymmetric configs. - static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 1, 8, 8}}; } + static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 2, 8, 8}}; } // {1, 1, 8, 8}, // Dataset 2 is intended for testing of configs with wide window. static std::vector get_2d_pooling_input_shapes_wide() @@ -110,7 +110,7 @@ struct pooling2d_driver : pooling_driver this->add(this->strides, "strides", this->template generate_multi_data({{{2, 2}, {1, 1}}, // - {{1, 1}, {2, 1}, {1, 2}, {2, 2}}, // + {{2, 2}, {2, 1}}, // , {1, 2}, {1, 1} {{1, 1}}})); // clang-format off this->add(this->pads, "pads", this->template generate_multi_data({ @@ -123,6 +123,6 @@ struct pooling2d_driver : pooling_driver {{0, 0}}})); // clang-format on this->add(this->wsidx, "wsidx", this->generate_data({0, 1})); - this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNHWC + this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNCHW } }; diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp index 6bbd949f78..8c84180663 100644 --- a/test/pooling_common.hpp +++ b/test/pooling_common.hpp @@ -102,7 +102,6 @@ tensor get_big_output_tensor(const miopen::PoolingDescriptor& filter, const t auto lens = desc.GetLengths(); lens[0] *= 10; auto big = miopen::TensorDescriptor{desc.GetType(), input.desc.GetLayout_t(), lens, desc.GetStrides()}; - std::cout << "get_big_output_tensor: " << input.desc.GetLayout_str() << " " << desc.GetLayout_str() << std::endl; return tensor{big}; } @@ -143,6 +142,9 @@ struct pooling_operators }; #include +#define MAX_PRINT 16 // TEMPCODE RJS +#define GPU_JUNK 240 +#define GPU_4COL false template struct verify_forward_pooling @@ -168,35 +170,8 @@ struct verify_forward_pooling auto pooler = pooling_operators{filter}; // TEMPCODE RJS print input tensor - bool printing = true; // in_dim[0]==8 && in_dim[1]==8; - if(printing) - { - auto inlen = input.desc.GetLengths(); - auto instr = input.desc.GetStrides(); - std::cout << "CPU in : m" << filter.GetMode() << " t" << input.desc.GetType() << " | "; - for(auto dim : inlen) std::cout << std::setw(4) << dim; - std::cout << " | "; - for(auto str : instr) std::cout << std::setw(4) << str; - std::cout << " | "; - for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str; - std::cout << " | "; - for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str; - std::cout << " | "; - for(auto str : filter.GetPads()) std::cout << std::setw(4) << str; - std::cout << std::endl; - - for(int nn = 0; nn < inlen[0]; ++nn) { - for(int cc = 0; cc < inlen[1]; ++cc) { - for(int hh = 0; hh < inlen[2]; ++hh) { - for(int ww = 0; ww < inlen[3]; ++ww) { - std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << " "; - } - std::cout << std::endl; - } - } - } - } - + bool printing = in_dim[0]<=MAX_PRINT && in_dim[1]<=MAX_PRINT; + if(in_dim.size() > 2) printing &= in_dim[2]<=MAX_PRINT; int b_n = out.desc.GetLengths()[0]; int k_n = out.desc.GetLengths()[chan_dim_offset]; std::array out_spatial_len{}; @@ -262,6 +237,7 @@ struct verify_forward_pooling for(int nn = 0; nn < outlen[0]; ++nn) { for(int cc = 0; cc < outlen[1]; ++cc) { + std::cout << "n=" << nn << " c=" << cc <>(); handle.ReadTo(out.data.data(), out_dev, out.data.size() * sizeof(T)); - handle.ReadTo(junk.data.data(), junk_dev, junk.data.size() * sizeof(T)); - if(false) + bool printing = input.desc.GetLengths()[2] <= MAX_PRINT && input.desc.GetLengths()[3] <= MAX_PRINT; + if(input.desc.GetLengths().size() > 4) printing &= input.desc.GetLengths()[4] <= MAX_PRINT; + if(printing) { + if(GPU_4COL){ + std::cout << "GPU (8-cols): " << std::endl; + for(int idx = 0; idx < GPU_JUNK; ++idx) { + std::cout << std::setw(11) << std::setprecision(5) << out.data[idx] << " "; + if((idx % 8) == 7) std::cout < 0){ + handle.ReadTo(junk.data.data(), junk_dev, junk.data.size() * sizeof(T)); + std::cout << "GPU junk: " << std::endl; + for(int idx = 0; idx < GPU_JUNK; ++idx) { + std::cout << std::setw(11) << std::setprecision(5) << junk.data[idx] << " "; + if((idx % 4) == 3) std::cout <( // - // {{"miopenIndexUint8", - // "miopenIndexUint16", - // "miopenIndexUint32", - // "miopenIndexUint64"}, // - // {"miopenIndexUint8", "miopenIndexUint32"}, // - // {"miopenIndexUint32"}} // + // generate_data({"miopenIndexUint32",} // TEMPCODE RJS + generate_multi_data( // + {{"miopenIndexUint8", + "miopenIndexUint16", + "miopenIndexUint32", + "miopenIndexUint64"}, // + {"miopenIndexUint8", "miopenIndexUint32"}, // + {"miopenIndexUint32"}} // )); add(mode, "mode", @@ -648,7 +659,8 @@ struct pooling_driver : test_driver for(auto& v : input.data) v = gen_value(); // TEMPCODE RJS print input tensor - bool printing = true; // in_dim[0]==8 && in_dim[1]==8; + bool printing = in_shape[0]<=MAX_PRINT && in_shape[1]<=MAX_PRINT; + if (in_shape.size() > 2) printing &= in_shape[2]<=MAX_PRINT; if(printing) { auto inlen = input.desc.GetLengths(); From ea745efd340e083fee8e9aef65f500567a3c81e3 Mon Sep 17 00:00:00 2001 From: "Randy J. Spaulding" Date: Thu, 29 Aug 2024 22:38:35 -0700 Subject: [PATCH 05/10] fwd works! --- src/include/miopen/pooling.hpp | 2 + src/include/miopen/tensor.hpp | 25 +- .../MIOpenPoolingForwardNDNhwcNaive.cpp | 142 +- src/kernels/pooling_functions.h | 2 +- src/pooling.cpp | 5 + src/solver/pooling/forwardNdNhwcNaive.cpp | 4 +- test/gtest/poolingFwdNdNaive.cpp | 5 +- test/gtest/pooling_testing.hpp | 1149 +++++++++++++++++ test/pooling2d.hpp | 45 +- test/pooling_common.hpp | 22 +- 10 files changed, 1268 insertions(+), 133 deletions(-) create mode 100644 test/gtest/pooling_testing.hpp diff --git a/src/include/miopen/pooling.hpp b/src/include/miopen/pooling.hpp index 2670c3b09c..9cf2bae82f 100644 --- a/src/include/miopen/pooling.hpp +++ b/src/include/miopen/pooling.hpp @@ -113,6 +113,8 @@ struct MIOPEN_EXPORT PoolingDescriptor : miopenPoolingDescriptor void SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t workspace_index); miopenPoolingMode_t GetMode() const; + + bool ModeIsAveraging() const; miopenPaddingMode_t GetPaddingMode() const; diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp index 30a197e503..ec368dbb83 100644 --- a/src/include/miopen/tensor.hpp +++ b/src/include/miopen/tensor.hpp @@ -293,6 +293,18 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor } } + static bool IsDefaultLayout(miopenTensorLayout_t layout, unsigned spatial_dims = 2) + { + switch (spatial_dims) + { + case 2: + case 3: + return layout == GetDefaultLayout(); + default: + MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3."); + } + } + friend MIOPEN_INTERNALS_EXPORT std::ostream& operator<<(std::ostream& stream, const TensorDescriptor& t); @@ -311,19 +323,6 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor } }; - static bool IsDefaultLayout(miopenTensorLayout_t layout, unsigned spatial_dims = 2) - { - switch (spatial_dims) - { - case 2: - case 3: - return layout == GetDefaultLayout(); - default: - MIOPEN_THROW(miopenStatusBadParm, "Spatial dimension count must be 2 or 3."); - } - } - - private: TensorDescriptor(miopenDataType_t t, miopenTensorLayout_t layout_in, diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp index a6aa93aade..b194b6d44c 100644 --- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp +++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp @@ -53,6 +53,8 @@ #include "pooling_functions.h" #include "poolingNdNhwcArgs.hpp" +// TODO: add ability to decode network string into pooling descriptor or similar for targeted debugging + #if(MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE) || (MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE) #define AVERAGE_OPS 1 #else @@ -87,45 +89,13 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, ARG_UNUSED_FOR_AVERAGE int save_index, ARG_UNUSED_FOR_AVERAGE int index_mode, poolingNdNhwcArgs args - // UU uint32_t filter_d, - // UU uint32_t filter_h, - // UU uint32_t filter_w, - // UU uint32_t filter_d_stride, - // UU uint32_t filter_h_stride, - // UU uint32_t filter_w_stride, - // UU uint32_t filter_d_pad, - // UU uint32_t filter_h_pad, - // UU uint32_t filter_w_pad, - // uint32_t all_n, - // UU uint32_t all_c, // TEMPCODE RJS - // UU uint32_t bot_d, - // UU uint32_t bot_h, - // UU uint32_t bot_w, - // UU BIGONE bot_n_stride, - // UU uint32_t bot_c_stride, - // UU BIGONE bot_d_stride, - // UU uint32_t bot_h_stride, - // UU uint32_t bot_w_stride, - // uint32_t top_d, - // uint32_t top_h, - // uint32_t top_w, - // BIGONE top_n_stride, - // uint32_t top_c_stride, - // BIGONE top_d_stride, - // uint32_t top_h_stride, - // uint32_t top_w_stride, - // UU ARG_UNUSED_FOR_AVERAGE BIGONE mask_n_stride, - // UU ARG_UNUSED_FOR_AVERAGE BIGONE mask_c_stride, - // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_d_stride, - // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_h_stride, - // UU ARG_UNUSED_FOR_AVERAGE uint32_t mask_w_stride ) { const uint32_t nn = blockIdx.x / args.top_d; // N=slow index + const auto c_base = (blockIdx.z / args.top_w) * blockDim.x; const uint32_t td = blockIdx.x % args.top_d; // top D=fast index const uint32_t th = blockIdx.y; // top H - const uint32_t tw = blockIdx.z % args.all_c; // top W=fast index - const auto c_base = (blockIdx.z / args.all_c) * blockDim.x; + const uint32_t tw = blockIdx.z % args.top_w; // top W=fast index if(blockDim.x > args.all_c) { // // TODO: h, w, or both may be encoded into threadIdx @@ -210,25 +180,17 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, log_ptr[idx++] = args.mask_w_stride; #endif log_ptr[idx++] = -7; - while(idx < 64) log_ptr[idx++] = (_FLOAT)0; } - // if(nn >= args.all_n) - // return; - - // if(td >= args.top_d) - // return; - - // if(th >= args.top_h) - // return; - - // if(tw >= args.top_w) - // return; + if(nn >= args.all_n) return; + if(td >= args.top_d) return; + if(th >= args.top_h) return; + if(tw >= args.top_w) return; if(true) { // TEMPCODE RJS const auto int_dstart = static_cast(td * args.filter_d_stride) - static_cast(args.filter_d_pad); - const auto dend = static_cast(min(int_dstart + static_cast(args.filter_d), static_cast(args.bot_d))); - const auto dstart = static_cast(max(int_dstart, 0)); + const auto dend = static_cast(min(int_dstart + static_cast(args.filter_d), static_cast(args.bot_d))); + const auto dstart = static_cast(max(int_dstart, 0)); const auto int_hstart = static_cast(th * args.filter_h_stride) - static_cast(args.filter_h_pad); const auto hend = static_cast(min(int_hstart + static_cast(args.filter_h), static_cast(args.bot_h))); @@ -239,7 +201,7 @@ if(true) { // TEMPCODE RJS const auto wstart = static_cast(max(int_wstart, 0)); uint32_t cc = c_base + threadIdx.x; - // if(cc > args.all_c) return; + if(cc >= args.all_c) return; size_t top_index = nn * args.top_n_stride // TEMPCODE RJS @@ -247,9 +209,23 @@ if(true) { // TEMPCODE RJS + (size_t)(td * args.top_d_stride) // + (size_t)(th * args.top_h_stride) // + (size_t)(tw * args.top_w_stride); + size_t junk_idx = 64 + 4 * th; if(true) { - top_ptr[top_index] = (TO)-1.11111; - junk_ptr[64 + top_index] = top_index; + if(nn == 0 && cc == 0 && td == 0 && tw < 8 && th == 0) + { + size_t bot_ncd = static_cast(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride); + size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride; + size_t bot_index = bot_ncdh + wstart * args.bot_w_stride; + + junk_ptr[junk_idx++] = top_index; + junk_ptr[junk_idx++] = bot_index; + junk_ptr[junk_idx++] = dstart; + junk_ptr[junk_idx++] = dend; + junk_ptr[junk_idx++] = hstart; + junk_ptr[junk_idx++] = hend; + junk_ptr[junk_idx++] = wstart; + junk_ptr[junk_idx++] = wend; + } } #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE @@ -267,18 +243,18 @@ if(true) { uint32_t d_save = 0; uint32_t h_save = 0; uint32_t w_save = 0; + uint32_t saved_index = 0; #endif + + size_t bot_ncd = static_cast(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride); for(size_t bd = dstart; bd < dend; ++bd) { + size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride; for(uint32_t bh = hstart; bh < hend; ++bh) { + size_t bot_index = bot_ncdh + wstart * args.bot_w_stride; for(uint32_t bw = wstart; bw < wend; ++bw) { - const size_t bot_index = nn * args.bot_n_stride + // - cc * args.bot_c_stride + // - bd * args.bot_d_stride + // - static_cast(bh * args.bot_h_stride) + // - static_cast(bw * args.bot_w_stride); #if AVERAGE_OPS res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]); #else // MAX @@ -291,20 +267,34 @@ if(true) { d_save = bd; h_save = bh; w_save = bw; + saved_index = bot_index; } } + if(top_index == 1662 || (nn == 0 && cc == 0 && td == 0 && tw == 2 && th == 0)) + { + junk_ptr[junk_idx++] = nn; + junk_ptr[junk_idx++] = cc; + junk_ptr[junk_idx++] = th; + junk_ptr[junk_idx++] = tw; + junk_ptr[junk_idx++] = bot_ptr[bot_index]; + junk_ptr[junk_idx++] = bot_index; + junk_ptr[junk_idx++] = res; + junk_ptr[junk_idx++] = saved_index; + } #endif + bot_index += args.bot_w_stride; } + bot_ncdh += args.bot_h_stride; } + bot_ncd += args.bot_d_stride; } #if AVERAGE_OPS res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size); #else // MAX -res *= 1.0; // TEMPCODE RJS fix UNUSED if(save_index) { - index_t res_index = 0; + index_t res_index = 5150; // / Preventing overflow during computation of res_index: // / If Index is shorter than uint, then let's perform computation in 32-bit @@ -316,10 +306,14 @@ res *= 1.0; // TEMPCODE RJS fix UNUSED if(found) { - if(index_mode == 1) - res_index = (index_t)(d_save * args.bot_h * args.bot_w // - + h_save * args.bot_w // - + w_save); + if(index_mode == 1) // TEMPCODE RJS + res_index = saved_index; + // res_index = (index_t)( // + // nn * args.bot_n_stride // + // + cc * args.bot_c_stride // + // + d_save * args.bot_d_stride // + // + h_save * args.bot_h_stride // + // + w_save * args.bot_w_stride); else res_index = (index_t)( // ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) // @@ -331,21 +325,13 @@ res *= 1.0; // TEMPCODE RJS fix UNUSED const size_t mask_index = nn * args.mask_n_stride // + cc * args.mask_c_stride // + (size_t)(td * args.mask_d_stride) // - + (size_t)(tw * args.mask_h_stride) // - + (size_t)(th * args.mask_w_stride); + + (size_t)(th * args.mask_h_stride) // + + (size_t)(tw * args.mask_w_stride); mask_ptr[mask_index] = res_index; } #endif - // top_index = nn * args.top_n_stride // - // + cc * args.top_c_stride // - // + (size_t)(td * args.top_d_stride) // - // + (size_t)(th * args.top_h_stride) // - // + (size_t)(tw * args.top_w_stride); - - top_ptr[top_index] = (_FLOAT)res; // TEMPCODE RJS - top_ptr[top_index] = (_FLOAT)1.11111; // TEMPCODE RJS - cc += blockDim.x; + top_ptr[top_index] = (_FLOAT)res; } // TEMPCODE } @@ -367,15 +353,5 @@ poolingNdNhwcArgs args save_index, index_mode, args - // args.filter_d, args.filter_h, args.filter_w, - // args.filter_d_stride, args.filter_h_stride, args.filter_w_stride, - // args.filter_d_pad, args.filter_h_pad, args.filter_w_pad, - // args.all_n, - // args.all_c, - // args.bot_d, args.bot_h, args.bot_w, - // args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride, - // args.top_d, args.top_h, args.top_w, - // args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride, - // args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride ); } diff --git a/src/kernels/pooling_functions.h b/src/kernels/pooling_functions.h index c4821b10fb..6f53a2daab 100644 --- a/src/kernels/pooling_functions.h +++ b/src/kernels/pooling_functions.h @@ -40,7 +40,7 @@ typedef MLO_POOLING_INDEX_TYPE index_t; #define MLO_POOLING_OP_AVE_INCLUSIVE 3 #ifndef MLO_POOLING_OP_ID -#define MLO_POOLING_OP_ID 0 +#define MLO_POOLING_OP_ID 1 #endif #endif // GUARD_POOLING_FUNCTIONS_H diff --git a/src/pooling.cpp b/src/pooling.cpp index 000e297f3c..e0d9112bd2 100644 --- a/src/pooling.cpp +++ b/src/pooling.cpp @@ -96,6 +96,11 @@ miopenPoolingWorkspaceIndexMode_t PoolingDescriptor::GetWorkspaceIndexMode() con miopenPoolingMode_t PoolingDescriptor::GetMode() const { return mode; } +bool PoolingDescriptor::ModeIsAveraging() const +{ + return mode == miopenPoolingAverage || mode == miopenPoolingAverageInclusive; +} + miopenPaddingMode_t PoolingDescriptor::GetPaddingMode() const { return (pmode); } const std::vector& PoolingDescriptor::GetLengths() const { return lens; } diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp index 77b5a6ea2d..62b65249df 100644 --- a/src/solver/pooling/forwardNdNhwcNaive.cpp +++ b/src/solver/pooling/forwardNdNhwcNaive.cpp @@ -137,8 +137,8 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, // This also deduces 3D (DHW) parameters from 2D (HW) descriptor. uint32_t idx = 0; args.filter_d = is2d ? 1 : lengths[idx++]; - args.filter_h = lengths[idx++]; - args.filter_w = lengths[idx++]; + args.filter_h = lengths[idx++]; + args.filter_w = lengths[idx++]; idx = 0; args.filter_d_stride = is2d ? (strides[0]) : strides[idx++]; diff --git a/test/gtest/poolingFwdNdNaive.cpp b/test/gtest/poolingFwdNdNaive.cpp index abf9cb0509..3a27b3c786 100644 --- a/test/gtest/poolingFwdNdNaive.cpp +++ b/test/gtest/poolingFwdNdNaive.cpp @@ -31,6 +31,7 @@ #include "get_handle.hpp" #include "test_env.hpp" +#include "pooling_testing.hpp" #include "pooling2d.hpp" #include "tensor_holder.hpp" @@ -140,7 +141,7 @@ std::vector GetTestCases(const std::string precision) const std::vector test_cases = { // clang-format off - {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg} // TEMPCODE RJS DATASET + {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg} // TEMPCODE RJS DATASET // clang-format on }; @@ -237,7 +238,7 @@ void Run2dDriver(miopenDataType_t prec) } INSTANTIATE_TEST_SUITE_P(Float, PoolingFwdFloat, testing::Values(GetTestCases("--float"))); - +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwdHalf); INSTANTIATE_TEST_SUITE_P(Half, PoolingFwdHalf, testing::Values(GetTestCases("--half"))); #endif diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp new file mode 100644 index 0000000000..f114be8764 --- /dev/null +++ b/test/gtest/pooling_testing.hpp @@ -0,0 +1,1149 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +// TODO: I've hijacked pooling_common here. This is a temporary workaround until +// all pooling tests have been converted to gtest. This work has not been planned yet. +#ifndef GUARD_MIOPEN_TEST_POOLING_COMMON_HPP +#define GUARD_MIOPEN_TEST_POOLING_COMMON_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// #include "network_data.hpp" +#include "driver.hpp" +#include "get_handle.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include "cpu_conv.hpp" +#include "workspace.hpp" + +#define TEST_PADDING_MODE 0 + +namespace { +int num_all_case = 0; +int num_uint16_case = 0; +int num_uint32_case = 0; +int num_uint32_case_imgidx = 0; +int num_uint64_case = 0; +int num_uint64_case_imgidx = 0; +constexpr int max_typed_cases = 5; +constexpr int MAX_ALL_CASES = 0; + +constexpr int RAND_INTEGER_MAX = 12000; +constexpr int RAND_INTEGER_MIN = -8800; + +template +auto gen_value = + [](auto... is) { return static_cast(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; }; +} + +static inline void print(std::ostringstream& oss, const miopen::PoolingDescriptor& filter, bool is_default_layout) +{ + oss << "Pooling: "; + if(filter.GetMode() == miopenPoolingAverage) + oss << "Average"; + else if(filter.GetMode() == miopenPoolingAverageInclusive) + oss << "AverageInclusive"; + else + oss << "Max"; + oss << std::endl; + oss << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl; // TEMPCODE RJS + oss << "Lengths: "; + miopen::LogRange(oss, filter.GetLengths(), ", ") << std::endl; + oss << "Pads: "; + miopen::LogRange(oss, filter.GetPads(), ", ") << std::endl; + oss << "Strides: "; + miopen::LogRange(oss, filter.GetStrides(), ", ") << std::endl; +} + +template +tensor get_output_tensor(const miopen::PoolingDescriptor& filter, const tensor& input) +{ + return tensor{filter.GetForwardOutputTensor(input.desc)}; +} + +template +tensor get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor& input) +{ + auto desc = filter.GetForwardOutputTensor(input.desc); + auto lens = desc.GetLengths(); + if(desc.GetNumBytes() > 1000) + lens[0] *= 2; + else + lens[0] *= 10; + auto big = miopen::TensorDescriptor{desc.GetType(), input.desc.GetLayout_t(), lens, desc.GetStrides()}; + return tensor{big}; +} + +template +struct pooling_operators +{ + miopen::PoolingDescriptor filter; + pooling_operators(miopen::PoolingDescriptor f) : filter(f) {} + + double initialize() const + { + if(filter.GetMode() == miopenPoolingMax) + return std::numeric_limits::lowest(); + else + return 0.0; + } + + double operator()(double x, double y) const + { + if(filter.GetMode() == miopenPoolingMax) + { + double m = std::max(x, y); + return (m); + } + else + { + return x + y; + } + } + + double finalize(double x, double y) + { + if(filter.GetMode() == miopenPoolingMax) + return (x); + else + return x / y; + } +}; + +#include +#include +#define MAX_PRINTING 128 // TEMPCODE RJS +#define MAX_PRINT 12 // TEMPCODE RJS +#define MAX_NCD 2 +#define GPU_JUNK 160 +#define PRINT_CPU_IN 0 +#define PRINT_GPU_OUT 0 +#define GPU_4COL false +#define PIPE std::cout + +namespace { + template + std::vector ClampNCS(const std::vector& lens, std::vector& strides) + { + std::vector out; + bool is2d = lens.size() == 4; + constexpr T max_ncd = MAX_NCD; + constexpr T max_print = MAX_PRINT; + int idx = 0; + + out.push_back(std::min(lens[idx++], max_ncd)); + out.push_back(std::min(lens[idx++], max_ncd)); + if(is2d) + out.push_back(1); + else + out.push_back(std::min(lens[idx++], max_ncd)); + + out.push_back(std::min(lens[idx++], max_print)); + out.push_back(std::min(lens[idx++], max_print)); + + if(is2d) + strides.insert(strides.begin() + 2, {strides[2]}); + + return out; + } + + template + std::ostream& printClamped(std::ostream& oss, const std::vector& out, const std::vector& outlen, const std::vector& outstr, int mode = 0) + { + auto ostr = outstr; + auto olen = ClampNCS(outlen, ostr); + + for(int nn = 0; nn < olen[0]; ++nn) { + for(int cc = 0; cc < olen[1]; ++cc) { + for(int dd = 0; dd < olen[2]; ++dd) { + oss << "n= " << nn << " c= " << cc << " d= " << dd << std::endl; + for(int hh = 0; hh < olen[3]; ++hh) { + for(int ww = 0; ww < olen[4]; ++ww) { +switch(mode) { + case 0: + oss << std::setw(11) << std::setprecision(5) << out[nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4]] << " "; + break; + case 1: + oss << std::setw(11) << std::setprecision(5) << nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4] << " "; + break; +} + } + oss << std::endl; + } + } + } + } + return oss; + } +} + + template + std::ostream& printClamped(std::ostream& oss, const tensor& out, const std::vector& outlen, const std::vector& outstr, int mode = 0) + { + return printClamped(oss, out.data, outlen, outstr, mode); + } + +template +struct verify_forward_pooling +{ + template + tensor + cpu(const tensor& input, const miopen::PoolingDescriptor& filter, std::vector&) const + { + auto out = get_output_tensor(filter, input); + + std::array in_dim{}; + std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin()); + std::array strides{}; + std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin()); + std::array pads{}; + std::copy_n(filter.GetPads().begin(), SptDim, pads.begin()); + std::array kers{}; + std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin()); + auto pooler = pooling_operators{filter}; + + // TEMPCODE RJS print input tensor + bool printing = in_dim[0] <= MAX_PRINTING && in_dim[1] <= MAX_PRINTING; + if(in_dim.size() > 2) printing &= in_dim[2] <= MAX_PRINTING; + + int b_n = out.desc.GetLengths()[0]; + int k_n = out.desc.GetLengths()[1]; + std::array out_spatial_len{}; + std::copy_n(out.desc.GetLengths().begin() + 2, SptDim, out_spatial_len.begin()); + + auto par_ford_out = + miopen::unpacker(miopen::prepender(par_ford, b_n, k_n))(out_spatial_len); + + par_ford_out([&](int o, int w, auto... out_spatial_id_pack) { + auto out_spatial_id = make_array(out_spatial_id_pack...); + + std::array start_idx{}; + std::array win_sz{}; + for(int i = 0; i < SptDim; ++i) + { + start_idx[i] = out_spatial_id[i] * strides[i] - pads[i]; + int end_idx = start_idx[i] + kers[i]; + end_idx = std::min(end_idx, in_dim[i]); + start_idx[i] = std::max(start_idx[i], 0); + win_sz[i] = end_idx - start_idx[i]; + win_sz[i] = std::max(win_sz[i], 1); + } + + int pool_size = + filter.GetMode() == miopenPoolingAverageInclusive + ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies()) + : std::accumulate(win_sz.begin(), win_sz.end(), 1, std::multiplies()); + + double acc = pooler.initialize(); + miopen::unpacker(ford)(win_sz)([&](auto... in_spatial_id_pack) { + auto in_spatial_id = make_array(in_spatial_id_pack...); + std::array idx{}; + idx[0] = o; + idx[1] = w; + + bool in_cmp_idx = true; + for(int i = 0; i < SptDim; ++i) + { + idx[i + 2] = start_idx[i] + in_spatial_id[i]; + in_cmp_idx &= (in_dim[i] > idx[i + 2]); + } + + if(in_cmp_idx) + { + acc = pooler(acc, input(idx)); + } + }); + out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size)); + }); + + if(printing) + { + PIPE << "CPU out: "; + auto outlen = out.desc.GetLengths(); + for(auto dim : outlen) PIPE << std::setw(4) << dim << " "; PIPE << " | "; + auto outstr = out.desc.GetStrides(); + for(auto dim : outstr) PIPE << std::setw(4) << dim << " "; PIPE << " | "; + for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | "; + for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE << " | "; + for(auto str : filter.GetPads()) PIPE << std::setw(4) << str << " "; + PIPE << std::endl; + + printClamped(PIPE, out, outlen, outstr); + // printClamped(PIPE, out, outlen, outstr, 1); + } // print output tensor + if(PRINT_CPU_IN && printing) + { + auto inlen = input.desc.GetLengths(); + auto instr = input.desc.GetStrides(); + PIPE << "CPU in : m" << filter.GetMode() << " t" << input.desc.GetType() << " | "; + for(auto dim : inlen) PIPE << std::setw(4) << dim << " "; PIPE << " | "; + for(auto str : instr) PIPE << std::setw(4) << str << " "; PIPE << " | "; + for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | "; + for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE << " | "; + for(auto str : filter.GetPads()) PIPE << std::setw(4) << str << " "; + PIPE << std::endl; + + printClamped(PIPE, input, inlen, instr); + // printClamped(PIPE, input, inlen, instr, 1); + } + + return out; + } + + template + tensor + cpu_naive(const tensor& input, const miopen::PoolingDescriptor& filter, std::vector&) const + { + auto out = get_output_tensor(filter, input); + + std::array in_dim{}; + std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin()); + std::array strides{}; + std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin()); + std::array pads{}; + std::copy_n(filter.GetPads().begin(), SptDim, pads.begin()); + std::array kers{}; + std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin()); + auto pooler = pooling_operators{filter}; + + // TEMPCODE RJS print input tensor + bool printing = in_dim[0] <= MAX_PRINTING && in_dim[1] <= MAX_PRINTING; + if(in_dim.size() > 2) printing &= in_dim[2] <= MAX_PRINTING; + + auto flens = filter.GetLengths(); + auto fstrs = filter.GetStrides(); + auto ilens = input.desc.GetLengths(); + auto istrs = input.desc.GetStrides(); + auto olens = out.desc.GetLengths(); + auto ostrs = out.desc.GetStrides(); + + for(int n = 0; n < olens[0]; ++n) + { + for(int c = 0; c < olens[1]; ++c) + { + for(int h = 0; h < olens[2]; ++h) + { + for(int w = 0; w < olens[3]; ++w) + { + int hstart = h * fstrs[0]; + int hend = hstart + flens[0]; + hend = min(hend, ilens[2]); + + int wstart = w * fstrs[1]; + int wend = wstart + flens[1]; + wend = min(wend, ilens[3]); + double res = -10000.0; + for(int fh = hstart; fh < hend; ++fh) + { + for(int fw = wstart; fw < wend; ++fw) + { + double val = input[n * istrs[0] + c * istrs[1] + fh * istrs[2] + fw * istrs[3]]; + if(val > res) + res = val; + } + } + out[n * ostrs[0] + c * ostrs[1] + h * ostrs[2] + w * ostrs[3]] = res; + } + } + } + } + + if(printing) + { + PIPE << "CPU nve: "; + for(auto dim : olens) PIPE << std::setw(4) << dim << " "; PIPE << " | "; + for(auto dim : ostrs) PIPE << std::setw(4) << dim << " "; PIPE << " | "; + for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | "; + for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE<< " | "; + for(auto str : filter.GetPads()) PIPE << std::setw(4) << str << " "; + PIPE << std::endl; + + printClamped(PIPE, out, olens, ostrs); + } + + return out; + } + + template + tensor gpu(const tensor& input, + const miopen::PoolingDescriptor& filter, + std::vector& indices) const + { + auto&& handle = get_handle(); + auto out = get_output_tensor(filter, input); + auto junk = get_big_output_tensor(filter, input); // TEMPCODE RJS + + indices.resize(out.data.size(), 0); + + auto in_dev = handle.Write(input.data); + auto out_dev = handle.Create(out.GetSize()); + auto junk_dev = handle.Create(junk.GetSize()); // + Workspace wspace{}; + wspace.Write(indices); + + float alpha = 1, beta = 0; + filter.Forward(handle, + &alpha, + input.desc, + in_dev.get(), + &beta, + out.desc, + out_dev.get(), + true, + wspace.ptr(), + wspace.size(), + junk_dev.get()); // TEMPCODE RJS + + handle.ReadTo(out.data.data(), out_dev, out.GetDataByteSize()); + wspace.ReadTo(indices); + bool printing = input.desc.GetLengths()[2] <= MAX_PRINTING && input.desc.GetLengths()[3] <= MAX_PRINTING; + if(input.desc.GetLengths().size() > 4) printing &= input.desc.GetLengths()[4] <= MAX_PRINTING; +std::cout << (printing ? "printing output from GPU..." : "skipping GPU print.") << std::endl; + if(PRINT_GPU_OUT && printing) + { +#if GPU_JUNK > 0 + handle.ReadTo(junk.data.data(), junk_dev, junk.GetDataByteSize()); + if(GPU_4COL){ + PIPE<< "GPU (8-cols): " << std::endl; + for(int idx = 0; idx < GPU_JUNK; ++idx) { + PIPE << std::setw(11) << std::setprecision(5) << out.data[idx] << " "; + if((idx % 8) == 7) PIPE < olen = outlen; + if(olen.size() == 4) olen.insert(olen.begin() + 2, 1); + // Mask data is always NCDHW layout + size_t mask_w_stride = 1; + size_t mask_h_stride = mask_w_stride * olen[4]; + size_t mask_d_stride = mask_h_stride * olen[3]; + size_t mask_c_stride = mask_d_stride * olen[2]; + size_t mask_n_stride = mask_c_stride * olen[1]; + std::vector mask_str{mask_n_stride, mask_c_stride, mask_d_stride,mask_h_stride, mask_w_stride}; + std::cout << "GPU indices: " << mask_n_stride << " " << mask_c_stride << " " << mask_d_stride << " " << mask_h_stride << " " << mask_w_stride << std::endl; + printClamped(std::cout, indices, olen, mask_str); + } + + PIPE << "GPU out: "; + for(auto dim : outlen) PIPE << std::setw(4) << dim << " "; PIPE << " | "; + for(auto dim : outstr) PIPE << std::setw(4) << dim << " "; PIPE << " | "; + for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | "; + for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE<< " | "; + for(auto str : filter.GetPads()) PIPE << std::setw(4) << str << " "; + PIPE << std::endl; + + printClamped(PIPE, out, outlen, outstr); + // printClamped(std::cout, out, outlen, outstr, 1); + } // print output tensor + + return out; + } + + template + void fail(float, + const tensor& input, + const miopen::PoolingDescriptor& filter, + const std::vector&) const + { + std::ostringstream oss; + oss << "Forward "; + print(oss, filter, input.desc.IsDefaultLayout()); + oss << "Input tensor: " << input.desc.ToString() << std::endl; + oss << "Output tensor: " << filter.GetForwardOutputTensor(input.desc).ToString() + << std::endl; + GTEST_FAIL() << oss.str(); + } +}; + +template +struct verify_backward_pooling +{ + template + tensor cpu(const tensor& input, + const tensor& dout, + const tensor& out, + const miopen::PoolingDescriptor& filter, + const std::vector& indices, + bool use_global_index, + bool verify_index) const + { + const int sptl_dim_offset = 2; + const int chan_dim_offset = 1; + + auto dinput = input; + + std::vector din_vec(input.desc.GetElementSpace(), 0.0); + CHECK(dout.desc == out.desc); + std::array in_dim{}; + std::copy_n(input.desc.GetLengths().begin(), SptDim + 2, in_dim.begin()); + std::array in_str{}; + std::copy_n(input.desc.GetStrides().begin(), SptDim + 2, in_str.begin()); + std::array strides{}; + std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin()); + std::array pads{}; + std::copy_n(filter.GetPads().begin(), SptDim, pads.begin()); + std::array kers{}; + std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin()); + auto ford_ker = miopen::unpacker(ford)(kers); + + int out_n = out.desc.GetLengths()[0]; + int out_c = out.desc.GetLengths()[chan_dim_offset]; + std::array out_spatial_len{}; + std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin()); + auto ford_out = miopen::unpacker(ford)(out_spatial_len); + + par_ford(out_n, out_c)([&](int o, int w) { + if(filter.GetMode() == miopenPoolingMax) + { + ford_out([&](auto... out_spatial_id_pack) { + auto mx_idx = indices.at(dout.desc.GetIndex(o, w, out_spatial_id_pack...)); + std::array idx{}; + bool in_cmp_idx = true; + if(use_global_index) + { + for(int i = 0; i < SptDim; i++) + { + std::size_t mx_idx_dim = mx_idx; + mx_idx_dim /= std::accumulate(in_dim.begin() + sptl_dim_offset + i + 1, + in_dim.end(), + 1ULL, + std::multiplies()); + mx_idx_dim %= in_dim[i + sptl_dim_offset]; + idx[i + sptl_dim_offset] = mx_idx_dim; + } + } + else + { + auto out_spatial_id = make_array(out_spatial_id_pack...); + + for(int i = 0; i < SptDim; i++) + { + int mx_idx_dim = mx_idx; + mx_idx_dim /= std::accumulate( + kers.begin() + i + 1, kers.end(), 1, std::multiplies()); + mx_idx_dim %= kers[i]; + + mx_idx_dim += (out_spatial_id[i] * strides[i] - pads[i]); + in_cmp_idx &= (in_dim[i + 2] > mx_idx_dim && mx_idx_dim >= 0); + + idx[i + 2] = std::size_t(mx_idx_dim); + } + } + + if(in_cmp_idx) + { + idx[0] = o; + idx[1] = w; + if(verify_index) + { + CHECK( + miopen::float_equal(input(idx), out(o, w, out_spatial_id_pack...))); + } + std::size_t din_idx = 0; + for(int i = 0; i < SptDim + 2; i++) + { + din_idx += idx[i] * in_str[i]; + } + din_vec.at(din_idx) += dout(o, w, out_spatial_id_pack...); + } + }); + } + else + { + ford_out([&](auto... out_spatial_id_pack) { + auto out_spatial_id = make_array(out_spatial_id_pack...); + + std::array start_idx{}; + std::array win_sz{}; + for(int i = 0; i < SptDim; ++i) + { + start_idx[i] = out_spatial_id[i] * strides[i] - pads[i]; + int end_idx = start_idx[i] + kers[i]; + end_idx = std::min(end_idx, in_dim[i + 2]); + win_sz[i] = end_idx - std::max(start_idx[i], 0); + win_sz[i] = std::max(win_sz[i], 1); + } + + int pool_size = + filter.GetMode() == miopenPoolingAverageInclusive + ? std::accumulate(kers.begin(), kers.end(), 1, std::multiplies()) + : std::accumulate( + win_sz.begin(), win_sz.end(), 1, std::multiplies()); + + ford_ker([&](auto... ker_id_pack) { + auto ker_id = make_array(ker_id_pack...); + + bool in_cmp_idx = true; + std::array in_idx{}; + in_idx[0] = o; + in_idx[1] = w; + for(int i = 0; i < SptDim; ++i) + { + in_idx[i + 2] = start_idx[i] + ker_id[i]; + in_cmp_idx &= (in_dim[i + 2] > in_idx[i + 2] && in_idx[i + 2] >= 0); + } + + if(in_cmp_idx) + { + std::size_t din_idx = 0; + for(int i = 0; i < SptDim + 2; i++) + { + din_idx += in_idx[i] * in_str[i]; + } + + din_vec.at(din_idx) += + static_cast(dout(o, w, out_spatial_id_pack...)) / pool_size; + } + }); + }); + } + }); + + miopen::unpacker(ford)(in_dim)([&](auto... in_id_pack) { + auto in_id = make_array(in_id_pack...); + std::size_t din_idx = 0; + for(int i = 0; i < SptDim + 2; i++) + { + din_idx += in_id[i] * in_str[i]; + } + dinput(in_id_pack...) = din_vec.at(din_idx); + }); + return dinput; + } + + template + tensor gpu(const tensor& input, + const tensor& dout, + const tensor& out, + const miopen::PoolingDescriptor& filter, + const std::vector& indices, + bool, + bool) const + { + auto&& handle = get_handle(); + auto dinput = input; + + auto in_dev = handle.Write(input.data); + auto dout_dev = handle.Write(dout.data); + auto out_dev = handle.Write(out.data); + auto din_dev = handle.Create(dinput.data.size()); + + Workspace wspace{}; + wspace.Write(indices); + + float alpha = 1, beta = 0; + filter.Backward(handle, + &alpha, + // y + out.desc, + out_dev.get(), + // dy + dout.desc, + dout_dev.get(), + // x + input.desc, + in_dev.get(), + &beta, + // dx + dinput.desc, + din_dev.get(), + wspace.ptr()); + + dinput.data = handle.Read(din_dev, dinput.data.size()); + return dinput; + } + + template + void fail(float, + const tensor& input, + const tensor&, + const tensor& out, + const miopen::PoolingDescriptor& filter, + const std::vector&, + bool, + bool) const + { + std::cout << "Backward "; + print(filter, input.desc.IsDefaultLayout()); + std::cout << "Input tensor: " << input.desc.ToString() << std::endl; + std::cout << "Output tensor: " << out.desc.ToString() << std::endl; + } +}; + +template +struct pooling_driver : test_driver +{ + miopen::PoolingDescriptor filter; + std::vector in_shape; + std::vector lens; + std::vector pads; + std::vector strides; + std::string index_type; + std::string mode_str; +#if TEST_PADDING_MODE == 1 + std::string pmode; +#endif + int verify_indices{}; + miopenPoolingWorkspaceIndexMode_t wsidx{}; + miopenTensorLayout_t layout{}; + std::unordered_map index_type_lookup = { + {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8}, + {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16}, + {miopen::ToUpper("miopenIndexUint32"), miopenIndexUint32}, + {miopen::ToUpper("miopenIndexUint64"), miopenIndexUint64}, + }; + std::unordered_map mode_lookup = { + {"MAX", miopenPoolingMax}, + {"MIOPENPOOLINGMAX", miopenPoolingMax}, + {"AVERAGE", miopenPoolingAverage}, + {"MIOPENPOOLINGAVERAGE", miopenPoolingAverage}, + {"AVERAGEINCLUSIVE", miopenPoolingAverageInclusive}, + {"MIOPENPOOLINGAVERAGEINCLUSIVE", miopenPoolingAverageInclusive}, + }; +#if TEST_PADDING_MODE == 1 + std::unordered_map pmode_lookup = { + {"DEFAULT", miopenPaddingDefault}, + {"SAME", miopenPaddingSame}, + {"VALID", miopenPaddingValid}, + }; +#endif + pooling_driver() + { + add(index_type, + "index_type", + generate_data({"miopenIndexUint32"} // TEMPCODE RJS + // generate_multi_data( // + // {{"miopenIndexUint8", + // "miopenIndexUint16", + // "miopenIndexUint32", + // "miopenIndexUint64"}, // + // {"miopenIndexUint8", "miopenIndexUint32"}, // + // {"miopenIndexUint32"}} // + )); + add(mode_str, + "mode_str", + generate_data( + {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); // +#if TEST_PADDING_MODE == 1 + add(pmode, "pmode", generate_data({"default", "same", "valid"})); +#endif + add(verify_indices, "verify_indices", generate_data({1})); + } + + template + void run_impl() + { + std::vector indices{}; + auto input = tensor{layout, in_shape}; + for(auto& v : input.data) v = gen_value(); + + // TEMPCODE RJS print input tensor + // bool printing = in_shape[0] <= MAX_PRINTING && in_shape[1] <= MAX_PRINTING; + // if (in_shape.size() > 2) printing &= in_shape[2] <= MAX_PRINTING; + // if(printing) + // { + // auto inlen = input.desc.GetLengths(); + // auto instr = input.desc.GetStrides(); + // std::cout << "CPU GEN : " << input.desc.GetLayout_str() << "(" << inlen.size() << ") | " << input.data.size() << " | " << input.desc.GetElementSpace() << " | "; + // for(auto dim : inlen) std::cout << std::setw(4) << dim; + // std::cout << " | "; + // for(auto str : instr) std::cout << std::setw(4) << str; + // std::cout << " | "; + // for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str; + // std::cout << " | "; + // for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str; + // std::cout << " | "; + // for(auto str : filter.GetPads()) std::cout << std::setw(4) << str; + // std::cout << std::endl; + + // for(int nn = 0; nn < inlen[0]; ++nn) { + // for(int cc = 0; cc < inlen[1]; ++cc) { + // for(int hh = 0; hh < inlen[2]; ++hh) { + // for(int ww = 0; ww < inlen[3]; ++ww) {// nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3] + // std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << " "; + // } + // std::cout << std::endl; + // } + // } + // } + // } + + auto out = verify(verify_forward_pooling{}, + input, + filter, + indices); +#ifdef BACKWARD + auto dout = out.first; + dout.generate(tensor_elem_gen_integer{2503}); + verify(verify_backward_pooling{}, // TEMPCODE RJS no backward + input, + dout, + out.first, + filter, + indices, + wsidx != 0, + static_cast(this->verify_indices)); +#endif + } + + void run() + { + std::cout << "\n############ Run # " << std::setw(6) << num_all_case++ << " : "; + if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES) + { + std::cout << " skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES << " : "; + show_command(); + return; + } + else if(this->dry_run) + { + std::cout << " skipped due to dry_run : "; + show_command(); + return; + } + else + { + show_command(); + } + + int sptl_dim = static_cast(in_shape.size()) - 2; + if(sptl_dim != 2 && sptl_dim != 3) + { + std::cout << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW form." + << std::endl; + return; + } + + // To simplify launching, input dimensions to the driver are always default layout. Desire to + // test non-default layouts is communicated exclusively via 'layout'. + // const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); // TEMPCODE RJS + + auto mode = mode_lookup.at(miopen::ToUpper(mode_str)); + if(mode != miopenPoolingMax) return; // TEMPCODE RJS skip all except max, do max only + + auto pad_mode = miopenPaddingDefault; +#if TEST_PADDING_MODE + pad_mode = pmode_lookup.at(miopen::ToUpper(pmode)); +#endif + + auto idx_typ = index_type_lookup.at(miopen::ToUpper(index_type)); + auto idx_sz = sizeof(uint8_t); + const bool skip_many_configs_with_non_int8_index = + (dataset_id == 0) && !full_set; // Otherwise the default dataset takes too much time. + const bool wide_dataset = (dataset_id == 2) && full_set; + + filter = miopen::PoolingDescriptor + { + mode, + pad_mode, + lens, + strides, + pads + }; + + filter.SetIndexType(idx_typ); + filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx)); + bool mask_idx = filter.GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask; + + if(mask_idx && sptl_dim == 3 && filter.GetMode() == miopenPoolingMax) + { + std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " + "yet in 3D max pooling solvers." + << std::endl; + return; + } + + if(mask_idx && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset) + { + std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " + "yet in 2D max backward solvers that support wide pooling window." + << std::endl; + return; + } + + if(mask_idx && filter.ModeIsAveraging()) + { + std::cout << "Warning: Config skipped. Workspace index modes are irrelevant for " + "Average pooling. " + "In order to optimize performance of full tests, we " + "skip average pooling configs when (wsidx == 0). " + "Please make sure that dataset includes counterparts with (wsidx == 1)." + << std::endl; + return; + } + + switch(idx_typ) + { + /// The "index is too small" limitation is an approximation + /// of the real limitation, and therefore applied only when + /// the "full test" is ran. See: + /// \ref max_pooling_index_max_restriction + case miopenIndexUint8: { + if(full_set && (sptl_dim == 3 || (mask_idx && sptl_dim == 2)) && + filter.GetMode() == miopenPoolingMax) + { + std::cout << "Warning: Config skipped: uint8 index is too small " + "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " + "&& filter.GetMode() == miopenPoolingMax" + << std::endl; + return; + } + break; + } + case miopenIndexUint16: { + if(full_set && (sptl_dim == 3 || (!mask_idx && sptl_dim == 2)) && + filter.GetMode() == miopenPoolingMax) + { + std::cout << "Warning: Config skipped: uint16 index is too small " + "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " + "&& filter.GetMode() == miopenPoolingMax" + << std::endl; + return; + } + if(skip_many_configs_with_non_int8_index) + { + // test_pooling_test --all limit uint16 cases + if(num_uint16_case >= max_typed_cases) + { + std::cout << "Warning: Config skipped for the default dataset to speed " + "up testing (num_uint16_case > 5)" + << std::endl; + return; + } + ++num_uint16_case; + } + idx_sz = sizeof(uint16_t); + break; + } + case miopenIndexUint32: { + if(skip_many_configs_with_non_int8_index) + { + // test_pooling_test --all limit uint32 cases + if(mask_idx) + { + if(num_uint32_case >= max_typed_cases) + { + std::cout << "Warning: Config skipped for the default dataset to speed up " + "testing (wsidx == 0 && num_uint32_case > 5)" + << std::endl; + return; + } + ++num_uint32_case; + } + else + { + if(num_uint32_case_imgidx >= max_typed_cases) + { + std::cout << "Warning: Config skipped for the default dataset to speed up " + "testing (wsidx != 0 && num_uint32_case_imgidx > 5)" + << std::endl; + return; + } + ++num_uint32_case_imgidx; + } + } + idx_sz = sizeof(uint32_t); + break; + } + case miopenIndexUint64: { + if(skip_many_configs_with_non_int8_index) + { + if(mask_idx) + { + if(num_uint64_case >= max_typed_cases) + { + std::cout << "Warning: Config skipped for the default dataset to speed up " + "testing (wsidx == 0) && (num_uint64_case > 5)" + << std::endl; + return; + } + ++num_uint64_case; + } + else + { + if(num_uint64_case_imgidx >= max_typed_cases && sptl_dim == 2) + { + std::cout << "Warning: Config skipped to speed up testing of the " + "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 " + "&& sptl_dim == 2)" + << std::endl; + return; + } + ++num_uint64_case_imgidx; + } + } + idx_sz = sizeof(uint64_t); + break; + } + } + + auto input_desc = miopen::TensorDescriptor(this->type, layout, in_shape); + + for(int i = 0; i < sptl_dim; i++) + { + if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast(2) * pads[i])) + { + std::cout << "Warning: Config skipped becuse it is invalid " + "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))" + << std::endl; + return; + } + } + + if(full_set) + { + auto output_desc = filter.GetForwardOutputTensor(input_desc); + size_t total_mem = + 3 * input_desc.GetNumBytes() + output_desc.GetNumBytes() + + idx_sz * output_desc.GetElementSize(); // estimate based on backward pass + + size_t device_mem = get_handle().GetGlobalMemorySize(); + if(total_mem >= device_mem) + { + std::cout << "Config skipped because it requires " << total_mem + << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem + << " Bytes of memory." << std::endl; + return; + } + } + + std::vector in_dim(input_desc.GetLengths().begin(), + input_desc.GetLengths().begin() + sptl_dim); + std::vector out_dim(sptl_dim); + std::vector ker_dim(filter.GetLengths().begin(), filter.GetLengths().end()); + +#if TEST_PADDING_MODE == 1 + if(filter.pmode == miopenPaddingSame) + { + if(std::any_of(filter.GetStrides().begin(), filter.GetStrides().end(), [](int i) { + return i == 0; + })) + return; + for(int i = 0; i < sptl_dim; i++) + { + filter.pads[i] = + ((in_dim[i] % filter.GetStrides()[i] == 0) + ? (std::max((ker_dim[i] - filter.GetStrides()[i]), 0)) + : (std::max((ker_dim[i] - (in_dim[i] % filter.GetStrides()[i])), 0))) / + 2; + + out_dim[i] = std::ceil(static_cast(in_dim[i]) / filter.strides[i]); + } + + if(std::any_of(out_dim.begin(), out_dim.end(), [](int i) { return i <= 0; })) + return; + } + else if(filter.pmode == miopenPaddingValid) + { + if(std::any_of(filter.GetStrides().begin(), filter.GetStrides().end(), [](int i) { + return i == 0; + })) + return; + for(int i = 0; i < sptl_dim; i++) + { + filter.pads[i] = 0; + + out_dim[i] = std::ceil(static_cast(in_dim[i] - filter.lens[i] + 1) / + filter.strides[i]); + } + + if(std::any_of(out_dim.begin(), out_dim.end(), [](int i) { return i <= 0; })) + return; + } +#endif + switch(filter.GetIndexType()) + { + case miopenIndexUint8: { + if(sptl_dim == 3) + { + run_impl(); + } + else + { + run_impl(); + } + break; + } + case miopenIndexUint16: { + if(sptl_dim == 3) + { + run_impl(); + } + else + { + run_impl(); + } + break; + } + case miopenIndexUint32: { + if(sptl_dim == 3) + { + run_impl(); + } + else + { + run_impl(); + } + break; + } + case miopenIndexUint64: { + if(sptl_dim == 3) + { + run_impl(); + } + else + { + run_impl(); + } + break; + } + } + } +}; + +#endif diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp index fb1556bc1a..8472b23375 100644 --- a/test/pooling2d.hpp +++ b/test/pooling2d.hpp @@ -24,6 +24,7 @@ * *******************************************************************************/ +#include "gtest/pooling_testing.hpp" #include "pooling_common.hpp" #define WORKAROUND_ISSUE_1670 1 @@ -36,9 +37,11 @@ struct pooling2d_shapes static std::vector get_2d_pooling_input_shapes() { - return {{1, 19, 1024, 2048}, - {10, 3, 32, 32}, + return { + {1, 831, 64, 128}, {5, 32, 8, 8}, + {10, 3, 32, 32}, + {1, 19, 1024, 2048}, {2, 1024, 12, 12}, {4, 3, 231, 231}, {8, 3, 227, 227}, @@ -47,13 +50,13 @@ struct pooling2d_shapes {2, 160, 7, 7}, {1, 192, 256, 512}, {2, 192, 28, 28}, - {1, 832, 64, 128}, {1, 256, 56, 56}, {4, 3, 224, 224}, {2, 64, 112, 112}, {2, 608, 4, 4}, {1, 2048, 11, 11}, - {1, 16, 4096, 4096}}; + {1, 16, 4096, 4096} + }; } // Dataset 1 is intended for testing of asymmetric configs. @@ -88,6 +91,23 @@ struct pooling2d_driver : pooling_driver public: pooling2d_driver() : pooling_driver() { + // clang-format off + this->add(this->pads, "pads", this->template generate_multi_data({ + {{0, 0}, {1, 1}}, // +#if WORKAROUND_ISSUE_1670 + {{0, 0}}, // +#else + {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, // +#endif + {{0, 0}}})); + // clang-format on + this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNCHW + this->add(this->wsidx, "wsidx", this->generate_data({miopenPoolingWorkspaceIndexMask, miopenPoolingWorkspaceIndexImage})); + this->add(this->strides, + "strides", + this->template generate_multi_data({{{2, 2}, {1, 1}}, // + {{2, 2}, {2, 1}}, // , {1, 2}, {1, 1} + {{1, 1}}})); #if TEST_GET_INPUT_TENSOR std::set in_dim_set = get_inputs(this->batch_factor); std::vector in_dim_vec(in_dim_set.begin(), in_dim_set.end()); @@ -107,22 +127,5 @@ struct pooling2d_driver : pooling_driver {{{2, 2}, {3, 3}}, // {{2, 2}, {1, 2}, {2, 1}}, // {{35, 35}, {100, 100}, {255, 255}, {410, 400}}})); - this->add(this->strides, - "strides", - this->template generate_multi_data({{{2, 2}, {1, 1}}, // - {{2, 2}, {2, 1}}, // , {1, 2}, {1, 1} - {{1, 1}}})); - // clang-format off - this->add(this->pads, "pads", this->template generate_multi_data({ - {{0, 0}, {1, 1}}, // -#if WORKAROUND_ISSUE_1670 - {{0, 0}}, // -#else - {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, // -#endif - {{0, 0}}})); - // clang-format on - this->add(this->wsidx, "wsidx", this->generate_data({0, 1})); - this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNCHW } }; diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp index 8c84180663..956c529533 100644 --- a/test/pooling_common.hpp +++ b/test/pooling_common.hpp @@ -143,7 +143,7 @@ struct pooling_operators #include #define MAX_PRINT 16 // TEMPCODE RJS -#define GPU_JUNK 240 +#define GPU_JUNK 0 #define GPU_4COL false template @@ -349,7 +349,7 @@ struct verify_forward_pooling for(int hh = 0; hh < outlen[2]; ++hh) { for(int ww = 0; ww < outlen[3]; ++ww) { std::cout << std::setw(11) << std::setprecision(5) << out.data[ - nn * outstr[0] + cc * outstr[1] + hh * outstr[2] + ww * outstr[2] + nn * outstr[0] + cc * outstr[1] + hh * outstr[2] + ww * outstr[3] // out.desc.GetIndex(nn, cc, hh, ww) ] << " "; } @@ -632,19 +632,19 @@ struct pooling_driver : test_driver { add(index_type, "index_type", - // generate_data({"miopenIndexUint32",} // TEMPCODE RJS - generate_multi_data( // - {{"miopenIndexUint8", - "miopenIndexUint16", - "miopenIndexUint32", - "miopenIndexUint64"}, // - {"miopenIndexUint8", "miopenIndexUint32"}, // - {"miopenIndexUint32"}} // + generate_data({"miopenIndexUint32"} // TEMPCODE RJS + // generate_multi_data( // + // {{"miopenIndexUint8", + // "miopenIndexUint16", + // "miopenIndexUint32", + // "miopenIndexUint64"}, // + // {"miopenIndexUint8", "miopenIndexUint32"}, // + // {"miopenIndexUint32"}} // )); add(mode, "mode", generate_data( - {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); + {"miopenPoolingMax", "miopenPoolingAverage"})); // , "miopenPoolingAverageInclusive" #if TEST_PADDING_MODE == 1 add(pmode, "pmode", generate_data({"default", "same", "valid"})); #endif From 2acd21486a1305afb3fed740610f04e81125313b Mon Sep 17 00:00:00 2001 From: "Randy J. Spaulding" Date: Thu, 5 Sep 2024 22:17:42 -0700 Subject: [PATCH 06/10] benchmarking --- src/CMakeLists.txt | 6 + .../MIOpenPoolingForwardNDNhwcNaive.cpp | 226 +++++++------- src/ocl/pooling_ocl.cpp | 6 +- src/pooling/problem_description.cpp | 4 +- src/solver/pooling/forward2d.cpp | 23 +- src/solver/pooling/forwardNdNhwcNaive.cpp | 99 +++--- ...ngFwdNdNaive.cpp => poolingFwd2dNaive.cpp} | 114 ++++--- test/gtest/poolingFwd3dNaive.cpp | 283 ++++++++++++++++++ test/gtest/pooling_testing.hpp | 192 ++++++++---- test/pooling2d.hpp | 102 ++++--- test/pooling3d.hpp | 7 +- 11 files changed, 731 insertions(+), 331 deletions(-) rename test/gtest/{poolingFwdNdNaive.cpp => poolingFwd2dNaive.cpp} (66%) create mode 100644 test/gtest/poolingFwd3dNaive.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1bd81f9e87..f536de3b10 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -597,11 +597,17 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN # TEMPCODE RJS set(MIOPEN_KERNELS kernels/MIOpenPoolingForwardNDNhwcNaive.cpp + kernels/MIOpenPooling.cl + kernels/MIOpenPoolingBwd.cl + kernels/MIOpenPoolingBwdND.cl + kernels/MIOpenPoolingForwardNaive.cl + kernels/MIOpenPoolingND.cl ) set(MIOPEN_KERNEL_INCLUDES kernels/bfloat16_dev.hpp kernels/float_types.h + kernels/miopen_cstdint.hpp kernels/pooling_functions.h include/miopen/pooling/poolingNdNhwcArgs.hpp ) diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp index b194b6d44c..5c197e3b8e 100644 --- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp +++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp @@ -39,9 +39,6 @@ #define CVT_FP32_2ACCUM(x) (x) #endif -#define _FLOAT float -#define _FLOAT_ACCUM _FLOAT - #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS #include #include @@ -71,6 +68,22 @@ #endif #include "float_types.h" +#include "miopen_cstdint.hpp" + +#if MIOPEN_USE_INT8 == 1 + #define _FLOAT char + #if !AVERAGE_OPS + #ifndef FLT_MAX + #define MAX_VAL 127 /* max value */ + #else + #define MAX_VAL FLT_MAX + #endif + #endif +#else + #define _FLOAT float +#endif +#define _FLOAT_ACCUM _FLOAT + #endif // TEMPCODE #if AVERAGE_OPS @@ -91,16 +104,31 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, poolingNdNhwcArgs args ) { - const uint32_t nn = blockIdx.x / args.top_d; // N=slow index - const auto c_base = (blockIdx.z / args.top_w) * blockDim.x; - const uint32_t td = blockIdx.x % args.top_d; // top D=fast index - const uint32_t th = blockIdx.y; // top H - const uint32_t tw = blockIdx.z % args.top_w; // top W=fast index - if(blockDim.x > args.all_c) - { - // // TODO: h, w, or both may be encoded into threadIdx - // if(top_h > 1 && blockDim.y == 1) - } + // naming: caps=count, lowercase=index, _ + const uint32_t nd = blockIdx.x; + const uint32_t h_ = blockIdx.y; + const uint32_t w_c = blockIdx.z; + const uint32_t w_ = w_c % args.top_w; // CAN w=fast index + + const uint32_t C_WH = blockDim.x; + const uint32_t _H = blockDim.y; + const uint32_t _W = blockDim.z; + + const uint32_t c = threadIdx.x; + const uint32_t _h = threadIdx.y; + const uint32_t _w = threadIdx.z; + + const uint32_t nn = nd / args.top_d; // n=slow index + const uint32_t cc = (w_c / args.top_w) * C_WH + c; // c=slow index (lg-C) + const uint32_t td = nd % args.top_d; // top d=fast index + const uint32_t th = h_ * _H + _h; // top h: blockIdx is slow (sm-C) + const uint32_t tw = w_ * _W + _w; // top w: blockIdx is slow (sm-C) + + if(nn >= args.all_n) return; + if(td >= args.top_d) return; + if(th >= args.top_h) return; + if(tw >= args.top_w) return; + if(cc >= args.all_c) return; auto log_ptr = junk_ptr; if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) @@ -182,12 +210,6 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, log_ptr[idx++] = -7; } - if(nn >= args.all_n) return; - if(td >= args.top_d) return; - if(th >= args.top_h) return; - if(tw >= args.top_w) return; - -if(true) { // TEMPCODE RJS const auto int_dstart = static_cast(td * args.filter_d_stride) - static_cast(args.filter_d_pad); const auto dend = static_cast(min(int_dstart + static_cast(args.filter_d), static_cast(args.bot_d))); const auto dstart = static_cast(max(int_dstart, 0)); @@ -200,17 +222,15 @@ if(true) { // TEMPCODE RJS const auto wend = static_cast(min(int_wstart + static_cast(args.filter_w), static_cast(args.bot_w))); const auto wstart = static_cast(max(int_wstart, 0)); - uint32_t cc = c_base + threadIdx.x; - if(cc >= args.all_c) return; - size_t top_index = - nn * args.top_n_stride // TEMPCODE RJS - + cc * args.top_c_stride // - + (size_t)(td * args.top_d_stride) // - + (size_t)(th * args.top_h_stride) // - + (size_t)(tw * args.top_w_stride); + nn * args.top_n_stride + // + cc * args.top_c_stride + // + td * args.top_d_stride + // + th * args.top_h_stride + // + tw * args.top_w_stride; + +#if false size_t junk_idx = 64 + 4 * th; -if(true) { if(nn == 0 && cc == 0 && td == 0 && tw < 8 && th == 0) { size_t bot_ncd = static_cast(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride); @@ -226,113 +246,101 @@ if(true) { junk_ptr[junk_idx++] = wstart; junk_ptr[junk_idx++] = wend; } -} +#endif #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE - uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - pool_size = (pool_size == 0) ? 1 : pool_size; + uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + pool_size = (pool_size == 0) ? 1 : pool_size; #elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE - const uint32_t pool_size = args.filter_d * args.filter_h * args.filter_w; + const uint32_t pool_size = args.filter_d * args.filter_h * args.filter_w; #endif #if AVERAGE_OPS - _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0); + _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0); #else // MAX - _FLOAT_ACCUM res = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM); - bool found = false; // May remain false if bot contains only NaNs/-INFs. - uint32_t d_save = 0; - uint32_t h_save = 0; - uint32_t w_save = 0; - uint32_t saved_index = 0; + _FLOAT_ACCUM res = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM); + bool found = false; // May remain false if bot contains only NaNs/-INFs. + uint32_t d_save = 0; + uint32_t h_save = 0; + uint32_t w_save = 0; + uint32_t saved_index = 0; #endif - size_t bot_ncd = static_cast(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride); - for(size_t bd = dstart; bd < dend; ++bd) + size_t bot_ncd = static_cast(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride); + for(size_t bd = dstart; bd < dend; ++bd) + { + size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride; + for(uint32_t bh = hstart; bh < hend; ++bh) { - size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride; - for(uint32_t bh = hstart; bh < hend; ++bh) + size_t bot_index = bot_ncdh + wstart * args.bot_w_stride; + for(uint32_t bw = wstart; bw < wend; ++bw) { - size_t bot_index = bot_ncdh + wstart * args.bot_w_stride; - for(uint32_t bw = wstart; bw < wend; ++bw) - { #if AVERAGE_OPS - res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]); + res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]); #else // MAX - if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res) + if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res) + { + res = bot_ptr[bot_index]; + if(save_index) { - res = bot_ptr[bot_index]; - if(save_index) - { - found = true; - d_save = bd; - h_save = bh; - w_save = bw; - saved_index = bot_index; - } + found = true; + d_save = bd; + h_save = bh; + w_save = bw; + saved_index = bot_index; } - if(top_index == 1662 || (nn == 0 && cc == 0 && td == 0 && tw == 2 && th == 0)) - { - junk_ptr[junk_idx++] = nn; - junk_ptr[junk_idx++] = cc; - junk_ptr[junk_idx++] = th; - junk_ptr[junk_idx++] = tw; - junk_ptr[junk_idx++] = bot_ptr[bot_index]; - junk_ptr[junk_idx++] = bot_index; - junk_ptr[junk_idx++] = res; - junk_ptr[junk_idx++] = saved_index; - } -#endif - bot_index += args.bot_w_stride; } - bot_ncdh += args.bot_h_stride; +#endif + bot_index += args.bot_w_stride; } - bot_ncd += args.bot_d_stride; + bot_ncdh += args.bot_h_stride; } + bot_ncd += args.bot_d_stride; + } #if AVERAGE_OPS - res *= CVT_FP32_2ACCUM(1.f) / static_cast<_FLOAT_ACCUM>(pool_size); + res /= static_cast<_FLOAT_ACCUM>(pool_size); #else // MAX - if(save_index) - { - index_t res_index = 5150; + if(save_index) + { + index_t res_index = 5150; - // / Preventing overflow during computation of res_index: - // / If Index is shorter than uint, then let's perform computation in 32-bit - // / domain and then convert to narrower Index. That would reduce the probability of - // / overflow. If Index is wider then 32 bits, then it seems like it is better to - // / convert to Index type before multiplication. However this is not actually - // / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in - // / 32 bits and then convert. + // / Preventing overflow during computation of res_index: + // / If Index is shorter than uint, then let's perform computation in 32-bit + // / domain and then convert to narrower Index. That would reduce the probability of + // / overflow. If Index is wider then 32 bits, then it seems like it is better to + // / convert to Index type before multiplication. However this is not actually + // / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in + // / 32 bits and then convert. - if(found) - { - if(index_mode == 1) // TEMPCODE RJS - res_index = saved_index; - // res_index = (index_t)( // - // nn * args.bot_n_stride // - // + cc * args.bot_c_stride // - // + d_save * args.bot_d_stride // - // + h_save * args.bot_h_stride // - // + w_save * args.bot_w_stride); - else - res_index = (index_t)( // - ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) // - + ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w) // - + (w_save - tw * args.filter_w_stride + args.filter_w_pad) // - ); - } - - const size_t mask_index = nn * args.mask_n_stride // - + cc * args.mask_c_stride // - + (size_t)(td * args.mask_d_stride) // - + (size_t)(th * args.mask_h_stride) // - + (size_t)(tw * args.mask_w_stride); - mask_ptr[mask_index] = res_index; + if(found) + { + if(index_mode == 1) // TEMPCODE RJS + res_index = saved_index; + // res_index = (index_t)( // + // nn * args.bot_n_stride // + // + cc * args.bot_c_stride // + // + d_save * args.bot_d_stride // + // + h_save * args.bot_h_stride // + // + w_save * args.bot_w_stride); + else + res_index = (index_t)( // + ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) // + + ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w) // + + (w_save - tw * args.filter_w_stride + args.filter_w_pad) // + ); } + + const size_t mask_index = nn * args.mask_n_stride // + + cc * args.mask_c_stride // + + (size_t)(td * args.mask_d_stride) // + + (size_t)(th * args.mask_h_stride) // + + (size_t)(tw * args.mask_w_stride); + mask_ptr[mask_index] = res_index; + } #endif - top_ptr[top_index] = (_FLOAT)res; -} // TEMPCODE + top_ptr[top_index] = (_FLOAT)res; } extern "C" __global__ void mloPoolingForwardNDNhwcNaive( diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp index 1b22cd91b7..9a7e166081 100644 --- a/src/ocl/pooling_ocl.cpp +++ b/src/ocl/pooling_ocl.cpp @@ -95,7 +95,7 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle, auto index_max = get_index_max(GetIndexType()); /// \anchor max_pooling_index_max_restriction - /// For kernel implementation max pooling backward pass, + /// For kernel implementation max pooling forward pass, /// "index_max" means ghost, and thus should not be reached. if(mode == miopenPoolingMax && save_index) { @@ -108,13 +108,13 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle, 1, std::multiplies()))) { - MIOPEN_THROW("Index range not enough for max pooling bwd"); + MIOPEN_THROW("Index range not enough for max pooling fwd"); } if(workSpace == nullptr) { throw std::invalid_argument("workSpace cannot be NULL in Forward Pooling MAX mode when " - "backward pass is requested"); + "forward pass is requested"); } } diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp index 804bc2bf84..7807d5425b 100644 --- a/src/pooling/problem_description.cpp +++ b/src/pooling/problem_description.cpp @@ -83,8 +83,8 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const ss << "_dyd" << get_vect_config(dyDesc.GetLengths()); ss << "_dys" << get_vect_config(dyDesc.GetStrides()); } // TEMPCODE RJS - std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << (xDesc.IsDefaultLayout() ? "" : " (not default)") << " *************************" << std::endl; - ss << "_l" << (xDesc.IsDefaultLayout() ? 0 : 1); + std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << " (" << static_cast(xDesc.GetLayout_t()) << (xDesc.IsDefaultLayout() ? ")" : " not default)") << " *************************" << std::endl; + ss << "_l" << static_cast(xDesc.GetLayout_t()); std::cout << " " << ss.str() << std::endl; return NetworkConfig{ss.str()}; } diff --git a/src/solver/pooling/forward2d.cpp b/src/solver/pooling/forward2d.cpp index f1e1cd6e52..aa8fdb2f6b 100644 --- a/src/solver/pooling/forward2d.cpp +++ b/src/solver/pooling/forward2d.cpp @@ -135,19 +135,26 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro bool PoolingForward2d::IsApplicable(const ExecutionContext& context, const miopen::pooling::ProblemDescription& problem) const { + auto x_type = problem.GetXDesc().GetType(); + auto y_type = problem.GetYDesc().GetType(); + std::vector types {miopenFloat, miopenHalf}; // TEMPCODE RJS fix types , miopenInt8, miopenFloat8 + + auto x_layout = problem.GetXDesc().GetLayout_str(); + auto y_layout = problem.GetYDesc().GetLayout_str(); + std::vector layouts {"NCHW"}; + bool app = - problem.GetDirection() == miopen::pooling::Direction::Forward && - problem.GetXDesc().GetNumDims() == 4 && - problem.GetXDesc().GetType() == problem.GetYDesc().GetType() && - (problem.GetXDesc().GetType() == miopenFloat || - problem.GetXDesc().GetType() == miopenHalf) && - problem.GetXDesc().GetLayout("NCHW") == "NCHW" && - problem.GetYDesc().GetLayout("NCHW") == "NCHW" && + problem.GetDirection() == miopen::pooling::Direction::Forward && + problem.GetXDesc().GetNumDims() == 4 && + (x_type == y_type) && // + (x_layout == y_layout) && // + (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) && // + (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()) && // sizeof_private_memory(problem) <= TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth(); // TEMPCODE RJS - std::cout << "%%%%%%%%%% PoolingForward2d::IsApplicable: " << app << " " << problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout("NCHW") << std::endl; + std::cout << "%%%%%%%%%% PoolingForward2d::IsApplicable: " << app << " " << problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout(x_layout) << std::endl; return app; } diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp index 62b65249df..6e28b98645 100644 --- a/src/solver/pooling/forwardNdNhwcNaive.cpp +++ b/src/solver/pooling/forwardNdNhwcNaive.cpp @@ -72,7 +72,7 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&, { auto x_type = problem.GetXDesc().GetType(); auto y_type = problem.GetYDesc().GetType(); - std::vector types {miopenFloat, miopenHalf}; + std::vector types {miopenFloat, miopenHalf, miopenInt8, miopenFloat8}; // , miopenBFloat16 auto mode = problem.GetPooling().GetMode(); std::vector modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive}; @@ -88,12 +88,15 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&, && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) //) && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()); - std::cout << "%%%%%%%%%% PoolingForwardNDNhwcNaive::IsApplicable: " << app << " " << problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout("NCHW") - << " " << problem.GetYDesc().GetLayout_str() << "->" << problem.GetYDesc().GetLayout("NCHW") + // TODO RJS check grid size + + std::cout << "%%%%%%%%%% PoolingForwardNDNhwcNaive::IsApplicable: " << app << " " << problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout(x_layout) + << " " << problem.GetYDesc().GetLayout_str() << "->" << problem.GetYDesc().GetLayout(y_layout) << " " << (problem.GetDirection() == miopen::pooling::Direction::Forward) << (x_type == y_type) << (x_layout == y_layout) << (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) << (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) << (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()) << std::endl; + return app; } @@ -141,29 +144,16 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, args.filter_w = lengths[idx++]; idx = 0; - args.filter_d_stride = is2d ? (strides[0]) : strides[idx++]; - args.filter_h_stride = strides[idx++]; - args.filter_w_stride = strides[idx++]; + args.filter_d_stride = is2d ? (strides[0]) : strides[idx++]; + args.filter_h_stride = strides[idx++]; + args.filter_w_stride = strides[idx++]; idx = 0; args.filter_d_pad = is2d ? 0 : pads[idx++]; args.filter_h_pad = pads[idx++]; args.filter_w_pad = pads[idx++]; - // uint32_t idx = 0; - // const uint32_t filter_d = is2d ? 1 : lengths[idx++]; - // const uint32_t filter_h = lengths[idx++]; - // const uint32_t filter_w = lengths[idx++]; - - // idx = 0; - // const uint32_t filter_d_stride = is2d ? (strides[0]) : strides[idx++]; - // const uint32_t filter_h_stride = strides[idx++]; - // const uint32_t filter_w_stride = strides[idx++]; - - // idx = 0; - // const uint32_t filter_d_pad = is2d ? 0 : pads[idx++]; - // const uint32_t filter_h_pad = pads[idx++]; - // const uint32_t filter_w_pad = pads[idx++]; + // TODO RJS move pooling_method to shared code const int pooling_method = (pooling.GetMode() == miopenPoolingMax) ? MLO_POOLING_OP_MAX : (pooling.GetMode() == miopenPoolingAverage) ? MLO_POOLING_OP_AVE @@ -199,24 +189,16 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, const auto spatial_dim = is2d ? 2U : 3U; - // uint32_t all_n, all_c, bot_d, bot_h, bot_w; std::tie(args.all_n, args.all_c, args.bot_d, args.bot_h, args.bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths()); -std::cout << "GetSol: bot_lens " << args.all_n << " " << args.all_c << " " << args.bot_d << " " << args.bot_h << " " << args.bot_w << std::endl; std::tie(args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride) = miopen::GetNCDHW(spatial_dim, bot.GetStrides()); -std::cout << "GetSol: bot_strides " << args.bot_n_stride << " " << args.bot_c_stride << " " << args.bot_d_stride -<< " " << args.bot_h_stride << " " << args.bot_w_stride << std::endl; std::tie(std::ignore, std::ignore, args.top_d, args.top_h, args.top_w) = miopen::GetNCDHW(spatial_dim, top.GetLengths()); -std::cout << "GetSol: top_lens " << args.top_d << " " << args.top_h << " " << args.top_w << std::endl; std::tie(args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride,args. top_w_stride) = miopen::GetNCDHW(spatial_dim, top.GetStrides()); - // TEMPCODE RJS -std::cout << "GetSol: top_strides " << args.top_n_stride << " " << args.top_c_stride << " " - << args.top_d_stride << " " << args.top_h_stride << " " << args.top_w_stride << std::endl; // Mask data is always NCDHW layout args.mask_w_stride = 1; @@ -236,12 +218,14 @@ std::cout << "GetSol: top_strides " << args.top_n_stride << " " << args.top_c_st /// Currently this limitation is valid for both ROCm HIP and OCL runtimes. /// /// Selecting the optimal workgroup size is an interesting problem. - /// We'll first map c into the workgroup up to the maximum 1024 items. For large C, the - /// extra are mapped into the grid dimensions. - /// For small C, w and h are mapped into the workgroup dimensions as needed, in that - /// order, up to a maximum of 128 workitems (favoring more active blocks over more threads). - /// We do permit a partial workgroup when it is not an exact multiple of the wavefront size. - /// As said above, remaining H and W are mapped onto the grid dimensions. + /// We'll first map N * D to blockIdx.x. H and W are canonically mapped into + /// blockIdx.y and z, respectively. C, being the fastest index, is mapped + /// into threadIdx.x up to the maximum items. For larger C, the remainder are + /// mapped into blockIdx.z. + /// + /// For small C, we favor more waves over more blocks. W/H are mapped into threadIdx.z/y, + /// in that order, fractionally in powers of 2 if possible, up to a maximum + /// of 256 workitems. Finally, any remaining W/H are then mapped onto blockIdx.z/y. /// /// The workgroup size does not have the restrictions imposed by synchronization between /// workitems because the kernel does not require synchronization. @@ -249,14 +233,14 @@ std::cout << "GetSol: top_strides " << args.top_n_stride << " " << args.top_c_st std::ignore = context; constexpr uint32_t MAX_THREADS = 512; constexpr uint32_t LARGE_C_MAX_ITEMS = MAX_THREADS; - constexpr uint32_t SMALL_C_MAX_ITEMS = 128; + constexpr uint32_t SMALL_C_TGT_ITEMS = 256; auto nd_ = args.all_n * args.top_d; auto h_ = args.top_h; auto w_ = args.top_w; auto c_ = args.all_c; -std::cout << "nd_ " << nd_ << " h_ " << h_ << " w_ " << w_ << " c_ " << c_ << std::endl; + // These are hip-style indexes (not OCL) uint32_t l1 = 1U; uint32_t l2 = 1U; @@ -266,22 +250,30 @@ std::cout << "nd_ " << nd_ << " h_ " << h_ << " w_ " << w_ << " c_ " << c_ << st c_ = LARGE_C_MAX_ITEMS; w_ *= c2; } - // else if(c_ <= SMALL_C_MAX_ITEMS / 2) - // { - // if(c_ * w_ <= MAX_THREADS) - // { - // std::swap(l2, w_); - // - // if(c_ * w_ * h_ <= MAX_THREADS) - // { - // std::swap(l1, h_); - // } - // } - // else if(c_ * h_ <= MAX_THREADS) - // { - // std::swap(l1, h_); - // } - // } + else if(c_ <= SMALL_C_TGT_ITEMS / 2) // Small C, remap H and W to increase occupancy + { + if(c_ * w_ < SMALL_C_TGT_ITEMS) + { + std::swap(l2, w_); // full w mapped to threads + } + + while(w_ > 2 && ((c_ * l2) < SMALL_C_TGT_ITEMS)) + { + w_ = (w_ + 1) / 2; // partial w mapped to threads (rounddown-safe) + l2 *= 2; + } + + if(c_ * l2 * h_ < SMALL_C_TGT_ITEMS) + { + std::swap(l1, h_); // full h mapped to threads + } + + while(h_ > 2 && ((c_ * l1 * l2) < SMALL_C_TGT_ITEMS)) + { + h_ = (h_ + 1 ) / 2; // partial h mapped to threads (rounddown-safe) + l1 *= 2; + } + } const auto g0 = nd_; const auto g1 = h_; @@ -313,8 +305,7 @@ std::cout << "nd_ " << nd_ << " h_ " << h_ << " w_ " << w_ << " c_ " << c_ << st // * 2: layout (NCHW vs NHWC) // * 2: 2D and 3D kernels (optimization) - // l1 = 11; - // l2 = 11; + // KernelInfo uses OCL-style indexes kernel.l_wk.clear(); kernel.l_wk.push_back(l0); kernel.l_wk.push_back(l1); diff --git a/test/gtest/poolingFwdNdNaive.cpp b/test/gtest/poolingFwd2dNaive.cpp similarity index 66% rename from test/gtest/poolingFwdNdNaive.cpp rename to test/gtest/poolingFwd2dNaive.cpp index 3a27b3c786..45a51525e6 100644 --- a/test/gtest/poolingFwdNdNaive.cpp +++ b/test/gtest/poolingFwd2dNaive.cpp @@ -114,9 +114,12 @@ struct layout_data } -class PoolingFwd : public testing::TestWithParam> {}; -class PoolingFwdFloat : public PoolingFwd {}; -class PoolingFwdHalf : public PoolingFwd {}; +class PoolingFwd2d : public testing::TestWithParam> {}; +class PoolingFwd2dInt8 : public PoolingFwd2d {}; +class PoolingFwd2dFloat : public PoolingFwd2d {}; +class PoolingFwd2dHalf : public PoolingFwd2d {}; +class PoolingFwd2dBF16 : public PoolingFwd2d {}; +class PoolingFwd2dF8 : public PoolingFwd2d {}; void Run2dDriver(miopenDataType_t prec); @@ -135,13 +138,15 @@ void GetArgs(const std::string& param, std::vector& tokens) bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; } -std::vector GetTestCases(const std::string precision) +std::vector Get2dTestCases(const std::string precision) { const auto& flag_arg = env::value(MIOPEN_TEST_FLAGS_ARGS); const std::vector test_cases = { // clang-format off - {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg} // TEMPCODE RJS DATASET + {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg}, // TEMPCODE RJS DATASET + // {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}, // TEMPCODE RJS DATASET + // {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 " + flag_arg} // TEMPCODE RJS DATASET // clang-format on }; @@ -150,68 +155,83 @@ std::vector GetTestCases(const std::string precision) } // namespace pooling_tests // using namespace pooling_tests; -TEST_P(PoolingFwdFloat, NNT) // NDNaiveTranspose +TEST_P(PoolingFwd2dInt8, NNT) +{ + if(!IsTestRunWith("--int8")) std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--int8") TEMPCODE RJS + GTEST_SKIP(); + + Run2dDriver(miopenInt8); +}; + +TEST_P(PoolingFwd2dFloat, NNT) { - const auto& handle = get_handle(); - if(!IsTestSupportedForDevice(handle)) std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl; - if(SkipTest()) std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl; if(!IsTestRunWith("--float")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl; - // Run2dDriver(miopenFloat); return; // TEMPCODE RJS - // && IsTestRunWith("--float") - if(IsTestSupportedForDevice(handle) && !SkipTest()) - { - Run2dDriver(miopenFloat); - } - else - { + + if(SkipTest() || !IsTestSupportedForDevice(get_handle())) GTEST_SKIP(); - } + + Run2dDriver(miopenFloat); }; -TEST_P(PoolingFwdHalf, NNT) +TEST_P(PoolingFwd2dHalf, NNT) { - const auto& handle = get_handle(); - if(!IsTestSupportedForDevice(handle)) std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl; - if(SkipTest()) std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl; if(!IsTestRunWith("--half")) std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl; - if(IsTestSupportedForDevice(handle) && !SkipTest()) // && IsTestRunWith("--half") TEMPCODE RJS - { - Run2dDriver(miopenHalf); - } - else - { + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--half") TEMPCODE RJS GTEST_SKIP(); - } + + Run2dDriver(miopenHalf); +}; + +TEST_P(PoolingFwd2dBF16, NNT) +{ + if(!IsTestRunWith("--bfloat16")) std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--bfloat16") TEMPCODE RJS + GTEST_SKIP(); + + Run2dDriver(miopenBFloat16); +}; + +TEST_P(PoolingFwd2dF8, NNT) +{ + if(!IsTestRunWith("--float8")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--float8") TEMPCODE RJS + GTEST_SKIP(); + + Run2dDriver(miopenFloat8); }; void Run2dDriver(miopenDataType_t prec) { - auto cases = GetTestCases("--float"); - std::cerr << " Cases: " << cases.size() << std::endl; - for(const auto& test_value : cases) - { - std::cerr << " : " << test_value << std::endl; // TEMPCODE RJS - } + auto cases = Get2dTestCases("--float"); + // std::cerr << " Cases: " << cases.size() << std::endl; // TEMPCODE RJS + // for(const auto& test_value : cases) + // { + // std::cerr << " : " << test_value << std::endl; // TEMPCODE RJS + // } std::vector params; switch(prec) { - case miopenFloat: params = PoolingFwdFloat_NNT_Test::GetParam(); break; - case miopenHalf: params = PoolingFwdHalf_NNT_Test::GetParam(); break; - case miopenBFloat16: - case miopenInt8: + case miopenFloat: params = PoolingFwd2dFloat_NNT_Test::GetParam(); break; + case miopenHalf: params = PoolingFwd2dHalf_NNT_Test::GetParam(); break; + case miopenBFloat16: params = PoolingFwd2dBF16_NNT_Test::GetParam(); break; + case miopenInt8: params = PoolingFwd2dInt8_NNT_Test::GetParam(); break; + case miopenFloat8: params = PoolingFwd2dF8_NNT_Test::GetParam(); break; case miopenInt32: case miopenDouble: - case miopenFloat8: case miopenBFloat8: case miopenInt64: FAIL() << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 " "data type not supported by " - "poolingFwdNdNaive test"; + "poolingFwd2dNaive test"; - default: params = PoolingFwdFloat_NNT_Test::GetParam(); + default: params = PoolingFwd2dFloat_NNT_Test::GetParam(); } std::cerr << "Params: " << params.size() << std::endl; @@ -237,8 +257,12 @@ void Run2dDriver(miopenDataType_t prec) } } -INSTANTIATE_TEST_SUITE_P(Float, PoolingFwdFloat, testing::Values(GetTestCases("--float"))); -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwdHalf); -INSTANTIATE_TEST_SUITE_P(Half, PoolingFwdHalf, testing::Values(GetTestCases("--half"))); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dBF16); + +//INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd2dBF16, testing::Values(Get2dTestCases("--bfloat16"))); +INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd2dInt8, testing::Values(Get2dTestCases("--int8"))); +INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd2dFloat, testing::Values(Get2dTestCases("--float"))); +INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd2dHalf, testing::Values(Get2dTestCases("--half"))); +INSTANTIATE_TEST_SUITE_P(F8, PoolingFwd2dF8, testing::Values(Get2dTestCases("--float8"))); #endif diff --git a/test/gtest/poolingFwd3dNaive.cpp b/test/gtest/poolingFwd3dNaive.cpp new file mode 100644 index 0000000000..a32432f1e6 --- /dev/null +++ b/test/gtest/poolingFwd3dNaive.cpp @@ -0,0 +1,283 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef POOLING_GTEST_BUILD + +#include +#include +#include "get_handle.hpp" +#include "test_env.hpp" + +#include "pooling_testing.hpp" +#include "pooling3d.hpp" + +#include "tensor_holder.hpp" +#include "miopen/tensor_layout.hpp" + +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLAGS_ARGS) + +namespace env = miopen::env; + +namespace { + +template +struct layout_data +{ + static std::vector get_layout_lengths(int n, int c, std::vector& dims) + { + auto ret = std::vector{n, c}; + ret.insert(ret.end(), dims.cbegin(), dims.cend()); + + return ret; + } + + static std::vector + get_strides(std::vector& lens, int dims, miopenTensorLayout_t tensor_layout) + { + std::vector strides; + std::string layout_default = miopen::tensor_layout_get_default(dims + 2); + std::string layout_string = miopen::TensorDescriptor::GetLayoutStr(tensor_layout); + +std::cout << "get_strides: dims=" << dims << " lens=" << lens.size() << " " << layout_default << " " << layout_string << std::endl; + miopen::tensor_layout_to_strides(lens, layout_default, layout_string, strides); + + constexpr int min_stride_multiplier = 1; + constexpr int max_stride_multiplier = 4; + + auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier); + for(auto& v : strides) + { + // cppcheck-suppress useStlAlgorithm + v = v * c; + } + + return strides; + } + + static miopenTensorDescriptor_t init_tensor_descriptor(miopenDataType_t type, + const std::vector& lens, + const std::vector& strides) + { + miopenTensorDescriptor_t desc; + + EXPECT_TRUE(miopenCreateTensorDescriptor(&desc) == miopenStatusSuccess); + EXPECT_TRUE( + miopenSetTensorDescriptor(desc, type, lens.size(), lens.data(), strides.data()) == + miopenStatusSuccess); + + return desc; + } + + layout_data(int _n, std::vector _dims, int _c, miopenTensorLayout_t _tensor_layout) + { + auto lens = get_layout_lengths(_n, _c, _dims); + auto strides = get_strides(lens, _dims.size(), _tensor_layout); + descriptor = miopen::TensorDescriptor{miopen_type{}, lens, strides}; + host = tensor{lens, strides}.generate(gen_value); + } + + ~layout_data() {} + + void read_gpu_data(miopen::Handle& handle, const miopen::Allocator::ManageDataPtr& ddata) + { + check = tensor{descriptor.GetLengths(), descriptor.GetStrides()}; + handle.ReadTo(check.data.data(), ddata, check.data.size()); + } + + tensor check{}; + tensor host; + miopen::TensorDescriptor descriptor; +}; + +} + +class PoolingFwd3d : public testing::TestWithParam> {}; +class PoolingFwd3dFloat : public PoolingFwd3d {}; +class PoolingFwd3dHalf : public PoolingFwd3d {}; +class PoolingFwd3dBF16 : public PoolingFwd3d {}; +class PoolingFwd3dInt8 : public PoolingFwd3d {}; +class PoolingFwd3dF8 : public PoolingFwd3d {}; + +void Run3dDriver(miopenDataType_t prec); + +namespace { + +static bool SkipTest(void) { return env::disabled(MIOPEN_TEST_ALL); } + +void GetArgs(const std::string& param, std::vector& tokens) +{ + std::stringstream ss(param); + std::istream_iterator begin(ss); + std::istream_iterator end; + while(begin != end) + tokens.push_back(*begin++); +} + +bool IsTestSupportedForDevice(const miopen::Handle& handle) { return true; } + +std::vector Get3dTestCases(const std::string precision) +{ + const auto& flag_arg = env::value(MIOPEN_TEST_FLAGS_ARGS); + + const std::vector test_cases = { + // clang-format off + {"test_pooling3d " + precision + " --all --dataset 0 --limit 0 " + flag_arg}, // TEMPCODE RJS DATASET + // {"test_pooling3d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}, // TEMPCODE RJS DATASET + // {"test_pooling3d " + precision + " --all --dataset 2 --limit 0 " + flag_arg} // TEMPCODE RJS DATASET + // clang-format on + }; + + return test_cases; +} +} // namespace pooling_tests +// using namespace pooling_tests; + +TEST_P(PoolingFwd3dFloat, NNT) // NDNaiveTranspose +{ + const auto& handle = get_handle(); + if(!IsTestSupportedForDevice(handle)) std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl; + if(SkipTest()) std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl; + if(!IsTestRunWith("--float")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl; + // Run3dDriver(miopenFloat); return; // TEMPCODE RJS + // && IsTestRunWith("--float") + if(IsTestSupportedForDevice(handle) && !SkipTest()) + { + Run3dDriver(miopenFloat); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(PoolingFwd3dHalf, NNT) +{ + const auto& handle = get_handle(); + if(!IsTestSupportedForDevice(handle)) std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl; + if(SkipTest()) std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl; + if(!IsTestRunWith("--half")) std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl; + + if(IsTestSupportedForDevice(handle) && !SkipTest()) // && IsTestRunWith("--half") TEMPCODE RJS + { + Run3dDriver(miopenHalf); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(PoolingFwd3dBF16, NNT) +{ + if(!IsTestRunWith("--bfloat16")) std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--bfloat16") TEMPCODE RJS + GTEST_SKIP(); + + Run3dDriver(miopenBFloat16); +}; + +TEST_P(PoolingFwd3dInt8, NNT) +{ + if(!IsTestRunWith("--int8")) std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--int8") TEMPCODE RJS + GTEST_SKIP(); + + Run3dDriver(miopenInt8); +}; + +TEST_P(PoolingFwd3dF8, NNT) +{ + if(!IsTestRunWith("--float8")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl; + + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--float8") TEMPCODE RJS + GTEST_SKIP(); + + Run3dDriver(miopenFloat8); +}; + +void Run3dDriver(miopenDataType_t prec) +{ + auto cases = Get3dTestCases("--float"); + std::cerr << " Cases: " << cases.size() << std::endl; + for(const auto& test_value : cases) + { + std::cerr << " : " << test_value << std::endl; // TEMPCODE RJS + } + + std::vector params; + switch(prec) + { + case miopenFloat: params = PoolingFwd3dFloat_NNT_Test::GetParam(); break; + case miopenHalf: params = PoolingFwd3dHalf_NNT_Test::GetParam(); break; + case miopenBFloat16: params = PoolingFwd3dBF16_NNT_Test::GetParam(); break; + case miopenInt8: params = PoolingFwd3dInt8_NNT_Test::GetParam(); break; + case miopenFloat8: params = PoolingFwd3dF8_NNT_Test::GetParam(); break; + case miopenInt32: + case miopenDouble: + case miopenBFloat8: + case miopenInt64: + FAIL() + << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 " + "data type not supported by " + "poolingFwdNdNaive test"; + + default: params = PoolingFwd3dFloat_NNT_Test::GetParam(); + } + + for(const auto& test_value : params) + { + std::cerr << "Running Test: " << test_value << std::endl; // TEMPCODE RJS + std::vector tokens; + GetArgs(test_value, tokens); + std::vector ptrs; + + std::transform(tokens.begin(), tokens.end(), std::back_inserter(ptrs), [](const auto& str) { + return str.data(); + }); + + testing::internal::CaptureStderr(); + test_drive(ptrs.size(), ptrs.data()); + auto capture = testing::internal::GetCapturedStderr(); + std::cout << capture; + } +} + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dBF16); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dFloat); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dHalf); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dInt8); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dF8); + +// INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd3dBF16, testing::Values(Get3dTestCases("--bfloat16"))); +INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd3dFloat, testing::Values(Get3dTestCases("--float"))); +INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd3dHalf, testing::Values(Get3dTestCases("--half"))); +INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd3dInt8, testing::Values(Get3dTestCases("--int8"))); +INSTANTIATE_TEST_SUITE_P(F8, PoolingFwd3dF8, testing::Values(Get3dTestCases("--float8"))); + +#endif diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp index f114be8764..990759c620 100644 --- a/test/gtest/pooling_testing.hpp +++ b/test/gtest/pooling_testing.hpp @@ -29,6 +29,18 @@ #ifndef GUARD_MIOPEN_TEST_POOLING_COMMON_HPP #define GUARD_MIOPEN_TEST_POOLING_COMMON_HPP +#define DATASET "0" + +#include +#include +namespace {using sc = std::chrono::steady_clock;} +#undef tomillis +#define tomillis(__DUR) (0.001 * std::chrono::duration_cast(__DUR).count()) +#undef coutms +#define coutms(__TOK, __TP) (std::cout << "ms[" << std::setw(16) << __TOK << "]: " \ + << std::setw(12) << std::fixed << std::setprecision(3) << tomillis(sc::now() - __TP) << std::endl) + + #include #include #include @@ -36,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -68,7 +81,7 @@ constexpr int RAND_INTEGER_MIN = -8800; template auto gen_value = - [](auto... is) { return static_cast(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; }; + [](auto... is) { return static_cast(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX) / 100); }; } static inline void print(std::ostringstream& oss, const miopen::PoolingDescriptor& filter, bool is_default_layout) @@ -97,7 +110,7 @@ tensor get_output_tensor(const miopen::PoolingDescriptor& filter, const tenso } template -tensor get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor& input) +tensor get_big_output_tensor(const miopen::PoolingDescriptor& filter, const tensor& input) { auto desc = filter.GetForwardOutputTensor(input.desc); auto lens = desc.GetLengths(); @@ -105,8 +118,8 @@ tensor get_big_output_tensor(const miopen::PoolingDescriptor& filter, const t lens[0] *= 2; else lens[0] *= 10; - auto big = miopen::TensorDescriptor{desc.GetType(), input.desc.GetLayout_t(), lens, desc.GetStrides()}; - return tensor{big}; + auto big = miopen::TensorDescriptor{miopenFloat, input.desc.GetLayout_t(), lens, desc.GetStrides()}; + return tensor{big}; } template @@ -150,7 +163,7 @@ struct pooling_operators #define MAX_PRINTING 128 // TEMPCODE RJS #define MAX_PRINT 12 // TEMPCODE RJS #define MAX_NCD 2 -#define GPU_JUNK 160 +#define GPU_JUNK 0 // 160 #define PRINT_CPU_IN 0 #define PRINT_GPU_OUT 0 #define GPU_4COL false @@ -194,12 +207,17 @@ namespace { oss << "n= " << nn << " c= " << cc << " d= " << dd << std::endl; for(int hh = 0; hh < olen[3]; ++hh) { for(int ww = 0; ww < olen[4]; ++ww) { + auto index = nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4]; switch(mode) { case 0: - oss << std::setw(11) << std::setprecision(5) << out[nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4]] << " "; + if(std::is_same::value || std::is_same::value) { + oss << std::setw(11) << std::setprecision(5) << (int16_t)out[index] << " "; + } else { + oss << std::setw(11) << std::setprecision(5) << out[index] << " "; + } break; case 1: - oss << std::setw(11) << std::setprecision(5) << nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4] << " "; + oss << std::setw(11) << std::setprecision(5) << index << " "; break; } } @@ -225,6 +243,7 @@ struct verify_forward_pooling tensor cpu(const tensor& input, const miopen::PoolingDescriptor& filter, std::vector&) const { +auto st = sc::now(); auto out = get_output_tensor(filter, input); std::array in_dim{}; @@ -291,7 +310,7 @@ struct verify_forward_pooling out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size)); }); - if(printing) + if(PRINT_GPU_OUT && printing) { PIPE << "CPU out: "; auto outlen = out.desc.GetLengths(); @@ -321,7 +340,7 @@ struct verify_forward_pooling printClamped(PIPE, input, inlen, instr); // printClamped(PIPE, input, inlen, instr, 1); } - +coutms("cpu", st); return out; } @@ -404,6 +423,8 @@ struct verify_forward_pooling const miopen::PoolingDescriptor& filter, std::vector& indices) const { +auto st = sc::now(); +auto write = sc::now(); auto&& handle = get_handle(); auto out = get_output_tensor(filter, input); auto junk = get_big_output_tensor(filter, input); // TEMPCODE RJS @@ -412,10 +433,10 @@ struct verify_forward_pooling auto in_dev = handle.Write(input.data); auto out_dev = handle.Create(out.GetSize()); - auto junk_dev = handle.Create(junk.GetSize()); // + auto junk_dev = handle.Create(junk.GetSize()); // Workspace wspace{}; wspace.Write(indices); - +coutms("GPUwrite", write); float alpha = 1, beta = 0; filter.Forward(handle, &alpha, @@ -428,7 +449,21 @@ struct verify_forward_pooling wspace.ptr(), wspace.size(), junk_dev.get()); // TEMPCODE RJS - +handle.Finish(); // TEMPCODE RJS +coutms("gpu1", st); + filter.Forward(handle, + &alpha, + input.desc, + in_dev.get(), + &beta, + out.desc, + out_dev.get(), + true, + wspace.ptr(), + wspace.size(), + junk_dev.get()); // TEMPCODE RJS +handle.Finish(); // TEMPCODE RJS +auto read = sc::now(); handle.ReadTo(out.data.data(), out_dev, out.GetDataByteSize()); wspace.ReadTo(indices); bool printing = input.desc.GetLengths()[2] <= MAX_PRINTING && input.desc.GetLengths()[3] <= MAX_PRINTING; @@ -437,7 +472,7 @@ std::cout << (printing ? "printing output from GPU..." : "skipping GPU print.") if(PRINT_GPU_OUT && printing) { #if GPU_JUNK > 0 - handle.ReadTo(junk.data.data(), junk_dev, junk.GetDataByteSize()); + handle.ReadTo(junk.data.data(), junk_dev, junk.GetDataByteSize()); if(GPU_4COL){ PIPE<< "GPU (8-cols): " << std::endl; for(int idx = 0; idx < GPU_JUNK; ++idx) { @@ -445,11 +480,11 @@ std::cout << (printing ? "printing output from GPU..." : "skipping GPU print.") if((idx % 8) == 7) PIPE < out_spatial_len{}; std::copy_n(out.desc.GetLengths().begin() + sptl_dim_offset, SptDim, out_spatial_len.begin()); + auto ford_out = miopen::unpacker(ford)(out_spatial_len); par_ford(out_n, out_c)([&](int o, int w) { @@ -653,6 +690,7 @@ struct verify_backward_pooling } dinput(in_id_pack...) = din_vec.at(din_idx); }); + return dinput; } @@ -756,25 +794,28 @@ struct pooling_driver : test_driver { add(index_type, "index_type", - generate_data({"miopenIndexUint32"} // TEMPCODE RJS - // generate_multi_data( // - // {{"miopenIndexUint8", - // "miopenIndexUint16", - // "miopenIndexUint32", - // "miopenIndexUint64"}, // - // {"miopenIndexUint8", "miopenIndexUint32"}, // - // {"miopenIndexUint32"}} // + // generate_data({"miopenIndexUint32"} // TEMPCODE RJS + generate_multi_data( // + {{"miopenIndexUint32", + "miopenIndexUint8" + , + "miopenIndexUint16", + "miopenIndexUint64" + }, // + {"miopenIndexUint8", "miopenIndexUint32"}, // + {"miopenIndexUint32"}} // )); add(mode_str, "mode_str", generate_data( - {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); // + {"miopenPoolingMax", "miopenPoolingAverage"})); // , "miopenPoolingAverageInclusive" #if TEST_PADDING_MODE == 1 add(pmode, "pmode", generate_data({"default", "same", "valid"})); #endif add(verify_indices, "verify_indices", generate_data({1})); } + template void run_impl() { @@ -812,11 +853,12 @@ struct pooling_driver : test_driver // } // } // } - +auto st = sc::now(); auto out = verify(verify_forward_pooling{}, input, filter, indices); + coutms("verifyfwd", st); #ifdef BACKWARD auto dout = out.first; dout.generate(tensor_elem_gen_integer{2503}); @@ -831,24 +873,42 @@ struct pooling_driver : test_driver #endif } + +#define RETURN coutms("Run", st); return + void run() { - std::cout << "\n############ Run # " << std::setw(6) << num_all_case++ << " : "; - if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES) - { - std::cout << " skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES << " : "; - show_command(); - return; - } - else if(this->dry_run) - { - std::cout << " skipped due to dry_run : "; - show_command(); - return; - } - else +auto st = sc::now(); + const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); // TEMPCODE RJS + { + bool skip = false; + std::ostringstream oss; + + if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES) + { + skip = true; + oss << " : skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES; + } + if(this->dry_run) + { + skip = true; + oss << " : skipped due to dry_run"; + } + if(is_default_layout && (this->type == miopenInt8 || this->type == miopenFloat8)) + { + skip = true; + oss << " : skipped, no solvers for datatype " << this->type << " and default layouts"; + } + + std::cout << "\n############ Run " << (skip ? " " : "#") << " " << std::setw(6) << num_all_case++ << " : "; show_command(); + + if(skip) + { + std::cout << "-- " << oss.str() << std::endl; + RETURN; + } } int sptl_dim = static_cast(in_shape.size()) - 2; @@ -856,15 +916,13 @@ struct pooling_driver : test_driver { std::cout << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW form." << std::endl; - return; + RETURN; } // To simplify launching, input dimensions to the driver are always default layout. Desire to // test non-default layouts is communicated exclusively via 'layout'. - // const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); // TEMPCODE RJS auto mode = mode_lookup.at(miopen::ToUpper(mode_str)); - if(mode != miopenPoolingMax) return; // TEMPCODE RJS skip all except max, do max only auto pad_mode = miopenPaddingDefault; #if TEST_PADDING_MODE @@ -895,7 +953,7 @@ struct pooling_driver : test_driver std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " "yet in 3D max pooling solvers." << std::endl; - return; + RETURN; } if(mask_idx && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset) @@ -903,7 +961,7 @@ struct pooling_driver : test_driver std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " "yet in 2D max backward solvers that support wide pooling window." << std::endl; - return; + RETURN; } if(mask_idx && filter.ModeIsAveraging()) @@ -914,7 +972,24 @@ struct pooling_driver : test_driver "skip average pooling configs when (wsidx == 0). " "Please make sure that dataset includes counterparts with (wsidx == 1)." << std::endl; - return; + RETURN; + } + + // index size filter + if(filter.GetMode() == miopenPoolingMax) + { + auto index_max = miopen::get_index_max(filter.GetIndexType()); + auto index_needed = mask_idx ? + std::accumulate(lens.begin(), lens.end(), 1, std::multiplies()) : + std::accumulate(in_shape.begin() + 2, in_shape.end(), 1, std::multiplies()); + + if(index_max <= index_needed) + { + std::cout << "Warning: Config skipped: index mode " << filter.GetWorkspaceIndexMode() + << " type " << filter.GetIndexType() << " is too small. max=" + << index_max << ", needed=" << index_needed << std::endl; + RETURN; + } } switch(idx_typ) @@ -931,7 +1006,7 @@ struct pooling_driver : test_driver "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " "&& filter.GetMode() == miopenPoolingMax" << std::endl; - return; + RETURN; } break; } @@ -943,7 +1018,7 @@ struct pooling_driver : test_driver "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " "&& filter.GetMode() == miopenPoolingMax" << std::endl; - return; + RETURN; } if(skip_many_configs_with_non_int8_index) { @@ -953,7 +1028,7 @@ struct pooling_driver : test_driver std::cout << "Warning: Config skipped for the default dataset to speed " "up testing (num_uint16_case > 5)" << std::endl; - return; + RETURN; } ++num_uint16_case; } @@ -971,7 +1046,7 @@ struct pooling_driver : test_driver std::cout << "Warning: Config skipped for the default dataset to speed up " "testing (wsidx == 0 && num_uint32_case > 5)" << std::endl; - return; + RETURN; } ++num_uint32_case; } @@ -982,7 +1057,7 @@ struct pooling_driver : test_driver std::cout << "Warning: Config skipped for the default dataset to speed up " "testing (wsidx != 0 && num_uint32_case_imgidx > 5)" << std::endl; - return; + RETURN; } ++num_uint32_case_imgidx; } @@ -1000,7 +1075,7 @@ struct pooling_driver : test_driver std::cout << "Warning: Config skipped for the default dataset to speed up " "testing (wsidx == 0) && (num_uint64_case > 5)" << std::endl; - return; + RETURN; } ++num_uint64_case; } @@ -1012,7 +1087,7 @@ struct pooling_driver : test_driver "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 " "&& sptl_dim == 2)" << std::endl; - return; + RETURN; } ++num_uint64_case_imgidx; } @@ -1031,7 +1106,7 @@ struct pooling_driver : test_driver std::cout << "Warning: Config skipped becuse it is invalid " "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))" << std::endl; - return; + RETURN; } } @@ -1048,7 +1123,7 @@ struct pooling_driver : test_driver std::cout << "Config skipped because it requires " << total_mem << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem << " Bytes of memory." << std::endl; - return; + RETURN; } } @@ -1143,6 +1218,7 @@ struct pooling_driver : test_driver break; } } +coutms("Run", st); } }; diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp index 8472b23375..b30b9ce36f 100644 --- a/test/pooling2d.hpp +++ b/test/pooling2d.hpp @@ -38,29 +38,33 @@ struct pooling2d_shapes static std::vector get_2d_pooling_input_shapes() { return { - {1, 831, 64, 128}, {5, 32, 8, 8}, - {10, 3, 32, 32}, - {1, 19, 1024, 2048}, - {2, 1024, 12, 12}, - {4, 3, 231, 231}, - {8, 3, 227, 227}, - {1, 384, 13, 13}, - {1, 96, 27, 27}, - {2, 160, 7, 7}, - {1, 192, 256, 512}, - {2, 192, 28, 28}, - {1, 256, 56, 56}, - {4, 3, 224, 224}, - {2, 64, 112, 112}, - {2, 608, 4, 4}, - {1, 2048, 11, 11}, - {1, 16, 4096, 4096} + {16, 1, 4096, 4096}, + {1, 16, 4096, 4096}, + {1, 1024, 512, 512}, + {16, 1024, 128, 128} + // , + // {1, 832, 64, 128}, + // {10, 3, 32, 32}, + // {1, 19, 1024, 2048}, + // {2, 1024, 12, 12}, + // {4, 3, 231, 231}, + // {8, 3, 227, 227}, + // {1, 384, 13, 13}, + // {1, 96, 27, 27}, + // {2, 160, 7, 7}, + // {1, 192, 256, 512}, + // {2, 192, 28, 28}, + // {1, 256, 56, 56}, + // {4, 3, 224, 224}, + // {2, 64, 112, 112}, + // {2, 608, 4, 4}, + // {1, 2048, 11, 11} }; } // Dataset 1 is intended for testing of asymmetric configs. - static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 2, 8, 8}}; } // {1, 1, 8, 8}, + static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 2, 8, 8}, {10, 3, 32, 32}}; } // {1, 1, 8, 8}, // Dataset 2 is intended for testing of configs with wide window. static std::vector get_2d_pooling_input_shapes_wide() @@ -91,41 +95,43 @@ struct pooling2d_driver : pooling_driver public: pooling2d_driver() : pooling_driver() { +#if TEST_GET_INPUT_TENSOR + std::set in_dim_set = get_inputs(this->batch_factor); + std::vector in_dim_vec(in_dim_set.begin(), in_dim_set.end()); + this->add(this->in_shape, "input", this->generate_data(in_dim_vec, {16, 32, 8, 8})); +#else + this->add(this->in_shape, "input", this->template generate_multi_data_limited({ + get_2d_pooling_input_shapes(), + get_2d_pooling_input_shapes_minimal(), + get_2d_pooling_input_shapes_wide() + }, 9 + )); +#endif + this->add(this->lens, "lens", this->template generate_multi_data({ + {{2, 2}, {3, 3}}, // + {{2, 2}, {1, 2}, {2, 1}}, // + {{35, 35}, {100, 100}, {255, 255}, {410, 400}} + } + )); + this->add(this->strides, "strides", this->template generate_multi_data({ + {{2, 2}, {1, 1}}, // + {{2, 2}, {2, 1}, {1, 2}}, // + {{1, 1}} + } + )); // clang-format off this->add(this->pads, "pads", this->template generate_multi_data({ - {{0, 0}, {1, 1}}, // + {{0, 0}, {1, 1}}, // #if WORKAROUND_ISSUE_1670 - {{0, 0}}, // + {{0, 0}}, // #else - {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, // + {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, // #endif - {{0, 0}}})); + {{0, 0}} + } + )); // clang-format on - this->add(this->layout, "layout", this->generate_data({miopenTensorNHWC})); // , miopenTensorNCHW this->add(this->wsidx, "wsidx", this->generate_data({miopenPoolingWorkspaceIndexMask, miopenPoolingWorkspaceIndexImage})); - this->add(this->strides, - "strides", - this->template generate_multi_data({{{2, 2}, {1, 1}}, // - {{2, 2}, {2, 1}}, // , {1, 2}, {1, 1} - {{1, 1}}})); -#if TEST_GET_INPUT_TENSOR - std::set in_dim_set = get_inputs(this->batch_factor); - std::vector in_dim_vec(in_dim_set.begin(), in_dim_set.end()); - this->add(this->in_shape, "input", this->generate_data(in_dim_vec, {16, 32, 8, 8})); -#else - this->add( - this->in_shape, - "input", - this->template generate_multi_data_limited({get_2d_pooling_input_shapes(), - get_2d_pooling_input_shapes_minimal(), - get_2d_pooling_input_shapes_wide()}, - 9)); -#endif - this->add(this->lens, - "lens", - this->template generate_multi_data( - {{{2, 2}, {3, 3}}, // - {{2, 2}, {1, 2}, {2, 1}}, // - {{35, 35}, {100, 100}, {255, 255}, {410, 400}}})); - } + this->add(this->layout, "layout", this->generate_data({miopenTensorNCHW, miopenTensorNHWC})); + } }; diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp index d2a64540c6..6792a5cc84 100644 --- a/test/pooling3d.hpp +++ b/test/pooling3d.hpp @@ -53,12 +53,11 @@ struct pooling3d_driver : pooling_driver pooling3d_driver() : pooling_driver() { - this->add( - this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4)); + this->add(this->in_shape, "input", this->generate_data_limited(get_3d_pooling_input_shapes(), 4)); this->add(this->lens, "lens", this->generate_data({{2, 2, 2}, {3, 3, 3}})); this->add(this->strides, "strides", this->generate_data({{2, 2, 2}, {1, 1, 1}})); this->add(this->pads, "pads", this->generate_data({{0, 0, 0}, {1, 1, 1}})); - this->add(this->wsidx, "wsidx", this->generate_data({1})); - this->add(this->layout, "layout", this->generate_data({0, 1})); + this->add(this->wsidx, "wsidx", this->generate_data({miopenPoolingWorkspaceIndexImage})); + this->add(this->layout, "layout", this->generate_data({miopenTensorNCDHW, miopenTensorNDHWC})); } }; From c5a4b11d406aa5bd2351a35476c7f70a41af685e Mon Sep 17 00:00:00 2001 From: "Randy J. Spaulding" Date: Thu, 19 Sep 2024 16:25:26 -0700 Subject: [PATCH 07/10] fwd naive done --- src/CMakeLists.txt | 13 +- .../MIOpenPoolingForwardNDNhwcNaive.cpp | 80 +++--- src/pooling/problem_description.cpp | 1 + src/solver/pooling/forwardNdNhwcNaive.cpp | 4 +- test/gtest/poolingFwd2dNaive.cpp | 6 +- test/gtest/pooling_testing.hpp | 239 ++++++++++-------- test/pooling2d.hpp | 39 +-- test/pooling3d.hpp | 6 +- 8 files changed, 219 insertions(+), 169 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f536de3b10..df8c2f9614 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -26,6 +26,11 @@ cmake_policy(SET CMP0057 NEW) +include_directories(SYSTEM + /opt/rocm/include/gtest + /opt/rocm/include +) + include(ExportHeader) if(MIOPEN_ENABLE_SQLITE) add_subdirectory(sqlite) @@ -594,7 +599,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/xform_bidirect_winograd_out.s kernels/UniversalTranspose.cl) - # TEMPCODE RJS +# TEMPCODE KERNELS set(MIOPEN_KERNELS kernels/MIOpenPoolingForwardNDNhwcNaive.cpp kernels/MIOpenPooling.cl @@ -788,6 +793,12 @@ else() ) endif() +# TEMPCODE LIB_INC +include_directories(SYSTEM + /opt/rocm/include/gtest/ + /opt/rocm/include/ +) + rocm_set_soversion(MIOpen ${MIOpen_SOVERSION}) clang_tidy_check(MIOpen) diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp index 5c197e3b8e..be6a548091 100644 --- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp +++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp @@ -25,22 +25,19 @@ *******************************************************************************/ //#define TEMPCODE RJS -#ifdef TEMPCODE -#define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0 +// #ifdef TEMPCODE +// #define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0 -#define MLO_POOLING_OP_ID MLO_POOLING_OP_AVE +// #define MLO_POOLING_OP_ID MLO_POOLING_OP_AVE -#define MLO_POOLING_INDEX_TYPE int -#define MLO_POOLING_IS2D_KERNEL 0 -#define INPUT_TYPE _FLOAT -#define OUTPUT_TYPE _FLOAT -// #define TI INPUT_TYPE -// #define TO OUTPUT_TYPE -#define CVT_FP32_2ACCUM(x) (x) -#endif +// #define INPUT_TYPE _FLOAT +// #define OUTPUT_TYPE _FLOAT +// // #define TI INPUT_TYPE +// // #define TO OUTPUT_TYPE +// #define CVT_FP32_2ACCUM(x) (x) +// #endif #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS -#include #include #endif @@ -70,8 +67,11 @@ #include "float_types.h" #include "miopen_cstdint.hpp" +// This implementation is extremely memory-bound, so float type is used for all calculations +#define _FLOAT float +#define _FLOAT_ACCUM float + #if MIOPEN_USE_INT8 == 1 - #define _FLOAT char #if !AVERAGE_OPS #ifndef FLT_MAX #define MAX_VAL 127 /* max value */ @@ -79,10 +79,14 @@ #define MAX_VAL FLT_MAX #endif #endif +#endif +#if MIOPEN_USE_BFP16 + #define NATIVE_CAST(_x) (_FLOAT)bfloat16_to_float(_x) + #define NATIVE_UNCAST(_x) (_FLOAT)float_to_bfloat16(_x) #else - #define _FLOAT float + #define NATIVE_CAST(_x) (_FLOAT)(_x) + #define NATIVE_UNCAST(_x) (_FLOAT)(_x) #endif -#define _FLOAT_ACCUM _FLOAT #endif // TEMPCODE @@ -94,10 +98,10 @@ // Out N, D, H are encoded into the block indices x, y, z // No 2D-only optimization. -template +template __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, - TO* __restrict__ top_ptr, - TO* __restrict__ junk_ptr, // TEMPCODE RJS + TI* __restrict__ top_ptr, + float* __restrict__ junk_ptr, // TEMPCODE RJS ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr, ARG_UNUSED_FOR_AVERAGE int save_index, ARG_UNUSED_FOR_AVERAGE int index_mode, @@ -130,14 +134,14 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, if(tw >= args.top_w) return; if(cc >= args.all_c) return; + for(int i = 4; i < 320; ++i) + { + junk_ptr[i] = 1.11111; + } auto log_ptr = junk_ptr; if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { - for(int i = 0; i < 320; ++i) - { - junk_ptr[i] = (_FLOAT)1.11111; - } - int idx = 0; + int idx = 8; log_ptr[idx++] = gridDim.x; // ND log_ptr[idx++] = gridDim.y; // H log_ptr[idx++] = gridDim.z; // W (*C overflow) @@ -229,10 +233,17 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, th * args.top_h_stride + // tw * args.top_w_stride; -#if false - size_t junk_idx = 64 + 4 * th; +#if true + size_t junk_idx = 64; + int bot_idx = 0; + junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]); + junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]); + junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]); + junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]); + junk_idx += 4 * th; if(nn == 0 && cc == 0 && td == 0 && tw < 8 && th == 0) { + size_t bot_ncd = static_cast(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride); size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride; size_t bot_index = bot_ncdh + wstart * args.bot_w_stride; @@ -258,7 +269,7 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, #if AVERAGE_OPS _FLOAT_ACCUM res = (_FLOAT_ACCUM)(0); #else // MAX - _FLOAT_ACCUM res = (_FLOAT_ACCUM)(-MAX_VAL_ACCUM); + _FLOAT_ACCUM res = (_FLOAT_ACCUM)NATIVE_CAST(-MAX_VAL_ACCUM); bool found = false; // May remain false if bot contains only NaNs/-INFs. uint32_t d_save = 0; uint32_t h_save = 0; @@ -276,11 +287,12 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, for(uint32_t bw = wstart; bw < wend; ++bw) { #if AVERAGE_OPS - res += static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]); + res += static_cast<_FLOAT_ACCUM>(NATIVE_CAST(bot_ptr[bot_index])); #else // MAX - if(static_cast<_FLOAT_ACCUM>(bot_ptr[bot_index]) > res) + auto val = static_cast<_FLOAT_ACCUM>(NATIVE_CAST(bot_ptr[bot_index])); + if(val > res) { - res = bot_ptr[bot_index]; + res = val; if(save_index) { found = true; @@ -340,20 +352,20 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, } #endif - top_ptr[top_index] = (_FLOAT)res; + top_ptr[top_index] = NATIVE_UNCAST(res); } extern "C" __global__ void mloPoolingForwardNDNhwcNaive( const INPUT_TYPE* __restrict__ bot_ptr, - OUTPUT_TYPE* __restrict__ top_ptr, - OUTPUT_TYPE* __restrict__ junk_ptr, // TEMPCODE RJS + INPUT_TYPE* __restrict__ top_ptr, + float* __restrict__ junk_ptr, // TEMPCODE RJS index_t* __restrict__ mask_ptr, int save_index, int index_mode, -poolingNdNhwcArgs args + poolingNdNhwcArgs args ) { - poolingForwardNDNhwcNaive( + poolingForwardNDNhwcNaive( bot_ptr, top_ptr, junk_ptr, diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp index 7807d5425b..a96101be3a 100644 --- a/src/pooling/problem_description.cpp +++ b/src/pooling/problem_description.cpp @@ -60,6 +60,7 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const : MLO_POOLING_OP_AVE_INCLUSIVE); ss << "m" + std::to_string(pooling_method); + ss << "_dt" << xDesc.GetType(); if(const auto ct = xDesc.GetCastType()) ss << "_dct" << GetDataTypeName(*ct); diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp index 6e28b98645..d31df3358d 100644 --- a/src/solver/pooling/forwardNdNhwcNaive.cpp +++ b/src/solver/pooling/forwardNdNhwcNaive.cpp @@ -72,7 +72,7 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&, { auto x_type = problem.GetXDesc().GetType(); auto y_type = problem.GetYDesc().GetType(); - std::vector types {miopenFloat, miopenHalf, miopenInt8, miopenFloat8}; // , miopenBFloat16 + std::vector types {miopenFloat, miopenHalf, miopenInt8, miopenFloat8, miopenBFloat16}; // auto mode = problem.GetPooling().GetMode(); std::vector modes {miopenPoolingMax, miopenPoolingAverage, miopenPoolingAverageInclusive}; @@ -81,7 +81,7 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&, auto y_layout = problem.GetYDesc().GetLayout_str(); std::vector layouts {"NHWC", "NDHWC"}; - bool app = (problem.GetDirection() == miopen::pooling::Direction::Forward) // + bool app = (problem.GetDirection() == miopen::pooling::Direction::Forward) // && (x_type == y_type) // && (x_layout == y_layout) // && (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) // diff --git a/test/gtest/poolingFwd2dNaive.cpp b/test/gtest/poolingFwd2dNaive.cpp index 45a51525e6..828fc556cb 100644 --- a/test/gtest/poolingFwd2dNaive.cpp +++ b/test/gtest/poolingFwd2dNaive.cpp @@ -258,8 +258,12 @@ void Run2dDriver(miopenDataType_t prec) } GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dBF16); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dInt8); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dFloat); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dHalf); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dF8); -//INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd2dBF16, testing::Values(Get2dTestCases("--bfloat16"))); +INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd2dBF16, testing::Values(Get2dTestCases("--bfloat16"))); INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd2dInt8, testing::Values(Get2dTestCases("--int8"))); INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd2dFloat, testing::Values(Get2dTestCases("--float"))); INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd2dHalf, testing::Values(Get2dTestCases("--half"))); diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp index 990759c620..2fdb4ded19 100644 --- a/test/gtest/pooling_testing.hpp +++ b/test/gtest/pooling_testing.hpp @@ -114,12 +114,14 @@ tensor get_big_output_tensor(const miopen::PoolingDescriptor& filter, con { auto desc = filter.GetForwardOutputTensor(input.desc); auto lens = desc.GetLengths(); - if(desc.GetNumBytes() > 1000) + if(desc.GetElementSize() > 1024) lens[0] *= 2; else - lens[0] *= 10; - auto big = miopen::TensorDescriptor{miopenFloat, input.desc.GetLayout_t(), lens, desc.GetStrides()}; - return tensor{big}; + lens[0] *= (2047 / desc.GetElementSize()) + 1 ; + auto dbig = miopen::TensorDescriptor{miopenFloat, input.desc.GetLayout_t(), lens, desc.GetStrides()}; + auto big = tensor{dbig}; + for (auto& v : big) v = -2.2222f; + return big; } template @@ -163,7 +165,7 @@ struct pooling_operators #define MAX_PRINTING 128 // TEMPCODE RJS #define MAX_PRINT 12 // TEMPCODE RJS #define MAX_NCD 2 -#define GPU_JUNK 0 // 160 +#define GPU_JUNK 0 // typ. 160, reasonable max is 320 #define PRINT_CPU_IN 0 #define PRINT_GPU_OUT 0 #define GPU_4COL false @@ -340,7 +342,7 @@ auto st = sc::now(); printClamped(PIPE, input, inlen, instr); // printClamped(PIPE, input, inlen, instr, 1); } -coutms("cpu", st); +coutms("f.cpu", st); return out; } @@ -427,16 +429,24 @@ auto st = sc::now(); auto write = sc::now(); auto&& handle = get_handle(); auto out = get_output_tensor(filter, input); - auto junk = get_big_output_tensor(filter, input); // TEMPCODE RJS + auto junk = get_big_output_tensor(filter, input); // TEMPCODE RJS + +#if GPU_JUNK > 0 + PIPE << "GPU junk init: " << std::endl; + for(int idx = 0; idx < GPU_JUNK; ++idx) { + PIPE << std::setw(11) << std::setprecision(5) << junk.data[idx] << " "; + if((idx % 4) == 3) PIPE <(out.GetSize()); - auto junk_dev = handle.Create(junk.GetSize()); // + auto junk_dev = handle.Write(junk.data); // TEMPCODE RJS Workspace wspace{}; wspace.Write(indices); -coutms("GPUwrite", write); +coutms("f.GPUwrite", write); float alpha = 1, beta = 0; filter.Forward(handle, &alpha, @@ -450,7 +460,7 @@ coutms("GPUwrite", write); wspace.size(), junk_dev.get()); // TEMPCODE RJS handle.Finish(); // TEMPCODE RJS -coutms("gpu1", st); +coutms("f.gpu1", st); filter.Forward(handle, &alpha, input.desc, @@ -515,8 +525,8 @@ std::cout << (printing ? "printing output from GPU..." : "skipping GPU print.") printClamped(PIPE, out, outlen, outstr); // printClamped(std::cout, out, outlen, outstr, 1); } // print output tensor -coutms("GPUread", read); -coutms("gpu", st); +coutms("f.GPUread", read); +coutms("f.gpu", st); return out; } @@ -548,6 +558,7 @@ struct verify_backward_pooling bool use_global_index, bool verify_index) const { +auto st = sc::now(); const int sptl_dim_offset = 2; const int chan_dim_offset = 1; @@ -616,7 +627,7 @@ struct verify_backward_pooling { idx[0] = o; idx[1] = w; - if(verify_index) + if(false && verify_index) { CHECK( miopen::float_equal(input(idx), out(o, w, out_spatial_id_pack...))); @@ -691,7 +702,8 @@ struct verify_backward_pooling dinput(in_id_pack...) = din_vec.at(din_idx); }); - return dinput; + coutms("b.cpu", st); + return dinput; } template @@ -703,6 +715,8 @@ struct verify_backward_pooling bool, bool) const { +auto st = sc::now(); +auto write = sc::now(); auto&& handle = get_handle(); auto dinput = input; @@ -713,6 +727,7 @@ struct verify_backward_pooling Workspace wspace{}; wspace.Write(indices); +coutms("b.GPUwrite", write); float alpha = 1, beta = 0; filter.Backward(handle, @@ -731,8 +746,29 @@ struct verify_backward_pooling dinput.desc, din_dev.get(), wspace.ptr()); - +handle.Finish(); // TEMPCODE RJS +coutms("b.gpu1", st); + filter.Backward(handle, + &alpha, + // y + out.desc, + out_dev.get(), + // dy + dout.desc, + dout_dev.get(), + // x + input.desc, + in_dev.get(), + &beta, + // dx + dinput.desc, + din_dev.get(), + wspace.ptr()); +handle.Finish(); // TEMPCODE RJS +auto read = sc::now(); dinput.data = handle.Read(din_dev, dinput.data.size()); +coutms("b.GPUread", read); +coutms("b.gpu", st); return dinput; } @@ -746,10 +782,12 @@ struct verify_backward_pooling bool, bool) const { - std::cout << "Backward "; - print(filter, input.desc.IsDefaultLayout()); - std::cout << "Input tensor: " << input.desc.ToString() << std::endl; - std::cout << "Output tensor: " << out.desc.ToString() << std::endl; + std::ostringstream oss; + oss << "Backward "; + print(oss, filter, input.desc.IsDefaultLayout()); + oss << "Input tensor: " << input.desc.ToString() << std::endl; + oss << "Output tensor: " << out.desc.ToString() << std::endl; + GTEST_FAIL() << oss.str(); } }; @@ -794,7 +832,7 @@ struct pooling_driver : test_driver { add(index_type, "index_type", - // generate_data({"miopenIndexUint32"} // TEMPCODE RJS + // generate_data({"miopenIndexUint32"} // TEMPCODE RUN generate_multi_data( // {{"miopenIndexUint32", "miopenIndexUint8" @@ -808,13 +846,15 @@ struct pooling_driver : test_driver add(mode_str, "mode_str", generate_data( - {"miopenPoolingMax", "miopenPoolingAverage"})); // , "miopenPoolingAverageInclusive" + {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); // TEMPCODE RUN #if TEST_PADDING_MODE == 1 add(pmode, "pmode", generate_data({"default", "same", "valid"})); #endif add(verify_indices, "verify_indices", generate_data({1})); } +#define FORWARD +#define BACKWARD template void run_impl() @@ -823,46 +863,20 @@ struct pooling_driver : test_driver auto input = tensor{layout, in_shape}; for(auto& v : input.data) v = gen_value(); - // TEMPCODE RJS print input tensor - // bool printing = in_shape[0] <= MAX_PRINTING && in_shape[1] <= MAX_PRINTING; - // if (in_shape.size() > 2) printing &= in_shape[2] <= MAX_PRINTING; - // if(printing) - // { - // auto inlen = input.desc.GetLengths(); - // auto instr = input.desc.GetStrides(); - // std::cout << "CPU GEN : " << input.desc.GetLayout_str() << "(" << inlen.size() << ") | " << input.data.size() << " | " << input.desc.GetElementSpace() << " | "; - // for(auto dim : inlen) std::cout << std::setw(4) << dim; - // std::cout << " | "; - // for(auto str : instr) std::cout << std::setw(4) << str; - // std::cout << " | "; - // for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str; - // std::cout << " | "; - // for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str; - // std::cout << " | "; - // for(auto str : filter.GetPads()) std::cout << std::setw(4) << str; - // std::cout << std::endl; - - // for(int nn = 0; nn < inlen[0]; ++nn) { - // for(int cc = 0; cc < inlen[1]; ++cc) { - // for(int hh = 0; hh < inlen[2]; ++hh) { - // for(int ww = 0; ww < inlen[3]; ++ww) {// nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3] - // std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << " "; - // } - // std::cout << std::endl; - // } - // } - // } - // } auto st = sc::now(); +#ifdef FORWARD auto out = verify(verify_forward_pooling{}, input, filter, indices); - coutms("verifyfwd", st); -#ifdef BACKWARD +coutms("f.verify", st); +#endif +#ifdef BACKWARD // TEMPCODE RJS no backward + if(!std::is_same::value && !std::is_same::value) return; +auto bst = sc::now(); auto dout = out.first; dout.generate(tensor_elem_gen_integer{2503}); - verify(verify_backward_pooling{}, // TEMPCODE RJS no backward + verify(verify_backward_pooling{}, input, dout, out.first, @@ -870,53 +884,53 @@ auto st = sc::now(); indices, wsidx != 0, static_cast(this->verify_indices)); +coutms("b.verify", bst); #endif } +#define CHECK_SKIP \ +if(skip) \ +{ \ + std::cout << "\n############ RunSkip # " << std::setw(7) << num_all_case++ << " : "; \ + show_command(); \ + std::cout << "-- " << oss.str() << std::endl; \ +coutms("skipRun", st); return; \ +} -#define RETURN coutms("Run", st); return +#define SKIP_RUN skip = true; CHECK_SKIP void run() { auto st = sc::now(); const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); // TEMPCODE RJS - { - bool skip = false; - std::ostringstream oss; - - if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES) - { - skip = true; - oss << " : skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES; - } - if(this->dry_run) - { - skip = true; - oss << " : skipped due to dry_run"; - } - if(is_default_layout && (this->type == miopenInt8 || this->type == miopenFloat8)) - { - skip = true; - oss << " : skipped, no solvers for datatype " << this->type << " and default layouts"; - } - - std::cout << "\n############ Run " << (skip ? " " : "#") << " " << std::setw(6) << num_all_case++ << " : "; - show_command(); + bool skip = false; + std::ostringstream oss; - if(skip) - { - std::cout << "-- " << oss.str() << std::endl; - RETURN; - } + if(MAX_ALL_CASES && num_all_case > MAX_ALL_CASES) + { + skip = true; + oss << " : skipped due to MAX_ALL_CASES=" << MAX_ALL_CASES; + } + if(this->dry_run) + { + skip = true; + oss << " : skipped due to dry_run"; + } + if(is_default_layout && (this->type != miopenFloat && this->type != miopenHalf)) + { + skip = true; + oss << " : skipped, no solvers for datatype " << this->type << " and default layouts"; } + CHECK_SKIP; + int sptl_dim = static_cast(in_shape.size()) - 2; if(sptl_dim != 2 && sptl_dim != 3) { - std::cout << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW form." + oss << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW form." << std::endl; - RETURN; + SKIP_RUN; } // To simplify launching, input dimensions to the driver are always default layout. Desire to @@ -950,29 +964,29 @@ auto st = sc::now(); if(mask_idx && sptl_dim == 3 && filter.GetMode() == miopenPoolingMax) { - std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " + oss << "Warning: Config skipped. Workspace index mask mode is not implemented " "yet in 3D max pooling solvers." << std::endl; - RETURN; + SKIP_RUN; } if(mask_idx && sptl_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset) { - std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " + oss << "Warning: Config skipped. Workspace index mask mode is not implemented " "yet in 2D max backward solvers that support wide pooling window." << std::endl; - RETURN; + SKIP_RUN; } if(mask_idx && filter.ModeIsAveraging()) { - std::cout << "Warning: Config skipped. Workspace index modes are irrelevant for " + oss << "Warning: Config skipped. Workspace index modes are irrelevant for " "Average pooling. " "In order to optimize performance of full tests, we " "skip average pooling configs when (wsidx == 0). " "Please make sure that dataset includes counterparts with (wsidx == 1)." << std::endl; - RETURN; + SKIP_RUN; } // index size filter @@ -985,10 +999,10 @@ auto st = sc::now(); if(index_max <= index_needed) { - std::cout << "Warning: Config skipped: index mode " << filter.GetWorkspaceIndexMode() + oss << "Warning: Config skipped: index mode " << filter.GetWorkspaceIndexMode() << " type " << filter.GetIndexType() << " is too small. max=" << index_max << ", needed=" << index_needed << std::endl; - RETURN; + SKIP_RUN; } } @@ -1002,11 +1016,11 @@ auto st = sc::now(); if(full_set && (sptl_dim == 3 || (mask_idx && sptl_dim == 2)) && filter.GetMode() == miopenPoolingMax) { - std::cout << "Warning: Config skipped: uint8 index is too small " + oss << "Warning: Config skipped: uint8 index is too small " "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " "&& filter.GetMode() == miopenPoolingMax" << std::endl; - RETURN; + SKIP_RUN; } break; } @@ -1014,21 +1028,21 @@ auto st = sc::now(); if(full_set && (sptl_dim == 3 || (!mask_idx && sptl_dim == 2)) && filter.GetMode() == miopenPoolingMax) { - std::cout << "Warning: Config skipped: uint16 index is too small " + oss << "Warning: Config skipped: uint16 index is too small " "(sptl_dim == 3 || (sptl_dim == 2 && wsidx == 1)) " "&& filter.GetMode() == miopenPoolingMax" << std::endl; - RETURN; + SKIP_RUN; } if(skip_many_configs_with_non_int8_index) { // test_pooling_test --all limit uint16 cases if(num_uint16_case >= max_typed_cases) { - std::cout << "Warning: Config skipped for the default dataset to speed " + oss << "Warning: Config skipped for the default dataset to speed " "up testing (num_uint16_case > 5)" << std::endl; - RETURN; + SKIP_RUN; } ++num_uint16_case; } @@ -1043,10 +1057,10 @@ auto st = sc::now(); { if(num_uint32_case >= max_typed_cases) { - std::cout << "Warning: Config skipped for the default dataset to speed up " + oss << "Warning: Config skipped for the default dataset to speed up " "testing (wsidx == 0 && num_uint32_case > 5)" << std::endl; - RETURN; + SKIP_RUN; } ++num_uint32_case; } @@ -1054,10 +1068,10 @@ auto st = sc::now(); { if(num_uint32_case_imgidx >= max_typed_cases) { - std::cout << "Warning: Config skipped for the default dataset to speed up " + oss << "Warning: Config skipped for the default dataset to speed up " "testing (wsidx != 0 && num_uint32_case_imgidx > 5)" << std::endl; - RETURN; + SKIP_RUN; } ++num_uint32_case_imgidx; } @@ -1072,10 +1086,10 @@ auto st = sc::now(); { if(num_uint64_case >= max_typed_cases) { - std::cout << "Warning: Config skipped for the default dataset to speed up " + oss << "Warning: Config skipped for the default dataset to speed up " "testing (wsidx == 0) && (num_uint64_case > 5)" << std::endl; - RETURN; + SKIP_RUN; } ++num_uint64_case; } @@ -1083,11 +1097,11 @@ auto st = sc::now(); { if(num_uint64_case_imgidx >= max_typed_cases && sptl_dim == 2) { - std::cout << "Warning: Config skipped to speed up testing of the " + oss << "Warning: Config skipped to speed up testing of the " "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 " "&& sptl_dim == 2)" << std::endl; - RETURN; + SKIP_RUN; } ++num_uint64_case_imgidx; } @@ -1103,10 +1117,10 @@ auto st = sc::now(); { if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast(2) * pads[i])) { - std::cout << "Warning: Config skipped becuse it is invalid " + oss << "Warning: Config skipped becuse it is invalid " "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))" << std::endl; - RETURN; + SKIP_RUN; } } @@ -1120,13 +1134,18 @@ auto st = sc::now(); size_t device_mem = get_handle().GetGlobalMemorySize(); if(total_mem >= device_mem) { - std::cout << "Config skipped because it requires " << total_mem + oss << "Config skipped because it requires " << total_mem << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem << " Bytes of memory." << std::endl; - RETURN; + SKIP_RUN; } } + CHECK_SKIP; + + std::cout << "\n############ Run # " << std::setw(7) << num_all_case++ << " : "; + show_command(); + std::vector in_dim(input_desc.GetLengths().begin(), input_desc.GetLengths().begin() + sptl_dim); std::vector out_dim(sptl_dim); diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp index b30b9ce36f..b3acb61e07 100644 --- a/test/pooling2d.hpp +++ b/test/pooling2d.hpp @@ -38,33 +38,34 @@ struct pooling2d_shapes static std::vector get_2d_pooling_input_shapes() { return { - {5, 32, 8, 8}, + {5, 32, 8, 8} // TEMPCODE RUN + , {16, 1, 4096, 4096}, {1, 16, 4096, 4096}, {1, 1024, 512, 512}, {16, 1024, 128, 128} - // , - // {1, 832, 64, 128}, - // {10, 3, 32, 32}, - // {1, 19, 1024, 2048}, - // {2, 1024, 12, 12}, - // {4, 3, 231, 231}, - // {8, 3, 227, 227}, - // {1, 384, 13, 13}, - // {1, 96, 27, 27}, - // {2, 160, 7, 7}, - // {1, 192, 256, 512}, - // {2, 192, 28, 28}, - // {1, 256, 56, 56}, - // {4, 3, 224, 224}, - // {2, 64, 112, 112}, - // {2, 608, 4, 4}, - // {1, 2048, 11, 11} + , + {1, 832, 64, 128}, + {10, 3, 32, 32}, + {1, 19, 1024, 2048}, + {2, 1024, 12, 12}, + {4, 3, 231, 231}, + {8, 3, 227, 227}, + {1, 384, 13, 13}, + {1, 96, 27, 27}, + {2, 160, 7, 7}, + {1, 192, 256, 512}, + {2, 192, 28, 28}, + {1, 256, 56, 56}, + {4, 3, 224, 224}, + {2, 64, 112, 112}, + {2, 608, 4, 4}, + {1, 2048, 11, 11} }; } // Dataset 1 is intended for testing of asymmetric configs. - static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 2, 8, 8}, {10, 3, 32, 32}}; } // {1, 1, 8, 8}, + static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 1, 8, 8}, {10, 3, 32, 32}}; } // Dataset 2 is intended for testing of configs with wide window. static std::vector get_2d_pooling_input_shapes_wide() diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp index 6792a5cc84..341cc34660 100644 --- a/test/pooling3d.hpp +++ b/test/pooling3d.hpp @@ -33,13 +33,15 @@ struct pooling3d_shapes static std::vector get_3d_pooling_input_shapes() { - return {{16, 64, 3, 4, 4}, + return {{16, 64, 3, 4, 4} // TEMPCODE RUN + , {16, 32, 4, 9, 9}, {8, 512, 3, 14, 14}, {8, 512, 4, 28, 28}, {16, 64, 56, 56, 56}, {4, 3, 4, 227, 227}, - {4, 4, 4, 161, 700}}; + {4, 4, 4, 161, 700} + }; } }; From e04542086e1e4bf4f1aacf8e9c5c18ab80ae9cb0 Mon Sep 17 00:00:00 2001 From: "Randy J. Spaulding" Date: Tue, 8 Oct 2024 06:40:29 -0700 Subject: [PATCH 08/10] pr chkpt, pads1,1 broken --- src/CMakeLists.txt | 26 +- src/comgr.cpp | 9 +- src/hipoc/hipoc_program.cpp | 1 - src/include/miopen/kernel_info.hpp | 3 + src/include/miopen/pooling.hpp | 3 +- src/include/miopen/pooling/invoke_params.hpp | 1 - .../miopen/pooling/poolingNdNhwcArgs.hpp | 50 +- .../MIOpenPoolingForwardNDNhwcNaive.cpp | 180 +------ src/ocl/pooling_ocl.cpp | 4 +- src/pooling/problem_description.cpp | 5 +- src/solver.cpp | 13 + src/solver/pooling/forward2d.cpp | 36 +- src/solver/pooling/forwardCk2d.cpp | 2 +- src/solver/pooling/forwardNaive.cpp | 26 +- src/solver/pooling/forwardNdNhwcNaive.cpp | 92 +--- test/gtest/layout_transpose.cpp | 44 +- test/gtest/poolingFwd2dNaive.cpp | 86 ++-- test/gtest/poolingFwd3dNaive.cpp | 46 +- test/gtest/pooling_testing.hpp | 454 +++--------------- test/pooling2d.hpp | 2 +- test/pooling_common.hpp | 187 +------- 21 files changed, 251 insertions(+), 1019 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index df8c2f9614..359604bfd3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -458,6 +458,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/miopen_utility.hpp kernels/neuron.inc kernels/pooling_functions.h + include/miopen/pooling/poolingNdNhwcArgs.hpp kernels/rocm_version.inc kernels/stride_array.hpp kernels/utilities.inc @@ -599,24 +600,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/xform_bidirect_winograd_out.s kernels/UniversalTranspose.cl) -# TEMPCODE KERNELS - set(MIOPEN_KERNELS - kernels/MIOpenPoolingForwardNDNhwcNaive.cpp - kernels/MIOpenPooling.cl - kernels/MIOpenPoolingBwd.cl - kernels/MIOpenPoolingBwdND.cl - kernels/MIOpenPoolingForwardNaive.cl - kernels/MIOpenPoolingND.cl - ) - - set(MIOPEN_KERNEL_INCLUDES - kernels/bfloat16_dev.hpp - kernels/float_types.h - kernels/miopen_cstdint.hpp - kernels/pooling_functions.h - include/miopen/pooling/poolingNdNhwcArgs.hpp - ) - # Kernels in development lists. + # Kernels in development lists. # Should be ALWAYS empty in develop branch (at the time of PR merge) # Intention: to speed up kernel development rebuild time set(MIOPEN_DEVELOPMENT_KERNELS) @@ -793,12 +777,6 @@ else() ) endif() -# TEMPCODE LIB_INC -include_directories(SYSTEM - /opt/rocm/include/gtest/ - /opt/rocm/include/ -) - rocm_set_soversion(MIOpen ${MIOpen_SOVERSION}) clang_tidy_check(MIOpen) diff --git a/src/comgr.cpp b/src/comgr.cpp index 0fec8fa8ea..2c533f9623 100644 --- a/src/comgr.cpp +++ b/src/comgr.cpp @@ -887,7 +887,6 @@ class HiprtcProgram const auto log = GetLog(false); if(!log.empty()) MIOPEN_LOG_I(log); - std::cout << "************** HipRTC compile log: '" << log << "'" << std::endl; // TEMPCODE RJS } void GetCode(std::vector& bytes) const @@ -990,9 +989,11 @@ void BuildHip(const std::string& name, HiprtcProgram prog(name, text); try{ // TEMPCODE RJS - prog.Compile(opts); - } catch(Error& ex) { std::cout << __FUNCTION__ << " : Exception calling prog.Compile!: " << ex.text << std::endl; throw(ex); } - prog.GetCode(binary); + + std::cout << "Compling HIP: '" << name << "'" << std::endl; // TEMPCODE RJS + prog.Compile(opts); + } catch(Error& ex) { std::cout << __FUNCTION__ << " : Exception calling prog.Compile!: " << ex.text << std::endl; throw(ex); } + prog.GetCode(binary); } catch(Error& ex) { diff --git a/src/hipoc/hipoc_program.cpp b/src/hipoc/hipoc_program.cpp index fa151bd3ed..ec090455e8 100644 --- a/src/hipoc/hipoc_program.cpp +++ b/src/hipoc/hipoc_program.cpp @@ -275,7 +275,6 @@ void HIPOCProgramImpl::BuildCodeObjectInMemory(const std::string& params, #endif if(filename.extension() == ".cpp") { - std::cout << "Compling HIP: '" << filename << "'" << std::endl; // TEMPCODE RJS hiprtc::BuildHip(filename.string(), src, params, target, binary); } else if(filename.extension() == ".s") diff --git a/src/include/miopen/kernel_info.hpp b/src/include/miopen/kernel_info.hpp index d2571afa32..d824fe5c78 100644 --- a/src/include/miopen/kernel_info.hpp +++ b/src/include/miopen/kernel_info.hpp @@ -48,6 +48,9 @@ struct KernelInfo fs::path kernel_file; std::string kernel_name; friend std::ostream& operator<<(std::ostream& os, const KernelInfo& k); + + /// configures the working set using hip-style indices + void ConfigureHip(size_t l0, size_t l1, size_t l2, size_t g0, size_t g1, size_t g2); }; std::vector PrecompileKernels(const Handle& h, diff --git a/src/include/miopen/pooling.hpp b/src/include/miopen/pooling.hpp index 9cf2bae82f..26805f6df8 100644 --- a/src/include/miopen/pooling.hpp +++ b/src/include/miopen/pooling.hpp @@ -148,8 +148,7 @@ struct MIOPEN_EXPORT PoolingDescriptor : miopenPoolingDescriptor Data_t y, bool save_index, Data_t workSpace, - size_t workSpaceSize, - Data_t junk = nullptr) const; // TEMPCODE RJS + size_t workSpaceSize) const; miopenStatus_t Backward(Handle& handle, const void* alpha, diff --git a/src/include/miopen/pooling/invoke_params.hpp b/src/include/miopen/pooling/invoke_params.hpp index 2140c0a1fb..2d55786c21 100644 --- a/src/include/miopen/pooling/invoke_params.hpp +++ b/src/include/miopen/pooling/invoke_params.hpp @@ -45,7 +45,6 @@ struct FwdInvokeParams : public miopen::InvokeParams Data_t y = nullptr; Data_t workspace = nullptr; std::size_t workspace_size = 0; - Data_t junk = nullptr; // TEMPCODE RJS std::size_t GetWorkspaceSize() const { return workspace_size; } Data_t GetWorkspace() const { return workspace; } diff --git a/src/include/miopen/pooling/poolingNdNhwcArgs.hpp b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp index 9fd15bd4bd..5084f28b4f 100644 --- a/src/include/miopen/pooling/poolingNdNhwcArgs.hpp +++ b/src/include/miopen/pooling/poolingNdNhwcArgs.hpp @@ -1,19 +1,45 @@ #pragma once -// #include - -using BIGONE = uint32_t; // TEMPCODE RJS - struct poolingNdNhwcArgs { - uint32_t filter_d; uint32_t filter_h; uint32_t filter_w; - uint32_t filter_d_stride; uint32_t filter_h_stride; uint32_t filter_w_stride; - uint32_t filter_d_pad; uint32_t filter_h_pad; uint32_t filter_w_pad; + uint32_t filter_d; + uint32_t filter_h; + uint32_t filter_w; + + uint32_t filter_d_stride; + uint32_t filter_h_stride; + uint32_t filter_w_stride; + + uint32_t filter_d_pad; + uint32_t filter_h_pad; + uint32_t filter_w_pad; + uint32_t all_n; uint32_t all_c; - uint32_t bot_d; uint32_t bot_h; uint32_t bot_w; - BIGONE bot_n_stride; uint32_t bot_c_stride; BIGONE bot_d_stride; uint32_t bot_h_stride; uint32_t bot_w_stride; - uint32_t top_d; uint32_t top_h; uint32_t top_w; - BIGONE top_n_stride; uint32_t top_c_stride; BIGONE top_d_stride; uint32_t top_h_stride; uint32_t top_w_stride; - BIGONE mask_n_stride; BIGONE mask_c_stride; uint32_t mask_d_stride; uint32_t mask_h_stride; uint32_t mask_w_stride; + + uint32_t bot_d; + uint32_t bot_h; + uint32_t bot_w; + + uint32_t bot_n_stride; + uint32_t bot_c_stride; + uint32_t bot_d_stride; + uint32_t bot_h_stride; + uint32_t bot_w_stride; + + uint32_t top_d; + uint32_t top_h; + uint32_t top_w; + + uint32_t top_n_stride; + uint32_t top_c_stride; + uint32_t top_d_stride; + uint32_t top_h_stride; + uint32_t top_w_stride; + + uint32_t mask_n_stride; + uint32_t mask_c_stride; + uint32_t mask_d_stride; + uint32_t mask_h_stride; + uint32_t mask_w_stride; }; diff --git a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp index be6a548091..c502cf625f 100644 --- a/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp +++ b/src/kernels/MIOpenPoolingForwardNDNhwcNaive.cpp @@ -24,26 +24,10 @@ * *******************************************************************************/ -//#define TEMPCODE RJS -// #ifdef TEMPCODE -// #define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 0 - -// #define MLO_POOLING_OP_ID MLO_POOLING_OP_AVE - -// #define INPUT_TYPE _FLOAT -// #define OUTPUT_TYPE _FLOAT -// // #define TI INPUT_TYPE -// // #define TO OUTPUT_TYPE -// #define CVT_FP32_2ACCUM(x) (x) -// #endif - #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS #include #endif -#ifdef TEMPCODE -#include "float_types.h" -#endif #include "pooling_functions.h" #include "poolingNdNhwcArgs.hpp" @@ -57,7 +41,6 @@ // Let's use extended-precision accumulator only in FP16 pooling and only for averaging. // For all other ops and datatypes, use native accumulator, i.e. treate FLOAT_ACCUM as FLOAT. -#ifndef TEMPCODE #if !(AVERAGE_OPS && MIOPEN_USE_FP16) #define MIOPEN_USE_NATIVE_DATATYPE_ACCUM 1 // #else @@ -88,8 +71,6 @@ #define NATIVE_UNCAST(_x) (_FLOAT)(_x) #endif -#endif // TEMPCODE - #if AVERAGE_OPS #define ARG_UNUSED_FOR_AVERAGE __attribute__((__unused__)) #else @@ -101,7 +82,6 @@ template __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, TI* __restrict__ top_ptr, - float* __restrict__ junk_ptr, // TEMPCODE RJS ARG_UNUSED_FOR_AVERAGE index_t* __restrict__ mask_ptr, ARG_UNUSED_FOR_AVERAGE int save_index, ARG_UNUSED_FOR_AVERAGE int index_mode, @@ -112,7 +92,7 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, const uint32_t nd = blockIdx.x; const uint32_t h_ = blockIdx.y; const uint32_t w_c = blockIdx.z; - const uint32_t w_ = w_c % args.top_w; // CAN w=fast index + const uint32_t w_ = w_c % args.top_w; // CAN w=fast index const uint32_t C_WH = blockDim.x; const uint32_t _H = blockDim.y; @@ -122,11 +102,11 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, const uint32_t _h = threadIdx.y; const uint32_t _w = threadIdx.z; - const uint32_t nn = nd / args.top_d; // n=slow index - const uint32_t cc = (w_c / args.top_w) * C_WH + c; // c=slow index (lg-C) - const uint32_t td = nd % args.top_d; // top d=fast index - const uint32_t th = h_ * _H + _h; // top h: blockIdx is slow (sm-C) - const uint32_t tw = w_ * _W + _w; // top w: blockIdx is slow (sm-C) + const uint32_t nn = nd / args.top_d; // n=slow index + const uint32_t cc = (w_c / args.top_w) * C_WH + c; // c=slow index (lg-C) + const uint32_t td = nd % args.top_d; // top d=fast index + const uint32_t th = h_ * _H + _h; // top h: blockIdx is slow (sm-C) + const uint32_t tw = w_ * _W + _w; // top w: blockIdx is slow (sm-C) if(nn >= args.all_n) return; if(td >= args.top_d) return; @@ -134,96 +114,16 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, if(tw >= args.top_w) return; if(cc >= args.all_c) return; - for(int i = 4; i < 320; ++i) - { - junk_ptr[i] = 1.11111; - } - auto log_ptr = junk_ptr; - if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) - { - int idx = 8; - log_ptr[idx++] = gridDim.x; // ND - log_ptr[idx++] = gridDim.y; // H - log_ptr[idx++] = gridDim.z; // W (*C overflow) - log_ptr[idx++] = -9; - - log_ptr[idx++] = blockDim.x; // C - log_ptr[idx++] = blockDim.y; // small-C H - log_ptr[idx++] = blockDim.z; // small-C W - log_ptr[idx++] = -8; - - log_ptr[idx++] = args.filter_d; - log_ptr[idx++] = args.filter_h; - log_ptr[idx++] = args.filter_w; - log_ptr[idx++] = -7; - - log_ptr[idx++] = args.filter_d_stride; - log_ptr[idx++] = args.filter_h_stride; - log_ptr[idx++] = args.filter_w_stride; - log_ptr[idx++] = -6; - - log_ptr[idx++] = args.filter_d_pad; - log_ptr[idx++] = args.filter_h_pad; - log_ptr[idx++] = args.filter_w_pad; - log_ptr[idx++] = -5; - - log_ptr[idx++] = args.all_n; - log_ptr[idx++] = args.all_c; - log_ptr[idx++] = args.bot_n_stride; - log_ptr[idx++] = args.bot_c_stride; - - log_ptr[idx++] = args.top_n_stride; - log_ptr[idx++] = args.top_c_stride; - #if AVERAGE_OPS - log_ptr[idx++] = -4; - log_ptr[idx++] = -4; - #else - log_ptr[idx++] = args.mask_n_stride; - log_ptr[idx++] = args.mask_c_stride; - #endif - - log_ptr[idx++] = args.bot_d; - log_ptr[idx++] = args.bot_h; - log_ptr[idx++] = args.bot_w; - log_ptr[idx++] = -3; - - log_ptr[idx++] = args.bot_d_stride; - log_ptr[idx++] = args.bot_h_stride; - log_ptr[idx++] = args.bot_w_stride; - log_ptr[idx++] = -2; - - log_ptr[idx++] = args.top_d; - log_ptr[idx++] = args.top_h; - log_ptr[idx++] = args.top_w; - log_ptr[idx++] = -1; - - log_ptr[idx++] = args.top_d_stride; - log_ptr[idx++] = args.top_h_stride; - log_ptr[idx++] = args.top_w_stride; - log_ptr[idx++] = -9; - - #if AVERAGE_OPS - log_ptr[idx++] = -8; - log_ptr[idx++] = -8; - log_ptr[idx++] = -8; - #else - log_ptr[idx++] = args.mask_d_stride; - log_ptr[idx++] = args.mask_h_stride; - log_ptr[idx++] = args.mask_w_stride; - #endif - log_ptr[idx++] = -7; - } - const auto int_dstart = static_cast(td * args.filter_d_stride) - static_cast(args.filter_d_pad); - const auto dend = static_cast(min(int_dstart + static_cast(args.filter_d), static_cast(args.bot_d))); + /* const */ auto dend = static_cast(min(int_dstart + static_cast(args.filter_d), static_cast(args.bot_d))); const auto dstart = static_cast(max(int_dstart, 0)); const auto int_hstart = static_cast(th * args.filter_h_stride) - static_cast(args.filter_h_pad); - const auto hend = static_cast(min(int_hstart + static_cast(args.filter_h), static_cast(args.bot_h))); + /* const */ auto hend = static_cast(min(int_hstart + static_cast(args.filter_h), static_cast(args.bot_h))); const auto hstart = static_cast(max(int_hstart, 0)); const auto int_wstart = static_cast(tw * args.filter_w_stride) - static_cast(args.filter_w_pad); - const auto wend = static_cast(min(int_wstart + static_cast(args.filter_w), static_cast(args.bot_w))); + /* const */ auto wend = static_cast(min(int_wstart + static_cast(args.filter_w), static_cast(args.bot_w))); const auto wstart = static_cast(max(int_wstart, 0)); size_t top_index = @@ -233,32 +133,6 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, th * args.top_h_stride + // tw * args.top_w_stride; -#if true - size_t junk_idx = 64; - int bot_idx = 0; - junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]); - junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]); - junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]); - junk_ptr[junk_idx++] = NATIVE_CAST(bot_ptr[bot_idx++]); - junk_idx += 4 * th; - if(nn == 0 && cc == 0 && td == 0 && tw < 8 && th == 0) - { - - size_t bot_ncd = static_cast(nn * args.bot_n_stride + cc * args.bot_c_stride + dstart * args.bot_d_stride); - size_t bot_ncdh = bot_ncd + hstart * args.bot_h_stride; - size_t bot_index = bot_ncdh + wstart * args.bot_w_stride; - - junk_ptr[junk_idx++] = top_index; - junk_ptr[junk_idx++] = bot_index; - junk_ptr[junk_idx++] = dstart; - junk_ptr[junk_idx++] = dend; - junk_ptr[junk_idx++] = hstart; - junk_ptr[junk_idx++] = hend; - junk_ptr[junk_idx++] = wstart; - junk_ptr[junk_idx++] = wend; - } -#endif - #if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE uint32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); pool_size = (pool_size == 0) ? 1 : pool_size; @@ -315,35 +189,27 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, #else // MAX if(save_index) { - index_t res_index = 5150; + index_t res_index = saved_index; - // / Preventing overflow during computation of res_index: - // / If Index is shorter than uint, then let's perform computation in 32-bit - // / domain and then convert to narrower Index. That would reduce the probability of - // / overflow. If Index is wider then 32 bits, then it seems like it is better to - // / convert to Index type before multiplication. However this is not actually - // / necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in - // / 32 bits and then convert. + /// Preventing overflow during computation of res_index: + /// If Index is shorter than uint, then let's perform computation in 32-bit + /// domain and then convert to narrower Index. That would reduce the probability of + /// overflow. If Index is wider then 32 bits, then it seems like it is better to + /// convert to Index type before multiplication. However this is not actually + /// necessary, see \ref multiply_dims_overflow_assumption. Let's always compute in + /// 32 bits and then convert. if(found) { - if(index_mode == 1) // TEMPCODE RJS - res_index = saved_index; - // res_index = (index_t)( // - // nn * args.bot_n_stride // - // + cc * args.bot_c_stride // - // + d_save * args.bot_d_stride // - // + h_save * args.bot_h_stride // - // + w_save * args.bot_w_stride); - else + if(index_mode == 0) res_index = (index_t)( // - ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) // - + ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w) // - + (w_save - tw * args.filter_w_stride + args.filter_w_pad) // + ((d_save - td * args.filter_d_stride + args.filter_d_pad) * args.filter_h * args.filter_w) + // + ((h_save - th * args.filter_h_stride + args.filter_h_pad) * args.filter_w) + // + (w_save - tw * args.filter_w_stride + args.filter_w_pad) // ); } - const size_t mask_index = nn * args.mask_n_stride // + const size_t mask_index = nn * args.mask_n_stride // + cc * args.mask_c_stride // + (size_t)(td * args.mask_d_stride) // + (size_t)(th * args.mask_h_stride) // @@ -358,7 +224,6 @@ __device__ void poolingForwardNDNhwcNaive(const TI* __restrict__ bot_ptr, extern "C" __global__ void mloPoolingForwardNDNhwcNaive( const INPUT_TYPE* __restrict__ bot_ptr, INPUT_TYPE* __restrict__ top_ptr, - float* __restrict__ junk_ptr, // TEMPCODE RJS index_t* __restrict__ mask_ptr, int save_index, int index_mode, @@ -368,7 +233,6 @@ extern "C" __global__ void mloPoolingForwardNDNhwcNaive( poolingForwardNDNhwcNaive( bot_ptr, top_ptr, - junk_ptr, mask_ptr, save_index, index_mode, diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp index 9a7e166081..6b79def0b0 100644 --- a/src/ocl/pooling_ocl.cpp +++ b/src/ocl/pooling_ocl.cpp @@ -68,8 +68,7 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle, Data_t y, bool save_index, Data_t workSpace, - size_t workSpaceSize, - Data_t junk) const // TEMPCODE RJS + size_t workSpaceSize) const { if(!float_equal(*(static_cast(alpha)), 1.0) || @@ -132,7 +131,6 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle, tmp.y = y; tmp.workspace = workSpace; tmp.workspace_size = workSpaceSize; - tmp.junk = junk; // TEMPCODE RJS return tmp; }(); diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp index a96101be3a..df21b8cd93 100644 --- a/src/pooling/problem_description.cpp +++ b/src/pooling/problem_description.cpp @@ -83,10 +83,9 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const ss << "_dxs" << get_vect_config(dxDesc.GetStrides()); ss << "_dyd" << get_vect_config(dyDesc.GetLengths()); ss << "_dys" << get_vect_config(dyDesc.GetStrides()); - } // TEMPCODE RJS - std::cout << "\n************** xDesc layout: " << xDesc.GetLayout_str() << " (" << static_cast(xDesc.GetLayout_t()) << (xDesc.IsDefaultLayout() ? ")" : " not default)") << " *************************" << std::endl; + } ss << "_l" << static_cast(xDesc.GetLayout_t()); -std::cout << " " << ss.str() << std::endl; + return NetworkConfig{ss.str()}; } diff --git a/src/solver.cpp b/src/solver.cpp index 28ca3751a8..c9607df252 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -66,6 +66,19 @@ std::ostream& operator<<(std::ostream& os, const KernelInfo& k) return os << "} '" << k.comp_options << '\''; } +void KernelInfo::ConfigureHip(size_t l0, size_t l1, size_t l2, size_t g0, size_t g1, size_t g2) +{ + l_wk.clear(); + l_wk.push_back(l0); + l_wk.push_back(l1); + l_wk.push_back(l2); + + g_wk.clear(); + g_wk.push_back(g0 * l0); + g_wk.push_back(g1 * l1); + g_wk.push_back(g2 * l2); +} + std::vector PrecompileKernels(const Handle& h, const std::vector& kernels, bool force_attach_binary) { diff --git a/src/solver/pooling/forward2d.cpp b/src/solver/pooling/forward2d.cpp index aa8fdb2f6b..b757c07d54 100644 --- a/src/solver/pooling/forward2d.cpp +++ b/src/solver/pooling/forward2d.cpp @@ -137,7 +137,7 @@ bool PoolingForward2d::IsApplicable(const ExecutionContext& context, { auto x_type = problem.GetXDesc().GetType(); auto y_type = problem.GetYDesc().GetType(); - std::vector types {miopenFloat, miopenHalf}; // TEMPCODE RJS fix types , miopenInt8, miopenFloat8 + std::vector types {miopenFloat, miopenHalf}; // TRJS TODO fix types , miopenInt8, miopenFloat8 auto x_layout = problem.GetXDesc().GetLayout_str(); auto y_layout = problem.GetYDesc().GetLayout_str(); @@ -153,21 +153,7 @@ bool PoolingForward2d::IsApplicable(const ExecutionContext& context, sizeof_private_memory(problem) <= TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth(); -// TEMPCODE RJS - std::cout << "%%%%%%%%%% PoolingForward2d::IsApplicable: " << app << " " << problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout(x_layout) << std::endl; - return app; -} - -#include // TEMPCODE RJS -namespace { - template - std::ostream& printVec(std::string name, const std::vector& vec) - { - return std::cout; - std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: "; - for(auto i : vec) std::cout << std::setw(8) << i; - return std::cout; - } + return app; } ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&, @@ -206,24 +192,6 @@ ConvSolution PoolingForward2d::GetSolution(const ExecutionContext&, : ((pool_d.GetMode() == miopenPoolingAverage) ? MLO_POOLING_OP_AVE : MLO_POOLING_OP_AVE_INCLUSIVE); - - // TEMPCODE RJS - // const auto bot = problem.GetXDesc(); - // const auto top = problem.GetYDesc(); - // const auto& pooling = problem.GetPooling(); - // const auto& lengths = pooling.GetLengths(); - // const auto& strides = pooling.GetStrides(); - // const auto& pads = pooling.GetPads(); - - // std::cout << "PoolingForward2d GetSolution: " << std::endl; - // printVec(" bot lengths", bot.GetLengths()) << - // printVec(" bot strides", bot.GetStrides()) << std::endl; - // printVec(" top lengths", top.GetLengths()) << - // printVec(" top strides", top.GetStrides()) << std::endl; - // printVec(" pool lengths", lengths) << - // printVec(" pool strides", strides) << - // printVec(" pool pads", pads) << std::endl; - auto build_params = KernelBuildParameters{ {"MLO_POOLING_OP_ID", pooling_method}, {"MLO_POOLING_KERNEL_SZ1", kp.kernel_size_h}, diff --git a/src/solver/pooling/forwardCk2d.cpp b/src/solver/pooling/forwardCk2d.cpp index 9681f7f563..7ac0c15170 100644 --- a/src/solver/pooling/forwardCk2d.cpp +++ b/src/solver/pooling/forwardCk2d.cpp @@ -135,7 +135,7 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro bool PoolingForwardCk2d::IsApplicable(const ExecutionContext& context, const miopen::pooling::ProblemDescription& problem) const { - return false; + return false; // TRJS not active yet return problem.GetDirection() == miopen::pooling::Direction::Forward && problem.GetXDesc().GetNumDims() == 4 && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() && diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp index faf4fac836..9039d49836 100644 --- a/src/solver/pooling/forwardNaive.cpp +++ b/src/solver/pooling/forwardNaive.cpp @@ -86,19 +86,6 @@ bool PoolingForwardNaive::IsApplicable(const ExecutionContext&, && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()); } - -#include // TEMPCODE RJS -namespace { - template - void printVec(std::string name, const std::vector& vec) - { - return; - std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: "; - for(auto i : vec) std::cout << std::setw(8) << i; - std::cout << std::endl; - } -} - ConvSolution PoolingForwardNaive::GetSolution(const ExecutionContext& context, const miopen::pooling::ProblemDescription& problem) const @@ -136,17 +123,6 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, const auto index_mode = pooling.GetWorkspaceIndexMode(); const auto index_type = pooling.GetIndexType(); - // TEMPCODE RJS - std::cout << "======================================================================" << std::endl; - printVec("bot lengths", bot.GetLengths()); - printVec("bot strides", bot.GetStrides()); - printVec("top lengths", top.GetLengths()); - printVec("top strides", top.GetStrides()); - printVec("pool lengths", lengths); - printVec("pool strides", strides); - printVec("pool pads", pads); - std::cout << "======================================================================" << std::endl; - /// \anchor multiply_dims_overflow_assumption /// /// Preventing overflow during dimension-related computations: @@ -290,7 +266,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, filter_w_pad, all_n, all_c, - bot_d, // TEMPCODE RJS: have not broke it + bot_d, bot_h, bot_w, bot_n_stride, diff --git a/src/solver/pooling/forwardNdNhwcNaive.cpp b/src/solver/pooling/forwardNdNhwcNaive.cpp index d31df3358d..86555d9560 100644 --- a/src/solver/pooling/forwardNdNhwcNaive.cpp +++ b/src/solver/pooling/forwardNdNhwcNaive.cpp @@ -50,21 +50,6 @@ bool IsPower2(T v) } #endif -template -T RoundUpNearestPower2Positive(T v) = delete; - -inline uint32_t RoundUpNearestPower2Positive(uint32_t v) -{ - assert(v > 0); - --v; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return std::max(++v, 1U); // Shut clang-tidy. -} - } // namespace bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&, @@ -88,30 +73,9 @@ bool PoolingForwardNDNhwcNaive::IsApplicable(const ExecutionContext&, && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) //) && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()); - // TODO RJS check grid size - - std::cout << "%%%%%%%%%% PoolingForwardNDNhwcNaive::IsApplicable: " << app << " " << problem.GetXDesc().GetLayout_str() << "->" << problem.GetXDesc().GetLayout(x_layout) - << " " << problem.GetYDesc().GetLayout_str() << "->" << problem.GetYDesc().GetLayout(y_layout) - << " " << (problem.GetDirection() == miopen::pooling::Direction::Forward) - << (x_type == y_type) - << (x_layout == y_layout) << (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) - << (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) << (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()) << std::endl; - return app; } -#include // TEMPCODE RJS -namespace { - template - void printVec(std::string name, std::vector vec) - { - return; - std::cout << "Vector Printing: " << std::setw(20) << name << "[" << vec.size() << "]: "; - for(auto i : vec) std::cout << std::setw(8) << i; - std::cout << std::endl; - } -} - ConvSolution PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, const miopen::pooling::ProblemDescription& problem) const @@ -176,17 +140,6 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, /// not require widening to size_t prior mul, but (d_stride * dim * dim) /// requires it because the total number of muls is 4. - // TEMPCODE RJS - printVec("======================================================================", std::vector{}); - printVec("bot lengths", bot.GetLengths()); - printVec("bot strides", bot.GetStrides()); - printVec("top lengths", top.GetLengths()); - printVec("top strides", top.GetStrides()); - printVec("pool lengths", lengths); - printVec("pool strides", strides); - printVec("pool pads", pads); - printVec("======================================================================", std::vector{}); - const auto spatial_dim = is2d ? 2U : 3U; std::tie(args.all_n, args.all_c, args.bot_d, args.bot_h, args.bot_w) = miopen::GetNCDHW(spatial_dim, bot.GetLengths()); @@ -204,7 +157,7 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, args.mask_w_stride = 1; args.mask_h_stride = args.mask_w_stride * args.top_w; args.mask_d_stride = args.mask_h_stride * args.top_h; - args.mask_c_stride = static_cast(args.mask_d_stride) * args.top_d; + args.mask_c_stride = args.mask_d_stride * args.top_d; args.mask_n_stride = args.mask_c_stride * args.all_c; /// About optimal grid size: @@ -305,19 +258,16 @@ PoolingForwardNDNhwcNaive::GetSolution(const ExecutionContext& context, // * 2: layout (NCHW vs NHWC) // * 2: 2D and 3D kernels (optimization) + kernel.ConfigureHip(l0, l1, l2, g0, g1, g2); // KernelInfo uses OCL-style indexes - kernel.l_wk.clear(); - kernel.l_wk.push_back(l0); - kernel.l_wk.push_back(l1); - kernel.l_wk.push_back(l2); - kernel.g_wk.clear(); - kernel.g_wk.push_back(g0 * l0); - kernel.g_wk.push_back(g1 * l1); - kernel.g_wk.push_back(g2 * l2); - - // TEMPCODE RJS -std::cout << "Kernel dims: g[" << kernel.g_wk.size() << "] " << kernel.g_wk[0] << " " << kernel.g_wk[1] << " " << kernel.g_wk[2] -<< " | l[" << kernel.l_wk.size() << "] " << kernel.l_wk[0] << " " << kernel.l_wk[1] << " " << kernel.l_wk[2] << std::endl; + // kernel.l_wk.clear(); + // kernel.l_wk.push_back(l0); + // kernel.l_wk.push_back(l1); + // kernel.l_wk.push_back(l2); + // kernel.g_wk.clear(); + // kernel.g_wk.push_back(g0 * l0); + // kernel.g_wk.push_back(g1 * l1); + // kernel.g_wk.push_back(g2 * l2); result.construction_params.push_back(kernel); } @@ -329,20 +279,20 @@ std::cout << "Kernel dims: g[" << kernel.g_wk.size() << "] " << kernel.g_wk[0] < kernel( params.x, params.y, - params.junk, // TEMPCODE RJS params.workspace, save_index, index_mode, - args.filter_d, args.filter_h, args.filter_w, - args.filter_d_stride, args.filter_h_stride, args.filter_w_stride, - args.filter_d_pad, args.filter_h_pad, args.filter_w_pad, - args.all_n, - args.all_c, - args.bot_d, args.bot_h, args.bot_w, - args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride, - args.top_d, args.top_h, args.top_w, - args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride, - args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride + args + // args.filter_d, args.filter_h, args.filter_w, + // args.filter_d_stride, args.filter_h_stride, args.filter_w_stride, + // args.filter_d_pad, args.filter_h_pad, args.filter_w_pad, + // args.all_n, + // args.all_c, + // args.bot_d, args.bot_h, args.bot_w, + // args.bot_n_stride, args.bot_c_stride, args.bot_d_stride, args.bot_h_stride, args.bot_w_stride, + // args.top_d, args.top_h, args.top_w, + // args.top_n_stride, args.top_c_stride, args.top_d_stride, args.top_h_stride, args.top_w_stride, + // args.mask_n_stride, args.mask_c_stride, args.mask_d_stride, args.mask_h_stride, args.mask_w_stride ); }; }; diff --git a/test/gtest/layout_transpose.cpp b/test/gtest/layout_transpose.cpp index 1161eff2b9..bab82000d7 100644 --- a/test/gtest/layout_transpose.cpp +++ b/test/gtest/layout_transpose.cpp @@ -275,51 +275,9 @@ struct LayoutTransposeTest_2D : public ::testing::TestWithParam{tensor_len, tensor_strides}; - bool printing = true; // in_dim[0]==8 && in_dim[1]==8; - if(printing) - { - auto inlen = t_src.desc.GetLengths(); - auto instr = t_src.desc.GetStrides(); - std::cout << "CPU in : "; - for(auto dim : inlen) std::cout << std::setw(4) << dim; - std::cout << " | "; - for(auto str : instr) std::cout << std::setw(4) << str; - std::cout << std::endl; - - for(int nn = 0; nn < inlen[0]; ++nn) { - for(int cc = 0; cc < inlen[1]; ++cc) { - for(int hh = 0; hh < inlen[2]; ++hh) { - for(int ww = 0; ww < inlen[3]; ++ww) { - std::cout << std::setw(11) << std::setprecision(5) << t_src(nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]) << " "; - } - std::cout << std::endl; - } - } - } - } t_src.generate(gen_value); - if(printing) - { - auto inlen = t_src.desc.GetLengths(); - auto instr = t_src.desc.GetStrides(); - std::cout << "CPU in : "; - for(auto dim : inlen) std::cout << std::setw(4) << dim; - std::cout << " | "; - for(auto str : instr) std::cout << std::setw(4) << str; - std::cout << std::endl; - - for(int nn = 0; nn < inlen[0]; ++nn) { - for(int cc = 0; cc < inlen[1]; ++cc) { - for(int hh = 0; hh < inlen[2]; ++hh) { - for(int ww = 0; ww < inlen[3]; ++ww) { - std::cout << std::setw(11) << std::setprecision(5) << t_src(nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3]) << " "; - } - std::cout << std::endl; - } - } - } - } + auto t_dst = tensor{tensor_len, tensor_strides}; auto t_dst_gpu = tensor{tensor_len, tensor_strides}; diff --git a/test/gtest/poolingFwd2dNaive.cpp b/test/gtest/poolingFwd2dNaive.cpp index 828fc556cb..d09c35302a 100644 --- a/test/gtest/poolingFwd2dNaive.cpp +++ b/test/gtest/poolingFwd2dNaive.cpp @@ -114,12 +114,12 @@ struct layout_data } -class PoolingFwd2d : public testing::TestWithParam> {}; -class PoolingFwd2dInt8 : public PoolingFwd2d {}; -class PoolingFwd2dFloat : public PoolingFwd2d {}; -class PoolingFwd2dHalf : public PoolingFwd2d {}; -class PoolingFwd2dBF16 : public PoolingFwd2d {}; -class PoolingFwd2dF8 : public PoolingFwd2d {}; +class Pooling2d : public testing::TestWithParam> {}; +class Pooling2dInt8 : public Pooling2d {}; +class Pooling2dFloat : public Pooling2d {}; +class Pooling2dHalf : public Pooling2d {}; +class Pooling2dBF16 : public Pooling2d {}; +class Pooling2dF8 : public Pooling2d {}; void Run2dDriver(miopenDataType_t prec); @@ -144,62 +144,61 @@ std::vector Get2dTestCases(const std::string precision) const std::vector test_cases = { // clang-format off - {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg}, // TEMPCODE RJS DATASET - // {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}, // TEMPCODE RJS DATASET - // {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 " + flag_arg} // TEMPCODE RJS DATASET + {"test_pooling2d " + precision + " --all --dataset 0 --limit 0 " + flag_arg}, + {"test_pooling2d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}, + {"test_pooling2d " + precision + " --all --dataset 2 --limit 0 " + flag_arg} // clang-format on }; return test_cases; } } // namespace pooling_tests -// using namespace pooling_tests; -TEST_P(PoolingFwd2dInt8, NNT) +TEST_P(Pooling2dInt8, NNT) { if(!IsTestRunWith("--int8")) std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl; - if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--int8") TEMPCODE RJS + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--int8") TRJS GTEST_SKIP(); Run2dDriver(miopenInt8); }; -TEST_P(PoolingFwd2dFloat, NNT) +TEST_P(Pooling2dFloat, NNT) { if(!IsTestRunWith("--float")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl; - if(SkipTest() || !IsTestSupportedForDevice(get_handle())) + if(SkipTest() || !IsTestSupportedForDevice(get_handle())) // && IsTestRunWith("--float") TRJS GTEST_SKIP(); Run2dDriver(miopenFloat); }; -TEST_P(PoolingFwd2dHalf, NNT) +TEST_P(Pooling2dHalf, NNT) { if(!IsTestRunWith("--half")) std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl; - if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--half") TEMPCODE RJS + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--half") TRJS GTEST_SKIP(); Run2dDriver(miopenHalf); }; -TEST_P(PoolingFwd2dBF16, NNT) +TEST_P(Pooling2dBF16, NNT) { - if(!IsTestRunWith("--bfloat16")) std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl; + if(!IsTestRunWith("--bfloat16")) std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl; - if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--bfloat16") TEMPCODE RJS + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--bfloat16") TRJS GTEST_SKIP(); Run2dDriver(miopenBFloat16); }; -TEST_P(PoolingFwd2dF8, NNT) +TEST_P(Pooling2dF8, NNT) { - if(!IsTestRunWith("--float8")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl; + if(!IsTestRunWith("--float8")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl; - if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--float8") TEMPCODE RJS + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--float8") TRJS GTEST_SKIP(); Run2dDriver(miopenFloat8); @@ -208,40 +207,31 @@ TEST_P(PoolingFwd2dF8, NNT) void Run2dDriver(miopenDataType_t prec) { auto cases = Get2dTestCases("--float"); - // std::cerr << " Cases: " << cases.size() << std::endl; // TEMPCODE RJS - // for(const auto& test_value : cases) - // { - // std::cerr << " : " << test_value << std::endl; // TEMPCODE RJS - // } std::vector params; switch(prec) { - case miopenFloat: params = PoolingFwd2dFloat_NNT_Test::GetParam(); break; - case miopenHalf: params = PoolingFwd2dHalf_NNT_Test::GetParam(); break; - case miopenBFloat16: params = PoolingFwd2dBF16_NNT_Test::GetParam(); break; - case miopenInt8: params = PoolingFwd2dInt8_NNT_Test::GetParam(); break; - case miopenFloat8: params = PoolingFwd2dF8_NNT_Test::GetParam(); break; + case miopenFloat: params = Pooling2dFloat_NNT_Test::GetParam(); break; + case miopenHalf: params = Pooling2dHalf_NNT_Test::GetParam(); break; + case miopenBFloat16: params = Pooling2dBF16_NNT_Test::GetParam(); break; + case miopenInt8: params = Pooling2dInt8_NNT_Test::GetParam(); break; + case miopenFloat8: params = Pooling2dF8_NNT_Test::GetParam(); break; case miopenInt32: case miopenDouble: case miopenBFloat8: case miopenInt64: FAIL() - << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 " - "data type not supported by " + << "miopenInt32, miopenDouble, miopenBFloat8, miopenInt64 " + "data types not supported by " "poolingFwd2dNaive test"; - default: params = PoolingFwd2dFloat_NNT_Test::GetParam(); + default: params = Pooling2dFloat_NNT_Test::GetParam(); } std::cerr << "Params: " << params.size() << std::endl; + for(const auto& test_value : params) { - std::cerr << " : " << test_value << std::endl; // TEMPCODE RJS - } - for(const auto& test_value : params) - { - std::cerr << "Testing: " << test_value << std::endl; // TEMPCODE RJS std::vector tokens; GetArgs(test_value, tokens); std::vector ptrs; @@ -257,16 +247,10 @@ void Run2dDriver(miopenDataType_t prec) } } -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dBF16); -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dInt8); -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dFloat); -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dHalf); -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd2dF8); - -INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd2dBF16, testing::Values(Get2dTestCases("--bfloat16"))); -INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd2dInt8, testing::Values(Get2dTestCases("--int8"))); -INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd2dFloat, testing::Values(Get2dTestCases("--float"))); -INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd2dHalf, testing::Values(Get2dTestCases("--half"))); -INSTANTIATE_TEST_SUITE_P(F8, PoolingFwd2dF8, testing::Values(Get2dTestCases("--float8"))); +INSTANTIATE_TEST_SUITE_P(Full, Pooling2dBF16, testing::Values(Get2dTestCases("--bfloat16"))); +INSTANTIATE_TEST_SUITE_P(Full, Pooling2dInt8, testing::Values(Get2dTestCases("--int8"))); +INSTANTIATE_TEST_SUITE_P(Full, Pooling2dFloat, testing::Values(Get2dTestCases("--float"))); +INSTANTIATE_TEST_SUITE_P(Full, Pooling2dHalf, testing::Values(Get2dTestCases("--half"))); +INSTANTIATE_TEST_SUITE_P(Full, Pooling2dF8, testing::Values(Get2dTestCases("--float8"))); #endif diff --git a/test/gtest/poolingFwd3dNaive.cpp b/test/gtest/poolingFwd3dNaive.cpp index a32432f1e6..d3f8975b15 100644 --- a/test/gtest/poolingFwd3dNaive.cpp +++ b/test/gtest/poolingFwd3dNaive.cpp @@ -145,26 +145,22 @@ std::vector Get3dTestCases(const std::string precision) const std::vector test_cases = { // clang-format off - {"test_pooling3d " + precision + " --all --dataset 0 --limit 0 " + flag_arg}, // TEMPCODE RJS DATASET - // {"test_pooling3d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}, // TEMPCODE RJS DATASET - // {"test_pooling3d " + precision + " --all --dataset 2 --limit 0 " + flag_arg} // TEMPCODE RJS DATASET + {"test_pooling3d " + precision + " --all --dataset 0 --limit 0 " + flag_arg}, + {"test_pooling3d " + precision + " --all --dataset 1 --limit 0 " + flag_arg}, + {"test_pooling3d " + precision + " --all --dataset 2 --limit 0 " + flag_arg} // clang-format on }; return test_cases; } } // namespace pooling_tests -// using namespace pooling_tests; -TEST_P(PoolingFwd3dFloat, NNT) // NDNaiveTranspose +TEST_P(PoolingFwd3dFloat, NNT) // NNT=NdNaiveTranspose { const auto& handle = get_handle(); - if(!IsTestSupportedForDevice(handle)) std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl; - if(SkipTest()) std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl; if(!IsTestRunWith("--float")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT!" << std::endl; - // Run3dDriver(miopenFloat); return; // TEMPCODE RJS - // && IsTestRunWith("--float") - if(IsTestSupportedForDevice(handle) && !SkipTest()) + + if(IsTestSupportedForDevice(handle) && !SkipTest()) // && IsTestRunWith("--float") TRJS { Run3dDriver(miopenFloat); } @@ -177,11 +173,9 @@ TEST_P(PoolingFwd3dFloat, NNT) // NDNaiveTranspose TEST_P(PoolingFwd3dHalf, NNT) { const auto& handle = get_handle(); - if(!IsTestSupportedForDevice(handle)) std::cout << "WOULD SKIP BECAUSE NOT SUPPORTED!" << std::endl; - if(SkipTest()) std::cout << "WOULD SKIP BECAUSE SKIPTEST!" << std::endl; - if(!IsTestRunWith("--half")) std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl; + if(!IsTestRunWith("--half")) std::cout << "WOULD SKIP BECAUSE NOT HALF!" << std::endl; - if(IsTestSupportedForDevice(handle) && !SkipTest()) // && IsTestRunWith("--half") TEMPCODE RJS + if(IsTestSupportedForDevice(handle) && !SkipTest()) // && IsTestRunWith("--half") TRJS { Run3dDriver(miopenHalf); } @@ -195,7 +189,7 @@ TEST_P(PoolingFwd3dBF16, NNT) { if(!IsTestRunWith("--bfloat16")) std::cout << "WOULD SKIP BECAUSE NOT BFLOAT16!" << std::endl; - if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--bfloat16") TEMPCODE RJS + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--bfloat16") TRJS GTEST_SKIP(); Run3dDriver(miopenBFloat16); @@ -205,7 +199,7 @@ TEST_P(PoolingFwd3dInt8, NNT) { if(!IsTestRunWith("--int8")) std::cout << "WOULD SKIP BECAUSE NOT INT8!" << std::endl; - if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--int8") TEMPCODE RJS + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--int8") TRJS GTEST_SKIP(); Run3dDriver(miopenInt8); @@ -215,7 +209,7 @@ TEST_P(PoolingFwd3dF8, NNT) { if(!IsTestRunWith("--float8")) std::cout << "WOULD SKIP BECAUSE NOT FLOAT8!" << std::endl; - if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--float8") TEMPCODE RJS + if(!IsTestSupportedForDevice(get_handle()) || SkipTest()) // && IsTestRunWith("--float8") TRJS GTEST_SKIP(); Run3dDriver(miopenFloat8); @@ -224,11 +218,6 @@ TEST_P(PoolingFwd3dF8, NNT) void Run3dDriver(miopenDataType_t prec) { auto cases = Get3dTestCases("--float"); - std::cerr << " Cases: " << cases.size() << std::endl; - for(const auto& test_value : cases) - { - std::cerr << " : " << test_value << std::endl; // TEMPCODE RJS - } std::vector params; switch(prec) @@ -243,8 +232,8 @@ void Run3dDriver(miopenDataType_t prec) case miopenBFloat8: case miopenInt64: FAIL() - << "miopenBFloat16, miopenInt8, miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 " - "data type not supported by " + << "miopenInt32, miopenDouble, miopenFloat8, miopenBFloat8, miopenInt64 " + "data types not supported by " "poolingFwdNdNaive test"; default: params = PoolingFwd3dFloat_NNT_Test::GetParam(); @@ -252,7 +241,6 @@ void Run3dDriver(miopenDataType_t prec) for(const auto& test_value : params) { - std::cerr << "Running Test: " << test_value << std::endl; // TEMPCODE RJS std::vector tokens; GetArgs(test_value, tokens); std::vector ptrs; @@ -268,13 +256,7 @@ void Run3dDriver(miopenDataType_t prec) } } -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dBF16); -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dFloat); -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dHalf); -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dInt8); -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PoolingFwd3dF8); - -// INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd3dBF16, testing::Values(Get3dTestCases("--bfloat16"))); +INSTANTIATE_TEST_SUITE_P(BF16, PoolingFwd3dBF16, testing::Values(Get3dTestCases("--bfloat16"))); INSTANTIATE_TEST_SUITE_P(Float, PoolingFwd3dFloat, testing::Values(Get3dTestCases("--float"))); INSTANTIATE_TEST_SUITE_P(Half, PoolingFwd3dHalf, testing::Values(Get3dTestCases("--half"))); INSTANTIATE_TEST_SUITE_P(Int8, PoolingFwd3dInt8, testing::Values(Get3dTestCases("--int8"))); diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp index 2fdb4ded19..eacd9ebeb6 100644 --- a/test/gtest/pooling_testing.hpp +++ b/test/gtest/pooling_testing.hpp @@ -36,10 +36,10 @@ namespace {using sc = std::chrono::steady_clock;} #undef tomillis #define tomillis(__DUR) (0.001 * std::chrono::duration_cast(__DUR).count()) +#undef mstocout +#define mstocout(__TP) std::setw(15) << std::fixed << std::setprecision(3) << tomillis(sc::now() - __TP) #undef coutms -#define coutms(__TOK, __TP) (std::cout << "ms[" << std::setw(16) << __TOK << "]: " \ - << std::setw(12) << std::fixed << std::setprecision(3) << tomillis(sc::now() - __TP) << std::endl) - +#define coutms(__TOK, __TP) (std::cout << "ms[" << std::setw(16) << __TOK << "]: " << mstocout(__TP) << std::endl) #include #include @@ -56,13 +56,12 @@ namespace {using sc = std::chrono::steady_clock;} #include #include -// #include "network_data.hpp" -#include "driver.hpp" -#include "get_handle.hpp" -#include "tensor_holder.hpp" -#include "verify.hpp" -#include "cpu_conv.hpp" -#include "workspace.hpp" +#include "../driver.hpp" +#include "../get_handle.hpp" +#include "../tensor_holder.hpp" +#include "../verify.hpp" +#include "../cpu_conv.hpp" +#include "../workspace.hpp" #define TEST_PADDING_MODE 0 @@ -75,13 +74,17 @@ int num_uint64_case = 0; int num_uint64_case_imgidx = 0; constexpr int max_typed_cases = 5; constexpr int MAX_ALL_CASES = 0; +auto __start = sc::now(); constexpr int RAND_INTEGER_MAX = 12000; constexpr int RAND_INTEGER_MIN = -8800; template auto gen_value = - [](auto... is) { return static_cast(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX) / 100); }; + [](auto... is) { return static_cast(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; }; + +auto gen_start = + [](auto... is) { return prng::gen_0_to_B(1ULL << 28); }; } static inline void print(std::ostringstream& oss, const miopen::PoolingDescriptor& filter, bool is_default_layout) @@ -94,7 +97,7 @@ static inline void print(std::ostringstream& oss, const miopen::PoolingDescripto else oss << "Max"; oss << std::endl; - oss << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl; // TEMPCODE RJS + oss << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl; oss << "Lengths: "; miopen::LogRange(oss, filter.GetLengths(), ", ") << std::endl; oss << "Pads: "; @@ -160,84 +163,6 @@ struct pooling_operators } }; -#include -#include -#define MAX_PRINTING 128 // TEMPCODE RJS -#define MAX_PRINT 12 // TEMPCODE RJS -#define MAX_NCD 2 -#define GPU_JUNK 0 // typ. 160, reasonable max is 320 -#define PRINT_CPU_IN 0 -#define PRINT_GPU_OUT 0 -#define GPU_4COL false -#define PIPE std::cout - -namespace { - template - std::vector ClampNCS(const std::vector& lens, std::vector& strides) - { - std::vector out; - bool is2d = lens.size() == 4; - constexpr T max_ncd = MAX_NCD; - constexpr T max_print = MAX_PRINT; - int idx = 0; - - out.push_back(std::min(lens[idx++], max_ncd)); - out.push_back(std::min(lens[idx++], max_ncd)); - if(is2d) - out.push_back(1); - else - out.push_back(std::min(lens[idx++], max_ncd)); - - out.push_back(std::min(lens[idx++], max_print)); - out.push_back(std::min(lens[idx++], max_print)); - - if(is2d) - strides.insert(strides.begin() + 2, {strides[2]}); - - return out; - } - - template - std::ostream& printClamped(std::ostream& oss, const std::vector& out, const std::vector& outlen, const std::vector& outstr, int mode = 0) - { - auto ostr = outstr; - auto olen = ClampNCS(outlen, ostr); - - for(int nn = 0; nn < olen[0]; ++nn) { - for(int cc = 0; cc < olen[1]; ++cc) { - for(int dd = 0; dd < olen[2]; ++dd) { - oss << "n= " << nn << " c= " << cc << " d= " << dd << std::endl; - for(int hh = 0; hh < olen[3]; ++hh) { - for(int ww = 0; ww < olen[4]; ++ww) { - auto index = nn * ostr[0] + cc * ostr[1] + dd * ostr[2] + hh * ostr[3] + ww * ostr[4]; -switch(mode) { - case 0: - if(std::is_same::value || std::is_same::value) { - oss << std::setw(11) << std::setprecision(5) << (int16_t)out[index] << " "; - } else { - oss << std::setw(11) << std::setprecision(5) << out[index] << " "; - } - break; - case 1: - oss << std::setw(11) << std::setprecision(5) << index << " "; - break; -} - } - oss << std::endl; - } - } - } - } - return oss; - } -} - - template - std::ostream& printClamped(std::ostream& oss, const tensor& out, const std::vector& outlen, const std::vector& outstr, int mode = 0) - { - return printClamped(oss, out.data, outlen, outstr, mode); - } - template struct verify_forward_pooling { @@ -245,7 +170,6 @@ struct verify_forward_pooling tensor cpu(const tensor& input, const miopen::PoolingDescriptor& filter, std::vector&) const { -auto st = sc::now(); auto out = get_output_tensor(filter, input); std::array in_dim{}; @@ -258,10 +182,6 @@ auto st = sc::now(); std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin()); auto pooler = pooling_operators{filter}; - // TEMPCODE RJS print input tensor - bool printing = in_dim[0] <= MAX_PRINTING && in_dim[1] <= MAX_PRINTING; - if(in_dim.size() > 2) printing &= in_dim[2] <= MAX_PRINTING; - int b_n = out.desc.GetLengths()[0]; int k_n = out.desc.GetLengths()[1]; std::array out_spatial_len{}; @@ -312,111 +232,6 @@ auto st = sc::now(); out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size)); }); - if(PRINT_GPU_OUT && printing) - { - PIPE << "CPU out: "; - auto outlen = out.desc.GetLengths(); - for(auto dim : outlen) PIPE << std::setw(4) << dim << " "; PIPE << " | "; - auto outstr = out.desc.GetStrides(); - for(auto dim : outstr) PIPE << std::setw(4) << dim << " "; PIPE << " | "; - for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | "; - for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE << " | "; - for(auto str : filter.GetPads()) PIPE << std::setw(4) << str << " "; - PIPE << std::endl; - - printClamped(PIPE, out, outlen, outstr); - // printClamped(PIPE, out, outlen, outstr, 1); - } // print output tensor - if(PRINT_CPU_IN && printing) - { - auto inlen = input.desc.GetLengths(); - auto instr = input.desc.GetStrides(); - PIPE << "CPU in : m" << filter.GetMode() << " t" << input.desc.GetType() << " | "; - for(auto dim : inlen) PIPE << std::setw(4) << dim << " "; PIPE << " | "; - for(auto str : instr) PIPE << std::setw(4) << str << " "; PIPE << " | "; - for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | "; - for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE << " | "; - for(auto str : filter.GetPads()) PIPE << std::setw(4) << str << " "; - PIPE << std::endl; - - printClamped(PIPE, input, inlen, instr); - // printClamped(PIPE, input, inlen, instr, 1); - } -coutms("f.cpu", st); - return out; - } - - template - tensor - cpu_naive(const tensor& input, const miopen::PoolingDescriptor& filter, std::vector&) const - { - auto out = get_output_tensor(filter, input); - - std::array in_dim{}; - std::copy_n(input.desc.GetLengths().begin() + 2, SptDim, in_dim.begin()); - std::array strides{}; - std::copy_n(filter.GetStrides().begin(), SptDim, strides.begin()); - std::array pads{}; - std::copy_n(filter.GetPads().begin(), SptDim, pads.begin()); - std::array kers{}; - std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin()); - auto pooler = pooling_operators{filter}; - - // TEMPCODE RJS print input tensor - bool printing = in_dim[0] <= MAX_PRINTING && in_dim[1] <= MAX_PRINTING; - if(in_dim.size() > 2) printing &= in_dim[2] <= MAX_PRINTING; - - auto flens = filter.GetLengths(); - auto fstrs = filter.GetStrides(); - auto ilens = input.desc.GetLengths(); - auto istrs = input.desc.GetStrides(); - auto olens = out.desc.GetLengths(); - auto ostrs = out.desc.GetStrides(); - - for(int n = 0; n < olens[0]; ++n) - { - for(int c = 0; c < olens[1]; ++c) - { - for(int h = 0; h < olens[2]; ++h) - { - for(int w = 0; w < olens[3]; ++w) - { - int hstart = h * fstrs[0]; - int hend = hstart + flens[0]; - hend = min(hend, ilens[2]); - - int wstart = w * fstrs[1]; - int wend = wstart + flens[1]; - wend = min(wend, ilens[3]); - double res = -10000.0; - for(int fh = hstart; fh < hend; ++fh) - { - for(int fw = wstart; fw < wend; ++fw) - { - double val = input[n * istrs[0] + c * istrs[1] + fh * istrs[2] + fw * istrs[3]]; - if(val > res) - res = val; - } - } - out[n * ostrs[0] + c * ostrs[1] + h * ostrs[2] + w * ostrs[3]] = res; - } - } - } - } - - if(printing) - { - PIPE << "CPU nve: "; - for(auto dim : olens) PIPE << std::setw(4) << dim << " "; PIPE << " | "; - for(auto dim : ostrs) PIPE << std::setw(4) << dim << " "; PIPE << " | "; - for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | "; - for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE<< " | "; - for(auto str : filter.GetPads()) PIPE << std::setw(4) << str << " "; - PIPE << std::endl; - - printClamped(PIPE, out, olens, ostrs); - } - return out; } @@ -425,28 +240,16 @@ coutms("f.cpu", st); const miopen::PoolingDescriptor& filter, std::vector& indices) const { -auto st = sc::now(); -auto write = sc::now(); auto&& handle = get_handle(); auto out = get_output_tensor(filter, input); - auto junk = get_big_output_tensor(filter, input); // TEMPCODE RJS - -#if GPU_JUNK > 0 - PIPE << "GPU junk init: " << std::endl; - for(int idx = 0; idx < GPU_JUNK; ++idx) { - PIPE << std::setw(11) << std::setprecision(5) << junk.data[idx] << " "; - if((idx % 4) == 3) PIPE <(out.GetSize()); - auto junk_dev = handle.Write(junk.data); // TEMPCODE RJS Workspace wspace{}; wspace.Write(indices); -coutms("f.GPUwrite", write); + float alpha = 1, beta = 0; filter.Forward(handle, &alpha, @@ -457,76 +260,10 @@ coutms("f.GPUwrite", write); out_dev.get(), true, wspace.ptr(), - wspace.size(), - junk_dev.get()); // TEMPCODE RJS -handle.Finish(); // TEMPCODE RJS -coutms("f.gpu1", st); - filter.Forward(handle, - &alpha, - input.desc, - in_dev.get(), - &beta, - out.desc, - out_dev.get(), - true, - wspace.ptr(), - wspace.size(), - junk_dev.get()); // TEMPCODE RJS -handle.Finish(); // TEMPCODE RJS -auto read = sc::now(); + wspace.size()); handle.ReadTo(out.data.data(), out_dev, out.GetDataByteSize()); wspace.ReadTo(indices); - bool printing = input.desc.GetLengths()[2] <= MAX_PRINTING && input.desc.GetLengths()[3] <= MAX_PRINTING; - if(input.desc.GetLengths().size() > 4) printing &= input.desc.GetLengths()[4] <= MAX_PRINTING; -std::cout << (printing ? "printing output from GPU..." : "skipping GPU print.") << std::endl; - if(PRINT_GPU_OUT && printing) - { -#if GPU_JUNK > 0 - handle.ReadTo(junk.data.data(), junk_dev, junk.GetDataByteSize()); - if(GPU_4COL){ - PIPE<< "GPU (8-cols): " << std::endl; - for(int idx = 0; idx < GPU_JUNK; ++idx) { - PIPE << std::setw(11) << std::setprecision(5) << out.data[idx] << " "; - if((idx % 8) == 7) PIPE < olen = outlen; - if(olen.size() == 4) olen.insert(olen.begin() + 2, 1); - // Mask data is always NCDHW layout - size_t mask_w_stride = 1; - size_t mask_h_stride = mask_w_stride * olen[4]; - size_t mask_d_stride = mask_h_stride * olen[3]; - size_t mask_c_stride = mask_d_stride * olen[2]; - size_t mask_n_stride = mask_c_stride * olen[1]; - std::vector mask_str{mask_n_stride, mask_c_stride, mask_d_stride,mask_h_stride, mask_w_stride}; - std::cout << "GPU indices: " << mask_n_stride << " " << mask_c_stride << " " << mask_d_stride << " " << mask_h_stride << " " << mask_w_stride << std::endl; - printClamped(std::cout, indices, olen, mask_str); - } - PIPE << "GPU out: "; - for(auto dim : outlen) PIPE << std::setw(4) << dim << " "; PIPE << " | "; - for(auto dim : outstr) PIPE << std::setw(4) << dim << " "; PIPE << " | "; - for(auto str : filter.GetLengths()) PIPE << std::setw(4) << str << " "; PIPE << " | "; - for(auto str : filter.GetStrides()) PIPE << std::setw(4) << str << " "; PIPE<< " | "; - for(auto str : filter.GetPads()) PIPE << std::setw(4) << str << " "; - PIPE << std::endl; - - printClamped(PIPE, out, outlen, outstr); - // printClamped(std::cout, out, outlen, outstr, 1); - } // print output tensor -coutms("f.GPUread", read); -coutms("f.gpu", st); return out; } @@ -558,11 +295,11 @@ struct verify_backward_pooling bool use_global_index, bool verify_index) const { -auto st = sc::now(); const int sptl_dim_offset = 2; const int chan_dim_offset = 1; auto dinput = input; + return dinput; // TRJS std::vector din_vec(input.desc.GetElementSpace(), 0.0); CHECK(dout.desc == out.desc); @@ -702,7 +439,6 @@ auto st = sc::now(); dinput(in_id_pack...) = din_vec.at(din_idx); }); - coutms("b.cpu", st); return dinput; } @@ -715,10 +451,9 @@ auto st = sc::now(); bool, bool) const { -auto st = sc::now(); -auto write = sc::now(); auto&& handle = get_handle(); auto dinput = input; + return dinput; // TRJS auto in_dev = handle.Write(input.data); auto dout_dev = handle.Write(dout.data); @@ -727,7 +462,6 @@ auto write = sc::now(); Workspace wspace{}; wspace.Write(indices); -coutms("b.GPUwrite", write); float alpha = 1, beta = 0; filter.Backward(handle, @@ -746,29 +480,9 @@ coutms("b.GPUwrite", write); dinput.desc, din_dev.get(), wspace.ptr()); -handle.Finish(); // TEMPCODE RJS -coutms("b.gpu1", st); - filter.Backward(handle, - &alpha, - // y - out.desc, - out_dev.get(), - // dy - dout.desc, - dout_dev.get(), - // x - input.desc, - in_dev.get(), - &beta, - // dx - dinput.desc, - din_dev.get(), - wspace.ptr()); -handle.Finish(); // TEMPCODE RJS -auto read = sc::now(); - dinput.data = handle.Read(din_dev, dinput.data.size()); -coutms("b.GPUread", read); -coutms("b.gpu", st); + + handle.ReadTo(dinput.data.data(), din_dev, dinput.data.size()); + return dinput; } @@ -807,6 +521,33 @@ struct pooling_driver : test_driver int verify_indices{}; miopenPoolingWorkspaceIndexMode_t wsidx{}; miopenTensorLayout_t layout{}; + + static void randomize_tensor(tensor& in) + { + static tensor random_data{{1}}; + static tensor starts{{1}}; + static size_t start_idx = 0; + + const auto size = in.GetSize(); + const auto ran_size = size > 2 ? (3 * size) / 2 : 3; + if (random_data.GetSize() < ran_size) + { + random_data = tensor{{ran_size}}.generate(tensor_elem_gen_integer{2503}); + } + if (starts.GetSize() == 1) + { + starts = tensor{{1 << 20}}.generate(gen_start); + } + + const auto r_start = starts[start_idx++] % (random_data.GetSize() / 3); + if (start_idx >= starts.GetSize()) start_idx = 0; + + std::cout << "randomizing " << std::setw(9) << size << " elems from " << std::setw(9) << r_start << " (" << start_idx << ")" + // << "(" << std::setw(8) << prng::gen_0_to_B(size / 2) << std::setw(8) << prng::gen_0_to_B(size / 2) << std::setw(8) << prng::gen_0_to_B(size / 2) << std::setw(8) << prng::gen_0_to_B(size / 2) << ")" + << std::endl; + in.data.assign(random_data.begin() + r_start, random_data.begin() + r_start + size); + } + std::unordered_map index_type_lookup = { {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8}, {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16}, @@ -832,7 +573,7 @@ struct pooling_driver : test_driver { add(index_type, "index_type", - // generate_data({"miopenIndexUint32"} // TEMPCODE RUN + // generate_data({"miopenIndexUint32"} // TEMPCODE RJS RUN generate_multi_data( // {{"miopenIndexUint32", "miopenIndexUint8" @@ -846,63 +587,55 @@ struct pooling_driver : test_driver add(mode_str, "mode_str", generate_data( - {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); // TEMPCODE RUN + {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); #if TEST_PADDING_MODE == 1 add(pmode, "pmode", generate_data({"default", "same", "valid"})); #endif add(verify_indices, "verify_indices", generate_data({1})); } -#define FORWARD -#define BACKWARD - template void run_impl() { std::vector indices{}; +auto gst = sc::now(); auto input = tensor{layout, in_shape}; - for(auto& v : input.data) v = gen_value(); - -auto st = sc::now(); -#ifdef FORWARD + randomize_tensor(input); +coutms("gen", gst); +auto vst = sc::now(); auto out = verify(verify_forward_pooling{}, input, filter, indices); -coutms("f.verify", st); -#endif -#ifdef BACKWARD // TEMPCODE RJS no backward +coutms("verify", vst); if(!std::is_same::value && !std::is_same::value) return; -auto bst = sc::now(); - auto dout = out.first; - dout.generate(tensor_elem_gen_integer{2503}); - verify(verify_backward_pooling{}, - input, - dout, - out.first, - filter, - indices, - wsidx != 0, - static_cast(this->verify_indices)); -coutms("b.verify", bst); -#endif + + // auto dout = out.first; + // dout.generate(tensor_elem_gen_integer{2503}); + // verify(verify_backward_pooling{}, // TRJS + // input, + // dout, + // out.first, + // filter, + // indices, + // wsidx != 0, + // static_cast(this->verify_indices)); } #define CHECK_SKIP \ if(skip) \ { \ - std::cout << "\n############ RunSkip # " << std::setw(7) << num_all_case++ << " : "; \ + std::cout << "\nSkipping run # " << std::setw(7) << num_all_case++ << " @ET=" << mstocout(__start) << " : "; \ show_command(); \ std::cout << "-- " << oss.str() << std::endl; \ -coutms("skipRun", st); return; \ + return; \ } #define SKIP_RUN skip = true; CHECK_SKIP void run() { -auto st = sc::now(); - const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); // TEMPCODE RJS + const bool is_default_layout = miopen::TensorDescriptor::IsDefaultLayout(layout); bool skip = false; std::ostringstream oss; @@ -928,8 +661,7 @@ auto st = sc::now(); int sptl_dim = static_cast(in_shape.size()) - 2; if(sptl_dim != 2 && sptl_dim != 3) { - oss << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW form." - << std::endl; + oss << "Warning: Config skipped due to invalid dimensions. 'in_shape' must be in NCHW or NCDHW format." << std::endl; SKIP_RUN; } @@ -1117,7 +849,7 @@ auto st = sc::now(); { if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast(2) * pads[i])) { - oss << "Warning: Config skipped becuse it is invalid " + oss << "Warning: Config skipped because it is invalid " "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))" << std::endl; SKIP_RUN; @@ -1143,7 +875,7 @@ auto st = sc::now(); CHECK_SKIP; - std::cout << "\n############ Run # " << std::setw(7) << num_all_case++ << " : "; + std::cout << "\nRun # " << std::setw(7) << num_all_case++ << " @ET=" << mstocout(__start) << " : "; show_command(); std::vector in_dim(input_desc.GetLengths().begin(), @@ -1151,45 +883,6 @@ auto st = sc::now(); std::vector out_dim(sptl_dim); std::vector ker_dim(filter.GetLengths().begin(), filter.GetLengths().end()); -#if TEST_PADDING_MODE == 1 - if(filter.pmode == miopenPaddingSame) - { - if(std::any_of(filter.GetStrides().begin(), filter.GetStrides().end(), [](int i) { - return i == 0; - })) - return; - for(int i = 0; i < sptl_dim; i++) - { - filter.pads[i] = - ((in_dim[i] % filter.GetStrides()[i] == 0) - ? (std::max((ker_dim[i] - filter.GetStrides()[i]), 0)) - : (std::max((ker_dim[i] - (in_dim[i] % filter.GetStrides()[i])), 0))) / - 2; - - out_dim[i] = std::ceil(static_cast(in_dim[i]) / filter.strides[i]); - } - - if(std::any_of(out_dim.begin(), out_dim.end(), [](int i) { return i <= 0; })) - return; - } - else if(filter.pmode == miopenPaddingValid) - { - if(std::any_of(filter.GetStrides().begin(), filter.GetStrides().end(), [](int i) { - return i == 0; - })) - return; - for(int i = 0; i < sptl_dim; i++) - { - filter.pads[i] = 0; - - out_dim[i] = std::ceil(static_cast(in_dim[i] - filter.lens[i] + 1) / - filter.strides[i]); - } - - if(std::any_of(out_dim.begin(), out_dim.end(), [](int i) { return i <= 0; })) - return; - } -#endif switch(filter.GetIndexType()) { case miopenIndexUint8: { @@ -1237,7 +930,6 @@ auto st = sc::now(); break; } } -coutms("Run", st); } }; diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp index b3acb61e07..546b40876f 100644 --- a/test/pooling2d.hpp +++ b/test/pooling2d.hpp @@ -24,7 +24,7 @@ * *******************************************************************************/ -#include "gtest/pooling_testing.hpp" +// #include "gtest/pooling_testing.hpp" #include "pooling_common.hpp" #define WORKAROUND_ISSUE_1670 1 diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp index 956c529533..f42b6da6aa 100644 --- a/test/pooling_common.hpp +++ b/test/pooling_common.hpp @@ -80,7 +80,7 @@ static inline void print(const miopen::PoolingDescriptor& filter, bool is_defaul else std::cout << "Max"; std::cout << std::endl; - std::cout << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl; // TEMPCODE RJS + std::cout << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl; std::cout << "Lengths: "; miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl; std::cout << "Pads: "; @@ -141,11 +141,6 @@ struct pooling_operators } }; -#include -#define MAX_PRINT 16 // TEMPCODE RJS -#define GPU_JUNK 0 -#define GPU_4COL false - template struct verify_forward_pooling { @@ -154,8 +149,8 @@ struct verify_forward_pooling cpu(const tensor& input, const miopen::PoolingDescriptor& filter, std::vector&) const { // const bool is_default_layout = input.desc.IsDefaultLayout(); - const int sptl_dim_offset = 2; // is_default_layout ? 2 : 1; TEMPCODE RJS - const int chan_dim_offset = 1; // is_default_layout ? 1 : SptDim + 1; + const int sptl_dim_offset = 2; + const int chan_dim_offset = 1; auto out = get_output_tensor(filter, input); @@ -169,9 +164,6 @@ struct verify_forward_pooling std::copy_n(filter.GetLengths().begin(), SptDim, kers.begin()); auto pooler = pooling_operators{filter}; - // TEMPCODE RJS print input tensor - bool printing = in_dim[0]<=MAX_PRINT && in_dim[1]<=MAX_PRINT; - if(in_dim.size() > 2) printing &= in_dim[2]<=MAX_PRINT; int b_n = out.desc.GetLengths()[0]; int k_n = out.desc.GetLengths()[chan_dim_offset]; std::array out_spatial_len{}; @@ -221,59 +213,6 @@ struct verify_forward_pooling }); out(o, w, out_spatial_id_pack...) = T(pooler.finalize(acc, pool_size)); }); - if(printing) - { - std::cout << "CPU out: "; - auto outlen = out.desc.GetLengths(); - auto outstr = out.desc.GetStrides(); - for(auto dim : outlen) std::cout << std::setw(4) << dim; - std::cout << " | "; - for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str; - std::cout << " | "; - for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str; - std::cout << " | "; - for(auto str : filter.GetPads()) std::cout << std::setw(4) << str; - std::cout << std::endl; - - for(int nn = 0; nn < outlen[0]; ++nn) { - for(int cc = 0; cc < outlen[1]; ++cc) { - std::cout << "n=" << nn << " c=" << cc <(out.data.size()); - auto junk_dev = handle.Create(junk.data.size()); // Workspace wspace{}; wspace.Write(indices); @@ -305,59 +242,11 @@ struct verify_forward_pooling out_dev.get(), true, wspace.ptr(), - wspace.size(), - junk_dev.get()); // TEMPCODE RJS + wspace.size()); indices = wspace.Read>(); handle.ReadTo(out.data.data(), out_dev, out.data.size() * sizeof(T)); - bool printing = input.desc.GetLengths()[2] <= MAX_PRINT && input.desc.GetLengths()[3] <= MAX_PRINT; - if(input.desc.GetLengths().size() > 4) printing &= input.desc.GetLengths()[4] <= MAX_PRINT; - if(printing) - { - if(GPU_4COL){ - std::cout << "GPU (8-cols): " << std::endl; - for(int idx = 0; idx < GPU_JUNK; ++idx) { - std::cout << std::setw(11) << std::setprecision(5) << out.data[idx] << " "; - if((idx % 8) == 7) std::cout < 0){ - handle.ReadTo(junk.data.data(), junk_dev, junk.data.size() * sizeof(T)); - std::cout << "GPU junk: " << std::endl; - for(int idx = 0; idx < GPU_JUNK; ++idx) { - std::cout << std::setw(11) << std::setprecision(5) << junk.data[idx] << " "; - if((idx % 4) == 3) std::cout <( // - // {{"miopenIndexUint8", - // "miopenIndexUint16", - // "miopenIndexUint32", - // "miopenIndexUint64"}, // - // {"miopenIndexUint8", "miopenIndexUint32"}, // - // {"miopenIndexUint32"}} // + generate_multi_data( // + {{"miopenIndexUint8", + "miopenIndexUint16", + "miopenIndexUint32", + "miopenIndexUint64"}, // + {"miopenIndexUint8", "miopenIndexUint32"}, // + {"miopenIndexUint32"}} // )); add(mode, "mode", generate_data( - {"miopenPoolingMax", "miopenPoolingAverage"})); // , "miopenPoolingAverageInclusive" + {"miopenPoolingMax", "miopenPoolingAverage", "miopenPoolingAverageInclusive"})); #if TEST_PADDING_MODE == 1 add(pmode, "pmode", generate_data({"default", "same", "valid"})); #endif @@ -656,60 +544,16 @@ struct pooling_driver : test_driver { std::vector indices{}; auto input = tensor{layout, in_shape}; - for(auto& v : input.data) v = gen_value(); - - // TEMPCODE RJS print input tensor - bool printing = in_shape[0]<=MAX_PRINT && in_shape[1]<=MAX_PRINT; - if (in_shape.size() > 2) printing &= in_shape[2]<=MAX_PRINT; - if(printing) - { - auto inlen = input.desc.GetLengths(); - auto instr = input.desc.GetStrides(); - std::cout << "CPU GEN : " << input.desc.GetLayout_str() << "(" << inlen.size() << ") | " << input.data.size() << " | " << input.desc.GetElementSpace() << " | "; - for(auto dim : inlen) std::cout << std::setw(4) << dim; - std::cout << " | "; - for(auto str : instr) std::cout << std::setw(4) << str; - std::cout << " | "; - for(auto str : filter.GetLengths()) std::cout << std::setw(4) << str; - std::cout << " | "; - for(auto str : filter.GetStrides()) std::cout << std::setw(4) << str; - std::cout << " | "; - for(auto str : filter.GetPads()) std::cout << std::setw(4) << str; - std::cout << std::endl; - - for(int nn = 0; nn < inlen[0]; ++nn) { - for(int cc = 0; cc < inlen[1]; ++cc) { - for(int hh = 0; hh < inlen[2]; ++hh) { - for(int ww = 0; ww < inlen[3]; ++ww) {// nn * instr[0] + cc * instr[1] + hh * instr[2] + ww * instr[3] - std::cout << std::setw(11) << std::setprecision(5) << input.data[input.desc.GetIndex(nn, cc, hh, ww)] << " "; - } - std::cout << std::endl; - } - } - } - } + for(auto& v : input.data) v = gen_value(); // TODO RJS use generate auto out = verify(verify_forward_pooling{}, input, filter, indices); - - // auto dout = out.first; - // dout.generate(tensor_elem_gen_integer{2503}); - // verify(verify_backward_pooling{}, // TEMPCODE RJS no backward - // input, - // dout, - // out.first, - // filter, - // indices, - // wsidx != 0, - // static_cast(this->verify_indices)); } void run() { - if(miopen::ToUpper(mode) == "MAX") return; // TEMPCODE RJS skip all except max, do max only - auto idx_typ = index_type_lookup.at(miopen::ToUpper(index_type)); auto idx_sz = sizeof(uint8_t); int sptl_dim = in_shape.size() - 2; @@ -719,7 +563,6 @@ struct pooling_driver : test_driver // Input dimensions to the driver are always NCHW-style const bool is_default_layout = !(layout == miopenTensorNHWC || layout == miopenTensorNDHWC); - std::cout << "################## pooling_driver run layout=" << (int)layout << " is_default=" << (int)is_default_layout << std::endl; filter = miopen::PoolingDescriptor { @@ -887,7 +730,7 @@ struct pooling_driver : test_driver if(sptl_dim != 2 && sptl_dim != 3) { show_command(); - std::cout << "Warning: Config skipped becuse it is not supported " // + std::cout << "Warning: Config skipped because it is not supported " // "(sptl_dim != 2 && sptl_dim != 3)" << std::endl; return; @@ -898,7 +741,7 @@ struct pooling_driver : test_driver if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast(2) * pads[i])) { show_command(); - std::cout << "Warning: Config skipped becuse it is invalid " + std::cout << "Warning: Config skipped because it is invalid " "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))" << std::endl; return; From 98b6dafee3c8fe263bf599a156a0e0f1a29cfc8f Mon Sep 17 00:00:00 2001 From: "Randy J. Spaulding" Date: Tue, 8 Oct 2024 07:44:42 -0700 Subject: [PATCH 09/10] cleanup --- src/CMakeLists.txt | 7 +---- .../miopen/problem_description_layout.hpp | 4 +-- src/include/miopen/tensor.hpp | 2 +- src/kernels/pooling_functions.h | 2 +- src/solver/pooling/forwardNaive.cpp | 5 ++-- test/gtest/ex1.cpp | 2 +- test/gtest/layout_transpose.cpp | 5 +--- test/gtest/pooling_testing.hpp | 10 +++---- test/pooling2d.hpp | 11 +++---- test/pooling3d.hpp | 18 ++++++------ test/pooling_common.hpp | 29 ++++++++++--------- 11 files changed, 42 insertions(+), 53 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 359604bfd3..5855cf20e7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -26,11 +26,6 @@ cmake_policy(SET CMP0057 NEW) -include_directories(SYSTEM - /opt/rocm/include/gtest - /opt/rocm/include -) - include(ExportHeader) if(MIOPEN_ENABLE_SQLITE) add_subdirectory(sqlite) @@ -600,7 +595,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/xform_bidirect_winograd_out.s kernels/UniversalTranspose.cl) - # Kernels in development lists. + # Kernels in development lists. # Should be ALWAYS empty in develop branch (at the time of PR merge) # Intention: to speed up kernel development rebuild time set(MIOPEN_DEVELOPMENT_KERNELS) diff --git a/src/include/miopen/problem_description_layout.hpp b/src/include/miopen/problem_description_layout.hpp index aa20c4058c..780086afe6 100644 --- a/src/include/miopen/problem_description_layout.hpp +++ b/src/include/miopen/problem_description_layout.hpp @@ -39,8 +39,8 @@ struct ProblemDescriptionLayoutBase : ProblemDescriptionBase { ProblemDescriptionLayoutBase() = default; ProblemDescriptionLayoutBase(const ProblemDescriptionLayoutBase&) = default; -ProblemDescriptionLayoutBase(const TensorDescriptor& in_, // x for Forward, y for Backward* - const TensorDescriptor& out_ // y for Forward, x for Backward* + ProblemDescriptionLayoutBase(const TensorDescriptor& in_, // x for Forward, y for Backward* + const TensorDescriptor& out_ // y for Forward, x for Backward* ) : ProblemDescriptionBase(), in(in_), diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp index ec368dbb83..3059622374 100644 --- a/src/include/miopen/tensor.hpp +++ b/src/include/miopen/tensor.hpp @@ -265,7 +265,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor { if(labels.size() != strides.size()) { - std::ostringstream oss; + std::ostringstream oss; // TODO TRJS check this print oss << "Invalid labels size. labels='" << labels << "', strides size=" << strides.size() << ". Layout labels size must be equivalent to stride size"; MIOPEN_THROW(oss.str().c_str()); diff --git a/src/kernels/pooling_functions.h b/src/kernels/pooling_functions.h index 6f53a2daab..aaaa431f53 100644 --- a/src/kernels/pooling_functions.h +++ b/src/kernels/pooling_functions.h @@ -39,7 +39,7 @@ typedef MLO_POOLING_INDEX_TYPE index_t; #define MLO_POOLING_OP_STC 2 #define MLO_POOLING_OP_AVE_INCLUSIVE 3 -#ifndef MLO_POOLING_OP_ID +#ifndef MLO_POOLING_OP_ID // TODO TRJS check: default used to be 0 #define MLO_POOLING_OP_ID 1 #endif diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp index 9039d49836..5bc4835ff5 100644 --- a/src/solver/pooling/forwardNaive.cpp +++ b/src/solver/pooling/forwardNaive.cpp @@ -82,7 +82,7 @@ bool PoolingForwardNaive::IsApplicable(const ExecutionContext&, return (problem.GetDirection() == miopen::pooling::Direction::Forward) // && (x_type == y_type) // && (std::find(types.cbegin(), types.cend(), x_type) != types.cend()) // - && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) //) + && (std::find(modes.cbegin(), modes.cend(), mode) != modes.cend()) // && (std::find(layouts.cbegin(), layouts.cend(), x_layout) != layouts.end()); } @@ -95,7 +95,7 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, const auto bot = problem.GetXDesc(); const auto top = problem.GetYDesc(); const bool is2d = (bot.GetNumDims() == 4); - const bool isTranspose = problem.GetXDesc().GetLayout_str()[1] != 'C'; + const bool isTranspose = problem.GetXDesc().GetLayout_str()[1] != 'C'; // TODO TRJS create member func // To compact code: const auto& pooling = problem.GetPooling(); @@ -204,7 +204,6 @@ PoolingForwardNaive::GetSolution(const ExecutionContext& context, const auto g1 = RoundUpNearestPower2Positive(isTranspose ? top_d : all_c); const auto g2 = RoundUpNearestPower2Positive(isTranspose || is2d_kernel ? top_h : top_d); - // TODO RJS: finish NHWC grid auto work_left = wavesize / 1; const auto w0 = (g0 < work_left) ? g0 : work_left; work_left /= w0; diff --git a/test/gtest/ex1.cpp b/test/gtest/ex1.cpp index 4b1ab5e2b1..324a3055cb 100644 --- a/test/gtest/ex1.cpp +++ b/test/gtest/ex1.cpp @@ -1,5 +1,5 @@ #include - +// TODO TRJS delete file struct paramType { std::string value; }; class MyFixture : public testing::TestWithParam {}; diff --git a/test/gtest/layout_transpose.cpp b/test/gtest/layout_transpose.cpp index bab82000d7..54d6cc21a7 100644 --- a/test/gtest/layout_transpose.cpp +++ b/test/gtest/layout_transpose.cpp @@ -274,10 +274,7 @@ struct LayoutTransposeTest_2D : public ::testing::TestWithParam{tensor_len, tensor_strides}; - - t_src.generate(gen_value); - + auto t_src = tensor{tensor_len, tensor_strides}.generate(gen_value); auto t_dst = tensor{tensor_len, tensor_strides}; auto t_dst_gpu = tensor{tensor_len, tensor_strides}; diff --git a/test/gtest/pooling_testing.hpp b/test/gtest/pooling_testing.hpp index eacd9ebeb6..8e94032708 100644 --- a/test/gtest/pooling_testing.hpp +++ b/test/gtest/pooling_testing.hpp @@ -525,7 +525,7 @@ struct pooling_driver : test_driver static void randomize_tensor(tensor& in) { static tensor random_data{{1}}; - static tensor starts{{1}}; + static tensor starts{std::vector{1}}; static size_t start_idx = 0; const auto size = in.GetSize(); @@ -534,22 +534,22 @@ struct pooling_driver : test_driver { random_data = tensor{{ran_size}}.generate(tensor_elem_gen_integer{2503}); } - if (starts.GetSize() == 1) + if (starts.GetSize() == 1) // TODO TRJS is there a cleaner way to initialize starts? { - starts = tensor{{1 << 20}}.generate(gen_start); + starts = tensor{std::vector{1 << 20}}.generate(gen_start); } const auto r_start = starts[start_idx++] % (random_data.GetSize() / 3); if (start_idx >= starts.GetSize()) start_idx = 0; - std::cout << "randomizing " << std::setw(9) << size << " elems from " << std::setw(9) << r_start << " (" << start_idx << ")" + std::cout << "randomizing " << std::setw(9) << size << " elems from " << std::setw(9) << r_start << " (" << start_idx << ")" // TRJS // << "(" << std::setw(8) << prng::gen_0_to_B(size / 2) << std::setw(8) << prng::gen_0_to_B(size / 2) << std::setw(8) << prng::gen_0_to_B(size / 2) << std::setw(8) << prng::gen_0_to_B(size / 2) << ")" << std::endl; in.data.assign(random_data.begin() + r_start, random_data.begin() + r_start + size); } std::unordered_map index_type_lookup = { - {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8}, + {miopen::ToUpper("miopenIndexUint8"), miopenIndexUint8}, {miopen::ToUpper("miopenIndexUint16"), miopenIndexUint16}, {miopen::ToUpper("miopenIndexUint32"), miopenIndexUint32}, {miopen::ToUpper("miopenIndexUint64"), miopenIndexUint64}, diff --git a/test/pooling2d.hpp b/test/pooling2d.hpp index 546b40876f..1bd44a364b 100644 --- a/test/pooling2d.hpp +++ b/test/pooling2d.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -// #include "gtest/pooling_testing.hpp" #include "pooling_common.hpp" #define WORKAROUND_ISSUE_1670 1 @@ -38,13 +37,11 @@ struct pooling2d_shapes static std::vector get_2d_pooling_input_shapes() { return { - {5, 32, 8, 8} // TEMPCODE RUN - , + {5, 32, 8, 8}, {16, 1, 4096, 4096}, {1, 16, 4096, 4096}, {1, 1024, 512, 512}, - {16, 1024, 128, 128} - , + {16, 1024, 128, 128}, {1, 832, 64, 128}, {10, 3, 32, 32}, {1, 19, 1024, 2048}, @@ -65,7 +62,7 @@ struct pooling2d_shapes } // Dataset 1 is intended for testing of asymmetric configs. - static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 1, 8, 8}, {10, 3, 32, 32}}; } + static std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}, {10, 3, 32, 32}}; } // Dataset 2 is intended for testing of configs with wide window. static std::vector get_2d_pooling_input_shapes_wide() @@ -96,6 +93,7 @@ struct pooling2d_driver : pooling_driver public: pooling2d_driver() : pooling_driver() { + // clang-format off #if TEST_GET_INPUT_TENSOR std::set in_dim_set = get_inputs(this->batch_factor); std::vector in_dim_vec(in_dim_set.begin(), in_dim_set.end()); @@ -120,7 +118,6 @@ struct pooling2d_driver : pooling_driver {{1, 1}} } )); - // clang-format off this->add(this->pads, "pads", this->template generate_multi_data({ {{0, 0}, {1, 1}}, // #if WORKAROUND_ISSUE_1670 diff --git a/test/pooling3d.hpp b/test/pooling3d.hpp index 341cc34660..ffaaf4a9ee 100644 --- a/test/pooling3d.hpp +++ b/test/pooling3d.hpp @@ -33,15 +33,15 @@ struct pooling3d_shapes static std::vector get_3d_pooling_input_shapes() { - return {{16, 64, 3, 4, 4} // TEMPCODE RUN - , - {16, 32, 4, 9, 9}, - {8, 512, 3, 14, 14}, - {8, 512, 4, 28, 28}, - {16, 64, 56, 56, 56}, - {4, 3, 4, 227, 227}, - {4, 4, 4, 161, 700} - }; + return { + {16, 64, 3, 4, 4}, + {16, 32, 4, 9, 9}, + {8, 512, 3, 14, 14}, + {8, 512, 4, 28, 28}, + {16, 64, 56, 56, 56}, + {4, 3, 4, 227, 227}, + {4, 4, 4, 161, 700} + }; } }; diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp index f42b6da6aa..0156f781a3 100644 --- a/test/pooling_common.hpp +++ b/test/pooling_common.hpp @@ -62,15 +62,15 @@ static int num_uint64_case_imgidx = 0; namespace { -constexpr int RAND_INTEGER_MAX = 1200; -constexpr int RAND_INTEGER_MIN = -880; +constexpr int RAND_INTEGER_MAX = 12000; +constexpr int RAND_INTEGER_MIN = -8800; template auto gen_value = - [](auto... is) { return static_cast(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 10; }; + [](auto... is) { return static_cast(prng::gen_A_to_B(RAND_INTEGER_MIN, RAND_INTEGER_MAX)) / 100; }; } -static inline void print(const miopen::PoolingDescriptor& filter, bool is_default_layout) +static inline void print(const miopen::PoolingDescriptor& filter) { std::cout << "Pooling: "; if(filter.GetMode() == miopenPoolingAverage) @@ -80,7 +80,6 @@ static inline void print(const miopen::PoolingDescriptor& filter, bool is_defaul else std::cout << "Max"; std::cout << std::endl; - std::cout << "Layout: " << (is_default_layout ? "default" : "transposed") << std::endl; std::cout << "Lengths: "; miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl; std::cout << "Pads: "; @@ -148,7 +147,6 @@ struct verify_forward_pooling tensor cpu(const tensor& input, const miopen::PoolingDescriptor& filter, std::vector&) const { - // const bool is_default_layout = input.desc.IsDefaultLayout(); const int sptl_dim_offset = 2; const int chan_dim_offset = 1; @@ -276,10 +274,6 @@ struct verify_backward_pooling bool use_global_index, bool verify_index) const { - const bool is_default_layout = input.desc.IsDefaultLayout(); - const int sptl_dim_offset = is_default_layout ? 2 : 1; - const int chan_dim_offset = is_default_layout ? 1 : SptDim + 1; - auto dinput = input; std::vector din_vec(input.desc.GetElementSpace(), 0.0); @@ -550,6 +544,16 @@ struct pooling_driver : test_driver input, filter, indices); + auto dout = out.first; + dout.generate(tensor_elem_gen_integer{2503}); + verify(verify_backward_pooling{}, + input, + dout, + out.first, + filter, + indices, + wsidx != 0, + static_cast(this->verify_indices)); } void run() @@ -561,9 +565,6 @@ struct pooling_driver : test_driver (dataset_id == 0) && full_set; // Otherwise the default dataset takes too much time. const bool wide_dataset = (dataset_id == 2) && full_set; - // Input dimensions to the driver are always NCHW-style - const bool is_default_layout = !(layout == miopenTensorNHWC || layout == miopenTensorNDHWC); - filter = miopen::PoolingDescriptor { mode_lookup.at(miopen::ToUpper(mode)), @@ -766,7 +767,7 @@ struct pooling_driver : test_driver } } - int sptl_index = is_default_layout ? 2 : 1; + constexpr int sptl_index = 2; std::vector in_dim(input_desc.GetLengths().begin() + sptl_index, input_desc.GetLengths().begin() + sptl_index + sptl_dim); From 964bc57f209c065ad5c8a524f59475b2f45476b7 Mon Sep 17 00:00:00 2001 From: "Randy J. Spaulding" Date: Tue, 8 Oct 2024 08:13:52 -0700 Subject: [PATCH 10/10] fix build --- test/pooling_common.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp index 0156f781a3..c9f787b3ed 100644 --- a/test/pooling_common.hpp +++ b/test/pooling_common.hpp @@ -255,7 +255,7 @@ struct verify_forward_pooling const std::vector&) const { std::cout << "Forward "; - print(filter, input.desc.IsDefaultLayout()); + print(filter); std::cout << "Input tensor: " << input.desc.ToString() << std::endl; std::cout << "Output tensor: " << filter.GetForwardOutputTensor(input.desc).ToString() << std::endl; @@ -276,6 +276,9 @@ struct verify_backward_pooling { auto dinput = input; + constexpr int sptl_dim_offset = 2; + constexpr int chan_dim_offset = 1; + std::vector din_vec(input.desc.GetElementSpace(), 0.0); CHECK(dout.desc == out.desc); std::array in_dim{}; @@ -468,7 +471,7 @@ struct verify_backward_pooling bool) const { std::cout << "Backward "; - print(filter, input.desc.IsDefaultLayout()); + print(filter); std::cout << "Input tensor: " << input.desc.ToString() << std::endl; std::cout << "Output tensor: " << out.desc.ToString() << std::endl; } @@ -546,7 +549,7 @@ struct pooling_driver : test_driver indices); auto dout = out.first; dout.generate(tensor_elem_gen_integer{2503}); - verify(verify_backward_pooling{}, + verify(verify_backward_pooling{}, input, dout, out.first,