diff --git a/data/kernels/basic.cl b/data/kernels/basic.cl index eef502c36c8c..728ffb0a87d8 100644 --- a/data/kernels/basic.cl +++ b/data/kernels/basic.cl @@ -3722,8 +3722,6 @@ interpolation_resample (read_only image2d_t in, // store final result if (iy == 0 && x < width && y < height) { - // Clip negative RGB that may be produced by Lanczos undershooting - // Negative RGB are invalid values no matter the RGB space (light is positive) write_ipixel(out, (int2)(x, y), buffer[ylid]); } } diff --git a/data/kernels/common.h b/data/kernels/common.h index fc9db1fb1ae7..b9ccc08d07b1 100644 --- a/data/kernels/common.h +++ b/data/kernels/common.h @@ -280,5 +280,5 @@ static inline float readalpha(read_only image2d_t in, int col, int row) static inline void write_ipixel(write_only image2d_t out, const int2 pos, const float4 pixel) { - write_imagef(out, pos, fmax(0.0f, pixel)); + write_imagef(out, pos, pixel); } \ No newline at end of file diff --git a/data/kernels/extended.cl b/data/kernels/extended.cl index 10597d6da379..62687adf53b6 100644 --- a/data/kernels/extended.cl +++ b/data/kernels/extended.cl @@ -42,7 +42,7 @@ graduatedndp (read_only image2d_t in, float len = length_base + y*length_inc_y + x*length_inc_x; float dens = dtcl_exp2(density * clipf(0.5f + len)); pixel = pixel / (color + ((float4)1.0f - color) * (float4)dens); - write_imagef (out, (int2)(x, y), fmax(0.0f, pixel)); + write_ipixel(out, (int2)(x, y), pixel); } @@ -67,7 +67,7 @@ graduatedndm (read_only image2d_t in, float len = length_base + y*length_inc_y + x*length_inc_x; float dens = dtcl_exp2(-density * clipf(0.5f - len)); pixel = pixel * (color + ((float4)1.0f - color) * (float4)dens); - write_imagef(out, (int2)(x, y), fmax(0.0f, pixel)); + write_ipixel(out, (int2)(x, y), pixel); } __kernel void diff --git a/src/common/interpolation.c b/src/common/interpolation.c index 999fef0debfb..e89bded163a5 100644 --- a/src/common/interpolation.c +++ b/src/common/interpolation.c @@ -44,9 +44,6 @@ enum border_mode // !! Make sure to sync this with the filter array !! #define MAX_HALF_FILTER_WIDTH 3 -// Add *verbose* (like one msg per pixel out) debug message to stderr -#define DEBUG_PRINT_VERBOSE 0 - /* -------------------------------------------------------------------------- * Debug helpers * ------------------------------------------------------------------------*/ @@ -552,7 +549,6 @@ float dt_interpolation_compute_sample(const dt_interpolation_t *itor, s += kernelv[i] * h; in += linestride; } - s = _interpolated_out(s * oonorm); } else if(ix >= 0 && iy >= 0 && ix < width && iy < height) { @@ -574,9 +570,8 @@ float dt_interpolation_compute_sample(const dt_interpolation_t *itor, } s += kernelv[i] * h; } - s = _interpolated_out(s * oonorm); } - return s; // if called for masks make sure to CLIP to avoid interpolator under/overshoots + return s * oonorm; // if called for masks make sure to CLIP to avoid interpolator under/overshoots } /* -------------------------------------------------------------------------- @@ -645,7 +640,7 @@ void dt_interpolation_compute_pixel4c(const dt_interpolation_t *itor, } for_each_channel(c,aligned(out)) - out[c] = _interpolated_out(pixel[c] * oonorm); + out[c] = pixel[c] * oonorm; } else if(ix >= 0 && iy >= 0 && ix < width && iy < height) { @@ -675,7 +670,7 @@ void dt_interpolation_compute_pixel4c(const dt_interpolation_t *itor, } for_each_channel(c,aligned(out)) - out[c] = _interpolated_out(pixel[c] * oonorm); + out[c] = pixel[c] * oonorm; } else { @@ -1118,7 +1113,7 @@ void dt_interpolation_resample(const dt_interpolation_t *itor, dt_aligned_pixel_t pixel; for_each_channel(c, aligned(vs:16)) - pixel[c] = _interpolated_out(vs[c]); + pixel[c] = vs[c]; copy_pixel_nontemporal(out + baseidx, pixel); // Reset vertical resampling context diff --git a/src/common/math.h b/src/common/math.h index ff48555a2f47..689dc1061abd 100644 --- a/src/common/math.h +++ b/src/common/math.h @@ -811,15 +811,6 @@ static inline double rad2deg(const double radians) return radians / M_PI * 180.0; } -/* Reminder: keep in sync with opencl write_ipixel() - All pixel interpolators use this, currently we restrict output of resampling to be at least zero -*/ -static inline float _interpolated_out(const float val) -{ - return MAX(0.0f, val); -} - - // clang-format off // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py // vim: shiftwidth=2 expandtab tabstop=2 cindent diff --git a/src/iop/graduatednd.c b/src/iop/graduatednd.c index d70f36b79fbf..2cfe25700690 100644 --- a/src/iop/graduatednd.c +++ b/src/iop/graduatednd.c @@ -743,35 +743,9 @@ int scrolled( return 0; } -DT_OMP_DECLARE_SIMD(simdlen(4)) -static inline float _density_times_length(const float dens, const float length) -{ - return (dens * CLIP(0.5f + length) / 8.0f); -} - -DT_OMP_DECLARE_SIMD(simdlen(4)) static inline float _compute_density(const float dens, const float length) { -#ifdef __FAST_MATH__ - // !!! approximation is ok only when highest density is 8 - // for input x = (data->density * CLIP( 0.5+length ), calculate 2^x as (e^(ln2*x/8))^8 - // use exp2f approximation to calculate e^(ln2*x/8) - // in worst case - density==8,CLIP(0.5-length) == 1.0 it gives 0.6% of error - const float t = M_LN2f * _density_times_length(dens,length); - const float d1 = t * t * 0.5f; - const float d2 = d1 * t * 0.333333333f; - const float d3 = d2 * t * 0.25f; - const float d = 1 + t + d1 + d2 + d3; /* taylor series for e^x till x^4 */ - float density = d * d; - density = density * density; - density = density * density; -#else - // use fair exp2f - // for GCC10 on recent hardware, exp2f is actually faster than the above approximation, - // but it does not vectorize so it is slower overall - const float density = exp2f(dens * CLIP(0.5f + length)); -#endif - return density; + return exp2f(dens * CLIP(0.5f + length)); } void process(dt_iop_module_t *self, @@ -813,7 +787,6 @@ void process(dt_iop_module_t *self, // these into registers when it vectorizes const dt_aligned_pixel_t color = { data->color[0], data->color[1], data->color[2], data->color[3] }; const dt_aligned_pixel_t color1 = { data->color1[0], data->color1[1], data->color1[2], data->color1[3] }; - const dt_aligned_pixel_t zero = { 0.0f, 0.0f, 0.0f, 0.0f }; if(density > 0) { @@ -843,7 +816,7 @@ void process(dt_iop_module_t *self, dt_aligned_pixel_t res; // the compiler will optimize this into a register for_each_channel(l, aligned(in : 16)) { - res[l] = MAX(zero[l], (in[4*(x+i)+l] / (color[l] + color1[l] * curr_density[i]))); + res[l] = in[4*(x+i)+l] / (color[l] + color1[l] * curr_density[i]); } // use streaming writes to eliminate the memory reads from loading cache lines copy_pixel_nontemporal(out + 4*(x+i), res); @@ -857,7 +830,7 @@ void process(dt_iop_module_t *self, dt_aligned_pixel_t res; // the compiler will optimize this into a register for_each_channel(l, aligned(in : 16)) { - res[l] = MAX(zero[l], (in[4*x+l] / (color[l] + color1[l] * curr_density))); + res[l] = in[4*x+l] / (color[l] + color1[l] * curr_density); } // use streaming writes to eliminate the memory reads from loading cache lines copy_pixel_nontemporal(out + 4*x, res); @@ -894,7 +867,7 @@ void process(dt_iop_module_t *self, dt_aligned_pixel_t res; // the compiler will optimize this into a register for_each_channel(l, aligned(in : 16)) { - res[l] = MAX(zero[l], (in[4*(x+i)+l] * (color[l] + color1[l] * curr_density[i]))); + res[l] = in[4*(x+i)+l] * (color[l] + color1[l] * curr_density[i]); } // use streaming writes to eliminate the memory reads from loading cache lines copy_pixel_nontemporal(out + 4*(x+i), res); @@ -908,7 +881,7 @@ void process(dt_iop_module_t *self, dt_aligned_pixel_t res; // the compiler will optimize this into a register for_each_channel(l, aligned(in : 16)) { - res[l] = MAX(zero[l], (in[4*x+l] * (color[l] + color1[l] * curr_density))); + res[l] = in[4*x+l] * (color[l] + color1[l] * curr_density); } // use streaming writes to eliminate the memory reads from loading cache lines copy_pixel_nontemporal(out + 4*x, res);