diff --git a/data/kernels/basic.cl b/data/kernels/basic.cl
index eef502c36c8c..728ffb0a87d8 100644
--- a/data/kernels/basic.cl
+++ b/data/kernels/basic.cl
@@ -3722,8 +3722,6 @@ interpolation_resample (read_only image2d_t in,
   // store final result
   if (iy == 0 && x < width && y < height)
   {
-    // Clip negative RGB that may be produced by Lanczos undershooting
-    // Negative RGB are invalid values no matter the RGB space (light is positive)
     write_ipixel(out, (int2)(x, y), buffer[ylid]);
   }
 }
diff --git a/data/kernels/common.h b/data/kernels/common.h
index fc9db1fb1ae7..b9ccc08d07b1 100644
--- a/data/kernels/common.h
+++ b/data/kernels/common.h
@@ -280,5 +280,5 @@ static inline float readalpha(read_only image2d_t in, int col, int row)
 
 static inline void write_ipixel(write_only image2d_t out, const int2 pos, const float4 pixel)
 {
-  write_imagef(out, pos, fmax(0.0f, pixel));
+  write_imagef(out, pos, pixel);
 }
\ No newline at end of file
diff --git a/data/kernels/extended.cl b/data/kernels/extended.cl
index 10597d6da379..62687adf53b6 100644
--- a/data/kernels/extended.cl
+++ b/data/kernels/extended.cl
@@ -42,7 +42,7 @@ graduatedndp (read_only image2d_t in,
   float len = length_base + y*length_inc_y + x*length_inc_x;
   float dens = dtcl_exp2(density * clipf(0.5f + len));
   pixel = pixel / (color + ((float4)1.0f - color) * (float4)dens);
-  write_imagef (out, (int2)(x, y), fmax(0.0f, pixel));
+  write_ipixel(out, (int2)(x, y), pixel);
 }
 
 
@@ -67,7 +67,7 @@ graduatedndm (read_only image2d_t in,
   float len = length_base + y*length_inc_y + x*length_inc_x;
   float dens = dtcl_exp2(-density * clipf(0.5f - len));
   pixel = pixel * (color + ((float4)1.0f - color) * (float4)dens);
-  write_imagef(out, (int2)(x, y), fmax(0.0f, pixel));
+  write_ipixel(out, (int2)(x, y), pixel);
 }
 
 __kernel void
diff --git a/src/common/interpolation.c b/src/common/interpolation.c
index 999fef0debfb..e89bded163a5 100644
--- a/src/common/interpolation.c
+++ b/src/common/interpolation.c
@@ -44,9 +44,6 @@ enum border_mode
 // !! Make sure to sync this with the filter array !!
 #define MAX_HALF_FILTER_WIDTH 3
 
-// Add *verbose* (like one msg per pixel out) debug message to stderr
-#define DEBUG_PRINT_VERBOSE 0
-
 /* --------------------------------------------------------------------------
  * Debug helpers
  * ------------------------------------------------------------------------*/
@@ -552,7 +549,6 @@ float dt_interpolation_compute_sample(const dt_interpolation_t *itor,
       s += kernelv[i] * h;
       in += linestride;
     }
-    s = _interpolated_out(s * oonorm);
   }
   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
   {
@@ -574,9 +570,8 @@ float dt_interpolation_compute_sample(const dt_interpolation_t *itor,
       }
       s += kernelv[i] * h;
     }
-    s = _interpolated_out(s * oonorm);
   }
-  return s; // if called for masks make sure to CLIP to avoid interpolator under/overshoots
+  return s * oonorm; // if called for masks make sure to CLIP to avoid interpolator under/overshoots
 }
 
 /* --------------------------------------------------------------------------
@@ -645,7 +640,7 @@ void dt_interpolation_compute_pixel4c(const dt_interpolation_t *itor,
     }
 
     for_each_channel(c,aligned(out))
-      out[c] = _interpolated_out(pixel[c] * oonorm);
+      out[c] = pixel[c] * oonorm;
   }
   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
   {
@@ -675,7 +670,7 @@ void dt_interpolation_compute_pixel4c(const dt_interpolation_t *itor,
     }
 
     for_each_channel(c,aligned(out))
-      out[c] = _interpolated_out(pixel[c] * oonorm);
+      out[c] = pixel[c] * oonorm;
   }
   else
   {
@@ -1118,7 +1113,7 @@ void dt_interpolation_resample(const dt_interpolation_t *itor,
 
       dt_aligned_pixel_t pixel;
       for_each_channel(c, aligned(vs:16))
-        pixel[c] = _interpolated_out(vs[c]);
+        pixel[c] = vs[c];
       copy_pixel_nontemporal(out + baseidx, pixel);
 
       // Reset vertical resampling context
diff --git a/src/common/math.h b/src/common/math.h
index ff48555a2f47..689dc1061abd 100644
--- a/src/common/math.h
+++ b/src/common/math.h
@@ -811,15 +811,6 @@ static inline double rad2deg(const double radians)
   return radians / M_PI * 180.0;
 }
 
-/* Reminder: keep in sync with opencl write_ipixel()
-   All pixel interpolators use this, currently we restrict output of resampling to be at least zero
-*/
-static inline float _interpolated_out(const float val)
-{
-  return MAX(0.0f, val);
-}
-
-
 // clang-format off
 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
 // vim: shiftwidth=2 expandtab tabstop=2 cindent
diff --git a/src/iop/graduatednd.c b/src/iop/graduatednd.c
index d70f36b79fbf..2cfe25700690 100644
--- a/src/iop/graduatednd.c
+++ b/src/iop/graduatednd.c
@@ -743,35 +743,9 @@ int scrolled(
   return 0;
 }
 
-DT_OMP_DECLARE_SIMD(simdlen(4))
-static inline float _density_times_length(const float dens, const float length)
-{
-  return (dens * CLIP(0.5f + length) / 8.0f);
-}
-
-DT_OMP_DECLARE_SIMD(simdlen(4))
 static inline float _compute_density(const float dens, const float length)
 {
-#ifdef __FAST_MATH__
-  // !!! approximation is ok only when highest density is 8
-  // for input x = (data->density * CLIP( 0.5+length ), calculate 2^x as (e^(ln2*x/8))^8
-  // use exp2f approximation to calculate e^(ln2*x/8)
-  // in worst case - density==8,CLIP(0.5-length) == 1.0 it gives 0.6% of error
-  const float t = M_LN2f * _density_times_length(dens,length);
-  const float d1 = t * t * 0.5f;
-  const float d2 = d1 * t * 0.333333333f;
-  const float d3 = d2 * t * 0.25f;
-  const float d = 1 + t + d1 + d2 + d3; /* taylor series for e^x till x^4 */
-  float density = d * d;
-  density = density * density;
-  density = density * density;
-#else
-  // use fair exp2f
-  // for GCC10 on recent hardware, exp2f is actually faster than the above approximation,
-  // but it does not vectorize so it is slower overall
-  const float density = exp2f(dens * CLIP(0.5f + length));
-#endif
-  return density;
+  return exp2f(dens * CLIP(0.5f + length));
 }
 
 void process(dt_iop_module_t *self,
@@ -813,7 +787,6 @@ void process(dt_iop_module_t *self,
   // these into registers when it vectorizes
   const dt_aligned_pixel_t color = { data->color[0], data->color[1], data->color[2], data->color[3] };
   const dt_aligned_pixel_t color1 = { data->color1[0], data->color1[1], data->color1[2], data->color1[3] };
-  const dt_aligned_pixel_t zero = { 0.0f, 0.0f, 0.0f, 0.0f };
 
   if(density > 0)
   {
@@ -843,7 +816,7 @@ void process(dt_iop_module_t *self,
           dt_aligned_pixel_t res;	// the compiler will optimize this into a register
           for_each_channel(l, aligned(in : 16))
           {
-            res[l] = MAX(zero[l], (in[4*(x+i)+l] / (color[l] + color1[l] * curr_density[i])));
+            res[l] = in[4*(x+i)+l] / (color[l] + color1[l] * curr_density[i]);
           }
           // use streaming writes to eliminate the memory reads from loading cache lines
           copy_pixel_nontemporal(out + 4*(x+i), res);
@@ -857,7 +830,7 @@ void process(dt_iop_module_t *self,
         dt_aligned_pixel_t res;	// the compiler will optimize this into a register
         for_each_channel(l, aligned(in : 16))
         {
-          res[l] = MAX(zero[l], (in[4*x+l] / (color[l] + color1[l] * curr_density)));
+          res[l] = in[4*x+l] / (color[l] + color1[l] * curr_density);
         }
         // use streaming writes to eliminate the memory reads from loading cache lines
         copy_pixel_nontemporal(out + 4*x, res);
@@ -894,7 +867,7 @@ void process(dt_iop_module_t *self,
           dt_aligned_pixel_t res;	// the compiler will optimize this into a register
           for_each_channel(l, aligned(in : 16))
           {
-            res[l] = MAX(zero[l], (in[4*(x+i)+l] * (color[l] + color1[l] * curr_density[i])));
+            res[l] = in[4*(x+i)+l] * (color[l] + color1[l] * curr_density[i]);
           }
           // use streaming writes to eliminate the memory reads from loading cache lines
           copy_pixel_nontemporal(out + 4*(x+i), res);
@@ -908,7 +881,7 @@ void process(dt_iop_module_t *self,
         dt_aligned_pixel_t res;	// the compiler will optimize this into a register
         for_each_channel(l, aligned(in : 16))
         {
-          res[l] = MAX(zero[l], (in[4*x+l] * (color[l] + color1[l] * curr_density)));
+          res[l] = in[4*x+l] * (color[l] + color1[l] * curr_density);
         }
         // use streaming writes to eliminate the memory reads from loading cache lines
         copy_pixel_nontemporal(out + 4*x, res);