From 6596f667c8caba569dbc1a4e6781f21549b42228 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sat, 9 May 2026 07:53:35 +0200
Subject: [PATCH 1/2] All interpolating functions write data as calculated and
 don't avoid negatives

Until now the interpolating scalers (used in demosaic and finalscale) and all interpolating modules
(ashift, lens, liquify ...) calculated data and made sure output is at least zero.

We did that to avoid undershoots in very dark regions for example by the lanczos variants.

This leads to a few subtle problems:
- we might have valid negative data in some colorspaces so clipping negatives is not desired.
- if a module does not scale but only crops (happening in demosaic or finalscale) the negative
  data are copied for further processing.
- we likely will not clip all channel symmetrically so possibly subtle color casts happen in
  dark areas.

So - make all interpolators write to output as calculated.
---
 data/kernels/basic.cl      | 2 --
 data/kernels/common.h      | 2 +-
 data/kernels/extended.cl   | 4 ++--
 src/common/interpolation.c | 3 ---
 src/common/math.h          | 2 +-
 5 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/data/kernels/basic.cl b/data/kernels/basic.cl
index eef502c36c8c..728ffb0a87d8 100644
--- a/data/kernels/basic.cl
+++ b/data/kernels/basic.cl
@@ -3722,8 +3722,6 @@ interpolation_resample (read_only image2d_t in,
   // store final result
   if (iy == 0 && x < width && y < height)
   {
-    // Clip negative RGB that may be produced by Lanczos undershooting
-    // Negative RGB are invalid values no matter the RGB space (light is positive)
     write_ipixel(out, (int2)(x, y), buffer[ylid]);
   }
 }
diff --git a/data/kernels/common.h b/data/kernels/common.h
index fc9db1fb1ae7..b9ccc08d07b1 100644
--- a/data/kernels/common.h
+++ b/data/kernels/common.h
@@ -280,5 +280,5 @@ static inline float readalpha(read_only image2d_t in, int col, int row)
 
 static inline void write_ipixel(write_only image2d_t out, const int2 pos, const float4 pixel)
 {
-  write_imagef(out, pos, fmax(0.0f, pixel));
+  write_imagef(out, pos, pixel);
 }
\ No newline at end of file
diff --git a/data/kernels/extended.cl b/data/kernels/extended.cl
index 10597d6da379..62687adf53b6 100644
--- a/data/kernels/extended.cl
+++ b/data/kernels/extended.cl
@@ -42,7 +42,7 @@ graduatedndp (read_only image2d_t in,
   float len = length_base + y*length_inc_y + x*length_inc_x;
   float dens = dtcl_exp2(density * clipf(0.5f + len));
   pixel = pixel / (color + ((float4)1.0f - color) * (float4)dens);
-  write_imagef (out, (int2)(x, y), fmax(0.0f, pixel));
+  write_ipixel(out, (int2)(x, y), pixel);
 }
 
 
@@ -67,7 +67,7 @@ graduatedndm (read_only image2d_t in,
   float len = length_base + y*length_inc_y + x*length_inc_x;
   float dens = dtcl_exp2(-density * clipf(0.5f - len));
   pixel = pixel * (color + ((float4)1.0f - color) * (float4)dens);
-  write_imagef(out, (int2)(x, y), fmax(0.0f, pixel));
+  write_ipixel(out, (int2)(x, y), pixel);
 }
 
 __kernel void
diff --git a/src/common/interpolation.c b/src/common/interpolation.c
index 999fef0debfb..811186b6053f 100644
--- a/src/common/interpolation.c
+++ b/src/common/interpolation.c
@@ -44,9 +44,6 @@ enum border_mode
 // !! Make sure to sync this with the filter array !!
 #define MAX_HALF_FILTER_WIDTH 3
 
-// Add *verbose* (like one msg per pixel out) debug message to stderr
-#define DEBUG_PRINT_VERBOSE 0
-
 /* --------------------------------------------------------------------------
  * Debug helpers
  * ------------------------------------------------------------------------*/
diff --git a/src/common/math.h b/src/common/math.h
index ff48555a2f47..d2935d275a8e 100644
--- a/src/common/math.h
+++ b/src/common/math.h
@@ -816,7 +816,7 @@ static inline double rad2deg(const double radians)
 */
 static inline float _interpolated_out(const float val)
 {
-  return MAX(0.0f, val);
+  return val;
 }
 
 

From a04f2533175176cd9d9435d8164308abf19c50c4 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sat, 9 May 2026 19:44:00 +0200
Subject: [PATCH 2/2] Let's get rid of _interpolated_out()

1. As we don't clip for negatives in output any longer it's not used any longer.
2. Also fix clipping negatives in graduatednd CPU code and use std maths for all builds
   for the sake of quality (less banding).
---
 src/common/interpolation.c | 10 ++++------
 src/common/math.h          |  9 ---------
 src/iop/graduatednd.c      | 37 +++++--------------------------------
 3 files changed, 9 insertions(+), 47 deletions(-)

diff --git a/src/common/interpolation.c b/src/common/interpolation.c
index 811186b6053f..e89bded163a5 100644
--- a/src/common/interpolation.c
+++ b/src/common/interpolation.c
@@ -549,7 +549,6 @@ float dt_interpolation_compute_sample(const dt_interpolation_t *itor,
       s += kernelv[i] * h;
       in += linestride;
     }
-    s = _interpolated_out(s * oonorm);
   }
   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
   {
@@ -571,9 +570,8 @@ float dt_interpolation_compute_sample(const dt_interpolation_t *itor,
       }
       s += kernelv[i] * h;
     }
-    s = _interpolated_out(s * oonorm);
   }
-  return s; // if called for masks make sure to CLIP to avoid interpolator under/overshoots
+  return s * oonorm; // if called for masks make sure to CLIP to avoid interpolator under/overshoots
 }
 
 /* --------------------------------------------------------------------------
@@ -642,7 +640,7 @@ void dt_interpolation_compute_pixel4c(const dt_interpolation_t *itor,
     }
 
     for_each_channel(c,aligned(out))
-      out[c] = _interpolated_out(pixel[c] * oonorm);
+      out[c] = pixel[c] * oonorm;
   }
   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
   {
@@ -672,7 +670,7 @@ void dt_interpolation_compute_pixel4c(const dt_interpolation_t *itor,
     }
 
     for_each_channel(c,aligned(out))
-      out[c] = _interpolated_out(pixel[c] * oonorm);
+      out[c] = pixel[c] * oonorm;
   }
   else
   {
@@ -1115,7 +1113,7 @@ void dt_interpolation_resample(const dt_interpolation_t *itor,
 
       dt_aligned_pixel_t pixel;
       for_each_channel(c, aligned(vs:16))
-        pixel[c] = _interpolated_out(vs[c]);
+        pixel[c] = vs[c];
       copy_pixel_nontemporal(out + baseidx, pixel);
 
       // Reset vertical resampling context
diff --git a/src/common/math.h b/src/common/math.h
index d2935d275a8e..689dc1061abd 100644
--- a/src/common/math.h
+++ b/src/common/math.h
@@ -811,15 +811,6 @@ static inline double rad2deg(const double radians)
   return radians / M_PI * 180.0;
 }
 
-/* Reminder: keep in sync with opencl write_ipixel()
-   All pixel interpolators use this, currently we restrict output of resampling to be at least zero
-*/
-static inline float _interpolated_out(const float val)
-{
-  return val;
-}
-
-
 // clang-format off
 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
 // vim: shiftwidth=2 expandtab tabstop=2 cindent
diff --git a/src/iop/graduatednd.c b/src/iop/graduatednd.c
index d70f36b79fbf..2cfe25700690 100644
--- a/src/iop/graduatednd.c
+++ b/src/iop/graduatednd.c
@@ -743,35 +743,9 @@ int scrolled(
   return 0;
 }
 
-DT_OMP_DECLARE_SIMD(simdlen(4))
-static inline float _density_times_length(const float dens, const float length)
-{
-  return (dens * CLIP(0.5f + length) / 8.0f);
-}
-
-DT_OMP_DECLARE_SIMD(simdlen(4))
 static inline float _compute_density(const float dens, const float length)
 {
-#ifdef __FAST_MATH__
-  // !!! approximation is ok only when highest density is 8
-  // for input x = (data->density * CLIP( 0.5+length ), calculate 2^x as (e^(ln2*x/8))^8
-  // use exp2f approximation to calculate e^(ln2*x/8)
-  // in worst case - density==8,CLIP(0.5-length) == 1.0 it gives 0.6% of error
-  const float t = M_LN2f * _density_times_length(dens,length);
-  const float d1 = t * t * 0.5f;
-  const float d2 = d1 * t * 0.333333333f;
-  const float d3 = d2 * t * 0.25f;
-  const float d = 1 + t + d1 + d2 + d3; /* taylor series for e^x till x^4 */
-  float density = d * d;
-  density = density * density;
-  density = density * density;
-#else
-  // use fair exp2f
-  // for GCC10 on recent hardware, exp2f is actually faster than the above approximation,
-  // but it does not vectorize so it is slower overall
-  const float density = exp2f(dens * CLIP(0.5f + length));
-#endif
-  return density;
+  return exp2f(dens * CLIP(0.5f + length));
 }
 
 void process(dt_iop_module_t *self,
@@ -813,7 +787,6 @@ void process(dt_iop_module_t *self,
   // these into registers when it vectorizes
   const dt_aligned_pixel_t color = { data->color[0], data->color[1], data->color[2], data->color[3] };
   const dt_aligned_pixel_t color1 = { data->color1[0], data->color1[1], data->color1[2], data->color1[3] };
-  const dt_aligned_pixel_t zero = { 0.0f, 0.0f, 0.0f, 0.0f };
 
   if(density > 0)
   {
@@ -843,7 +816,7 @@ void process(dt_iop_module_t *self,
           dt_aligned_pixel_t res;	// the compiler will optimize this into a register
           for_each_channel(l, aligned(in : 16))
           {
-            res[l] = MAX(zero[l], (in[4*(x+i)+l] / (color[l] + color1[l] * curr_density[i])));
+            res[l] = in[4*(x+i)+l] / (color[l] + color1[l] * curr_density[i]);
           }
           // use streaming writes to eliminate the memory reads from loading cache lines
           copy_pixel_nontemporal(out + 4*(x+i), res);
@@ -857,7 +830,7 @@ void process(dt_iop_module_t *self,
         dt_aligned_pixel_t res;	// the compiler will optimize this into a register
         for_each_channel(l, aligned(in : 16))
         {
-          res[l] = MAX(zero[l], (in[4*x+l] / (color[l] + color1[l] * curr_density)));
+          res[l] = in[4*x+l] / (color[l] + color1[l] * curr_density);
         }
         // use streaming writes to eliminate the memory reads from loading cache lines
         copy_pixel_nontemporal(out + 4*x, res);
@@ -894,7 +867,7 @@ void process(dt_iop_module_t *self,
           dt_aligned_pixel_t res;	// the compiler will optimize this into a register
           for_each_channel(l, aligned(in : 16))
           {
-            res[l] = MAX(zero[l], (in[4*(x+i)+l] * (color[l] + color1[l] * curr_density[i])));
+            res[l] = in[4*(x+i)+l] * (color[l] + color1[l] * curr_density[i]);
           }
           // use streaming writes to eliminate the memory reads from loading cache lines
           copy_pixel_nontemporal(out + 4*(x+i), res);
@@ -908,7 +881,7 @@ void process(dt_iop_module_t *self,
         dt_aligned_pixel_t res;	// the compiler will optimize this into a register
         for_each_channel(l, aligned(in : 16))
         {
-          res[l] = MAX(zero[l], (in[4*x+l] * (color[l] + color1[l] * curr_density)));
+          res[l] = in[4*x+l] * (color[l] + color1[l] * curr_density);
         }
         // use streaming writes to eliminate the memory reads from loading cache lines
         copy_pixel_nontemporal(out + 4*x, res);