From aba483362bbcd0356c35e715402b0021c54348e0 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Tue, 3 Mar 2026 07:33:34 +0100
Subject: [PATCH 01/13] Ensure piece sensor patterns

Also use default colorspace for this
---
 src/develop/pixelpipe_hb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c
index dad1946df664..2197bd80696c 100644
--- a/src/develop/pixelpipe_hb.c
+++ b/src/develop/pixelpipe_hb.c
@@ -1236,7 +1236,8 @@ static inline gboolean _module_pipe_stop(dt_dev_pixelpipe_t *pipe, float *input)
 void dt_dev_prepare_piece_cfa(dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *roi)
 {
   dt_iop_module_t *module = piece->module;
-  if(module && module->input_colorspace(module, piece->pipe, piece) == IOP_CS_RAW)
+  if(module && (module->input_colorspace(module, piece->pipe, piece) == IOP_CS_RAW
+              || module->default_colorspace(module, piece->pipe, piece) == IOP_CS_RAW))
   {
     piece->filters = dt_rawspeed_crop_dcraw_filters(piece->pipe->dsc.filters, roi->x, roi->y);
     for(int ii = 0; ii < 6; ++ii)

From ed4578e4eedba7c960c7db6ca55bad84584c60d1 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sat, 7 Mar 2026 10:02:34 +0100
Subject: [PATCH 02/13] Reorganize some demosaic defines

Moved the demosaic specific #define to shared darktable.h where they belong.
Also added some comments.
---
 src/common/darktable.h       | 32 ++++++++++++++++++++++++++++++++
 src/iop/demosaic.c           |  5 -----
 src/iop/demosaicing/amaze.cc |  5 -----
 src/iop/demosaicing/basics.c |  1 +
 src/iop/demosaicing/xtrans.c |  7 +++----
 5 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/src/common/darktable.h b/src/common/darktable.h
index 0b8baa93809f..44700bb14602 100644
--- a/src/common/darktable.h
+++ b/src/common/darktable.h
@@ -249,6 +249,38 @@ char *dt_version_major_minor();
 #define INVPHI 0.61803398874989479F
 #endif
 
+
+/* Some CPU demosaicers do internal tiling, this decreases mem pressure while
+   increasing performance as processed data is available in cpu cache.
+   The default values have been calculated for a cachesize of 1MB per thread,
+   this is mostly still valid in 2026 but you may tune for performance.
+
+   Due to internal code AMAZETS should be a multiple of 32, for RCD and LMMSE
+   a multiple of 8.
+
+   Note: for the curious, if we calculate the tilesizes at runtime we loose the
+   performance gain as the compiler can't optimize that much.
+*/
+#ifndef DT_RCD_TILESIZE
+#define DT_RCD_TILESIZE 112
+#endif
+
+#ifndef DT_LMMSE_TILESIZE
+#define DT_LMMSE_TILESIZE 136
+#endif
+
+#ifndef AMAZETS
+#define AMAZETS 160
+#endif
+
+#ifndef DT_MARKESTEIJN_TS
+#define DT_MARKESTEIJN_TS 122
+#endif
+
+#ifndef DT_FDC_TS
+#define DT_FDC_TS 122
+#endif
+
 // NaN-safe clamping (NaN compares false, and will thus result in H)
 #define CLAMPS(A, L, H) ((A) > (L) ? ((A) < (H) ? (A) : (H)) : (L))
 
diff --git a/src/iop/demosaic.c b/src/iop/demosaic.c
index 34e91fba86dd..e0254705b294 100644
--- a/src/iop/demosaic.c
+++ b/src/iop/demosaic.c
@@ -51,11 +51,6 @@ DT_MODULE_INTROSPECTION(6, dt_iop_demosaic_params_t)
 
 #define DT_DEMOSAIC_XTRANS 1024 // masks for non-Bayer demosaic ops
 #define DT_DEMOSAIC_DUAL 2048   // masks for dual demosaicing methods
-#define DT_REDUCESIZE_MIN 64
-
-// these are highly depending on CPU architecture (cache size)
-#define DT_RCD_TILESIZE 112
-#define DT_LMMSE_TILESIZE 136
 
 typedef enum dt_iop_demosaic_method_t
 {
diff --git a/src/iop/demosaicing/amaze.cc b/src/iop/demosaicing/amaze.cc
index dc2c4dbd5f58..64ce4b826735 100644
--- a/src/iop/demosaicing/amaze.cc
+++ b/src/iop/demosaicing/amaze.cc
@@ -135,11 +135,6 @@ void amaze_demosaic(const float *const in,
 {
   const float clip_pt8 = 0.8f * clip_pt;
 
-// this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster
-// If AMAZETS is undefined it will be set to 160, which is the fastest on machines with 1GB cache per thread.
-#ifndef AMAZETS
-#define AMAZETS 160
-#endif
   // Tile size; the image is processed in square tiles to lower memory requirements and facilitate
   // multi-threading
   // We assure that Tile size is a multiple of 32 in the range [96;992]
diff --git a/src/iop/demosaicing/basics.c b/src/iop/demosaicing/basics.c
index 851d880cd6c5..7ffab019b172 100644
--- a/src/iop/demosaicing/basics.c
+++ b/src/iop/demosaicing/basics.c
@@ -298,6 +298,7 @@ static int color_smoothing_cl(const dt_iop_module_t *self,
   return err;
 }
 
+#define DT_REDUCESIZE_MIN 64
 static int green_equilibration_cl(const dt_iop_module_t *self,
                                   const dt_dev_pixelpipe_iop_t *piece,
                                   cl_mem dev_in,
diff --git a/src/iop/demosaicing/xtrans.c b/src/iop/demosaicing/xtrans.c
index c08991062389..054547ec3c44 100644
--- a/src/iop/demosaicing/xtrans.c
+++ b/src/iop/demosaicing/xtrans.c
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    Copyright (C) 2010-2025 darktable developers.
+    Copyright (C) 2010-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -18,8 +18,7 @@
 
 // xtrans_interpolate adapted from dcraw 9.20
 
-// tile size, optimized to keep data in L2 cache
-#define TS 122
+#define TS DT_MARKESTEIJN_TS
 
 #define PAD_G1_G3 3
 #define PAD_G_INTERP 3
@@ -512,7 +511,7 @@ static void xtrans_markesteijn_interpolate(float *out,
 
 #undef TS
 
-#define TS 122
+#define TS DT_FDC_TS
 static void xtrans_fdc_interpolate(float *out,
                                    const float *const in,
                                    const int width,

From 1632494995b192afe48a01c586716263ba74aaf5 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Wed, 4 Mar 2026 09:14:08 +0100
Subject: [PATCH 03/13] Support OpenCL diffdumps for Lab

When writing pfm diff files for a module writing out IOP_CS_LAB we convert
data to linear RGB.
---
 src/develop/pixelpipe_hb.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c
index 2197bd80696c..8d46100bf354 100644
--- a/src/develop/pixelpipe_hb.c
+++ b/src/develop/pixelpipe_hb.c
@@ -2336,6 +2336,17 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
                     if(terr == CL_SUCCESS)
                     {
                       module->process(module, piece, clindata, cpudata, &roi_in, roi_out);
+                      if(cst_out == IOP_CS_LAB)
+                      {
+                        for(size_t k = 0; k < (size_t)ow * oh * 4; k+=4)
+                        {
+                          dt_aligned_pixel_t XYZ;
+                          dt_Lab_to_XYZ(cpudata + k, XYZ);
+                          dt_XYZ_to_linearRGB(XYZ, cpudata + k);
+                          dt_Lab_to_XYZ(cloutdata + k, XYZ);
+                          dt_XYZ_to_linearRGB(XYZ, cloutdata + k);
+                        }
+                      }
                       dt_dump_pipe_diff_pfm(module->op, cloutdata, cpudata, ow, oh, cho,
                                             dt_dev_pixelpipe_type_to_str(pipe->type));                  }
                   }

From caee4de0d77ba822be9a88f0a4d56a5bc677b98f Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sun, 8 Mar 2026 11:36:01 +0100
Subject: [PATCH 04/13] Fix RCD, PPG, box3 and LMMSE demosaicers

1. The internal tiling for RCD CPU code requires a border of 10 to avoid subtle instabilities at the
   overlapping parts. Fixed comments accordingly.
2. The PPG demosaicer definitely requires data to be unclipped in internal algorithms thus output
   has been improved both for CPU and OpenCL code, almost no detectable differences any more.
3. As RCD used a slightly modified variant of PPG (we don't need to calculate data outside the border),
   code could be refactored for CPU and OpenCL.
4. In RCD and LMMSE we normalize data so we down/upscale to ensure this.
   We now use safer scalers reducing artifacts and possibly don't loose precision in CPU/OpenCL code.
   For RCD the normalizing is not strictly required ...
5. Demosaicers don't have to clip output to non-negatives
6. Introduce and first use of read_imagef helper inline functions
---
 data/kernels/common.h        |  20 +++
 data/kernels/demosaic_ppg.cl | 164 ++++++++++++------------
 data/kernels/demosaic_rcd.cl | 225 +++------------------------------
 src/iop/demosaic.c           |  13 +-
 src/iop/demosaicing/basics.c |   6 +-
 src/iop/demosaicing/lmmse.c  |   2 +-
 src/iop/demosaicing/ppg.c    | 152 ++++++++++++----------
 src/iop/demosaicing/rcd.c    | 238 ++++-------------------------------
 8 files changed, 242 insertions(+), 578 deletions(-)

diff --git a/data/kernels/common.h b/data/kernels/common.h
index 234f00b44149..d9abcb2918f7 100644
--- a/data/kernels/common.h
+++ b/data/kernels/common.h
@@ -159,3 +159,23 @@ static inline float clipf(const float a)
 {
   return clamp(a, 0.0f, 1.0f);
 }
+
+/* Some inline functions making life easier when reading photosites
+   or pixels from cl_mem images.
+   As we had images with NaNs in data and at least AMD systems suffered
+   as reading NaNs instead of falling back to 0 (nvidia seems to do do)
+   we always read data and make sure values are valid floats.
+*/
+static inline float readsingle(read_only image2d_t in, int col, int row)
+{
+  return fmax(-FLT_MAX, read_imagef(in, sampleri, (int2)(col, row)).x);
+}
+
+static inline float4 readpixel(read_only image2d_t in, int col, int row)
+{
+  return fmax(-FLT_MAX, read_imagef(in, sampleri, (int2)(col, row)));
+}
+static inline float readalpha(read_only image2d_t in, int col, int row)
+{
+  return clipf(read_imagef(in, sampleri, (int2)(col, row)).w);
+}
diff --git a/data/kernels/demosaic_ppg.cl b/data/kernels/demosaic_ppg.cl
index ce88484fd87f..8ef11616cab0 100644
--- a/data/kernels/demosaic_ppg.cl
+++ b/data/kernels/demosaic_ppg.cl
@@ -544,11 +544,16 @@ clip_and_zoom_demosaic_half_size(__read_only image2d_t in,
  * in (float) or (float4).x -> out (float4)
  */
 kernel void
-ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
-                    const unsigned int filters, local float *buffer)
+ppg_demosaic_green (read_only image2d_t in,
+                    write_only image2d_t out,
+                    const int width,
+                    const int height,
+                    const unsigned int filters,
+                    local float *buffer,
+                    const int border)
 {
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
   const int xlsz = get_local_size(0);
   const int ylsz = get_local_size(1);
   const int xlid = get_local_id(0);
@@ -577,7 +582,7 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy)).x;
+    buffer[bufidx] = readsingle(in, xx, yy);
   }
 
   // center buffer around current x,y-Pixel
@@ -585,23 +590,22 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  // make sure we dont write the outermost 3 pixels
-  if(x >= width - 3 || x < 3 || y >= height - 3 || y < 3) return;
+  if(col >= width - 3 || col < 3 || row >= height - 3 || row < 3) return;
+  if(col >= border && col < width - border && row >= border && row < height - border) return;
+
   // process all non-green pixels
-  const int row = y;
-  const int col = x;
   const int c = FC(row, col, filters);
-  float4 color; // output color
+  float4 color = 0.0f; // output color
 
   const float pc = buffer[0];
 
-  if     (c == 0) color.x = pc; // red
-  else if(c == 1) color.y = pc; // green1
-  else if(c == 2) color.z = pc; // blue
+  if     (c == RED) color.x = pc;
+  else if(c == GREEN) color.y = pc;
+  else if(c == BLUE) color.z = pc;
   else            color.y = pc; // green2
 
   // fill green layer for red and blue pixels:
-  if(c == 0 || c == 2)
+  if(c == RED || c == BLUE)
   {
     // look up horizontal and vertical neighbours, sharpened weight:
     const float pym  = buffer[-1 * stride];
@@ -640,7 +644,7 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int
       color.y = fmax(fmin(guessx*0.25f, M), m);
     }
   }
-  write_imagef (out, (int2)(x, y), fmax(color, 0.0f));
+  write_imagef (out, (int2)(col, row), color);
 }
 
 
@@ -649,12 +653,17 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int
  * in (float4) -> out (float4)
  */
 kernel void
-ppg_demosaic_redblue (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
-                      const unsigned int filters, local float4 *buffer)
+ppg_demosaic_redblue (read_only image2d_t in,
+                      write_only image2d_t out,
+                      const int width,
+                      const int height,
+                      const unsigned int filters,
+                      local float4 *buffer,
+                      const int border)
 {
   // image in contains full green and sparse r b
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
   const int xlsz = get_local_size(0);
   const int ylsz = get_local_size(1);
   const int xlid = get_local_id(0);
@@ -683,7 +692,7 @@ ppg_demosaic_redblue (read_only image2d_t in, write_only image2d_t out, const in
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy));
+    buffer[bufidx] = readpixel(in, xx, yy);
   }
 
   // center buffer around current x,y-Pixel
@@ -691,69 +700,66 @@ ppg_demosaic_redblue (read_only image2d_t in, write_only image2d_t out, const in
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  if(x >= width || y >= height) return;
-  const int row = y;
-  const int col = x;
+  if(col >= width || row >= height) return;
+  if(col >= border && col < width - border && row >= border && row < height - border) return;
+
   const int c = FC(row, col, filters);
   float4 color = buffer[0];
-  if(x == 0 || y == 0 || x == (width-1) || y == (height-1))
+  if(row > 0 && col > 0 && col < width - 1 && row < height - 1)
   {
-    write_imagef (out, (int2)(x, y), fmax(color, 0.0f));
-    return;
-  }
-
-  if(c == 1 || c == 3)
-  { // calculate red and blue for green pixels:
-    // need 4-nbhood:
-    const float4 nt = buffer[-stride];
-    const float4 nb = buffer[ stride];
-    const float4 nl = buffer[-1];
-    const float4 nr = buffer[ 1];
-    if(FC(row, col+1, filters) == 0) // red nb in same row
-    {
-      color.z = (nt.z + nb.z + 2.0f*color.y - nt.y - nb.y)*0.5f;
-      color.x = (nl.x + nr.x + 2.0f*color.y - nl.y - nr.y)*0.5f;
+    if(c == GREEN || c == 3)
+    { // calculate red and blue for green pixels:
+      // need 4-nbhood:
+      const float4 nt = buffer[-stride];
+      const float4 nb = buffer[ stride];
+      const float4 nl = buffer[-1];
+      const float4 nr = buffer[ 1];
+      if(FC(row, col+1, filters) == RED) // red nb in same row
+      {
+        color.z = (nt.z + nb.z + 2.0f*color.y - nt.y - nb.y)*0.5f;
+        color.x = (nl.x + nr.x + 2.0f*color.y - nl.y - nr.y)*0.5f;
+      }
+      else
+      { // blue nb
+        color.x = (nt.x + nb.x + 2.0f*color.y - nt.y - nb.y)*0.5f;
+        color.z = (nl.z + nr.z + 2.0f*color.y - nl.y - nr.y)*0.5f;
+      }
     }
     else
-    { // blue nb
-      color.x = (nt.x + nb.x + 2.0f*color.y - nt.y - nb.y)*0.5f;
-      color.z = (nl.z + nr.z + 2.0f*color.y - nl.y - nr.y)*0.5f;
-    }
-  }
-  else
-  {
-    // get 4-star-nbhood:
-    const float4 ntl = buffer[-stride - 1];
-    const float4 ntr = buffer[-stride + 1];
-    const float4 nbl = buffer[ stride - 1];
-    const float4 nbr = buffer[ stride + 1];
-
-    if(c == 0)
-    { // red pixel, fill blue:
-      const float diff1  = fabs(ntl.z - nbr.z) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
-      const float guess1 = ntl.z + nbr.z + 2.0f*color.y - ntl.y - nbr.y;
-      const float diff2  = fabs(ntr.z - nbl.z) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
-      const float guess2 = ntr.z + nbl.z + 2.0f*color.y - ntr.y - nbl.y;
-      if     (diff1 > diff2) color.z = guess2 * 0.5f;
-      else if(diff1 < diff2) color.z = guess1 * 0.5f;
-      else color.z = (guess1 + guess2)*0.25f;
-    }
-    else // c == 2, blue pixel, fill red:
     {
-      const float diff1  = fabs(ntl.x - nbr.x) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
-      const float guess1 = ntl.x + nbr.x + 2.0f*color.y - ntl.y - nbr.y;
-      const float diff2  = fabs(ntr.x - nbl.x) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
-      const float guess2 = ntr.x + nbl.x + 2.0f*color.y - ntr.y - nbl.y;
-      if     (diff1 > diff2) color.x = guess2 * 0.5f;
-      else if(diff1 < diff2) color.x = guess1 * 0.5f;
-      else color.x = (guess1 + guess2)*0.25f;
+      // get 4-star-nbhood:
+      const float4 ntl = buffer[-stride - 1];
+      const float4 ntr = buffer[-stride + 1];
+      const float4 nbl = buffer[ stride - 1];
+      const float4 nbr = buffer[ stride + 1];
+
+      if(c == RED)
+      { // red pixel, fill blue:
+        const float diff1  = fabs(ntl.z - nbr.z) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
+        const float guess1 = ntl.z + nbr.z + 2.0f*color.y - ntl.y - nbr.y;
+        const float diff2  = fabs(ntr.z - nbl.z) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
+        const float guess2 = ntr.z + nbl.z + 2.0f*color.y - ntr.y - nbl.y;
+        if     (diff1 > diff2) color.z = guess2 * 0.5f;
+        else if(diff1 < diff2) color.z = guess1 * 0.5f;
+        else color.z = (guess1 + guess2)*0.25f;
+      }
+      else // c == 2, blue pixel, fill red:
+      {
+        const float diff1  = fabs(ntl.x - nbr.x) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
+        const float guess1 = ntl.x + nbr.x + 2.0f*color.y - ntl.y - nbr.y;
+        const float diff2  = fabs(ntr.x - nbl.x) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
+        const float guess2 = ntr.x + nbl.x + 2.0f*color.y - ntr.y - nbl.y;
+        if     (diff1 > diff2) color.x = guess2 * 0.5f;
+        else if(diff1 < diff2) color.x = guess1 * 0.5f;
+        else color.x = (guess1 + guess2)*0.25f;
+      }
     }
   }
-  write_imagef (out, (int2)(x, y), fmax(color, 0.0f));
+  write_imagef (out, (int2)(col, row), color);
 }
 
 /**
- * Demosaic image border
+ * Demosaic image border for the three outermost photosites
  */
 kernel void
 border_interpolate(read_only image2d_t in, write_only image2d_t out, const int width, const int height, const unsigned int filters)
@@ -762,27 +768,23 @@ border_interpolate(read_only image2d_t in, write_only image2d_t out, const int w
   const int y = get_global_id(1);
 
   if(x >= width || y >= height) return;
-
-  const int avgwindow = 1;
-  const int border = 3;
-
-  if(x >= border && x < width-border && y >= border && y < height-border) return;
+  if(x >= 3 && x < width-3 && y >= 3 && y < height-3) return;
 
   float4 o;
   float sum[4] = { 0.0f };
   int count[4] = { 0 };
 
-  for (int j=y-avgwindow; j<=y+avgwindow; j++) for (int i=x-avgwindow; i<=x+avgwindow; i++)
+  for(int j=y-1; j<=y+1; j++) for (int i=x-1; i<=x+1; i++)
   {
-    if (j>=0 && i>=0 && j<height && i<width)
+    if(j>=0 && i>=0 && j<height && i<width)
     {
       const int f = FC(j,i,filters);
-      sum[f] += fmax(0.0f, read_imagef(in, sampleri, (int2)(i, j)).x);
+      sum[f] += readsingle(in, i, j);
       count[f]++;
     }
   }
 
-  const float i = fmax(0.0f, read_imagef(in, sampleri, (int2)(x, y)).x);
+  const float i = readsingle(in, x, y);
   o.x = count[0] > 0 ? sum[0]/count[0] : i;
   o.y = count[1]+count[3] > 0 ? (sum[1]+sum[3])/(count[1]+count[3]) : i;
   o.z = count[2] > 0 ? sum[2]/count[2] : i;
diff --git a/data/kernels/demosaic_rcd.cl b/data/kernels/demosaic_rcd.cl
index a6202149cd13..56ad06a26c2e 100644
--- a/data/kernels/demosaic_rcd.cl
+++ b/data/kernels/demosaic_rcd.cl
@@ -32,7 +32,7 @@ __kernel void rcd_populate (__read_only image2d_t in, global float *cfa, global
   const int col = get_global_id(0);
   const int row = get_global_id(1);
   if(col >= w || row >= height) return;
-  const float val = scale * fmax(0.0f, read_imagef(in, sampleri, (int2)(col, row)).x);
+  const float val = scale * fmax(0.0f, readsingle(in, col, row));
   const int color = FC(row, col, filters);
 
   global float *rgbcol = rgb0;
@@ -51,7 +51,7 @@ __kernel void rcd_write_output (__write_only image2d_t out, global float *rgb0,
   if(!(col >= border && col < w - border && row >= border && row < height - border)) return;
   const int idx = mad24(row, w, col);
 
-  write_imagef(out, (int2)(col, row), (float4)(fmax(scale * rgb0[idx], 0.0f), fmax(scale * rgb1[idx], 0.0f), fmax(scale * rgb2[idx], 0.0f), 0.0f));
+  write_imagef(out, (int2)(col, row), (float4)(scale * rgb0[idx], scale * rgb1[idx], scale * rgb2[idx], 0.0f));
 }
 
 #define eps 1e-5f              // Tolerance to avoid dividing by zero
@@ -278,9 +278,9 @@ __kernel void write_blended_dual(__read_only image2d_t high,
   const int row = get_global_id(1);
   if((col >= w) || (row >= height)) return;
 
-  const float4 high_val = read_imagef(high, sampleri, (int2)(col, row));
-  const float4 low_val = read_imagef(low, sampleri, (int2)(col, row));
-  const float4 blender = clipf(mask[mad24(row, w, col)]);
+  const float4 high_val = readpixel(high, col, row);
+  const float4 low_val = readpixel(low, col, row);
+  const float4 blender = (float4)clipf(mask[mad24(row, w, col)]);
   float4 data = mix(low_val, high_val, blender);
   data.w = showmask ? blender.x : 0.0f;
   write_imagef(out, (int2)(col, row), data);
@@ -297,8 +297,8 @@ __kernel void calc_Y0_mask(global float *mask,
   if((col >= w) || (row >= height)) return;
   const int idx = mad24(row, w, col);
 
-  const float4 pt = wb * fmax(0.0f, read_imagef(in, sampleri, (int2)(col, row)));
-  mask[idx] = dtcl_sqrt(0.33333333f *(pt.x + pt.y + pt.z));
+  const float4 pt = wb * fmax(0.0f, readpixel(in, col, row));
+  mask[idx] = dtcl_sqrt(0.33333333f * (pt.x + pt.y + pt.z));
 }
 
 __kernel void calc_scharr_mask(global float *in, global float *out, const int w, const int height)
@@ -331,203 +331,6 @@ __kernel void calc_detail_blend(global float *in, global float *out, const int w
   out[idx] = detail ? blend : 1.0f - blend;
 }
 
-kernel void rcd_border_green(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
-                    const unsigned int filters, local float *buffer, const int border)
-{
-  const int col = get_global_id(0);
-  const int row = get_global_id(1);
-  const int xlsz = get_local_size(0);
-  const int ylsz = get_local_size(1);
-  const int xlid = get_local_id(0);
-  const int ylid = get_local_id(1);
-  const int xgid = get_group_id(0);
-  const int ygid = get_group_id(1);
-
-  // individual control variable in this work group and the work group size
-  const int l = mad24(ylid, xlsz, xlid);
-  const int lsz = mul24(xlsz, ylsz);
-
-  // stride and maximum capacity of local buffer
-  // cells of 1*float per pixel with a surrounding border of 3 cells
-  const int stride = xlsz + 2*3;
-  const int maxbuf = mul24(stride, ylsz + 2*3);
-
-  // coordinates of top left pixel of buffer
-  // this is 3 pixel left and above of the work group origin
-  const int xul = mul24(xgid, xlsz) - 3;
-  const int yul = mul24(ygid, ylsz) - 3;
-
-  // populate local memory buffer
-  for(int n = 0; n <= maxbuf/lsz; n++)
-  {
-    const int bufidx = mad24(n, lsz, l);
-    if(bufidx >= maxbuf) continue;
-    const int xx = xul + bufidx % stride;
-    const int yy = yul + bufidx / stride;
-    buffer[bufidx] = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy)).x);
-  }
-
-  // center buffer around current x,y-Pixel
-  buffer += mad24(ylid + 3, stride, xlid + 3);
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  if(col >= width - 3 || col < 3 || row >= height - 3 || row < 3) return;
-  if(col >= border && col < width - border && row >= border && row < height - border) return;
-
-  // process all non-green pixels
-  const int c = FC(row, col, filters);
-  float4 color = 0.0f; // output color
-
-  const float pc = buffer[0];
-
-  if     (c == RED) color.x = pc;
-  else if(c == GREEN) color.y = pc;
-  else if(c == BLUE) color.z = pc;
-  else            color.y = pc; // green2
-
-  // fill green layer for red and blue pixels:
-  if(c == RED || c == BLUE)
-  {
-    // look up horizontal and vertical neighbours, sharpened weight:
-    const float pym  = buffer[-1 * stride];
-    const float pym2 = buffer[-2 * stride];
-    const float pym3 = buffer[-3 * stride];
-    const float pyM  = buffer[ 1 * stride];
-    const float pyM2 = buffer[ 2 * stride];
-    const float pyM3 = buffer[ 3 * stride];
-    const float pxm  = buffer[-1];
-    const float pxm2 = buffer[-2];
-    const float pxm3 = buffer[-3];
-    const float pxM  = buffer[ 1];
-    const float pxM2 = buffer[ 2];
-    const float pxM3 = buffer[ 3];
-    const float guessx = (pxm + pc + pxM) * 2.0f - pxM2 - pxm2;
-    const float diffx  = (fabs(pxm2 - pc) +
-                          fabs(pxM2 - pc) +
-                          fabs(pxm  - pxM)) * 3.0f +
-                         (fabs(pxM3 - pxM) + fabs(pxm3 - pxm)) * 2.0f;
-    const float guessy = (pym + pc + pyM) * 2.0f - pyM2 - pym2;
-    const float diffy  = (fabs(pym2 - pc) +
-                          fabs(pyM2 - pc) +
-                          fabs(pym  - pyM)) * 3.0f +
-                         (fabs(pyM3 - pyM) + fabs(pym3 - pym)) * 2.0f;
-    if(diffx > diffy)
-    {
-      // use guessy
-      const float m = fmin(pym, pyM);
-      const float M = fmax(pym, pyM);
-      color.y = fmax(fmin(guessy*0.25f, M), m);
-    }
-    else
-    {
-      const float m = fmin(pxm, pxM);
-      const float M = fmax(pxm, pxM);
-      color.y = fmax(fmin(guessx*0.25f, M), m);
-    }
-  }
-  write_imagef (out, (int2)(col, row), fmax(color, 0.0f));
-}
-kernel void rcd_border_redblue(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
-                      const unsigned int filters, local float4 *buffer, const int border)
-{
-  // image in contains full green and sparse r b
-  const int col = get_global_id(0);
-  const int row = get_global_id(1);
-  const int xlsz = get_local_size(0);
-  const int ylsz = get_local_size(1);
-  const int xlid = get_local_id(0);
-  const int ylid = get_local_id(1);
-  const int xgid = get_group_id(0);
-  const int ygid = get_group_id(1);
-
-  // individual control variable in this work group and the work group size
-  const int l = mad24(ylid, xlsz, xlid);
-  const int lsz = mul24(xlsz, ylsz);
-
-  // stride and maximum capacity of local buffer
-  // cells of float4 per pixel with a surrounding border of 1 cell
-  const int stride = xlsz + 2;
-  const int maxbuf = mul24(stride, ylsz + 2);
-
-  // coordinates of top left pixel of buffer
-  // this is 1 pixel left and above of the work group origin
-  const int xul = mul24(xgid, xlsz) - 1;
-  const int yul = mul24(ygid, ylsz) - 1;
-
-  // populate local memory buffer
-  for(int n = 0; n <= maxbuf/lsz; n++)
-  {
-    const int bufidx = mad24(n, lsz, l);
-    if(bufidx >= maxbuf) continue;
-    const int xx = xul + bufidx % stride;
-    const int yy = yul + bufidx / stride;
-    buffer[bufidx] = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy)));
-  }
-
-  // center buffer around current x,y-Pixel
-  buffer += mad24(ylid + 1, stride, xlid + 1);
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  if(col >= width || row >= height) return;
-  if(col >= border && col < width - border && row >= border && row < height - border) return;
-
-  const int c = FC(row, col, filters);
-  float4 color = buffer[0];
-  if(row > 0 && col > 0 && col < width - 1 && row < height - 1)
-  {
-    if(c == GREEN || c == 3)
-    { // calculate red and blue for green pixels:
-      // need 4-nbhood:
-      const float4 nt = buffer[-stride];
-      const float4 nb = buffer[ stride];
-      const float4 nl = buffer[-1];
-      const float4 nr = buffer[ 1];
-      if(FC(row, col+1, filters) == RED) // red nb in same row
-      {
-        color.z = (nt.z + nb.z + 2.0f*color.y - nt.y - nb.y)*0.5f;
-        color.x = (nl.x + nr.x + 2.0f*color.y - nl.y - nr.y)*0.5f;
-      }
-      else
-      { // blue nb
-        color.x = (nt.x + nb.x + 2.0f*color.y - nt.y - nb.y)*0.5f;
-        color.z = (nl.z + nr.z + 2.0f*color.y - nl.y - nr.y)*0.5f;
-      }
-    }
-    else
-    {
-      // get 4-star-nbhood:
-      const float4 ntl = buffer[-stride - 1];
-      const float4 ntr = buffer[-stride + 1];
-      const float4 nbl = buffer[ stride - 1];
-      const float4 nbr = buffer[ stride + 1];
-
-      if(c == RED)
-      { // red pixel, fill blue:
-        const float diff1  = fabs(ntl.z - nbr.z) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
-        const float guess1 = ntl.z + nbr.z + 2.0f*color.y - ntl.y - nbr.y;
-        const float diff2  = fabs(ntr.z - nbl.z) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
-        const float guess2 = ntr.z + nbl.z + 2.0f*color.y - ntr.y - nbl.y;
-        if     (diff1 > diff2) color.z = guess2 * 0.5f;
-        else if(diff1 < diff2) color.z = guess1 * 0.5f;
-        else color.z = (guess1 + guess2)*0.25f;
-      }
-      else // c == 2, blue pixel, fill red:
-      {
-        const float diff1  = fabs(ntl.x - nbr.x) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
-        const float guess1 = ntl.x + nbr.x + 2.0f*color.y - ntl.y - nbr.y;
-        const float diff2  = fabs(ntr.x - nbl.x) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
-        const float guess2 = ntr.x + nbl.x + 2.0f*color.y - ntr.y - nbl.y;
-        if     (diff1 > diff2) color.x = guess2 * 0.5f;
-        else if(diff1 < diff2) color.x = guess1 * 0.5f;
-        else color.x = (guess1 + guess2)*0.25f;
-      }
-    }
-  }
-  write_imagef (out, (int2)(col, row), fmax(color, 0.0f));
-}
-
 kernel void demosaic_box3(read_only image2d_t in,
                           write_only image2d_t out,
                           const int width,
@@ -547,7 +350,7 @@ kernel void demosaic_box3(read_only image2d_t in,
       if(x >= 0 && y >= 0 && x < width && y < height)
       {
         const int color = fcol(y, x, filters, xtrans);
-        sum[color] += fmax(0.0f, read_imagef(in, sampleri, (int2)(x, y)).x);
+        sum[color] += readsingle(in, x, y);
         cnt[color] += 1.0f;
       }
     }
@@ -556,3 +359,15 @@ kernel void demosaic_box3(read_only image2d_t in,
   write_imagef(out, (int2)(col, row), rgb);
 }
 
+kernel void
+image_positive(read_only image2d_t in, write_only image2d_t out, const int width, const int height)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  if(x >= width || y >= height) return;
+
+  float4 pixel = fmax(0.0f, readpixel(in, x, y));
+  write_imagef(out, (int2)(x, y), pixel);
+}
+
diff --git a/src/iop/demosaic.c b/src/iop/demosaic.c
index e0254705b294..78521d165868 100644
--- a/src/iop/demosaic.c
+++ b/src/iop/demosaic.c
@@ -230,8 +230,6 @@ typedef struct dt_iop_demosaic_global_data_t
   int kernel_rcd_step_4_2;
   int kernel_rcd_step_5_1;
   int kernel_rcd_step_5_2;
-  int kernel_rcd_border_redblue;
-  int kernel_rcd_border_green;
   int kernel_demosaic_box3;
   int kernel_write_blended_dual;
   int gaussian_9x9_mul;
@@ -721,7 +719,10 @@ void process(dt_iop_module_t *self,
   const gboolean do_capture = !passthru &&  !is_4bayer && !show_dual && !run_fast && d->cs_enabled;
   const gboolean greens = is_bayer && d->green_eq != DT_IOP_GREEN_EQ_NO && no_masking && !run_fast && !true_monochrome;
 
-  const float procmax = dt_iop_get_processed_maximum(piece);
+  float procmax = dt_iop_get_processed_maximum(piece);
+  if(procmax < 1.5f) procmax = 2.0f;
+  else if(procmax < 3.0f) procmax = 4.0f;
+  else procmax = 8.0f;
   const float procmin = dt_iop_get_processed_minimum(piece);
   const int exif_iso = img->exif_iso;
 
@@ -873,7 +874,7 @@ void process(dt_iop_module_t *self,
         else if(method == DT_IOP_DEMOSAIC_LMMSE)
           lmmse_demosaic(t_out, t_in, width, t_rows, filters, d->lmmse_refine, procmax);
         else if(method != DT_IOP_DEMOSAIC_AMAZE)
-          demosaic_ppg(t_out, t_in, width, t_rows, filters, d->median_thrs);
+          demosaic_ppg(t_out, t_in, width, t_rows, filters, d->median_thrs, 100000);
         else
           amaze_demosaic(t_in, t_out, width, t_rows, filters, procmin);
       }
@@ -1281,8 +1282,6 @@ void init_global(dt_iop_module_so_t *self)
   gd->kernel_rcd_step_4_2 = dt_opencl_create_kernel(rcd, "rcd_step_4_2");
   gd->kernel_rcd_step_5_1 = dt_opencl_create_kernel(rcd, "rcd_step_5_1");
   gd->kernel_rcd_step_5_2 = dt_opencl_create_kernel(rcd, "rcd_step_5_2");
-  gd->kernel_rcd_border_redblue = dt_opencl_create_kernel(rcd, "rcd_border_redblue");
-  gd->kernel_rcd_border_green = dt_opencl_create_kernel(rcd, "rcd_border_green");
   gd->kernel_demosaic_box3 = dt_opencl_create_kernel(rcd, "demosaic_box3");
   gd->kernel_write_blended_dual  = dt_opencl_create_kernel(rcd, "write_blended_dual");
 
@@ -1348,8 +1347,6 @@ void cleanup_global(dt_iop_module_so_t *self)
   dt_opencl_free_kernel(gd->kernel_rcd_step_4_2);
   dt_opencl_free_kernel(gd->kernel_rcd_step_5_1);
   dt_opencl_free_kernel(gd->kernel_rcd_step_5_2);
-  dt_opencl_free_kernel(gd->kernel_rcd_border_redblue);
-  dt_opencl_free_kernel(gd->kernel_rcd_border_green);
   dt_opencl_free_kernel(gd->kernel_demosaic_box3);
   dt_opencl_free_kernel(gd->kernel_write_blended_dual);
   dt_opencl_free_kernel(gd->gaussian_9x9_mul);
diff --git a/src/iop/demosaicing/basics.c b/src/iop/demosaicing/basics.c
index 7ffab019b172..5a1749289afe 100644
--- a/src/iop/demosaicing/basics.c
+++ b/src/iop/demosaicing/basics.c
@@ -549,7 +549,8 @@ static int process_default_cl(const dt_iop_module_t *self,
         const size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
         err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_green, sizes, local,
           CLARG(dev_med), CLARG(dev_tmp), CLARG(width),
-          CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * (locopt.sizex + 2*3) * (locopt.sizey + 2*3)));
+          CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * (locopt.sizex + 2*3) * (locopt.sizey + 2*3)),
+          CLARGINT(100000));
         if(err != CL_SUCCESS) goto error;
       }
 
@@ -566,7 +567,8 @@ static int process_default_cl(const dt_iop_module_t *self,
         const size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
         err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_redblue, sizes, local,
           CLARG(dev_tmp), CLARG(dev_out), CLARG(width),
-          CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)));
+          CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)),
+          CLARGINT(100000));
         if(err != CL_SUCCESS) goto error;
       }
     }
diff --git a/src/iop/demosaicing/lmmse.c b/src/iop/demosaicing/lmmse.c
index fcc5d10e32c1..9ed6ebe9000c 100644
--- a/src/iop/demosaicing/lmmse.c
+++ b/src/iop/demosaicing/lmmse.c
@@ -125,7 +125,7 @@ static void lmmse_demosaic(float *const restrict out,
                            const dt_iop_demosaic_lmmse_t mode,
                            const float scaler)
 {
-  rcd_ppg_border(out, in, width, height, filters, BORDER_AROUND);
+  demosaic_ppg(out, in, width, height, filters, 0.0f, BORDER_AROUND);
   if(width < 2 * BORDER_AROUND || height < 2 * BORDER_AROUND)
     return;
 
diff --git a/src/iop/demosaicing/ppg.c b/src/iop/demosaicing/ppg.c
index e88c6f268824..75fa8c079a1c 100644
--- a/src/iop/demosaicing/ppg.c
+++ b/src/iop/demosaicing/ppg.c
@@ -22,9 +22,10 @@ static void demosaic_ppg(float *const out,
                          const int width,
                          const int height,
                          const uint32_t filters,
-                         const float thrs)
+                         const float thrs,
+                         const int margin)
 {
-  // border interpolate
+  // preliminary border interpolate for outermost 3 pixels
   float sum[8];
   for(int j = 0; j < height; j++)
     for(int i = 0; i < width; i++)
@@ -46,10 +47,10 @@ static void demosaic_ppg(float *const out,
       for(int c = 0; c < 3; c++)
       {
         if(c != f && sum[c + 4] > 0.0f)
-          out[4 * ((size_t)j * width + i) + c] = fmaxf(0.0f, sum[c] / sum[c + 4]);
+          out[4 * ((size_t)j * width + i) + c] = sum[c] / sum[c + 4];
         else
           out[4 * ((size_t)j * width + i) + c]
-              = fmaxf(0.0f, in[(size_t)j * width + i]);
+              = in[(size_t)j * width + i];
       }
     }
   const gboolean median = thrs > 0.0f;
@@ -60,8 +61,11 @@ static void demosaic_ppg(float *const out,
     pre_median(med_in, in, width, height, filters, 1, thrs);
     input = med_in;
   }
-// for all pixels except those in the 3 pixel border:
-// interpolate green from input into out float array, or copy color.
+
+  const int border = margin + 3;
+  // for all pixels except those in the 3 pixel border:
+  // interpolate green from input into out float array, or copy color.
+  // avoid processing if outside of given border
   DT_OMP_FOR()
   for(int j = 3; j < height - 3; j++)
   {
@@ -69,6 +73,14 @@ static void demosaic_ppg(float *const out,
     const float *buf_in = input + (size_t)width * j + 3;
     for(int i = 3; i < width - 3; i++)
     {
+      if(i == border && j >= border && j < height - border)
+      {
+        i = width - border;
+        buf = out + (size_t)4 * width * j + 4 * i;
+        buf_in = in + (size_t)width * j + i;
+      }
+      if(i == width) break;
+
       const int c = FC(j, i, filters);
       dt_aligned_pixel_t color;
       const float pc = buf_in[0];
@@ -116,86 +128,94 @@ static void demosaic_ppg(float *const out,
       color[3] = 0.0f;
 
       for_each_channel(k,aligned(buf,color:16)
-        dt_omp_nontemporal(buf)) buf[k] = fmaxf(0.0f, color[k]);
+        dt_omp_nontemporal(buf)) buf[k] = color[k];
 
       buf += 4;
       buf_in++;
     }
   }
 
-// for all pixels except the outermost row/column:
-// interpolate colors using out as input into float out array
   DT_OMP_FOR()
-  for(int j = 1; j < height - 1; j++)
+  for(int j = 0; j < height; j++)
   {
-    float *buf = out + (size_t)4 * width * j + 4;
-    for(int i = 1; i < width - 1; i++)
+    float *buf = out + (size_t)4 * width * j;
+    for(int i = 0; i < width; i++)
     {
+      if(i == margin && j >= margin && j < height - margin)
+      {
+        i = width - margin;
+        buf = out + (size_t)4 * (width * j + i);
+      }
       // also prefetch direct nbs top/bottom
-      const int c = FC(j, i, filters);
       dt_aligned_pixel_t color = { buf[0], buf[1], buf[2], buf[3] };
 
-      // fill all four pixels with correctly interpolated stuff: r/b for green1/2
-      // b for r and r for b
-      if(__builtin_expect(c & 1, 1)) // c == 1 || c == 3)
-      {
-        // calculate red and blue for green pixels:
-        // need 4-nbhood:
-        const float *nt = buf - 4 * width;
-        const float *nb = buf + 4 * width;
-        const float *nl = buf - 4;
-        const float *nr = buf + 4;
-        if(FC(j, i + 1, filters) == 0) // red nb in same row
-        {
-          color[2] = (nt[2] + nb[2] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
-          color[0] = (nl[0] + nr[0] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
-        }
-        else
-        {
-          // blue nb
-          color[0] = (nt[0] + nb[0] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
-          color[2] = (nl[2] + nr[2] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
-        }
-      }
-      else
+      // for all pixels except the outermost row/column:
+      // interpolate colors using out as input into float out array
+      // the outermost are untouched
+      // but both final results avoid negatives
+      if(j > 0 && i > 0 && i < width - 1 && j < height - 1)
       {
-        // get 4-star-nbhood:
-        const float *ntl = buf - 4 - 4 * width;
-        const float *ntr = buf + 4 - 4 * width;
-        const float *nbl = buf - 4 + 4 * width;
-        const float *nbr = buf + 4 + 4 * width;
-
-        if(c == 0)
+        const int c = FC(j, i, filters);
+        // fill all four pixels with correctly interpolated stuff: r/b for green1/2
+        // b for r and r for b
+        if(__builtin_expect(c & 1, 1)) // c == 1 || c == 3)
         {
-          // red pixel, fill blue:
-          const float diff1 = fabsf(ntl[2] - nbr[2]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
-          const float guess1 = ntl[2] + nbr[2] + 2.0f * color[1] - ntl[1] - nbr[1];
-          const float diff2 = fabsf(ntr[2] - nbl[2]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
-          const float guess2 = ntr[2] + nbl[2] + 2.0f * color[1] - ntr[1] - nbl[1];
-          if(diff1 > diff2)
-            color[2] = guess2 * .5f;
-          else if(diff1 < diff2)
-            color[2] = guess1 * .5f;
+          // calculate red and blue for green pixels:
+          // need 4-nbhood:
+          const float *nt = buf - 4 * width;
+          const float *nb = buf + 4 * width;
+          const float *nl = buf - 4;
+          const float *nr = buf + 4;
+          if(FC(j, i + 1, filters) == 0) // red nb in same row
+          {
+            color[2] = (nt[2] + nb[2] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
+            color[0] = (nl[0] + nr[0] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
+          }
           else
-            color[2] = (guess1 + guess2) * .25f;
+          {
+            // blue nb
+            color[0] = (nt[0] + nb[0] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
+            color[2] = (nl[2] + nr[2] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
+          }
         }
-        else // c == 2, blue pixel, fill red:
+        else
         {
-          const float diff1 = fabsf(ntl[0] - nbr[0]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
-          const float guess1 = ntl[0] + nbr[0] + 2.0f * color[1] - ntl[1] - nbr[1];
-          const float diff2 = fabsf(ntr[0] - nbl[0]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
-          const float guess2 = ntr[0] + nbl[0] + 2.0f * color[1] - ntr[1] - nbl[1];
-          if(diff1 > diff2)
-            color[0] = guess2 * .5f;
-          else if(diff1 < diff2)
-            color[0] = guess1 * .5f;
-          else
-            color[0] = (guess1 + guess2) * .25f;
+          // get 4-star-nbhood:
+          const float *ntl = buf - 4 - 4 * width;
+          const float *ntr = buf + 4 - 4 * width;
+          const float *nbl = buf - 4 + 4 * width;
+          const float *nbr = buf + 4 + 4 * width;
+          if(c == 0)
+          {
+            // red pixel, fill blue:
+            const float diff1 = fabsf(ntl[2] - nbr[2]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
+            const float guess1 = ntl[2] + nbr[2] + 2.0f * color[1] - ntl[1] - nbr[1];
+            const float diff2 = fabsf(ntr[2] - nbl[2]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
+            const float guess2 = ntr[2] + nbl[2] + 2.0f * color[1] - ntr[1] - nbl[1];
+            if(diff1 > diff2)
+              color[2] = guess2 * .5f;
+            else if(diff1 < diff2)
+              color[2] = guess1 * .5f;
+            else
+              color[2] = (guess1 + guess2) * .25f;
+          }
+          else // c == 2, blue pixel, fill red:
+          {
+            const float diff1 = fabsf(ntl[0] - nbr[0]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
+            const float guess1 = ntl[0] + nbr[0] + 2.0f * color[1] - ntl[1] - nbr[1];
+            const float diff2 = fabsf(ntr[0] - nbl[0]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
+            const float guess2 = ntr[0] + nbl[0] + 2.0f * color[1] - ntr[1] - nbl[1];
+            if(diff1 > diff2)
+              color[0] = guess2 * .5f;
+            else if(diff1 < diff2)
+              color[0] = guess1 * .5f;
+            else
+              color[0] = (guess1 + guess2) * .25f;
+          }
         }
       }
-
       for_each_channel(k,aligned(buf,color:16)
-        dt_omp_nontemporal(buf)) buf[k] = fmaxf(0.0f, color[k]);
+        dt_omp_nontemporal(buf)) buf[k] = color[k];
 
       buf += 4;
     }
diff --git a/src/iop/demosaicing/rcd.c b/src/iop/demosaicing/rcd.c
index 0adcbd83d915..4704f3ec3e34 100644
--- a/src/iop/demosaicing/rcd.c
+++ b/src/iop/demosaicing/rcd.c
@@ -38,9 +38,8 @@
 */
 
 /* Some notes about the algorithm
-* 1. The calculated data at the tiling borders RCD_BORDER must be at least 9 to be stable.
-* 2. For the outermost tiles we only have to discard a 7 pixel border region interpolated otherwise.
-* 3. The tilesize has a significant influence on performance, the default is a good guess for modern
+* 1. The calculated data at the tiling borders RCD_BORDER must be 10 to be stable.
+* 2. The tilesize has a significant influence on performance, the default is a good guess for modern
 *    x86/64 machines, tested on Xeon E-2288G, i5-8250U.
 */
 
@@ -65,8 +64,7 @@
   #pragma GCC optimize ("fp-contract=fast", "finite-math-only", "no-math-errno")
 #endif
 
-#define RCD_BORDER 9          // avoid tile-overlap errors
-#define RCD_MARGIN 7          // for the outermost tiles we can have a smaller outer border
+#define RCD_BORDER 10 // avoid tile-overlap errors
 #define RCD_TILEVALID (DT_RCD_TILESIZE - 2 * RCD_BORDER)
 #define w1 DT_RCD_TILESIZE
 #define w2 (2 * DT_RCD_TILESIZE)
@@ -82,191 +80,6 @@ static inline float _safe_in(float a, float scale)
   return fmaxf(0.0f, a) * scale;
 }
 
-/** This is basically ppg adopted to only write data to RCD_MARGIN */
-static void rcd_ppg_border(float *const out,
-                           const float *const in,
-                           const int width,
-                           const int height,
-                           const uint32_t filters,
-                           const int margin)
-{
-  const int border = margin + 3;
-  // write approximatad 3-pixel border region to out
-  float sum[8];
-  for(int j = 0; j < height; j++)
-  {
-    for(int i = 0; i < width; i++)
-    {
-      if(i == 3 && j >= 3 && j < height - 3) i = width - 3;
-      if(i == width) break;
-      memset(sum, 0, sizeof(float) * 8);
-      for(int y = j - 1; y != j + 2; y++)
-      {
-        for(int x = i - 1; x != i + 2; x++)
-        {
-          if((y >= 0) && (x >= 0) && (y < height) && (x < width))
-          {
-            const int f = FC(y, x, filters);
-            sum[f] += fmaxf(0.0f, in[(size_t)y * width + x]);
-            sum[f + 4]++;
-          }
-        }
-      }
-      const int f = FC(j, i, filters);
-      for(int c = 0; c < 3; c++)
-      {
-        if(c != f && sum[c + 4] > 0.0f)
-          out[4 * ((size_t)j * width + i) + c] = sum[c] / sum[c + 4];
-        else
-          out[4 * ((size_t)j * width + i) + c] = fmaxf(0.0f, in[(size_t)j * width + i]);
-      }
-    }
-  }
-
-  DT_OMP_FOR()
-  for(int j = 3; j < height - 3; j++)
-  {
-    float *buf = out + (size_t)4 * width * j + 4 * 3;
-    const float *buf_in = in + (size_t)width * j + 3;
-    for(int i = 3; i < width - 3; i++)
-    {
-      if(i == border && j >= border && j < height - border)
-      {
-        i = width - border;
-        buf = out + (size_t)4 * width * j + 4 * i;
-        buf_in = in + (size_t)width * j + i;
-      }
-      if(i == width) break;
-
-      const int c = FC(j, i, filters);
-      dt_aligned_pixel_t color;
-      const float pc = fmaxf(0.0f, buf_in[0]);
-      if(c == 0 || c == 2)
-      {
-        color[c] = pc;
-        const float pym  = fmaxf(0.0f, buf_in[-width * 1]);
-        const float pym2 = fmaxf(0.0f, buf_in[-width * 2]);
-        const float pym3 = fmaxf(0.0f, buf_in[-width * 3]);
-        const float pyM  = fmaxf(0.0f, buf_in[+width * 1]);
-        const float pyM2 = fmaxf(0.0f, buf_in[+width * 2]);
-        const float pyM3 = fmaxf(0.0f, buf_in[+width * 3]);
-        const float pxm  = fmaxf(0.0f, buf_in[-1]);
-        const float pxm2 = fmaxf(0.0f, buf_in[-2]);
-        const float pxm3 = fmaxf(0.0f, buf_in[-3]);
-        const float pxM  = fmaxf(0.0f, buf_in[+1]);
-        const float pxM2 = fmaxf(0.0f, buf_in[+2]);
-        const float pxM3 = fmaxf(0.0f, buf_in[+3]);
-
-        const float guessx = (pxm + pc + pxM) * 2.0f - pxM2 - pxm2;
-        const float diffx = (fabsf(pxm2 - pc) + fabsf(pxM2 - pc) + fabsf(pxm - pxM)) * 3.0f
-                            + (fabsf(pxM3 - pxM) + fabsf(pxm3 - pxm)) * 2.0f;
-        const float guessy = (pym + pc + pyM) * 2.0f - pyM2 - pym2;
-        const float diffy = (fabsf(pym2 - pc) + fabsf(pyM2 - pc) + fabsf(pym - pyM)) * 3.0f
-                            + (fabsf(pyM3 - pyM) + fabsf(pym3 - pym)) * 2.0f;
-        if(diffx > diffy)
-        {
-          // use guessy
-          const float m = fminf(pym, pyM);
-          const float M = fmaxf(pym, pyM);
-          color[1] = fmaxf(fminf(guessy * .25f, M), m);
-        }
-        else
-        {
-          const float m = fminf(pxm, pxM);
-          const float M = fmaxf(pxm, pxM);
-          color[1] = fmaxf(fminf(guessx * .25f, M), m);
-        }
-      }
-      else
-        color[1] = pc;
-
-      color[3] = 0.0f;
-      for_each_channel(k)
-        buf[k] = color[k];
-      buf += 4;
-      buf_in++;
-    }
-  }
-// for all pixels: interpolate colors into float array
-  DT_OMP_FOR()
-  for(int j = 1; j < height - 1; j++)
-  {
-    float *buf = out + (size_t)4 * width * j + 4;
-    for(int i = 1; i < width - 1; i++)
-    {
-      if(i == margin && j >= margin && j < height - margin)
-      {
-        i = width - margin;
-        buf = out + (size_t)4 * (width * j + i);
-      }
-      const int c = FC(j, i, filters);
-      dt_aligned_pixel_t color = { buf[0], buf[1], buf[2], buf[3] };
-      const int linesize = 4 * width;
-      // fill all four pixels with correctly interpolated stuff: r/b for green1/2
-      // b for r and r for b
-      if(__builtin_expect(c & 1, 1)) // c == 1 || c == 3)
-      {
-        // calculate red and blue for green pixels:
-        // need 4-nbhood:
-        const float *nt = buf - linesize;
-        const float *nb = buf + linesize;
-        const float *nl = buf - 4;
-        const float *nr = buf + 4;
-        if(FC(j, i + 1, filters) == 0) // red nb in same row
-        {
-          color[2] = (nt[2] + nb[2] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
-          color[0] = (nl[0] + nr[0] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
-        }
-        else
-        {
-          // blue nb
-          color[0] = (nt[0] + nb[0] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
-          color[2] = (nl[2] + nr[2] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
-        }
-      }
-      else
-      {
-        // get 4-star-nbhood:
-        const float *ntl = buf - 4 - linesize;
-        const float *ntr = buf + 4 - linesize;
-        const float *nbl = buf - 4 + linesize;
-        const float *nbr = buf + 4 + linesize;
-
-        if(c == 0)
-        {
-          // red pixel, fill blue:
-          const float diff1  = fabsf(ntl[2] - nbr[2]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
-          const float guess1 = ntl[2] + nbr[2] + 2.0f * color[1] - ntl[1] - nbr[1];
-          const float diff2  = fabsf(ntr[2] - nbl[2]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
-          const float guess2 = ntr[2] + nbl[2] + 2.0f * color[1] - ntr[1] - nbl[1];
-          if(diff1 > diff2)
-            color[2] = guess2 * .5f;
-          else if(diff1 < diff2)
-            color[2] = guess1 * .5f;
-          else
-            color[2] = (guess1 + guess2) * .25f;
-        }
-        else // c == 2, blue pixel, fill red:
-        {
-          const float diff1  = fabsf(ntl[0] - nbr[0]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
-          const float guess1 = ntl[0] + nbr[0] + 2.0f * color[1] - ntl[1] - nbr[1];
-          const float diff2  = fabsf(ntr[0] - nbl[0]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
-          const float guess2 = ntr[0] + nbl[0] + 2.0f * color[1] - ntr[1] - nbl[1];
-          if(diff1 > diff2)
-            color[0] = guess2 * .5f;
-          else if(diff1 < diff2)
-            color[0] = guess1 * .5f;
-          else
-            color[0] = (guess1 + guess2) * .25f;
-        }
-      }
-      for_each_channel(k)
-        buf[k] = color[k];
-      buf += 4;
-    }
-  }
-}
-
 DT_OMP_DECLARE_SIMD(aligned(in, out : 64))
 static void demosaic_box3(float *const restrict out,
                           const float *const restrict in,
@@ -289,7 +102,7 @@ static void demosaic_box3(float *const restrict out,
           if(x >= 0 && y >= 0 && x < width && y < height)
           {
             const int color = (filters == 9u) ? FCNxtrans(y, x, xtrans) : FC(y, x, filters);
-            sum[color] += MAX(0.0f, in[(size_t)width*y +x]);
+            sum[color] += in[(size_t)width*y +x];
             cnt[color] += 1.0f;
           }
         }
@@ -308,13 +121,9 @@ static void rcd_demosaic(float *const restrict out,
                          const uint32_t filters,
                          const float scaler)
 {
+  demosaic_ppg(out, in, width, height, filters, 0.0f, RCD_BORDER);
   if(width < 2*RCD_BORDER || height < 2*RCD_BORDER)
-  {
-    rcd_ppg_border(out, in, width, height, filters, RCD_BORDER);
     return;
-  }
-
-  rcd_ppg_border(out, in, width, height, filters, RCD_MARGIN);
 
   const float revscaler = 1.0f / scaler;
 
@@ -555,18 +364,15 @@ static void rcd_demosaic(float *const restrict out,
           }
         }
 
-        // For the outermost tiles in all directions we can use a smaller border margin
-        const int first_vertical =   rowStart + ((tile_vertical == 0) ? RCD_MARGIN : RCD_BORDER);
-        const int last_vertical =    rowEnd   - ((tile_vertical == num_vertical - 1)     ? RCD_MARGIN : RCD_BORDER);
-        const int first_horizontal = colStart + ((tile_horizontal == 0) ? RCD_MARGIN : RCD_BORDER);
-        const int last_horizontal =  colEnd   - ((tile_horizontal == num_horizontal - 1) ? RCD_MARGIN : RCD_BORDER);
-        for(int row = first_vertical; row < last_vertical; row++)
+        for(int row = rowStart + RCD_BORDER; row < rowEnd - RCD_BORDER; row++)
         {
-          for(int col = first_horizontal, idx = (row - rowStart) * DT_RCD_TILESIZE + col - colStart, o_idx = (row * width + col) * 4; col < last_horizontal; col++, o_idx += 4, idx++)
+          for(int col = colStart + RCD_BORDER, idx = (row - rowStart) * DT_RCD_TILESIZE + col - colStart, o_idx = (row * width + col) * 4;
+              col < colEnd - RCD_BORDER;
+              col++, o_idx += 4, idx++)
           {
-            out[o_idx]   = scaler * fmaxf(0.0f, rgb[0][idx]);
-            out[o_idx+1] = scaler * fmaxf(0.0f, rgb[1][idx]);
-            out[o_idx+2] = scaler * fmaxf(0.0f, rgb[2][idx]);
+            out[o_idx]   = scaler * rgb[0][idx];
+            out[o_idx+1] = scaler * rgb[1][idx];
+            out[o_idx+2] = scaler * rgb[2][idx];
             out[o_idx+3] = 0.0f;
           }
         }
@@ -625,12 +431,12 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
                                       .cellsize = sizeof(float) * 1, .overhead = 0,
                                       .sizex = 64, .sizey = 64 };
 
-    err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_green, &locopt);
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_green, &locopt);
     if(err != CL_SUCCESS) goto error;
 
     size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
     size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
-    err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_rcd_border_green, sizes, local,
+    err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_green, sizes, local,
         CLARG(dev_in), CLARG(dev_tmp), CLARG(width), CLARG(height), CLARG(filters),
         CLLOCAL(sizeof(float) * (locopt.sizex + 2*3) * (locopt.sizey + 2*3)), CLARGINT(32));
     if(err != CL_SUCCESS) goto error;
@@ -642,12 +448,12 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
                                       .cellsize = 4 * sizeof(float), .overhead = 0,
                                       .sizex = 64, .sizey = 64 };
 
-    err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_redblue, &locopt);
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_redblue, &locopt);
     if(err != CL_SUCCESS) goto error;
 
     size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
     size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
-    err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_rcd_border_redblue, sizes, local,
+    err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_redblue, sizes, local,
       CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(filters),
       CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)), CLARGINT(16));
     if(err != CL_SUCCESS) goto error;
@@ -669,10 +475,14 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
     goto error;
 
   // populate data
-  float scaler = 1.0f / dt_iop_get_processed_maximum(piece);
+  float scaler = dt_iop_get_processed_maximum(piece);
+  if(scaler < 1.5f) scaler = 2.0f;
+  else if(scaler < 3.0f) scaler = 4.0f;
+  else scaler = 8.0f;
+  const float revscaler = 1.0f / scaler;
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_rcd_populate, width, height,
         CLARG(dev_in), CLARG(cfa), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height),
-        CLARG(filters), CLARG(scaler));
+        CLARG(filters), CLARG(revscaler));
   if(err != CL_SUCCESS) goto error;
 
   // Step 1.1: Calculate a squared vertical and horizontal high pass filter on color differences
@@ -715,10 +525,9 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
         CLARG(VH_dir), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(filters));
   if(err != CL_SUCCESS) goto error;
 
-  scaler = dt_iop_get_processed_maximum(piece);
   // write output
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_rcd_write_output, width, height,
-        CLARG(dev_out), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(scaler), CLARGINT(RCD_MARGIN));
+        CLARG(dev_out), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(scaler), CLARGINT(RCD_BORDER));
 
 error:
   dt_opencl_release_mem_object(dev_tmp);
@@ -739,7 +548,6 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
 #endif
 
 #undef RCD_BORDER
-#undef RCD_MARGIN
 #undef RCD_TILEVALID
 #undef w1
 #undef w2

From 646ca00caa88613f2751c1fd43f6bf44ced526c4 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sun, 8 Mar 2026 15:01:47 +0100
Subject: [PATCH 05/13] Various VNG demosaicer fixes

1. Fix OpenCL VNG full interpolation, we must not touch the outermost 2 pixels,
   they have been interpolated already in the linear phase.

2. Ensure identical results for VNG initial borders.
   Checked results for two possible algorithms, average vs ppg-like original.
   There are subtle differences, but overall the ppg-like seems to be better
   and we now use it for both code paths.

3. Fix VNG final green mixing for bayer4 sensors.
   Do this at the end of processing for both CPU and GPU code.

4. We don't avoid negative input for slightly better results.
---
 data/kernels/demosaic_vng.cl | 110 +++++++++++++++--------------------
 src/iop/demosaicing/vng.c    |  45 ++++++--------
 2 files changed, 66 insertions(+), 89 deletions(-)

diff --git a/data/kernels/demosaic_vng.cl b/data/kernels/demosaic_vng.cl
index 1cbff4807f1c..3ff6482a3665 100644
--- a/data/kernels/demosaic_vng.cl
+++ b/data/kernels/demosaic_vng.cl
@@ -28,7 +28,7 @@ vng_lin_interpolate(read_only image2d_t in,
                     global const unsigned char (*const xtrans)[6],
                     global const int (*const lookup)[16][32],
                     local float *buffer,
-                    const int equil)
+                    const int only_linear)
 {
   const int x = get_global_id(0);
   const int y = get_global_id(1);
@@ -59,7 +59,7 @@ vng_lin_interpolate(read_only image2d_t in,
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    buffer[bufidx] = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy)).x);
+    buffer[bufidx] = readsingle(in, xx, yy);
   }
 
   // center buffer around current x,y-Pixel
@@ -73,31 +73,40 @@ vng_lin_interpolate(read_only image2d_t in,
 
   const bool is_xtrans = filters == 9; 
   const int colors = is_xtrans ? 3 : 4;
-  const int av = (filters == 9) ? 2 : 1;
   const int size = is_xtrans ? 6 : 16;
 
   float sum[4] = { 0.0f };
   float o[4] = { 0.0f };
 
+  // approximate outermost single pixels
   if(x < 1 || x >= width-1 || y < 1 || y >= height-1)
   {
     int count[4] = { 0 };
-    for(int j = y-av; j <= y+av; j++)
+    for(int j = y-1; j <= y+1; j++)
     {
-      for(int i = x-av; i <= x+av; i++)
+      for(int i = x-1; i <= x+1; i++)
       {
         if(j >= 0 && i >= 0 && j < height && i < width)
         {
           const int f = fcol(j, i, filters, xtrans);
-          sum[f] += fmax(0.0f, read_imagef(in, sampleri, (int2)(i, j)).x);
+          sum[f] += readsingle(in, i, j);
           count[f]++;
         }
       }
     }
+    const int f = fcol(y, x, filters, xtrans);
+    // for current cell, copy the current sensor's color data,
+    // interpolate the other two colors from surrounding pixels of
+    // their color
     for(int c = 0; c < colors; c++)
-      o[c] = sum[c] / fmax(1.0f, (float)count[c]);
+    {
+      if(c != f && count[c] != 0)
+        o[c] = sum[c] / count[c];
+      else
+        o[c] = readsingle(in, x, y);
+    }
   }
-  else
+  else // do the bilinear stuff
   {
     global const int *ip = lookup[y % size][x % size];
     const int num_pixels = ip[0];
@@ -120,17 +129,18 @@ vng_lin_interpolate(read_only image2d_t in,
     o[ip[0]] = buffer[0];
   }
 
-  if(!is_xtrans && equil)
+  if(only_linear)
   {
-    o[1] = 0.5f * (o[1] + o[3]);
+    o[0] = o[0];
+    o[1] = is_xtrans ? o[1] : 0.5f * (o[1] + o[3]);
+    o[2] = o[2];
     o[3] = 0.0f;
   }
   write_imagef(out, (int2)(x, y), (float4)(o[0], o[1], o[2], o[3]));
 }
 
 kernel void
-vng_interpolate(read_only image2d_t input,
-                read_only image2d_t in,
+vng_interpolate(read_only image2d_t in,
                 write_only image2d_t out,
                 const int width,
                 const int height,
@@ -169,7 +179,7 @@ vng_interpolate(read_only image2d_t input,
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    const float4 pixel = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy)));
+    const float4 pixel = readpixel(in, xx, yy);
     vstore4(pixel, bufidx, buffer);
   }
 
@@ -179,10 +189,21 @@ vng_interpolate(read_only image2d_t input,
   barrier(CLK_LOCAL_MEM_FENCE);
   if(x >= width || y >= height) return;
 
-  const bool is_xtrans = filters == 9; 
-  const int colors = is_xtrans ? 3 : 4;
-  const int av = is_xtrans ? 2 : 1;
+  const bool is_xtrans = filters == 9;
+  // we don't touch data at the outermost 2 pixels
+  if(x < 2 || x >= width-2 || y < 2 || y >= height-2)
+  {
+    float4 val = readpixel(in, x, y);
+    if(!is_xtrans)
+    {
+      val.y = 0.5f * (val.y + val.w);
+      val.w = 0.0f;
+    }
+    write_imagef(out, (int2)(x, y), val);
+    return;
+  }
 
+  const int colors = is_xtrans ? 3 : 4;
   const int prow = is_xtrans ? 6 : 8;
   const int pcol = is_xtrans ? 6 : 2;
 
@@ -227,41 +248,14 @@ vng_interpolate(read_only image2d_t input,
     if(gmax < gval[g]) gmax = gval[g];
   }
 
-  const bool border = x < 2 || x >= width-2 || y < 2 || y >= height-2;
-  float b[4] = { 0.0f };
-  // Ok lets calculate the border pixel, we'll need it soon
-  if(border)
-  {
-    int count[4] = { 0 };
-    for(int j = y-av; j <= y+av; j++)
-    {
-      for(int i = x-av; i <= x+av; i++)
-      {
-        if(j >= 0 && i >= 0 && j < height && i < width)
-        {
-          const int f = fcol(j, i, filters, xtrans);
-          b[f] += fmax(0.0f, read_imagef(input, sampleri, (int2)(i, j)).x);
-          count[f]++;
-        }
-      }
-    }
-    for(int c = 0; c < colors; c++)
-      b[c] /= fmax(1.0f, (float)count[c]);
-  }
-
+  float o[4] = { 0.0f };
   if(gmax == 0.0f)
   {
-    if(!border)
-    {
-      for(int c = 0; c < colors; c++)
-        b[c] = buffer[c];
-    }
-    if(!is_xtrans)
-    {
-      b[1] = 0.5f * (b[1] + b[3]);
-      b[3] = 0.0f;
-    }
-    write_imagef(out, (int2)(x, y), (float4)(b[0], b[1], b[2], b[3]));
+    o[0] = buffer[0];
+    o[1] = is_xtrans ? buffer[1] : 0.5f * (buffer[1] + buffer[3]);
+    o[2] = buffer[2];
+    o[3] = 0.0f;
+    write_imagef(out, (int2)(x, y), (float4)(o[0], o[1], o[2], o[3]));
     return;
   }
 
@@ -303,25 +297,15 @@ vng_interpolate(read_only image2d_t input,
   }
 
   // save to output
-  float o[4] = { 0.0f };
   for(int c = 0; c < colors; c++)
   {
     float tot = buffer[color];
     if(c != color) tot += (sum[c] - sum[color]) / num;
-    o[c] = fmax(0.0f, tot);
-  }
-
-  if(border)
-  {
-    for(int c = 0; c < colors; c++)
-      o[c] = b[c];
+    o[c] = tot;
   }
 
-  if(!is_xtrans)
-  {
-    o[1] = 0.5f * (o[1] + o[3]);
-    o[3] = 0.0f;
-  }
+  o[1] = is_xtrans ? o[1] : 0.5f * (o[1] + o[3]);
+  o[3] = 0.0f;
   write_imagef(out, (int2)(x, y), (float4)(o[0], o[1], o[2], 0.0f));
 }
 
@@ -362,7 +346,7 @@ clip_and_zoom_demosaic_third_size_xtrans(read_only image2d_t in, write_only imag
     {
       for(int j = 0; j < 3; j++)
         for(int i = 0; i < 3; i++)
-          col[FCxtrans(yy + j, xx + i, xtrans)] += fmax(0.0f, read_imagef(in, sampleri, (int2)(xx + i, yy + j)).x);
+          col[FCxtrans(yy + j, xx + i, xtrans)] += fmax(0.0f, readsingle(in, xx + i, yy + j));
       num++;
     }
 
diff --git a/src/iop/demosaicing/vng.c b/src/iop/demosaicing/vng.c
index 47e5694224ca..0865d6d30c5c 100644
--- a/src/iop/demosaicing/vng.c
+++ b/src/iop/demosaicing/vng.c
@@ -25,7 +25,8 @@ static void lin_interpolate(float *out,
                             const uint32_t filters,
                             const uint8_t (*const xtrans)[6])
 {
-  const int colors = (filters == 9) ? 3 : 4;
+  const gboolean is_xtrans = filters == 9;
+  const int colors = is_xtrans ? 3 : 4;
   // border interpolate
   DT_OMP_FOR()
   for(int row = 0; row < height; row++)
@@ -41,7 +42,7 @@ static void lin_interpolate(float *out,
           if(y >= 0 && x >= 0 && y < height && x < width)
           {
             const int f = fcol(y, x, filters, xtrans);
-            sum[f] += fmaxf(0.0f, in[y * width + x]);
+            sum[f] += in[y * width + x];
             count[f]++;
           }
       const int f = fcol(row, col, filters, xtrans);
@@ -53,7 +54,7 @@ static void lin_interpolate(float *out,
         if(c != f && count[c] != 0)
           out[4 * (row * width + col) + c] = sum[c] / count[c];
         else
-          out[4 * (row * width + col) + c] = fmaxf(0.0f, in[row * width + col]);
+          out[4 * (row * width + col) + c] = in[row * width + col];
       }
     }
 
@@ -111,11 +112,11 @@ static void lin_interpolate(float *out,
       int *ip = &(lookup[row % size][col % size][0]);
       // for each adjoining pixel not of this pixel's color, sum up its weighted values
       for(int i = *ip++; i--; ip += 3)
-        sum[ip[2]] += fmaxf(0.0f, buf_in[ip[0]]) * ip[1];
+        sum[ip[2]] += buf_in[ip[0]] * ip[1];
       // for each interpolated color, load it into the pixel
       for(int i = colors; --i; ip += 2)
         buf[*ip] = sum[ip[0]] / ip[1];
-      buf[*ip] = fmaxf(0.0f, *buf_in);
+      buf[*ip] = *buf_in;
       buf += 4;
       buf_in++;
     }
@@ -137,12 +138,6 @@ static void lin_interpolate(float *out,
    I've extended the basic idea to work with non-Bayer filter arrays.
    Gradients are numbered clockwise from NW=0 to W=7.
 */
-static void _copy_abovezero(float *to, float *from, const int pixels)
-{
-  static dt_aligned_pixel_t zero = { 0.0f, 0.0f, 0.0f, 0.0f};
-  for(int i = 0; i < pixels; i++)
-    dt_vector_max(&to[i*4], zero, &from[i*4]);
-}
 
 static const signed char terms[]
       = { -2, -2, +0, -1, 1, 0x01, -2, -2, +0, +0, 2, 0x01, -2, -1, -1, +0, 1, 0x01, -2, -1, +0, -1, 1, 0x02,
@@ -178,7 +173,6 @@ static void vng_interpolate(float *out,
   float(*brow[4])[4];
   const gboolean is_xtrans = (filters == 9);
   const gboolean is_4bayer = FILTERS_ARE_4BAYER(filters);
-  const gboolean is_bayer = !(is_xtrans || is_4bayer);
   const int prow = is_xtrans ? 6 : 8;
   const int pcol = is_xtrans ? 6 : 2;
   const int colors = is_xtrans ? 3 : 4;
@@ -196,10 +190,7 @@ static void vng_interpolate(float *out,
 
   // if only linear interpolation is requested we can stop it here
   if(only_vng_linear)
-  {
-    if(is_bayer) goto bayer_greens;
-    else return;
-  }
+    goto finish;
 
   char *buffer = dt_alloc_aligned(sizeof(**brow) * width * 3 + sizeof(*ip) * prow * pcol * 320);
   if(!buffer)
@@ -304,22 +295,24 @@ static void vng_interpolate(float *out,
       }
     }
     if(row > 3) /* Write buffer to image */
-      _copy_abovezero(out + 4 * ((row - 2) * width + 2), (float *)(brow[0] + 2), width - 4);
+      dt_iop_image_copy(out + 4 * ((row - 2) * width + 2), (float *)(brow[0] + 2), 4*(width - 4));
 
     // rotate ring buffer
     for(int g = 0; g < 4; g++) brow[(g - 1) & 3] = brow[g];
   }
   // copy the final two rows to the image
-  _copy_abovezero(out + (4 * ((height - 4) * width + 2)), (float *)(brow[0] + 2), width - 4);
-  _copy_abovezero(out + (4 * ((height - 3) * width + 2)), (float *)(brow[1] + 2), width - 4);
+  dt_iop_image_copy(out + (4 * ((height - 4) * width + 2)), (float *)(brow[0] + 2), 4*(width - 4));
+  dt_iop_image_copy(out + (4 * ((height - 3) * width + 2)), (float *)(brow[1] + 2), 4*(width - 4));
   dt_free_align(buffer);
 
-bayer_greens:
-  if(is_bayer)
+finish:
+  DT_OMP_FOR()
+  for(size_t i = 0; i < (size_t)width * height * 4; i+=4)
   {
-    DT_OMP_FOR()
-    for(int i = 0; i < height * width; i++)
-      out[i * 4 + 1] = (out[i * 4 + 1] + out[i * 4 + 3]) / 2.0f;
+    out[i] = out[i];
+    out[i+1] = is_xtrans ? out[i+1] : 0.5f * (out[i+1] + out[i+3]);
+    out[i+2] = out[i+2];
+    out[i+3] = 0.0f;
   }
 }
 
@@ -499,7 +492,7 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
   if(only_vng_linear)
     goto finish;
 
-  // do full VNG interpolation; linear data is in dev_tmp
+  // do full VNG interpolation; linear data is in dev_tmp; don't touch outermost 2 pixels
   dt_opencl_local_buffer_t locopt
       = (dt_opencl_local_buffer_t){ .xoffset = 2*2, .xfactor = 1, .yoffset = 2*2, .yfactor = 1,
                                       .cellsize = 4 * sizeof(float), .overhead = 0,
@@ -511,7 +504,7 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
   size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
   size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
   err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_vng_interpolate, sizes, local,
-        CLARG(dev_in), CLARG(dev_tmp), CLARG(dev_out),
+        CLARG(dev_tmp), CLARG(dev_out),
         CLARG(width), CLARG(height), CLARG(filters4),
         CLARG(dev_xtrans), CLARG(dev_ips), CLARG(dev_code), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 4) * (locopt.sizey + 4)));
 

From 8e61a450c2a60ec35019f77df3b73bb09fced3c1 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sun, 8 Mar 2026 12:05:35 +0100
Subject: [PATCH 06/13] Fix color smoothing OpenCL code

Don't touch outermost one-pixels, they are used for next/previous row/column,
smoothing over-the-edge leads to clear artifacts as data are mirrored from the opposite
border.
---
 data/kernels/demosaic_ppg.cl | 21 +++++++++++++++------
 src/iop/demosaicing/basics.c |  6 +++---
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/data/kernels/demosaic_ppg.cl b/data/kernels/demosaic_ppg.cl
index 8ef11616cab0..ca12cfe60027 100644
--- a/data/kernels/demosaic_ppg.cl
+++ b/data/kernels/demosaic_ppg.cl
@@ -325,7 +325,11 @@ pre_median(read_only image2d_t in, write_only image2d_t out, const int width, co
 // 3x3 median filter
 // uses a sorting network to sort entirely in registers with no branches
 kernel void
-color_smoothing(read_only image2d_t in, write_only image2d_t out, const int width, const int height, local float4 *buffer)
+color_smoothing(read_only image2d_t in,
+                write_only image2d_t out,
+                const int width,
+                const int height,
+                local float4 *buffer)
 {
   const int lxid = get_local_id(0);
   const int lyid = get_local_id(1);
@@ -352,17 +356,22 @@ color_smoothing(read_only image2d_t in, write_only image2d_t out, const int widt
     // don't read more than needed
     if(gx >= width + 1 || gy >= height + 1) continue;
 
-    buffer[bufidx] = read_imagef(in, sampleri, (int2)(gx, gy));
+    buffer[bufidx] = readpixel(in, gx, gy);
   }
 
   barrier(CLK_LOCAL_MEM_FENCE);
-
   if(x >= width || y >= height) return;
 
   // re-position buffer
   buffer += (lyid + 1) * buffwd + lxid + 1;
 
   float4 o = buffer[0];
+  // don't touch outermost 1 pixels, just copy
+  if(x < 1 || x >= width-1 || y < 1 || y >= height-1)
+  {
+    write_imagef(out, (int2) (x, y), o);
+    return;
+  }
 
   // 3x3 median for R
   float s0 = buffer[-buffwd - 1].x - buffer[-buffwd - 1].y;
@@ -395,7 +404,7 @@ color_smoothing(read_only image2d_t in, write_only image2d_t out, const int widt
   cas(s6, s4);
   cas(s4, s2);
 
-  o.x = fmax(s4 + o.y, 0.0f);
+  o.x = s4 + o.y;
 
 
   // 3x3 median for B
@@ -429,9 +438,9 @@ color_smoothing(read_only image2d_t in, write_only image2d_t out, const int widt
   cas(s6, s4);
   cas(s4, s2);
 
-  o.z = fmax(s4 + o.y, 0.0f);
+  o.z = s4 + o.y;
 
-  write_imagef(out, (int2) (x, y), fmax(o, 0.0f));
+  write_imagef(out, (int2) (x, y), o);
 }
 #undef cas
 
diff --git a/src/iop/demosaicing/basics.c b/src/iop/demosaicing/basics.c
index 5a1749289afe..9c3d2de60039 100644
--- a/src/iop/demosaicing/basics.c
+++ b/src/iop/demosaicing/basics.c
@@ -138,7 +138,7 @@ static void color_smoothing(float *out,
           SWAPmed(4, 2);
           SWAPmed(6, 4);
           SWAPmed(4, 2);
-          outp[c] = fmaxf(med[4] + outp[1], 0.0f);
+          outp[c] = med[4] + outp[1];
         }
       }
     }
@@ -266,10 +266,10 @@ static int color_smoothing_cl(const dt_iop_module_t *self,
   cl_mem dev_t1 = dev_out;
   cl_mem dev_t2 = dev_tmp;
 
+  const size_t sizes[] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
+  const size_t local[] = { locopt.sizex, locopt.sizey, 1 };
   for(int pass = 0; pass < passes; pass++)
   {
-    const size_t sizes[] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
-    const size_t local[] = { locopt.sizex, locopt.sizey, 1 };
     err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_color_smoothing, sizes, local,
       CLARG(dev_t1), CLARG(dev_t2), CLARG(width),
       CLARG(height), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)));

From 5fed1454a419553dfabf9005b5c40c215b52c81f Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sun, 8 Mar 2026 12:11:17 +0100
Subject: [PATCH 07/13] Mostly fixing OpenCL green averaging

1. Use doubles to accumulate for full image green averaging in OpenCL sampling,
   the second reduce kernel uses kahan sum for increased precision for less CPU
   vs OpenCL difference, both don't fully fix differences but less diffs.
2. In OpenCL local greens averaging we correct the same lines as we do in cpu code.
---
 data/kernels/common.h         |  9 +++++++
 data/kernels/demosaic_ppg.cl  | 44 +++++++++++++++++++++--------------
 data/kernels/guided_filter.cl | 10 --------
 src/iop/demosaicing/basics.c  | 19 ++++++++-------
 4 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/data/kernels/common.h b/data/kernels/common.h
index d9abcb2918f7..7ff05665ab0b 100644
--- a/data/kernels/common.h
+++ b/data/kernels/common.h
@@ -63,6 +63,15 @@ constant sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE
   #pragma OPENCL FP_CONTRACT OFF
 #endif
 
+// Kahan summation algorithm
+#define Kahan_sum(m, c, add)        \
+  {                                 \
+    const float t1 = (add) - (c);   \
+    const float t2 = (m) + t1;      \
+    c = (t2 - m) - t1;              \
+    m = t2;                         \
+  }
+
 static inline int
 FC(const int row, const int col, const unsigned int filters)
 {
diff --git a/data/kernels/demosaic_ppg.cl b/data/kernels/demosaic_ppg.cl
index ca12cfe60027..564f3bca9846 100644
--- a/data/kernels/demosaic_ppg.cl
+++ b/data/kernels/demosaic_ppg.cl
@@ -74,7 +74,7 @@ green_equilibration_lavg(read_only image2d_t in,
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy)).x;
+    buffer[bufidx] = readsingle(in, xx, yy);
   }
 
   // center buffer around current x,y-Pixel
@@ -88,7 +88,15 @@ green_equilibration_lavg(read_only image2d_t in,
   const float maximum = 1.0f;
   float o = buffer[0];
 
-  if(c == 1 && (y & 1))
+  // check lines to be used as we do in cpu code
+  int oj = 2, oi = 2;
+  if(FC(oj, oi, filters) != GREEN) oj++;
+  if(FC(oj, oi, filters) != GREEN) oi++;
+  if(FC(oj, oi, filters) != GREEN) oj--;
+
+  int row_ok = (oj & 1) == (y & 1);
+  int col_ok = (oi & 1) == (x & 1);
+  if(row_ok && col_ok && x >= 2 && y >= 2 && x < width -2 && y < height-2)
   {
     const float o1_1 = buffer[-1 * stride - 1];
     const float o1_2 = buffer[-1 * stride + 1];
@@ -102,17 +110,16 @@ green_equilibration_lavg(read_only image2d_t in,
     const float m1 = (o1_1+o1_2+o1_3+o1_4)/4.0f;
     const float m2 = (o2_1+o2_2+o2_3+o2_4)/4.0f;
 
-    if ((m2 > 0.0f) && (m1 > 0.0f) && (m1 / m2 < maximum * 2.0f))
+    if((m2 > 0.0f) && (m1 > 0.0f) && (m1 / m2 < maximum * 2.0f))
     {
       const float c1 = (fabs(o1_1 - o1_2) + fabs(o1_1 - o1_3) + fabs(o1_1 - o1_4) + fabs(o1_2 - o1_3) + fabs(o1_3 - o1_4) + fabs(o1_2 - o1_4)) / 6.0f;
       const float c2 = (fabs(o2_1 - o2_2) + fabs(o2_1 - o2_3) + fabs(o2_1 - o2_4) + fabs(o2_2 - o2_3) + fabs(o2_3 - o2_4) + fabs(o2_2 - o2_4)) / 6.0f;
 
       if((o < maximum * 0.95f) && (c1 < maximum * thr) && (c2 < maximum * thr))
-        o *= m1/m2;
+          o *= m1/m2;
     }
   }
-
-  write_imagef (out, (int2)(x, y), fmax(o, 0.0f));
+  write_imagef (out, (int2)(x, y), o);
 }
 
 
@@ -135,11 +142,12 @@ green_equilibration_favg_reduce_first(read_only image2d_t in,
 
   const int c = FC(y, x, filters);
 
+  // make sure we add same number of greens for even and odd lines
   const int isinimage = (x < 2 * (width / 2) && y < 2 * (height / 2));
-  const int isgreen1 = (c == 1 && !(y & 1));
-  const int isgreen2 = (c == 1 && (y & 1));
+  const int isgreen1 = c == GREEN && !(y & 1);
+  const int isgreen2 = c == GREEN && (y & 1);
 
-  float pixel = read_imagef(in, sampleri, (int2)(x, y)).x;
+  float pixel = readsingle(in, x, y);
 
   buffer[l].x = isinimage && isgreen1 ? pixel : 0.0f;
   buffer[l].y = isinimage && isgreen2 ? pixel : 0.0f;
@@ -170,17 +178,19 @@ kernel void
 green_equilibration_favg_reduce_second(const global float2* input, global float2 *result, const int length, local float2 *buffer)
 {
   int x = get_global_id(0);
-  float2 sum = (float2)0.0f;
-
+  float m1 = 0.0f;
+  float m2 = 0.0f;
+  float sum1 = 0.0f;
+  float sum2 = 0.0f;
   while(x < length)
   {
-    sum += input[x];
-
+    Kahan_sum(sum1, m1, input[x].x);
+    Kahan_sum(sum2, m2, input[x].y);
     x += get_global_size(0);
   }
 
   int lid = get_local_id(0);
-  buffer[lid] = sum;
+  buffer[lid] = (float2)(sum1, sum2);
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -215,15 +225,15 @@ green_equilibration_favg_apply(read_only image2d_t in,
 
   if(x >= width || y >= height) return;
 
-  float pixel = read_imagef(in, sampleri, (int2)(x, y)).x;
+  float pixel = readsingle(in, x, y);
 
   const int c = FC(y, x, filters);
 
-  const int isgreen1 = (c == 1 && !(y & 1));
+  const int isgreen1 = (c == GREEN && !(y & 1)); // on even lines
 
   pixel *= (isgreen1 ? gr_ratio : 1.0f);
 
-  write_imagef (out, (int2)(x, y), fmax(pixel, 0.0f));
+  write_imagef (out, (int2)(x, y), pixel);
 }
 
 #define SWAP(a, b)                \
diff --git a/data/kernels/guided_filter.cl b/data/kernels/guided_filter.cl
index 36ca2cffd284..0c105afdb0ed 100644
--- a/data/kernels/guided_filter.cl
+++ b/data/kernels/guided_filter.cl
@@ -40,16 +40,6 @@ kernel void guided_filter_split_rgb_image(const int width,
 }
 
 
-// Kahan summation algorithm
-#define Kahan_sum(m, c, add)        \
-  {                                 \
-    const float t1 = (add) - (c);   \
-    const float t2 = (m) + t1;      \
-    c = (t2 - m) - t1;              \
-    m = t2;                         \
-  }
-
-
 kernel void guided_filter_box_mean_x(const int width,
                                      const int height,
                                      read_only image2d_t in,
diff --git a/src/iop/demosaicing/basics.c b/src/iop/demosaicing/basics.c
index 9c3d2de60039..b3b92c495858 100644
--- a/src/iop/demosaicing/basics.c
+++ b/src/iop/demosaicing/basics.c
@@ -156,9 +156,9 @@ static void green_equilibration_lavg(float *out,
   const float maximum = 1.0f;
 
   int oj = 2, oi = 2;
-  if(FC(oj, oi, filters) != 1) oj++;
-  if(FC(oj, oi, filters) != 1) oi++;
-  if(FC(oj, oi, filters) != 1) oj--;
+  if(FC(oj, oi, filters) != GREEN) oj++;
+  if(FC(oj, oi, filters) != GREEN) oi++;
+  if(FC(oj, oi, filters) != GREEN) oj--;
 
   dt_iop_image_copy_by_size(out, in, width, height, 1);
 
@@ -190,7 +190,7 @@ static void green_equilibration_lavg(float *out,
                           + fabsf(o2_3 - o2_4) + fabsf(o2_2 - o2_4)) / 6.0f;
         if((in[j * width + i] < maximum * 0.95f) && (c1 < maximum * thr) && (c2 < maximum * thr))
         {
-          out[j * width + i] = fmaxf(0.0f, in[j * width + i] * m1 / m2);
+          out[j * width + i] = in[j * width + i] * m1 / m2;
         }
       }
     }
@@ -230,7 +230,7 @@ static void green_equilibration_favg(float *out,
   {
     for(int i = oi; i < (width - 1 - g2_offset); i += 2)
     {
-      out[(size_t)j * width + i] = fmaxf(0.0f, in[(size_t)j * width + i] * gr_ratio);
+      out[(size_t)j * width + i] = in[(size_t)j * width + i] * gr_ratio;
     }
   }
 }
@@ -415,14 +415,15 @@ static int green_equilibration_cl(const dt_iop_module_t *self,
                                             sizeof(float) * 2 * reducesize, CL_TRUE);
     if(err != CL_SUCCESS) goto error;
 
-    float sum1 = 0.0f, sum2 = 0.0f;
+    double sum1 = 0.0;
+    double sum2 = 0.0;
     for(int k = 0; k < reducesize; k++)
     {
-      sum1 += sumsum[2 * k];
-      sum2 += sumsum[2 * k + 1];
+      sum1 += (double)sumsum[2 * k];
+      sum2 += (double)sumsum[2 * k + 1];
     }
 
-    const float gr_ratio = (sum1 > 0.0f && sum2 > 0.0f) ? sum2 / sum1 : 1.0f;
+    const float gr_ratio = (sum1 > 0.0 && sum2 > 0.0) ? (float)(sum2 / sum1) : 1.0f;
 
     err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_green_eq_favg_apply, width, height,
       CLARG(dev_in1), CLARG(dev_out1), CLARG(width), CLARG(height), CLARG(filters),

From daa4db7643468ba419825a875bca3fff7eb91da0 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sun, 8 Mar 2026 12:15:35 +0100
Subject: [PATCH 08/13] Fix pre median demosaicing part

Don't ensure non-negatives for this preliminary demosaicing code, the demosaicers
will have to handle this.
---
 data/kernels/demosaic_ppg.cl | 13 +++++++++----
 src/iop/demosaicing/basics.c |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/data/kernels/demosaic_ppg.cl b/data/kernels/demosaic_ppg.cl
index 564f3bca9846..3baff695a837 100644
--- a/data/kernels/demosaic_ppg.cl
+++ b/data/kernels/demosaic_ppg.cl
@@ -246,8 +246,13 @@ green_equilibration_favg_apply(read_only image2d_t in,
 constant int glim[5] = { 0, 1, 2, 1, 0 };
 
 kernel void
-pre_median(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
-           const unsigned int filters, const float threshold, local float *buffer)
+pre_median(read_only image2d_t in,
+           write_only image2d_t out,
+           const int width,
+           const int height,
+           const unsigned int filters,
+           const float threshold,
+           local float *buffer)
 {
   const int x = get_global_id(0);
   const int y = get_global_id(1);
@@ -279,7 +284,7 @@ pre_median(read_only image2d_t in, write_only image2d_t out, const int width, co
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy)).x;
+    buffer[bufidx] = readsingle(in, xx, yy);
   }
 
   // center buffer around current x,y-Pixel
@@ -317,7 +322,7 @@ pre_median(read_only image2d_t in, write_only image2d_t out, const int width, co
 
   float color = (c & 1) ? (cnt == 1 ? med[4] - 64.0f : med[(cnt - 1) / 2]) : buffer[0];
 
-  write_imagef (out, (int2)(x, y), fmax(color, 0.0f));
+  write_imagef (out, (int2)(x, y), color);
 }
 #undef SWAP
 
diff --git a/src/iop/demosaicing/basics.c b/src/iop/demosaicing/basics.c
index b3b92c495858..5d3472603fe0 100644
--- a/src/iop/demosaicing/basics.c
+++ b/src/iop/demosaicing/basics.c
@@ -65,7 +65,7 @@ static void pre_median_b(float *out,
         for(int i = 0; i < 8; i++)
           for(int ii = i + 1; ii < 9; ii++)
             if(med[i] > med[ii]) SWAP(med[i], med[ii]);
-        pixo[0] = fmaxf(0.0f, (cnt == 1 ? med[4] - 64.0f : med[(cnt - 1) / 2]));
+        pixo[0] = cnt == 1 ? med[4] - 64.0f : med[(cnt - 1) / 2];
         // pixo[0] = med[(cnt-1)/2];
         pixo += 2;
         pixi += 2;

From b7e8eac4719867ce4c731700d95e82b232aaafab Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sun, 8 Mar 2026 12:21:24 +0100
Subject: [PATCH 09/13] Slightly simplified capture OpenCL interface

Pass icoeffs as a float4 instead of cl_mem, use inlines for reading.
---
 data/kernels/capture.cl       | 14 ++++++++------
 src/iop/demosaicing/capture.c |  6 ++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/data/kernels/capture.cl b/data/kernels/capture.cl
index 92bb9d2acfad..f81243b46c73 100644
--- a/data/kernels/capture.cl
+++ b/data/kernels/capture.cl
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    copyright (c) 2025 darktable developer.
+    copyright (c) 2026 darktable developer.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -185,7 +185,7 @@ __kernel void prepare_blend(__read_only image2d_t cfa,
                             global const unsigned char (*const xtrans)[6],
                             global float *mask,
                             global float *Yold,
-                            global float *whites,
+                            const float4 wb,
                             const int w,
                             const int height)
 {
@@ -193,7 +193,9 @@ __kernel void prepare_blend(__read_only image2d_t cfa,
   const int row = get_global_id(1);
   if(col >= w || row >= height) return;
 
-  float4 rgb = read_imagef(dev_out, samplerA, (int2)(col, row));
+  float whites[4] = { wb.x, wb.y, wb.z, wb.w };
+
+  float4 rgb = readpixel(dev_out, col, row);
   // Photometric/digital ITU BT.709
   const float4 flum = (float4)( 0.212671f, 0.715160f, 0.072169f, 0.0f );
   rgb *= flum;
@@ -205,7 +207,7 @@ __kernel void prepare_blend(__read_only image2d_t cfa,
   {
     const int w2 = 2 * w;
     const int color = (filters == 9u) ? FCxtrans(row, col, xtrans) : FC(row, col, filters);
-    const float val = read_imagef(cfa, samplerA, (int2)(col, row)).x;
+    const float val = readsingle(cfa, col, row);
     if(val > whites[color] || Y < CAPTURE_YMIN)
     {
       mask[k-w2-1] = mask[k-w2]  = mask[k-w2+1] =
@@ -289,7 +291,7 @@ __kernel void show_blend_mask(__read_only image2d_t in,
   const int row = get_global_id(1);
   if(col >= width || row >= height) return;
 
-  float4 pix = read_imagef(in, samplerA, (int2)(col, row));
+  float4 pix = readpixel(in, col, row);
   const float blend = blender ? blend_mask[mad24(row, width, col)]
                               : (float)sigma_mask[mad24(row, width, col)] / 255.0f;
   pix.w = blend;
@@ -308,7 +310,7 @@ __kernel void capture_result( __read_only image2d_t in,
   const int row = get_global_id(1);
   if(col >= width || row >= height) return;
 
-  float4 pix = read_imagef(in, samplerA, (int2)(col, row));
+  float4 pix = readpixel(in, col, row);
   const int k = mad24(row, width, col);
 
   if(blendmask[k] > 0.0f)
diff --git a/src/iop/demosaicing/capture.c b/src/iop/demosaicing/capture.c
index e7f6beb6377c..27def3bc480f 100644
--- a/src/iop/demosaicing/capture.c
+++ b/src/iop/demosaicing/capture.c
@@ -1058,10 +1058,9 @@ static int _capture_sharpen_cl(dt_iop_module_t *self,
   cl_mem luminance = dt_opencl_alloc_device_buffer(devid, bsize);
   cl_mem tmp2 = dt_opencl_alloc_device_buffer(devid, bsize);
   cl_mem tmp1 = dt_opencl_alloc_device_buffer(devid, bsize);
-  cl_mem whites = dt_opencl_copy_host_to_device_constant(devid, 4 * sizeof(float), icoeffs);
   cl_mem dev_rgb = dt_opencl_duplicate_image(devid, dev_out);
 
-  if(!blendmask || !luminance || !tmp2 || !tmp1 || !whites || !dev_rgb) goto finish;
+  if(!blendmask || !luminance || !tmp2 || !tmp1 || !dev_rgb) goto finish;
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->prefill_clip_mask, width, height,
           CLARG(tmp2), CLARG(width), CLARG(height));
@@ -1069,7 +1068,7 @@ static int _capture_sharpen_cl(dt_iop_module_t *self,
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->prepare_blend, width, height,
           CLARG(dev_in), CLARG(dev_out), CLARG(filters), CLARG(dev_xtrans), CLARG(tmp2), CLARG(tmp1),
-          CLARG(whites), CLARG(width), CLARG(height));
+          CLFLARRAY(4, icoeffs), CLARG(width), CLARG(height));
   if(err != CL_SUCCESS) goto finish;
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->modify_blend, width, height,
@@ -1140,7 +1139,6 @@ static int _capture_sharpen_cl(dt_iop_module_t *self,
   dt_opencl_release_mem_object(tmp2);
   dt_opencl_release_mem_object(tmp1);
   dt_opencl_release_mem_object(luminance);
-  dt_opencl_release_mem_object(whites);
 
   return err;
 }

From f7bd5a4bb8cec126bc83032e9b4b4ce0d1340cab Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sun, 8 Mar 2026 12:53:04 +0100
Subject: [PATCH 10/13] Markesteijn demosaicer fixes

1. The CPU code deserves the better border calculation as we do for OpenCL since very long.
   There is a small performance penalty but with clearly better results.
   I was likely just overseen for many years.
2. Some minor fixes for initalizing data for far less differences between CPU OpenCL.
3. The VNG linear interpolation code needed some modifications to handle borders.

Reminder: The markesteijn algorithm does very heavy maths with a lot of intermediate results
leading to more-than-usual OpenCL vs CPU differences even without special 'native' instructions.
---
 data/kernels/demosaic_markesteijn.cl |  8 ++++----
 src/iop/demosaic.c                   |  2 +-
 src/iop/demosaicing/vng.c            | 22 +++++++++++++++-------
 src/iop/demosaicing/xtrans.c         |  9 ++++++---
 4 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/data/kernels/demosaic_markesteijn.cl b/data/kernels/demosaic_markesteijn.cl
index 0b49efc54bbc..fa358cc4650b 100644
--- a/data/kernels/demosaic_markesteijn.cl
+++ b/data/kernels/demosaic_markesteijn.cl
@@ -50,7 +50,7 @@ markesteijn_initial_copy(read_only image2d_t in, global float *rgb, const int wi
 
   const int f = FCxtrans(y, x, xtrans);
 
-  const float p = fmax(0.0f, read_imagef(in, sampleri, (int2)(x, y)).x);
+  const float p = readsingle(in, x, y);
 
   for(int c = 0; c < 3; c++)
     pix[c] = (c == f) ? p : 0.0f;
@@ -897,7 +897,7 @@ markesteijn_accu(read_only image2d_t in, write_only image2d_t out, global float
 
   const int glidx = mad24(y, width, x);
 
-  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
+  float4 pixel = readpixel(in, x, y);
   float4 add = vload4(glidx, rgb);
   add.w = 1.0f;
 
@@ -918,11 +918,11 @@ markesteijn_final(read_only image2d_t in, write_only image2d_t out, const int wi
   // take sufficient border into account
   if(x < border || x >= width-border || y < border || y >= height-border) return;
 
-  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
+  float4 pixel = readpixel(in, x, y);
 
   pixel = (pixel.w > 0.0f) ? pixel/pixel.w : (float4)0.0f;
   pixel.w = 0.0f;
 
-  write_imagef(out, (int2)(x, y), fmax(0.0f, pixel));
+  write_imagef(out, (int2)(x, y), pixel);
 }
 
diff --git a/src/iop/demosaic.c b/src/iop/demosaic.c
index 78521d165868..126a5f332420 100644
--- a/src/iop/demosaic.c
+++ b/src/iop/demosaic.c
@@ -854,7 +854,7 @@ void process(dt_iop_module_t *self,
         if(method == DT_IOP_DEMOSAIC_FDC)
           xtrans_fdc_interpolate(t_out, t_in, width, t_rows, xtrans, exif_iso);
         else if(method == DT_IOP_DEMOSAIC_MARKESTEIJN || method == DT_IOP_DEMOSAIC_MARKESTEIJN_3)
-          xtrans_markesteijn_interpolate(t_out, t_in, width, t_rows, xtrans, passes);
+          xtrans_markesteijn_interpolate(t_out, t_in, width, t_rows, xtrans, passes, filters);
         else
           vng_interpolate(t_out, t_in, width, t_rows, filters, xtrans, FALSE);
       }
diff --git a/src/iop/demosaicing/vng.c b/src/iop/demosaicing/vng.c
index 0865d6d30c5c..bb5c8481b2c6 100644
--- a/src/iop/demosaicing/vng.c
+++ b/src/iop/demosaicing/vng.c
@@ -18,12 +18,13 @@
 
 /* taken from dcraw and demosaic_ppg below */
 
-static void lin_interpolate(float *out,
-                            const float *const in,
-                            const int width,
-                            const int height,
-                            const uint32_t filters,
-                            const uint8_t (*const xtrans)[6])
+static void _vng_lininterpolate(float *out,
+                                const float *const in,
+                                const int width,
+                                const int height,
+                                const uint32_t filters,
+                                const uint8_t (*const xtrans)[6],
+                                const int border)
 {
   const gboolean is_xtrans = filters == 9;
   const int colors = is_xtrans ? 3 : 4;
@@ -108,6 +109,13 @@ static void lin_interpolate(float *out,
     const float *buf_in = in + width * row + 1;
     for(int col = 1; col < width - 1; col++)
     {
+      if(col == border && row >= border && row < height - border)
+      {
+        col = width - border;
+        buf = out + (size_t)4 * width * row + 4 * col;
+        buf_in = in + (size_t)width * row + col;
+      }
+      if(col == width) break;
       dt_aligned_pixel_t sum = { 0.0f };
       int *ip = &(lookup[row % size][col % size][0]);
       // for each adjoining pixel not of this pixel's color, sum up its weighted values
@@ -186,7 +194,7 @@ static void vng_interpolate(float *out,
   else
     filters4 = filters | 0x0c0c0c0cu;
 
-  lin_interpolate(out, in, width, height, filters4, xtrans);
+  _vng_lininterpolate(out, in, width, height, filters4, xtrans, 1000000);
 
   // if only linear interpolation is requested we can stop it here
   if(only_vng_linear)
diff --git a/src/iop/demosaicing/xtrans.c b/src/iop/demosaicing/xtrans.c
index 054547ec3c44..0609aae15c97 100644
--- a/src/iop/demosaicing/xtrans.c
+++ b/src/iop/demosaicing/xtrans.c
@@ -47,7 +47,8 @@ static void xtrans_markesteijn_interpolate(float *out,
                                            const int width,
                                            const int height,
                                            const uint8_t (*const xtrans)[6],
-                                           const int passes)
+                                           const int passes,
+                                           const uint32_t filters)
 {
   static const short orth[12] = { 1, 0, 0, 1, -1, 0, 0, -1, 1, 0, 0, 1 },
                      patt[2][16] = { { 0, 1, 0, -1, 2, 0, -1, 0, 1, 1, 1, -1, 0, 0, 0, 0 },
@@ -177,7 +178,8 @@ static void xtrans_markesteijn_interpolate(float *out,
         }
 
       // duplicate rgb[0] to rgb[1], rgb[2], and rgb[3]
-      for(int c = 1; c <= 3; c++) memcpy(rgb[c], rgb[0], sizeof(*rgb));
+      for(int c = 1; c <= 3; c++)
+        dt_iop_image_copy((float*)rgb[c], (float*)rgb[0], sizeof(*rgb) / sizeof(float));
 
       // note that successive calculations are inset within the tile
       // so as to give enough border data, and there needs to be a 6
@@ -502,11 +504,12 @@ static void xtrans_markesteijn_interpolate(float *out,
             }
           }
           for(int c = 0; c < 3; c++)
-            out[4 * (width * (row + top) + col + left) + c] = MAX(0.0f, avg[c]/avg[3]);
+            out[4 * (width * (row + top) + col + left) + c] = avg[c]/avg[3];
         }
     }
   }
   dt_free_align(all_buffers);
+  _vng_lininterpolate(out, in, width, height, filters, xtrans, pad_tile);
 }
 
 #undef TS

From cd448a2821cd418d98ac1538816d338b678a4322 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sun, 8 Mar 2026 13:04:05 +0100
Subject: [PATCH 11/13] Bugfixing various interpolaters

When doing
- dt_interpolation_resample() dt_interpolation_resample_roi()
- dt_interpolation_resample_cl() dt_interpolation_resample_roi_cl()
- dt_interpolation_resample_1c()

we don't attempt to keep data above zero, if this is required we must correct outside
these interpolators.

Notes:
- the cpu code did badly on intermediate results
- the fast copy modes behave accordingly as keeping data as they are
---
 data/kernels/basic.cl      | 20 ++++++++------------
 src/common/interpolation.c |  6 ++----
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/data/kernels/basic.cl b/data/kernels/basic.cl
index bc16ac290c48..7fef68286151 100644
--- a/data/kernels/basic.cl
+++ b/data/kernels/basic.cl
@@ -2972,11 +2972,11 @@ static inline float get_image_channel(read_only image2d_t in,
                                       const int c)
 {
   float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
-  if(c == 0)
+  if(c == RED)
     return pixel.x;
-  else if(c == 1)
+  else if(c == GREEN)
     return pixel.y;
-  else if(c == 2)
+  else if(c == BLUE)
     return pixel.z;
 
   return pixel.w;
@@ -3846,7 +3846,7 @@ interpolation_resample (read_only image2d_t in,
     for (int ix = 0; ix < hl && yvalid; ix++)
     {
       const int xx = lindex[ix];
-      float4 hpixel = read_imagef(in, sampleri,(int2)(xx, yy));
+      float4 hpixel = readpixel(in, xx, yy);
       vpixel += hpixel * lkernel[ix];
     }
 
@@ -3868,12 +3868,8 @@ interpolation_resample (read_only image2d_t in,
   }
 
   // store final result
-  if (iy == 0 && x < width && y < height)
-  {
-    // Clip negative RGB that may be produced by Lanczos undershooting
-    // Negative RGB are invalid values no matter the RGB space (light is positive)
-    write_imagef (out, (int2)(x, y), fmax(buffer[ylid], 0.f));
-  }
+  if(iy == 0 && x < width && y < height)
+    write_imagef(out, (int2)(x, y), buffer[ylid]);
 }
 
 /* kernel for the interpolation copy helper */
@@ -3890,7 +3886,7 @@ interpolation_copy(read_only image2d_t dev_in,
   const int ocol = get_global_id(0);
   const int orow = get_global_id(1);
 
-  if(ocol >= owidth || orow >= oheight) return;
+  if(ocol < 0 || ocol >= owidth || orow < 0 || orow >= oheight) return;
 
   float4 pix = (float4)( 0.0f, 0.0f, 0.0f, 0.0f );
 
@@ -3899,7 +3895,7 @@ interpolation_copy(read_only image2d_t dev_in,
 
   if(irow < iheight && irow >= 0 && icol < iwidth && icol >= 0)
   {
-    pix = read_imagef(dev_in, samplerA, (int2)(icol, irow));
+    pix = readpixel(dev_in, icol, irow);
   }
   write_imagef(dev_out, (int2)(ocol, orow), pix);
 }
diff --git a/src/common/interpolation.c b/src/common/interpolation.c
index ced583c98a11..5137bd4da9c2 100644
--- a/src/common/interpolation.c
+++ b/src/common/interpolation.c
@@ -1142,11 +1142,9 @@ void dt_interpolation_resample(const dt_interpolation_t *itor,
       // Output pixel is ready
       const size_t baseidx = (size_t)oy * out_stride_floats + (size_t)ox * 4;
 
-      // Clip negative RGB that may be produced by Lanczos undershooting
-      // Negative RGB are invalid values no matter the RGB space (light is positive)
       dt_aligned_pixel_t pixel;
       for_each_channel(c, aligned(vs:16))
-        pixel[c] = fmaxf(0.0f, vs[c]);
+        pixel[c] = vs[c];
       copy_pixel_nontemporal(out + baseidx, pixel);
 
       // Reset vertical resampling context
@@ -1565,7 +1563,7 @@ void dt_interpolation_resample_1c(const dt_interpolation_t *itor,
       // Output pixel is ready
       float *o = (float *)((char *)out + (size_t)oy * out_stride
                            + (size_t)ox * sizeof(float));
-      *o = fmaxf(0.0f, vs);
+      *o = vs;
 
       // Reset vertical resampling context
       viidx -= vl;

From 2f69ceb5e63ba3bfa15b685cfe6aea5ef5dae543 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sun, 8 Mar 2026 16:03:57 +0100
Subject: [PATCH 12/13] Possibly demosaicers should not write negatives

Demosaicer input has negatives, also it's output can have negs due to internal
algorithms even if input didn't.

All OpenCL demosaicers read data safely (leaving -FLT_MAX) even if the image was not correctly initialized,
we had such issues on AMD systems with bad things following, nvidia/intel seemed to return zero in such cases.

If things in the pixelpipe work fine all should be good even with negs fed into the pipe, following code
in color maths must handle that. Remember there are colorsoaces with neg values so the interpolaters should
not try to keep data non-negative.

We can possibly add code prohibiting negative demosaicing data by uncommenting
DT_DEMOSAIC_POSITIVE
---
 src/iop/demosaic.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/iop/demosaic.c b/src/iop/demosaic.c
index 126a5f332420..a43477d4cb96 100644
--- a/src/iop/demosaic.c
+++ b/src/iop/demosaic.c
@@ -52,6 +52,8 @@ DT_MODULE_INTROSPECTION(6, dt_iop_demosaic_params_t)
 #define DT_DEMOSAIC_XTRANS 1024 // masks for non-Bayer demosaic ops
 #define DT_DEMOSAIC_DUAL 2048   // masks for dual demosaicing methods
 
+// #define DT_DEMOSAIC_POSITIVE
+
 typedef enum dt_iop_demosaic_method_t
 {
   // methods for Bayer images
@@ -240,6 +242,7 @@ typedef struct dt_iop_demosaic_global_data_t
   int show_blend_mask;
   int capture_result;
   int final_blend;
+  int image_positive;
   float *gauss_coeffs;
 } dt_iop_demosaic_global_data_t;
 
@@ -904,6 +907,13 @@ void process(dt_iop_module_t *self,
     dt_iop_clip_and_zoom_roi((float *)o, out, roi_out, roi_in);
     dt_free_align(out);
   }
+
+#ifdef DT_DEMOSAIC_POSITIVE
+  float *oo = (float *)o;
+  DT_OMP_FOR_SIMD(aligned(oo : 64))
+  for(size_t k = 0; k < (size_t)roi_out->width * roi_out->height * 4; k++)
+    oo[k] = fmaxf(0.0f, oo[k]);
+#endif
 }
 
 #ifdef HAVE_OPENCL
@@ -1201,7 +1211,13 @@ int process_cl(dt_iop_module_t *self,
   }
 
   if(!direct)
-     err = dt_iop_clip_and_zoom_roi_cl(devid, dev_out, out_image, roi_out, roi_in);
+    err = dt_iop_clip_and_zoom_roi_cl(devid, dev_out, out_image, roi_out, roi_in);
+
+#ifdef DT_DEMOSAIC_POSITIVE
+  if(err == CL_SUCCESS)
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->image_positive, roi_out->width, roi_out->height,
+        CLARG(dev_out), CLARG(dev_out), CLARG(roi_out->width), CLARG(roi_out->height));
+#endif
 
 finish:
   dt_opencl_release_mem_object(dev_xtrans);
@@ -1284,6 +1300,7 @@ void init_global(dt_iop_module_so_t *self)
   gd->kernel_rcd_step_5_2 = dt_opencl_create_kernel(rcd, "rcd_step_5_2");
   gd->kernel_demosaic_box3 = dt_opencl_create_kernel(rcd, "demosaic_box3");
   gd->kernel_write_blended_dual  = dt_opencl_create_kernel(rcd, "write_blended_dual");
+  gd->image_positive  = dt_opencl_create_kernel(rcd, "image_positive");
 
   const int capt = 38; // capture.cl, from programs.conf
   gd->gaussian_9x9_mul = dt_opencl_create_kernel(capt, "kernel_9x9_mul");
@@ -1357,6 +1374,7 @@ void cleanup_global(dt_iop_module_so_t *self)
   dt_opencl_free_kernel(gd->show_blend_mask);
   dt_opencl_free_kernel(gd->capture_result);
   dt_opencl_free_kernel(gd->final_blend);
+  dt_opencl_free_kernel(gd->image_positive);
   dt_free_align(gd->gauss_coeffs);
   free(self->data);
   self->data = NULL;

From 68fc6a70c2dd71c7eb593c35f5133283bb7f5558 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Mon, 9 Mar 2026 13:36:46 +0100
Subject: [PATCH 13/13] Minor improvements for dual demosaicing and details

1. Calculation of Y0 mask is done minimally different leading to a slightly better
   stability in regions with any channal below zero.
2. The CPU code now works exactly as OpenCL with subtle p3erf gains as we multiply instead of devide
   depending on compiler.

qwerv
---
 data/kernels/demosaic_rcd.cl  |  4 ++--
 src/develop/masks/detail.c    | 14 +++++++-------
 src/develop/pixelpipe_hb.c    |  2 +-
 src/iop/demosaicing/capture.c | 10 +++++-----
 src/iop/demosaicing/dual.c    |  2 +-
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/data/kernels/demosaic_rcd.cl b/data/kernels/demosaic_rcd.cl
index 56ad06a26c2e..73b885a5a4af 100644
--- a/data/kernels/demosaic_rcd.cl
+++ b/data/kernels/demosaic_rcd.cl
@@ -297,8 +297,8 @@ __kernel void calc_Y0_mask(global float *mask,
   if((col >= w) || (row >= height)) return;
   const int idx = mad24(row, w, col);
 
-  const float4 pt = wb * fmax(0.0f, readpixel(in, col, row));
-  mask[idx] = dtcl_sqrt(0.33333333f * (pt.x + pt.y + pt.z));
+  const float4 pt = wb * readpixel(in, col, row);
+  mask[idx] = dtcl_sqrt(0.33333333f * fmax(0.0f, (pt.x + pt.y + pt.z)));
 }
 
 __kernel void calc_scharr_mask(global float *in, global float *out, const int w, const int height)
diff --git a/src/develop/masks/detail.c b/src/develop/masks/detail.c
index 838df90e63df..6c7aec6fc899 100644
--- a/src/develop/masks/detail.c
+++ b/src/develop/masks/detail.c
@@ -122,19 +122,19 @@ float *dt_masks_calc_scharr_mask(dt_dev_pixelpipe_t *pipe,
   }
 
   const gboolean wboff = !pipe->dsc.temperature.enabled || !rawmode;
-  const dt_aligned_pixel_t wb = { wboff ? 1.0f : pipe->dsc.temperature.coeffs[0],
-                                  wboff ? 1.0f : pipe->dsc.temperature.coeffs[1],
-                                  wboff ? 1.0f : pipe->dsc.temperature.coeffs[2] };
+  const dt_aligned_pixel_t wb = { wboff ? 1.0f : 1.0f / pipe->dsc.temperature.coeffs[0],
+                                  wboff ? 1.0f : 1.0f / pipe->dsc.temperature.coeffs[1],
+                                  wboff ? 1.0f : 1.0f / pipe->dsc.temperature.coeffs[2] };
 
 
   DT_OMP_FOR_SIMD(aligned(tmp : 64))
   for(size_t idx =0; idx < msize; idx++)
   {
-    const float val = fmaxf(0.0f, src[4 * idx] / wb[0])
-                    + fmaxf(0.0f, src[4 * idx + 1] / wb[1])
-                    + fmaxf(0.0f, src[4 * idx + 2] / wb[2]);
+    const float val = src[4 * idx] * wb[0]
+                    + src[4 * idx + 1] * wb[1]
+                    + src[4 * idx + 2] * wb[2];
     // add a gamma. sqrtf should make noise variance the same for all image
-    tmp[idx] = sqrtf(val / 3.0f);
+    tmp[idx] = sqrtf(0.33333333f * fmaxf(0.0f, val));
   }
 
   DT_OMP_FOR()
diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c
index 8d46100bf354..d438e7777db3 100644
--- a/src/develop/pixelpipe_hb.c
+++ b/src/develop/pixelpipe_hb.c
@@ -3634,7 +3634,7 @@ int dt_dev_write_scharr_mask_cl(dt_dev_pixelpipe_iop_t *piece,
         wboff ? 1.0f : 1.0f / p->dsc.temperature.coeffs[2], 1.0f };
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_Y0_mask, width, height,
-     CLARG(tmp), CLARG(in), CLARG(width), CLARG(height), CLFLARRAY(4, wb));
+     CLARG(tmp), CLARG(in), CLARG(width), CLARG(height), CLARG(wb));
   if(err != CL_SUCCESS) goto error;
 
   err = dt_opencl_enqueue_kernel_2d_args(devid,
diff --git a/src/iop/demosaicing/capture.c b/src/iop/demosaicing/capture.c
index 27def3bc480f..02e4e9f5f48c 100644
--- a/src/iop/demosaicing/capture.c
+++ b/src/iop/demosaicing/capture.c
@@ -1045,10 +1045,10 @@ static int _capture_sharpen_cl(dt_iop_module_t *self,
 
   const dt_iop_buffer_dsc_t *dsc = &pipe->dsc;
   const gboolean wbon = dsc->temperature.enabled;
-  dt_aligned_pixel_t icoeffs = { wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[0] : CAPTURE_CFACLIP,
-                                 wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[1] : CAPTURE_CFACLIP,
-                                 wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[2] : CAPTURE_CFACLIP,
-                                 0.0f };
+  const dt_aligned_pixel_t icoeffs = { wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[0] : CAPTURE_CFACLIP,
+                                       wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[1] : CAPTURE_CFACLIP,
+                                       wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[2] : CAPTURE_CFACLIP,
+                                       0.0f };
 
   cl_mem gcoeffs = NULL;
   cl_mem gauss_idx = NULL;
@@ -1068,7 +1068,7 @@ static int _capture_sharpen_cl(dt_iop_module_t *self,
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->prepare_blend, width, height,
           CLARG(dev_in), CLARG(dev_out), CLARG(filters), CLARG(dev_xtrans), CLARG(tmp2), CLARG(tmp1),
-          CLFLARRAY(4, icoeffs), CLARG(width), CLARG(height));
+          CLARG(icoeffs), CLARG(width), CLARG(height));
   if(err != CL_SUCCESS) goto finish;
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->modify_blend, width, height,
diff --git a/src/iop/demosaicing/dual.c b/src/iop/demosaicing/dual.c
index 0c703a03ce38..062092ad609a 100644
--- a/src/iop/demosaicing/dual.c
+++ b/src/iop/demosaicing/dual.c
@@ -118,7 +118,7 @@ int dual_demosaic_cl(const dt_iop_module_t *self,
         wboff ? 1.0f : 1.0f / p->dsc.temperature.coeffs[2], 1.0f };
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_Y0_mask, width, height,
-     CLARG(mask), CLARG(high_image), CLARG(width), CLARG(height), CLFLARRAY(4, wb));
+     CLARG(mask), CLARG(high_image), CLARG(width), CLARG(height), CLARG(wb));
   if(err != CL_SUCCESS) goto finish;
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_scharr_mask, width, height,