diff --git a/data/kernels/basic.cl b/data/kernels/basic.cl
index bc16ac290c48..7fef68286151 100644
--- a/data/kernels/basic.cl
+++ b/data/kernels/basic.cl
@@ -2972,11 +2972,11 @@ static inline float get_image_channel(read_only image2d_t in,
                                       const int c)
 {
   float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
-  if(c == 0)
+  if(c == RED)
     return pixel.x;
-  else if(c == 1)
+  else if(c == GREEN)
     return pixel.y;
-  else if(c == 2)
+  else if(c == BLUE)
     return pixel.z;
 
   return pixel.w;
@@ -3846,7 +3846,7 @@ interpolation_resample (read_only image2d_t in,
     for (int ix = 0; ix < hl && yvalid; ix++)
     {
       const int xx = lindex[ix];
-      float4 hpixel = read_imagef(in, sampleri,(int2)(xx, yy));
+      float4 hpixel = readpixel(in, xx, yy);
       vpixel += hpixel * lkernel[ix];
     }
 
@@ -3868,12 +3868,8 @@ interpolation_resample (read_only image2d_t in,
   }
 
   // store final result
-  if (iy == 0 && x < width && y < height)
-  {
-    // Clip negative RGB that may be produced by Lanczos undershooting
-    // Negative RGB are invalid values no matter the RGB space (light is positive)
-    write_imagef (out, (int2)(x, y), fmax(buffer[ylid], 0.f));
-  }
+  if(iy == 0 && x < width && y < height)
+    write_imagef(out, (int2)(x, y), buffer[ylid]);
 }
 
 /* kernel for the interpolation copy helper */
@@ -3890,7 +3886,7 @@ interpolation_copy(read_only image2d_t dev_in,
   const int ocol = get_global_id(0);
   const int orow = get_global_id(1);
 
-  if(ocol >= owidth || orow >= oheight) return;
+  if(ocol < 0 || ocol >= owidth || orow < 0 || orow >= oheight) return;
 
   float4 pix = (float4)( 0.0f, 0.0f, 0.0f, 0.0f );
 
@@ -3899,7 +3895,7 @@ interpolation_copy(read_only image2d_t dev_in,
 
   if(irow < iheight && irow >= 0 && icol < iwidth && icol >= 0)
   {
-    pix = read_imagef(dev_in, samplerA, (int2)(icol, irow));
+    pix = readpixel(dev_in, icol, irow);
   }
   write_imagef(dev_out, (int2)(ocol, orow), pix);
 }
diff --git a/data/kernels/capture.cl b/data/kernels/capture.cl
index 92bb9d2acfad..f81243b46c73 100644
--- a/data/kernels/capture.cl
+++ b/data/kernels/capture.cl
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    copyright (c) 2025 darktable developer.
+    copyright (c) 2026 darktable developer.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -185,7 +185,7 @@ __kernel void prepare_blend(__read_only image2d_t cfa,
                             global const unsigned char (*const xtrans)[6],
                             global float *mask,
                             global float *Yold,
-                            global float *whites,
+                            const float4 wb,
                             const int w,
                             const int height)
 {
@@ -193,7 +193,9 @@ __kernel void prepare_blend(__read_only image2d_t cfa,
   const int row = get_global_id(1);
   if(col >= w || row >= height) return;
 
-  float4 rgb = read_imagef(dev_out, samplerA, (int2)(col, row));
+  float whites[4] = { wb.x, wb.y, wb.z, wb.w };
+
+  float4 rgb = readpixel(dev_out, col, row);
   // Photometric/digital ITU BT.709
   const float4 flum = (float4)( 0.212671f, 0.715160f, 0.072169f, 0.0f );
   rgb *= flum;
@@ -205,7 +207,7 @@ __kernel void prepare_blend(__read_only image2d_t cfa,
   {
     const int w2 = 2 * w;
     const int color = (filters == 9u) ? FCxtrans(row, col, xtrans) : FC(row, col, filters);
-    const float val = read_imagef(cfa, samplerA, (int2)(col, row)).x;
+    const float val = readsingle(cfa, col, row);
     if(val > whites[color] || Y < CAPTURE_YMIN)
     {
       mask[k-w2-1] = mask[k-w2]  = mask[k-w2+1] =
@@ -289,7 +291,7 @@ __kernel void show_blend_mask(__read_only image2d_t in,
   const int row = get_global_id(1);
   if(col >= width || row >= height) return;
 
-  float4 pix = read_imagef(in, samplerA, (int2)(col, row));
+  float4 pix = readpixel(in, col, row);
   const float blend = blender ? blend_mask[mad24(row, width, col)]
                               : (float)sigma_mask[mad24(row, width, col)] / 255.0f;
   pix.w = blend;
@@ -308,7 +310,7 @@ __kernel void capture_result( __read_only image2d_t in,
   const int row = get_global_id(1);
   if(col >= width || row >= height) return;
 
-  float4 pix = read_imagef(in, samplerA, (int2)(col, row));
+  float4 pix = readpixel(in, col, row);
   const int k = mad24(row, width, col);
 
   if(blendmask[k] > 0.0f)
diff --git a/data/kernels/common.h b/data/kernels/common.h
index 234f00b44149..7ff05665ab0b 100644
--- a/data/kernels/common.h
+++ b/data/kernels/common.h
@@ -63,6 +63,15 @@ constant sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE
   #pragma OPENCL FP_CONTRACT OFF
 #endif
 
+// Kahan summation algorithm
+#define Kahan_sum(m, c, add)        \
+  {                                 \
+    const float t1 = (add) - (c);   \
+    const float t2 = (m) + t1;      \
+    c = (t2 - m) - t1;              \
+    m = t2;                         \
+  }
+
 static inline int
 FC(const int row, const int col, const unsigned int filters)
 {
@@ -159,3 +168,23 @@ static inline float clipf(const float a)
 {
   return clamp(a, 0.0f, 1.0f);
 }
+
+/* Some inline functions making life easier when reading photosites
+   or pixels from cl_mem images.
+   As we had images with NaNs in data and at least AMD systems suffered
+   as reading NaNs instead of falling back to 0 (nvidia seems to do do)
+   we always read data and make sure values are valid floats.
+*/
+static inline float readsingle(read_only image2d_t in, int col, int row)
+{
+  return fmax(-FLT_MAX, read_imagef(in, sampleri, (int2)(col, row)).x);
+}
+
+static inline float4 readpixel(read_only image2d_t in, int col, int row)
+{
+  return fmax(-FLT_MAX, read_imagef(in, sampleri, (int2)(col, row)));
+}
+static inline float readalpha(read_only image2d_t in, int col, int row)
+{
+  return clipf(read_imagef(in, sampleri, (int2)(col, row)).w);
+}
diff --git a/data/kernels/demosaic_markesteijn.cl b/data/kernels/demosaic_markesteijn.cl
index 0b49efc54bbc..fa358cc4650b 100644
--- a/data/kernels/demosaic_markesteijn.cl
+++ b/data/kernels/demosaic_markesteijn.cl
@@ -50,7 +50,7 @@ markesteijn_initial_copy(read_only image2d_t in, global float *rgb, const int wi
 
   const int f = FCxtrans(y, x, xtrans);
 
-  const float p = fmax(0.0f, read_imagef(in, sampleri, (int2)(x, y)).x);
+  const float p = readsingle(in, x, y);
 
   for(int c = 0; c < 3; c++)
     pix[c] = (c == f) ? p : 0.0f;
@@ -897,7 +897,7 @@ markesteijn_accu(read_only image2d_t in, write_only image2d_t out, global float
 
   const int glidx = mad24(y, width, x);
 
-  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
+  float4 pixel = readpixel(in, x, y);
   float4 add = vload4(glidx, rgb);
   add.w = 1.0f;
 
@@ -918,11 +918,11 @@ markesteijn_final(read_only image2d_t in, write_only image2d_t out, const int wi
   // take sufficient border into account
   if(x < border || x >= width-border || y < border || y >= height-border) return;
 
-  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
+  float4 pixel = readpixel(in, x, y);
 
   pixel = (pixel.w > 0.0f) ? pixel/pixel.w : (float4)0.0f;
   pixel.w = 0.0f;
 
-  write_imagef(out, (int2)(x, y), fmax(0.0f, pixel));
+  write_imagef(out, (int2)(x, y), pixel);
 }
 
diff --git a/data/kernels/demosaic_ppg.cl b/data/kernels/demosaic_ppg.cl
index ce88484fd87f..3baff695a837 100644
--- a/data/kernels/demosaic_ppg.cl
+++ b/data/kernels/demosaic_ppg.cl
@@ -74,7 +74,7 @@ green_equilibration_lavg(read_only image2d_t in,
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy)).x;
+    buffer[bufidx] = readsingle(in, xx, yy);
   }
 
   // center buffer around current x,y-Pixel
@@ -88,7 +88,15 @@ green_equilibration_lavg(read_only image2d_t in,
   const float maximum = 1.0f;
   float o = buffer[0];
 
-  if(c == 1 && (y & 1))
+  // check lines to be used as we do in cpu code
+  int oj = 2, oi = 2;
+  if(FC(oj, oi, filters) != GREEN) oj++;
+  if(FC(oj, oi, filters) != GREEN) oi++;
+  if(FC(oj, oi, filters) != GREEN) oj--;
+
+  int row_ok = (oj & 1) == (y & 1);
+  int col_ok = (oi & 1) == (x & 1);
+  if(row_ok && col_ok && x >= 2 && y >= 2 && x < width -2 && y < height-2)
   {
     const float o1_1 = buffer[-1 * stride - 1];
     const float o1_2 = buffer[-1 * stride + 1];
@@ -102,17 +110,16 @@ green_equilibration_lavg(read_only image2d_t in,
     const float m1 = (o1_1+o1_2+o1_3+o1_4)/4.0f;
     const float m2 = (o2_1+o2_2+o2_3+o2_4)/4.0f;
 
-    if ((m2 > 0.0f) && (m1 > 0.0f) && (m1 / m2 < maximum * 2.0f))
+    if((m2 > 0.0f) && (m1 > 0.0f) && (m1 / m2 < maximum * 2.0f))
     {
       const float c1 = (fabs(o1_1 - o1_2) + fabs(o1_1 - o1_3) + fabs(o1_1 - o1_4) + fabs(o1_2 - o1_3) + fabs(o1_3 - o1_4) + fabs(o1_2 - o1_4)) / 6.0f;
       const float c2 = (fabs(o2_1 - o2_2) + fabs(o2_1 - o2_3) + fabs(o2_1 - o2_4) + fabs(o2_2 - o2_3) + fabs(o2_3 - o2_4) + fabs(o2_2 - o2_4)) / 6.0f;
 
       if((o < maximum * 0.95f) && (c1 < maximum * thr) && (c2 < maximum * thr))
-        o *= m1/m2;
+          o *= m1/m2;
     }
   }
-
-  write_imagef (out, (int2)(x, y), fmax(o, 0.0f));
+  write_imagef (out, (int2)(x, y), o);
 }
 
 
@@ -135,11 +142,12 @@ green_equilibration_favg_reduce_first(read_only image2d_t in,
 
   const int c = FC(y, x, filters);
 
+  // make sure we add same number of greens for even and odd lines
   const int isinimage = (x < 2 * (width / 2) && y < 2 * (height / 2));
-  const int isgreen1 = (c == 1 && !(y & 1));
-  const int isgreen2 = (c == 1 && (y & 1));
+  const int isgreen1 = c == GREEN && !(y & 1);
+  const int isgreen2 = c == GREEN && (y & 1);
 
-  float pixel = read_imagef(in, sampleri, (int2)(x, y)).x;
+  float pixel = readsingle(in, x, y);
 
   buffer[l].x = isinimage && isgreen1 ? pixel : 0.0f;
   buffer[l].y = isinimage && isgreen2 ? pixel : 0.0f;
@@ -170,17 +178,19 @@ kernel void
 green_equilibration_favg_reduce_second(const global float2* input, global float2 *result, const int length, local float2 *buffer)
 {
   int x = get_global_id(0);
-  float2 sum = (float2)0.0f;
-
+  float m1 = 0.0f;
+  float m2 = 0.0f;
+  float sum1 = 0.0f;
+  float sum2 = 0.0f;
   while(x < length)
   {
-    sum += input[x];
-
+    Kahan_sum(sum1, m1, input[x].x);
+    Kahan_sum(sum2, m2, input[x].y);
     x += get_global_size(0);
   }
 
   int lid = get_local_id(0);
-  buffer[lid] = sum;
+  buffer[lid] = (float2)(sum1, sum2);
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -215,15 +225,15 @@ green_equilibration_favg_apply(read_only image2d_t in,
 
   if(x >= width || y >= height) return;
 
-  float pixel = read_imagef(in, sampleri, (int2)(x, y)).x;
+  float pixel = readsingle(in, x, y);
 
   const int c = FC(y, x, filters);
 
-  const int isgreen1 = (c == 1 && !(y & 1));
+  const int isgreen1 = (c == GREEN && !(y & 1)); // on even lines
 
   pixel *= (isgreen1 ? gr_ratio : 1.0f);
 
-  write_imagef (out, (int2)(x, y), fmax(pixel, 0.0f));
+  write_imagef (out, (int2)(x, y), pixel);
 }
 
 #define SWAP(a, b)                \
@@ -236,8 +246,13 @@ green_equilibration_favg_apply(read_only image2d_t in,
 constant int glim[5] = { 0, 1, 2, 1, 0 };
 
 kernel void
-pre_median(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
-           const unsigned int filters, const float threshold, local float *buffer)
+pre_median(read_only image2d_t in,
+           write_only image2d_t out,
+           const int width,
+           const int height,
+           const unsigned int filters,
+           const float threshold,
+           local float *buffer)
 {
   const int x = get_global_id(0);
   const int y = get_global_id(1);
@@ -269,7 +284,7 @@ pre_median(read_only image2d_t in, write_only image2d_t out, const int width, co
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy)).x;
+    buffer[bufidx] = readsingle(in, xx, yy);
   }
 
   // center buffer around current x,y-Pixel
@@ -307,7 +322,7 @@ pre_median(read_only image2d_t in, write_only image2d_t out, const int width, co
 
   float color = (c & 1) ? (cnt == 1 ? med[4] - 64.0f : med[(cnt - 1) / 2]) : buffer[0];
 
-  write_imagef (out, (int2)(x, y), fmax(color, 0.0f));
+  write_imagef (out, (int2)(x, y), color);
 }
 #undef SWAP
 
@@ -325,7 +340,11 @@ pre_median(read_only image2d_t in, write_only image2d_t out, const int width, co
 // 3x3 median filter
 // uses a sorting network to sort entirely in registers with no branches
 kernel void
-color_smoothing(read_only image2d_t in, write_only image2d_t out, const int width, const int height, local float4 *buffer)
+color_smoothing(read_only image2d_t in,
+                write_only image2d_t out,
+                const int width,
+                const int height,
+                local float4 *buffer)
 {
   const int lxid = get_local_id(0);
   const int lyid = get_local_id(1);
@@ -352,17 +371,22 @@ color_smoothing(read_only image2d_t in, write_only image2d_t out, const int widt
     // don't read more than needed
     if(gx >= width + 1 || gy >= height + 1) continue;
 
-    buffer[bufidx] = read_imagef(in, sampleri, (int2)(gx, gy));
+    buffer[bufidx] = readpixel(in, gx, gy);
   }
 
   barrier(CLK_LOCAL_MEM_FENCE);
-
   if(x >= width || y >= height) return;
 
   // re-position buffer
   buffer += (lyid + 1) * buffwd + lxid + 1;
 
   float4 o = buffer[0];
+  // don't touch outermost 1 pixels, just copy
+  if(x < 1 || x >= width-1 || y < 1 || y >= height-1)
+  {
+    write_imagef(out, (int2) (x, y), o);
+    return;
+  }
 
   // 3x3 median for R
   float s0 = buffer[-buffwd - 1].x - buffer[-buffwd - 1].y;
@@ -395,7 +419,7 @@ color_smoothing(read_only image2d_t in, write_only image2d_t out, const int widt
   cas(s6, s4);
   cas(s4, s2);
 
-  o.x = fmax(s4 + o.y, 0.0f);
+  o.x = s4 + o.y;
 
 
   // 3x3 median for B
@@ -429,9 +453,9 @@ color_smoothing(read_only image2d_t in, write_only image2d_t out, const int widt
   cas(s6, s4);
   cas(s4, s2);
 
-  o.z = fmax(s4 + o.y, 0.0f);
+  o.z = s4 + o.y;
 
-  write_imagef(out, (int2) (x, y), fmax(o, 0.0f));
+  write_imagef(out, (int2) (x, y), o);
 }
 #undef cas
 
@@ -544,11 +568,16 @@ clip_and_zoom_demosaic_half_size(__read_only image2d_t in,
  * in (float) or (float4).x -> out (float4)
  */
 kernel void
-ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
-                    const unsigned int filters, local float *buffer)
+ppg_demosaic_green (read_only image2d_t in,
+                    write_only image2d_t out,
+                    const int width,
+                    const int height,
+                    const unsigned int filters,
+                    local float *buffer,
+                    const int border)
 {
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
   const int xlsz = get_local_size(0);
   const int ylsz = get_local_size(1);
   const int xlid = get_local_id(0);
@@ -577,7 +606,7 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy)).x;
+    buffer[bufidx] = readsingle(in, xx, yy);
   }
 
   // center buffer around current x,y-Pixel
@@ -585,23 +614,22 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  // make sure we dont write the outermost 3 pixels
-  if(x >= width - 3 || x < 3 || y >= height - 3 || y < 3) return;
+  if(col >= width - 3 || col < 3 || row >= height - 3 || row < 3) return;
+  if(col >= border && col < width - border && row >= border && row < height - border) return;
+
   // process all non-green pixels
-  const int row = y;
-  const int col = x;
   const int c = FC(row, col, filters);
-  float4 color; // output color
+  float4 color = 0.0f; // output color
 
   const float pc = buffer[0];
 
-  if     (c == 0) color.x = pc; // red
-  else if(c == 1) color.y = pc; // green1
-  else if(c == 2) color.z = pc; // blue
+  if     (c == RED) color.x = pc;
+  else if(c == GREEN) color.y = pc;
+  else if(c == BLUE) color.z = pc;
   else            color.y = pc; // green2
 
   // fill green layer for red and blue pixels:
-  if(c == 0 || c == 2)
+  if(c == RED || c == BLUE)
   {
     // look up horizontal and vertical neighbours, sharpened weight:
     const float pym  = buffer[-1 * stride];
@@ -640,7 +668,7 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int
       color.y = fmax(fmin(guessx*0.25f, M), m);
     }
   }
-  write_imagef (out, (int2)(x, y), fmax(color, 0.0f));
+  write_imagef (out, (int2)(col, row), color);
 }
 
 
@@ -649,12 +677,17 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int
  * in (float4) -> out (float4)
  */
 kernel void
-ppg_demosaic_redblue (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
-                      const unsigned int filters, local float4 *buffer)
+ppg_demosaic_redblue (read_only image2d_t in,
+                      write_only image2d_t out,
+                      const int width,
+                      const int height,
+                      const unsigned int filters,
+                      local float4 *buffer,
+                      const int border)
 {
   // image in contains full green and sparse r b
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
   const int xlsz = get_local_size(0);
   const int ylsz = get_local_size(1);
   const int xlid = get_local_id(0);
@@ -683,7 +716,7 @@ ppg_demosaic_redblue (read_only image2d_t in, write_only image2d_t out, const in
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy));
+    buffer[bufidx] = readpixel(in, xx, yy);
   }
 
   // center buffer around current x,y-Pixel
@@ -691,69 +724,66 @@ ppg_demosaic_redblue (read_only image2d_t in, write_only image2d_t out, const in
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  if(x >= width || y >= height) return;
-  const int row = y;
-  const int col = x;
+  if(col >= width || row >= height) return;
+  if(col >= border && col < width - border && row >= border && row < height - border) return;
+
   const int c = FC(row, col, filters);
   float4 color = buffer[0];
-  if(x == 0 || y == 0 || x == (width-1) || y == (height-1))
+  if(row > 0 && col > 0 && col < width - 1 && row < height - 1)
   {
-    write_imagef (out, (int2)(x, y), fmax(color, 0.0f));
-    return;
-  }
-
-  if(c == 1 || c == 3)
-  { // calculate red and blue for green pixels:
-    // need 4-nbhood:
-    const float4 nt = buffer[-stride];
-    const float4 nb = buffer[ stride];
-    const float4 nl = buffer[-1];
-    const float4 nr = buffer[ 1];
-    if(FC(row, col+1, filters) == 0) // red nb in same row
-    {
-      color.z = (nt.z + nb.z + 2.0f*color.y - nt.y - nb.y)*0.5f;
-      color.x = (nl.x + nr.x + 2.0f*color.y - nl.y - nr.y)*0.5f;
+    if(c == GREEN || c == 3)
+    { // calculate red and blue for green pixels:
+      // need 4-nbhood:
+      const float4 nt = buffer[-stride];
+      const float4 nb = buffer[ stride];
+      const float4 nl = buffer[-1];
+      const float4 nr = buffer[ 1];
+      if(FC(row, col+1, filters) == RED) // red nb in same row
+      {
+        color.z = (nt.z + nb.z + 2.0f*color.y - nt.y - nb.y)*0.5f;
+        color.x = (nl.x + nr.x + 2.0f*color.y - nl.y - nr.y)*0.5f;
+      }
+      else
+      { // blue nb
+        color.x = (nt.x + nb.x + 2.0f*color.y - nt.y - nb.y)*0.5f;
+        color.z = (nl.z + nr.z + 2.0f*color.y - nl.y - nr.y)*0.5f;
+      }
     }
     else
-    { // blue nb
-      color.x = (nt.x + nb.x + 2.0f*color.y - nt.y - nb.y)*0.5f;
-      color.z = (nl.z + nr.z + 2.0f*color.y - nl.y - nr.y)*0.5f;
-    }
-  }
-  else
-  {
-    // get 4-star-nbhood:
-    const float4 ntl = buffer[-stride - 1];
-    const float4 ntr = buffer[-stride + 1];
-    const float4 nbl = buffer[ stride - 1];
-    const float4 nbr = buffer[ stride + 1];
-
-    if(c == 0)
-    { // red pixel, fill blue:
-      const float diff1  = fabs(ntl.z - nbr.z) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
-      const float guess1 = ntl.z + nbr.z + 2.0f*color.y - ntl.y - nbr.y;
-      const float diff2  = fabs(ntr.z - nbl.z) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
-      const float guess2 = ntr.z + nbl.z + 2.0f*color.y - ntr.y - nbl.y;
-      if     (diff1 > diff2) color.z = guess2 * 0.5f;
-      else if(diff1 < diff2) color.z = guess1 * 0.5f;
-      else color.z = (guess1 + guess2)*0.25f;
-    }
-    else // c == 2, blue pixel, fill red:
     {
-      const float diff1  = fabs(ntl.x - nbr.x) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
-      const float guess1 = ntl.x + nbr.x + 2.0f*color.y - ntl.y - nbr.y;
-      const float diff2  = fabs(ntr.x - nbl.x) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
-      const float guess2 = ntr.x + nbl.x + 2.0f*color.y - ntr.y - nbl.y;
-      if     (diff1 > diff2) color.x = guess2 * 0.5f;
-      else if(diff1 < diff2) color.x = guess1 * 0.5f;
-      else color.x = (guess1 + guess2)*0.25f;
+      // get 4-star-nbhood:
+      const float4 ntl = buffer[-stride - 1];
+      const float4 ntr = buffer[-stride + 1];
+      const float4 nbl = buffer[ stride - 1];
+      const float4 nbr = buffer[ stride + 1];
+
+      if(c == RED)
+      { // red pixel, fill blue:
+        const float diff1  = fabs(ntl.z - nbr.z) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
+        const float guess1 = ntl.z + nbr.z + 2.0f*color.y - ntl.y - nbr.y;
+        const float diff2  = fabs(ntr.z - nbl.z) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
+        const float guess2 = ntr.z + nbl.z + 2.0f*color.y - ntr.y - nbl.y;
+        if     (diff1 > diff2) color.z = guess2 * 0.5f;
+        else if(diff1 < diff2) color.z = guess1 * 0.5f;
+        else color.z = (guess1 + guess2)*0.25f;
+      }
+      else // c == 2, blue pixel, fill red:
+      {
+        const float diff1  = fabs(ntl.x - nbr.x) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
+        const float guess1 = ntl.x + nbr.x + 2.0f*color.y - ntl.y - nbr.y;
+        const float diff2  = fabs(ntr.x - nbl.x) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
+        const float guess2 = ntr.x + nbl.x + 2.0f*color.y - ntr.y - nbl.y;
+        if     (diff1 > diff2) color.x = guess2 * 0.5f;
+        else if(diff1 < diff2) color.x = guess1 * 0.5f;
+        else color.x = (guess1 + guess2)*0.25f;
+      }
     }
   }
-  write_imagef (out, (int2)(x, y), fmax(color, 0.0f));
+  write_imagef (out, (int2)(col, row), color);
 }
 
 /**
- * Demosaic image border
+ * Demosaic image border for the three outermost photosites
  */
 kernel void
 border_interpolate(read_only image2d_t in, write_only image2d_t out, const int width, const int height, const unsigned int filters)
@@ -762,27 +792,23 @@ border_interpolate(read_only image2d_t in, write_only image2d_t out, const int w
   const int y = get_global_id(1);
 
   if(x >= width || y >= height) return;
-
-  const int avgwindow = 1;
-  const int border = 3;
-
-  if(x >= border && x < width-border && y >= border && y < height-border) return;
+  if(x >= 3 && x < width-3 && y >= 3 && y < height-3) return;
 
   float4 o;
   float sum[4] = { 0.0f };
   int count[4] = { 0 };
 
-  for (int j=y-avgwindow; j<=y+avgwindow; j++) for (int i=x-avgwindow; i<=x+avgwindow; i++)
+  for(int j=y-1; j<=y+1; j++) for (int i=x-1; i<=x+1; i++)
   {
-    if (j>=0 && i>=0 && j<height && i<width)
+    if(j>=0 && i>=0 && j<height && i<width)
     {
       const int f = FC(j,i,filters);
-      sum[f] += fmax(0.0f, read_imagef(in, sampleri, (int2)(i, j)).x);
+      sum[f] += readsingle(in, i, j);
       count[f]++;
     }
   }
 
-  const float i = fmax(0.0f, read_imagef(in, sampleri, (int2)(x, y)).x);
+  const float i = readsingle(in, x, y);
   o.x = count[0] > 0 ? sum[0]/count[0] : i;
   o.y = count[1]+count[3] > 0 ? (sum[1]+sum[3])/(count[1]+count[3]) : i;
   o.z = count[2] > 0 ? sum[2]/count[2] : i;
diff --git a/data/kernels/demosaic_rcd.cl b/data/kernels/demosaic_rcd.cl
index a6202149cd13..73b885a5a4af 100644
--- a/data/kernels/demosaic_rcd.cl
+++ b/data/kernels/demosaic_rcd.cl
@@ -32,7 +32,7 @@ __kernel void rcd_populate (__read_only image2d_t in, global float *cfa, global
   const int col = get_global_id(0);
   const int row = get_global_id(1);
   if(col >= w || row >= height) return;
-  const float val = scale * fmax(0.0f, read_imagef(in, sampleri, (int2)(col, row)).x);
+  const float val = scale * fmax(0.0f, readsingle(in, col, row));
   const int color = FC(row, col, filters);
 
   global float *rgbcol = rgb0;
@@ -51,7 +51,7 @@ __kernel void rcd_write_output (__write_only image2d_t out, global float *rgb0,
   if(!(col >= border && col < w - border && row >= border && row < height - border)) return;
   const int idx = mad24(row, w, col);
 
-  write_imagef(out, (int2)(col, row), (float4)(fmax(scale * rgb0[idx], 0.0f), fmax(scale * rgb1[idx], 0.0f), fmax(scale * rgb2[idx], 0.0f), 0.0f));
+  write_imagef(out, (int2)(col, row), (float4)(scale * rgb0[idx], scale * rgb1[idx], scale * rgb2[idx], 0.0f));
 }
 
 #define eps 1e-5f              // Tolerance to avoid dividing by zero
@@ -278,9 +278,9 @@ __kernel void write_blended_dual(__read_only image2d_t high,
   const int row = get_global_id(1);
   if((col >= w) || (row >= height)) return;
 
-  const float4 high_val = read_imagef(high, sampleri, (int2)(col, row));
-  const float4 low_val = read_imagef(low, sampleri, (int2)(col, row));
-  const float4 blender = clipf(mask[mad24(row, w, col)]);
+  const float4 high_val = readpixel(high, col, row);
+  const float4 low_val = readpixel(low, col, row);
+  const float4 blender = (float4)clipf(mask[mad24(row, w, col)]);
   float4 data = mix(low_val, high_val, blender);
   data.w = showmask ? blender.x : 0.0f;
   write_imagef(out, (int2)(col, row), data);
@@ -297,8 +297,8 @@ __kernel void calc_Y0_mask(global float *mask,
   if((col >= w) || (row >= height)) return;
   const int idx = mad24(row, w, col);
 
-  const float4 pt = wb * fmax(0.0f, read_imagef(in, sampleri, (int2)(col, row)));
-  mask[idx] = dtcl_sqrt(0.33333333f *(pt.x + pt.y + pt.z));
+  const float4 pt = wb * readpixel(in, col, row);
+  mask[idx] = dtcl_sqrt(0.33333333f * fmax(0.0f, (pt.x + pt.y + pt.z)));
 }
 
 __kernel void calc_scharr_mask(global float *in, global float *out, const int w, const int height)
@@ -331,203 +331,6 @@ __kernel void calc_detail_blend(global float *in, global float *out, const int w
   out[idx] = detail ? blend : 1.0f - blend;
 }
 
-kernel void rcd_border_green(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
-                    const unsigned int filters, local float *buffer, const int border)
-{
-  const int col = get_global_id(0);
-  const int row = get_global_id(1);
-  const int xlsz = get_local_size(0);
-  const int ylsz = get_local_size(1);
-  const int xlid = get_local_id(0);
-  const int ylid = get_local_id(1);
-  const int xgid = get_group_id(0);
-  const int ygid = get_group_id(1);
-
-  // individual control variable in this work group and the work group size
-  const int l = mad24(ylid, xlsz, xlid);
-  const int lsz = mul24(xlsz, ylsz);
-
-  // stride and maximum capacity of local buffer
-  // cells of 1*float per pixel with a surrounding border of 3 cells
-  const int stride = xlsz + 2*3;
-  const int maxbuf = mul24(stride, ylsz + 2*3);
-
-  // coordinates of top left pixel of buffer
-  // this is 3 pixel left and above of the work group origin
-  const int xul = mul24(xgid, xlsz) - 3;
-  const int yul = mul24(ygid, ylsz) - 3;
-
-  // populate local memory buffer
-  for(int n = 0; n <= maxbuf/lsz; n++)
-  {
-    const int bufidx = mad24(n, lsz, l);
-    if(bufidx >= maxbuf) continue;
-    const int xx = xul + bufidx % stride;
-    const int yy = yul + bufidx / stride;
-    buffer[bufidx] = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy)).x);
-  }
-
-  // center buffer around current x,y-Pixel
-  buffer += mad24(ylid + 3, stride, xlid + 3);
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  if(col >= width - 3 || col < 3 || row >= height - 3 || row < 3) return;
-  if(col >= border && col < width - border && row >= border && row < height - border) return;
-
-  // process all non-green pixels
-  const int c = FC(row, col, filters);
-  float4 color = 0.0f; // output color
-
-  const float pc = buffer[0];
-
-  if     (c == RED) color.x = pc;
-  else if(c == GREEN) color.y = pc;
-  else if(c == BLUE) color.z = pc;
-  else            color.y = pc; // green2
-
-  // fill green layer for red and blue pixels:
-  if(c == RED || c == BLUE)
-  {
-    // look up horizontal and vertical neighbours, sharpened weight:
-    const float pym  = buffer[-1 * stride];
-    const float pym2 = buffer[-2 * stride];
-    const float pym3 = buffer[-3 * stride];
-    const float pyM  = buffer[ 1 * stride];
-    const float pyM2 = buffer[ 2 * stride];
-    const float pyM3 = buffer[ 3 * stride];
-    const float pxm  = buffer[-1];
-    const float pxm2 = buffer[-2];
-    const float pxm3 = buffer[-3];
-    const float pxM  = buffer[ 1];
-    const float pxM2 = buffer[ 2];
-    const float pxM3 = buffer[ 3];
-    const float guessx = (pxm + pc + pxM) * 2.0f - pxM2 - pxm2;
-    const float diffx  = (fabs(pxm2 - pc) +
-                          fabs(pxM2 - pc) +
-                          fabs(pxm  - pxM)) * 3.0f +
-                         (fabs(pxM3 - pxM) + fabs(pxm3 - pxm)) * 2.0f;
-    const float guessy = (pym + pc + pyM) * 2.0f - pyM2 - pym2;
-    const float diffy  = (fabs(pym2 - pc) +
-                          fabs(pyM2 - pc) +
-                          fabs(pym  - pyM)) * 3.0f +
-                         (fabs(pyM3 - pyM) + fabs(pym3 - pym)) * 2.0f;
-    if(diffx > diffy)
-    {
-      // use guessy
-      const float m = fmin(pym, pyM);
-      const float M = fmax(pym, pyM);
-      color.y = fmax(fmin(guessy*0.25f, M), m);
-    }
-    else
-    {
-      const float m = fmin(pxm, pxM);
-      const float M = fmax(pxm, pxM);
-      color.y = fmax(fmin(guessx*0.25f, M), m);
-    }
-  }
-  write_imagef (out, (int2)(col, row), fmax(color, 0.0f));
-}
-kernel void rcd_border_redblue(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
-                      const unsigned int filters, local float4 *buffer, const int border)
-{
-  // image in contains full green and sparse r b
-  const int col = get_global_id(0);
-  const int row = get_global_id(1);
-  const int xlsz = get_local_size(0);
-  const int ylsz = get_local_size(1);
-  const int xlid = get_local_id(0);
-  const int ylid = get_local_id(1);
-  const int xgid = get_group_id(0);
-  const int ygid = get_group_id(1);
-
-  // individual control variable in this work group and the work group size
-  const int l = mad24(ylid, xlsz, xlid);
-  const int lsz = mul24(xlsz, ylsz);
-
-  // stride and maximum capacity of local buffer
-  // cells of float4 per pixel with a surrounding border of 1 cell
-  const int stride = xlsz + 2;
-  const int maxbuf = mul24(stride, ylsz + 2);
-
-  // coordinates of top left pixel of buffer
-  // this is 1 pixel left and above of the work group origin
-  const int xul = mul24(xgid, xlsz) - 1;
-  const int yul = mul24(ygid, ylsz) - 1;
-
-  // populate local memory buffer
-  for(int n = 0; n <= maxbuf/lsz; n++)
-  {
-    const int bufidx = mad24(n, lsz, l);
-    if(bufidx >= maxbuf) continue;
-    const int xx = xul + bufidx % stride;
-    const int yy = yul + bufidx / stride;
-    buffer[bufidx] = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy)));
-  }
-
-  // center buffer around current x,y-Pixel
-  buffer += mad24(ylid + 1, stride, xlid + 1);
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  if(col >= width || row >= height) return;
-  if(col >= border && col < width - border && row >= border && row < height - border) return;
-
-  const int c = FC(row, col, filters);
-  float4 color = buffer[0];
-  if(row > 0 && col > 0 && col < width - 1 && row < height - 1)
-  {
-    if(c == GREEN || c == 3)
-    { // calculate red and blue for green pixels:
-      // need 4-nbhood:
-      const float4 nt = buffer[-stride];
-      const float4 nb = buffer[ stride];
-      const float4 nl = buffer[-1];
-      const float4 nr = buffer[ 1];
-      if(FC(row, col+1, filters) == RED) // red nb in same row
-      {
-        color.z = (nt.z + nb.z + 2.0f*color.y - nt.y - nb.y)*0.5f;
-        color.x = (nl.x + nr.x + 2.0f*color.y - nl.y - nr.y)*0.5f;
-      }
-      else
-      { // blue nb
-        color.x = (nt.x + nb.x + 2.0f*color.y - nt.y - nb.y)*0.5f;
-        color.z = (nl.z + nr.z + 2.0f*color.y - nl.y - nr.y)*0.5f;
-      }
-    }
-    else
-    {
-      // get 4-star-nbhood:
-      const float4 ntl = buffer[-stride - 1];
-      const float4 ntr = buffer[-stride + 1];
-      const float4 nbl = buffer[ stride - 1];
-      const float4 nbr = buffer[ stride + 1];
-
-      if(c == RED)
-      { // red pixel, fill blue:
-        const float diff1  = fabs(ntl.z - nbr.z) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
-        const float guess1 = ntl.z + nbr.z + 2.0f*color.y - ntl.y - nbr.y;
-        const float diff2  = fabs(ntr.z - nbl.z) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
-        const float guess2 = ntr.z + nbl.z + 2.0f*color.y - ntr.y - nbl.y;
-        if     (diff1 > diff2) color.z = guess2 * 0.5f;
-        else if(diff1 < diff2) color.z = guess1 * 0.5f;
-        else color.z = (guess1 + guess2)*0.25f;
-      }
-      else // c == 2, blue pixel, fill red:
-      {
-        const float diff1  = fabs(ntl.x - nbr.x) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y);
-        const float guess1 = ntl.x + nbr.x + 2.0f*color.y - ntl.y - nbr.y;
-        const float diff2  = fabs(ntr.x - nbl.x) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y);
-        const float guess2 = ntr.x + nbl.x + 2.0f*color.y - ntr.y - nbl.y;
-        if     (diff1 > diff2) color.x = guess2 * 0.5f;
-        else if(diff1 < diff2) color.x = guess1 * 0.5f;
-        else color.x = (guess1 + guess2)*0.25f;
-      }
-    }
-  }
-  write_imagef (out, (int2)(col, row), fmax(color, 0.0f));
-}
-
 kernel void demosaic_box3(read_only image2d_t in,
                           write_only image2d_t out,
                           const int width,
@@ -547,7 +350,7 @@ kernel void demosaic_box3(read_only image2d_t in,
       if(x >= 0 && y >= 0 && x < width && y < height)
       {
         const int color = fcol(y, x, filters, xtrans);
-        sum[color] += fmax(0.0f, read_imagef(in, sampleri, (int2)(x, y)).x);
+        sum[color] += readsingle(in, x, y);
         cnt[color] += 1.0f;
       }
     }
@@ -556,3 +359,15 @@ kernel void demosaic_box3(read_only image2d_t in,
   write_imagef(out, (int2)(col, row), rgb);
 }
 
+kernel void
+image_positive(read_only image2d_t in, write_only image2d_t out, const int width, const int height)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  if(x >= width || y >= height) return;
+
+  float4 pixel = fmax(0.0f, readpixel(in, x, y));
+  write_imagef(out, (int2)(x, y), pixel);
+}
+
diff --git a/data/kernels/demosaic_vng.cl b/data/kernels/demosaic_vng.cl
index 1cbff4807f1c..3ff6482a3665 100644
--- a/data/kernels/demosaic_vng.cl
+++ b/data/kernels/demosaic_vng.cl
@@ -28,7 +28,7 @@ vng_lin_interpolate(read_only image2d_t in,
                     global const unsigned char (*const xtrans)[6],
                     global const int (*const lookup)[16][32],
                     local float *buffer,
-                    const int equil)
+                    const int only_linear)
 {
   const int x = get_global_id(0);
   const int y = get_global_id(1);
@@ -59,7 +59,7 @@ vng_lin_interpolate(read_only image2d_t in,
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    buffer[bufidx] = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy)).x);
+    buffer[bufidx] = readsingle(in, xx, yy);
   }
 
   // center buffer around current x,y-Pixel
@@ -73,31 +73,40 @@ vng_lin_interpolate(read_only image2d_t in,
 
   const bool is_xtrans = filters == 9; 
   const int colors = is_xtrans ? 3 : 4;
-  const int av = (filters == 9) ? 2 : 1;
   const int size = is_xtrans ? 6 : 16;
 
   float sum[4] = { 0.0f };
   float o[4] = { 0.0f };
 
+  // approximate outermost single pixels
   if(x < 1 || x >= width-1 || y < 1 || y >= height-1)
   {
     int count[4] = { 0 };
-    for(int j = y-av; j <= y+av; j++)
+    for(int j = y-1; j <= y+1; j++)
     {
-      for(int i = x-av; i <= x+av; i++)
+      for(int i = x-1; i <= x+1; i++)
       {
         if(j >= 0 && i >= 0 && j < height && i < width)
         {
           const int f = fcol(j, i, filters, xtrans);
-          sum[f] += fmax(0.0f, read_imagef(in, sampleri, (int2)(i, j)).x);
+          sum[f] += readsingle(in, i, j);
           count[f]++;
         }
       }
     }
+    const int f = fcol(y, x, filters, xtrans);
+    // for current cell, copy the current sensor's color data,
+    // interpolate the other two colors from surrounding pixels of
+    // their color
     for(int c = 0; c < colors; c++)
-      o[c] = sum[c] / fmax(1.0f, (float)count[c]);
+    {
+      if(c != f && count[c] != 0)
+        o[c] = sum[c] / count[c];
+      else
+        o[c] = readsingle(in, x, y);
+    }
   }
-  else
+  else // do the bilinear stuff
   {
     global const int *ip = lookup[y % size][x % size];
     const int num_pixels = ip[0];
@@ -120,17 +129,18 @@ vng_lin_interpolate(read_only image2d_t in,
     o[ip[0]] = buffer[0];
   }
 
-  if(!is_xtrans && equil)
+  if(only_linear)
   {
-    o[1] = 0.5f * (o[1] + o[3]);
+    o[0] = o[0];
+    o[1] = is_xtrans ? o[1] : 0.5f * (o[1] + o[3]);
+    o[2] = o[2];
     o[3] = 0.0f;
   }
   write_imagef(out, (int2)(x, y), (float4)(o[0], o[1], o[2], o[3]));
 }
 
 kernel void
-vng_interpolate(read_only image2d_t input,
-                read_only image2d_t in,
+vng_interpolate(read_only image2d_t in,
                 write_only image2d_t out,
                 const int width,
                 const int height,
@@ -169,7 +179,7 @@ vng_interpolate(read_only image2d_t input,
     if(bufidx >= maxbuf) continue;
     const int xx = xul + bufidx % stride;
     const int yy = yul + bufidx / stride;
-    const float4 pixel = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy)));
+    const float4 pixel = readpixel(in, xx, yy);
     vstore4(pixel, bufidx, buffer);
   }
 
@@ -179,10 +189,21 @@ vng_interpolate(read_only image2d_t input,
   barrier(CLK_LOCAL_MEM_FENCE);
   if(x >= width || y >= height) return;
 
-  const bool is_xtrans = filters == 9; 
-  const int colors = is_xtrans ? 3 : 4;
-  const int av = is_xtrans ? 2 : 1;
+  const bool is_xtrans = filters == 9;
+  // we don't touch data at the outermost 2 pixels
+  if(x < 2 || x >= width-2 || y < 2 || y >= height-2)
+  {
+    float4 val = readpixel(in, x, y);
+    if(!is_xtrans)
+    {
+      val.y = 0.5f * (val.y + val.w);
+      val.w = 0.0f;
+    }
+    write_imagef(out, (int2)(x, y), val);
+    return;
+  }
 
+  const int colors = is_xtrans ? 3 : 4;
   const int prow = is_xtrans ? 6 : 8;
   const int pcol = is_xtrans ? 6 : 2;
 
@@ -227,41 +248,14 @@ vng_interpolate(read_only image2d_t input,
     if(gmax < gval[g]) gmax = gval[g];
   }
 
-  const bool border = x < 2 || x >= width-2 || y < 2 || y >= height-2;
-  float b[4] = { 0.0f };
-  // Ok lets calculate the border pixel, we'll need it soon
-  if(border)
-  {
-    int count[4] = { 0 };
-    for(int j = y-av; j <= y+av; j++)
-    {
-      for(int i = x-av; i <= x+av; i++)
-      {
-        if(j >= 0 && i >= 0 && j < height && i < width)
-        {
-          const int f = fcol(j, i, filters, xtrans);
-          b[f] += fmax(0.0f, read_imagef(input, sampleri, (int2)(i, j)).x);
-          count[f]++;
-        }
-      }
-    }
-    for(int c = 0; c < colors; c++)
-      b[c] /= fmax(1.0f, (float)count[c]);
-  }
-
+  float o[4] = { 0.0f };
   if(gmax == 0.0f)
   {
-    if(!border)
-    {
-      for(int c = 0; c < colors; c++)
-        b[c] = buffer[c];
-    }
-    if(!is_xtrans)
-    {
-      b[1] = 0.5f * (b[1] + b[3]);
-      b[3] = 0.0f;
-    }
-    write_imagef(out, (int2)(x, y), (float4)(b[0], b[1], b[2], b[3]));
+    o[0] = buffer[0];
+    o[1] = is_xtrans ? buffer[1] : 0.5f * (buffer[1] + buffer[3]);
+    o[2] = buffer[2];
+    o[3] = 0.0f;
+    write_imagef(out, (int2)(x, y), (float4)(o[0], o[1], o[2], o[3]));
     return;
   }
 
@@ -303,25 +297,15 @@ vng_interpolate(read_only image2d_t input,
   }
 
   // save to output
-  float o[4] = { 0.0f };
   for(int c = 0; c < colors; c++)
   {
     float tot = buffer[color];
     if(c != color) tot += (sum[c] - sum[color]) / num;
-    o[c] = fmax(0.0f, tot);
-  }
-
-  if(border)
-  {
-    for(int c = 0; c < colors; c++)
-      o[c] = b[c];
+    o[c] = tot;
   }
 
-  if(!is_xtrans)
-  {
-    o[1] = 0.5f * (o[1] + o[3]);
-    o[3] = 0.0f;
-  }
+  o[1] = is_xtrans ? o[1] : 0.5f * (o[1] + o[3]);
+  o[3] = 0.0f;
   write_imagef(out, (int2)(x, y), (float4)(o[0], o[1], o[2], 0.0f));
 }
 
@@ -362,7 +346,7 @@ clip_and_zoom_demosaic_third_size_xtrans(read_only image2d_t in, write_only imag
     {
       for(int j = 0; j < 3; j++)
         for(int i = 0; i < 3; i++)
-          col[FCxtrans(yy + j, xx + i, xtrans)] += fmax(0.0f, read_imagef(in, sampleri, (int2)(xx + i, yy + j)).x);
+          col[FCxtrans(yy + j, xx + i, xtrans)] += fmax(0.0f, readsingle(in, xx + i, yy + j));
       num++;
     }
 
diff --git a/data/kernels/guided_filter.cl b/data/kernels/guided_filter.cl
index 36ca2cffd284..0c105afdb0ed 100644
--- a/data/kernels/guided_filter.cl
+++ b/data/kernels/guided_filter.cl
@@ -40,16 +40,6 @@ kernel void guided_filter_split_rgb_image(const int width,
 }
 
 
-// Kahan summation algorithm
-#define Kahan_sum(m, c, add)        \
-  {                                 \
-    const float t1 = (add) - (c);   \
-    const float t2 = (m) + t1;      \
-    c = (t2 - m) - t1;              \
-    m = t2;                         \
-  }
-
-
 kernel void guided_filter_box_mean_x(const int width,
                                      const int height,
                                      read_only image2d_t in,
diff --git a/src/common/darktable.h b/src/common/darktable.h
index 0b8baa93809f..44700bb14602 100644
--- a/src/common/darktable.h
+++ b/src/common/darktable.h
@@ -249,6 +249,38 @@ char *dt_version_major_minor();
 #define INVPHI 0.61803398874989479F
 #endif
 
+
+/* Some CPU demosaicers do internal tiling, this decreases mem pressure while
+   increasing performance as processed data is available in cpu cache.
+   The default values have been calculated for a cachesize of 1MB per thread,
+   this is mostly still valid in 2026 but you may tune for performance.
+
+   Due to internal code AMAZETS should be a multiple of 32, for RCD and LMMSE
+   a multiple of 8.
+
+   Note: for the curious, if we calculate the tilesizes at runtime we loose the
+   performance gain as the compiler can't optimize that much.
+*/
+#ifndef DT_RCD_TILESIZE
+#define DT_RCD_TILESIZE 112
+#endif
+
+#ifndef DT_LMMSE_TILESIZE
+#define DT_LMMSE_TILESIZE 136
+#endif
+
+#ifndef AMAZETS
+#define AMAZETS 160
+#endif
+
+#ifndef DT_MARKESTEIJN_TS
+#define DT_MARKESTEIJN_TS 122
+#endif
+
+#ifndef DT_FDC_TS
+#define DT_FDC_TS 122
+#endif
+
 // NaN-safe clamping (NaN compares false, and will thus result in H)
 #define CLAMPS(A, L, H) ((A) > (L) ? ((A) < (H) ? (A) : (H)) : (L))
 
diff --git a/src/common/interpolation.c b/src/common/interpolation.c
index ced583c98a11..5137bd4da9c2 100644
--- a/src/common/interpolation.c
+++ b/src/common/interpolation.c
@@ -1142,11 +1142,9 @@ void dt_interpolation_resample(const dt_interpolation_t *itor,
       // Output pixel is ready
       const size_t baseidx = (size_t)oy * out_stride_floats + (size_t)ox * 4;
 
-      // Clip negative RGB that may be produced by Lanczos undershooting
-      // Negative RGB are invalid values no matter the RGB space (light is positive)
       dt_aligned_pixel_t pixel;
       for_each_channel(c, aligned(vs:16))
-        pixel[c] = fmaxf(0.0f, vs[c]);
+        pixel[c] = vs[c];
       copy_pixel_nontemporal(out + baseidx, pixel);
 
       // Reset vertical resampling context
@@ -1565,7 +1563,7 @@ void dt_interpolation_resample_1c(const dt_interpolation_t *itor,
       // Output pixel is ready
       float *o = (float *)((char *)out + (size_t)oy * out_stride
                            + (size_t)ox * sizeof(float));
-      *o = fmaxf(0.0f, vs);
+      *o = vs;
 
       // Reset vertical resampling context
       viidx -= vl;
diff --git a/src/develop/masks/detail.c b/src/develop/masks/detail.c
index 838df90e63df..6c7aec6fc899 100644
--- a/src/develop/masks/detail.c
+++ b/src/develop/masks/detail.c
@@ -122,19 +122,19 @@ float *dt_masks_calc_scharr_mask(dt_dev_pixelpipe_t *pipe,
   }
 
   const gboolean wboff = !pipe->dsc.temperature.enabled || !rawmode;
-  const dt_aligned_pixel_t wb = { wboff ? 1.0f : pipe->dsc.temperature.coeffs[0],
-                                  wboff ? 1.0f : pipe->dsc.temperature.coeffs[1],
-                                  wboff ? 1.0f : pipe->dsc.temperature.coeffs[2] };
+  const dt_aligned_pixel_t wb = { wboff ? 1.0f : 1.0f / pipe->dsc.temperature.coeffs[0],
+                                  wboff ? 1.0f : 1.0f / pipe->dsc.temperature.coeffs[1],
+                                  wboff ? 1.0f : 1.0f / pipe->dsc.temperature.coeffs[2] };
 
 
   DT_OMP_FOR_SIMD(aligned(tmp : 64))
   for(size_t idx =0; idx < msize; idx++)
   {
-    const float val = fmaxf(0.0f, src[4 * idx] / wb[0])
-                    + fmaxf(0.0f, src[4 * idx + 1] / wb[1])
-                    + fmaxf(0.0f, src[4 * idx + 2] / wb[2]);
+    const float val = src[4 * idx] * wb[0]
+                    + src[4 * idx + 1] * wb[1]
+                    + src[4 * idx + 2] * wb[2];
     // add a gamma. sqrtf should make noise variance the same for all image
-    tmp[idx] = sqrtf(val / 3.0f);
+    tmp[idx] = sqrtf(0.33333333f * fmaxf(0.0f, val));
   }
 
   DT_OMP_FOR()
diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c
index dad1946df664..d438e7777db3 100644
--- a/src/develop/pixelpipe_hb.c
+++ b/src/develop/pixelpipe_hb.c
@@ -1236,7 +1236,8 @@ static inline gboolean _module_pipe_stop(dt_dev_pixelpipe_t *pipe, float *input)
 void dt_dev_prepare_piece_cfa(dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *roi)
 {
   dt_iop_module_t *module = piece->module;
-  if(module && module->input_colorspace(module, piece->pipe, piece) == IOP_CS_RAW)
+  if(module && (module->input_colorspace(module, piece->pipe, piece) == IOP_CS_RAW
+              || module->default_colorspace(module, piece->pipe, piece) == IOP_CS_RAW))
   {
     piece->filters = dt_rawspeed_crop_dcraw_filters(piece->pipe->dsc.filters, roi->x, roi->y);
     for(int ii = 0; ii < 6; ++ii)
@@ -2335,6 +2336,17 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
                     if(terr == CL_SUCCESS)
                     {
                       module->process(module, piece, clindata, cpudata, &roi_in, roi_out);
+                      if(cst_out == IOP_CS_LAB)
+                      {
+                        for(size_t k = 0; k < (size_t)ow * oh * 4; k+=4)
+                        {
+                          dt_aligned_pixel_t XYZ;
+                          dt_Lab_to_XYZ(cpudata + k, XYZ);
+                          dt_XYZ_to_linearRGB(XYZ, cpudata + k);
+                          dt_Lab_to_XYZ(cloutdata + k, XYZ);
+                          dt_XYZ_to_linearRGB(XYZ, cloutdata + k);
+                        }
+                      }
                       dt_dump_pipe_diff_pfm(module->op, cloutdata, cpudata, ow, oh, cho,
                                             dt_dev_pixelpipe_type_to_str(pipe->type));                  }
                   }
@@ -3622,7 +3634,7 @@ int dt_dev_write_scharr_mask_cl(dt_dev_pixelpipe_iop_t *piece,
         wboff ? 1.0f : 1.0f / p->dsc.temperature.coeffs[2], 1.0f };
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_Y0_mask, width, height,
-     CLARG(tmp), CLARG(in), CLARG(width), CLARG(height), CLFLARRAY(4, wb));
+     CLARG(tmp), CLARG(in), CLARG(width), CLARG(height), CLARG(wb));
   if(err != CL_SUCCESS) goto error;
 
   err = dt_opencl_enqueue_kernel_2d_args(devid,
diff --git a/src/iop/demosaic.c b/src/iop/demosaic.c
index 34e91fba86dd..a43477d4cb96 100644
--- a/src/iop/demosaic.c
+++ b/src/iop/demosaic.c
@@ -51,11 +51,8 @@ DT_MODULE_INTROSPECTION(6, dt_iop_demosaic_params_t)
 
 #define DT_DEMOSAIC_XTRANS 1024 // masks for non-Bayer demosaic ops
 #define DT_DEMOSAIC_DUAL 2048   // masks for dual demosaicing methods
-#define DT_REDUCESIZE_MIN 64
 
-// these are highly depending on CPU architecture (cache size)
-#define DT_RCD_TILESIZE 112
-#define DT_LMMSE_TILESIZE 136
+// #define DT_DEMOSAIC_POSITIVE
 
 typedef enum dt_iop_demosaic_method_t
 {
@@ -235,8 +232,6 @@ typedef struct dt_iop_demosaic_global_data_t
   int kernel_rcd_step_4_2;
   int kernel_rcd_step_5_1;
   int kernel_rcd_step_5_2;
-  int kernel_rcd_border_redblue;
-  int kernel_rcd_border_green;
   int kernel_demosaic_box3;
   int kernel_write_blended_dual;
   int gaussian_9x9_mul;
@@ -247,6 +242,7 @@ typedef struct dt_iop_demosaic_global_data_t
   int show_blend_mask;
   int capture_result;
   int final_blend;
+  int image_positive;
   float *gauss_coeffs;
 } dt_iop_demosaic_global_data_t;
 
@@ -726,7 +722,10 @@ void process(dt_iop_module_t *self,
   const gboolean do_capture = !passthru &&  !is_4bayer && !show_dual && !run_fast && d->cs_enabled;
   const gboolean greens = is_bayer && d->green_eq != DT_IOP_GREEN_EQ_NO && no_masking && !run_fast && !true_monochrome;
 
-  const float procmax = dt_iop_get_processed_maximum(piece);
+  float procmax = dt_iop_get_processed_maximum(piece);
+  if(procmax < 1.5f) procmax = 2.0f;
+  else if(procmax < 3.0f) procmax = 4.0f;
+  else procmax = 8.0f;
   const float procmin = dt_iop_get_processed_minimum(piece);
   const int exif_iso = img->exif_iso;
 
@@ -858,7 +857,7 @@ void process(dt_iop_module_t *self,
         if(method == DT_IOP_DEMOSAIC_FDC)
           xtrans_fdc_interpolate(t_out, t_in, width, t_rows, xtrans, exif_iso);
         else if(method == DT_IOP_DEMOSAIC_MARKESTEIJN || method == DT_IOP_DEMOSAIC_MARKESTEIJN_3)
-          xtrans_markesteijn_interpolate(t_out, t_in, width, t_rows, xtrans, passes);
+          xtrans_markesteijn_interpolate(t_out, t_in, width, t_rows, xtrans, passes, filters);
         else
           vng_interpolate(t_out, t_in, width, t_rows, filters, xtrans, FALSE);
       }
@@ -878,7 +877,7 @@ void process(dt_iop_module_t *self,
         else if(method == DT_IOP_DEMOSAIC_LMMSE)
           lmmse_demosaic(t_out, t_in, width, t_rows, filters, d->lmmse_refine, procmax);
         else if(method != DT_IOP_DEMOSAIC_AMAZE)
-          demosaic_ppg(t_out, t_in, width, t_rows, filters, d->median_thrs);
+          demosaic_ppg(t_out, t_in, width, t_rows, filters, d->median_thrs, 100000);
         else
           amaze_demosaic(t_in, t_out, width, t_rows, filters, procmin);
       }
@@ -908,6 +907,13 @@ void process(dt_iop_module_t *self,
     dt_iop_clip_and_zoom_roi((float *)o, out, roi_out, roi_in);
     dt_free_align(out);
   }
+
+#ifdef DT_DEMOSAIC_POSITIVE
+  float *oo = (float *)o;
+  DT_OMP_FOR_SIMD(aligned(oo : 64))
+  for(size_t k = 0; k < (size_t)roi_out->width * roi_out->height * 4; k++)
+    oo[k] = fmaxf(0.0f, oo[k]);
+#endif
 }
 
 #ifdef HAVE_OPENCL
@@ -1205,7 +1211,13 @@ int process_cl(dt_iop_module_t *self,
   }
 
   if(!direct)
-     err = dt_iop_clip_and_zoom_roi_cl(devid, dev_out, out_image, roi_out, roi_in);
+    err = dt_iop_clip_and_zoom_roi_cl(devid, dev_out, out_image, roi_out, roi_in);
+
+#ifdef DT_DEMOSAIC_POSITIVE
+  if(err == CL_SUCCESS)
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->image_positive, roi_out->width, roi_out->height,
+        CLARG(dev_out), CLARG(dev_out), CLARG(roi_out->width), CLARG(roi_out->height));
+#endif
 
 finish:
   dt_opencl_release_mem_object(dev_xtrans);
@@ -1286,10 +1298,9 @@ void init_global(dt_iop_module_so_t *self)
   gd->kernel_rcd_step_4_2 = dt_opencl_create_kernel(rcd, "rcd_step_4_2");
   gd->kernel_rcd_step_5_1 = dt_opencl_create_kernel(rcd, "rcd_step_5_1");
   gd->kernel_rcd_step_5_2 = dt_opencl_create_kernel(rcd, "rcd_step_5_2");
-  gd->kernel_rcd_border_redblue = dt_opencl_create_kernel(rcd, "rcd_border_redblue");
-  gd->kernel_rcd_border_green = dt_opencl_create_kernel(rcd, "rcd_border_green");
   gd->kernel_demosaic_box3 = dt_opencl_create_kernel(rcd, "demosaic_box3");
   gd->kernel_write_blended_dual  = dt_opencl_create_kernel(rcd, "write_blended_dual");
+  gd->image_positive  = dt_opencl_create_kernel(rcd, "image_positive");
 
   const int capt = 38; // capture.cl, from programs.conf
   gd->gaussian_9x9_mul = dt_opencl_create_kernel(capt, "kernel_9x9_mul");
@@ -1353,8 +1364,6 @@ void cleanup_global(dt_iop_module_so_t *self)
   dt_opencl_free_kernel(gd->kernel_rcd_step_4_2);
   dt_opencl_free_kernel(gd->kernel_rcd_step_5_1);
   dt_opencl_free_kernel(gd->kernel_rcd_step_5_2);
-  dt_opencl_free_kernel(gd->kernel_rcd_border_redblue);
-  dt_opencl_free_kernel(gd->kernel_rcd_border_green);
   dt_opencl_free_kernel(gd->kernel_demosaic_box3);
   dt_opencl_free_kernel(gd->kernel_write_blended_dual);
   dt_opencl_free_kernel(gd->gaussian_9x9_mul);
@@ -1365,6 +1374,7 @@ void cleanup_global(dt_iop_module_so_t *self)
   dt_opencl_free_kernel(gd->show_blend_mask);
   dt_opencl_free_kernel(gd->capture_result);
   dt_opencl_free_kernel(gd->final_blend);
+  dt_opencl_free_kernel(gd->image_positive);
   dt_free_align(gd->gauss_coeffs);
   free(self->data);
   self->data = NULL;
diff --git a/src/iop/demosaicing/amaze.cc b/src/iop/demosaicing/amaze.cc
index dc2c4dbd5f58..64ce4b826735 100644
--- a/src/iop/demosaicing/amaze.cc
+++ b/src/iop/demosaicing/amaze.cc
@@ -135,11 +135,6 @@ void amaze_demosaic(const float *const in,
 {
   const float clip_pt8 = 0.8f * clip_pt;
 
-// this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster
-// If AMAZETS is undefined it will be set to 160, which is the fastest on machines with 1GB cache per thread.
-#ifndef AMAZETS
-#define AMAZETS 160
-#endif
   // Tile size; the image is processed in square tiles to lower memory requirements and facilitate
   // multi-threading
   // We assure that Tile size is a multiple of 32 in the range [96;992]
diff --git a/src/iop/demosaicing/basics.c b/src/iop/demosaicing/basics.c
index 851d880cd6c5..5d3472603fe0 100644
--- a/src/iop/demosaicing/basics.c
+++ b/src/iop/demosaicing/basics.c
@@ -65,7 +65,7 @@ static void pre_median_b(float *out,
         for(int i = 0; i < 8; i++)
           for(int ii = i + 1; ii < 9; ii++)
             if(med[i] > med[ii]) SWAP(med[i], med[ii]);
-        pixo[0] = fmaxf(0.0f, (cnt == 1 ? med[4] - 64.0f : med[(cnt - 1) / 2]));
+        pixo[0] = cnt == 1 ? med[4] - 64.0f : med[(cnt - 1) / 2];
         // pixo[0] = med[(cnt-1)/2];
         pixo += 2;
         pixi += 2;
@@ -138,7 +138,7 @@ static void color_smoothing(float *out,
           SWAPmed(4, 2);
           SWAPmed(6, 4);
           SWAPmed(4, 2);
-          outp[c] = fmaxf(med[4] + outp[1], 0.0f);
+          outp[c] = med[4] + outp[1];
         }
       }
     }
@@ -156,9 +156,9 @@ static void green_equilibration_lavg(float *out,
   const float maximum = 1.0f;
 
   int oj = 2, oi = 2;
-  if(FC(oj, oi, filters) != 1) oj++;
-  if(FC(oj, oi, filters) != 1) oi++;
-  if(FC(oj, oi, filters) != 1) oj--;
+  if(FC(oj, oi, filters) != GREEN) oj++;
+  if(FC(oj, oi, filters) != GREEN) oi++;
+  if(FC(oj, oi, filters) != GREEN) oj--;
 
   dt_iop_image_copy_by_size(out, in, width, height, 1);
 
@@ -190,7 +190,7 @@ static void green_equilibration_lavg(float *out,
                           + fabsf(o2_3 - o2_4) + fabsf(o2_2 - o2_4)) / 6.0f;
         if((in[j * width + i] < maximum * 0.95f) && (c1 < maximum * thr) && (c2 < maximum * thr))
         {
-          out[j * width + i] = fmaxf(0.0f, in[j * width + i] * m1 / m2);
+          out[j * width + i] = in[j * width + i] * m1 / m2;
         }
       }
     }
@@ -230,7 +230,7 @@ static void green_equilibration_favg(float *out,
   {
     for(int i = oi; i < (width - 1 - g2_offset); i += 2)
     {
-      out[(size_t)j * width + i] = fmaxf(0.0f, in[(size_t)j * width + i] * gr_ratio);
+      out[(size_t)j * width + i] = in[(size_t)j * width + i] * gr_ratio;
     }
   }
 }
@@ -266,10 +266,10 @@ static int color_smoothing_cl(const dt_iop_module_t *self,
   cl_mem dev_t1 = dev_out;
   cl_mem dev_t2 = dev_tmp;
 
+  const size_t sizes[] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
+  const size_t local[] = { locopt.sizex, locopt.sizey, 1 };
   for(int pass = 0; pass < passes; pass++)
   {
-    const size_t sizes[] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
-    const size_t local[] = { locopt.sizex, locopt.sizey, 1 };
     err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_color_smoothing, sizes, local,
       CLARG(dev_t1), CLARG(dev_t2), CLARG(width),
       CLARG(height), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)));
@@ -298,6 +298,7 @@ static int color_smoothing_cl(const dt_iop_module_t *self,
   return err;
 }
 
+#define DT_REDUCESIZE_MIN 64
 static int green_equilibration_cl(const dt_iop_module_t *self,
                                   const dt_dev_pixelpipe_iop_t *piece,
                                   cl_mem dev_in,
@@ -414,14 +415,15 @@ static int green_equilibration_cl(const dt_iop_module_t *self,
                                             sizeof(float) * 2 * reducesize, CL_TRUE);
     if(err != CL_SUCCESS) goto error;
 
-    float sum1 = 0.0f, sum2 = 0.0f;
+    double sum1 = 0.0;
+    double sum2 = 0.0;
     for(int k = 0; k < reducesize; k++)
     {
-      sum1 += sumsum[2 * k];
-      sum2 += sumsum[2 * k + 1];
+      sum1 += (double)sumsum[2 * k];
+      sum2 += (double)sumsum[2 * k + 1];
     }
 
-    const float gr_ratio = (sum1 > 0.0f && sum2 > 0.0f) ? sum2 / sum1 : 1.0f;
+    const float gr_ratio = (sum1 > 0.0 && sum2 > 0.0) ? (float)(sum2 / sum1) : 1.0f;
 
     err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_green_eq_favg_apply, width, height,
       CLARG(dev_in1), CLARG(dev_out1), CLARG(width), CLARG(height), CLARG(filters),
@@ -548,7 +550,8 @@ static int process_default_cl(const dt_iop_module_t *self,
         const size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
         err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_green, sizes, local,
           CLARG(dev_med), CLARG(dev_tmp), CLARG(width),
-          CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * (locopt.sizex + 2*3) * (locopt.sizey + 2*3)));
+          CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * (locopt.sizex + 2*3) * (locopt.sizey + 2*3)),
+          CLARGINT(100000));
         if(err != CL_SUCCESS) goto error;
       }
 
@@ -565,7 +568,8 @@ static int process_default_cl(const dt_iop_module_t *self,
         const size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
         err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_redblue, sizes, local,
           CLARG(dev_tmp), CLARG(dev_out), CLARG(width),
-          CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)));
+          CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)),
+          CLARGINT(100000));
         if(err != CL_SUCCESS) goto error;
       }
     }
diff --git a/src/iop/demosaicing/capture.c b/src/iop/demosaicing/capture.c
index e7f6beb6377c..02e4e9f5f48c 100644
--- a/src/iop/demosaicing/capture.c
+++ b/src/iop/demosaicing/capture.c
@@ -1045,10 +1045,10 @@ static int _capture_sharpen_cl(dt_iop_module_t *self,
 
   const dt_iop_buffer_dsc_t *dsc = &pipe->dsc;
   const gboolean wbon = dsc->temperature.enabled;
-  dt_aligned_pixel_t icoeffs = { wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[0] : CAPTURE_CFACLIP,
-                                 wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[1] : CAPTURE_CFACLIP,
-                                 wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[2] : CAPTURE_CFACLIP,
-                                 0.0f };
+  const dt_aligned_pixel_t icoeffs = { wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[0] : CAPTURE_CFACLIP,
+                                       wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[1] : CAPTURE_CFACLIP,
+                                       wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[2] : CAPTURE_CFACLIP,
+                                       0.0f };
 
   cl_mem gcoeffs = NULL;
   cl_mem gauss_idx = NULL;
@@ -1058,10 +1058,9 @@ static int _capture_sharpen_cl(dt_iop_module_t *self,
   cl_mem luminance = dt_opencl_alloc_device_buffer(devid, bsize);
   cl_mem tmp2 = dt_opencl_alloc_device_buffer(devid, bsize);
   cl_mem tmp1 = dt_opencl_alloc_device_buffer(devid, bsize);
-  cl_mem whites = dt_opencl_copy_host_to_device_constant(devid, 4 * sizeof(float), icoeffs);
   cl_mem dev_rgb = dt_opencl_duplicate_image(devid, dev_out);
 
-  if(!blendmask || !luminance || !tmp2 || !tmp1 || !whites || !dev_rgb) goto finish;
+  if(!blendmask || !luminance || !tmp2 || !tmp1 || !dev_rgb) goto finish;
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->prefill_clip_mask, width, height,
           CLARG(tmp2), CLARG(width), CLARG(height));
@@ -1069,7 +1068,7 @@ static int _capture_sharpen_cl(dt_iop_module_t *self,
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->prepare_blend, width, height,
           CLARG(dev_in), CLARG(dev_out), CLARG(filters), CLARG(dev_xtrans), CLARG(tmp2), CLARG(tmp1),
-          CLARG(whites), CLARG(width), CLARG(height));
+          CLARG(icoeffs), CLARG(width), CLARG(height));
   if(err != CL_SUCCESS) goto finish;
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->modify_blend, width, height,
@@ -1140,7 +1139,6 @@ static int _capture_sharpen_cl(dt_iop_module_t *self,
   dt_opencl_release_mem_object(tmp2);
   dt_opencl_release_mem_object(tmp1);
   dt_opencl_release_mem_object(luminance);
-  dt_opencl_release_mem_object(whites);
 
   return err;
 }
diff --git a/src/iop/demosaicing/dual.c b/src/iop/demosaicing/dual.c
index 0c703a03ce38..062092ad609a 100644
--- a/src/iop/demosaicing/dual.c
+++ b/src/iop/demosaicing/dual.c
@@ -118,7 +118,7 @@ int dual_demosaic_cl(const dt_iop_module_t *self,
         wboff ? 1.0f : 1.0f / p->dsc.temperature.coeffs[2], 1.0f };
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_Y0_mask, width, height,
-     CLARG(mask), CLARG(high_image), CLARG(width), CLARG(height), CLFLARRAY(4, wb));
+     CLARG(mask), CLARG(high_image), CLARG(width), CLARG(height), CLARG(wb));
   if(err != CL_SUCCESS) goto finish;
 
   err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_scharr_mask, width, height,
diff --git a/src/iop/demosaicing/lmmse.c b/src/iop/demosaicing/lmmse.c
index fcc5d10e32c1..9ed6ebe9000c 100644
--- a/src/iop/demosaicing/lmmse.c
+++ b/src/iop/demosaicing/lmmse.c
@@ -125,7 +125,7 @@ static void lmmse_demosaic(float *const restrict out,
                            const dt_iop_demosaic_lmmse_t mode,
                            const float scaler)
 {
-  rcd_ppg_border(out, in, width, height, filters, BORDER_AROUND);
+  demosaic_ppg(out, in, width, height, filters, 0.0f, BORDER_AROUND);
   if(width < 2 * BORDER_AROUND || height < 2 * BORDER_AROUND)
     return;
 
diff --git a/src/iop/demosaicing/ppg.c b/src/iop/demosaicing/ppg.c
index e88c6f268824..75fa8c079a1c 100644
--- a/src/iop/demosaicing/ppg.c
+++ b/src/iop/demosaicing/ppg.c
@@ -22,9 +22,10 @@ static void demosaic_ppg(float *const out,
                          const int width,
                          const int height,
                          const uint32_t filters,
-                         const float thrs)
+                         const float thrs,
+                         const int margin)
 {
-  // border interpolate
+  // preliminary border interpolate for outermost 3 pixels
   float sum[8];
   for(int j = 0; j < height; j++)
     for(int i = 0; i < width; i++)
@@ -46,10 +47,10 @@ static void demosaic_ppg(float *const out,
       for(int c = 0; c < 3; c++)
       {
         if(c != f && sum[c + 4] > 0.0f)
-          out[4 * ((size_t)j * width + i) + c] = fmaxf(0.0f, sum[c] / sum[c + 4]);
+          out[4 * ((size_t)j * width + i) + c] = sum[c] / sum[c + 4];
         else
           out[4 * ((size_t)j * width + i) + c]
-              = fmaxf(0.0f, in[(size_t)j * width + i]);
+              = in[(size_t)j * width + i];
       }
     }
   const gboolean median = thrs > 0.0f;
@@ -60,8 +61,11 @@ static void demosaic_ppg(float *const out,
     pre_median(med_in, in, width, height, filters, 1, thrs);
     input = med_in;
   }
-// for all pixels except those in the 3 pixel border:
-// interpolate green from input into out float array, or copy color.
+
+  const int border = margin + 3;
+  // for all pixels except those in the 3 pixel border:
+  // interpolate green from input into out float array, or copy color.
+  // avoid processing if outside of given border
   DT_OMP_FOR()
   for(int j = 3; j < height - 3; j++)
   {
@@ -69,6 +73,14 @@ static void demosaic_ppg(float *const out,
     const float *buf_in = input + (size_t)width * j + 3;
     for(int i = 3; i < width - 3; i++)
     {
+      if(i == border && j >= border && j < height - border)
+      {
+        i = width - border;
+        buf = out + (size_t)4 * width * j + 4 * i;
+        buf_in = in + (size_t)width * j + i;
+      }
+      if(i == width) break;
+
       const int c = FC(j, i, filters);
       dt_aligned_pixel_t color;
       const float pc = buf_in[0];
@@ -116,86 +128,94 @@ static void demosaic_ppg(float *const out,
       color[3] = 0.0f;
 
       for_each_channel(k,aligned(buf,color:16)
-        dt_omp_nontemporal(buf)) buf[k] = fmaxf(0.0f, color[k]);
+        dt_omp_nontemporal(buf)) buf[k] = color[k];
 
       buf += 4;
       buf_in++;
     }
   }
 
-// for all pixels except the outermost row/column:
-// interpolate colors using out as input into float out array
   DT_OMP_FOR()
-  for(int j = 1; j < height - 1; j++)
+  for(int j = 0; j < height; j++)
   {
-    float *buf = out + (size_t)4 * width * j + 4;
-    for(int i = 1; i < width - 1; i++)
+    float *buf = out + (size_t)4 * width * j;
+    for(int i = 0; i < width; i++)
     {
+      if(i == margin && j >= margin && j < height - margin)
+      {
+        i = width - margin;
+        buf = out + (size_t)4 * (width * j + i);
+      }
       // also prefetch direct nbs top/bottom
-      const int c = FC(j, i, filters);
       dt_aligned_pixel_t color = { buf[0], buf[1], buf[2], buf[3] };
 
-      // fill all four pixels with correctly interpolated stuff: r/b for green1/2
-      // b for r and r for b
-      if(__builtin_expect(c & 1, 1)) // c == 1 || c == 3)
-      {
-        // calculate red and blue for green pixels:
-        // need 4-nbhood:
-        const float *nt = buf - 4 * width;
-        const float *nb = buf + 4 * width;
-        const float *nl = buf - 4;
-        const float *nr = buf + 4;
-        if(FC(j, i + 1, filters) == 0) // red nb in same row
-        {
-          color[2] = (nt[2] + nb[2] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
-          color[0] = (nl[0] + nr[0] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
-        }
-        else
-        {
-          // blue nb
-          color[0] = (nt[0] + nb[0] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
-          color[2] = (nl[2] + nr[2] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
-        }
-      }
-      else
+      // for all pixels except the outermost row/column:
+      // interpolate colors using out as input into float out array
+      // the outermost are untouched
+      // but both final results avoid negatives
+      if(j > 0 && i > 0 && i < width - 1 && j < height - 1)
       {
-        // get 4-star-nbhood:
-        const float *ntl = buf - 4 - 4 * width;
-        const float *ntr = buf + 4 - 4 * width;
-        const float *nbl = buf - 4 + 4 * width;
-        const float *nbr = buf + 4 + 4 * width;
-
-        if(c == 0)
+        const int c = FC(j, i, filters);
+        // fill all four pixels with correctly interpolated stuff: r/b for green1/2
+        // b for r and r for b
+        if(__builtin_expect(c & 1, 1)) // c == 1 || c == 3)
         {
-          // red pixel, fill blue:
-          const float diff1 = fabsf(ntl[2] - nbr[2]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
-          const float guess1 = ntl[2] + nbr[2] + 2.0f * color[1] - ntl[1] - nbr[1];
-          const float diff2 = fabsf(ntr[2] - nbl[2]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
-          const float guess2 = ntr[2] + nbl[2] + 2.0f * color[1] - ntr[1] - nbl[1];
-          if(diff1 > diff2)
-            color[2] = guess2 * .5f;
-          else if(diff1 < diff2)
-            color[2] = guess1 * .5f;
+          // calculate red and blue for green pixels:
+          // need 4-nbhood:
+          const float *nt = buf - 4 * width;
+          const float *nb = buf + 4 * width;
+          const float *nl = buf - 4;
+          const float *nr = buf + 4;
+          if(FC(j, i + 1, filters) == 0) // red nb in same row
+          {
+            color[2] = (nt[2] + nb[2] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
+            color[0] = (nl[0] + nr[0] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
+          }
           else
-            color[2] = (guess1 + guess2) * .25f;
+          {
+            // blue nb
+            color[0] = (nt[0] + nb[0] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
+            color[2] = (nl[2] + nr[2] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
+          }
         }
-        else // c == 2, blue pixel, fill red:
+        else
         {
-          const float diff1 = fabsf(ntl[0] - nbr[0]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
-          const float guess1 = ntl[0] + nbr[0] + 2.0f * color[1] - ntl[1] - nbr[1];
-          const float diff2 = fabsf(ntr[0] - nbl[0]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
-          const float guess2 = ntr[0] + nbl[0] + 2.0f * color[1] - ntr[1] - nbl[1];
-          if(diff1 > diff2)
-            color[0] = guess2 * .5f;
-          else if(diff1 < diff2)
-            color[0] = guess1 * .5f;
-          else
-            color[0] = (guess1 + guess2) * .25f;
+          // get 4-star-nbhood:
+          const float *ntl = buf - 4 - 4 * width;
+          const float *ntr = buf + 4 - 4 * width;
+          const float *nbl = buf - 4 + 4 * width;
+          const float *nbr = buf + 4 + 4 * width;
+          if(c == 0)
+          {
+            // red pixel, fill blue:
+            const float diff1 = fabsf(ntl[2] - nbr[2]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
+            const float guess1 = ntl[2] + nbr[2] + 2.0f * color[1] - ntl[1] - nbr[1];
+            const float diff2 = fabsf(ntr[2] - nbl[2]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
+            const float guess2 = ntr[2] + nbl[2] + 2.0f * color[1] - ntr[1] - nbl[1];
+            if(diff1 > diff2)
+              color[2] = guess2 * .5f;
+            else if(diff1 < diff2)
+              color[2] = guess1 * .5f;
+            else
+              color[2] = (guess1 + guess2) * .25f;
+          }
+          else // c == 2, blue pixel, fill red:
+          {
+            const float diff1 = fabsf(ntl[0] - nbr[0]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
+            const float guess1 = ntl[0] + nbr[0] + 2.0f * color[1] - ntl[1] - nbr[1];
+            const float diff2 = fabsf(ntr[0] - nbl[0]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
+            const float guess2 = ntr[0] + nbl[0] + 2.0f * color[1] - ntr[1] - nbl[1];
+            if(diff1 > diff2)
+              color[0] = guess2 * .5f;
+            else if(diff1 < diff2)
+              color[0] = guess1 * .5f;
+            else
+              color[0] = (guess1 + guess2) * .25f;
+          }
         }
       }
-
       for_each_channel(k,aligned(buf,color:16)
-        dt_omp_nontemporal(buf)) buf[k] = fmaxf(0.0f, color[k]);
+        dt_omp_nontemporal(buf)) buf[k] = color[k];
 
       buf += 4;
     }
diff --git a/src/iop/demosaicing/rcd.c b/src/iop/demosaicing/rcd.c
index 0adcbd83d915..4704f3ec3e34 100644
--- a/src/iop/demosaicing/rcd.c
+++ b/src/iop/demosaicing/rcd.c
@@ -38,9 +38,8 @@
 */
 
 /* Some notes about the algorithm
-* 1. The calculated data at the tiling borders RCD_BORDER must be at least 9 to be stable.
-* 2. For the outermost tiles we only have to discard a 7 pixel border region interpolated otherwise.
-* 3. The tilesize has a significant influence on performance, the default is a good guess for modern
+* 1. The calculated data at the tiling borders RCD_BORDER must be 10 to be stable.
+* 2. The tilesize has a significant influence on performance, the default is a good guess for modern
 *    x86/64 machines, tested on Xeon E-2288G, i5-8250U.
 */
 
@@ -65,8 +64,7 @@
   #pragma GCC optimize ("fp-contract=fast", "finite-math-only", "no-math-errno")
 #endif
 
-#define RCD_BORDER 9          // avoid tile-overlap errors
-#define RCD_MARGIN 7          // for the outermost tiles we can have a smaller outer border
+#define RCD_BORDER 10 // avoid tile-overlap errors
 #define RCD_TILEVALID (DT_RCD_TILESIZE - 2 * RCD_BORDER)
 #define w1 DT_RCD_TILESIZE
 #define w2 (2 * DT_RCD_TILESIZE)
@@ -82,191 +80,6 @@ static inline float _safe_in(float a, float scale)
   return fmaxf(0.0f, a) * scale;
 }
 
-/** This is basically ppg adopted to only write data to RCD_MARGIN */
-static void rcd_ppg_border(float *const out,
-                           const float *const in,
-                           const int width,
-                           const int height,
-                           const uint32_t filters,
-                           const int margin)
-{
-  const int border = margin + 3;
-  // write approximatad 3-pixel border region to out
-  float sum[8];
-  for(int j = 0; j < height; j++)
-  {
-    for(int i = 0; i < width; i++)
-    {
-      if(i == 3 && j >= 3 && j < height - 3) i = width - 3;
-      if(i == width) break;
-      memset(sum, 0, sizeof(float) * 8);
-      for(int y = j - 1; y != j + 2; y++)
-      {
-        for(int x = i - 1; x != i + 2; x++)
-        {
-          if((y >= 0) && (x >= 0) && (y < height) && (x < width))
-          {
-            const int f = FC(y, x, filters);
-            sum[f] += fmaxf(0.0f, in[(size_t)y * width + x]);
-            sum[f + 4]++;
-          }
-        }
-      }
-      const int f = FC(j, i, filters);
-      for(int c = 0; c < 3; c++)
-      {
-        if(c != f && sum[c + 4] > 0.0f)
-          out[4 * ((size_t)j * width + i) + c] = sum[c] / sum[c + 4];
-        else
-          out[4 * ((size_t)j * width + i) + c] = fmaxf(0.0f, in[(size_t)j * width + i]);
-      }
-    }
-  }
-
-  DT_OMP_FOR()
-  for(int j = 3; j < height - 3; j++)
-  {
-    float *buf = out + (size_t)4 * width * j + 4 * 3;
-    const float *buf_in = in + (size_t)width * j + 3;
-    for(int i = 3; i < width - 3; i++)
-    {
-      if(i == border && j >= border && j < height - border)
-      {
-        i = width - border;
-        buf = out + (size_t)4 * width * j + 4 * i;
-        buf_in = in + (size_t)width * j + i;
-      }
-      if(i == width) break;
-
-      const int c = FC(j, i, filters);
-      dt_aligned_pixel_t color;
-      const float pc = fmaxf(0.0f, buf_in[0]);
-      if(c == 0 || c == 2)
-      {
-        color[c] = pc;
-        const float pym  = fmaxf(0.0f, buf_in[-width * 1]);
-        const float pym2 = fmaxf(0.0f, buf_in[-width * 2]);
-        const float pym3 = fmaxf(0.0f, buf_in[-width * 3]);
-        const float pyM  = fmaxf(0.0f, buf_in[+width * 1]);
-        const float pyM2 = fmaxf(0.0f, buf_in[+width * 2]);
-        const float pyM3 = fmaxf(0.0f, buf_in[+width * 3]);
-        const float pxm  = fmaxf(0.0f, buf_in[-1]);
-        const float pxm2 = fmaxf(0.0f, buf_in[-2]);
-        const float pxm3 = fmaxf(0.0f, buf_in[-3]);
-        const float pxM  = fmaxf(0.0f, buf_in[+1]);
-        const float pxM2 = fmaxf(0.0f, buf_in[+2]);
-        const float pxM3 = fmaxf(0.0f, buf_in[+3]);
-
-        const float guessx = (pxm + pc + pxM) * 2.0f - pxM2 - pxm2;
-        const float diffx = (fabsf(pxm2 - pc) + fabsf(pxM2 - pc) + fabsf(pxm - pxM)) * 3.0f
-                            + (fabsf(pxM3 - pxM) + fabsf(pxm3 - pxm)) * 2.0f;
-        const float guessy = (pym + pc + pyM) * 2.0f - pyM2 - pym2;
-        const float diffy = (fabsf(pym2 - pc) + fabsf(pyM2 - pc) + fabsf(pym - pyM)) * 3.0f
-                            + (fabsf(pyM3 - pyM) + fabsf(pym3 - pym)) * 2.0f;
-        if(diffx > diffy)
-        {
-          // use guessy
-          const float m = fminf(pym, pyM);
-          const float M = fmaxf(pym, pyM);
-          color[1] = fmaxf(fminf(guessy * .25f, M), m);
-        }
-        else
-        {
-          const float m = fminf(pxm, pxM);
-          const float M = fmaxf(pxm, pxM);
-          color[1] = fmaxf(fminf(guessx * .25f, M), m);
-        }
-      }
-      else
-        color[1] = pc;
-
-      color[3] = 0.0f;
-      for_each_channel(k)
-        buf[k] = color[k];
-      buf += 4;
-      buf_in++;
-    }
-  }
-// for all pixels: interpolate colors into float array
-  DT_OMP_FOR()
-  for(int j = 1; j < height - 1; j++)
-  {
-    float *buf = out + (size_t)4 * width * j + 4;
-    for(int i = 1; i < width - 1; i++)
-    {
-      if(i == margin && j >= margin && j < height - margin)
-      {
-        i = width - margin;
-        buf = out + (size_t)4 * (width * j + i);
-      }
-      const int c = FC(j, i, filters);
-      dt_aligned_pixel_t color = { buf[0], buf[1], buf[2], buf[3] };
-      const int linesize = 4 * width;
-      // fill all four pixels with correctly interpolated stuff: r/b for green1/2
-      // b for r and r for b
-      if(__builtin_expect(c & 1, 1)) // c == 1 || c == 3)
-      {
-        // calculate red and blue for green pixels:
-        // need 4-nbhood:
-        const float *nt = buf - linesize;
-        const float *nb = buf + linesize;
-        const float *nl = buf - 4;
-        const float *nr = buf + 4;
-        if(FC(j, i + 1, filters) == 0) // red nb in same row
-        {
-          color[2] = (nt[2] + nb[2] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
-          color[0] = (nl[0] + nr[0] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
-        }
-        else
-        {
-          // blue nb
-          color[0] = (nt[0] + nb[0] + 2.0f * color[1] - nt[1] - nb[1]) * .5f;
-          color[2] = (nl[2] + nr[2] + 2.0f * color[1] - nl[1] - nr[1]) * .5f;
-        }
-      }
-      else
-      {
-        // get 4-star-nbhood:
-        const float *ntl = buf - 4 - linesize;
-        const float *ntr = buf + 4 - linesize;
-        const float *nbl = buf - 4 + linesize;
-        const float *nbr = buf + 4 + linesize;
-
-        if(c == 0)
-        {
-          // red pixel, fill blue:
-          const float diff1  = fabsf(ntl[2] - nbr[2]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
-          const float guess1 = ntl[2] + nbr[2] + 2.0f * color[1] - ntl[1] - nbr[1];
-          const float diff2  = fabsf(ntr[2] - nbl[2]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
-          const float guess2 = ntr[2] + nbl[2] + 2.0f * color[1] - ntr[1] - nbl[1];
-          if(diff1 > diff2)
-            color[2] = guess2 * .5f;
-          else if(diff1 < diff2)
-            color[2] = guess1 * .5f;
-          else
-            color[2] = (guess1 + guess2) * .25f;
-        }
-        else // c == 2, blue pixel, fill red:
-        {
-          const float diff1  = fabsf(ntl[0] - nbr[0]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]);
-          const float guess1 = ntl[0] + nbr[0] + 2.0f * color[1] - ntl[1] - nbr[1];
-          const float diff2  = fabsf(ntr[0] - nbl[0]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]);
-          const float guess2 = ntr[0] + nbl[0] + 2.0f * color[1] - ntr[1] - nbl[1];
-          if(diff1 > diff2)
-            color[0] = guess2 * .5f;
-          else if(diff1 < diff2)
-            color[0] = guess1 * .5f;
-          else
-            color[0] = (guess1 + guess2) * .25f;
-        }
-      }
-      for_each_channel(k)
-        buf[k] = color[k];
-      buf += 4;
-    }
-  }
-}
-
 DT_OMP_DECLARE_SIMD(aligned(in, out : 64))
 static void demosaic_box3(float *const restrict out,
                           const float *const restrict in,
@@ -289,7 +102,7 @@ static void demosaic_box3(float *const restrict out,
           if(x >= 0 && y >= 0 && x < width && y < height)
           {
             const int color = (filters == 9u) ? FCNxtrans(y, x, xtrans) : FC(y, x, filters);
-            sum[color] += MAX(0.0f, in[(size_t)width*y +x]);
+            sum[color] += in[(size_t)width*y +x];
             cnt[color] += 1.0f;
           }
         }
@@ -308,13 +121,9 @@ static void rcd_demosaic(float *const restrict out,
                          const uint32_t filters,
                          const float scaler)
 {
+  demosaic_ppg(out, in, width, height, filters, 0.0f, RCD_BORDER);
   if(width < 2*RCD_BORDER || height < 2*RCD_BORDER)
-  {
-    rcd_ppg_border(out, in, width, height, filters, RCD_BORDER);
     return;
-  }
-
-  rcd_ppg_border(out, in, width, height, filters, RCD_MARGIN);
 
   const float revscaler = 1.0f / scaler;
 
@@ -555,18 +364,15 @@ static void rcd_demosaic(float *const restrict out,
           }
         }
 
-        // For the outermost tiles in all directions we can use a smaller border margin
-        const int first_vertical =   rowStart + ((tile_vertical == 0) ? RCD_MARGIN : RCD_BORDER);
-        const int last_vertical =    rowEnd   - ((tile_vertical == num_vertical - 1)     ? RCD_MARGIN : RCD_BORDER);
-        const int first_horizontal = colStart + ((tile_horizontal == 0) ? RCD_MARGIN : RCD_BORDER);
-        const int last_horizontal =  colEnd   - ((tile_horizontal == num_horizontal - 1) ? RCD_MARGIN : RCD_BORDER);
-        for(int row = first_vertical; row < last_vertical; row++)
+        for(int row = rowStart + RCD_BORDER; row < rowEnd - RCD_BORDER; row++)
         {
-          for(int col = first_horizontal, idx = (row - rowStart) * DT_RCD_TILESIZE + col - colStart, o_idx = (row * width + col) * 4; col < last_horizontal; col++, o_idx += 4, idx++)
+          for(int col = colStart + RCD_BORDER, idx = (row - rowStart) * DT_RCD_TILESIZE + col - colStart, o_idx = (row * width + col) * 4;
+              col < colEnd - RCD_BORDER;
+              col++, o_idx += 4, idx++)
           {
-            out[o_idx]   = scaler * fmaxf(0.0f, rgb[0][idx]);
-            out[o_idx+1] = scaler * fmaxf(0.0f, rgb[1][idx]);
-            out[o_idx+2] = scaler * fmaxf(0.0f, rgb[2][idx]);
+            out[o_idx]   = scaler * rgb[0][idx];
+            out[o_idx+1] = scaler * rgb[1][idx];
+            out[o_idx+2] = scaler * rgb[2][idx];
             out[o_idx+3] = 0.0f;
           }
         }
@@ -625,12 +431,12 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
                                       .cellsize = sizeof(float) * 1, .overhead = 0,
                                       .sizex = 64, .sizey = 64 };
 
-    err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_green, &locopt);
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_green, &locopt);
     if(err != CL_SUCCESS) goto error;
 
     size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
     size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
-    err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_rcd_border_green, sizes, local,
+    err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_green, sizes, local,
         CLARG(dev_in), CLARG(dev_tmp), CLARG(width), CLARG(height), CLARG(filters),
         CLLOCAL(sizeof(float) * (locopt.sizex + 2*3) * (locopt.sizey + 2*3)), CLARGINT(32));
     if(err != CL_SUCCESS) goto error;
@@ -642,12 +448,12 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
                                       .cellsize = 4 * sizeof(float), .overhead = 0,
                                       .sizex = 64, .sizey = 64 };
 
-    err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_redblue, &locopt);
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_redblue, &locopt);
     if(err != CL_SUCCESS) goto error;
 
     size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
     size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
-    err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_rcd_border_redblue, sizes, local,
+    err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_redblue, sizes, local,
       CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(filters),
       CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)), CLARGINT(16));
     if(err != CL_SUCCESS) goto error;
@@ -669,10 +475,14 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
     goto error;
 
   // populate data
-  float scaler = 1.0f / dt_iop_get_processed_maximum(piece);
+  float scaler = dt_iop_get_processed_maximum(piece);
+  if(scaler < 1.5f) scaler = 2.0f;
+  else if(scaler < 3.0f) scaler = 4.0f;
+  else scaler = 8.0f;
+  const float revscaler = 1.0f / scaler;
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_rcd_populate, width, height,
         CLARG(dev_in), CLARG(cfa), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height),
-        CLARG(filters), CLARG(scaler));
+        CLARG(filters), CLARG(revscaler));
   if(err != CL_SUCCESS) goto error;
 
   // Step 1.1: Calculate a squared vertical and horizontal high pass filter on color differences
@@ -715,10 +525,9 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
         CLARG(VH_dir), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(filters));
   if(err != CL_SUCCESS) goto error;
 
-  scaler = dt_iop_get_processed_maximum(piece);
   // write output
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_rcd_write_output, width, height,
-        CLARG(dev_out), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(scaler), CLARGINT(RCD_MARGIN));
+        CLARG(dev_out), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(scaler), CLARGINT(RCD_BORDER));
 
 error:
   dt_opencl_release_mem_object(dev_tmp);
@@ -739,7 +548,6 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
 #endif
 
 #undef RCD_BORDER
-#undef RCD_MARGIN
 #undef RCD_TILEVALID
 #undef w1
 #undef w2
diff --git a/src/iop/demosaicing/vng.c b/src/iop/demosaicing/vng.c
index 47e5694224ca..bb5c8481b2c6 100644
--- a/src/iop/demosaicing/vng.c
+++ b/src/iop/demosaicing/vng.c
@@ -18,14 +18,16 @@
 
 /* taken from dcraw and demosaic_ppg below */
 
-static void lin_interpolate(float *out,
-                            const float *const in,
-                            const int width,
-                            const int height,
-                            const uint32_t filters,
-                            const uint8_t (*const xtrans)[6])
+static void _vng_lininterpolate(float *out,
+                                const float *const in,
+                                const int width,
+                                const int height,
+                                const uint32_t filters,
+                                const uint8_t (*const xtrans)[6],
+                                const int border)
 {
-  const int colors = (filters == 9) ? 3 : 4;
+  const gboolean is_xtrans = filters == 9;
+  const int colors = is_xtrans ? 3 : 4;
   // border interpolate
   DT_OMP_FOR()
   for(int row = 0; row < height; row++)
@@ -41,7 +43,7 @@ static void lin_interpolate(float *out,
           if(y >= 0 && x >= 0 && y < height && x < width)
           {
             const int f = fcol(y, x, filters, xtrans);
-            sum[f] += fmaxf(0.0f, in[y * width + x]);
+            sum[f] += in[y * width + x];
             count[f]++;
           }
       const int f = fcol(row, col, filters, xtrans);
@@ -53,7 +55,7 @@ static void lin_interpolate(float *out,
         if(c != f && count[c] != 0)
           out[4 * (row * width + col) + c] = sum[c] / count[c];
         else
-          out[4 * (row * width + col) + c] = fmaxf(0.0f, in[row * width + col]);
+          out[4 * (row * width + col) + c] = in[row * width + col];
       }
     }
 
@@ -107,15 +109,22 @@ static void lin_interpolate(float *out,
     const float *buf_in = in + width * row + 1;
     for(int col = 1; col < width - 1; col++)
     {
+      if(col == border && row >= border && row < height - border)
+      {
+        col = width - border;
+        buf = out + (size_t)4 * width * row + 4 * col;
+        buf_in = in + (size_t)width * row + col;
+      }
+      if(col == width) break;
       dt_aligned_pixel_t sum = { 0.0f };
       int *ip = &(lookup[row % size][col % size][0]);
       // for each adjoining pixel not of this pixel's color, sum up its weighted values
       for(int i = *ip++; i--; ip += 3)
-        sum[ip[2]] += fmaxf(0.0f, buf_in[ip[0]]) * ip[1];
+        sum[ip[2]] += buf_in[ip[0]] * ip[1];
       // for each interpolated color, load it into the pixel
       for(int i = colors; --i; ip += 2)
         buf[*ip] = sum[ip[0]] / ip[1];
-      buf[*ip] = fmaxf(0.0f, *buf_in);
+      buf[*ip] = *buf_in;
       buf += 4;
       buf_in++;
     }
@@ -137,12 +146,6 @@ static void lin_interpolate(float *out,
    I've extended the basic idea to work with non-Bayer filter arrays.
    Gradients are numbered clockwise from NW=0 to W=7.
 */
-static void _copy_abovezero(float *to, float *from, const int pixels)
-{
-  static dt_aligned_pixel_t zero = { 0.0f, 0.0f, 0.0f, 0.0f};
-  for(int i = 0; i < pixels; i++)
-    dt_vector_max(&to[i*4], zero, &from[i*4]);
-}
 
 static const signed char terms[]
       = { -2, -2, +0, -1, 1, 0x01, -2, -2, +0, +0, 2, 0x01, -2, -1, -1, +0, 1, 0x01, -2, -1, +0, -1, 1, 0x02,
@@ -178,7 +181,6 @@ static void vng_interpolate(float *out,
   float(*brow[4])[4];
   const gboolean is_xtrans = (filters == 9);
   const gboolean is_4bayer = FILTERS_ARE_4BAYER(filters);
-  const gboolean is_bayer = !(is_xtrans || is_4bayer);
   const int prow = is_xtrans ? 6 : 8;
   const int pcol = is_xtrans ? 6 : 2;
   const int colors = is_xtrans ? 3 : 4;
@@ -192,14 +194,11 @@ static void vng_interpolate(float *out,
   else
     filters4 = filters | 0x0c0c0c0cu;
 
-  lin_interpolate(out, in, width, height, filters4, xtrans);
+  _vng_lininterpolate(out, in, width, height, filters4, xtrans, 1000000);
 
   // if only linear interpolation is requested we can stop it here
   if(only_vng_linear)
-  {
-    if(is_bayer) goto bayer_greens;
-    else return;
-  }
+    goto finish;
 
   char *buffer = dt_alloc_aligned(sizeof(**brow) * width * 3 + sizeof(*ip) * prow * pcol * 320);
   if(!buffer)
@@ -304,22 +303,24 @@ static void vng_interpolate(float *out,
       }
     }
     if(row > 3) /* Write buffer to image */
-      _copy_abovezero(out + 4 * ((row - 2) * width + 2), (float *)(brow[0] + 2), width - 4);
+      dt_iop_image_copy(out + 4 * ((row - 2) * width + 2), (float *)(brow[0] + 2), 4*(width - 4));
 
     // rotate ring buffer
     for(int g = 0; g < 4; g++) brow[(g - 1) & 3] = brow[g];
   }
   // copy the final two rows to the image
-  _copy_abovezero(out + (4 * ((height - 4) * width + 2)), (float *)(brow[0] + 2), width - 4);
-  _copy_abovezero(out + (4 * ((height - 3) * width + 2)), (float *)(brow[1] + 2), width - 4);
+  dt_iop_image_copy(out + (4 * ((height - 4) * width + 2)), (float *)(brow[0] + 2), 4*(width - 4));
+  dt_iop_image_copy(out + (4 * ((height - 3) * width + 2)), (float *)(brow[1] + 2), 4*(width - 4));
   dt_free_align(buffer);
 
-bayer_greens:
-  if(is_bayer)
+finish:
+  DT_OMP_FOR()
+  for(size_t i = 0; i < (size_t)width * height * 4; i+=4)
   {
-    DT_OMP_FOR()
-    for(int i = 0; i < height * width; i++)
-      out[i * 4 + 1] = (out[i * 4 + 1] + out[i * 4 + 3]) / 2.0f;
+    out[i] = out[i];
+    out[i+1] = is_xtrans ? out[i+1] : 0.5f * (out[i+1] + out[i+3]);
+    out[i+2] = out[i+2];
+    out[i+3] = 0.0f;
   }
 }
 
@@ -499,7 +500,7 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
   if(only_vng_linear)
     goto finish;
 
-  // do full VNG interpolation; linear data is in dev_tmp
+  // do full VNG interpolation; linear data is in dev_tmp; don't touch outermost 2 pixels
   dt_opencl_local_buffer_t locopt
       = (dt_opencl_local_buffer_t){ .xoffset = 2*2, .xfactor = 1, .yoffset = 2*2, .yfactor = 1,
                                       .cellsize = 4 * sizeof(float), .overhead = 0,
@@ -511,7 +512,7 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
   size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
   size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
   err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_vng_interpolate, sizes, local,
-        CLARG(dev_in), CLARG(dev_tmp), CLARG(dev_out),
+        CLARG(dev_tmp), CLARG(dev_out),
         CLARG(width), CLARG(height), CLARG(filters4),
         CLARG(dev_xtrans), CLARG(dev_ips), CLARG(dev_code), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 4) * (locopt.sizey + 4)));
 
diff --git a/src/iop/demosaicing/xtrans.c b/src/iop/demosaicing/xtrans.c
index c08991062389..0609aae15c97 100644
--- a/src/iop/demosaicing/xtrans.c
+++ b/src/iop/demosaicing/xtrans.c
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    Copyright (C) 2010-2025 darktable developers.
+    Copyright (C) 2010-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -18,8 +18,7 @@
 
 // xtrans_interpolate adapted from dcraw 9.20
 
-// tile size, optimized to keep data in L2 cache
-#define TS 122
+#define TS DT_MARKESTEIJN_TS
 
 #define PAD_G1_G3 3
 #define PAD_G_INTERP 3
@@ -48,7 +47,8 @@ static void xtrans_markesteijn_interpolate(float *out,
                                            const int width,
                                            const int height,
                                            const uint8_t (*const xtrans)[6],
-                                           const int passes)
+                                           const int passes,
+                                           const uint32_t filters)
 {
   static const short orth[12] = { 1, 0, 0, 1, -1, 0, 0, -1, 1, 0, 0, 1 },
                      patt[2][16] = { { 0, 1, 0, -1, 2, 0, -1, 0, 1, 1, 1, -1, 0, 0, 0, 0 },
@@ -178,7 +178,8 @@ static void xtrans_markesteijn_interpolate(float *out,
         }
 
       // duplicate rgb[0] to rgb[1], rgb[2], and rgb[3]
-      for(int c = 1; c <= 3; c++) memcpy(rgb[c], rgb[0], sizeof(*rgb));
+      for(int c = 1; c <= 3; c++)
+        dt_iop_image_copy((float*)rgb[c], (float*)rgb[0], sizeof(*rgb) / sizeof(float));
 
       // note that successive calculations are inset within the tile
       // so as to give enough border data, and there needs to be a 6
@@ -503,16 +504,17 @@ static void xtrans_markesteijn_interpolate(float *out,
             }
           }
           for(int c = 0; c < 3; c++)
-            out[4 * (width * (row + top) + col + left) + c] = MAX(0.0f, avg[c]/avg[3]);
+            out[4 * (width * (row + top) + col + left) + c] = avg[c]/avg[3];
         }
     }
   }
   dt_free_align(all_buffers);
+  _vng_lininterpolate(out, in, width, height, filters, xtrans, pad_tile);
 }
 
 #undef TS
 
-#define TS 122
+#define TS DT_FDC_TS
 static void xtrans_fdc_interpolate(float *out,
                                    const float *const in,
                                    const int width,