diff --git a/data/kernels/basic.cl b/data/kernels/basic.cl index bc16ac290c48..7fef68286151 100644 --- a/data/kernels/basic.cl +++ b/data/kernels/basic.cl @@ -2972,11 +2972,11 @@ static inline float get_image_channel(read_only image2d_t in, const int c) { float4 pixel = read_imagef(in, sampleri, (int2)(x, y)); - if(c == 0) + if(c == RED) return pixel.x; - else if(c == 1) + else if(c == GREEN) return pixel.y; - else if(c == 2) + else if(c == BLUE) return pixel.z; return pixel.w; @@ -3846,7 +3846,7 @@ interpolation_resample (read_only image2d_t in, for (int ix = 0; ix < hl && yvalid; ix++) { const int xx = lindex[ix]; - float4 hpixel = read_imagef(in, sampleri,(int2)(xx, yy)); + float4 hpixel = readpixel(in, xx, yy); vpixel += hpixel * lkernel[ix]; } @@ -3868,12 +3868,8 @@ interpolation_resample (read_only image2d_t in, } // store final result - if (iy == 0 && x < width && y < height) - { - // Clip negative RGB that may be produced by Lanczos undershooting - // Negative RGB are invalid values no matter the RGB space (light is positive) - write_imagef (out, (int2)(x, y), fmax(buffer[ylid], 0.f)); - } + if(iy == 0 && x < width && y < height) + write_imagef(out, (int2)(x, y), buffer[ylid]); } /* kernel for the interpolation copy helper */ @@ -3890,7 +3886,7 @@ interpolation_copy(read_only image2d_t dev_in, const int ocol = get_global_id(0); const int orow = get_global_id(1); - if(ocol >= owidth || orow >= oheight) return; + if(ocol < 0 || ocol >= owidth || orow < 0 || orow >= oheight) return; float4 pix = (float4)( 0.0f, 0.0f, 0.0f, 0.0f ); @@ -3899,7 +3895,7 @@ interpolation_copy(read_only image2d_t dev_in, if(irow < iheight && irow >= 0 && icol < iwidth && icol >= 0) { - pix = read_imagef(dev_in, samplerA, (int2)(icol, irow)); + pix = readpixel(dev_in, icol, irow); } write_imagef(dev_out, (int2)(ocol, orow), pix); } diff --git a/data/kernels/capture.cl b/data/kernels/capture.cl index 92bb9d2acfad..f81243b46c73 100644 --- a/data/kernels/capture.cl +++ b/data/kernels/capture.cl @@ -1,6 +1,6 @@ /* This file is part of darktable, - copyright (c) 2025 darktable developer. + copyright (c) 2026 darktable developer. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -185,7 +185,7 @@ __kernel void prepare_blend(__read_only image2d_t cfa, global const unsigned char (*const xtrans)[6], global float *mask, global float *Yold, - global float *whites, + const float4 wb, const int w, const int height) { @@ -193,7 +193,9 @@ __kernel void prepare_blend(__read_only image2d_t cfa, const int row = get_global_id(1); if(col >= w || row >= height) return; - float4 rgb = read_imagef(dev_out, samplerA, (int2)(col, row)); + float whites[4] = { wb.x, wb.y, wb.z, wb.w }; + + float4 rgb = readpixel(dev_out, col, row); // Photometric/digital ITU BT.709 const float4 flum = (float4)( 0.212671f, 0.715160f, 0.072169f, 0.0f ); rgb *= flum; @@ -205,7 +207,7 @@ __kernel void prepare_blend(__read_only image2d_t cfa, { const int w2 = 2 * w; const int color = (filters == 9u) ? FCxtrans(row, col, xtrans) : FC(row, col, filters); - const float val = read_imagef(cfa, samplerA, (int2)(col, row)).x; + const float val = readsingle(cfa, col, row); if(val > whites[color] || Y < CAPTURE_YMIN) { mask[k-w2-1] = mask[k-w2] = mask[k-w2+1] = @@ -289,7 +291,7 @@ __kernel void show_blend_mask(__read_only image2d_t in, const int row = get_global_id(1); if(col >= width || row >= height) return; - float4 pix = read_imagef(in, samplerA, (int2)(col, row)); + float4 pix = readpixel(in, col, row); const float blend = blender ? blend_mask[mad24(row, width, col)] : (float)sigma_mask[mad24(row, width, col)] / 255.0f; pix.w = blend; @@ -308,7 +310,7 @@ __kernel void capture_result( __read_only image2d_t in, const int row = get_global_id(1); if(col >= width || row >= height) return; - float4 pix = read_imagef(in, samplerA, (int2)(col, row)); + float4 pix = readpixel(in, col, row); const int k = mad24(row, width, col); if(blendmask[k] > 0.0f) diff --git a/data/kernels/common.h b/data/kernels/common.h index 234f00b44149..7ff05665ab0b 100644 --- a/data/kernels/common.h +++ b/data/kernels/common.h @@ -63,6 +63,15 @@ constant sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE #pragma OPENCL FP_CONTRACT OFF #endif +// Kahan summation algorithm +#define Kahan_sum(m, c, add) \ + { \ + const float t1 = (add) - (c); \ + const float t2 = (m) + t1; \ + c = (t2 - m) - t1; \ + m = t2; \ + } + static inline int FC(const int row, const int col, const unsigned int filters) { @@ -159,3 +168,23 @@ static inline float clipf(const float a) { return clamp(a, 0.0f, 1.0f); } + +/* Some inline functions making life easier when reading photosites + or pixels from cl_mem images. + As we had images with NaNs in data and at least AMD systems suffered + as reading NaNs instead of falling back to 0 (nvidia seems to do do) + we always read data and make sure values are valid floats. +*/ +static inline float readsingle(read_only image2d_t in, int col, int row) +{ + return fmax(-FLT_MAX, read_imagef(in, sampleri, (int2)(col, row)).x); +} + +static inline float4 readpixel(read_only image2d_t in, int col, int row) +{ + return fmax(-FLT_MAX, read_imagef(in, sampleri, (int2)(col, row))); +} +static inline float readalpha(read_only image2d_t in, int col, int row) +{ + return clipf(read_imagef(in, sampleri, (int2)(col, row)).w); +} diff --git a/data/kernels/demosaic_markesteijn.cl b/data/kernels/demosaic_markesteijn.cl index 0b49efc54bbc..fa358cc4650b 100644 --- a/data/kernels/demosaic_markesteijn.cl +++ b/data/kernels/demosaic_markesteijn.cl @@ -50,7 +50,7 @@ markesteijn_initial_copy(read_only image2d_t in, global float *rgb, const int wi const int f = FCxtrans(y, x, xtrans); - const float p = fmax(0.0f, read_imagef(in, sampleri, (int2)(x, y)).x); + const float p = readsingle(in, x, y); for(int c = 0; c < 3; c++) pix[c] = (c == f) ? p : 0.0f; @@ -897,7 +897,7 @@ markesteijn_accu(read_only image2d_t in, write_only image2d_t out, global float const int glidx = mad24(y, width, x); - float4 pixel = read_imagef(in, sampleri, (int2)(x, y)); + float4 pixel = readpixel(in, x, y); float4 add = vload4(glidx, rgb); add.w = 1.0f; @@ -918,11 +918,11 @@ markesteijn_final(read_only image2d_t in, write_only image2d_t out, const int wi // take sufficient border into account if(x < border || x >= width-border || y < border || y >= height-border) return; - float4 pixel = read_imagef(in, sampleri, (int2)(x, y)); + float4 pixel = readpixel(in, x, y); pixel = (pixel.w > 0.0f) ? pixel/pixel.w : (float4)0.0f; pixel.w = 0.0f; - write_imagef(out, (int2)(x, y), fmax(0.0f, pixel)); + write_imagef(out, (int2)(x, y), pixel); } diff --git a/data/kernels/demosaic_ppg.cl b/data/kernels/demosaic_ppg.cl index ce88484fd87f..3baff695a837 100644 --- a/data/kernels/demosaic_ppg.cl +++ b/data/kernels/demosaic_ppg.cl @@ -74,7 +74,7 @@ green_equilibration_lavg(read_only image2d_t in, if(bufidx >= maxbuf) continue; const int xx = xul + bufidx % stride; const int yy = yul + bufidx / stride; - buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy)).x; + buffer[bufidx] = readsingle(in, xx, yy); } // center buffer around current x,y-Pixel @@ -88,7 +88,15 @@ green_equilibration_lavg(read_only image2d_t in, const float maximum = 1.0f; float o = buffer[0]; - if(c == 1 && (y & 1)) + // check lines to be used as we do in cpu code + int oj = 2, oi = 2; + if(FC(oj, oi, filters) != GREEN) oj++; + if(FC(oj, oi, filters) != GREEN) oi++; + if(FC(oj, oi, filters) != GREEN) oj--; + + int row_ok = (oj & 1) == (y & 1); + int col_ok = (oi & 1) == (x & 1); + if(row_ok && col_ok && x >= 2 && y >= 2 && x < width -2 && y < height-2) { const float o1_1 = buffer[-1 * stride - 1]; const float o1_2 = buffer[-1 * stride + 1]; @@ -102,17 +110,16 @@ green_equilibration_lavg(read_only image2d_t in, const float m1 = (o1_1+o1_2+o1_3+o1_4)/4.0f; const float m2 = (o2_1+o2_2+o2_3+o2_4)/4.0f; - if ((m2 > 0.0f) && (m1 > 0.0f) && (m1 / m2 < maximum * 2.0f)) + if((m2 > 0.0f) && (m1 > 0.0f) && (m1 / m2 < maximum * 2.0f)) { const float c1 = (fabs(o1_1 - o1_2) + fabs(o1_1 - o1_3) + fabs(o1_1 - o1_4) + fabs(o1_2 - o1_3) + fabs(o1_3 - o1_4) + fabs(o1_2 - o1_4)) / 6.0f; const float c2 = (fabs(o2_1 - o2_2) + fabs(o2_1 - o2_3) + fabs(o2_1 - o2_4) + fabs(o2_2 - o2_3) + fabs(o2_3 - o2_4) + fabs(o2_2 - o2_4)) / 6.0f; if((o < maximum * 0.95f) && (c1 < maximum * thr) && (c2 < maximum * thr)) - o *= m1/m2; + o *= m1/m2; } } - - write_imagef (out, (int2)(x, y), fmax(o, 0.0f)); + write_imagef (out, (int2)(x, y), o); } @@ -135,11 +142,12 @@ green_equilibration_favg_reduce_first(read_only image2d_t in, const int c = FC(y, x, filters); + // make sure we add same number of greens for even and odd lines const int isinimage = (x < 2 * (width / 2) && y < 2 * (height / 2)); - const int isgreen1 = (c == 1 && !(y & 1)); - const int isgreen2 = (c == 1 && (y & 1)); + const int isgreen1 = c == GREEN && !(y & 1); + const int isgreen2 = c == GREEN && (y & 1); - float pixel = read_imagef(in, sampleri, (int2)(x, y)).x; + float pixel = readsingle(in, x, y); buffer[l].x = isinimage && isgreen1 ? pixel : 0.0f; buffer[l].y = isinimage && isgreen2 ? pixel : 0.0f; @@ -170,17 +178,19 @@ kernel void green_equilibration_favg_reduce_second(const global float2* input, global float2 *result, const int length, local float2 *buffer) { int x = get_global_id(0); - float2 sum = (float2)0.0f; - + float m1 = 0.0f; + float m2 = 0.0f; + float sum1 = 0.0f; + float sum2 = 0.0f; while(x < length) { - sum += input[x]; - + Kahan_sum(sum1, m1, input[x].x); + Kahan_sum(sum2, m2, input[x].y); x += get_global_size(0); } int lid = get_local_id(0); - buffer[lid] = sum; + buffer[lid] = (float2)(sum1, sum2); barrier(CLK_LOCAL_MEM_FENCE); @@ -215,15 +225,15 @@ green_equilibration_favg_apply(read_only image2d_t in, if(x >= width || y >= height) return; - float pixel = read_imagef(in, sampleri, (int2)(x, y)).x; + float pixel = readsingle(in, x, y); const int c = FC(y, x, filters); - const int isgreen1 = (c == 1 && !(y & 1)); + const int isgreen1 = (c == GREEN && !(y & 1)); // on even lines pixel *= (isgreen1 ? gr_ratio : 1.0f); - write_imagef (out, (int2)(x, y), fmax(pixel, 0.0f)); + write_imagef (out, (int2)(x, y), pixel); } #define SWAP(a, b) \ @@ -236,8 +246,13 @@ green_equilibration_favg_apply(read_only image2d_t in, constant int glim[5] = { 0, 1, 2, 1, 0 }; kernel void -pre_median(read_only image2d_t in, write_only image2d_t out, const int width, const int height, - const unsigned int filters, const float threshold, local float *buffer) +pre_median(read_only image2d_t in, + write_only image2d_t out, + const int width, + const int height, + const unsigned int filters, + const float threshold, + local float *buffer) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -269,7 +284,7 @@ pre_median(read_only image2d_t in, write_only image2d_t out, const int width, co if(bufidx >= maxbuf) continue; const int xx = xul + bufidx % stride; const int yy = yul + bufidx / stride; - buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy)).x; + buffer[bufidx] = readsingle(in, xx, yy); } // center buffer around current x,y-Pixel @@ -307,7 +322,7 @@ pre_median(read_only image2d_t in, write_only image2d_t out, const int width, co float color = (c & 1) ? (cnt == 1 ? med[4] - 64.0f : med[(cnt - 1) / 2]) : buffer[0]; - write_imagef (out, (int2)(x, y), fmax(color, 0.0f)); + write_imagef (out, (int2)(x, y), color); } #undef SWAP @@ -325,7 +340,11 @@ pre_median(read_only image2d_t in, write_only image2d_t out, const int width, co // 3x3 median filter // uses a sorting network to sort entirely in registers with no branches kernel void -color_smoothing(read_only image2d_t in, write_only image2d_t out, const int width, const int height, local float4 *buffer) +color_smoothing(read_only image2d_t in, + write_only image2d_t out, + const int width, + const int height, + local float4 *buffer) { const int lxid = get_local_id(0); const int lyid = get_local_id(1); @@ -352,17 +371,22 @@ color_smoothing(read_only image2d_t in, write_only image2d_t out, const int widt // don't read more than needed if(gx >= width + 1 || gy >= height + 1) continue; - buffer[bufidx] = read_imagef(in, sampleri, (int2)(gx, gy)); + buffer[bufidx] = readpixel(in, gx, gy); } barrier(CLK_LOCAL_MEM_FENCE); - if(x >= width || y >= height) return; // re-position buffer buffer += (lyid + 1) * buffwd + lxid + 1; float4 o = buffer[0]; + // don't touch outermost 1 pixels, just copy + if(x < 1 || x >= width-1 || y < 1 || y >= height-1) + { + write_imagef(out, (int2) (x, y), o); + return; + } // 3x3 median for R float s0 = buffer[-buffwd - 1].x - buffer[-buffwd - 1].y; @@ -395,7 +419,7 @@ color_smoothing(read_only image2d_t in, write_only image2d_t out, const int widt cas(s6, s4); cas(s4, s2); - o.x = fmax(s4 + o.y, 0.0f); + o.x = s4 + o.y; // 3x3 median for B @@ -429,9 +453,9 @@ color_smoothing(read_only image2d_t in, write_only image2d_t out, const int widt cas(s6, s4); cas(s4, s2); - o.z = fmax(s4 + o.y, 0.0f); + o.z = s4 + o.y; - write_imagef(out, (int2) (x, y), fmax(o, 0.0f)); + write_imagef(out, (int2) (x, y), o); } #undef cas @@ -544,11 +568,16 @@ clip_and_zoom_demosaic_half_size(__read_only image2d_t in, * in (float) or (float4).x -> out (float4) */ kernel void -ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int width, const int height, - const unsigned int filters, local float *buffer) +ppg_demosaic_green (read_only image2d_t in, + write_only image2d_t out, + const int width, + const int height, + const unsigned int filters, + local float *buffer, + const int border) { - const int x = get_global_id(0); - const int y = get_global_id(1); + const int col = get_global_id(0); + const int row = get_global_id(1); const int xlsz = get_local_size(0); const int ylsz = get_local_size(1); const int xlid = get_local_id(0); @@ -577,7 +606,7 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int if(bufidx >= maxbuf) continue; const int xx = xul + bufidx % stride; const int yy = yul + bufidx / stride; - buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy)).x; + buffer[bufidx] = readsingle(in, xx, yy); } // center buffer around current x,y-Pixel @@ -585,23 +614,22 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int barrier(CLK_LOCAL_MEM_FENCE); - // make sure we dont write the outermost 3 pixels - if(x >= width - 3 || x < 3 || y >= height - 3 || y < 3) return; + if(col >= width - 3 || col < 3 || row >= height - 3 || row < 3) return; + if(col >= border && col < width - border && row >= border && row < height - border) return; + // process all non-green pixels - const int row = y; - const int col = x; const int c = FC(row, col, filters); - float4 color; // output color + float4 color = 0.0f; // output color const float pc = buffer[0]; - if (c == 0) color.x = pc; // red - else if(c == 1) color.y = pc; // green1 - else if(c == 2) color.z = pc; // blue + if (c == RED) color.x = pc; + else if(c == GREEN) color.y = pc; + else if(c == BLUE) color.z = pc; else color.y = pc; // green2 // fill green layer for red and blue pixels: - if(c == 0 || c == 2) + if(c == RED || c == BLUE) { // look up horizontal and vertical neighbours, sharpened weight: const float pym = buffer[-1 * stride]; @@ -640,7 +668,7 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int color.y = fmax(fmin(guessx*0.25f, M), m); } } - write_imagef (out, (int2)(x, y), fmax(color, 0.0f)); + write_imagef (out, (int2)(col, row), color); } @@ -649,12 +677,17 @@ ppg_demosaic_green (read_only image2d_t in, write_only image2d_t out, const int * in (float4) -> out (float4) */ kernel void -ppg_demosaic_redblue (read_only image2d_t in, write_only image2d_t out, const int width, const int height, - const unsigned int filters, local float4 *buffer) +ppg_demosaic_redblue (read_only image2d_t in, + write_only image2d_t out, + const int width, + const int height, + const unsigned int filters, + local float4 *buffer, + const int border) { // image in contains full green and sparse r b - const int x = get_global_id(0); - const int y = get_global_id(1); + const int col = get_global_id(0); + const int row = get_global_id(1); const int xlsz = get_local_size(0); const int ylsz = get_local_size(1); const int xlid = get_local_id(0); @@ -683,7 +716,7 @@ ppg_demosaic_redblue (read_only image2d_t in, write_only image2d_t out, const in if(bufidx >= maxbuf) continue; const int xx = xul + bufidx % stride; const int yy = yul + bufidx / stride; - buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy)); + buffer[bufidx] = readpixel(in, xx, yy); } // center buffer around current x,y-Pixel @@ -691,69 +724,66 @@ ppg_demosaic_redblue (read_only image2d_t in, write_only image2d_t out, const in barrier(CLK_LOCAL_MEM_FENCE); - if(x >= width || y >= height) return; - const int row = y; - const int col = x; + if(col >= width || row >= height) return; + if(col >= border && col < width - border && row >= border && row < height - border) return; + const int c = FC(row, col, filters); float4 color = buffer[0]; - if(x == 0 || y == 0 || x == (width-1) || y == (height-1)) + if(row > 0 && col > 0 && col < width - 1 && row < height - 1) { - write_imagef (out, (int2)(x, y), fmax(color, 0.0f)); - return; - } - - if(c == 1 || c == 3) - { // calculate red and blue for green pixels: - // need 4-nbhood: - const float4 nt = buffer[-stride]; - const float4 nb = buffer[ stride]; - const float4 nl = buffer[-1]; - const float4 nr = buffer[ 1]; - if(FC(row, col+1, filters) == 0) // red nb in same row - { - color.z = (nt.z + nb.z + 2.0f*color.y - nt.y - nb.y)*0.5f; - color.x = (nl.x + nr.x + 2.0f*color.y - nl.y - nr.y)*0.5f; + if(c == GREEN || c == 3) + { // calculate red and blue for green pixels: + // need 4-nbhood: + const float4 nt = buffer[-stride]; + const float4 nb = buffer[ stride]; + const float4 nl = buffer[-1]; + const float4 nr = buffer[ 1]; + if(FC(row, col+1, filters) == RED) // red nb in same row + { + color.z = (nt.z + nb.z + 2.0f*color.y - nt.y - nb.y)*0.5f; + color.x = (nl.x + nr.x + 2.0f*color.y - nl.y - nr.y)*0.5f; + } + else + { // blue nb + color.x = (nt.x + nb.x + 2.0f*color.y - nt.y - nb.y)*0.5f; + color.z = (nl.z + nr.z + 2.0f*color.y - nl.y - nr.y)*0.5f; + } } else - { // blue nb - color.x = (nt.x + nb.x + 2.0f*color.y - nt.y - nb.y)*0.5f; - color.z = (nl.z + nr.z + 2.0f*color.y - nl.y - nr.y)*0.5f; - } - } - else - { - // get 4-star-nbhood: - const float4 ntl = buffer[-stride - 1]; - const float4 ntr = buffer[-stride + 1]; - const float4 nbl = buffer[ stride - 1]; - const float4 nbr = buffer[ stride + 1]; - - if(c == 0) - { // red pixel, fill blue: - const float diff1 = fabs(ntl.z - nbr.z) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y); - const float guess1 = ntl.z + nbr.z + 2.0f*color.y - ntl.y - nbr.y; - const float diff2 = fabs(ntr.z - nbl.z) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y); - const float guess2 = ntr.z + nbl.z + 2.0f*color.y - ntr.y - nbl.y; - if (diff1 > diff2) color.z = guess2 * 0.5f; - else if(diff1 < diff2) color.z = guess1 * 0.5f; - else color.z = (guess1 + guess2)*0.25f; - } - else // c == 2, blue pixel, fill red: { - const float diff1 = fabs(ntl.x - nbr.x) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y); - const float guess1 = ntl.x + nbr.x + 2.0f*color.y - ntl.y - nbr.y; - const float diff2 = fabs(ntr.x - nbl.x) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y); - const float guess2 = ntr.x + nbl.x + 2.0f*color.y - ntr.y - nbl.y; - if (diff1 > diff2) color.x = guess2 * 0.5f; - else if(diff1 < diff2) color.x = guess1 * 0.5f; - else color.x = (guess1 + guess2)*0.25f; + // get 4-star-nbhood: + const float4 ntl = buffer[-stride - 1]; + const float4 ntr = buffer[-stride + 1]; + const float4 nbl = buffer[ stride - 1]; + const float4 nbr = buffer[ stride + 1]; + + if(c == RED) + { // red pixel, fill blue: + const float diff1 = fabs(ntl.z - nbr.z) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y); + const float guess1 = ntl.z + nbr.z + 2.0f*color.y - ntl.y - nbr.y; + const float diff2 = fabs(ntr.z - nbl.z) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y); + const float guess2 = ntr.z + nbl.z + 2.0f*color.y - ntr.y - nbl.y; + if (diff1 > diff2) color.z = guess2 * 0.5f; + else if(diff1 < diff2) color.z = guess1 * 0.5f; + else color.z = (guess1 + guess2)*0.25f; + } + else // c == 2, blue pixel, fill red: + { + const float diff1 = fabs(ntl.x - nbr.x) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y); + const float guess1 = ntl.x + nbr.x + 2.0f*color.y - ntl.y - nbr.y; + const float diff2 = fabs(ntr.x - nbl.x) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y); + const float guess2 = ntr.x + nbl.x + 2.0f*color.y - ntr.y - nbl.y; + if (diff1 > diff2) color.x = guess2 * 0.5f; + else if(diff1 < diff2) color.x = guess1 * 0.5f; + else color.x = (guess1 + guess2)*0.25f; + } } } - write_imagef (out, (int2)(x, y), fmax(color, 0.0f)); + write_imagef (out, (int2)(col, row), color); } /** - * Demosaic image border + * Demosaic image border for the three outermost photosites */ kernel void border_interpolate(read_only image2d_t in, write_only image2d_t out, const int width, const int height, const unsigned int filters) @@ -762,27 +792,23 @@ border_interpolate(read_only image2d_t in, write_only image2d_t out, const int w const int y = get_global_id(1); if(x >= width || y >= height) return; - - const int avgwindow = 1; - const int border = 3; - - if(x >= border && x < width-border && y >= border && y < height-border) return; + if(x >= 3 && x < width-3 && y >= 3 && y < height-3) return; float4 o; float sum[4] = { 0.0f }; int count[4] = { 0 }; - for (int j=y-avgwindow; j<=y+avgwindow; j++) for (int i=x-avgwindow; i<=x+avgwindow; i++) + for(int j=y-1; j<=y+1; j++) for (int i=x-1; i<=x+1; i++) { - if (j>=0 && i>=0 && j=0 && i>=0 && j 0 ? sum[0]/count[0] : i; o.y = count[1]+count[3] > 0 ? (sum[1]+sum[3])/(count[1]+count[3]) : i; o.z = count[2] > 0 ? sum[2]/count[2] : i; diff --git a/data/kernels/demosaic_rcd.cl b/data/kernels/demosaic_rcd.cl index a6202149cd13..73b885a5a4af 100644 --- a/data/kernels/demosaic_rcd.cl +++ b/data/kernels/demosaic_rcd.cl @@ -32,7 +32,7 @@ __kernel void rcd_populate (__read_only image2d_t in, global float *cfa, global const int col = get_global_id(0); const int row = get_global_id(1); if(col >= w || row >= height) return; - const float val = scale * fmax(0.0f, read_imagef(in, sampleri, (int2)(col, row)).x); + const float val = scale * fmax(0.0f, readsingle(in, col, row)); const int color = FC(row, col, filters); global float *rgbcol = rgb0; @@ -51,7 +51,7 @@ __kernel void rcd_write_output (__write_only image2d_t out, global float *rgb0, if(!(col >= border && col < w - border && row >= border && row < height - border)) return; const int idx = mad24(row, w, col); - write_imagef(out, (int2)(col, row), (float4)(fmax(scale * rgb0[idx], 0.0f), fmax(scale * rgb1[idx], 0.0f), fmax(scale * rgb2[idx], 0.0f), 0.0f)); + write_imagef(out, (int2)(col, row), (float4)(scale * rgb0[idx], scale * rgb1[idx], scale * rgb2[idx], 0.0f)); } #define eps 1e-5f // Tolerance to avoid dividing by zero @@ -278,9 +278,9 @@ __kernel void write_blended_dual(__read_only image2d_t high, const int row = get_global_id(1); if((col >= w) || (row >= height)) return; - const float4 high_val = read_imagef(high, sampleri, (int2)(col, row)); - const float4 low_val = read_imagef(low, sampleri, (int2)(col, row)); - const float4 blender = clipf(mask[mad24(row, w, col)]); + const float4 high_val = readpixel(high, col, row); + const float4 low_val = readpixel(low, col, row); + const float4 blender = (float4)clipf(mask[mad24(row, w, col)]); float4 data = mix(low_val, high_val, blender); data.w = showmask ? blender.x : 0.0f; write_imagef(out, (int2)(col, row), data); @@ -297,8 +297,8 @@ __kernel void calc_Y0_mask(global float *mask, if((col >= w) || (row >= height)) return; const int idx = mad24(row, w, col); - const float4 pt = wb * fmax(0.0f, read_imagef(in, sampleri, (int2)(col, row))); - mask[idx] = dtcl_sqrt(0.33333333f *(pt.x + pt.y + pt.z)); + const float4 pt = wb * readpixel(in, col, row); + mask[idx] = dtcl_sqrt(0.33333333f * fmax(0.0f, (pt.x + pt.y + pt.z))); } __kernel void calc_scharr_mask(global float *in, global float *out, const int w, const int height) @@ -331,203 +331,6 @@ __kernel void calc_detail_blend(global float *in, global float *out, const int w out[idx] = detail ? blend : 1.0f - blend; } -kernel void rcd_border_green(read_only image2d_t in, write_only image2d_t out, const int width, const int height, - const unsigned int filters, local float *buffer, const int border) -{ - const int col = get_global_id(0); - const int row = get_global_id(1); - const int xlsz = get_local_size(0); - const int ylsz = get_local_size(1); - const int xlid = get_local_id(0); - const int ylid = get_local_id(1); - const int xgid = get_group_id(0); - const int ygid = get_group_id(1); - - // individual control variable in this work group and the work group size - const int l = mad24(ylid, xlsz, xlid); - const int lsz = mul24(xlsz, ylsz); - - // stride and maximum capacity of local buffer - // cells of 1*float per pixel with a surrounding border of 3 cells - const int stride = xlsz + 2*3; - const int maxbuf = mul24(stride, ylsz + 2*3); - - // coordinates of top left pixel of buffer - // this is 3 pixel left and above of the work group origin - const int xul = mul24(xgid, xlsz) - 3; - const int yul = mul24(ygid, ylsz) - 3; - - // populate local memory buffer - for(int n = 0; n <= maxbuf/lsz; n++) - { - const int bufidx = mad24(n, lsz, l); - if(bufidx >= maxbuf) continue; - const int xx = xul + bufidx % stride; - const int yy = yul + bufidx / stride; - buffer[bufidx] = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy)).x); - } - - // center buffer around current x,y-Pixel - buffer += mad24(ylid + 3, stride, xlid + 3); - - barrier(CLK_LOCAL_MEM_FENCE); - - if(col >= width - 3 || col < 3 || row >= height - 3 || row < 3) return; - if(col >= border && col < width - border && row >= border && row < height - border) return; - - // process all non-green pixels - const int c = FC(row, col, filters); - float4 color = 0.0f; // output color - - const float pc = buffer[0]; - - if (c == RED) color.x = pc; - else if(c == GREEN) color.y = pc; - else if(c == BLUE) color.z = pc; - else color.y = pc; // green2 - - // fill green layer for red and blue pixels: - if(c == RED || c == BLUE) - { - // look up horizontal and vertical neighbours, sharpened weight: - const float pym = buffer[-1 * stride]; - const float pym2 = buffer[-2 * stride]; - const float pym3 = buffer[-3 * stride]; - const float pyM = buffer[ 1 * stride]; - const float pyM2 = buffer[ 2 * stride]; - const float pyM3 = buffer[ 3 * stride]; - const float pxm = buffer[-1]; - const float pxm2 = buffer[-2]; - const float pxm3 = buffer[-3]; - const float pxM = buffer[ 1]; - const float pxM2 = buffer[ 2]; - const float pxM3 = buffer[ 3]; - const float guessx = (pxm + pc + pxM) * 2.0f - pxM2 - pxm2; - const float diffx = (fabs(pxm2 - pc) + - fabs(pxM2 - pc) + - fabs(pxm - pxM)) * 3.0f + - (fabs(pxM3 - pxM) + fabs(pxm3 - pxm)) * 2.0f; - const float guessy = (pym + pc + pyM) * 2.0f - pyM2 - pym2; - const float diffy = (fabs(pym2 - pc) + - fabs(pyM2 - pc) + - fabs(pym - pyM)) * 3.0f + - (fabs(pyM3 - pyM) + fabs(pym3 - pym)) * 2.0f; - if(diffx > diffy) - { - // use guessy - const float m = fmin(pym, pyM); - const float M = fmax(pym, pyM); - color.y = fmax(fmin(guessy*0.25f, M), m); - } - else - { - const float m = fmin(pxm, pxM); - const float M = fmax(pxm, pxM); - color.y = fmax(fmin(guessx*0.25f, M), m); - } - } - write_imagef (out, (int2)(col, row), fmax(color, 0.0f)); -} -kernel void rcd_border_redblue(read_only image2d_t in, write_only image2d_t out, const int width, const int height, - const unsigned int filters, local float4 *buffer, const int border) -{ - // image in contains full green and sparse r b - const int col = get_global_id(0); - const int row = get_global_id(1); - const int xlsz = get_local_size(0); - const int ylsz = get_local_size(1); - const int xlid = get_local_id(0); - const int ylid = get_local_id(1); - const int xgid = get_group_id(0); - const int ygid = get_group_id(1); - - // individual control variable in this work group and the work group size - const int l = mad24(ylid, xlsz, xlid); - const int lsz = mul24(xlsz, ylsz); - - // stride and maximum capacity of local buffer - // cells of float4 per pixel with a surrounding border of 1 cell - const int stride = xlsz + 2; - const int maxbuf = mul24(stride, ylsz + 2); - - // coordinates of top left pixel of buffer - // this is 1 pixel left and above of the work group origin - const int xul = mul24(xgid, xlsz) - 1; - const int yul = mul24(ygid, ylsz) - 1; - - // populate local memory buffer - for(int n = 0; n <= maxbuf/lsz; n++) - { - const int bufidx = mad24(n, lsz, l); - if(bufidx >= maxbuf) continue; - const int xx = xul + bufidx % stride; - const int yy = yul + bufidx / stride; - buffer[bufidx] = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy))); - } - - // center buffer around current x,y-Pixel - buffer += mad24(ylid + 1, stride, xlid + 1); - - barrier(CLK_LOCAL_MEM_FENCE); - - if(col >= width || row >= height) return; - if(col >= border && col < width - border && row >= border && row < height - border) return; - - const int c = FC(row, col, filters); - float4 color = buffer[0]; - if(row > 0 && col > 0 && col < width - 1 && row < height - 1) - { - if(c == GREEN || c == 3) - { // calculate red and blue for green pixels: - // need 4-nbhood: - const float4 nt = buffer[-stride]; - const float4 nb = buffer[ stride]; - const float4 nl = buffer[-1]; - const float4 nr = buffer[ 1]; - if(FC(row, col+1, filters) == RED) // red nb in same row - { - color.z = (nt.z + nb.z + 2.0f*color.y - nt.y - nb.y)*0.5f; - color.x = (nl.x + nr.x + 2.0f*color.y - nl.y - nr.y)*0.5f; - } - else - { // blue nb - color.x = (nt.x + nb.x + 2.0f*color.y - nt.y - nb.y)*0.5f; - color.z = (nl.z + nr.z + 2.0f*color.y - nl.y - nr.y)*0.5f; - } - } - else - { - // get 4-star-nbhood: - const float4 ntl = buffer[-stride - 1]; - const float4 ntr = buffer[-stride + 1]; - const float4 nbl = buffer[ stride - 1]; - const float4 nbr = buffer[ stride + 1]; - - if(c == RED) - { // red pixel, fill blue: - const float diff1 = fabs(ntl.z - nbr.z) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y); - const float guess1 = ntl.z + nbr.z + 2.0f*color.y - ntl.y - nbr.y; - const float diff2 = fabs(ntr.z - nbl.z) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y); - const float guess2 = ntr.z + nbl.z + 2.0f*color.y - ntr.y - nbl.y; - if (diff1 > diff2) color.z = guess2 * 0.5f; - else if(diff1 < diff2) color.z = guess1 * 0.5f; - else color.z = (guess1 + guess2)*0.25f; - } - else // c == 2, blue pixel, fill red: - { - const float diff1 = fabs(ntl.x - nbr.x) + fabs(ntl.y - color.y) + fabs(nbr.y - color.y); - const float guess1 = ntl.x + nbr.x + 2.0f*color.y - ntl.y - nbr.y; - const float diff2 = fabs(ntr.x - nbl.x) + fabs(ntr.y - color.y) + fabs(nbl.y - color.y); - const float guess2 = ntr.x + nbl.x + 2.0f*color.y - ntr.y - nbl.y; - if (diff1 > diff2) color.x = guess2 * 0.5f; - else if(diff1 < diff2) color.x = guess1 * 0.5f; - else color.x = (guess1 + guess2)*0.25f; - } - } - } - write_imagef (out, (int2)(col, row), fmax(color, 0.0f)); -} - kernel void demosaic_box3(read_only image2d_t in, write_only image2d_t out, const int width, @@ -547,7 +350,7 @@ kernel void demosaic_box3(read_only image2d_t in, if(x >= 0 && y >= 0 && x < width && y < height) { const int color = fcol(y, x, filters, xtrans); - sum[color] += fmax(0.0f, read_imagef(in, sampleri, (int2)(x, y)).x); + sum[color] += readsingle(in, x, y); cnt[color] += 1.0f; } } @@ -556,3 +359,15 @@ kernel void demosaic_box3(read_only image2d_t in, write_imagef(out, (int2)(col, row), rgb); } +kernel void +image_positive(read_only image2d_t in, write_only image2d_t out, const int width, const int height) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if(x >= width || y >= height) return; + + float4 pixel = fmax(0.0f, readpixel(in, x, y)); + write_imagef(out, (int2)(x, y), pixel); +} + diff --git a/data/kernels/demosaic_vng.cl b/data/kernels/demosaic_vng.cl index 1cbff4807f1c..3ff6482a3665 100644 --- a/data/kernels/demosaic_vng.cl +++ b/data/kernels/demosaic_vng.cl @@ -28,7 +28,7 @@ vng_lin_interpolate(read_only image2d_t in, global const unsigned char (*const xtrans)[6], global const int (*const lookup)[16][32], local float *buffer, - const int equil) + const int only_linear) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -59,7 +59,7 @@ vng_lin_interpolate(read_only image2d_t in, if(bufidx >= maxbuf) continue; const int xx = xul + bufidx % stride; const int yy = yul + bufidx / stride; - buffer[bufidx] = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy)).x); + buffer[bufidx] = readsingle(in, xx, yy); } // center buffer around current x,y-Pixel @@ -73,31 +73,40 @@ vng_lin_interpolate(read_only image2d_t in, const bool is_xtrans = filters == 9; const int colors = is_xtrans ? 3 : 4; - const int av = (filters == 9) ? 2 : 1; const int size = is_xtrans ? 6 : 16; float sum[4] = { 0.0f }; float o[4] = { 0.0f }; + // approximate outermost single pixels if(x < 1 || x >= width-1 || y < 1 || y >= height-1) { int count[4] = { 0 }; - for(int j = y-av; j <= y+av; j++) + for(int j = y-1; j <= y+1; j++) { - for(int i = x-av; i <= x+av; i++) + for(int i = x-1; i <= x+1; i++) { if(j >= 0 && i >= 0 && j < height && i < width) { const int f = fcol(j, i, filters, xtrans); - sum[f] += fmax(0.0f, read_imagef(in, sampleri, (int2)(i, j)).x); + sum[f] += readsingle(in, i, j); count[f]++; } } } + const int f = fcol(y, x, filters, xtrans); + // for current cell, copy the current sensor's color data, + // interpolate the other two colors from surrounding pixels of + // their color for(int c = 0; c < colors; c++) - o[c] = sum[c] / fmax(1.0f, (float)count[c]); + { + if(c != f && count[c] != 0) + o[c] = sum[c] / count[c]; + else + o[c] = readsingle(in, x, y); + } } - else + else // do the bilinear stuff { global const int *ip = lookup[y % size][x % size]; const int num_pixels = ip[0]; @@ -120,17 +129,18 @@ vng_lin_interpolate(read_only image2d_t in, o[ip[0]] = buffer[0]; } - if(!is_xtrans && equil) + if(only_linear) { - o[1] = 0.5f * (o[1] + o[3]); + o[0] = o[0]; + o[1] = is_xtrans ? o[1] : 0.5f * (o[1] + o[3]); + o[2] = o[2]; o[3] = 0.0f; } write_imagef(out, (int2)(x, y), (float4)(o[0], o[1], o[2], o[3])); } kernel void -vng_interpolate(read_only image2d_t input, - read_only image2d_t in, +vng_interpolate(read_only image2d_t in, write_only image2d_t out, const int width, const int height, @@ -169,7 +179,7 @@ vng_interpolate(read_only image2d_t input, if(bufidx >= maxbuf) continue; const int xx = xul + bufidx % stride; const int yy = yul + bufidx / stride; - const float4 pixel = fmax(0.0f, read_imagef(in, sampleri, (int2)(xx, yy))); + const float4 pixel = readpixel(in, xx, yy); vstore4(pixel, bufidx, buffer); } @@ -179,10 +189,21 @@ vng_interpolate(read_only image2d_t input, barrier(CLK_LOCAL_MEM_FENCE); if(x >= width || y >= height) return; - const bool is_xtrans = filters == 9; - const int colors = is_xtrans ? 3 : 4; - const int av = is_xtrans ? 2 : 1; + const bool is_xtrans = filters == 9; + // we don't touch data at the outermost 2 pixels + if(x < 2 || x >= width-2 || y < 2 || y >= height-2) + { + float4 val = readpixel(in, x, y); + if(!is_xtrans) + { + val.y = 0.5f * (val.y + val.w); + val.w = 0.0f; + } + write_imagef(out, (int2)(x, y), val); + return; + } + const int colors = is_xtrans ? 3 : 4; const int prow = is_xtrans ? 6 : 8; const int pcol = is_xtrans ? 6 : 2; @@ -227,41 +248,14 @@ vng_interpolate(read_only image2d_t input, if(gmax < gval[g]) gmax = gval[g]; } - const bool border = x < 2 || x >= width-2 || y < 2 || y >= height-2; - float b[4] = { 0.0f }; - // Ok lets calculate the border pixel, we'll need it soon - if(border) - { - int count[4] = { 0 }; - for(int j = y-av; j <= y+av; j++) - { - for(int i = x-av; i <= x+av; i++) - { - if(j >= 0 && i >= 0 && j < height && i < width) - { - const int f = fcol(j, i, filters, xtrans); - b[f] += fmax(0.0f, read_imagef(input, sampleri, (int2)(i, j)).x); - count[f]++; - } - } - } - for(int c = 0; c < colors; c++) - b[c] /= fmax(1.0f, (float)count[c]); - } - + float o[4] = { 0.0f }; if(gmax == 0.0f) { - if(!border) - { - for(int c = 0; c < colors; c++) - b[c] = buffer[c]; - } - if(!is_xtrans) - { - b[1] = 0.5f * (b[1] + b[3]); - b[3] = 0.0f; - } - write_imagef(out, (int2)(x, y), (float4)(b[0], b[1], b[2], b[3])); + o[0] = buffer[0]; + o[1] = is_xtrans ? buffer[1] : 0.5f * (buffer[1] + buffer[3]); + o[2] = buffer[2]; + o[3] = 0.0f; + write_imagef(out, (int2)(x, y), (float4)(o[0], o[1], o[2], o[3])); return; } @@ -303,25 +297,15 @@ vng_interpolate(read_only image2d_t input, } // save to output - float o[4] = { 0.0f }; for(int c = 0; c < colors; c++) { float tot = buffer[color]; if(c != color) tot += (sum[c] - sum[color]) / num; - o[c] = fmax(0.0f, tot); - } - - if(border) - { - for(int c = 0; c < colors; c++) - o[c] = b[c]; + o[c] = tot; } - if(!is_xtrans) - { - o[1] = 0.5f * (o[1] + o[3]); - o[3] = 0.0f; - } + o[1] = is_xtrans ? o[1] : 0.5f * (o[1] + o[3]); + o[3] = 0.0f; write_imagef(out, (int2)(x, y), (float4)(o[0], o[1], o[2], 0.0f)); } @@ -362,7 +346,7 @@ clip_and_zoom_demosaic_third_size_xtrans(read_only image2d_t in, write_only imag { for(int j = 0; j < 3; j++) for(int i = 0; i < 3; i++) - col[FCxtrans(yy + j, xx + i, xtrans)] += fmax(0.0f, read_imagef(in, sampleri, (int2)(xx + i, yy + j)).x); + col[FCxtrans(yy + j, xx + i, xtrans)] += fmax(0.0f, readsingle(in, xx + i, yy + j)); num++; } diff --git a/data/kernels/guided_filter.cl b/data/kernels/guided_filter.cl index 36ca2cffd284..0c105afdb0ed 100644 --- a/data/kernels/guided_filter.cl +++ b/data/kernels/guided_filter.cl @@ -40,16 +40,6 @@ kernel void guided_filter_split_rgb_image(const int width, } -// Kahan summation algorithm -#define Kahan_sum(m, c, add) \ - { \ - const float t1 = (add) - (c); \ - const float t2 = (m) + t1; \ - c = (t2 - m) - t1; \ - m = t2; \ - } - - kernel void guided_filter_box_mean_x(const int width, const int height, read_only image2d_t in, diff --git a/src/common/darktable.h b/src/common/darktable.h index 0b8baa93809f..44700bb14602 100644 --- a/src/common/darktable.h +++ b/src/common/darktable.h @@ -249,6 +249,38 @@ char *dt_version_major_minor(); #define INVPHI 0.61803398874989479F #endif + +/* Some CPU demosaicers do internal tiling, this decreases mem pressure while + increasing performance as processed data is available in cpu cache. + The default values have been calculated for a cachesize of 1MB per thread, + this is mostly still valid in 2026 but you may tune for performance. + + Due to internal code AMAZETS should be a multiple of 32, for RCD and LMMSE + a multiple of 8. + + Note: for the curious, if we calculate the tilesizes at runtime we loose the + performance gain as the compiler can't optimize that much. +*/ +#ifndef DT_RCD_TILESIZE +#define DT_RCD_TILESIZE 112 +#endif + +#ifndef DT_LMMSE_TILESIZE +#define DT_LMMSE_TILESIZE 136 +#endif + +#ifndef AMAZETS +#define AMAZETS 160 +#endif + +#ifndef DT_MARKESTEIJN_TS +#define DT_MARKESTEIJN_TS 122 +#endif + +#ifndef DT_FDC_TS +#define DT_FDC_TS 122 +#endif + // NaN-safe clamping (NaN compares false, and will thus result in H) #define CLAMPS(A, L, H) ((A) > (L) ? ((A) < (H) ? (A) : (H)) : (L)) diff --git a/src/common/interpolation.c b/src/common/interpolation.c index ced583c98a11..5137bd4da9c2 100644 --- a/src/common/interpolation.c +++ b/src/common/interpolation.c @@ -1142,11 +1142,9 @@ void dt_interpolation_resample(const dt_interpolation_t *itor, // Output pixel is ready const size_t baseidx = (size_t)oy * out_stride_floats + (size_t)ox * 4; - // Clip negative RGB that may be produced by Lanczos undershooting - // Negative RGB are invalid values no matter the RGB space (light is positive) dt_aligned_pixel_t pixel; for_each_channel(c, aligned(vs:16)) - pixel[c] = fmaxf(0.0f, vs[c]); + pixel[c] = vs[c]; copy_pixel_nontemporal(out + baseidx, pixel); // Reset vertical resampling context @@ -1565,7 +1563,7 @@ void dt_interpolation_resample_1c(const dt_interpolation_t *itor, // Output pixel is ready float *o = (float *)((char *)out + (size_t)oy * out_stride + (size_t)ox * sizeof(float)); - *o = fmaxf(0.0f, vs); + *o = vs; // Reset vertical resampling context viidx -= vl; diff --git a/src/develop/masks/detail.c b/src/develop/masks/detail.c index 838df90e63df..6c7aec6fc899 100644 --- a/src/develop/masks/detail.c +++ b/src/develop/masks/detail.c @@ -122,19 +122,19 @@ float *dt_masks_calc_scharr_mask(dt_dev_pixelpipe_t *pipe, } const gboolean wboff = !pipe->dsc.temperature.enabled || !rawmode; - const dt_aligned_pixel_t wb = { wboff ? 1.0f : pipe->dsc.temperature.coeffs[0], - wboff ? 1.0f : pipe->dsc.temperature.coeffs[1], - wboff ? 1.0f : pipe->dsc.temperature.coeffs[2] }; + const dt_aligned_pixel_t wb = { wboff ? 1.0f : 1.0f / pipe->dsc.temperature.coeffs[0], + wboff ? 1.0f : 1.0f / pipe->dsc.temperature.coeffs[1], + wboff ? 1.0f : 1.0f / pipe->dsc.temperature.coeffs[2] }; DT_OMP_FOR_SIMD(aligned(tmp : 64)) for(size_t idx =0; idx < msize; idx++) { - const float val = fmaxf(0.0f, src[4 * idx] / wb[0]) - + fmaxf(0.0f, src[4 * idx + 1] / wb[1]) - + fmaxf(0.0f, src[4 * idx + 2] / wb[2]); + const float val = src[4 * idx] * wb[0] + + src[4 * idx + 1] * wb[1] + + src[4 * idx + 2] * wb[2]; // add a gamma. sqrtf should make noise variance the same for all image - tmp[idx] = sqrtf(val / 3.0f); + tmp[idx] = sqrtf(0.33333333f * fmaxf(0.0f, val)); } DT_OMP_FOR() diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c index dad1946df664..d438e7777db3 100644 --- a/src/develop/pixelpipe_hb.c +++ b/src/develop/pixelpipe_hb.c @@ -1236,7 +1236,8 @@ static inline gboolean _module_pipe_stop(dt_dev_pixelpipe_t *pipe, float *input) void dt_dev_prepare_piece_cfa(dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *roi) { dt_iop_module_t *module = piece->module; - if(module && module->input_colorspace(module, piece->pipe, piece) == IOP_CS_RAW) + if(module && (module->input_colorspace(module, piece->pipe, piece) == IOP_CS_RAW + || module->default_colorspace(module, piece->pipe, piece) == IOP_CS_RAW)) { piece->filters = dt_rawspeed_crop_dcraw_filters(piece->pipe->dsc.filters, roi->x, roi->y); for(int ii = 0; ii < 6; ++ii) @@ -2335,6 +2336,17 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe, if(terr == CL_SUCCESS) { module->process(module, piece, clindata, cpudata, &roi_in, roi_out); + if(cst_out == IOP_CS_LAB) + { + for(size_t k = 0; k < (size_t)ow * oh * 4; k+=4) + { + dt_aligned_pixel_t XYZ; + dt_Lab_to_XYZ(cpudata + k, XYZ); + dt_XYZ_to_linearRGB(XYZ, cpudata + k); + dt_Lab_to_XYZ(cloutdata + k, XYZ); + dt_XYZ_to_linearRGB(XYZ, cloutdata + k); + } + } dt_dump_pipe_diff_pfm(module->op, cloutdata, cpudata, ow, oh, cho, dt_dev_pixelpipe_type_to_str(pipe->type)); } } @@ -3622,7 +3634,7 @@ int dt_dev_write_scharr_mask_cl(dt_dev_pixelpipe_iop_t *piece, wboff ? 1.0f : 1.0f / p->dsc.temperature.coeffs[2], 1.0f }; err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_Y0_mask, width, height, - CLARG(tmp), CLARG(in), CLARG(width), CLARG(height), CLFLARRAY(4, wb)); + CLARG(tmp), CLARG(in), CLARG(width), CLARG(height), CLARG(wb)); if(err != CL_SUCCESS) goto error; err = dt_opencl_enqueue_kernel_2d_args(devid, diff --git a/src/iop/demosaic.c b/src/iop/demosaic.c index 34e91fba86dd..a43477d4cb96 100644 --- a/src/iop/demosaic.c +++ b/src/iop/demosaic.c @@ -51,11 +51,8 @@ DT_MODULE_INTROSPECTION(6, dt_iop_demosaic_params_t) #define DT_DEMOSAIC_XTRANS 1024 // masks for non-Bayer demosaic ops #define DT_DEMOSAIC_DUAL 2048 // masks for dual demosaicing methods -#define DT_REDUCESIZE_MIN 64 -// these are highly depending on CPU architecture (cache size) -#define DT_RCD_TILESIZE 112 -#define DT_LMMSE_TILESIZE 136 +// #define DT_DEMOSAIC_POSITIVE typedef enum dt_iop_demosaic_method_t { @@ -235,8 +232,6 @@ typedef struct dt_iop_demosaic_global_data_t int kernel_rcd_step_4_2; int kernel_rcd_step_5_1; int kernel_rcd_step_5_2; - int kernel_rcd_border_redblue; - int kernel_rcd_border_green; int kernel_demosaic_box3; int kernel_write_blended_dual; int gaussian_9x9_mul; @@ -247,6 +242,7 @@ typedef struct dt_iop_demosaic_global_data_t int show_blend_mask; int capture_result; int final_blend; + int image_positive; float *gauss_coeffs; } dt_iop_demosaic_global_data_t; @@ -726,7 +722,10 @@ void process(dt_iop_module_t *self, const gboolean do_capture = !passthru && !is_4bayer && !show_dual && !run_fast && d->cs_enabled; const gboolean greens = is_bayer && d->green_eq != DT_IOP_GREEN_EQ_NO && no_masking && !run_fast && !true_monochrome; - const float procmax = dt_iop_get_processed_maximum(piece); + float procmax = dt_iop_get_processed_maximum(piece); + if(procmax < 1.5f) procmax = 2.0f; + else if(procmax < 3.0f) procmax = 4.0f; + else procmax = 8.0f; const float procmin = dt_iop_get_processed_minimum(piece); const int exif_iso = img->exif_iso; @@ -858,7 +857,7 @@ void process(dt_iop_module_t *self, if(method == DT_IOP_DEMOSAIC_FDC) xtrans_fdc_interpolate(t_out, t_in, width, t_rows, xtrans, exif_iso); else if(method == DT_IOP_DEMOSAIC_MARKESTEIJN || method == DT_IOP_DEMOSAIC_MARKESTEIJN_3) - xtrans_markesteijn_interpolate(t_out, t_in, width, t_rows, xtrans, passes); + xtrans_markesteijn_interpolate(t_out, t_in, width, t_rows, xtrans, passes, filters); else vng_interpolate(t_out, t_in, width, t_rows, filters, xtrans, FALSE); } @@ -878,7 +877,7 @@ void process(dt_iop_module_t *self, else if(method == DT_IOP_DEMOSAIC_LMMSE) lmmse_demosaic(t_out, t_in, width, t_rows, filters, d->lmmse_refine, procmax); else if(method != DT_IOP_DEMOSAIC_AMAZE) - demosaic_ppg(t_out, t_in, width, t_rows, filters, d->median_thrs); + demosaic_ppg(t_out, t_in, width, t_rows, filters, d->median_thrs, 100000); else amaze_demosaic(t_in, t_out, width, t_rows, filters, procmin); } @@ -908,6 +907,13 @@ void process(dt_iop_module_t *self, dt_iop_clip_and_zoom_roi((float *)o, out, roi_out, roi_in); dt_free_align(out); } + +#ifdef DT_DEMOSAIC_POSITIVE + float *oo = (float *)o; + DT_OMP_FOR_SIMD(aligned(oo : 64)) + for(size_t k = 0; k < (size_t)roi_out->width * roi_out->height * 4; k++) + oo[k] = fmaxf(0.0f, oo[k]); +#endif } #ifdef HAVE_OPENCL @@ -1205,7 +1211,13 @@ int process_cl(dt_iop_module_t *self, } if(!direct) - err = dt_iop_clip_and_zoom_roi_cl(devid, dev_out, out_image, roi_out, roi_in); + err = dt_iop_clip_and_zoom_roi_cl(devid, dev_out, out_image, roi_out, roi_in); + +#ifdef DT_DEMOSAIC_POSITIVE + if(err == CL_SUCCESS) + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->image_positive, roi_out->width, roi_out->height, + CLARG(dev_out), CLARG(dev_out), CLARG(roi_out->width), CLARG(roi_out->height)); +#endif finish: dt_opencl_release_mem_object(dev_xtrans); @@ -1286,10 +1298,9 @@ void init_global(dt_iop_module_so_t *self) gd->kernel_rcd_step_4_2 = dt_opencl_create_kernel(rcd, "rcd_step_4_2"); gd->kernel_rcd_step_5_1 = dt_opencl_create_kernel(rcd, "rcd_step_5_1"); gd->kernel_rcd_step_5_2 = dt_opencl_create_kernel(rcd, "rcd_step_5_2"); - gd->kernel_rcd_border_redblue = dt_opencl_create_kernel(rcd, "rcd_border_redblue"); - gd->kernel_rcd_border_green = dt_opencl_create_kernel(rcd, "rcd_border_green"); gd->kernel_demosaic_box3 = dt_opencl_create_kernel(rcd, "demosaic_box3"); gd->kernel_write_blended_dual = dt_opencl_create_kernel(rcd, "write_blended_dual"); + gd->image_positive = dt_opencl_create_kernel(rcd, "image_positive"); const int capt = 38; // capture.cl, from programs.conf gd->gaussian_9x9_mul = dt_opencl_create_kernel(capt, "kernel_9x9_mul"); @@ -1353,8 +1364,6 @@ void cleanup_global(dt_iop_module_so_t *self) dt_opencl_free_kernel(gd->kernel_rcd_step_4_2); dt_opencl_free_kernel(gd->kernel_rcd_step_5_1); dt_opencl_free_kernel(gd->kernel_rcd_step_5_2); - dt_opencl_free_kernel(gd->kernel_rcd_border_redblue); - dt_opencl_free_kernel(gd->kernel_rcd_border_green); dt_opencl_free_kernel(gd->kernel_demosaic_box3); dt_opencl_free_kernel(gd->kernel_write_blended_dual); dt_opencl_free_kernel(gd->gaussian_9x9_mul); @@ -1365,6 +1374,7 @@ void cleanup_global(dt_iop_module_so_t *self) dt_opencl_free_kernel(gd->show_blend_mask); dt_opencl_free_kernel(gd->capture_result); dt_opencl_free_kernel(gd->final_blend); + dt_opencl_free_kernel(gd->image_positive); dt_free_align(gd->gauss_coeffs); free(self->data); self->data = NULL; diff --git a/src/iop/demosaicing/amaze.cc b/src/iop/demosaicing/amaze.cc index dc2c4dbd5f58..64ce4b826735 100644 --- a/src/iop/demosaicing/amaze.cc +++ b/src/iop/demosaicing/amaze.cc @@ -135,11 +135,6 @@ void amaze_demosaic(const float *const in, { const float clip_pt8 = 0.8f * clip_pt; -// this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster -// If AMAZETS is undefined it will be set to 160, which is the fastest on machines with 1GB cache per thread. -#ifndef AMAZETS -#define AMAZETS 160 -#endif // Tile size; the image is processed in square tiles to lower memory requirements and facilitate // multi-threading // We assure that Tile size is a multiple of 32 in the range [96;992] diff --git a/src/iop/demosaicing/basics.c b/src/iop/demosaicing/basics.c index 851d880cd6c5..5d3472603fe0 100644 --- a/src/iop/demosaicing/basics.c +++ b/src/iop/demosaicing/basics.c @@ -65,7 +65,7 @@ static void pre_median_b(float *out, for(int i = 0; i < 8; i++) for(int ii = i + 1; ii < 9; ii++) if(med[i] > med[ii]) SWAP(med[i], med[ii]); - pixo[0] = fmaxf(0.0f, (cnt == 1 ? med[4] - 64.0f : med[(cnt - 1) / 2])); + pixo[0] = cnt == 1 ? med[4] - 64.0f : med[(cnt - 1) / 2]; // pixo[0] = med[(cnt-1)/2]; pixo += 2; pixi += 2; @@ -138,7 +138,7 @@ static void color_smoothing(float *out, SWAPmed(4, 2); SWAPmed(6, 4); SWAPmed(4, 2); - outp[c] = fmaxf(med[4] + outp[1], 0.0f); + outp[c] = med[4] + outp[1]; } } } @@ -156,9 +156,9 @@ static void green_equilibration_lavg(float *out, const float maximum = 1.0f; int oj = 2, oi = 2; - if(FC(oj, oi, filters) != 1) oj++; - if(FC(oj, oi, filters) != 1) oi++; - if(FC(oj, oi, filters) != 1) oj--; + if(FC(oj, oi, filters) != GREEN) oj++; + if(FC(oj, oi, filters) != GREEN) oi++; + if(FC(oj, oi, filters) != GREEN) oj--; dt_iop_image_copy_by_size(out, in, width, height, 1); @@ -190,7 +190,7 @@ static void green_equilibration_lavg(float *out, + fabsf(o2_3 - o2_4) + fabsf(o2_2 - o2_4)) / 6.0f; if((in[j * width + i] < maximum * 0.95f) && (c1 < maximum * thr) && (c2 < maximum * thr)) { - out[j * width + i] = fmaxf(0.0f, in[j * width + i] * m1 / m2); + out[j * width + i] = in[j * width + i] * m1 / m2; } } } @@ -230,7 +230,7 @@ static void green_equilibration_favg(float *out, { for(int i = oi; i < (width - 1 - g2_offset); i += 2) { - out[(size_t)j * width + i] = fmaxf(0.0f, in[(size_t)j * width + i] * gr_ratio); + out[(size_t)j * width + i] = in[(size_t)j * width + i] * gr_ratio; } } } @@ -266,10 +266,10 @@ static int color_smoothing_cl(const dt_iop_module_t *self, cl_mem dev_t1 = dev_out; cl_mem dev_t2 = dev_tmp; + const size_t sizes[] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; + const size_t local[] = { locopt.sizex, locopt.sizey, 1 }; for(int pass = 0; pass < passes; pass++) { - const size_t sizes[] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; - const size_t local[] = { locopt.sizex, locopt.sizey, 1 }; err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_color_smoothing, sizes, local, CLARG(dev_t1), CLARG(dev_t2), CLARG(width), CLARG(height), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2))); @@ -298,6 +298,7 @@ static int color_smoothing_cl(const dt_iop_module_t *self, return err; } +#define DT_REDUCESIZE_MIN 64 static int green_equilibration_cl(const dt_iop_module_t *self, const dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, @@ -414,14 +415,15 @@ static int green_equilibration_cl(const dt_iop_module_t *self, sizeof(float) * 2 * reducesize, CL_TRUE); if(err != CL_SUCCESS) goto error; - float sum1 = 0.0f, sum2 = 0.0f; + double sum1 = 0.0; + double sum2 = 0.0; for(int k = 0; k < reducesize; k++) { - sum1 += sumsum[2 * k]; - sum2 += sumsum[2 * k + 1]; + sum1 += (double)sumsum[2 * k]; + sum2 += (double)sumsum[2 * k + 1]; } - const float gr_ratio = (sum1 > 0.0f && sum2 > 0.0f) ? sum2 / sum1 : 1.0f; + const float gr_ratio = (sum1 > 0.0 && sum2 > 0.0) ? (float)(sum2 / sum1) : 1.0f; err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_green_eq_favg_apply, width, height, CLARG(dev_in1), CLARG(dev_out1), CLARG(width), CLARG(height), CLARG(filters), @@ -548,7 +550,8 @@ static int process_default_cl(const dt_iop_module_t *self, const size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_green, sizes, local, CLARG(dev_med), CLARG(dev_tmp), CLARG(width), - CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * (locopt.sizex + 2*3) * (locopt.sizey + 2*3))); + CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * (locopt.sizex + 2*3) * (locopt.sizey + 2*3)), + CLARGINT(100000)); if(err != CL_SUCCESS) goto error; } @@ -565,7 +568,8 @@ static int process_default_cl(const dt_iop_module_t *self, const size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_redblue, sizes, local, CLARG(dev_tmp), CLARG(dev_out), CLARG(width), - CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2))); + CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)), + CLARGINT(100000)); if(err != CL_SUCCESS) goto error; } } diff --git a/src/iop/demosaicing/capture.c b/src/iop/demosaicing/capture.c index e7f6beb6377c..02e4e9f5f48c 100644 --- a/src/iop/demosaicing/capture.c +++ b/src/iop/demosaicing/capture.c @@ -1045,10 +1045,10 @@ static int _capture_sharpen_cl(dt_iop_module_t *self, const dt_iop_buffer_dsc_t *dsc = &pipe->dsc; const gboolean wbon = dsc->temperature.enabled; - dt_aligned_pixel_t icoeffs = { wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[0] : CAPTURE_CFACLIP, - wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[1] : CAPTURE_CFACLIP, - wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[2] : CAPTURE_CFACLIP, - 0.0f }; + const dt_aligned_pixel_t icoeffs = { wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[0] : CAPTURE_CFACLIP, + wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[1] : CAPTURE_CFACLIP, + wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[2] : CAPTURE_CFACLIP, + 0.0f }; cl_mem gcoeffs = NULL; cl_mem gauss_idx = NULL; @@ -1058,10 +1058,9 @@ static int _capture_sharpen_cl(dt_iop_module_t *self, cl_mem luminance = dt_opencl_alloc_device_buffer(devid, bsize); cl_mem tmp2 = dt_opencl_alloc_device_buffer(devid, bsize); cl_mem tmp1 = dt_opencl_alloc_device_buffer(devid, bsize); - cl_mem whites = dt_opencl_copy_host_to_device_constant(devid, 4 * sizeof(float), icoeffs); cl_mem dev_rgb = dt_opencl_duplicate_image(devid, dev_out); - if(!blendmask || !luminance || !tmp2 || !tmp1 || !whites || !dev_rgb) goto finish; + if(!blendmask || !luminance || !tmp2 || !tmp1 || !dev_rgb) goto finish; err = dt_opencl_enqueue_kernel_2d_args(devid, gd->prefill_clip_mask, width, height, CLARG(tmp2), CLARG(width), CLARG(height)); @@ -1069,7 +1068,7 @@ static int _capture_sharpen_cl(dt_iop_module_t *self, err = dt_opencl_enqueue_kernel_2d_args(devid, gd->prepare_blend, width, height, CLARG(dev_in), CLARG(dev_out), CLARG(filters), CLARG(dev_xtrans), CLARG(tmp2), CLARG(tmp1), - CLARG(whites), CLARG(width), CLARG(height)); + CLARG(icoeffs), CLARG(width), CLARG(height)); if(err != CL_SUCCESS) goto finish; err = dt_opencl_enqueue_kernel_2d_args(devid, gd->modify_blend, width, height, @@ -1140,7 +1139,6 @@ static int _capture_sharpen_cl(dt_iop_module_t *self, dt_opencl_release_mem_object(tmp2); dt_opencl_release_mem_object(tmp1); dt_opencl_release_mem_object(luminance); - dt_opencl_release_mem_object(whites); return err; } diff --git a/src/iop/demosaicing/dual.c b/src/iop/demosaicing/dual.c index 0c703a03ce38..062092ad609a 100644 --- a/src/iop/demosaicing/dual.c +++ b/src/iop/demosaicing/dual.c @@ -118,7 +118,7 @@ int dual_demosaic_cl(const dt_iop_module_t *self, wboff ? 1.0f : 1.0f / p->dsc.temperature.coeffs[2], 1.0f }; err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_Y0_mask, width, height, - CLARG(mask), CLARG(high_image), CLARG(width), CLARG(height), CLFLARRAY(4, wb)); + CLARG(mask), CLARG(high_image), CLARG(width), CLARG(height), CLARG(wb)); if(err != CL_SUCCESS) goto finish; err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_scharr_mask, width, height, diff --git a/src/iop/demosaicing/lmmse.c b/src/iop/demosaicing/lmmse.c index fcc5d10e32c1..9ed6ebe9000c 100644 --- a/src/iop/demosaicing/lmmse.c +++ b/src/iop/demosaicing/lmmse.c @@ -125,7 +125,7 @@ static void lmmse_demosaic(float *const restrict out, const dt_iop_demosaic_lmmse_t mode, const float scaler) { - rcd_ppg_border(out, in, width, height, filters, BORDER_AROUND); + demosaic_ppg(out, in, width, height, filters, 0.0f, BORDER_AROUND); if(width < 2 * BORDER_AROUND || height < 2 * BORDER_AROUND) return; diff --git a/src/iop/demosaicing/ppg.c b/src/iop/demosaicing/ppg.c index e88c6f268824..75fa8c079a1c 100644 --- a/src/iop/demosaicing/ppg.c +++ b/src/iop/demosaicing/ppg.c @@ -22,9 +22,10 @@ static void demosaic_ppg(float *const out, const int width, const int height, const uint32_t filters, - const float thrs) + const float thrs, + const int margin) { - // border interpolate + // preliminary border interpolate for outermost 3 pixels float sum[8]; for(int j = 0; j < height; j++) for(int i = 0; i < width; i++) @@ -46,10 +47,10 @@ static void demosaic_ppg(float *const out, for(int c = 0; c < 3; c++) { if(c != f && sum[c + 4] > 0.0f) - out[4 * ((size_t)j * width + i) + c] = fmaxf(0.0f, sum[c] / sum[c + 4]); + out[4 * ((size_t)j * width + i) + c] = sum[c] / sum[c + 4]; else out[4 * ((size_t)j * width + i) + c] - = fmaxf(0.0f, in[(size_t)j * width + i]); + = in[(size_t)j * width + i]; } } const gboolean median = thrs > 0.0f; @@ -60,8 +61,11 @@ static void demosaic_ppg(float *const out, pre_median(med_in, in, width, height, filters, 1, thrs); input = med_in; } -// for all pixels except those in the 3 pixel border: -// interpolate green from input into out float array, or copy color. + + const int border = margin + 3; + // for all pixels except those in the 3 pixel border: + // interpolate green from input into out float array, or copy color. + // avoid processing if outside of given border DT_OMP_FOR() for(int j = 3; j < height - 3; j++) { @@ -69,6 +73,14 @@ static void demosaic_ppg(float *const out, const float *buf_in = input + (size_t)width * j + 3; for(int i = 3; i < width - 3; i++) { + if(i == border && j >= border && j < height - border) + { + i = width - border; + buf = out + (size_t)4 * width * j + 4 * i; + buf_in = in + (size_t)width * j + i; + } + if(i == width) break; + const int c = FC(j, i, filters); dt_aligned_pixel_t color; const float pc = buf_in[0]; @@ -116,86 +128,94 @@ static void demosaic_ppg(float *const out, color[3] = 0.0f; for_each_channel(k,aligned(buf,color:16) - dt_omp_nontemporal(buf)) buf[k] = fmaxf(0.0f, color[k]); + dt_omp_nontemporal(buf)) buf[k] = color[k]; buf += 4; buf_in++; } } -// for all pixels except the outermost row/column: -// interpolate colors using out as input into float out array DT_OMP_FOR() - for(int j = 1; j < height - 1; j++) + for(int j = 0; j < height; j++) { - float *buf = out + (size_t)4 * width * j + 4; - for(int i = 1; i < width - 1; i++) + float *buf = out + (size_t)4 * width * j; + for(int i = 0; i < width; i++) { + if(i == margin && j >= margin && j < height - margin) + { + i = width - margin; + buf = out + (size_t)4 * (width * j + i); + } // also prefetch direct nbs top/bottom - const int c = FC(j, i, filters); dt_aligned_pixel_t color = { buf[0], buf[1], buf[2], buf[3] }; - // fill all four pixels with correctly interpolated stuff: r/b for green1/2 - // b for r and r for b - if(__builtin_expect(c & 1, 1)) // c == 1 || c == 3) - { - // calculate red and blue for green pixels: - // need 4-nbhood: - const float *nt = buf - 4 * width; - const float *nb = buf + 4 * width; - const float *nl = buf - 4; - const float *nr = buf + 4; - if(FC(j, i + 1, filters) == 0) // red nb in same row - { - color[2] = (nt[2] + nb[2] + 2.0f * color[1] - nt[1] - nb[1]) * .5f; - color[0] = (nl[0] + nr[0] + 2.0f * color[1] - nl[1] - nr[1]) * .5f; - } - else - { - // blue nb - color[0] = (nt[0] + nb[0] + 2.0f * color[1] - nt[1] - nb[1]) * .5f; - color[2] = (nl[2] + nr[2] + 2.0f * color[1] - nl[1] - nr[1]) * .5f; - } - } - else + // for all pixels except the outermost row/column: + // interpolate colors using out as input into float out array + // the outermost are untouched + // but both final results avoid negatives + if(j > 0 && i > 0 && i < width - 1 && j < height - 1) { - // get 4-star-nbhood: - const float *ntl = buf - 4 - 4 * width; - const float *ntr = buf + 4 - 4 * width; - const float *nbl = buf - 4 + 4 * width; - const float *nbr = buf + 4 + 4 * width; - - if(c == 0) + const int c = FC(j, i, filters); + // fill all four pixels with correctly interpolated stuff: r/b for green1/2 + // b for r and r for b + if(__builtin_expect(c & 1, 1)) // c == 1 || c == 3) { - // red pixel, fill blue: - const float diff1 = fabsf(ntl[2] - nbr[2]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]); - const float guess1 = ntl[2] + nbr[2] + 2.0f * color[1] - ntl[1] - nbr[1]; - const float diff2 = fabsf(ntr[2] - nbl[2]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]); - const float guess2 = ntr[2] + nbl[2] + 2.0f * color[1] - ntr[1] - nbl[1]; - if(diff1 > diff2) - color[2] = guess2 * .5f; - else if(diff1 < diff2) - color[2] = guess1 * .5f; + // calculate red and blue for green pixels: + // need 4-nbhood: + const float *nt = buf - 4 * width; + const float *nb = buf + 4 * width; + const float *nl = buf - 4; + const float *nr = buf + 4; + if(FC(j, i + 1, filters) == 0) // red nb in same row + { + color[2] = (nt[2] + nb[2] + 2.0f * color[1] - nt[1] - nb[1]) * .5f; + color[0] = (nl[0] + nr[0] + 2.0f * color[1] - nl[1] - nr[1]) * .5f; + } else - color[2] = (guess1 + guess2) * .25f; + { + // blue nb + color[0] = (nt[0] + nb[0] + 2.0f * color[1] - nt[1] - nb[1]) * .5f; + color[2] = (nl[2] + nr[2] + 2.0f * color[1] - nl[1] - nr[1]) * .5f; + } } - else // c == 2, blue pixel, fill red: + else { - const float diff1 = fabsf(ntl[0] - nbr[0]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]); - const float guess1 = ntl[0] + nbr[0] + 2.0f * color[1] - ntl[1] - nbr[1]; - const float diff2 = fabsf(ntr[0] - nbl[0]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]); - const float guess2 = ntr[0] + nbl[0] + 2.0f * color[1] - ntr[1] - nbl[1]; - if(diff1 > diff2) - color[0] = guess2 * .5f; - else if(diff1 < diff2) - color[0] = guess1 * .5f; - else - color[0] = (guess1 + guess2) * .25f; + // get 4-star-nbhood: + const float *ntl = buf - 4 - 4 * width; + const float *ntr = buf + 4 - 4 * width; + const float *nbl = buf - 4 + 4 * width; + const float *nbr = buf + 4 + 4 * width; + if(c == 0) + { + // red pixel, fill blue: + const float diff1 = fabsf(ntl[2] - nbr[2]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]); + const float guess1 = ntl[2] + nbr[2] + 2.0f * color[1] - ntl[1] - nbr[1]; + const float diff2 = fabsf(ntr[2] - nbl[2]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]); + const float guess2 = ntr[2] + nbl[2] + 2.0f * color[1] - ntr[1] - nbl[1]; + if(diff1 > diff2) + color[2] = guess2 * .5f; + else if(diff1 < diff2) + color[2] = guess1 * .5f; + else + color[2] = (guess1 + guess2) * .25f; + } + else // c == 2, blue pixel, fill red: + { + const float diff1 = fabsf(ntl[0] - nbr[0]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]); + const float guess1 = ntl[0] + nbr[0] + 2.0f * color[1] - ntl[1] - nbr[1]; + const float diff2 = fabsf(ntr[0] - nbl[0]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]); + const float guess2 = ntr[0] + nbl[0] + 2.0f * color[1] - ntr[1] - nbl[1]; + if(diff1 > diff2) + color[0] = guess2 * .5f; + else if(diff1 < diff2) + color[0] = guess1 * .5f; + else + color[0] = (guess1 + guess2) * .25f; + } } } - for_each_channel(k,aligned(buf,color:16) - dt_omp_nontemporal(buf)) buf[k] = fmaxf(0.0f, color[k]); + dt_omp_nontemporal(buf)) buf[k] = color[k]; buf += 4; } diff --git a/src/iop/demosaicing/rcd.c b/src/iop/demosaicing/rcd.c index 0adcbd83d915..4704f3ec3e34 100644 --- a/src/iop/demosaicing/rcd.c +++ b/src/iop/demosaicing/rcd.c @@ -38,9 +38,8 @@ */ /* Some notes about the algorithm -* 1. The calculated data at the tiling borders RCD_BORDER must be at least 9 to be stable. -* 2. For the outermost tiles we only have to discard a 7 pixel border region interpolated otherwise. -* 3. The tilesize has a significant influence on performance, the default is a good guess for modern +* 1. The calculated data at the tiling borders RCD_BORDER must be 10 to be stable. +* 2. The tilesize has a significant influence on performance, the default is a good guess for modern * x86/64 machines, tested on Xeon E-2288G, i5-8250U. */ @@ -65,8 +64,7 @@ #pragma GCC optimize ("fp-contract=fast", "finite-math-only", "no-math-errno") #endif -#define RCD_BORDER 9 // avoid tile-overlap errors -#define RCD_MARGIN 7 // for the outermost tiles we can have a smaller outer border +#define RCD_BORDER 10 // avoid tile-overlap errors #define RCD_TILEVALID (DT_RCD_TILESIZE - 2 * RCD_BORDER) #define w1 DT_RCD_TILESIZE #define w2 (2 * DT_RCD_TILESIZE) @@ -82,191 +80,6 @@ static inline float _safe_in(float a, float scale) return fmaxf(0.0f, a) * scale; } -/** This is basically ppg adopted to only write data to RCD_MARGIN */ -static void rcd_ppg_border(float *const out, - const float *const in, - const int width, - const int height, - const uint32_t filters, - const int margin) -{ - const int border = margin + 3; - // write approximatad 3-pixel border region to out - float sum[8]; - for(int j = 0; j < height; j++) - { - for(int i = 0; i < width; i++) - { - if(i == 3 && j >= 3 && j < height - 3) i = width - 3; - if(i == width) break; - memset(sum, 0, sizeof(float) * 8); - for(int y = j - 1; y != j + 2; y++) - { - for(int x = i - 1; x != i + 2; x++) - { - if((y >= 0) && (x >= 0) && (y < height) && (x < width)) - { - const int f = FC(y, x, filters); - sum[f] += fmaxf(0.0f, in[(size_t)y * width + x]); - sum[f + 4]++; - } - } - } - const int f = FC(j, i, filters); - for(int c = 0; c < 3; c++) - { - if(c != f && sum[c + 4] > 0.0f) - out[4 * ((size_t)j * width + i) + c] = sum[c] / sum[c + 4]; - else - out[4 * ((size_t)j * width + i) + c] = fmaxf(0.0f, in[(size_t)j * width + i]); - } - } - } - - DT_OMP_FOR() - for(int j = 3; j < height - 3; j++) - { - float *buf = out + (size_t)4 * width * j + 4 * 3; - const float *buf_in = in + (size_t)width * j + 3; - for(int i = 3; i < width - 3; i++) - { - if(i == border && j >= border && j < height - border) - { - i = width - border; - buf = out + (size_t)4 * width * j + 4 * i; - buf_in = in + (size_t)width * j + i; - } - if(i == width) break; - - const int c = FC(j, i, filters); - dt_aligned_pixel_t color; - const float pc = fmaxf(0.0f, buf_in[0]); - if(c == 0 || c == 2) - { - color[c] = pc; - const float pym = fmaxf(0.0f, buf_in[-width * 1]); - const float pym2 = fmaxf(0.0f, buf_in[-width * 2]); - const float pym3 = fmaxf(0.0f, buf_in[-width * 3]); - const float pyM = fmaxf(0.0f, buf_in[+width * 1]); - const float pyM2 = fmaxf(0.0f, buf_in[+width * 2]); - const float pyM3 = fmaxf(0.0f, buf_in[+width * 3]); - const float pxm = fmaxf(0.0f, buf_in[-1]); - const float pxm2 = fmaxf(0.0f, buf_in[-2]); - const float pxm3 = fmaxf(0.0f, buf_in[-3]); - const float pxM = fmaxf(0.0f, buf_in[+1]); - const float pxM2 = fmaxf(0.0f, buf_in[+2]); - const float pxM3 = fmaxf(0.0f, buf_in[+3]); - - const float guessx = (pxm + pc + pxM) * 2.0f - pxM2 - pxm2; - const float diffx = (fabsf(pxm2 - pc) + fabsf(pxM2 - pc) + fabsf(pxm - pxM)) * 3.0f - + (fabsf(pxM3 - pxM) + fabsf(pxm3 - pxm)) * 2.0f; - const float guessy = (pym + pc + pyM) * 2.0f - pyM2 - pym2; - const float diffy = (fabsf(pym2 - pc) + fabsf(pyM2 - pc) + fabsf(pym - pyM)) * 3.0f - + (fabsf(pyM3 - pyM) + fabsf(pym3 - pym)) * 2.0f; - if(diffx > diffy) - { - // use guessy - const float m = fminf(pym, pyM); - const float M = fmaxf(pym, pyM); - color[1] = fmaxf(fminf(guessy * .25f, M), m); - } - else - { - const float m = fminf(pxm, pxM); - const float M = fmaxf(pxm, pxM); - color[1] = fmaxf(fminf(guessx * .25f, M), m); - } - } - else - color[1] = pc; - - color[3] = 0.0f; - for_each_channel(k) - buf[k] = color[k]; - buf += 4; - buf_in++; - } - } -// for all pixels: interpolate colors into float array - DT_OMP_FOR() - for(int j = 1; j < height - 1; j++) - { - float *buf = out + (size_t)4 * width * j + 4; - for(int i = 1; i < width - 1; i++) - { - if(i == margin && j >= margin && j < height - margin) - { - i = width - margin; - buf = out + (size_t)4 * (width * j + i); - } - const int c = FC(j, i, filters); - dt_aligned_pixel_t color = { buf[0], buf[1], buf[2], buf[3] }; - const int linesize = 4 * width; - // fill all four pixels with correctly interpolated stuff: r/b for green1/2 - // b for r and r for b - if(__builtin_expect(c & 1, 1)) // c == 1 || c == 3) - { - // calculate red and blue for green pixels: - // need 4-nbhood: - const float *nt = buf - linesize; - const float *nb = buf + linesize; - const float *nl = buf - 4; - const float *nr = buf + 4; - if(FC(j, i + 1, filters) == 0) // red nb in same row - { - color[2] = (nt[2] + nb[2] + 2.0f * color[1] - nt[1] - nb[1]) * .5f; - color[0] = (nl[0] + nr[0] + 2.0f * color[1] - nl[1] - nr[1]) * .5f; - } - else - { - // blue nb - color[0] = (nt[0] + nb[0] + 2.0f * color[1] - nt[1] - nb[1]) * .5f; - color[2] = (nl[2] + nr[2] + 2.0f * color[1] - nl[1] - nr[1]) * .5f; - } - } - else - { - // get 4-star-nbhood: - const float *ntl = buf - 4 - linesize; - const float *ntr = buf + 4 - linesize; - const float *nbl = buf - 4 + linesize; - const float *nbr = buf + 4 + linesize; - - if(c == 0) - { - // red pixel, fill blue: - const float diff1 = fabsf(ntl[2] - nbr[2]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]); - const float guess1 = ntl[2] + nbr[2] + 2.0f * color[1] - ntl[1] - nbr[1]; - const float diff2 = fabsf(ntr[2] - nbl[2]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]); - const float guess2 = ntr[2] + nbl[2] + 2.0f * color[1] - ntr[1] - nbl[1]; - if(diff1 > diff2) - color[2] = guess2 * .5f; - else if(diff1 < diff2) - color[2] = guess1 * .5f; - else - color[2] = (guess1 + guess2) * .25f; - } - else // c == 2, blue pixel, fill red: - { - const float diff1 = fabsf(ntl[0] - nbr[0]) + fabsf(ntl[1] - color[1]) + fabsf(nbr[1] - color[1]); - const float guess1 = ntl[0] + nbr[0] + 2.0f * color[1] - ntl[1] - nbr[1]; - const float diff2 = fabsf(ntr[0] - nbl[0]) + fabsf(ntr[1] - color[1]) + fabsf(nbl[1] - color[1]); - const float guess2 = ntr[0] + nbl[0] + 2.0f * color[1] - ntr[1] - nbl[1]; - if(diff1 > diff2) - color[0] = guess2 * .5f; - else if(diff1 < diff2) - color[0] = guess1 * .5f; - else - color[0] = (guess1 + guess2) * .25f; - } - } - for_each_channel(k) - buf[k] = color[k]; - buf += 4; - } - } -} - DT_OMP_DECLARE_SIMD(aligned(in, out : 64)) static void demosaic_box3(float *const restrict out, const float *const restrict in, @@ -289,7 +102,7 @@ static void demosaic_box3(float *const restrict out, if(x >= 0 && y >= 0 && x < width && y < height) { const int color = (filters == 9u) ? FCNxtrans(y, x, xtrans) : FC(y, x, filters); - sum[color] += MAX(0.0f, in[(size_t)width*y +x]); + sum[color] += in[(size_t)width*y +x]; cnt[color] += 1.0f; } } @@ -308,13 +121,9 @@ static void rcd_demosaic(float *const restrict out, const uint32_t filters, const float scaler) { + demosaic_ppg(out, in, width, height, filters, 0.0f, RCD_BORDER); if(width < 2*RCD_BORDER || height < 2*RCD_BORDER) - { - rcd_ppg_border(out, in, width, height, filters, RCD_BORDER); return; - } - - rcd_ppg_border(out, in, width, height, filters, RCD_MARGIN); const float revscaler = 1.0f / scaler; @@ -555,18 +364,15 @@ static void rcd_demosaic(float *const restrict out, } } - // For the outermost tiles in all directions we can use a smaller border margin - const int first_vertical = rowStart + ((tile_vertical == 0) ? RCD_MARGIN : RCD_BORDER); - const int last_vertical = rowEnd - ((tile_vertical == num_vertical - 1) ? RCD_MARGIN : RCD_BORDER); - const int first_horizontal = colStart + ((tile_horizontal == 0) ? RCD_MARGIN : RCD_BORDER); - const int last_horizontal = colEnd - ((tile_horizontal == num_horizontal - 1) ? RCD_MARGIN : RCD_BORDER); - for(int row = first_vertical; row < last_vertical; row++) + for(int row = rowStart + RCD_BORDER; row < rowEnd - RCD_BORDER; row++) { - for(int col = first_horizontal, idx = (row - rowStart) * DT_RCD_TILESIZE + col - colStart, o_idx = (row * width + col) * 4; col < last_horizontal; col++, o_idx += 4, idx++) + for(int col = colStart + RCD_BORDER, idx = (row - rowStart) * DT_RCD_TILESIZE + col - colStart, o_idx = (row * width + col) * 4; + col < colEnd - RCD_BORDER; + col++, o_idx += 4, idx++) { - out[o_idx] = scaler * fmaxf(0.0f, rgb[0][idx]); - out[o_idx+1] = scaler * fmaxf(0.0f, rgb[1][idx]); - out[o_idx+2] = scaler * fmaxf(0.0f, rgb[2][idx]); + out[o_idx] = scaler * rgb[0][idx]; + out[o_idx+1] = scaler * rgb[1][idx]; + out[o_idx+2] = scaler * rgb[2][idx]; out[o_idx+3] = 0.0f; } } @@ -625,12 +431,12 @@ static cl_int process_rcd_cl(dt_iop_module_t *self, .cellsize = sizeof(float) * 1, .overhead = 0, .sizex = 64, .sizey = 64 }; - err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_green, &locopt); + err = dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_green, &locopt); if(err != CL_SUCCESS) goto error; size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; - err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_rcd_border_green, sizes, local, + err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_green, sizes, local, CLARG(dev_in), CLARG(dev_tmp), CLARG(width), CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * (locopt.sizex + 2*3) * (locopt.sizey + 2*3)), CLARGINT(32)); if(err != CL_SUCCESS) goto error; @@ -642,12 +448,12 @@ static cl_int process_rcd_cl(dt_iop_module_t *self, .cellsize = 4 * sizeof(float), .overhead = 0, .sizex = 64, .sizey = 64 }; - err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_redblue, &locopt); + err = dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_redblue, &locopt); if(err != CL_SUCCESS) goto error; size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; - err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_rcd_border_redblue, sizes, local, + err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_ppg_redblue, sizes, local, CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)), CLARGINT(16)); if(err != CL_SUCCESS) goto error; @@ -669,10 +475,14 @@ static cl_int process_rcd_cl(dt_iop_module_t *self, goto error; // populate data - float scaler = 1.0f / dt_iop_get_processed_maximum(piece); + float scaler = dt_iop_get_processed_maximum(piece); + if(scaler < 1.5f) scaler = 2.0f; + else if(scaler < 3.0f) scaler = 4.0f; + else scaler = 8.0f; + const float revscaler = 1.0f / scaler; err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_rcd_populate, width, height, CLARG(dev_in), CLARG(cfa), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), - CLARG(filters), CLARG(scaler)); + CLARG(filters), CLARG(revscaler)); if(err != CL_SUCCESS) goto error; // Step 1.1: Calculate a squared vertical and horizontal high pass filter on color differences @@ -715,10 +525,9 @@ static cl_int process_rcd_cl(dt_iop_module_t *self, CLARG(VH_dir), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(filters)); if(err != CL_SUCCESS) goto error; - scaler = dt_iop_get_processed_maximum(piece); // write output err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_rcd_write_output, width, height, - CLARG(dev_out), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(scaler), CLARGINT(RCD_MARGIN)); + CLARG(dev_out), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(scaler), CLARGINT(RCD_BORDER)); error: dt_opencl_release_mem_object(dev_tmp); @@ -739,7 +548,6 @@ static cl_int process_rcd_cl(dt_iop_module_t *self, #endif #undef RCD_BORDER -#undef RCD_MARGIN #undef RCD_TILEVALID #undef w1 #undef w2 diff --git a/src/iop/demosaicing/vng.c b/src/iop/demosaicing/vng.c index 47e5694224ca..bb5c8481b2c6 100644 --- a/src/iop/demosaicing/vng.c +++ b/src/iop/demosaicing/vng.c @@ -18,14 +18,16 @@ /* taken from dcraw and demosaic_ppg below */ -static void lin_interpolate(float *out, - const float *const in, - const int width, - const int height, - const uint32_t filters, - const uint8_t (*const xtrans)[6]) +static void _vng_lininterpolate(float *out, + const float *const in, + const int width, + const int height, + const uint32_t filters, + const uint8_t (*const xtrans)[6], + const int border) { - const int colors = (filters == 9) ? 3 : 4; + const gboolean is_xtrans = filters == 9; + const int colors = is_xtrans ? 3 : 4; // border interpolate DT_OMP_FOR() for(int row = 0; row < height; row++) @@ -41,7 +43,7 @@ static void lin_interpolate(float *out, if(y >= 0 && x >= 0 && y < height && x < width) { const int f = fcol(y, x, filters, xtrans); - sum[f] += fmaxf(0.0f, in[y * width + x]); + sum[f] += in[y * width + x]; count[f]++; } const int f = fcol(row, col, filters, xtrans); @@ -53,7 +55,7 @@ static void lin_interpolate(float *out, if(c != f && count[c] != 0) out[4 * (row * width + col) + c] = sum[c] / count[c]; else - out[4 * (row * width + col) + c] = fmaxf(0.0f, in[row * width + col]); + out[4 * (row * width + col) + c] = in[row * width + col]; } } @@ -107,15 +109,22 @@ static void lin_interpolate(float *out, const float *buf_in = in + width * row + 1; for(int col = 1; col < width - 1; col++) { + if(col == border && row >= border && row < height - border) + { + col = width - border; + buf = out + (size_t)4 * width * row + 4 * col; + buf_in = in + (size_t)width * row + col; + } + if(col == width) break; dt_aligned_pixel_t sum = { 0.0f }; int *ip = &(lookup[row % size][col % size][0]); // for each adjoining pixel not of this pixel's color, sum up its weighted values for(int i = *ip++; i--; ip += 3) - sum[ip[2]] += fmaxf(0.0f, buf_in[ip[0]]) * ip[1]; + sum[ip[2]] += buf_in[ip[0]] * ip[1]; // for each interpolated color, load it into the pixel for(int i = colors; --i; ip += 2) buf[*ip] = sum[ip[0]] / ip[1]; - buf[*ip] = fmaxf(0.0f, *buf_in); + buf[*ip] = *buf_in; buf += 4; buf_in++; } @@ -137,12 +146,6 @@ static void lin_interpolate(float *out, I've extended the basic idea to work with non-Bayer filter arrays. Gradients are numbered clockwise from NW=0 to W=7. */ -static void _copy_abovezero(float *to, float *from, const int pixels) -{ - static dt_aligned_pixel_t zero = { 0.0f, 0.0f, 0.0f, 0.0f}; - for(int i = 0; i < pixels; i++) - dt_vector_max(&to[i*4], zero, &from[i*4]); -} static const signed char terms[] = { -2, -2, +0, -1, 1, 0x01, -2, -2, +0, +0, 2, 0x01, -2, -1, -1, +0, 1, 0x01, -2, -1, +0, -1, 1, 0x02, @@ -178,7 +181,6 @@ static void vng_interpolate(float *out, float(*brow[4])[4]; const gboolean is_xtrans = (filters == 9); const gboolean is_4bayer = FILTERS_ARE_4BAYER(filters); - const gboolean is_bayer = !(is_xtrans || is_4bayer); const int prow = is_xtrans ? 6 : 8; const int pcol = is_xtrans ? 6 : 2; const int colors = is_xtrans ? 3 : 4; @@ -192,14 +194,11 @@ static void vng_interpolate(float *out, else filters4 = filters | 0x0c0c0c0cu; - lin_interpolate(out, in, width, height, filters4, xtrans); + _vng_lininterpolate(out, in, width, height, filters4, xtrans, 1000000); // if only linear interpolation is requested we can stop it here if(only_vng_linear) - { - if(is_bayer) goto bayer_greens; - else return; - } + goto finish; char *buffer = dt_alloc_aligned(sizeof(**brow) * width * 3 + sizeof(*ip) * prow * pcol * 320); if(!buffer) @@ -304,22 +303,24 @@ static void vng_interpolate(float *out, } } if(row > 3) /* Write buffer to image */ - _copy_abovezero(out + 4 * ((row - 2) * width + 2), (float *)(brow[0] + 2), width - 4); + dt_iop_image_copy(out + 4 * ((row - 2) * width + 2), (float *)(brow[0] + 2), 4*(width - 4)); // rotate ring buffer for(int g = 0; g < 4; g++) brow[(g - 1) & 3] = brow[g]; } // copy the final two rows to the image - _copy_abovezero(out + (4 * ((height - 4) * width + 2)), (float *)(brow[0] + 2), width - 4); - _copy_abovezero(out + (4 * ((height - 3) * width + 2)), (float *)(brow[1] + 2), width - 4); + dt_iop_image_copy(out + (4 * ((height - 4) * width + 2)), (float *)(brow[0] + 2), 4*(width - 4)); + dt_iop_image_copy(out + (4 * ((height - 3) * width + 2)), (float *)(brow[1] + 2), 4*(width - 4)); dt_free_align(buffer); -bayer_greens: - if(is_bayer) +finish: + DT_OMP_FOR() + for(size_t i = 0; i < (size_t)width * height * 4; i+=4) { - DT_OMP_FOR() - for(int i = 0; i < height * width; i++) - out[i * 4 + 1] = (out[i * 4 + 1] + out[i * 4 + 3]) / 2.0f; + out[i] = out[i]; + out[i+1] = is_xtrans ? out[i+1] : 0.5f * (out[i+1] + out[i+3]); + out[i+2] = out[i+2]; + out[i+3] = 0.0f; } } @@ -499,7 +500,7 @@ static cl_int process_vng_cl(const dt_iop_module_t *self, if(only_vng_linear) goto finish; - // do full VNG interpolation; linear data is in dev_tmp + // do full VNG interpolation; linear data is in dev_tmp; don't touch outermost 2 pixels dt_opencl_local_buffer_t locopt = (dt_opencl_local_buffer_t){ .xoffset = 2*2, .xfactor = 1, .yoffset = 2*2, .yfactor = 1, .cellsize = 4 * sizeof(float), .overhead = 0, @@ -511,7 +512,7 @@ static cl_int process_vng_cl(const dt_iop_module_t *self, size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; err = dt_opencl_enqueue_kernel_2d_local_args(devid, gd->kernel_vng_interpolate, sizes, local, - CLARG(dev_in), CLARG(dev_tmp), CLARG(dev_out), + CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(filters4), CLARG(dev_xtrans), CLARG(dev_ips), CLARG(dev_code), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 4) * (locopt.sizey + 4))); diff --git a/src/iop/demosaicing/xtrans.c b/src/iop/demosaicing/xtrans.c index c08991062389..0609aae15c97 100644 --- a/src/iop/demosaicing/xtrans.c +++ b/src/iop/demosaicing/xtrans.c @@ -1,6 +1,6 @@ /* This file is part of darktable, - Copyright (C) 2010-2025 darktable developers. + Copyright (C) 2010-2026 darktable developers. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,8 +18,7 @@ // xtrans_interpolate adapted from dcraw 9.20 -// tile size, optimized to keep data in L2 cache -#define TS 122 +#define TS DT_MARKESTEIJN_TS #define PAD_G1_G3 3 #define PAD_G_INTERP 3 @@ -48,7 +47,8 @@ static void xtrans_markesteijn_interpolate(float *out, const int width, const int height, const uint8_t (*const xtrans)[6], - const int passes) + const int passes, + const uint32_t filters) { static const short orth[12] = { 1, 0, 0, 1, -1, 0, 0, -1, 1, 0, 0, 1 }, patt[2][16] = { { 0, 1, 0, -1, 2, 0, -1, 0, 1, 1, 1, -1, 0, 0, 0, 0 }, @@ -178,7 +178,8 @@ static void xtrans_markesteijn_interpolate(float *out, } // duplicate rgb[0] to rgb[1], rgb[2], and rgb[3] - for(int c = 1; c <= 3; c++) memcpy(rgb[c], rgb[0], sizeof(*rgb)); + for(int c = 1; c <= 3; c++) + dt_iop_image_copy((float*)rgb[c], (float*)rgb[0], sizeof(*rgb) / sizeof(float)); // note that successive calculations are inset within the tile // so as to give enough border data, and there needs to be a 6 @@ -503,16 +504,17 @@ static void xtrans_markesteijn_interpolate(float *out, } } for(int c = 0; c < 3; c++) - out[4 * (width * (row + top) + col + left) + c] = MAX(0.0f, avg[c]/avg[3]); + out[4 * (width * (row + top) + col + left) + c] = avg[c]/avg[3]; } } } dt_free_align(all_buffers); + _vng_lininterpolate(out, in, width, height, filters, xtrans, pad_tile); } #undef TS -#define TS 122 +#define TS DT_FDC_TS static void xtrans_fdc_interpolate(float *out, const float *const in, const int width,