diff --git a/IBKeymaster/IBKeyer.cpp b/IBKeymaster/IBKeyer.cpp index 1eaced1..7a5f42f 100644 --- a/IBKeymaster/IBKeyer.cpp +++ b/IBKeymaster/IBKeyer.cpp @@ -36,7 +36,11 @@ constexpr const char* kPluginDescription = "Based on IBKeyer by Jed Smith (gaffer-tools) + He et al. guided filter."; constexpr const char* kPluginIdentifier = "com.OpenFXSample.IBKeyer"; constexpr int kPluginVersionMajor = 2; -constexpr int kPluginVersionMinor = 1; +// This changed the public OFX surface in a non-trivial way: new clips, new params, and new +// backend-routing rules. Keeping the old version number after that can leave hosts holding onto a +// stale descriptor cache and trying to reconcile it with a different binary. Bumping the version is +// the polite way to tell Resolve "this is materially a new plugin shape, please rescan it fresh." +constexpr int kPluginVersionMinor = 2; constexpr bool kSupportsTiles = false; constexpr bool kSupportsMultiResolution = false; constexpr bool kSupportsMultipleClipPARs = false; @@ -63,6 +67,9 @@ class IBKeyerPlugin : public OFX::ImageEffect OFX::Clip* m_DstClip; OFX::Clip* m_SrcClip; OFX::Clip* m_ScreenClip; + OFX::Clip* m_BgClip; + OFX::Clip* m_GarbageMatteClip; + OFX::Clip* m_OcclusionMatteClip; OFX::ChoiceParam* m_ScreenColor; OFX::BooleanParam* m_UseScreenInput; @@ -73,12 +80,31 @@ class IBKeyerPlugin : public OFX::ImageEffect OFX::BooleanParam* m_Premultiply; OFX::DoubleParam* m_BlackClip; OFX::DoubleParam* m_WhiteClip; + OFX::DoubleParam* m_MatteGamma; + OFX::BooleanParam* m_PrematteEnabled; + OFX::IntParam* m_PrematteBlur; + OFX::IntParam* m_PrematteErode; + OFX::IntParam* m_PrematteIterations; OFX::BooleanParam* m_NearGreyExtract; OFX::DoubleParam* m_NearGreyAmount; + OFX::DoubleParam* m_NearGreySoftness; OFX::BooleanParam* m_GuidedFilterEnabled; + OFX::ChoiceParam* m_GuidedFilterMode; OFX::IntParam* m_GuidedRadius; OFX::DoubleParam* m_GuidedEpsilon; OFX::DoubleParam* m_GuidedMix; + OFX::DoubleParam* m_EdgeProtect; + OFX::IntParam* m_RefineIterations; + OFX::DoubleParam* m_EdgeColorCorrect; + OFX::BooleanParam* m_BgWrapEnabled; + OFX::IntParam* m_BgWrapBlur; + OFX::DoubleParam* m_BgWrapAmount; + OFX::BooleanParam* m_AdditiveKeyEnabled; + OFX::ChoiceParam* m_AdditiveKeyMode; + OFX::DoubleParam* m_AdditiveKeySaturation; + OFX::DoubleParam* m_AdditiveKeyAmount; + OFX::BooleanParam* m_AdditiveKeyBlackClamp; + OFX::ChoiceParam* m_ViewMode; }; IBKeyerPlugin::IBKeyerPlugin(OfxImageEffectHandle p_Handle) @@ -86,6 +112,9 @@ IBKeyerPlugin::IBKeyerPlugin(OfxImageEffectHandle p_Handle) , m_DstClip(fetchClip(kOfxImageEffectOutputClipName)) , m_SrcClip(fetchClip(kOfxImageEffectSimpleSourceClipName)) , m_ScreenClip(fetchClip("Screen")) + , m_BgClip(fetchClip("Background")) + , m_GarbageMatteClip(fetchClip("GarbageMatte")) + , m_OcclusionMatteClip(fetchClip("OcclusionMatte")) , m_ScreenColor(fetchChoiceParam("screenColor")) , m_UseScreenInput(fetchBooleanParam("useScreenInput")) , m_PickColor(fetchRGBParam("pickColor")) @@ -95,12 +124,31 @@ IBKeyerPlugin::IBKeyerPlugin(OfxImageEffectHandle p_Handle) , m_Premultiply(fetchBooleanParam("premultiply")) , m_BlackClip(fetchDoubleParam("blackClip")) , m_WhiteClip(fetchDoubleParam("whiteClip")) + , m_MatteGamma(fetchDoubleParam("matteGamma")) + , m_PrematteEnabled(fetchBooleanParam("prematteEnabled")) + , m_PrematteBlur(fetchIntParam("prematteBlur")) + , m_PrematteErode(fetchIntParam("prematteErode")) + , m_PrematteIterations(fetchIntParam("prematteIterations")) , m_NearGreyExtract(fetchBooleanParam("nearGreyExtract")) , m_NearGreyAmount(fetchDoubleParam("nearGreyAmount")) + , m_NearGreySoftness(fetchDoubleParam("nearGreySoftness")) , m_GuidedFilterEnabled(fetchBooleanParam("guidedFilterEnabled")) + , m_GuidedFilterMode(fetchChoiceParam("guidedFilterMode")) , m_GuidedRadius(fetchIntParam("guidedRadius")) , m_GuidedEpsilon(fetchDoubleParam("guidedEpsilon")) , m_GuidedMix(fetchDoubleParam("guidedMix")) + , m_EdgeProtect(fetchDoubleParam("edgeProtect")) + , m_RefineIterations(fetchIntParam("refineIterations")) + , m_EdgeColorCorrect(fetchDoubleParam("edgeColorCorrect")) + , m_BgWrapEnabled(fetchBooleanParam("bgWrapEnabled")) + , m_BgWrapBlur(fetchIntParam("bgWrapBlur")) + , m_BgWrapAmount(fetchDoubleParam("bgWrapAmount")) + , m_AdditiveKeyEnabled(fetchBooleanParam("additiveKeyEnabled")) + , m_AdditiveKeyMode(fetchChoiceParam("additiveKeyMode")) + , m_AdditiveKeySaturation(fetchDoubleParam("additiveKeySaturation")) + , m_AdditiveKeyAmount(fetchDoubleParam("additiveKeyAmount")) + , m_AdditiveKeyBlackClamp(fetchBooleanParam("additiveKeyBlackClamp")) + , m_ViewMode(fetchChoiceParam("viewMode")) { setEnabledness(); } @@ -149,6 +197,43 @@ void IBKeyerPlugin::render(const OFX::RenderArguments& p_Args) } } + // Get background image (optional light-wrap source). + std::unique_ptr background; + if (m_BgClip && m_BgClip->isConnected()) { + background.reset(m_BgClip->fetchImage(p_Args.time)); + if (background && + (background->getPixelDepth() != OFX::eBitDepthFloat || + (background->getPixelComponents() != OFX::ePixelComponentRGB && + background->getPixelComponents() != OFX::ePixelComponentRGBA))) { + OFX::throwSuiteStatusException(kOfxStatErrUnsupported); + } + } + + // External matte clips are deliberately optional. They are constraints on the matte, not + // mandatory inputs to the key itself, so disconnecting them should never change routing or + // make the effect invalid. + std::unique_ptr garbageMatte; + if (m_GarbageMatteClip && m_GarbageMatteClip->isConnected()) { + garbageMatte.reset(m_GarbageMatteClip->fetchImage(p_Args.time)); + if (garbageMatte && + (garbageMatte->getPixelDepth() != OFX::eBitDepthFloat || + (garbageMatte->getPixelComponents() != OFX::ePixelComponentRGB && + garbageMatte->getPixelComponents() != OFX::ePixelComponentRGBA))) { + OFX::throwSuiteStatusException(kOfxStatErrUnsupported); + } + } + + std::unique_ptr occlusionMatte; + if (m_OcclusionMatteClip && m_OcclusionMatteClip->isConnected()) { + occlusionMatte.reset(m_OcclusionMatteClip->fetchImage(p_Args.time)); + if (occlusionMatte && + (occlusionMatte->getPixelDepth() != OFX::eBitDepthFloat || + (occlusionMatte->getPixelComponents() != OFX::ePixelComponentRGB && + occlusionMatte->getPixelComponents() != OFX::ePixelComponentRGBA))) { + OFX::throwSuiteStatusException(kOfxStatErrUnsupported); + } + } + // Fetch parameter values once and freeze them into a backend-agnostic request. The old file // pushed these directly into an ImageProcessor instance; the split version keeps that same // intent, but makes the backend choice explicit and testable. @@ -179,16 +264,44 @@ void IBKeyerPlugin::render(const OFX::RenderArguments& p_Args) params.premultiply = m_Premultiply->getValueAtTime(p_Args.time); params.blackClip = static_cast(m_BlackClip->getValueAtTime(p_Args.time)); params.whiteClip = static_cast(m_WhiteClip->getValueAtTime(p_Args.time)); + params.matteGamma = static_cast(m_MatteGamma->getValueAtTime(p_Args.time)); + params.prematteEnabled = m_PrematteEnabled->getValueAtTime(p_Args.time); + params.prematteBlur = m_PrematteBlur->getValueAtTime(p_Args.time); + params.prematteErode = m_PrematteErode->getValueAtTime(p_Args.time); + params.prematteIterations = m_PrematteIterations->getValueAtTime(p_Args.time); params.nearGreyExtract = m_NearGreyExtract->getValueAtTime(p_Args.time); params.nearGreyAmount = static_cast(m_NearGreyAmount->getValueAtTime(p_Args.time)); + params.nearGreySoftness = static_cast(m_NearGreySoftness->getValueAtTime(p_Args.time)); params.guidedFilterEnabled = m_GuidedFilterEnabled->getValueAtTime(p_Args.time); + int guidedFilterMode = 0; + m_GuidedFilterMode->getValueAtTime(p_Args.time, guidedFilterMode); + params.guidedFilterMode = guidedFilterMode; params.guidedRadius = m_GuidedRadius->getValueAtTime(p_Args.time); params.guidedEpsilon = static_cast(m_GuidedEpsilon->getValueAtTime(p_Args.time)); params.guidedMix = static_cast(m_GuidedMix->getValueAtTime(p_Args.time)); + params.edgeProtect = static_cast(m_EdgeProtect->getValueAtTime(p_Args.time)); + params.refineIterations = m_RefineIterations->getValueAtTime(p_Args.time); + params.edgeColorCorrect = static_cast(m_EdgeColorCorrect->getValueAtTime(p_Args.time)); + params.bgWrapEnabled = m_BgWrapEnabled->getValueAtTime(p_Args.time) && static_cast(background); + params.bgWrapBlur = m_BgWrapBlur->getValueAtTime(p_Args.time); + params.bgWrapAmount = static_cast(m_BgWrapAmount->getValueAtTime(p_Args.time)); + params.additiveKeyEnabled = m_AdditiveKeyEnabled->getValueAtTime(p_Args.time); + int additiveKeyMode = 0; + m_AdditiveKeyMode->getValueAtTime(p_Args.time, additiveKeyMode); + params.additiveKeyMode = additiveKeyMode; + params.additiveKeySaturation = static_cast(m_AdditiveKeySaturation->getValueAtTime(p_Args.time)); + params.additiveKeyAmount = static_cast(m_AdditiveKeyAmount->getValueAtTime(p_Args.time)); + params.additiveKeyBlackClamp = m_AdditiveKeyBlackClamp->getValueAtTime(p_Args.time); + int viewMode = 0; + m_ViewMode->getValueAtTime(p_Args.time, viewMode); + params.viewMode = viewMode; IBKeyerCore::RenderRequest request; request.srcImage = src.get(); request.screenImage = screen.get(); + request.backgroundImage = background.get(); + request.garbageMatteImage = garbageMatte.get(); + request.occlusionMatteImage = occlusionMatte.get(); request.dstImage = dst.get(); request.renderWindow = p_Args.renderWindow; request.hostCudaEnabled = p_Args.isEnabledCudaRender; @@ -224,8 +337,12 @@ void IBKeyerPlugin::changedParam(const OFX::InstanceChangedArgs&, const std::string& p_ParamName) { if (p_ParamName == "useScreenInput" || + p_ParamName == "prematteEnabled" || p_ParamName == "guidedFilterEnabled" || - p_ParamName == "nearGreyExtract") { + p_ParamName == "additiveKeyEnabled" || + p_ParamName == "additiveKeyMode" || + p_ParamName == "nearGreyExtract" || + p_ParamName == "bgWrapEnabled") { setEnabledness(); } } @@ -240,7 +357,11 @@ void IBKeyerPlugin::changedParam(const OFX::InstanceChangedArgs&, void IBKeyerPlugin::changedClip(const OFX::InstanceChangedArgs&, const std::string& p_ClipName) { - if (p_ClipName == "Screen" || p_ClipName == kOfxImageEffectSimpleSourceClipName) { + if (p_ClipName == "Screen" || + p_ClipName == "Background" || + p_ClipName == "GarbageMatte" || + p_ClipName == "OcclusionMatte" || + p_ClipName == kOfxImageEffectSimpleSourceClipName) { setEnabledness(); } } @@ -258,13 +379,40 @@ void IBKeyerPlugin::setEnabledness() const bool useScreenInput = m_UseScreenInput->getValue(); m_PickColor->setEnabled(!useScreenInput || !m_ScreenClip->isConnected()); + const bool prematteEnabled = m_PrematteEnabled->getValue(); + m_PrematteBlur->setEnabled(prematteEnabled); + m_PrematteErode->setEnabled(prematteEnabled); + m_PrematteIterations->setEnabled(prematteEnabled); + const bool guidedEnabled = m_GuidedFilterEnabled->getValue(); + m_GuidedFilterMode->setEnabled(guidedEnabled); m_GuidedRadius->setEnabled(guidedEnabled); m_GuidedEpsilon->setEnabled(guidedEnabled); m_GuidedMix->setEnabled(guidedEnabled); + m_EdgeProtect->setEnabled(guidedEnabled); + m_RefineIterations->setEnabled(guidedEnabled); + m_EdgeColorCorrect->setEnabled(guidedEnabled); const bool nearGreyEnabled = m_NearGreyExtract->getValue(); m_NearGreyAmount->setEnabled(nearGreyEnabled); + m_NearGreySoftness->setEnabled(nearGreyEnabled); + + // Resolve can be a little awkward about when optional secondary-input connection state becomes + // visible to the plugin UI. If we require "checked + connected" here, users can end up in a + // dead-feeling state where they enabled Background Wrap but still cannot edit its controls. + // + // Render-time validation still requires a real Background clip, so loosening the UI gate here + // is a usability fix rather than a behavior change. + const bool bgWrapControlsEnabled = m_BgWrapEnabled->getValue() || + (m_BgClip != nullptr && m_BgClip->isConnected()); + m_BgWrapBlur->setEnabled(bgWrapControlsEnabled); + m_BgWrapAmount->setEnabled(bgWrapControlsEnabled); + + const bool additiveEnabled = m_AdditiveKeyEnabled->getValue(); + m_AdditiveKeyMode->setEnabled(additiveEnabled); + m_AdditiveKeySaturation->setEnabled(additiveEnabled); + m_AdditiveKeyAmount->setEnabled(additiveEnabled); + m_AdditiveKeyBlackClamp->setEnabled(additiveEnabled); } OFX::DoubleParamDescriptor* defineDoubleParam(OFX::ImageEffectDescriptor& p_Desc, @@ -365,27 +513,52 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu // Screen clip (optional clean plate). ClipDescriptor* screenClip = p_Desc.defineClip("Screen"); - screenClip->addSupportedComponent(ePixelComponentRGB); screenClip->addSupportedComponent(ePixelComponentRGBA); + screenClip->addSupportedComponent(ePixelComponentRGB); screenClip->setTemporalClipAccess(false); screenClip->setSupportsTiles(kSupportsTiles); screenClip->setOptional(true); screenClip->setIsMask(false); + // Background clip (optional, used for background/light wrap parity with the older plugin). + ClipDescriptor* bgClip = p_Desc.defineClip("Background"); + bgClip->addSupportedComponent(ePixelComponentRGBA); + bgClip->addSupportedComponent(ePixelComponentRGB); + bgClip->setTemporalClipAccess(false); + bgClip->setSupportsTiles(kSupportsTiles); + bgClip->setOptional(true); + bgClip->setIsMask(false); + + ClipDescriptor* garbageClip = p_Desc.defineClip("GarbageMatte"); + garbageClip->addSupportedComponent(ePixelComponentRGBA); + garbageClip->addSupportedComponent(ePixelComponentRGB); + garbageClip->setTemporalClipAccess(false); + garbageClip->setSupportsTiles(kSupportsTiles); + garbageClip->setOptional(true); + // These are semantically mattes, but treating them as normal optional image clips keeps the + // host contract boring. That is useful on Resolve/Windows, where startup stability matters more + // than hinting extra semantics that the effect does not strictly need. + garbageClip->setIsMask(false); + + ClipDescriptor* occlusionClip = p_Desc.defineClip("OcclusionMatte"); + occlusionClip->addSupportedComponent(ePixelComponentRGBA); + occlusionClip->addSupportedComponent(ePixelComponentRGB); + occlusionClip->setTemporalClipAccess(false); + occlusionClip->setSupportsTiles(kSupportsTiles); + occlusionClip->setOptional(true); + occlusionClip->setIsMask(false); + // Output clip. ClipDescriptor* dstClip = p_Desc.defineClip(kOfxImageEffectOutputClipName); dstClip->addSupportedComponent(ePixelComponentRGBA); dstClip->setSupportsTiles(kSupportsTiles); - // Page. PageParamDescriptor* page = p_Desc.definePageParam("Controls"); - // Group: Screen Settings. GroupParamDescriptor* screenGroup = p_Desc.defineGroupParam("ScreenGroup"); screenGroup->setHint("Screen and keying parameters"); screenGroup->setLabels("Screen Settings", "Screen Settings", "Screen Settings"); - // Screen colour choice (Red / Green / Blue). ChoiceParamDescriptor* screenColor = p_Desc.defineChoiceParam("screenColor"); screenColor->setLabel("Screen Color"); screenColor->setHint("Dominant chroma of the backing screen."); @@ -397,7 +570,6 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu screenColor->setParent(*screenGroup); page->addChild(*screenColor); - // Use screen input toggle. BooleanParamDescriptor* useScreenInput = p_Desc.defineBooleanParam("useScreenInput"); useScreenInput->setDefault(true); useScreenInput->setHint("When enabled, reads screen colour from the Screen clip. When disabled, uses the Pick Color constant."); @@ -405,7 +577,6 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu useScreenInput->setParent(*screenGroup); page->addChild(*useScreenInput); - // Pick colour (constant fallback). RGBParamDescriptor* pickColor = p_Desc.defineRGBParam("pickColor"); pickColor->setLabels("Pick Color", "Pick Color", "Pick Color"); pickColor->setHint("Constant screen colour when Screen input is not connected."); @@ -413,7 +584,6 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu pickColor->setParent(*screenGroup); page->addChild(*pickColor); - // Group: Keyer Controls. GroupParamDescriptor* keyerGroup = p_Desc.defineGroupParam("KeyerGroup"); keyerGroup->setHint("Keying and despill controls"); keyerGroup->setLabels("Keyer Controls", "Keyer Controls", "Keyer Controls"); @@ -425,7 +595,6 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu "Scales the despill subtraction. 1.0 = standard.", keyerGroup, 1.0, 0.0, 5.0, 0.01)); - // Respill colour. RGBParamDescriptor* respillColor = p_Desc.defineRGBParam("respillColor"); respillColor->setLabels("Respill Color", "Respill Color", "Respill Color"); respillColor->setHint("Colour to add back where screen spill was removed."); @@ -433,7 +602,6 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu respillColor->setParent(*keyerGroup); page->addChild(*respillColor); - // Premultiply. BooleanParamDescriptor* premultiply = p_Desc.defineBooleanParam("premultiply"); premultiply->setDefault(false); premultiply->setHint("Premultiply RGB by alpha for compositing."); @@ -441,52 +609,106 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu premultiply->setParent(*keyerGroup); page->addChild(*premultiply); - // Group: Matte Controls. GroupParamDescriptor* matteGroup = p_Desc.defineGroupParam("MatteGroup"); - matteGroup->setHint("Matte refinement controls"); + matteGroup->setHint("Matte refinement controls — adjust black/white points of the raw key"); matteGroup->setLabels("Matte Controls", "Matte Controls", "Matte Controls"); page->addChild(*defineDoubleParam(p_Desc, "blackClip", "Black Clip", - "Crush blacks in the raw matte. Values below this become fully transparent.", + "Crush blacks in the raw matte. Values below this become fully transparent. Useful for cleaning up noise in the screen area.", matteGroup, 0.0, 0.0, 1.0, 0.001)); page->addChild(*defineDoubleParam(p_Desc, "whiteClip", "White Clip", - "Push whites in the raw matte. Values above this become fully opaque.", + "Push whites in the raw matte. Values above this become fully opaque. Useful for solidifying the foreground core.", matteGroup, 1.0, 0.0, 1.0, 0.001)); + page->addChild(*defineDoubleParam(p_Desc, "matteGamma", "Matte Gamma", + "Applies a power curve to the alpha after black/white clipping.\n" + "Values < 1.0 push semi-transparent edges toward opaque.\n" + "Values > 1.0 push them toward transparent.\n" + "1.0 = no change.", + matteGroup, 1.0, 0.1, 4.0, 0.01)); + + GroupParamDescriptor* prematteGroup = p_Desc.defineGroupParam("PrematteGroup"); + prematteGroup->setHint("Synthetic clean-plate generation used to re-key difficult shots."); + prematteGroup->setLabels("Prematte", "Prematte", "Prematte"); + + BooleanParamDescriptor* prematteEnabled = p_Desc.defineBooleanParam("prematteEnabled"); + prematteEnabled->setDefault(false); + prematteEnabled->setHint("Builds a synthetic clean plate from the source and re-runs the core keyer. This moved out of the private Metal branch because it changes the actual key, not just the display."); + prematteEnabled->setLabels("Enable", "Enable", "Enable"); + prematteEnabled->setParent(*prematteGroup); + page->addChild(*prematteEnabled); + + IntParamDescriptor* prematteBlur = p_Desc.defineIntParam("prematteBlur"); + prematteBlur->setLabels("Blur Radius", "Blur Radius", "Blur Radius"); + prematteBlur->setScriptName("prematteBlur"); + prematteBlur->setHint("Blur radius for the synthetic clean plate."); + prematteBlur->setDefault(8); + prematteBlur->setRange(1, 200); + prematteBlur->setDisplayRange(1, 50); + prematteBlur->setParent(*prematteGroup); + page->addChild(*prematteBlur); + + IntParamDescriptor* prematteErode = p_Desc.defineIntParam("prematteErode"); + prematteErode->setLabels("Erode", "Erode", "Erode"); + prematteErode->setScriptName("prematteErode"); + prematteErode->setHint("Erodes the initial matte before clean-plate estimation to reduce foreground contamination."); + prematteErode->setDefault(0); + prematteErode->setRange(0, 20); + prematteErode->setDisplayRange(0, 10); + prematteErode->setParent(*prematteGroup); + page->addChild(*prematteErode); + + IntParamDescriptor* prematteIterations = p_Desc.defineIntParam("prematteIterations"); + prematteIterations->setLabels("Iterations", "Iterations", "Iterations"); + prematteIterations->setScriptName("prematteIterations"); + prematteIterations->setHint("How many times the synthetic clean plate is rebuilt and re-keyed."); + prematteIterations->setDefault(1); + prematteIterations->setRange(1, 5); + prematteIterations->setDisplayRange(1, 5); + prematteIterations->setParent(*prematteGroup); + page->addChild(*prematteIterations); + + GroupParamDescriptor* ngeGroup = p_Desc.defineGroupParam("NGEGroup"); + ngeGroup->setHint("Near Grey Extraction controls"); + ngeGroup->setLabels("Near Grey Extract", "Near Grey Extract", "Near Grey Extract"); - // Group: Near Grey Extract. - GroupParamDescriptor* nearGreyGroup = p_Desc.defineGroupParam("NGEGroup"); - nearGreyGroup->setHint("Near Grey Extraction controls"); - nearGreyGroup->setLabels("Near Grey Extract", "Near Grey Extract", "Near Grey Extract"); - - // Near Grey Extract toggle. BooleanParamDescriptor* nearGreyExtract = p_Desc.defineBooleanParam("nearGreyExtract"); nearGreyExtract->setDefault(true); nearGreyExtract->setHint("Improves matte quality in near-grey or ambiguous areas."); nearGreyExtract->setLabels("Enable", "Enable", "Enable"); - nearGreyExtract->setParent(*nearGreyGroup); + nearGreyExtract->setParent(*ngeGroup); page->addChild(*nearGreyExtract); - // Near Grey Amount. - page->addChild(*defineDoubleParam(p_Desc, "nearGreyAmount", "Amount", - "Controls the near-grey response curve used by the keyer.", - nearGreyGroup, 1.0, 0.0, 1.0, 0.01)); + page->addChild(*defineDoubleParam(p_Desc, "nearGreyAmount", "Strength", + "How much the near-grey extraction contributes to the final alpha.", + ngeGroup, 0.5, 0.0, 1.0, 0.01)); + page->addChild(*defineDoubleParam(p_Desc, "nearGreySoftness", "Softness", + "Controls how the keyer measures 'greyness' in ambiguous regions.", + ngeGroup, 1.0, 0.0, 1.0, 0.01)); - // Group: Guided Filter. GroupParamDescriptor* guidedGroup = p_Desc.defineGroupParam("GuidedFilterGroup"); guidedGroup->setHint("Edge-aware matte refinement using the source luminance as guide"); guidedGroup->setLabels("Guided Filter", "Guided Filter", "Guided Filter"); BooleanParamDescriptor* guidedEnabled = p_Desc.defineBooleanParam("guidedFilterEnabled"); - // The original Gaffer IBKeyer stops at the raw IBK-style result plus optional premultiply. - // Guided filtering is useful, but it is an extension we added in the OFX port, not part of - // the source graph itself. Defaulting it off keeps "fresh instance" behaviour closer to the - // original tool and makes backend parity checks less confusing. - guidedEnabled->setDefault(false); - guidedEnabled->setHint("Enable guided filter matte refinement."); + // This intentionally matches the older IBKeymaster defaults now. Earlier in the port I turned + // this off to stay closer to the simpler Gaffer graph, but once the goal shifted to full + // IBKeymaster parity that default became misleading. + guidedEnabled->setDefault(true); + guidedEnabled->setHint("Enable guided filter matte refinement. Uses source luminance as an edge guide to recover hair detail and soft edges."); guidedEnabled->setLabels("Enable", "Enable", "Enable"); guidedEnabled->setParent(*guidedGroup); page->addChild(*guidedEnabled); + ChoiceParamDescriptor* guidedMode = p_Desc.defineChoiceParam("guidedFilterMode"); + guidedMode->setLabel("Guide Mode"); + guidedMode->setHint("Luma uses the simpler scalar guide. RGB uses the full 3-channel guided filter from the private Metal branch."); + guidedMode->appendOption("Luma"); + guidedMode->appendOption("RGB"); + guidedMode->setDefault(0); + guidedMode->setAnimates(true); + guidedMode->setParent(*guidedGroup); + page->addChild(*guidedMode); + IntParamDescriptor* guidedRadius = p_Desc.defineIntParam("guidedRadius"); guidedRadius->setLabels("Radius", "Radius", "Radius"); guidedRadius->setScriptName("guidedRadius"); @@ -498,11 +720,106 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu page->addChild(*guidedRadius); page->addChild(*defineDoubleParam(p_Desc, "guidedEpsilon", "Epsilon", - "Edge sensitivity for the guided filter.", + "Edge sensitivity. Smaller values preserve more edges but may introduce noise.", guidedGroup, 0.01, 0.0001, 1.0, 0.001)); page->addChild(*defineDoubleParam(p_Desc, "guidedMix", "Mix", - "Blend between raw matte and guided-filter-refined matte.", + "Blend between raw matte (0.0) and guided-filter-refined matte (1.0).", guidedGroup, 1.0, 0.0, 1.0, 0.01)); + page->addChild(*defineDoubleParam(p_Desc, "edgeProtect", "Edge Protection", + "Blends the guide signal from source luminance toward the raw alpha.", + guidedGroup, 0.5, 0.0, 1.0, 0.01)); + + IntParamDescriptor* refineIterations = p_Desc.defineIntParam("refineIterations"); + refineIterations->setLabels("Refine Iterations", "Refine Iterations", "Refine Iterations"); + refineIterations->setScriptName("refineIterations"); + refineIterations->setHint("Number of iterative guided-filter refinement passes."); + refineIterations->setDefault(2); + refineIterations->setRange(1, 5); + refineIterations->setDisplayRange(1, 5); + refineIterations->setParent(*guidedGroup); + page->addChild(*refineIterations); + + page->addChild(*defineDoubleParam(p_Desc, "edgeColorCorrect", "Edge Color Correct", + "Re-estimates foreground colour at semi-transparent edges using the matting equation.", + guidedGroup, 0.0, 0.0, 1.0, 0.01)); + + GroupParamDescriptor* bgGroup = p_Desc.defineGroupParam("BgWrapGroup"); + bgGroup->setHint("Bleeds a blurred version of the new background into the foreground edges."); + bgGroup->setLabels("Background Wrap", "Background Wrap", "Background Wrap"); + + BooleanParamDescriptor* bgWrapEnabled = p_Desc.defineBooleanParam("bgWrapEnabled"); + bgWrapEnabled->setDefault(false); + bgWrapEnabled->setHint("Enable background wrap. Requires the Background clip to be connected."); + bgWrapEnabled->setLabels("Enable", "Enable", "Enable"); + bgWrapEnabled->setParent(*bgGroup); + page->addChild(*bgWrapEnabled); + + IntParamDescriptor* bgWrapBlur = p_Desc.defineIntParam("bgWrapBlur"); + bgWrapBlur->setLabels("Blur Radius", "Blur Radius", "Blur Radius"); + bgWrapBlur->setScriptName("bgWrapBlur"); + bgWrapBlur->setHint("Gaussian blur radius applied to the background before wrapping."); + bgWrapBlur->setDefault(20); + bgWrapBlur->setRange(1, 200); + bgWrapBlur->setDisplayRange(1, 100); + bgWrapBlur->setParent(*bgGroup); + page->addChild(*bgWrapBlur); + + page->addChild(*defineDoubleParam(p_Desc, "bgWrapAmount", "Amount", + "How much blurred background to bleed into the foreground edges.", + bgGroup, 0.5, 0.0, 2.0, 0.01)); + + GroupParamDescriptor* additiveGroup = p_Desc.defineGroupParam("AdditiveKeyGroup"); + additiveGroup->setHint("Recovers transparent detail the alpha missed by adding back source-minus-screen detail."); + additiveGroup->setLabels("Additive Key", "Additive Key", "Additive Key"); + + BooleanParamDescriptor* additiveEnabled = p_Desc.defineBooleanParam("additiveKeyEnabled"); + additiveEnabled->setDefault(false); + additiveEnabled->setHint("Enable additive detail recovery."); + additiveEnabled->setLabels("Enable", "Enable", "Enable"); + additiveEnabled->setParent(*additiveGroup); + page->addChild(*additiveEnabled); + + ChoiceParamDescriptor* additiveMode = p_Desc.defineChoiceParam("additiveKeyMode"); + additiveMode->setLabel("Mode"); + additiveMode->setHint("Addition uses source minus screen directly. Multiply uses a factor against the blurred background."); + additiveMode->appendOption("Addition"); + additiveMode->appendOption("Multiply"); + additiveMode->setDefault(0); + additiveMode->setAnimates(true); + additiveMode->setParent(*additiveGroup); + page->addChild(*additiveMode); + + page->addChild(*defineDoubleParam(p_Desc, "additiveKeySaturation", "Saturation", + "Desaturates recovered detail to reduce residual screen colour.", + additiveGroup, 0.0, 0.0, 1.0, 0.01)); + page->addChild(*defineDoubleParam(p_Desc, "additiveKeyAmount", "Amount", + "Strength of the additive detail recovery.", + additiveGroup, 0.0, 0.0, 2.0, 0.01)); + + BooleanParamDescriptor* additiveClamp = p_Desc.defineBooleanParam("additiveKeyBlackClamp"); + additiveClamp->setDefault(false); + additiveClamp->setHint("Clamp additive detail to positive values only."); + additiveClamp->setLabels("Black Clamp", "Black Clamp", "Black Clamp"); + additiveClamp->setParent(*additiveGroup); + page->addChild(*additiveClamp); + + GroupParamDescriptor* displayGroup = p_Desc.defineGroupParam("DisplayGroup"); + displayGroup->setHint("Diagnostic views from the richer private branch."); + displayGroup->setLabels("Display", "Display", "Display"); + + ChoiceParamDescriptor* viewMode = p_Desc.defineChoiceParam("viewMode"); + viewMode->setLabel("View Mode"); + viewMode->setHint("Displays intermediate pipeline stages for debugging and tuning."); + viewMode->appendOption("Composite"); + viewMode->appendOption("Raw Matte"); + viewMode->appendOption("Clean Plate"); + viewMode->appendOption("Refined Matte"); + viewMode->appendOption("Despilled Source"); + viewMode->appendOption("Blurred Background"); + viewMode->setDefault(0); + viewMode->setAnimates(true); + viewMode->setParent(*displayGroup); + page->addChild(*viewMode); } //////////////////////////////////////////////////////////////////////////////// diff --git a/IBKeymaster/IBKeyerBackend.cpp b/IBKeymaster/IBKeyerBackend.cpp index f471816..6818e4a 100644 --- a/IBKeymaster/IBKeyerBackend.cpp +++ b/IBKeymaster/IBKeyerBackend.cpp @@ -33,13 +33,19 @@ extern void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, int p_Premultiply, int p_NearGreyExtract, float p_NearGreyAmount, float p_NearGreySoftness, float p_BlackClip, float p_WhiteClip, float p_MatteGamma, - int p_GuidedFilterEnabled, int p_GuidedRadius, + int p_PrematteEnabled, int p_PrematteBlur, int p_PrematteErode, int p_PrematteIterations, + int p_GuidedFilterEnabled, int p_GuidedFilterMode, int p_GuidedRadius, float p_GuidedEpsilon, float p_GuidedMix, float p_EdgeProtect, int p_RefineIterations, float p_EdgeColorCorrect, int p_BgWrapEnabled, int p_BgWrapBlur, float p_BgWrapAmount, + int p_AdditiveKeyEnabled, int p_AdditiveKeyMode, + float p_AdditiveKeySat, float p_AdditiveKeyAmount, int p_AdditiveKeyBlackClamp, + int p_ViewMode, const float* p_Input, const float* p_Screen, - const float* p_Background, float* p_Output); + const float* p_Background, + const float* p_GarbageMatte, const float* p_OcclusionMatte, + float* p_Output); #endif namespace IBKeyerCore { @@ -76,6 +82,16 @@ bool hostCudaForceSyncEnabled() return enabled; } +bool requiresReferenceOnlyFeatures(const RenderRequest& request) +{ + // This helper started life as a safety rail while the private-branch guide/composite features + // only existed in the CPU reference path. Once the CUDA path learned the same features, keeping + // this list would silently strand Windows/Linux on CPU and make host-CUDA look "broken" even + // when the device code was ready. Leaving the helper in place keeps that migration story visible. + (void)request; + return false; +} + // One selector controls both descriptor advertising and runtime routing. CudaRenderMode selectedCudaRenderModeImpl() { @@ -250,6 +266,9 @@ DeviceRenderFrame buildDeviceRenderFrame(const RenderRequest& request) DeviceRenderFrame frame; frame.src = makeImagePlaneDesc(request.srcImage); frame.screen = makeImagePlaneDesc(request.screenImage); + frame.background = makeImagePlaneDesc(request.backgroundImage); + frame.garbageMatte = makeImagePlaneDesc(request.garbageMatteImage); + frame.occlusionMatte = makeImagePlaneDesc(request.occlusionMatteImage); frame.dst = makeMutableImagePlaneDesc(request.dstImage); frame.renderWindow = request.renderWindow; return frame; @@ -309,91 +328,150 @@ void gaussianBlurSingle(float* data, } } -// Moved from: the old "CPU PROCESSING — FALLBACK" section. -// -// CPU code is slower, but it is the least dependent on host-specific GPU contracts. That makes it -// the best place to preserve the algorithm "as intended" and compare GPU paths against it when -// debugging correctness regressions. -void renderCpuPacked(const IBKeyerParams& params, const PackedFrame& frame) +void erodeSingle(const float* src, float* dst, int width, int height, int radius) { - // This is the CPU processing fallback from the old file, moved out so it can remain the - // reference implementation while CUDA/Metal evolve independently. - const int width = frame.width; - const int height = frame.height; - const int pixelCount = width * height; - const bool doGF = guidedFilterActive(params); + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + float minValue = 1.0f; + for (int dy = -radius; dy <= radius; ++dy) { + const int sy = std::max(0, std::min(height - 1, y + dy)); + for (int dx = -radius; dx <= radius; ++dx) { + const int sx = std::max(0, std::min(width - 1, x + dx)); + minValue = std::min(minValue, src[(sy * width) + sx]); + } + } + dst[(y * width) + x] = minValue; + } + } +} - std::vector rawAlpha(doGF ? pixelCount : 0); - std::vector guide(doGF ? pixelCount : 0); - std::vector meanI(doGF ? pixelCount : 0); - std::vector meanP(doGF ? pixelCount : 0); - std::vector meanIp(doGF ? pixelCount : 0); - std::vector meanII(doGF ? pixelCount : 0); - std::vector scratch(doGF ? pixelCount : 0); - std::vector gaussianWeights(doGF ? buildGaussianWeights(params.guidedRadius) : std::vector{}); +float smoothstep01(float value) +{ + const float t = clamp01(value); + return t * t * (3.0f - 2.0f * t); +} + +void buildCleanPlate(const PackedFrame& frame, + const IBKeyerParams& params, + const std::vector& alpha, + std::vector& cleanPlate, + std::vector& cleanR, + std::vector& cleanG, + std::vector& cleanB, + std::vector& scratch) +{ + const int pixelCount = frame.width * frame.height; + cleanR.resize(pixelCount); + cleanG.resize(pixelCount); + cleanB.resize(pixelCount); + cleanPlate.resize(static_cast(pixelCount) * 4u, 0.0f); for (int index = 0; index < pixelCount; ++index) { const int rgba = index * 4; - const float srcR = frame.srcRgba[rgba + 0]; - const float srcG = frame.srcRgba[rgba + 1]; - const float srcB = frame.srcRgba[rgba + 2]; - - float scrR = params.pickR; - float scrG = params.pickG; - float scrB = params.pickB; - if (params.useScreenInput && frame.screenRgba != nullptr) { - scrR = frame.screenRgba[rgba + 0]; - scrG = frame.screenRgba[rgba + 1]; - scrB = frame.screenRgba[rgba + 2]; - } - - // 1. Despill of source and screen. - const float despillRGB = despillValue(srcR, srcG, srcB, params.screenColor, params.bias, params.limit); - const float despillScreen = despillValue(scrR, scrG, scrB, params.screenColor, params.bias, params.limit); - - // 2. Normalise. - const float normalized = safeDivide(despillRGB, despillScreen); + const float alphaSoft = smoothstep01(alpha[index]); + cleanR[index] = frame.srcRgba[rgba + 0] * (1.0f - alphaSoft) + params.pickR * alphaSoft; + cleanG[index] = frame.srcRgba[rgba + 1] * (1.0f - alphaSoft) + params.pickG * alphaSoft; + cleanB[index] = frame.srcRgba[rgba + 2] * (1.0f - alphaSoft) + params.pickB * alphaSoft; + } - // 3. Spill map and screen subtraction. - const float spillMul = std::max(0.0f, normalized); - const float ssR = srcR - spillMul * scrR; - const float ssG = srcG - spillMul * scrG; - const float ssB = srcB - spillMul * scrB; + const int blurRadius = std::max(1, params.prematteBlur); + const std::vector weights = buildGaussianWeights(blurRadius); + gaussianBlurSingle(cleanR.data(), scratch.data(), frame.width, frame.height, weights, blurRadius); + gaussianBlurSingle(cleanG.data(), scratch.data(), frame.width, frame.height, weights, blurRadius); + gaussianBlurSingle(cleanB.data(), scratch.data(), frame.width, frame.height, weights, blurRadius); - // 4. Initial alpha. - float alpha = clamp01(1.0f - normalized); + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + cleanPlate[rgba + 0] = cleanR[index]; + cleanPlate[rgba + 1] = cleanG[index]; + cleanPlate[rgba + 2] = cleanB[index]; + cleanPlate[rgba + 3] = 1.0f; + } +} - // 5. Near Grey Extraction (optional). - if (params.nearGreyExtract) { - const float divR = safeDivide(ssR, srcR); - const float divG = safeDivide(ssG, srcG); - const float divB = safeDivide(ssB, srcB); - const float ngeAlpha = nearGreyAlpha(divR, divG, divB, params.screenColor, params.nearGreyAmount); - // Screen composite: a + b - a*b. - alpha = ngeAlpha + alpha - ngeAlpha * alpha; +void applyExternalMatte(float* alphaBuffer, + float* dstRgba, + int pixelCount, + const float* matteRgba, + bool garbage) +{ + if (matteRgba == nullptr) { + return; + } + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + const float matteAlpha = matteRgba[rgba + 3]; + float alpha = alphaBuffer[index]; + if (garbage) { + alpha *= (1.0f - matteAlpha); + } else { + alpha = std::max(alpha, matteAlpha); } + alphaBuffer[index] = clamp01(alpha); + dstRgba[rgba + 3] = alphaBuffer[index]; + } +} - if (params.whiteClip > params.blackClip + 1e-6f) { - alpha = clamp01((alpha - params.blackClip) / (params.whiteClip - params.blackClip)); - } +void writeAlphaDiagnostic(float* dstRgba, const float* alpha, int pixelCount) +{ + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + const float a = alpha[index]; + dstRgba[rgba + 0] = a; + dstRgba[rgba + 1] = a; + dstRgba[rgba + 2] = a; + dstRgba[rgba + 3] = 1.0f; + } +} - // 6. Output = screen-subtracted + respill. - const float respillMul = std::max(0.0f, despillScreen * normalized); - frame.dstRgba[rgba + 0] = ssR + respillMul * params.respillR; - frame.dstRgba[rgba + 1] = ssG + respillMul * params.respillG; - frame.dstRgba[rgba + 2] = ssB + respillMul * params.respillB; - frame.dstRgba[rgba + 3] = alpha; +void copyDiagnosticRgba(float* dstRgba, const float* srcRgba, int pixelCount) +{ + if (srcRgba == nullptr) { + return; + } + std::copy(srcRgba, srcRgba + static_cast(pixelCount) * 4u, dstRgba); +} - if (doGF) { - rawAlpha[index] = alpha; - guide[index] = luminance(srcR, srcG, srcB); - } +void writeAlphaFromRgbaDiagnostic(float* dstRgba, int pixelCount) +{ + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + const float a = dstRgba[rgba + 3]; + dstRgba[rgba + 0] = a; + dstRgba[rgba + 1] = a; + dstRgba[rgba + 2] = a; + dstRgba[rgba + 3] = 1.0f; } +} + +void runScalarGuidedFilter(const PackedFrame& frame, + const IBKeyerParams& params, + std::vector& rawAlpha, + std::vector& guide, + std::vector& meanI, + std::vector& meanP, + std::vector& meanIp, + std::vector& meanII, + std::vector& scratch, + float* dstRgba) +{ + const int pixelCount = frame.width * frame.height; + const std::vector gaussianWeights = buildGaussianWeights(params.guidedRadius); + std::vector savedRawAlpha = rawAlpha; + const int numIter = std::max(1, std::min(params.refineIterations, 5)); + + for (int iter = 0; iter < numIter; ++iter) { + if (iter > 0) { + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + const float alpha = rawAlpha[index]; + const float fgLum = luminance(frame.srcRgba[rgba + 0] * alpha, + frame.srcRgba[rgba + 1] * alpha, + frame.srcRgba[rgba + 2] * alpha); + guide[index] = fgLum * (1.0f - params.edgeProtect) + alpha * params.edgeProtect; + } + } - if (doGF) { - // Guided filter refinement is a post-pass layered over the original IBKeyer math. - // Keeping it after the numbered IBK steps makes it easier to compare against the - // pre-guided historical version when debugging parity. for (int index = 0; index < pixelCount; ++index) { meanI[index] = guide[index]; meanP[index] = rawAlpha[index]; @@ -401,13 +479,10 @@ void renderCpuPacked(const IBKeyerParams& params, const PackedFrame& frame) meanII[index] = guide[index] * guide[index]; } - // The macOS Metal path already uses a normalized Gaussian kernel for the guided - // filter. Matching that here keeps the matte stable across OSes instead of letting - // Windows/Linux drift because they happened to use a cheaper box-blur approximation. - gaussianBlurSingle(meanI.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); - gaussianBlurSingle(meanP.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); - gaussianBlurSingle(meanIp.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); - gaussianBlurSingle(meanII.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(meanI.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(meanP.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(meanIp.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(meanII.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius); for (int index = 0; index < pixelCount; ++index) { const float variance = meanII[index] - meanI[index] * meanI[index]; @@ -418,23 +493,354 @@ void renderCpuPacked(const IBKeyerParams& params, const PackedFrame& frame) meanP[index] = b; } - gaussianBlurSingle(meanI.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(meanI.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(meanP.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius); + + if (iter < numIter - 1) { + for (int index = 0; index < pixelCount; ++index) { + rawAlpha[index] = clamp01(meanI[index] * guide[index] + meanP[index]); + } + } + } + + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + const float guidedAlpha = clamp01(meanI[index] * guide[index] + meanP[index]); + const float alpha = savedRawAlpha[index] * (1.0f - params.guidedMix) + guidedAlpha * params.guidedMix; + if (params.premultiply) { + dstRgba[rgba + 0] *= alpha; + dstRgba[rgba + 1] *= alpha; + dstRgba[rgba + 2] *= alpha; + } + dstRgba[rgba + 3] = alpha; + } +} + +void runRgbGuidedFilter(const PackedFrame& frame, + const IBKeyerParams& params, + std::vector& rawAlpha, + std::vector& scratch, + float* dstRgba) +{ + const int width = frame.width; + const int height = frame.height; + const int pixelCount = width * height; + const int numIter = std::max(1, std::min(params.refineIterations, 5)); + const std::vector gaussianWeights = buildGaussianWeights(params.guidedRadius); + std::vector meanIr(pixelCount), meanIg(pixelCount), meanIb(pixelCount), meanP(pixelCount); + std::vector irir(pixelCount), irig(pixelCount), irib(pixelCount), igig(pixelCount), igib(pixelCount), ibib(pixelCount); + std::vector irp(pixelCount), igp(pixelCount), ibp(pixelCount); + std::vector meanAr(pixelCount), meanAg(pixelCount), meanAb(pixelCount), meanB(pixelCount); + const std::vector savedRawAlpha = rawAlpha; + + for (int iter = 0; iter < numIter; ++iter) { + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + const float ir = frame.srcRgba[rgba + 0]; + const float ig = frame.srcRgba[rgba + 1]; + const float ib = frame.srcRgba[rgba + 2]; + const float p = (iter == 0) ? rawAlpha[index] : rawAlpha[index]; + meanIr[index] = ir; + meanIg[index] = ig; + meanIb[index] = ib; + meanP[index] = p; + irir[index] = ir * ir; + irig[index] = ir * ig; + irib[index] = ir * ib; + igig[index] = ig * ig; + igib[index] = ig * ib; + ibib[index] = ib * ib; + irp[index] = ir * p; + igp[index] = ig * p; + ibp[index] = ib * p; + } + + gaussianBlurSingle(meanIr.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(meanIg.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(meanIb.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); gaussianBlurSingle(meanP.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(irir.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(irig.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(irib.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(igig.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(igib.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(ibib.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(irp.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(igp.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(ibp.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + for (int index = 0; index < pixelCount; ++index) { + const float mIr = meanIr[index]; + const float mIg = meanIg[index]; + const float mIb = meanIb[index]; + const float mP = meanP[index]; + + float s_rr = irir[index] - mIr * mIr; + const float s_rg = irig[index] - mIr * mIg; + const float s_rb = irib[index] - mIr * mIb; + float s_gg = igig[index] - mIg * mIg; + const float s_gb = igib[index] - mIg * mIb; + float s_bb = ibib[index] - mIb * mIb; + + const float c_rp = irp[index] - mIr * mP; + const float c_gp = igp[index] - mIg * mP; + const float c_bp = ibp[index] - mIb * mP; + + const float trace = s_rr + s_gg + s_bb; + const float adaptEps = params.guidedEpsilon * params.guidedEpsilon / + ((trace / 3.0f) + params.guidedEpsilon + 1e-10f); + s_rr += adaptEps; + s_gg += adaptEps; + s_bb += adaptEps; + + const float det = s_rr * (s_gg * s_bb - s_gb * s_gb) + - s_rg * (s_rg * s_bb - s_gb * s_rb) + + s_rb * (s_rg * s_gb - s_gg * s_rb); + const float invDet = (std::fabs(det) > 1e-12f) ? (1.0f / det) : 0.0f; + + const float inv_rr = (s_gg * s_bb - s_gb * s_gb) * invDet; + const float inv_rg = (s_rb * s_gb - s_rg * s_bb) * invDet; + const float inv_rb = (s_rg * s_gb - s_rb * s_gg) * invDet; + const float inv_gg = (s_rr * s_bb - s_rb * s_rb) * invDet; + const float inv_gb = (s_rb * s_rg - s_rr * s_gb) * invDet; + const float inv_bb = (s_rr * s_gg - s_rg * s_rg) * invDet; + + meanAr[index] = inv_rr * c_rp + inv_rg * c_gp + inv_rb * c_bp; + meanAg[index] = inv_rg * c_rp + inv_gg * c_gp + inv_gb * c_bp; + meanAb[index] = inv_rb * c_rp + inv_gb * c_gp + inv_bb * c_bp; + meanB[index] = mP - meanAr[index] * mIr - meanAg[index] * mIg - meanAb[index] * mIb; + } + + gaussianBlurSingle(meanAr.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(meanAg.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(meanAb.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + gaussianBlurSingle(meanB.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius); + + if (iter < numIter - 1) { + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + const float q = meanAr[index] * frame.srcRgba[rgba + 0] + + meanAg[index] * frame.srcRgba[rgba + 1] + + meanAb[index] * frame.srcRgba[rgba + 2] + + meanB[index]; + rawAlpha[index] = clamp01(q); + } + } + } + + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + const float guidedAlpha = clamp01(meanAr[index] * frame.srcRgba[rgba + 0] + + meanAg[index] * frame.srcRgba[rgba + 1] + + meanAb[index] * frame.srcRgba[rgba + 2] + + meanB[index]); + const float alpha = savedRawAlpha[index] * (1.0f - params.guidedMix) + guidedAlpha * params.guidedMix; + if (params.premultiply) { + dstRgba[rgba + 0] *= alpha; + dstRgba[rgba + 1] *= alpha; + dstRgba[rgba + 2] *= alpha; + } + dstRgba[rgba + 3] = alpha; + } +} + +void applyAdditiveKey(const PackedFrame& frame, + const IBKeyerParams& params, + const std::vector& blurBgR, + const std::vector& blurBgG, + const std::vector& blurBgB, + float* dstRgba) +{ + const int pixelCount = frame.width * frame.height; + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + const float alpha = dstRgba[rgba + 3]; + if (alpha > 0.999f) { + continue; + } + + float scrR = params.pickR; + float scrG = params.pickG; + float scrB = params.pickB; + if (params.useScreenInput && frame.screenRgba != nullptr) { + scrR = frame.screenRgba[rgba + 0]; + scrG = frame.screenRgba[rgba + 1]; + scrB = frame.screenRgba[rgba + 2]; + } + + const float srcR = frame.srcRgba[rgba + 0]; + const float srcG = frame.srcRgba[rgba + 1]; + const float srcB = frame.srcRgba[rgba + 2]; + + float resR = 0.0f; + float resG = 0.0f; + float resB = 0.0f; + if (params.additiveKeyMode == 0) { + resR = srcR - scrR; + resG = srcG - scrG; + resB = srcB - scrB; + const float lum = luminance(resR, resG, resB); + resR = lum * (1.0f - params.additiveKeySaturation) + resR * params.additiveKeySaturation; + resG = lum * (1.0f - params.additiveKeySaturation) + resG * params.additiveKeySaturation; + resB = lum * (1.0f - params.additiveKeySaturation) + resB * params.additiveKeySaturation; + } else { + float fR = (scrR > 1e-6f) ? srcR / scrR : 1.0f; + float fG = (scrG > 1e-6f) ? srcG / scrG : 1.0f; + float fB = (scrB > 1e-6f) ? srcB / scrB : 1.0f; + const float fLum = luminance(fR, fG, fB); + fR = fLum * (1.0f - params.additiveKeySaturation) + fR * params.additiveKeySaturation; + fG = fLum * (1.0f - params.additiveKeySaturation) + fG * params.additiveKeySaturation; + fB = fLum * (1.0f - params.additiveKeySaturation) + fB * params.additiveKeySaturation; + if (!blurBgR.empty()) { + resR = blurBgR[index] * (fR - 1.0f); + resG = blurBgG[index] * (fG - 1.0f); + resB = blurBgB[index] * (fB - 1.0f); + } + } + + if (params.additiveKeyBlackClamp) { + resR = std::max(resR, 0.0f); + resG = std::max(resG, 0.0f); + resB = std::max(resB, 0.0f); + } + + const float weight = (1.0f - alpha) * params.additiveKeyAmount; + dstRgba[rgba + 0] += resR * weight; + dstRgba[rgba + 1] += resG * weight; + dstRgba[rgba + 2] += resB * weight; + } +} + +// Moved from: the old "CPU PROCESSING — FALLBACK" section. +// +// CPU code is slower, but it is the least dependent on host-specific GPU contracts. That makes it +// the best place to preserve the algorithm "as intended" and compare GPU paths against it when +// debugging correctness regressions. +void renderCpuPacked(const IBKeyerParams& params, const PackedFrame& frame) +{ + // This section is intentionally close to the old IBKeymaster CPU fallback. + // When I first split the plugin, I simplified this path too much and that made CPU/CUDA/Metal + // parity harder to reason about because the "reference" path was no longer actually the old + // algorithm. The fuller structure here is deliberate: it restores the original pass ordering. + const int width = frame.width; + const int height = frame.height; + const int pixelCount = width * height; + const bool doGF = guidedFilterActive(params); + const bool doPrematte = params.prematteEnabled && params.prematteBlur > 0; + const bool doBgWrap = params.bgWrapEnabled && frame.backgroundRgba != nullptr && params.bgWrapAmount > 0.0f; + const bool doAdditive = params.additiveKeyEnabled && params.additiveKeyAmount > 0.0f; + const bool needBgBlur = doBgWrap || (doAdditive && params.additiveKeyMode == 1 && frame.backgroundRgba != nullptr); + + std::vector rawAlpha(pixelCount, 0.0f); + std::vector guide(doGF ? pixelCount : 0); + std::vector meanI(doGF ? pixelCount : 0); + std::vector meanP(doGF ? pixelCount : 0); + std::vector meanIp(doGF ? pixelCount : 0); + std::vector meanII(doGF ? pixelCount : 0); + std::vector scratch((doGF || needBgBlur || doPrematte) ? pixelCount : 1, 0.0f); + std::vector cleanPlate; + std::vector cleanR; + std::vector cleanG; + std::vector cleanB; + std::vector erodedAlpha; + const float* activeScreen = (params.useScreenInput && frame.screenRgba != nullptr) ? frame.screenRgba : nullptr; + + auto runCorePass = [&](const float* screenRgba, bool forceScreen) { for (int index = 0; index < pixelCount; ++index) { const int rgba = index * 4; - const float raw = frame.dstRgba[rgba + 3]; - const float guided = clamp01(meanI[index] * guide[index] + meanP[index]); - const float alpha = raw * (1.0f - params.guidedMix) + guided * params.guidedMix; - if (params.premultiply) { - frame.dstRgba[rgba + 0] *= alpha; - frame.dstRgba[rgba + 1] *= alpha; - frame.dstRgba[rgba + 2] *= alpha; + const float srcR = frame.srcRgba[rgba + 0]; + const float srcG = frame.srcRgba[rgba + 1]; + const float srcB = frame.srcRgba[rgba + 2]; + + float scrR = params.pickR; + float scrG = params.pickG; + float scrB = params.pickB; + if (forceScreen && screenRgba != nullptr) { + scrR = screenRgba[rgba + 0]; + scrG = screenRgba[rgba + 1]; + scrB = screenRgba[rgba + 2]; } + + const float despillRGB = despillValue(srcR, srcG, srcB, params.screenColor, params.bias, params.limit); + const float despillScreen = despillValue(scrR, scrG, scrB, params.screenColor, params.bias, params.limit); + const float normalized = safeDivide(despillRGB, despillScreen); + const float spillMul = std::max(0.0f, normalized); + const float ssR = srcR - spillMul * scrR; + const float ssG = srcG - spillMul * scrG; + const float ssB = srcB - spillMul * scrB; + + float alpha = clamp01(1.0f - normalized); + if (params.nearGreyExtract && params.nearGreyAmount > 0.0f) { + const float divR = safeDivide(ssR, srcR); + const float divG = safeDivide(ssG, srcG); + const float divB = safeDivide(ssB, srcB); + const float ngeAlpha = nearGreyAlpha(divR, divG, divB, params.screenColor, params.nearGreySoftness); + alpha = alpha + params.nearGreyAmount * ngeAlpha * (1.0f - alpha); + } + if (params.whiteClip > params.blackClip + 1e-6f) { + alpha = clamp01((alpha - params.blackClip) / (params.whiteClip - params.blackClip)); + } + if (params.matteGamma != 1.0f && alpha > 0.0f && alpha < 1.0f) { + alpha = std::pow(alpha, params.matteGamma); + } + + const float respillMul = std::max(0.0f, despillScreen * normalized); + frame.dstRgba[rgba + 0] = ssR + respillMul * params.respillR; + frame.dstRgba[rgba + 1] = ssG + respillMul * params.respillG; + frame.dstRgba[rgba + 2] = ssB + respillMul * params.respillB; frame.dstRgba[rgba + 3] = alpha; + rawAlpha[index] = alpha; + + if (doGF && params.guidedFilterMode == 0) { + const float lum = luminance(srcR, srcG, srcB); + guide[index] = lum * (1.0f - params.edgeProtect) + alpha * params.edgeProtect; + } + } + }; + + runCorePass(activeScreen, params.useScreenInput && activeScreen != nullptr); + + if (doPrematte) { + const int iterations = std::max(1, std::min(params.prematteIterations, 5)); + std::vector prematteAlpha = rawAlpha; + erodedAlpha.resize(pixelCount); + for (int iter = 0; iter < iterations; ++iter) { + const float* alphaSource = prematteAlpha.data(); + if (params.prematteErode > 0) { + erodeSingle(prematteAlpha.data(), erodedAlpha.data(), width, height, params.prematteErode); + alphaSource = erodedAlpha.data(); + } + buildCleanPlate(frame, params, std::vector(alphaSource, alphaSource + pixelCount), + cleanPlate, cleanR, cleanG, cleanB, scratch); + runCorePass(cleanPlate.data(), true); + prematteAlpha = rawAlpha; + } + } + + if (params.viewMode == 2) { + if (doPrematte && !cleanPlate.empty()) { + copyDiagnosticRgba(frame.dstRgba, cleanPlate.data(), pixelCount); + } else if (activeScreen != nullptr) { + copyDiagnosticRgba(frame.dstRgba, activeScreen, pixelCount); + } + return; + } + + applyExternalMatte(rawAlpha.data(), frame.dstRgba, pixelCount, frame.garbageMatteRgba, true); + applyExternalMatte(rawAlpha.data(), frame.dstRgba, pixelCount, frame.occlusionMatteRgba, false); + + if (params.viewMode == 1) { + writeAlphaDiagnostic(frame.dstRgba, rawAlpha.data(), pixelCount); + return; + } + + if (doGF) { + if (params.guidedFilterMode == 1) { + runRgbGuidedFilter(frame, params, rawAlpha, scratch, frame.dstRgba); + } else { + runScalarGuidedFilter(frame, params, rawAlpha, guide, meanI, meanP, meanIp, meanII, scratch, frame.dstRgba); } } else if (params.premultiply) { - // 7. Optional premultiply. for (int index = 0; index < pixelCount; ++index) { const int rgba = index * 4; const float alpha = frame.dstRgba[rgba + 3]; @@ -443,6 +849,121 @@ void renderCpuPacked(const IBKeyerParams& params, const PackedFrame& frame) frame.dstRgba[rgba + 2] *= alpha; } } + + if (params.viewMode == 3) { + writeAlphaFromRgbaDiagnostic(frame.dstRgba, pixelCount); + return; + } + + // This edge-colour pass was one of the features lost in the first split. It looks optional in + // UI terms, but omitting it changes the comped edge colour in ways users absolutely notice. + if (params.edgeColorCorrect > 0.0f) { + const bool isPremult = params.premultiply; + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + const float alpha = frame.dstRgba[rgba + 3]; + if (alpha <= 0.005f || alpha >= 0.995f) { + continue; + } + + float scrR = params.pickR; + float scrG = params.pickG; + float scrB = params.pickB; + if (params.useScreenInput && frame.screenRgba != nullptr) { + scrR = frame.screenRgba[rgba + 0]; + scrG = frame.screenRgba[rgba + 1]; + scrB = frame.screenRgba[rgba + 2]; + } + + const float srcR = frame.srcRgba[rgba + 0]; + const float srcG = frame.srcRgba[rgba + 1]; + const float srcB = frame.srcRgba[rgba + 2]; + const float invA = 1.0f / alpha; + float fgR = (srcR - scrR * (1.0f - alpha)) * invA; + float fgG = (srcG - scrG * (1.0f - alpha)) * invA; + float fgB = (srcB - scrB * (1.0f - alpha)) * invA; + + fgR = std::max(-0.5f, std::min(2.0f, fgR)); + fgG = std::max(-0.5f, std::min(2.0f, fgG)); + fgB = std::max(-0.5f, std::min(2.0f, fgB)); + + float curR = frame.dstRgba[rgba + 0]; + float curG = frame.dstRgba[rgba + 1]; + float curB = frame.dstRgba[rgba + 2]; + if (isPremult) { + curR *= invA; + curG *= invA; + curB *= invA; + } + + const float edgeFactor = alpha * (1.0f - alpha) * 4.0f * params.edgeColorCorrect; + float outR = curR + (fgR - curR) * edgeFactor; + float outG = curG + (fgG - curG) * edgeFactor; + float outB = curB + (fgB - curB) * edgeFactor; + + if (isPremult) { + outR *= alpha; + outG *= alpha; + outB *= alpha; + } + + frame.dstRgba[rgba + 0] = outR; + frame.dstRgba[rgba + 1] = outG; + frame.dstRgba[rgba + 2] = outB; + } + } + + if (params.viewMode == 4) { + return; + } + + std::vector bgR; + std::vector bgG; + std::vector bgB; + if (needBgBlur) { + const int blurRadius = std::max(1, params.bgWrapBlur); + const std::vector bgWeights = buildGaussianWeights(blurRadius); + bgR.resize(pixelCount); + bgG.resize(pixelCount); + bgB.resize(pixelCount); + + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + bgR[index] = frame.backgroundRgba[rgba + 0]; + bgG[index] = frame.backgroundRgba[rgba + 1]; + bgB[index] = frame.backgroundRgba[rgba + 2]; + } + + gaussianBlurSingle(bgR.data(), scratch.data(), width, height, bgWeights, blurRadius); + gaussianBlurSingle(bgG.data(), scratch.data(), width, height, bgWeights, blurRadius); + gaussianBlurSingle(bgB.data(), scratch.data(), width, height, bgWeights, blurRadius); + + if (params.viewMode == 5) { + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + frame.dstRgba[rgba + 0] = bgR[index]; + frame.dstRgba[rgba + 1] = bgG[index]; + frame.dstRgba[rgba + 2] = bgB[index]; + frame.dstRgba[rgba + 3] = 1.0f; + } + return; + } + + if (doBgWrap) { + for (int index = 0; index < pixelCount; ++index) { + const int rgba = index * 4; + const float alpha = frame.dstRgba[rgba + 3]; + const float wrapWeight = alpha * (1.0f - alpha) * 4.0f * params.bgWrapAmount; + frame.dstRgba[rgba + 0] += bgR[index] * wrapWeight; + frame.dstRgba[rgba + 1] += bgG[index] * wrapWeight; + frame.dstRgba[rgba + 2] += bgB[index] * wrapWeight; + } + } + } + + if (doAdditive) { + applyAdditiveKey(frame, params, bgR, bgG, bgB, frame.dstRgba); + } } // Moved from: the old direct CPU/GPU setup path where images were sampled ad hoc. @@ -480,6 +1001,37 @@ void packImageWindow(const OFX::Image* image, const OfxRectI& renderWindow, std: } } +void packMatteWindow(const OFX::Image* image, const OfxRectI& renderWindow, std::vector& packed) +{ + const int width = renderWindow.x2 - renderWindow.x1; + const int height = renderWindow.y2 - renderWindow.y1; + packed.assign(static_cast(width) * static_cast(height) * 4u, 0.0f); + + if (image == nullptr) { + return; + } + + const OFX::PixelComponentEnum components = image->getPixelComponents(); + const int componentCount = (components == OFX::ePixelComponentRGBA) ? 4 : 3; + + for (int y = renderWindow.y1; y < renderWindow.y2; ++y) { + for (int x = renderWindow.x1; x < renderWindow.x2; ++x) { + const float* sourcePixel = static_cast(image->getPixelAddress(x, y)); + if (sourcePixel == nullptr) { + continue; + } + const int localIndex = ((y - renderWindow.y1) * width + (x - renderWindow.x1)) * 4; + const float matte = (componentCount == 4) + ? sourcePixel[3] + : luminance(sourcePixel[0], sourcePixel[1], sourcePixel[2]); + packed[localIndex + 0] = matte; + packed[localIndex + 1] = matte; + packed[localIndex + 2] = matte; + packed[localIndex + 3] = matte; + } + } +} + // Companion to packImageWindow(). // // Once the staged CUDA path finishes, we still need to write the result back through the host's @@ -512,18 +1064,33 @@ BackendResult renderCpu(const RenderRequest& request) const int height = request.renderWindow.y2 - request.renderWindow.y1; std::vector srcPacked; std::vector screenPacked; + std::vector backgroundPacked; + std::vector garbagePacked; + std::vector occlusionPacked; std::vector dstPacked(static_cast(width) * static_cast(height) * 4u, 0.0f); packImageWindow(request.srcImage, request.renderWindow, srcPacked); if (request.params.useScreenInput && request.screenImage != nullptr) { packImageWindow(request.screenImage, request.renderWindow, screenPacked); } + if (request.params.bgWrapEnabled && request.backgroundImage != nullptr) { + packImageWindow(request.backgroundImage, request.renderWindow, backgroundPacked); + } + if (request.garbageMatteImage != nullptr) { + packMatteWindow(request.garbageMatteImage, request.renderWindow, garbagePacked); + } + if (request.occlusionMatteImage != nullptr) { + packMatteWindow(request.occlusionMatteImage, request.renderWindow, occlusionPacked); + } PackedFrame frame; frame.width = width; frame.height = height; frame.srcRgba = srcPacked.data(); frame.screenRgba = screenPacked.empty() ? nullptr : screenPacked.data(); + frame.backgroundRgba = backgroundPacked.empty() ? nullptr : backgroundPacked.data(); + frame.garbageMatteRgba = garbagePacked.empty() ? nullptr : garbagePacked.data(); + frame.occlusionMatteRgba = occlusionPacked.empty() ? nullptr : occlusionPacked.data(); frame.dstRgba = dstPacked.data(); // CPU is intentionally kept as the reference path. When GPU behavior differs, this @@ -545,18 +1112,33 @@ BackendResult renderInternalCuda(const RenderRequest& request) const int height = request.renderWindow.y2 - request.renderWindow.y1; std::vector srcPacked; std::vector screenPacked; + std::vector backgroundPacked; + std::vector garbagePacked; + std::vector occlusionPacked; std::vector dstPacked(static_cast(width) * static_cast(height) * 4u, 0.0f); packImageWindow(request.srcImage, request.renderWindow, srcPacked); if (request.params.useScreenInput && request.screenImage != nullptr) { packImageWindow(request.screenImage, request.renderWindow, screenPacked); } + if (request.params.bgWrapEnabled && request.backgroundImage != nullptr) { + packImageWindow(request.backgroundImage, request.renderWindow, backgroundPacked); + } + if (request.garbageMatteImage != nullptr) { + packMatteWindow(request.garbageMatteImage, request.renderWindow, garbagePacked); + } + if (request.occlusionMatteImage != nullptr) { + packMatteWindow(request.occlusionMatteImage, request.renderWindow, occlusionPacked); + } PackedFrame frame; frame.width = width; frame.height = height; frame.srcRgba = srcPacked.data(); frame.screenRgba = screenPacked.empty() ? nullptr : screenPacked.data(); + frame.backgroundRgba = backgroundPacked.empty() ? nullptr : backgroundPacked.data(); + frame.garbageMatteRgba = garbagePacked.empty() ? nullptr : garbagePacked.data(); + frame.occlusionMatteRgba = occlusionPacked.empty() ? nullptr : occlusionPacked.data(); frame.dstRgba = dstPacked.data(); std::string error; @@ -612,6 +1194,36 @@ BackendResult renderHostCuda(const RenderRequest& request) } } + if (request.params.bgWrapEnabled && request.backgroundImage != nullptr) { + if (frame.background.data == nullptr || frame.background.rowBytes == 0 || (frame.background.components != 3 && frame.background.components != 4)) { + return { + false, + BackendKind::HostCUDA, + "Host CUDA declined because the Background clip did not expose a usable RGB/RGBA device buffer." + }; + } + } + if (request.garbageMatteImage != nullptr) { + if (frame.garbageMatte.data == nullptr || frame.garbageMatte.rowBytes == 0 || (frame.garbageMatte.components != 3 && frame.garbageMatte.components != 4)) { + return {false, BackendKind::HostCUDA, "Host CUDA declined because the Garbage Matte clip did not expose a usable RGB/RGBA device buffer."}; + } + } + if (request.occlusionMatteImage != nullptr) { + if (frame.occlusionMatte.data == nullptr || frame.occlusionMatte.rowBytes == 0 || (frame.occlusionMatte.components != 3 && frame.occlusionMatte.components != 4)) { + return {false, BackendKind::HostCUDA, "Host CUDA declined because the Occlusion Matte clip did not expose a usable RGB/RGBA device buffer."}; + } + } + + logMessage(false, formatString( + "IBKeyer: HostCUDA zero-copy validated. prematte=%d rgbGuide=%d bgWrap=%d additive=%d garbage=%d occlusion=%d viewMode=%d", + request.params.prematteEnabled ? 1 : 0, + (request.params.guidedFilterEnabled && request.params.guidedFilterMode == 1) ? 1 : 0, + request.params.bgWrapEnabled ? 1 : 0, + request.params.additiveKeyEnabled ? 1 : 0, + request.garbageMatteImage != nullptr ? 1 : 0, + request.occlusionMatteImage != nullptr ? 1 : 0, + request.params.viewMode)); + std::string error; if (!renderCudaHost(request.params, frame, request.hostCudaStream, error)) { return {false, BackendKind::HostCUDA, error}; @@ -671,10 +1283,53 @@ BackendResult renderHostMetal(const RenderRequest& request) } } - // The shared Metal kernel still expects host-provided MTLBuffer handles. We keep that - // path only on macOS, and only when the host gave us a command queue and matching bounds. - const float nearGreySoftness = request.params.nearGreyAmount; - const float nearGreyAmount = request.params.nearGreyExtract ? 1.0f : 0.0f; + if (request.params.bgWrapEnabled && request.backgroundImage != nullptr) { + const OfxRectI& bgBounds = request.backgroundImage->getBounds(); + if (bgBounds.x1 != srcBounds.x1 || + bgBounds.y1 != srcBounds.y1 || + bgBounds.x2 != srcBounds.x2 || + bgBounds.y2 != srcBounds.y2) { + return { + false, + BackendKind::HostMetal, + "Host Metal requires the Background clip to match the Source bounds; mismatched host buffers fall back to CPU rather than guessing at per-host buffer layouts." + }; + } + + if (request.backgroundImage->getPixelComponents() != OFX::ePixelComponentRGBA) { + return { + false, + BackendKind::HostMetal, + "Host Metal currently assumes the Background clip arrives as RGBA device memory; RGB Background clips fall back to CPU so the wrapper does not lie about parity." + }; + } + } + + auto validateMetalMatteClip = [&](const OFX::Image* image, const char* name) -> BackendResult { + if (image == nullptr) { + return {true, BackendKind::HostMetal, ""}; + } + const OfxRectI& matteBounds = image->getBounds(); + if (matteBounds.x1 != srcBounds.x1 || matteBounds.y1 != srcBounds.y1 || + matteBounds.x2 != srcBounds.x2 || matteBounds.y2 != srcBounds.y2) { + return {false, BackendKind::HostMetal, formatString("Host Metal requires the %s clip to match the Source bounds; mismatched host buffers fall back to CPU rather than guessing at per-host buffer layouts.", name)}; + } + if (image->getPixelComponents() != OFX::ePixelComponentRGBA) { + return {false, BackendKind::HostMetal, formatString("Host Metal currently assumes the %s clip arrives as RGBA device memory; RGB mattes fall back to CPU so the wrapper does not misread host Metal buffers.", name)}; + } + return {true, BackendKind::HostMetal, ""}; + }; + + if (const BackendResult matte = validateMetalMatteClip(request.garbageMatteImage, "Garbage Matte"); !matte.success) { + return matte; + } + if (const BackendResult matte = validateMetalMatteClip(request.occlusionMatteImage, "Occlusion Matte"); !matte.success) { + return matte; + } + + // The shared Metal kernel already had support for the richer IBKeymaster control surface. + // The cross-platform wrapper was previously hard-coding neutral placeholders here, which made + // macOS look feature-complete in code while quietly disabling those controls in practice. const int width = srcBounds.x2 - srcBounds.x1; const int height = srcBounds.y2 - srcBounds.y1; @@ -694,26 +1349,45 @@ BackendResult renderHostMetal(const RenderRequest& request) request.params.respillB, request.params.premultiply ? 1 : 0, request.params.nearGreyExtract ? 1 : 0, - nearGreyAmount, - nearGreySoftness, + request.params.nearGreyAmount, + request.params.nearGreySoftness, request.params.blackClip, request.params.whiteClip, - 1.0f, + request.params.matteGamma, + request.params.prematteEnabled ? 1 : 0, + request.params.prematteBlur, + request.params.prematteErode, + request.params.prematteIterations, request.params.guidedFilterEnabled ? 1 : 0, + request.params.guidedFilterMode, request.params.guidedRadius, request.params.guidedEpsilon, request.params.guidedMix, - 0.0f, - 1, - 0.0f, - 0, - 0, - 0.0f, + request.params.edgeProtect, + request.params.refineIterations, + request.params.edgeColorCorrect, + request.params.bgWrapEnabled ? 1 : 0, + request.params.bgWrapBlur, + request.params.bgWrapAmount, + request.params.additiveKeyEnabled ? 1 : 0, + request.params.additiveKeyMode, + request.params.additiveKeySaturation, + request.params.additiveKeyAmount, + request.params.additiveKeyBlackClamp ? 1 : 0, + request.params.viewMode, static_cast(request.srcImage->getPixelData()), (request.params.useScreenInput && request.screenImage != nullptr) ? static_cast(request.screenImage->getPixelData()) : nullptr, - nullptr, + (request.params.bgWrapEnabled && request.backgroundImage != nullptr) + ? static_cast(request.backgroundImage->getPixelData()) + : nullptr, + (request.garbageMatteImage != nullptr) + ? static_cast(request.garbageMatteImage->getPixelData()) + : nullptr, + (request.occlusionMatteImage != nullptr) + ? static_cast(request.occlusionMatteImage->getPixelData()) + : nullptr, static_cast(request.dstImage->getPixelData())); return {true, BackendKind::HostMetal, "Rendered with the host Metal backend."}; @@ -725,6 +1399,8 @@ BackendResult renderHostMetal(const RenderRequest& request) // hard to debug. This helper makes the decision and the reason visible in one place. BackendKind chooseBackend(const RenderRequest& request, std::string& reason) { + const bool needsReferenceOnly = requiresReferenceOnlyFeatures(request); + if (envFlagEnabled("IBKEYER_FORCE_CPU")) { reason = "IBKEYER_FORCE_CPU forced the reference CPU path."; return BackendKind::CPU; @@ -740,12 +1416,22 @@ BackendKind chooseBackend(const RenderRequest& request, std::string& reason) const CudaRenderMode cudaMode = selectedCudaRenderMode(); if (request.hostCudaEnabled) { if (request.hostCudaStream != nullptr) { - reason = "The host enabled OFX CUDA render and supplied a CUDA stream, so IBKeyer must stay on the host-CUDA memory path."; + // This used to branch on "reference-only" features. Once prematte, external mattes, + // RGB guide mode, additive key, and diagnostics landed in CUDA, keeping that older + // wording became actively misleading during zero-copy debugging. + reason = "The host enabled OFX CUDA render and supplied a CUDA stream, so IBKeyer stays on the host-CUDA memory path."; return BackendKind::HostCUDA; } reason = "The host enabled OFX CUDA render but did not supply a CUDA stream. That leaves no safe CPU-readable fallback for the CUDA images."; return BackendKind::HostCUDA; } + if (needsReferenceOnly) { + // These newer private-branch features were ported CPU-first so the result stays trustworthy + // while the shared CUDA implementation catches up. This is only safe when the host has not + // already switched fetchImage() over to CUDA device memory. + reason = "The requested feature set currently relies on the CPU reference path on Windows/Linux to preserve parity (prematte, external mattes, additive key, RGB guided filter, or diagnostic views)."; + return BackendKind::CPU; + } if (cudaMode == CudaRenderMode::HostPreferred && request.hostCudaStream != nullptr) { reason = "Host CUDA is the selected policy and the host supplied CUDA device images plus a CUDA stream, so the zero-copy path is preferred."; diff --git a/IBKeymaster/IBKeyerBackend.h b/IBKeymaster/IBKeyerBackend.h index 9fd7493..18c15fa 100644 --- a/IBKeymaster/IBKeyerBackend.h +++ b/IBKeymaster/IBKeyerBackend.h @@ -43,6 +43,9 @@ struct DeviceRenderFrame { ImagePlaneDesc src; ImagePlaneDesc screen; + ImagePlaneDesc background; + ImagePlaneDesc garbageMatte; + ImagePlaneDesc occlusionMatte; MutableImagePlaneDesc dst; OfxRectI renderWindow = {0, 0, 0, 0}; }; @@ -51,6 +54,9 @@ struct RenderRequest { const OFX::Image* srcImage = nullptr; const OFX::Image* screenImage = nullptr; + const OFX::Image* backgroundImage = nullptr; + const OFX::Image* garbageMatteImage = nullptr; + const OFX::Image* occlusionMatteImage = nullptr; OFX::Image* dstImage = nullptr; OfxRectI renderWindow = {0, 0, 0, 0}; bool hostCudaEnabled = false; diff --git a/IBKeymaster/IBKeyerCuda.cu b/IBKeymaster/IBKeyerCuda.cu index ffc4b16..e2dffb2 100644 --- a/IBKeymaster/IBKeyerCuda.cu +++ b/IBKeymaster/IBKeyerCuda.cu @@ -44,12 +44,36 @@ bool hostCudaForceSyncEnabled() struct CudaScratchCache { float* rawAlpha = nullptr; + float* savedRawAlpha = nullptr; float* guide = nullptr; float* meanI = nullptr; float* meanP = nullptr; float* meanIp = nullptr; float* meanII = nullptr; float* scratch = nullptr; + float* bgR = nullptr; + float* bgG = nullptr; + float* bgB = nullptr; + float* prematteAlpha = nullptr; + float* cleanPlate = nullptr; + + float* rgbMeanIr = nullptr; + float* rgbMeanIg = nullptr; + float* rgbMeanIb = nullptr; + float* rgbMeanP = nullptr; + float* rgbIrIr = nullptr; + float* rgbIrIg = nullptr; + float* rgbIrIb = nullptr; + float* rgbIgIg = nullptr; + float* rgbIgIb = nullptr; + float* rgbIbIb = nullptr; + float* rgbIrP = nullptr; + float* rgbIgP = nullptr; + float* rgbIbP = nullptr; + float* rgbMeanAr = nullptr; + float* rgbMeanAg = nullptr; + float* rgbMeanAb = nullptr; + float* rgbMeanB = nullptr; int pixelCapacity = 0; float* gaussianWeights = nullptr; @@ -61,24 +85,70 @@ struct CudaScratchCache void release() { cudaFree(gaussianWeights); + cudaFree(rgbMeanB); + cudaFree(rgbMeanAb); + cudaFree(rgbMeanAg); + cudaFree(rgbMeanAr); + cudaFree(rgbIbP); + cudaFree(rgbIgP); + cudaFree(rgbIrP); + cudaFree(rgbIbIb); + cudaFree(rgbIgIb); + cudaFree(rgbIgIg); + cudaFree(rgbIrIb); + cudaFree(rgbIrIg); + cudaFree(rgbIrIr); + cudaFree(rgbMeanP); + cudaFree(rgbMeanIb); + cudaFree(rgbMeanIg); + cudaFree(rgbMeanIr); + cudaFree(cleanPlate); + cudaFree(prematteAlpha); + cudaFree(bgB); + cudaFree(bgG); + cudaFree(bgR); cudaFree(scratch); cudaFree(meanII); cudaFree(meanIp); cudaFree(meanP); cudaFree(meanI); cudaFree(guide); + cudaFree(savedRawAlpha); cudaFree(rawAlpha); if (inFlightEvent != nullptr) { cudaEventDestroy(inFlightEvent); } gaussianWeights = nullptr; + rgbMeanB = nullptr; + rgbMeanAb = nullptr; + rgbMeanAg = nullptr; + rgbMeanAr = nullptr; + rgbIbP = nullptr; + rgbIgP = nullptr; + rgbIrP = nullptr; + rgbIbIb = nullptr; + rgbIgIb = nullptr; + rgbIgIg = nullptr; + rgbIrIb = nullptr; + rgbIrIg = nullptr; + rgbIrIr = nullptr; + rgbMeanP = nullptr; + rgbMeanIb = nullptr; + rgbMeanIg = nullptr; + rgbMeanIr = nullptr; + cleanPlate = nullptr; + prematteAlpha = nullptr; + bgB = nullptr; + bgG = nullptr; + bgR = nullptr; scratch = nullptr; meanII = nullptr; meanIp = nullptr; meanP = nullptr; meanI = nullptr; guide = nullptr; + savedRawAlpha = nullptr; rawAlpha = nullptr; pixelCapacity = 0; weightRadius = -1; @@ -93,30 +163,100 @@ struct CudaScratchCache return true; } + cudaFree(bgB); + cudaFree(bgG); + cudaFree(bgR); + cudaFree(prematteAlpha); + cudaFree(cleanPlate); + cudaFree(rgbMeanB); + cudaFree(rgbMeanAb); + cudaFree(rgbMeanAg); + cudaFree(rgbMeanAr); + cudaFree(rgbIbP); + cudaFree(rgbIgP); + cudaFree(rgbIrP); + cudaFree(rgbIbIb); + cudaFree(rgbIgIb); + cudaFree(rgbIgIg); + cudaFree(rgbIrIb); + cudaFree(rgbIrIg); + cudaFree(rgbIrIr); + cudaFree(rgbMeanP); + cudaFree(rgbMeanIb); + cudaFree(rgbMeanIg); + cudaFree(rgbMeanIr); cudaFree(scratch); cudaFree(meanII); cudaFree(meanIp); cudaFree(meanP); cudaFree(meanI); cudaFree(guide); + cudaFree(savedRawAlpha); cudaFree(rawAlpha); + bgB = nullptr; + bgG = nullptr; + bgR = nullptr; + prematteAlpha = nullptr; + cleanPlate = nullptr; + rgbMeanB = nullptr; + rgbMeanAb = nullptr; + rgbMeanAg = nullptr; + rgbMeanAr = nullptr; + rgbIbP = nullptr; + rgbIgP = nullptr; + rgbIrP = nullptr; + rgbIbIb = nullptr; + rgbIgIb = nullptr; + rgbIgIg = nullptr; + rgbIrIb = nullptr; + rgbIrIg = nullptr; + rgbIrIr = nullptr; + rgbMeanP = nullptr; + rgbMeanIb = nullptr; + rgbMeanIg = nullptr; + rgbMeanIr = nullptr; scratch = nullptr; meanII = nullptr; meanIp = nullptr; meanP = nullptr; meanI = nullptr; guide = nullptr; + savedRawAlpha = nullptr; rawAlpha = nullptr; const size_t channelBytes = static_cast(pixelCount) * sizeof(float); + const size_t rgbaBytes = static_cast(pixelCount) * 4u * sizeof(float); if (cudaMalloc(&rawAlpha, channelBytes) != cudaSuccess || + cudaMalloc(&savedRawAlpha, channelBytes) != cudaSuccess || cudaMalloc(&guide, channelBytes) != cudaSuccess || cudaMalloc(&meanI, channelBytes) != cudaSuccess || cudaMalloc(&meanP, channelBytes) != cudaSuccess || cudaMalloc(&meanIp, channelBytes) != cudaSuccess || cudaMalloc(&meanII, channelBytes) != cudaSuccess || - cudaMalloc(&scratch, channelBytes) != cudaSuccess) { + cudaMalloc(&scratch, channelBytes) != cudaSuccess || + cudaMalloc(&bgR, channelBytes) != cudaSuccess || + cudaMalloc(&bgG, channelBytes) != cudaSuccess || + cudaMalloc(&bgB, channelBytes) != cudaSuccess || + cudaMalloc(&prematteAlpha, channelBytes) != cudaSuccess || + cudaMalloc(&cleanPlate, rgbaBytes) != cudaSuccess || + cudaMalloc(&rgbMeanIr, channelBytes) != cudaSuccess || + cudaMalloc(&rgbMeanIg, channelBytes) != cudaSuccess || + cudaMalloc(&rgbMeanIb, channelBytes) != cudaSuccess || + cudaMalloc(&rgbMeanP, channelBytes) != cudaSuccess || + cudaMalloc(&rgbIrIr, channelBytes) != cudaSuccess || + cudaMalloc(&rgbIrIg, channelBytes) != cudaSuccess || + cudaMalloc(&rgbIrIb, channelBytes) != cudaSuccess || + cudaMalloc(&rgbIgIg, channelBytes) != cudaSuccess || + cudaMalloc(&rgbIgIb, channelBytes) != cudaSuccess || + cudaMalloc(&rgbIbIb, channelBytes) != cudaSuccess || + cudaMalloc(&rgbIrP, channelBytes) != cudaSuccess || + cudaMalloc(&rgbIgP, channelBytes) != cudaSuccess || + cudaMalloc(&rgbIbP, channelBytes) != cudaSuccess || + cudaMalloc(&rgbMeanAr, channelBytes) != cudaSuccess || + cudaMalloc(&rgbMeanAg, channelBytes) != cudaSuccess || + cudaMalloc(&rgbMeanAb, channelBytes) != cudaSuccess || + cudaMalloc(&rgbMeanB, channelBytes) != cudaSuccess) { error = "Failed to allocate CUDA scratch buffers for the guided filter."; release(); return false; @@ -285,6 +425,24 @@ IBKEYER_HOST_DEVICE inline void sampleRgb(const ImagePlaneDesc& image, int x, in b = pixel[2]; } +IBKEYER_HOST_DEVICE inline float sampleMatteValue(const ImagePlaneDesc& image, int x, int y) +{ + const float* pixel = pixelAddress(image, x, y); + if (pixel == nullptr) { + return 0.0f; + } + if (image.components >= 4) { + return clamp01(pixel[3]); + } + return clamp01(luminance(pixel[0], pixel[1], pixel[2])); +} + +IBKEYER_HOST_DEVICE inline float smoothstep01(float value) +{ + const float t = clamp01(value); + return t * t * (3.0f - 2.0f * t); +} + IBKEYER_HOST_DEVICE inline void storeRgba(const MutableImagePlaneDesc& image, int x, int y, float r, float g, float b, float a) { float* pixel = pixelAddress(image, x, y); @@ -352,18 +510,20 @@ __global__ void coreKernel(IBKeyerParams params, float alpha = clamp01(1.0f - normalized); // 5. Near Grey Extraction (optional). - if (params.nearGreyExtract) { + if (params.nearGreyExtract && params.nearGreyAmount > 0.0f) { const float divR = safeDivide(ssR, srcR); const float divG = safeDivide(ssG, srcG); const float divB = safeDivide(ssB, srcB); - const float ngeAlpha = nearGreyAlpha(divR, divG, divB, params.screenColor, params.nearGreyAmount); - // Screen composite: a + b - a*b. - alpha = ngeAlpha + alpha - ngeAlpha * alpha; + const float ngeAlpha = nearGreyAlpha(divR, divG, divB, params.screenColor, params.nearGreySoftness); + alpha = alpha + params.nearGreyAmount * ngeAlpha * (1.0f - alpha); } if (params.whiteClip > params.blackClip + 1e-6f) { alpha = clamp01((alpha - params.blackClip) / (params.whiteClip - params.blackClip)); } + if (params.matteGamma != 1.0f && alpha > 0.0f && alpha < 1.0f) { + alpha = powf(alpha, params.matteGamma); + } // 6. Output = screen-subtracted + respill. const float respillMul = despillScreen * normalized > 0.0f ? despillScreen * normalized : 0.0f; @@ -376,7 +536,8 @@ __global__ void coreKernel(IBKeyerParams params, alpha); rawAlpha[pixelIndex] = alpha; - guide[pixelIndex] = luminance(srcR, srcG, srcB); + const float lum = luminance(srcR, srcG, srcB); + guide[pixelIndex] = lum * (1.0f - params.edgeProtect) + alpha * params.edgeProtect; } __global__ void computeProductsKernel(int pixelCount, @@ -400,6 +561,15 @@ __global__ void computeProductsKernel(int pixelCount, meanII[index] = I * I; } +__global__ void copyBufferKernel(int pixelCount, const float* src, float* dst) +{ + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= pixelCount) { + return; + } + dst[index] = src[index]; +} + __global__ void gaussianBlurHorizontalKernel(int width, int height, int radius, @@ -462,6 +632,48 @@ __global__ void guidedCoeffKernel(int pixelCount, meanP[index] = b; } +__global__ void refineGuideKernel(int width, + int height, + ImagePlaneDesc src, + int renderX1, + int renderY1, + float edgeProtect, + const float* alphaBuffer, + float* guideBuffer) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + const int imageX = renderX1 + x; + const int imageY = renderY1 + y; + const int pixelIndex = y * width + x; + const float alpha = alphaBuffer[pixelIndex]; + + float srcR = 0.0f; + float srcG = 0.0f; + float srcB = 0.0f; + sampleRgb(src, imageX, imageY, srcR, srcG, srcB); + + const float fgLum = luminance(srcR * alpha, srcG * alpha, srcB * alpha); + guideBuffer[pixelIndex] = fgLum * (1.0f - edgeProtect) + alpha * edgeProtect; +} + +__global__ void guidedEvalKernel(int pixelCount, + const float* guide, + const float* meanA, + const float* meanB, + float* dst) +{ + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= pixelCount) { + return; + } + dst[index] = clamp01(meanA[index] * guide[index] + meanB[index]); +} + __global__ void guidedApplyKernel(IBKeyerParams params, MutableImagePlaneDesc dst, int renderX1, @@ -498,6 +710,77 @@ __global__ void guidedApplyKernel(IBKeyerParams params, pixel[3] = alpha; } +__global__ void edgeColorCorrectKernel(IBKeyerParams params, + ImagePlaneDesc src, + ImagePlaneDesc screen, + MutableImagePlaneDesc dst, + int renderX1, + int renderY1, + int width, + int height) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + const int imageX = renderX1 + x; + const int imageY = renderY1 + y; + float* pixel = pixelAddress(dst, imageX, imageY); + if (pixel == nullptr) { + return; + } + + const float alpha = pixel[3]; + if (alpha <= 0.005f || alpha >= 0.995f) { + return; + } + + float srcR = 0.0f; + float srcG = 0.0f; + float srcB = 0.0f; + sampleRgb(src, imageX, imageY, srcR, srcG, srcB); + + float scrR = params.pickR; + float scrG = params.pickG; + float scrB = params.pickB; + if (params.useScreenInput && screen.data != nullptr) { + sampleRgb(screen, imageX, imageY, scrR, scrG, scrB); + } + + const float invA = 1.0f / alpha; + float fgR = (srcR - scrR * (1.0f - alpha)) * invA; + float fgG = (srcG - scrG * (1.0f - alpha)) * invA; + float fgB = (srcB - scrB * (1.0f - alpha)) * invA; + fgR = fmaxf(-0.5f, fminf(2.0f, fgR)); + fgG = fmaxf(-0.5f, fminf(2.0f, fgG)); + fgB = fmaxf(-0.5f, fminf(2.0f, fgB)); + + float curR = pixel[0]; + float curG = pixel[1]; + float curB = pixel[2]; + if (params.premultiply) { + curR *= invA; + curG *= invA; + curB *= invA; + } + + const float edgeFactor = alpha * (1.0f - alpha) * 4.0f * params.edgeColorCorrect; + float outR = curR + (fgR - curR) * edgeFactor; + float outG = curG + (fgG - curG) * edgeFactor; + float outB = curB + (fgB - curB) * edgeFactor; + if (params.premultiply) { + outR *= alpha; + outG *= alpha; + outB *= alpha; + } + + pixel[0] = outR; + pixel[1] = outG; + pixel[2] = outB; +} + __global__ void premultiplyKernel(MutableImagePlaneDesc dst, int renderX1, int renderY1, @@ -523,6 +806,530 @@ __global__ void premultiplyKernel(MutableImagePlaneDesc dst, pixel[2] *= alpha; } +__global__ void extractBackgroundChannelsKernel(int width, + int height, + ImagePlaneDesc background, + int renderX1, + int renderY1, + float* outR, + float* outG, + float* outB) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + const int imageX = renderX1 + x; + const int imageY = renderY1 + y; + const int pixelIndex = y * width + x; + + float bgR = 0.0f; + float bgG = 0.0f; + float bgB = 0.0f; + sampleRgb(background, imageX, imageY, bgR, bgG, bgB); + outR[pixelIndex] = bgR; + outG[pixelIndex] = bgG; + outB[pixelIndex] = bgB; +} + +__global__ void bgWrapKernel(MutableImagePlaneDesc dst, + int renderX1, + int renderY1, + int width, + int height, + float amount, + const float* bgR, + const float* bgG, + const float* bgB) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + const int imageX = renderX1 + x; + const int imageY = renderY1 + y; + const int pixelIndex = y * width + x; + float* pixel = pixelAddress(dst, imageX, imageY); + if (pixel == nullptr) { + return; + } + + const float alpha = pixel[3]; + const float wrapWeight = alpha * (1.0f - alpha) * 4.0f * amount; + pixel[0] += bgR[pixelIndex] * wrapWeight; + pixel[1] += bgG[pixelIndex] * wrapWeight; + pixel[2] += bgB[pixelIndex] * wrapWeight; +} + +__global__ void applyMatteKernel(float* rawAlpha, + MutableImagePlaneDesc dst, + ImagePlaneDesc matte, + int renderX1, + int renderY1, + int width, + int height, + int mode) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + const int imageX = renderX1 + x; + const int imageY = renderY1 + y; + const int pixelIndex = y * width + x; + float* dstPixel = pixelAddress(dst, imageX, imageY); + if (dstPixel == nullptr) { + return; + } + + const float matteValue = sampleMatteValue(matte, imageX, imageY); + float alpha = rawAlpha[pixelIndex]; + if (mode == 0) { + alpha *= (1.0f - matteValue); + } else { + alpha = fmaxf(alpha, matteValue); + } + alpha = clamp01(alpha); + rawAlpha[pixelIndex] = alpha; + dstPixel[3] = alpha; +} + +__global__ void writeAlphaDiagnosticKernel(const float* rawAlpha, + MutableImagePlaneDesc dst, + int renderX1, + int renderY1, + int width, + int height) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + const int imageX = renderX1 + x; + const int imageY = renderY1 + y; + const int pixelIndex = y * width + x; + const float a = rawAlpha[pixelIndex]; + storeRgba(dst, imageX, imageY, a, a, a, 1.0f); +} + +__global__ void extractOutputAlphaDiagnosticKernel(MutableImagePlaneDesc dst, + int renderX1, + int renderY1, + int width, + int height) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + const int imageX = renderX1 + x; + const int imageY = renderY1 + y; + float* pixel = pixelAddress(dst, imageX, imageY); + if (pixel == nullptr) { + return; + } + + const float a = pixel[3]; + pixel[0] = a; + pixel[1] = a; + pixel[2] = a; + pixel[3] = 1.0f; +} + +__global__ void copyImageKernel(ImagePlaneDesc src, + MutableImagePlaneDesc dst, + int renderX1, + int renderY1, + int width, + int height) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + const int imageX = renderX1 + x; + const int imageY = renderY1 + y; + const float* srcPixel = pixelAddress(src, imageX, imageY); + float* dstPixel = pixelAddress(dst, imageX, imageY); + if (srcPixel == nullptr || dstPixel == nullptr) { + return; + } + + dstPixel[0] = srcPixel[0]; + dstPixel[1] = srcPixel[1]; + dstPixel[2] = srcPixel[2]; + dstPixel[3] = (src.components >= 4) ? srcPixel[3] : 1.0f; +} + +__global__ void packRgbKernel(MutableImagePlaneDesc dst, + int renderX1, + int renderY1, + int width, + int height, + const float* srcR, + const float* srcG, + const float* srcB) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + const int pixelIndex = y * width + x; + storeRgba(dst, renderX1 + x, renderY1 + y, srcR[pixelIndex], srcG[pixelIndex], srcB[pixelIndex], 1.0f); +} + +__global__ void erodeAlphaKernel(const float* src, + float* dst, + int width, + int height, + int radius) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + float minValue = 1.0f; + for (int dy = -radius; dy <= radius; ++dy) { + const int sy = max(0, min(height - 1, y + dy)); + for (int dx = -radius; dx <= radius; ++dx) { + const int sx = max(0, min(width - 1, x + dx)); + minValue = fminf(minValue, src[sy * width + sx]); + } + } + dst[y * width + x] = minValue; +} + +__global__ void cleanPlateEstimateKernel(ImagePlaneDesc src, + int renderX1, + int renderY1, + int width, + int height, + float pickR, + float pickG, + float pickB, + const float* alpha, + float* outR, + float* outG, + float* outB) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + const int imageX = renderX1 + x; + const int imageY = renderY1 + y; + const int pixelIndex = y * width + x; + float srcR = 0.0f; + float srcG = 0.0f; + float srcB = 0.0f; + sampleRgb(src, imageX, imageY, srcR, srcG, srcB); + const float t = smoothstep01(alpha[pixelIndex]); + outR[pixelIndex] = srcR * (1.0f - t) + pickR * t; + outG[pixelIndex] = srcG * (1.0f - t) + pickG * t; + outB[pixelIndex] = srcB * (1.0f - t) + pickB * t; +} + +__global__ void packCleanPlateKernel(int width, + int height, + const float* srcR, + const float* srcG, + const float* srcB, + float* cleanPlate) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + const int pixelIndex = y * width + x; + const int rgba = pixelIndex * 4; + cleanPlate[rgba + 0] = srcR[pixelIndex]; + cleanPlate[rgba + 1] = srcG[pixelIndex]; + cleanPlate[rgba + 2] = srcB[pixelIndex]; + cleanPlate[rgba + 3] = 1.0f; +} + +__global__ void rgbComputeProductsKernel(ImagePlaneDesc src, + int renderX1, + int renderY1, + int width, + int height, + const float* rawAlpha, + float* meanIr, + float* meanIg, + float* meanIb, + float* meanP, + float* irir, + float* irig, + float* irib, + float* igig, + float* igib, + float* ibib, + float* irp, + float* igp, + float* ibp) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + const int pixelIndex = y * width + x; + float ir = 0.0f; + float ig = 0.0f; + float ib = 0.0f; + sampleRgb(src, renderX1 + x, renderY1 + y, ir, ig, ib); + const float p = rawAlpha[pixelIndex]; + meanIr[pixelIndex] = ir; + meanIg[pixelIndex] = ig; + meanIb[pixelIndex] = ib; + meanP[pixelIndex] = p; + irir[pixelIndex] = ir * ir; + irig[pixelIndex] = ir * ig; + irib[pixelIndex] = ir * ib; + igig[pixelIndex] = ig * ig; + igib[pixelIndex] = ig * ib; + ibib[pixelIndex] = ib * ib; + irp[pixelIndex] = ir * p; + igp[pixelIndex] = ig * p; + ibp[pixelIndex] = ib * p; +} + +__global__ void rgbGuidedCoeffKernel(int pixelCount, + float epsilon, + const float* meanIr, + const float* meanIg, + const float* meanIb, + const float* meanP, + const float* irir, + const float* irig, + const float* irib, + const float* igig, + const float* igib, + const float* ibib, + const float* irp, + const float* igp, + const float* ibp, + float* outAr, + float* outAg, + float* outAb, + float* outB) +{ + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= pixelCount) { + return; + } + + const float mIr = meanIr[index]; + const float mIg = meanIg[index]; + const float mIb = meanIb[index]; + const float mP = meanP[index]; + + float s_rr = irir[index] - mIr * mIr; + const float s_rg = irig[index] - mIr * mIg; + const float s_rb = irib[index] - mIr * mIb; + float s_gg = igig[index] - mIg * mIg; + const float s_gb = igib[index] - mIg * mIb; + float s_bb = ibib[index] - mIb * mIb; + + const float c_rp = irp[index] - mIr * mP; + const float c_gp = igp[index] - mIg * mP; + const float c_bp = ibp[index] - mIb * mP; + + const float trace = s_rr + s_gg + s_bb; + const float adaptEps = epsilon * epsilon / ((trace / 3.0f) + epsilon + 1e-10f); + s_rr += adaptEps; + s_gg += adaptEps; + s_bb += adaptEps; + + const float det = s_rr * (s_gg * s_bb - s_gb * s_gb) + - s_rg * (s_rg * s_bb - s_gb * s_rb) + + s_rb * (s_rg * s_gb - s_gg * s_rb); + const float invDet = (fabsf(det) > 1e-12f) ? (1.0f / det) : 0.0f; + + const float inv_rr = (s_gg * s_bb - s_gb * s_gb) * invDet; + const float inv_rg = (s_rb * s_gb - s_rg * s_bb) * invDet; + const float inv_rb = (s_rg * s_gb - s_rb * s_gg) * invDet; + const float inv_gg = (s_rr * s_bb - s_rb * s_rb) * invDet; + const float inv_gb = (s_rb * s_rg - s_rr * s_gb) * invDet; + const float inv_bb = (s_rr * s_gg - s_rg * s_rg) * invDet; + + const float ar = inv_rr * c_rp + inv_rg * c_gp + inv_rb * c_bp; + const float ag = inv_rg * c_rp + inv_gg * c_gp + inv_gb * c_bp; + const float ab = inv_rb * c_rp + inv_gb * c_gp + inv_bb * c_bp; + outAr[index] = ar; + outAg[index] = ag; + outAb[index] = ab; + outB[index] = mP - ar * mIr - ag * mIg - ab * mIb; +} + +__global__ void rgbGuidedEvalKernel(ImagePlaneDesc src, + int renderX1, + int renderY1, + int width, + int height, + const float* meanAr, + const float* meanAg, + const float* meanAb, + const float* meanB, + float* outAlpha) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + const int pixelIndex = y * width + x; + float ir = 0.0f; + float ig = 0.0f; + float ib = 0.0f; + sampleRgb(src, renderX1 + x, renderY1 + y, ir, ig, ib); + outAlpha[pixelIndex] = clamp01(meanAr[pixelIndex] * ir + + meanAg[pixelIndex] * ig + + meanAb[pixelIndex] * ib + + meanB[pixelIndex]); +} + +__global__ void rgbGuidedApplyKernel(IBKeyerParams params, + ImagePlaneDesc src, + MutableImagePlaneDesc dst, + int renderX1, + int renderY1, + int width, + int height, + const float* rawAlpha, + const float* meanAr, + const float* meanAg, + const float* meanAb, + const float* meanB) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + const int imageX = renderX1 + x; + const int imageY = renderY1 + y; + const int pixelIndex = y * width + x; + float ir = 0.0f; + float ig = 0.0f; + float ib = 0.0f; + sampleRgb(src, imageX, imageY, ir, ig, ib); + const float guidedAlpha = clamp01(meanAr[pixelIndex] * ir + + meanAg[pixelIndex] * ig + + meanAb[pixelIndex] * ib + + meanB[pixelIndex]); + const float alpha = rawAlpha[pixelIndex] * (1.0f - params.guidedMix) + guidedAlpha * params.guidedMix; + + float* pixel = pixelAddress(dst, imageX, imageY); + if (pixel == nullptr) { + return; + } + if (params.premultiply) { + pixel[0] *= alpha; + pixel[1] *= alpha; + pixel[2] *= alpha; + } + pixel[3] = alpha; +} + +__global__ void additiveKeyKernel(IBKeyerParams params, + ImagePlaneDesc src, + ImagePlaneDesc screen, + MutableImagePlaneDesc dst, + int renderX1, + int renderY1, + int width, + int height, + const float* blurBgR, + const float* blurBgG, + const float* blurBgB) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) { + return; + } + + const int imageX = renderX1 + x; + const int imageY = renderY1 + y; + const int pixelIndex = y * width + x; + float* outPixel = pixelAddress(dst, imageX, imageY); + if (outPixel == nullptr) { + return; + } + + const float alpha = outPixel[3]; + if (alpha > 0.999f) { + return; + } + + float srcR = 0.0f; + float srcG = 0.0f; + float srcB = 0.0f; + sampleRgb(src, imageX, imageY, srcR, srcG, srcB); + + float scrR = params.pickR; + float scrG = params.pickG; + float scrB = params.pickB; + if (params.useScreenInput && screen.data != nullptr) { + sampleRgb(screen, imageX, imageY, scrR, scrG, scrB); + } + + float resR = 0.0f; + float resG = 0.0f; + float resB = 0.0f; + if (params.additiveKeyMode == 0) { + resR = srcR - scrR; + resG = srcG - scrG; + resB = srcB - scrB; + const float lum = luminance(resR, resG, resB); + resR = lum * (1.0f - params.additiveKeySaturation) + resR * params.additiveKeySaturation; + resG = lum * (1.0f - params.additiveKeySaturation) + resG * params.additiveKeySaturation; + resB = lum * (1.0f - params.additiveKeySaturation) + resB * params.additiveKeySaturation; + } else if (blurBgR != nullptr && blurBgG != nullptr && blurBgB != nullptr) { + float fR = (scrR > 1e-6f) ? srcR / scrR : 1.0f; + float fG = (scrG > 1e-6f) ? srcG / scrG : 1.0f; + float fB = (scrB > 1e-6f) ? srcB / scrB : 1.0f; + const float fLum = luminance(fR, fG, fB); + fR = fLum * (1.0f - params.additiveKeySaturation) + fR * params.additiveKeySaturation; + fG = fLum * (1.0f - params.additiveKeySaturation) + fG * params.additiveKeySaturation; + fB = fLum * (1.0f - params.additiveKeySaturation) + fB * params.additiveKeySaturation; + resR = blurBgR[pixelIndex] * (fR - 1.0f); + resG = blurBgG[pixelIndex] * (fG - 1.0f); + resB = blurBgB[pixelIndex] * (fB - 1.0f); + } + + if (params.additiveKeyBlackClamp) { + resR = fmaxf(resR, 0.0f); + resG = fmaxf(resG, 0.0f); + resB = fmaxf(resB, 0.0f); + } + + const float weight = (1.0f - alpha) * params.additiveKeyAmount; + outPixel[0] += resR * weight; + outPixel[1] += resG * weight; + outPixel[2] += resB * weight; +} + //////////////////////////////////////////////////////////////////////////////// // CUDA LAUNCH HELPERS //////////////////////////////////////////////////////////////////////////////// @@ -600,6 +1407,11 @@ bool renderCudaFrame(const IBKeyerParams& params, const int pixelCount = width * height; const bool doGF = guidedFilterActive(params); + const bool doRgbGF = doGF && params.guidedFilterMode == 1; + const bool doPrematte = params.prematteEnabled && params.prematteBlur > 0; + const bool doBgWrap = params.bgWrapEnabled && frame.background.data != nullptr && params.bgWrapAmount > 0.0f; + const bool doAdditive = params.additiveKeyEnabled && params.additiveKeyAmount > 0.0f; + const bool needBgBlur = doBgWrap || (doAdditive && params.additiveKeyMode == 1 && frame.background.data != nullptr); if (!scratch.ensurePixelCapacity(pixelCount, error)) { return false; } @@ -637,58 +1449,336 @@ bool renderCudaFrame(const IBKeyerParams& params, return false; } - if (doGF) { - computeProductsKernel<<>>( - pixelCount, - scratch.rawAlpha, - scratch.guide, - scratch.meanI, - scratch.meanP, - scratch.meanIp, - scratch.meanII); - if (!captureKernelStage("guided products", stream, waitForCompletion, error)) { + if (doPrematte) { + const int prematteIterations = max(1, min(params.prematteIterations, 5)); + if (!scratch.ensureGaussianWeights(buildGaussianWeights(max(1, params.prematteBlur)), max(1, params.prematteBlur), error)) { return false; } - // This is the part zero-copy does not remove: the guided filter still needs temporary - // working buffers on the GPU. What zero-copy changes is that Source/Screen/Output stay - // on the host-owned device images instead of bouncing through CPU memory first. - if (!runGaussianBlur(scratch.meanI, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || - !runGaussianBlur(scratch.meanP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || - !runGaussianBlur(scratch.meanIp, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || - !runGaussianBlur(scratch.meanII, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) { - return false; + ImagePlaneDesc cleanScreen; + cleanScreen.data = scratch.cleanPlate; + cleanScreen.rowBytes = static_cast(width) * 4u * sizeof(float); + cleanScreen.bounds = frame.renderWindow; + cleanScreen.components = 4; + + for (int iter = 0; iter < prematteIterations; ++iter) { + const float* alphaSource = scratch.rawAlpha; + if (params.prematteErode > 0) { + erodeAlphaKernel<<>>( + scratch.rawAlpha, + scratch.prematteAlpha, + width, + height, + params.prematteErode); + if (!captureKernelStage("prematte erode", stream, waitForCompletion, error)) { + return false; + } + alphaSource = scratch.prematteAlpha; + } + + cleanPlateEstimateKernel<<>>( + frame.src, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height, + params.pickR, + params.pickG, + params.pickB, + alphaSource, + scratch.bgR, + scratch.bgG, + scratch.bgB); + if (!captureKernelStage("prematte clean plate estimate", stream, waitForCompletion, error)) { + return false; + } + + if (!runGaussianBlur(scratch.bgR, scratch.scratch, scratch.gaussianWeights, width, height, max(1, params.prematteBlur), stream, waitForCompletion, error) || + !runGaussianBlur(scratch.bgG, scratch.scratch, scratch.gaussianWeights, width, height, max(1, params.prematteBlur), stream, waitForCompletion, error) || + !runGaussianBlur(scratch.bgB, scratch.scratch, scratch.gaussianWeights, width, height, max(1, params.prematteBlur), stream, waitForCompletion, error)) { + return false; + } + + packCleanPlateKernel<<>>( + width, + height, + scratch.bgR, + scratch.bgG, + scratch.bgB, + scratch.cleanPlate); + if (!captureKernelStage("prematte pack clean plate", stream, waitForCompletion, error)) { + return false; + } + + coreKernel<<>>(params, + frame.src, + cleanScreen, + frame.dst, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height, + scratch.rawAlpha, + scratch.guide); + if (!captureKernelStage("prematte rekey", stream, waitForCompletion, error)) { + return false; + } } - guidedCoeffKernel<<>>( - pixelCount, - params.guidedEpsilon, - scratch.meanI, - scratch.meanP, - scratch.meanIp, - scratch.meanII); - if (!captureKernelStage("guided coefficients", stream, waitForCompletion, error)) { - return false; + if (params.viewMode == 2) { + copyImageKernel<<>>( + cleanScreen, + frame.dst, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height); + return captureKernelStage("diagnostic clean plate", stream, waitForCompletion, error); } + } else if (params.viewMode == 2 && params.useScreenInput && frame.screen.data != nullptr) { + copyImageKernel<<>>( + frame.screen, + frame.dst, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height); + return captureKernelStage("diagnostic screen input", stream, waitForCompletion, error); + } - if (!runGaussianBlur(scratch.meanI, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || - !runGaussianBlur(scratch.meanP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) { + if (frame.garbageMatte.data != nullptr) { + applyMatteKernel<<>>( + scratch.rawAlpha, + frame.dst, + frame.garbageMatte, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height, + 0); + if (!captureKernelStage("garbage matte", stream, waitForCompletion, error)) { + return false; + } + } + if (frame.occlusionMatte.data != nullptr) { + applyMatteKernel<<>>( + scratch.rawAlpha, + frame.dst, + frame.occlusionMatte, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height, + 1); + if (!captureKernelStage("occlusion matte", stream, waitForCompletion, error)) { return false; } + } + + if (params.viewMode == 1) { + writeAlphaDiagnosticKernel<<>>( + scratch.rawAlpha, + frame.dst, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height); + return captureKernelStage("diagnostic raw matte", stream, waitForCompletion, error); + } - guidedApplyKernel<<>>(params, - frame.dst, - frame.renderWindow.x1, - frame.renderWindow.y1, - width, - height, - scratch.rawAlpha, - scratch.guide, - scratch.meanI, - scratch.meanP); - if (!captureKernelStage("guided apply", stream, waitForCompletion, error)) { + if (doGF) { + copyBufferKernel<<>>(pixelCount, scratch.rawAlpha, scratch.savedRawAlpha); + if (!captureKernelStage("save raw alpha", stream, waitForCompletion, error)) { return false; } + + const int numIter = std::max(1, std::min(params.refineIterations, 5)); + if (doRgbGF) { + for (int iter = 0; iter < numIter; ++iter) { + rgbComputeProductsKernel<<>>( + frame.src, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height, + scratch.rawAlpha, + scratch.rgbMeanIr, + scratch.rgbMeanIg, + scratch.rgbMeanIb, + scratch.rgbMeanP, + scratch.rgbIrIr, + scratch.rgbIrIg, + scratch.rgbIrIb, + scratch.rgbIgIg, + scratch.rgbIgIb, + scratch.rgbIbIb, + scratch.rgbIrP, + scratch.rgbIgP, + scratch.rgbIbP); + if (!captureKernelStage("rgb guided products", stream, waitForCompletion, error)) { + return false; + } + + if (!runGaussianBlur(scratch.rgbMeanIr, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbMeanIg, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbMeanIb, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbMeanP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbIrIr, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbIrIg, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbIrIb, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbIgIg, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbIgIb, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbIbIb, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbIrP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbIgP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbIbP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) { + return false; + } + + rgbGuidedCoeffKernel<<>>( + pixelCount, + params.guidedEpsilon, + scratch.rgbMeanIr, + scratch.rgbMeanIg, + scratch.rgbMeanIb, + scratch.rgbMeanP, + scratch.rgbIrIr, + scratch.rgbIrIg, + scratch.rgbIrIb, + scratch.rgbIgIg, + scratch.rgbIgIb, + scratch.rgbIbIb, + scratch.rgbIrP, + scratch.rgbIgP, + scratch.rgbIbP, + scratch.rgbMeanAr, + scratch.rgbMeanAg, + scratch.rgbMeanAb, + scratch.rgbMeanB); + if (!captureKernelStage("rgb guided coefficients", stream, waitForCompletion, error)) { + return false; + } + + if (!runGaussianBlur(scratch.rgbMeanAr, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbMeanAg, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbMeanAb, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.rgbMeanB, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) { + return false; + } + + if (iter < numIter - 1) { + rgbGuidedEvalKernel<<>>( + frame.src, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height, + scratch.rgbMeanAr, + scratch.rgbMeanAg, + scratch.rgbMeanAb, + scratch.rgbMeanB, + scratch.rawAlpha); + if (!captureKernelStage("rgb guided eval", stream, waitForCompletion, error)) { + return false; + } + } + } + + rgbGuidedApplyKernel<<>>( + params, + frame.src, + frame.dst, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height, + scratch.savedRawAlpha, + scratch.rgbMeanAr, + scratch.rgbMeanAg, + scratch.rgbMeanAb, + scratch.rgbMeanB); + if (!captureKernelStage("rgb guided apply", stream, waitForCompletion, error)) { + return false; + } + } else { + for (int iter = 0; iter < numIter; ++iter) { + if (iter > 0) { + refineGuideKernel<<>>( + width, + height, + frame.src, + frame.renderWindow.x1, + frame.renderWindow.y1, + params.edgeProtect, + scratch.rawAlpha, + scratch.guide); + if (!captureKernelStage("refine guide", stream, waitForCompletion, error)) { + return false; + } + } + + computeProductsKernel<<>>( + pixelCount, + scratch.rawAlpha, + scratch.guide, + scratch.meanI, + scratch.meanP, + scratch.meanIp, + scratch.meanII); + if (!captureKernelStage("guided products", stream, waitForCompletion, error)) { + return false; + } + + if (!runGaussianBlur(scratch.meanI, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.meanP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.meanIp, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.meanII, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) { + return false; + } + + guidedCoeffKernel<<>>( + pixelCount, + params.guidedEpsilon, + scratch.meanI, + scratch.meanP, + scratch.meanIp, + scratch.meanII); + if (!captureKernelStage("guided coefficients", stream, waitForCompletion, error)) { + return false; + } + + if (!runGaussianBlur(scratch.meanI, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.meanP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) { + return false; + } + + if (iter < numIter - 1) { + guidedEvalKernel<<>>( + pixelCount, + scratch.guide, + scratch.meanI, + scratch.meanP, + scratch.rawAlpha); + if (!captureKernelStage("guided eval", stream, waitForCompletion, error)) { + return false; + } + } + } + + guidedApplyKernel<<>>(params, + frame.dst, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height, + scratch.savedRawAlpha, + scratch.guide, + scratch.meanI, + scratch.meanP); + if (!captureKernelStage("guided apply", stream, waitForCompletion, error)) { + return false; + } + } } else if (params.premultiply) { premultiplyKernel<<>>( frame.dst, @@ -701,6 +1791,107 @@ bool renderCudaFrame(const IBKeyerParams& params, } } + if (params.viewMode == 3) { + extractOutputAlphaDiagnosticKernel<<>>( + frame.dst, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height); + return captureKernelStage("diagnostic refined matte", stream, waitForCompletion, error); + } + + if (params.edgeColorCorrect > 0.0f) { + edgeColorCorrectKernel<<>>( + params, + frame.src, + frame.screen, + frame.dst, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height); + if (!captureKernelStage("edge color correct", stream, waitForCompletion, error)) { + return false; + } + } + + if (params.viewMode == 4) { + return true; + } + + if (needBgBlur) { + extractBackgroundChannelsKernel<<>>( + width, + height, + frame.background, + frame.renderWindow.x1, + frame.renderWindow.y1, + scratch.bgR, + scratch.bgG, + scratch.bgB); + if (!captureKernelStage("background extract", stream, waitForCompletion, error)) { + return false; + } + + const int blurRadius = std::max(1, params.bgWrapBlur); + if (!scratch.ensureGaussianWeights(buildGaussianWeights(blurRadius), blurRadius, error)) { + return false; + } + if (!runGaussianBlur(scratch.bgR, scratch.scratch, scratch.gaussianWeights, width, height, blurRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.bgG, scratch.scratch, scratch.gaussianWeights, width, height, blurRadius, stream, waitForCompletion, error) || + !runGaussianBlur(scratch.bgB, scratch.scratch, scratch.gaussianWeights, width, height, blurRadius, stream, waitForCompletion, error)) { + return false; + } + + if (params.viewMode == 5) { + packRgbKernel<<>>( + frame.dst, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height, + scratch.bgR, + scratch.bgG, + scratch.bgB); + return captureKernelStage("diagnostic blurred background", stream, waitForCompletion, error); + } + + if (doBgWrap) { + bgWrapKernel<<>>( + frame.dst, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height, + params.bgWrapAmount, + scratch.bgR, + scratch.bgG, + scratch.bgB); + if (!captureKernelStage("background wrap", stream, waitForCompletion, error)) { + return false; + } + } + } + + if (doAdditive) { + additiveKeyKernel<<>>( + params, + frame.src, + frame.screen, + frame.dst, + frame.renderWindow.x1, + frame.renderWindow.y1, + width, + height, + (needBgBlur ? scratch.bgR : nullptr), + (needBgBlur ? scratch.bgG : nullptr), + (needBgBlur ? scratch.bgB : nullptr)); + if (!captureKernelStage("additive key", stream, waitForCompletion, error)) { + return false; + } + } + if (!waitForCompletion && !scratch.markInFlight(stream)) { // If we cannot record the fence event, we give up some performance and force the stream // to finish now so those scratch buffers are still safe to reuse on the next frame. @@ -761,12 +1952,18 @@ bool renderCudaInternal(const IBKeyerParams& params, const PackedFrame& frame, s float* dSrc = nullptr; float* dScreen = nullptr; + float* dBackground = nullptr; + float* dGarbageMatte = nullptr; + float* dOcclusionMatte = nullptr; float* dDst = nullptr; DeviceRenderFrame deviceFrame; if (cudaMalloc(&dSrc, rgbaBytes) != cudaSuccess || cudaMalloc(&dDst, rgbaBytes) != cudaSuccess || - (frame.screenRgba != nullptr && cudaMalloc(&dScreen, rgbaBytes) != cudaSuccess)) { + (frame.screenRgba != nullptr && cudaMalloc(&dScreen, rgbaBytes) != cudaSuccess) || + (frame.backgroundRgba != nullptr && cudaMalloc(&dBackground, rgbaBytes) != cudaSuccess) || + (frame.garbageMatteRgba != nullptr && cudaMalloc(&dGarbageMatte, rgbaBytes) != cudaSuccess) || + (frame.occlusionMatteRgba != nullptr && cudaMalloc(&dOcclusionMatte, rgbaBytes) != cudaSuccess)) { error = "cudaMalloc failed for the staged CUDA path."; goto cleanup; } @@ -780,6 +1977,21 @@ bool renderCudaInternal(const IBKeyerParams& params, const PackedFrame& frame, s error = "cudaMemcpy(screen) failed for the staged CUDA path."; goto cleanup; } + if (frame.backgroundRgba != nullptr && + cudaMemcpy(dBackground, frame.backgroundRgba, rgbaBytes, cudaMemcpyHostToDevice) != cudaSuccess) { + error = "cudaMemcpy(background) failed for the staged CUDA path."; + goto cleanup; + } + if (frame.garbageMatteRgba != nullptr && + cudaMemcpy(dGarbageMatte, frame.garbageMatteRgba, rgbaBytes, cudaMemcpyHostToDevice) != cudaSuccess) { + error = "cudaMemcpy(garbage matte) failed for the staged CUDA path."; + goto cleanup; + } + if (frame.occlusionMatteRgba != nullptr && + cudaMemcpy(dOcclusionMatte, frame.occlusionMatteRgba, rgbaBytes, cudaMemcpyHostToDevice) != cudaSuccess) { + error = "cudaMemcpy(occlusion matte) failed for the staged CUDA path."; + goto cleanup; + } deviceFrame.src.data = dSrc; deviceFrame.src.rowBytes = static_cast(frame.width) * 4u * sizeof(float); @@ -789,6 +2001,18 @@ bool renderCudaInternal(const IBKeyerParams& params, const PackedFrame& frame, s deviceFrame.screen.rowBytes = (dScreen != nullptr) ? static_cast(frame.width) * 4u * sizeof(float) : 0u; deviceFrame.screen.bounds = {0, 0, frame.width, frame.height}; deviceFrame.screen.components = (dScreen != nullptr) ? 4 : 0; + deviceFrame.background.data = dBackground; + deviceFrame.background.rowBytes = (dBackground != nullptr) ? static_cast(frame.width) * 4u * sizeof(float) : 0u; + deviceFrame.background.bounds = {0, 0, frame.width, frame.height}; + deviceFrame.background.components = (dBackground != nullptr) ? 4 : 0; + deviceFrame.garbageMatte.data = dGarbageMatte; + deviceFrame.garbageMatte.rowBytes = (dGarbageMatte != nullptr) ? static_cast(frame.width) * 4u * sizeof(float) : 0u; + deviceFrame.garbageMatte.bounds = {0, 0, frame.width, frame.height}; + deviceFrame.garbageMatte.components = (dGarbageMatte != nullptr) ? 4 : 0; + deviceFrame.occlusionMatte.data = dOcclusionMatte; + deviceFrame.occlusionMatte.rowBytes = (dOcclusionMatte != nullptr) ? static_cast(frame.width) * 4u * sizeof(float) : 0u; + deviceFrame.occlusionMatte.bounds = {0, 0, frame.width, frame.height}; + deviceFrame.occlusionMatte.components = (dOcclusionMatte != nullptr) ? 4 : 0; deviceFrame.dst.data = dDst; deviceFrame.dst.rowBytes = static_cast(frame.width) * 4u * sizeof(float); deviceFrame.dst.bounds = {0, 0, frame.width, frame.height}; @@ -809,6 +2033,9 @@ bool renderCudaInternal(const IBKeyerParams& params, const PackedFrame& frame, s cleanup: cudaFree(dDst); + cudaFree(dOcclusionMatte); + cudaFree(dGarbageMatte); + cudaFree(dBackground); cudaFree(dScreen); cudaFree(dSrc); return error.empty(); diff --git a/IBKeymaster/IBKeyerShared.h b/IBKeymaster/IBKeyerShared.h index 6021a38..bac8ae5 100644 --- a/IBKeymaster/IBKeyerShared.h +++ b/IBKeymaster/IBKeyerShared.h @@ -32,13 +32,32 @@ struct IBKeyerParams float respillB = 0.0f; bool premultiply = false; bool nearGreyExtract = true; - float nearGreyAmount = 1.0f; + float nearGreyAmount = 0.5f; + float nearGreySoftness = 1.0f; float blackClip = 0.0f; float whiteClip = 1.0f; + float matteGamma = 1.0f; + bool prematteEnabled = false; + int prematteBlur = 8; + int prematteErode = 0; + int prematteIterations = 1; bool guidedFilterEnabled = true; + int guidedFilterMode = 0; int guidedRadius = 8; float guidedEpsilon = 0.01f; float guidedMix = 1.0f; + float edgeProtect = 0.5f; + int refineIterations = 2; + float edgeColorCorrect = 0.0f; + bool bgWrapEnabled = false; + int bgWrapBlur = 20; + float bgWrapAmount = 0.5f; + bool additiveKeyEnabled = false; + int additiveKeyMode = 0; + float additiveKeySaturation = 0.0f; + float additiveKeyAmount = 0.0f; + bool additiveKeyBlackClamp = false; + int viewMode = 0; }; struct PackedFrame @@ -47,6 +66,9 @@ struct PackedFrame int height = 0; const float* srcRgba = nullptr; const float* screenRgba = nullptr; + const float* backgroundRgba = nullptr; + const float* garbageMatteRgba = nullptr; + const float* occlusionMatteRgba = nullptr; float* dstRgba = nullptr; }; @@ -87,13 +109,13 @@ IBKEYER_HOST_DEVICE inline float despillValue(float r, float g, float b, int scr } IBKEYER_HOST_DEVICE inline float nearGreyAlpha(float r, float g, float b, int screenColor, - float amount) + float softness) { float c0, c1, c2; reorderChannels(r, g, b, screenColor, c0, c1, c2); const float mx = fmaxf(c0, fmaxf(c1, c2)); const float comp = (mx == c1) ? c1 : c2; - const float value = c0 * (1.0f - amount) + comp * amount; + const float value = c0 * (1.0f - softness) + comp * softness; return clamp01(value); } diff --git a/IBKeymaster/MetalKernel.mm b/IBKeymaster/MetalKernel.mm index de2f98e..2e46dbe 100644 --- a/IBKeymaster/MetalKernel.mm +++ b/IBKeymaster/MetalKernel.mm @@ -228,6 +228,162 @@ kernel void GaussianBlurV( p_Dst[y * w + x] = sum; } +// ════════════════════════════════════════════════════════════════════════ +// Gaussian Blur — Horizontal TILED (threadgroup shared memory) +// Each row of the threadgroup cooperatively loads a strip into fast +// shared memory, then each thread sums from the tile. Dramatically +// reduces global-memory bandwidth for large radii. +// Static 4096-float tile supports radius up to ~240 with (32,8) tg. +// ════════════════════════════════════════════════════════════════════════ + +kernel void GaussianBlurH_Tiled( + const device float* p_Src [[buffer(3)]], + device float* p_Dst [[buffer(4)]], + const device float* p_Weights [[buffer(5)]], + constant int& p_Width [[buffer(10)]], + constant int& p_Height [[buffer(11)]], + constant int& p_Radius [[buffer(12)]], + uint2 gid [[thread_position_in_grid]], + uint2 tid [[thread_position_in_threadgroup]], + uint2 tgs [[threads_per_threadgroup]]) +{ + threadgroup float tile[4096]; + + int x = (int)gid.x, y = (int)gid.y; + if (y >= p_Height) return; + + int r = p_Radius; + int tileW = (int)tgs.x + 2 * r; + int rowOff = (int)tid.y * tileW; + int baseX = x - (int)tid.x - r; + + for (int i = (int)tid.x; i < tileW; i += (int)tgs.x) { + int sx = clamp(baseX + i, 0, p_Width - 1); + tile[rowOff + i] = p_Src[y * p_Width + sx]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (x >= p_Width) return; + + float sum = 0.0f; + int center = rowOff + (int)tid.x + r; + for (int dx = -r; dx <= r; dx++) { + sum += tile[center + dx] * p_Weights[dx + r]; + } + p_Dst[y * p_Width + x] = sum; +} + +// ════════════════════════════════════════════════════════════════════════ +// Gaussian Blur — Vertical TILED (threadgroup shared memory) +// ════════════════════════════════════════════════════════════════════════ + +kernel void GaussianBlurV_Tiled( + const device float* p_Src [[buffer(3)]], + device float* p_Dst [[buffer(4)]], + const device float* p_Weights [[buffer(5)]], + constant int& p_Width [[buffer(10)]], + constant int& p_Height [[buffer(11)]], + constant int& p_Radius [[buffer(12)]], + uint2 gid [[thread_position_in_grid]], + uint2 tid [[thread_position_in_threadgroup]], + uint2 tgs [[threads_per_threadgroup]]) +{ + threadgroup float tile[4096]; + + int x = (int)gid.x, y = (int)gid.y; + if (x >= p_Width) return; + + int r = p_Radius; + int tileH = (int)tgs.y + 2 * r; + int colOff = (int)tid.x * tileH; + int baseY = y - (int)tid.y - r; + + for (int i = (int)tid.y; i < tileH; i += (int)tgs.y) { + int sy = clamp(baseY + i, 0, p_Height - 1); + tile[colOff + i] = p_Src[sy * p_Width + x]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (y >= p_Height) return; + + float sum = 0.0f; + int center = colOff + (int)tid.y + r; + for (int dy = -r; dy <= r; dy++) { + sum += tile[center + dy] * p_Weights[dy + r]; + } + p_Dst[y * p_Width + x] = sum; +} + +// ════════════════════════════════════════════════════════════════════════ +// Gaussian Blur — Horizontal, 4-channel (processes 4 separate buffers) +// ════════════════════════════════════════════════════════════════════════ + +kernel void GaussianBlurH4( + const device float* p_A [[buffer(0)]], + const device float* p_B [[buffer(1)]], + const device float* p_C [[buffer(2)]], + const device float* p_D [[buffer(3)]], + device float* p_OA [[buffer(4)]], + device float* p_OB [[buffer(5)]], + device float* p_OC [[buffer(6)]], + device float* p_OD [[buffer(7)]], + const device float* p_Weights [[buffer(8)]], + constant int& p_Width [[buffer(10)]], + constant int& p_Height [[buffer(11)]], + constant int& p_Radius [[buffer(12)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + int x = (int)id.x, y = (int)id.y, r = p_Radius, w = p_Width; + float sA = 0.0f, sB = 0.0f, sC = 0.0f, sD = 0.0f; + for (int dx = -r; dx <= r; dx++) { + int sx = clamp(x + dx, 0, w - 1); + int si = y * w + sx; + float wt = p_Weights[dx + r]; + sA += p_A[si] * wt; + sB += p_B[si] * wt; + sC += p_C[si] * wt; + sD += p_D[si] * wt; + } + int oi = y * w + x; + p_OA[oi] = sA; p_OB[oi] = sB; p_OC[oi] = sC; p_OD[oi] = sD; +} + +// ════════════════════════════════════════════════════════════════════════ +// Gaussian Blur — Vertical, 4-channel (processes 4 separate buffers) +// ════════════════════════════════════════════════════════════════════════ + +kernel void GaussianBlurV4( + const device float* p_A [[buffer(0)]], + const device float* p_B [[buffer(1)]], + const device float* p_C [[buffer(2)]], + const device float* p_D [[buffer(3)]], + device float* p_OA [[buffer(4)]], + device float* p_OB [[buffer(5)]], + device float* p_OC [[buffer(6)]], + device float* p_OD [[buffer(7)]], + const device float* p_Weights [[buffer(8)]], + constant int& p_Width [[buffer(10)]], + constant int& p_Height [[buffer(11)]], + constant int& p_Radius [[buffer(12)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + int x = (int)id.x, y = (int)id.y, r = p_Radius, w = p_Width, h = p_Height; + float sA = 0.0f, sB = 0.0f, sC = 0.0f, sD = 0.0f; + for (int dy = -r; dy <= r; dy++) { + int sy = clamp(y + dy, 0, h - 1); + int si = sy * w + x; + float wt = p_Weights[dy + r]; + sA += p_A[si] * wt; + sB += p_B[si] * wt; + sC += p_C[si] * wt; + sD += p_D[si] * wt; + } + int oi = y * w + x; + p_OA[oi] = sA; p_OB[oi] = sB; p_OC[oi] = sC; p_OD[oi] = sD; +} + // ════════════════════════════════════════════════════════════════════════ // Compute products: I*p and I*I, and copy I for blurring // ════════════════════════════════════════════════════════════════════════ @@ -281,7 +437,13 @@ kernel void GuidedFilterCoeff( float varI = mII - mI * mI; float covIp = mIp - mI * mP; - float a = covIp / (varI + p_Epsilon); + // Adaptive epsilon: scales down at edges (high variance) for better + // edge preservation, stays full-strength in flat regions to suppress noise. + // adaptEps = eps^2 / (varI + eps), so: + // flat (varI≈0): adaptEps ≈ eps → smooths normally + // edge (varI>>eps): adaptEps ≈ eps^2/varI → tiny → preserves edges + float adaptEps = p_Epsilon * p_Epsilon / (varI + p_Epsilon + 1e-10f); + float a = covIp / (varI + adaptEps); float b = mP - a * mI; p_MeanI[idx] = a; @@ -468,6 +630,96 @@ kernel void BgWrapKernel( p_Output[idx4 + 2] += p_BgB[idx1] * w; } +// ════════════════════════════════════════════════════════════════════════ +// Additive Key — recovers fine detail (hair, motion blur, transparency) +// the alpha-based key lost, by superimposing source-minus-screen onto +// the composite. Two paths: +// Addition: residual = src - screen → desaturate → add +// Multiplication: factor = src/screen - 1 → desaturate → × BG +// Weighted by (1-alpha) so only transparent areas are affected. +// ════════════════════════════════════════════════════════════════════════ + +kernel void AdditiveKeyKernel( + const device float* p_Source [[buffer(0)]], + const device float* p_Screen [[buffer(1)]], + device float* p_Output [[buffer(2)]], + const device float* p_BlurBgR [[buffer(3)]], + const device float* p_BlurBgG [[buffer(4)]], + const device float* p_BlurBgB [[buffer(5)]], + constant int& p_Width [[buffer(10)]], + constant int& p_Height [[buffer(11)]], + constant int& p_Mode [[buffer(12)]], + constant int& p_UseScr [[buffer(13)]], + constant float& p_ScrR [[buffer(14)]], + constant float& p_ScrG [[buffer(15)]], + constant float& p_ScrB [[buffer(16)]], + constant float& p_Sat [[buffer(17)]], + constant float& p_Amount [[buffer(18)]], + constant int& p_ClampBlk [[buffer(19)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x; + const int idx4 = idx1 * 4; + + float alpha = p_Output[idx4 + 3]; + if (alpha > 0.999f) return; // solid FG — standard key handles it + + float srcR = p_Source[idx4 + 0]; + float srcG = p_Source[idx4 + 1]; + float srcB = p_Source[idx4 + 2]; + + float scrR, scrG, scrB; + if (p_UseScr != 0) { + scrR = p_Screen[idx4 + 0]; + scrG = p_Screen[idx4 + 1]; + scrB = p_Screen[idx4 + 2]; + } else { + scrR = p_ScrR; scrG = p_ScrG; scrB = p_ScrB; + } + + float resR, resG, resB; + + if (p_Mode == 0) { + // ── Addition path: source - screen ── + resR = srcR - scrR; + resG = srcG - scrG; + resB = srcB - scrB; + // Desaturate to remove color cast + float lum = 0.2126f * resR + 0.7152f * resG + 0.0722f * resB; + resR = mix(lum, resR, p_Sat); + resG = mix(lum, resG, p_Sat); + resB = mix(lum, resB, p_Sat); + } else { + // ── Multiplication path: (source/screen) × BG ── + float fR = (scrR > 1e-6f) ? srcR / scrR : 1.0f; + float fG = (scrG > 1e-6f) ? srcG / scrG : 1.0f; + float fB = (scrB > 1e-6f) ? srcB / scrB : 1.0f; + // Desaturate factor (neutral = 1.0) + float fLum = 0.2126f * fR + 0.7152f * fG + 0.0722f * fB; + fR = mix(fLum, fR, p_Sat); + fG = mix(fLum, fG, p_Sat); + fB = mix(fLum, fB, p_Sat); + // Delta from original BG: BG × (factor-1) + resR = p_BlurBgR[idx1] * (fR - 1.0f); + resG = p_BlurBgG[idx1] * (fG - 1.0f); + resB = p_BlurBgB[idx1] * (fB - 1.0f); + } + + // Black clamp (optional — keeps only brighter-than-screen detail) + if (p_ClampBlk != 0) { + resR = max(resR, 0.0f); + resG = max(resG, 0.0f); + resB = max(resB, 0.0f); + } + + // Weight by (1-alpha) · amount and add to output + float w = (1.0f - alpha) * p_Amount; + p_Output[idx4 + 0] += resR * w; + p_Output[idx4 + 1] += resG * w; + p_Output[idx4 + 2] += resB * w; +} + // ════════════════════════════════════════════════════════════════════════ // Edge Color Correction: re-estimate FG color at semi-transparent edges // using the matting equation: fg = (src - screen*(1-alpha)) / alpha @@ -546,16 +798,421 @@ kernel void EdgeColorCorrectKernel( p_Output[idx4 + 1] = outG; p_Output[idx4 + 2] = outB; } -)"; -// ═══════════════════════════════════════════════════════════════════════════════ -// Pipeline cache -// ═══════════════════════════════════════════════════════════════════════════════ +// ════════════════════════════════════════════════════════════════════════ +// RGB Guided Filter — Compute Products +// Extracts RGB guide channels from source, copies alpha, and computes +// all 13 statistical channels needed for the 3-channel guided filter: +// 4 means (Ir, Ig, Ib, p) + 6 auto-covariance (IrIr..IbIb) +// + 3 cross-covariance (Irp, Igp, Ibp) +// ════════════════════════════════════════════════════════════════════════ + +kernel void RGBComputeProducts( + const device float* p_Input [[buffer(0)]], // source RGBA + const device float* p_Output [[buffer(1)]], // output from core keyer (alpha in .w) + device float* p_MeanIr [[buffer(2)]], + device float* p_MeanIg [[buffer(3)]], + device float* p_MeanIb [[buffer(4)]], + device float* p_MeanP [[buffer(5)]], + device float* p_IrIr [[buffer(6)]], + device float* p_IrIg [[buffer(7)]], + device float* p_IrIb [[buffer(8)]], + device float* p_IgIg [[buffer(9)]], + device float* p_IgIb [[buffer(10)]], + device float* p_IbIb [[buffer(11)]], + device float* p_IrP [[buffer(12)]], + device float* p_IgP [[buffer(13)]], + device float* p_IbP [[buffer(14)]], + constant int& p_Width [[buffer(20)]], + constant int& p_Height [[buffer(21)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + const int idx4 = ((int)(id.y * (uint)p_Width) + (int)id.x) * 4; + const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x; + + float Ir = p_Input[idx4 + 0]; + float Ig = p_Input[idx4 + 1]; + float Ib = p_Input[idx4 + 2]; + float p = p_Output[idx4 + 3]; // alpha from core keyer + + // Copies for blurring (will become means) + p_MeanIr[idx1] = Ir; + p_MeanIg[idx1] = Ig; + p_MeanIb[idx1] = Ib; + p_MeanP[idx1] = p; + + // Auto-covariance products + p_IrIr[idx1] = Ir * Ir; + p_IrIg[idx1] = Ir * Ig; + p_IrIb[idx1] = Ir * Ib; + p_IgIg[idx1] = Ig * Ig; + p_IgIb[idx1] = Ig * Ib; + p_IbIb[idx1] = Ib * Ib; + + // Cross-covariance products + p_IrP[idx1] = Ir * p; + p_IgP[idx1] = Ig * p; + p_IbP[idx1] = Ib * p; +} + +// ════════════════════════════════════════════════════════════════════════ +// RGB Guided Filter — Coefficient Computation +// Reads 13 blurred channels, solves the 3×3 system: +// (Σ + εI) · a = cov(I, p) +// b = mean_p − aᵀ · mean_I +// Writes a_r, a_g, a_b, b into 4 output buffers. +// ════════════════════════════════════════════════════════════════════════ + +kernel void RGBGuidedCoeff( + const device float* p_MeanIr [[buffer(0)]], + const device float* p_MeanIg [[buffer(1)]], + const device float* p_MeanIb [[buffer(2)]], + const device float* p_MeanP [[buffer(3)]], + device float* p_IrIr [[buffer(4)]], // overwritten with a_r + const device float* p_IrIg [[buffer(5)]], + const device float* p_IrIb [[buffer(6)]], + device float* p_IgIg [[buffer(7)]], // overwritten with a_g + const device float* p_IgIb [[buffer(8)]], + device float* p_IbIb [[buffer(9)]], // overwritten with a_b + const device float* p_IrP [[buffer(10)]], + const device float* p_IgP [[buffer(11)]], + const device float* p_IbP [[buffer(12)]], + device float* p_OutB [[buffer(13)]], // output b coefficient + constant int& p_Width [[buffer(20)]], + constant int& p_Height [[buffer(21)]], + constant float& p_Epsilon [[buffer(22)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + const int i = (int)(id.y * (uint)p_Width) + (int)id.x; + + float mIr = p_MeanIr[i], mIg = p_MeanIg[i], mIb = p_MeanIb[i], mP = p_MeanP[i]; + + // Covariance matrix Σ (symmetric 3×3) + float s_rr = p_IrIr[i] - mIr * mIr; + float s_rg = p_IrIg[i] - mIr * mIg; + float s_rb = p_IrIb[i] - mIr * mIb; + float s_gg = p_IgIg[i] - mIg * mIg; + float s_gb = p_IgIb[i] - mIg * mIb; + float s_bb = p_IbIb[i] - mIb * mIb; + + // Cross-covariance cov(I, p) + float c_rp = p_IrP[i] - mIr * mP; + float c_gp = p_IgP[i] - mIg * mP; + float c_bp = p_IbP[i] - mIb * mP; + + // Adaptive epsilon: same approach as scalar but applied to trace + float trace = s_rr + s_gg + s_bb; + float adaptEps = p_Epsilon * p_Epsilon / (trace / 3.0f + p_Epsilon + 1e-10f); + + // Add ε·I to diagonal + s_rr += adaptEps; + s_gg += adaptEps; + s_bb += adaptEps; + + // Solve 3×3 symmetric system via Cramer's rule + // det(M) + float det = s_rr * (s_gg * s_bb - s_gb * s_gb) + - s_rg * (s_rg * s_bb - s_gb * s_rb) + + s_rb * (s_rg * s_gb - s_gg * s_rb); + + float invDet = (abs(det) > 1e-12f) ? (1.0f / det) : 0.0f; + + // Cofactor matrix (symmetric) for inverse + float inv_rr = (s_gg * s_bb - s_gb * s_gb) * invDet; + float inv_rg = (s_rb * s_gb - s_rg * s_bb) * invDet; + float inv_rb = (s_rg * s_gb - s_rb * s_gg) * invDet; + float inv_gg = (s_rr * s_bb - s_rb * s_rb) * invDet; + float inv_gb = (s_rb * s_rg - s_rr * s_gb) * invDet; + float inv_bb = (s_rr * s_gg - s_rg * s_rg) * invDet; + + // a = inv(Σ+εI) · cov(I,p) + float ar = inv_rr * c_rp + inv_rg * c_gp + inv_rb * c_bp; + float ag = inv_rg * c_rp + inv_gg * c_gp + inv_gb * c_bp; + float ab = inv_rb * c_rp + inv_gb * c_gp + inv_bb * c_bp; + float b = mP - ar * mIr - ag * mIg - ab * mIb; + + // Write coefficients (reuse 4 buffers) + p_IrIr[i] = ar; // buf[4] = a_r + p_IgIg[i] = ag; // buf[7] = a_g + p_IbIb[i] = ab; // buf[9] = a_b + p_OutB[i] = b; // buf[13] = b +} + +// ════════════════════════════════════════════════════════════════════════ +// RGB Guided Filter — Intermediate Evaluation +// For iterative refinement: compute refined alpha without premultiply. +// q = mean_ar * Ir + mean_ag * Ig + mean_ab * Ib + mean_b +// ════════════════════════════════════════════════════════════════════════ + +kernel void RGBGuidedEval( + const device float* p_Input [[buffer(0)]], // source RGBA (for RGB guide) + const device float* p_MeanAr [[buffer(2)]], + const device float* p_MeanAg [[buffer(3)]], + const device float* p_MeanAb [[buffer(4)]], + const device float* p_MeanB [[buffer(5)]], + device float* p_OutAlpha [[buffer(6)]], + constant int& p_Width [[buffer(20)]], + constant int& p_Height [[buffer(21)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + const int idx4 = ((int)(id.y * (uint)p_Width) + (int)id.x) * 4; + const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x; + + float Ir = p_Input[idx4 + 0]; + float Ig = p_Input[idx4 + 1]; + float Ib = p_Input[idx4 + 2]; + + float q = p_MeanAr[idx1] * Ir + p_MeanAg[idx1] * Ig + + p_MeanAb[idx1] * Ib + p_MeanB[idx1]; + p_OutAlpha[idx1] = clamp(q, 0.0f, 1.0f); +} + +// ════════════════════════════════════════════════════════════════════════ +// RGB Guided Filter — Final Apply +// Evaluates q, mixes with raw alpha, writes RGBA output + premultiply. +// ════════════════════════════════════════════════════════════════════════ + +kernel void RGBGuidedApply( + const device float* p_Input [[buffer(0)]], // source RGBA (for RGB guide) + device float* p_Output [[buffer(1)]], // output RGBA (read+write) + const device float* p_MeanAr [[buffer(2)]], + const device float* p_MeanAg [[buffer(3)]], + const device float* p_MeanAb [[buffer(4)]], + const device float* p_MeanB [[buffer(5)]], + const device float* p_RawAlpha [[buffer(6)]], + constant int& p_Width [[buffer(20)]], + constant int& p_Height [[buffer(21)]], + constant int& p_Premultiply [[buffer(22)]], + constant float& p_GFMix [[buffer(23)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + const int idx4 = ((int)(id.y * (uint)p_Width) + (int)id.x) * 4; + const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x; + + float Ir = p_Input[idx4 + 0]; + float Ig = p_Input[idx4 + 1]; + float Ib = p_Input[idx4 + 2]; + + float rawAlpha = p_RawAlpha[idx1]; + float guidedAlpha = clamp( + p_MeanAr[idx1] * Ir + p_MeanAg[idx1] * Ig + + p_MeanAb[idx1] * Ib + p_MeanB[idx1], 0.0f, 1.0f); + float alpha = mix(rawAlpha, guidedAlpha, p_GFMix); + + float outR = p_Output[idx4 + 0]; + float outG = p_Output[idx4 + 1]; + float outB = p_Output[idx4 + 2]; + + if (p_Premultiply != 0) { + outR *= alpha; + outG *= alpha; + outB *= alpha; + } + + p_Output[idx4 + 0] = outR; + p_Output[idx4 + 1] = outG; + p_Output[idx4 + 2] = outB; + p_Output[idx4 + 3] = alpha; +} + +// ════════════════════════════════════════════════════════════════════════ +// Write Alpha — copies 1ch alpha buffer to RGBA alpha channel +// Used during iterative RGB guided filter refinement. +// ════════════════════════════════════════════════════════════════════════ + +kernel void WriteAlphaKernel( + const device float* p_Alpha [[buffer(0)]], + device float* p_RGBA [[buffer(1)]], + constant int& p_Width [[buffer(10)]], + constant int& p_Height [[buffer(11)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x; + p_RGBA[idx1 * 4 + 3] = p_Alpha[idx1]; +} + +// ════════════════════════════════════════════════════════════════════════ +// Erode Alpha — morphological minimum filter on 1ch alpha +// Shrinks the initial matte before clean plate estimation to prevent +// foreground contamination in the synthetic screen. +// ════════════════════════════════════════════════════════════════════════ + +kernel void ErodeAlphaKernel( + const device float* p_Src [[buffer(0)]], + device float* p_Dst [[buffer(1)]], + constant int& p_Width [[buffer(10)]], + constant int& p_Height [[buffer(11)]], + constant int& p_Radius [[buffer(12)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + int x = (int)id.x, y = (int)id.y, r = p_Radius; + int w = p_Width, h = p_Height; + float minVal = 1.0f; + for (int dy = -r; dy <= r; dy++) { + int sy = clamp(y + dy, 0, h - 1); + for (int dx = -r; dx <= r; dx++) { + int sx = clamp(x + dx, 0, w - 1); + minVal = min(minVal, p_Src[sy * w + sx]); + } + } + p_Dst[y * w + x] = minVal; +} + +// ════════════════════════════════════════════════════════════════════════ +// Clean Plate Estimate — IBKColour-style synthetic screen generation +// Where alpha ≈ 0 (pure screen): keeps source pixel (preserves screen +// variation — light falloff, wrinkles, color gradients) +// Where alpha ≈ 1 (pure FG): replaces with picked screen color +// Smoothstep blending avoids hard transition artifacts. +// The subsequent blur fills FG holes from surrounding BG information. +// ════════════════════════════════════════════════════════════════════════ + +kernel void CleanPlateEstimateKernel( + const device float* p_Input [[buffer(0)]], // source RGBA + const device float* p_Alpha [[buffer(1)]], // 1ch initial alpha (eroded) + device float* p_OutR [[buffer(2)]], // clean plate R channel + device float* p_OutG [[buffer(3)]], // clean plate G channel + device float* p_OutB [[buffer(4)]], // clean plate B channel + constant int& p_Width [[buffer(10)]], + constant int& p_Height [[buffer(11)]], + constant float& p_PickR [[buffer(12)]], + constant float& p_PickG [[buffer(13)]], + constant float& p_PickB [[buffer(14)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + const int idx4 = ((int)(id.y * (uint)p_Width) + (int)id.x) * 4; + const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x; + + float srcR = p_Input[idx4 + 0]; + float srcG = p_Input[idx4 + 1]; + float srcB = p_Input[idx4 + 2]; + float alpha = clamp(p_Alpha[idx1], 0.0f, 1.0f); + + // Smoothstep for softer transition + float t = alpha * alpha * (3.0f - 2.0f * alpha); + + // Blend: keep source at alpha=0 (screen), replace with picked color at alpha=1 (FG) + p_OutR[idx1] = mix(srcR, p_PickR, t); + p_OutG[idx1] = mix(srcG, p_PickG, t); + p_OutB[idx1] = mix(srcB, p_PickB, t); +} + +// ════════════════════════════════════════════════════════════════════════ +// Pack RGBA — assembles 3 float channels into an interleaved RGBA buffer +// Builds the clean plate RGBA for the second core key pass. +// ════════════════════════════════════════════════════════════════════════ + +kernel void PackRGBAKernel( + const device float* p_R [[buffer(0)]], + const device float* p_G [[buffer(1)]], + const device float* p_B [[buffer(2)]], + device float* p_RGBA [[buffer(3)]], + constant int& p_Width [[buffer(10)]], + constant int& p_Height [[buffer(11)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x; + const int idx4 = idx1 * 4; + p_RGBA[idx4 + 0] = p_R[idx1]; + p_RGBA[idx4 + 1] = p_G[idx1]; + p_RGBA[idx4 + 2] = p_B[idx1]; + p_RGBA[idx4 + 3] = 1.0f; +} + +// ════════════════════════════════════════════════════════════════════════ +// Apply External Matte — garbage / occlusion matte support +// Reads alpha from an RGBA matte buffer and modifies both the 1ch alpha +// and the output RGBA buffer's alpha channel. +// Mode 0 = garbage (white = remove: alpha *= 1 - matte) +// Mode 1 = occlusion (white = keep: alpha = max(alpha, matte)) +// ════════════════════════════════════════════════════════════════════════ + +kernel void ApplyMatteKernel( + device float* p_Alpha [[buffer(0)]], + device float* p_Output [[buffer(1)]], + const device float* p_Matte [[buffer(2)]], + constant int& p_Width [[buffer(10)]], + constant int& p_Height [[buffer(11)]], + constant int& p_Mode [[buffer(12)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x; + const int idx4 = idx1 * 4; + + float matteVal = p_Matte[idx4 + 3]; + float alpha = p_Alpha[idx1]; + + if (p_Mode == 0) { + // Garbage: white areas = remove from key + alpha *= (1.0f - matteVal); + } else { + // Occlusion: white areas = force opaque + alpha = max(alpha, matteVal); + } + + p_Alpha[idx1] = alpha; + p_Output[idx4 + 3] = alpha; +} + +// ════════════════════════════════════════════════════════════════════════ +// Diagnostic Output — writes intermediate pipeline data to output +// Mode 0: 1-channel alpha buffer → greyscale RGBA (R=G=B=alpha, A=1) +// Mode 1: copy RGBA buffer to output +// Mode 2: extract alpha from RGBA → greyscale (R=G=B=src.a, A=1) +// ════════════════════════════════════════════════════════════════════════ + +kernel void DiagnosticOutputKernel( + const device float* p_SrcA [[buffer(0)]], + const device float* p_SrcRGBA [[buffer(1)]], + device float* p_Output [[buffer(2)]], + constant int& p_Width [[buffer(10)]], + constant int& p_Height [[buffer(11)]], + constant int& p_Mode [[buffer(12)]], + uint2 id [[thread_position_in_grid]]) +{ + if ((int)id.x >= p_Width || (int)id.y >= p_Height) return; + const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x; + const int idx4 = idx1 * 4; + + if (p_Mode == 0) { + // 1-channel alpha → greyscale RGBA + float a = p_SrcA[idx1]; + p_Output[idx4 + 0] = a; + p_Output[idx4 + 1] = a; + p_Output[idx4 + 2] = a; + p_Output[idx4 + 3] = 1.0f; + } else if (p_Mode == 1) { + // Copy RGBA + p_Output[idx4 + 0] = p_SrcRGBA[idx4 + 0]; + p_Output[idx4 + 1] = p_SrcRGBA[idx4 + 1]; + p_Output[idx4 + 2] = p_SrcRGBA[idx4 + 2]; + p_Output[idx4 + 3] = p_SrcRGBA[idx4 + 3]; + } else { + // Mode 2: extract alpha from RGBA → greyscale + float a = p_SrcRGBA[idx4 + 3]; + p_Output[idx4 + 0] = a; + p_Output[idx4 + 1] = a; + p_Output[idx4 + 2] = a; + p_Output[idx4 + 3] = 1.0f; + } +} +)"; // end of kernelSource struct PipelineSet { id coreKeyer; id gaussianBlurH; id gaussianBlurV; + id gaussianBlurH4; + id gaussianBlurV4; id computeProducts; id guidedCoeff; id refineGuide; @@ -566,26 +1223,47 @@ kernel void EdgeColorCorrectKernel( id extractChannel; id bgWrap; id edgeColorCorrect; + // RGB guided filter + id rgbComputeProducts; + id rgbGuidedCoeff; + id rgbGuidedEval; + id rgbGuidedApply; + id writeAlpha; + // Prematte (clean plate generation) + id erodeAlpha; + id cleanPlateEstimate; + id packRGBA; + // External mattes + id applyMatte; + // Diagnostic output + id diagnosticOutput; + // Tiled blur (threadgroup shared memory) + id gaussianBlurHTiled; + id gaussianBlurVTiled; + // Additive key + id additiveKey; }; struct QueueState { PipelineSet pipes; - // Cached temp buffers (persist across frames — avoids alloc/free churn) - // tempA: raw alpha / p / mean_p / b / mean_b - // tempB: guide / I / mean_I / a / mean_a - // tempC: I*p / mean_Ip / eval scratch - // tempD: I*I / mean_II - // tempE: guide copy (preserved through blur passes) - // tempF: Gaussian blur scratch (H/V intermediate) - // tempG: saved raw alpha (for final mix against original) - id tempA = nil; - id tempB = nil; - id tempC = nil; - id tempD = nil; - id tempE = nil; - id tempF = nil; - id tempG = nil; + // Temp buffer pool: + // Scalar GF: uses temp[0..6] + // [0]=rawAlpha/p, [1]=guide/I, [2]=Ip, [3]=II, + // [4]=guideCopy, [5]=blurScratch, [6]=savedRawAlpha + // RGB GF: uses temp[0..17] + // [0..3]=meanIr,Ig,Ib,P [4..9]=IrIr,IrIg,IrIb,IgIg,IgIb,IbIb + // [10..12]=Irp,Igp,Ibp [13..16]=blur4 scratch [17]=savedRawAlpha + static const int MAX_TEMP = 18; + id temp[MAX_TEMP] = {}; size_t cachedChanBytes = 0; + int cachedBufCount = 0; // how many temps are currently allocated + // Prematte clean plate buffer (RGBA-sized) + id cleanPlateBuf = nil; + size_t cachedCleanPlateBytes = 0; + // Cached Gaussian weight buffers (avoid per-frame alloc/free) + id gfWeightBuf = nil; int gfWeightRadius = -1; + id pmWeightBuf = nil; int pmWeightRadius = -1; + id bwWeightBuf = nil; int bwWeightRadius = -1; }; std::mutex s_PipelineMutex; @@ -617,6 +1295,9 @@ static void dispatch2D(id enc, // ═══════════════════════════════════════════════════════════════════════════════ // Gaussian blur helper: single separable H+V pass with pre-computed weights. // Blurs bufA in-place, uses bufScratch as intermediate. +// Uses threadgroup-tiled kernels when the tile fits in 4096-float shared +// memory (radius ≤ 240), falls back to the simple global-read kernels +// for very large radii. // ═══════════════════════════════════════════════════════════════════════════════ static void gaussianBlur(id enc, @@ -625,25 +1306,103 @@ static void gaussianBlur(id enc, id weightBuf, int w, int h, int radius) { - // Horizontal: A → scratch - [enc setComputePipelineState:ps.gaussianBlurH]; - [enc setBuffer:bufA offset:0 atIndex:3]; - [enc setBuffer:bufScratch offset:0 atIndex:4]; - [enc setBuffer:weightBuf offset:0 atIndex:5]; + // Threadgroup sizes: (32,8)=256 for H, (8,32)=256 for V + const int tgW_H = 32, tgH_H = 8; + const int tgW_V = 8, tgH_V = 32; + bool canTileH = (tgH_H * (tgW_H + 2 * radius) <= 4096); + bool canTileV = (tgW_V * (tgH_V + 2 * radius) <= 4096); + + if (canTileH && canTileV && ps.gaussianBlurHTiled && ps.gaussianBlurVTiled) { + // ── Tiled H: A → scratch ── + [enc setComputePipelineState:ps.gaussianBlurHTiled]; + [enc setBuffer:bufA offset:0 atIndex:3]; + [enc setBuffer:bufScratch offset:0 atIndex:4]; + [enc setBuffer:weightBuf offset:0 atIndex:5]; + [enc setBytes:&w length:sizeof(int) atIndex:10]; + [enc setBytes:&h length:sizeof(int) atIndex:11]; + [enc setBytes:&radius length:sizeof(int) atIndex:12]; + MTLSize tgSizeH = MTLSizeMake(tgW_H, tgH_H, 1); + MTLSize gridH = MTLSizeMake((w + tgW_H - 1) / tgW_H, (h + tgH_H - 1) / tgH_H, 1); + [enc dispatchThreadgroups:gridH threadsPerThreadgroup:tgSizeH]; + + // ── Tiled V: scratch → A ── + [enc setComputePipelineState:ps.gaussianBlurVTiled]; + [enc setBuffer:bufScratch offset:0 atIndex:3]; + [enc setBuffer:bufA offset:0 atIndex:4]; + [enc setBuffer:weightBuf offset:0 atIndex:5]; + [enc setBytes:&w length:sizeof(int) atIndex:10]; + [enc setBytes:&h length:sizeof(int) atIndex:11]; + [enc setBytes:&radius length:sizeof(int) atIndex:12]; + MTLSize tgSizeV = MTLSizeMake(tgW_V, tgH_V, 1); + MTLSize gridV = MTLSizeMake((w + tgW_V - 1) / tgW_V, (h + tgH_V - 1) / tgH_V, 1); + [enc dispatchThreadgroups:gridV threadsPerThreadgroup:tgSizeV]; + } else { + // ── Fallback: global-read kernels ── + [enc setComputePipelineState:ps.gaussianBlurH]; + [enc setBuffer:bufA offset:0 atIndex:3]; + [enc setBuffer:bufScratch offset:0 atIndex:4]; + [enc setBuffer:weightBuf offset:0 atIndex:5]; + [enc setBytes:&w length:sizeof(int) atIndex:10]; + [enc setBytes:&h length:sizeof(int) atIndex:11]; + [enc setBytes:&radius length:sizeof(int) atIndex:12]; + dispatch2D(enc, ps.gaussianBlurH, w, h); + + [enc setComputePipelineState:ps.gaussianBlurV]; + [enc setBuffer:bufScratch offset:0 atIndex:3]; + [enc setBuffer:bufA offset:0 atIndex:4]; + [enc setBuffer:weightBuf offset:0 atIndex:5]; + [enc setBytes:&w length:sizeof(int) atIndex:10]; + [enc setBytes:&h length:sizeof(int) atIndex:11]; + [enc setBytes:&radius length:sizeof(int) atIndex:12]; + dispatch2D(enc, ps.gaussianBlurV, w, h); + } +} + +// ═══════════════════════════════════════════════════════════════════════════════ +// Gaussian blur helper: 4-channel separable H+V in 2 dispatches. +// Blurs a,b,c,d in-place using sa,sb,sc,sd as scratch. +// ═══════════════════════════════════════════════════════════════════════════════ + +static void gaussianBlur4(id enc, + const PipelineSet& ps, + id a, id b, + id c, id d, + id sa, id sb, + id sc, id sd, + id weightBuf, + int w, int h, int radius) +{ + // Horizontal: a,b,c,d → sa,sb,sc,sd + [enc setComputePipelineState:ps.gaussianBlurH4]; + [enc setBuffer:a offset:0 atIndex:0]; + [enc setBuffer:b offset:0 atIndex:1]; + [enc setBuffer:c offset:0 atIndex:2]; + [enc setBuffer:d offset:0 atIndex:3]; + [enc setBuffer:sa offset:0 atIndex:4]; + [enc setBuffer:sb offset:0 atIndex:5]; + [enc setBuffer:sc offset:0 atIndex:6]; + [enc setBuffer:sd offset:0 atIndex:7]; + [enc setBuffer:weightBuf offset:0 atIndex:8]; [enc setBytes:&w length:sizeof(int) atIndex:10]; [enc setBytes:&h length:sizeof(int) atIndex:11]; [enc setBytes:&radius length:sizeof(int) atIndex:12]; - dispatch2D(enc, ps.gaussianBlurH, w, h); - - // Vertical: scratch → A - [enc setComputePipelineState:ps.gaussianBlurV]; - [enc setBuffer:bufScratch offset:0 atIndex:3]; - [enc setBuffer:bufA offset:0 atIndex:4]; - [enc setBuffer:weightBuf offset:0 atIndex:5]; + dispatch2D(enc, ps.gaussianBlurH4, w, h); + + // Vertical: sa,sb,sc,sd → a,b,c,d + [enc setComputePipelineState:ps.gaussianBlurV4]; + [enc setBuffer:sa offset:0 atIndex:0]; + [enc setBuffer:sb offset:0 atIndex:1]; + [enc setBuffer:sc offset:0 atIndex:2]; + [enc setBuffer:sd offset:0 atIndex:3]; + [enc setBuffer:a offset:0 atIndex:4]; + [enc setBuffer:b offset:0 atIndex:5]; + [enc setBuffer:c offset:0 atIndex:6]; + [enc setBuffer:d offset:0 atIndex:7]; + [enc setBuffer:weightBuf offset:0 atIndex:8]; [enc setBytes:&w length:sizeof(int) atIndex:10]; [enc setBytes:&h length:sizeof(int) atIndex:11]; [enc setBytes:&radius length:sizeof(int) atIndex:12]; - dispatch2D(enc, ps.gaussianBlurV, w, h); + dispatch2D(enc, ps.gaussianBlurV4, w, h); } // ═══════════════════════════════════════════════════════════════════════════════ @@ -658,12 +1417,19 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, int p_Premultiply, int p_NearGreyExtract, float p_NearGreyAmount, float p_NearGreySoftness, float p_BlackClip, float p_WhiteClip, float p_MatteGamma, - int p_GuidedFilterEnabled, int p_GuidedRadius, float p_GuidedEpsilon, + int p_PrematteEnabled, int p_PrematteBlur, int p_PrematteErode, int p_PrematteIterations, + int p_GuidedFilterEnabled, int p_GuidedFilterMode, + int p_GuidedRadius, float p_GuidedEpsilon, float p_GuidedMix, float p_EdgeProtect, int p_RefineIterations, float p_EdgeColorCorrect, int p_BgWrapEnabled, int p_BgWrapBlur, float p_BgWrapAmount, + int p_AdditiveKeyEnabled, int p_AdditiveKeyMode, + float p_AdditiveKeySat, float p_AdditiveKeyAmount, int p_AdditiveKeyBlackClamp, + int p_ViewMode, const float* p_Input, const float* p_Screen, - const float* p_Background, float* p_Output) + const float* p_Background, + const float* p_GarbageMatte, const float* p_OcclusionMatte, + float* p_Output) { @autoreleasepool { id queue = static_cast>(p_CmdQ); @@ -675,10 +1441,10 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, auto it = s_QueueStateMap.find(queue); if (it == s_QueueStateMap.end()) { MTLCompileOptions* options = [MTLCompileOptions new]; - // Keep Metal math conservative here. Fast-math is tempting, but parity debugging gets much - // harder when the Metal backend quietly takes a different numerical path from CPU/CUDA and - // from the original Gaffer graph. If we revisit this as a performance optimization later, - // it should be treated as a measured opt-in change rather than the default behavior. + // Keep Metal math conservative here. The private branch was using fast math to chase speed, + // but this cross-platform port leans on CPU as the parity anchor. Safe math makes it much + // easier to compare Metal against CPU/CUDA without backend-specific numeric drift muddying + // whether a feature port is actually correct. #if defined(__MAC_OS_X_VERSION_MAX_ALLOWED) && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 options.mathMode = MTLMathModeSafe; #else @@ -694,6 +1460,8 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, qs.pipes.coreKeyer = makePipeline(lib, "IBKeymasterCoreKernel"); qs.pipes.gaussianBlurH = makePipeline(lib, "GaussianBlurH"); qs.pipes.gaussianBlurV = makePipeline(lib, "GaussianBlurV"); + qs.pipes.gaussianBlurH4 = makePipeline(lib, "GaussianBlurH4"); + qs.pipes.gaussianBlurV4 = makePipeline(lib, "GaussianBlurV4"); qs.pipes.computeProducts = makePipeline(lib, "ComputeProducts"); qs.pipes.guidedCoeff = makePipeline(lib, "GuidedFilterCoeff"); qs.pipes.refineGuide = makePipeline(lib, "RefineGuideKernel"); @@ -704,6 +1472,19 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, qs.pipes.extractChannel = makePipeline(lib, "ExtractChannel"); qs.pipes.bgWrap = makePipeline(lib, "BgWrapKernel"); qs.pipes.edgeColorCorrect = makePipeline(lib, "EdgeColorCorrectKernel"); + qs.pipes.rgbComputeProducts = makePipeline(lib, "RGBComputeProducts"); + qs.pipes.rgbGuidedCoeff = makePipeline(lib, "RGBGuidedCoeff"); + qs.pipes.rgbGuidedEval = makePipeline(lib, "RGBGuidedEval"); + qs.pipes.rgbGuidedApply = makePipeline(lib, "RGBGuidedApply"); + qs.pipes.writeAlpha = makePipeline(lib, "WriteAlphaKernel"); + qs.pipes.erodeAlpha = makePipeline(lib, "ErodeAlphaKernel"); + qs.pipes.cleanPlateEstimate = makePipeline(lib, "CleanPlateEstimateKernel"); + qs.pipes.packRGBA = makePipeline(lib, "PackRGBAKernel"); + qs.pipes.applyMatte = makePipeline(lib, "ApplyMatteKernel"); + qs.pipes.diagnosticOutput = makePipeline(lib, "DiagnosticOutputKernel"); + qs.pipes.gaussianBlurHTiled = makePipeline(lib, "GaussianBlurH_Tiled"); + qs.pipes.gaussianBlurVTiled = makePipeline(lib, "GaussianBlurV_Tiled"); + qs.pipes.additiveKey = makePipeline(lib, "AdditiveKeyKernel"); [lib release]; s_QueueStateMap[queue] = qs; it = s_QueueStateMap.find(queue); @@ -713,47 +1494,56 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, // ── Cached temp buffers (reused across frames) ── size_t chanBytes = (size_t)p_Width * (size_t)p_Height * sizeof(float); + bool doPrematte = p_PrematteEnabled && p_PrematteBlur > 0; bool doGF = p_GuidedFilterEnabled && p_GuidedRadius > 0; bool doBgWrap = p_BgWrapEnabled && p_Background && p_BgWrapAmount > 0.0f; + bool doAdditiveKey = p_AdditiveKeyEnabled && p_AdditiveKeyAmount > 0.0f; + bool needBgBlur = doBgWrap || (doAdditiveKey && p_AdditiveKeyMode == 1 && p_Background != nullptr); + bool rgbGF = doGF && (p_GuidedFilterMode == 1); + + // Determine how many temp buffers are needed + int neededBufs = 2; // always need temp[0..1] for core keyer output + if (doPrematte) neededBufs = std::max(neededBufs, 7); // prematte uses temp[0..6] + if (rgbGF) neededBufs = 18; // temp[0..17] + else if (doGF || needBgBlur) neededBufs = std::max(neededBufs, 7); // temp[0..6] // Reallocate if resolution changed if (chanBytes != state.cachedChanBytes) { - if (state.tempA) [state.tempA release]; - if (state.tempB) [state.tempB release]; - if (state.tempC) [state.tempC release]; - if (state.tempD) [state.tempD release]; - if (state.tempE) [state.tempE release]; - if (state.tempF) [state.tempF release]; - if (state.tempG) [state.tempG release]; - state.tempA = nil; state.tempB = nil; state.tempC = nil; - state.tempD = nil; state.tempE = nil; state.tempF = nil; - state.tempG = nil; + for (int i = 0; i < QueueState::MAX_TEMP; i++) { + if (state.temp[i]) { [state.temp[i] release]; state.temp[i] = nil; } + } + if (state.cleanPlateBuf) { [state.cleanPlateBuf release]; state.cleanPlateBuf = nil; } state.cachedChanBytes = chanBytes; + state.cachedCleanPlateBytes = 0; + state.cachedBufCount = 0; + } + + // Allocate temp buffers on demand + if (neededBufs > state.cachedBufCount) { + for (int i = state.cachedBufCount; i < neededBufs; i++) { + if (!state.temp[i]) + state.temp[i] = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate]; + } + state.cachedBufCount = neededBufs; } - // Allocate on demand - if ((doGF || doBgWrap) && !state.tempC) { - // Need buffers for GF and/or BG wrap - if (!state.tempA) state.tempA = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate]; - if (!state.tempB) state.tempB = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate]; - state.tempC = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate]; - state.tempD = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate]; - state.tempE = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate]; - state.tempF = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate]; - state.tempG = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate]; - } else if (!doGF && !doBgWrap && !state.tempA) { - // Need tempA and tempB even without GF (core kernel writes to them) - state.tempA = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate]; - state.tempB = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate]; + // Allocate RGBA clean plate buffer for prematte + size_t rgbaBytes = chanBytes * 4; + if (doPrematte && state.cachedCleanPlateBytes != rgbaBytes) { + if (state.cleanPlateBuf) { [state.cleanPlateBuf release]; state.cleanPlateBuf = nil; } + state.cleanPlateBuf = [device newBufferWithLength:rgbaBytes options:MTLResourceStorageModePrivate]; + state.cachedCleanPlateBytes = rgbaBytes; } - id tempA = state.tempA; - id tempB = state.tempB; - id tempC = state.tempC; - id tempD = state.tempD; - id tempE = state.tempE; - id tempF = state.tempF; - id tempG = state.tempG; + // Scalar aliases (backward compatible with existing scalar dispatch code) + id tempA = state.temp[0]; + id tempB = state.temp[1]; + id tempC = (neededBufs > 2) ? state.temp[2] : nil; + id tempD = (neededBufs > 3) ? state.temp[3] : nil; + id tempE = (neededBufs > 4) ? state.temp[4] : nil; + id tempF = (neededBufs > 5) ? state.temp[5] : nil; + id tempG = (neededBufs > 6) ? state.temp[6] : nil; + id* t = state.temp; // direct array access for RGB path lock.unlock(); // ── Resolve Metal buffers ── @@ -771,26 +1561,29 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, createdDummy = true; } - // ── Pre-compute Gaussian weights (tiny shared buffer, freed per-frame) ── + // ── Cached Gaussian weights — only re-created when radius changes ── id weightBuf = nil; if (doGF) { int r = p_GuidedRadius; - int kernelSize = 2 * r + 1; - float sigma = fmaxf(r / 3.0f, 0.5f); - float invTwoSigmaSq = 1.0f / (2.0f * sigma * sigma); - - float* weights = (float*)alloca(kernelSize * sizeof(float)); - float wsum = 0.0f; - for (int i = -r; i <= r; i++) { - float w = expf(-(float)(i * i) * invTwoSigmaSq); - weights[i + r] = w; - wsum += w; + if (state.gfWeightRadius != r) { + if (state.gfWeightBuf) [state.gfWeightBuf release]; + int kernelSize = 2 * r + 1; + float sigma = fmaxf(r / 3.0f, 0.5f); + float invTwoSigmaSq = 1.0f / (2.0f * sigma * sigma); + float* weights = (float*)alloca(kernelSize * sizeof(float)); + float wsum = 0.0f; + for (int i = -r; i <= r; i++) { + float w = expf(-(float)(i * i) * invTwoSigmaSq); + weights[i + r] = w; + wsum += w; + } + for (int i = 0; i < kernelSize; i++) weights[i] /= wsum; + state.gfWeightBuf = [device newBufferWithBytes:weights + length:kernelSize * sizeof(float) + options:MTLResourceStorageModeShared]; + state.gfWeightRadius = r; } - for (int i = 0; i < kernelSize; i++) weights[i] /= wsum; - - weightBuf = [device newBufferWithBytes:weights - length:kernelSize * sizeof(float) - options:MTLResourceStorageModeShared]; + weightBuf = state.gfWeightBuf; } // ══════════════════════════════════════════════════════════════════════ @@ -829,7 +1622,191 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, // After core: dstBuf=RGBA(despilled+rawAlpha), tempA=rawAlpha, tempB=guide - if (doGF) { + // ══════════════════════════════════════════════════════════════════════ + // PREMATTE — IBKColour-style synthetic clean plate generation + // 1. Erode initial alpha (prevents FG bleed into plate estimate) + // 2. Estimate clean plate: blend source → picked screen based on alpha + // 3. Blur the plate (fills FG holes from surrounding BG information) + // 4. Pack into RGBA and re-run core keyer with the clean plate as screen + // ══════════════════════════════════════════════════════════════════════ + if (doPrematte) { + // Cached Gaussian weights for prematte blur + int pmR = std::max(1, p_PrematteBlur); + if (state.pmWeightRadius != pmR) { + if (state.pmWeightBuf) [state.pmWeightBuf release]; + int pmKernelSize = 2 * pmR + 1; + float pmSigma = fmaxf(pmR / 3.0f, 0.5f); + float pmInv2s2 = 1.0f / (2.0f * pmSigma * pmSigma); + float* pmW = (float*)alloca(pmKernelSize * sizeof(float)); + float pmSum = 0.0f; + for (int i = -pmR; i <= pmR; i++) { + float wt = expf(-(float)(i * i) * pmInv2s2); + pmW[i + pmR] = wt; + pmSum += wt; + } + for (int i = 0; i < pmKernelSize; i++) pmW[i] /= pmSum; + state.pmWeightBuf = [device newBufferWithBytes:pmW + length:pmKernelSize * sizeof(float) + options:MTLResourceStorageModeShared]; + state.pmWeightRadius = pmR; + } + id pmWeightBuf = state.pmWeightBuf; + + int pmIter = std::max(1, std::min(p_PrematteIterations, 5)); + for (int pi = 0; pi < pmIter; pi++) { + + // Step 1: Erode alpha (tempA → tempC, or use tempA directly if no erode) + id erodeAlpha = tempA; + if (p_PrematteErode > 0) { + [enc setComputePipelineState:pipes.erodeAlpha]; + [enc setBuffer:tempA offset:0 atIndex:0]; + [enc setBuffer:tempC offset:0 atIndex:1]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&p_PrematteErode length:sizeof(int) atIndex:12]; + dispatch2D(enc, pipes.erodeAlpha, p_Width, p_Height); + erodeAlpha = tempC; + } + + // Step 2: Estimate clean plate (source + eroded alpha → 3 channel buffers) + [enc setComputePipelineState:pipes.cleanPlateEstimate]; + [enc setBuffer:srcBuf offset:0 atIndex:0]; + [enc setBuffer:erodeAlpha offset:0 atIndex:1]; + [enc setBuffer:tempD offset:0 atIndex:2]; // cleanR + [enc setBuffer:tempE offset:0 atIndex:3]; // cleanG + [enc setBuffer:tempF offset:0 atIndex:4]; // cleanB + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&p_PickR length:sizeof(float) atIndex:12]; + [enc setBytes:&p_PickG length:sizeof(float) atIndex:13]; + [enc setBytes:&p_PickB length:sizeof(float) atIndex:14]; + dispatch2D(enc, pipes.cleanPlateEstimate, p_Width, p_Height); + + // Step 3: Blur each channel of the clean plate + // Use tempA as blur scratch (it's consumed; will be overwritten by re-key) + gaussianBlur(enc, pipes, tempD, tempA, pmWeightBuf, p_Width, p_Height, pmR); + gaussianBlur(enc, pipes, tempE, tempA, pmWeightBuf, p_Width, p_Height, pmR); + gaussianBlur(enc, pipes, tempF, tempA, pmWeightBuf, p_Width, p_Height, pmR); + + // Step 4: Pack into RGBA clean plate buffer + [enc setComputePipelineState:pipes.packRGBA]; + [enc setBuffer:tempD offset:0 atIndex:0]; // R + [enc setBuffer:tempE offset:0 atIndex:1]; // G + [enc setBuffer:tempF offset:0 atIndex:2]; // B + [enc setBuffer:state.cleanPlateBuf offset:0 atIndex:3]; // RGBA out + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + dispatch2D(enc, pipes.packRGBA, p_Width, p_Height); + + // Step 5: Re-run core keyer with clean plate as screen + int useScreenOn = 1; + [enc setComputePipelineState:pipes.coreKeyer]; + [enc setBuffer:srcBuf offset:0 atIndex:0]; + [enc setBuffer:state.cleanPlateBuf offset:0 atIndex:1]; // clean plate as screen + [enc setBuffer:dstBuf offset:0 atIndex:2]; + [enc setBuffer:tempA offset:0 atIndex:3]; // fresh raw alpha + [enc setBuffer:tempB offset:0 atIndex:4]; // fresh guide + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&p_ScreenColor length:sizeof(int) atIndex:12]; + [enc setBytes:&useScreenOn length:sizeof(int) atIndex:13]; + [enc setBytes:&p_PickR length:sizeof(float) atIndex:14]; + [enc setBytes:&p_PickG length:sizeof(float) atIndex:15]; + [enc setBytes:&p_PickB length:sizeof(float) atIndex:16]; + [enc setBytes:&p_Bias length:sizeof(float) atIndex:17]; + [enc setBytes:&p_Limit length:sizeof(float) atIndex:18]; + [enc setBytes:&p_RespillR length:sizeof(float) atIndex:19]; + [enc setBytes:&p_RespillG length:sizeof(float) atIndex:20]; + [enc setBytes:&p_RespillB length:sizeof(float) atIndex:21]; + [enc setBytes:&p_NearGreyExtract length:sizeof(int) atIndex:22]; + [enc setBytes:&p_NearGreyAmount length:sizeof(float) atIndex:23]; + [enc setBytes:&p_NearGreySoftness length:sizeof(float) atIndex:28]; + [enc setBytes:&p_BlackClip length:sizeof(float) atIndex:24]; + [enc setBytes:&p_WhiteClip length:sizeof(float) atIndex:25]; + [enc setBytes:&p_EdgeProtect length:sizeof(float) atIndex:26]; + [enc setBytes:&p_MatteGamma length:sizeof(float) atIndex:27]; + dispatch2D(enc, pipes.coreKeyer, p_Width, p_Height); + + } // prematte iteration loop + } + + // After prematte (or core if prematte off): dstBuf=RGBA, tempA=alpha, tempB=guide + + bool viewDone = false; + + // ── Diagnostic: Clean Plate ────────────────────────────────────────── + if (p_ViewMode == 2) { + if (doPrematte && state.cleanPlateBuf) { + int dMode = 1; // RGBA copy + [enc setComputePipelineState:pipes.diagnosticOutput]; + [enc setBuffer:state.cleanPlateBuf offset:0 atIndex:0]; + [enc setBuffer:state.cleanPlateBuf offset:0 atIndex:1]; + [enc setBuffer:dstBuf offset:0 atIndex:2]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&dMode length:sizeof(int) atIndex:12]; + dispatch2D(enc, pipes.diagnosticOutput, p_Width, p_Height); + } else { + // Prematte off: show the screen input as fallback + int dMode = 1; + [enc setComputePipelineState:pipes.diagnosticOutput]; + [enc setBuffer:scrBuf offset:0 atIndex:0]; + [enc setBuffer:scrBuf offset:0 atIndex:1]; + [enc setBuffer:dstBuf offset:0 atIndex:2]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&dMode length:sizeof(int) atIndex:12]; + dispatch2D(enc, pipes.diagnosticOutput, p_Width, p_Height); + } + viewDone = true; + } + + // ══════════════════════════════════════════════════════════════════════ + // EXTERNAL MATTES — Garbage and Occlusion + // Applied after core keyer (+ prematte) but before guided filter, + // so the GF refines the combined matte with external constraints. + // ══════════════════════════════════════════════════════════════════════ + if (p_GarbageMatte && !viewDone) { + id garbageBuf = reinterpret_cast>(const_cast(p_GarbageMatte)); + int mode = 0; + [enc setComputePipelineState:pipes.applyMatte]; + [enc setBuffer:tempA offset:0 atIndex:0]; + [enc setBuffer:dstBuf offset:0 atIndex:1]; + [enc setBuffer:garbageBuf offset:0 atIndex:2]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&mode length:sizeof(int) atIndex:12]; + dispatch2D(enc, pipes.applyMatte, p_Width, p_Height); + } + + if (p_OcclusionMatte && !viewDone) { + id occlusionBuf = reinterpret_cast>(const_cast(p_OcclusionMatte)); + int mode = 1; + [enc setComputePipelineState:pipes.applyMatte]; + [enc setBuffer:tempA offset:0 atIndex:0]; + [enc setBuffer:dstBuf offset:0 atIndex:1]; + [enc setBuffer:occlusionBuf offset:0 atIndex:2]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&mode length:sizeof(int) atIndex:12]; + dispatch2D(enc, pipes.applyMatte, p_Width, p_Height); + } + + // ── Diagnostic: Raw Matte ──────────────────────────────────────────── + if (p_ViewMode == 1 && !viewDone) { + int dMode = 0; // 1ch alpha → greyscale + [enc setComputePipelineState:pipes.diagnosticOutput]; + [enc setBuffer:tempA offset:0 atIndex:0]; + [enc setBuffer:tempA offset:0 atIndex:1]; + [enc setBuffer:dstBuf offset:0 atIndex:2]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&dMode length:sizeof(int) atIndex:12]; + dispatch2D(enc, pipes.diagnosticOutput, p_Width, p_Height); + viewDone = true; + } + + if (!viewDone && doGF && !rgbGF) { // ══════════════════════════════════════════════════════════════════ // GUIDED FILTER (iterative refinement) // ══════════════════════════════════════════════════════════════════ @@ -934,7 +1911,133 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, } } // iteration loop - } else if (p_Premultiply) { + } else if (!viewDone && rgbGF) { + // ══════════════════════════════════════════════════════════════════ + // RGB COLOR-AWARE GUIDED FILTER + // Uses full 3-channel RGB guide with 3×3 covariance matrix. + // Dramatically better color-edge awareness than scalar luminance. + // Optimized: 4-channel vectorized blur reduces dispatches 3.4×. + // ══════════════════════════════════════════════════════════════════ + int r = p_GuidedRadius; + int numIter = std::max(1, std::min(p_RefineIterations, 5)); + + // Save raw alpha to t[17] for final mix + [enc setComputePipelineState:pipes.copyBuffer]; + [enc setBuffer:t[0] offset:0 atIndex:3]; // raw alpha from core keyer + [enc setBuffer:t[17] offset:0 atIndex:4]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + dispatch2D(enc, pipes.copyBuffer, p_Width, p_Height); + + for (int iter = 0; iter < numIter; iter++) { + bool isLast = (iter == numIter - 1); + + // ── Compute products: source RGB + current alpha → 13 channels ── + [enc setComputePipelineState:pipes.rgbComputeProducts]; + [enc setBuffer:srcBuf offset:0 atIndex:0]; // source RGBA (guide RGB) + [enc setBuffer:dstBuf offset:0 atIndex:1]; // output (alpha in .w) + [enc setBuffer:t[0] offset:0 atIndex:2]; // → mean_Ir + [enc setBuffer:t[1] offset:0 atIndex:3]; // → mean_Ig + [enc setBuffer:t[2] offset:0 atIndex:4]; // → mean_Ib + [enc setBuffer:t[3] offset:0 atIndex:5]; // → mean_p + [enc setBuffer:t[4] offset:0 atIndex:6]; // → IrIr + [enc setBuffer:t[5] offset:0 atIndex:7]; // → IrIg + [enc setBuffer:t[6] offset:0 atIndex:8]; // → IrIb + [enc setBuffer:t[7] offset:0 atIndex:9]; // → IgIg + [enc setBuffer:t[8] offset:0 atIndex:10]; // → IgIb + [enc setBuffer:t[9] offset:0 atIndex:11]; // → IbIb + [enc setBuffer:t[10] offset:0 atIndex:12]; // → Irp + [enc setBuffer:t[11] offset:0 atIndex:13]; // → Igp + [enc setBuffer:t[12] offset:0 atIndex:14]; // → Ibp + [enc setBytes:&p_Width length:sizeof(int) atIndex:20]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:21]; + dispatch2D(enc, pipes.rgbComputeProducts, p_Width, p_Height); + + // ── Blur 13 channels: 3 groups of 4 + 1 single ── + // (t[13..16] are dedicated scratch for blur4) + gaussianBlur4(enc, pipes, + t[0], t[1], t[2], t[3], + t[13], t[14], t[15], t[16], + weightBuf, p_Width, p_Height, r); + gaussianBlur4(enc, pipes, + t[4], t[5], t[6], t[7], + t[13], t[14], t[15], t[16], + weightBuf, p_Width, p_Height, r); + gaussianBlur4(enc, pipes, + t[8], t[9], t[10], t[11], + t[13], t[14], t[15], t[16], + weightBuf, p_Width, p_Height, r); + gaussianBlur(enc, pipes, t[12], t[13], weightBuf, p_Width, p_Height, r); + + // ── Compute 3×3 coefficients: ar, ag, ab, b ── + // Writes: t[4]=ar, t[7]=ag, t[9]=ab, t[0]=b + // (b goes to t[0] since means are consumed; t[13..16] stay as scratch) + [enc setComputePipelineState:pipes.rgbGuidedCoeff]; + [enc setBuffer:t[0] offset:0 atIndex:0]; // mean_Ir (consumed → becomes b output) + [enc setBuffer:t[1] offset:0 atIndex:1]; // mean_Ig + [enc setBuffer:t[2] offset:0 atIndex:2]; // mean_Ib + [enc setBuffer:t[3] offset:0 atIndex:3]; // mean_p + [enc setBuffer:t[4] offset:0 atIndex:4]; // IrIr → ar + [enc setBuffer:t[5] offset:0 atIndex:5]; // IrIg + [enc setBuffer:t[6] offset:0 atIndex:6]; // IrIb + [enc setBuffer:t[7] offset:0 atIndex:7]; // IgIg → ag + [enc setBuffer:t[8] offset:0 atIndex:8]; // IgIb + [enc setBuffer:t[9] offset:0 atIndex:9]; // IbIb → ab + [enc setBuffer:t[10] offset:0 atIndex:10]; // Irp + [enc setBuffer:t[11] offset:0 atIndex:11]; // Igp + [enc setBuffer:t[12] offset:0 atIndex:12]; // Ibp + [enc setBuffer:t[0] offset:0 atIndex:13]; // → b (overwrites mean_Ir) + [enc setBytes:&p_Width length:sizeof(int) atIndex:20]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:21]; + [enc setBytes:&p_GuidedEpsilon length:sizeof(float) atIndex:22]; + dispatch2D(enc, pipes.rgbGuidedCoeff, p_Width, p_Height); + + // After coeff: t[4]=ar, t[7]=ag, t[9]=ab, t[0]=b + // Blur 4 coefficients in a single vectorized pass + gaussianBlur4(enc, pipes, + t[4], t[7], t[9], t[0], + t[13], t[14], t[15], t[16], + weightBuf, p_Width, p_Height, r); + + if (isLast) { + // Final: apply with mix against saved raw alpha + premultiply + [enc setComputePipelineState:pipes.rgbGuidedApply]; + [enc setBuffer:srcBuf offset:0 atIndex:0]; // source RGB + [enc setBuffer:dstBuf offset:0 atIndex:1]; // output RGBA + [enc setBuffer:t[4] offset:0 atIndex:2]; // mean_ar + [enc setBuffer:t[7] offset:0 atIndex:3]; // mean_ag + [enc setBuffer:t[9] offset:0 atIndex:4]; // mean_ab + [enc setBuffer:t[0] offset:0 atIndex:5]; // mean_b + [enc setBuffer:t[17] offset:0 atIndex:6]; // saved raw alpha + [enc setBytes:&p_Width length:sizeof(int) atIndex:20]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:21]; + [enc setBytes:&p_Premultiply length:sizeof(int) atIndex:22]; + [enc setBytes:&p_GuidedMix length:sizeof(float) atIndex:23]; + dispatch2D(enc, pipes.rgbGuidedApply, p_Width, p_Height); + } else { + // Intermediate: eval refined alpha → t[1], then write to dstBuf alpha + [enc setComputePipelineState:pipes.rgbGuidedEval]; + [enc setBuffer:srcBuf offset:0 atIndex:0]; // source RGB + [enc setBuffer:t[4] offset:0 atIndex:2]; // mean_ar + [enc setBuffer:t[7] offset:0 atIndex:3]; // mean_ag + [enc setBuffer:t[9] offset:0 atIndex:4]; // mean_ab + [enc setBuffer:t[0] offset:0 atIndex:5]; // mean_b + [enc setBuffer:t[1] offset:0 atIndex:6]; // output: refined alpha + [enc setBytes:&p_Width length:sizeof(int) atIndex:20]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:21]; + dispatch2D(enc, pipes.rgbGuidedEval, p_Width, p_Height); + + // Write refined alpha back to dstBuf's .w channel for next iteration + [enc setComputePipelineState:pipes.writeAlpha]; + [enc setBuffer:t[1] offset:0 atIndex:0]; // 1ch refined alpha + [enc setBuffer:dstBuf offset:0 atIndex:1]; // RGBA buffer + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + dispatch2D(enc, pipes.writeAlpha, p_Width, p_Height); + } + } // RGB iteration loop + + } else if (!viewDone && p_Premultiply) { // No GF — just premultiply [enc setComputePipelineState:pipes.premultiply]; [enc setBuffer:dstBuf offset:0 atIndex:2]; @@ -943,12 +2046,26 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, dispatch2D(enc, pipes.premultiply, p_Width, p_Height); } + // ── Diagnostic: Refined Matte ──────────────────────────────────────── + if (p_ViewMode == 3 && !viewDone) { + int dMode = 2; // extract alpha from RGBA → greyscale + [enc setComputePipelineState:pipes.diagnosticOutput]; + [enc setBuffer:dstBuf offset:0 atIndex:0]; + [enc setBuffer:dstBuf offset:0 atIndex:1]; + [enc setBuffer:dstBuf offset:0 atIndex:2]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&dMode length:sizeof(int) atIndex:12]; + dispatch2D(enc, pipes.diagnosticOutput, p_Width, p_Height); + viewDone = true; + } + // ══════════════════════════════════════════════════════════════════════ // PASS 2.5: Edge Color Correction // Re-estimates FG color at semi-transparent edges using the matting // equation: fg = (src - screen*(1-alpha)) / alpha // ══════════════════════════════════════════════════════════════════════ - if (p_EdgeColorCorrect > 0.0f) { + if (!viewDone && p_EdgeColorCorrect > 0.0f) { [enc setComputePipelineState:pipes.edgeColorCorrect]; [enc setBuffer:srcBuf offset:0 atIndex:0]; [enc setBuffer:scrBuf offset:0 atIndex:1]; @@ -964,30 +2081,43 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, dispatch2D(enc, pipes.edgeColorCorrect, p_Width, p_Height); } + // ── Diagnostic: Despilled Source ───────────────────────────────────── + if (p_ViewMode == 4 && !viewDone) { + viewDone = true; // skip BG wrap — output despilled FG as-is + } + // ══════════════════════════════════════════════════════════════════════ - // PASS 3: Background Wrap - // Blurs the BG and bleeds it into FG edges weighted by (1-alpha) + // PASS 3: Background Stage — BG Wrap + Additive Key + // Blurs the BG and bleeds it into FG edges weighted by (1-alpha). + // Additive key: recovers fine detail the alpha missed by + // superimposing source-minus-screen onto the composite. // ══════════════════════════════════════════════════════════════════════ - if (doBgWrap) { + + // BG extraction/blur needed for: bg wrap OR additive key multiplication mode + if (needBgBlur && !viewDone) { id bgBuf = reinterpret_cast>(const_cast(p_Background)); - // Pre-compute Gaussian weights for BG blur + // Cached Gaussian weights for BG blur int bwR = std::max(1, p_BgWrapBlur); - int bwKernelSize = 2 * bwR + 1; - float bwSigma = fmaxf(bwR / 3.0f, 0.5f); - float bwInv2s2 = 1.0f / (2.0f * bwSigma * bwSigma); - float* bwW = (float*)alloca(bwKernelSize * sizeof(float)); - float bwSum = 0.0f; - for (int i = -bwR; i <= bwR; i++) { - float wt = expf(-(float)(i * i) * bwInv2s2); - bwW[i + bwR] = wt; - bwSum += wt; + if (state.bwWeightRadius != bwR) { + if (state.bwWeightBuf) [state.bwWeightBuf release]; + int bwKernelSize = 2 * bwR + 1; + float bwSigma = fmaxf(bwR / 3.0f, 0.5f); + float bwInv2s2 = 1.0f / (2.0f * bwSigma * bwSigma); + float* bwW = (float*)alloca(bwKernelSize * sizeof(float)); + float bwSum = 0.0f; + for (int i = -bwR; i <= bwR; i++) { + float wt = expf(-(float)(i * i) * bwInv2s2); + bwW[i + bwR] = wt; + bwSum += wt; + } + for (int i = 0; i < bwKernelSize; i++) bwW[i] /= bwSum; + state.bwWeightBuf = [device newBufferWithBytes:bwW + length:bwKernelSize * sizeof(float) + options:MTLResourceStorageModeShared]; + state.bwWeightRadius = bwR; } - for (int i = 0; i < bwKernelSize; i++) bwW[i] /= bwSum; - - id bwWeightBuf = [device newBufferWithBytes:bwW - length:bwKernelSize * sizeof(float) - options:MTLResourceStorageModeShared]; + id bwWeightBuf = state.bwWeightBuf; // Extract R, G, B from BG into tempA, tempB, tempC for (int ch = 0; ch < 3; ch++) { @@ -1006,26 +2136,81 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height, gaussianBlur(enc, pipes, tempB, tempF, bwWeightBuf, p_Width, p_Height, bwR); gaussianBlur(enc, pipes, tempC, tempF, bwWeightBuf, p_Width, p_Height, bwR); - // Apply wrap - [enc setComputePipelineState:pipes.bgWrap]; + if (p_ViewMode == 5) { + // ── Diagnostic: Blurred Background ────────────────────────── + [enc setComputePipelineState:pipes.packRGBA]; + [enc setBuffer:tempA offset:0 atIndex:0]; + [enc setBuffer:tempB offset:0 atIndex:1]; + [enc setBuffer:tempC offset:0 atIndex:2]; + [enc setBuffer:dstBuf offset:0 atIndex:3]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + dispatch2D(enc, pipes.packRGBA, p_Width, p_Height); + } else { + // Apply BG wrap (light wrap) + if (doBgWrap) { + [enc setComputePipelineState:pipes.bgWrap]; + [enc setBuffer:dstBuf offset:0 atIndex:2]; + [enc setBuffer:tempA offset:0 atIndex:3]; + [enc setBuffer:tempB offset:0 atIndex:4]; + [enc setBuffer:tempC offset:0 atIndex:5]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&p_BgWrapAmount length:sizeof(float) atIndex:12]; + dispatch2D(enc, pipes.bgWrap, p_Width, p_Height); + } + + // Apply additive key (multiplication mode uses blurred BG in tempA/B/C) + if (doAdditiveKey) { + [enc setComputePipelineState:pipes.additiveKey]; + [enc setBuffer:srcBuf offset:0 atIndex:0]; + [enc setBuffer:scrBuf offset:0 atIndex:1]; + [enc setBuffer:dstBuf offset:0 atIndex:2]; + [enc setBuffer:tempA offset:0 atIndex:3]; + [enc setBuffer:tempB offset:0 atIndex:4]; + [enc setBuffer:tempC offset:0 atIndex:5]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&p_AdditiveKeyMode length:sizeof(int) atIndex:12]; + [enc setBytes:&p_UseScreenInput length:sizeof(int) atIndex:13]; + [enc setBytes:&p_PickR length:sizeof(float) atIndex:14]; + [enc setBytes:&p_PickG length:sizeof(float) atIndex:15]; + [enc setBytes:&p_PickB length:sizeof(float) atIndex:16]; + [enc setBytes:&p_AdditiveKeySat length:sizeof(float) atIndex:17]; + [enc setBytes:&p_AdditiveKeyAmount length:sizeof(float) atIndex:18]; + [enc setBytes:&p_AdditiveKeyBlackClamp length:sizeof(int) atIndex:19]; + dispatch2D(enc, pipes.additiveKey, p_Width, p_Height); + } + } + } + + // Additive key (addition mode) — works without BG input + if (doAdditiveKey && p_AdditiveKeyMode == 0 && !needBgBlur && !viewDone) { + [enc setComputePipelineState:pipes.additiveKey]; + [enc setBuffer:srcBuf offset:0 atIndex:0]; + [enc setBuffer:scrBuf offset:0 atIndex:1]; [enc setBuffer:dstBuf offset:0 atIndex:2]; - [enc setBuffer:tempA offset:0 atIndex:3]; + [enc setBuffer:tempA offset:0 atIndex:3]; // unused in addition mode [enc setBuffer:tempB offset:0 atIndex:4]; [enc setBuffer:tempC offset:0 atIndex:5]; - [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; - [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; - [enc setBytes:&p_BgWrapAmount length:sizeof(float) atIndex:12]; - dispatch2D(enc, pipes.bgWrap, p_Width, p_Height); - - [bwWeightBuf release]; + [enc setBytes:&p_Width length:sizeof(int) atIndex:10]; + [enc setBytes:&p_Height length:sizeof(int) atIndex:11]; + [enc setBytes:&p_AdditiveKeyMode length:sizeof(int) atIndex:12]; + [enc setBytes:&p_UseScreenInput length:sizeof(int) atIndex:13]; + [enc setBytes:&p_PickR length:sizeof(float) atIndex:14]; + [enc setBytes:&p_PickG length:sizeof(float) atIndex:15]; + [enc setBytes:&p_PickB length:sizeof(float) atIndex:16]; + [enc setBytes:&p_AdditiveKeySat length:sizeof(float) atIndex:17]; + [enc setBytes:&p_AdditiveKeyAmount length:sizeof(float) atIndex:18]; + [enc setBytes:&p_AdditiveKeyBlackClamp length:sizeof(int) atIndex:19]; + dispatch2D(enc, pipes.additiveKey, p_Width, p_Height); } [enc endEncoding]; [cmdBuf commit]; [cmdBuf waitUntilCompleted]; - // ── Cleanup (temp buffers are cached — only release per-frame objects) ── + // ── Cleanup (temp + weight buffers are cached — only release per-frame objects) ── if (createdDummy) [scrBuf release]; - if (weightBuf) [weightBuf release]; } // @autoreleasepool }