diff --git a/IBKeymaster/IBKeyer.cpp b/IBKeymaster/IBKeyer.cpp
index 1eaced1..7a5f42f 100644
--- a/IBKeymaster/IBKeyer.cpp
+++ b/IBKeymaster/IBKeyer.cpp
@@ -36,7 +36,11 @@ constexpr const char* kPluginDescription =
     "Based on IBKeyer by Jed Smith (gaffer-tools) + He et al. guided filter.";
 constexpr const char* kPluginIdentifier = "com.OpenFXSample.IBKeyer";
 constexpr int kPluginVersionMajor = 2;
-constexpr int kPluginVersionMinor = 1;
+// This changed the public OFX surface in a non-trivial way: new clips, new params, and new
+// backend-routing rules. Keeping the old version number after that can leave hosts holding onto a
+// stale descriptor cache and trying to reconcile it with a different binary. Bumping the version is
+// the polite way to tell Resolve "this is materially a new plugin shape, please rescan it fresh."
+constexpr int kPluginVersionMinor = 2;
 constexpr bool kSupportsTiles = false;
 constexpr bool kSupportsMultiResolution = false;
 constexpr bool kSupportsMultipleClipPARs = false;
@@ -63,6 +67,9 @@ class IBKeyerPlugin : public OFX::ImageEffect
     OFX::Clip* m_DstClip;
     OFX::Clip* m_SrcClip;
     OFX::Clip* m_ScreenClip;
+    OFX::Clip* m_BgClip;
+    OFX::Clip* m_GarbageMatteClip;
+    OFX::Clip* m_OcclusionMatteClip;
 
     OFX::ChoiceParam* m_ScreenColor;
     OFX::BooleanParam* m_UseScreenInput;
@@ -73,12 +80,31 @@ class IBKeyerPlugin : public OFX::ImageEffect
     OFX::BooleanParam* m_Premultiply;
     OFX::DoubleParam* m_BlackClip;
     OFX::DoubleParam* m_WhiteClip;
+    OFX::DoubleParam* m_MatteGamma;
+    OFX::BooleanParam* m_PrematteEnabled;
+    OFX::IntParam* m_PrematteBlur;
+    OFX::IntParam* m_PrematteErode;
+    OFX::IntParam* m_PrematteIterations;
     OFX::BooleanParam* m_NearGreyExtract;
     OFX::DoubleParam* m_NearGreyAmount;
+    OFX::DoubleParam* m_NearGreySoftness;
     OFX::BooleanParam* m_GuidedFilterEnabled;
+    OFX::ChoiceParam* m_GuidedFilterMode;
     OFX::IntParam* m_GuidedRadius;
     OFX::DoubleParam* m_GuidedEpsilon;
     OFX::DoubleParam* m_GuidedMix;
+    OFX::DoubleParam* m_EdgeProtect;
+    OFX::IntParam* m_RefineIterations;
+    OFX::DoubleParam* m_EdgeColorCorrect;
+    OFX::BooleanParam* m_BgWrapEnabled;
+    OFX::IntParam* m_BgWrapBlur;
+    OFX::DoubleParam* m_BgWrapAmount;
+    OFX::BooleanParam* m_AdditiveKeyEnabled;
+    OFX::ChoiceParam* m_AdditiveKeyMode;
+    OFX::DoubleParam* m_AdditiveKeySaturation;
+    OFX::DoubleParam* m_AdditiveKeyAmount;
+    OFX::BooleanParam* m_AdditiveKeyBlackClamp;
+    OFX::ChoiceParam* m_ViewMode;
 };
 
 IBKeyerPlugin::IBKeyerPlugin(OfxImageEffectHandle p_Handle)
@@ -86,6 +112,9 @@ IBKeyerPlugin::IBKeyerPlugin(OfxImageEffectHandle p_Handle)
     , m_DstClip(fetchClip(kOfxImageEffectOutputClipName))
     , m_SrcClip(fetchClip(kOfxImageEffectSimpleSourceClipName))
     , m_ScreenClip(fetchClip("Screen"))
+    , m_BgClip(fetchClip("Background"))
+    , m_GarbageMatteClip(fetchClip("GarbageMatte"))
+    , m_OcclusionMatteClip(fetchClip("OcclusionMatte"))
     , m_ScreenColor(fetchChoiceParam("screenColor"))
     , m_UseScreenInput(fetchBooleanParam("useScreenInput"))
     , m_PickColor(fetchRGBParam("pickColor"))
@@ -95,12 +124,31 @@ IBKeyerPlugin::IBKeyerPlugin(OfxImageEffectHandle p_Handle)
     , m_Premultiply(fetchBooleanParam("premultiply"))
     , m_BlackClip(fetchDoubleParam("blackClip"))
     , m_WhiteClip(fetchDoubleParam("whiteClip"))
+    , m_MatteGamma(fetchDoubleParam("matteGamma"))
+    , m_PrematteEnabled(fetchBooleanParam("prematteEnabled"))
+    , m_PrematteBlur(fetchIntParam("prematteBlur"))
+    , m_PrematteErode(fetchIntParam("prematteErode"))
+    , m_PrematteIterations(fetchIntParam("prematteIterations"))
     , m_NearGreyExtract(fetchBooleanParam("nearGreyExtract"))
     , m_NearGreyAmount(fetchDoubleParam("nearGreyAmount"))
+    , m_NearGreySoftness(fetchDoubleParam("nearGreySoftness"))
     , m_GuidedFilterEnabled(fetchBooleanParam("guidedFilterEnabled"))
+    , m_GuidedFilterMode(fetchChoiceParam("guidedFilterMode"))
     , m_GuidedRadius(fetchIntParam("guidedRadius"))
     , m_GuidedEpsilon(fetchDoubleParam("guidedEpsilon"))
     , m_GuidedMix(fetchDoubleParam("guidedMix"))
+    , m_EdgeProtect(fetchDoubleParam("edgeProtect"))
+    , m_RefineIterations(fetchIntParam("refineIterations"))
+    , m_EdgeColorCorrect(fetchDoubleParam("edgeColorCorrect"))
+    , m_BgWrapEnabled(fetchBooleanParam("bgWrapEnabled"))
+    , m_BgWrapBlur(fetchIntParam("bgWrapBlur"))
+    , m_BgWrapAmount(fetchDoubleParam("bgWrapAmount"))
+    , m_AdditiveKeyEnabled(fetchBooleanParam("additiveKeyEnabled"))
+    , m_AdditiveKeyMode(fetchChoiceParam("additiveKeyMode"))
+    , m_AdditiveKeySaturation(fetchDoubleParam("additiveKeySaturation"))
+    , m_AdditiveKeyAmount(fetchDoubleParam("additiveKeyAmount"))
+    , m_AdditiveKeyBlackClamp(fetchBooleanParam("additiveKeyBlackClamp"))
+    , m_ViewMode(fetchChoiceParam("viewMode"))
 {
     setEnabledness();
 }
@@ -149,6 +197,43 @@ void IBKeyerPlugin::render(const OFX::RenderArguments& p_Args)
         }
     }
 
+    // Get background image (optional light-wrap source).
+    std::unique_ptr<OFX::Image> background;
+    if (m_BgClip && m_BgClip->isConnected()) {
+        background.reset(m_BgClip->fetchImage(p_Args.time));
+        if (background &&
+            (background->getPixelDepth() != OFX::eBitDepthFloat ||
+             (background->getPixelComponents() != OFX::ePixelComponentRGB &&
+              background->getPixelComponents() != OFX::ePixelComponentRGBA))) {
+            OFX::throwSuiteStatusException(kOfxStatErrUnsupported);
+        }
+    }
+
+    // External matte clips are deliberately optional. They are constraints on the matte, not
+    // mandatory inputs to the key itself, so disconnecting them should never change routing or
+    // make the effect invalid.
+    std::unique_ptr<OFX::Image> garbageMatte;
+    if (m_GarbageMatteClip && m_GarbageMatteClip->isConnected()) {
+        garbageMatte.reset(m_GarbageMatteClip->fetchImage(p_Args.time));
+        if (garbageMatte &&
+            (garbageMatte->getPixelDepth() != OFX::eBitDepthFloat ||
+             (garbageMatte->getPixelComponents() != OFX::ePixelComponentRGB &&
+              garbageMatte->getPixelComponents() != OFX::ePixelComponentRGBA))) {
+            OFX::throwSuiteStatusException(kOfxStatErrUnsupported);
+        }
+    }
+
+    std::unique_ptr<OFX::Image> occlusionMatte;
+    if (m_OcclusionMatteClip && m_OcclusionMatteClip->isConnected()) {
+        occlusionMatte.reset(m_OcclusionMatteClip->fetchImage(p_Args.time));
+        if (occlusionMatte &&
+            (occlusionMatte->getPixelDepth() != OFX::eBitDepthFloat ||
+             (occlusionMatte->getPixelComponents() != OFX::ePixelComponentRGB &&
+              occlusionMatte->getPixelComponents() != OFX::ePixelComponentRGBA))) {
+            OFX::throwSuiteStatusException(kOfxStatErrUnsupported);
+        }
+    }
+
     // Fetch parameter values once and freeze them into a backend-agnostic request. The old file
     // pushed these directly into an ImageProcessor instance; the split version keeps that same
     // intent, but makes the backend choice explicit and testable.
@@ -179,16 +264,44 @@ void IBKeyerPlugin::render(const OFX::RenderArguments& p_Args)
     params.premultiply = m_Premultiply->getValueAtTime(p_Args.time);
     params.blackClip = static_cast<float>(m_BlackClip->getValueAtTime(p_Args.time));
     params.whiteClip = static_cast<float>(m_WhiteClip->getValueAtTime(p_Args.time));
+    params.matteGamma = static_cast<float>(m_MatteGamma->getValueAtTime(p_Args.time));
+    params.prematteEnabled = m_PrematteEnabled->getValueAtTime(p_Args.time);
+    params.prematteBlur = m_PrematteBlur->getValueAtTime(p_Args.time);
+    params.prematteErode = m_PrematteErode->getValueAtTime(p_Args.time);
+    params.prematteIterations = m_PrematteIterations->getValueAtTime(p_Args.time);
     params.nearGreyExtract = m_NearGreyExtract->getValueAtTime(p_Args.time);
     params.nearGreyAmount = static_cast<float>(m_NearGreyAmount->getValueAtTime(p_Args.time));
+    params.nearGreySoftness = static_cast<float>(m_NearGreySoftness->getValueAtTime(p_Args.time));
     params.guidedFilterEnabled = m_GuidedFilterEnabled->getValueAtTime(p_Args.time);
+    int guidedFilterMode = 0;
+    m_GuidedFilterMode->getValueAtTime(p_Args.time, guidedFilterMode);
+    params.guidedFilterMode = guidedFilterMode;
     params.guidedRadius = m_GuidedRadius->getValueAtTime(p_Args.time);
     params.guidedEpsilon = static_cast<float>(m_GuidedEpsilon->getValueAtTime(p_Args.time));
     params.guidedMix = static_cast<float>(m_GuidedMix->getValueAtTime(p_Args.time));
+    params.edgeProtect = static_cast<float>(m_EdgeProtect->getValueAtTime(p_Args.time));
+    params.refineIterations = m_RefineIterations->getValueAtTime(p_Args.time);
+    params.edgeColorCorrect = static_cast<float>(m_EdgeColorCorrect->getValueAtTime(p_Args.time));
+    params.bgWrapEnabled = m_BgWrapEnabled->getValueAtTime(p_Args.time) && static_cast<bool>(background);
+    params.bgWrapBlur = m_BgWrapBlur->getValueAtTime(p_Args.time);
+    params.bgWrapAmount = static_cast<float>(m_BgWrapAmount->getValueAtTime(p_Args.time));
+    params.additiveKeyEnabled = m_AdditiveKeyEnabled->getValueAtTime(p_Args.time);
+    int additiveKeyMode = 0;
+    m_AdditiveKeyMode->getValueAtTime(p_Args.time, additiveKeyMode);
+    params.additiveKeyMode = additiveKeyMode;
+    params.additiveKeySaturation = static_cast<float>(m_AdditiveKeySaturation->getValueAtTime(p_Args.time));
+    params.additiveKeyAmount = static_cast<float>(m_AdditiveKeyAmount->getValueAtTime(p_Args.time));
+    params.additiveKeyBlackClamp = m_AdditiveKeyBlackClamp->getValueAtTime(p_Args.time);
+    int viewMode = 0;
+    m_ViewMode->getValueAtTime(p_Args.time, viewMode);
+    params.viewMode = viewMode;
 
     IBKeyerCore::RenderRequest request;
     request.srcImage = src.get();
     request.screenImage = screen.get();
+    request.backgroundImage = background.get();
+    request.garbageMatteImage = garbageMatte.get();
+    request.occlusionMatteImage = occlusionMatte.get();
     request.dstImage = dst.get();
     request.renderWindow = p_Args.renderWindow;
     request.hostCudaEnabled = p_Args.isEnabledCudaRender;
@@ -224,8 +337,12 @@ void IBKeyerPlugin::changedParam(const OFX::InstanceChangedArgs&,
                                  const std::string& p_ParamName)
 {
     if (p_ParamName == "useScreenInput" ||
+        p_ParamName == "prematteEnabled" ||
         p_ParamName == "guidedFilterEnabled" ||
-        p_ParamName == "nearGreyExtract") {
+        p_ParamName == "additiveKeyEnabled" ||
+        p_ParamName == "additiveKeyMode" ||
+        p_ParamName == "nearGreyExtract" ||
+        p_ParamName == "bgWrapEnabled") {
         setEnabledness();
     }
 }
@@ -240,7 +357,11 @@ void IBKeyerPlugin::changedParam(const OFX::InstanceChangedArgs&,
 void IBKeyerPlugin::changedClip(const OFX::InstanceChangedArgs&,
                                 const std::string& p_ClipName)
 {
-    if (p_ClipName == "Screen" || p_ClipName == kOfxImageEffectSimpleSourceClipName) {
+    if (p_ClipName == "Screen" ||
+        p_ClipName == "Background" ||
+        p_ClipName == "GarbageMatte" ||
+        p_ClipName == "OcclusionMatte" ||
+        p_ClipName == kOfxImageEffectSimpleSourceClipName) {
         setEnabledness();
     }
 }
@@ -258,13 +379,40 @@ void IBKeyerPlugin::setEnabledness()
     const bool useScreenInput = m_UseScreenInput->getValue();
     m_PickColor->setEnabled(!useScreenInput || !m_ScreenClip->isConnected());
 
+    const bool prematteEnabled = m_PrematteEnabled->getValue();
+    m_PrematteBlur->setEnabled(prematteEnabled);
+    m_PrematteErode->setEnabled(prematteEnabled);
+    m_PrematteIterations->setEnabled(prematteEnabled);
+
     const bool guidedEnabled = m_GuidedFilterEnabled->getValue();
+    m_GuidedFilterMode->setEnabled(guidedEnabled);
     m_GuidedRadius->setEnabled(guidedEnabled);
     m_GuidedEpsilon->setEnabled(guidedEnabled);
     m_GuidedMix->setEnabled(guidedEnabled);
+    m_EdgeProtect->setEnabled(guidedEnabled);
+    m_RefineIterations->setEnabled(guidedEnabled);
+    m_EdgeColorCorrect->setEnabled(guidedEnabled);
 
     const bool nearGreyEnabled = m_NearGreyExtract->getValue();
     m_NearGreyAmount->setEnabled(nearGreyEnabled);
+    m_NearGreySoftness->setEnabled(nearGreyEnabled);
+
+    // Resolve can be a little awkward about when optional secondary-input connection state becomes
+    // visible to the plugin UI. If we require "checked + connected" here, users can end up in a
+    // dead-feeling state where they enabled Background Wrap but still cannot edit its controls.
+    //
+    // Render-time validation still requires a real Background clip, so loosening the UI gate here
+    // is a usability fix rather than a behavior change.
+    const bool bgWrapControlsEnabled = m_BgWrapEnabled->getValue() ||
+                                       (m_BgClip != nullptr && m_BgClip->isConnected());
+    m_BgWrapBlur->setEnabled(bgWrapControlsEnabled);
+    m_BgWrapAmount->setEnabled(bgWrapControlsEnabled);
+
+    const bool additiveEnabled = m_AdditiveKeyEnabled->getValue();
+    m_AdditiveKeyMode->setEnabled(additiveEnabled);
+    m_AdditiveKeySaturation->setEnabled(additiveEnabled);
+    m_AdditiveKeyAmount->setEnabled(additiveEnabled);
+    m_AdditiveKeyBlackClamp->setEnabled(additiveEnabled);
 }
 
 OFX::DoubleParamDescriptor* defineDoubleParam(OFX::ImageEffectDescriptor& p_Desc,
@@ -365,27 +513,52 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu
 
     // Screen clip (optional clean plate).
     ClipDescriptor* screenClip = p_Desc.defineClip("Screen");
-    screenClip->addSupportedComponent(ePixelComponentRGB);
     screenClip->addSupportedComponent(ePixelComponentRGBA);
+    screenClip->addSupportedComponent(ePixelComponentRGB);
     screenClip->setTemporalClipAccess(false);
     screenClip->setSupportsTiles(kSupportsTiles);
     screenClip->setOptional(true);
     screenClip->setIsMask(false);
 
+    // Background clip (optional, used for background/light wrap parity with the older plugin).
+    ClipDescriptor* bgClip = p_Desc.defineClip("Background");
+    bgClip->addSupportedComponent(ePixelComponentRGBA);
+    bgClip->addSupportedComponent(ePixelComponentRGB);
+    bgClip->setTemporalClipAccess(false);
+    bgClip->setSupportsTiles(kSupportsTiles);
+    bgClip->setOptional(true);
+    bgClip->setIsMask(false);
+
+    ClipDescriptor* garbageClip = p_Desc.defineClip("GarbageMatte");
+    garbageClip->addSupportedComponent(ePixelComponentRGBA);
+    garbageClip->addSupportedComponent(ePixelComponentRGB);
+    garbageClip->setTemporalClipAccess(false);
+    garbageClip->setSupportsTiles(kSupportsTiles);
+    garbageClip->setOptional(true);
+    // These are semantically mattes, but treating them as normal optional image clips keeps the
+    // host contract boring. That is useful on Resolve/Windows, where startup stability matters more
+    // than hinting extra semantics that the effect does not strictly need.
+    garbageClip->setIsMask(false);
+
+    ClipDescriptor* occlusionClip = p_Desc.defineClip("OcclusionMatte");
+    occlusionClip->addSupportedComponent(ePixelComponentRGBA);
+    occlusionClip->addSupportedComponent(ePixelComponentRGB);
+    occlusionClip->setTemporalClipAccess(false);
+    occlusionClip->setSupportsTiles(kSupportsTiles);
+    occlusionClip->setOptional(true);
+    occlusionClip->setIsMask(false);
+
     // Output clip.
     ClipDescriptor* dstClip = p_Desc.defineClip(kOfxImageEffectOutputClipName);
     dstClip->addSupportedComponent(ePixelComponentRGBA);
     dstClip->setSupportsTiles(kSupportsTiles);
 
-    // Page.
     PageParamDescriptor* page = p_Desc.definePageParam("Controls");
 
-    // Group: Screen Settings.
     GroupParamDescriptor* screenGroup = p_Desc.defineGroupParam("ScreenGroup");
     screenGroup->setHint("Screen and keying parameters");
     screenGroup->setLabels("Screen Settings", "Screen Settings", "Screen Settings");
 
-    // Screen colour choice (Red / Green / Blue).
     ChoiceParamDescriptor* screenColor = p_Desc.defineChoiceParam("screenColor");
     screenColor->setLabel("Screen Color");
     screenColor->setHint("Dominant chroma of the backing screen.");
@@ -397,7 +570,6 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu
     screenColor->setParent(*screenGroup);
     page->addChild(*screenColor);
 
-    // Use screen input toggle.
     BooleanParamDescriptor* useScreenInput = p_Desc.defineBooleanParam("useScreenInput");
     useScreenInput->setDefault(true);
     useScreenInput->setHint("When enabled, reads screen colour from the Screen clip. When disabled, uses the Pick Color constant.");
@@ -405,7 +577,6 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu
     useScreenInput->setParent(*screenGroup);
     page->addChild(*useScreenInput);
 
-    // Pick colour (constant fallback).
     RGBParamDescriptor* pickColor = p_Desc.defineRGBParam("pickColor");
     pickColor->setLabels("Pick Color", "Pick Color", "Pick Color");
     pickColor->setHint("Constant screen colour when Screen input is not connected.");
@@ -413,7 +584,6 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu
     pickColor->setParent(*screenGroup);
     page->addChild(*pickColor);
 
-    // Group: Keyer Controls.
     GroupParamDescriptor* keyerGroup = p_Desc.defineGroupParam("KeyerGroup");
     keyerGroup->setHint("Keying and despill controls");
     keyerGroup->setLabels("Keyer Controls", "Keyer Controls", "Keyer Controls");
@@ -425,7 +595,6 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu
                                       "Scales the despill subtraction. 1.0 = standard.",
                                       keyerGroup, 1.0, 0.0, 5.0, 0.01));
 
-    // Respill colour.
     RGBParamDescriptor* respillColor = p_Desc.defineRGBParam("respillColor");
     respillColor->setLabels("Respill Color", "Respill Color", "Respill Color");
     respillColor->setHint("Colour to add back where screen spill was removed.");
@@ -433,7 +602,6 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu
     respillColor->setParent(*keyerGroup);
     page->addChild(*respillColor);
 
-    // Premultiply.
     BooleanParamDescriptor* premultiply = p_Desc.defineBooleanParam("premultiply");
     premultiply->setDefault(false);
     premultiply->setHint("Premultiply RGB by alpha for compositing.");
@@ -441,52 +609,106 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu
     premultiply->setParent(*keyerGroup);
     page->addChild(*premultiply);
 
-    // Group: Matte Controls.
     GroupParamDescriptor* matteGroup = p_Desc.defineGroupParam("MatteGroup");
-    matteGroup->setHint("Matte refinement controls");
+    matteGroup->setHint("Matte refinement controls — adjust black/white points of the raw key");
     matteGroup->setLabels("Matte Controls", "Matte Controls", "Matte Controls");
 
     page->addChild(*defineDoubleParam(p_Desc, "blackClip", "Black Clip",
-                                      "Crush blacks in the raw matte. Values below this become fully transparent.",
+                                      "Crush blacks in the raw matte. Values below this become fully transparent. Useful for cleaning up noise in the screen area.",
                                       matteGroup, 0.0, 0.0, 1.0, 0.001));
     page->addChild(*defineDoubleParam(p_Desc, "whiteClip", "White Clip",
-                                      "Push whites in the raw matte. Values above this become fully opaque.",
+                                      "Push whites in the raw matte. Values above this become fully opaque. Useful for solidifying the foreground core.",
                                       matteGroup, 1.0, 0.0, 1.0, 0.001));
+    page->addChild(*defineDoubleParam(p_Desc, "matteGamma", "Matte Gamma",
+                                      "Applies a power curve to the alpha after black/white clipping.\n"
+                                      "Values < 1.0 push semi-transparent edges toward opaque.\n"
+                                      "Values > 1.0 push them toward transparent.\n"
+                                      "1.0 = no change.",
+                                      matteGroup, 1.0, 0.1, 4.0, 0.01));
+
+    GroupParamDescriptor* prematteGroup = p_Desc.defineGroupParam("PrematteGroup");
+    prematteGroup->setHint("Synthetic clean-plate generation used to re-key difficult shots.");
+    prematteGroup->setLabels("Prematte", "Prematte", "Prematte");
+
+    BooleanParamDescriptor* prematteEnabled = p_Desc.defineBooleanParam("prematteEnabled");
+    prematteEnabled->setDefault(false);
+    prematteEnabled->setHint("Builds a synthetic clean plate from the source and re-runs the core keyer. This moved out of the private Metal branch because it changes the actual key, not just the display.");
+    prematteEnabled->setLabels("Enable", "Enable", "Enable");
+    prematteEnabled->setParent(*prematteGroup);
+    page->addChild(*prematteEnabled);
+
+    IntParamDescriptor* prematteBlur = p_Desc.defineIntParam("prematteBlur");
+    prematteBlur->setLabels("Blur Radius", "Blur Radius", "Blur Radius");
+    prematteBlur->setScriptName("prematteBlur");
+    prematteBlur->setHint("Blur radius for the synthetic clean plate.");
+    prematteBlur->setDefault(8);
+    prematteBlur->setRange(1, 200);
+    prematteBlur->setDisplayRange(1, 50);
+    prematteBlur->setParent(*prematteGroup);
+    page->addChild(*prematteBlur);
+
+    IntParamDescriptor* prematteErode = p_Desc.defineIntParam("prematteErode");
+    prematteErode->setLabels("Erode", "Erode", "Erode");
+    prematteErode->setScriptName("prematteErode");
+    prematteErode->setHint("Erodes the initial matte before clean-plate estimation to reduce foreground contamination.");
+    prematteErode->setDefault(0);
+    prematteErode->setRange(0, 20);
+    prematteErode->setDisplayRange(0, 10);
+    prematteErode->setParent(*prematteGroup);
+    page->addChild(*prematteErode);
+
+    IntParamDescriptor* prematteIterations = p_Desc.defineIntParam("prematteIterations");
+    prematteIterations->setLabels("Iterations", "Iterations", "Iterations");
+    prematteIterations->setScriptName("prematteIterations");
+    prematteIterations->setHint("How many times the synthetic clean plate is rebuilt and re-keyed.");
+    prematteIterations->setDefault(1);
+    prematteIterations->setRange(1, 5);
+    prematteIterations->setDisplayRange(1, 5);
+    prematteIterations->setParent(*prematteGroup);
+    page->addChild(*prematteIterations);
+
+    GroupParamDescriptor* ngeGroup = p_Desc.defineGroupParam("NGEGroup");
+    ngeGroup->setHint("Near Grey Extraction controls");
+    ngeGroup->setLabels("Near Grey Extract", "Near Grey Extract", "Near Grey Extract");
 
-    // Group: Near Grey Extract.
-    GroupParamDescriptor* nearGreyGroup = p_Desc.defineGroupParam("NGEGroup");
-    nearGreyGroup->setHint("Near Grey Extraction controls");
-    nearGreyGroup->setLabels("Near Grey Extract", "Near Grey Extract", "Near Grey Extract");
-
-    // Near Grey Extract toggle.
     BooleanParamDescriptor* nearGreyExtract = p_Desc.defineBooleanParam("nearGreyExtract");
     nearGreyExtract->setDefault(true);
     nearGreyExtract->setHint("Improves matte quality in near-grey or ambiguous areas.");
     nearGreyExtract->setLabels("Enable", "Enable", "Enable");
-    nearGreyExtract->setParent(*nearGreyGroup);
+    nearGreyExtract->setParent(*ngeGroup);
     page->addChild(*nearGreyExtract);
 
-    // Near Grey Amount.
-    page->addChild(*defineDoubleParam(p_Desc, "nearGreyAmount", "Amount",
-                                      "Controls the near-grey response curve used by the keyer.",
-                                      nearGreyGroup, 1.0, 0.0, 1.0, 0.01));
+    page->addChild(*defineDoubleParam(p_Desc, "nearGreyAmount", "Strength",
+                                      "How much the near-grey extraction contributes to the final alpha.",
+                                      ngeGroup, 0.5, 0.0, 1.0, 0.01));
+    page->addChild(*defineDoubleParam(p_Desc, "nearGreySoftness", "Softness",
+                                      "Controls how the keyer measures 'greyness' in ambiguous regions.",
+                                      ngeGroup, 1.0, 0.0, 1.0, 0.01));
 
-    // Group: Guided Filter.
     GroupParamDescriptor* guidedGroup = p_Desc.defineGroupParam("GuidedFilterGroup");
     guidedGroup->setHint("Edge-aware matte refinement using the source luminance as guide");
     guidedGroup->setLabels("Guided Filter", "Guided Filter", "Guided Filter");
 
     BooleanParamDescriptor* guidedEnabled = p_Desc.defineBooleanParam("guidedFilterEnabled");
-    // The original Gaffer IBKeyer stops at the raw IBK-style result plus optional premultiply.
-    // Guided filtering is useful, but it is an extension we added in the OFX port, not part of
-    // the source graph itself. Defaulting it off keeps "fresh instance" behaviour closer to the
-    // original tool and makes backend parity checks less confusing.
-    guidedEnabled->setDefault(false);
-    guidedEnabled->setHint("Enable guided filter matte refinement.");
+    // This intentionally matches the older IBKeymaster defaults now. Earlier in the port I turned
+    // this off to stay closer to the simpler Gaffer graph, but once the goal shifted to full
+    // IBKeymaster parity that default became misleading.
+    guidedEnabled->setDefault(true);
+    guidedEnabled->setHint("Enable guided filter matte refinement. Uses source luminance as an edge guide to recover hair detail and soft edges.");
     guidedEnabled->setLabels("Enable", "Enable", "Enable");
     guidedEnabled->setParent(*guidedGroup);
     page->addChild(*guidedEnabled);
 
+    ChoiceParamDescriptor* guidedMode = p_Desc.defineChoiceParam("guidedFilterMode");
+    guidedMode->setLabel("Guide Mode");
+    guidedMode->setHint("Luma uses the simpler scalar guide. RGB uses the full 3-channel guided filter from the private Metal branch.");
+    guidedMode->appendOption("Luma");
+    guidedMode->appendOption("RGB");
+    guidedMode->setDefault(0);
+    guidedMode->setAnimates(true);
+    guidedMode->setParent(*guidedGroup);
+    page->addChild(*guidedMode);
+
     IntParamDescriptor* guidedRadius = p_Desc.defineIntParam("guidedRadius");
     guidedRadius->setLabels("Radius", "Radius", "Radius");
     guidedRadius->setScriptName("guidedRadius");
@@ -498,11 +720,106 @@ void IBKeyerFactory::describeInContext(ImageEffectDescriptor& p_Desc, ContextEnu
     page->addChild(*guidedRadius);
 
     page->addChild(*defineDoubleParam(p_Desc, "guidedEpsilon", "Epsilon",
-                                      "Edge sensitivity for the guided filter.",
+                                      "Edge sensitivity. Smaller values preserve more edges but may introduce noise.",
                                       guidedGroup, 0.01, 0.0001, 1.0, 0.001));
     page->addChild(*defineDoubleParam(p_Desc, "guidedMix", "Mix",
-                                      "Blend between raw matte and guided-filter-refined matte.",
+                                      "Blend between raw matte (0.0) and guided-filter-refined matte (1.0).",
                                       guidedGroup, 1.0, 0.0, 1.0, 0.01));
+    page->addChild(*defineDoubleParam(p_Desc, "edgeProtect", "Edge Protection",
+                                      "Blends the guide signal from source luminance toward the raw alpha.",
+                                      guidedGroup, 0.5, 0.0, 1.0, 0.01));
+
+    IntParamDescriptor* refineIterations = p_Desc.defineIntParam("refineIterations");
+    refineIterations->setLabels("Refine Iterations", "Refine Iterations", "Refine Iterations");
+    refineIterations->setScriptName("refineIterations");
+    refineIterations->setHint("Number of iterative guided-filter refinement passes.");
+    refineIterations->setDefault(2);
+    refineIterations->setRange(1, 5);
+    refineIterations->setDisplayRange(1, 5);
+    refineIterations->setParent(*guidedGroup);
+    page->addChild(*refineIterations);
+
+    page->addChild(*defineDoubleParam(p_Desc, "edgeColorCorrect", "Edge Color Correct",
+                                      "Re-estimates foreground colour at semi-transparent edges using the matting equation.",
+                                      guidedGroup, 0.0, 0.0, 1.0, 0.01));
+
+    GroupParamDescriptor* bgGroup = p_Desc.defineGroupParam("BgWrapGroup");
+    bgGroup->setHint("Bleeds a blurred version of the new background into the foreground edges.");
+    bgGroup->setLabels("Background Wrap", "Background Wrap", "Background Wrap");
+
+    BooleanParamDescriptor* bgWrapEnabled = p_Desc.defineBooleanParam("bgWrapEnabled");
+    bgWrapEnabled->setDefault(false);
+    bgWrapEnabled->setHint("Enable background wrap. Requires the Background clip to be connected.");
+    bgWrapEnabled->setLabels("Enable", "Enable", "Enable");
+    bgWrapEnabled->setParent(*bgGroup);
+    page->addChild(*bgWrapEnabled);
+
+    IntParamDescriptor* bgWrapBlur = p_Desc.defineIntParam("bgWrapBlur");
+    bgWrapBlur->setLabels("Blur Radius", "Blur Radius", "Blur Radius");
+    bgWrapBlur->setScriptName("bgWrapBlur");
+    bgWrapBlur->setHint("Gaussian blur radius applied to the background before wrapping.");
+    bgWrapBlur->setDefault(20);
+    bgWrapBlur->setRange(1, 200);
+    bgWrapBlur->setDisplayRange(1, 100);
+    bgWrapBlur->setParent(*bgGroup);
+    page->addChild(*bgWrapBlur);
+
+    page->addChild(*defineDoubleParam(p_Desc, "bgWrapAmount", "Amount",
+                                      "How much blurred background to bleed into the foreground edges.",
+                                      bgGroup, 0.5, 0.0, 2.0, 0.01));
+
+    GroupParamDescriptor* additiveGroup = p_Desc.defineGroupParam("AdditiveKeyGroup");
+    additiveGroup->setHint("Recovers transparent detail the alpha missed by adding back source-minus-screen detail.");
+    additiveGroup->setLabels("Additive Key", "Additive Key", "Additive Key");
+
+    BooleanParamDescriptor* additiveEnabled = p_Desc.defineBooleanParam("additiveKeyEnabled");
+    additiveEnabled->setDefault(false);
+    additiveEnabled->setHint("Enable additive detail recovery.");
+    additiveEnabled->setLabels("Enable", "Enable", "Enable");
+    additiveEnabled->setParent(*additiveGroup);
+    page->addChild(*additiveEnabled);
+
+    ChoiceParamDescriptor* additiveMode = p_Desc.defineChoiceParam("additiveKeyMode");
+    additiveMode->setLabel("Mode");
+    additiveMode->setHint("Addition uses source minus screen directly. Multiply uses a factor against the blurred background.");
+    additiveMode->appendOption("Addition");
+    additiveMode->appendOption("Multiply");
+    additiveMode->setDefault(0);
+    additiveMode->setAnimates(true);
+    additiveMode->setParent(*additiveGroup);
+    page->addChild(*additiveMode);
+
+    page->addChild(*defineDoubleParam(p_Desc, "additiveKeySaturation", "Saturation",
+                                      "Desaturates recovered detail to reduce residual screen colour.",
+                                      additiveGroup, 0.0, 0.0, 1.0, 0.01));
+    page->addChild(*defineDoubleParam(p_Desc, "additiveKeyAmount", "Amount",
+                                      "Strength of the additive detail recovery.",
+                                      additiveGroup, 0.0, 0.0, 2.0, 0.01));
+
+    BooleanParamDescriptor* additiveClamp = p_Desc.defineBooleanParam("additiveKeyBlackClamp");
+    additiveClamp->setDefault(false);
+    additiveClamp->setHint("Clamp additive detail to positive values only.");
+    additiveClamp->setLabels("Black Clamp", "Black Clamp", "Black Clamp");
+    additiveClamp->setParent(*additiveGroup);
+    page->addChild(*additiveClamp);
+
+    GroupParamDescriptor* displayGroup = p_Desc.defineGroupParam("DisplayGroup");
+    displayGroup->setHint("Diagnostic views from the richer private branch.");
+    displayGroup->setLabels("Display", "Display", "Display");
+
+    ChoiceParamDescriptor* viewMode = p_Desc.defineChoiceParam("viewMode");
+    viewMode->setLabel("View Mode");
+    viewMode->setHint("Displays intermediate pipeline stages for debugging and tuning.");
+    viewMode->appendOption("Composite");
+    viewMode->appendOption("Raw Matte");
+    viewMode->appendOption("Clean Plate");
+    viewMode->appendOption("Refined Matte");
+    viewMode->appendOption("Despilled Source");
+    viewMode->appendOption("Blurred Background");
+    viewMode->setDefault(0);
+    viewMode->setAnimates(true);
+    viewMode->setParent(*displayGroup);
+    page->addChild(*viewMode);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/IBKeymaster/IBKeyerBackend.cpp b/IBKeymaster/IBKeyerBackend.cpp
index f471816..6818e4a 100644
--- a/IBKeymaster/IBKeyerBackend.cpp
+++ b/IBKeymaster/IBKeyerBackend.cpp
@@ -33,13 +33,19 @@ extern void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
                            int p_Premultiply, int p_NearGreyExtract,
                            float p_NearGreyAmount, float p_NearGreySoftness,
                            float p_BlackClip, float p_WhiteClip, float p_MatteGamma,
-                           int p_GuidedFilterEnabled, int p_GuidedRadius,
+                           int p_PrematteEnabled, int p_PrematteBlur, int p_PrematteErode, int p_PrematteIterations,
+                           int p_GuidedFilterEnabled, int p_GuidedFilterMode, int p_GuidedRadius,
                            float p_GuidedEpsilon, float p_GuidedMix,
                            float p_EdgeProtect, int p_RefineIterations,
                            float p_EdgeColorCorrect,
                            int p_BgWrapEnabled, int p_BgWrapBlur, float p_BgWrapAmount,
+                           int p_AdditiveKeyEnabled, int p_AdditiveKeyMode,
+                           float p_AdditiveKeySat, float p_AdditiveKeyAmount, int p_AdditiveKeyBlackClamp,
+                           int p_ViewMode,
                            const float* p_Input, const float* p_Screen,
-                           const float* p_Background, float* p_Output);
+                           const float* p_Background,
+                           const float* p_GarbageMatte, const float* p_OcclusionMatte,
+                           float* p_Output);
 #endif
 
 namespace IBKeyerCore {
@@ -76,6 +82,16 @@ bool hostCudaForceSyncEnabled()
     return enabled;
 }
 
+bool requiresReferenceOnlyFeatures(const RenderRequest& request)
+{
+    // This helper started life as a safety rail while the private-branch guide/composite features
+    // only existed in the CPU reference path. Once the CUDA path learned the same features, keeping
+    // this list would silently strand Windows/Linux on CPU and make host-CUDA look "broken" even
+    // when the device code was ready. Leaving the helper in place keeps that migration story visible.
+    (void)request;
+    return false;
+}
+
 // One selector controls both descriptor advertising and runtime routing.
 CudaRenderMode selectedCudaRenderModeImpl()
 {
@@ -250,6 +266,9 @@ DeviceRenderFrame buildDeviceRenderFrame(const RenderRequest& request)
     DeviceRenderFrame frame;
     frame.src = makeImagePlaneDesc(request.srcImage);
     frame.screen = makeImagePlaneDesc(request.screenImage);
+    frame.background = makeImagePlaneDesc(request.backgroundImage);
+    frame.garbageMatte = makeImagePlaneDesc(request.garbageMatteImage);
+    frame.occlusionMatte = makeImagePlaneDesc(request.occlusionMatteImage);
     frame.dst = makeMutableImagePlaneDesc(request.dstImage);
     frame.renderWindow = request.renderWindow;
     return frame;
@@ -309,91 +328,150 @@ void gaussianBlurSingle(float* data,
     }
 }
 
-// Moved from: the old "CPU PROCESSING — FALLBACK" section.
-//
-// CPU code is slower, but it is the least dependent on host-specific GPU contracts. That makes it
-// the best place to preserve the algorithm "as intended" and compare GPU paths against it when
-// debugging correctness regressions.
-void renderCpuPacked(const IBKeyerParams& params, const PackedFrame& frame)
+void erodeSingle(const float* src, float* dst, int width, int height, int radius)
 {
-    // This is the CPU processing fallback from the old file, moved out so it can remain the
-    // reference implementation while CUDA/Metal evolve independently.
-    const int width = frame.width;
-    const int height = frame.height;
-    const int pixelCount = width * height;
-    const bool doGF = guidedFilterActive(params);
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+            float minValue = 1.0f;
+            for (int dy = -radius; dy <= radius; ++dy) {
+                const int sy = std::max(0, std::min(height - 1, y + dy));
+                for (int dx = -radius; dx <= radius; ++dx) {
+                    const int sx = std::max(0, std::min(width - 1, x + dx));
+                    minValue = std::min(minValue, src[(sy * width) + sx]);
+                }
+            }
+            dst[(y * width) + x] = minValue;
+        }
+    }
+}
 
-    std::vector<float> rawAlpha(doGF ? pixelCount : 0);
-    std::vector<float> guide(doGF ? pixelCount : 0);
-    std::vector<float> meanI(doGF ? pixelCount : 0);
-    std::vector<float> meanP(doGF ? pixelCount : 0);
-    std::vector<float> meanIp(doGF ? pixelCount : 0);
-    std::vector<float> meanII(doGF ? pixelCount : 0);
-    std::vector<float> scratch(doGF ? pixelCount : 0);
-    std::vector<float> gaussianWeights(doGF ? buildGaussianWeights(params.guidedRadius) : std::vector<float>{});
+float smoothstep01(float value)
+{
+    const float t = clamp01(value);
+    return t * t * (3.0f - 2.0f * t);
+}
+
+void buildCleanPlate(const PackedFrame& frame,
+                     const IBKeyerParams& params,
+                     const std::vector<float>& alpha,
+                     std::vector<float>& cleanPlate,
+                     std::vector<float>& cleanR,
+                     std::vector<float>& cleanG,
+                     std::vector<float>& cleanB,
+                     std::vector<float>& scratch)
+{
+    const int pixelCount = frame.width * frame.height;
+    cleanR.resize(pixelCount);
+    cleanG.resize(pixelCount);
+    cleanB.resize(pixelCount);
+    cleanPlate.resize(static_cast<size_t>(pixelCount) * 4u, 0.0f);
 
     for (int index = 0; index < pixelCount; ++index) {
         const int rgba = index * 4;
-        const float srcR = frame.srcRgba[rgba + 0];
-        const float srcG = frame.srcRgba[rgba + 1];
-        const float srcB = frame.srcRgba[rgba + 2];
-
-        float scrR = params.pickR;
-        float scrG = params.pickG;
-        float scrB = params.pickB;
-        if (params.useScreenInput && frame.screenRgba != nullptr) {
-            scrR = frame.screenRgba[rgba + 0];
-            scrG = frame.screenRgba[rgba + 1];
-            scrB = frame.screenRgba[rgba + 2];
-        }
-
-        // 1. Despill of source and screen.
-        const float despillRGB = despillValue(srcR, srcG, srcB, params.screenColor, params.bias, params.limit);
-        const float despillScreen = despillValue(scrR, scrG, scrB, params.screenColor, params.bias, params.limit);
-
-        // 2. Normalise.
-        const float normalized = safeDivide(despillRGB, despillScreen);
+        const float alphaSoft = smoothstep01(alpha[index]);
+        cleanR[index] = frame.srcRgba[rgba + 0] * (1.0f - alphaSoft) + params.pickR * alphaSoft;
+        cleanG[index] = frame.srcRgba[rgba + 1] * (1.0f - alphaSoft) + params.pickG * alphaSoft;
+        cleanB[index] = frame.srcRgba[rgba + 2] * (1.0f - alphaSoft) + params.pickB * alphaSoft;
+    }
 
-        // 3. Spill map and screen subtraction.
-        const float spillMul = std::max(0.0f, normalized);
-        const float ssR = srcR - spillMul * scrR;
-        const float ssG = srcG - spillMul * scrG;
-        const float ssB = srcB - spillMul * scrB;
+    const int blurRadius = std::max(1, params.prematteBlur);
+    const std::vector<float> weights = buildGaussianWeights(blurRadius);
+    gaussianBlurSingle(cleanR.data(), scratch.data(), frame.width, frame.height, weights, blurRadius);
+    gaussianBlurSingle(cleanG.data(), scratch.data(), frame.width, frame.height, weights, blurRadius);
+    gaussianBlurSingle(cleanB.data(), scratch.data(), frame.width, frame.height, weights, blurRadius);
 
-        // 4. Initial alpha.
-        float alpha = clamp01(1.0f - normalized);
+    for (int index = 0; index < pixelCount; ++index) {
+        const int rgba = index * 4;
+        cleanPlate[rgba + 0] = cleanR[index];
+        cleanPlate[rgba + 1] = cleanG[index];
+        cleanPlate[rgba + 2] = cleanB[index];
+        cleanPlate[rgba + 3] = 1.0f;
+    }
+}
 
-        // 5. Near Grey Extraction (optional).
-        if (params.nearGreyExtract) {
-            const float divR = safeDivide(ssR, srcR);
-            const float divG = safeDivide(ssG, srcG);
-            const float divB = safeDivide(ssB, srcB);
-            const float ngeAlpha = nearGreyAlpha(divR, divG, divB, params.screenColor, params.nearGreyAmount);
-            // Screen composite: a + b - a*b.
-            alpha = ngeAlpha + alpha - ngeAlpha * alpha;
+void applyExternalMatte(float* alphaBuffer,
+                        float* dstRgba,
+                        int pixelCount,
+                        const float* matteRgba,
+                        bool garbage)
+{
+    if (matteRgba == nullptr) {
+        return;
+    }
+    for (int index = 0; index < pixelCount; ++index) {
+        const int rgba = index * 4;
+        const float matteAlpha = matteRgba[rgba + 3];
+        float alpha = alphaBuffer[index];
+        if (garbage) {
+            alpha *= (1.0f - matteAlpha);
+        } else {
+            alpha = std::max(alpha, matteAlpha);
         }
+        alphaBuffer[index] = clamp01(alpha);
+        dstRgba[rgba + 3] = alphaBuffer[index];
+    }
+}
 
-        if (params.whiteClip > params.blackClip + 1e-6f) {
-            alpha = clamp01((alpha - params.blackClip) / (params.whiteClip - params.blackClip));
-        }
+void writeAlphaDiagnostic(float* dstRgba, const float* alpha, int pixelCount)
+{
+    for (int index = 0; index < pixelCount; ++index) {
+        const int rgba = index * 4;
+        const float a = alpha[index];
+        dstRgba[rgba + 0] = a;
+        dstRgba[rgba + 1] = a;
+        dstRgba[rgba + 2] = a;
+        dstRgba[rgba + 3] = 1.0f;
+    }
+}
 
-        // 6. Output = screen-subtracted + respill.
-        const float respillMul = std::max(0.0f, despillScreen * normalized);
-        frame.dstRgba[rgba + 0] = ssR + respillMul * params.respillR;
-        frame.dstRgba[rgba + 1] = ssG + respillMul * params.respillG;
-        frame.dstRgba[rgba + 2] = ssB + respillMul * params.respillB;
-        frame.dstRgba[rgba + 3] = alpha;
+void copyDiagnosticRgba(float* dstRgba, const float* srcRgba, int pixelCount)
+{
+    if (srcRgba == nullptr) {
+        return;
+    }
+    std::copy(srcRgba, srcRgba + static_cast<size_t>(pixelCount) * 4u, dstRgba);
+}
 
-        if (doGF) {
-            rawAlpha[index] = alpha;
-            guide[index] = luminance(srcR, srcG, srcB);
-        }
+void writeAlphaFromRgbaDiagnostic(float* dstRgba, int pixelCount)
+{
+    for (int index = 0; index < pixelCount; ++index) {
+        const int rgba = index * 4;
+        const float a = dstRgba[rgba + 3];
+        dstRgba[rgba + 0] = a;
+        dstRgba[rgba + 1] = a;
+        dstRgba[rgba + 2] = a;
+        dstRgba[rgba + 3] = 1.0f;
     }
+}
+
+void runScalarGuidedFilter(const PackedFrame& frame,
+                           const IBKeyerParams& params,
+                           std::vector<float>& rawAlpha,
+                           std::vector<float>& guide,
+                           std::vector<float>& meanI,
+                           std::vector<float>& meanP,
+                           std::vector<float>& meanIp,
+                           std::vector<float>& meanII,
+                           std::vector<float>& scratch,
+                           float* dstRgba)
+{
+    const int pixelCount = frame.width * frame.height;
+    const std::vector<float> gaussianWeights = buildGaussianWeights(params.guidedRadius);
+    std::vector<float> savedRawAlpha = rawAlpha;
+    const int numIter = std::max(1, std::min(params.refineIterations, 5));
+
+    for (int iter = 0; iter < numIter; ++iter) {
+        if (iter > 0) {
+            for (int index = 0; index < pixelCount; ++index) {
+                const int rgba = index * 4;
+                const float alpha = rawAlpha[index];
+                const float fgLum = luminance(frame.srcRgba[rgba + 0] * alpha,
+                                              frame.srcRgba[rgba + 1] * alpha,
+                                              frame.srcRgba[rgba + 2] * alpha);
+                guide[index] = fgLum * (1.0f - params.edgeProtect) + alpha * params.edgeProtect;
+            }
+        }
 
-    if (doGF) {
-        // Guided filter refinement is a post-pass layered over the original IBKeyer math.
-        // Keeping it after the numbered IBK steps makes it easier to compare against the
-        // pre-guided historical version when debugging parity.
         for (int index = 0; index < pixelCount; ++index) {
             meanI[index] = guide[index];
             meanP[index] = rawAlpha[index];
@@ -401,13 +479,10 @@ void renderCpuPacked(const IBKeyerParams& params, const PackedFrame& frame)
             meanII[index] = guide[index] * guide[index];
         }
 
-        // The macOS Metal path already uses a normalized Gaussian kernel for the guided
-        // filter. Matching that here keeps the matte stable across OSes instead of letting
-        // Windows/Linux drift because they happened to use a cheaper box-blur approximation.
-        gaussianBlurSingle(meanI.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
-        gaussianBlurSingle(meanP.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
-        gaussianBlurSingle(meanIp.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
-        gaussianBlurSingle(meanII.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(meanI.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(meanP.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(meanIp.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(meanII.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius);
 
         for (int index = 0; index < pixelCount; ++index) {
             const float variance = meanII[index] - meanI[index] * meanI[index];
@@ -418,23 +493,354 @@ void renderCpuPacked(const IBKeyerParams& params, const PackedFrame& frame)
             meanP[index] = b;
         }
 
-        gaussianBlurSingle(meanI.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(meanI.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(meanP.data(), scratch.data(), frame.width, frame.height, gaussianWeights, params.guidedRadius);
+
+        if (iter < numIter - 1) {
+            for (int index = 0; index < pixelCount; ++index) {
+                rawAlpha[index] = clamp01(meanI[index] * guide[index] + meanP[index]);
+            }
+        }
+    }
+
+    for (int index = 0; index < pixelCount; ++index) {
+        const int rgba = index * 4;
+        const float guidedAlpha = clamp01(meanI[index] * guide[index] + meanP[index]);
+        const float alpha = savedRawAlpha[index] * (1.0f - params.guidedMix) + guidedAlpha * params.guidedMix;
+        if (params.premultiply) {
+            dstRgba[rgba + 0] *= alpha;
+            dstRgba[rgba + 1] *= alpha;
+            dstRgba[rgba + 2] *= alpha;
+        }
+        dstRgba[rgba + 3] = alpha;
+    }
+}
+
+void runRgbGuidedFilter(const PackedFrame& frame,
+                        const IBKeyerParams& params,
+                        std::vector<float>& rawAlpha,
+                        std::vector<float>& scratch,
+                        float* dstRgba)
+{
+    const int width = frame.width;
+    const int height = frame.height;
+    const int pixelCount = width * height;
+    const int numIter = std::max(1, std::min(params.refineIterations, 5));
+    const std::vector<float> gaussianWeights = buildGaussianWeights(params.guidedRadius);
+    std::vector<float> meanIr(pixelCount), meanIg(pixelCount), meanIb(pixelCount), meanP(pixelCount);
+    std::vector<float> irir(pixelCount), irig(pixelCount), irib(pixelCount), igig(pixelCount), igib(pixelCount), ibib(pixelCount);
+    std::vector<float> irp(pixelCount), igp(pixelCount), ibp(pixelCount);
+    std::vector<float> meanAr(pixelCount), meanAg(pixelCount), meanAb(pixelCount), meanB(pixelCount);
+    const std::vector<float> savedRawAlpha = rawAlpha;
+
+    for (int iter = 0; iter < numIter; ++iter) {
+        for (int index = 0; index < pixelCount; ++index) {
+            const int rgba = index * 4;
+            const float ir = frame.srcRgba[rgba + 0];
+            const float ig = frame.srcRgba[rgba + 1];
+            const float ib = frame.srcRgba[rgba + 2];
+            const float p = (iter == 0) ? rawAlpha[index] : rawAlpha[index];
+            meanIr[index] = ir;
+            meanIg[index] = ig;
+            meanIb[index] = ib;
+            meanP[index] = p;
+            irir[index] = ir * ir;
+            irig[index] = ir * ig;
+            irib[index] = ir * ib;
+            igig[index] = ig * ig;
+            igib[index] = ig * ib;
+            ibib[index] = ib * ib;
+            irp[index] = ir * p;
+            igp[index] = ig * p;
+            ibp[index] = ib * p;
+        }
+
+        gaussianBlurSingle(meanIr.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(meanIg.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(meanIb.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
         gaussianBlurSingle(meanP.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(irir.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(irig.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(irib.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(igig.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(igib.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(ibib.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(irp.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(igp.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(ibp.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
 
+        for (int index = 0; index < pixelCount; ++index) {
+            const float mIr = meanIr[index];
+            const float mIg = meanIg[index];
+            const float mIb = meanIb[index];
+            const float mP = meanP[index];
+
+            float s_rr = irir[index] - mIr * mIr;
+            const float s_rg = irig[index] - mIr * mIg;
+            const float s_rb = irib[index] - mIr * mIb;
+            float s_gg = igig[index] - mIg * mIg;
+            const float s_gb = igib[index] - mIg * mIb;
+            float s_bb = ibib[index] - mIb * mIb;
+
+            const float c_rp = irp[index] - mIr * mP;
+            const float c_gp = igp[index] - mIg * mP;
+            const float c_bp = ibp[index] - mIb * mP;
+
+            const float trace = s_rr + s_gg + s_bb;
+            const float adaptEps = params.guidedEpsilon * params.guidedEpsilon /
+                                   ((trace / 3.0f) + params.guidedEpsilon + 1e-10f);
+            s_rr += adaptEps;
+            s_gg += adaptEps;
+            s_bb += adaptEps;
+
+            const float det = s_rr * (s_gg * s_bb - s_gb * s_gb)
+                            - s_rg * (s_rg * s_bb - s_gb * s_rb)
+                            + s_rb * (s_rg * s_gb - s_gg * s_rb);
+            const float invDet = (std::fabs(det) > 1e-12f) ? (1.0f / det) : 0.0f;
+
+            const float inv_rr = (s_gg * s_bb - s_gb * s_gb) * invDet;
+            const float inv_rg = (s_rb * s_gb - s_rg * s_bb) * invDet;
+            const float inv_rb = (s_rg * s_gb - s_rb * s_gg) * invDet;
+            const float inv_gg = (s_rr * s_bb - s_rb * s_rb) * invDet;
+            const float inv_gb = (s_rb * s_rg - s_rr * s_gb) * invDet;
+            const float inv_bb = (s_rr * s_gg - s_rg * s_rg) * invDet;
+
+            meanAr[index] = inv_rr * c_rp + inv_rg * c_gp + inv_rb * c_bp;
+            meanAg[index] = inv_rg * c_rp + inv_gg * c_gp + inv_gb * c_bp;
+            meanAb[index] = inv_rb * c_rp + inv_gb * c_gp + inv_bb * c_bp;
+            meanB[index] = mP - meanAr[index] * mIr - meanAg[index] * mIg - meanAb[index] * mIb;
+        }
+
+        gaussianBlurSingle(meanAr.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(meanAg.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(meanAb.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+        gaussianBlurSingle(meanB.data(), scratch.data(), width, height, gaussianWeights, params.guidedRadius);
+
+        if (iter < numIter - 1) {
+            for (int index = 0; index < pixelCount; ++index) {
+                const int rgba = index * 4;
+                const float q = meanAr[index] * frame.srcRgba[rgba + 0] +
+                                meanAg[index] * frame.srcRgba[rgba + 1] +
+                                meanAb[index] * frame.srcRgba[rgba + 2] +
+                                meanB[index];
+                rawAlpha[index] = clamp01(q);
+            }
+        }
+    }
+
+    for (int index = 0; index < pixelCount; ++index) {
+        const int rgba = index * 4;
+        const float guidedAlpha = clamp01(meanAr[index] * frame.srcRgba[rgba + 0] +
+                                          meanAg[index] * frame.srcRgba[rgba + 1] +
+                                          meanAb[index] * frame.srcRgba[rgba + 2] +
+                                          meanB[index]);
+        const float alpha = savedRawAlpha[index] * (1.0f - params.guidedMix) + guidedAlpha * params.guidedMix;
+        if (params.premultiply) {
+            dstRgba[rgba + 0] *= alpha;
+            dstRgba[rgba + 1] *= alpha;
+            dstRgba[rgba + 2] *= alpha;
+        }
+        dstRgba[rgba + 3] = alpha;
+    }
+}
+
+void applyAdditiveKey(const PackedFrame& frame,
+                      const IBKeyerParams& params,
+                      const std::vector<float>& blurBgR,
+                      const std::vector<float>& blurBgG,
+                      const std::vector<float>& blurBgB,
+                      float* dstRgba)
+{
+    const int pixelCount = frame.width * frame.height;
+    for (int index = 0; index < pixelCount; ++index) {
+        const int rgba = index * 4;
+        const float alpha = dstRgba[rgba + 3];
+        if (alpha > 0.999f) {
+            continue;
+        }
+
+        float scrR = params.pickR;
+        float scrG = params.pickG;
+        float scrB = params.pickB;
+        if (params.useScreenInput && frame.screenRgba != nullptr) {
+            scrR = frame.screenRgba[rgba + 0];
+            scrG = frame.screenRgba[rgba + 1];
+            scrB = frame.screenRgba[rgba + 2];
+        }
+
+        const float srcR = frame.srcRgba[rgba + 0];
+        const float srcG = frame.srcRgba[rgba + 1];
+        const float srcB = frame.srcRgba[rgba + 2];
+
+        float resR = 0.0f;
+        float resG = 0.0f;
+        float resB = 0.0f;
+        if (params.additiveKeyMode == 0) {
+            resR = srcR - scrR;
+            resG = srcG - scrG;
+            resB = srcB - scrB;
+            const float lum = luminance(resR, resG, resB);
+            resR = lum * (1.0f - params.additiveKeySaturation) + resR * params.additiveKeySaturation;
+            resG = lum * (1.0f - params.additiveKeySaturation) + resG * params.additiveKeySaturation;
+            resB = lum * (1.0f - params.additiveKeySaturation) + resB * params.additiveKeySaturation;
+        } else {
+            float fR = (scrR > 1e-6f) ? srcR / scrR : 1.0f;
+            float fG = (scrG > 1e-6f) ? srcG / scrG : 1.0f;
+            float fB = (scrB > 1e-6f) ? srcB / scrB : 1.0f;
+            const float fLum = luminance(fR, fG, fB);
+            fR = fLum * (1.0f - params.additiveKeySaturation) + fR * params.additiveKeySaturation;
+            fG = fLum * (1.0f - params.additiveKeySaturation) + fG * params.additiveKeySaturation;
+            fB = fLum * (1.0f - params.additiveKeySaturation) + fB * params.additiveKeySaturation;
+            if (!blurBgR.empty()) {
+                resR = blurBgR[index] * (fR - 1.0f);
+                resG = blurBgG[index] * (fG - 1.0f);
+                resB = blurBgB[index] * (fB - 1.0f);
+            }
+        }
+
+        if (params.additiveKeyBlackClamp) {
+            resR = std::max(resR, 0.0f);
+            resG = std::max(resG, 0.0f);
+            resB = std::max(resB, 0.0f);
+        }
+
+        const float weight = (1.0f - alpha) * params.additiveKeyAmount;
+        dstRgba[rgba + 0] += resR * weight;
+        dstRgba[rgba + 1] += resG * weight;
+        dstRgba[rgba + 2] += resB * weight;
+    }
+}
+
+// Moved from: the old "CPU PROCESSING — FALLBACK" section.
+//
+// CPU code is slower, but it is the least dependent on host-specific GPU contracts. That makes it
+// the best place to preserve the algorithm "as intended" and compare GPU paths against it when
+// debugging correctness regressions.
+void renderCpuPacked(const IBKeyerParams& params, const PackedFrame& frame)
+{
+    // This section is intentionally close to the old IBKeymaster CPU fallback.
+    // When I first split the plugin, I simplified this path too much and that made CPU/CUDA/Metal
+    // parity harder to reason about because the "reference" path was no longer actually the old
+    // algorithm. The fuller structure here is deliberate: it restores the original pass ordering.
+    const int width = frame.width;
+    const int height = frame.height;
+    const int pixelCount = width * height;
+    const bool doGF = guidedFilterActive(params);
+    const bool doPrematte = params.prematteEnabled && params.prematteBlur > 0;
+    const bool doBgWrap = params.bgWrapEnabled && frame.backgroundRgba != nullptr && params.bgWrapAmount > 0.0f;
+    const bool doAdditive = params.additiveKeyEnabled && params.additiveKeyAmount > 0.0f;
+    const bool needBgBlur = doBgWrap || (doAdditive && params.additiveKeyMode == 1 && frame.backgroundRgba != nullptr);
+
+    std::vector<float> rawAlpha(pixelCount, 0.0f);
+    std::vector<float> guide(doGF ? pixelCount : 0);
+    std::vector<float> meanI(doGF ? pixelCount : 0);
+    std::vector<float> meanP(doGF ? pixelCount : 0);
+    std::vector<float> meanIp(doGF ? pixelCount : 0);
+    std::vector<float> meanII(doGF ? pixelCount : 0);
+    std::vector<float> scratch((doGF || needBgBlur || doPrematte) ? pixelCount : 1, 0.0f);
+    std::vector<float> cleanPlate;
+    std::vector<float> cleanR;
+    std::vector<float> cleanG;
+    std::vector<float> cleanB;
+    std::vector<float> erodedAlpha;
+    const float* activeScreen = (params.useScreenInput && frame.screenRgba != nullptr) ? frame.screenRgba : nullptr;
+
+    auto runCorePass = [&](const float* screenRgba, bool forceScreen) {
         for (int index = 0; index < pixelCount; ++index) {
             const int rgba = index * 4;
-            const float raw = frame.dstRgba[rgba + 3];
-            const float guided = clamp01(meanI[index] * guide[index] + meanP[index]);
-            const float alpha = raw * (1.0f - params.guidedMix) + guided * params.guidedMix;
-            if (params.premultiply) {
-                frame.dstRgba[rgba + 0] *= alpha;
-                frame.dstRgba[rgba + 1] *= alpha;
-                frame.dstRgba[rgba + 2] *= alpha;
+            const float srcR = frame.srcRgba[rgba + 0];
+            const float srcG = frame.srcRgba[rgba + 1];
+            const float srcB = frame.srcRgba[rgba + 2];
+
+            float scrR = params.pickR;
+            float scrG = params.pickG;
+            float scrB = params.pickB;
+            if (forceScreen && screenRgba != nullptr) {
+                scrR = screenRgba[rgba + 0];
+                scrG = screenRgba[rgba + 1];
+                scrB = screenRgba[rgba + 2];
             }
+
+            const float despillRGB = despillValue(srcR, srcG, srcB, params.screenColor, params.bias, params.limit);
+            const float despillScreen = despillValue(scrR, scrG, scrB, params.screenColor, params.bias, params.limit);
+            const float normalized = safeDivide(despillRGB, despillScreen);
+            const float spillMul = std::max(0.0f, normalized);
+            const float ssR = srcR - spillMul * scrR;
+            const float ssG = srcG - spillMul * scrG;
+            const float ssB = srcB - spillMul * scrB;
+
+            float alpha = clamp01(1.0f - normalized);
+            if (params.nearGreyExtract && params.nearGreyAmount > 0.0f) {
+                const float divR = safeDivide(ssR, srcR);
+                const float divG = safeDivide(ssG, srcG);
+                const float divB = safeDivide(ssB, srcB);
+                const float ngeAlpha = nearGreyAlpha(divR, divG, divB, params.screenColor, params.nearGreySoftness);
+                alpha = alpha + params.nearGreyAmount * ngeAlpha * (1.0f - alpha);
+            }
+            if (params.whiteClip > params.blackClip + 1e-6f) {
+                alpha = clamp01((alpha - params.blackClip) / (params.whiteClip - params.blackClip));
+            }
+            if (params.matteGamma != 1.0f && alpha > 0.0f && alpha < 1.0f) {
+                alpha = std::pow(alpha, params.matteGamma);
+            }
+
+            const float respillMul = std::max(0.0f, despillScreen * normalized);
+            frame.dstRgba[rgba + 0] = ssR + respillMul * params.respillR;
+            frame.dstRgba[rgba + 1] = ssG + respillMul * params.respillG;
+            frame.dstRgba[rgba + 2] = ssB + respillMul * params.respillB;
             frame.dstRgba[rgba + 3] = alpha;
+            rawAlpha[index] = alpha;
+
+            if (doGF && params.guidedFilterMode == 0) {
+                const float lum = luminance(srcR, srcG, srcB);
+                guide[index] = lum * (1.0f - params.edgeProtect) + alpha * params.edgeProtect;
+            }
+        }
+    };
+
+    runCorePass(activeScreen, params.useScreenInput && activeScreen != nullptr);
+
+    if (doPrematte) {
+        const int iterations = std::max(1, std::min(params.prematteIterations, 5));
+        std::vector<float> prematteAlpha = rawAlpha;
+        erodedAlpha.resize(pixelCount);
+        for (int iter = 0; iter < iterations; ++iter) {
+            const float* alphaSource = prematteAlpha.data();
+            if (params.prematteErode > 0) {
+                erodeSingle(prematteAlpha.data(), erodedAlpha.data(), width, height, params.prematteErode);
+                alphaSource = erodedAlpha.data();
+            }
+            buildCleanPlate(frame, params, std::vector<float>(alphaSource, alphaSource + pixelCount),
+                            cleanPlate, cleanR, cleanG, cleanB, scratch);
+            runCorePass(cleanPlate.data(), true);
+            prematteAlpha = rawAlpha;
+        }
+    }
+
+    if (params.viewMode == 2) {
+        if (doPrematte && !cleanPlate.empty()) {
+            copyDiagnosticRgba(frame.dstRgba, cleanPlate.data(), pixelCount);
+        } else if (activeScreen != nullptr) {
+            copyDiagnosticRgba(frame.dstRgba, activeScreen, pixelCount);
+        }
+        return;
+    }
+
+    applyExternalMatte(rawAlpha.data(), frame.dstRgba, pixelCount, frame.garbageMatteRgba, true);
+    applyExternalMatte(rawAlpha.data(), frame.dstRgba, pixelCount, frame.occlusionMatteRgba, false);
+
+    if (params.viewMode == 1) {
+        writeAlphaDiagnostic(frame.dstRgba, rawAlpha.data(), pixelCount);
+        return;
+    }
+
+    if (doGF) {
+        if (params.guidedFilterMode == 1) {
+            runRgbGuidedFilter(frame, params, rawAlpha, scratch, frame.dstRgba);
+        } else {
+            runScalarGuidedFilter(frame, params, rawAlpha, guide, meanI, meanP, meanIp, meanII, scratch, frame.dstRgba);
         }
     } else if (params.premultiply) {
-        // 7. Optional premultiply.
         for (int index = 0; index < pixelCount; ++index) {
             const int rgba = index * 4;
             const float alpha = frame.dstRgba[rgba + 3];
@@ -443,6 +849,121 @@ void renderCpuPacked(const IBKeyerParams& params, const PackedFrame& frame)
             frame.dstRgba[rgba + 2] *= alpha;
         }
     }
+
+    if (params.viewMode == 3) {
+        writeAlphaFromRgbaDiagnostic(frame.dstRgba, pixelCount);
+        return;
+    }
+
+    // This edge-colour pass was one of the features lost in the first split. It looks optional in
+    // UI terms, but omitting it changes the comped edge colour in ways users absolutely notice.
+    if (params.edgeColorCorrect > 0.0f) {
+        const bool isPremult = params.premultiply;
+        for (int index = 0; index < pixelCount; ++index) {
+            const int rgba = index * 4;
+            const float alpha = frame.dstRgba[rgba + 3];
+            if (alpha <= 0.005f || alpha >= 0.995f) {
+                continue;
+            }
+
+            float scrR = params.pickR;
+            float scrG = params.pickG;
+            float scrB = params.pickB;
+            if (params.useScreenInput && frame.screenRgba != nullptr) {
+                scrR = frame.screenRgba[rgba + 0];
+                scrG = frame.screenRgba[rgba + 1];
+                scrB = frame.screenRgba[rgba + 2];
+            }
+
+            const float srcR = frame.srcRgba[rgba + 0];
+            const float srcG = frame.srcRgba[rgba + 1];
+            const float srcB = frame.srcRgba[rgba + 2];
+            const float invA = 1.0f / alpha;
+            float fgR = (srcR - scrR * (1.0f - alpha)) * invA;
+            float fgG = (srcG - scrG * (1.0f - alpha)) * invA;
+            float fgB = (srcB - scrB * (1.0f - alpha)) * invA;
+
+            fgR = std::max(-0.5f, std::min(2.0f, fgR));
+            fgG = std::max(-0.5f, std::min(2.0f, fgG));
+            fgB = std::max(-0.5f, std::min(2.0f, fgB));
+
+            float curR = frame.dstRgba[rgba + 0];
+            float curG = frame.dstRgba[rgba + 1];
+            float curB = frame.dstRgba[rgba + 2];
+            if (isPremult) {
+                curR *= invA;
+                curG *= invA;
+                curB *= invA;
+            }
+
+            const float edgeFactor = alpha * (1.0f - alpha) * 4.0f * params.edgeColorCorrect;
+            float outR = curR + (fgR - curR) * edgeFactor;
+            float outG = curG + (fgG - curG) * edgeFactor;
+            float outB = curB + (fgB - curB) * edgeFactor;
+
+            if (isPremult) {
+                outR *= alpha;
+                outG *= alpha;
+                outB *= alpha;
+            }
+
+            frame.dstRgba[rgba + 0] = outR;
+            frame.dstRgba[rgba + 1] = outG;
+            frame.dstRgba[rgba + 2] = outB;
+        }
+    }
+
+    if (params.viewMode == 4) {
+        return;
+    }
+
+    std::vector<float> bgR;
+    std::vector<float> bgG;
+    std::vector<float> bgB;
+    if (needBgBlur) {
+        const int blurRadius = std::max(1, params.bgWrapBlur);
+        const std::vector<float> bgWeights = buildGaussianWeights(blurRadius);
+        bgR.resize(pixelCount);
+        bgG.resize(pixelCount);
+        bgB.resize(pixelCount);
+
+        for (int index = 0; index < pixelCount; ++index) {
+            const int rgba = index * 4;
+            bgR[index] = frame.backgroundRgba[rgba + 0];
+            bgG[index] = frame.backgroundRgba[rgba + 1];
+            bgB[index] = frame.backgroundRgba[rgba + 2];
+        }
+
+        gaussianBlurSingle(bgR.data(), scratch.data(), width, height, bgWeights, blurRadius);
+        gaussianBlurSingle(bgG.data(), scratch.data(), width, height, bgWeights, blurRadius);
+        gaussianBlurSingle(bgB.data(), scratch.data(), width, height, bgWeights, blurRadius);
+
+        if (params.viewMode == 5) {
+            for (int index = 0; index < pixelCount; ++index) {
+                const int rgba = index * 4;
+                frame.dstRgba[rgba + 0] = bgR[index];
+                frame.dstRgba[rgba + 1] = bgG[index];
+                frame.dstRgba[rgba + 2] = bgB[index];
+                frame.dstRgba[rgba + 3] = 1.0f;
+            }
+            return;
+        }
+
+        if (doBgWrap) {
+            for (int index = 0; index < pixelCount; ++index) {
+                const int rgba = index * 4;
+                const float alpha = frame.dstRgba[rgba + 3];
+                const float wrapWeight = alpha * (1.0f - alpha) * 4.0f * params.bgWrapAmount;
+                frame.dstRgba[rgba + 0] += bgR[index] * wrapWeight;
+                frame.dstRgba[rgba + 1] += bgG[index] * wrapWeight;
+                frame.dstRgba[rgba + 2] += bgB[index] * wrapWeight;
+            }
+        }
+    }
+
+    if (doAdditive) {
+        applyAdditiveKey(frame, params, bgR, bgG, bgB, frame.dstRgba);
+    }
 }
 
 // Moved from: the old direct CPU/GPU setup path where images were sampled ad hoc.
@@ -480,6 +1001,37 @@ void packImageWindow(const OFX::Image* image, const OfxRectI& renderWindow, std:
     }
 }
 
+void packMatteWindow(const OFX::Image* image, const OfxRectI& renderWindow, std::vector<float>& packed)
+{
+    const int width = renderWindow.x2 - renderWindow.x1;
+    const int height = renderWindow.y2 - renderWindow.y1;
+    packed.assign(static_cast<size_t>(width) * static_cast<size_t>(height) * 4u, 0.0f);
+
+    if (image == nullptr) {
+        return;
+    }
+
+    const OFX::PixelComponentEnum components = image->getPixelComponents();
+    const int componentCount = (components == OFX::ePixelComponentRGBA) ? 4 : 3;
+
+    for (int y = renderWindow.y1; y < renderWindow.y2; ++y) {
+        for (int x = renderWindow.x1; x < renderWindow.x2; ++x) {
+            const float* sourcePixel = static_cast<const float*>(image->getPixelAddress(x, y));
+            if (sourcePixel == nullptr) {
+                continue;
+            }
+            const int localIndex = ((y - renderWindow.y1) * width + (x - renderWindow.x1)) * 4;
+            const float matte = (componentCount == 4)
+                ? sourcePixel[3]
+                : luminance(sourcePixel[0], sourcePixel[1], sourcePixel[2]);
+            packed[localIndex + 0] = matte;
+            packed[localIndex + 1] = matte;
+            packed[localIndex + 2] = matte;
+            packed[localIndex + 3] = matte;
+        }
+    }
+}
+
 // Companion to packImageWindow().
 //
 // Once the staged CUDA path finishes, we still need to write the result back through the host's
@@ -512,18 +1064,33 @@ BackendResult renderCpu(const RenderRequest& request)
     const int height = request.renderWindow.y2 - request.renderWindow.y1;
     std::vector<float> srcPacked;
     std::vector<float> screenPacked;
+    std::vector<float> backgroundPacked;
+    std::vector<float> garbagePacked;
+    std::vector<float> occlusionPacked;
     std::vector<float> dstPacked(static_cast<size_t>(width) * static_cast<size_t>(height) * 4u, 0.0f);
 
     packImageWindow(request.srcImage, request.renderWindow, srcPacked);
     if (request.params.useScreenInput && request.screenImage != nullptr) {
         packImageWindow(request.screenImage, request.renderWindow, screenPacked);
     }
+    if (request.params.bgWrapEnabled && request.backgroundImage != nullptr) {
+        packImageWindow(request.backgroundImage, request.renderWindow, backgroundPacked);
+    }
+    if (request.garbageMatteImage != nullptr) {
+        packMatteWindow(request.garbageMatteImage, request.renderWindow, garbagePacked);
+    }
+    if (request.occlusionMatteImage != nullptr) {
+        packMatteWindow(request.occlusionMatteImage, request.renderWindow, occlusionPacked);
+    }
 
     PackedFrame frame;
     frame.width = width;
     frame.height = height;
     frame.srcRgba = srcPacked.data();
     frame.screenRgba = screenPacked.empty() ? nullptr : screenPacked.data();
+    frame.backgroundRgba = backgroundPacked.empty() ? nullptr : backgroundPacked.data();
+    frame.garbageMatteRgba = garbagePacked.empty() ? nullptr : garbagePacked.data();
+    frame.occlusionMatteRgba = occlusionPacked.empty() ? nullptr : occlusionPacked.data();
     frame.dstRgba = dstPacked.data();
 
     // CPU is intentionally kept as the reference path. When GPU behavior differs, this
@@ -545,18 +1112,33 @@ BackendResult renderInternalCuda(const RenderRequest& request)
     const int height = request.renderWindow.y2 - request.renderWindow.y1;
     std::vector<float> srcPacked;
     std::vector<float> screenPacked;
+    std::vector<float> backgroundPacked;
+    std::vector<float> garbagePacked;
+    std::vector<float> occlusionPacked;
     std::vector<float> dstPacked(static_cast<size_t>(width) * static_cast<size_t>(height) * 4u, 0.0f);
 
     packImageWindow(request.srcImage, request.renderWindow, srcPacked);
     if (request.params.useScreenInput && request.screenImage != nullptr) {
         packImageWindow(request.screenImage, request.renderWindow, screenPacked);
     }
+    if (request.params.bgWrapEnabled && request.backgroundImage != nullptr) {
+        packImageWindow(request.backgroundImage, request.renderWindow, backgroundPacked);
+    }
+    if (request.garbageMatteImage != nullptr) {
+        packMatteWindow(request.garbageMatteImage, request.renderWindow, garbagePacked);
+    }
+    if (request.occlusionMatteImage != nullptr) {
+        packMatteWindow(request.occlusionMatteImage, request.renderWindow, occlusionPacked);
+    }
 
     PackedFrame frame;
     frame.width = width;
     frame.height = height;
     frame.srcRgba = srcPacked.data();
     frame.screenRgba = screenPacked.empty() ? nullptr : screenPacked.data();
+    frame.backgroundRgba = backgroundPacked.empty() ? nullptr : backgroundPacked.data();
+    frame.garbageMatteRgba = garbagePacked.empty() ? nullptr : garbagePacked.data();
+    frame.occlusionMatteRgba = occlusionPacked.empty() ? nullptr : occlusionPacked.data();
     frame.dstRgba = dstPacked.data();
 
     std::string error;
@@ -612,6 +1194,36 @@ BackendResult renderHostCuda(const RenderRequest& request)
         }
     }
 
+    if (request.params.bgWrapEnabled && request.backgroundImage != nullptr) {
+        if (frame.background.data == nullptr || frame.background.rowBytes == 0 || (frame.background.components != 3 && frame.background.components != 4)) {
+            return {
+                false,
+                BackendKind::HostCUDA,
+                "Host CUDA declined because the Background clip did not expose a usable RGB/RGBA device buffer."
+            };
+        }
+    }
+    if (request.garbageMatteImage != nullptr) {
+        if (frame.garbageMatte.data == nullptr || frame.garbageMatte.rowBytes == 0 || (frame.garbageMatte.components != 3 && frame.garbageMatte.components != 4)) {
+            return {false, BackendKind::HostCUDA, "Host CUDA declined because the Garbage Matte clip did not expose a usable RGB/RGBA device buffer."};
+        }
+    }
+    if (request.occlusionMatteImage != nullptr) {
+        if (frame.occlusionMatte.data == nullptr || frame.occlusionMatte.rowBytes == 0 || (frame.occlusionMatte.components != 3 && frame.occlusionMatte.components != 4)) {
+            return {false, BackendKind::HostCUDA, "Host CUDA declined because the Occlusion Matte clip did not expose a usable RGB/RGBA device buffer."};
+        }
+    }
+
+    logMessage(false, formatString(
+        "IBKeyer: HostCUDA zero-copy validated. prematte=%d rgbGuide=%d bgWrap=%d additive=%d garbage=%d occlusion=%d viewMode=%d",
+        request.params.prematteEnabled ? 1 : 0,
+        (request.params.guidedFilterEnabled && request.params.guidedFilterMode == 1) ? 1 : 0,
+        request.params.bgWrapEnabled ? 1 : 0,
+        request.params.additiveKeyEnabled ? 1 : 0,
+        request.garbageMatteImage != nullptr ? 1 : 0,
+        request.occlusionMatteImage != nullptr ? 1 : 0,
+        request.params.viewMode));
+
     std::string error;
     if (!renderCudaHost(request.params, frame, request.hostCudaStream, error)) {
         return {false, BackendKind::HostCUDA, error};
@@ -671,10 +1283,53 @@ BackendResult renderHostMetal(const RenderRequest& request)
         }
     }
 
-    // The shared Metal kernel still expects host-provided MTLBuffer handles. We keep that
-    // path only on macOS, and only when the host gave us a command queue and matching bounds.
-    const float nearGreySoftness = request.params.nearGreyAmount;
-    const float nearGreyAmount = request.params.nearGreyExtract ? 1.0f : 0.0f;
+    if (request.params.bgWrapEnabled && request.backgroundImage != nullptr) {
+        const OfxRectI& bgBounds = request.backgroundImage->getBounds();
+        if (bgBounds.x1 != srcBounds.x1 ||
+            bgBounds.y1 != srcBounds.y1 ||
+            bgBounds.x2 != srcBounds.x2 ||
+            bgBounds.y2 != srcBounds.y2) {
+            return {
+                false,
+                BackendKind::HostMetal,
+                "Host Metal requires the Background clip to match the Source bounds; mismatched host buffers fall back to CPU rather than guessing at per-host buffer layouts."
+            };
+        }
+
+        if (request.backgroundImage->getPixelComponents() != OFX::ePixelComponentRGBA) {
+            return {
+                false,
+                BackendKind::HostMetal,
+                "Host Metal currently assumes the Background clip arrives as RGBA device memory; RGB Background clips fall back to CPU so the wrapper does not lie about parity."
+            };
+        }
+    }
+
+    auto validateMetalMatteClip = [&](const OFX::Image* image, const char* name) -> BackendResult {
+        if (image == nullptr) {
+            return {true, BackendKind::HostMetal, ""};
+        }
+        const OfxRectI& matteBounds = image->getBounds();
+        if (matteBounds.x1 != srcBounds.x1 || matteBounds.y1 != srcBounds.y1 ||
+            matteBounds.x2 != srcBounds.x2 || matteBounds.y2 != srcBounds.y2) {
+            return {false, BackendKind::HostMetal, formatString("Host Metal requires the %s clip to match the Source bounds; mismatched host buffers fall back to CPU rather than guessing at per-host buffer layouts.", name)};
+        }
+        if (image->getPixelComponents() != OFX::ePixelComponentRGBA) {
+            return {false, BackendKind::HostMetal, formatString("Host Metal currently assumes the %s clip arrives as RGBA device memory; RGB mattes fall back to CPU so the wrapper does not misread host Metal buffers.", name)};
+        }
+        return {true, BackendKind::HostMetal, ""};
+    };
+
+    if (const BackendResult matte = validateMetalMatteClip(request.garbageMatteImage, "Garbage Matte"); !matte.success) {
+        return matte;
+    }
+    if (const BackendResult matte = validateMetalMatteClip(request.occlusionMatteImage, "Occlusion Matte"); !matte.success) {
+        return matte;
+    }
+
+    // The shared Metal kernel already had support for the richer IBKeymaster control surface.
+    // The cross-platform wrapper was previously hard-coding neutral placeholders here, which made
+    // macOS look feature-complete in code while quietly disabling those controls in practice.
     const int width = srcBounds.x2 - srcBounds.x1;
     const int height = srcBounds.y2 - srcBounds.y1;
 
@@ -694,26 +1349,45 @@ BackendResult renderHostMetal(const RenderRequest& request)
         request.params.respillB,
         request.params.premultiply ? 1 : 0,
         request.params.nearGreyExtract ? 1 : 0,
-        nearGreyAmount,
-        nearGreySoftness,
+        request.params.nearGreyAmount,
+        request.params.nearGreySoftness,
         request.params.blackClip,
         request.params.whiteClip,
-        1.0f,
+        request.params.matteGamma,
+        request.params.prematteEnabled ? 1 : 0,
+        request.params.prematteBlur,
+        request.params.prematteErode,
+        request.params.prematteIterations,
         request.params.guidedFilterEnabled ? 1 : 0,
+        request.params.guidedFilterMode,
         request.params.guidedRadius,
         request.params.guidedEpsilon,
         request.params.guidedMix,
-        0.0f,
-        1,
-        0.0f,
-        0,
-        0,
-        0.0f,
+        request.params.edgeProtect,
+        request.params.refineIterations,
+        request.params.edgeColorCorrect,
+        request.params.bgWrapEnabled ? 1 : 0,
+        request.params.bgWrapBlur,
+        request.params.bgWrapAmount,
+        request.params.additiveKeyEnabled ? 1 : 0,
+        request.params.additiveKeyMode,
+        request.params.additiveKeySaturation,
+        request.params.additiveKeyAmount,
+        request.params.additiveKeyBlackClamp ? 1 : 0,
+        request.params.viewMode,
         static_cast<const float*>(request.srcImage->getPixelData()),
         (request.params.useScreenInput && request.screenImage != nullptr)
             ? static_cast<const float*>(request.screenImage->getPixelData())
             : nullptr,
-        nullptr,
+        (request.params.bgWrapEnabled && request.backgroundImage != nullptr)
+            ? static_cast<const float*>(request.backgroundImage->getPixelData())
+            : nullptr,
+        (request.garbageMatteImage != nullptr)
+            ? static_cast<const float*>(request.garbageMatteImage->getPixelData())
+            : nullptr,
+        (request.occlusionMatteImage != nullptr)
+            ? static_cast<const float*>(request.occlusionMatteImage->getPixelData())
+            : nullptr,
         static_cast<float*>(request.dstImage->getPixelData()));
 
     return {true, BackendKind::HostMetal, "Rendered with the host Metal backend."};
@@ -725,6 +1399,8 @@ BackendResult renderHostMetal(const RenderRequest& request)
 // hard to debug. This helper makes the decision and the reason visible in one place.
 BackendKind chooseBackend(const RenderRequest& request, std::string& reason)
 {
+    const bool needsReferenceOnly = requiresReferenceOnlyFeatures(request);
+
     if (envFlagEnabled("IBKEYER_FORCE_CPU")) {
         reason = "IBKEYER_FORCE_CPU forced the reference CPU path.";
         return BackendKind::CPU;
@@ -740,12 +1416,22 @@ BackendKind chooseBackend(const RenderRequest& request, std::string& reason)
     const CudaRenderMode cudaMode = selectedCudaRenderMode();
     if (request.hostCudaEnabled) {
         if (request.hostCudaStream != nullptr) {
-            reason = "The host enabled OFX CUDA render and supplied a CUDA stream, so IBKeyer must stay on the host-CUDA memory path.";
+            // This used to branch on "reference-only" features. Once prematte, external mattes,
+            // RGB guide mode, additive key, and diagnostics landed in CUDA, keeping that older
+            // wording became actively misleading during zero-copy debugging.
+            reason = "The host enabled OFX CUDA render and supplied a CUDA stream, so IBKeyer stays on the host-CUDA memory path.";
             return BackendKind::HostCUDA;
         }
         reason = "The host enabled OFX CUDA render but did not supply a CUDA stream. That leaves no safe CPU-readable fallback for the CUDA images.";
         return BackendKind::HostCUDA;
     }
+    if (needsReferenceOnly) {
+        // These newer private-branch features were ported CPU-first so the result stays trustworthy
+        // while the shared CUDA implementation catches up. This is only safe when the host has not
+        // already switched fetchImage() over to CUDA device memory.
+        reason = "The requested feature set currently relies on the CPU reference path on Windows/Linux to preserve parity (prematte, external mattes, additive key, RGB guided filter, or diagnostic views).";
+        return BackendKind::CPU;
+    }
     if (cudaMode == CudaRenderMode::HostPreferred &&
         request.hostCudaStream != nullptr) {
         reason = "Host CUDA is the selected policy and the host supplied CUDA device images plus a CUDA stream, so the zero-copy path is preferred.";
diff --git a/IBKeymaster/IBKeyerBackend.h b/IBKeymaster/IBKeyerBackend.h
index 9fd7493..18c15fa 100644
--- a/IBKeymaster/IBKeyerBackend.h
+++ b/IBKeymaster/IBKeyerBackend.h
@@ -43,6 +43,9 @@ struct DeviceRenderFrame
 {
     ImagePlaneDesc src;
     ImagePlaneDesc screen;
+    ImagePlaneDesc background;
+    ImagePlaneDesc garbageMatte;
+    ImagePlaneDesc occlusionMatte;
     MutableImagePlaneDesc dst;
     OfxRectI renderWindow = {0, 0, 0, 0};
 };
@@ -51,6 +54,9 @@ struct RenderRequest
 {
     const OFX::Image* srcImage = nullptr;
     const OFX::Image* screenImage = nullptr;
+    const OFX::Image* backgroundImage = nullptr;
+    const OFX::Image* garbageMatteImage = nullptr;
+    const OFX::Image* occlusionMatteImage = nullptr;
     OFX::Image* dstImage = nullptr;
     OfxRectI renderWindow = {0, 0, 0, 0};
     bool hostCudaEnabled = false;
diff --git a/IBKeymaster/IBKeyerCuda.cu b/IBKeymaster/IBKeyerCuda.cu
index ffc4b16..e2dffb2 100644
--- a/IBKeymaster/IBKeyerCuda.cu
+++ b/IBKeymaster/IBKeyerCuda.cu
@@ -44,12 +44,36 @@ bool hostCudaForceSyncEnabled()
 struct CudaScratchCache
 {
     float* rawAlpha = nullptr;
+    float* savedRawAlpha = nullptr;
     float* guide = nullptr;
     float* meanI = nullptr;
     float* meanP = nullptr;
     float* meanIp = nullptr;
     float* meanII = nullptr;
     float* scratch = nullptr;
+    float* bgR = nullptr;
+    float* bgG = nullptr;
+    float* bgB = nullptr;
+    float* prematteAlpha = nullptr;
+    float* cleanPlate = nullptr;
+
+    float* rgbMeanIr = nullptr;
+    float* rgbMeanIg = nullptr;
+    float* rgbMeanIb = nullptr;
+    float* rgbMeanP = nullptr;
+    float* rgbIrIr = nullptr;
+    float* rgbIrIg = nullptr;
+    float* rgbIrIb = nullptr;
+    float* rgbIgIg = nullptr;
+    float* rgbIgIb = nullptr;
+    float* rgbIbIb = nullptr;
+    float* rgbIrP = nullptr;
+    float* rgbIgP = nullptr;
+    float* rgbIbP = nullptr;
+    float* rgbMeanAr = nullptr;
+    float* rgbMeanAg = nullptr;
+    float* rgbMeanAb = nullptr;
+    float* rgbMeanB = nullptr;
     int pixelCapacity = 0;
 
     float* gaussianWeights = nullptr;
@@ -61,24 +85,70 @@ struct CudaScratchCache
     void release()
     {
         cudaFree(gaussianWeights);
+        cudaFree(rgbMeanB);
+        cudaFree(rgbMeanAb);
+        cudaFree(rgbMeanAg);
+        cudaFree(rgbMeanAr);
+        cudaFree(rgbIbP);
+        cudaFree(rgbIgP);
+        cudaFree(rgbIrP);
+        cudaFree(rgbIbIb);
+        cudaFree(rgbIgIb);
+        cudaFree(rgbIgIg);
+        cudaFree(rgbIrIb);
+        cudaFree(rgbIrIg);
+        cudaFree(rgbIrIr);
+        cudaFree(rgbMeanP);
+        cudaFree(rgbMeanIb);
+        cudaFree(rgbMeanIg);
+        cudaFree(rgbMeanIr);
+        cudaFree(cleanPlate);
+        cudaFree(prematteAlpha);
+        cudaFree(bgB);
+        cudaFree(bgG);
+        cudaFree(bgR);
         cudaFree(scratch);
         cudaFree(meanII);
         cudaFree(meanIp);
         cudaFree(meanP);
         cudaFree(meanI);
         cudaFree(guide);
+        cudaFree(savedRawAlpha);
         cudaFree(rawAlpha);
         if (inFlightEvent != nullptr) {
             cudaEventDestroy(inFlightEvent);
         }
 
         gaussianWeights = nullptr;
+        rgbMeanB = nullptr;
+        rgbMeanAb = nullptr;
+        rgbMeanAg = nullptr;
+        rgbMeanAr = nullptr;
+        rgbIbP = nullptr;
+        rgbIgP = nullptr;
+        rgbIrP = nullptr;
+        rgbIbIb = nullptr;
+        rgbIgIb = nullptr;
+        rgbIgIg = nullptr;
+        rgbIrIb = nullptr;
+        rgbIrIg = nullptr;
+        rgbIrIr = nullptr;
+        rgbMeanP = nullptr;
+        rgbMeanIb = nullptr;
+        rgbMeanIg = nullptr;
+        rgbMeanIr = nullptr;
+        cleanPlate = nullptr;
+        prematteAlpha = nullptr;
+        bgB = nullptr;
+        bgG = nullptr;
+        bgR = nullptr;
         scratch = nullptr;
         meanII = nullptr;
         meanIp = nullptr;
         meanP = nullptr;
         meanI = nullptr;
         guide = nullptr;
+        savedRawAlpha = nullptr;
         rawAlpha = nullptr;
         pixelCapacity = 0;
         weightRadius = -1;
@@ -93,30 +163,100 @@ struct CudaScratchCache
             return true;
         }
 
+        cudaFree(bgB);
+        cudaFree(bgG);
+        cudaFree(bgR);
+        cudaFree(prematteAlpha);
+        cudaFree(cleanPlate);
+        cudaFree(rgbMeanB);
+        cudaFree(rgbMeanAb);
+        cudaFree(rgbMeanAg);
+        cudaFree(rgbMeanAr);
+        cudaFree(rgbIbP);
+        cudaFree(rgbIgP);
+        cudaFree(rgbIrP);
+        cudaFree(rgbIbIb);
+        cudaFree(rgbIgIb);
+        cudaFree(rgbIgIg);
+        cudaFree(rgbIrIb);
+        cudaFree(rgbIrIg);
+        cudaFree(rgbIrIr);
+        cudaFree(rgbMeanP);
+        cudaFree(rgbMeanIb);
+        cudaFree(rgbMeanIg);
+        cudaFree(rgbMeanIr);
         cudaFree(scratch);
         cudaFree(meanII);
         cudaFree(meanIp);
         cudaFree(meanP);
         cudaFree(meanI);
         cudaFree(guide);
+        cudaFree(savedRawAlpha);
         cudaFree(rawAlpha);
 
+        bgB = nullptr;
+        bgG = nullptr;
+        bgR = nullptr;
+        prematteAlpha = nullptr;
+        cleanPlate = nullptr;
+        rgbMeanB = nullptr;
+        rgbMeanAb = nullptr;
+        rgbMeanAg = nullptr;
+        rgbMeanAr = nullptr;
+        rgbIbP = nullptr;
+        rgbIgP = nullptr;
+        rgbIrP = nullptr;
+        rgbIbIb = nullptr;
+        rgbIgIb = nullptr;
+        rgbIgIg = nullptr;
+        rgbIrIb = nullptr;
+        rgbIrIg = nullptr;
+        rgbIrIr = nullptr;
+        rgbMeanP = nullptr;
+        rgbMeanIb = nullptr;
+        rgbMeanIg = nullptr;
+        rgbMeanIr = nullptr;
         scratch = nullptr;
         meanII = nullptr;
         meanIp = nullptr;
         meanP = nullptr;
         meanI = nullptr;
         guide = nullptr;
+        savedRawAlpha = nullptr;
         rawAlpha = nullptr;
 
         const size_t channelBytes = static_cast<size_t>(pixelCount) * sizeof(float);
+        const size_t rgbaBytes = static_cast<size_t>(pixelCount) * 4u * sizeof(float);
         if (cudaMalloc(&rawAlpha, channelBytes) != cudaSuccess ||
+            cudaMalloc(&savedRawAlpha, channelBytes) != cudaSuccess ||
             cudaMalloc(&guide, channelBytes) != cudaSuccess ||
             cudaMalloc(&meanI, channelBytes) != cudaSuccess ||
             cudaMalloc(&meanP, channelBytes) != cudaSuccess ||
             cudaMalloc(&meanIp, channelBytes) != cudaSuccess ||
             cudaMalloc(&meanII, channelBytes) != cudaSuccess ||
-            cudaMalloc(&scratch, channelBytes) != cudaSuccess) {
+            cudaMalloc(&scratch, channelBytes) != cudaSuccess ||
+            cudaMalloc(&bgR, channelBytes) != cudaSuccess ||
+            cudaMalloc(&bgG, channelBytes) != cudaSuccess ||
+            cudaMalloc(&bgB, channelBytes) != cudaSuccess ||
+            cudaMalloc(&prematteAlpha, channelBytes) != cudaSuccess ||
+            cudaMalloc(&cleanPlate, rgbaBytes) != cudaSuccess ||
+            cudaMalloc(&rgbMeanIr, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbMeanIg, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbMeanIb, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbMeanP, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbIrIr, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbIrIg, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbIrIb, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbIgIg, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbIgIb, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbIbIb, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbIrP, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbIgP, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbIbP, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbMeanAr, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbMeanAg, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbMeanAb, channelBytes) != cudaSuccess ||
+            cudaMalloc(&rgbMeanB, channelBytes) != cudaSuccess) {
             error = "Failed to allocate CUDA scratch buffers for the guided filter.";
             release();
             return false;
@@ -285,6 +425,24 @@ IBKEYER_HOST_DEVICE inline void sampleRgb(const ImagePlaneDesc& image, int x, in
     b = pixel[2];
 }
 
+IBKEYER_HOST_DEVICE inline float sampleMatteValue(const ImagePlaneDesc& image, int x, int y)
+{
+    const float* pixel = pixelAddress(image, x, y);
+    if (pixel == nullptr) {
+        return 0.0f;
+    }
+    if (image.components >= 4) {
+        return clamp01(pixel[3]);
+    }
+    return clamp01(luminance(pixel[0], pixel[1], pixel[2]));
+}
+
+IBKEYER_HOST_DEVICE inline float smoothstep01(float value)
+{
+    const float t = clamp01(value);
+    return t * t * (3.0f - 2.0f * t);
+}
+
 IBKEYER_HOST_DEVICE inline void storeRgba(const MutableImagePlaneDesc& image, int x, int y, float r, float g, float b, float a)
 {
     float* pixel = pixelAddress(image, x, y);
@@ -352,18 +510,20 @@ __global__ void coreKernel(IBKeyerParams params,
     float alpha = clamp01(1.0f - normalized);
 
     // 5. Near Grey Extraction (optional).
-    if (params.nearGreyExtract) {
+    if (params.nearGreyExtract && params.nearGreyAmount > 0.0f) {
         const float divR = safeDivide(ssR, srcR);
         const float divG = safeDivide(ssG, srcG);
         const float divB = safeDivide(ssB, srcB);
-        const float ngeAlpha = nearGreyAlpha(divR, divG, divB, params.screenColor, params.nearGreyAmount);
-        // Screen composite: a + b - a*b.
-        alpha = ngeAlpha + alpha - ngeAlpha * alpha;
+        const float ngeAlpha = nearGreyAlpha(divR, divG, divB, params.screenColor, params.nearGreySoftness);
+        alpha = alpha + params.nearGreyAmount * ngeAlpha * (1.0f - alpha);
     }
 
     if (params.whiteClip > params.blackClip + 1e-6f) {
         alpha = clamp01((alpha - params.blackClip) / (params.whiteClip - params.blackClip));
     }
+    if (params.matteGamma != 1.0f && alpha > 0.0f && alpha < 1.0f) {
+        alpha = powf(alpha, params.matteGamma);
+    }
 
     // 6. Output = screen-subtracted + respill.
     const float respillMul = despillScreen * normalized > 0.0f ? despillScreen * normalized : 0.0f;
@@ -376,7 +536,8 @@ __global__ void coreKernel(IBKeyerParams params,
               alpha);
 
     rawAlpha[pixelIndex] = alpha;
-    guide[pixelIndex] = luminance(srcR, srcG, srcB);
+    const float lum = luminance(srcR, srcG, srcB);
+    guide[pixelIndex] = lum * (1.0f - params.edgeProtect) + alpha * params.edgeProtect;
 }
 
 __global__ void computeProductsKernel(int pixelCount,
@@ -400,6 +561,15 @@ __global__ void computeProductsKernel(int pixelCount,
     meanII[index] = I * I;
 }
 
+__global__ void copyBufferKernel(int pixelCount, const float* src, float* dst)
+{
+    const int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= pixelCount) {
+        return;
+    }
+    dst[index] = src[index];
+}
+
 __global__ void gaussianBlurHorizontalKernel(int width,
                                              int height,
                                              int radius,
@@ -462,6 +632,48 @@ __global__ void guidedCoeffKernel(int pixelCount,
     meanP[index] = b;
 }
 
+__global__ void refineGuideKernel(int width,
+                                  int height,
+                                  ImagePlaneDesc src,
+                                  int renderX1,
+                                  int renderY1,
+                                  float edgeProtect,
+                                  const float* alphaBuffer,
+                                  float* guideBuffer)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    const int imageX = renderX1 + x;
+    const int imageY = renderY1 + y;
+    const int pixelIndex = y * width + x;
+    const float alpha = alphaBuffer[pixelIndex];
+
+    float srcR = 0.0f;
+    float srcG = 0.0f;
+    float srcB = 0.0f;
+    sampleRgb(src, imageX, imageY, srcR, srcG, srcB);
+
+    const float fgLum = luminance(srcR * alpha, srcG * alpha, srcB * alpha);
+    guideBuffer[pixelIndex] = fgLum * (1.0f - edgeProtect) + alpha * edgeProtect;
+}
+
+__global__ void guidedEvalKernel(int pixelCount,
+                                 const float* guide,
+                                 const float* meanA,
+                                 const float* meanB,
+                                 float* dst)
+{
+    const int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= pixelCount) {
+        return;
+    }
+    dst[index] = clamp01(meanA[index] * guide[index] + meanB[index]);
+}
+
 __global__ void guidedApplyKernel(IBKeyerParams params,
                                   MutableImagePlaneDesc dst,
                                   int renderX1,
@@ -498,6 +710,77 @@ __global__ void guidedApplyKernel(IBKeyerParams params,
     pixel[3] = alpha;
 }
 
+__global__ void edgeColorCorrectKernel(IBKeyerParams params,
+                                       ImagePlaneDesc src,
+                                       ImagePlaneDesc screen,
+                                       MutableImagePlaneDesc dst,
+                                       int renderX1,
+                                       int renderY1,
+                                       int width,
+                                       int height)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    const int imageX = renderX1 + x;
+    const int imageY = renderY1 + y;
+    float* pixel = pixelAddress(dst, imageX, imageY);
+    if (pixel == nullptr) {
+        return;
+    }
+
+    const float alpha = pixel[3];
+    if (alpha <= 0.005f || alpha >= 0.995f) {
+        return;
+    }
+
+    float srcR = 0.0f;
+    float srcG = 0.0f;
+    float srcB = 0.0f;
+    sampleRgb(src, imageX, imageY, srcR, srcG, srcB);
+
+    float scrR = params.pickR;
+    float scrG = params.pickG;
+    float scrB = params.pickB;
+    if (params.useScreenInput && screen.data != nullptr) {
+        sampleRgb(screen, imageX, imageY, scrR, scrG, scrB);
+    }
+
+    const float invA = 1.0f / alpha;
+    float fgR = (srcR - scrR * (1.0f - alpha)) * invA;
+    float fgG = (srcG - scrG * (1.0f - alpha)) * invA;
+    float fgB = (srcB - scrB * (1.0f - alpha)) * invA;
+    fgR = fmaxf(-0.5f, fminf(2.0f, fgR));
+    fgG = fmaxf(-0.5f, fminf(2.0f, fgG));
+    fgB = fmaxf(-0.5f, fminf(2.0f, fgB));
+
+    float curR = pixel[0];
+    float curG = pixel[1];
+    float curB = pixel[2];
+    if (params.premultiply) {
+        curR *= invA;
+        curG *= invA;
+        curB *= invA;
+    }
+
+    const float edgeFactor = alpha * (1.0f - alpha) * 4.0f * params.edgeColorCorrect;
+    float outR = curR + (fgR - curR) * edgeFactor;
+    float outG = curG + (fgG - curG) * edgeFactor;
+    float outB = curB + (fgB - curB) * edgeFactor;
+    if (params.premultiply) {
+        outR *= alpha;
+        outG *= alpha;
+        outB *= alpha;
+    }
+
+    pixel[0] = outR;
+    pixel[1] = outG;
+    pixel[2] = outB;
+}
+
 __global__ void premultiplyKernel(MutableImagePlaneDesc dst,
                                   int renderX1,
                                   int renderY1,
@@ -523,6 +806,530 @@ __global__ void premultiplyKernel(MutableImagePlaneDesc dst,
     pixel[2] *= alpha;
 }
 
+__global__ void extractBackgroundChannelsKernel(int width,
+                                                int height,
+                                                ImagePlaneDesc background,
+                                                int renderX1,
+                                                int renderY1,
+                                                float* outR,
+                                                float* outG,
+                                                float* outB)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    const int imageX = renderX1 + x;
+    const int imageY = renderY1 + y;
+    const int pixelIndex = y * width + x;
+
+    float bgR = 0.0f;
+    float bgG = 0.0f;
+    float bgB = 0.0f;
+    sampleRgb(background, imageX, imageY, bgR, bgG, bgB);
+    outR[pixelIndex] = bgR;
+    outG[pixelIndex] = bgG;
+    outB[pixelIndex] = bgB;
+}
+
+__global__ void bgWrapKernel(MutableImagePlaneDesc dst,
+                             int renderX1,
+                             int renderY1,
+                             int width,
+                             int height,
+                             float amount,
+                             const float* bgR,
+                             const float* bgG,
+                             const float* bgB)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    const int imageX = renderX1 + x;
+    const int imageY = renderY1 + y;
+    const int pixelIndex = y * width + x;
+    float* pixel = pixelAddress(dst, imageX, imageY);
+    if (pixel == nullptr) {
+        return;
+    }
+
+    const float alpha = pixel[3];
+    const float wrapWeight = alpha * (1.0f - alpha) * 4.0f * amount;
+    pixel[0] += bgR[pixelIndex] * wrapWeight;
+    pixel[1] += bgG[pixelIndex] * wrapWeight;
+    pixel[2] += bgB[pixelIndex] * wrapWeight;
+}
+
+__global__ void applyMatteKernel(float* rawAlpha,
+                                 MutableImagePlaneDesc dst,
+                                 ImagePlaneDesc matte,
+                                 int renderX1,
+                                 int renderY1,
+                                 int width,
+                                 int height,
+                                 int mode)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    const int imageX = renderX1 + x;
+    const int imageY = renderY1 + y;
+    const int pixelIndex = y * width + x;
+    float* dstPixel = pixelAddress(dst, imageX, imageY);
+    if (dstPixel == nullptr) {
+        return;
+    }
+
+    const float matteValue = sampleMatteValue(matte, imageX, imageY);
+    float alpha = rawAlpha[pixelIndex];
+    if (mode == 0) {
+        alpha *= (1.0f - matteValue);
+    } else {
+        alpha = fmaxf(alpha, matteValue);
+    }
+    alpha = clamp01(alpha);
+    rawAlpha[pixelIndex] = alpha;
+    dstPixel[3] = alpha;
+}
+
+__global__ void writeAlphaDiagnosticKernel(const float* rawAlpha,
+                                           MutableImagePlaneDesc dst,
+                                           int renderX1,
+                                           int renderY1,
+                                           int width,
+                                           int height)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    const int imageX = renderX1 + x;
+    const int imageY = renderY1 + y;
+    const int pixelIndex = y * width + x;
+    const float a = rawAlpha[pixelIndex];
+    storeRgba(dst, imageX, imageY, a, a, a, 1.0f);
+}
+
+__global__ void extractOutputAlphaDiagnosticKernel(MutableImagePlaneDesc dst,
+                                                   int renderX1,
+                                                   int renderY1,
+                                                   int width,
+                                                   int height)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    const int imageX = renderX1 + x;
+    const int imageY = renderY1 + y;
+    float* pixel = pixelAddress(dst, imageX, imageY);
+    if (pixel == nullptr) {
+        return;
+    }
+
+    const float a = pixel[3];
+    pixel[0] = a;
+    pixel[1] = a;
+    pixel[2] = a;
+    pixel[3] = 1.0f;
+}
+
+__global__ void copyImageKernel(ImagePlaneDesc src,
+                                MutableImagePlaneDesc dst,
+                                int renderX1,
+                                int renderY1,
+                                int width,
+                                int height)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    const int imageX = renderX1 + x;
+    const int imageY = renderY1 + y;
+    const float* srcPixel = pixelAddress(src, imageX, imageY);
+    float* dstPixel = pixelAddress(dst, imageX, imageY);
+    if (srcPixel == nullptr || dstPixel == nullptr) {
+        return;
+    }
+
+    dstPixel[0] = srcPixel[0];
+    dstPixel[1] = srcPixel[1];
+    dstPixel[2] = srcPixel[2];
+    dstPixel[3] = (src.components >= 4) ? srcPixel[3] : 1.0f;
+}
+
+__global__ void packRgbKernel(MutableImagePlaneDesc dst,
+                              int renderX1,
+                              int renderY1,
+                              int width,
+                              int height,
+                              const float* srcR,
+                              const float* srcG,
+                              const float* srcB)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+    const int pixelIndex = y * width + x;
+    storeRgba(dst, renderX1 + x, renderY1 + y, srcR[pixelIndex], srcG[pixelIndex], srcB[pixelIndex], 1.0f);
+}
+
+__global__ void erodeAlphaKernel(const float* src,
+                                 float* dst,
+                                 int width,
+                                 int height,
+                                 int radius)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    float minValue = 1.0f;
+    for (int dy = -radius; dy <= radius; ++dy) {
+        const int sy = max(0, min(height - 1, y + dy));
+        for (int dx = -radius; dx <= radius; ++dx) {
+            const int sx = max(0, min(width - 1, x + dx));
+            minValue = fminf(minValue, src[sy * width + sx]);
+        }
+    }
+    dst[y * width + x] = minValue;
+}
+
+__global__ void cleanPlateEstimateKernel(ImagePlaneDesc src,
+                                         int renderX1,
+                                         int renderY1,
+                                         int width,
+                                         int height,
+                                         float pickR,
+                                         float pickG,
+                                         float pickB,
+                                         const float* alpha,
+                                         float* outR,
+                                         float* outG,
+                                         float* outB)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    const int imageX = renderX1 + x;
+    const int imageY = renderY1 + y;
+    const int pixelIndex = y * width + x;
+    float srcR = 0.0f;
+    float srcG = 0.0f;
+    float srcB = 0.0f;
+    sampleRgb(src, imageX, imageY, srcR, srcG, srcB);
+    const float t = smoothstep01(alpha[pixelIndex]);
+    outR[pixelIndex] = srcR * (1.0f - t) + pickR * t;
+    outG[pixelIndex] = srcG * (1.0f - t) + pickG * t;
+    outB[pixelIndex] = srcB * (1.0f - t) + pickB * t;
+}
+
+__global__ void packCleanPlateKernel(int width,
+                                     int height,
+                                     const float* srcR,
+                                     const float* srcG,
+                                     const float* srcB,
+                                     float* cleanPlate)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+    const int pixelIndex = y * width + x;
+    const int rgba = pixelIndex * 4;
+    cleanPlate[rgba + 0] = srcR[pixelIndex];
+    cleanPlate[rgba + 1] = srcG[pixelIndex];
+    cleanPlate[rgba + 2] = srcB[pixelIndex];
+    cleanPlate[rgba + 3] = 1.0f;
+}
+
+__global__ void rgbComputeProductsKernel(ImagePlaneDesc src,
+                                         int renderX1,
+                                         int renderY1,
+                                         int width,
+                                         int height,
+                                         const float* rawAlpha,
+                                         float* meanIr,
+                                         float* meanIg,
+                                         float* meanIb,
+                                         float* meanP,
+                                         float* irir,
+                                         float* irig,
+                                         float* irib,
+                                         float* igig,
+                                         float* igib,
+                                         float* ibib,
+                                         float* irp,
+                                         float* igp,
+                                         float* ibp)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+    const int pixelIndex = y * width + x;
+    float ir = 0.0f;
+    float ig = 0.0f;
+    float ib = 0.0f;
+    sampleRgb(src, renderX1 + x, renderY1 + y, ir, ig, ib);
+    const float p = rawAlpha[pixelIndex];
+    meanIr[pixelIndex] = ir;
+    meanIg[pixelIndex] = ig;
+    meanIb[pixelIndex] = ib;
+    meanP[pixelIndex] = p;
+    irir[pixelIndex] = ir * ir;
+    irig[pixelIndex] = ir * ig;
+    irib[pixelIndex] = ir * ib;
+    igig[pixelIndex] = ig * ig;
+    igib[pixelIndex] = ig * ib;
+    ibib[pixelIndex] = ib * ib;
+    irp[pixelIndex] = ir * p;
+    igp[pixelIndex] = ig * p;
+    ibp[pixelIndex] = ib * p;
+}
+
+__global__ void rgbGuidedCoeffKernel(int pixelCount,
+                                     float epsilon,
+                                     const float* meanIr,
+                                     const float* meanIg,
+                                     const float* meanIb,
+                                     const float* meanP,
+                                     const float* irir,
+                                     const float* irig,
+                                     const float* irib,
+                                     const float* igig,
+                                     const float* igib,
+                                     const float* ibib,
+                                     const float* irp,
+                                     const float* igp,
+                                     const float* ibp,
+                                     float* outAr,
+                                     float* outAg,
+                                     float* outAb,
+                                     float* outB)
+{
+    const int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= pixelCount) {
+        return;
+    }
+
+    const float mIr = meanIr[index];
+    const float mIg = meanIg[index];
+    const float mIb = meanIb[index];
+    const float mP = meanP[index];
+
+    float s_rr = irir[index] - mIr * mIr;
+    const float s_rg = irig[index] - mIr * mIg;
+    const float s_rb = irib[index] - mIr * mIb;
+    float s_gg = igig[index] - mIg * mIg;
+    const float s_gb = igib[index] - mIg * mIb;
+    float s_bb = ibib[index] - mIb * mIb;
+
+    const float c_rp = irp[index] - mIr * mP;
+    const float c_gp = igp[index] - mIg * mP;
+    const float c_bp = ibp[index] - mIb * mP;
+
+    const float trace = s_rr + s_gg + s_bb;
+    const float adaptEps = epsilon * epsilon / ((trace / 3.0f) + epsilon + 1e-10f);
+    s_rr += adaptEps;
+    s_gg += adaptEps;
+    s_bb += adaptEps;
+
+    const float det = s_rr * (s_gg * s_bb - s_gb * s_gb)
+                    - s_rg * (s_rg * s_bb - s_gb * s_rb)
+                    + s_rb * (s_rg * s_gb - s_gg * s_rb);
+    const float invDet = (fabsf(det) > 1e-12f) ? (1.0f / det) : 0.0f;
+
+    const float inv_rr = (s_gg * s_bb - s_gb * s_gb) * invDet;
+    const float inv_rg = (s_rb * s_gb - s_rg * s_bb) * invDet;
+    const float inv_rb = (s_rg * s_gb - s_rb * s_gg) * invDet;
+    const float inv_gg = (s_rr * s_bb - s_rb * s_rb) * invDet;
+    const float inv_gb = (s_rb * s_rg - s_rr * s_gb) * invDet;
+    const float inv_bb = (s_rr * s_gg - s_rg * s_rg) * invDet;
+
+    const float ar = inv_rr * c_rp + inv_rg * c_gp + inv_rb * c_bp;
+    const float ag = inv_rg * c_rp + inv_gg * c_gp + inv_gb * c_bp;
+    const float ab = inv_rb * c_rp + inv_gb * c_gp + inv_bb * c_bp;
+    outAr[index] = ar;
+    outAg[index] = ag;
+    outAb[index] = ab;
+    outB[index] = mP - ar * mIr - ag * mIg - ab * mIb;
+}
+
+__global__ void rgbGuidedEvalKernel(ImagePlaneDesc src,
+                                    int renderX1,
+                                    int renderY1,
+                                    int width,
+                                    int height,
+                                    const float* meanAr,
+                                    const float* meanAg,
+                                    const float* meanAb,
+                                    const float* meanB,
+                                    float* outAlpha)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+    const int pixelIndex = y * width + x;
+    float ir = 0.0f;
+    float ig = 0.0f;
+    float ib = 0.0f;
+    sampleRgb(src, renderX1 + x, renderY1 + y, ir, ig, ib);
+    outAlpha[pixelIndex] = clamp01(meanAr[pixelIndex] * ir +
+                                   meanAg[pixelIndex] * ig +
+                                   meanAb[pixelIndex] * ib +
+                                   meanB[pixelIndex]);
+}
+
+__global__ void rgbGuidedApplyKernel(IBKeyerParams params,
+                                     ImagePlaneDesc src,
+                                     MutableImagePlaneDesc dst,
+                                     int renderX1,
+                                     int renderY1,
+                                     int width,
+                                     int height,
+                                     const float* rawAlpha,
+                                     const float* meanAr,
+                                     const float* meanAg,
+                                     const float* meanAb,
+                                     const float* meanB)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    const int imageX = renderX1 + x;
+    const int imageY = renderY1 + y;
+    const int pixelIndex = y * width + x;
+    float ir = 0.0f;
+    float ig = 0.0f;
+    float ib = 0.0f;
+    sampleRgb(src, imageX, imageY, ir, ig, ib);
+    const float guidedAlpha = clamp01(meanAr[pixelIndex] * ir +
+                                      meanAg[pixelIndex] * ig +
+                                      meanAb[pixelIndex] * ib +
+                                      meanB[pixelIndex]);
+    const float alpha = rawAlpha[pixelIndex] * (1.0f - params.guidedMix) + guidedAlpha * params.guidedMix;
+
+    float* pixel = pixelAddress(dst, imageX, imageY);
+    if (pixel == nullptr) {
+        return;
+    }
+    if (params.premultiply) {
+        pixel[0] *= alpha;
+        pixel[1] *= alpha;
+        pixel[2] *= alpha;
+    }
+    pixel[3] = alpha;
+}
+
+__global__ void additiveKeyKernel(IBKeyerParams params,
+                                  ImagePlaneDesc src,
+                                  ImagePlaneDesc screen,
+                                  MutableImagePlaneDesc dst,
+                                  int renderX1,
+                                  int renderY1,
+                                  int width,
+                                  int height,
+                                  const float* blurBgR,
+                                  const float* blurBgG,
+                                  const float* blurBgB)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    const int imageX = renderX1 + x;
+    const int imageY = renderY1 + y;
+    const int pixelIndex = y * width + x;
+    float* outPixel = pixelAddress(dst, imageX, imageY);
+    if (outPixel == nullptr) {
+        return;
+    }
+
+    const float alpha = outPixel[3];
+    if (alpha > 0.999f) {
+        return;
+    }
+
+    float srcR = 0.0f;
+    float srcG = 0.0f;
+    float srcB = 0.0f;
+    sampleRgb(src, imageX, imageY, srcR, srcG, srcB);
+
+    float scrR = params.pickR;
+    float scrG = params.pickG;
+    float scrB = params.pickB;
+    if (params.useScreenInput && screen.data != nullptr) {
+        sampleRgb(screen, imageX, imageY, scrR, scrG, scrB);
+    }
+
+    float resR = 0.0f;
+    float resG = 0.0f;
+    float resB = 0.0f;
+    if (params.additiveKeyMode == 0) {
+        resR = srcR - scrR;
+        resG = srcG - scrG;
+        resB = srcB - scrB;
+        const float lum = luminance(resR, resG, resB);
+        resR = lum * (1.0f - params.additiveKeySaturation) + resR * params.additiveKeySaturation;
+        resG = lum * (1.0f - params.additiveKeySaturation) + resG * params.additiveKeySaturation;
+        resB = lum * (1.0f - params.additiveKeySaturation) + resB * params.additiveKeySaturation;
+    } else if (blurBgR != nullptr && blurBgG != nullptr && blurBgB != nullptr) {
+        float fR = (scrR > 1e-6f) ? srcR / scrR : 1.0f;
+        float fG = (scrG > 1e-6f) ? srcG / scrG : 1.0f;
+        float fB = (scrB > 1e-6f) ? srcB / scrB : 1.0f;
+        const float fLum = luminance(fR, fG, fB);
+        fR = fLum * (1.0f - params.additiveKeySaturation) + fR * params.additiveKeySaturation;
+        fG = fLum * (1.0f - params.additiveKeySaturation) + fG * params.additiveKeySaturation;
+        fB = fLum * (1.0f - params.additiveKeySaturation) + fB * params.additiveKeySaturation;
+        resR = blurBgR[pixelIndex] * (fR - 1.0f);
+        resG = blurBgG[pixelIndex] * (fG - 1.0f);
+        resB = blurBgB[pixelIndex] * (fB - 1.0f);
+    }
+
+    if (params.additiveKeyBlackClamp) {
+        resR = fmaxf(resR, 0.0f);
+        resG = fmaxf(resG, 0.0f);
+        resB = fmaxf(resB, 0.0f);
+    }
+
+    const float weight = (1.0f - alpha) * params.additiveKeyAmount;
+    outPixel[0] += resR * weight;
+    outPixel[1] += resG * weight;
+    outPixel[2] += resB * weight;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // CUDA LAUNCH HELPERS
 ////////////////////////////////////////////////////////////////////////////////
@@ -600,6 +1407,11 @@ bool renderCudaFrame(const IBKeyerParams& params,
 
     const int pixelCount = width * height;
     const bool doGF = guidedFilterActive(params);
+    const bool doRgbGF = doGF && params.guidedFilterMode == 1;
+    const bool doPrematte = params.prematteEnabled && params.prematteBlur > 0;
+    const bool doBgWrap = params.bgWrapEnabled && frame.background.data != nullptr && params.bgWrapAmount > 0.0f;
+    const bool doAdditive = params.additiveKeyEnabled && params.additiveKeyAmount > 0.0f;
+    const bool needBgBlur = doBgWrap || (doAdditive && params.additiveKeyMode == 1 && frame.background.data != nullptr);
     if (!scratch.ensurePixelCapacity(pixelCount, error)) {
         return false;
     }
@@ -637,58 +1449,336 @@ bool renderCudaFrame(const IBKeyerParams& params,
         return false;
     }
 
-    if (doGF) {
-        computeProductsKernel<<<flatBlocks, flatThreads, 0, stream>>>(
-            pixelCount,
-            scratch.rawAlpha,
-            scratch.guide,
-            scratch.meanI,
-            scratch.meanP,
-            scratch.meanIp,
-            scratch.meanII);
-        if (!captureKernelStage("guided products", stream, waitForCompletion, error)) {
+    if (doPrematte) {
+        const int prematteIterations = max(1, min(params.prematteIterations, 5));
+        if (!scratch.ensureGaussianWeights(buildGaussianWeights(max(1, params.prematteBlur)), max(1, params.prematteBlur), error)) {
             return false;
         }
 
-        // This is the part zero-copy does not remove: the guided filter still needs temporary
-        // working buffers on the GPU. What zero-copy changes is that Source/Screen/Output stay
-        // on the host-owned device images instead of bouncing through CPU memory first.
-        if (!runGaussianBlur(scratch.meanI, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
-            !runGaussianBlur(scratch.meanP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
-            !runGaussianBlur(scratch.meanIp, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
-            !runGaussianBlur(scratch.meanII, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) {
-            return false;
+        ImagePlaneDesc cleanScreen;
+        cleanScreen.data = scratch.cleanPlate;
+        cleanScreen.rowBytes = static_cast<size_t>(width) * 4u * sizeof(float);
+        cleanScreen.bounds = frame.renderWindow;
+        cleanScreen.components = 4;
+
+        for (int iter = 0; iter < prematteIterations; ++iter) {
+            const float* alphaSource = scratch.rawAlpha;
+            if (params.prematteErode > 0) {
+                erodeAlphaKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+                    scratch.rawAlpha,
+                    scratch.prematteAlpha,
+                    width,
+                    height,
+                    params.prematteErode);
+                if (!captureKernelStage("prematte erode", stream, waitForCompletion, error)) {
+                    return false;
+                }
+                alphaSource = scratch.prematteAlpha;
+            }
+
+            cleanPlateEstimateKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+                frame.src,
+                frame.renderWindow.x1,
+                frame.renderWindow.y1,
+                width,
+                height,
+                params.pickR,
+                params.pickG,
+                params.pickB,
+                alphaSource,
+                scratch.bgR,
+                scratch.bgG,
+                scratch.bgB);
+            if (!captureKernelStage("prematte clean plate estimate", stream, waitForCompletion, error)) {
+                return false;
+            }
+
+            if (!runGaussianBlur(scratch.bgR, scratch.scratch, scratch.gaussianWeights, width, height, max(1, params.prematteBlur), stream, waitForCompletion, error) ||
+                !runGaussianBlur(scratch.bgG, scratch.scratch, scratch.gaussianWeights, width, height, max(1, params.prematteBlur), stream, waitForCompletion, error) ||
+                !runGaussianBlur(scratch.bgB, scratch.scratch, scratch.gaussianWeights, width, height, max(1, params.prematteBlur), stream, waitForCompletion, error)) {
+                return false;
+            }
+
+            packCleanPlateKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+                width,
+                height,
+                scratch.bgR,
+                scratch.bgG,
+                scratch.bgB,
+                scratch.cleanPlate);
+            if (!captureKernelStage("prematte pack clean plate", stream, waitForCompletion, error)) {
+                return false;
+            }
+
+            coreKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(params,
+                                                                 frame.src,
+                                                                 cleanScreen,
+                                                                 frame.dst,
+                                                                 frame.renderWindow.x1,
+                                                                 frame.renderWindow.y1,
+                                                                 width,
+                                                                 height,
+                                                                 scratch.rawAlpha,
+                                                                 scratch.guide);
+            if (!captureKernelStage("prematte rekey", stream, waitForCompletion, error)) {
+                return false;
+            }
         }
 
-        guidedCoeffKernel<<<flatBlocks, flatThreads, 0, stream>>>(
-            pixelCount,
-            params.guidedEpsilon,
-            scratch.meanI,
-            scratch.meanP,
-            scratch.meanIp,
-            scratch.meanII);
-        if (!captureKernelStage("guided coefficients", stream, waitForCompletion, error)) {
-            return false;
+        if (params.viewMode == 2) {
+            copyImageKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+                cleanScreen,
+                frame.dst,
+                frame.renderWindow.x1,
+                frame.renderWindow.y1,
+                width,
+                height);
+            return captureKernelStage("diagnostic clean plate", stream, waitForCompletion, error);
         }
+    } else if (params.viewMode == 2 && params.useScreenInput && frame.screen.data != nullptr) {
+        copyImageKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+            frame.screen,
+            frame.dst,
+            frame.renderWindow.x1,
+            frame.renderWindow.y1,
+            width,
+            height);
+        return captureKernelStage("diagnostic screen input", stream, waitForCompletion, error);
+    }
 
-        if (!runGaussianBlur(scratch.meanI, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
-            !runGaussianBlur(scratch.meanP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) {
+    if (frame.garbageMatte.data != nullptr) {
+        applyMatteKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+            scratch.rawAlpha,
+            frame.dst,
+            frame.garbageMatte,
+            frame.renderWindow.x1,
+            frame.renderWindow.y1,
+            width,
+            height,
+            0);
+        if (!captureKernelStage("garbage matte", stream, waitForCompletion, error)) {
+            return false;
+        }
+    }
+    if (frame.occlusionMatte.data != nullptr) {
+        applyMatteKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+            scratch.rawAlpha,
+            frame.dst,
+            frame.occlusionMatte,
+            frame.renderWindow.x1,
+            frame.renderWindow.y1,
+            width,
+            height,
+            1);
+        if (!captureKernelStage("occlusion matte", stream, waitForCompletion, error)) {
             return false;
         }
+    }
+
+    if (params.viewMode == 1) {
+        writeAlphaDiagnosticKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+            scratch.rawAlpha,
+            frame.dst,
+            frame.renderWindow.x1,
+            frame.renderWindow.y1,
+            width,
+            height);
+        return captureKernelStage("diagnostic raw matte", stream, waitForCompletion, error);
+    }
 
-        guidedApplyKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(params,
-                                                                    frame.dst,
-                                                                    frame.renderWindow.x1,
-                                                                    frame.renderWindow.y1,
-                                                                    width,
-                                                                    height,
-                                                                    scratch.rawAlpha,
-                                                                    scratch.guide,
-                                                                    scratch.meanI,
-                                                                    scratch.meanP);
-        if (!captureKernelStage("guided apply", stream, waitForCompletion, error)) {
+    if (doGF) {
+        copyBufferKernel<<<flatBlocks, flatThreads, 0, stream>>>(pixelCount, scratch.rawAlpha, scratch.savedRawAlpha);
+        if (!captureKernelStage("save raw alpha", stream, waitForCompletion, error)) {
             return false;
         }
+
+        const int numIter = std::max(1, std::min(params.refineIterations, 5));
+        if (doRgbGF) {
+            for (int iter = 0; iter < numIter; ++iter) {
+                rgbComputeProductsKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+                    frame.src,
+                    frame.renderWindow.x1,
+                    frame.renderWindow.y1,
+                    width,
+                    height,
+                    scratch.rawAlpha,
+                    scratch.rgbMeanIr,
+                    scratch.rgbMeanIg,
+                    scratch.rgbMeanIb,
+                    scratch.rgbMeanP,
+                    scratch.rgbIrIr,
+                    scratch.rgbIrIg,
+                    scratch.rgbIrIb,
+                    scratch.rgbIgIg,
+                    scratch.rgbIgIb,
+                    scratch.rgbIbIb,
+                    scratch.rgbIrP,
+                    scratch.rgbIgP,
+                    scratch.rgbIbP);
+                if (!captureKernelStage("rgb guided products", stream, waitForCompletion, error)) {
+                    return false;
+                }
+
+                if (!runGaussianBlur(scratch.rgbMeanIr, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbMeanIg, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbMeanIb, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbMeanP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbIrIr, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbIrIg, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbIrIb, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbIgIg, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbIgIb, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbIbIb, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbIrP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbIgP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbIbP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) {
+                    return false;
+                }
+
+                rgbGuidedCoeffKernel<<<flatBlocks, flatThreads, 0, stream>>>(
+                    pixelCount,
+                    params.guidedEpsilon,
+                    scratch.rgbMeanIr,
+                    scratch.rgbMeanIg,
+                    scratch.rgbMeanIb,
+                    scratch.rgbMeanP,
+                    scratch.rgbIrIr,
+                    scratch.rgbIrIg,
+                    scratch.rgbIrIb,
+                    scratch.rgbIgIg,
+                    scratch.rgbIgIb,
+                    scratch.rgbIbIb,
+                    scratch.rgbIrP,
+                    scratch.rgbIgP,
+                    scratch.rgbIbP,
+                    scratch.rgbMeanAr,
+                    scratch.rgbMeanAg,
+                    scratch.rgbMeanAb,
+                    scratch.rgbMeanB);
+                if (!captureKernelStage("rgb guided coefficients", stream, waitForCompletion, error)) {
+                    return false;
+                }
+
+                if (!runGaussianBlur(scratch.rgbMeanAr, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbMeanAg, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbMeanAb, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.rgbMeanB, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) {
+                    return false;
+                }
+
+                if (iter < numIter - 1) {
+                    rgbGuidedEvalKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+                        frame.src,
+                        frame.renderWindow.x1,
+                        frame.renderWindow.y1,
+                        width,
+                        height,
+                        scratch.rgbMeanAr,
+                        scratch.rgbMeanAg,
+                        scratch.rgbMeanAb,
+                        scratch.rgbMeanB,
+                        scratch.rawAlpha);
+                    if (!captureKernelStage("rgb guided eval", stream, waitForCompletion, error)) {
+                        return false;
+                    }
+                }
+            }
+
+            rgbGuidedApplyKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+                params,
+                frame.src,
+                frame.dst,
+                frame.renderWindow.x1,
+                frame.renderWindow.y1,
+                width,
+                height,
+                scratch.savedRawAlpha,
+                scratch.rgbMeanAr,
+                scratch.rgbMeanAg,
+                scratch.rgbMeanAb,
+                scratch.rgbMeanB);
+            if (!captureKernelStage("rgb guided apply", stream, waitForCompletion, error)) {
+                return false;
+            }
+        } else {
+            for (int iter = 0; iter < numIter; ++iter) {
+                if (iter > 0) {
+                    refineGuideKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+                        width,
+                        height,
+                        frame.src,
+                        frame.renderWindow.x1,
+                        frame.renderWindow.y1,
+                        params.edgeProtect,
+                        scratch.rawAlpha,
+                        scratch.guide);
+                    if (!captureKernelStage("refine guide", stream, waitForCompletion, error)) {
+                        return false;
+                    }
+                }
+
+                computeProductsKernel<<<flatBlocks, flatThreads, 0, stream>>>(
+                    pixelCount,
+                    scratch.rawAlpha,
+                    scratch.guide,
+                    scratch.meanI,
+                    scratch.meanP,
+                    scratch.meanIp,
+                    scratch.meanII);
+                if (!captureKernelStage("guided products", stream, waitForCompletion, error)) {
+                    return false;
+                }
+
+                if (!runGaussianBlur(scratch.meanI, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.meanP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.meanIp, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.meanII, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) {
+                    return false;
+                }
+
+                guidedCoeffKernel<<<flatBlocks, flatThreads, 0, stream>>>(
+                    pixelCount,
+                    params.guidedEpsilon,
+                    scratch.meanI,
+                    scratch.meanP,
+                    scratch.meanIp,
+                    scratch.meanII);
+                if (!captureKernelStage("guided coefficients", stream, waitForCompletion, error)) {
+                    return false;
+                }
+
+                if (!runGaussianBlur(scratch.meanI, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error) ||
+                    !runGaussianBlur(scratch.meanP, scratch.scratch, scratch.gaussianWeights, width, height, params.guidedRadius, stream, waitForCompletion, error)) {
+                    return false;
+                }
+
+                if (iter < numIter - 1) {
+                    guidedEvalKernel<<<flatBlocks, flatThreads, 0, stream>>>(
+                        pixelCount,
+                        scratch.guide,
+                        scratch.meanI,
+                        scratch.meanP,
+                        scratch.rawAlpha);
+                    if (!captureKernelStage("guided eval", stream, waitForCompletion, error)) {
+                        return false;
+                    }
+                }
+            }
+
+            guidedApplyKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(params,
+                                                                        frame.dst,
+                                                                        frame.renderWindow.x1,
+                                                                        frame.renderWindow.y1,
+                                                                        width,
+                                                                        height,
+                                                                        scratch.savedRawAlpha,
+                                                                        scratch.guide,
+                                                                        scratch.meanI,
+                                                                        scratch.meanP);
+            if (!captureKernelStage("guided apply", stream, waitForCompletion, error)) {
+                return false;
+            }
+        }
     } else if (params.premultiply) {
         premultiplyKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
             frame.dst,
@@ -701,6 +1791,107 @@ bool renderCudaFrame(const IBKeyerParams& params,
         }
     }
 
+    if (params.viewMode == 3) {
+        extractOutputAlphaDiagnosticKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+            frame.dst,
+            frame.renderWindow.x1,
+            frame.renderWindow.y1,
+            width,
+            height);
+        return captureKernelStage("diagnostic refined matte", stream, waitForCompletion, error);
+    }
+
+    if (params.edgeColorCorrect > 0.0f) {
+        edgeColorCorrectKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+            params,
+            frame.src,
+            frame.screen,
+            frame.dst,
+            frame.renderWindow.x1,
+            frame.renderWindow.y1,
+            width,
+            height);
+        if (!captureKernelStage("edge color correct", stream, waitForCompletion, error)) {
+            return false;
+        }
+    }
+
+    if (params.viewMode == 4) {
+        return true;
+    }
+
+    if (needBgBlur) {
+        extractBackgroundChannelsKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+            width,
+            height,
+            frame.background,
+            frame.renderWindow.x1,
+            frame.renderWindow.y1,
+            scratch.bgR,
+            scratch.bgG,
+            scratch.bgB);
+        if (!captureKernelStage("background extract", stream, waitForCompletion, error)) {
+            return false;
+        }
+
+        const int blurRadius = std::max(1, params.bgWrapBlur);
+        if (!scratch.ensureGaussianWeights(buildGaussianWeights(blurRadius), blurRadius, error)) {
+            return false;
+        }
+        if (!runGaussianBlur(scratch.bgR, scratch.scratch, scratch.gaussianWeights, width, height, blurRadius, stream, waitForCompletion, error) ||
+            !runGaussianBlur(scratch.bgG, scratch.scratch, scratch.gaussianWeights, width, height, blurRadius, stream, waitForCompletion, error) ||
+            !runGaussianBlur(scratch.bgB, scratch.scratch, scratch.gaussianWeights, width, height, blurRadius, stream, waitForCompletion, error)) {
+            return false;
+        }
+
+        if (params.viewMode == 5) {
+            packRgbKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+                frame.dst,
+                frame.renderWindow.x1,
+                frame.renderWindow.y1,
+                width,
+                height,
+                scratch.bgR,
+                scratch.bgG,
+                scratch.bgB);
+            return captureKernelStage("diagnostic blurred background", stream, waitForCompletion, error);
+        }
+
+        if (doBgWrap) {
+            bgWrapKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+                frame.dst,
+                frame.renderWindow.x1,
+                frame.renderWindow.y1,
+                width,
+                height,
+                params.bgWrapAmount,
+                scratch.bgR,
+                scratch.bgG,
+                scratch.bgB);
+            if (!captureKernelStage("background wrap", stream, waitForCompletion, error)) {
+                return false;
+            }
+        }
+    }
+
+    if (doAdditive) {
+        additiveKeyKernel<<<pixelBlocks, pixelThreads, 0, stream>>>(
+            params,
+            frame.src,
+            frame.screen,
+            frame.dst,
+            frame.renderWindow.x1,
+            frame.renderWindow.y1,
+            width,
+            height,
+            (needBgBlur ? scratch.bgR : nullptr),
+            (needBgBlur ? scratch.bgG : nullptr),
+            (needBgBlur ? scratch.bgB : nullptr));
+        if (!captureKernelStage("additive key", stream, waitForCompletion, error)) {
+            return false;
+        }
+    }
+
     if (!waitForCompletion && !scratch.markInFlight(stream)) {
         // If we cannot record the fence event, we give up some performance and force the stream
         // to finish now so those scratch buffers are still safe to reuse on the next frame.
@@ -761,12 +1952,18 @@ bool renderCudaInternal(const IBKeyerParams& params, const PackedFrame& frame, s
 
     float* dSrc = nullptr;
     float* dScreen = nullptr;
+    float* dBackground = nullptr;
+    float* dGarbageMatte = nullptr;
+    float* dOcclusionMatte = nullptr;
     float* dDst = nullptr;
     DeviceRenderFrame deviceFrame;
 
     if (cudaMalloc(&dSrc, rgbaBytes) != cudaSuccess ||
         cudaMalloc(&dDst, rgbaBytes) != cudaSuccess ||
-        (frame.screenRgba != nullptr && cudaMalloc(&dScreen, rgbaBytes) != cudaSuccess)) {
+        (frame.screenRgba != nullptr && cudaMalloc(&dScreen, rgbaBytes) != cudaSuccess) ||
+        (frame.backgroundRgba != nullptr && cudaMalloc(&dBackground, rgbaBytes) != cudaSuccess) ||
+        (frame.garbageMatteRgba != nullptr && cudaMalloc(&dGarbageMatte, rgbaBytes) != cudaSuccess) ||
+        (frame.occlusionMatteRgba != nullptr && cudaMalloc(&dOcclusionMatte, rgbaBytes) != cudaSuccess)) {
         error = "cudaMalloc failed for the staged CUDA path.";
         goto cleanup;
     }
@@ -780,6 +1977,21 @@ bool renderCudaInternal(const IBKeyerParams& params, const PackedFrame& frame, s
         error = "cudaMemcpy(screen) failed for the staged CUDA path.";
         goto cleanup;
     }
+    if (frame.backgroundRgba != nullptr &&
+        cudaMemcpy(dBackground, frame.backgroundRgba, rgbaBytes, cudaMemcpyHostToDevice) != cudaSuccess) {
+        error = "cudaMemcpy(background) failed for the staged CUDA path.";
+        goto cleanup;
+    }
+    if (frame.garbageMatteRgba != nullptr &&
+        cudaMemcpy(dGarbageMatte, frame.garbageMatteRgba, rgbaBytes, cudaMemcpyHostToDevice) != cudaSuccess) {
+        error = "cudaMemcpy(garbage matte) failed for the staged CUDA path.";
+        goto cleanup;
+    }
+    if (frame.occlusionMatteRgba != nullptr &&
+        cudaMemcpy(dOcclusionMatte, frame.occlusionMatteRgba, rgbaBytes, cudaMemcpyHostToDevice) != cudaSuccess) {
+        error = "cudaMemcpy(occlusion matte) failed for the staged CUDA path.";
+        goto cleanup;
+    }
 
     deviceFrame.src.data = dSrc;
     deviceFrame.src.rowBytes = static_cast<size_t>(frame.width) * 4u * sizeof(float);
@@ -789,6 +2001,18 @@ bool renderCudaInternal(const IBKeyerParams& params, const PackedFrame& frame, s
     deviceFrame.screen.rowBytes = (dScreen != nullptr) ? static_cast<size_t>(frame.width) * 4u * sizeof(float) : 0u;
     deviceFrame.screen.bounds = {0, 0, frame.width, frame.height};
     deviceFrame.screen.components = (dScreen != nullptr) ? 4 : 0;
+    deviceFrame.background.data = dBackground;
+    deviceFrame.background.rowBytes = (dBackground != nullptr) ? static_cast<size_t>(frame.width) * 4u * sizeof(float) : 0u;
+    deviceFrame.background.bounds = {0, 0, frame.width, frame.height};
+    deviceFrame.background.components = (dBackground != nullptr) ? 4 : 0;
+    deviceFrame.garbageMatte.data = dGarbageMatte;
+    deviceFrame.garbageMatte.rowBytes = (dGarbageMatte != nullptr) ? static_cast<size_t>(frame.width) * 4u * sizeof(float) : 0u;
+    deviceFrame.garbageMatte.bounds = {0, 0, frame.width, frame.height};
+    deviceFrame.garbageMatte.components = (dGarbageMatte != nullptr) ? 4 : 0;
+    deviceFrame.occlusionMatte.data = dOcclusionMatte;
+    deviceFrame.occlusionMatte.rowBytes = (dOcclusionMatte != nullptr) ? static_cast<size_t>(frame.width) * 4u * sizeof(float) : 0u;
+    deviceFrame.occlusionMatte.bounds = {0, 0, frame.width, frame.height};
+    deviceFrame.occlusionMatte.components = (dOcclusionMatte != nullptr) ? 4 : 0;
     deviceFrame.dst.data = dDst;
     deviceFrame.dst.rowBytes = static_cast<size_t>(frame.width) * 4u * sizeof(float);
     deviceFrame.dst.bounds = {0, 0, frame.width, frame.height};
@@ -809,6 +2033,9 @@ bool renderCudaInternal(const IBKeyerParams& params, const PackedFrame& frame, s
 
 cleanup:
     cudaFree(dDst);
+    cudaFree(dOcclusionMatte);
+    cudaFree(dGarbageMatte);
+    cudaFree(dBackground);
     cudaFree(dScreen);
     cudaFree(dSrc);
     return error.empty();
diff --git a/IBKeymaster/IBKeyerShared.h b/IBKeymaster/IBKeyerShared.h
index 6021a38..bac8ae5 100644
--- a/IBKeymaster/IBKeyerShared.h
+++ b/IBKeymaster/IBKeyerShared.h
@@ -32,13 +32,32 @@ struct IBKeyerParams
     float respillB = 0.0f;
     bool premultiply = false;
     bool nearGreyExtract = true;
-    float nearGreyAmount = 1.0f;
+    float nearGreyAmount = 0.5f;
+    float nearGreySoftness = 1.0f;
     float blackClip = 0.0f;
     float whiteClip = 1.0f;
+    float matteGamma = 1.0f;
+    bool prematteEnabled = false;
+    int prematteBlur = 8;
+    int prematteErode = 0;
+    int prematteIterations = 1;
     bool guidedFilterEnabled = true;
+    int guidedFilterMode = 0;
     int guidedRadius = 8;
     float guidedEpsilon = 0.01f;
     float guidedMix = 1.0f;
+    float edgeProtect = 0.5f;
+    int refineIterations = 2;
+    float edgeColorCorrect = 0.0f;
+    bool bgWrapEnabled = false;
+    int bgWrapBlur = 20;
+    float bgWrapAmount = 0.5f;
+    bool additiveKeyEnabled = false;
+    int additiveKeyMode = 0;
+    float additiveKeySaturation = 0.0f;
+    float additiveKeyAmount = 0.0f;
+    bool additiveKeyBlackClamp = false;
+    int viewMode = 0;
 };
 
 struct PackedFrame
@@ -47,6 +66,9 @@ struct PackedFrame
     int height = 0;
     const float* srcRgba = nullptr;
     const float* screenRgba = nullptr;
+    const float* backgroundRgba = nullptr;
+    const float* garbageMatteRgba = nullptr;
+    const float* occlusionMatteRgba = nullptr;
     float* dstRgba = nullptr;
 };
 
@@ -87,13 +109,13 @@ IBKEYER_HOST_DEVICE inline float despillValue(float r, float g, float b, int scr
 }
 
 IBKEYER_HOST_DEVICE inline float nearGreyAlpha(float r, float g, float b, int screenColor,
-                                               float amount)
+                                               float softness)
 {
     float c0, c1, c2;
     reorderChannels(r, g, b, screenColor, c0, c1, c2);
     const float mx = fmaxf(c0, fmaxf(c1, c2));
     const float comp = (mx == c1) ? c1 : c2;
-    const float value = c0 * (1.0f - amount) + comp * amount;
+    const float value = c0 * (1.0f - softness) + comp * softness;
     return clamp01(value);
 }
 
diff --git a/IBKeymaster/MetalKernel.mm b/IBKeymaster/MetalKernel.mm
index de2f98e..2e46dbe 100644
--- a/IBKeymaster/MetalKernel.mm
+++ b/IBKeymaster/MetalKernel.mm
@@ -228,6 +228,162 @@ kernel void GaussianBlurV(
     p_Dst[y * w + x] = sum;
 }
 
+// ════════════════════════════════════════════════════════════════════════
+//  Gaussian Blur — Horizontal TILED (threadgroup shared memory)
+//  Each row of the threadgroup cooperatively loads a strip into fast
+//  shared memory, then each thread sums from the tile.  Dramatically
+//  reduces global-memory bandwidth for large radii.
+//  Static 4096-float tile supports radius up to ~240 with (32,8) tg.
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void GaussianBlurH_Tiled(
+    const device float* p_Src     [[buffer(3)]],
+    device float*       p_Dst     [[buffer(4)]],
+    const device float* p_Weights [[buffer(5)]],
+    constant int&       p_Width   [[buffer(10)]],
+    constant int&       p_Height  [[buffer(11)]],
+    constant int&       p_Radius  [[buffer(12)]],
+    uint2 gid  [[thread_position_in_grid]],
+    uint2 tid  [[thread_position_in_threadgroup]],
+    uint2 tgs  [[threads_per_threadgroup]])
+{
+    threadgroup float tile[4096];
+
+    int x = (int)gid.x, y = (int)gid.y;
+    if (y >= p_Height) return;
+
+    int r = p_Radius;
+    int tileW = (int)tgs.x + 2 * r;
+    int rowOff = (int)tid.y * tileW;
+    int baseX = x - (int)tid.x - r;
+
+    for (int i = (int)tid.x; i < tileW; i += (int)tgs.x) {
+        int sx = clamp(baseX + i, 0, p_Width - 1);
+        tile[rowOff + i] = p_Src[y * p_Width + sx];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (x >= p_Width) return;
+
+    float sum = 0.0f;
+    int center = rowOff + (int)tid.x + r;
+    for (int dx = -r; dx <= r; dx++) {
+        sum += tile[center + dx] * p_Weights[dx + r];
+    }
+    p_Dst[y * p_Width + x] = sum;
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  Gaussian Blur — Vertical TILED (threadgroup shared memory)
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void GaussianBlurV_Tiled(
+    const device float* p_Src     [[buffer(3)]],
+    device float*       p_Dst     [[buffer(4)]],
+    const device float* p_Weights [[buffer(5)]],
+    constant int&       p_Width   [[buffer(10)]],
+    constant int&       p_Height  [[buffer(11)]],
+    constant int&       p_Radius  [[buffer(12)]],
+    uint2 gid  [[thread_position_in_grid]],
+    uint2 tid  [[thread_position_in_threadgroup]],
+    uint2 tgs  [[threads_per_threadgroup]])
+{
+    threadgroup float tile[4096];
+
+    int x = (int)gid.x, y = (int)gid.y;
+    if (x >= p_Width) return;
+
+    int r = p_Radius;
+    int tileH = (int)tgs.y + 2 * r;
+    int colOff = (int)tid.x * tileH;
+    int baseY = y - (int)tid.y - r;
+
+    for (int i = (int)tid.y; i < tileH; i += (int)tgs.y) {
+        int sy = clamp(baseY + i, 0, p_Height - 1);
+        tile[colOff + i] = p_Src[sy * p_Width + x];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (y >= p_Height) return;
+
+    float sum = 0.0f;
+    int center = colOff + (int)tid.y + r;
+    for (int dy = -r; dy <= r; dy++) {
+        sum += tile[center + dy] * p_Weights[dy + r];
+    }
+    p_Dst[y * p_Width + x] = sum;
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  Gaussian Blur — Horizontal, 4-channel (processes 4 separate buffers)
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void GaussianBlurH4(
+    const device float* p_A [[buffer(0)]],
+    const device float* p_B [[buffer(1)]],
+    const device float* p_C [[buffer(2)]],
+    const device float* p_D [[buffer(3)]],
+    device float* p_OA [[buffer(4)]],
+    device float* p_OB [[buffer(5)]],
+    device float* p_OC [[buffer(6)]],
+    device float* p_OD [[buffer(7)]],
+    const device float* p_Weights [[buffer(8)]],
+    constant int& p_Width  [[buffer(10)]],
+    constant int& p_Height [[buffer(11)]],
+    constant int& p_Radius [[buffer(12)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    int x = (int)id.x, y = (int)id.y, r = p_Radius, w = p_Width;
+    float sA = 0.0f, sB = 0.0f, sC = 0.0f, sD = 0.0f;
+    for (int dx = -r; dx <= r; dx++) {
+        int sx = clamp(x + dx, 0, w - 1);
+        int si = y * w + sx;
+        float wt = p_Weights[dx + r];
+        sA += p_A[si] * wt;
+        sB += p_B[si] * wt;
+        sC += p_C[si] * wt;
+        sD += p_D[si] * wt;
+    }
+    int oi = y * w + x;
+    p_OA[oi] = sA; p_OB[oi] = sB; p_OC[oi] = sC; p_OD[oi] = sD;
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  Gaussian Blur — Vertical, 4-channel (processes 4 separate buffers)
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void GaussianBlurV4(
+    const device float* p_A [[buffer(0)]],
+    const device float* p_B [[buffer(1)]],
+    const device float* p_C [[buffer(2)]],
+    const device float* p_D [[buffer(3)]],
+    device float* p_OA [[buffer(4)]],
+    device float* p_OB [[buffer(5)]],
+    device float* p_OC [[buffer(6)]],
+    device float* p_OD [[buffer(7)]],
+    const device float* p_Weights [[buffer(8)]],
+    constant int& p_Width  [[buffer(10)]],
+    constant int& p_Height [[buffer(11)]],
+    constant int& p_Radius [[buffer(12)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    int x = (int)id.x, y = (int)id.y, r = p_Radius, w = p_Width, h = p_Height;
+    float sA = 0.0f, sB = 0.0f, sC = 0.0f, sD = 0.0f;
+    for (int dy = -r; dy <= r; dy++) {
+        int sy = clamp(y + dy, 0, h - 1);
+        int si = sy * w + x;
+        float wt = p_Weights[dy + r];
+        sA += p_A[si] * wt;
+        sB += p_B[si] * wt;
+        sC += p_C[si] * wt;
+        sD += p_D[si] * wt;
+    }
+    int oi = y * w + x;
+    p_OA[oi] = sA; p_OB[oi] = sB; p_OC[oi] = sC; p_OD[oi] = sD;
+}
+
 // ════════════════════════════════════════════════════════════════════════
 //  Compute products: I*p and I*I, and copy I for blurring
 // ════════════════════════════════════════════════════════════════════════
@@ -281,7 +437,13 @@ kernel void GuidedFilterCoeff(
     float varI  = mII - mI * mI;
     float covIp = mIp - mI * mP;
 
-    float a = covIp / (varI + p_Epsilon);
+    // Adaptive epsilon: scales down at edges (high variance) for better
+    // edge preservation, stays full-strength in flat regions to suppress noise.
+    // adaptEps = eps^2 / (varI + eps), so:
+    //   flat (varI≈0): adaptEps ≈ eps  → smooths normally
+    //   edge (varI>>eps): adaptEps ≈ eps^2/varI → tiny → preserves edges
+    float adaptEps = p_Epsilon * p_Epsilon / (varI + p_Epsilon + 1e-10f);
+    float a = covIp / (varI + adaptEps);
     float b = mP - a * mI;
 
     p_MeanI[idx] = a;
@@ -468,6 +630,96 @@ kernel void BgWrapKernel(
     p_Output[idx4 + 2] += p_BgB[idx1] * w;
 }
 
+// ════════════════════════════════════════════════════════════════════════
+//  Additive Key — recovers fine detail (hair, motion blur, transparency)
+//  the alpha-based key lost, by superimposing source-minus-screen onto
+//  the composite.  Two paths:
+//    Addition:       residual = src - screen  →  desaturate  →  add
+//    Multiplication: factor = src/screen - 1  →  desaturate  →  × BG
+//  Weighted by (1-alpha) so only transparent areas are affected.
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void AdditiveKeyKernel(
+    const device float* p_Source   [[buffer(0)]],
+    const device float* p_Screen   [[buffer(1)]],
+    device float*       p_Output   [[buffer(2)]],
+    const device float* p_BlurBgR  [[buffer(3)]],
+    const device float* p_BlurBgG  [[buffer(4)]],
+    const device float* p_BlurBgB  [[buffer(5)]],
+    constant int&       p_Width    [[buffer(10)]],
+    constant int&       p_Height   [[buffer(11)]],
+    constant int&       p_Mode     [[buffer(12)]],
+    constant int&       p_UseScr   [[buffer(13)]],
+    constant float&     p_ScrR     [[buffer(14)]],
+    constant float&     p_ScrG     [[buffer(15)]],
+    constant float&     p_ScrB     [[buffer(16)]],
+    constant float&     p_Sat      [[buffer(17)]],
+    constant float&     p_Amount   [[buffer(18)]],
+    constant int&       p_ClampBlk [[buffer(19)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x;
+    const int idx4 = idx1 * 4;
+
+    float alpha = p_Output[idx4 + 3];
+    if (alpha > 0.999f) return;  // solid FG — standard key handles it
+
+    float srcR = p_Source[idx4 + 0];
+    float srcG = p_Source[idx4 + 1];
+    float srcB = p_Source[idx4 + 2];
+
+    float scrR, scrG, scrB;
+    if (p_UseScr != 0) {
+        scrR = p_Screen[idx4 + 0];
+        scrG = p_Screen[idx4 + 1];
+        scrB = p_Screen[idx4 + 2];
+    } else {
+        scrR = p_ScrR; scrG = p_ScrG; scrB = p_ScrB;
+    }
+
+    float resR, resG, resB;
+
+    if (p_Mode == 0) {
+        // ── Addition path: source - screen ──
+        resR = srcR - scrR;
+        resG = srcG - scrG;
+        resB = srcB - scrB;
+        // Desaturate to remove color cast
+        float lum = 0.2126f * resR + 0.7152f * resG + 0.0722f * resB;
+        resR = mix(lum, resR, p_Sat);
+        resG = mix(lum, resG, p_Sat);
+        resB = mix(lum, resB, p_Sat);
+    } else {
+        // ── Multiplication path: (source/screen) × BG ──
+        float fR = (scrR > 1e-6f) ? srcR / scrR : 1.0f;
+        float fG = (scrG > 1e-6f) ? srcG / scrG : 1.0f;
+        float fB = (scrB > 1e-6f) ? srcB / scrB : 1.0f;
+        // Desaturate factor (neutral = 1.0)
+        float fLum = 0.2126f * fR + 0.7152f * fG + 0.0722f * fB;
+        fR = mix(fLum, fR, p_Sat);
+        fG = mix(fLum, fG, p_Sat);
+        fB = mix(fLum, fB, p_Sat);
+        // Delta from original BG: BG × (factor-1)
+        resR = p_BlurBgR[idx1] * (fR - 1.0f);
+        resG = p_BlurBgG[idx1] * (fG - 1.0f);
+        resB = p_BlurBgB[idx1] * (fB - 1.0f);
+    }
+
+    // Black clamp (optional — keeps only brighter-than-screen detail)
+    if (p_ClampBlk != 0) {
+        resR = max(resR, 0.0f);
+        resG = max(resG, 0.0f);
+        resB = max(resB, 0.0f);
+    }
+
+    // Weight by (1-alpha) · amount and add to output
+    float w = (1.0f - alpha) * p_Amount;
+    p_Output[idx4 + 0] += resR * w;
+    p_Output[idx4 + 1] += resG * w;
+    p_Output[idx4 + 2] += resB * w;
+}
+
 // ════════════════════════════════════════════════════════════════════════
 //  Edge Color Correction: re-estimate FG color at semi-transparent edges
 //  using the matting equation: fg = (src - screen*(1-alpha)) / alpha
@@ -546,16 +798,421 @@ kernel void EdgeColorCorrectKernel(
     p_Output[idx4 + 1] = outG;
     p_Output[idx4 + 2] = outB;
 }
-)";
 
-// ═══════════════════════════════════════════════════════════════════════════════
-//  Pipeline cache
-// ═══════════════════════════════════════════════════════════════════════════════
+// ════════════════════════════════════════════════════════════════════════
+//  RGB Guided Filter — Compute Products
+//  Extracts RGB guide channels from source, copies alpha, and computes
+//  all 13 statistical channels needed for the 3-channel guided filter:
+//    4 means (Ir, Ig, Ib, p)  +  6 auto-covariance (IrIr..IbIb)
+//    + 3 cross-covariance (Irp, Igp, Ibp)
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void RGBComputeProducts(
+    const device float* p_Input   [[buffer(0)]],   // source RGBA
+    const device float* p_Output  [[buffer(1)]],   // output from core keyer (alpha in .w)
+    device float* p_MeanIr   [[buffer(2)]],
+    device float* p_MeanIg   [[buffer(3)]],
+    device float* p_MeanIb   [[buffer(4)]],
+    device float* p_MeanP    [[buffer(5)]],
+    device float* p_IrIr     [[buffer(6)]],
+    device float* p_IrIg     [[buffer(7)]],
+    device float* p_IrIb     [[buffer(8)]],
+    device float* p_IgIg     [[buffer(9)]],
+    device float* p_IgIb     [[buffer(10)]],
+    device float* p_IbIb     [[buffer(11)]],
+    device float* p_IrP      [[buffer(12)]],
+    device float* p_IgP      [[buffer(13)]],
+    device float* p_IbP      [[buffer(14)]],
+    constant int& p_Width    [[buffer(20)]],
+    constant int& p_Height   [[buffer(21)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    const int idx4 = ((int)(id.y * (uint)p_Width) + (int)id.x) * 4;
+    const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x;
+
+    float Ir = p_Input[idx4 + 0];
+    float Ig = p_Input[idx4 + 1];
+    float Ib = p_Input[idx4 + 2];
+    float p  = p_Output[idx4 + 3];  // alpha from core keyer
+
+    // Copies for blurring (will become means)
+    p_MeanIr[idx1] = Ir;
+    p_MeanIg[idx1] = Ig;
+    p_MeanIb[idx1] = Ib;
+    p_MeanP[idx1]  = p;
+
+    // Auto-covariance products
+    p_IrIr[idx1] = Ir * Ir;
+    p_IrIg[idx1] = Ir * Ig;
+    p_IrIb[idx1] = Ir * Ib;
+    p_IgIg[idx1] = Ig * Ig;
+    p_IgIb[idx1] = Ig * Ib;
+    p_IbIb[idx1] = Ib * Ib;
+
+    // Cross-covariance products
+    p_IrP[idx1] = Ir * p;
+    p_IgP[idx1] = Ig * p;
+    p_IbP[idx1] = Ib * p;
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  RGB Guided Filter — Coefficient Computation
+//  Reads 13 blurred channels, solves the 3×3 system:
+//    (Σ + εI) · a = cov(I, p)
+//    b = mean_p − aᵀ · mean_I
+//  Writes a_r, a_g, a_b, b into 4 output buffers.
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void RGBGuidedCoeff(
+    const device float* p_MeanIr  [[buffer(0)]],
+    const device float* p_MeanIg  [[buffer(1)]],
+    const device float* p_MeanIb  [[buffer(2)]],
+    const device float* p_MeanP   [[buffer(3)]],
+    device float*       p_IrIr    [[buffer(4)]],   // overwritten with a_r
+    const device float* p_IrIg    [[buffer(5)]],
+    const device float* p_IrIb    [[buffer(6)]],
+    device float*       p_IgIg    [[buffer(7)]],   // overwritten with a_g
+    const device float* p_IgIb    [[buffer(8)]],
+    device float*       p_IbIb    [[buffer(9)]],   // overwritten with a_b
+    const device float* p_IrP     [[buffer(10)]],
+    const device float* p_IgP     [[buffer(11)]],
+    const device float* p_IbP     [[buffer(12)]],
+    device float*       p_OutB    [[buffer(13)]],   // output b coefficient
+    constant int&       p_Width   [[buffer(20)]],
+    constant int&       p_Height  [[buffer(21)]],
+    constant float&     p_Epsilon [[buffer(22)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    const int i = (int)(id.y * (uint)p_Width) + (int)id.x;
+
+    float mIr = p_MeanIr[i], mIg = p_MeanIg[i], mIb = p_MeanIb[i], mP = p_MeanP[i];
+
+    // Covariance matrix Σ (symmetric 3×3)
+    float s_rr = p_IrIr[i] - mIr * mIr;
+    float s_rg = p_IrIg[i] - mIr * mIg;
+    float s_rb = p_IrIb[i] - mIr * mIb;
+    float s_gg = p_IgIg[i] - mIg * mIg;
+    float s_gb = p_IgIb[i] - mIg * mIb;
+    float s_bb = p_IbIb[i] - mIb * mIb;
+
+    // Cross-covariance cov(I, p)
+    float c_rp = p_IrP[i] - mIr * mP;
+    float c_gp = p_IgP[i] - mIg * mP;
+    float c_bp = p_IbP[i] - mIb * mP;
+
+    // Adaptive epsilon: same approach as scalar but applied to trace
+    float trace = s_rr + s_gg + s_bb;
+    float adaptEps = p_Epsilon * p_Epsilon / (trace / 3.0f + p_Epsilon + 1e-10f);
+
+    // Add ε·I to diagonal
+    s_rr += adaptEps;
+    s_gg += adaptEps;
+    s_bb += adaptEps;
+
+    // Solve 3×3 symmetric system via Cramer's rule
+    // det(M)
+    float det = s_rr * (s_gg * s_bb - s_gb * s_gb)
+              - s_rg * (s_rg * s_bb - s_gb * s_rb)
+              + s_rb * (s_rg * s_gb - s_gg * s_rb);
+
+    float invDet = (abs(det) > 1e-12f) ? (1.0f / det) : 0.0f;
+
+    // Cofactor matrix (symmetric) for inverse
+    float inv_rr = (s_gg * s_bb - s_gb * s_gb) * invDet;
+    float inv_rg = (s_rb * s_gb - s_rg * s_bb) * invDet;
+    float inv_rb = (s_rg * s_gb - s_rb * s_gg) * invDet;
+    float inv_gg = (s_rr * s_bb - s_rb * s_rb) * invDet;
+    float inv_gb = (s_rb * s_rg - s_rr * s_gb) * invDet;
+    float inv_bb = (s_rr * s_gg - s_rg * s_rg) * invDet;
+
+    // a = inv(Σ+εI) · cov(I,p)
+    float ar = inv_rr * c_rp + inv_rg * c_gp + inv_rb * c_bp;
+    float ag = inv_rg * c_rp + inv_gg * c_gp + inv_gb * c_bp;
+    float ab = inv_rb * c_rp + inv_gb * c_gp + inv_bb * c_bp;
+    float b  = mP - ar * mIr - ag * mIg - ab * mIb;
+
+    // Write coefficients (reuse 4 buffers)
+    p_IrIr[i] = ar;     // buf[4] = a_r
+    p_IgIg[i] = ag;     // buf[7] = a_g
+    p_IbIb[i] = ab;     // buf[9] = a_b
+    p_OutB[i] = b;      // buf[13] = b
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  RGB Guided Filter — Intermediate Evaluation
+//  For iterative refinement: compute refined alpha without premultiply.
+//  q = mean_ar * Ir + mean_ag * Ig + mean_ab * Ib + mean_b
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void RGBGuidedEval(
+    const device float* p_Input   [[buffer(0)]],   // source RGBA (for RGB guide)
+    const device float* p_MeanAr  [[buffer(2)]],
+    const device float* p_MeanAg  [[buffer(3)]],
+    const device float* p_MeanAb  [[buffer(4)]],
+    const device float* p_MeanB   [[buffer(5)]],
+    device float*       p_OutAlpha [[buffer(6)]],
+    constant int&       p_Width   [[buffer(20)]],
+    constant int&       p_Height  [[buffer(21)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    const int idx4 = ((int)(id.y * (uint)p_Width) + (int)id.x) * 4;
+    const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x;
+
+    float Ir = p_Input[idx4 + 0];
+    float Ig = p_Input[idx4 + 1];
+    float Ib = p_Input[idx4 + 2];
+
+    float q = p_MeanAr[idx1] * Ir + p_MeanAg[idx1] * Ig
+            + p_MeanAb[idx1] * Ib + p_MeanB[idx1];
+    p_OutAlpha[idx1] = clamp(q, 0.0f, 1.0f);
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  RGB Guided Filter — Final Apply
+//  Evaluates q, mixes with raw alpha, writes RGBA output + premultiply.
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void RGBGuidedApply(
+    const device float* p_Input    [[buffer(0)]],   // source RGBA (for RGB guide)
+    device float*       p_Output   [[buffer(1)]],   // output RGBA (read+write)
+    const device float* p_MeanAr   [[buffer(2)]],
+    const device float* p_MeanAg   [[buffer(3)]],
+    const device float* p_MeanAb   [[buffer(4)]],
+    const device float* p_MeanB    [[buffer(5)]],
+    const device float* p_RawAlpha [[buffer(6)]],
+    constant int&       p_Width    [[buffer(20)]],
+    constant int&       p_Height   [[buffer(21)]],
+    constant int&       p_Premultiply [[buffer(22)]],
+    constant float&     p_GFMix    [[buffer(23)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    const int idx4 = ((int)(id.y * (uint)p_Width) + (int)id.x) * 4;
+    const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x;
+
+    float Ir = p_Input[idx4 + 0];
+    float Ig = p_Input[idx4 + 1];
+    float Ib = p_Input[idx4 + 2];
+
+    float rawAlpha = p_RawAlpha[idx1];
+    float guidedAlpha = clamp(
+        p_MeanAr[idx1] * Ir + p_MeanAg[idx1] * Ig
+        + p_MeanAb[idx1] * Ib + p_MeanB[idx1], 0.0f, 1.0f);
+    float alpha = mix(rawAlpha, guidedAlpha, p_GFMix);
+
+    float outR = p_Output[idx4 + 0];
+    float outG = p_Output[idx4 + 1];
+    float outB = p_Output[idx4 + 2];
+
+    if (p_Premultiply != 0) {
+        outR *= alpha;
+        outG *= alpha;
+        outB *= alpha;
+    }
+
+    p_Output[idx4 + 0] = outR;
+    p_Output[idx4 + 1] = outG;
+    p_Output[idx4 + 2] = outB;
+    p_Output[idx4 + 3] = alpha;
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  Write Alpha — copies 1ch alpha buffer to RGBA alpha channel
+//  Used during iterative RGB guided filter refinement.
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void WriteAlphaKernel(
+    const device float* p_Alpha  [[buffer(0)]],
+    device float*       p_RGBA   [[buffer(1)]],
+    constant int&       p_Width  [[buffer(10)]],
+    constant int&       p_Height [[buffer(11)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x;
+    p_RGBA[idx1 * 4 + 3] = p_Alpha[idx1];
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  Erode Alpha — morphological minimum filter on 1ch alpha
+//  Shrinks the initial matte before clean plate estimation to prevent
+//  foreground contamination in the synthetic screen.
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void ErodeAlphaKernel(
+    const device float* p_Src    [[buffer(0)]],
+    device float*       p_Dst    [[buffer(1)]],
+    constant int&       p_Width  [[buffer(10)]],
+    constant int&       p_Height [[buffer(11)]],
+    constant int&       p_Radius [[buffer(12)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    int x = (int)id.x, y = (int)id.y, r = p_Radius;
+    int w = p_Width, h = p_Height;
+    float minVal = 1.0f;
+    for (int dy = -r; dy <= r; dy++) {
+        int sy = clamp(y + dy, 0, h - 1);
+        for (int dx = -r; dx <= r; dx++) {
+            int sx = clamp(x + dx, 0, w - 1);
+            minVal = min(minVal, p_Src[sy * w + sx]);
+        }
+    }
+    p_Dst[y * w + x] = minVal;
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  Clean Plate Estimate — IBKColour-style synthetic screen generation
+//  Where alpha ≈ 0 (pure screen): keeps source pixel (preserves screen
+//  variation — light falloff, wrinkles, color gradients)
+//  Where alpha ≈ 1 (pure FG): replaces with picked screen color
+//  Smoothstep blending avoids hard transition artifacts.
+//  The subsequent blur fills FG holes from surrounding BG information.
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void CleanPlateEstimateKernel(
+    const device float* p_Input  [[buffer(0)]],   // source RGBA
+    const device float* p_Alpha  [[buffer(1)]],   // 1ch initial alpha (eroded)
+    device float*       p_OutR   [[buffer(2)]],   // clean plate R channel
+    device float*       p_OutG   [[buffer(3)]],   // clean plate G channel
+    device float*       p_OutB   [[buffer(4)]],   // clean plate B channel
+    constant int&       p_Width  [[buffer(10)]],
+    constant int&       p_Height [[buffer(11)]],
+    constant float&     p_PickR  [[buffer(12)]],
+    constant float&     p_PickG  [[buffer(13)]],
+    constant float&     p_PickB  [[buffer(14)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    const int idx4 = ((int)(id.y * (uint)p_Width) + (int)id.x) * 4;
+    const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x;
+
+    float srcR = p_Input[idx4 + 0];
+    float srcG = p_Input[idx4 + 1];
+    float srcB = p_Input[idx4 + 2];
+    float alpha = clamp(p_Alpha[idx1], 0.0f, 1.0f);
+
+    // Smoothstep for softer transition
+    float t = alpha * alpha * (3.0f - 2.0f * alpha);
+
+    // Blend: keep source at alpha=0 (screen), replace with picked color at alpha=1 (FG)
+    p_OutR[idx1] = mix(srcR, p_PickR, t);
+    p_OutG[idx1] = mix(srcG, p_PickG, t);
+    p_OutB[idx1] = mix(srcB, p_PickB, t);
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  Pack RGBA — assembles 3 float channels into an interleaved RGBA buffer
+//  Builds the clean plate RGBA for the second core key pass.
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void PackRGBAKernel(
+    const device float* p_R      [[buffer(0)]],
+    const device float* p_G      [[buffer(1)]],
+    const device float* p_B      [[buffer(2)]],
+    device float*       p_RGBA   [[buffer(3)]],
+    constant int&       p_Width  [[buffer(10)]],
+    constant int&       p_Height [[buffer(11)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x;
+    const int idx4 = idx1 * 4;
+    p_RGBA[idx4 + 0] = p_R[idx1];
+    p_RGBA[idx4 + 1] = p_G[idx1];
+    p_RGBA[idx4 + 2] = p_B[idx1];
+    p_RGBA[idx4 + 3] = 1.0f;
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  Apply External Matte — garbage / occlusion matte support
+//  Reads alpha from an RGBA matte buffer and modifies both the 1ch alpha
+//  and the output RGBA buffer's alpha channel.
+//  Mode 0 = garbage (white = remove: alpha *= 1 - matte)
+//  Mode 1 = occlusion (white = keep:  alpha = max(alpha, matte))
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void ApplyMatteKernel(
+    device float*       p_Alpha   [[buffer(0)]],
+    device float*       p_Output  [[buffer(1)]],
+    const device float* p_Matte   [[buffer(2)]],
+    constant int&       p_Width   [[buffer(10)]],
+    constant int&       p_Height  [[buffer(11)]],
+    constant int&       p_Mode    [[buffer(12)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x;
+    const int idx4 = idx1 * 4;
+
+    float matteVal = p_Matte[idx4 + 3];
+    float alpha = p_Alpha[idx1];
+
+    if (p_Mode == 0) {
+        // Garbage: white areas = remove from key
+        alpha *= (1.0f - matteVal);
+    } else {
+        // Occlusion: white areas = force opaque
+        alpha = max(alpha, matteVal);
+    }
+
+    p_Alpha[idx1] = alpha;
+    p_Output[idx4 + 3] = alpha;
+}
+
+// ════════════════════════════════════════════════════════════════════════
+//  Diagnostic Output — writes intermediate pipeline data to output
+//  Mode 0: 1-channel alpha buffer → greyscale RGBA (R=G=B=alpha, A=1)
+//  Mode 1: copy RGBA buffer to output
+//  Mode 2: extract alpha from RGBA → greyscale (R=G=B=src.a, A=1)
+// ════════════════════════════════════════════════════════════════════════
+
+kernel void DiagnosticOutputKernel(
+    const device float* p_SrcA    [[buffer(0)]],
+    const device float* p_SrcRGBA [[buffer(1)]],
+    device float*       p_Output  [[buffer(2)]],
+    constant int&       p_Width   [[buffer(10)]],
+    constant int&       p_Height  [[buffer(11)]],
+    constant int&       p_Mode    [[buffer(12)]],
+    uint2 id [[thread_position_in_grid]])
+{
+    if ((int)id.x >= p_Width || (int)id.y >= p_Height) return;
+    const int idx1 = (int)(id.y * (uint)p_Width) + (int)id.x;
+    const int idx4 = idx1 * 4;
+
+    if (p_Mode == 0) {
+        // 1-channel alpha → greyscale RGBA
+        float a = p_SrcA[idx1];
+        p_Output[idx4 + 0] = a;
+        p_Output[idx4 + 1] = a;
+        p_Output[idx4 + 2] = a;
+        p_Output[idx4 + 3] = 1.0f;
+    } else if (p_Mode == 1) {
+        // Copy RGBA
+        p_Output[idx4 + 0] = p_SrcRGBA[idx4 + 0];
+        p_Output[idx4 + 1] = p_SrcRGBA[idx4 + 1];
+        p_Output[idx4 + 2] = p_SrcRGBA[idx4 + 2];
+        p_Output[idx4 + 3] = p_SrcRGBA[idx4 + 3];
+    } else {
+        // Mode 2: extract alpha from RGBA → greyscale
+        float a = p_SrcRGBA[idx4 + 3];
+        p_Output[idx4 + 0] = a;
+        p_Output[idx4 + 1] = a;
+        p_Output[idx4 + 2] = a;
+        p_Output[idx4 + 3] = 1.0f;
+    }
+}
+)";  // end of kernelSource
 
 struct PipelineSet {
     id<MTLComputePipelineState> coreKeyer;
     id<MTLComputePipelineState> gaussianBlurH;
     id<MTLComputePipelineState> gaussianBlurV;
+    id<MTLComputePipelineState> gaussianBlurH4;
+    id<MTLComputePipelineState> gaussianBlurV4;
     id<MTLComputePipelineState> computeProducts;
     id<MTLComputePipelineState> guidedCoeff;
     id<MTLComputePipelineState> refineGuide;
@@ -566,26 +1223,47 @@ kernel void EdgeColorCorrectKernel(
     id<MTLComputePipelineState> extractChannel;
     id<MTLComputePipelineState> bgWrap;
     id<MTLComputePipelineState> edgeColorCorrect;
+    // RGB guided filter
+    id<MTLComputePipelineState> rgbComputeProducts;
+    id<MTLComputePipelineState> rgbGuidedCoeff;
+    id<MTLComputePipelineState> rgbGuidedEval;
+    id<MTLComputePipelineState> rgbGuidedApply;
+    id<MTLComputePipelineState> writeAlpha;
+    // Prematte (clean plate generation)
+    id<MTLComputePipelineState> erodeAlpha;
+    id<MTLComputePipelineState> cleanPlateEstimate;
+    id<MTLComputePipelineState> packRGBA;
+    // External mattes
+    id<MTLComputePipelineState> applyMatte;
+    // Diagnostic output
+    id<MTLComputePipelineState> diagnosticOutput;
+    // Tiled blur (threadgroup shared memory)
+    id<MTLComputePipelineState> gaussianBlurHTiled;
+    id<MTLComputePipelineState> gaussianBlurVTiled;
+    // Additive key
+    id<MTLComputePipelineState> additiveKey;
 };
 
 struct QueueState {
     PipelineSet pipes;
-    // Cached temp buffers (persist across frames — avoids alloc/free churn)
-    //   tempA: raw alpha / p / mean_p / b / mean_b
-    //   tempB: guide / I / mean_I / a / mean_a
-    //   tempC: I*p / mean_Ip / eval scratch
-    //   tempD: I*I / mean_II
-    //   tempE: guide copy (preserved through blur passes)
-    //   tempF: Gaussian blur scratch (H/V intermediate)
-    //   tempG: saved raw alpha (for final mix against original)
-    id<MTLBuffer> tempA = nil;
-    id<MTLBuffer> tempB = nil;
-    id<MTLBuffer> tempC = nil;
-    id<MTLBuffer> tempD = nil;
-    id<MTLBuffer> tempE = nil;
-    id<MTLBuffer> tempF = nil;
-    id<MTLBuffer> tempG = nil;
+    // Temp buffer pool:
+    //   Scalar GF: uses temp[0..6]
+    //     [0]=rawAlpha/p, [1]=guide/I, [2]=Ip, [3]=II,
+    //     [4]=guideCopy, [5]=blurScratch, [6]=savedRawAlpha
+    //   RGB GF: uses temp[0..17]
+    //     [0..3]=meanIr,Ig,Ib,P  [4..9]=IrIr,IrIg,IrIb,IgIg,IgIb,IbIb
+    //     [10..12]=Irp,Igp,Ibp  [13..16]=blur4 scratch  [17]=savedRawAlpha
+    static const int MAX_TEMP = 18;
+    id<MTLBuffer> temp[MAX_TEMP] = {};
     size_t cachedChanBytes = 0;
+    int cachedBufCount = 0;   // how many temps are currently allocated
+    // Prematte clean plate buffer (RGBA-sized)
+    id<MTLBuffer> cleanPlateBuf = nil;
+    size_t cachedCleanPlateBytes = 0;
+    // Cached Gaussian weight buffers (avoid per-frame alloc/free)
+    id<MTLBuffer> gfWeightBuf = nil;   int gfWeightRadius = -1;
+    id<MTLBuffer> pmWeightBuf = nil;   int pmWeightRadius = -1;
+    id<MTLBuffer> bwWeightBuf = nil;   int bwWeightRadius = -1;
 };
 
 std::mutex s_PipelineMutex;
@@ -617,6 +1295,9 @@ static void dispatch2D(id<MTLComputeCommandEncoder> enc,
 // ═══════════════════════════════════════════════════════════════════════════════
 //  Gaussian blur helper: single separable H+V pass with pre-computed weights.
 //  Blurs bufA in-place, uses bufScratch as intermediate.
+//  Uses threadgroup-tiled kernels when the tile fits in 4096-float shared
+//  memory (radius ≤ 240), falls back to the simple global-read kernels
+//  for very large radii.
 // ═══════════════════════════════════════════════════════════════════════════════
 
 static void gaussianBlur(id<MTLComputeCommandEncoder> enc,
@@ -625,25 +1306,103 @@ static void gaussianBlur(id<MTLComputeCommandEncoder> enc,
                          id<MTLBuffer> weightBuf,
                          int w, int h, int radius)
 {
-    // Horizontal: A → scratch
-    [enc setComputePipelineState:ps.gaussianBlurH];
-    [enc setBuffer:bufA       offset:0 atIndex:3];
-    [enc setBuffer:bufScratch offset:0 atIndex:4];
-    [enc setBuffer:weightBuf  offset:0 atIndex:5];
+    // Threadgroup sizes: (32,8)=256 for H, (8,32)=256 for V
+    const int tgW_H = 32, tgH_H = 8;
+    const int tgW_V = 8,  tgH_V = 32;
+    bool canTileH = (tgH_H * (tgW_H + 2 * radius) <= 4096);
+    bool canTileV = (tgW_V * (tgH_V + 2 * radius) <= 4096);
+
+    if (canTileH && canTileV && ps.gaussianBlurHTiled && ps.gaussianBlurVTiled) {
+        // ── Tiled H: A → scratch ──
+        [enc setComputePipelineState:ps.gaussianBlurHTiled];
+        [enc setBuffer:bufA       offset:0 atIndex:3];
+        [enc setBuffer:bufScratch offset:0 atIndex:4];
+        [enc setBuffer:weightBuf  offset:0 atIndex:5];
+        [enc setBytes:&w      length:sizeof(int) atIndex:10];
+        [enc setBytes:&h      length:sizeof(int) atIndex:11];
+        [enc setBytes:&radius length:sizeof(int) atIndex:12];
+        MTLSize tgSizeH = MTLSizeMake(tgW_H, tgH_H, 1);
+        MTLSize gridH   = MTLSizeMake((w + tgW_H - 1) / tgW_H, (h + tgH_H - 1) / tgH_H, 1);
+        [enc dispatchThreadgroups:gridH threadsPerThreadgroup:tgSizeH];
+
+        // ── Tiled V: scratch → A ──
+        [enc setComputePipelineState:ps.gaussianBlurVTiled];
+        [enc setBuffer:bufScratch offset:0 atIndex:3];
+        [enc setBuffer:bufA       offset:0 atIndex:4];
+        [enc setBuffer:weightBuf  offset:0 atIndex:5];
+        [enc setBytes:&w      length:sizeof(int) atIndex:10];
+        [enc setBytes:&h      length:sizeof(int) atIndex:11];
+        [enc setBytes:&radius length:sizeof(int) atIndex:12];
+        MTLSize tgSizeV = MTLSizeMake(tgW_V, tgH_V, 1);
+        MTLSize gridV   = MTLSizeMake((w + tgW_V - 1) / tgW_V, (h + tgH_V - 1) / tgH_V, 1);
+        [enc dispatchThreadgroups:gridV threadsPerThreadgroup:tgSizeV];
+    } else {
+        // ── Fallback: global-read kernels ──
+        [enc setComputePipelineState:ps.gaussianBlurH];
+        [enc setBuffer:bufA       offset:0 atIndex:3];
+        [enc setBuffer:bufScratch offset:0 atIndex:4];
+        [enc setBuffer:weightBuf  offset:0 atIndex:5];
+        [enc setBytes:&w      length:sizeof(int) atIndex:10];
+        [enc setBytes:&h      length:sizeof(int) atIndex:11];
+        [enc setBytes:&radius length:sizeof(int) atIndex:12];
+        dispatch2D(enc, ps.gaussianBlurH, w, h);
+
+        [enc setComputePipelineState:ps.gaussianBlurV];
+        [enc setBuffer:bufScratch offset:0 atIndex:3];
+        [enc setBuffer:bufA       offset:0 atIndex:4];
+        [enc setBuffer:weightBuf  offset:0 atIndex:5];
+        [enc setBytes:&w      length:sizeof(int) atIndex:10];
+        [enc setBytes:&h      length:sizeof(int) atIndex:11];
+        [enc setBytes:&radius length:sizeof(int) atIndex:12];
+        dispatch2D(enc, ps.gaussianBlurV, w, h);
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+//  Gaussian blur helper: 4-channel separable H+V in 2 dispatches.
+//  Blurs a,b,c,d in-place using sa,sb,sc,sd as scratch.
+// ═══════════════════════════════════════════════════════════════════════════════
+
+static void gaussianBlur4(id<MTLComputeCommandEncoder> enc,
+                          const PipelineSet& ps,
+                          id<MTLBuffer> a, id<MTLBuffer> b,
+                          id<MTLBuffer> c, id<MTLBuffer> d,
+                          id<MTLBuffer> sa, id<MTLBuffer> sb,
+                          id<MTLBuffer> sc, id<MTLBuffer> sd,
+                          id<MTLBuffer> weightBuf,
+                          int w, int h, int radius)
+{
+    // Horizontal: a,b,c,d → sa,sb,sc,sd
+    [enc setComputePipelineState:ps.gaussianBlurH4];
+    [enc setBuffer:a  offset:0 atIndex:0];
+    [enc setBuffer:b  offset:0 atIndex:1];
+    [enc setBuffer:c  offset:0 atIndex:2];
+    [enc setBuffer:d  offset:0 atIndex:3];
+    [enc setBuffer:sa offset:0 atIndex:4];
+    [enc setBuffer:sb offset:0 atIndex:5];
+    [enc setBuffer:sc offset:0 atIndex:6];
+    [enc setBuffer:sd offset:0 atIndex:7];
+    [enc setBuffer:weightBuf offset:0 atIndex:8];
     [enc setBytes:&w      length:sizeof(int) atIndex:10];
     [enc setBytes:&h      length:sizeof(int) atIndex:11];
     [enc setBytes:&radius length:sizeof(int) atIndex:12];
-    dispatch2D(enc, ps.gaussianBlurH, w, h);
-
-    // Vertical: scratch → A
-    [enc setComputePipelineState:ps.gaussianBlurV];
-    [enc setBuffer:bufScratch offset:0 atIndex:3];
-    [enc setBuffer:bufA       offset:0 atIndex:4];
-    [enc setBuffer:weightBuf  offset:0 atIndex:5];
+    dispatch2D(enc, ps.gaussianBlurH4, w, h);
+
+    // Vertical: sa,sb,sc,sd → a,b,c,d
+    [enc setComputePipelineState:ps.gaussianBlurV4];
+    [enc setBuffer:sa offset:0 atIndex:0];
+    [enc setBuffer:sb offset:0 atIndex:1];
+    [enc setBuffer:sc offset:0 atIndex:2];
+    [enc setBuffer:sd offset:0 atIndex:3];
+    [enc setBuffer:a  offset:0 atIndex:4];
+    [enc setBuffer:b  offset:0 atIndex:5];
+    [enc setBuffer:c  offset:0 atIndex:6];
+    [enc setBuffer:d  offset:0 atIndex:7];
+    [enc setBuffer:weightBuf offset:0 atIndex:8];
     [enc setBytes:&w      length:sizeof(int) atIndex:10];
     [enc setBytes:&h      length:sizeof(int) atIndex:11];
     [enc setBytes:&radius length:sizeof(int) atIndex:12];
-    dispatch2D(enc, ps.gaussianBlurV, w, h);
+    dispatch2D(enc, ps.gaussianBlurV4, w, h);
 }
 
 // ═══════════════════════════════════════════════════════════════════════════════
@@ -658,12 +1417,19 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
                     int p_Premultiply, int p_NearGreyExtract,
                     float p_NearGreyAmount, float p_NearGreySoftness,
                     float p_BlackClip, float p_WhiteClip, float p_MatteGamma,
-                    int p_GuidedFilterEnabled, int p_GuidedRadius, float p_GuidedEpsilon,
+                    int p_PrematteEnabled, int p_PrematteBlur, int p_PrematteErode, int p_PrematteIterations,
+                    int p_GuidedFilterEnabled, int p_GuidedFilterMode,
+                    int p_GuidedRadius, float p_GuidedEpsilon,
                     float p_GuidedMix, float p_EdgeProtect, int p_RefineIterations,
                     float p_EdgeColorCorrect,
                     int p_BgWrapEnabled, int p_BgWrapBlur, float p_BgWrapAmount,
+                    int p_AdditiveKeyEnabled, int p_AdditiveKeyMode,
+                    float p_AdditiveKeySat, float p_AdditiveKeyAmount, int p_AdditiveKeyBlackClamp,
+                    int p_ViewMode,
                     const float* p_Input, const float* p_Screen,
-                    const float* p_Background, float* p_Output)
+                    const float* p_Background,
+                    const float* p_GarbageMatte, const float* p_OcclusionMatte,
+                    float* p_Output)
 {
   @autoreleasepool {
     id<MTLCommandQueue> queue = static_cast<id<MTLCommandQueue>>(p_CmdQ);
@@ -675,10 +1441,10 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
     auto it = s_QueueStateMap.find(queue);
     if (it == s_QueueStateMap.end()) {
         MTLCompileOptions* options = [MTLCompileOptions new];
-        // Keep Metal math conservative here. Fast-math is tempting, but parity debugging gets much
-        // harder when the Metal backend quietly takes a different numerical path from CPU/CUDA and
-        // from the original Gaffer graph. If we revisit this as a performance optimization later,
-        // it should be treated as a measured opt-in change rather than the default behavior.
+        // Keep Metal math conservative here. The private branch was using fast math to chase speed,
+        // but this cross-platform port leans on CPU as the parity anchor. Safe math makes it much
+        // easier to compare Metal against CPU/CUDA without backend-specific numeric drift muddying
+        // whether a feature port is actually correct.
 #if defined(__MAC_OS_X_VERSION_MAX_ALLOWED) && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
         options.mathMode = MTLMathModeSafe;
 #else
@@ -694,6 +1460,8 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
         qs.pipes.coreKeyer       = makePipeline(lib, "IBKeymasterCoreKernel");
         qs.pipes.gaussianBlurH   = makePipeline(lib, "GaussianBlurH");
         qs.pipes.gaussianBlurV   = makePipeline(lib, "GaussianBlurV");
+        qs.pipes.gaussianBlurH4  = makePipeline(lib, "GaussianBlurH4");
+        qs.pipes.gaussianBlurV4  = makePipeline(lib, "GaussianBlurV4");
         qs.pipes.computeProducts = makePipeline(lib, "ComputeProducts");
         qs.pipes.guidedCoeff     = makePipeline(lib, "GuidedFilterCoeff");
         qs.pipes.refineGuide     = makePipeline(lib, "RefineGuideKernel");
@@ -704,6 +1472,19 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
         qs.pipes.extractChannel  = makePipeline(lib, "ExtractChannel");
         qs.pipes.bgWrap          = makePipeline(lib, "BgWrapKernel");
         qs.pipes.edgeColorCorrect = makePipeline(lib, "EdgeColorCorrectKernel");
+        qs.pipes.rgbComputeProducts = makePipeline(lib, "RGBComputeProducts");
+        qs.pipes.rgbGuidedCoeff     = makePipeline(lib, "RGBGuidedCoeff");
+        qs.pipes.rgbGuidedEval      = makePipeline(lib, "RGBGuidedEval");
+        qs.pipes.rgbGuidedApply     = makePipeline(lib, "RGBGuidedApply");
+        qs.pipes.writeAlpha         = makePipeline(lib, "WriteAlphaKernel");
+        qs.pipes.erodeAlpha         = makePipeline(lib, "ErodeAlphaKernel");
+        qs.pipes.cleanPlateEstimate = makePipeline(lib, "CleanPlateEstimateKernel");
+        qs.pipes.packRGBA           = makePipeline(lib, "PackRGBAKernel");
+        qs.pipes.applyMatte         = makePipeline(lib, "ApplyMatteKernel");
+        qs.pipes.diagnosticOutput   = makePipeline(lib, "DiagnosticOutputKernel");
+        qs.pipes.gaussianBlurHTiled = makePipeline(lib, "GaussianBlurH_Tiled");
+        qs.pipes.gaussianBlurVTiled = makePipeline(lib, "GaussianBlurV_Tiled");
+        qs.pipes.additiveKey        = makePipeline(lib, "AdditiveKeyKernel");
         [lib release];
         s_QueueStateMap[queue] = qs;
         it = s_QueueStateMap.find(queue);
@@ -713,47 +1494,56 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
 
     // ── Cached temp buffers (reused across frames) ──
     size_t chanBytes = (size_t)p_Width * (size_t)p_Height * sizeof(float);
+    bool doPrematte = p_PrematteEnabled && p_PrematteBlur > 0;
     bool doGF = p_GuidedFilterEnabled && p_GuidedRadius > 0;
     bool doBgWrap = p_BgWrapEnabled && p_Background && p_BgWrapAmount > 0.0f;
+    bool doAdditiveKey = p_AdditiveKeyEnabled && p_AdditiveKeyAmount > 0.0f;
+    bool needBgBlur = doBgWrap || (doAdditiveKey && p_AdditiveKeyMode == 1 && p_Background != nullptr);
+    bool rgbGF = doGF && (p_GuidedFilterMode == 1);
+
+    // Determine how many temp buffers are needed
+    int neededBufs = 2;  // always need temp[0..1] for core keyer output
+    if (doPrematte)                      neededBufs = std::max(neededBufs, 7);  // prematte uses temp[0..6]
+    if (rgbGF)                           neededBufs = 18;  // temp[0..17]
+    else if (doGF || needBgBlur)         neededBufs = std::max(neededBufs, 7);  // temp[0..6]
 
     // Reallocate if resolution changed
     if (chanBytes != state.cachedChanBytes) {
-        if (state.tempA) [state.tempA release];
-        if (state.tempB) [state.tempB release];
-        if (state.tempC) [state.tempC release];
-        if (state.tempD) [state.tempD release];
-        if (state.tempE) [state.tempE release];
-        if (state.tempF) [state.tempF release];
-        if (state.tempG) [state.tempG release];
-        state.tempA = nil; state.tempB = nil; state.tempC = nil;
-        state.tempD = nil; state.tempE = nil; state.tempF = nil;
-        state.tempG = nil;
+        for (int i = 0; i < QueueState::MAX_TEMP; i++) {
+            if (state.temp[i]) { [state.temp[i] release]; state.temp[i] = nil; }
+        }
+        if (state.cleanPlateBuf) { [state.cleanPlateBuf release]; state.cleanPlateBuf = nil; }
         state.cachedChanBytes = chanBytes;
+        state.cachedCleanPlateBytes = 0;
+        state.cachedBufCount = 0;
+    }
+
+    // Allocate temp buffers on demand
+    if (neededBufs > state.cachedBufCount) {
+        for (int i = state.cachedBufCount; i < neededBufs; i++) {
+            if (!state.temp[i])
+                state.temp[i] = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate];
+        }
+        state.cachedBufCount = neededBufs;
     }
 
-    // Allocate on demand
-    if ((doGF || doBgWrap) && !state.tempC) {
-        // Need buffers for GF and/or BG wrap
-        if (!state.tempA) state.tempA = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate];
-        if (!state.tempB) state.tempB = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate];
-        state.tempC = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate];
-        state.tempD = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate];
-        state.tempE = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate];
-        state.tempF = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate];
-        state.tempG = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate];
-    } else if (!doGF && !doBgWrap && !state.tempA) {
-        // Need tempA and tempB even without GF (core kernel writes to them)
-        state.tempA = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate];
-        state.tempB = [device newBufferWithLength:chanBytes options:MTLResourceStorageModePrivate];
+    // Allocate RGBA clean plate buffer for prematte
+    size_t rgbaBytes = chanBytes * 4;
+    if (doPrematte && state.cachedCleanPlateBytes != rgbaBytes) {
+        if (state.cleanPlateBuf) { [state.cleanPlateBuf release]; state.cleanPlateBuf = nil; }
+        state.cleanPlateBuf = [device newBufferWithLength:rgbaBytes options:MTLResourceStorageModePrivate];
+        state.cachedCleanPlateBytes = rgbaBytes;
     }
 
-    id<MTLBuffer> tempA = state.tempA;
-    id<MTLBuffer> tempB = state.tempB;
-    id<MTLBuffer> tempC = state.tempC;
-    id<MTLBuffer> tempD = state.tempD;
-    id<MTLBuffer> tempE = state.tempE;
-    id<MTLBuffer> tempF = state.tempF;
-    id<MTLBuffer> tempG = state.tempG;
+    // Scalar aliases (backward compatible with existing scalar dispatch code)
+    id<MTLBuffer> tempA = state.temp[0];
+    id<MTLBuffer> tempB = state.temp[1];
+    id<MTLBuffer> tempC = (neededBufs > 2) ? state.temp[2] : nil;
+    id<MTLBuffer> tempD = (neededBufs > 3) ? state.temp[3] : nil;
+    id<MTLBuffer> tempE = (neededBufs > 4) ? state.temp[4] : nil;
+    id<MTLBuffer> tempF = (neededBufs > 5) ? state.temp[5] : nil;
+    id<MTLBuffer> tempG = (neededBufs > 6) ? state.temp[6] : nil;
+    id<MTLBuffer>* t = state.temp;   // direct array access for RGB path
     lock.unlock();
 
     // ── Resolve Metal buffers ──
@@ -771,26 +1561,29 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
         createdDummy = true;
     }
 
-    // ── Pre-compute Gaussian weights (tiny shared buffer, freed per-frame) ──
+    // ── Cached Gaussian weights — only re-created when radius changes ──
     id<MTLBuffer> weightBuf = nil;
     if (doGF) {
         int r = p_GuidedRadius;
-        int kernelSize = 2 * r + 1;
-        float sigma = fmaxf(r / 3.0f, 0.5f);
-        float invTwoSigmaSq = 1.0f / (2.0f * sigma * sigma);
-
-        float* weights = (float*)alloca(kernelSize * sizeof(float));
-        float wsum = 0.0f;
-        for (int i = -r; i <= r; i++) {
-            float w = expf(-(float)(i * i) * invTwoSigmaSq);
-            weights[i + r] = w;
-            wsum += w;
+        if (state.gfWeightRadius != r) {
+            if (state.gfWeightBuf) [state.gfWeightBuf release];
+            int kernelSize = 2 * r + 1;
+            float sigma = fmaxf(r / 3.0f, 0.5f);
+            float invTwoSigmaSq = 1.0f / (2.0f * sigma * sigma);
+            float* weights = (float*)alloca(kernelSize * sizeof(float));
+            float wsum = 0.0f;
+            for (int i = -r; i <= r; i++) {
+                float w = expf(-(float)(i * i) * invTwoSigmaSq);
+                weights[i + r] = w;
+                wsum += w;
+            }
+            for (int i = 0; i < kernelSize; i++) weights[i] /= wsum;
+            state.gfWeightBuf = [device newBufferWithBytes:weights
+                                                    length:kernelSize * sizeof(float)
+                                                   options:MTLResourceStorageModeShared];
+            state.gfWeightRadius = r;
         }
-        for (int i = 0; i < kernelSize; i++) weights[i] /= wsum;
-
-        weightBuf = [device newBufferWithBytes:weights
-                                        length:kernelSize * sizeof(float)
-                                       options:MTLResourceStorageModeShared];
+        weightBuf = state.gfWeightBuf;
     }
 
     // ══════════════════════════════════════════════════════════════════════
@@ -829,7 +1622,191 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
 
     // After core: dstBuf=RGBA(despilled+rawAlpha), tempA=rawAlpha, tempB=guide
 
-    if (doGF) {
+    // ══════════════════════════════════════════════════════════════════════
+    //  PREMATTE — IBKColour-style synthetic clean plate generation
+    //  1. Erode initial alpha (prevents FG bleed into plate estimate)
+    //  2. Estimate clean plate: blend source → picked screen based on alpha
+    //  3. Blur the plate (fills FG holes from surrounding BG information)
+    //  4. Pack into RGBA and re-run core keyer with the clean plate as screen
+    // ══════════════════════════════════════════════════════════════════════
+    if (doPrematte) {
+        // Cached Gaussian weights for prematte blur
+        int pmR = std::max(1, p_PrematteBlur);
+        if (state.pmWeightRadius != pmR) {
+            if (state.pmWeightBuf) [state.pmWeightBuf release];
+            int pmKernelSize = 2 * pmR + 1;
+            float pmSigma = fmaxf(pmR / 3.0f, 0.5f);
+            float pmInv2s2 = 1.0f / (2.0f * pmSigma * pmSigma);
+            float* pmW = (float*)alloca(pmKernelSize * sizeof(float));
+            float pmSum = 0.0f;
+            for (int i = -pmR; i <= pmR; i++) {
+                float wt = expf(-(float)(i * i) * pmInv2s2);
+                pmW[i + pmR] = wt;
+                pmSum += wt;
+            }
+            for (int i = 0; i < pmKernelSize; i++) pmW[i] /= pmSum;
+            state.pmWeightBuf = [device newBufferWithBytes:pmW
+                                                    length:pmKernelSize * sizeof(float)
+                                                   options:MTLResourceStorageModeShared];
+            state.pmWeightRadius = pmR;
+        }
+        id<MTLBuffer> pmWeightBuf = state.pmWeightBuf;
+
+        int pmIter = std::max(1, std::min(p_PrematteIterations, 5));
+        for (int pi = 0; pi < pmIter; pi++) {
+
+        // Step 1: Erode alpha (tempA → tempC, or use tempA directly if no erode)
+        id<MTLBuffer> erodeAlpha = tempA;
+        if (p_PrematteErode > 0) {
+            [enc setComputePipelineState:pipes.erodeAlpha];
+            [enc setBuffer:tempA offset:0 atIndex:0];
+            [enc setBuffer:tempC offset:0 atIndex:1];
+            [enc setBytes:&p_Width         length:sizeof(int) atIndex:10];
+            [enc setBytes:&p_Height        length:sizeof(int) atIndex:11];
+            [enc setBytes:&p_PrematteErode length:sizeof(int) atIndex:12];
+            dispatch2D(enc, pipes.erodeAlpha, p_Width, p_Height);
+            erodeAlpha = tempC;
+        }
+
+        // Step 2: Estimate clean plate (source + eroded alpha → 3 channel buffers)
+        [enc setComputePipelineState:pipes.cleanPlateEstimate];
+        [enc setBuffer:srcBuf    offset:0 atIndex:0];
+        [enc setBuffer:erodeAlpha offset:0 atIndex:1];
+        [enc setBuffer:tempD     offset:0 atIndex:2];   // cleanR
+        [enc setBuffer:tempE     offset:0 atIndex:3];   // cleanG
+        [enc setBuffer:tempF     offset:0 atIndex:4];   // cleanB
+        [enc setBytes:&p_Width  length:sizeof(int)   atIndex:10];
+        [enc setBytes:&p_Height length:sizeof(int)   atIndex:11];
+        [enc setBytes:&p_PickR  length:sizeof(float) atIndex:12];
+        [enc setBytes:&p_PickG  length:sizeof(float) atIndex:13];
+        [enc setBytes:&p_PickB  length:sizeof(float) atIndex:14];
+        dispatch2D(enc, pipes.cleanPlateEstimate, p_Width, p_Height);
+
+        // Step 3: Blur each channel of the clean plate
+        // Use tempA as blur scratch (it's consumed; will be overwritten by re-key)
+        gaussianBlur(enc, pipes, tempD, tempA, pmWeightBuf, p_Width, p_Height, pmR);
+        gaussianBlur(enc, pipes, tempE, tempA, pmWeightBuf, p_Width, p_Height, pmR);
+        gaussianBlur(enc, pipes, tempF, tempA, pmWeightBuf, p_Width, p_Height, pmR);
+
+        // Step 4: Pack into RGBA clean plate buffer
+        [enc setComputePipelineState:pipes.packRGBA];
+        [enc setBuffer:tempD              offset:0 atIndex:0];   // R
+        [enc setBuffer:tempE              offset:0 atIndex:1];   // G
+        [enc setBuffer:tempF              offset:0 atIndex:2];   // B
+        [enc setBuffer:state.cleanPlateBuf offset:0 atIndex:3];  // RGBA out
+        [enc setBytes:&p_Width  length:sizeof(int) atIndex:10];
+        [enc setBytes:&p_Height length:sizeof(int) atIndex:11];
+        dispatch2D(enc, pipes.packRGBA, p_Width, p_Height);
+
+        // Step 5: Re-run core keyer with clean plate as screen
+        int useScreenOn = 1;
+        [enc setComputePipelineState:pipes.coreKeyer];
+        [enc setBuffer:srcBuf              offset:0 atIndex:0];
+        [enc setBuffer:state.cleanPlateBuf offset:0 atIndex:1];   // clean plate as screen
+        [enc setBuffer:dstBuf              offset:0 atIndex:2];
+        [enc setBuffer:tempA               offset:0 atIndex:3];   // fresh raw alpha
+        [enc setBuffer:tempB               offset:0 atIndex:4];   // fresh guide
+        [enc setBytes:&p_Width           length:sizeof(int)   atIndex:10];
+        [enc setBytes:&p_Height          length:sizeof(int)   atIndex:11];
+        [enc setBytes:&p_ScreenColor     length:sizeof(int)   atIndex:12];
+        [enc setBytes:&useScreenOn       length:sizeof(int)   atIndex:13];
+        [enc setBytes:&p_PickR           length:sizeof(float) atIndex:14];
+        [enc setBytes:&p_PickG           length:sizeof(float) atIndex:15];
+        [enc setBytes:&p_PickB           length:sizeof(float) atIndex:16];
+        [enc setBytes:&p_Bias            length:sizeof(float) atIndex:17];
+        [enc setBytes:&p_Limit           length:sizeof(float) atIndex:18];
+        [enc setBytes:&p_RespillR        length:sizeof(float) atIndex:19];
+        [enc setBytes:&p_RespillG        length:sizeof(float) atIndex:20];
+        [enc setBytes:&p_RespillB        length:sizeof(float) atIndex:21];
+        [enc setBytes:&p_NearGreyExtract length:sizeof(int)   atIndex:22];
+        [enc setBytes:&p_NearGreyAmount  length:sizeof(float) atIndex:23];
+        [enc setBytes:&p_NearGreySoftness length:sizeof(float) atIndex:28];
+        [enc setBytes:&p_BlackClip       length:sizeof(float) atIndex:24];
+        [enc setBytes:&p_WhiteClip       length:sizeof(float) atIndex:25];
+        [enc setBytes:&p_EdgeProtect     length:sizeof(float) atIndex:26];
+        [enc setBytes:&p_MatteGamma      length:sizeof(float) atIndex:27];
+        dispatch2D(enc, pipes.coreKeyer, p_Width, p_Height);
+
+        } // prematte iteration loop
+    }
+
+    // After prematte (or core if prematte off): dstBuf=RGBA, tempA=alpha, tempB=guide
+
+    bool viewDone = false;
+
+    // ── Diagnostic: Clean Plate ──────────────────────────────────────────
+    if (p_ViewMode == 2) {
+        if (doPrematte && state.cleanPlateBuf) {
+            int dMode = 1;  // RGBA copy
+            [enc setComputePipelineState:pipes.diagnosticOutput];
+            [enc setBuffer:state.cleanPlateBuf offset:0 atIndex:0];
+            [enc setBuffer:state.cleanPlateBuf offset:0 atIndex:1];
+            [enc setBuffer:dstBuf              offset:0 atIndex:2];
+            [enc setBytes:&p_Width  length:sizeof(int) atIndex:10];
+            [enc setBytes:&p_Height length:sizeof(int) atIndex:11];
+            [enc setBytes:&dMode    length:sizeof(int) atIndex:12];
+            dispatch2D(enc, pipes.diagnosticOutput, p_Width, p_Height);
+        } else {
+            // Prematte off: show the screen input as fallback
+            int dMode = 1;
+            [enc setComputePipelineState:pipes.diagnosticOutput];
+            [enc setBuffer:scrBuf offset:0 atIndex:0];
+            [enc setBuffer:scrBuf offset:0 atIndex:1];
+            [enc setBuffer:dstBuf offset:0 atIndex:2];
+            [enc setBytes:&p_Width  length:sizeof(int) atIndex:10];
+            [enc setBytes:&p_Height length:sizeof(int) atIndex:11];
+            [enc setBytes:&dMode    length:sizeof(int) atIndex:12];
+            dispatch2D(enc, pipes.diagnosticOutput, p_Width, p_Height);
+        }
+        viewDone = true;
+    }
+
+    // ══════════════════════════════════════════════════════════════════════
+    //  EXTERNAL MATTES — Garbage and Occlusion
+    //  Applied after core keyer (+ prematte) but before guided filter,
+    //  so the GF refines the combined matte with external constraints.
+    // ══════════════════════════════════════════════════════════════════════
+    if (p_GarbageMatte && !viewDone) {
+        id<MTLBuffer> garbageBuf = reinterpret_cast<id<MTLBuffer>>(const_cast<float*>(p_GarbageMatte));
+        int mode = 0;
+        [enc setComputePipelineState:pipes.applyMatte];
+        [enc setBuffer:tempA      offset:0 atIndex:0];
+        [enc setBuffer:dstBuf     offset:0 atIndex:1];
+        [enc setBuffer:garbageBuf offset:0 atIndex:2];
+        [enc setBytes:&p_Width  length:sizeof(int) atIndex:10];
+        [enc setBytes:&p_Height length:sizeof(int) atIndex:11];
+        [enc setBytes:&mode     length:sizeof(int) atIndex:12];
+        dispatch2D(enc, pipes.applyMatte, p_Width, p_Height);
+    }
+
+    if (p_OcclusionMatte && !viewDone) {
+        id<MTLBuffer> occlusionBuf = reinterpret_cast<id<MTLBuffer>>(const_cast<float*>(p_OcclusionMatte));
+        int mode = 1;
+        [enc setComputePipelineState:pipes.applyMatte];
+        [enc setBuffer:tempA        offset:0 atIndex:0];
+        [enc setBuffer:dstBuf       offset:0 atIndex:1];
+        [enc setBuffer:occlusionBuf offset:0 atIndex:2];
+        [enc setBytes:&p_Width  length:sizeof(int) atIndex:10];
+        [enc setBytes:&p_Height length:sizeof(int) atIndex:11];
+        [enc setBytes:&mode     length:sizeof(int) atIndex:12];
+        dispatch2D(enc, pipes.applyMatte, p_Width, p_Height);
+    }
+
+    // ── Diagnostic: Raw Matte ────────────────────────────────────────────
+    if (p_ViewMode == 1 && !viewDone) {
+        int dMode = 0;  // 1ch alpha → greyscale
+        [enc setComputePipelineState:pipes.diagnosticOutput];
+        [enc setBuffer:tempA  offset:0 atIndex:0];
+        [enc setBuffer:tempA  offset:0 atIndex:1];
+        [enc setBuffer:dstBuf offset:0 atIndex:2];
+        [enc setBytes:&p_Width  length:sizeof(int) atIndex:10];
+        [enc setBytes:&p_Height length:sizeof(int) atIndex:11];
+        [enc setBytes:&dMode    length:sizeof(int) atIndex:12];
+        dispatch2D(enc, pipes.diagnosticOutput, p_Width, p_Height);
+        viewDone = true;
+    }
+
+    if (!viewDone && doGF && !rgbGF) {
         // ══════════════════════════════════════════════════════════════════
         //  GUIDED FILTER (iterative refinement)
         // ══════════════════════════════════════════════════════════════════
@@ -934,7 +1911,133 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
             }
         } // iteration loop
 
-    } else if (p_Premultiply) {
+    } else if (!viewDone && rgbGF) {
+        // ══════════════════════════════════════════════════════════════════
+        //  RGB COLOR-AWARE GUIDED FILTER
+        //  Uses full 3-channel RGB guide with 3×3 covariance matrix.
+        //  Dramatically better color-edge awareness than scalar luminance.
+        //  Optimized: 4-channel vectorized blur reduces dispatches 3.4×.
+        // ══════════════════════════════════════════════════════════════════
+        int r = p_GuidedRadius;
+        int numIter = std::max(1, std::min(p_RefineIterations, 5));
+
+        // Save raw alpha to t[17] for final mix
+        [enc setComputePipelineState:pipes.copyBuffer];
+        [enc setBuffer:t[0]  offset:0 atIndex:3];    // raw alpha from core keyer
+        [enc setBuffer:t[17] offset:0 atIndex:4];
+        [enc setBytes:&p_Width  length:sizeof(int) atIndex:10];
+        [enc setBytes:&p_Height length:sizeof(int) atIndex:11];
+        dispatch2D(enc, pipes.copyBuffer, p_Width, p_Height);
+
+        for (int iter = 0; iter < numIter; iter++) {
+            bool isLast = (iter == numIter - 1);
+
+            // ── Compute products: source RGB + current alpha → 13 channels ──
+            [enc setComputePipelineState:pipes.rgbComputeProducts];
+            [enc setBuffer:srcBuf offset:0 atIndex:0];   // source RGBA (guide RGB)
+            [enc setBuffer:dstBuf offset:0 atIndex:1];   // output (alpha in .w)
+            [enc setBuffer:t[0]   offset:0 atIndex:2];   // → mean_Ir
+            [enc setBuffer:t[1]   offset:0 atIndex:3];   // → mean_Ig
+            [enc setBuffer:t[2]   offset:0 atIndex:4];   // → mean_Ib
+            [enc setBuffer:t[3]   offset:0 atIndex:5];   // → mean_p
+            [enc setBuffer:t[4]   offset:0 atIndex:6];   // → IrIr
+            [enc setBuffer:t[5]   offset:0 atIndex:7];   // → IrIg
+            [enc setBuffer:t[6]   offset:0 atIndex:8];   // → IrIb
+            [enc setBuffer:t[7]   offset:0 atIndex:9];   // → IgIg
+            [enc setBuffer:t[8]   offset:0 atIndex:10];  // → IgIb
+            [enc setBuffer:t[9]   offset:0 atIndex:11];  // → IbIb
+            [enc setBuffer:t[10]  offset:0 atIndex:12];  // → Irp
+            [enc setBuffer:t[11]  offset:0 atIndex:13];  // → Igp
+            [enc setBuffer:t[12]  offset:0 atIndex:14];  // → Ibp
+            [enc setBytes:&p_Width  length:sizeof(int) atIndex:20];
+            [enc setBytes:&p_Height length:sizeof(int) atIndex:21];
+            dispatch2D(enc, pipes.rgbComputeProducts, p_Width, p_Height);
+
+            // ── Blur 13 channels: 3 groups of 4 + 1 single ──
+            // (t[13..16] are dedicated scratch for blur4)
+            gaussianBlur4(enc, pipes,
+                          t[0], t[1], t[2], t[3],
+                          t[13], t[14], t[15], t[16],
+                          weightBuf, p_Width, p_Height, r);
+            gaussianBlur4(enc, pipes,
+                          t[4], t[5], t[6], t[7],
+                          t[13], t[14], t[15], t[16],
+                          weightBuf, p_Width, p_Height, r);
+            gaussianBlur4(enc, pipes,
+                          t[8], t[9], t[10], t[11],
+                          t[13], t[14], t[15], t[16],
+                          weightBuf, p_Width, p_Height, r);
+            gaussianBlur(enc, pipes, t[12], t[13], weightBuf, p_Width, p_Height, r);
+
+            // ── Compute 3×3 coefficients: ar, ag, ab, b ──
+            // Writes: t[4]=ar, t[7]=ag, t[9]=ab, t[0]=b
+            // (b goes to t[0] since means are consumed; t[13..16] stay as scratch)
+            [enc setComputePipelineState:pipes.rgbGuidedCoeff];
+            [enc setBuffer:t[0]   offset:0 atIndex:0];   // mean_Ir (consumed → becomes b output)
+            [enc setBuffer:t[1]   offset:0 atIndex:1];   // mean_Ig
+            [enc setBuffer:t[2]   offset:0 atIndex:2];   // mean_Ib
+            [enc setBuffer:t[3]   offset:0 atIndex:3];   // mean_p
+            [enc setBuffer:t[4]   offset:0 atIndex:4];   // IrIr → ar
+            [enc setBuffer:t[5]   offset:0 atIndex:5];   // IrIg
+            [enc setBuffer:t[6]   offset:0 atIndex:6];   // IrIb
+            [enc setBuffer:t[7]   offset:0 atIndex:7];   // IgIg → ag
+            [enc setBuffer:t[8]   offset:0 atIndex:8];   // IgIb
+            [enc setBuffer:t[9]   offset:0 atIndex:9];   // IbIb → ab
+            [enc setBuffer:t[10]  offset:0 atIndex:10];  // Irp
+            [enc setBuffer:t[11]  offset:0 atIndex:11];  // Igp
+            [enc setBuffer:t[12]  offset:0 atIndex:12];  // Ibp
+            [enc setBuffer:t[0]   offset:0 atIndex:13];  // → b (overwrites mean_Ir)
+            [enc setBytes:&p_Width          length:sizeof(int)   atIndex:20];
+            [enc setBytes:&p_Height         length:sizeof(int)   atIndex:21];
+            [enc setBytes:&p_GuidedEpsilon  length:sizeof(float) atIndex:22];
+            dispatch2D(enc, pipes.rgbGuidedCoeff, p_Width, p_Height);
+
+            // After coeff: t[4]=ar, t[7]=ag, t[9]=ab, t[0]=b
+            // Blur 4 coefficients in a single vectorized pass
+            gaussianBlur4(enc, pipes,
+                          t[4], t[7], t[9], t[0],
+                          t[13], t[14], t[15], t[16],
+                          weightBuf, p_Width, p_Height, r);
+
+            if (isLast) {
+                // Final: apply with mix against saved raw alpha + premultiply
+                [enc setComputePipelineState:pipes.rgbGuidedApply];
+                [enc setBuffer:srcBuf offset:0 atIndex:0];   // source RGB
+                [enc setBuffer:dstBuf offset:0 atIndex:1];   // output RGBA
+                [enc setBuffer:t[4]   offset:0 atIndex:2];   // mean_ar
+                [enc setBuffer:t[7]   offset:0 atIndex:3];   // mean_ag
+                [enc setBuffer:t[9]   offset:0 atIndex:4];   // mean_ab
+                [enc setBuffer:t[0]   offset:0 atIndex:5];   // mean_b
+                [enc setBuffer:t[17]  offset:0 atIndex:6];   // saved raw alpha
+                [enc setBytes:&p_Width        length:sizeof(int)   atIndex:20];
+                [enc setBytes:&p_Height       length:sizeof(int)   atIndex:21];
+                [enc setBytes:&p_Premultiply  length:sizeof(int)   atIndex:22];
+                [enc setBytes:&p_GuidedMix    length:sizeof(float) atIndex:23];
+                dispatch2D(enc, pipes.rgbGuidedApply, p_Width, p_Height);
+            } else {
+                // Intermediate: eval refined alpha → t[1], then write to dstBuf alpha
+                [enc setComputePipelineState:pipes.rgbGuidedEval];
+                [enc setBuffer:srcBuf offset:0 atIndex:0];   // source RGB
+                [enc setBuffer:t[4]   offset:0 atIndex:2];   // mean_ar
+                [enc setBuffer:t[7]   offset:0 atIndex:3];   // mean_ag
+                [enc setBuffer:t[9]   offset:0 atIndex:4];   // mean_ab
+                [enc setBuffer:t[0]   offset:0 atIndex:5];   // mean_b
+                [enc setBuffer:t[1]   offset:0 atIndex:6];   // output: refined alpha
+                [enc setBytes:&p_Width  length:sizeof(int) atIndex:20];
+                [enc setBytes:&p_Height length:sizeof(int) atIndex:21];
+                dispatch2D(enc, pipes.rgbGuidedEval, p_Width, p_Height);
+
+                // Write refined alpha back to dstBuf's .w channel for next iteration
+                [enc setComputePipelineState:pipes.writeAlpha];
+                [enc setBuffer:t[1]   offset:0 atIndex:0];   // 1ch refined alpha
+                [enc setBuffer:dstBuf offset:0 atIndex:1];   // RGBA buffer
+                [enc setBytes:&p_Width  length:sizeof(int) atIndex:10];
+                [enc setBytes:&p_Height length:sizeof(int) atIndex:11];
+                dispatch2D(enc, pipes.writeAlpha, p_Width, p_Height);
+            }
+        } // RGB iteration loop
+
+    } else if (!viewDone && p_Premultiply) {
         // No GF — just premultiply
         [enc setComputePipelineState:pipes.premultiply];
         [enc setBuffer:dstBuf offset:0 atIndex:2];
@@ -943,12 +2046,26 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
         dispatch2D(enc, pipes.premultiply, p_Width, p_Height);
     }
 
+    // ── Diagnostic: Refined Matte ────────────────────────────────────────
+    if (p_ViewMode == 3 && !viewDone) {
+        int dMode = 2;  // extract alpha from RGBA → greyscale
+        [enc setComputePipelineState:pipes.diagnosticOutput];
+        [enc setBuffer:dstBuf offset:0 atIndex:0];
+        [enc setBuffer:dstBuf offset:0 atIndex:1];
+        [enc setBuffer:dstBuf offset:0 atIndex:2];
+        [enc setBytes:&p_Width  length:sizeof(int) atIndex:10];
+        [enc setBytes:&p_Height length:sizeof(int) atIndex:11];
+        [enc setBytes:&dMode    length:sizeof(int) atIndex:12];
+        dispatch2D(enc, pipes.diagnosticOutput, p_Width, p_Height);
+        viewDone = true;
+    }
+
     // ══════════════════════════════════════════════════════════════════════
     //  PASS 2.5: Edge Color Correction
     //  Re-estimates FG color at semi-transparent edges using the matting
     //  equation: fg = (src - screen*(1-alpha)) / alpha
     // ══════════════════════════════════════════════════════════════════════
-    if (p_EdgeColorCorrect > 0.0f) {
+    if (!viewDone && p_EdgeColorCorrect > 0.0f) {
         [enc setComputePipelineState:pipes.edgeColorCorrect];
         [enc setBuffer:srcBuf offset:0 atIndex:0];
         [enc setBuffer:scrBuf offset:0 atIndex:1];
@@ -964,30 +2081,43 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
         dispatch2D(enc, pipes.edgeColorCorrect, p_Width, p_Height);
     }
 
+    // ── Diagnostic: Despilled Source ─────────────────────────────────────
+    if (p_ViewMode == 4 && !viewDone) {
+        viewDone = true;  // skip BG wrap — output despilled FG as-is
+    }
+
     // ══════════════════════════════════════════════════════════════════════
-    //  PASS 3: Background Wrap
-    //  Blurs the BG and bleeds it into FG edges weighted by (1-alpha)
+    //  PASS 3: Background Stage — BG Wrap + Additive Key
+    //  Blurs the BG and bleeds it into FG edges weighted by (1-alpha).
+    //  Additive key: recovers fine detail the alpha missed by
+    //  superimposing source-minus-screen onto the composite.
     // ══════════════════════════════════════════════════════════════════════
-    if (doBgWrap) {
+
+    // BG extraction/blur needed for: bg wrap OR additive key multiplication mode
+    if (needBgBlur && !viewDone) {
         id<MTLBuffer> bgBuf = reinterpret_cast<id<MTLBuffer>>(const_cast<float*>(p_Background));
 
-        // Pre-compute Gaussian weights for BG blur
+        // Cached Gaussian weights for BG blur
         int bwR = std::max(1, p_BgWrapBlur);
-        int bwKernelSize = 2 * bwR + 1;
-        float bwSigma = fmaxf(bwR / 3.0f, 0.5f);
-        float bwInv2s2 = 1.0f / (2.0f * bwSigma * bwSigma);
-        float* bwW = (float*)alloca(bwKernelSize * sizeof(float));
-        float bwSum = 0.0f;
-        for (int i = -bwR; i <= bwR; i++) {
-            float wt = expf(-(float)(i * i) * bwInv2s2);
-            bwW[i + bwR] = wt;
-            bwSum += wt;
+        if (state.bwWeightRadius != bwR) {
+            if (state.bwWeightBuf) [state.bwWeightBuf release];
+            int bwKernelSize = 2 * bwR + 1;
+            float bwSigma = fmaxf(bwR / 3.0f, 0.5f);
+            float bwInv2s2 = 1.0f / (2.0f * bwSigma * bwSigma);
+            float* bwW = (float*)alloca(bwKernelSize * sizeof(float));
+            float bwSum = 0.0f;
+            for (int i = -bwR; i <= bwR; i++) {
+                float wt = expf(-(float)(i * i) * bwInv2s2);
+                bwW[i + bwR] = wt;
+                bwSum += wt;
+            }
+            for (int i = 0; i < bwKernelSize; i++) bwW[i] /= bwSum;
+            state.bwWeightBuf = [device newBufferWithBytes:bwW
+                                                    length:bwKernelSize * sizeof(float)
+                                                   options:MTLResourceStorageModeShared];
+            state.bwWeightRadius = bwR;
         }
-        for (int i = 0; i < bwKernelSize; i++) bwW[i] /= bwSum;
-
-        id<MTLBuffer> bwWeightBuf = [device newBufferWithBytes:bwW
-                                                        length:bwKernelSize * sizeof(float)
-                                                       options:MTLResourceStorageModeShared];
+        id<MTLBuffer> bwWeightBuf = state.bwWeightBuf;
 
         // Extract R, G, B from BG into tempA, tempB, tempC
         for (int ch = 0; ch < 3; ch++) {
@@ -1006,26 +2136,81 @@ void RunMetalKernel(void* p_CmdQ, int p_Width, int p_Height,
         gaussianBlur(enc, pipes, tempB, tempF, bwWeightBuf, p_Width, p_Height, bwR);
         gaussianBlur(enc, pipes, tempC, tempF, bwWeightBuf, p_Width, p_Height, bwR);
 
-        // Apply wrap
-        [enc setComputePipelineState:pipes.bgWrap];
+        if (p_ViewMode == 5) {
+            // ── Diagnostic: Blurred Background ──────────────────────────
+            [enc setComputePipelineState:pipes.packRGBA];
+            [enc setBuffer:tempA  offset:0 atIndex:0];
+            [enc setBuffer:tempB  offset:0 atIndex:1];
+            [enc setBuffer:tempC  offset:0 atIndex:2];
+            [enc setBuffer:dstBuf offset:0 atIndex:3];
+            [enc setBytes:&p_Width  length:sizeof(int) atIndex:10];
+            [enc setBytes:&p_Height length:sizeof(int) atIndex:11];
+            dispatch2D(enc, pipes.packRGBA, p_Width, p_Height);
+        } else {
+            // Apply BG wrap (light wrap)
+            if (doBgWrap) {
+                [enc setComputePipelineState:pipes.bgWrap];
+                [enc setBuffer:dstBuf offset:0 atIndex:2];
+                [enc setBuffer:tempA  offset:0 atIndex:3];
+                [enc setBuffer:tempB  offset:0 atIndex:4];
+                [enc setBuffer:tempC  offset:0 atIndex:5];
+                [enc setBytes:&p_Width       length:sizeof(int)   atIndex:10];
+                [enc setBytes:&p_Height      length:sizeof(int)   atIndex:11];
+                [enc setBytes:&p_BgWrapAmount length:sizeof(float) atIndex:12];
+                dispatch2D(enc, pipes.bgWrap, p_Width, p_Height);
+            }
+
+            // Apply additive key (multiplication mode uses blurred BG in tempA/B/C)
+            if (doAdditiveKey) {
+                [enc setComputePipelineState:pipes.additiveKey];
+                [enc setBuffer:srcBuf offset:0 atIndex:0];
+                [enc setBuffer:scrBuf offset:0 atIndex:1];
+                [enc setBuffer:dstBuf offset:0 atIndex:2];
+                [enc setBuffer:tempA  offset:0 atIndex:3];
+                [enc setBuffer:tempB  offset:0 atIndex:4];
+                [enc setBuffer:tempC  offset:0 atIndex:5];
+                [enc setBytes:&p_Width              length:sizeof(int)   atIndex:10];
+                [enc setBytes:&p_Height             length:sizeof(int)   atIndex:11];
+                [enc setBytes:&p_AdditiveKeyMode    length:sizeof(int)   atIndex:12];
+                [enc setBytes:&p_UseScreenInput     length:sizeof(int)   atIndex:13];
+                [enc setBytes:&p_PickR              length:sizeof(float) atIndex:14];
+                [enc setBytes:&p_PickG              length:sizeof(float) atIndex:15];
+                [enc setBytes:&p_PickB              length:sizeof(float) atIndex:16];
+                [enc setBytes:&p_AdditiveKeySat     length:sizeof(float) atIndex:17];
+                [enc setBytes:&p_AdditiveKeyAmount  length:sizeof(float) atIndex:18];
+                [enc setBytes:&p_AdditiveKeyBlackClamp length:sizeof(int) atIndex:19];
+                dispatch2D(enc, pipes.additiveKey, p_Width, p_Height);
+            }
+        }
+    }
+
+    // Additive key (addition mode) — works without BG input
+    if (doAdditiveKey && p_AdditiveKeyMode == 0 && !needBgBlur && !viewDone) {
+        [enc setComputePipelineState:pipes.additiveKey];
+        [enc setBuffer:srcBuf offset:0 atIndex:0];
+        [enc setBuffer:scrBuf offset:0 atIndex:1];
         [enc setBuffer:dstBuf offset:0 atIndex:2];
-        [enc setBuffer:tempA  offset:0 atIndex:3];
+        [enc setBuffer:tempA  offset:0 atIndex:3];  // unused in addition mode
         [enc setBuffer:tempB  offset:0 atIndex:4];
         [enc setBuffer:tempC  offset:0 atIndex:5];
-        [enc setBytes:&p_Width       length:sizeof(int)   atIndex:10];
-        [enc setBytes:&p_Height      length:sizeof(int)   atIndex:11];
-        [enc setBytes:&p_BgWrapAmount length:sizeof(float) atIndex:12];
-        dispatch2D(enc, pipes.bgWrap, p_Width, p_Height);
-
-        [bwWeightBuf release];
+        [enc setBytes:&p_Width              length:sizeof(int)   atIndex:10];
+        [enc setBytes:&p_Height             length:sizeof(int)   atIndex:11];
+        [enc setBytes:&p_AdditiveKeyMode    length:sizeof(int)   atIndex:12];
+        [enc setBytes:&p_UseScreenInput     length:sizeof(int)   atIndex:13];
+        [enc setBytes:&p_PickR              length:sizeof(float) atIndex:14];
+        [enc setBytes:&p_PickG              length:sizeof(float) atIndex:15];
+        [enc setBytes:&p_PickB              length:sizeof(float) atIndex:16];
+        [enc setBytes:&p_AdditiveKeySat     length:sizeof(float) atIndex:17];
+        [enc setBytes:&p_AdditiveKeyAmount  length:sizeof(float) atIndex:18];
+        [enc setBytes:&p_AdditiveKeyBlackClamp length:sizeof(int) atIndex:19];
+        dispatch2D(enc, pipes.additiveKey, p_Width, p_Height);
     }
 
     [enc endEncoding];
     [cmdBuf commit];
     [cmdBuf waitUntilCompleted];
 
-    // ── Cleanup (temp buffers are cached — only release per-frame objects) ──
+    // ── Cleanup (temp + weight buffers are cached — only release per-frame objects) ──
     if (createdDummy) [scrBuf release];
-    if (weightBuf) [weightBuf release];
   } // @autoreleasepool
 }