microsoft · Copilot · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 20, 2025
diff --git a/README.md b/README.md
@@ -162,6 +162,88 @@ catch(OverflowException)
 ```
 </details>
 
+<details>
+<summary>Custom Extractors for Additional File Types</summary>
+<br/>
+You can extend RecursiveExtractor with custom extractors to support additional archive or file formats not natively supported. This is useful for formats like MSI, MSP, or other proprietary archive formats.
+
+To create a custom extractor, implement the `ICustomAsyncExtractor` interface and register it with the extractor:
+
+```csharp
+using Microsoft.CST.RecursiveExtractor;
+using Microsoft.CST.RecursiveExtractor.Extractors;
+using System.IO;
+using System.Collections.Generic;
+using System.Linq;
+
+// Example: Custom extractor for a hypothetical archive format with magic bytes "MYARC"
+public class MyCustomExtractor : ICustomAsyncExtractor
+{
+    private readonly Extractor context;
+    private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("MYARC");
+
+    public MyCustomExtractor(Extractor ctx)
+    {
+        context = ctx;
+    }
+
+    // Check if this extractor can handle the file based on binary signatures
+    public bool CanExtract(Stream stream)
+    {
+        if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
+        {
+            return false;
+        }
+
+        var initialPosition = stream.Position;
+        try
+        {
+            stream.Position = 0;
+            var buffer = new byte[MAGIC_BYTES.Length];
+            var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
+
+            return bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES);
+        }
+        finally
+        {
+            // Always restore the original position
+            stream.Position = initialPosition;
+        }
+    }
+
+    // Implement extraction logic
+    public IEnumerable<FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+    {
+        // Your extraction logic here
+        // For example, parse the archive and yield FileEntry objects for each contained file
+        yield break;
+    }
+
+    public async IAsyncEnumerable<FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+    {
+        // Your async extraction logic here
+        yield break;
+    }
+}
+
+// Register the custom extractor via constructor
+var customExtractor = new MyCustomExtractor(null);
+var extractor = new Extractor(new[] { customExtractor });
+
+// Now the extractor will use your custom extractor for files matching your CanExtract criteria
+var results = extractor.Extract("path/to/custom/archive.myarc");
+```
+
+Key points:
+- The `CanExtract` method should check the stream's binary signature (like MiniMagic does) and return true if this extractor can handle the format
+- Always preserve the stream's original position in `CanExtract`
+- Custom extractors are provided via the constructor as an `IEnumerable<ICustomAsyncExtractor>`
+- Custom extractors are only checked when the file type is UNKNOWN (not recognized by built-in extractors)
+- Multiple custom extractors can be registered; they are checked in the order provided
+- Custom extractors are invoked for both synchronous and asynchronous extraction paths
+
+</details>
+
 ## Exceptions
 RecursiveExtractor protects against [ZipSlip](https://snyk.io/research/zip-slip-vulnerability), [Quines, and Zip Bombs](https://en.wikipedia.org/wiki/Zip_bomb).
 Calls to Extract will throw an `OverflowException` when a Quine or Zip bomb is detected and a `TimeOutException` if `EnableTiming` is set and the specified time period has elapsed before completion.

diff --git a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
@@ -0,0 +1,274 @@
+using Microsoft.CST.RecursiveExtractor;
+using Microsoft.CST.RecursiveExtractor.Extractors;
+using Microsoft.VisualStudio.TestTools.UnitTesting;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Threading.Tasks;
+
+namespace RecursiveExtractor.Tests.ExtractorTests;
+
+[TestClass]
+public class CustomExtractorTests
+{
+    /// <summary>
+    /// A simple test custom extractor that extracts files with a specific magic number
+    /// For testing purposes, it recognizes files starting with "CUSTOM1"
+    /// </summary>
+    private class TestCustomExtractor : ICustomAsyncExtractor
+    {
+        private readonly Extractor context;
+        private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM1");
+
+        public TestCustomExtractor(Extractor ctx)
+        {
+            context = ctx;
+        }
+
+        public bool CanExtract(Stream stream)
+        {
+            if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
+            {
+                return false;
+            }
+
+            var initialPosition = stream.Position;
+            try
+            {
+                stream.Position = 0;
+                var buffer = new byte[MAGIC_BYTES.Length];
+                var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
+
+                if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES))
+                {
+                    return true;
+                }
+                return false;
+            }
+            finally
+            {
+                stream.Position = initialPosition;
+            }
+        }
+
+        public IEnumerable<FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+        {
+            // For this test, we just return a synthetic file entry showing the custom extractor worked
+            var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor"));
+            yield return new FileEntry("extracted_from_custom.txt", content, fileEntry);
+        }
+
+        public async IAsyncEnumerable<FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+        {
+            // For this test, we just return a synthetic file entry showing the custom extractor worked
+            var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor"));
+            yield return new FileEntry("extracted_from_custom.txt", content, fileEntry);
+            await Task.CompletedTask;
+        }
+    }
+
+    /// <summary>
+    /// A second test custom extractor that recognizes files starting with "CUSTOM2"
+    /// </summary>
+    private class SecondTestCustomExtractor : ICustomAsyncExtractor
+    {
+        private readonly Extractor context;
+        private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM2");
+
+        public SecondTestCustomExtractor(Extractor ctx)
+        {
+            context = ctx;
+        }
+
+        public bool CanExtract(Stream stream)
+        {
+            if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
+            {
+                return false;
+            }
+
+            var initialPosition = stream.Position;
+            try
+            {
+                stream.Position = 0;
+                var buffer = new byte[MAGIC_BYTES.Length];
+                var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
+
+                if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES))
+                {
+                    return true;
+                }
+                return false;
+            }
+            finally
+            {
+                stream.Position = initialPosition;
+            }
+        }
+
+        public IEnumerable<FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+        {
+            var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor"));
+            yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry);
+        }
+
+        public async IAsyncEnumerable<FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+        {
+            var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor"));
+            yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry);
+            await Task.CompletedTask;
+        }
+    }
+
+    [TestMethod]
+    public void Constructor_WithCustomExtractors_RegistersExtractors()
+    {
+        var customExtractor = new TestCustomExtractor(null!);
+        var extractor = new Extractor(new[] { customExtractor });
+
+        Assert.AreEqual(1, extractor.CustomExtractors.Count);
+    }
+
+    [TestMethod]
+    public void Constructor_WithMultipleCustomExtractors_RegistersAll()
+    {
+        var customExtractor1 = new TestCustomExtractor(null!);
+        var customExtractor2 = new SecondTestCustomExtractor(null!);
+        var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor1, customExtractor2 });
+
+        Assert.AreEqual(2, extractor.CustomExtractors.Count);
+    }
+
+    [TestMethod]
+    public void Constructor_WithNullInCollection_IgnoresNull()
+    {
+        var customExtractor = new TestCustomExtractor(null!);
+        var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor, null! });
+
+        Assert.AreEqual(1, extractor.CustomExtractors.Count);
+    }
+
+    [TestMethod]
+    public void Constructor_WithNullCollection_CreatesEmptyExtractor()
+    {
+        var extractor = new Extractor((IEnumerable<ICustomAsyncExtractor>)null!);
+
+        Assert.AreEqual(0, extractor.CustomExtractors.Count);
+    }
+
+    [TestMethod]
+    public void Extract_WithMatchingCustomExtractor_UsesCustomExtractor()
+    {
+        var customExtractor = new TestCustomExtractor(null!);
+        var extractor = new Extractor(new[] { customExtractor });
+
+        // Create a test file with the custom magic bytes
+        var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
+        var results = extractor.Extract("test.custom", testData).ToList();
+
+        Assert.AreEqual(1, results.Count);
+        Assert.AreEqual("extracted_from_custom.txt", results[0].Name);
+
+        // Read the content to verify it was processed by our custom extractor
+        using var reader = new StreamReader(results[0].Content);
+        results[0].Content.Position = 0;
+        var content = reader.ReadToEnd();
+        Assert.AreEqual("Extracted by TestCustomExtractor", content);
+    }
+
+    [TestMethod]
+    public async Task ExtractAsync_WithMatchingCustomExtractor_UsesCustomExtractor()
+    {
+        var customExtractor = new TestCustomExtractor(null!);
+        var extractor = new Extractor(new[] { customExtractor });
+
+        // Create a test file with the custom magic bytes
+        var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
+        var results = await extractor.ExtractAsync("test.custom", testData).ToListAsync();
+
+        Assert.AreEqual(1, results.Count);
+        Assert.AreEqual("extracted_from_custom.txt", results[0].Name);
+
+        // Read the content to verify it was processed by our custom extractor
+        using var reader = new StreamReader(results[0].Content);
+        results[0].Content.Position = 0;
+        var content = reader.ReadToEnd();
+        Assert.AreEqual("Extracted by TestCustomExtractor", content);
+    }
+
+    [TestMethod]
+    public void Extract_WithoutMatchingCustomExtractor_ReturnsOriginalFile()
+    {
+        var customExtractor = new TestCustomExtractor(null!);
+        var extractor = new Extractor(new[] { customExtractor });
+
+        // Create a test file that doesn't match the custom magic bytes
+        var testData = System.Text.Encoding.ASCII.GetBytes("NOTCUSTOM This is test data");
+        var results = extractor.Extract("test.txt", testData).ToList();
+
+        // Should return the original file since no custom extractor matched
+        Assert.AreEqual(1, results.Count);
+        Assert.AreEqual("test.txt", results[0].Name);
+
+        // Verify it's the original content
+        using var reader = new StreamReader(results[0].Content);
+        results[0].Content.Position = 0;
+        var content = reader.ReadToEnd();
+        Assert.AreEqual("NOTCUSTOM This is test data", content);
+    }
+
+    [TestMethod]
+    public void Extract_MultipleCustomExtractors_UsesCorrectOne()
+    {
+        var extractor = new Extractor(new ICustomAsyncExtractor[] 
+        { 
+            new TestCustomExtractor(null!), 
+            new SecondTestCustomExtractor(null!) 
+        });
+
+        // Test with first custom format
+        var testData1 = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 data");
+        var results1 = extractor.Extract("test1.custom", testData1).ToList();
+        Assert.AreEqual(1, results1.Count);
+        Assert.AreEqual("extracted_from_custom.txt", results1[0].Name);
+
+        // Test with second custom format
+        var testData2 = System.Text.Encoding.ASCII.GetBytes("CUSTOM2 data");
+        var results2 = extractor.Extract("test2.custom", testData2).ToList();
+        Assert.AreEqual(1, results2.Count);
+        Assert.AreEqual("extracted_from_second_custom.txt", results2[0].Name);
+    }
+
+    [TestMethod]
+    public void Extract_NoCustomExtractors_ReturnsOriginalFile()
+    {
+        var extractor = new Extractor();
+
+        // Don't add any custom extractors
+        var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
+        var results = extractor.Extract("test.custom", testData).ToList();
+
+        // Should return the original file since no custom extractor is registered
+        Assert.AreEqual(1, results.Count);
+        Assert.AreEqual("test.custom", results[0].Name);
+    }
+
+    [TestMethod]
+    public void Extract_CustomExtractorForKnownFormat_UsesBuiltInExtractor()
+    {
+        var customExtractor = new TestCustomExtractor(null!);
+        var extractor = new Extractor(new[] { customExtractor });
+
+        // Test with a real ZIP file - should use built-in extractor, not custom
+        var path = Path.Combine(Directory.GetCurrentDirectory(), "TestData", "TestDataArchives", "EmptyFile.txt.zip");
+        if (File.Exists(path))
+        {
+            var results = extractor.Extract(path).ToList();
+
+            // Should extract the ZIP normally, not use the custom extractor
+            Assert.IsTrue(results.Count > 0);
+            Assert.IsTrue(results.Any(r => r.Name.Contains("EmptyFile")));
+        }
+    }
+}