diff --git a/README.md b/README.md
index fea318c..65cccf5 100644
--- a/README.md
+++ b/README.md
@@ -162,6 +162,88 @@ catch(OverflowException)
```
+
+Custom Extractors for Additional File Types
+
+You can extend RecursiveExtractor with custom extractors to support additional archive or file formats not natively supported. This is useful for formats like MSI, MSP, or other proprietary archive formats.
+
+To create a custom extractor, implement the `ICustomAsyncExtractor` interface and register it with the extractor:
+
+```csharp
+using Microsoft.CST.RecursiveExtractor;
+using Microsoft.CST.RecursiveExtractor.Extractors;
+using System.IO;
+using System.Collections.Generic;
+using System.Linq;
+
+// Example: Custom extractor for a hypothetical archive format with magic bytes "MYARC"
+public class MyCustomExtractor : ICustomAsyncExtractor
+{
+ private readonly Extractor context;
+ private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("MYARC");
+
+ public MyCustomExtractor(Extractor ctx)
+ {
+ context = ctx;
+ }
+
+ // Check if this extractor can handle the file based on binary signatures
+ public bool CanExtract(Stream stream)
+ {
+ if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
+ {
+ return false;
+ }
+
+ var initialPosition = stream.Position;
+ try
+ {
+ stream.Position = 0;
+ var buffer = new byte[MAGIC_BYTES.Length];
+ var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
+
+ return bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES);
+ }
+ finally
+ {
+ // Always restore the original position
+ stream.Position = initialPosition;
+ }
+ }
+
+ // Implement extraction logic
+ public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ // Your extraction logic here
+ // For example, parse the archive and yield FileEntry objects for each contained file
+ yield break;
+ }
+
+ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ // Your async extraction logic here
+ yield break;
+ }
+}
+
+// Register the custom extractor via constructor
+var customExtractor = new MyCustomExtractor(null);
+var extractor = new Extractor(new[] { customExtractor });
+
+// Now the extractor will use your custom extractor for files matching your CanExtract criteria
+var results = extractor.Extract("path/to/custom/archive.myarc");
+```
+
+Key points:
+- The `CanExtract` method should check the stream's binary signature (like MiniMagic does) and return true if this extractor can handle the format
+- Always preserve the stream's original position in `CanExtract`
+- Custom extractors are provided via the constructor as an `IEnumerable`
+- Custom extractors are only checked when the file type is UNKNOWN (not recognized by built-in extractors)
+- Multiple custom extractors can be registered; they are checked in the order provided
+- Custom extractors are invoked for both synchronous and asynchronous extraction paths
+
+
+
## Exceptions
RecursiveExtractor protects against [ZipSlip](https://snyk.io/research/zip-slip-vulnerability), [Quines, and Zip Bombs](https://en.wikipedia.org/wiki/Zip_bomb).
Calls to Extract will throw an `OverflowException` when a Quine or Zip bomb is detected and a `TimeOutException` if `EnableTiming` is set and the specified time period has elapsed before completion.
diff --git a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
new file mode 100644
index 0000000..547ba87
--- /dev/null
+++ b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
@@ -0,0 +1,274 @@
+using Microsoft.CST.RecursiveExtractor;
+using Microsoft.CST.RecursiveExtractor.Extractors;
+using Microsoft.VisualStudio.TestTools.UnitTesting;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Threading.Tasks;
+
+namespace RecursiveExtractor.Tests.ExtractorTests;
+
+[TestClass]
+public class CustomExtractorTests
+{
+ ///
+ /// A simple test custom extractor that extracts files with a specific magic number
+ /// For testing purposes, it recognizes files starting with "CUSTOM1"
+ ///
+ private class TestCustomExtractor : ICustomAsyncExtractor
+ {
+ private readonly Extractor context;
+ private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM1");
+
+ public TestCustomExtractor(Extractor ctx)
+ {
+ context = ctx;
+ }
+
+ public bool CanExtract(Stream stream)
+ {
+ if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
+ {
+ return false;
+ }
+
+ var initialPosition = stream.Position;
+ try
+ {
+ stream.Position = 0;
+ var buffer = new byte[MAGIC_BYTES.Length];
+ var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
+
+ if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES))
+ {
+ return true;
+ }
+ return false;
+ }
+ finally
+ {
+ stream.Position = initialPosition;
+ }
+ }
+
+ public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ // For this test, we just return a synthetic file entry showing the custom extractor worked
+ var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor"));
+ yield return new FileEntry("extracted_from_custom.txt", content, fileEntry);
+ }
+
+ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ // For this test, we just return a synthetic file entry showing the custom extractor worked
+ var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor"));
+ yield return new FileEntry("extracted_from_custom.txt", content, fileEntry);
+ await Task.CompletedTask;
+ }
+ }
+
+ ///
+ /// A second test custom extractor that recognizes files starting with "CUSTOM2"
+ ///
+ private class SecondTestCustomExtractor : ICustomAsyncExtractor
+ {
+ private readonly Extractor context;
+ private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM2");
+
+ public SecondTestCustomExtractor(Extractor ctx)
+ {
+ context = ctx;
+ }
+
+ public bool CanExtract(Stream stream)
+ {
+ if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
+ {
+ return false;
+ }
+
+ var initialPosition = stream.Position;
+ try
+ {
+ stream.Position = 0;
+ var buffer = new byte[MAGIC_BYTES.Length];
+ var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
+
+ if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES))
+ {
+ return true;
+ }
+ return false;
+ }
+ finally
+ {
+ stream.Position = initialPosition;
+ }
+ }
+
+ public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor"));
+ yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry);
+ }
+
+ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor"));
+ yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry);
+ await Task.CompletedTask;
+ }
+ }
+
+ [TestMethod]
+ public void Constructor_WithCustomExtractors_RegistersExtractors()
+ {
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new[] { customExtractor });
+
+ Assert.AreEqual(1, extractor.CustomExtractors.Count);
+ }
+
+ [TestMethod]
+ public void Constructor_WithMultipleCustomExtractors_RegistersAll()
+ {
+ var customExtractor1 = new TestCustomExtractor(null!);
+ var customExtractor2 = new SecondTestCustomExtractor(null!);
+ var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor1, customExtractor2 });
+
+ Assert.AreEqual(2, extractor.CustomExtractors.Count);
+ }
+
+ [TestMethod]
+ public void Constructor_WithNullInCollection_IgnoresNull()
+ {
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor, null! });
+
+ Assert.AreEqual(1, extractor.CustomExtractors.Count);
+ }
+
+ [TestMethod]
+ public void Constructor_WithNullCollection_CreatesEmptyExtractor()
+ {
+ var extractor = new Extractor((IEnumerable)null!);
+
+ Assert.AreEqual(0, extractor.CustomExtractors.Count);
+ }
+
+ [TestMethod]
+ public void Extract_WithMatchingCustomExtractor_UsesCustomExtractor()
+ {
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new[] { customExtractor });
+
+ // Create a test file with the custom magic bytes
+ var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
+ var results = extractor.Extract("test.custom", testData).ToList();
+
+ Assert.AreEqual(1, results.Count);
+ Assert.AreEqual("extracted_from_custom.txt", results[0].Name);
+
+ // Read the content to verify it was processed by our custom extractor
+ using var reader = new StreamReader(results[0].Content);
+ results[0].Content.Position = 0;
+ var content = reader.ReadToEnd();
+ Assert.AreEqual("Extracted by TestCustomExtractor", content);
+ }
+
+ [TestMethod]
+ public async Task ExtractAsync_WithMatchingCustomExtractor_UsesCustomExtractor()
+ {
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new[] { customExtractor });
+
+ // Create a test file with the custom magic bytes
+ var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
+ var results = await extractor.ExtractAsync("test.custom", testData).ToListAsync();
+
+ Assert.AreEqual(1, results.Count);
+ Assert.AreEqual("extracted_from_custom.txt", results[0].Name);
+
+ // Read the content to verify it was processed by our custom extractor
+ using var reader = new StreamReader(results[0].Content);
+ results[0].Content.Position = 0;
+ var content = reader.ReadToEnd();
+ Assert.AreEqual("Extracted by TestCustomExtractor", content);
+ }
+
+ [TestMethod]
+ public void Extract_WithoutMatchingCustomExtractor_ReturnsOriginalFile()
+ {
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new[] { customExtractor });
+
+ // Create a test file that doesn't match the custom magic bytes
+ var testData = System.Text.Encoding.ASCII.GetBytes("NOTCUSTOM This is test data");
+ var results = extractor.Extract("test.txt", testData).ToList();
+
+ // Should return the original file since no custom extractor matched
+ Assert.AreEqual(1, results.Count);
+ Assert.AreEqual("test.txt", results[0].Name);
+
+ // Verify it's the original content
+ using var reader = new StreamReader(results[0].Content);
+ results[0].Content.Position = 0;
+ var content = reader.ReadToEnd();
+ Assert.AreEqual("NOTCUSTOM This is test data", content);
+ }
+
+ [TestMethod]
+ public void Extract_MultipleCustomExtractors_UsesCorrectOne()
+ {
+ var extractor = new Extractor(new ICustomAsyncExtractor[]
+ {
+ new TestCustomExtractor(null!),
+ new SecondTestCustomExtractor(null!)
+ });
+
+ // Test with first custom format
+ var testData1 = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 data");
+ var results1 = extractor.Extract("test1.custom", testData1).ToList();
+ Assert.AreEqual(1, results1.Count);
+ Assert.AreEqual("extracted_from_custom.txt", results1[0].Name);
+
+ // Test with second custom format
+ var testData2 = System.Text.Encoding.ASCII.GetBytes("CUSTOM2 data");
+ var results2 = extractor.Extract("test2.custom", testData2).ToList();
+ Assert.AreEqual(1, results2.Count);
+ Assert.AreEqual("extracted_from_second_custom.txt", results2[0].Name);
+ }
+
+ [TestMethod]
+ public void Extract_NoCustomExtractors_ReturnsOriginalFile()
+ {
+ var extractor = new Extractor();
+
+ // Don't add any custom extractors
+ var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
+ var results = extractor.Extract("test.custom", testData).ToList();
+
+ // Should return the original file since no custom extractor is registered
+ Assert.AreEqual(1, results.Count);
+ Assert.AreEqual("test.custom", results[0].Name);
+ }
+
+ [TestMethod]
+ public void Extract_CustomExtractorForKnownFormat_UsesBuiltInExtractor()
+ {
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new[] { customExtractor });
+
+ // Test with a real ZIP file - should use built-in extractor, not custom
+ var path = Path.Combine(Directory.GetCurrentDirectory(), "TestData", "TestDataArchives", "EmptyFile.txt.zip");
+ if (File.Exists(path))
+ {
+ var results = extractor.Extract(path).ToList();
+
+ // Should extract the ZIP normally, not use the custom extractor
+ Assert.IsTrue(results.Count > 0);
+ Assert.IsTrue(results.Any(r => r.Name.Contains("EmptyFile")));
+ }
+ }
+}
diff --git a/RecursiveExtractor/Extractor.cs b/RecursiveExtractor/Extractor.cs
index 20564b5..15dafc5 100644
--- a/RecursiveExtractor/Extractor.cs
+++ b/RecursiveExtractor/Extractor.cs
@@ -39,8 +39,32 @@ public Extractor()
SetDefaultExtractors();
}
+ ///
+ /// Instantiate an extractor with the default extractors and custom extractors.
+ ///
+ /// Custom extractors to register for handling file types not natively supported.
+ public Extractor(IEnumerable customExtractors) : this()
+ {
+ if (customExtractors != null)
+ {
+ foreach (var extractor in customExtractors)
+ {
+ if (extractor != null)
+ {
+ ((HashSet)CustomExtractors).Add(extractor);
+ }
+ }
+ }
+ }
+
internal Dictionary Extractors { get; } = new Dictionary();
+ ///
+ /// Collection of custom extractors that can handle file types not natively supported.
+ /// These are checked when a file type is detected as UNKNOWN.
+ ///
+ internal ICollection CustomExtractors { get; } = new HashSet();
+
///
/// Set up the Default Extractors compatible with this platform.
///
@@ -308,6 +332,31 @@ public async IAsyncEnumerable ExtractAsync(string filename, byte[] ar
///
private readonly NLog.Logger Logger = NLog.LogManager.GetCurrentClassLogger();
+ ///
+ /// Finds a custom extractor that can handle the given file entry.
+ ///
+ /// The file entry to check.
+ /// A custom extractor that can handle the file, or null if none found.
+ private ICustomAsyncExtractor? FindMatchingCustomExtractor(FileEntry fileEntry)
+ {
+ foreach (var customExtractor in CustomExtractors)
+ {
+ try
+ {
+ if (customExtractor.CanExtract(fileEntry.Content))
+ {
+ Logger.Debug("Custom extractor {0} matched for file {1}", customExtractor.GetType().Name, fileEntry.FullPath);
+ return customExtractor;
+ }
+ }
+ catch (Exception e)
+ {
+ Logger.Debug("Custom extractor {0} threw exception when checking {1}: {2}", customExtractor.GetType().Name, fileEntry.FullPath, e.Message);
+ }
+ }
+ return null;
+ }
+
///
/// Extract asynchronously from a FileEntry.
///
@@ -348,13 +397,39 @@ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, Extra
var type = fileEntry.ArchiveType;
if (options.IsAcceptableType(type))
{
- if (((opts?.RawExtensions?.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) ?? false) || type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type)))
+ // If this file should be treated as a raw file based on extension, just yield it
+ if (opts?.RawExtensions?.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) ?? false)
{
if (options.FileNamePasses(fileEntry.FullPath))
{
yield return fileEntry;
}
}
+ // If type is UNKNOWN or no extractor is registered, check custom extractors
+ else if (type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type))
+ {
+ // Try to find a custom extractor that can handle this file
+ var customExtractor = FindMatchingCustomExtractor(fileEntry);
+ if (customExtractor != null)
+ {
+ // Use the custom extractor
+ await foreach (var result in customExtractor.ExtractAsync(fileEntry, options, resourceGovernor, false))
+ {
+ if (options.FileNamePasses(result.FullPath))
+ {
+ yield return result;
+ }
+ }
+ }
+ else
+ {
+ // No custom extractor found, yield as raw file
+ if (options.FileNamePasses(fileEntry.FullPath))
+ {
+ yield return fileEntry;
+ }
+ }
+ }
else
{
await foreach (var result in Extractors[type].ExtractAsync(fileEntry, options, resourceGovernor, false))
@@ -640,14 +715,38 @@ public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions? opt
resourceGovernor.AdjustRemainingBytes(-fileEntry.Content.Length);
// If this file should be treated as a raw file, and not extracted, just yield it
- if (options.RawExtensions.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) ||
- type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type))
+ if (options.RawExtensions.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)))
{
if (options.FileNamePasses(fileEntry.FullPath))
{
yield return fileEntry;
}
}
+ // If type is UNKNOWN or no extractor is registered, check custom extractors
+ else if (type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type))
+ {
+ // Try to find a custom extractor that can handle this file
+ var customExtractor = FindMatchingCustomExtractor(fileEntry);
+ if (customExtractor != null)
+ {
+ // Use the custom extractor
+ foreach (var extractedResult in customExtractor.Extract(fileEntry, options, resourceGovernor, false))
+ {
+ if (options.FileNamePasses(extractedResult.FullPath))
+ {
+ yield return extractedResult;
+ }
+ }
+ }
+ else
+ {
+ // No custom extractor found, yield as raw file
+ if (options.FileNamePasses(fileEntry.FullPath))
+ {
+ yield return fileEntry;
+ }
+ }
+ }
// Otherwise yield all the results from its extraction
else
{
diff --git a/RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs b/RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs
new file mode 100644
index 0000000..64f2dc0
--- /dev/null
+++ b/RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs
@@ -0,0 +1,20 @@
+using System.IO;
+
+namespace Microsoft.CST.RecursiveExtractor.Extractors
+{
+ ///
+ /// An interface for custom extractors that can determine if they can handle a given stream.
+ /// This allows library users to extend the extractor with support for additional archive types.
+ ///
+ public interface ICustomAsyncExtractor : AsyncExtractorInterface
+ {
+ ///
+ /// Determines if this extractor can extract the given stream based on binary signatures or other criteria.
+ /// This method should check the stream's content (similar to how MiniMagic works) and return true if this
+ /// extractor supports the file format.
+ ///
+ /// The stream to check. The implementation should preserve the stream's original position.
+ /// True if this extractor can handle the stream, false otherwise.
+ bool CanExtract(Stream stream);
+ }
+}
diff --git a/RecursiveExtractor/RecursiveExtractor.csproj b/RecursiveExtractor/RecursiveExtractor.csproj
index 09b0c90..9fdec44 100644
--- a/RecursiveExtractor/RecursiveExtractor.csproj
+++ b/RecursiveExtractor/RecursiveExtractor.csproj
@@ -24,6 +24,12 @@
snupkg
+
+
+ <_Parameter1>RecursiveExtractor.Tests
+
+
+