diff --git a/README.md b/README.md index fea318c..65cccf5 100644 --- a/README.md +++ b/README.md @@ -162,6 +162,88 @@ catch(OverflowException) ``` +
+Custom Extractors for Additional File Types +
+You can extend RecursiveExtractor with custom extractors to support additional archive or file formats not natively supported. This is useful for formats like MSI, MSP, or other proprietary archive formats. + +To create a custom extractor, implement the `ICustomAsyncExtractor` interface and register it with the extractor: + +```csharp +using Microsoft.CST.RecursiveExtractor; +using Microsoft.CST.RecursiveExtractor.Extractors; +using System.IO; +using System.Collections.Generic; +using System.Linq; + +// Example: Custom extractor for a hypothetical archive format with magic bytes "MYARC" +public class MyCustomExtractor : ICustomAsyncExtractor +{ + private readonly Extractor context; + private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("MYARC"); + + public MyCustomExtractor(Extractor ctx) + { + context = ctx; + } + + // Check if this extractor can handle the file based on binary signatures + public bool CanExtract(Stream stream) + { + if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length) + { + return false; + } + + var initialPosition = stream.Position; + try + { + stream.Position = 0; + var buffer = new byte[MAGIC_BYTES.Length]; + var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length); + + return bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES); + } + finally + { + // Always restore the original position + stream.Position = initialPosition; + } + } + + // Implement extraction logic + public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + // Your extraction logic here + // For example, parse the archive and yield FileEntry objects for each contained file + yield break; + } + + public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + // Your async extraction logic here + yield break; + } +} + +// Register the custom extractor via constructor +var customExtractor = new MyCustomExtractor(null); +var extractor = new Extractor(new[] { customExtractor }); + +// Now the extractor will use your custom extractor for files matching your CanExtract criteria +var results = extractor.Extract("path/to/custom/archive.myarc"); +``` + +Key points: +- The `CanExtract` method should check the stream's binary signature (like MiniMagic does) and return true if this extractor can handle the format +- Always preserve the stream's original position in `CanExtract` +- Custom extractors are provided via the constructor as an `IEnumerable` +- Custom extractors are only checked when the file type is UNKNOWN (not recognized by built-in extractors) +- Multiple custom extractors can be registered; they are checked in the order provided +- Custom extractors are invoked for both synchronous and asynchronous extraction paths + +
+ ## Exceptions RecursiveExtractor protects against [ZipSlip](https://snyk.io/research/zip-slip-vulnerability), [Quines, and Zip Bombs](https://en.wikipedia.org/wiki/Zip_bomb). Calls to Extract will throw an `OverflowException` when a Quine or Zip bomb is detected and a `TimeOutException` if `EnableTiming` is set and the specified time period has elapsed before completion. diff --git a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs new file mode 100644 index 0000000..547ba87 --- /dev/null +++ b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs @@ -0,0 +1,274 @@ +using Microsoft.CST.RecursiveExtractor; +using Microsoft.CST.RecursiveExtractor.Extractors; +using Microsoft.VisualStudio.TestTools.UnitTesting; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Threading.Tasks; + +namespace RecursiveExtractor.Tests.ExtractorTests; + +[TestClass] +public class CustomExtractorTests +{ + /// + /// A simple test custom extractor that extracts files with a specific magic number + /// For testing purposes, it recognizes files starting with "CUSTOM1" + /// + private class TestCustomExtractor : ICustomAsyncExtractor + { + private readonly Extractor context; + private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM1"); + + public TestCustomExtractor(Extractor ctx) + { + context = ctx; + } + + public bool CanExtract(Stream stream) + { + if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length) + { + return false; + } + + var initialPosition = stream.Position; + try + { + stream.Position = 0; + var buffer = new byte[MAGIC_BYTES.Length]; + var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length); + + if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES)) + { + return true; + } + return false; + } + finally + { + stream.Position = initialPosition; + } + } + + public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + // For this test, we just return a synthetic file entry showing the custom extractor worked + var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor")); + yield return new FileEntry("extracted_from_custom.txt", content, fileEntry); + } + + public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + // For this test, we just return a synthetic file entry showing the custom extractor worked + var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor")); + yield return new FileEntry("extracted_from_custom.txt", content, fileEntry); + await Task.CompletedTask; + } + } + + /// + /// A second test custom extractor that recognizes files starting with "CUSTOM2" + /// + private class SecondTestCustomExtractor : ICustomAsyncExtractor + { + private readonly Extractor context; + private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM2"); + + public SecondTestCustomExtractor(Extractor ctx) + { + context = ctx; + } + + public bool CanExtract(Stream stream) + { + if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length) + { + return false; + } + + var initialPosition = stream.Position; + try + { + stream.Position = 0; + var buffer = new byte[MAGIC_BYTES.Length]; + var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length); + + if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES)) + { + return true; + } + return false; + } + finally + { + stream.Position = initialPosition; + } + } + + public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor")); + yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry); + } + + public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor")); + yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry); + await Task.CompletedTask; + } + } + + [TestMethod] + public void Constructor_WithCustomExtractors_RegistersExtractors() + { + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new[] { customExtractor }); + + Assert.AreEqual(1, extractor.CustomExtractors.Count); + } + + [TestMethod] + public void Constructor_WithMultipleCustomExtractors_RegistersAll() + { + var customExtractor1 = new TestCustomExtractor(null!); + var customExtractor2 = new SecondTestCustomExtractor(null!); + var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor1, customExtractor2 }); + + Assert.AreEqual(2, extractor.CustomExtractors.Count); + } + + [TestMethod] + public void Constructor_WithNullInCollection_IgnoresNull() + { + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor, null! }); + + Assert.AreEqual(1, extractor.CustomExtractors.Count); + } + + [TestMethod] + public void Constructor_WithNullCollection_CreatesEmptyExtractor() + { + var extractor = new Extractor((IEnumerable)null!); + + Assert.AreEqual(0, extractor.CustomExtractors.Count); + } + + [TestMethod] + public void Extract_WithMatchingCustomExtractor_UsesCustomExtractor() + { + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new[] { customExtractor }); + + // Create a test file with the custom magic bytes + var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data"); + var results = extractor.Extract("test.custom", testData).ToList(); + + Assert.AreEqual(1, results.Count); + Assert.AreEqual("extracted_from_custom.txt", results[0].Name); + + // Read the content to verify it was processed by our custom extractor + using var reader = new StreamReader(results[0].Content); + results[0].Content.Position = 0; + var content = reader.ReadToEnd(); + Assert.AreEqual("Extracted by TestCustomExtractor", content); + } + + [TestMethod] + public async Task ExtractAsync_WithMatchingCustomExtractor_UsesCustomExtractor() + { + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new[] { customExtractor }); + + // Create a test file with the custom magic bytes + var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data"); + var results = await extractor.ExtractAsync("test.custom", testData).ToListAsync(); + + Assert.AreEqual(1, results.Count); + Assert.AreEqual("extracted_from_custom.txt", results[0].Name); + + // Read the content to verify it was processed by our custom extractor + using var reader = new StreamReader(results[0].Content); + results[0].Content.Position = 0; + var content = reader.ReadToEnd(); + Assert.AreEqual("Extracted by TestCustomExtractor", content); + } + + [TestMethod] + public void Extract_WithoutMatchingCustomExtractor_ReturnsOriginalFile() + { + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new[] { customExtractor }); + + // Create a test file that doesn't match the custom magic bytes + var testData = System.Text.Encoding.ASCII.GetBytes("NOTCUSTOM This is test data"); + var results = extractor.Extract("test.txt", testData).ToList(); + + // Should return the original file since no custom extractor matched + Assert.AreEqual(1, results.Count); + Assert.AreEqual("test.txt", results[0].Name); + + // Verify it's the original content + using var reader = new StreamReader(results[0].Content); + results[0].Content.Position = 0; + var content = reader.ReadToEnd(); + Assert.AreEqual("NOTCUSTOM This is test data", content); + } + + [TestMethod] + public void Extract_MultipleCustomExtractors_UsesCorrectOne() + { + var extractor = new Extractor(new ICustomAsyncExtractor[] + { + new TestCustomExtractor(null!), + new SecondTestCustomExtractor(null!) + }); + + // Test with first custom format + var testData1 = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 data"); + var results1 = extractor.Extract("test1.custom", testData1).ToList(); + Assert.AreEqual(1, results1.Count); + Assert.AreEqual("extracted_from_custom.txt", results1[0].Name); + + // Test with second custom format + var testData2 = System.Text.Encoding.ASCII.GetBytes("CUSTOM2 data"); + var results2 = extractor.Extract("test2.custom", testData2).ToList(); + Assert.AreEqual(1, results2.Count); + Assert.AreEqual("extracted_from_second_custom.txt", results2[0].Name); + } + + [TestMethod] + public void Extract_NoCustomExtractors_ReturnsOriginalFile() + { + var extractor = new Extractor(); + + // Don't add any custom extractors + var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data"); + var results = extractor.Extract("test.custom", testData).ToList(); + + // Should return the original file since no custom extractor is registered + Assert.AreEqual(1, results.Count); + Assert.AreEqual("test.custom", results[0].Name); + } + + [TestMethod] + public void Extract_CustomExtractorForKnownFormat_UsesBuiltInExtractor() + { + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new[] { customExtractor }); + + // Test with a real ZIP file - should use built-in extractor, not custom + var path = Path.Combine(Directory.GetCurrentDirectory(), "TestData", "TestDataArchives", "EmptyFile.txt.zip"); + if (File.Exists(path)) + { + var results = extractor.Extract(path).ToList(); + + // Should extract the ZIP normally, not use the custom extractor + Assert.IsTrue(results.Count > 0); + Assert.IsTrue(results.Any(r => r.Name.Contains("EmptyFile"))); + } + } +} diff --git a/RecursiveExtractor/Extractor.cs b/RecursiveExtractor/Extractor.cs index 20564b5..15dafc5 100644 --- a/RecursiveExtractor/Extractor.cs +++ b/RecursiveExtractor/Extractor.cs @@ -39,8 +39,32 @@ public Extractor() SetDefaultExtractors(); } + /// + /// Instantiate an extractor with the default extractors and custom extractors. + /// + /// Custom extractors to register for handling file types not natively supported. + public Extractor(IEnumerable customExtractors) : this() + { + if (customExtractors != null) + { + foreach (var extractor in customExtractors) + { + if (extractor != null) + { + ((HashSet)CustomExtractors).Add(extractor); + } + } + } + } + internal Dictionary Extractors { get; } = new Dictionary(); + /// + /// Collection of custom extractors that can handle file types not natively supported. + /// These are checked when a file type is detected as UNKNOWN. + /// + internal ICollection CustomExtractors { get; } = new HashSet(); + /// /// Set up the Default Extractors compatible with this platform. /// @@ -308,6 +332,31 @@ public async IAsyncEnumerable ExtractAsync(string filename, byte[] ar /// private readonly NLog.Logger Logger = NLog.LogManager.GetCurrentClassLogger(); + /// + /// Finds a custom extractor that can handle the given file entry. + /// + /// The file entry to check. + /// A custom extractor that can handle the file, or null if none found. + private ICustomAsyncExtractor? FindMatchingCustomExtractor(FileEntry fileEntry) + { + foreach (var customExtractor in CustomExtractors) + { + try + { + if (customExtractor.CanExtract(fileEntry.Content)) + { + Logger.Debug("Custom extractor {0} matched for file {1}", customExtractor.GetType().Name, fileEntry.FullPath); + return customExtractor; + } + } + catch (Exception e) + { + Logger.Debug("Custom extractor {0} threw exception when checking {1}: {2}", customExtractor.GetType().Name, fileEntry.FullPath, e.Message); + } + } + return null; + } + /// /// Extract asynchronously from a FileEntry. /// @@ -348,13 +397,39 @@ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, Extra var type = fileEntry.ArchiveType; if (options.IsAcceptableType(type)) { - if (((opts?.RawExtensions?.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) ?? false) || type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type))) + // If this file should be treated as a raw file based on extension, just yield it + if (opts?.RawExtensions?.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) ?? false) { if (options.FileNamePasses(fileEntry.FullPath)) { yield return fileEntry; } } + // If type is UNKNOWN or no extractor is registered, check custom extractors + else if (type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type)) + { + // Try to find a custom extractor that can handle this file + var customExtractor = FindMatchingCustomExtractor(fileEntry); + if (customExtractor != null) + { + // Use the custom extractor + await foreach (var result in customExtractor.ExtractAsync(fileEntry, options, resourceGovernor, false)) + { + if (options.FileNamePasses(result.FullPath)) + { + yield return result; + } + } + } + else + { + // No custom extractor found, yield as raw file + if (options.FileNamePasses(fileEntry.FullPath)) + { + yield return fileEntry; + } + } + } else { await foreach (var result in Extractors[type].ExtractAsync(fileEntry, options, resourceGovernor, false)) @@ -640,14 +715,38 @@ public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions? opt resourceGovernor.AdjustRemainingBytes(-fileEntry.Content.Length); // If this file should be treated as a raw file, and not extracted, just yield it - if (options.RawExtensions.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) || - type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type)) + if (options.RawExtensions.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x))) { if (options.FileNamePasses(fileEntry.FullPath)) { yield return fileEntry; } } + // If type is UNKNOWN or no extractor is registered, check custom extractors + else if (type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type)) + { + // Try to find a custom extractor that can handle this file + var customExtractor = FindMatchingCustomExtractor(fileEntry); + if (customExtractor != null) + { + // Use the custom extractor + foreach (var extractedResult in customExtractor.Extract(fileEntry, options, resourceGovernor, false)) + { + if (options.FileNamePasses(extractedResult.FullPath)) + { + yield return extractedResult; + } + } + } + else + { + // No custom extractor found, yield as raw file + if (options.FileNamePasses(fileEntry.FullPath)) + { + yield return fileEntry; + } + } + } // Otherwise yield all the results from its extraction else { diff --git a/RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs b/RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs new file mode 100644 index 0000000..64f2dc0 --- /dev/null +++ b/RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs @@ -0,0 +1,20 @@ +using System.IO; + +namespace Microsoft.CST.RecursiveExtractor.Extractors +{ + /// + /// An interface for custom extractors that can determine if they can handle a given stream. + /// This allows library users to extend the extractor with support for additional archive types. + /// + public interface ICustomAsyncExtractor : AsyncExtractorInterface + { + /// + /// Determines if this extractor can extract the given stream based on binary signatures or other criteria. + /// This method should check the stream's content (similar to how MiniMagic works) and return true if this + /// extractor supports the file format. + /// + /// The stream to check. The implementation should preserve the stream's original position. + /// True if this extractor can handle the stream, false otherwise. + bool CanExtract(Stream stream); + } +} diff --git a/RecursiveExtractor/RecursiveExtractor.csproj b/RecursiveExtractor/RecursiveExtractor.csproj index 09b0c90..9fdec44 100644 --- a/RecursiveExtractor/RecursiveExtractor.csproj +++ b/RecursiveExtractor/RecursiveExtractor.csproj @@ -24,6 +24,12 @@ snupkg + + + <_Parameter1>RecursiveExtractor.Tests + + +