-
Notifications
You must be signed in to change notification settings - Fork 125
Optimize matchesFilenamePattern #875
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -69,18 +69,45 @@ public struct Path: Serializable, Sendable { | |
| /// The system path separator. | ||
| #if os(Windows) | ||
| public static let pathSeparator = Character("\\") | ||
| public static let pathSeparatorUTF8 = UInt8(ascii: "\\") | ||
| public static let pathSeparatorsUTF8 = Set([UInt8(ascii: "\\"), UInt8(ascii: "/")]) | ||
| @inline(__always) public static var pathSeparatorUTF8: UInt8 { UInt8(ascii: "\\") } | ||
| public static let pathEnvironmentSeparator = Character(";") | ||
| public static let pathSeparators = Set("\\/") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we replace a full Set lookup of Strings with a few integer comparisons. The non-default separators path is probably slower, but there's no reason I can see to care about that. Also removes a heap allocation and a lazy initialization. |
||
| @inline(__always) public static func isUTF8PathSeparator(_ char: UInt8, separators: (some Collection<Character>)? = ([Character]?).none) -> Bool { | ||
| guard let separators else { | ||
| return char == pathSeparatorUTF8 || char == UInt8(ascii: "/") | ||
| } | ||
| // This is a bit inefficient, but separators should always be nil outside of tests | ||
| return separators.contains(String(decoding: CollectionOfOne(char), as: UTF8.self)) | ||
| } | ||
| @inline(__always) public static func firstPathSeparatorIndex(in str: some StringProtocol, separators: (some Collection<Character>)?) -> String.Index? { | ||
| guard let separators else { | ||
| return str.utf8.firstIndex(where: { Path.isUTF8PathSeparator($0, separators: separators) }) | ||
| } | ||
| return str.firstIndex(where: { separators.contains($0) }) | ||
| } | ||
| #else | ||
| public static let pathSeparator = Character("/") | ||
| public static let pathSeparatorUTF8 = UInt8(ascii: "/") | ||
| public static let pathSeparatorsUTF8 = Set([UInt8(ascii: "/")]) | ||
| @inline(__always) public static var pathSeparatorUTF8: UInt8 { UInt8(ascii: "/") } | ||
| public static let pathEnvironmentSeparator = Character(":") | ||
| public static let pathSeparators = Set([Character("/")]) | ||
| @inline(__always) public static func isUTF8PathSeparator(_ char: UInt8, separators: (some Collection<Character>)? = ([Character]?).none) -> Bool { | ||
| guard let separators else { | ||
| return char == pathSeparatorUTF8 | ||
| } | ||
| // This is a bit inefficient, but separators should always be nil outside of tests | ||
| return separators.contains(String(decoding: CollectionOfOne(char), as: UTF8.self)) | ||
| } | ||
| @inline(__always) public static func firstPathSeparatorIndex(in str: some StringProtocol, separators: (some Collection<Character>)?) -> String.Index? { | ||
| guard let separators else { | ||
| return str.utf8.index(of: pathSeparatorUTF8) | ||
| } | ||
| return str.firstIndex(where: { separators.contains($0) }) | ||
| } | ||
| #endif | ||
|
|
||
| @inline(__always) public static func isPathSeparator(_ char: Character, separators: (some Collection<Character>)?) -> Bool { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, replacing set lookups with integer comparisons |
||
| guard let c = char.utf8.first else { return false } | ||
| return isUTF8PathSeparator(c, separators: separators) | ||
| } | ||
|
|
||
| /// The system path separator, as a string. | ||
| public static let pathSeparatorString = String(pathSeparator) | ||
|
|
||
|
|
@@ -717,9 +744,10 @@ public struct Path: Serializable, Sendable { | |
| var numComponents = 0 | ||
| var isInPathComponent = false | ||
| var nextCharacterIsEscaped = false | ||
| for idx in pattern.indices { | ||
| for byte in pattern.utf8 { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Everything we're comparing this to is a single grapheme, so we can skip grapheme semantics here |
||
| // Skip over path separators, unless they're escaped. | ||
| if pattern[idx] == Path.pathSeparator { | ||
| //TODO: should this (and other similar uses) be Path.isUTF8PathSeparator(byte) instead for Windows? | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like feedback on this please. To my eye it looks like a spot where Windows would want us to check both path separators, but I wasn't sure, and the old code didn't do so. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's hard to say. Generally you wouldn't ever want to match \ without also looking for /, but I think \ is also used as an escape sequence within the pattern here, so I'm not sure... |
||
| if byte == Path.pathSeparatorUTF8 { | ||
| if !nextCharacterIsEscaped { | ||
| isInPathComponent = false | ||
| } | ||
|
|
@@ -736,7 +764,7 @@ public struct Path: Serializable, Sendable { | |
| nextCharacterIsEscaped = false | ||
| } | ||
| else { | ||
| nextCharacterIsEscaped = (pattern[idx] == Character("\\")) | ||
| nextCharacterIsEscaped = (byte == UInt8(ascii: "\\")) | ||
| } | ||
| } | ||
| return numComponents | ||
|
|
@@ -746,19 +774,20 @@ public struct Path: Serializable, Sendable { | |
| var numPathComponentsInPath = 0 | ||
| var isInPathComponent = false | ||
| var firstIdx: String.Index? | ||
| for idx in self.str.indices.reversed() { | ||
| let utf8Str = self.str.utf8 | ||
| for idx in utf8Str.indices.reversed() { | ||
| // Skip over path separators. We ignore backslashes here, since paths don't have escape characters. | ||
| if self.str[idx] == Path.pathSeparator { | ||
| if utf8Str[idx] == Path.pathSeparatorUTF8 { | ||
| isInPathComponent = false | ||
| // If we've found the expected number of path components, then we stop, and record the index of the first character we want to match against. | ||
| if numPathComponentsInPath == numPathComponentsInPattern { | ||
| if idx != self.str.endIndex { | ||
| firstIdx = self.str.index(after: idx) | ||
| if idx != utf8Str.endIndex { | ||
| firstIdx = utf8Str.index(after: idx) | ||
| } | ||
| break | ||
| } | ||
| } | ||
| else if idx == self.str.startIndex { | ||
| else if idx == utf8Str.startIndex { | ||
| // If we didn't encounter a path separator, then the full string is the trailing subpath. | ||
| firstIdx = idx | ||
| break | ||
|
|
@@ -781,7 +810,7 @@ public struct Path: Serializable, Sendable { | |
| } | ||
|
|
||
| // Create a string from the first index we found to the end of the path. | ||
| let trailingSubpath = String(self.str[first..<self.str.endIndex]) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This just got converted back to a Substring immediately in the callee, no reason not to leave it as one to begin with. |
||
| let trailingSubpath = self.str[first..<self.str.endIndex] | ||
|
|
||
| // Match the pattern against the requisite number of trailing path components. | ||
| do { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,16 +36,6 @@ private enum RangeStatus { | |
| case error | ||
| } | ||
|
|
||
| private extension StringProtocol { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was unused after my other changes |
||
| @inline(__always) | ||
| func firstIndex(matching characters: Set<Character>) -> String.Index? { | ||
| if characters.isEmpty { | ||
| return nil | ||
| } | ||
| return firstIndex(where: {characters.contains($0)}) | ||
| } | ||
| } | ||
|
|
||
| /// Multi-platform fnmatch implementation. This is intended to be a close match the the POSIX fnmatch of all platforms including Windows (though not all options are supported). | ||
| /// | ||
| /// - parameter pattern: The pattern to match. When using the ``FnmatchOptions/pathname`` option, any path representation in the pattern is expected to use the POSIX path separator (`/`) to match with the input, and on Windows, the path separator (`/`) will be matched to either separator in the input string ( both `/` and `\` will be matched). | ||
|
|
@@ -54,7 +44,7 @@ private extension StringProtocol { | |
| /// - returns: `true` if the pattern matches the input, `false` otherwise. | ||
| /// | ||
| /// - note: On Windows and when using the ``FnmatchOptions/pathname`` option, both separators (`/` and `\`) are recognized (see note on pattern parameter). | ||
| public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, options: FnmatchOptions = .default, pathSeparators: Set<Character> = Path.pathSeparators) throws | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Defaulting to nil (sorry about the weird spelling of nil, it couldn't infer the type properly otherwise) lets us use the new default behavior that avoids lookups entirely. |
||
| public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, options: FnmatchOptions = .default, pathSeparators: (some Collection<Character>)? = ([Character]?).none) throws | ||
| -> Bool | ||
| { | ||
| // Use Substrings to avoid String allocations | ||
|
|
@@ -76,32 +66,32 @@ public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, op | |
| return false | ||
| } | ||
| case "?": | ||
| guard let _sc = input.first else { | ||
| guard let _sc = input.utf8.first else { | ||
| return false | ||
| } | ||
| if options.contains(.pathname) && pathSeparators.contains(_sc) { | ||
| if options.contains(.pathname) && Path.isUTF8PathSeparator(_sc, separators: pathSeparators) { | ||
| if backtrack() { | ||
| return false | ||
| } | ||
| } | ||
| input = input.dropFirst() | ||
| case "*": | ||
| var p = pattern.first | ||
| while pattern.first == "*" { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "*" is always a single grapheme, so using utf8 will get us the same results as characters |
||
| var p = pattern.utf8.first | ||
| while pattern.utf8.first == UInt8(ascii: "*") { | ||
| // consume multiple '*' in pattern | ||
| pattern = pattern.dropFirst() | ||
| p = pattern.first | ||
| p = pattern.utf8.first | ||
| } | ||
| if p == nil { | ||
| if options.contains(.pathname) { | ||
| // make sure input does not have any more path separators | ||
| return input.firstIndex(matching: pathSeparators) == nil ? true : false | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really wanted to delete the pathSeparators parameter entirely, but it's used for testing Windows on non-Windows hosts |
||
| return Path.firstPathSeparatorIndex(in: input, separators: pathSeparators) == nil | ||
| } else { | ||
| return true // pattern matched everything else in input | ||
| } | ||
| } else if pattern.first == "/" && options.contains(.pathname) { | ||
| } else if p == UInt8(ascii: "/") && options.contains(.pathname) { | ||
| // we have a '*/' pattern input must have an path separators to continue | ||
| guard let newInputIndex = input.firstIndex(matching: pathSeparators) else { | ||
| guard let newInputIndex = Path.firstPathSeparatorIndex(in: input, separators: pathSeparators) else { | ||
| return false | ||
| } | ||
| input.removeSubrange(..<newInputIndex) | ||
|
|
@@ -110,14 +100,14 @@ public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, op | |
| bt_pattern = pattern | ||
| bt_input = input | ||
| case "[": | ||
| guard let _sc = input.first else { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reordering these checks lets us do a quick utf8 check without having to construct a Character (since those are String under the hood) |
||
| return false | ||
| } | ||
| if pathSeparators.contains(_sc) && options.contains(.pathname) { | ||
| if let first = input.utf8.first, Path.isUTF8PathSeparator(first, separators: pathSeparators) && options.contains(.pathname) { | ||
| if backtrack() { | ||
| return false | ||
| } | ||
| } | ||
| guard let _sc = input.first else { | ||
| return false | ||
| } | ||
| var new_input = input.dropFirst() | ||
| var new_pattern = pattern | ||
| switch rangematch(pattern: &new_pattern, input: &new_input, test: _sc, options: options, pathSeparators: pathSeparators) { | ||
|
|
@@ -146,7 +136,7 @@ public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, op | |
| continue | ||
| } else { | ||
| // windows need to test for both path separators | ||
| if _pc == "/" && options.contains(.pathname) && pathSeparators.contains(_sc) { | ||
| if _pc == "/" && options.contains(.pathname) && Path.isPathSeparator(_sc, separators: pathSeparators) { | ||
| continue | ||
| } | ||
| if backtrack() { | ||
|
|
@@ -177,15 +167,15 @@ public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, op | |
| } | ||
|
|
||
| @inline(__always) | ||
| private func rangematch(pattern: inout Substring, input: inout Substring, test: Character, options: FnmatchOptions, pathSeparators: Set<Character>) -> RangeStatus { | ||
| private func rangematch(pattern: inout Substring, input: inout Substring, test: Character, options: FnmatchOptions, pathSeparators: (some Collection<Character>)? = ([Character]?).none) -> RangeStatus { | ||
| var test = test | ||
|
|
||
| if !pattern.contains("]") { | ||
| if !pattern.utf8.contains(UInt8(ascii: "]")) { | ||
| // unmatched '[' test as literal '[' | ||
| return "[" == test ? .match : .noMatch | ||
| } | ||
|
|
||
| let negate = pattern.first == "!" | ||
| let negate = pattern.utf8.first == UInt8(ascii: "!") | ||
| if negate { | ||
| pattern = pattern.dropFirst() | ||
| } | ||
|
|
@@ -198,13 +188,13 @@ private func rangematch(pattern: inout Substring, input: inout Substring, test: | |
| if c == "]" { | ||
| break | ||
| } | ||
| if options.contains(.pathname) && pathSeparators.contains(c) { | ||
| if options.contains(.pathname) && Path.isPathSeparator(c, separators: pathSeparators) { | ||
| return .noMatch | ||
| } | ||
| if options.contains(.caseInsensitive) { | ||
| c = Character(c.lowercased()) | ||
| } | ||
| if pattern.first == "-" { | ||
| if pattern.utf8.first == UInt8(ascii: "-") { | ||
| let subPattern = pattern.dropFirst() | ||
| if var c2 = subPattern.first { | ||
| if c2 != "]" { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -247,7 +247,7 @@ import SWBUtil | |
|
|
||
| @Test(arguments: [true, false]) | ||
| func pathnameMatch(isWindows: Bool) throws { | ||
| let separators = isWindows ? Set("\\/") : Set([Character("/")]) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Set("\/") looks so much like a Set containing one two-character-long String that I got confused. The thing it's passed to is generic now, so let's save the next person the wasted time. |
||
| let separators = isWindows ? "\\/" : "/" | ||
|
|
||
| try assertFnmatch(pattern: "x?y", input: "x/y", separators: separators) | ||
| try assertFnmatch(pattern: "x?y", input: "x/y", shouldMatch: false, options: [.pathname], separators: separators) | ||
|
|
@@ -272,7 +272,7 @@ import SWBUtil | |
| } | ||
|
|
||
| func assertFnmatch( | ||
| pattern: String, input: String, shouldMatch: Bool = true, options: FnmatchOptions = .default, separators: Set<Character> = Path.pathSeparators, sourceLocation: SourceLocation = #_sourceLocation) throws { | ||
| pattern: String, input: String, shouldMatch: Bool = true, options: FnmatchOptions = .default, separators: (some Collection<Character>)? = ([Character]?).none, sourceLocation: SourceLocation = #_sourceLocation) throws { | ||
| let comment = Comment(stringLiteral: "\(pattern) \(shouldMatch ? "should" : "should not") match \(input)") | ||
| let result = try fnmatch(pattern: pattern, input: input, options: options, pathSeparators: separators) | ||
| shouldMatch ? #expect(result, comment, sourceLocation: sourceLocation) : #expect(!result, comment, sourceLocation: sourceLocation) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using an inline computed var instead of a stored let saves a little memory and a little indirection. Is it actually a relevant amount? Not really.