Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ public final class RecursiveSearchPathResolver: Sendable {
result.append(Path("."))
} else if sourcePath.isRoot {
result.append(Path(String(path.str[path.str.utf8.index(after: path.str.utf8.startIndex)...])))
} else if path.str.hasPrefix(sourcePath.str) && Path.pathSeparatorsUTF8.contains(path.str.utf8[path.str.utf8.index(path.str.utf8.startIndex, offsetBy: sourcePath.str.utf8.count)]) {
} else if path.str.hasPrefix(sourcePath.str) && Path.isUTF8PathSeparator(path.str.utf8[path.str.utf8.index(path.str.utf8.startIndex, offsetBy: sourcePath.str.utf8.count)]) {
// FIXME: Use dropFirst() once available everywhere.
result.append(Path(String(path.str[path.str.utf8.index(path.str.utf8.startIndex, offsetBy: sourcePath.str.utf8.count + 1)...])))
} else {
Expand Down
59 changes: 44 additions & 15 deletions Sources/SWBUtil/Path.swift
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,45 @@ public struct Path: Serializable, Sendable {
/// The system path separator.
#if os(Windows)
public static let pathSeparator = Character("\\")
public static let pathSeparatorUTF8 = UInt8(ascii: "\\")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using an inline computed var instead of a stored let saves a little memory and a little indirection. Is it actually a relevant amount? Not really.

public static let pathSeparatorsUTF8 = Set([UInt8(ascii: "\\"), UInt8(ascii: "/")])
@inline(__always) public static var pathSeparatorUTF8: UInt8 { UInt8(ascii: "\\") }
public static let pathEnvironmentSeparator = Character(";")
public static let pathSeparators = Set("\\/")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we replace a full Set lookup of Strings with a few integer comparisons. The non-default separators path is probably slower, but there's no reason I can see to care about that.

Also removes a heap allocation and a lazy initialization.

@inline(__always) public static func isUTF8PathSeparator(_ char: UInt8, separators: (some Collection<Character>)? = ([Character]?).none) -> Bool {
guard let separators else {
return char == pathSeparatorUTF8 || char == UInt8(ascii: "/")
}
// This is a bit inefficient, but separators should always be nil outside of tests
return separators.contains(String(decoding: CollectionOfOne(char), as: UTF8.self))
}
@inline(__always) public static func firstPathSeparatorIndex(in str: some StringProtocol, separators: (some Collection<Character>)?) -> String.Index? {
guard let separators else {
return str.utf8.firstIndex(where: { Path.isUTF8PathSeparator($0, separators: separators) })
}
return str.firstIndex(where: { separators.contains($0) })
}
#else
public static let pathSeparator = Character("/")
public static let pathSeparatorUTF8 = UInt8(ascii: "/")
public static let pathSeparatorsUTF8 = Set([UInt8(ascii: "/")])
@inline(__always) public static var pathSeparatorUTF8: UInt8 { UInt8(ascii: "/") }
public static let pathEnvironmentSeparator = Character(":")
public static let pathSeparators = Set([Character("/")])
@inline(__always) public static func isUTF8PathSeparator(_ char: UInt8, separators: (some Collection<Character>)? = ([Character]?).none) -> Bool {
guard let separators else {
return char == pathSeparatorUTF8
}
// This is a bit inefficient, but separators should always be nil outside of tests
return separators.contains(String(decoding: CollectionOfOne(char), as: UTF8.self))
}
@inline(__always) public static func firstPathSeparatorIndex(in str: some StringProtocol, separators: (some Collection<Character>)?) -> String.Index? {
guard let separators else {
return str.utf8.index(of: pathSeparatorUTF8)
}
return str.firstIndex(where: { separators.contains($0) })
}
#endif

@inline(__always) public static func isPathSeparator(_ char: Character, separators: (some Collection<Character>)?) -> Bool {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, replacing set lookups with integer comparisons

guard let c = char.utf8.first else { return false }
return isUTF8PathSeparator(c, separators: separators)
}

/// The system path separator, as a string.
public static let pathSeparatorString = String(pathSeparator)

Expand Down Expand Up @@ -717,9 +744,10 @@ public struct Path: Serializable, Sendable {
var numComponents = 0
var isInPathComponent = false
var nextCharacterIsEscaped = false
for idx in pattern.indices {
for byte in pattern.utf8 {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Everything we're comparing this to is a single grapheme, so we can skip grapheme semantics here

// Skip over path separators, unless they're escaped.
if pattern[idx] == Path.pathSeparator {
//TODO: should this (and other similar uses) be Path.isUTF8PathSeparator(byte) instead for Windows?
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like feedback on this please. To my eye it looks like a spot where Windows would want us to check both path separators, but I wasn't sure, and the old code didn't do so.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's hard to say. Generally you wouldn't ever want to match \ without also looking for /, but I think \ is also used as an escape sequence within the pattern here, so I'm not sure...

if byte == Path.pathSeparatorUTF8 {
if !nextCharacterIsEscaped {
isInPathComponent = false
}
Expand All @@ -736,7 +764,7 @@ public struct Path: Serializable, Sendable {
nextCharacterIsEscaped = false
}
else {
nextCharacterIsEscaped = (pattern[idx] == Character("\\"))
nextCharacterIsEscaped = (byte == UInt8(ascii: "\\"))
}
}
return numComponents
Expand All @@ -746,19 +774,20 @@ public struct Path: Serializable, Sendable {
var numPathComponentsInPath = 0
var isInPathComponent = false
var firstIdx: String.Index?
for idx in self.str.indices.reversed() {
let utf8Str = self.str.utf8
for idx in utf8Str.indices.reversed() {
// Skip over path separators. We ignore backslashes here, since paths don't have escape characters.
if self.str[idx] == Path.pathSeparator {
if utf8Str[idx] == Path.pathSeparatorUTF8 {
isInPathComponent = false
// If we've found the expected number of path components, then we stop, and record the index of the first character we want to match against.
if numPathComponentsInPath == numPathComponentsInPattern {
if idx != self.str.endIndex {
firstIdx = self.str.index(after: idx)
if idx != utf8Str.endIndex {
firstIdx = utf8Str.index(after: idx)
}
break
}
}
else if idx == self.str.startIndex {
else if idx == utf8Str.startIndex {
// If we didn't encounter a path separator, then the full string is the trailing subpath.
firstIdx = idx
break
Expand All @@ -781,7 +810,7 @@ public struct Path: Serializable, Sendable {
}

// Create a string from the first index we found to the end of the path.
let trailingSubpath = String(self.str[first..<self.str.endIndex])
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This just got converted back to a Substring immediately in the callee, no reason not to leave it as one to begin with.

let trailingSubpath = self.str[first..<self.str.endIndex]

// Match the pattern against the requisite number of trailing path components.
do {
Expand Down
48 changes: 19 additions & 29 deletions Sources/SWBUtil/fnmatch.swift
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,6 @@ private enum RangeStatus {
case error
}

private extension StringProtocol {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was unused after my other changes

@inline(__always)
func firstIndex(matching characters: Set<Character>) -> String.Index? {
if characters.isEmpty {
return nil
}
return firstIndex(where: {characters.contains($0)})
}
}

/// Multi-platform fnmatch implementation. This is intended to be a close match the the POSIX fnmatch of all platforms including Windows (though not all options are supported).
///
/// - parameter pattern: The pattern to match. When using the ``FnmatchOptions/pathname`` option, any path representation in the pattern is expected to use the POSIX path separator (`/`) to match with the input, and on Windows, the path separator (`/`) will be matched to either separator in the input string ( both `/` and `\` will be matched).
Expand All @@ -54,7 +44,7 @@ private extension StringProtocol {
/// - returns: `true` if the pattern matches the input, `false` otherwise.
///
/// - note: On Windows and when using the ``FnmatchOptions/pathname`` option, both separators (`/` and `\`) are recognized (see note on pattern parameter).
public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, options: FnmatchOptions = .default, pathSeparators: Set<Character> = Path.pathSeparators) throws
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some Collection lets us pass in non-Set types efficiently. Having a whole separate allocation to store 1-2 items is wasteful.

Defaulting to nil (sorry about the weird spelling of nil, it couldn't infer the type properly otherwise) lets us use the new default behavior that avoids lookups entirely.

public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, options: FnmatchOptions = .default, pathSeparators: (some Collection<Character>)? = ([Character]?).none) throws
-> Bool
{
// Use Substrings to avoid String allocations
Expand All @@ -76,32 +66,32 @@ public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, op
return false
}
case "?":
guard let _sc = input.first else {
guard let _sc = input.utf8.first else {
return false
}
if options.contains(.pathname) && pathSeparators.contains(_sc) {
if options.contains(.pathname) && Path.isUTF8PathSeparator(_sc, separators: pathSeparators) {
if backtrack() {
return false
}
}
input = input.dropFirst()
case "*":
var p = pattern.first
while pattern.first == "*" {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"*" is always a single grapheme, so using utf8 will get us the same results as characters

var p = pattern.utf8.first
while pattern.utf8.first == UInt8(ascii: "*") {
// consume multiple '*' in pattern
pattern = pattern.dropFirst()
p = pattern.first
p = pattern.utf8.first
}
if p == nil {
if options.contains(.pathname) {
// make sure input does not have any more path separators
return input.firstIndex(matching: pathSeparators) == nil ? true : false
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really wanted to delete the pathSeparators parameter entirely, but it's used for testing Windows on non-Windows hosts

return Path.firstPathSeparatorIndex(in: input, separators: pathSeparators) == nil
} else {
return true // pattern matched everything else in input
}
} else if pattern.first == "/" && options.contains(.pathname) {
} else if p == UInt8(ascii: "/") && options.contains(.pathname) {
// we have a '*/' pattern input must have an path separators to continue
guard let newInputIndex = input.firstIndex(matching: pathSeparators) else {
guard let newInputIndex = Path.firstPathSeparatorIndex(in: input, separators: pathSeparators) else {
return false
}
input.removeSubrange(..<newInputIndex)
Expand All @@ -110,14 +100,14 @@ public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, op
bt_pattern = pattern
bt_input = input
case "[":
guard let _sc = input.first else {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reordering these checks lets us do a quick utf8 check without having to construct a Character (since those are String under the hood)

return false
}
if pathSeparators.contains(_sc) && options.contains(.pathname) {
if let first = input.utf8.first, Path.isUTF8PathSeparator(first, separators: pathSeparators) && options.contains(.pathname) {
if backtrack() {
return false
}
}
guard let _sc = input.first else {
return false
}
var new_input = input.dropFirst()
var new_pattern = pattern
switch rangematch(pattern: &new_pattern, input: &new_input, test: _sc, options: options, pathSeparators: pathSeparators) {
Expand Down Expand Up @@ -146,7 +136,7 @@ public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, op
continue
} else {
// windows need to test for both path separators
if _pc == "/" && options.contains(.pathname) && pathSeparators.contains(_sc) {
if _pc == "/" && options.contains(.pathname) && Path.isPathSeparator(_sc, separators: pathSeparators) {
continue
}
if backtrack() {
Expand Down Expand Up @@ -177,15 +167,15 @@ public func fnmatch(pattern: some StringProtocol, input: some StringProtocol, op
}

@inline(__always)
private func rangematch(pattern: inout Substring, input: inout Substring, test: Character, options: FnmatchOptions, pathSeparators: Set<Character>) -> RangeStatus {
private func rangematch(pattern: inout Substring, input: inout Substring, test: Character, options: FnmatchOptions, pathSeparators: (some Collection<Character>)? = ([Character]?).none) -> RangeStatus {
var test = test

if !pattern.contains("]") {
if !pattern.utf8.contains(UInt8(ascii: "]")) {
// unmatched '[' test as literal '['
return "[" == test ? .match : .noMatch
}

let negate = pattern.first == "!"
let negate = pattern.utf8.first == UInt8(ascii: "!")
if negate {
pattern = pattern.dropFirst()
}
Expand All @@ -198,13 +188,13 @@ private func rangematch(pattern: inout Substring, input: inout Substring, test:
if c == "]" {
break
}
if options.contains(.pathname) && pathSeparators.contains(c) {
if options.contains(.pathname) && Path.isPathSeparator(c, separators: pathSeparators) {
return .noMatch
}
if options.contains(.caseInsensitive) {
c = Character(c.lowercased())
}
if pattern.first == "-" {
if pattern.utf8.first == UInt8(ascii: "-") {
let subPattern = pattern.dropFirst()
if var c2 = subPattern.first {
if c2 != "]" {
Expand Down
4 changes: 2 additions & 2 deletions Tests/SWBUtilTests/FnmatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ import SWBUtil

@Test(arguments: [true, false])
func pathnameMatch(isWindows: Bool) throws {
let separators = isWindows ? Set("\\/") : Set([Character("/")])
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Set("\/") looks so much like a Set containing one two-character-long String that I got confused. The thing it's passed to is generic now, so let's save the next person the wasted time.

let separators = isWindows ? "\\/" : "/"

try assertFnmatch(pattern: "x?y", input: "x/y", separators: separators)
try assertFnmatch(pattern: "x?y", input: "x/y", shouldMatch: false, options: [.pathname], separators: separators)
Expand All @@ -272,7 +272,7 @@ import SWBUtil
}

func assertFnmatch(
pattern: String, input: String, shouldMatch: Bool = true, options: FnmatchOptions = .default, separators: Set<Character> = Path.pathSeparators, sourceLocation: SourceLocation = #_sourceLocation) throws {
pattern: String, input: String, shouldMatch: Bool = true, options: FnmatchOptions = .default, separators: (some Collection<Character>)? = ([Character]?).none, sourceLocation: SourceLocation = #_sourceLocation) throws {
let comment = Comment(stringLiteral: "\(pattern) \(shouldMatch ? "should" : "should not") match \(input)")
let result = try fnmatch(pattern: pattern, input: input, options: options, pathSeparators: separators)
shouldMatch ? #expect(result, comment, sourceLocation: sourceLocation) : #expect(!result, comment, sourceLocation: sourceLocation)
Expand Down
Loading