diff --git a/dcs-bagit/dcs-bagit-support/pom.xml b/dcs-bagit/dcs-bagit-support/pom.xml new file mode 100644 index 00000000..fc12c29b --- /dev/null +++ b/dcs-bagit/dcs-bagit-support/pom.xml @@ -0,0 +1,83 @@ + + + + + + + + + 4.0.0 + + Data Conservancy BagIt Support + Support classes for BagIt implementations and clients + + org.dataconservancy + dcs-bagit-support + jar + + + org.dataconservancy + dcs-bagit + 1.0.0-SNAPSHOT + + + + + + + + + + + + + + + + + + + org.slf4j + slf4j-api + compile + + + + + + org.slf4j + slf4j-log4j12 + test + + + + log4j + log4j + test + + + + junit + junit + test + + + + + diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/BoundToken.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/BoundToken.java new file mode 100644 index 00000000..589bef0e --- /dev/null +++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/BoundToken.java @@ -0,0 +1,100 @@ +/* + * Copyright 2015 Johns Hopkins University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dataconservancy.bagit.rules; + +/** + * Binds a {@link Token} to the string that it represents. Most Tokens have their strings bound already, by the + * {@link org.dataconservancy.bagit.rules.Token#getTokenString()} method: + *
+ *
{@code PATH_SEPARATOR}:
{@code /}
+ *
{@code EXACTLY_ONE_CHARACTER}:
{@code ?}
+ *
{@code ZERO_OR_MORE_CHARACTERS}:
{@code *}
+ *
{@code DIRECTORY}:
{@code **}
+ *
+ * The exception is the {@link Token#LITERAL LITERAL token}, because it isn't known, a priori, what the + * literal characters will be. + *

+ * Therefore this class is mostly redundant, and may fail the smell test, but it serves to bind the string + * representation to all Tokens, useful really for only the {@code LITERAL} token. + *

+ */ +class BoundToken { + + String bound; + Token token; + + BoundToken(Token token, String toBind) { + this.token = token; + this.bound = toBind; + } + + /** + * Return true if the the value bound to this token is exactly one character. + * + * @return true if the bound value is exactly one character. + */ + boolean isSingleChar() { + return bound.length() == 1; + } + + /** + * Return the first character of the bound value as a character. + * + * @return the first character of the bound value. + */ + char asChar() { + return bound.charAt(0); + } + + /** + * Return the entire bound value as a character array. This is what you would + * use if {@link #isSingleChar()} was false. + * + * @return the bound value as a character array. + */ + char[] asCharArray() { + return bound.toCharArray(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + BoundToken that = (BoundToken) o; + + if (bound != null ? !bound.equals(that.bound) : that.bound != null) return false; + if (token != that.token) return false; + + return true; + } + + @Override + public int hashCode() { + int result = bound != null ? bound.hashCode() : 0; + result = 31 * result + (token != null ? token.hashCode() : 0); + return result; + } + + @Override + public String toString() { + return "BoundToken{" + + "bound='" + bound + '\'' + + ", token=" + token + + '}'; + } +} diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Expression.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Expression.java new file mode 100644 index 00000000..4fcd862e --- /dev/null +++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Expression.java @@ -0,0 +1,141 @@ +/* + * + * * Copyright 2015 Johns Hopkins University + * * + * * Licensed under the Apache License, Version 2.0 (the "License"); + * * you may not use this file except in compliance with the License. + * * You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + */ + +package org.dataconservancy.bagit.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; + +/** + * An Expression is a String that represents a hierarchical path. An Expression may represent a path, or a pattern + * meant to match a path. + *

+ * * Even though a "path" and a "pattern" are both instances of an {@link Expression}, their semantics differ. A + * "path" only contains literal and path separator tokens. A "pattern" may contain literals, path separators, and + * matching tokens like '*' and '?'. Path segments are the tokens between consecutive path separators, addressable + * by their zero-indexed {@link org.dataconservancy.bagit.rules.Expression#depth() depth}. For example, the Expression + * '/foo/bar/baz.txt' has three path segments, 'foo' (depth = 0), 'bar' (depth = 1), and 'baz.txt' (depth = 2). The + * depth of the Expression is 2. + *

+ */ +public class Expression { + + /** + * Tokens that make up this Expression, with the left-most token at the head of the list. + */ + final private List tokens; + + /** + * Tokens that make up this Expression, except any leading or trailing path separator tokens are stripped. + * This is more amenable to streams operations. + */ + final private List sanitized; + + /** + * Map of path segments, keyed by their depth. A path segment is a List of BoundTokens that lie between + * consecutive path separators. So a path segment will never contain a path separator character. + */ + final private ConcurrentHashMap> segments = new ConcurrentHashMap<>(); + + /** + * Creates a new {@code Expression} instance from the supplied string. Normally an Expression represents a + * hierarchical path, so the supplied string will resemble a pattern matching a path, or an actual path. + * + * @param expression a string representing an expression. + */ + public Expression(String expression) { + this.tokens = ExpressionTokenizer.tokenize(expression); + this.sanitized = this.tokens.stream().collect(ArrayList::new, ArrayList::add, ArrayList::addAll); + if (this.sanitized.get(0).token == Token.PATH_SEPARATOR) { + this.sanitized.remove(0); + } + + if (this.sanitized.get(this.sanitized.size() - 1).token == Token.PATH_SEPARATOR) { + this.sanitized.remove(this.sanitized.size() - 1); + } + } + + /** + * The entire list of tokens that make up this {@code Expression}, including all path separators. + * + * @return the tokens that make up this {@code Expression} + */ + List getTokens() { + return tokens; + } + + /** + * A zero-based index representing the depth of the {@code Expression}. + *
+ *
{@code /}
depth == 0
+ *
{@code dir/}
depth == 0
+ *
{@code /dir}
depth == 0
+ *
{@code /dir/foo}
depth == 1
+ *
{@code /dir/foo/bar.txt}
depth == 2
+ *
{@code **/*.java}
depth == 1
+ *
+ * + * @return the depth of this {@code Expression}, always 0 or greater. + */ + public int depth() { + return (int) sanitized.stream().filter(bt -> bt.token == Token.PATH_SEPARATOR).count(); + } + + /** + * A path segment are the tokens that occur between two consecutive path separators. This method obtains the + * tokens for the path segment specified {@code depth}. Path separator tokens will not be included in the returned + * list. + * + * @param depth the zero-indexed depth of the path segment to retrieve + * @return the tokens making up the path segment, or an empty List if the depth is out of bounds + */ + public List getPathSegment(int depth) { + return segments.computeIfAbsent(depth, (d) -> { + List pathSegments = new ArrayList<>(); + int i = 0; + for (BoundToken t : sanitized) { + if (i > d) { + // done recording tokens, break + break; + } + + if (t.token == Token.PATH_SEPARATOR) { + // increment depth + i++; + // continue, we don't record path separators + continue; + } + + + if (d - i == 0) { + // record the token + pathSegments.add(t); + } + } + + return pathSegments; + }); + } + + @Override + public String toString() { + return tokens.stream() + .collect(StringBuilder::new, (s, bt) -> s.append(bt.bound), StringBuilder::append).toString(); + } +} diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java new file mode 100644 index 00000000..4bb0f3ab --- /dev/null +++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java @@ -0,0 +1,733 @@ +/* + * + * * Copyright 2015 Johns Hopkins University + * * + * * Licensed under the Apache License, Version 2.0 (the "License"); + * * you may not use this file except in compliance with the License. + * * You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + */ + +package org.dataconservancy.bagit.rules; + +import java.util.List; + +/** + * Responsible for matching an Expression representing a path against a pattern. This is quite possibly the most + * heavy-weight string parsing library you'll ever encounter. It is inspired by Ant-style pattern matching, and + * attempts to follow the same rules as the Ant + * implementation: + *

+ *

+ * These patterns look very much like the patterns used in DOS and UNIX:
+ *
+ * '*' matches zero or more characters, '?' matches one character.
+ *
+ * In general, patterns are considered relative paths, relative to a task dependent base directory (the dir attribute in
+ * the case of ). Only files found below that base directory are considered. So while a pattern like
+ * ../foo.java is possible, it will not match anything when applied since the base directory's parent is never scanned
+ * for files.
+ *
+ * Examples:
+ *
+ * .java  matches  .java, x.java and FooBar.java, but not FooBar.xml (does not end with .java).
+ *
+ * ?.java  matches  x.java, A.java, but not .java or xyz.java (both don't have one character before .java).
+ *
+ * Combinations of *'s and ?'s are allowed.
+ *
+ * Matching is done per-directory. This means that first the first directory in the pattern is matched against the first
+ * directory in the path to match. Then the second directory is matched, and so on. For example, when we have the
+ * pattern /?abc/*/*.java and the path /xabc/foobar/test.java, the first ?abc is matched with
+ * xabc, then * is matched with foobar, and finally *.java is matched with test.java. They all match, so the path
+ * matches the pattern.
+ *
+ * To make things a bit more flexible, we add one extra feature, which makes it possible to match multiple directory
+ * levels. This can be used to match a complete directory tree, or a file anywhere in the directory tree. To do this,
+ * ** must be used as the name of a directory. When ** is used as the name of a directory in the
+ * pattern, it matches zero or more directories. For example: /test/** matches all files/directories under
+ * /test/, such as /test/x.java, or /test/foo/bar/xyz.html, but not /xyz.xml.
+ *
+ * There is one "shorthand": if a pattern ends with / or \, then ** is appended. For example,
+ * mypackage/test/ is interpreted as if it were mypackage/test/**.
+ * 
+ *

+ *

+ * Even though a "path" and a "pattern" are both instances of an {@link Expression}, their semantics differ. A + * "path" only contains literal and path separator tokens. A "pattern" may contain literals, path separators, and + * matching tokens like '*' and '?'. Path segments are the tokens between consecutive path separators, addressable + * by their zero-indexed {@link org.dataconservancy.bagit.rules.Expression#depth() depth}. For example, the Expression + * '/foo/bar/baz.txt' has three path segments, 'foo' (depth = 0), 'bar' (depth = 1), and 'baz.txt' (depth = 2). The + * depth of the Expression is 2. + *

+ *

+ * Note that methods on this class are package-private, and are not meant to be exposed publicly. + *

+ *

+ * Because ExpressionMatcher is package-private, it can be hard to tell what the entry points into the ExpressionMatcher + * class are, and this test class doesn't help you determine that. Clients of ExpressionMatcher should be calling + *

    + *
  • {@link org.dataconservancy.bagit.rules.ExpressionMatcher#match(Expression, Expression)}
  • + *
+ *

+ */ +public class ExpressionMatcher { + + /** + * Convenience reference to a {@code BoundToken} that matches zero or more characters (i.e. '*'). + * See {@link #zero_plus} for the {@code char} analog. + */ + private static final BoundToken ZERO_OR_MORE = new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, + Token.ZERO_OR_MORE_CHARACTERS.getTokenString()); + + /** + * Convenience reference to a {@code BoundToken} that matches exactly one character (i.e. '?'). + * See {@link #exactly_one} for the {@code char} analog. + */ + private static final BoundToken EXACTLY_ONE = new BoundToken(Token.EXACTLY_ONE_CHARACTER, + Token.EXACTLY_ONE_CHARACTER.getTokenString()); + + /** + * The {@code char} analog of {@link #EXACTLY_ONE} + */ + private final char exactly_one; + + /** + * The {@code char} analog of {@link #ZERO_OR_MORE} + */ + private final char zero_plus; + + /** + * Constructs a new instance of a matcher. + * TODO: probably could be private and methods be made static. + */ + ExpressionMatcher() { + if (EXACTLY_ONE.isSingleChar()) { + exactly_one = EXACTLY_ONE.asChar(); + } else { + throw new RuntimeException("Implementation doesn't handle multi-character token: " + + Token.EXACTLY_ONE_CHARACTER); + } + + if (ZERO_OR_MORE.isSingleChar()) { + zero_plus = ZERO_OR_MORE.asChar(); + } else { + throw new RuntimeException("Implementation doesn't handle multi-character token: " + + Token.ZERO_OR_MORE_CHARACTERS); + + } + } + + /** + * Match the supplied path against the pattern. Matching is applied 'per-directory' as described + * {@link org.dataconservancy.bagit.rules.ExpressionMatcher above}. This is the main entry point into the pattern + * matching logic. + * + * @param pattern the pattern meant to match a path + * @param path the path to match against the pattern + * @return true if the pattern matches + */ + boolean match(Expression pattern, Expression path) { + + // the path should just be made up of path separators and literals + if (!isPath(path.getTokens())) { + return false; // probably should be an IAE + } + + if (pattern.depth() > path.depth()) { + // if the pattern depth is greater than the path we're supposed to be matching, + // then we can't match, so short-circuit + return false; // probably should be an IAE + } + + if (pattern.depth() == path.depth()) { + boolean match = true; + // we have alignment, simply match each path segment from the pattern against the path. + for (int i = 0; i <= pattern.depth(); i++) { + match &= match(pattern.getPathSegment(i), path.getPathSegment(i)); + } + + return match; + } + + int pathOff = 0; + int expOff = 0; + int nextLiteral = nextLiteral(pattern, expOff); + + return matchPathSegment(pattern, path, expOff, pathOff, nextLiteral); + } + + /** + * Attempt to match all of the path segments in {@code path} against {@code pattern}, starting from + * {@code pathDepth} and {@code patternDepth}. The {@code nextLiteral} parameter contains the depth of the next + * path segment in {@code pattern} containing a literal (or -1 if there isn't any). + * + * @param pattern the expression containing a matching tokens (i.e. pattern semantics) + * @param path the expression containing only literals or path separators (i.e. path semantics) + * @param patternDepth the depth to begin matching the pattern + * @param pathDepth the depth to begin matching the path + * @param nextLiteral the depth of the next pattern segment that contains a literal, or -1 if it doesn't exist + * @return true if all of the segments (starting from pathDepth) in the path can be matched in the pattern (starting + * from patternDepth) + */ + private boolean matchPathSegment(Expression pattern, Expression path, int patternDepth, int pathDepth, + int nextLiteral) { + + // if we're out of literals... + if (nextLiteral == -1) { + // See if there are remaining segments to match, and match them. + boolean match = true; + for (int i = pathDepth; i <= path.depth(); i++) { + match &= match(pattern.getPathSegment(patternDepth), path.getPathSegment(i)); + } + + return match; + } + + // attempt to match every path segment against the pattern segment containing literals. + int rightAnchor = nextMatch(path, pathDepth, pattern.getPathSegment(nextLiteral)); + + // if we don't match ... + if (rightAnchor == -1) { + return false; + } + + // make sure that every path segment from the left anchor to the right anchor matches the current pattern + boolean match = true; + for (int i = pathDepth; i < rightAnchor; i++) { + match &= match(pattern.getPathSegment(patternDepth), path.getPathSegment(i)); + } + + // if they match up to the anchor, keep going + if (match) { + pathDepth = rightAnchor; + patternDepth++; + nextLiteral = nextLiteral(pattern, nextLiteral + 1); + return matchPathSegment(pattern, path, patternDepth, pathDepth, nextLiteral); + } + + return false; + } + + /** + * Search the supplied pattern starting at {@code depth} for path segments that contain literals. Useful for + * finding the depth of path segment 'Foo??.java' in the pattern expression '**/Foo??.java'. + * + * @param pattern an expression with pattern semantics + * @param depth the depth to begin searching from + * @return the index of the next path segment (i.e. depth) that contains literals, or -1 if not found + */ + int nextLiteral(Expression pattern, int depth) { + if (depth > pattern.depth()) { + return -1; + } + + for (int i = depth; i <= pattern.depth(); i++) { + if (containsLiterals(pattern.getPathSegment(i))) { + return i; + } + } + + return -1; + } + + /** + * Attempts to match every path segment starting from {@code path.getPathSegment(depth)} against the + * {@code pattern}. The {@code path} is an {@code Expression} with path semantics (i.e. only containing literals + * and path separators). Each path segment (starting from {@code depth}) is matched against {@code pattern}. + * + * @param path an Expression with path semantics + * @param depth the depth of the expression to begin matching from + * @param pattern the pattern each path segment of {@code path} is matched against. + * @return the index of the first path segment (i.e. depth) that matched {@code pattern}, or -1 if no match + */ + int nextMatch(Expression path, int depth, List pattern) { + for (int i = depth; i <= path.depth(); i++) { + if (match(pattern, path.getPathSegment(i))) { + return i; + } + } + + return -1; + } + + /** + * Expected input are two Lists of BoundTokens. Each List is expected to be a path segment; that is, a List + * will contain all BoundTokens between two consecutive path separators, not including the separators. Therefore + * the path segment will not ever contain a path separator ('/'), nor should it contain a directory match + * token ('**'). + *

+ * Essentially this method is evaluating a pattern that may contain literals, '*', and '?' against a + * string of literals. + *

+ * + * @param patternPathSegment the pattern + * @param pathPathSegment the string (i.e. path) to match the pattern against + * @return true if the pattern matches the path + */ + boolean match(List patternPathSegment, List pathPathSegment) { + + // first, handle the short-circuit cases: + // patternPathSegment only contains '*' ; doesn't matter what pathPathSegment has, all tokens match + // patternPathSegment only contains '**' ; doesn't matter what pathPathSegment has, all tokens match + // patternPathSegment contains '?' and pathPathSegment only has a single token, the single token matches + // patternPathSegment is all literals ; see if the pathPathSegment equals + + if (isZeroOrMore(patternPathSegment)) { + return true; + } + + if (isDirectoryMatchToken(patternPathSegment)) { + // this guards match(CharSequence, CharSequence, int, int, int, int) from having to handle '**' tokens. + return true; + } + + if (pathPathSegment.size() == 1 && isExactlyOne(patternPathSegment)) { + return true; + } + + if (allLiterals(patternPathSegment)) { + return tokenEquals(patternPathSegment, pathPathSegment); + } + + // Otherwise, we have a multiple-token pattern that contains a mixture of literals + // and at least one of '*' or '?' + + CharSequence pattern = toCharSeq(patternPathSegment); + CharSequence path = toCharSeq(pathPathSegment); + + int fPatternIndex = 0; + int fPathIndex = 0; + int tokenIndex = findNextToken(pattern, fPatternIndex); + int literalIndex = findNextLiteral(pattern, fPatternIndex); + + int leftAnchor = 0; + + return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor); + } + + /** + * A recursive method for matching a {@code path} against a {@code pattern}. The method terminates when there are + * no more literals or tokens to be matched, or as soon as it determines a match isn't possible and returns early. + * N.B. this method cannot handle a directory matching token: '**'. It is expected that the caller + * has filtered these tokens out (see {@link #match(java.util.List, java.util.List)} and its + * {@link #isDirectoryMatchToken(java.util.List)} check. + *

+ * Developers, when reading this implementation, keep in mind that anchors are always indexes into the {@code path}, + * while {@code tokenIndex} and {@code literalIndex} are always indexes into {@code pattern}. The first major + * decision made is whether the method is attempting to match a token (e.g. '?' in "Foo??.java") or match a literal + * (e.g. "Foo", ".java" in "Foo??.java"). + *

+ *

+ * When matching a token, the first decision to make is whether you are going to match forward from the current + * token, or work backward from the end of the pattern. When matching a literal, the objective is to determine the + * anchors of the literal in the path and attempt to match it against the pattern. + *

+ * + * @param pattern the pattern to match against + * @param path the path to match + * @param fPathIndex the index into the {@code path} that has matched + * @param tokenIndex the index into {@code pattern} of the next token to be matched + * @param literalIndex the index into th {@code pattern} of the next literal to be matched + * @param leftAnchor not used TODO remove + * @return true if {@code path} matches {@code pattern} + */ + private boolean match(CharSequence pattern, CharSequence path, int fPathIndex, int tokenIndex, int literalIndex, int leftAnchor) { + // Index description: + // - fPathIndex, left and right anchors are always indexes in the path + // - token and literal are always indexes in the pattern. + + + int rightAnchor = Integer.MIN_VALUE; + + if (tokenIndex == Integer.MAX_VALUE && literalIndex == Integer.MAX_VALUE) { + // we've matched everything? + return true; + } + + if (tokenIndex < literalIndex) { + + // We are matching a token (because tokenIndex < literalIndex) + // + // If we are matching the last token in the pattern, we work backward in the path. + // If we are matching a token, and there are still more tokens left, we work forward in the path. + // + // - Find the left and right anchors in the path. + // - Find right anchor + // - Find the next literal in the pattern (using the literalIndex, and the [end of string|next token index]) + // - Match that literal in the path (from offset fPathIndex) + // - Set the right anchor at the start of the literal. + // - Find left anchor + // - Equal to the forward path index (fPathIndex) + // + // - If the token is a '*', we match. + // - If the token is a '?', and rightAnchor - leftAnchor == 1, we match. + // + // - If we match: + // - set the fPathIndex to the rightAnchor (because fPathIndex keeps track of what we've matched in the path) + // - set the next token index (or Integer.MIN_VALUE if the pattern is exhausted, or out of tokens) + // - leave literalIndex alone, because we didn't match a literal this go-around, we matched a token. + + // Find the right anchor. + int nextTokenIndex = findNextToken(pattern, tokenIndex + 1); + + leftAnchor = fPathIndex; + + CharSequence literal = null; + + if (nextTokenIndex != Integer.MAX_VALUE && nextTokenIndex != Integer.MAX_VALUE) { + // we are not at the last token, work forward + // in the case of consecutive tokens (e.g. "??"), the literalIndex will be greater than the nextTokenIndex + int literalLen = Math.max(nextTokenIndex - literalIndex, 1); + literal = (literalLen <= 0) ? "" : pattern.subSequence(literalIndex, literalIndex + literalLen); + + if ((rightAnchor = matchNextLiteral(path, fPathIndex, literal)) == Integer.MIN_VALUE) { + // we didn't match the literal in the path, so we won't match + return false; + } + } else { + // we are at the last token, work backward from the end of the pattern by matching the literal at + // the end of the pattern, then checking the remaining characters in the path with the pattern token + + // the special case is if the token we are matching is the last character of the pattern, in which + // case there won't be a literal to match. in this case, the right anchor will be set to + // the end of the path. + + if (tokenIndex == pattern.length() - 1) { + rightAnchor = path.length(); + } else { + literal = pattern.subSequence(tokenIndex + 1, pattern.length()); + + // if we don't match the literal in the path, then we don't match + if ((rightAnchor = matchNextLiteral(path, fPathIndex, literal)) == Integer.MIN_VALUE) { + return false; + } + } + } + + // - If the token is a '*', we match. + // - If the token is a '?', and rightAnchor - leftAnchor == 1, we match. + + // if the next token is inside of the right anchor, we have multiple tokens (e.g. '??') in a row. + if (pattern.charAt(tokenIndex) == exactly_one) { + if (nextTokenIndex < rightAnchor) { + if (pattern.subSequence(tokenIndex, literalIndex).chars().allMatch(c -> ((char) c) == '?')) { + fPathIndex = ++leftAnchor; + tokenIndex = findNextToken(pattern, tokenIndex + 1); + return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor); + } else { + return false; + } + } + + if (rightAnchor - leftAnchor == 1) { + fPathIndex = rightAnchor; + tokenIndex = findNextToken(pattern, tokenIndex + 1); + return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor); + } else { + return false; + } + } + + if (pattern.charAt(tokenIndex) == zero_plus) { //|| + //pattern.charAt(tokenIndex) == exactly_one && (rightAnchor - leftAnchor == 1) // ) { + // || ((nextTokenIndex < rightAnchor) && pattern.subSequence(tokenIndex, literalIndex).chars().allMatch(c -> ((char) c) == '?'))) { + + // - If we match: + // - set the fPathIndex to the rightAnchor (because fPathIndex keeps track of what we've matched in the path) + // - set the next token index (or Integer.MAX_VALUE if the pattern is exhausted, or out of tokens) + // - leave literalIndex alone, because we didn't match a literal this go-around, we matched a token. + + fPathIndex = rightAnchor; + tokenIndex = findNextToken(pattern, tokenIndex + 1); + + return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor); + + } else { + return false; + } + + } else if (literalIndex < tokenIndex) { + + // We are matching a literal (because literalIndex < tokenIndex) + CharSequence literalToMatch; + + if (literalIndex == Integer.MIN_VALUE) { + // we're out of literals, so we just have to match that last token + rightAnchor = path.length(); + if (pattern.charAt(tokenIndex) == zero_plus || + pattern.charAt(tokenIndex) == exactly_one && rightAnchor - leftAnchor == 1) { + return true; + } else { + return false; + } + } else { + leftAnchor = fPathIndex; + + // if we can't find the right anchor, then we can't match. + if ((rightAnchor = findRightAnchor(pattern, path, fPathIndex, literalIndex)) == Integer.MAX_VALUE) { + return false; + } + + literalToMatch = pattern.subSequence(literalIndex, Math.min(tokenIndex, pattern.length())); + } + + // does the literal in the pattern match the literal between the anchors? + if (path.subSequence(leftAnchor, rightAnchor).equals(literalToMatch)) { + + // - If we match: + // - set the fPathIndex to the rightAnchor (because fPathIndex keeps track of what we've matched in the path) + // - leave the next token index alone, because we didn't match a token this go around, we matched a literal + // - set the literal index to the beginning of the next literal + + fPathIndex = rightAnchor; + literalIndex = findNextLiteral(pattern, literalIndex + literalToMatch.length()); + + return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor); + } + } + return false; + } + + /** + * Attempt to find the next occurrence of a token in {@code pattern}, starting from {@code offset}. + *

+ * Remember that even though paths and patterns are both instances of {@link Expression}, the semantics of + * a 'path' are that it contains only literals and path separators, and differs from a 'pattern' which can contain + * matching tokens like '*' and '?'. + *

+ * + * @param pattern the pattern to search through + * @param offset the offset into pattern to start searching from + * @return the offset in the pattern with the next occurrence of a token, or {@code Integer.MAX_VALUE} if not + * found. + * @see #findNextLiteral(CharSequence, int) + */ + int findNextToken(CharSequence pattern, int offset) { + if (offset < 0 || offset >= pattern.length() || pattern.length() == 0) { + return Integer.MAX_VALUE; + } + + for (int i = offset; i < pattern.length(); i++) { + if (pattern.charAt(i) == exactly_one || pattern.charAt(i) == zero_plus) { + return i; + } + } + + return Integer.MAX_VALUE; + } + + /** + * Attempt to find the next occurrence of a literal in {@code pattern}, starting from {@code offset}. + *

+ * Remember that even though paths and patterns are both instances of {@link Expression}, the semantics of + * a 'path' are that it contains only literals and path separators, and differs from a 'pattern' which can contain + * matching tokens like '*' and '?'. + *

+ * + * @param pattern the pattern to search through + * @param offset the offset into pattern to start searching from + * @return the offset in the pattern with the next occurrence of a literal, or {@code Integer.MAX_VALUE} if not + * found. + * @see #findNextToken(CharSequence, int) + */ + int findNextLiteral(CharSequence pattern, int offset) { + if (offset < 0 || offset >= pattern.length() || pattern.length() == 0) { + return Integer.MAX_VALUE; + } + + for (int i = offset; i < pattern.length(); i++) { + if (pattern.charAt(i) != exactly_one && pattern.charAt(i) != zero_plus) { + return i; + } + } + + return Integer.MAX_VALUE; + } + + /** + * Attempts to match the literal in the path, from the supplied offset, and returns the offset where the + * literal occurs. + * + * @param path the path being searched for a literal string + * @param offset the offset in path to start searching from + * @param literal the literal string to find + * @return the offset of {@code literal} in {@code path}, or {@code Integer.MIN_VALUE} if not found + */ + int matchNextLiteral(CharSequence path, int offset, CharSequence literal) { + if (offset < 0 || offset >= path.length() || path.length() == 0) { + return Integer.MIN_VALUE; + } + + CharSequence sub = path.subSequence(offset, path.length()); + + int litIdx = 0; + int subIdx = 0; + while (litIdx < literal.length() && subIdx < sub.length()) { + if (literal.charAt(litIdx) == sub.charAt(subIdx)) { + // increment literal index if there's a match + litIdx++; + } else { + // reset litIdx to 0 + litIdx = 0; + } + ; + + subIdx++; // always increment the substring index + } + + // we matched the literal if the literal index is the same as its CharSequence + if (literal.length() - litIdx == 0) { + // then the offset into the path of the beginning of the literal is + // offset + ( subIdx - literal.length() ) + return offset + (subIdx - literal.length()); + } + + return Integer.MIN_VALUE; // literal wasn't found. + } + + /** + * Returns true if the path segment contains a single {@link Token#ZERO_OR_MORE_CHARACTERS} token. + * + * @param pathSegment the path segment containing arbitrary tokens + * @return true if the only token in the path segment is a {@code ZERO_OR_MORE_CHARACTERS} token. + */ + boolean isZeroOrMore(List pathSegment) { + return pathSegment.size() == 1 && pathSegment.get(0).token == Token.ZERO_OR_MORE_CHARACTERS; + + + } + + /** + * Returns true if the path segment contains a single {@link Token#DIRECTORY} token, or exactly two + * {@link Token#ZERO_OR_MORE_CHARACTERS} tokens. + * + * @param pathSegment the path segment containing arbitrary tokens + * @return true if the path segment will match a directory + */ + boolean isDirectoryMatchToken(List pathSegment) { + return (pathSegment.size() == 1 && pathSegment.get(0).token == Token.DIRECTORY) || + (pathSegment.size() == 2 && pathSegment.get(0).token == Token.ZERO_OR_MORE_CHARACTERS + && pathSegment.get(1).token == Token.ZERO_OR_MORE_CHARACTERS); + } + + /** + * Returns true if the path segment contains a single token, and the token is a {@link Token#EXACTLY_ONE_CHARACTER}. + * + * @param pathSegment the path segment containing arbitrary tokens + * @return true if the single token in the path segment is a {@code EXACTLY_ONE_CHARACTER} token. + */ + boolean isExactlyOne(List pathSegment) { + return pathSegment.size() == 1 && pathSegment.get(0).token == Token.EXACTLY_ONE_CHARACTER; + } + + /** + * Answers a {@code CharSequence} that contains the value of each token, in the same order, as supplied by + * {@code tokens}. + * + * @param tokens a List of arbitrary tokens + * @return the sequence of token values + */ + private CharSequence toCharSeq(List tokens) { + return tokens.stream().collect(StringBuilder::new, (sb, bt) -> sb.append(bt.bound), + StringBuilder::append).toString(); + } + + /** + * Returns true if each list is equal in size, and contains + * {@link org.dataconservancy.bagit.rules.BoundToken#equals(Object) equal} tokens, in the same order. + * + * @param one the first list of arbitrary tokens + * @param two the second list of arbitrary tokens + * @return true if the lists contain equal content + */ + private boolean tokenEquals(List one, List two) { + if (one.size() != two.size()) { + return false; + } + + for (int i = 0; i < one.size(); i++) { + if (!one.get(i).equals(two.get(i))) { + return false; + } + } + + return true; + } + + /** + * + * @param pattern + * @param path + * @param pathOff + * @param patternOff + * @return + */ + int findRightAnchor(CharSequence pattern, CharSequence path, int pathOff, int patternOff) { + // - Find the next token in the pattern from patternOff + // - The token index (or end of string) is the end of the literal. + + // Assumes that patternOff isn't positioned at a token, and that patternOff + 1 isn't a token either + int nextToken = Math.min(findNextToken(pattern, patternOff), pattern.length()); + + CharSequence literalToMatch = pattern.subSequence(patternOff, nextToken); + + // - Match that literal in the path (from offset fPathIndex) + // - If the literalToMatch isn't found in 'path', or if the matched literal is an empty string, return Integer.MAX_VALUE + if (literalToMatch.length() == 0) { + return Integer.MAX_VALUE; + } + int tmpIdx = matchNextLiteral(path, pathOff, literalToMatch); + if (tmpIdx == Integer.MIN_VALUE) { + return Integer.MAX_VALUE; + } + + // - Set the right anchor at the end of the literal. + return tmpIdx + literalToMatch.length(); + } + + /** + * Returns true if any of the tokens in {@code pathTokens} represent a literal. + * + * @param pathTokens the tokens to check, normally these are the tokens from a single path segment. + * @return true if any of the supplied token is a literal. + */ + boolean containsLiterals(List pathTokens) { + return (pathTokens.stream().filter(bt -> bt.token == Token.LITERAL).count() > 0); + } + + /** + * Returns true if all of the tokens in {@code pathTokens} are literal. + * + * @param pathTokens the tokens to check; normally these are tokens from a single path segment. + * @return + */ + boolean allLiterals(List pathTokens) { + return (pathTokens.stream().filter(bt -> bt.token == Token.LITERAL).count() == pathTokens.size()); + } + + /** + * Returns true if all of the tokens in {@code pathTokens} represent a literal or + * path separator. This method is used to determine of a List of tokens represents a path or a pattern. + *

+ * For example, tokens for {@code /foo/bar/baz.txt} would return {@code true}; tokens for {@code Foo??.java} would + * return {@code false}, because the '?' characters are not a literal or path separator token. + *

+ * + * @param pathTokens the tokens to check, normally these are tokens for a path or pattern + * @return true if the list of tokens contains only literals or separators + */ + boolean isPath(List pathTokens) { + return (pathTokens.stream().filter( + bt -> bt.token == Token.LITERAL || bt.token == Token.PATH_SEPARATOR).count() == pathTokens.size()); + } + +} diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java new file mode 100644 index 00000000..dc6b96b4 --- /dev/null +++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java @@ -0,0 +1,171 @@ +/* + * + * * Copyright 2015 Johns Hopkins University + * * + * * Licensed under the Apache License, Version 2.0 (the "License"); + * * you may not use this file except in compliance with the License. + * * You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + */ + +package org.dataconservancy.bagit.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import static org.dataconservancy.bagit.rules.Message.ERR_NULL; + +/** + * Tokens are strings that make up a location expressions. Location expressions are patterns that are matched against + * paths. Location expressions are inspired by Apache Ant file pattern matching. + */ +enum Token { + + /** + * A token matching exactly one character in an expression. + */ + EXACTLY_ONE_CHARACTER("?"), + + /** + * A token that will match multiple directory levels in an expression. + */ + DIRECTORY("**"), + + /** + * A token matching zero or more characters in an expression. Must always be defined sometime after + * {@link #DIRECTORY} + */ + ZERO_OR_MORE_CHARACTERS("*"), + + + /** + * A token that separates path segments in an expression. + */ + PATH_SEPARATOR("/"), + + /** + * A special token with a {@code null} token string. Must always be defined last + */ + LITERAL(); + + private static final String ERR_MULTIPLE_TOKENS = "Candidate sequence '%s' contains multiple tokens. " + + "Try splitting up the tokens and submitting the tokens one at a time."; + + /** + * String representation of the token, if there is one. + */ + private String tokenString; + + /** + * Construct a Token with no string representation. Currently reserved for {@link #LITERAL} tokens. + */ + private Token() { + this.tokenString = null; + } + + /** + * Construct a token with the supplied string representation. + * + * @param tokenString the string representation of the token. + * @throws java.lang.IllegalArgumentException if the {@code tokenString} is {@code null} + */ + private Token(String tokenString) { + if (tokenString == null) { + throw new IllegalArgumentException(String.format(ERR_NULL, "tokenString")); + } + this.tokenString = tokenString; + } + + /** + * Obtain the string form of the token, may be {@code null}. {@link #LITERAL} tokens will not + * have a string form, because a literal is the set of characters that do not represent a token. + * + * @return the string form of the token, or {@code null} in the case of {@code LITERAL} tokens. + */ + String getTokenString() { + return tokenString; + } + + /** + * Attempts to parse a string which represents a single token into a {@code Token} + * + * @param candidate the candidate token string + * @return a {@code Token} if {@code candidate} represents a valid token + * @throws java.lang.IllegalArgumentException if {@code candidate} does not represent a valid token + */ + static BoundToken parse(CharSequence candidate) { + if (candidate == null || candidate.length() == 0) { + throw new IllegalArgumentException(String.format(ERR_NULL, "candidate")); + } + + for (Token m : Token.values()) { + + // See if the candidate token string equals the string representation + // of the token (except LITERAL), and return it + if (m.tokenString != null && m.tokenString.equals(candidate)) { + return new BoundToken(m, candidate.toString()); + } + + // Check to see if the candidate token string _contains_ the string representation + // of the token (except LITERAL). If so, that means that the candidate contains multiple + // tokens, which isn't allowed. + if (candidate.length() > 1 && + m.tokenString != null && + candidate.chars().anyMatch( + c -> m.tokenString.contains(Character.toString((char) c)))) { + throw new IllegalArgumentException(String.format(ERR_MULTIPLE_TOKENS, candidate)); + } + } + + // None of our Token string representations equaled the candidate string. + // The candidate string did not _contain_ any of the Token string representations + // We must be left with a LITERAL. + + return new BoundToken(Token.LITERAL, candidate.toString()); + } + + static List parseString(CharSequence candidate) { + if (candidate == null || candidate.length() == 0) { + throw new IllegalArgumentException(String.format(ERR_NULL, "candidate")); + } + + return + candidate.chars().mapToObj(c -> { + // This code block maps each character in the sequence to a BoundToken. + + // Cast the int to a char, and parse it as a String + String s = String.valueOf((char) c); + BoundToken bound = null; + + // Iterate over every Token (except LITERAL), and see if the string matches + for (Token t : Token.values()) { + if (t.getTokenString() != null && t.getTokenString().equals(s)) { + bound = new BoundToken(t, s); + } + } + + // If there was no match, then we must have a LITERAL. + if (bound == null) { + bound = new BoundToken(LITERAL, s); + } + + return bound; + }).collect(Collectors.toList()); + } + + @Override + public String toString() { + return "Token{" + + "tokenString='" + tokenString + '\'' + + '}'; + } +} diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagFileRole.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagFileRole.java new file mode 100644 index 00000000..ec33bb84 --- /dev/null +++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagFileRole.java @@ -0,0 +1,82 @@ +/* + * Copyright 2015 Johns Hopkins University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dataconservancy.bagit.support; + +/** + * Roles of tag files seen in a Bag. Roles are defined independently of concrete files because: 1) a single role may + * be served by multiple files; 2) names of files that fulfill these roles may change. + *

+ * While changing the name of the {@code bagit.txt} file is hard to imagine, it is reasonable that additional files may + * fulfill a particular role as the BagIt specification evolves. If the name of {@code bagit.txt} does change, it is + * likely that the role of a bag declaration will continue to be needed, even if it is not longer fulfilled by + * {@code bagit.txt}. + *

+ *

+ * BagIt requires that there be a bag declaration and a pay load manifest. These roles are enumerated in this class as + * {@link #BAG_DECL} and {@link #PAYLOAD_MANIFEST}, respectively. Other roles such as a tag manifest, payload + * directory, and fetch file are enumerated in this class. A payload manifest role may be fulfilled by two different + * files, a {@code manifest-sha1.txt} file containing SHA checksums, and a {@code manifest-md5.txt} file containing MD5 + * checksums. In the future, implementations may use SHA-256 or other algorithms. Regardless of the name of future + * files, their role will be enumerated in this class. + *

+ *

+ * The documentation for each role includes example file names from the specification, and are informative. These are + * meant to be examples in aiding the comprehension of what the role represents; they are not normative. + *

+ */ +public enum BagFileRole { + + /** + * The bag payload (e.g. {@code data/}) directory. + */ + PAYLOAD_DIRECTORY, + + /** + * Bag payload itself (e.g. content in the {@code data/} directory. + */ + PAYLOAD_CONTENT, + + /** + * Tag file corresponding to the {@code bagit.txt} file, at the base of the bag. + */ + BAG_DECL, + + /** + * Tag file corresponding to the {@code bag-info.txt} file, at the base of the bag. + */ + BAG_INFO, + + /** + * Tag file(s) corresponding to the payload {@code manifest-<algorithm>.txt} file, at the base of the bag. + */ + PAYLOAD_MANIFEST, + + /** + * Tag file(s) corresponding to the {@code tagmanifest-<algorithm>.txt} file, at the base of the bag. + */ + TAG_MANIFEST, + + /** + * Tag file corresponding to the {@code fetch.txt} file, at the base of the bag. + */ + FETCH, + + /** + * Tag files corresponding to additional tag files, not covered by the BagIt specification. + */ + OTHER_TAG +} diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagUri.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagUri.java new file mode 100644 index 00000000..407cb1fb --- /dev/null +++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagUri.java @@ -0,0 +1,174 @@ +/* + * Copyright 2015 Johns Hopkins University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dataconservancy.bagit.support; + +import java.net.URI; +import java.net.URISyntaxException; + +/** + * URI scheme for addressing resources contained within a Bag. The form of a Bag URI is: + * {@code bag:///path/to/resource#optional-fragment} + *

+ * The Bag URI {@link #BAG_SCHEME scheme} is equal to the string '{@code bag}'; resources inside the bag are unique + * within the scope of a single bag. The {@code authority} component of a Bag URI is equal to the name of the Bag + * serialization (as discussed in BagIt + * section 4), minus any file name extensions. Query parameters are disallowed in Bag URIs, as they have no semantic + * analog in the BagIt specification. + *

+ * + * @see BagIt Draft Specification version 0.97, expires December 25, 2015 + * @see RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax + * @see Data Conservancy BagIt Profile 1.0, section X + */ +public class BagUri { + + /** + * The value of the Bag URI {@code scheme} (RFC 2396 sec. 3) + */ + public static final String BAG_SCHEME = "bag"; + + /** + * Characters that are reserved (i.e. illegal) for URI authority portion (RFC 2396 sec. 3.2) + */ +// private static final char[] RESERVED_AUTHORITY_CHARACTERS = new char[] { ';', ':', '@', '?', '/' }; + + private static final String ERR_NULL = "Argument '%s' must not be null or empty."; + + private static final String ERR_INVALID_SCHEME = "Invalid scheme '%s' for " + BagUri.class.getName() + ": scheme " + + "must be equal to '" + BAG_SCHEME + "'"; + + private static final String ERR_PARSE_URI = "Unable to parse URI string '%s': %s"; + + private static final String ERR_CREATE_URI = "Unable to construct a URI with scheme '%s', authority '%s', path '%s', and fragment '%s': %s"; + + /** + * Internal representation of the BagUri as a java.net.URI. + */ + private URI bagUri; + + /** + * The authority string (must not be {@code null}). It semantically aligns with, and should be equal to, the name + * of the bag. We keep this state for our own equals() and hashCode() implementation. + */ + private String authority; + + /** + * The path string (may be {@code null}). We keep this state for our own equals() and hashCode() implementation. + */ + private String path; + + /** + * The fragment string (may be {@code null}). We keep this state for our own equals() and hashCode() + * implementation. + */ + private String fragment; + + /** + * Constructs a new Bag URI, which addresses a resource in a Bag named by {@code authority}. + *

+ * Exemplars:
+ *

    + *
  • The {@code path} "data" with {@code authority} "mybag" would address the data directory inside + * of a Bag named 'mybag': {@code bag://mybag/data}.
  • + *
  • The {@code path} "bag-info.txt" would identify the Bag metadata file: {@code bag://mybag/bag-info.txt}.
  • + *
  • The {@code path} "data/dataobject.rdf" with a {@code fragment} "#obj-3" would identify a resource + * "{@code obj-3}" inside of the payload file {@code data/dataobject.rdf}: + * {@code bag://mybag/data/dataobject#obj-3}.
  • + *
+ *

+ * + * @param authority the authority portion of the URI, which is expected to be the Bag name. Must not be + * {@code null}. + * @param path the path to the resource within the Bag + * @param fragment an optional fragment identifier, useful for referencing individual resources within a file + * @throws java.lang.IllegalArgumentException if any required parameters are {@code null} or invalid URI components. + */ + public BagUri(String authority, String path, String fragment) { + if (authority == null || authority.trim().length() == 0) { + throw new IllegalArgumentException(String.format(ERR_NULL, "authority")); + } + try { + bagUri = new URI(BAG_SCHEME, authority, path, null, fragment); + } catch (URISyntaxException e) { + throw new IllegalArgumentException( + String.format(ERR_CREATE_URI, BAG_SCHEME, authority, path, fragment, e.getMessage()), e); + } + + this.authority = authority; + this.path = path; + this.fragment = fragment; + } + + /** + * TODO javadoc + * @return + */ + public String getAuthority() { + return bagUri.getAuthority(); + } + + /** + * TODO javadoc + * @return + */ + public String getFragment() { + return bagUri.getFragment(); + } + + /** + * TODO javadoc + * @return + */ + public String getPath() { + return bagUri.getPath(); + } + + public URI asUri() { + return bagUri; + } + + /** + * {@inheritDoc} + *

+ * Instances of this class are considered equal if their authority, path, and fragment components are equal. + *

+ * + * @param o the object to determine equivalence against. + * @return {@code true} if the instances are equal, {@code false} otherwise + */ + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + BagUri bagUri = (BagUri) o; + + if (authority != null ? !authority.equals(bagUri.authority) : bagUri.authority != null) return false; + if (fragment != null ? !fragment.equals(bagUri.fragment) : bagUri.fragment != null) return false; + if (path != null ? !path.equals(bagUri.path) : bagUri.path != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = authority != null ? authority.hashCode() : 0; + result = 31 * result + (path != null ? path.hashCode() : 0); + result = 31 * result + (fragment != null ? fragment.hashCode() : 0); + return result; + } +} diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/BoundTokensTestUtil.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/BoundTokensTestUtil.java new file mode 100644 index 00000000..6cfae0b4 --- /dev/null +++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/BoundTokensTestUtil.java @@ -0,0 +1,119 @@ +/* + * + * * Copyright 2015 Johns Hopkins University + * * + * * Licensed under the Apache License, Version 2.0 (the "License"); + * * you may not use this file except in compliance with the License. + * * You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + */ + +package org.dataconservancy.bagit.rules; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; + +/** + * BoundTokens that are shared across unit tests. + */ +class BoundTokensTestUtil { + + /** + * The {@code BoundToken} version of {@link Token#ZERO_OR_MORE_CHARACTERS} + */ + static final BoundToken ZERO_OR_MORE = new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, "*"); + + /** + * The {@code BoundToken} version of {@link Token#ZERO_OR_MORE_CHARACTERS}, in a single element List + */ + static final List ZERO_OR_MORE_L = Arrays.asList( + new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, "*")); + + /** + * The {@code BoundToken} version of {@link Token#EXACTLY_ONE_CHARACTER} + */ + static final BoundToken EXACTLY_ONE = new BoundToken(Token.EXACTLY_ONE_CHARACTER, "?"); + + /** + * The {@code BoundToken} version of {@link Token#EXACTLY_ONE_CHARACTER}, in a single element List + */ + static final List EXACTLY_ONE_L = Arrays.asList( + new BoundToken(Token.EXACTLY_ONE_CHARACTER, "?")); + + /** + * The {@code BoundToken} version of {@link Token#PATH_SEPARATOR} + */ + static final BoundToken PATH_SEP = new BoundToken(Token.PATH_SEPARATOR, "/"); + + /** + * The {@code BoundToken} version of {@link Token#PATH_SEPARATOR}, in a single element List + */ + static final List PATH_SEP_L = Arrays.asList( + new BoundToken(Token.PATH_SEPARATOR, "/")); + + /** + * The {@code BoundToken} version of {@link Token#DIRECTORY}, in a single element List + */ + static final BoundToken DIR = new BoundToken(Token.DIRECTORY, "**"); + + /** + * The {@code BoundToken} version of {@link Token#DIRECTORY}, represented as a List containing two + * {@link #ZERO_OR_MORE} BoundTokens. + */ + static final List DIR_L = Arrays.asList(ZERO_OR_MORE, ZERO_OR_MORE); + + /** + * Convenience method for creating a {@link Token#LITERAL literal} token for each character in {@code s}. It does + * not evaluate the characters in {@code s} for whether or not they should actually be made literals. That is the + * responsibility of the developer. (For example, this method will happily make literal tokens of "*", "?", and + * "/", which are not allowed by {@link Token#parse(CharSequence)}.) + * + * @param s the string to represent as a List of BoundTokens + * @return a List containing LITERAL BoundTokens for each character in {@code s} + */ + static List literalsForString(String s) { + ArrayList literals = new ArrayList<>(); + s.chars().forEach(c -> literals.add(new BoundToken(Token.LITERAL, String.valueOf((char) c)))); + return literals; + } + + /** + * Asserts that the values in the expected and actual Lists are equal. This method will assert that + * the lists are the same size before comparing their values. + * + * @param expected the expected List of BoundTokens + * @param actual the actual List of BoundTokens, normally representing a test result. + */ + static void assertTokenListEquals(List expected, List actual) { + assertExpectedListCount(expected.size(), actual); + for (int i = 0; i < expected.size(); i++) { + assertEquals("Expected token: '" + expected.get(i) + "' but found '" + actual.get(i) + "'", + expected.get(i), actual.get(i)); + } + } + + /** + * Asserts that the supplied list of BoundTokens has the expected count. + * + * @param expectedCount the expected number of BoundTokens in {@code actual} + * @param actual a List of BoundTokens, normally representing the result of a test. + */ + static void assertExpectedListCount(int expectedCount, List actual) { + assertEquals("Expected " + expectedCount + " BoundTokens, found " + actual.size() + ": " + + actual.stream().map(bt -> "['" + bt.token.name() + "', '" + bt.bound + "']") + .collect(Collectors.joining(", ")), + expectedCount, actual.size()); + } +} diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java new file mode 100644 index 00000000..152c31b9 --- /dev/null +++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java @@ -0,0 +1,499 @@ +/* + * + * * Copyright 2015 Johns Hopkins University + * * + * * Licensed under the Apache License, Version 2.0 (the "License"); + * * you may not use this file except in compliance with the License. + * * You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + */ + +package org.dataconservancy.bagit.rules; + +import org.junit.Before; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.DIR_L; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.EXACTLY_ONE; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.assertTokenListEquals; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.literalsForString; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Many, many tests against various methods in the ExpressionMatcher class. + * Most of the test methods in this class contain multiple assertions. Normally there will be one assertion for a + * sanity check - an assertion that should always be true. Often there will be multiple sanity checks. + *

+ * Because ExpressionMatcher is package-private, it can be hard to tell what the entry points into the ExpressionMatcher + * class are, and this test class doesn't help you determine that. Clients of ExpressionMatcher should be calling + * either: + *

    + *
  • {@link org.dataconservancy.bagit.rules.ExpressionMatcher#match(Expression, Expression)}
  • + *
  • {@link org.dataconservancy.bagit.rules.ExpressionMatcher#match(java.util.List, java.util.List)}
  • + *
+ * This test class covers not only these entry point methods, but other utility methods as well. + *

+ */ +public class ExpressionMatcherTest { + + private ExpressionMatcher underTest; + + @Before + public void setUp() throws Exception { + underTest = new ExpressionMatcher(); + } + + /** + * Attempts a match using an Expression that starts with '**' and contains consecutive '?' matching tokens. + */ + @Test + public void testMatchExpressionLeadingDirectoryAndConsecutiveExactlyOne() throws Exception { + // The pattern to match against: leading '**' and consecutive '??' + Expression pattern = new Expression("**/Foo??.java"); + + // This path should match the pattern: src/test/java matches '**' and FooIT.java matches 'Foo??.java' + Expression path = new Expression("src/test/java/FooIT.java"); + + // This path should not match (the consecutive token '??' will remain unmatched) + Expression nonMatchingPath = new Expression("src/test/java/FooI.java"); + + // sanity: a path should match itself. + assertTrue(underTest.match(path, path)); + + assertTrue(underTest.match(pattern, path)); + assertFalse(underTest.match(pattern, nonMatchingPath)); + } + + /** + * Attempts a match using an Expression that starts with '**' and contains a '*' matching token. + */ + @Test + public void testMatchExpressionLeadingDirectoryAndZeroPlus() throws Exception { + // The pattern to match against: leading '**' and a '*' + Expression pattern = new Expression("**/*IT.java"); + + // This path should match the pattern: src/test/java matches '**' and FooIT.java matches '*IT.java' + Expression path = new Expression("src/test/java/FooIT.java"); + + // This path should not match (the path segment Bar.java will remain unmatched) + Expression nonMatchingPath = new Expression("src/test/java/Bar.java"); + + // sanity: a path should match itself + assertTrue(underTest.match(path, path)); + + assertTrue(underTest.match(pattern, path)); + assertFalse(underTest.match(nonMatchingPath, path)); + } + + /** + * Attempts a match using an Expression that starts with a '**' matching token. + */ + @Test + public void testMatchExpressionLeadingDirectory() throws Exception { + // The pattern to match against: leading '**' + Expression pattern = new Expression("**/FooIT.java"); + + // This path should match the pattern: src/test/java matches '**', and FooIT.java matches the 'FooIT.java' literal + Expression path = new Expression("src/test/java/FooIT.java"); + + // This path should not match + Expression nonMatchingPath = new Expression("src/test/java/BarIT.java"); + + // sanity: a path should match itself + assertTrue(underTest.match(path, path)); + + assertTrue(underTest.match(pattern, path)); + assertFalse(underTest.match(pattern, nonMatchingPath)); + } + + /** + * Attempts a match using equal lists of {@code List<BoundToken>} containing only literals (no matching tokens or + * path separators) + */ + @Test + public void testMatchWithOnlyLiterals() throws Exception { + List pattern = literalsForString("bar"); + List path = literalsForString("bar"); + assertTokenListEquals(path, pattern); + + // sanity: non-equal literal token lists should not match + assertFalse(underTest.match(literalsForString("foo"), path)); + + // test to make sure that equal literal token lists will match + assertTrue(underTest.match(pattern, path)); + } + + /** + * Verifies that a literal will not match a pattern that contains leading directory match tokens followed by + * a non-matching literal. A complicated way of saying that we verify that the pattern "*IT.java" won't match + * "src". + */ + @Test + public void testNoMatchBeginningZeroPlus() throws Exception { + // pattern: *IT.java + List pattern = new ArrayList<>(); + pattern.add(ZERO_OR_MORE); + pattern.addAll(literalsForString("IT.java")); + + // path: src + List path = literalsForString("src"); + + assertFalse(underTest.match(pattern, path)); + } + + /** + * Attempts a match {@code List<BoundToken>} leading with a '?' matching token, ending with a '*' matching + * token, and with a single '?' token in the middle. + */ + @Test + public void testLiteralsWithExactlyOne() throws Exception { + // pattern: "?tart?IT.jav?" + List pattern = new ArrayList<>(); + pattern.add(EXACTLY_ONE); + pattern.addAll(literalsForString("tart")); + pattern.add(EXACTLY_ONE); + pattern.addAll(literalsForString("IT.jav")); + pattern.add(EXACTLY_ONE); + + // path: startXIT.java (sanity, should pass) + List path = literalsForString("startXIT.java"); + assertTrue(underTest.match(pattern, path)); + + // path: strtXIT.java (first literal 'tart' doesn't match) + path = literalsForString("strtXIT.java"); + assertFalse(underTest.match(pattern, path)); + + // path: startXITT.java (middle literal 'IT.jav' doesn't match) + path = literalsForString("startXITT.java"); + assertFalse(underTest.match(pattern, path)); + + // path: startXIT.jav (last token '?' doesn't match - missing character in path) + path = literalsForString("startXIT.jav"); + assertFalse(underTest.match(pattern, path)); + + // path: startXIT.javaa (last literal 'a' in path doesn't match) + path = literalsForString("startXIT.javaa"); + assertFalse(underTest.match(pattern, path)); + } + + /** + * Attempts a match {@code List<BoundToken>} leading with a '*' matching token, ending with a '*' matching + * token, and with a single '*' token in the middle. + */ + @Test + public void testLiteralsWithZeroPlus() throws Exception { + // pattern: "*tart*IT.jav*" + + List pattern = new ArrayList<>(); + pattern.add(ZERO_OR_MORE); + pattern.addAll(literalsForString("tart")); + pattern.add(ZERO_OR_MORE); + pattern.addAll(literalsForString("IT.jav")); + pattern.add(ZERO_OR_MORE); + + // path: startXIT.java (sanity, should pass) + List path = literalsForString("startXIT.java"); + assertTrue(underTest.match(pattern, path)); + + // path: startXIT.jav (sanity, should pass) + path = literalsForString("startXIT.jav"); + assertTrue(underTest.match(pattern, path)); + + // path: tartXIT.java (sanity, should pass) + path = literalsForString("tartXIT.java"); + assertTrue(underTest.match(pattern, path)); + + // path: tartXIT.jav (sanity, should pass) + path = literalsForString("tartXIT.jav"); + assertTrue(underTest.match(pattern, path)); + + // path: strtXIT.java (first literal 'tart' doesn't match) + path = literalsForString("strtXIT.java"); + assertFalse(underTest.match(pattern, path)); + + // path: startXITT.java (middle literal 'IT.jav' doesn't match) + path = literalsForString("startXITT.java"); + assertFalse(underTest.match(pattern, path)); + } + + /** + * Attempts various path matches against a pattern that contains three matching '?' tokens, at the + * beginning, middle, and end of the pattern. + */ + @Test + public void testMultipleSingleCharacterTokens() throws Exception { + // pattern: "?tart?IT.jav?" + + List pattern = new ArrayList<>(); + pattern.add(EXACTLY_ONE); + pattern.addAll(literalsForString("tart")); + pattern.add(EXACTLY_ONE); + pattern.addAll(literalsForString("IT.jav")); + pattern.add(EXACTLY_ONE); + + // path: startXIT.java (sanity, should pass) + List path = literalsForString("startXIT.java"); + assertTrue(underTest.match(pattern, path)); + + // path: FootartXIT.java (too many characters for first token) + path = literalsForString("FootartXIT.java"); + assertFalse(underTest.match(pattern, path)); + + // path: tartXIT.java (no characters for first token) + path = literalsForString("tartXIT.java"); + assertFalse(underTest.match(pattern, path)); + + // path: StartItUpIT.java (too many characters for middle token) + path = literalsForString("StartItUpIT.java"); + assertFalse(underTest.match(pattern, path)); + + // path: StartIT.java (no characters for middle token) + path = literalsForString("StartIT.java"); + assertFalse(underTest.match(pattern, path)); + + // path: StartXIT.jav (no characters for last token) + path = literalsForString("StartXIT.jav"); + assertFalse(underTest.match(pattern, path)); + + // path: StartXIT.javaa (too many characters for last token) + path = literalsForString("StartXIT.javaa"); + assertFalse(underTest.match(pattern, path)); + } + + /** + * Attempts to match a path against a pattern containing a single matching token '?' in the middle. + */ + @Test + public void testMatchLiteralFirstExactlyOneNoMatch() throws Exception { + // pattern: "Start?IT.java" + + List pattern = new ArrayList<>(); + pattern.addAll(literalsForString("Start")); + pattern.add(EXACTLY_ONE); + pattern.addAll(literalsForString("IT.java")); + + // path: startXIT.java (sanity, should pass) + List path = literalsForString("StartXIT.java"); + assertTrue(underTest.match(pattern, path)); + + // path: StartFooIT.java (won't match) + path = literalsForString("StartFooIT.java"); + + assertFalse(underTest.match(pattern, path)); + } + + /** + * Attempts to match a path against a pattern containing a single matching token '*' in the middle. + */ + @Test + public void testMatchLiteralFirstZeroPlus() throws Exception { + // pattern: "Start*IT.java" + + List pattern = new ArrayList<>(); + pattern.addAll(literalsForString("Start")); + pattern.add(ZERO_OR_MORE); + pattern.addAll(literalsForString("IT.java")); + + // path: StartCarIT.java ('*' should match 'Car') + List path = literalsForString("StartCarIT.java"); + assertTrue(underTest.match(pattern, path)); + + // path: StartIT.java ('*' should match zero characters) + path = literalsForString("StartIT.java"); + assertTrue(underTest.match(pattern, path)); + } + + @Test + public void testMatchTokenFirst() throws Exception { + // pattern: "*File*IT.java" + + List pattern = new ArrayList<>(); + pattern.add(ZERO_OR_MORE); + pattern.addAll(literalsForString("File")); + pattern.add(ZERO_OR_MORE); + pattern.addAll(literalsForString("IT.java")); + + // path: UnixFileSmallIT.java + List path = literalsForString("UnixFileSmallIT.java"); + + assertTrue(underTest.match(pattern, path)); + } + + @Test + public void testMatchConsecutiveMatchTokens() throws Exception { + // pattern: "Foo??.java" + List pattern = new ArrayList<>(); + pattern.addAll(literalsForString("Foo")); + pattern.add(EXACTLY_ONE); + pattern.add(EXACTLY_ONE); + pattern.addAll(literalsForString(".java")); + + // path: FooIT.java + List path = literalsForString("FooIT.java"); + + assertTrue(underTest.match(pattern, path)); + } + + /** + * Attempt to match a directory against the directory match token '**' + */ + @Test + public void testMatchZeroPlusAndLiteral() throws Exception { + // pattern: "**" + List pattern = DIR_L; + + // path: "src" + List path = literalsForString("src"); + + assertTrue(underTest.match(pattern, path)); + } + + /** + * Insures that a pattern like 'Foo**IT.java' - while almost certainly a mistake by the person who created the + * pattern - is a valid pattern. Make sure it matches. + */ + @Test + public void testMatchMultipleZeroPlusTokens() throws Exception { + // pattern: "Foo**IT.java" + List pattern = new ArrayList<>(); + pattern.addAll(literalsForString("Foo")); + pattern.add(ZERO_OR_MORE); + pattern.add(ZERO_OR_MORE); + pattern.addAll(literalsForString("IT.java")); + + // path: "FooIT.java" should match - '**' matches zero characters + List path = literalsForString("FooIT.java"); + assertTrue(underTest.match(pattern, path)); + + // path: "FooBarBazIT.java" should match - '**' matches "BarBaz" + path = literalsForString("FooBarBazIT.java"); + assertTrue(underTest.match(pattern, path)); + } + + @Test + public void testFindNextToken() throws Exception { + assertEquals(19, underTest.findNextToken("src/test/resources/*IT.java", 0)); + assertEquals(0, underTest.findNextToken("*File*IT.java", 0)); + assertEquals(5, underTest.findNextToken("*File*IT.java", 1)); + } + + @Test + public void testFindNextLiteral() throws Exception { + assertEquals(0, underTest.findNextLiteral("src/test/resources/*IT.java", 0)); + assertEquals(1, underTest.findNextLiteral("*File*IT.java", 0)); + assertEquals(2, underTest.findNextLiteral("*File*IT.java", 2)); + assertEquals(5, underTest.findNextLiteral("Foo??.java", 3)); + } + + @Test + public void testFindNextLiteralString() throws Exception { + assertEquals(1, underTest.matchNextLiteral("*File*IT.java", 0, "File")); + assertEquals(6, underTest.matchNextLiteral("*File*IT.java", 0, "IT")); + assertEquals(6, underTest.matchNextLiteral("*File*IT.java", 0, "IT.java")); + assertEquals(6, underTest.matchNextLiteral("*File*IT.java", 4, "IT.java")); + assertEquals(Integer.MIN_VALUE, underTest.matchNextLiteral("*FileIT.java", 0, "doodle")); + } + + @Test + public void testFindRightAnchorFromBeginning() throws Exception { + String pattern = "File*IT.java"; + String path = "FileUnixIT.java"; + + assertEquals("File".length(), underTest.findRightAnchor(pattern, path, 0,0 )); + } + + @Test + public void testFindRightAnchorFromMiddle() throws Exception { + String pattern = "File*IT.java"; + String path = "FileUnixIT.java"; + + // find the right anchor after we've matched pattern "File*" to path "FileUnix" + assertEquals(path.length(), underTest.findRightAnchor(pattern, path, "FileUnix".length(), "File*".length())); + } + + @Test + public void testFindRightAnchorMultipleTokens() throws Exception { + String pattern = "Foo*Bar*Baz"; + String path = "FooXBarYBaz"; + + assertEquals(3, underTest.findRightAnchor(pattern, path, 0, 0)); + assertEquals(7, underTest.findRightAnchor(pattern, path, "FooX".length(), "Foo*".length())); + assertEquals(11, underTest.findRightAnchor(pattern, path, "FooXBarY".length(), "Foo*Bar*".length())); + } + + @Test + public void testFindRightAnchorFoo() throws Exception { + String pattern = "Foo??.java"; + String path = "src"; + + // behavior when the path is not in the pattern + assertEquals(Integer.MAX_VALUE, underTest.findRightAnchor(pattern, path, 0, 0)); + } + + @Test + public void testFindRightAnchorFooIT() throws Exception { + String pattern = "Foo??.java"; + String path = "FooIT.java"; + + // behavior when the pattern offset is positioned at a token + assertEquals(Integer.MAX_VALUE, underTest.findRightAnchor(pattern, path, 0, 3)); + } + + @Test + public void testRightAnchorBar() throws Exception { + String pattern = "*File*IT.java"; + String path = "UnixFileSmallIT.java"; + + // behavior when the path offset is already positioned at the right anchor + assertEquals(8, underTest.findRightAnchor(pattern, path, 4, 1)); + } + + /** + * Insures that the match token '**' - represented as a single BoundToken containing a DIRECTORY, or two + * consecutive BoundTokens containing a ZERO_OR_MORE_CHARACTERS - are both considered a "directory match" token + * by the ExpressionMatcher. + * + * @throws Exception + */ + @Test + public void testIsDirectoryMatch() throws Exception { + List directory = Arrays.asList(new BoundToken(Token.DIRECTORY, Token.DIRECTORY.getTokenString())); + List consecutiveZeroOrMore = Arrays.asList( + new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, Token.ZERO_OR_MORE_CHARACTERS.getTokenString()), + new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, Token.ZERO_OR_MORE_CHARACTERS.getTokenString())); + + assertTrue(underTest.isDirectoryMatchToken(directory)); + assertTrue(underTest.isDirectoryMatchToken(consecutiveZeroOrMore)); + } + + void assertListsEqual(List expected, List actual) { + assertExpectedCount(expected.size(), actual); + + for (int i = 0; i < expected.size(); i++) { + assertEquals("Expected path segments to be equal. Expected: '" + expected.get(i) + + "', Actual: '" + actual.get(i) + "'", expected.get(i), actual.get(i)); + } + } + + void assertExpectedCount(int expectedCount, List actual) { + assertEquals("Expected List to contain " + expectedCount + " elements. Contained " + actual.size() + ": " + + actual.stream().map(v -> "'" + v + "'").collect(Collectors.joining(", ")), + expectedCount, actual.size()); + } +} \ No newline at end of file diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionTest.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionTest.java new file mode 100644 index 00000000..a65b5e8f --- /dev/null +++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionTest.java @@ -0,0 +1,101 @@ +/* + * + * * Copyright 2015 Johns Hopkins University + * * + * * Licensed under the Apache License, Version 2.0 (the "License"); + * * you may not use this file except in compliance with the License. + * * You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + */ + +package org.dataconservancy.bagit.rules; + +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP_L; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.assertTokenListEquals; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.literalsForString; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class ExpressionTest { + + @Test + public void testSimple() throws Exception { + Expression exp = new Expression("src/test/resources/*IT.java"); + + // == src/test/resources/*IT.java + List expected = literalsForString("src"); + expected.add(PATH_SEP); + expected.addAll(literalsForString("test")); + expected.add(PATH_SEP); + expected.addAll(literalsForString("resources")); + expected.add(PATH_SEP); + expected.add(ZERO_OR_MORE); + expected.addAll(literalsForString("IT.java")); + + List actual = exp.getTokens(); + + assertTokenListEquals(expected, actual); + + // depth is an index + assertEquals(3, exp.depth()); + + // get path segment by depth test + assertTokenListEquals(literalsForString("src"), exp.getPathSegment(0)); + assertTokenListEquals(literalsForString("test"), exp.getPathSegment(1)); + assertTokenListEquals(literalsForString("resources"), exp.getPathSegment(2)); + + expected = new ArrayList<>(); + expected.add(ZERO_OR_MORE); + expected.addAll(literalsForString("IT.java")); + assertTokenListEquals(expected, exp.getPathSegment(3)); + + // out of bounds tests + assertTrue(exp.getPathSegment(exp.depth() + 5).isEmpty()); + assertTrue(exp.getPathSegment(-1).isEmpty()); + } + + @Test + public void testWithEmptyRoot() throws Exception { + Expression exp = new Expression("/"); + assertEquals(0, exp.depth()); + + // TODO decide what to do with the automatic addition of '**' + // for example, the Expression "/" is tokenized as "/**". + // any path ending in "/" is going to be tokenized with a trailing "**", + // and the user may not intend that behavior (for example if they are just wanting + // to express a path (not a pattern). + assertTokenListEquals(PATH_SEP_L, exp.getTokens()); + assertTrue(exp.getPathSegment(0).isEmpty()); + } + + @Test + public void testWithSingleFileRoot() throws Exception { + Expression exp = new Expression("/foo.txt"); + assertEquals(0, exp.depth()); + + // "/foo.txt" + List expected = new ArrayList<>(); + expected.add(PATH_SEP); + expected.addAll(literalsForString("foo.txt")); + + assertTokenListEquals(expected, exp.getTokens()); + assertFalse(exp.getPathSegment(0).isEmpty()); + assertEquals(literalsForString("foo.txt"), exp.getPathSegment(0)); + } +} \ No newline at end of file diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java new file mode 100644 index 00000000..99db35dd --- /dev/null +++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java @@ -0,0 +1,178 @@ +/* + * + * * Copyright 2015 Johns Hopkins University + * * + * * Licensed under the Apache License, Version 2.0 (the "License"); + * * you may not use this file except in compliance with the License. + * * You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + */ + +package org.dataconservancy.bagit.rules; + + +import org.junit.Test; + +import java.util.Arrays; + +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.DIR; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.DIR_L; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.EXACTLY_ONE; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.EXACTLY_ONE_L; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP_L; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE_L; +import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.assertTokenListEquals; +import static org.junit.Assert.assertEquals; + +/** + * Insures that the {@link Token} class properly parses tokens. + */ +public class TokenTest { + + /** + * Tokens are strings with special meanings. Insure all single-character tokens can be parsed. + */ + @Test + public void testParseSingleCharacterString() throws Exception { + + // With parse(...) + assertEquals(ZERO_OR_MORE, Token.parse("*")); + assertEquals(EXACTLY_ONE, Token.parse("?")); + assertEquals(new BoundToken(Token.LITERAL, "f"), Token.parse("f")); + assertEquals(PATH_SEP, Token.parse("/")); + + // With parseString(...) + assertTokenListEquals(ZERO_OR_MORE_L, Token.parseString("*")); + assertTokenListEquals(EXACTLY_ONE_L, Token.parseString("?")); + assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, "f")), Token.parseString("f")); + assertTokenListEquals(PATH_SEP_L, Token.parseString("/")); + } + + /** + * Tokens are strings with special meanings. Insure all multi-character tokens can be parsed. + */ + @Test + public void testParseMultipleCharacterStrings() throws Exception { + // With parse(...) + assertEquals(DIR, Token.parse("**")); + assertEquals(new BoundToken(Token.LITERAL, "foobarbaz"), Token.parse("foobarbaz")); + + // With parseString(...) + assertTokenListEquals(DIR_L, Token.parseString("**")); + assertTokenListEquals( + Arrays.asList(new BoundToken(Token.LITERAL, "f"), new BoundToken(Token.LITERAL, "o"), + new BoundToken(Token.LITERAL, "o"), new BoundToken(Token.LITERAL, "b"), + new BoundToken(Token.LITERAL, "a"), new BoundToken(Token.LITERAL, "r"), + new BoundToken(Token.LITERAL, "b"), new BoundToken(Token.LITERAL, "a"), + new BoundToken(Token.LITERAL, "z")), Token.parseString("foobarbaz")); + } + + /** + * Attempting to parse a string with multiple tokens is an error. + * Legal with {@link #testParseStringSingleStringContainingDifferentTokens}. + */ + @Test(expected = IllegalArgumentException.class) + public void testParseSingleStringContainingDifferentTokens() throws Exception { + // With parse(...) + Token.parse("*/?**abc"); + } + + /** + * Attempting to parseString a string with multiple tokens is ok. + * An error with {@link #testParseSingleStringContainingDifferentTokens()} + */ + @Test + public void testParseStringSingleStringContainingDifferentTokens() throws Exception { + // With parseString(...) + assertTokenListEquals(Arrays.asList(ZERO_OR_MORE, PATH_SEP, EXACTLY_ONE, ZERO_OR_MORE, ZERO_OR_MORE, + new BoundToken(Token.LITERAL, "a"), new BoundToken(Token.LITERAL, "b"), + new BoundToken(Token.LITERAL, "c")), Token.parseString("*/?**abc")); + } + + /** + * Attempting to parse a string with multiple tokens is an error. Essentially the same test as + * {@link #testParseSingleStringContainingDifferentTokens()}. Note this is legal with + * {@link #testParseStringLiteralEndingWithPathSep()}. + */ + @Test(expected = IllegalArgumentException.class) + public void testParseLiteralEndingWithPathSep() throws Exception { + // With parse(...) + Token.parse("directory/"); + } + + /** + * Legal form of {@link #testParseLiteralEndingWithPathSep()}. + */ + @Test + public void testParseStringLiteralEndingWithPathSep() throws Exception { + // With parseString(...) + assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, "d"), new BoundToken(Token.LITERAL, "i"), + new BoundToken(Token.LITERAL, "r"), new BoundToken(Token.LITERAL, "e"), + new BoundToken(Token.LITERAL, "c"), new BoundToken(Token.LITERAL, "t"), + new BoundToken(Token.LITERAL, "o"), new BoundToken(Token.LITERAL, "r"), + new BoundToken(Token.LITERAL, "y"), PATH_SEP), Token.parseString("directory/")); + } + + /** + * Parsing zero length strings results in an error. + */ + @Test(expected = IllegalArgumentException.class) + public void testParseZeroLengthString() throws Exception { + // With parse(...) + assertEquals(new BoundToken(Token.LITERAL, ""), Token.parse("")); + } + + /** + * Parsing zero length strings results in an error. + */ + @Test(expected = IllegalArgumentException.class) + public void testParseStringZeroLengthString() throws Exception { + // With parseString(...) + assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, "")), Token.parseString("")); + } + + /** + * Empty strings would be parsed as a literal. + */ + @Test + public void testParseEmptyString() throws Exception { + // With parse(...) + assertEquals(new BoundToken(Token.LITERAL, " "), Token.parse(" ")); + + // With parseString(...) + assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, " ")), Token.parseString(" ")); + } + + /** + * Parsing {@code null} results in an error + * + * @throws Exception + */ + @Test(expected = IllegalArgumentException.class) + public void testParseNull() throws Exception { + // With parse(...) + assertEquals(new BoundToken(Token.LITERAL, null), Token.parse(null)); + } + + /** + * Parsing {@code null} with parseString is also an error + * + * @throws Exception + */ + @Test(expected = IllegalArgumentException.class) + public void testParseStringNull() throws Exception { + // With parseString(...) + assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, null)), Token.parseString(null)); + } + +} \ No newline at end of file diff --git a/dcs-bagit/pom.xml b/dcs-bagit/pom.xml new file mode 100644 index 00000000..1f8d95bd --- /dev/null +++ b/dcs-bagit/pom.xml @@ -0,0 +1,113 @@ + + + + + + + + + 4.0.0 + + Data Conservancy BagIt Tools and Utilities + Data Conservancy BagIt packaging tools and utilities + + org.dataconservancy + dcs-bagit + 1.0.0-SNAPSHOT + pom + + + + org.dataconservancy + project-pom + 1.1.2-SNAPSHOT + + + + + + dcs-bagit-support + dcs-bagit-vfs + dcs-bagit-compress + + + + + + + + + + + + + + org.apache.commons + commons-vfs2 + 2.1-SNAPSHOT + + + + org.apache.commons + commons-compress + 1.10 + + + + junit + junit + 4.12 + + + + + + + + + + + + + + + + + + + org.slf4j + slf4j-log4j12 + test + + + + log4j + log4j + test + + + + junit + junit + test + + + + +