pathSegments = new ArrayList<>();
+ int i = 0;
+ for (BoundToken t : sanitized) {
+ if (i > d) {
+ // done recording tokens, break
+ break;
+ }
+
+ if (t.token == Token.PATH_SEPARATOR) {
+ // increment depth
+ i++;
+ // continue, we don't record path separators
+ continue;
+ }
+
+
+ if (d - i == 0) {
+ // record the token
+ pathSegments.add(t);
+ }
+ }
+
+ return pathSegments;
+ });
+ }
+
+ @Override
+ public String toString() {
+ return tokens.stream()
+ .collect(StringBuilder::new, (s, bt) -> s.append(bt.bound), StringBuilder::append).toString();
+ }
+}
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java
new file mode 100644
index 00000000..4bb0f3ab
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java
@@ -0,0 +1,733 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+import java.util.List;
+
+/**
+ * Responsible for matching an Expression representing a path against a pattern. This is quite possibly the most
+ * heavy-weight string parsing library you'll ever encounter. It is inspired by Ant-style pattern matching, and
+ * attempts to follow the same rules as the Ant
+ * implementation:
+ *
+ *
+ * These patterns look very much like the patterns used in DOS and UNIX:
+ *
+ * '*' matches zero or more characters, '?' matches one character.
+ *
+ * In general, patterns are considered relative paths, relative to a task dependent base directory (the dir attribute in
+ * the case of ). Only files found below that base directory are considered. So while a pattern like
+ * ../foo.java is possible, it will not match anything when applied since the base directory's parent is never scanned
+ * for files.
+ *
+ * Examples:
+ *
+ * .java matches .java, x.java and FooBar.java, but not FooBar.xml (does not end with .java).
+ *
+ * ?.java matches x.java, A.java, but not .java or xyz.java (both don't have one character before .java).
+ *
+ * Combinations of *'s and ?'s are allowed.
+ *
+ * Matching is done per-directory. This means that first the first directory in the pattern is matched against the first
+ * directory in the path to match. Then the second directory is matched, and so on. For example, when we have the
+ * pattern /?abc/*/*.java and the path /xabc/foobar/test.java, the first ?abc is matched with
+ * xabc, then * is matched with foobar, and finally *.java is matched with test.java. They all match, so the path
+ * matches the pattern.
+ *
+ * To make things a bit more flexible, we add one extra feature, which makes it possible to match multiple directory
+ * levels. This can be used to match a complete directory tree, or a file anywhere in the directory tree. To do this,
+ * ** must be used as the name of a directory. When ** is used as the name of a directory in the
+ * pattern, it matches zero or more directories. For example: /test/** matches all files/directories under
+ * /test/, such as /test/x.java, or /test/foo/bar/xyz.html, but not /xyz.xml.
+ *
+ * There is one "shorthand": if a pattern ends with / or \, then ** is appended. For example,
+ * mypackage/test/ is interpreted as if it were mypackage/test/**.
+ *
+ *
+ *
+ * Even though a "path" and a "pattern" are both instances of an {@link Expression}, their semantics differ. A
+ * "path" only contains literal and path separator tokens. A "pattern" may contain literals, path separators, and
+ * matching tokens like '*' and '?'. Path segments are the tokens between consecutive path separators, addressable
+ * by their zero-indexed {@link org.dataconservancy.bagit.rules.Expression#depth() depth}. For example, the Expression
+ * '/foo/bar/baz.txt' has three path segments, 'foo' (depth = 0), 'bar' (depth = 1), and 'baz.txt' (depth = 2). The
+ * depth of the Expression is 2.
+ *
+ *
+ * Note that methods on this class are package-private, and are not meant to be exposed publicly.
+ *
+ *
+ * Because ExpressionMatcher is package-private, it can be hard to tell what the entry points into the ExpressionMatcher
+ * class are, and this test class doesn't help you determine that. Clients of ExpressionMatcher should be calling
+ *
+ * - {@link org.dataconservancy.bagit.rules.ExpressionMatcher#match(Expression, Expression)}
+ *
+ *
+ */
+public class ExpressionMatcher {
+
+ /**
+ * Convenience reference to a {@code BoundToken} that matches zero or more characters (i.e. '*').
+ * See {@link #zero_plus} for the {@code char} analog.
+ */
+ private static final BoundToken ZERO_OR_MORE = new BoundToken(Token.ZERO_OR_MORE_CHARACTERS,
+ Token.ZERO_OR_MORE_CHARACTERS.getTokenString());
+
+ /**
+ * Convenience reference to a {@code BoundToken} that matches exactly one character (i.e. '?').
+ * See {@link #exactly_one} for the {@code char} analog.
+ */
+ private static final BoundToken EXACTLY_ONE = new BoundToken(Token.EXACTLY_ONE_CHARACTER,
+ Token.EXACTLY_ONE_CHARACTER.getTokenString());
+
+ /**
+ * The {@code char} analog of {@link #EXACTLY_ONE}
+ */
+ private final char exactly_one;
+
+ /**
+ * The {@code char} analog of {@link #ZERO_OR_MORE}
+ */
+ private final char zero_plus;
+
+ /**
+ * Constructs a new instance of a matcher.
+ * TODO: probably could be private and methods be made static.
+ */
+ ExpressionMatcher() {
+ if (EXACTLY_ONE.isSingleChar()) {
+ exactly_one = EXACTLY_ONE.asChar();
+ } else {
+ throw new RuntimeException("Implementation doesn't handle multi-character token: " +
+ Token.EXACTLY_ONE_CHARACTER);
+ }
+
+ if (ZERO_OR_MORE.isSingleChar()) {
+ zero_plus = ZERO_OR_MORE.asChar();
+ } else {
+ throw new RuntimeException("Implementation doesn't handle multi-character token: " +
+ Token.ZERO_OR_MORE_CHARACTERS);
+
+ }
+ }
+
+ /**
+ * Match the supplied path against the pattern. Matching is applied 'per-directory' as described
+ * {@link org.dataconservancy.bagit.rules.ExpressionMatcher above}. This is the main entry point into the pattern
+ * matching logic.
+ *
+ * @param pattern the pattern meant to match a path
+ * @param path the path to match against the pattern
+ * @return true if the pattern matches
+ */
+ boolean match(Expression pattern, Expression path) {
+
+ // the path should just be made up of path separators and literals
+ if (!isPath(path.getTokens())) {
+ return false; // probably should be an IAE
+ }
+
+ if (pattern.depth() > path.depth()) {
+ // if the pattern depth is greater than the path we're supposed to be matching,
+ // then we can't match, so short-circuit
+ return false; // probably should be an IAE
+ }
+
+ if (pattern.depth() == path.depth()) {
+ boolean match = true;
+ // we have alignment, simply match each path segment from the pattern against the path.
+ for (int i = 0; i <= pattern.depth(); i++) {
+ match &= match(pattern.getPathSegment(i), path.getPathSegment(i));
+ }
+
+ return match;
+ }
+
+ int pathOff = 0;
+ int expOff = 0;
+ int nextLiteral = nextLiteral(pattern, expOff);
+
+ return matchPathSegment(pattern, path, expOff, pathOff, nextLiteral);
+ }
+
+ /**
+ * Attempt to match all of the path segments in {@code path} against {@code pattern}, starting from
+ * {@code pathDepth} and {@code patternDepth}. The {@code nextLiteral} parameter contains the depth of the next
+ * path segment in {@code pattern} containing a literal (or -1 if there isn't any).
+ *
+ * @param pattern the expression containing a matching tokens (i.e. pattern semantics)
+ * @param path the expression containing only literals or path separators (i.e. path semantics)
+ * @param patternDepth the depth to begin matching the pattern
+ * @param pathDepth the depth to begin matching the path
+ * @param nextLiteral the depth of the next pattern segment that contains a literal, or -1 if it doesn't exist
+ * @return true if all of the segments (starting from pathDepth) in the path can be matched in the pattern (starting
+ * from patternDepth)
+ */
+ private boolean matchPathSegment(Expression pattern, Expression path, int patternDepth, int pathDepth,
+ int nextLiteral) {
+
+ // if we're out of literals...
+ if (nextLiteral == -1) {
+ // See if there are remaining segments to match, and match them.
+ boolean match = true;
+ for (int i = pathDepth; i <= path.depth(); i++) {
+ match &= match(pattern.getPathSegment(patternDepth), path.getPathSegment(i));
+ }
+
+ return match;
+ }
+
+ // attempt to match every path segment against the pattern segment containing literals.
+ int rightAnchor = nextMatch(path, pathDepth, pattern.getPathSegment(nextLiteral));
+
+ // if we don't match ...
+ if (rightAnchor == -1) {
+ return false;
+ }
+
+ // make sure that every path segment from the left anchor to the right anchor matches the current pattern
+ boolean match = true;
+ for (int i = pathDepth; i < rightAnchor; i++) {
+ match &= match(pattern.getPathSegment(patternDepth), path.getPathSegment(i));
+ }
+
+ // if they match up to the anchor, keep going
+ if (match) {
+ pathDepth = rightAnchor;
+ patternDepth++;
+ nextLiteral = nextLiteral(pattern, nextLiteral + 1);
+ return matchPathSegment(pattern, path, patternDepth, pathDepth, nextLiteral);
+ }
+
+ return false;
+ }
+
+ /**
+ * Search the supplied pattern starting at {@code depth} for path segments that contain literals. Useful for
+ * finding the depth of path segment 'Foo??.java' in the pattern expression '**/Foo??.java'.
+ *
+ * @param pattern an expression with pattern semantics
+ * @param depth the depth to begin searching from
+ * @return the index of the next path segment (i.e. depth) that contains literals, or -1 if not found
+ */
+ int nextLiteral(Expression pattern, int depth) {
+ if (depth > pattern.depth()) {
+ return -1;
+ }
+
+ for (int i = depth; i <= pattern.depth(); i++) {
+ if (containsLiterals(pattern.getPathSegment(i))) {
+ return i;
+ }
+ }
+
+ return -1;
+ }
+
+ /**
+ * Attempts to match every path segment starting from {@code path.getPathSegment(depth)} against the
+ * {@code pattern}. The {@code path} is an {@code Expression} with path semantics (i.e. only containing literals
+ * and path separators). Each path segment (starting from {@code depth}) is matched against {@code pattern}.
+ *
+ * @param path an Expression with path semantics
+ * @param depth the depth of the expression to begin matching from
+ * @param pattern the pattern each path segment of {@code path} is matched against.
+ * @return the index of the first path segment (i.e. depth) that matched {@code pattern}, or -1 if no match
+ */
+ int nextMatch(Expression path, int depth, List pattern) {
+ for (int i = depth; i <= path.depth(); i++) {
+ if (match(pattern, path.getPathSegment(i))) {
+ return i;
+ }
+ }
+
+ return -1;
+ }
+
+ /**
+ * Expected input are two Lists of BoundTokens. Each List is expected to be a path segment; that is, a List
+ * will contain all BoundTokens between two consecutive path separators, not including the separators. Therefore
+ * the path segment will not ever contain a path separator ('/'), nor should it contain a directory match
+ * token ('**').
+ *
+ * Essentially this method is evaluating a pattern that may contain literals, '*', and '?' against a
+ * string of literals.
+ *
+ *
+ * @param patternPathSegment the pattern
+ * @param pathPathSegment the string (i.e. path) to match the pattern against
+ * @return true if the pattern matches the path
+ */
+ boolean match(List patternPathSegment, List pathPathSegment) {
+
+ // first, handle the short-circuit cases:
+ // patternPathSegment only contains '*' ; doesn't matter what pathPathSegment has, all tokens match
+ // patternPathSegment only contains '**' ; doesn't matter what pathPathSegment has, all tokens match
+ // patternPathSegment contains '?' and pathPathSegment only has a single token, the single token matches
+ // patternPathSegment is all literals ; see if the pathPathSegment equals
+
+ if (isZeroOrMore(patternPathSegment)) {
+ return true;
+ }
+
+ if (isDirectoryMatchToken(patternPathSegment)) {
+ // this guards match(CharSequence, CharSequence, int, int, int, int) from having to handle '**' tokens.
+ return true;
+ }
+
+ if (pathPathSegment.size() == 1 && isExactlyOne(patternPathSegment)) {
+ return true;
+ }
+
+ if (allLiterals(patternPathSegment)) {
+ return tokenEquals(patternPathSegment, pathPathSegment);
+ }
+
+ // Otherwise, we have a multiple-token pattern that contains a mixture of literals
+ // and at least one of '*' or '?'
+
+ CharSequence pattern = toCharSeq(patternPathSegment);
+ CharSequence path = toCharSeq(pathPathSegment);
+
+ int fPatternIndex = 0;
+ int fPathIndex = 0;
+ int tokenIndex = findNextToken(pattern, fPatternIndex);
+ int literalIndex = findNextLiteral(pattern, fPatternIndex);
+
+ int leftAnchor = 0;
+
+ return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor);
+ }
+
+ /**
+ * A recursive method for matching a {@code path} against a {@code pattern}. The method terminates when there are
+ * no more literals or tokens to be matched, or as soon as it determines a match isn't possible and returns early.
+ * N.B. this method cannot handle a directory matching token: '**'. It is expected that the caller
+ * has filtered these tokens out (see {@link #match(java.util.List, java.util.List)} and its
+ * {@link #isDirectoryMatchToken(java.util.List)} check.
+ *
+ * Developers, when reading this implementation, keep in mind that anchors are always indexes into the {@code path},
+ * while {@code tokenIndex} and {@code literalIndex} are always indexes into {@code pattern}. The first major
+ * decision made is whether the method is attempting to match a token (e.g. '?' in "Foo??.java") or match a literal
+ * (e.g. "Foo", ".java" in "Foo??.java").
+ *
+ *
+ * When matching a token, the first decision to make is whether you are going to match forward from the current
+ * token, or work backward from the end of the pattern. When matching a literal, the objective is to determine the
+ * anchors of the literal in the path and attempt to match it against the pattern.
+ *
+ *
+ * @param pattern the pattern to match against
+ * @param path the path to match
+ * @param fPathIndex the index into the {@code path} that has matched
+ * @param tokenIndex the index into {@code pattern} of the next token to be matched
+ * @param literalIndex the index into th {@code pattern} of the next literal to be matched
+ * @param leftAnchor not used TODO remove
+ * @return true if {@code path} matches {@code pattern}
+ */
+ private boolean match(CharSequence pattern, CharSequence path, int fPathIndex, int tokenIndex, int literalIndex, int leftAnchor) {
+ // Index description:
+ // - fPathIndex, left and right anchors are always indexes in the path
+ // - token and literal are always indexes in the pattern.
+
+
+ int rightAnchor = Integer.MIN_VALUE;
+
+ if (tokenIndex == Integer.MAX_VALUE && literalIndex == Integer.MAX_VALUE) {
+ // we've matched everything?
+ return true;
+ }
+
+ if (tokenIndex < literalIndex) {
+
+ // We are matching a token (because tokenIndex < literalIndex)
+ //
+ // If we are matching the last token in the pattern, we work backward in the path.
+ // If we are matching a token, and there are still more tokens left, we work forward in the path.
+ //
+ // - Find the left and right anchors in the path.
+ // - Find right anchor
+ // - Find the next literal in the pattern (using the literalIndex, and the [end of string|next token index])
+ // - Match that literal in the path (from offset fPathIndex)
+ // - Set the right anchor at the start of the literal.
+ // - Find left anchor
+ // - Equal to the forward path index (fPathIndex)
+ //
+ // - If the token is a '*', we match.
+ // - If the token is a '?', and rightAnchor - leftAnchor == 1, we match.
+ //
+ // - If we match:
+ // - set the fPathIndex to the rightAnchor (because fPathIndex keeps track of what we've matched in the path)
+ // - set the next token index (or Integer.MIN_VALUE if the pattern is exhausted, or out of tokens)
+ // - leave literalIndex alone, because we didn't match a literal this go-around, we matched a token.
+
+ // Find the right anchor.
+ int nextTokenIndex = findNextToken(pattern, tokenIndex + 1);
+
+ leftAnchor = fPathIndex;
+
+ CharSequence literal = null;
+
+ if (nextTokenIndex != Integer.MAX_VALUE && nextTokenIndex != Integer.MAX_VALUE) {
+ // we are not at the last token, work forward
+ // in the case of consecutive tokens (e.g. "??"), the literalIndex will be greater than the nextTokenIndex
+ int literalLen = Math.max(nextTokenIndex - literalIndex, 1);
+ literal = (literalLen <= 0) ? "" : pattern.subSequence(literalIndex, literalIndex + literalLen);
+
+ if ((rightAnchor = matchNextLiteral(path, fPathIndex, literal)) == Integer.MIN_VALUE) {
+ // we didn't match the literal in the path, so we won't match
+ return false;
+ }
+ } else {
+ // we are at the last token, work backward from the end of the pattern by matching the literal at
+ // the end of the pattern, then checking the remaining characters in the path with the pattern token
+
+ // the special case is if the token we are matching is the last character of the pattern, in which
+ // case there won't be a literal to match. in this case, the right anchor will be set to
+ // the end of the path.
+
+ if (tokenIndex == pattern.length() - 1) {
+ rightAnchor = path.length();
+ } else {
+ literal = pattern.subSequence(tokenIndex + 1, pattern.length());
+
+ // if we don't match the literal in the path, then we don't match
+ if ((rightAnchor = matchNextLiteral(path, fPathIndex, literal)) == Integer.MIN_VALUE) {
+ return false;
+ }
+ }
+ }
+
+ // - If the token is a '*', we match.
+ // - If the token is a '?', and rightAnchor - leftAnchor == 1, we match.
+
+ // if the next token is inside of the right anchor, we have multiple tokens (e.g. '??') in a row.
+ if (pattern.charAt(tokenIndex) == exactly_one) {
+ if (nextTokenIndex < rightAnchor) {
+ if (pattern.subSequence(tokenIndex, literalIndex).chars().allMatch(c -> ((char) c) == '?')) {
+ fPathIndex = ++leftAnchor;
+ tokenIndex = findNextToken(pattern, tokenIndex + 1);
+ return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor);
+ } else {
+ return false;
+ }
+ }
+
+ if (rightAnchor - leftAnchor == 1) {
+ fPathIndex = rightAnchor;
+ tokenIndex = findNextToken(pattern, tokenIndex + 1);
+ return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor);
+ } else {
+ return false;
+ }
+ }
+
+ if (pattern.charAt(tokenIndex) == zero_plus) { //||
+ //pattern.charAt(tokenIndex) == exactly_one && (rightAnchor - leftAnchor == 1) // ) {
+ // || ((nextTokenIndex < rightAnchor) && pattern.subSequence(tokenIndex, literalIndex).chars().allMatch(c -> ((char) c) == '?'))) {
+
+ // - If we match:
+ // - set the fPathIndex to the rightAnchor (because fPathIndex keeps track of what we've matched in the path)
+ // - set the next token index (or Integer.MAX_VALUE if the pattern is exhausted, or out of tokens)
+ // - leave literalIndex alone, because we didn't match a literal this go-around, we matched a token.
+
+ fPathIndex = rightAnchor;
+ tokenIndex = findNextToken(pattern, tokenIndex + 1);
+
+ return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor);
+
+ } else {
+ return false;
+ }
+
+ } else if (literalIndex < tokenIndex) {
+
+ // We are matching a literal (because literalIndex < tokenIndex)
+ CharSequence literalToMatch;
+
+ if (literalIndex == Integer.MIN_VALUE) {
+ // we're out of literals, so we just have to match that last token
+ rightAnchor = path.length();
+ if (pattern.charAt(tokenIndex) == zero_plus ||
+ pattern.charAt(tokenIndex) == exactly_one && rightAnchor - leftAnchor == 1) {
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ leftAnchor = fPathIndex;
+
+ // if we can't find the right anchor, then we can't match.
+ if ((rightAnchor = findRightAnchor(pattern, path, fPathIndex, literalIndex)) == Integer.MAX_VALUE) {
+ return false;
+ }
+
+ literalToMatch = pattern.subSequence(literalIndex, Math.min(tokenIndex, pattern.length()));
+ }
+
+ // does the literal in the pattern match the literal between the anchors?
+ if (path.subSequence(leftAnchor, rightAnchor).equals(literalToMatch)) {
+
+ // - If we match:
+ // - set the fPathIndex to the rightAnchor (because fPathIndex keeps track of what we've matched in the path)
+ // - leave the next token index alone, because we didn't match a token this go around, we matched a literal
+ // - set the literal index to the beginning of the next literal
+
+ fPathIndex = rightAnchor;
+ literalIndex = findNextLiteral(pattern, literalIndex + literalToMatch.length());
+
+ return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor);
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Attempt to find the next occurrence of a token in {@code pattern}, starting from {@code offset}.
+ *
+ * Remember that even though paths and patterns are both instances of {@link Expression}, the semantics of
+ * a 'path' are that it contains only literals and path separators, and differs from a 'pattern' which can contain
+ * matching tokens like '*' and '?'.
+ *
+ *
+ * @param pattern the pattern to search through
+ * @param offset the offset into pattern to start searching from
+ * @return the offset in the pattern with the next occurrence of a token, or {@code Integer.MAX_VALUE} if not
+ * found.
+ * @see #findNextLiteral(CharSequence, int)
+ */
+ int findNextToken(CharSequence pattern, int offset) {
+ if (offset < 0 || offset >= pattern.length() || pattern.length() == 0) {
+ return Integer.MAX_VALUE;
+ }
+
+ for (int i = offset; i < pattern.length(); i++) {
+ if (pattern.charAt(i) == exactly_one || pattern.charAt(i) == zero_plus) {
+ return i;
+ }
+ }
+
+ return Integer.MAX_VALUE;
+ }
+
+ /**
+ * Attempt to find the next occurrence of a literal in {@code pattern}, starting from {@code offset}.
+ *
+ * Remember that even though paths and patterns are both instances of {@link Expression}, the semantics of
+ * a 'path' are that it contains only literals and path separators, and differs from a 'pattern' which can contain
+ * matching tokens like '*' and '?'.
+ *
+ *
+ * @param pattern the pattern to search through
+ * @param offset the offset into pattern to start searching from
+ * @return the offset in the pattern with the next occurrence of a literal, or {@code Integer.MAX_VALUE} if not
+ * found.
+ * @see #findNextToken(CharSequence, int)
+ */
+ int findNextLiteral(CharSequence pattern, int offset) {
+ if (offset < 0 || offset >= pattern.length() || pattern.length() == 0) {
+ return Integer.MAX_VALUE;
+ }
+
+ for (int i = offset; i < pattern.length(); i++) {
+ if (pattern.charAt(i) != exactly_one && pattern.charAt(i) != zero_plus) {
+ return i;
+ }
+ }
+
+ return Integer.MAX_VALUE;
+ }
+
+ /**
+ * Attempts to match the literal in the path, from the supplied offset, and returns the offset where the
+ * literal occurs.
+ *
+ * @param path the path being searched for a literal string
+ * @param offset the offset in path to start searching from
+ * @param literal the literal string to find
+ * @return the offset of {@code literal} in {@code path}, or {@code Integer.MIN_VALUE} if not found
+ */
+ int matchNextLiteral(CharSequence path, int offset, CharSequence literal) {
+ if (offset < 0 || offset >= path.length() || path.length() == 0) {
+ return Integer.MIN_VALUE;
+ }
+
+ CharSequence sub = path.subSequence(offset, path.length());
+
+ int litIdx = 0;
+ int subIdx = 0;
+ while (litIdx < literal.length() && subIdx < sub.length()) {
+ if (literal.charAt(litIdx) == sub.charAt(subIdx)) {
+ // increment literal index if there's a match
+ litIdx++;
+ } else {
+ // reset litIdx to 0
+ litIdx = 0;
+ }
+ ;
+
+ subIdx++; // always increment the substring index
+ }
+
+ // we matched the literal if the literal index is the same as its CharSequence
+ if (literal.length() - litIdx == 0) {
+ // then the offset into the path of the beginning of the literal is
+ // offset + ( subIdx - literal.length() )
+ return offset + (subIdx - literal.length());
+ }
+
+ return Integer.MIN_VALUE; // literal wasn't found.
+ }
+
+ /**
+ * Returns true if the path segment contains a single {@link Token#ZERO_OR_MORE_CHARACTERS} token.
+ *
+ * @param pathSegment the path segment containing arbitrary tokens
+ * @return true if the only token in the path segment is a {@code ZERO_OR_MORE_CHARACTERS} token.
+ */
+ boolean isZeroOrMore(List pathSegment) {
+ return pathSegment.size() == 1 && pathSegment.get(0).token == Token.ZERO_OR_MORE_CHARACTERS;
+
+
+ }
+
+ /**
+ * Returns true if the path segment contains a single {@link Token#DIRECTORY} token, or exactly two
+ * {@link Token#ZERO_OR_MORE_CHARACTERS} tokens.
+ *
+ * @param pathSegment the path segment containing arbitrary tokens
+ * @return true if the path segment will match a directory
+ */
+ boolean isDirectoryMatchToken(List pathSegment) {
+ return (pathSegment.size() == 1 && pathSegment.get(0).token == Token.DIRECTORY) ||
+ (pathSegment.size() == 2 && pathSegment.get(0).token == Token.ZERO_OR_MORE_CHARACTERS
+ && pathSegment.get(1).token == Token.ZERO_OR_MORE_CHARACTERS);
+ }
+
+ /**
+ * Returns true if the path segment contains a single token, and the token is a {@link Token#EXACTLY_ONE_CHARACTER}.
+ *
+ * @param pathSegment the path segment containing arbitrary tokens
+ * @return true if the single token in the path segment is a {@code EXACTLY_ONE_CHARACTER} token.
+ */
+ boolean isExactlyOne(List pathSegment) {
+ return pathSegment.size() == 1 && pathSegment.get(0).token == Token.EXACTLY_ONE_CHARACTER;
+ }
+
+ /**
+ * Answers a {@code CharSequence} that contains the value of each token, in the same order, as supplied by
+ * {@code tokens}.
+ *
+ * @param tokens a List of arbitrary tokens
+ * @return the sequence of token values
+ */
+ private CharSequence toCharSeq(List tokens) {
+ return tokens.stream().collect(StringBuilder::new, (sb, bt) -> sb.append(bt.bound),
+ StringBuilder::append).toString();
+ }
+
+ /**
+ * Returns true if each list is equal in size, and contains
+ * {@link org.dataconservancy.bagit.rules.BoundToken#equals(Object) equal} tokens, in the same order.
+ *
+ * @param one the first list of arbitrary tokens
+ * @param two the second list of arbitrary tokens
+ * @return true if the lists contain equal content
+ */
+ private boolean tokenEquals(List one, List two) {
+ if (one.size() != two.size()) {
+ return false;
+ }
+
+ for (int i = 0; i < one.size(); i++) {
+ if (!one.get(i).equals(two.get(i))) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ *
+ * @param pattern
+ * @param path
+ * @param pathOff
+ * @param patternOff
+ * @return
+ */
+ int findRightAnchor(CharSequence pattern, CharSequence path, int pathOff, int patternOff) {
+ // - Find the next token in the pattern from patternOff
+ // - The token index (or end of string) is the end of the literal.
+
+ // Assumes that patternOff isn't positioned at a token, and that patternOff + 1 isn't a token either
+ int nextToken = Math.min(findNextToken(pattern, patternOff), pattern.length());
+
+ CharSequence literalToMatch = pattern.subSequence(patternOff, nextToken);
+
+ // - Match that literal in the path (from offset fPathIndex)
+ // - If the literalToMatch isn't found in 'path', or if the matched literal is an empty string, return Integer.MAX_VALUE
+ if (literalToMatch.length() == 0) {
+ return Integer.MAX_VALUE;
+ }
+ int tmpIdx = matchNextLiteral(path, pathOff, literalToMatch);
+ if (tmpIdx == Integer.MIN_VALUE) {
+ return Integer.MAX_VALUE;
+ }
+
+ // - Set the right anchor at the end of the literal.
+ return tmpIdx + literalToMatch.length();
+ }
+
+ /**
+ * Returns true if any of the tokens in {@code pathTokens} represent a literal.
+ *
+ * @param pathTokens the tokens to check, normally these are the tokens from a single path segment.
+ * @return true if any of the supplied token is a literal.
+ */
+ boolean containsLiterals(List pathTokens) {
+ return (pathTokens.stream().filter(bt -> bt.token == Token.LITERAL).count() > 0);
+ }
+
+ /**
+ * Returns true if all of the tokens in {@code pathTokens} are literal.
+ *
+ * @param pathTokens the tokens to check; normally these are tokens from a single path segment.
+ * @return
+ */
+ boolean allLiterals(List pathTokens) {
+ return (pathTokens.stream().filter(bt -> bt.token == Token.LITERAL).count() == pathTokens.size());
+ }
+
+ /**
+ * Returns true if all of the tokens in {@code pathTokens} represent a literal or
+ * path separator. This method is used to determine of a List of tokens represents a path or a pattern.
+ *
+ * For example, tokens for {@code /foo/bar/baz.txt} would return {@code true}; tokens for {@code Foo??.java} would
+ * return {@code false}, because the '?' characters are not a literal or path separator token.
+ *
+ *
+ * @param pathTokens the tokens to check, normally these are tokens for a path or pattern
+ * @return true if the list of tokens contains only literals or separators
+ */
+ boolean isPath(List pathTokens) {
+ return (pathTokens.stream().filter(
+ bt -> bt.token == Token.LITERAL || bt.token == Token.PATH_SEPARATOR).count() == pathTokens.size());
+ }
+
+}
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java
new file mode 100644
index 00000000..dc6b96b4
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java
@@ -0,0 +1,171 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import static org.dataconservancy.bagit.rules.Message.ERR_NULL;
+
+/**
+ * Tokens are strings that make up a location expressions. Location expressions are patterns that are matched against
+ * paths. Location expressions are inspired by Apache Ant file pattern matching.
+ */
+enum Token {
+
+ /**
+ * A token matching exactly one character in an expression.
+ */
+ EXACTLY_ONE_CHARACTER("?"),
+
+ /**
+ * A token that will match multiple directory levels in an expression.
+ */
+ DIRECTORY("**"),
+
+ /**
+ * A token matching zero or more characters in an expression. Must always be defined sometime after
+ * {@link #DIRECTORY}
+ */
+ ZERO_OR_MORE_CHARACTERS("*"),
+
+
+ /**
+ * A token that separates path segments in an expression.
+ */
+ PATH_SEPARATOR("/"),
+
+ /**
+ * A special token with a {@code null} token string. Must always be defined last
+ */
+ LITERAL();
+
+ private static final String ERR_MULTIPLE_TOKENS = "Candidate sequence '%s' contains multiple tokens. " +
+ "Try splitting up the tokens and submitting the tokens one at a time.";
+
+ /**
+ * String representation of the token, if there is one.
+ */
+ private String tokenString;
+
+ /**
+ * Construct a Token with no string representation. Currently reserved for {@link #LITERAL} tokens.
+ */
+ private Token() {
+ this.tokenString = null;
+ }
+
+ /**
+ * Construct a token with the supplied string representation.
+ *
+ * @param tokenString the string representation of the token.
+ * @throws java.lang.IllegalArgumentException if the {@code tokenString} is {@code null}
+ */
+ private Token(String tokenString) {
+ if (tokenString == null) {
+ throw new IllegalArgumentException(String.format(ERR_NULL, "tokenString"));
+ }
+ this.tokenString = tokenString;
+ }
+
+ /**
+ * Obtain the string form of the token, may be {@code null}. {@link #LITERAL} tokens will not
+ * have a string form, because a literal is the set of characters that do not represent a token.
+ *
+ * @return the string form of the token, or {@code null} in the case of {@code LITERAL} tokens.
+ */
+ String getTokenString() {
+ return tokenString;
+ }
+
+ /**
+ * Attempts to parse a string which represents a single token into a {@code Token}
+ *
+ * @param candidate the candidate token string
+ * @return a {@code Token} if {@code candidate} represents a valid token
+ * @throws java.lang.IllegalArgumentException if {@code candidate} does not represent a valid token
+ */
+ static BoundToken parse(CharSequence candidate) {
+ if (candidate == null || candidate.length() == 0) {
+ throw new IllegalArgumentException(String.format(ERR_NULL, "candidate"));
+ }
+
+ for (Token m : Token.values()) {
+
+ // See if the candidate token string equals the string representation
+ // of the token (except LITERAL), and return it
+ if (m.tokenString != null && m.tokenString.equals(candidate)) {
+ return new BoundToken(m, candidate.toString());
+ }
+
+ // Check to see if the candidate token string _contains_ the string representation
+ // of the token (except LITERAL). If so, that means that the candidate contains multiple
+ // tokens, which isn't allowed.
+ if (candidate.length() > 1 &&
+ m.tokenString != null &&
+ candidate.chars().anyMatch(
+ c -> m.tokenString.contains(Character.toString((char) c)))) {
+ throw new IllegalArgumentException(String.format(ERR_MULTIPLE_TOKENS, candidate));
+ }
+ }
+
+ // None of our Token string representations equaled the candidate string.
+ // The candidate string did not _contain_ any of the Token string representations
+ // We must be left with a LITERAL.
+
+ return new BoundToken(Token.LITERAL, candidate.toString());
+ }
+
+ static List parseString(CharSequence candidate) {
+ if (candidate == null || candidate.length() == 0) {
+ throw new IllegalArgumentException(String.format(ERR_NULL, "candidate"));
+ }
+
+ return
+ candidate.chars().mapToObj(c -> {
+ // This code block maps each character in the sequence to a BoundToken.
+
+ // Cast the int to a char, and parse it as a String
+ String s = String.valueOf((char) c);
+ BoundToken bound = null;
+
+ // Iterate over every Token (except LITERAL), and see if the string matches
+ for (Token t : Token.values()) {
+ if (t.getTokenString() != null && t.getTokenString().equals(s)) {
+ bound = new BoundToken(t, s);
+ }
+ }
+
+ // If there was no match, then we must have a LITERAL.
+ if (bound == null) {
+ bound = new BoundToken(LITERAL, s);
+ }
+
+ return bound;
+ }).collect(Collectors.toList());
+ }
+
+ @Override
+ public String toString() {
+ return "Token{" +
+ "tokenString='" + tokenString + '\'' +
+ '}';
+ }
+}
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagFileRole.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagFileRole.java
new file mode 100644
index 00000000..ec33bb84
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagFileRole.java
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2015 Johns Hopkins University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dataconservancy.bagit.support;
+
+/**
+ * Roles of tag files seen in a Bag. Roles are defined independently of concrete files because: 1) a single role may
+ * be served by multiple files; 2) names of files that fulfill these roles may change.
+ *
+ * While changing the name of the {@code bagit.txt} file is hard to imagine, it is reasonable that additional files may
+ * fulfill a particular role as the BagIt specification evolves. If the name of {@code bagit.txt} does change, it is
+ * likely that the role of a bag declaration will continue to be needed, even if it is not longer fulfilled by
+ * {@code bagit.txt}.
+ *
+ *
+ * BagIt requires that there be a bag declaration and a pay load manifest. These roles are enumerated in this class as
+ * {@link #BAG_DECL} and {@link #PAYLOAD_MANIFEST}, respectively. Other roles such as a tag manifest, payload
+ * directory, and fetch file are enumerated in this class. A payload manifest role may be fulfilled by two different
+ * files, a {@code manifest-sha1.txt} file containing SHA checksums, and a {@code manifest-md5.txt} file containing MD5
+ * checksums. In the future, implementations may use SHA-256 or other algorithms. Regardless of the name of future
+ * files, their role will be enumerated in this class.
+ *
+ *
+ * The documentation for each role includes example file names from the specification, and are informative. These are
+ * meant to be examples in aiding the comprehension of what the role represents; they are not normative.
+ *
+ */
+public enum BagFileRole {
+
+ /**
+ * The bag payload (e.g. {@code data/}) directory.
+ */
+ PAYLOAD_DIRECTORY,
+
+ /**
+ * Bag payload itself (e.g. content in the {@code data/} directory.
+ */
+ PAYLOAD_CONTENT,
+
+ /**
+ * Tag file corresponding to the {@code bagit.txt} file, at the base of the bag.
+ */
+ BAG_DECL,
+
+ /**
+ * Tag file corresponding to the {@code bag-info.txt} file, at the base of the bag.
+ */
+ BAG_INFO,
+
+ /**
+ * Tag file(s) corresponding to the payload {@code manifest-<algorithm>.txt} file, at the base of the bag.
+ */
+ PAYLOAD_MANIFEST,
+
+ /**
+ * Tag file(s) corresponding to the {@code tagmanifest-<algorithm>.txt} file, at the base of the bag.
+ */
+ TAG_MANIFEST,
+
+ /**
+ * Tag file corresponding to the {@code fetch.txt} file, at the base of the bag.
+ */
+ FETCH,
+
+ /**
+ * Tag files corresponding to additional tag files, not covered by the BagIt specification.
+ */
+ OTHER_TAG
+}
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagUri.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagUri.java
new file mode 100644
index 00000000..407cb1fb
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagUri.java
@@ -0,0 +1,174 @@
+/*
+ * Copyright 2015 Johns Hopkins University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dataconservancy.bagit.support;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+
+/**
+ * URI scheme for addressing resources contained within a Bag. The form of a Bag URI is:
+ * {@code bag:///path/to/resource#optional-fragment}
+ *
+ * The Bag URI {@link #BAG_SCHEME scheme} is equal to the string '{@code bag}'; resources inside the bag are unique
+ * within the scope of a single bag. The {@code authority} component of a Bag URI is equal to the name of the Bag
+ * serialization (as discussed in BagIt
+ * section 4), minus any file name extensions. Query parameters are disallowed in Bag URIs, as they have no semantic
+ * analog in the BagIt specification.
+ *
+ *
+ * @see BagIt Draft Specification version 0.97, expires December 25, 2015
+ * @see RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax
+ * @see Data Conservancy BagIt Profile 1.0, section X
+ */
+public class BagUri {
+
+ /**
+ * The value of the Bag URI {@code scheme} (RFC 2396 sec. 3)
+ */
+ public static final String BAG_SCHEME = "bag";
+
+ /**
+ * Characters that are reserved (i.e. illegal) for URI authority portion (RFC 2396 sec. 3.2)
+ */
+// private static final char[] RESERVED_AUTHORITY_CHARACTERS = new char[] { ';', ':', '@', '?', '/' };
+
+ private static final String ERR_NULL = "Argument '%s' must not be null or empty.";
+
+ private static final String ERR_INVALID_SCHEME = "Invalid scheme '%s' for " + BagUri.class.getName() + ": scheme " +
+ "must be equal to '" + BAG_SCHEME + "'";
+
+ private static final String ERR_PARSE_URI = "Unable to parse URI string '%s': %s";
+
+ private static final String ERR_CREATE_URI = "Unable to construct a URI with scheme '%s', authority '%s', path '%s', and fragment '%s': %s";
+
+ /**
+ * Internal representation of the BagUri as a java.net.URI.
+ */
+ private URI bagUri;
+
+ /**
+ * The authority string (must not be {@code null}). It semantically aligns with, and should be equal to, the name
+ * of the bag. We keep this state for our own equals() and hashCode() implementation.
+ */
+ private String authority;
+
+ /**
+ * The path string (may be {@code null}). We keep this state for our own equals() and hashCode() implementation.
+ */
+ private String path;
+
+ /**
+ * The fragment string (may be {@code null}). We keep this state for our own equals() and hashCode()
+ * implementation.
+ */
+ private String fragment;
+
+ /**
+ * Constructs a new Bag URI, which addresses a resource in a Bag named by {@code authority}.
+ *
+ * Exemplars:
+ *
+ * - The {@code path} "data" with {@code authority} "mybag" would address the data directory inside
+ * of a Bag named 'mybag': {@code bag://mybag/data}.
+ * - The {@code path} "bag-info.txt" would identify the Bag metadata file: {@code bag://mybag/bag-info.txt}.
+ * - The {@code path} "data/dataobject.rdf" with a {@code fragment} "#obj-3" would identify a resource
+ * "{@code obj-3}" inside of the payload file {@code data/dataobject.rdf}:
+ * {@code bag://mybag/data/dataobject#obj-3}.
+ *
+ *
+ *
+ * @param authority the authority portion of the URI, which is expected to be the Bag name. Must not be
+ * {@code null}.
+ * @param path the path to the resource within the Bag
+ * @param fragment an optional fragment identifier, useful for referencing individual resources within a file
+ * @throws java.lang.IllegalArgumentException if any required parameters are {@code null} or invalid URI components.
+ */
+ public BagUri(String authority, String path, String fragment) {
+ if (authority == null || authority.trim().length() == 0) {
+ throw new IllegalArgumentException(String.format(ERR_NULL, "authority"));
+ }
+ try {
+ bagUri = new URI(BAG_SCHEME, authority, path, null, fragment);
+ } catch (URISyntaxException e) {
+ throw new IllegalArgumentException(
+ String.format(ERR_CREATE_URI, BAG_SCHEME, authority, path, fragment, e.getMessage()), e);
+ }
+
+ this.authority = authority;
+ this.path = path;
+ this.fragment = fragment;
+ }
+
+ /**
+ * TODO javadoc
+ * @return
+ */
+ public String getAuthority() {
+ return bagUri.getAuthority();
+ }
+
+ /**
+ * TODO javadoc
+ * @return
+ */
+ public String getFragment() {
+ return bagUri.getFragment();
+ }
+
+ /**
+ * TODO javadoc
+ * @return
+ */
+ public String getPath() {
+ return bagUri.getPath();
+ }
+
+ public URI asUri() {
+ return bagUri;
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * Instances of this class are considered equal if their authority, path, and fragment components are equal.
+ *
+ *
+ * @param o the object to determine equivalence against.
+ * @return {@code true} if the instances are equal, {@code false} otherwise
+ */
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ BagUri bagUri = (BagUri) o;
+
+ if (authority != null ? !authority.equals(bagUri.authority) : bagUri.authority != null) return false;
+ if (fragment != null ? !fragment.equals(bagUri.fragment) : bagUri.fragment != null) return false;
+ if (path != null ? !path.equals(bagUri.path) : bagUri.path != null) return false;
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = authority != null ? authority.hashCode() : 0;
+ result = 31 * result + (path != null ? path.hashCode() : 0);
+ result = 31 * result + (fragment != null ? fragment.hashCode() : 0);
+ return result;
+ }
+}
diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/BoundTokensTestUtil.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/BoundTokensTestUtil.java
new file mode 100644
index 00000000..6cfae0b4
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/BoundTokensTestUtil.java
@@ -0,0 +1,119 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * BoundTokens that are shared across unit tests.
+ */
+class BoundTokensTestUtil {
+
+ /**
+ * The {@code BoundToken} version of {@link Token#ZERO_OR_MORE_CHARACTERS}
+ */
+ static final BoundToken ZERO_OR_MORE = new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, "*");
+
+ /**
+ * The {@code BoundToken} version of {@link Token#ZERO_OR_MORE_CHARACTERS}, in a single element List
+ */
+ static final List ZERO_OR_MORE_L = Arrays.asList(
+ new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, "*"));
+
+ /**
+ * The {@code BoundToken} version of {@link Token#EXACTLY_ONE_CHARACTER}
+ */
+ static final BoundToken EXACTLY_ONE = new BoundToken(Token.EXACTLY_ONE_CHARACTER, "?");
+
+ /**
+ * The {@code BoundToken} version of {@link Token#EXACTLY_ONE_CHARACTER}, in a single element List
+ */
+ static final List EXACTLY_ONE_L = Arrays.asList(
+ new BoundToken(Token.EXACTLY_ONE_CHARACTER, "?"));
+
+ /**
+ * The {@code BoundToken} version of {@link Token#PATH_SEPARATOR}
+ */
+ static final BoundToken PATH_SEP = new BoundToken(Token.PATH_SEPARATOR, "/");
+
+ /**
+ * The {@code BoundToken} version of {@link Token#PATH_SEPARATOR}, in a single element List
+ */
+ static final List PATH_SEP_L = Arrays.asList(
+ new BoundToken(Token.PATH_SEPARATOR, "/"));
+
+ /**
+ * The {@code BoundToken} version of {@link Token#DIRECTORY}, in a single element List
+ */
+ static final BoundToken DIR = new BoundToken(Token.DIRECTORY, "**");
+
+ /**
+ * The {@code BoundToken} version of {@link Token#DIRECTORY}, represented as a List containing two
+ * {@link #ZERO_OR_MORE} BoundTokens.
+ */
+ static final List DIR_L = Arrays.asList(ZERO_OR_MORE, ZERO_OR_MORE);
+
+ /**
+ * Convenience method for creating a {@link Token#LITERAL literal} token for each character in {@code s}. It does
+ * not evaluate the characters in {@code s} for whether or not they should actually be made literals. That is the
+ * responsibility of the developer. (For example, this method will happily make literal tokens of "*", "?", and
+ * "/", which are not allowed by {@link Token#parse(CharSequence)}.)
+ *
+ * @param s the string to represent as a List of BoundTokens
+ * @return a List containing LITERAL BoundTokens for each character in {@code s}
+ */
+ static List literalsForString(String s) {
+ ArrayList literals = new ArrayList<>();
+ s.chars().forEach(c -> literals.add(new BoundToken(Token.LITERAL, String.valueOf((char) c))));
+ return literals;
+ }
+
+ /**
+ * Asserts that the values in the expected and actual Lists are equal. This method will assert that
+ * the lists are the same size before comparing their values.
+ *
+ * @param expected the expected List of BoundTokens
+ * @param actual the actual List of BoundTokens, normally representing a test result.
+ */
+ static void assertTokenListEquals(List expected, List actual) {
+ assertExpectedListCount(expected.size(), actual);
+ for (int i = 0; i < expected.size(); i++) {
+ assertEquals("Expected token: '" + expected.get(i) + "' but found '" + actual.get(i) + "'",
+ expected.get(i), actual.get(i));
+ }
+ }
+
+ /**
+ * Asserts that the supplied list of BoundTokens has the expected count.
+ *
+ * @param expectedCount the expected number of BoundTokens in {@code actual}
+ * @param actual a List of BoundTokens, normally representing the result of a test.
+ */
+ static void assertExpectedListCount(int expectedCount, List actual) {
+ assertEquals("Expected " + expectedCount + " BoundTokens, found " + actual.size() + ": " +
+ actual.stream().map(bt -> "['" + bt.token.name() + "', '" + bt.bound + "']")
+ .collect(Collectors.joining(", ")),
+ expectedCount, actual.size());
+ }
+}
diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java
new file mode 100644
index 00000000..152c31b9
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java
@@ -0,0 +1,499 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.DIR_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.EXACTLY_ONE;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.assertTokenListEquals;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.literalsForString;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Many, many tests against various methods in the ExpressionMatcher class.
+ * Most of the test methods in this class contain multiple assertions. Normally there will be one assertion for a
+ * sanity check - an assertion that should always be true. Often there will be multiple sanity checks.
+ *
+ * Because ExpressionMatcher is package-private, it can be hard to tell what the entry points into the ExpressionMatcher
+ * class are, and this test class doesn't help you determine that. Clients of ExpressionMatcher should be calling
+ * either:
+ *
+ * - {@link org.dataconservancy.bagit.rules.ExpressionMatcher#match(Expression, Expression)}
+ * - {@link org.dataconservancy.bagit.rules.ExpressionMatcher#match(java.util.List, java.util.List)}
+ *
+ * This test class covers not only these entry point methods, but other utility methods as well.
+ *
+ */
+public class ExpressionMatcherTest {
+
+ private ExpressionMatcher underTest;
+
+ @Before
+ public void setUp() throws Exception {
+ underTest = new ExpressionMatcher();
+ }
+
+ /**
+ * Attempts a match using an Expression that starts with '**' and contains consecutive '?' matching tokens.
+ */
+ @Test
+ public void testMatchExpressionLeadingDirectoryAndConsecutiveExactlyOne() throws Exception {
+ // The pattern to match against: leading '**' and consecutive '??'
+ Expression pattern = new Expression("**/Foo??.java");
+
+ // This path should match the pattern: src/test/java matches '**' and FooIT.java matches 'Foo??.java'
+ Expression path = new Expression("src/test/java/FooIT.java");
+
+ // This path should not match (the consecutive token '??' will remain unmatched)
+ Expression nonMatchingPath = new Expression("src/test/java/FooI.java");
+
+ // sanity: a path should match itself.
+ assertTrue(underTest.match(path, path));
+
+ assertTrue(underTest.match(pattern, path));
+ assertFalse(underTest.match(pattern, nonMatchingPath));
+ }
+
+ /**
+ * Attempts a match using an Expression that starts with '**' and contains a '*' matching token.
+ */
+ @Test
+ public void testMatchExpressionLeadingDirectoryAndZeroPlus() throws Exception {
+ // The pattern to match against: leading '**' and a '*'
+ Expression pattern = new Expression("**/*IT.java");
+
+ // This path should match the pattern: src/test/java matches '**' and FooIT.java matches '*IT.java'
+ Expression path = new Expression("src/test/java/FooIT.java");
+
+ // This path should not match (the path segment Bar.java will remain unmatched)
+ Expression nonMatchingPath = new Expression("src/test/java/Bar.java");
+
+ // sanity: a path should match itself
+ assertTrue(underTest.match(path, path));
+
+ assertTrue(underTest.match(pattern, path));
+ assertFalse(underTest.match(nonMatchingPath, path));
+ }
+
+ /**
+ * Attempts a match using an Expression that starts with a '**' matching token.
+ */
+ @Test
+ public void testMatchExpressionLeadingDirectory() throws Exception {
+ // The pattern to match against: leading '**'
+ Expression pattern = new Expression("**/FooIT.java");
+
+ // This path should match the pattern: src/test/java matches '**', and FooIT.java matches the 'FooIT.java' literal
+ Expression path = new Expression("src/test/java/FooIT.java");
+
+ // This path should not match
+ Expression nonMatchingPath = new Expression("src/test/java/BarIT.java");
+
+ // sanity: a path should match itself
+ assertTrue(underTest.match(path, path));
+
+ assertTrue(underTest.match(pattern, path));
+ assertFalse(underTest.match(pattern, nonMatchingPath));
+ }
+
+ /**
+ * Attempts a match using equal lists of {@code List<BoundToken>} containing only literals (no matching tokens or
+ * path separators)
+ */
+ @Test
+ public void testMatchWithOnlyLiterals() throws Exception {
+ List pattern = literalsForString("bar");
+ List path = literalsForString("bar");
+ assertTokenListEquals(path, pattern);
+
+ // sanity: non-equal literal token lists should not match
+ assertFalse(underTest.match(literalsForString("foo"), path));
+
+ // test to make sure that equal literal token lists will match
+ assertTrue(underTest.match(pattern, path));
+ }
+
+ /**
+ * Verifies that a literal will not match a pattern that contains leading directory match tokens followed by
+ * a non-matching literal. A complicated way of saying that we verify that the pattern "*IT.java" won't match
+ * "src".
+ */
+ @Test
+ public void testNoMatchBeginningZeroPlus() throws Exception {
+ // pattern: *IT.java
+ List pattern = new ArrayList<>();
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("IT.java"));
+
+ // path: src
+ List path = literalsForString("src");
+
+ assertFalse(underTest.match(pattern, path));
+ }
+
+ /**
+ * Attempts a match {@code List<BoundToken>} leading with a '?' matching token, ending with a '*' matching
+ * token, and with a single '?' token in the middle.
+ */
+ @Test
+ public void testLiteralsWithExactlyOne() throws Exception {
+ // pattern: "?tart?IT.jav?"
+ List pattern = new ArrayList<>();
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString("tart"));
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString("IT.jav"));
+ pattern.add(EXACTLY_ONE);
+
+ // path: startXIT.java (sanity, should pass)
+ List path = literalsForString("startXIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: strtXIT.java (first literal 'tart' doesn't match)
+ path = literalsForString("strtXIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: startXITT.java (middle literal 'IT.jav' doesn't match)
+ path = literalsForString("startXITT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: startXIT.jav (last token '?' doesn't match - missing character in path)
+ path = literalsForString("startXIT.jav");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: startXIT.javaa (last literal 'a' in path doesn't match)
+ path = literalsForString("startXIT.javaa");
+ assertFalse(underTest.match(pattern, path));
+ }
+
+ /**
+ * Attempts a match {@code List<BoundToken>} leading with a '*' matching token, ending with a '*' matching
+ * token, and with a single '*' token in the middle.
+ */
+ @Test
+ public void testLiteralsWithZeroPlus() throws Exception {
+ // pattern: "*tart*IT.jav*"
+
+ List pattern = new ArrayList<>();
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("tart"));
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("IT.jav"));
+ pattern.add(ZERO_OR_MORE);
+
+ // path: startXIT.java (sanity, should pass)
+ List path = literalsForString("startXIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: startXIT.jav (sanity, should pass)
+ path = literalsForString("startXIT.jav");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: tartXIT.java (sanity, should pass)
+ path = literalsForString("tartXIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: tartXIT.jav (sanity, should pass)
+ path = literalsForString("tartXIT.jav");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: strtXIT.java (first literal 'tart' doesn't match)
+ path = literalsForString("strtXIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: startXITT.java (middle literal 'IT.jav' doesn't match)
+ path = literalsForString("startXITT.java");
+ assertFalse(underTest.match(pattern, path));
+ }
+
+ /**
+ * Attempts various path matches against a pattern that contains three matching '?' tokens, at the
+ * beginning, middle, and end of the pattern.
+ */
+ @Test
+ public void testMultipleSingleCharacterTokens() throws Exception {
+ // pattern: "?tart?IT.jav?"
+
+ List pattern = new ArrayList<>();
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString("tart"));
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString("IT.jav"));
+ pattern.add(EXACTLY_ONE);
+
+ // path: startXIT.java (sanity, should pass)
+ List path = literalsForString("startXIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: FootartXIT.java (too many characters for first token)
+ path = literalsForString("FootartXIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: tartXIT.java (no characters for first token)
+ path = literalsForString("tartXIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: StartItUpIT.java (too many characters for middle token)
+ path = literalsForString("StartItUpIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: StartIT.java (no characters for middle token)
+ path = literalsForString("StartIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: StartXIT.jav (no characters for last token)
+ path = literalsForString("StartXIT.jav");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: StartXIT.javaa (too many characters for last token)
+ path = literalsForString("StartXIT.javaa");
+ assertFalse(underTest.match(pattern, path));
+ }
+
+ /**
+ * Attempts to match a path against a pattern containing a single matching token '?' in the middle.
+ */
+ @Test
+ public void testMatchLiteralFirstExactlyOneNoMatch() throws Exception {
+ // pattern: "Start?IT.java"
+
+ List pattern = new ArrayList<>();
+ pattern.addAll(literalsForString("Start"));
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString("IT.java"));
+
+ // path: startXIT.java (sanity, should pass)
+ List path = literalsForString("StartXIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: StartFooIT.java (won't match)
+ path = literalsForString("StartFooIT.java");
+
+ assertFalse(underTest.match(pattern, path));
+ }
+
+ /**
+ * Attempts to match a path against a pattern containing a single matching token '*' in the middle.
+ */
+ @Test
+ public void testMatchLiteralFirstZeroPlus() throws Exception {
+ // pattern: "Start*IT.java"
+
+ List pattern = new ArrayList<>();
+ pattern.addAll(literalsForString("Start"));
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("IT.java"));
+
+ // path: StartCarIT.java ('*' should match 'Car')
+ List path = literalsForString("StartCarIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: StartIT.java ('*' should match zero characters)
+ path = literalsForString("StartIT.java");
+ assertTrue(underTest.match(pattern, path));
+ }
+
+ @Test
+ public void testMatchTokenFirst() throws Exception {
+ // pattern: "*File*IT.java"
+
+ List pattern = new ArrayList<>();
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("File"));
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("IT.java"));
+
+ // path: UnixFileSmallIT.java
+ List path = literalsForString("UnixFileSmallIT.java");
+
+ assertTrue(underTest.match(pattern, path));
+ }
+
+ @Test
+ public void testMatchConsecutiveMatchTokens() throws Exception {
+ // pattern: "Foo??.java"
+ List pattern = new ArrayList<>();
+ pattern.addAll(literalsForString("Foo"));
+ pattern.add(EXACTLY_ONE);
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString(".java"));
+
+ // path: FooIT.java
+ List path = literalsForString("FooIT.java");
+
+ assertTrue(underTest.match(pattern, path));
+ }
+
+ /**
+ * Attempt to match a directory against the directory match token '**'
+ */
+ @Test
+ public void testMatchZeroPlusAndLiteral() throws Exception {
+ // pattern: "**"
+ List pattern = DIR_L;
+
+ // path: "src"
+ List path = literalsForString("src");
+
+ assertTrue(underTest.match(pattern, path));
+ }
+
+ /**
+ * Insures that a pattern like 'Foo**IT.java' - while almost certainly a mistake by the person who created the
+ * pattern - is a valid pattern. Make sure it matches.
+ */
+ @Test
+ public void testMatchMultipleZeroPlusTokens() throws Exception {
+ // pattern: "Foo**IT.java"
+ List pattern = new ArrayList<>();
+ pattern.addAll(literalsForString("Foo"));
+ pattern.add(ZERO_OR_MORE);
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("IT.java"));
+
+ // path: "FooIT.java" should match - '**' matches zero characters
+ List path = literalsForString("FooIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: "FooBarBazIT.java" should match - '**' matches "BarBaz"
+ path = literalsForString("FooBarBazIT.java");
+ assertTrue(underTest.match(pattern, path));
+ }
+
+ @Test
+ public void testFindNextToken() throws Exception {
+ assertEquals(19, underTest.findNextToken("src/test/resources/*IT.java", 0));
+ assertEquals(0, underTest.findNextToken("*File*IT.java", 0));
+ assertEquals(5, underTest.findNextToken("*File*IT.java", 1));
+ }
+
+ @Test
+ public void testFindNextLiteral() throws Exception {
+ assertEquals(0, underTest.findNextLiteral("src/test/resources/*IT.java", 0));
+ assertEquals(1, underTest.findNextLiteral("*File*IT.java", 0));
+ assertEquals(2, underTest.findNextLiteral("*File*IT.java", 2));
+ assertEquals(5, underTest.findNextLiteral("Foo??.java", 3));
+ }
+
+ @Test
+ public void testFindNextLiteralString() throws Exception {
+ assertEquals(1, underTest.matchNextLiteral("*File*IT.java", 0, "File"));
+ assertEquals(6, underTest.matchNextLiteral("*File*IT.java", 0, "IT"));
+ assertEquals(6, underTest.matchNextLiteral("*File*IT.java", 0, "IT.java"));
+ assertEquals(6, underTest.matchNextLiteral("*File*IT.java", 4, "IT.java"));
+ assertEquals(Integer.MIN_VALUE, underTest.matchNextLiteral("*FileIT.java", 0, "doodle"));
+ }
+
+ @Test
+ public void testFindRightAnchorFromBeginning() throws Exception {
+ String pattern = "File*IT.java";
+ String path = "FileUnixIT.java";
+
+ assertEquals("File".length(), underTest.findRightAnchor(pattern, path, 0,0 ));
+ }
+
+ @Test
+ public void testFindRightAnchorFromMiddle() throws Exception {
+ String pattern = "File*IT.java";
+ String path = "FileUnixIT.java";
+
+ // find the right anchor after we've matched pattern "File*" to path "FileUnix"
+ assertEquals(path.length(), underTest.findRightAnchor(pattern, path, "FileUnix".length(), "File*".length()));
+ }
+
+ @Test
+ public void testFindRightAnchorMultipleTokens() throws Exception {
+ String pattern = "Foo*Bar*Baz";
+ String path = "FooXBarYBaz";
+
+ assertEquals(3, underTest.findRightAnchor(pattern, path, 0, 0));
+ assertEquals(7, underTest.findRightAnchor(pattern, path, "FooX".length(), "Foo*".length()));
+ assertEquals(11, underTest.findRightAnchor(pattern, path, "FooXBarY".length(), "Foo*Bar*".length()));
+ }
+
+ @Test
+ public void testFindRightAnchorFoo() throws Exception {
+ String pattern = "Foo??.java";
+ String path = "src";
+
+ // behavior when the path is not in the pattern
+ assertEquals(Integer.MAX_VALUE, underTest.findRightAnchor(pattern, path, 0, 0));
+ }
+
+ @Test
+ public void testFindRightAnchorFooIT() throws Exception {
+ String pattern = "Foo??.java";
+ String path = "FooIT.java";
+
+ // behavior when the pattern offset is positioned at a token
+ assertEquals(Integer.MAX_VALUE, underTest.findRightAnchor(pattern, path, 0, 3));
+ }
+
+ @Test
+ public void testRightAnchorBar() throws Exception {
+ String pattern = "*File*IT.java";
+ String path = "UnixFileSmallIT.java";
+
+ // behavior when the path offset is already positioned at the right anchor
+ assertEquals(8, underTest.findRightAnchor(pattern, path, 4, 1));
+ }
+
+ /**
+ * Insures that the match token '**' - represented as a single BoundToken containing a DIRECTORY, or two
+ * consecutive BoundTokens containing a ZERO_OR_MORE_CHARACTERS - are both considered a "directory match" token
+ * by the ExpressionMatcher.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testIsDirectoryMatch() throws Exception {
+ List directory = Arrays.asList(new BoundToken(Token.DIRECTORY, Token.DIRECTORY.getTokenString()));
+ List consecutiveZeroOrMore = Arrays.asList(
+ new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, Token.ZERO_OR_MORE_CHARACTERS.getTokenString()),
+ new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, Token.ZERO_OR_MORE_CHARACTERS.getTokenString()));
+
+ assertTrue(underTest.isDirectoryMatchToken(directory));
+ assertTrue(underTest.isDirectoryMatchToken(consecutiveZeroOrMore));
+ }
+
+ void assertListsEqual(List expected, List actual) {
+ assertExpectedCount(expected.size(), actual);
+
+ for (int i = 0; i < expected.size(); i++) {
+ assertEquals("Expected path segments to be equal. Expected: '" + expected.get(i) +
+ "', Actual: '" + actual.get(i) + "'", expected.get(i), actual.get(i));
+ }
+ }
+
+ void assertExpectedCount(int expectedCount, List actual) {
+ assertEquals("Expected List to contain " + expectedCount + " elements. Contained " + actual.size() + ": " +
+ actual.stream().map(v -> "'" + v + "'").collect(Collectors.joining(", ")),
+ expectedCount, actual.size());
+ }
+}
\ No newline at end of file
diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionTest.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionTest.java
new file mode 100644
index 00000000..a65b5e8f
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionTest.java
@@ -0,0 +1,101 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.assertTokenListEquals;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.literalsForString;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class ExpressionTest {
+
+ @Test
+ public void testSimple() throws Exception {
+ Expression exp = new Expression("src/test/resources/*IT.java");
+
+ // == src/test/resources/*IT.java
+ List expected = literalsForString("src");
+ expected.add(PATH_SEP);
+ expected.addAll(literalsForString("test"));
+ expected.add(PATH_SEP);
+ expected.addAll(literalsForString("resources"));
+ expected.add(PATH_SEP);
+ expected.add(ZERO_OR_MORE);
+ expected.addAll(literalsForString("IT.java"));
+
+ List actual = exp.getTokens();
+
+ assertTokenListEquals(expected, actual);
+
+ // depth is an index
+ assertEquals(3, exp.depth());
+
+ // get path segment by depth test
+ assertTokenListEquals(literalsForString("src"), exp.getPathSegment(0));
+ assertTokenListEquals(literalsForString("test"), exp.getPathSegment(1));
+ assertTokenListEquals(literalsForString("resources"), exp.getPathSegment(2));
+
+ expected = new ArrayList<>();
+ expected.add(ZERO_OR_MORE);
+ expected.addAll(literalsForString("IT.java"));
+ assertTokenListEquals(expected, exp.getPathSegment(3));
+
+ // out of bounds tests
+ assertTrue(exp.getPathSegment(exp.depth() + 5).isEmpty());
+ assertTrue(exp.getPathSegment(-1).isEmpty());
+ }
+
+ @Test
+ public void testWithEmptyRoot() throws Exception {
+ Expression exp = new Expression("/");
+ assertEquals(0, exp.depth());
+
+ // TODO decide what to do with the automatic addition of '**'
+ // for example, the Expression "/" is tokenized as "/**".
+ // any path ending in "/" is going to be tokenized with a trailing "**",
+ // and the user may not intend that behavior (for example if they are just wanting
+ // to express a path (not a pattern).
+ assertTokenListEquals(PATH_SEP_L, exp.getTokens());
+ assertTrue(exp.getPathSegment(0).isEmpty());
+ }
+
+ @Test
+ public void testWithSingleFileRoot() throws Exception {
+ Expression exp = new Expression("/foo.txt");
+ assertEquals(0, exp.depth());
+
+ // "/foo.txt"
+ List expected = new ArrayList<>();
+ expected.add(PATH_SEP);
+ expected.addAll(literalsForString("foo.txt"));
+
+ assertTokenListEquals(expected, exp.getTokens());
+ assertFalse(exp.getPathSegment(0).isEmpty());
+ assertEquals(literalsForString("foo.txt"), exp.getPathSegment(0));
+ }
+}
\ No newline at end of file
diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java
new file mode 100644
index 00000000..99db35dd
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java
@@ -0,0 +1,178 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+
+import org.junit.Test;
+
+import java.util.Arrays;
+
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.DIR;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.DIR_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.EXACTLY_ONE;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.EXACTLY_ONE_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.assertTokenListEquals;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Insures that the {@link Token} class properly parses tokens.
+ */
+public class TokenTest {
+
+ /**
+ * Tokens are strings with special meanings. Insure all single-character tokens can be parsed.
+ */
+ @Test
+ public void testParseSingleCharacterString() throws Exception {
+
+ // With parse(...)
+ assertEquals(ZERO_OR_MORE, Token.parse("*"));
+ assertEquals(EXACTLY_ONE, Token.parse("?"));
+ assertEquals(new BoundToken(Token.LITERAL, "f"), Token.parse("f"));
+ assertEquals(PATH_SEP, Token.parse("/"));
+
+ // With parseString(...)
+ assertTokenListEquals(ZERO_OR_MORE_L, Token.parseString("*"));
+ assertTokenListEquals(EXACTLY_ONE_L, Token.parseString("?"));
+ assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, "f")), Token.parseString("f"));
+ assertTokenListEquals(PATH_SEP_L, Token.parseString("/"));
+ }
+
+ /**
+ * Tokens are strings with special meanings. Insure all multi-character tokens can be parsed.
+ */
+ @Test
+ public void testParseMultipleCharacterStrings() throws Exception {
+ // With parse(...)
+ assertEquals(DIR, Token.parse("**"));
+ assertEquals(new BoundToken(Token.LITERAL, "foobarbaz"), Token.parse("foobarbaz"));
+
+ // With parseString(...)
+ assertTokenListEquals(DIR_L, Token.parseString("**"));
+ assertTokenListEquals(
+ Arrays.asList(new BoundToken(Token.LITERAL, "f"), new BoundToken(Token.LITERAL, "o"),
+ new BoundToken(Token.LITERAL, "o"), new BoundToken(Token.LITERAL, "b"),
+ new BoundToken(Token.LITERAL, "a"), new BoundToken(Token.LITERAL, "r"),
+ new BoundToken(Token.LITERAL, "b"), new BoundToken(Token.LITERAL, "a"),
+ new BoundToken(Token.LITERAL, "z")), Token.parseString("foobarbaz"));
+ }
+
+ /**
+ * Attempting to parse a string with multiple tokens is an error.
+ * Legal with {@link #testParseStringSingleStringContainingDifferentTokens}.
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testParseSingleStringContainingDifferentTokens() throws Exception {
+ // With parse(...)
+ Token.parse("*/?**abc");
+ }
+
+ /**
+ * Attempting to parseString a string with multiple tokens is ok.
+ * An error with {@link #testParseSingleStringContainingDifferentTokens()}
+ */
+ @Test
+ public void testParseStringSingleStringContainingDifferentTokens() throws Exception {
+ // With parseString(...)
+ assertTokenListEquals(Arrays.asList(ZERO_OR_MORE, PATH_SEP, EXACTLY_ONE, ZERO_OR_MORE, ZERO_OR_MORE,
+ new BoundToken(Token.LITERAL, "a"), new BoundToken(Token.LITERAL, "b"),
+ new BoundToken(Token.LITERAL, "c")), Token.parseString("*/?**abc"));
+ }
+
+ /**
+ * Attempting to parse a string with multiple tokens is an error. Essentially the same test as
+ * {@link #testParseSingleStringContainingDifferentTokens()}. Note this is legal with
+ * {@link #testParseStringLiteralEndingWithPathSep()}.
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testParseLiteralEndingWithPathSep() throws Exception {
+ // With parse(...)
+ Token.parse("directory/");
+ }
+
+ /**
+ * Legal form of {@link #testParseLiteralEndingWithPathSep()}.
+ */
+ @Test
+ public void testParseStringLiteralEndingWithPathSep() throws Exception {
+ // With parseString(...)
+ assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, "d"), new BoundToken(Token.LITERAL, "i"),
+ new BoundToken(Token.LITERAL, "r"), new BoundToken(Token.LITERAL, "e"),
+ new BoundToken(Token.LITERAL, "c"), new BoundToken(Token.LITERAL, "t"),
+ new BoundToken(Token.LITERAL, "o"), new BoundToken(Token.LITERAL, "r"),
+ new BoundToken(Token.LITERAL, "y"), PATH_SEP), Token.parseString("directory/"));
+ }
+
+ /**
+ * Parsing zero length strings results in an error.
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testParseZeroLengthString() throws Exception {
+ // With parse(...)
+ assertEquals(new BoundToken(Token.LITERAL, ""), Token.parse(""));
+ }
+
+ /**
+ * Parsing zero length strings results in an error.
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testParseStringZeroLengthString() throws Exception {
+ // With parseString(...)
+ assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, "")), Token.parseString(""));
+ }
+
+ /**
+ * Empty strings would be parsed as a literal.
+ */
+ @Test
+ public void testParseEmptyString() throws Exception {
+ // With parse(...)
+ assertEquals(new BoundToken(Token.LITERAL, " "), Token.parse(" "));
+
+ // With parseString(...)
+ assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, " ")), Token.parseString(" "));
+ }
+
+ /**
+ * Parsing {@code null} results in an error
+ *
+ * @throws Exception
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testParseNull() throws Exception {
+ // With parse(...)
+ assertEquals(new BoundToken(Token.LITERAL, null), Token.parse(null));
+ }
+
+ /**
+ * Parsing {@code null} with parseString is also an error
+ *
+ * @throws Exception
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testParseStringNull() throws Exception {
+ // With parseString(...)
+ assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, null)), Token.parseString(null));
+ }
+
+}
\ No newline at end of file
diff --git a/dcs-bagit/pom.xml b/dcs-bagit/pom.xml
new file mode 100644
index 00000000..1f8d95bd
--- /dev/null
+++ b/dcs-bagit/pom.xml
@@ -0,0 +1,113 @@
+
+
+
+
+
+
+
+
+ 4.0.0
+
+ Data Conservancy BagIt Tools and Utilities
+ Data Conservancy BagIt packaging tools and utilities
+
+ org.dataconservancy
+ dcs-bagit
+ 1.0.0-SNAPSHOT
+ pom
+
+
+
+ org.dataconservancy
+ project-pom
+ 1.1.2-SNAPSHOT
+
+
+
+
+
+ dcs-bagit-support
+ dcs-bagit-vfs
+ dcs-bagit-compress
+
+
+
+
+
+
+
+
+
+
+
+
+
+ org.apache.commons
+ commons-vfs2
+ 2.1-SNAPSHOT
+
+
+
+ org.apache.commons
+ commons-compress
+ 1.10
+
+
+
+ junit
+ junit
+ 4.12
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ org.slf4j
+ slf4j-log4j12
+ test
+
+
+
+ log4j
+ log4j
+ test
+
+
+
+ junit
+ junit
+ test
+
+
+
+
+