From 25bc17e3f762e1cce60eff52a65e0cd6357fae71 Mon Sep 17 00:00:00 2001
From: Elliot Metsger
Date: Wed, 9 Sep 2015 11:33:30 -0400
Subject: [PATCH 1/9] DC-2101: Initial BagIt module poms.
---
dcs-bagit/dcs-bagit-support/pom.xml | 83 ++++++++++++++++++++
dcs-bagit/pom.xml | 113 ++++++++++++++++++++++++++++
2 files changed, 196 insertions(+)
create mode 100644 dcs-bagit/dcs-bagit-support/pom.xml
create mode 100644 dcs-bagit/pom.xml
diff --git a/dcs-bagit/dcs-bagit-support/pom.xml b/dcs-bagit/dcs-bagit-support/pom.xml
new file mode 100644
index 00000000..fc12c29b
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/pom.xml
@@ -0,0 +1,83 @@
+
+
+
+
+
+
+
+
+ 4.0.0
+
+ Data Conservancy BagIt Support
+ Support classes for BagIt implementations and clients
+
+ org.dataconservancy
+ dcs-bagit-support
+ jar
+
+
+ org.dataconservancy
+ dcs-bagit
+ 1.0.0-SNAPSHOT
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ org.slf4j
+ slf4j-api
+ compile
+
+
+
+
+
+ org.slf4j
+ slf4j-log4j12
+ test
+
+
+
+ log4j
+ log4j
+ test
+
+
+
+ junit
+ junit
+ test
+
+
+
+
+
diff --git a/dcs-bagit/pom.xml b/dcs-bagit/pom.xml
new file mode 100644
index 00000000..1f8d95bd
--- /dev/null
+++ b/dcs-bagit/pom.xml
@@ -0,0 +1,113 @@
+
+
+
+
+
+
+
+
+ 4.0.0
+
+ Data Conservancy BagIt Tools and Utilities
+ Data Conservancy BagIt packaging tools and utilities
+
+ org.dataconservancy
+ dcs-bagit
+ 1.0.0-SNAPSHOT
+ pom
+
+
+
+ org.dataconservancy
+ project-pom
+ 1.1.2-SNAPSHOT
+
+
+
+
+
+ dcs-bagit-support
+ dcs-bagit-vfs
+ dcs-bagit-compress
+
+
+
+
+
+
+
+
+
+
+
+
+
+ org.apache.commons
+ commons-vfs2
+ 2.1-SNAPSHOT
+
+
+
+ org.apache.commons
+ commons-compress
+ 1.10
+
+
+
+ junit
+ junit
+ 4.12
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ org.slf4j
+ slf4j-log4j12
+ test
+
+
+
+ log4j
+ log4j
+ test
+
+
+
+ junit
+ junit
+ test
+
+
+
+
+
From bf9a1e4d73b9106aeea7cf0cbb5de64529537e83 Mon Sep 17 00:00:00 2001
From: Elliot Metsger
Date: Wed, 9 Sep 2015 11:34:08 -0400
Subject: [PATCH 2/9] DC-2101: Initial BagUri class.
---
.../dataconservancy/bagit/support/BagUri.java | 174 ++++++++++++++++++
1 file changed, 174 insertions(+)
create mode 100644 dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagUri.java
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagUri.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagUri.java
new file mode 100644
index 00000000..407cb1fb
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagUri.java
@@ -0,0 +1,174 @@
+/*
+ * Copyright 2015 Johns Hopkins University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dataconservancy.bagit.support;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+
+/**
+ * URI scheme for addressing resources contained within a Bag. The form of a Bag URI is:
+ * {@code bag:///path/to/resource#optional-fragment}
+ *
+ * The Bag URI {@link #BAG_SCHEME scheme} is equal to the string '{@code bag}'; resources inside the bag are unique
+ * within the scope of a single bag. The {@code authority} component of a Bag URI is equal to the name of the Bag
+ * serialization (as discussed in BagIt
+ * section 4), minus any file name extensions. Query parameters are disallowed in Bag URIs, as they have no semantic
+ * analog in the BagIt specification.
+ *
+ *
+ * @see BagIt Draft Specification version 0.97, expires December 25, 2015
+ * @see RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax
+ * @see Data Conservancy BagIt Profile 1.0, section X
+ */
+public class BagUri {
+
+ /**
+ * The value of the Bag URI {@code scheme} (RFC 2396 sec. 3)
+ */
+ public static final String BAG_SCHEME = "bag";
+
+ /**
+ * Characters that are reserved (i.e. illegal) for URI authority portion (RFC 2396 sec. 3.2)
+ */
+// private static final char[] RESERVED_AUTHORITY_CHARACTERS = new char[] { ';', ':', '@', '?', '/' };
+
+ private static final String ERR_NULL = "Argument '%s' must not be null or empty.";
+
+ private static final String ERR_INVALID_SCHEME = "Invalid scheme '%s' for " + BagUri.class.getName() + ": scheme " +
+ "must be equal to '" + BAG_SCHEME + "'";
+
+ private static final String ERR_PARSE_URI = "Unable to parse URI string '%s': %s";
+
+ private static final String ERR_CREATE_URI = "Unable to construct a URI with scheme '%s', authority '%s', path '%s', and fragment '%s': %s";
+
+ /**
+ * Internal representation of the BagUri as a java.net.URI.
+ */
+ private URI bagUri;
+
+ /**
+ * The authority string (must not be {@code null}). It semantically aligns with, and should be equal to, the name
+ * of the bag. We keep this state for our own equals() and hashCode() implementation.
+ */
+ private String authority;
+
+ /**
+ * The path string (may be {@code null}). We keep this state for our own equals() and hashCode() implementation.
+ */
+ private String path;
+
+ /**
+ * The fragment string (may be {@code null}). We keep this state for our own equals() and hashCode()
+ * implementation.
+ */
+ private String fragment;
+
+ /**
+ * Constructs a new Bag URI, which addresses a resource in a Bag named by {@code authority}.
+ *
+ * Exemplars:
+ *
+ * - The {@code path} "data" with {@code authority} "mybag" would address the data directory inside
+ * of a Bag named 'mybag': {@code bag://mybag/data}.
+ * - The {@code path} "bag-info.txt" would identify the Bag metadata file: {@code bag://mybag/bag-info.txt}.
+ * - The {@code path} "data/dataobject.rdf" with a {@code fragment} "#obj-3" would identify a resource
+ * "{@code obj-3}" inside of the payload file {@code data/dataobject.rdf}:
+ * {@code bag://mybag/data/dataobject#obj-3}.
+ *
+ *
+ *
+ * @param authority the authority portion of the URI, which is expected to be the Bag name. Must not be
+ * {@code null}.
+ * @param path the path to the resource within the Bag
+ * @param fragment an optional fragment identifier, useful for referencing individual resources within a file
+ * @throws java.lang.IllegalArgumentException if any required parameters are {@code null} or invalid URI components.
+ */
+ public BagUri(String authority, String path, String fragment) {
+ if (authority == null || authority.trim().length() == 0) {
+ throw new IllegalArgumentException(String.format(ERR_NULL, "authority"));
+ }
+ try {
+ bagUri = new URI(BAG_SCHEME, authority, path, null, fragment);
+ } catch (URISyntaxException e) {
+ throw new IllegalArgumentException(
+ String.format(ERR_CREATE_URI, BAG_SCHEME, authority, path, fragment, e.getMessage()), e);
+ }
+
+ this.authority = authority;
+ this.path = path;
+ this.fragment = fragment;
+ }
+
+ /**
+ * TODO javadoc
+ * @return
+ */
+ public String getAuthority() {
+ return bagUri.getAuthority();
+ }
+
+ /**
+ * TODO javadoc
+ * @return
+ */
+ public String getFragment() {
+ return bagUri.getFragment();
+ }
+
+ /**
+ * TODO javadoc
+ * @return
+ */
+ public String getPath() {
+ return bagUri.getPath();
+ }
+
+ public URI asUri() {
+ return bagUri;
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * Instances of this class are considered equal if their authority, path, and fragment components are equal.
+ *
+ *
+ * @param o the object to determine equivalence against.
+ * @return {@code true} if the instances are equal, {@code false} otherwise
+ */
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ BagUri bagUri = (BagUri) o;
+
+ if (authority != null ? !authority.equals(bagUri.authority) : bagUri.authority != null) return false;
+ if (fragment != null ? !fragment.equals(bagUri.fragment) : bagUri.fragment != null) return false;
+ if (path != null ? !path.equals(bagUri.path) : bagUri.path != null) return false;
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = authority != null ? authority.hashCode() : 0;
+ result = 31 * result + (path != null ? path.hashCode() : 0);
+ result = 31 * result + (fragment != null ? fragment.hashCode() : 0);
+ return result;
+ }
+}
From e0c38dbc7ef7671b623455116f7493ac62d2cb00 Mon Sep 17 00:00:00 2001
From: Elliot Metsger
Date: Wed, 9 Sep 2015 11:35:41 -0400
Subject: [PATCH 3/9] DC-2101: Roles played by files or directories in a bag.
---
.../bagit/support/BagFileRole.java | 82 +++++++++++++++++++
1 file changed, 82 insertions(+)
create mode 100644 dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagFileRole.java
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagFileRole.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagFileRole.java
new file mode 100644
index 00000000..ec33bb84
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/support/BagFileRole.java
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2015 Johns Hopkins University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dataconservancy.bagit.support;
+
+/**
+ * Roles of tag files seen in a Bag. Roles are defined independently of concrete files because: 1) a single role may
+ * be served by multiple files; 2) names of files that fulfill these roles may change.
+ *
+ * While changing the name of the {@code bagit.txt} file is hard to imagine, it is reasonable that additional files may
+ * fulfill a particular role as the BagIt specification evolves. If the name of {@code bagit.txt} does change, it is
+ * likely that the role of a bag declaration will continue to be needed, even if it is not longer fulfilled by
+ * {@code bagit.txt}.
+ *
+ *
+ * BagIt requires that there be a bag declaration and a pay load manifest. These roles are enumerated in this class as
+ * {@link #BAG_DECL} and {@link #PAYLOAD_MANIFEST}, respectively. Other roles such as a tag manifest, payload
+ * directory, and fetch file are enumerated in this class. A payload manifest role may be fulfilled by two different
+ * files, a {@code manifest-sha1.txt} file containing SHA checksums, and a {@code manifest-md5.txt} file containing MD5
+ * checksums. In the future, implementations may use SHA-256 or other algorithms. Regardless of the name of future
+ * files, their role will be enumerated in this class.
+ *
+ *
+ * The documentation for each role includes example file names from the specification, and are informative. These are
+ * meant to be examples in aiding the comprehension of what the role represents; they are not normative.
+ *
+ */
+public enum BagFileRole {
+
+ /**
+ * The bag payload (e.g. {@code data/}) directory.
+ */
+ PAYLOAD_DIRECTORY,
+
+ /**
+ * Bag payload itself (e.g. content in the {@code data/} directory.
+ */
+ PAYLOAD_CONTENT,
+
+ /**
+ * Tag file corresponding to the {@code bagit.txt} file, at the base of the bag.
+ */
+ BAG_DECL,
+
+ /**
+ * Tag file corresponding to the {@code bag-info.txt} file, at the base of the bag.
+ */
+ BAG_INFO,
+
+ /**
+ * Tag file(s) corresponding to the payload {@code manifest-<algorithm>.txt} file, at the base of the bag.
+ */
+ PAYLOAD_MANIFEST,
+
+ /**
+ * Tag file(s) corresponding to the {@code tagmanifest-<algorithm>.txt} file, at the base of the bag.
+ */
+ TAG_MANIFEST,
+
+ /**
+ * Tag file corresponding to the {@code fetch.txt} file, at the base of the bag.
+ */
+ FETCH,
+
+ /**
+ * Tag files corresponding to additional tag files, not covered by the BagIt specification.
+ */
+ OTHER_TAG
+}
From 2715c44123c5d58cd05947a684080c8c54cceb8d Mon Sep 17 00:00:00 2001
From: Elliot Metsger
Date: Wed, 9 Sep 2015 22:11:57 -0400
Subject: [PATCH 4/9] DC-2101: Inital Token class and tests. Tokens are
strings with special meaning used to make up expressions.
---
.../dataconservancy/bagit/rules/Token.java | 138 ++++++++++++++++++
.../bagit/rules/TokenTest.java | 92 ++++++++++++
2 files changed, 230 insertions(+)
create mode 100644 dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java
create mode 100644 dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java
new file mode 100644
index 00000000..af8f7643
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java
@@ -0,0 +1,138 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+import static org.dataconservancy.bagit.rules.Message.ERR_NULL;
+
+/**
+ * Tokens are strings that make up a location expressions. Location expressions are patterns that are matched against
+ * paths. Location expressions are inspired by Apache Ant file pattern matching.
+ */
+enum Token {
+
+ /**
+ * A token matching exactly one character in an expression.
+ */
+ EXACTLY_ONE_CHARACTER("?"),
+
+ /**
+ * A token that will match multiple directory levels in an expression.
+ */
+ DIRECTORY("**"),
+
+ /**
+ * A token matching zero or more characters in an expression. Must always be defined sometime after
+ * {@link #DIRECTORY}
+ */
+ ZERO_OR_MORE_CHARACTERS("*"),
+
+
+ /**
+ * A token that separates path segments in an expression.
+ */
+ PATH_SEPARATOR("/"),
+
+ /**
+ * A special token with a {@code null} token string. Must always be defined last
+ */
+ LITERAL();
+
+ private static final String ERR_MULTIPLE_TOKENS = "Candidate sequence '%s' contains multiple tokens. " +
+ "Try splitting up the tokens and submitting the tokens one at a time.";
+
+ /**
+ * String representation of the token, if there is one.
+ */
+ private String tokenString;
+
+ /**
+ * Construct a Token with no string representation. Currently reserved for {@link #LITERAL} tokens.
+ */
+ private Token() {
+ this.tokenString = null;
+ }
+
+ /**
+ * Construct a token with the supplied string representation.
+ *
+ * @param tokenString the string representation of the token.
+ * @throws java.lang.IllegalArgumentException if the {@code tokenString} is {@code null}
+ */
+ private Token(String tokenString) {
+ if (tokenString == null) {
+ throw new IllegalArgumentException(String.format(ERR_NULL, "tokenString"));
+ }
+ this.tokenString = tokenString;
+ }
+
+ /**
+ * Obtain the string form of the token, may be {@code null}. {@link #LITERAL} tokens will not
+ * have a string form, because a literal is the set of characters that do not represent a token.
+ *
+ * @return the string form of the token, or {@code null} in the case of {@code LITERAL} tokens.
+ */
+ String getTokenString() {
+ return tokenString;
+ }
+
+ /**
+ * Attempts to parse a string which represents a single token into a {@code Token}
+ *
+ * @param candidate the candidate token string
+ * @return a {@code Token} if {@code candidate} represents a valid token
+ * @throws java.lang.IllegalArgumentException if {@code candidate} does not represent a valid token
+ */
+ static Token parse(CharSequence candidate) {
+ if (candidate == null) {
+ throw new IllegalArgumentException(String.format(ERR_NULL, "candidate"));
+ }
+
+ for (Token m : Token.values()) {
+
+ // See if the candidate token string equals the string representation
+ // of the token (except LITERAL), and return it
+ if (m.tokenString != null && m.tokenString.equals(candidate)) {
+ return m;
+ }
+
+ // Check to see if the candidate token string _contains_ the string representation
+ // of the token (except LITERAL). If so, that means that the candidate contains multiple
+ // tokens, which isn't allowed.
+ if (candidate.length() > 1 &&
+ m.tokenString != null &&
+ candidate.chars().anyMatch(
+ c -> m.tokenString.contains(Character.toString((char) c)))) {
+ throw new IllegalArgumentException(String.format(ERR_MULTIPLE_TOKENS, candidate));
+ }
+ }
+
+ // None of our Token string representations equaled the candidate string.
+ // The candidate string did not _contain_ any of the Token string representations
+ // We must be left with a LITERAL.
+
+ return LITERAL;
+ }
+
+ @Override
+ public String toString() {
+ return "Token{" +
+ "tokenString='" + tokenString + '\'' +
+ '}';
+ }
+}
diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java
new file mode 100644
index 00000000..d81d2b9c
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java
@@ -0,0 +1,92 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Insures that the {@link Token} class properly parses tokens.
+ */
+public class TokenTest {
+
+ /**
+ * Tokens are strings with special meanings. Insure all single-character tokens can be parsed.
+ */
+ @Test
+ public void testParseSingleCharacterString() throws Exception {
+ assertEquals(Token.ZERO_OR_MORE_CHARACTERS, Token.parse("*"));
+ assertEquals(Token.EXACTLY_ONE_CHARACTER, Token.parse("?"));
+ assertEquals(Token.LITERAL, Token.parse("f"));
+ assertEquals(Token.PATH_SEPARATOR, Token.parse("/"));
+ }
+
+ /**
+ * Tokens are strings with special meanings. Insure all multi-character tokens can be parsed.
+ */
+ @Test
+ public void testParseMultipleCharacterStrings() throws Exception {
+ assertEquals(Token.DIRECTORY, Token.parse("**"));
+ assertEquals(Token.LITERAL, Token.parse("foobarbaz"));
+ }
+
+ /**
+ * Attempting to parse a string with multiple tokens is an error.
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testParseSingleStringContainingDifferentTokens() throws Exception {
+ Token.parse("*/?**abc");
+ }
+
+ /**
+ * Attempting to parse a string with multiple tokens is an error. (Just another case similar to above)
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testParseLiteralEndingWithPathSep() throws Exception {
+ Token.parse("directory/");
+ }
+
+ /**
+ * Zero length strings would be parsed as a literal.
+ */
+ @Test
+ public void testParseZeroLengthString() throws Exception {
+ assertEquals(Token.LITERAL, Token.parse(""));
+ }
+
+ /**
+ * Empty strings would be parsed as a literal.
+ */
+ @Test
+ public void testParseEmptyString() throws Exception {
+ assertEquals(Token.LITERAL, Token.parse(" "));
+ }
+
+ /**
+ * Parsing {@code null} results in an error
+ *
+ * @throws Exception
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testParseNull() throws Exception {
+ assertEquals(Token.LITERAL, Token.parse(null));
+ }
+}
\ No newline at end of file
From 68b8f2eb8ca866f5236580c830f3f8008ff40199 Mon Sep 17 00:00:00 2001
From: Elliot Metsger
Date: Thu, 10 Sep 2015 23:12:13 -0400
Subject: [PATCH 5/9] DC-2101: Added the concept of a BoundToken: a Token that
is bound to a value. BoundToken doesn't quite pass the smell test because it
really only serves a purpose for LITERAL tokens. The other tokens already
have a value in their 'tokenString' field.
---
.../bagit/rules/BoundToken.java | 72 +++++++++++++++++++
1 file changed, 72 insertions(+)
create mode 100644 dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/BoundToken.java
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/BoundToken.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/BoundToken.java
new file mode 100644
index 00000000..9bff74fa
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/BoundToken.java
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2015 Johns Hopkins University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dataconservancy.bagit.rules;
+
+/**
+ * Binds a {@link Token} to the string that it represents. Most Tokens have their strings bound already, by the
+ * {@link org.dataconservancy.bagit.rules.Token#getTokenString()} method:
+ *
+ * - {@code PATH_SEPARATOR}:
- {@code /}
+ * - {@code EXACTLY_ONE_CHARACTER}:
- {@code ?}
+ * - {@code ZERO_OR_MORE_CHARACTERS}:
- {@code *}
+ * - {@code DIRECTORY}:
- {@code **}
+ *
+ * The exception is the {@link Token#LITERAL LITERAL token}, because it isn't known, a priori, what the
+ * literal characters will be.
+ *
+ * Therefore this class is mostly redundant, and may fail the smell test, but it serves to bind the string
+ * representation to all Tokens, useful really for only the {@code LITERAL} token.
+ *
+ */
+class BoundToken {
+
+ String bound;
+ Token token;
+
+ BoundToken(Token token, String toBind) {
+ this.token = token;
+ this.bound = toBind;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ BoundToken that = (BoundToken) o;
+
+ if (bound != null ? !bound.equals(that.bound) : that.bound != null) return false;
+ if (token != that.token) return false;
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = bound != null ? bound.hashCode() : 0;
+ result = 31 * result + (token != null ? token.hashCode() : 0);
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "BoundToken{" +
+ "bound='" + bound + '\'' +
+ ", token=" + token +
+ '}';
+ }
+}
From 0faf54a0fc32b4cb62ed2e9a9b7fe5c4eefe6674 Mon Sep 17 00:00:00 2001
From: Elliot Metsger
Date: Thu, 10 Sep 2015 23:19:42 -0400
Subject: [PATCH 6/9] DC-2101: Additional parsing logic to Token, unit tests,
javadoc 1. Added a parseString(...) method to Token, which will return a
List containing all the Tokens in the supplied String. 2.
Changed the behavior of parse(...) to return BoundToken instead of Token. 3.
Changed behavior of parse(...) to return a BoundToken for every literal
character encountered. Before it would return a single LITERAL token even if
multiple literal characters were encountered. The behavior of
parseString(...) also returns a LITERAL BoundToken for each literal
character.
The behaviors of parse(...) and parseString(...) differ, however, and this may be fixed in the future:
- parse("**") returns a single BoundToken(DIR, "**")
- parseString("**") returns a List containing two BoundToken(ZERO_OR_MORE_CHARACTERS, "*")
---
.../bagit/rules/BoundToken.java | 28 +++++
.../dataconservancy/bagit/rules/Token.java | 41 +++++-
.../bagit/rules/BoundTokensTestUtil.java | 119 ++++++++++++++++++
.../bagit/rules/TokenTest.java | 108 ++++++++++++++--
4 files changed, 281 insertions(+), 15 deletions(-)
create mode 100644 dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/BoundTokensTestUtil.java
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/BoundToken.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/BoundToken.java
index 9bff74fa..589bef0e 100644
--- a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/BoundToken.java
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/BoundToken.java
@@ -42,6 +42,34 @@ class BoundToken {
this.bound = toBind;
}
+ /**
+ * Return true if the the value bound to this token is exactly one character.
+ *
+ * @return true if the bound value is exactly one character.
+ */
+ boolean isSingleChar() {
+ return bound.length() == 1;
+ }
+
+ /**
+ * Return the first character of the bound value as a character.
+ *
+ * @return the first character of the bound value.
+ */
+ char asChar() {
+ return bound.charAt(0);
+ }
+
+ /**
+ * Return the entire bound value as a character array. This is what you would
+ * use if {@link #isSingleChar()} was false.
+ *
+ * @return the bound value as a character array.
+ */
+ char[] asCharArray() {
+ return bound.toCharArray();
+ }
+
@Override
public boolean equals(Object o) {
if (this == o) return true;
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java
index af8f7643..dc6b96b4 100644
--- a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Token.java
@@ -18,6 +18,10 @@
package org.dataconservancy.bagit.rules;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
import static org.dataconservancy.bagit.rules.Message.ERR_NULL;
/**
@@ -98,8 +102,8 @@ String getTokenString() {
* @return a {@code Token} if {@code candidate} represents a valid token
* @throws java.lang.IllegalArgumentException if {@code candidate} does not represent a valid token
*/
- static Token parse(CharSequence candidate) {
- if (candidate == null) {
+ static BoundToken parse(CharSequence candidate) {
+ if (candidate == null || candidate.length() == 0) {
throw new IllegalArgumentException(String.format(ERR_NULL, "candidate"));
}
@@ -108,7 +112,7 @@ static Token parse(CharSequence candidate) {
// See if the candidate token string equals the string representation
// of the token (except LITERAL), and return it
if (m.tokenString != null && m.tokenString.equals(candidate)) {
- return m;
+ return new BoundToken(m, candidate.toString());
}
// Check to see if the candidate token string _contains_ the string representation
@@ -126,7 +130,36 @@ static Token parse(CharSequence candidate) {
// The candidate string did not _contain_ any of the Token string representations
// We must be left with a LITERAL.
- return LITERAL;
+ return new BoundToken(Token.LITERAL, candidate.toString());
+ }
+
+ static List parseString(CharSequence candidate) {
+ if (candidate == null || candidate.length() == 0) {
+ throw new IllegalArgumentException(String.format(ERR_NULL, "candidate"));
+ }
+
+ return
+ candidate.chars().mapToObj(c -> {
+ // This code block maps each character in the sequence to a BoundToken.
+
+ // Cast the int to a char, and parse it as a String
+ String s = String.valueOf((char) c);
+ BoundToken bound = null;
+
+ // Iterate over every Token (except LITERAL), and see if the string matches
+ for (Token t : Token.values()) {
+ if (t.getTokenString() != null && t.getTokenString().equals(s)) {
+ bound = new BoundToken(t, s);
+ }
+ }
+
+ // If there was no match, then we must have a LITERAL.
+ if (bound == null) {
+ bound = new BoundToken(LITERAL, s);
+ }
+
+ return bound;
+ }).collect(Collectors.toList());
}
@Override
diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/BoundTokensTestUtil.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/BoundTokensTestUtil.java
new file mode 100644
index 00000000..6cfae0b4
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/BoundTokensTestUtil.java
@@ -0,0 +1,119 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * BoundTokens that are shared across unit tests.
+ */
+class BoundTokensTestUtil {
+
+ /**
+ * The {@code BoundToken} version of {@link Token#ZERO_OR_MORE_CHARACTERS}
+ */
+ static final BoundToken ZERO_OR_MORE = new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, "*");
+
+ /**
+ * The {@code BoundToken} version of {@link Token#ZERO_OR_MORE_CHARACTERS}, in a single element List
+ */
+ static final List ZERO_OR_MORE_L = Arrays.asList(
+ new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, "*"));
+
+ /**
+ * The {@code BoundToken} version of {@link Token#EXACTLY_ONE_CHARACTER}
+ */
+ static final BoundToken EXACTLY_ONE = new BoundToken(Token.EXACTLY_ONE_CHARACTER, "?");
+
+ /**
+ * The {@code BoundToken} version of {@link Token#EXACTLY_ONE_CHARACTER}, in a single element List
+ */
+ static final List EXACTLY_ONE_L = Arrays.asList(
+ new BoundToken(Token.EXACTLY_ONE_CHARACTER, "?"));
+
+ /**
+ * The {@code BoundToken} version of {@link Token#PATH_SEPARATOR}
+ */
+ static final BoundToken PATH_SEP = new BoundToken(Token.PATH_SEPARATOR, "/");
+
+ /**
+ * The {@code BoundToken} version of {@link Token#PATH_SEPARATOR}, in a single element List
+ */
+ static final List PATH_SEP_L = Arrays.asList(
+ new BoundToken(Token.PATH_SEPARATOR, "/"));
+
+ /**
+ * The {@code BoundToken} version of {@link Token#DIRECTORY}, in a single element List
+ */
+ static final BoundToken DIR = new BoundToken(Token.DIRECTORY, "**");
+
+ /**
+ * The {@code BoundToken} version of {@link Token#DIRECTORY}, represented as a List containing two
+ * {@link #ZERO_OR_MORE} BoundTokens.
+ */
+ static final List DIR_L = Arrays.asList(ZERO_OR_MORE, ZERO_OR_MORE);
+
+ /**
+ * Convenience method for creating a {@link Token#LITERAL literal} token for each character in {@code s}. It does
+ * not evaluate the characters in {@code s} for whether or not they should actually be made literals. That is the
+ * responsibility of the developer. (For example, this method will happily make literal tokens of "*", "?", and
+ * "/", which are not allowed by {@link Token#parse(CharSequence)}.)
+ *
+ * @param s the string to represent as a List of BoundTokens
+ * @return a List containing LITERAL BoundTokens for each character in {@code s}
+ */
+ static List literalsForString(String s) {
+ ArrayList literals = new ArrayList<>();
+ s.chars().forEach(c -> literals.add(new BoundToken(Token.LITERAL, String.valueOf((char) c))));
+ return literals;
+ }
+
+ /**
+ * Asserts that the values in the expected and actual Lists are equal. This method will assert that
+ * the lists are the same size before comparing their values.
+ *
+ * @param expected the expected List of BoundTokens
+ * @param actual the actual List of BoundTokens, normally representing a test result.
+ */
+ static void assertTokenListEquals(List expected, List actual) {
+ assertExpectedListCount(expected.size(), actual);
+ for (int i = 0; i < expected.size(); i++) {
+ assertEquals("Expected token: '" + expected.get(i) + "' but found '" + actual.get(i) + "'",
+ expected.get(i), actual.get(i));
+ }
+ }
+
+ /**
+ * Asserts that the supplied list of BoundTokens has the expected count.
+ *
+ * @param expectedCount the expected number of BoundTokens in {@code actual}
+ * @param actual a List of BoundTokens, normally representing the result of a test.
+ */
+ static void assertExpectedListCount(int expectedCount, List actual) {
+ assertEquals("Expected " + expectedCount + " BoundTokens, found " + actual.size() + ": " +
+ actual.stream().map(bt -> "['" + bt.token.name() + "', '" + bt.bound + "']")
+ .collect(Collectors.joining(", ")),
+ expectedCount, actual.size());
+ }
+}
diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java
index d81d2b9c..99db35dd 100644
--- a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java
+++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/TokenTest.java
@@ -21,6 +21,17 @@
import org.junit.Test;
+import java.util.Arrays;
+
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.DIR;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.DIR_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.EXACTLY_ONE;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.EXACTLY_ONE_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.assertTokenListEquals;
import static org.junit.Assert.assertEquals;
/**
@@ -33,10 +44,18 @@ public class TokenTest {
*/
@Test
public void testParseSingleCharacterString() throws Exception {
- assertEquals(Token.ZERO_OR_MORE_CHARACTERS, Token.parse("*"));
- assertEquals(Token.EXACTLY_ONE_CHARACTER, Token.parse("?"));
- assertEquals(Token.LITERAL, Token.parse("f"));
- assertEquals(Token.PATH_SEPARATOR, Token.parse("/"));
+
+ // With parse(...)
+ assertEquals(ZERO_OR_MORE, Token.parse("*"));
+ assertEquals(EXACTLY_ONE, Token.parse("?"));
+ assertEquals(new BoundToken(Token.LITERAL, "f"), Token.parse("f"));
+ assertEquals(PATH_SEP, Token.parse("/"));
+
+ // With parseString(...)
+ assertTokenListEquals(ZERO_OR_MORE_L, Token.parseString("*"));
+ assertTokenListEquals(EXACTLY_ONE_L, Token.parseString("?"));
+ assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, "f")), Token.parseString("f"));
+ assertTokenListEquals(PATH_SEP_L, Token.parseString("/"));
}
/**
@@ -44,32 +63,82 @@ public void testParseSingleCharacterString() throws Exception {
*/
@Test
public void testParseMultipleCharacterStrings() throws Exception {
- assertEquals(Token.DIRECTORY, Token.parse("**"));
- assertEquals(Token.LITERAL, Token.parse("foobarbaz"));
+ // With parse(...)
+ assertEquals(DIR, Token.parse("**"));
+ assertEquals(new BoundToken(Token.LITERAL, "foobarbaz"), Token.parse("foobarbaz"));
+
+ // With parseString(...)
+ assertTokenListEquals(DIR_L, Token.parseString("**"));
+ assertTokenListEquals(
+ Arrays.asList(new BoundToken(Token.LITERAL, "f"), new BoundToken(Token.LITERAL, "o"),
+ new BoundToken(Token.LITERAL, "o"), new BoundToken(Token.LITERAL, "b"),
+ new BoundToken(Token.LITERAL, "a"), new BoundToken(Token.LITERAL, "r"),
+ new BoundToken(Token.LITERAL, "b"), new BoundToken(Token.LITERAL, "a"),
+ new BoundToken(Token.LITERAL, "z")), Token.parseString("foobarbaz"));
}
/**
* Attempting to parse a string with multiple tokens is an error.
+ * Legal with {@link #testParseStringSingleStringContainingDifferentTokens}.
*/
@Test(expected = IllegalArgumentException.class)
public void testParseSingleStringContainingDifferentTokens() throws Exception {
+ // With parse(...)
Token.parse("*/?**abc");
}
/**
- * Attempting to parse a string with multiple tokens is an error. (Just another case similar to above)
+ * Attempting to parseString a string with multiple tokens is ok.
+ * An error with {@link #testParseSingleStringContainingDifferentTokens()}
+ */
+ @Test
+ public void testParseStringSingleStringContainingDifferentTokens() throws Exception {
+ // With parseString(...)
+ assertTokenListEquals(Arrays.asList(ZERO_OR_MORE, PATH_SEP, EXACTLY_ONE, ZERO_OR_MORE, ZERO_OR_MORE,
+ new BoundToken(Token.LITERAL, "a"), new BoundToken(Token.LITERAL, "b"),
+ new BoundToken(Token.LITERAL, "c")), Token.parseString("*/?**abc"));
+ }
+
+ /**
+ * Attempting to parse a string with multiple tokens is an error. Essentially the same test as
+ * {@link #testParseSingleStringContainingDifferentTokens()}. Note this is legal with
+ * {@link #testParseStringLiteralEndingWithPathSep()}.
*/
@Test(expected = IllegalArgumentException.class)
public void testParseLiteralEndingWithPathSep() throws Exception {
+ // With parse(...)
Token.parse("directory/");
}
/**
- * Zero length strings would be parsed as a literal.
+ * Legal form of {@link #testParseLiteralEndingWithPathSep()}.
*/
@Test
+ public void testParseStringLiteralEndingWithPathSep() throws Exception {
+ // With parseString(...)
+ assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, "d"), new BoundToken(Token.LITERAL, "i"),
+ new BoundToken(Token.LITERAL, "r"), new BoundToken(Token.LITERAL, "e"),
+ new BoundToken(Token.LITERAL, "c"), new BoundToken(Token.LITERAL, "t"),
+ new BoundToken(Token.LITERAL, "o"), new BoundToken(Token.LITERAL, "r"),
+ new BoundToken(Token.LITERAL, "y"), PATH_SEP), Token.parseString("directory/"));
+ }
+
+ /**
+ * Parsing zero length strings results in an error.
+ */
+ @Test(expected = IllegalArgumentException.class)
public void testParseZeroLengthString() throws Exception {
- assertEquals(Token.LITERAL, Token.parse(""));
+ // With parse(...)
+ assertEquals(new BoundToken(Token.LITERAL, ""), Token.parse(""));
+ }
+
+ /**
+ * Parsing zero length strings results in an error.
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testParseStringZeroLengthString() throws Exception {
+ // With parseString(...)
+ assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, "")), Token.parseString(""));
}
/**
@@ -77,7 +146,11 @@ public void testParseZeroLengthString() throws Exception {
*/
@Test
public void testParseEmptyString() throws Exception {
- assertEquals(Token.LITERAL, Token.parse(" "));
+ // With parse(...)
+ assertEquals(new BoundToken(Token.LITERAL, " "), Token.parse(" "));
+
+ // With parseString(...)
+ assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, " ")), Token.parseString(" "));
}
/**
@@ -87,6 +160,19 @@ public void testParseEmptyString() throws Exception {
*/
@Test(expected = IllegalArgumentException.class)
public void testParseNull() throws Exception {
- assertEquals(Token.LITERAL, Token.parse(null));
+ // With parse(...)
+ assertEquals(new BoundToken(Token.LITERAL, null), Token.parse(null));
+ }
+
+ /**
+ * Parsing {@code null} with parseString is also an error
+ *
+ * @throws Exception
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testParseStringNull() throws Exception {
+ // With parseString(...)
+ assertTokenListEquals(Arrays.asList(new BoundToken(Token.LITERAL, null)), Token.parseString(null));
}
+
}
\ No newline at end of file
From 1b0a6db796eb3223a8b3b8ae12dc34bc0afe64cb Mon Sep 17 00:00:00 2001
From: Elliot Metsger
Date: Sun, 13 Sep 2015 11:19:01 -0400
Subject: [PATCH 7/9] DC-2101: Initial Expression and ExpressionMatcher class,
with tests and Javadoc. Expression represents a path, or a pattern to match
a path. The ExpressionMatcher is responsible for matching a path Expression
against a pattern Expression. There is still some work to do here with
regard to path separators, and tokenizing Expression strings that end with
"/".
There is one unit test to resolve, and some more Javadoc to do, class/method level as well as package level.
---
.../bagit/rules/Expression.java | 141 ++++
.../bagit/rules/ExpressionMatcher.java | 701 ++++++++++++++++++
.../bagit/rules/ExpressionMatcherTest.java | 393 ++++++++++
.../bagit/rules/ExpressionTest.java | 101 +++
4 files changed, 1336 insertions(+)
create mode 100644 dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Expression.java
create mode 100644 dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java
create mode 100644 dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java
create mode 100644 dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionTest.java
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Expression.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Expression.java
new file mode 100644
index 00000000..4fcd862e
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/Expression.java
@@ -0,0 +1,141 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * An Expression is a String that represents a hierarchical path. An Expression may represent a path, or a pattern
+ * meant to match a path.
+ *
+ * * Even though a "path" and a "pattern" are both instances of an {@link Expression}, their semantics differ. A
+ * "path" only contains literal and path separator tokens. A "pattern" may contain literals, path separators, and
+ * matching tokens like '*' and '?'. Path segments are the tokens between consecutive path separators, addressable
+ * by their zero-indexed {@link org.dataconservancy.bagit.rules.Expression#depth() depth}. For example, the Expression
+ * '/foo/bar/baz.txt' has three path segments, 'foo' (depth = 0), 'bar' (depth = 1), and 'baz.txt' (depth = 2). The
+ * depth of the Expression is 2.
+ *
+ */
+public class Expression {
+
+ /**
+ * Tokens that make up this Expression, with the left-most token at the head of the list.
+ */
+ final private List tokens;
+
+ /**
+ * Tokens that make up this Expression, except any leading or trailing path separator tokens are stripped.
+ * This is more amenable to streams operations.
+ */
+ final private List sanitized;
+
+ /**
+ * Map of path segments, keyed by their depth. A path segment is a List of BoundTokens that lie between
+ * consecutive path separators. So a path segment will never contain a path separator character.
+ */
+ final private ConcurrentHashMap> segments = new ConcurrentHashMap<>();
+
+ /**
+ * Creates a new {@code Expression} instance from the supplied string. Normally an Expression represents a
+ * hierarchical path, so the supplied string will resemble a pattern matching a path, or an actual path.
+ *
+ * @param expression a string representing an expression.
+ */
+ public Expression(String expression) {
+ this.tokens = ExpressionTokenizer.tokenize(expression);
+ this.sanitized = this.tokens.stream().collect(ArrayList::new, ArrayList::add, ArrayList::addAll);
+ if (this.sanitized.get(0).token == Token.PATH_SEPARATOR) {
+ this.sanitized.remove(0);
+ }
+
+ if (this.sanitized.get(this.sanitized.size() - 1).token == Token.PATH_SEPARATOR) {
+ this.sanitized.remove(this.sanitized.size() - 1);
+ }
+ }
+
+ /**
+ * The entire list of tokens that make up this {@code Expression}, including all path separators.
+ *
+ * @return the tokens that make up this {@code Expression}
+ */
+ List getTokens() {
+ return tokens;
+ }
+
+ /**
+ * A zero-based index representing the depth of the {@code Expression}.
+ *
+ * - {@code /}
- depth == 0
+ * - {@code dir/}
- depth == 0
+ * - {@code /dir}
- depth == 0
+ * - {@code /dir/foo}
- depth == 1
+ * - {@code /dir/foo/bar.txt}
- depth == 2
+ * - {@code **/*.java}
- depth == 1
+ *
+ *
+ * @return the depth of this {@code Expression}, always 0 or greater.
+ */
+ public int depth() {
+ return (int) sanitized.stream().filter(bt -> bt.token == Token.PATH_SEPARATOR).count();
+ }
+
+ /**
+ * A path segment are the tokens that occur between two consecutive path separators. This method obtains the
+ * tokens for the path segment specified {@code depth}. Path separator tokens will not be included in the returned
+ * list.
+ *
+ * @param depth the zero-indexed depth of the path segment to retrieve
+ * @return the tokens making up the path segment, or an empty List if the depth is out of bounds
+ */
+ public List getPathSegment(int depth) {
+ return segments.computeIfAbsent(depth, (d) -> {
+ List pathSegments = new ArrayList<>();
+ int i = 0;
+ for (BoundToken t : sanitized) {
+ if (i > d) {
+ // done recording tokens, break
+ break;
+ }
+
+ if (t.token == Token.PATH_SEPARATOR) {
+ // increment depth
+ i++;
+ // continue, we don't record path separators
+ continue;
+ }
+
+
+ if (d - i == 0) {
+ // record the token
+ pathSegments.add(t);
+ }
+ }
+
+ return pathSegments;
+ });
+ }
+
+ @Override
+ public String toString() {
+ return tokens.stream()
+ .collect(StringBuilder::new, (s, bt) -> s.append(bt.bound), StringBuilder::append).toString();
+ }
+}
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java
new file mode 100644
index 00000000..6aea666f
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java
@@ -0,0 +1,701 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+import java.util.List;
+
+/**
+ * Responsible for matching an Expression representing a path against a pattern. This is quite possibly the most
+ * heavy-weight string parsing library you'll ever encounter. It is inspired by Ant-style pattern matching, and
+ * attempts to follow the same rules as the Ant
+ * implementation:
+ *
+ *
+ * These patterns look very much like the patterns used in DOS and UNIX:
+ *
+ * '*' matches zero or more characters, '?' matches one character.
+ *
+ * In general, patterns are considered relative paths, relative to a task dependent base directory (the dir attribute in
+ * the case of ). Only files found below that base directory are considered. So while a pattern like
+ * ../foo.java is possible, it will not match anything when applied since the base directory's parent is never scanned
+ * for files.
+ *
+ * Examples:
+ *
+ * .java matches .java, x.java and FooBar.java, but not FooBar.xml (does not end with .java).
+ *
+ * ?.java matches x.java, A.java, but not .java or xyz.java (both don't have one character before .java).
+ *
+ * Combinations of *'s and ?'s are allowed.
+ *
+ * Matching is done per-directory. This means that first the first directory in the pattern is matched against the first
+ * directory in the path to match. Then the second directory is matched, and so on. For example, when we have the
+ * pattern /?abc/*/*.java and the path /xabc/foobar/test.java, the first ?abc is matched with
+ * xabc, then * is matched with foobar, and finally *.java is matched with test.java. They all match, so the path
+ * matches the pattern.
+ *
+ * To make things a bit more flexible, we add one extra feature, which makes it possible to match multiple directory
+ * levels. This can be used to match a complete directory tree, or a file anywhere in the directory tree. To do this,
+ * ** must be used as the name of a directory. When ** is used as the name of a directory in the
+ * pattern, it matches zero or more directories. For example: /test/** matches all files/directories under
+ * /test/, such as /test/x.java, or /test/foo/bar/xyz.html, but not /xyz.xml.
+ *
+ * There is one "shorthand": if a pattern ends with / or \, then ** is appended. For example,
+ * mypackage/test/ is interpreted as if it were mypackage/test/**.
+ *
+ *
+ *
+ * Even though a "path" and a "pattern" are both instances of an {@link Expression}, their semantics differ. A
+ * "path" only contains literal and path separator tokens. A "pattern" may contain literals, path separators, and
+ * matching tokens like '*' and '?'. Path segments are the tokens between consecutive path separators, addressable
+ * by their zero-indexed {@link org.dataconservancy.bagit.rules.Expression#depth() depth}. For example, the Expression
+ * '/foo/bar/baz.txt' has three path segments, 'foo' (depth = 0), 'bar' (depth = 1), and 'baz.txt' (depth = 2). The
+ * depth of the Expression is 2.
+ *
+ *
+ * Note that methods on this class are package-private, and are not meant to be exposed publicly.
+ *
+ */
+public class ExpressionMatcher {
+
+ /**
+ * Convenience reference to a {@code BoundToken} that matches zero or more characters (i.e. '*').
+ * See {@link #zero_plus} for the {@code char} analog.
+ */
+ private static final BoundToken ZERO_OR_MORE = new BoundToken(Token.ZERO_OR_MORE_CHARACTERS,
+ Token.ZERO_OR_MORE_CHARACTERS.getTokenString());
+
+ /**
+ * Convenience reference to a {@code BoundToken} that matches exactly one character (i.e. '?').
+ * See {@link #exactly_one} for the {@code char} analog.
+ */
+ private static final BoundToken EXACTLY_ONE = new BoundToken(Token.EXACTLY_ONE_CHARACTER,
+ Token.EXACTLY_ONE_CHARACTER.getTokenString());
+
+ /**
+ * The {@code char} analog of {@link #EXACTLY_ONE}
+ */
+ private final char exactly_one;
+
+ /**
+ * The {@code char} analog of {@link #ZERO_OR_MORE}
+ */
+ private final char zero_plus;
+
+ /**
+ * Constructs a new instance of a matcher.
+ * TODO: probably could be private and methods be made static.
+ */
+ ExpressionMatcher() {
+ if (EXACTLY_ONE.isSingleChar()) {
+ exactly_one = EXACTLY_ONE.asChar();
+ } else {
+ throw new RuntimeException("Implementation doesn't handle multi-character token: " +
+ Token.EXACTLY_ONE_CHARACTER);
+ }
+
+ if (ZERO_OR_MORE.isSingleChar()) {
+ zero_plus = ZERO_OR_MORE.asChar();
+ } else {
+ throw new RuntimeException("Implementation doesn't handle multi-character token: " +
+ Token.ZERO_OR_MORE_CHARACTERS);
+
+ }
+ }
+
+ /**
+ * Match the supplied path against the pattern. Matching is applied 'per-directory' as described
+ * {@link org.dataconservancy.bagit.rules.ExpressionMatcher above}. This is the main entry point into the pattern
+ * matching logic.
+ *
+ * @param pattern the pattern meant to match a path
+ * @param path the path to match against the pattern
+ * @return true if the pattern matches
+ */
+ boolean match(Expression pattern, Expression path) {
+
+ // the path should just be made up of path separators and literals
+ if (!isPath(path.getTokens())) {
+ return false; // probably should be an IAE
+ }
+
+ if (pattern.depth() > path.depth()) {
+ // if the pattern depth is greater than the path we're supposed to be matching,
+ // then we can't match, so short-circuit
+ return false; // probably should be an IAE
+ }
+
+ if (pattern.depth() == path.depth()) {
+ boolean match = true;
+ // we have alignment, simply match each path segment from the pattern against the path.
+ for (int i = 0; i <= pattern.depth(); i++) {
+ match &= match(pattern.getPathSegment(i), path.getPathSegment(i));
+ }
+
+ return match;
+ }
+
+ int pathOff = 0;
+ int expOff = 0;
+ int nextLiteral = nextLiteral(pattern, expOff);
+
+ return matchPathSegment(pattern, path, expOff, pathOff, nextLiteral);
+ }
+
+ /**
+ * Attempt to match all of the path segments in {@code path} against {@code pattern}, starting from
+ * {@code pathDepth} and {@code patternDepth}. The {@code nextLiteral} parameter contains the depth of the next
+ * path segment in {@code pattern} containing a literal (or -1 if there isn't any).
+ *
+ * @param pattern the expression containing a matching tokens (i.e. pattern semantics)
+ * @param path the expression containing only literals or path separators (i.e. path semantics)
+ * @param patternDepth the depth to begin matching the pattern
+ * @param pathDepth the depth to begin matching the path
+ * @param nextLiteral the depth of the next pattern segment that contains a literal, or -1 if it doesn't exist
+ * @return true if all of the segments (starting from pathDepth) in the path can be matched in the pattern (starting
+ * from patternDepth)
+ */
+ private boolean matchPathSegment(Expression pattern, Expression path, int patternDepth, int pathDepth,
+ int nextLiteral) {
+
+ // if we're out of literals...
+ if (nextLiteral == -1) {
+ // See if there are remaining segments to match, and match them.
+ boolean match = true;
+ for (int i = pathDepth; i <= path.depth(); i++) {
+ match &= match(pattern.getPathSegment(patternDepth), path.getPathSegment(i));
+ }
+
+ return match;
+ }
+
+ // match the pattern segment containing literals against every path segment until we get a match
+ int rightAnchor = nextMatch(path, pathDepth, pattern.getPathSegment(nextLiteral));
+
+ // if we don't match ...
+ if (rightAnchor == -1) {
+ return false;
+ }
+
+ // make sure that every path segment from the left anchor to the right anchor matches the current path expression
+ boolean match = true;
+ for (int i = pathDepth; i < rightAnchor; i++) {
+ match &= match(pattern.getPathSegment(patternDepth), path.getPathSegment(i));
+ }
+
+ // if they match up to the anchor, keep going
+ if (match) {
+ pathDepth = rightAnchor;
+ patternDepth++;
+ nextLiteral = nextLiteral(pattern, nextLiteral + 1);
+ return matchPathSegment(pattern, path, patternDepth, pathDepth, nextLiteral);
+ }
+
+ return false;
+ }
+
+ /**
+ * Search the supplied pattern starting at {@code depth} for path segments that contain literals. Useful for
+ * finding the depth of path segment 'Foo??.java' in the pattern expression '**/Foo??.java'.
+ *
+ * @param pattern an expression with pattern semantics
+ * @param depth the depth to begin searching from
+ * @return the index of the next path segment (i.e. depth) that contains literals, or -1 if not found
+ */
+ int nextLiteral(Expression pattern, int depth) {
+ if (depth > pattern.depth()) {
+ return -1;
+ }
+
+ for (int i = depth; i <= pattern.depth(); i++) {
+ if (containsLiterals(pattern.getPathSegment(i))) {
+ return i;
+ }
+ }
+
+ return -1;
+ }
+
+ /**
+ * Attempts to match every path segment starting from {@code path.getPathSegment(depth)} against the
+ * {@code pattern}. The {@code path} is an {@code Expression} with path semantics (i.e. only containing literals
+ * and path separators). Each path segment (starting from {@code depth}) is matched against {@code pattern}.
+ *
+ * @param path an Expression with path semantics
+ * @param depth the depth of the expression to begin matching from
+ * @param pattern the pattern each path segment of {@code path} is matched against.
+ * @return the index of the first path segment (i.e. depth) that matched {@code pattern}, or -1 if no match
+ */
+ int nextMatch(Expression path, int depth, List pattern) {
+ for (int i = depth; i <= path.depth(); i++) {
+ if (match(pattern, path.getPathSegment(i))) {
+ return i;
+ }
+ }
+
+ return -1;
+ }
+
+ /**
+ * Expected input are two Lists of BoundTokens. Each List is expected to be a path segment; that is, a List
+ * will contain all BoundTokens between two consecutive path separators, not including the separators. Therefore
+ * the path segment will not ever contain a path separator ('/'), nor should it contain a directory match
+ * token ('**').
+ *
+ * Essentially this method is evaluating a pattern that may contain literals, '*', and '?' against a
+ * string of literals.
+ *
+ *
+ * @param patternPathSegment the pattern
+ * @param pathPathSegment the string (i.e. path) to match the pattern against
+ * @return true if the pattern matches the path
+ */
+ boolean match(List patternPathSegment, List pathPathSegment) {
+
+ // first, handle the short-circuit cases:
+ // patternPathSegment only contains '*' ; doesn't matter what pathPathSegment has, all tokens match
+ // patternPathSegment contains '?' and pathPathSegment only has a single token, the single token matches
+ // patternPathSegment is all literals ; see if the pathPathSegment equals
+
+ if (isZeroOrMore(patternPathSegment)) {
+ return true;
+ }
+
+ if (pathPathSegment.size() == 1 && isExactlyOne(patternPathSegment)) {
+ return true;
+ }
+
+ if (allLiterals(patternPathSegment)) {
+ return tokenEquals(patternPathSegment, pathPathSegment);
+ }
+
+ // Otherwise, we have a multiple-token pattern that contains a mixture of literals
+ // and at least one of '*' or '?'
+
+ CharSequence pattern = toCharSeq(patternPathSegment);
+ CharSequence path = toCharSeq(pathPathSegment);
+
+ int fPatternIndex = 0;
+ int fPathIndex = 0;
+ int tokenIndex = findNextToken(pattern, fPatternIndex);
+ int literalIndex = findNextLiteral(pattern, fPatternIndex);
+
+ int leftAnchor = 0;
+
+ return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor);
+ }
+
+ /**
+ * A recursive method for matching a {@code path} against a {@code pattern}. The method terminates when there are
+ * no more literals or tokens to be matched, or as soon as it determines a match isn't possible and returns early.
+ *
+ * Developers, when reading this implementation, keep in mind that anchors are always indexes into the {@code path},
+ * while {@code tokenIndex} and {@code literalIndex} are always indexes into {@code pattern}. The first major
+ * decision made is whether the method is attempting to match a token (e.g. '?' in "Foo??.java") or match a literal
+ * (e.g. "Foo", ".java" in "Foo??.java").
+ *
+ *
+ * When matching a token, the first decision to make is whether you are going to match forward from the current
+ * token, or work backward from the end of the pattern. When matching a literal, the objective is to determine the
+ * anchors of the literal in the path and attempt to match it against the pattern.
+ *
+ *
+ * @param pattern the pattern to match against
+ * @param path the path to match
+ * @param fPathIndex the index into the {@code path} that has matched
+ * @param tokenIndex the index into {@code pattern} of the next token to be matched
+ * @param literalIndex the index into th {@code pattern} of the next literal to be matched
+ * @param leftAnchor not used TODO remove
+ * @return true if {@code path} matches {@code pattern}
+ */
+ private boolean match(CharSequence pattern, CharSequence path, int fPathIndex, int tokenIndex, int literalIndex, int leftAnchor) {
+ // Index description:
+ // - fPathIndex, left and right anchors are always indexes in the path
+ // - token and literal are always indexes in the pattern.
+
+
+ int rightAnchor = Integer.MIN_VALUE;
+
+ if (tokenIndex == Integer.MAX_VALUE && literalIndex == Integer.MAX_VALUE) {
+ // we've matched everything?
+ return true;
+ }
+
+ if (tokenIndex < literalIndex) {
+
+ // We are matching a token (because tokenIndex < literalIndex)
+ //
+ // If we are matching the last token in the pattern, we work backward in the path.
+ // If we are matching a token, and there are still more tokens left, we work forward in the path.
+ //
+ // - Find the left and right anchors in the path.
+ // - Find right anchor
+ // - Find the next literal in the pattern (using the literalIndex, and the [end of string|next token index])
+ // - Match that literal in the path (from offset fPathIndex)
+ // - Set the right anchor at the start of the literal.
+ // - Find left anchor
+ // - Equal to the forward path index (fPathIndex)
+ //
+ // - If the token is a '*', we match.
+ // - If the token is a '?', and rightAnchor - leftAnchor == 1, we match.
+ //
+ // - If we match:
+ // - set the fPathIndex to the rightAnchor (because fPathIndex keeps track of what we've matched in the path)
+ // - set the next token index (or Integer.MIN_VALUE if the pattern is exhausted, or out of tokens)
+ // - leave literalIndex alone, because we didn't match a literal this go-around, we matched a token.
+
+ // Find the right anchor.
+ int nextTokenIndex = findNextToken(pattern, tokenIndex + 1);
+
+ leftAnchor = fPathIndex;
+
+ CharSequence literal = null;
+
+ if (nextTokenIndex != Integer.MAX_VALUE && nextTokenIndex != Integer.MAX_VALUE) {
+ // we are not at the last token, work forward
+ // in the case of consecutive tokens (e.g. "??"), the literalIndex will be greater than the nextTokenIndex
+ int literalLen = Math.max(nextTokenIndex - literalIndex, 1);
+ literal = (literalLen <= 0) ? "" : pattern.subSequence(literalIndex, literalIndex + literalLen);
+
+ if ((rightAnchor = matchNextLiteral(path, fPathIndex, literal)) == Integer.MIN_VALUE) {
+ // we didn't match the literal in the path, so we won't match
+ return false;
+ }
+ } else {
+ // we are at the last token, work backward from the end of the pattern by matching the literal at
+ // the end of the pattern, then checking the remaining characters in the path with the pattern token
+
+ // the special case is if the token we are matching is the last character of the pattern, in which
+ // case there won't be a literal to match. in this case, the right anchor will be set to
+ // the end of the path.
+
+ if (tokenIndex == pattern.length() - 1) {
+ rightAnchor = path.length();
+ } else {
+ literal = pattern.subSequence(tokenIndex + 1, pattern.length());
+
+ // if we don't match the literal in the path, then we don't match
+ if ((rightAnchor = matchNextLiteral(path, fPathIndex, literal)) == Integer.MIN_VALUE) {
+ return false;
+ }
+ }
+ }
+
+ // - If the token is a '*', we match.
+ // - If the token is a '?', and rightAnchor - leftAnchor == 1, we match.
+
+ // if the next token is inside of the right anchor, we have multiple tokens (e.g. '??') in a row.
+ if (pattern.charAt(tokenIndex) == exactly_one) {
+ if (nextTokenIndex < rightAnchor) {
+ if (pattern.subSequence(tokenIndex, literalIndex).chars().allMatch(c -> ((char) c) == '?')) {
+ fPathIndex = ++leftAnchor;
+ tokenIndex = findNextToken(pattern, tokenIndex + 1);
+ return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor);
+ } else {
+ return false;
+ }
+ }
+
+ if (rightAnchor - leftAnchor == 1) {
+ fPathIndex = rightAnchor;
+ tokenIndex = findNextToken(pattern, tokenIndex + 1);
+ return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor);
+ } else {
+ return false;
+ }
+ }
+
+ if (pattern.charAt(tokenIndex) == zero_plus) { //||
+ //pattern.charAt(tokenIndex) == exactly_one && (rightAnchor - leftAnchor == 1) // ) {
+ // || ((nextTokenIndex < rightAnchor) && pattern.subSequence(tokenIndex, literalIndex).chars().allMatch(c -> ((char) c) == '?'))) {
+
+ // - If we match:
+ // - set the fPathIndex to the rightAnchor (because fPathIndex keeps track of what we've matched in the path)
+ // - set the next token index (or Integer.MAX_VALUE if the pattern is exhausted, or out of tokens)
+ // - leave literalIndex alone, because we didn't match a literal this go-around, we matched a token.
+
+ fPathIndex = rightAnchor;
+ tokenIndex = findNextToken(pattern, tokenIndex + 1);
+
+ return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor);
+
+ } else {
+ return false;
+ }
+
+ } else if (literalIndex < tokenIndex) {
+
+ // We are matching a literal (because literalIndex < tokenIndex)
+ CharSequence literalToMatch;
+
+ if (literalIndex == Integer.MIN_VALUE) {
+ // we're out of literals, so we just have to match that last token
+ rightAnchor = path.length();
+ if (pattern.charAt(tokenIndex) == zero_plus ||
+ pattern.charAt(tokenIndex) == exactly_one && rightAnchor - leftAnchor == 1) {
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ leftAnchor = fPathIndex;
+
+ // if we can't find the right anchor, then we can't match.
+ if ((rightAnchor = findRightAnchor(pattern, path, fPathIndex, literalIndex)) == Integer.MAX_VALUE) {
+ return false;
+ }
+
+ literalToMatch = pattern.subSequence(literalIndex, Math.min(tokenIndex, pattern.length()));
+ }
+
+ // does the literal in the pattern match the literal between the anchors?
+ if (path.subSequence(leftAnchor, rightAnchor).equals(literalToMatch)) {
+
+ // - If we match:
+ // - set the fPathIndex to the rightAnchor (because fPathIndex keeps track of what we've matched in the path)
+ // - leave the next token index alone, because we didn't match a token this go around, we matched a literal
+ // - set the literal index to the beginning of the next literal
+
+ fPathIndex = rightAnchor;
+ literalIndex = findNextLiteral(pattern, literalIndex + literalToMatch.length());
+
+ return match(pattern, path, fPathIndex, tokenIndex, literalIndex, leftAnchor);
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Attempt to find the next occurrence of a token in {@code pattern}, starting from {@code offset}.
+ *
+ * Remember that even though paths and patterns are both instances of {@link Expression}, the semantics of
+ * a 'path' are that it contains only literals and path separators, and differs from a 'pattern' which can contain
+ * matching tokens like '*' and '?'.
+ *
+ *
+ * @param pattern the pattern to search through
+ * @param offset the offset into pattern to start searching from
+ * @return the offset in the pattern with the next occurrence of a token, or {@code Integer.MAX_VALUE} if not
+ * found.
+ * @see #findNextLiteral(CharSequence, int)
+ */
+ int findNextToken(CharSequence pattern, int offset) {
+ if (offset < 0 || offset >= pattern.length() || pattern.length() == 0) {
+ return Integer.MAX_VALUE;
+ }
+
+ for (int i = offset; i < pattern.length(); i++) {
+ if (pattern.charAt(i) == exactly_one || pattern.charAt(i) == zero_plus) {
+ return i;
+ }
+ }
+
+ return Integer.MAX_VALUE;
+ }
+
+ /**
+ * Attempt to find the next occurrence of a literal in {@code pattern}, starting from {@code offset}.
+ *
+ * Remember that even though paths and patterns are both instances of {@link Expression}, the semantics of
+ * a 'path' are that it contains only literals and path separators, and differs from a 'pattern' which can contain
+ * matching tokens like '*' and '?'.
+ *
+ *
+ * @param pattern the pattern to search through
+ * @param offset the offset into pattern to start searching from
+ * @return the offset in the pattern with the next occurrence of a literal, or {@code Integer.MAX_VALUE} if not
+ * found.
+ * @see #findNextToken(CharSequence, int)
+ */
+ int findNextLiteral(CharSequence pattern, int offset) {
+ if (offset < 0 || offset >= pattern.length() || pattern.length() == 0) {
+ return Integer.MAX_VALUE;
+ }
+
+ for (int i = offset; i < pattern.length(); i++) {
+ if (pattern.charAt(i) != exactly_one && pattern.charAt(i) != zero_plus) {
+ return i;
+ }
+ }
+
+ return Integer.MAX_VALUE;
+ }
+
+ /**
+ * Attempts to match the literal in the path, from the supplied offset, and returns the offset where the
+ * literal occurs.
+ *
+ * @param path the path being searched for a literal string
+ * @param offset the offset in path to start searching from
+ * @param literal the literal string to find
+ * @return the offset of {@code literal} in {@code path}, or {@code Integer.MIN_VALUE} if not found
+ */
+ int matchNextLiteral(CharSequence path, int offset, CharSequence literal) {
+ if (offset < 0 || offset >= path.length() || path.length() == 0) {
+ return Integer.MIN_VALUE;
+ }
+
+ CharSequence sub = path.subSequence(offset, path.length());
+
+ int litIdx = 0;
+ int subIdx = 0;
+ while (litIdx < literal.length() && subIdx < sub.length()) {
+ if (literal.charAt(litIdx) == sub.charAt(subIdx)) {
+ // increment literal index if there's a match
+ litIdx++;
+ } else {
+ // reset litIdx to 0
+ litIdx = 0;
+ }
+ ;
+
+ subIdx++; // always increment the substring index
+ }
+
+ // we matched the literal if the literal index is the same as its CharSequence
+ if (literal.length() - litIdx == 0) {
+ // then the offset into the path of the beginning of the literal is
+ // offset + ( subIdx - literal.length() )
+ return offset + (subIdx - literal.length());
+ }
+
+ return Integer.MIN_VALUE; // literal wasn't found.
+ }
+
+ /**
+ * Returns true if every token in the path segment is a {@link Token#ZERO_OR_MORE_CHARACTERS}.
+ *
+ * @param pathSegment the path segment containing arbitrary tokens
+ * @return true if every token in the path segment is a {@code ZERO_OR_MORE_CHARACTERS} token.
+ */
+ private boolean isZeroOrMore(List pathSegment) {
+ return pathSegment.size() == 1 && pathSegment.get(0).token == Token.ZERO_OR_MORE_CHARACTERS;
+ }
+
+ /**
+ * Returns true if the path segment contains a single token, and the token is a {@link Token#EXACTLY_ONE_CHARACTER}.
+ *
+ * @param pathSegment the path segment containing arbitrary tokens
+ * @return true if the single token in the path segment is a {@code EXACTLY_ONE_CHARACTER} token.
+ */
+ private boolean isExactlyOne(List pathSegment) {
+ return pathSegment.size() == 1 && pathSegment.get(0).token == Token.EXACTLY_ONE_CHARACTER;
+ }
+
+ /**
+ * Answers a {@code CharSequence} that contains the value of each token, in the same order, as supplied by
+ * {@code tokens}.
+ *
+ * @param tokens a List of arbitrary tokens
+ * @return the sequence of token values
+ */
+ private CharSequence toCharSeq(List tokens) {
+ return tokens.stream().collect(StringBuilder::new, StringBuilder::append, StringBuilder::append);
+ }
+
+ /**
+ * Returns true if each list is equal in size, and contains
+ * {@link org.dataconservancy.bagit.rules.BoundToken#equals(Object) equal} tokens, in the same order.
+ *
+ * @param one the first list of arbitrary tokens
+ * @param two the second list of arbitrary tokens
+ * @return true if the lists contain equal content
+ */
+ private boolean tokenEquals(List one, List two) {
+ if (one.size() != two.size()) {
+ return false;
+ }
+
+ for (int i = 0; i < one.size(); i++) {
+ if (!one.get(i).equals(two.get(i))) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ *
+ * @param pattern
+ * @param path
+ * @param pathOff
+ * @param patternOff
+ * @return
+ */
+ int findRightAnchor(CharSequence pattern, CharSequence path, int pathOff, int patternOff) {
+ // - Find the next token in the pattern from patternOff
+ // - The token index (or end of string) is the end of the literal.
+
+ // Assumes that patternOff isn't positioned at a token, and that patternOff + 1 isn't a token either
+ int nextToken = Math.min(findNextToken(pattern, patternOff), pattern.length());
+
+ CharSequence literalToMatch = pattern.subSequence(patternOff, nextToken);
+
+ // - Match that literal in the path (from offset fPathIndex)
+ // - If the literalToMatch isn't found in 'path', or if the matched literal is an empty string, return Integer.MAX_VALUE
+ if (literalToMatch.length() == 0) {
+ return Integer.MAX_VALUE;
+ }
+ int tmpIdx = matchNextLiteral(path, pathOff, literalToMatch);
+ if (tmpIdx == Integer.MIN_VALUE) {
+ return Integer.MAX_VALUE;
+ }
+
+ // - Set the right anchor at the end of the literal.
+ return tmpIdx + literalToMatch.length();
+ }
+
+ /**
+ * Returns true if any of the tokens in {@code pathTokens} represent a literal.
+ *
+ * @param pathTokens the tokens to check, normally these are the tokens from a single path segment.
+ * @return true if any of the supplied token is a literal.
+ */
+ boolean containsLiterals(List pathTokens) {
+ return (pathTokens.stream().filter(bt -> bt.token == Token.LITERAL).count() > 0);
+ }
+
+ /**
+ * Returns true if all of the tokens in {@code pathTokens} are literal.
+ *
+ * @param pathTokens the tokens to check; normally these are tokens from a single path segment.
+ * @return
+ */
+ boolean allLiterals(List pathTokens) {
+ return (pathTokens.stream().filter(bt -> bt.token == Token.LITERAL).count() == pathTokens.size());
+ }
+
+ /**
+ * Returns true if all of the tokens in {@code pathTokens} represent a literal or
+ * path separator. This method is used to determine of a List of tokens represents a path or a pattern.
+ *
+ * For example, tokens for {@code /foo/bar/baz.txt} would return {@code true}; tokens for {@code Foo??.java} would
+ * return {@code false}, because the '?' characters are not a literal or path separator token.
+ *
+ *
+ * @param pathTokens the tokens to check, normally these are tokens for a path or pattern
+ * @return true if the list of tokens contains only literals or separators
+ */
+ boolean isPath(List pathTokens) {
+ return (pathTokens.stream().filter(
+ bt -> bt.token == Token.LITERAL || bt.token == Token.PATH_SEPARATOR).count() == pathTokens.size());
+ }
+
+}
diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java
new file mode 100644
index 00000000..fe3a0fd7
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java
@@ -0,0 +1,393 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class ExpressionMatcherTest {
+
+ private ExpressionMatcher underTest;
+
+ @Before
+ public void setUp() throws Exception {
+ underTest = new ExpressionMatcher();
+ }
+
+ @Test
+ public void testMatchExpressionWip3() throws Exception {
+ Expression pattern = new Expression("**/Foo??.java");
+ Expression path = new Expression("src/test/java/FooIT.java");
+ Expression nonMatchingPath = new Expression("src/test/java/FooI.java");
+
+ // sanity
+ assertTrue(underTest.match(path, path));
+
+ assertTrue(underTest.match(pattern, path));
+
+ assertFalse(underTest.match(pattern, nonMatchingPath));
+ }
+
+
+ @Test
+ public void testMatchExpressionWip2() throws Exception {
+ Expression pattern = new Expression("**/*IT.java");
+ Expression path = new Expression("src/test/java/FooIT.java");
+ Expression nonMatchingPath = new Expression("src/test/java/Bar.java");
+
+// // sanity - insure that "*IT.java" will match "FooIT.java"
+// List tokenPattern = new ArrayList<>();
+// tokenPattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
+// tokenPattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
+//
+// List tokenPath = new ArrayList<>();
+// tokenPath.addAll(BoundTokensTestUtil.literalsForString("FooIT.java"));
+// assertTrue(underTest.match(tokenPattern, tokenPath));
+
+ // sanity
+ assertTrue(underTest.match(path, path));
+
+ assertTrue(underTest.match(pattern, path));
+
+ assertFalse(underTest.match(nonMatchingPath, path));
+ }
+
+ @Test
+ public void testMatchExpressionWip() throws Exception {
+ Expression pattern = new Expression("**/FooIT.java");
+ Expression path = new Expression("src/test/java/FooIT.java");
+ Expression nonMatchingPath = new Expression("src/test/java/BarIT.java");
+
+ // sanity
+ assertTrue(underTest.match(path, path));
+
+ assertTrue(underTest.match(pattern, path));
+
+ assertFalse(underTest.match(pattern, nonMatchingPath));
+ }
+
+ @Test
+ public void testMatchWithOnlyLiterals() throws Exception {
+ List pattern = BoundTokensTestUtil.literalsForString("bar");
+ List path = BoundTokensTestUtil.literalsForString("bar");
+
+ // sanity
+ assertTrue(underTest.match(pattern, path));
+
+ assertFalse(underTest.match(BoundTokensTestUtil.literalsForString("foo"), path));
+ }
+
+ @Test
+ public void testNoMatchBeginningZeroPlus() throws Exception {
+ List pattern = new ArrayList<>();
+ pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
+
+ List path = BoundTokensTestUtil.literalsForString("src");
+
+ assertFalse(underTest.match(pattern, path));
+ }
+
+ @Test
+ public void testLiteralsWithExactlyOne() throws Exception {
+ // pattern: "?tart?IT.jav*"
+
+ List pattern = new ArrayList<>();
+ pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("tart"));
+ pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("IT.jav"));
+ pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
+
+ // path: startXIT.java (sanity, should pass)
+ List path = BoundTokensTestUtil.literalsForString("startXIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: startXIT.jav (sanity, should pass)
+ path = BoundTokensTestUtil.literalsForString("startXIT.jav");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: strtXIT.java (first literal 'tart' doesn't match)
+ path = BoundTokensTestUtil.literalsForString("strtXIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: startXITT.java (middle literal 'IT.jav' doesn't match)
+ path = BoundTokensTestUtil.literalsForString("startXITT.java");
+ assertFalse(underTest.match(pattern, path));
+ }
+
+ @Test
+ public void testLiteralsWithZeroPlus() throws Exception {
+ // pattern: "*tart*IT.jav*"
+
+ List pattern = new ArrayList<>();
+ pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("tart"));
+ pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("IT.jav"));
+ pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
+
+ // path: startXIT.java (sanity, should pass)
+ List path = BoundTokensTestUtil.literalsForString("startXIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: startXIT.jav (sanity, should pass)
+ path = BoundTokensTestUtil.literalsForString("startXIT.jav");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: tartXIT.java (sanity, should pass)
+ path = BoundTokensTestUtil.literalsForString("tartXIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: tartXIT.jav (sanity, should pass)
+ path = BoundTokensTestUtil.literalsForString("tartXIT.jav");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: strtXIT.java (first literal 'tart' doesn't match)
+ path = BoundTokensTestUtil.literalsForString("strtXIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: startXITT.java (middle literal 'IT.jav' doesn't match)
+ path = BoundTokensTestUtil.literalsForString("startXITT.java");
+ assertFalse(underTest.match(pattern, path));
+ }
+
+
+ @Test
+ public void testMultipleSingleCharacterTokens() throws Exception {
+ // pattern: "?tart?IT.jav?"
+
+ List pattern = new ArrayList<>();
+ pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("tart"));
+ pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("IT.jav"));
+ pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
+
+ // path: startXIT.java (sanity, should pass)
+ List path = BoundTokensTestUtil.literalsForString("startXIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: FootartXIT.java (too many characters for first token)
+ path = BoundTokensTestUtil.literalsForString("FootartXIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: tartXIT.java (no characters for first token)
+ path = BoundTokensTestUtil.literalsForString("tartXIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: StartItUpIT.java (too many characters for middle token)
+ path = BoundTokensTestUtil.literalsForString("StartItUpIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: StartIT.java (no characters for middle token)
+ path = BoundTokensTestUtil.literalsForString("StartIT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: StartXIT.jav (no characters for last token)
+ path = BoundTokensTestUtil.literalsForString("StartXIT.jav");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: StartXIT.javaa (too many characters for last token)
+ path = BoundTokensTestUtil.literalsForString("StartXIT.javaa");
+ assertFalse(underTest.match(pattern, path));
+ }
+
+ @Test
+ public void testMatchLiteralFirstExactlyOneNoMatch() throws Exception {
+ // pattern: "Start?IT.java"
+
+ List pattern = new ArrayList<>();
+ pattern.addAll(BoundTokensTestUtil.literalsForString("Start"));
+ pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
+
+ // path: startXIT.java (sanity, should pass)
+ List path = BoundTokensTestUtil.literalsForString("StartXIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: StartFooIT.java (won't match)
+ path = BoundTokensTestUtil.literalsForString("StartFooIT.java");
+
+ assertFalse(underTest.match(pattern, path));
+ }
+
+ @Test
+ public void testMatchLiteralFirstExactlyOne() throws Exception {
+ // pattern: "Start?IT.java"
+
+ List pattern = new ArrayList<>();
+ pattern.addAll(BoundTokensTestUtil.literalsForString("Start"));
+ pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
+
+ // path: StartXIT.java
+ List path = BoundTokensTestUtil.literalsForString("StartXIT.java");
+
+ assertTrue(underTest.match(pattern, path));
+ }
+
+ @Test
+ public void testMatchLiteralFirstZeroPlus() throws Exception {
+ // pattern: "Start*IT.java"
+
+ List pattern = new ArrayList<>();
+ pattern.addAll(BoundTokensTestUtil.literalsForString("Start"));
+ pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
+
+ // path: StartCarIT.java
+ List path = BoundTokensTestUtil.literalsForString("StartCarIT.java");
+
+ assertTrue(underTest.match(pattern, path));
+ }
+
+ @Test
+ public void testMatchTokenFirst() throws Exception {
+ // pattern: "*File*IT.java"
+
+ List pattern = new ArrayList<>();
+ pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("File"));
+ pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
+
+ // path: UnixFileSmallIT.java
+ List path = BoundTokensTestUtil.literalsForString("UnixFileSmallIT.java");
+
+ assertTrue(underTest.match(pattern, path));
+ }
+
+ @Test
+ public void testMatchConsecutiveMatchTokens() throws Exception {
+ // pattern: "Foo??.java"
+ List pattern = new ArrayList<>();
+ pattern.addAll(BoundTokensTestUtil.literalsForString("Foo"));
+ pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
+ pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
+ pattern.addAll(BoundTokensTestUtil.literalsForString(".java"));
+
+ // path: FooIT.java
+ List path = BoundTokensTestUtil.literalsForString("FooIT.java");
+
+ assertTrue(underTest.match(pattern, path));
+ }
+
+ @Test
+ public void testFindNextToken() throws Exception {
+ assertEquals(19, underTest.findNextToken("src/test/resources/*IT.java", 0));
+ assertEquals(0, underTest.findNextToken("*File*IT.java", 0));
+ assertEquals(5, underTest.findNextToken("*File*IT.java", 1));
+ }
+
+ @Test
+ public void testFindNextLiteral() throws Exception {
+ assertEquals(0, underTest.findNextLiteral("src/test/resources/*IT.java", 0));
+ assertEquals(1, underTest.findNextLiteral("*File*IT.java", 0));
+ assertEquals(2, underTest.findNextLiteral("*File*IT.java", 2));
+ assertEquals(5, underTest.findNextLiteral("Foo??.java", 3));
+ }
+
+ @Test
+ public void testFindNextLiteralString() throws Exception {
+ assertEquals(1, underTest.matchNextLiteral("*File*IT.java", 0, "File"));
+ assertEquals(6, underTest.matchNextLiteral("*File*IT.java", 0, "IT"));
+ assertEquals(6, underTest.matchNextLiteral("*File*IT.java", 0, "IT.java"));
+ assertEquals(6, underTest.matchNextLiteral("*File*IT.java", 4, "IT.java"));
+ assertEquals(Integer.MIN_VALUE, underTest.matchNextLiteral("*FileIT.java", 0, "doodle"));
+ }
+
+ @Test
+ public void testFindRightAnchorFromBeginning() throws Exception {
+ String pattern = "File*IT.java";
+ String path = "FileUnixIT.java";
+
+ assertEquals("File".length(), underTest.findRightAnchor(pattern, path, 0,0 ));
+ }
+
+ @Test
+ public void testFindRightAnchorFromMiddle() throws Exception {
+ String pattern = "File*IT.java";
+ String path = "FileUnixIT.java";
+
+ // find the right anchor after we've matched pattern "File*" to path "FileUnix"
+ assertEquals(path.length(), underTest.findRightAnchor(pattern, path, "FileUnix".length(), "File*".length()));
+ }
+
+ @Test
+ public void testFindRightAnchorMultipleTokens() throws Exception {
+ String pattern = "Foo*Bar*Baz";
+ String path = "FooXBarYBaz";
+
+ assertEquals(3, underTest.findRightAnchor(pattern, path, 0, 0));
+ assertEquals(7, underTest.findRightAnchor(pattern, path, "FooX".length(), "Foo*".length()));
+ assertEquals(11, underTest.findRightAnchor(pattern, path, "FooXBarY".length(), "Foo*Bar*".length()));
+ }
+
+ @Test
+ public void testFindRightAnchorFoo() throws Exception {
+ String pattern = "Foo??.java";
+ String path = "src";
+
+ // behavior when the path is not in the pattern
+ assertEquals(Integer.MAX_VALUE, underTest.findRightAnchor(pattern, path, 0, 0));
+ }
+
+ @Test
+ public void testFindRightAnchorFooIT() throws Exception {
+ String pattern = "Foo??.java";
+ String path = "FooIT.java";
+
+ // behavior when the pattern offset is positioned at a token
+ assertEquals(Integer.MAX_VALUE, underTest.findRightAnchor(pattern, path, 0, 3));
+ }
+
+ @Test
+ public void testRightAnchorBar() throws Exception {
+ String pattern = "*File*IT.java";
+ String path = "UnixFileSmallIT.java";
+
+ // behavior when the path offset is already positioned at the right anchor
+ assertEquals(8, underTest.findRightAnchor(pattern, path, 4, 1));
+ }
+
+ void assertListsEqual(List expected, List actual) {
+ assertExpectedCount(expected.size(), actual);
+
+ for (int i = 0; i < expected.size(); i++) {
+ assertEquals("Expected path segments to be equal. Expected: '" + expected.get(i) +
+ "', Actual: '" + actual.get(i) + "'", expected.get(i), actual.get(i));
+ }
+ }
+
+ void assertExpectedCount(int expectedCount, List actual) {
+ assertEquals("Expected List to contain " + expectedCount + " elements. Contained " + actual.size() + ": " +
+ actual.stream().map(v -> "'" + v + "'").collect(Collectors.joining(", ")),
+ expectedCount, actual.size());
+ }
+}
\ No newline at end of file
diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionTest.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionTest.java
new file mode 100644
index 00000000..a65b5e8f
--- /dev/null
+++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionTest.java
@@ -0,0 +1,101 @@
+/*
+ *
+ * * Copyright 2015 Johns Hopkins University
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dataconservancy.bagit.rules;
+
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.PATH_SEP_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.assertTokenListEquals;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.literalsForString;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class ExpressionTest {
+
+ @Test
+ public void testSimple() throws Exception {
+ Expression exp = new Expression("src/test/resources/*IT.java");
+
+ // == src/test/resources/*IT.java
+ List expected = literalsForString("src");
+ expected.add(PATH_SEP);
+ expected.addAll(literalsForString("test"));
+ expected.add(PATH_SEP);
+ expected.addAll(literalsForString("resources"));
+ expected.add(PATH_SEP);
+ expected.add(ZERO_OR_MORE);
+ expected.addAll(literalsForString("IT.java"));
+
+ List actual = exp.getTokens();
+
+ assertTokenListEquals(expected, actual);
+
+ // depth is an index
+ assertEquals(3, exp.depth());
+
+ // get path segment by depth test
+ assertTokenListEquals(literalsForString("src"), exp.getPathSegment(0));
+ assertTokenListEquals(literalsForString("test"), exp.getPathSegment(1));
+ assertTokenListEquals(literalsForString("resources"), exp.getPathSegment(2));
+
+ expected = new ArrayList<>();
+ expected.add(ZERO_OR_MORE);
+ expected.addAll(literalsForString("IT.java"));
+ assertTokenListEquals(expected, exp.getPathSegment(3));
+
+ // out of bounds tests
+ assertTrue(exp.getPathSegment(exp.depth() + 5).isEmpty());
+ assertTrue(exp.getPathSegment(-1).isEmpty());
+ }
+
+ @Test
+ public void testWithEmptyRoot() throws Exception {
+ Expression exp = new Expression("/");
+ assertEquals(0, exp.depth());
+
+ // TODO decide what to do with the automatic addition of '**'
+ // for example, the Expression "/" is tokenized as "/**".
+ // any path ending in "/" is going to be tokenized with a trailing "**",
+ // and the user may not intend that behavior (for example if they are just wanting
+ // to express a path (not a pattern).
+ assertTokenListEquals(PATH_SEP_L, exp.getTokens());
+ assertTrue(exp.getPathSegment(0).isEmpty());
+ }
+
+ @Test
+ public void testWithSingleFileRoot() throws Exception {
+ Expression exp = new Expression("/foo.txt");
+ assertEquals(0, exp.depth());
+
+ // "/foo.txt"
+ List expected = new ArrayList<>();
+ expected.add(PATH_SEP);
+ expected.addAll(literalsForString("foo.txt"));
+
+ assertTokenListEquals(expected, exp.getTokens());
+ assertFalse(exp.getPathSegment(0).isEmpty());
+ assertEquals(literalsForString("foo.txt"), exp.getPathSegment(0));
+ }
+}
\ No newline at end of file
From 400dbe9f57d498d2d403f6d0c468fa30fd0e5f48 Mon Sep 17 00:00:00 2001
From: Elliot Metsger
Date: Tue, 15 Sep 2015 10:22:12 -0400
Subject: [PATCH 8/9] DC-2101: Fix an error with the string representation of
the Expression: mistakenly used BoundToken.toString() instead of
intentionally composing a string representation of the the BoundToken.
---
.../org/dataconservancy/bagit/rules/ExpressionMatcher.java | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java
index 6aea666f..c3ff89aa 100644
--- a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java
@@ -606,7 +606,8 @@ private boolean isExactlyOne(List pathSegment) {
* @return the sequence of token values
*/
private CharSequence toCharSeq(List tokens) {
- return tokens.stream().collect(StringBuilder::new, StringBuilder::append, StringBuilder::append);
+ return tokens.stream().collect(StringBuilder::new, (sb, bt) -> sb.append(bt.bound),
+ StringBuilder::append).toString();
}
/**
From 3fa605b39123fa74b63ee65f014bfca307922d9c Mon Sep 17 00:00:00 2001
From: Elliot Metsger
Date: Tue, 15 Sep 2015 16:25:49 -0400
Subject: [PATCH 9/9] DC-2101: new method
isDirectoryMatchToken(patternPathSegment) guards match(CharSequence,
CharSequence, int, int, int, int) from having to handle '**' tokens.
Includes test and Javadoc updates.
---
.../bagit/rules/ExpressionMatcher.java | 43 ++-
.../bagit/rules/ExpressionMatcherTest.java | 306 ++++++++++++------
2 files changed, 243 insertions(+), 106 deletions(-)
diff --git a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java
index c3ff89aa..4bb0f3ab 100644
--- a/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java
+++ b/dcs-bagit/dcs-bagit-support/src/main/java/org/dataconservancy/bagit/rules/ExpressionMatcher.java
@@ -71,6 +71,13 @@
*
* Note that methods on this class are package-private, and are not meant to be exposed publicly.
*
+ *
+ * Because ExpressionMatcher is package-private, it can be hard to tell what the entry points into the ExpressionMatcher
+ * class are, and this test class doesn't help you determine that. Clients of ExpressionMatcher should be calling
+ *
+ * - {@link org.dataconservancy.bagit.rules.ExpressionMatcher#match(Expression, Expression)}
+ *
+ *
*/
public class ExpressionMatcher {
@@ -185,7 +192,7 @@ private boolean matchPathSegment(Expression pattern, Expression path, int patter
return match;
}
- // match the pattern segment containing literals against every path segment until we get a match
+ // attempt to match every path segment against the pattern segment containing literals.
int rightAnchor = nextMatch(path, pathDepth, pattern.getPathSegment(nextLiteral));
// if we don't match ...
@@ -193,7 +200,7 @@ private boolean matchPathSegment(Expression pattern, Expression path, int patter
return false;
}
- // make sure that every path segment from the left anchor to the right anchor matches the current path expression
+ // make sure that every path segment from the left anchor to the right anchor matches the current pattern
boolean match = true;
for (int i = pathDepth; i < rightAnchor; i++) {
match &= match(pattern.getPathSegment(patternDepth), path.getPathSegment(i));
@@ -270,6 +277,7 @@ boolean match(List patternPathSegment, List pathPathSegm
// first, handle the short-circuit cases:
// patternPathSegment only contains '*' ; doesn't matter what pathPathSegment has, all tokens match
+ // patternPathSegment only contains '**' ; doesn't matter what pathPathSegment has, all tokens match
// patternPathSegment contains '?' and pathPathSegment only has a single token, the single token matches
// patternPathSegment is all literals ; see if the pathPathSegment equals
@@ -277,6 +285,11 @@ boolean match(List patternPathSegment, List pathPathSegm
return true;
}
+ if (isDirectoryMatchToken(patternPathSegment)) {
+ // this guards match(CharSequence, CharSequence, int, int, int, int) from having to handle '**' tokens.
+ return true;
+ }
+
if (pathPathSegment.size() == 1 && isExactlyOne(patternPathSegment)) {
return true;
}
@@ -304,6 +317,9 @@ boolean match(List patternPathSegment, List pathPathSegm
/**
* A recursive method for matching a {@code path} against a {@code pattern}. The method terminates when there are
* no more literals or tokens to be matched, or as soon as it determines a match isn't possible and returns early.
+ * N.B. this method cannot handle a directory matching token: '**'. It is expected that the caller
+ * has filtered these tokens out (see {@link #match(java.util.List, java.util.List)} and its
+ * {@link #isDirectoryMatchToken(java.util.List)} check.
*
* Developers, when reading this implementation, keep in mind that anchors are always indexes into the {@code path},
* while {@code tokenIndex} and {@code literalIndex} are always indexes into {@code pattern}. The first major
@@ -579,13 +595,28 @@ int matchNextLiteral(CharSequence path, int offset, CharSequence literal) {
}
/**
- * Returns true if every token in the path segment is a {@link Token#ZERO_OR_MORE_CHARACTERS}.
+ * Returns true if the path segment contains a single {@link Token#ZERO_OR_MORE_CHARACTERS} token.
*
* @param pathSegment the path segment containing arbitrary tokens
- * @return true if every token in the path segment is a {@code ZERO_OR_MORE_CHARACTERS} token.
+ * @return true if the only token in the path segment is a {@code ZERO_OR_MORE_CHARACTERS} token.
*/
- private boolean isZeroOrMore(List pathSegment) {
+ boolean isZeroOrMore(List pathSegment) {
return pathSegment.size() == 1 && pathSegment.get(0).token == Token.ZERO_OR_MORE_CHARACTERS;
+
+
+ }
+
+ /**
+ * Returns true if the path segment contains a single {@link Token#DIRECTORY} token, or exactly two
+ * {@link Token#ZERO_OR_MORE_CHARACTERS} tokens.
+ *
+ * @param pathSegment the path segment containing arbitrary tokens
+ * @return true if the path segment will match a directory
+ */
+ boolean isDirectoryMatchToken(List pathSegment) {
+ return (pathSegment.size() == 1 && pathSegment.get(0).token == Token.DIRECTORY) ||
+ (pathSegment.size() == 2 && pathSegment.get(0).token == Token.ZERO_OR_MORE_CHARACTERS
+ && pathSegment.get(1).token == Token.ZERO_OR_MORE_CHARACTERS);
}
/**
@@ -594,7 +625,7 @@ private boolean isZeroOrMore(List pathSegment) {
* @param pathSegment the path segment containing arbitrary tokens
* @return true if the single token in the path segment is a {@code EXACTLY_ONE_CHARACTER} token.
*/
- private boolean isExactlyOne(List pathSegment) {
+ boolean isExactlyOne(List pathSegment) {
return pathSegment.size() == 1 && pathSegment.get(0).token == Token.EXACTLY_ONE_CHARACTER;
}
diff --git a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java
index fe3a0fd7..152c31b9 100644
--- a/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java
+++ b/dcs-bagit/dcs-bagit-support/src/test/java/org/dataconservancy/bagit/rules/ExpressionMatcherTest.java
@@ -26,10 +26,30 @@
import java.util.List;
import java.util.stream.Collectors;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.DIR_L;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.EXACTLY_ONE;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.ZERO_OR_MORE;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.assertTokenListEquals;
+import static org.dataconservancy.bagit.rules.BoundTokensTestUtil.literalsForString;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
+/**
+ * Many, many tests against various methods in the ExpressionMatcher class.
+ * Most of the test methods in this class contain multiple assertions. Normally there will be one assertion for a
+ * sanity check - an assertion that should always be true. Often there will be multiple sanity checks.
+ *
+ * Because ExpressionMatcher is package-private, it can be hard to tell what the entry points into the ExpressionMatcher
+ * class are, and this test class doesn't help you determine that. Clients of ExpressionMatcher should be calling
+ * either:
+ *
+ * - {@link org.dataconservancy.bagit.rules.ExpressionMatcher#match(Expression, Expression)}
+ * - {@link org.dataconservancy.bagit.rules.ExpressionMatcher#match(java.util.List, java.util.List)}
+ *
+ * This test class covers not only these entry point methods, but other utility methods as well.
+ *
+ */
public class ExpressionMatcherTest {
private ExpressionMatcher underTest;
@@ -39,231 +59,263 @@ public void setUp() throws Exception {
underTest = new ExpressionMatcher();
}
+ /**
+ * Attempts a match using an Expression that starts with '**' and contains consecutive '?' matching tokens.
+ */
@Test
- public void testMatchExpressionWip3() throws Exception {
+ public void testMatchExpressionLeadingDirectoryAndConsecutiveExactlyOne() throws Exception {
+ // The pattern to match against: leading '**' and consecutive '??'
Expression pattern = new Expression("**/Foo??.java");
+
+ // This path should match the pattern: src/test/java matches '**' and FooIT.java matches 'Foo??.java'
Expression path = new Expression("src/test/java/FooIT.java");
+
+ // This path should not match (the consecutive token '??' will remain unmatched)
Expression nonMatchingPath = new Expression("src/test/java/FooI.java");
- // sanity
+ // sanity: a path should match itself.
assertTrue(underTest.match(path, path));
assertTrue(underTest.match(pattern, path));
-
assertFalse(underTest.match(pattern, nonMatchingPath));
}
-
+ /**
+ * Attempts a match using an Expression that starts with '**' and contains a '*' matching token.
+ */
@Test
- public void testMatchExpressionWip2() throws Exception {
+ public void testMatchExpressionLeadingDirectoryAndZeroPlus() throws Exception {
+ // The pattern to match against: leading '**' and a '*'
Expression pattern = new Expression("**/*IT.java");
+
+ // This path should match the pattern: src/test/java matches '**' and FooIT.java matches '*IT.java'
Expression path = new Expression("src/test/java/FooIT.java");
- Expression nonMatchingPath = new Expression("src/test/java/Bar.java");
-// // sanity - insure that "*IT.java" will match "FooIT.java"
-// List tokenPattern = new ArrayList<>();
-// tokenPattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
-// tokenPattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
-//
-// List tokenPath = new ArrayList<>();
-// tokenPath.addAll(BoundTokensTestUtil.literalsForString("FooIT.java"));
-// assertTrue(underTest.match(tokenPattern, tokenPath));
+ // This path should not match (the path segment Bar.java will remain unmatched)
+ Expression nonMatchingPath = new Expression("src/test/java/Bar.java");
- // sanity
+ // sanity: a path should match itself
assertTrue(underTest.match(path, path));
assertTrue(underTest.match(pattern, path));
-
assertFalse(underTest.match(nonMatchingPath, path));
}
+ /**
+ * Attempts a match using an Expression that starts with a '**' matching token.
+ */
@Test
- public void testMatchExpressionWip() throws Exception {
+ public void testMatchExpressionLeadingDirectory() throws Exception {
+ // The pattern to match against: leading '**'
Expression pattern = new Expression("**/FooIT.java");
+
+ // This path should match the pattern: src/test/java matches '**', and FooIT.java matches the 'FooIT.java' literal
Expression path = new Expression("src/test/java/FooIT.java");
+
+ // This path should not match
Expression nonMatchingPath = new Expression("src/test/java/BarIT.java");
- // sanity
+ // sanity: a path should match itself
assertTrue(underTest.match(path, path));
assertTrue(underTest.match(pattern, path));
-
assertFalse(underTest.match(pattern, nonMatchingPath));
}
+ /**
+ * Attempts a match using equal lists of {@code List<BoundToken>} containing only literals (no matching tokens or
+ * path separators)
+ */
@Test
public void testMatchWithOnlyLiterals() throws Exception {
- List pattern = BoundTokensTestUtil.literalsForString("bar");
- List path = BoundTokensTestUtil.literalsForString("bar");
+ List pattern = literalsForString("bar");
+ List path = literalsForString("bar");
+ assertTokenListEquals(path, pattern);
- // sanity
- assertTrue(underTest.match(pattern, path));
+ // sanity: non-equal literal token lists should not match
+ assertFalse(underTest.match(literalsForString("foo"), path));
- assertFalse(underTest.match(BoundTokensTestUtil.literalsForString("foo"), path));
+ // test to make sure that equal literal token lists will match
+ assertTrue(underTest.match(pattern, path));
}
+ /**
+ * Verifies that a literal will not match a pattern that contains leading directory match tokens followed by
+ * a non-matching literal. A complicated way of saying that we verify that the pattern "*IT.java" won't match
+ * "src".
+ */
@Test
public void testNoMatchBeginningZeroPlus() throws Exception {
+ // pattern: *IT.java
List pattern = new ArrayList<>();
- pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("IT.java"));
- List path = BoundTokensTestUtil.literalsForString("src");
+ // path: src
+ List path = literalsForString("src");
assertFalse(underTest.match(pattern, path));
}
+ /**
+ * Attempts a match {@code List<BoundToken>} leading with a '?' matching token, ending with a '*' matching
+ * token, and with a single '?' token in the middle.
+ */
@Test
public void testLiteralsWithExactlyOne() throws Exception {
- // pattern: "?tart?IT.jav*"
-
+ // pattern: "?tart?IT.jav?"
List pattern = new ArrayList<>();
- pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("tart"));
- pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("IT.jav"));
- pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString("tart"));
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString("IT.jav"));
+ pattern.add(EXACTLY_ONE);
// path: startXIT.java (sanity, should pass)
- List path = BoundTokensTestUtil.literalsForString("startXIT.java");
- assertTrue(underTest.match(pattern, path));
-
- // path: startXIT.jav (sanity, should pass)
- path = BoundTokensTestUtil.literalsForString("startXIT.jav");
+ List path = literalsForString("startXIT.java");
assertTrue(underTest.match(pattern, path));
// path: strtXIT.java (first literal 'tart' doesn't match)
- path = BoundTokensTestUtil.literalsForString("strtXIT.java");
+ path = literalsForString("strtXIT.java");
assertFalse(underTest.match(pattern, path));
// path: startXITT.java (middle literal 'IT.jav' doesn't match)
- path = BoundTokensTestUtil.literalsForString("startXITT.java");
+ path = literalsForString("startXITT.java");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: startXIT.jav (last token '?' doesn't match - missing character in path)
+ path = literalsForString("startXIT.jav");
+ assertFalse(underTest.match(pattern, path));
+
+ // path: startXIT.javaa (last literal 'a' in path doesn't match)
+ path = literalsForString("startXIT.javaa");
assertFalse(underTest.match(pattern, path));
}
+ /**
+ * Attempts a match {@code List<BoundToken>} leading with a '*' matching token, ending with a '*' matching
+ * token, and with a single '*' token in the middle.
+ */
@Test
public void testLiteralsWithZeroPlus() throws Exception {
// pattern: "*tart*IT.jav*"
List pattern = new ArrayList<>();
- pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("tart"));
- pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("IT.jav"));
- pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("tart"));
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("IT.jav"));
+ pattern.add(ZERO_OR_MORE);
// path: startXIT.java (sanity, should pass)
- List path = BoundTokensTestUtil.literalsForString("startXIT.java");
+ List path = literalsForString("startXIT.java");
assertTrue(underTest.match(pattern, path));
// path: startXIT.jav (sanity, should pass)
- path = BoundTokensTestUtil.literalsForString("startXIT.jav");
+ path = literalsForString("startXIT.jav");
assertTrue(underTest.match(pattern, path));
// path: tartXIT.java (sanity, should pass)
- path = BoundTokensTestUtil.literalsForString("tartXIT.java");
+ path = literalsForString("tartXIT.java");
assertTrue(underTest.match(pattern, path));
// path: tartXIT.jav (sanity, should pass)
- path = BoundTokensTestUtil.literalsForString("tartXIT.jav");
+ path = literalsForString("tartXIT.jav");
assertTrue(underTest.match(pattern, path));
// path: strtXIT.java (first literal 'tart' doesn't match)
- path = BoundTokensTestUtil.literalsForString("strtXIT.java");
+ path = literalsForString("strtXIT.java");
assertFalse(underTest.match(pattern, path));
// path: startXITT.java (middle literal 'IT.jav' doesn't match)
- path = BoundTokensTestUtil.literalsForString("startXITT.java");
+ path = literalsForString("startXITT.java");
assertFalse(underTest.match(pattern, path));
}
-
+ /**
+ * Attempts various path matches against a pattern that contains three matching '?' tokens, at the
+ * beginning, middle, and end of the pattern.
+ */
@Test
public void testMultipleSingleCharacterTokens() throws Exception {
// pattern: "?tart?IT.jav?"
List pattern = new ArrayList<>();
- pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("tart"));
- pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("IT.jav"));
- pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString("tart"));
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString("IT.jav"));
+ pattern.add(EXACTLY_ONE);
// path: startXIT.java (sanity, should pass)
- List path = BoundTokensTestUtil.literalsForString("startXIT.java");
+ List path = literalsForString("startXIT.java");
assertTrue(underTest.match(pattern, path));
// path: FootartXIT.java (too many characters for first token)
- path = BoundTokensTestUtil.literalsForString("FootartXIT.java");
+ path = literalsForString("FootartXIT.java");
assertFalse(underTest.match(pattern, path));
// path: tartXIT.java (no characters for first token)
- path = BoundTokensTestUtil.literalsForString("tartXIT.java");
+ path = literalsForString("tartXIT.java");
assertFalse(underTest.match(pattern, path));
// path: StartItUpIT.java (too many characters for middle token)
- path = BoundTokensTestUtil.literalsForString("StartItUpIT.java");
+ path = literalsForString("StartItUpIT.java");
assertFalse(underTest.match(pattern, path));
// path: StartIT.java (no characters for middle token)
- path = BoundTokensTestUtil.literalsForString("StartIT.java");
+ path = literalsForString("StartIT.java");
assertFalse(underTest.match(pattern, path));
// path: StartXIT.jav (no characters for last token)
- path = BoundTokensTestUtil.literalsForString("StartXIT.jav");
+ path = literalsForString("StartXIT.jav");
assertFalse(underTest.match(pattern, path));
// path: StartXIT.javaa (too many characters for last token)
- path = BoundTokensTestUtil.literalsForString("StartXIT.javaa");
+ path = literalsForString("StartXIT.javaa");
assertFalse(underTest.match(pattern, path));
}
+ /**
+ * Attempts to match a path against a pattern containing a single matching token '?' in the middle.
+ */
@Test
public void testMatchLiteralFirstExactlyOneNoMatch() throws Exception {
// pattern: "Start?IT.java"
List pattern = new ArrayList<>();
- pattern.addAll(BoundTokensTestUtil.literalsForString("Start"));
- pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
+ pattern.addAll(literalsForString("Start"));
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString("IT.java"));
// path: startXIT.java (sanity, should pass)
- List path = BoundTokensTestUtil.literalsForString("StartXIT.java");
+ List path = literalsForString("StartXIT.java");
assertTrue(underTest.match(pattern, path));
// path: StartFooIT.java (won't match)
- path = BoundTokensTestUtil.literalsForString("StartFooIT.java");
+ path = literalsForString("StartFooIT.java");
assertFalse(underTest.match(pattern, path));
}
- @Test
- public void testMatchLiteralFirstExactlyOne() throws Exception {
- // pattern: "Start?IT.java"
-
- List pattern = new ArrayList<>();
- pattern.addAll(BoundTokensTestUtil.literalsForString("Start"));
- pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
-
- // path: StartXIT.java
- List path = BoundTokensTestUtil.literalsForString("StartXIT.java");
-
- assertTrue(underTest.match(pattern, path));
- }
-
+ /**
+ * Attempts to match a path against a pattern containing a single matching token '*' in the middle.
+ */
@Test
public void testMatchLiteralFirstZeroPlus() throws Exception {
// pattern: "Start*IT.java"
List pattern = new ArrayList<>();
- pattern.addAll(BoundTokensTestUtil.literalsForString("Start"));
- pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
+ pattern.addAll(literalsForString("Start"));
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("IT.java"));
- // path: StartCarIT.java
- List path = BoundTokensTestUtil.literalsForString("StartCarIT.java");
+ // path: StartCarIT.java ('*' should match 'Car')
+ List path = literalsForString("StartCarIT.java");
+ assertTrue(underTest.match(pattern, path));
+ // path: StartIT.java ('*' should match zero characters)
+ path = literalsForString("StartIT.java");
assertTrue(underTest.match(pattern, path));
}
@@ -272,13 +324,13 @@ public void testMatchTokenFirst() throws Exception {
// pattern: "*File*IT.java"
List pattern = new ArrayList<>();
- pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("File"));
- pattern.add(BoundTokensTestUtil.ZERO_OR_MORE);
- pattern.addAll(BoundTokensTestUtil.literalsForString("IT.java"));
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("File"));
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("IT.java"));
// path: UnixFileSmallIT.java
- List path = BoundTokensTestUtil.literalsForString("UnixFileSmallIT.java");
+ List path = literalsForString("UnixFileSmallIT.java");
assertTrue(underTest.match(pattern, path));
}
@@ -287,17 +339,53 @@ public void testMatchTokenFirst() throws Exception {
public void testMatchConsecutiveMatchTokens() throws Exception {
// pattern: "Foo??.java"
List pattern = new ArrayList<>();
- pattern.addAll(BoundTokensTestUtil.literalsForString("Foo"));
- pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
- pattern.add(BoundTokensTestUtil.EXACTLY_ONE);
- pattern.addAll(BoundTokensTestUtil.literalsForString(".java"));
+ pattern.addAll(literalsForString("Foo"));
+ pattern.add(EXACTLY_ONE);
+ pattern.add(EXACTLY_ONE);
+ pattern.addAll(literalsForString(".java"));
// path: FooIT.java
- List path = BoundTokensTestUtil.literalsForString("FooIT.java");
+ List path = literalsForString("FooIT.java");
assertTrue(underTest.match(pattern, path));
}
+ /**
+ * Attempt to match a directory against the directory match token '**'
+ */
+ @Test
+ public void testMatchZeroPlusAndLiteral() throws Exception {
+ // pattern: "**"
+ List pattern = DIR_L;
+
+ // path: "src"
+ List path = literalsForString("src");
+
+ assertTrue(underTest.match(pattern, path));
+ }
+
+ /**
+ * Insures that a pattern like 'Foo**IT.java' - while almost certainly a mistake by the person who created the
+ * pattern - is a valid pattern. Make sure it matches.
+ */
+ @Test
+ public void testMatchMultipleZeroPlusTokens() throws Exception {
+ // pattern: "Foo**IT.java"
+ List pattern = new ArrayList<>();
+ pattern.addAll(literalsForString("Foo"));
+ pattern.add(ZERO_OR_MORE);
+ pattern.add(ZERO_OR_MORE);
+ pattern.addAll(literalsForString("IT.java"));
+
+ // path: "FooIT.java" should match - '**' matches zero characters
+ List path = literalsForString("FooIT.java");
+ assertTrue(underTest.match(pattern, path));
+
+ // path: "FooBarBazIT.java" should match - '**' matches "BarBaz"
+ path = literalsForString("FooBarBazIT.java");
+ assertTrue(underTest.match(pattern, path));
+ }
+
@Test
public void testFindNextToken() throws Exception {
assertEquals(19, underTest.findNextToken("src/test/resources/*IT.java", 0));
@@ -376,6 +464,24 @@ public void testRightAnchorBar() throws Exception {
assertEquals(8, underTest.findRightAnchor(pattern, path, 4, 1));
}
+ /**
+ * Insures that the match token '**' - represented as a single BoundToken containing a DIRECTORY, or two
+ * consecutive BoundTokens containing a ZERO_OR_MORE_CHARACTERS - are both considered a "directory match" token
+ * by the ExpressionMatcher.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testIsDirectoryMatch() throws Exception {
+ List directory = Arrays.asList(new BoundToken(Token.DIRECTORY, Token.DIRECTORY.getTokenString()));
+ List consecutiveZeroOrMore = Arrays.asList(
+ new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, Token.ZERO_OR_MORE_CHARACTERS.getTokenString()),
+ new BoundToken(Token.ZERO_OR_MORE_CHARACTERS, Token.ZERO_OR_MORE_CHARACTERS.getTokenString()));
+
+ assertTrue(underTest.isDirectoryMatchToken(directory));
+ assertTrue(underTest.isDirectoryMatchToken(consecutiveZeroOrMore));
+ }
+
void assertListsEqual(List expected, List actual) {
assertExpectedCount(expected.size(), actual);