diff --git a/src/main/java/io/deephaven/csv/CsvSpecs.java b/src/main/java/io/deephaven/csv/CsvSpecs.java
index fdb4ed66..a11718a7 100644
--- a/src/main/java/io/deephaven/csv/CsvSpecs.java
+++ b/src/main/java/io/deephaven/csv/CsvSpecs.java
@@ -319,7 +319,7 @@ default Builder putParserForIndex(int index, Parser> parser) {
* 7-bit ASCII. The default is '{@value #defaultQuote}' For example:
*
*
- * 123,"hello, there",456,
+ * 123,"hello, there",456
*
*
* Would be read as the three fields:
@@ -337,6 +337,29 @@ default Builder putParserForIndex(int index, Parser> parser) {
*/
Builder quote(char quote);
+ /**
+ * The escape character (used when you want field or line delimiters to be interpreted as literal text, or you
+ * want to add the Java-style escape sequences \b, \t, \n, \r, or \f. Typically set to the backslash character
+ * ('\'). Must be 7-bit ASCII. We do not decode Java octal or Unicode escape sequences 0xx or uxxxx. The default
+ * is null, interpreted as unset. For example, with the escape character set to '\':
+ *
+ *
+ * 123,hello\, there\n,456
+ *
+ *
+ * Would be read as the three fields:
+ *
+ *
+ * - 123
+ *
- hello, there\n (where \n is the newline character)
+ *
- 456
+ *
+ *
+ * @param escape The escape property.
+ * @return self after modifying the escape property.
+ */
+ Builder escape(Character escape);
+
/**
* Whether to trim leading and trailing blanks from non-quoted values. The default is {@code true}.
*
@@ -399,6 +422,9 @@ void check() {
final List problems = new ArrayList<>();
check7BitAscii("quote", quote(), problems);
check7BitAscii("delimiter", delimiter(), problems);
+ if (escape() != null) {
+ check7BitAscii("escape", escape(), problems);
+ }
checkNonnegative("skipRows", skipRows(), problems);
checkNonnegative("skipHeaderRows", skipHeaderRows(), problems);
checkNonnegative("numRows", numRows(), problems);
@@ -699,7 +725,6 @@ public char delimiter() {
return defaultDelimiter;
}
-
private static final char defaultQuote = '"';
/**
@@ -712,6 +737,17 @@ public char quote() {
return defaultQuote;
}
+ /**
+ * See {@link Builder#escape}.
+ *
+ * @return The caller-specified escape character, or null if none.
+ */
+ @Default
+ @Nullable
+ public Character escape() {
+ return null;
+ }
+
/**
* See {@link Builder#ignoreSurroundingSpaces}.
*
diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java
index 04610758..56560881 100644
--- a/src/main/java/io/deephaven/csv/reading/CsvReader.java
+++ b/src/main/java/io/deephaven/csv/reading/CsvReader.java
@@ -116,11 +116,15 @@ private static boolean needsUtf8Encoding(final Charset charset) {
private static Result delimitedReadLogic(
final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory)
throws CsvReaderException {
- // These two have already been validated by CsvSpecs to be 7-bit ASCII.
+ final byte IllegalUtf8 = (byte) 0xff;
+
+ // These three have already been validated by CsvSpecs to be 7-bit ASCII.
final byte quoteAsByte = (byte) specs.quote();
final byte delimiterAsByte = (byte) specs.delimiter();
+ final byte escapeCharAsByte = specs.escape() == null ? IllegalUtf8 : (byte) specs.escape().charValue();
final CellGrabber grabber =
- new DelimitedCellGrabber(stream, quoteAsByte, delimiterAsByte, specs.ignoreSurroundingSpaces(),
+ new DelimitedCellGrabber(stream, quoteAsByte, escapeCharAsByte, delimiterAsByte,
+ specs.ignoreSurroundingSpaces(),
specs.trim());
// For an "out" parameter
final MutableObject firstDataRowHolder = new MutableObject<>();
diff --git a/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java
index d235f2da..7c2857f6 100644
--- a/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java
+++ b/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java
@@ -21,6 +21,11 @@ public final class DelimitedCellGrabber implements CellGrabber {
private final InputStream inputStream;
/** The configured CSV quote character (typically '"'). Must be 7-bit ASCII. */
private final byte quoteChar;
+ /**
+ * The configured CVS escape character. Must be 7-bit ASCII. If configured to null in CsvSpecs, we set it to the
+ * illegal UTF-8 byte 0xff so it has no effect.
+ */
+ private final byte escapeChar;
/** The configured CVS field delimiter (typically ','). Must be 7-bit ASCII. */
private final byte fieldDelimiter;
/** Whether to trim leading and trailing blanks from non-quoted values. */
@@ -40,7 +45,8 @@ public final class DelimitedCellGrabber implements CellGrabber {
* buffer[] array. But we can't do that when the input cell spans more than one buffer[] chunk, or when the input
* cell does not exactly represent the output. This latter case can happen for example when an escaped quote ("")
* needs to be returned as a single quotation mark ("). So if our input is hello""there, then we can't directly
- * return a slice of the input array, because actually we need hello"there (one quotation mark, not two).
+ * return a slice of the input array, because actually we need hello"there (one quotation mark, not two). Another
+ * case where this can happen is when the escape character is enabled and we encounter an escape like \, or \n.
*/
private final GrowableByteBuffer spillBuffer;
/**
@@ -56,6 +62,8 @@ public final class DelimitedCellGrabber implements CellGrabber {
*
* @param inputStream The input, represented as UTF-8 bytes.
* @param quoteChar The configured quote char. Typically "
+ * @param escapeChar The configured escape char. Defaults to our representation of 'none' but if the feature is
+ * desired, is typically set to \
* @param fieldDelimiter The configured field delimiter. Typically ,
* @param ignoreSurroundingSpaces Whether to ignore surrounding spaces
* @param trim Whether to trim spaces inside quoted values.
@@ -63,11 +71,13 @@ public final class DelimitedCellGrabber implements CellGrabber {
public DelimitedCellGrabber(
final InputStream inputStream,
final byte quoteChar,
+ final byte escapeChar,
final byte fieldDelimiter,
final boolean ignoreSurroundingSpaces,
final boolean trim) {
this.inputStream = inputStream;
this.quoteChar = quoteChar;
+ this.escapeChar = escapeChar;
this.fieldDelimiter = fieldDelimiter;
this.ignoreSurroundingSpaces = ignoreSurroundingSpaces;
this.trim = trim;
@@ -131,10 +141,16 @@ private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastIn
}
prevCharWasCarriageReturn = false;
}
- if (ch != quoteChar) {
+ if (ch != quoteChar && ch != escapeChar) {
// Ordinary character. Note: in quoted mode we will gladly eat field and line separators.
continue;
}
+
+ if (ch == escapeChar) {
+ processEscapeChar();
+ continue;
+ }
+
// This character is a quote char. It could be the end of the cell, or it could be an escaped
// quote char (e.g. ""). The way to tell is to peek ahead at the next character.
if (!tryEnsureMore()) {
@@ -264,10 +280,74 @@ private void finishField(final ByteSlice dest, final MutableBoolean lastInRow,
++physicalRowNum;
return;
}
+ if (ch == escapeChar) {
+ ++offset;
+ processEscapeChar();
+ continue;
+ }
+
++offset;
}
}
+ private void processEscapeChar() throws CsvReaderException {
+ // Spill data up to and including the escape character into the spill buffer.
+ // Below, we will replace the escape character with the transformed escaped character.
+ spillRange();
+
+ // This character is an escape character. In practice, it is used to either to make the next
+ // metacharacter like the quote or field separator normal, or to provide a C-style special character like
+ // newline or tab.
+ // However, it can't appear as the last character of the input.
+ if (!tryEnsureMore()) {
+ throw new CsvReaderException("The escape character cannot be the last character of the input");
+ }
+
+ // Consume the next char (the escaped character). Potentially transform it if it is one of the C escapes:
+ // characters b, t, n etc
+ final byte nextChar = buffer[offset++];
+ final byte nextCharTransformed = transformEscapedChar(nextChar);
+
+ // Replace the placeholder character with the transformed character
+ spillBuffer.data()[spillBuffer.size() - 1] = nextCharTransformed;
+
+ // Advance the spill buffer's notion of "next start position" so it skips the escaped character.
+ startOffset = offset;
+ }
+
+
+ /**
+ * Interpret the set of character escapes supported by Java. We do not currently interpet the octal 0xx or Unicode
+ * escape sequences uxxxx
+ *
+ * @param nextChar The character following the escape character.
+ * @return If one of (b, t, n, r, f), that value transformed to (\b, \t, \n, \r, \f). Otherwise, the value is
+ * returned unchanged.
+ * @throws CsvReaderException if passed a non-ASCII character, carriage return, or newline.
+ */
+ private static byte transformEscapedChar(byte nextChar) throws CsvReaderException {
+ if (nextChar < 0) {
+ throw new CsvReaderException("Can't escape a non-ASCII character");
+ }
+ if (nextChar == '\r' || nextChar == '\n') {
+ throw new CsvReaderException("Can't escape a carriage return or newline");
+ }
+ switch (nextChar) {
+ case 'b':
+ return '\b';
+ case 't':
+ return '\t';
+ case 'n':
+ return '\n';
+ case 'r':
+ return '\r';
+ case 'f':
+ return '\f';
+ default:
+ return nextChar;
+ }
+ }
+
/** @return true if there are more characters. */
private boolean tryEnsureMore() throws CsvReaderException {
if (offset != size) {
diff --git a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
index 5538f21a..1bf4fbfc 100644
--- a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
+++ b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
@@ -23,7 +23,7 @@ public class FixedCellGrabber implements CellGrabber {
*/
public static CellGrabber makeLineGrabber(InputStream stream) {
final byte IllegalUtf8 = (byte) 0xff;
- return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false);
+ return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, IllegalUtf8, true, false);
}
private final CellGrabber lineGrabber;
diff --git a/src/test/java/io/deephaven/csv/EscapeTest.java b/src/test/java/io/deephaven/csv/EscapeTest.java
new file mode 100644
index 00000000..c7a107bf
--- /dev/null
+++ b/src/test/java/io/deephaven/csv/EscapeTest.java
@@ -0,0 +1,158 @@
+package io.deephaven.csv;
+
+import io.deephaven.csv.parsers.Parser;
+import io.deephaven.csv.reading.CsvReader;
+import io.deephaven.csv.testutil.*;
+import io.deephaven.csv.util.CsvReaderException;
+import org.assertj.core.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import java.io.InputStream;
+import java.lang.reflect.Array;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.stream.Stream;
+
+public class EscapeTest {
+ @ParameterizedTest
+ @MethodSource("provideTuplesForEscapeTest")
+ public void escapeTest(String input, Character escape, Object[] expectedValues) throws CsvReaderException {
+ final Charset charset = StandardCharsets.UTF_8;
+ final CsvSpecs specs = CsvTestUtil.defaultCsvBuilder().hasHeaderRow(false).quote('`').escape(escape).build();
+ final InputStream stream = CsvTestUtil.toInputStream(input, charset);
+ CsvReader.Result result = CsvReader.read(specs, stream, charset, CsvTestUtil.makeMySinkFactory());
+
+ Assertions.assertThat(result.numRows()).isEqualTo(1);
+ Assertions.assertThat(result.numCols()).isEqualTo(expectedValues.length);
+
+ for (int i = 0; i != result.numCols(); ++i) {
+ final Object array = result.columns()[i].data();
+ final Object element0 = Array.get(array, 0);
+ Assertions.assertThat(element0).isEqualTo(expectedValues[i]);
+ }
+ }
+
+ private static Stream provideTuplesForEscapeTest() {
+ // Note for Java readability we use unusual characters for quote and escape.
+ // Namely: quote is ` (backtick) and escape is | (vertical bar).
+ return Stream.of(
+ // Cases that are not surrounded with quotes
+
+ // Deep|,haven,42 with no escape configured comes through as "Deep|", "haven", 42
+ Arguments.of("Deep|,haven,42\n", null, new Object[] {"Deep|", "haven", 42}),
+ // Deep|,haven,42 with escape configured as | comes through as "Deep,haven", 42
+ Arguments.of("Deep|,haven,42\n", '|', new Object[] {"Deep,haven", 42}),
+ // Deephave|n,42 with no escape configured comes through as "Deephave|n", 42
+ Arguments.of("Deephave|n,42\n", null, new Object[] {"Deephave|n", 42}),
+ // Deephave|n,42 with escape configured as | comes through as "Deephave\n", with \n being newline
+ Arguments.of("Deephave|n,42\n", '|', new Object[] {"Deephave\n", 42}),
+
+ // Cases that are surrounded with quotes
+
+ // `Deep,haven`,42 with no escape configured comes through as "Deep,haven", 42
+ // because quotation marks are another way to escape the field separator.
+ Arguments.of("`Deep,haven`,42\n", null, new Object[] {"Deep,haven", 42}),
+ // `Deep|,haven`,42 with escape configured as | also comes through as "Deep,haven", 42
+ // because the escape is processed even inside quotes.
+ Arguments.of("`Deep|,haven`,42\n", '|', new Object[] {"Deep,haven", 42}),
+ // `Deephave|n`,42 with no escape configured comes through as "Deephave|n", 42
+ Arguments.of("`Deephave|n`,42\n", '*', new Object[] {"Deephave|n", 42}),
+ // `Deephave|n,42` with escape configured as | comes through as "Deephave\n", with \n being newline
+ Arguments.of("`Deephave|n`,42\n", '|', new Object[] {"Deephave\n", 42}),
+
+ // C style escapes
+
+ // Without escape configured, C-style escapes are not special
+ Arguments.of("Deep|b|r|n|t|fhaven,42\n", null, new Object[] {"Deep|b|r|n|t|fhaven", 42}),
+ // With escape configured, C-style escapes are special
+ Arguments.of("Deep|b|r|n|t|fhaven,42\n", '|', new Object[] {"Deep\b\r\n\t\fhaven", 42}),
+ // Surrounding with quotes, without escape configured, C-style escapes are not special
+ Arguments.of("`Deep|b|r|n|t|fhaven`,42\n", null, new Object[] {"Deep|b|r|n|t|fhaven", 42}),
+ // Surrounding with quotes, with escape configured, C-style escapes are special
+ Arguments.of("`Deep|b|r|n|t|fhaven`,42\n", '|', new Object[] {"Deep\b\r\n\t\fhaven", 42}),
+
+ // Quotation mark in the middle of unquoted text
+
+ // Without escape configured: a quotation mark in the middle is passed through
+ Arguments.of("Deep`haven,42\n", null, new Object[] {"Deep`haven", 42}),
+ // With escape configured: a quotation mark in the middle is still passed through (not that interesting)
+ Arguments.of("Deep`haven,42\n", '|', new Object[] {"Deep`haven", 42}),
+ // Without escape configured: an escaped quotation mark in the middle just passes through the | and the
+ // `
+ Arguments.of("Deep|`haven,42\n", null, new Object[] {"Deep|`haven", 42}),
+ // With escape configured: an escaped quotation mark in the middle passes through the `
+ Arguments.of("Deep|`haven,42\n", '|', new Object[] {"Deep`haven", 42}),
+
+ // Getting a quotation mark in the middle of quoted text
+
+ // Without escape configured: a double quotation mark in the middle is passed through as a single quote
+ Arguments.of("`Deep``haven`,42\n", null, new Object[] {"Deep`haven", 42}),
+ // With escape configured: a double quotation mark in the middle is still passed through (same as above)
+ Arguments.of("`Deep``haven`,42\n", '|', new Object[] {"Deep`haven", 42}),
+ // With escape configured: a single escaped quotation mark in the middle passes through the quote
+ Arguments.of("`Deep|`haven`,42\n", '|', new Object[] {"Deep`haven", 42}));
+ }
+
+ @ParameterizedTest
+ @MethodSource("provideTuplesForQuoteTest")
+ public void choiceOfQuoteTest(String input, char quote, Object[] expectedValues) throws CsvReaderException {
+ final Charset charset = StandardCharsets.UTF_8;
+ final CsvSpecs specs = CsvTestUtil.defaultCsvBuilder().hasHeaderRow(false).quote(quote).escape('|').build();
+ final InputStream stream = CsvTestUtil.toInputStream(input, charset);
+ CsvReader.Result result = CsvReader.read(specs, stream, charset, CsvTestUtil.makeMySinkFactory());
+
+ Assertions.assertThat(result.numRows()).isEqualTo(1);
+ Assertions.assertThat(result.numCols()).isEqualTo(expectedValues.length);
+
+ for (int i = 0; i != result.numCols(); ++i) {
+ final Object array = result.columns()[i].data();
+ final Object element0 = Array.get(array, 0);
+ Assertions.assertThat(element0).isEqualTo(expectedValues[i]);
+ }
+ }
+
+ private static Stream provideTuplesForQuoteTest() {
+ return Stream.of(
+ // `Deep|`haven`,42 comes through as Deep`haven, 42
+ Arguments.of("`Deep|`haven`,42\n", '`', new Object[] {"Deep`haven", 42}),
+ // *Deep|*haven*,42 comes through as Deep*haven, 42
+ Arguments.of("*Deep|*haven*,42\n", '*', new Object[] {"Deep*haven", 42}),
+ // nDeephave|nn,42 comes through as Deephave\n, 42 where n is the newline.
+ // This demonstrates the fanciful example that you *can* use 'n' as a quote character, but
+ // you should know that \n will translate to newline, not escape your quote character.
+ Arguments.of("nDeephave|nn,42\n", 'n', new Object[] {"Deephave\n", 42}),
+ // nDeephavennn,42 comes through as Deephaven, 42.
+ // Following up to the above, this shows if you use 'n' as a quote character
+ // and you want it in your data, you need to double it.
+ Arguments.of("nDeephavennn,42\n", 'n', new Object[] {"Deephaven", 42}));
+ }
+
+ @ParameterizedTest
+ @MethodSource("provideTuplesForErroneousUseOfEscapeTest")
+ public void erroneousUseOfEscape(String input, String exceptionFragment) throws CsvReaderException {
+ final Charset charset = StandardCharsets.UTF_8;
+ final CsvSpecs specs = CsvTestUtil.defaultCsvBuilder().hasHeaderRow(false).escape('|').build();
+ final InputStream stream = CsvTestUtil.toInputStream(input, charset);
+
+ Assertions.assertThatThrownBy(() -> {
+ CsvReader.read(specs, stream, charset, CsvTestUtil.makeMySinkFactory());
+ }).hasMessageContaining(exceptionFragment);
+ }
+
+ private static Stream provideTuplesForErroneousUseOfEscapeTest() {
+ return Stream.of(
+ // Last character of input cannot be escape
+ Arguments.of("hello|", "The escape character cannot be the last character of the input"),
+ // Cannot escape carriage return
+ Arguments.of("hello|\r", "Can't escape a carriage return or newline"),
+ // Cannot escape newline
+ Arguments.of("hello|\n", "Can't escape a carriage return or newline"),
+ // Cannot escape non-ASCII
+ Arguments.of("hello|❤", "Can't escape a non-ASCII character"));
+ }
+}