diff --git a/src/main/java/io/deephaven/csv/CsvSpecs.java b/src/main/java/io/deephaven/csv/CsvSpecs.java index fdb4ed66..a11718a7 100644 --- a/src/main/java/io/deephaven/csv/CsvSpecs.java +++ b/src/main/java/io/deephaven/csv/CsvSpecs.java @@ -319,7 +319,7 @@ default Builder putParserForIndex(int index, Parser parser) { * 7-bit ASCII. The default is '{@value #defaultQuote}' For example: * *
-         * 123,"hello, there",456,
+         * 123,"hello, there",456
          * 
* * Would be read as the three fields: @@ -337,6 +337,29 @@ default Builder putParserForIndex(int index, Parser parser) { */ Builder quote(char quote); + /** + * The escape character (used when you want field or line delimiters to be interpreted as literal text, or you + * want to add the Java-style escape sequences \b, \t, \n, \r, or \f. Typically set to the backslash character + * ('\'). Must be 7-bit ASCII. We do not decode Java octal or Unicode escape sequences 0xx or uxxxx. The default + * is null, interpreted as unset. For example, with the escape character set to '\': + * + *
+         * 123,hello\, there\n,456
+         * 
+ * + * Would be read as the three fields: + * + * + * + * @param escape The escape property. + * @return self after modifying the escape property. + */ + Builder escape(Character escape); + /** * Whether to trim leading and trailing blanks from non-quoted values. The default is {@code true}. * @@ -399,6 +422,9 @@ void check() { final List problems = new ArrayList<>(); check7BitAscii("quote", quote(), problems); check7BitAscii("delimiter", delimiter(), problems); + if (escape() != null) { + check7BitAscii("escape", escape(), problems); + } checkNonnegative("skipRows", skipRows(), problems); checkNonnegative("skipHeaderRows", skipHeaderRows(), problems); checkNonnegative("numRows", numRows(), problems); @@ -699,7 +725,6 @@ public char delimiter() { return defaultDelimiter; } - private static final char defaultQuote = '"'; /** @@ -712,6 +737,17 @@ public char quote() { return defaultQuote; } + /** + * See {@link Builder#escape}. + * + * @return The caller-specified escape character, or null if none. + */ + @Default + @Nullable + public Character escape() { + return null; + } + /** * See {@link Builder#ignoreSurroundingSpaces}. * diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java index 04610758..56560881 100644 --- a/src/main/java/io/deephaven/csv/reading/CsvReader.java +++ b/src/main/java/io/deephaven/csv/reading/CsvReader.java @@ -116,11 +116,15 @@ private static boolean needsUtf8Encoding(final Charset charset) { private static Result delimitedReadLogic( final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory) throws CsvReaderException { - // These two have already been validated by CsvSpecs to be 7-bit ASCII. + final byte IllegalUtf8 = (byte) 0xff; + + // These three have already been validated by CsvSpecs to be 7-bit ASCII. final byte quoteAsByte = (byte) specs.quote(); final byte delimiterAsByte = (byte) specs.delimiter(); + final byte escapeCharAsByte = specs.escape() == null ? IllegalUtf8 : (byte) specs.escape().charValue(); final CellGrabber grabber = - new DelimitedCellGrabber(stream, quoteAsByte, delimiterAsByte, specs.ignoreSurroundingSpaces(), + new DelimitedCellGrabber(stream, quoteAsByte, escapeCharAsByte, delimiterAsByte, + specs.ignoreSurroundingSpaces(), specs.trim()); // For an "out" parameter final MutableObject firstDataRowHolder = new MutableObject<>(); diff --git a/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java index d235f2da..7c2857f6 100644 --- a/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java +++ b/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java @@ -21,6 +21,11 @@ public final class DelimitedCellGrabber implements CellGrabber { private final InputStream inputStream; /** The configured CSV quote character (typically '"'). Must be 7-bit ASCII. */ private final byte quoteChar; + /** + * The configured CVS escape character. Must be 7-bit ASCII. If configured to null in CsvSpecs, we set it to the + * illegal UTF-8 byte 0xff so it has no effect. + */ + private final byte escapeChar; /** The configured CVS field delimiter (typically ','). Must be 7-bit ASCII. */ private final byte fieldDelimiter; /** Whether to trim leading and trailing blanks from non-quoted values. */ @@ -40,7 +45,8 @@ public final class DelimitedCellGrabber implements CellGrabber { * buffer[] array. But we can't do that when the input cell spans more than one buffer[] chunk, or when the input * cell does not exactly represent the output. This latter case can happen for example when an escaped quote ("") * needs to be returned as a single quotation mark ("). So if our input is hello""there, then we can't directly - * return a slice of the input array, because actually we need hello"there (one quotation mark, not two). + * return a slice of the input array, because actually we need hello"there (one quotation mark, not two). Another + * case where this can happen is when the escape character is enabled and we encounter an escape like \, or \n. */ private final GrowableByteBuffer spillBuffer; /** @@ -56,6 +62,8 @@ public final class DelimitedCellGrabber implements CellGrabber { * * @param inputStream The input, represented as UTF-8 bytes. * @param quoteChar The configured quote char. Typically " + * @param escapeChar The configured escape char. Defaults to our representation of 'none' but if the feature is + * desired, is typically set to \ * @param fieldDelimiter The configured field delimiter. Typically , * @param ignoreSurroundingSpaces Whether to ignore surrounding spaces * @param trim Whether to trim spaces inside quoted values. @@ -63,11 +71,13 @@ public final class DelimitedCellGrabber implements CellGrabber { public DelimitedCellGrabber( final InputStream inputStream, final byte quoteChar, + final byte escapeChar, final byte fieldDelimiter, final boolean ignoreSurroundingSpaces, final boolean trim) { this.inputStream = inputStream; this.quoteChar = quoteChar; + this.escapeChar = escapeChar; this.fieldDelimiter = fieldDelimiter; this.ignoreSurroundingSpaces = ignoreSurroundingSpaces; this.trim = trim; @@ -131,10 +141,16 @@ private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastIn } prevCharWasCarriageReturn = false; } - if (ch != quoteChar) { + if (ch != quoteChar && ch != escapeChar) { // Ordinary character. Note: in quoted mode we will gladly eat field and line separators. continue; } + + if (ch == escapeChar) { + processEscapeChar(); + continue; + } + // This character is a quote char. It could be the end of the cell, or it could be an escaped // quote char (e.g. ""). The way to tell is to peek ahead at the next character. if (!tryEnsureMore()) { @@ -264,10 +280,74 @@ private void finishField(final ByteSlice dest, final MutableBoolean lastInRow, ++physicalRowNum; return; } + if (ch == escapeChar) { + ++offset; + processEscapeChar(); + continue; + } + ++offset; } } + private void processEscapeChar() throws CsvReaderException { + // Spill data up to and including the escape character into the spill buffer. + // Below, we will replace the escape character with the transformed escaped character. + spillRange(); + + // This character is an escape character. In practice, it is used to either to make the next + // metacharacter like the quote or field separator normal, or to provide a C-style special character like + // newline or tab. + // However, it can't appear as the last character of the input. + if (!tryEnsureMore()) { + throw new CsvReaderException("The escape character cannot be the last character of the input"); + } + + // Consume the next char (the escaped character). Potentially transform it if it is one of the C escapes: + // characters b, t, n etc + final byte nextChar = buffer[offset++]; + final byte nextCharTransformed = transformEscapedChar(nextChar); + + // Replace the placeholder character with the transformed character + spillBuffer.data()[spillBuffer.size() - 1] = nextCharTransformed; + + // Advance the spill buffer's notion of "next start position" so it skips the escaped character. + startOffset = offset; + } + + + /** + * Interpret the set of character escapes supported by Java. We do not currently interpet the octal 0xx or Unicode + * escape sequences uxxxx + * + * @param nextChar The character following the escape character. + * @return If one of (b, t, n, r, f), that value transformed to (\b, \t, \n, \r, \f). Otherwise, the value is + * returned unchanged. + * @throws CsvReaderException if passed a non-ASCII character, carriage return, or newline. + */ + private static byte transformEscapedChar(byte nextChar) throws CsvReaderException { + if (nextChar < 0) { + throw new CsvReaderException("Can't escape a non-ASCII character"); + } + if (nextChar == '\r' || nextChar == '\n') { + throw new CsvReaderException("Can't escape a carriage return or newline"); + } + switch (nextChar) { + case 'b': + return '\b'; + case 't': + return '\t'; + case 'n': + return '\n'; + case 'r': + return '\r'; + case 'f': + return '\f'; + default: + return nextChar; + } + } + /** @return true if there are more characters. */ private boolean tryEnsureMore() throws CsvReaderException { if (offset != size) { diff --git a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java index 5538f21a..1bf4fbfc 100644 --- a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java +++ b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java @@ -23,7 +23,7 @@ public class FixedCellGrabber implements CellGrabber { */ public static CellGrabber makeLineGrabber(InputStream stream) { final byte IllegalUtf8 = (byte) 0xff; - return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false); + return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, IllegalUtf8, true, false); } private final CellGrabber lineGrabber; diff --git a/src/test/java/io/deephaven/csv/EscapeTest.java b/src/test/java/io/deephaven/csv/EscapeTest.java new file mode 100644 index 00000000..c7a107bf --- /dev/null +++ b/src/test/java/io/deephaven/csv/EscapeTest.java @@ -0,0 +1,158 @@ +package io.deephaven.csv; + +import io.deephaven.csv.parsers.Parser; +import io.deephaven.csv.reading.CsvReader; +import io.deephaven.csv.testutil.*; +import io.deephaven.csv.util.CsvReaderException; +import org.assertj.core.api.Assertions; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.InputStream; +import java.lang.reflect.Array; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.stream.Stream; + +public class EscapeTest { + @ParameterizedTest + @MethodSource("provideTuplesForEscapeTest") + public void escapeTest(String input, Character escape, Object[] expectedValues) throws CsvReaderException { + final Charset charset = StandardCharsets.UTF_8; + final CsvSpecs specs = CsvTestUtil.defaultCsvBuilder().hasHeaderRow(false).quote('`').escape(escape).build(); + final InputStream stream = CsvTestUtil.toInputStream(input, charset); + CsvReader.Result result = CsvReader.read(specs, stream, charset, CsvTestUtil.makeMySinkFactory()); + + Assertions.assertThat(result.numRows()).isEqualTo(1); + Assertions.assertThat(result.numCols()).isEqualTo(expectedValues.length); + + for (int i = 0; i != result.numCols(); ++i) { + final Object array = result.columns()[i].data(); + final Object element0 = Array.get(array, 0); + Assertions.assertThat(element0).isEqualTo(expectedValues[i]); + } + } + + private static Stream provideTuplesForEscapeTest() { + // Note for Java readability we use unusual characters for quote and escape. + // Namely: quote is ` (backtick) and escape is | (vertical bar). + return Stream.of( + // Cases that are not surrounded with quotes + + // Deep|,haven,42 with no escape configured comes through as "Deep|", "haven", 42 + Arguments.of("Deep|,haven,42\n", null, new Object[] {"Deep|", "haven", 42}), + // Deep|,haven,42 with escape configured as | comes through as "Deep,haven", 42 + Arguments.of("Deep|,haven,42\n", '|', new Object[] {"Deep,haven", 42}), + // Deephave|n,42 with no escape configured comes through as "Deephave|n", 42 + Arguments.of("Deephave|n,42\n", null, new Object[] {"Deephave|n", 42}), + // Deephave|n,42 with escape configured as | comes through as "Deephave\n", with \n being newline + Arguments.of("Deephave|n,42\n", '|', new Object[] {"Deephave\n", 42}), + + // Cases that are surrounded with quotes + + // `Deep,haven`,42 with no escape configured comes through as "Deep,haven", 42 + // because quotation marks are another way to escape the field separator. + Arguments.of("`Deep,haven`,42\n", null, new Object[] {"Deep,haven", 42}), + // `Deep|,haven`,42 with escape configured as | also comes through as "Deep,haven", 42 + // because the escape is processed even inside quotes. + Arguments.of("`Deep|,haven`,42\n", '|', new Object[] {"Deep,haven", 42}), + // `Deephave|n`,42 with no escape configured comes through as "Deephave|n", 42 + Arguments.of("`Deephave|n`,42\n", '*', new Object[] {"Deephave|n", 42}), + // `Deephave|n,42` with escape configured as | comes through as "Deephave\n", with \n being newline + Arguments.of("`Deephave|n`,42\n", '|', new Object[] {"Deephave\n", 42}), + + // C style escapes + + // Without escape configured, C-style escapes are not special + Arguments.of("Deep|b|r|n|t|fhaven,42\n", null, new Object[] {"Deep|b|r|n|t|fhaven", 42}), + // With escape configured, C-style escapes are special + Arguments.of("Deep|b|r|n|t|fhaven,42\n", '|', new Object[] {"Deep\b\r\n\t\fhaven", 42}), + // Surrounding with quotes, without escape configured, C-style escapes are not special + Arguments.of("`Deep|b|r|n|t|fhaven`,42\n", null, new Object[] {"Deep|b|r|n|t|fhaven", 42}), + // Surrounding with quotes, with escape configured, C-style escapes are special + Arguments.of("`Deep|b|r|n|t|fhaven`,42\n", '|', new Object[] {"Deep\b\r\n\t\fhaven", 42}), + + // Quotation mark in the middle of unquoted text + + // Without escape configured: a quotation mark in the middle is passed through + Arguments.of("Deep`haven,42\n", null, new Object[] {"Deep`haven", 42}), + // With escape configured: a quotation mark in the middle is still passed through (not that interesting) + Arguments.of("Deep`haven,42\n", '|', new Object[] {"Deep`haven", 42}), + // Without escape configured: an escaped quotation mark in the middle just passes through the | and the + // ` + Arguments.of("Deep|`haven,42\n", null, new Object[] {"Deep|`haven", 42}), + // With escape configured: an escaped quotation mark in the middle passes through the ` + Arguments.of("Deep|`haven,42\n", '|', new Object[] {"Deep`haven", 42}), + + // Getting a quotation mark in the middle of quoted text + + // Without escape configured: a double quotation mark in the middle is passed through as a single quote + Arguments.of("`Deep``haven`,42\n", null, new Object[] {"Deep`haven", 42}), + // With escape configured: a double quotation mark in the middle is still passed through (same as above) + Arguments.of("`Deep``haven`,42\n", '|', new Object[] {"Deep`haven", 42}), + // With escape configured: a single escaped quotation mark in the middle passes through the quote + Arguments.of("`Deep|`haven`,42\n", '|', new Object[] {"Deep`haven", 42})); + } + + @ParameterizedTest + @MethodSource("provideTuplesForQuoteTest") + public void choiceOfQuoteTest(String input, char quote, Object[] expectedValues) throws CsvReaderException { + final Charset charset = StandardCharsets.UTF_8; + final CsvSpecs specs = CsvTestUtil.defaultCsvBuilder().hasHeaderRow(false).quote(quote).escape('|').build(); + final InputStream stream = CsvTestUtil.toInputStream(input, charset); + CsvReader.Result result = CsvReader.read(specs, stream, charset, CsvTestUtil.makeMySinkFactory()); + + Assertions.assertThat(result.numRows()).isEqualTo(1); + Assertions.assertThat(result.numCols()).isEqualTo(expectedValues.length); + + for (int i = 0; i != result.numCols(); ++i) { + final Object array = result.columns()[i].data(); + final Object element0 = Array.get(array, 0); + Assertions.assertThat(element0).isEqualTo(expectedValues[i]); + } + } + + private static Stream provideTuplesForQuoteTest() { + return Stream.of( + // `Deep|`haven`,42 comes through as Deep`haven, 42 + Arguments.of("`Deep|`haven`,42\n", '`', new Object[] {"Deep`haven", 42}), + // *Deep|*haven*,42 comes through as Deep*haven, 42 + Arguments.of("*Deep|*haven*,42\n", '*', new Object[] {"Deep*haven", 42}), + // nDeephave|nn,42 comes through as Deephave\n, 42 where n is the newline. + // This demonstrates the fanciful example that you *can* use 'n' as a quote character, but + // you should know that \n will translate to newline, not escape your quote character. + Arguments.of("nDeephave|nn,42\n", 'n', new Object[] {"Deephave\n", 42}), + // nDeephavennn,42 comes through as Deephaven, 42. + // Following up to the above, this shows if you use 'n' as a quote character + // and you want it in your data, you need to double it. + Arguments.of("nDeephavennn,42\n", 'n', new Object[] {"Deephaven", 42})); + } + + @ParameterizedTest + @MethodSource("provideTuplesForErroneousUseOfEscapeTest") + public void erroneousUseOfEscape(String input, String exceptionFragment) throws CsvReaderException { + final Charset charset = StandardCharsets.UTF_8; + final CsvSpecs specs = CsvTestUtil.defaultCsvBuilder().hasHeaderRow(false).escape('|').build(); + final InputStream stream = CsvTestUtil.toInputStream(input, charset); + + Assertions.assertThatThrownBy(() -> { + CsvReader.read(specs, stream, charset, CsvTestUtil.makeMySinkFactory()); + }).hasMessageContaining(exceptionFragment); + } + + private static Stream provideTuplesForErroneousUseOfEscapeTest() { + return Stream.of( + // Last character of input cannot be escape + Arguments.of("hello|", "The escape character cannot be the last character of the input"), + // Cannot escape carriage return + Arguments.of("hello|\r", "Can't escape a carriage return or newline"), + // Cannot escape newline + Arguments.of("hello|\n", "Can't escape a carriage return or newline"), + // Cannot escape non-ASCII + Arguments.of("hello|❤", "Can't escape a non-ASCII character")); + } +}