Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions src/main/java/io/deephaven/csv/CsvSpecs.java
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ default Builder putParserForIndex(int index, Parser<?> parser) {
* 7-bit ASCII. The default is '{@value #defaultQuote}' For example:
*
* <pre>
* 123,"hello, there",456,
* 123,"hello, there",456
* </pre>
*
* Would be read as the three fields:
Expand All @@ -337,6 +337,29 @@ default Builder putParserForIndex(int index, Parser<?> parser) {
*/
Builder quote(char quote);

/**
* The escape character (used when you want field or line delimiters to be interpreted as literal text, or you
* want to add the Java-style escape sequences \b, \t, \n, \r, or \f. Typically set to the backslash character
* ('\'). Must be 7-bit ASCII. We do not decode Java octal or Unicode escape sequences 0xx or uxxxx. The default
* is null, interpreted as unset. For example, with the escape character set to '\':
*
* <pre>
* 123,hello\, there\n,456
* </pre>
*
* Would be read as the three fields:
*
* <ul>
* <li>123
* <li>hello, there\n (where \n is the newline character)
* <li>456
* </ul>
*
* @param escape The escape property.
* @return self after modifying the escape property.
*/
Builder escape(Character escape);

/**
* Whether to trim leading and trailing blanks from non-quoted values. The default is {@code true}.
*
Expand Down Expand Up @@ -399,6 +422,9 @@ void check() {
final List<String> problems = new ArrayList<>();
check7BitAscii("quote", quote(), problems);
check7BitAscii("delimiter", delimiter(), problems);
if (escape() != null) {
check7BitAscii("escape", escape(), problems);
}
checkNonnegative("skipRows", skipRows(), problems);
checkNonnegative("skipHeaderRows", skipHeaderRows(), problems);
checkNonnegative("numRows", numRows(), problems);
Expand Down Expand Up @@ -699,7 +725,6 @@ public char delimiter() {
return defaultDelimiter;
}


private static final char defaultQuote = '"';

/**
Expand All @@ -712,6 +737,17 @@ public char quote() {
return defaultQuote;
}

/**
* See {@link Builder#escape}.
*
* @return The caller-specified escape character, or null if none.
*/
@Default
@Nullable
public Character escape() {
return null;
}

/**
* See {@link Builder#ignoreSurroundingSpaces}.
*
Expand Down
8 changes: 6 additions & 2 deletions src/main/java/io/deephaven/csv/reading/CsvReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,15 @@ private static boolean needsUtf8Encoding(final Charset charset) {
private static Result delimitedReadLogic(
final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory)
throws CsvReaderException {
// These two have already been validated by CsvSpecs to be 7-bit ASCII.
final byte IllegalUtf8 = (byte) 0xff;

// These three have already been validated by CsvSpecs to be 7-bit ASCII.
final byte quoteAsByte = (byte) specs.quote();
final byte delimiterAsByte = (byte) specs.delimiter();
final byte escapeCharAsByte = specs.escape() == null ? IllegalUtf8 : (byte) specs.escape().charValue();
final CellGrabber grabber =
new DelimitedCellGrabber(stream, quoteAsByte, delimiterAsByte, specs.ignoreSurroundingSpaces(),
new DelimitedCellGrabber(stream, quoteAsByte, escapeCharAsByte, delimiterAsByte,
specs.ignoreSurroundingSpaces(),
specs.trim());
// For an "out" parameter
final MutableObject<byte[][]> firstDataRowHolder = new MutableObject<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ public final class DelimitedCellGrabber implements CellGrabber {
private final InputStream inputStream;
/** The configured CSV quote character (typically '"'). Must be 7-bit ASCII. */
private final byte quoteChar;
/**
* The configured CVS escape character. Must be 7-bit ASCII. If configured to null in CsvSpecs, we set it to the
* illegal UTF-8 byte 0xff so it has no effect.
*/
private final byte escapeChar;
/** The configured CVS field delimiter (typically ','). Must be 7-bit ASCII. */
private final byte fieldDelimiter;
/** Whether to trim leading and trailing blanks from non-quoted values. */
Expand All @@ -40,7 +45,8 @@ public final class DelimitedCellGrabber implements CellGrabber {
* buffer[] array. But we can't do that when the input cell spans more than one buffer[] chunk, or when the input
* cell does not exactly represent the output. This latter case can happen for example when an escaped quote ("")
* needs to be returned as a single quotation mark ("). So if our input is hello""there, then we can't directly
* return a slice of the input array, because actually we need hello"there (one quotation mark, not two).
* return a slice of the input array, because actually we need hello"there (one quotation mark, not two). Another
* case where this can happen is when the escape character is enabled and we encounter an escape like \, or \n.
*/
private final GrowableByteBuffer spillBuffer;
/**
Expand All @@ -56,18 +62,22 @@ public final class DelimitedCellGrabber implements CellGrabber {
*
* @param inputStream The input, represented as UTF-8 bytes.
* @param quoteChar The configured quote char. Typically "
* @param escapeChar The configured escape char. Defaults to our representation of 'none' but if the feature is
* desired, is typically set to \
* @param fieldDelimiter The configured field delimiter. Typically ,
* @param ignoreSurroundingSpaces Whether to ignore surrounding spaces
* @param trim Whether to trim spaces inside quoted values.
*/
public DelimitedCellGrabber(
final InputStream inputStream,
final byte quoteChar,
final byte escapeChar,
final byte fieldDelimiter,
final boolean ignoreSurroundingSpaces,
final boolean trim) {
this.inputStream = inputStream;
this.quoteChar = quoteChar;
this.escapeChar = escapeChar;
this.fieldDelimiter = fieldDelimiter;
this.ignoreSurroundingSpaces = ignoreSurroundingSpaces;
this.trim = trim;
Expand Down Expand Up @@ -131,10 +141,16 @@ private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastIn
}
prevCharWasCarriageReturn = false;
}
if (ch != quoteChar) {
if (ch != quoteChar && ch != escapeChar) {
// Ordinary character. Note: in quoted mode we will gladly eat field and line separators.
continue;
}

if (ch == escapeChar) {
processEscapeChar();
continue;
}

// This character is a quote char. It could be the end of the cell, or it could be an escaped
// quote char (e.g. ""). The way to tell is to peek ahead at the next character.
if (!tryEnsureMore()) {
Expand Down Expand Up @@ -264,10 +280,74 @@ private void finishField(final ByteSlice dest, final MutableBoolean lastInRow,
++physicalRowNum;
return;
}
if (ch == escapeChar) {
++offset;
processEscapeChar();
continue;
}

++offset;
}
}

private void processEscapeChar() throws CsvReaderException {
// Spill data up to and including the escape character into the spill buffer.
// Below, we will replace the escape character with the transformed escaped character.
spillRange();

// This character is an escape character. In practice, it is used to either to make the next
// metacharacter like the quote or field separator normal, or to provide a C-style special character like
// newline or tab.
// However, it can't appear as the last character of the input.
if (!tryEnsureMore()) {
throw new CsvReaderException("The escape character cannot be the last character of the input");
}

// Consume the next char (the escaped character). Potentially transform it if it is one of the C escapes:
// characters b, t, n etc
final byte nextChar = buffer[offset++];
final byte nextCharTransformed = transformEscapedChar(nextChar);

// Replace the placeholder character with the transformed character
spillBuffer.data()[spillBuffer.size() - 1] = nextCharTransformed;

// Advance the spill buffer's notion of "next start position" so it skips the escaped character.
startOffset = offset;
}


/**
* Interpret the set of character escapes supported by Java. We do not currently interpet the octal 0xx or Unicode
* escape sequences uxxxx
*
* @param nextChar The character following the escape character.
* @return If one of (b, t, n, r, f), that value transformed to (\b, \t, \n, \r, \f). Otherwise, the value is
* returned unchanged.
* @throws CsvReaderException if passed a non-ASCII character, carriage return, or newline.
*/
private static byte transformEscapedChar(byte nextChar) throws CsvReaderException {
if (nextChar < 0) {
throw new CsvReaderException("Can't escape a non-ASCII character");
}
if (nextChar == '\r' || nextChar == '\n') {
throw new CsvReaderException("Can't escape a carriage return or newline");
}
switch (nextChar) {
case 'b':
return '\b';
case 't':
return '\t';
case 'n':
return '\n';
case 'r':
return '\r';
case 'f':
return '\f';
default:
return nextChar;
}
}

/** @return true if there are more characters. */
private boolean tryEnsureMore() throws CsvReaderException {
if (offset != size) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public class FixedCellGrabber implements CellGrabber {
*/
public static CellGrabber makeLineGrabber(InputStream stream) {
final byte IllegalUtf8 = (byte) 0xff;
return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false);
return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, IllegalUtf8, true, false);
}

private final CellGrabber lineGrabber;
Expand Down
158 changes: 158 additions & 0 deletions src/test/java/io/deephaven/csv/EscapeTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package io.deephaven.csv;

import io.deephaven.csv.parsers.Parser;
import io.deephaven.csv.reading.CsvReader;
import io.deephaven.csv.testutil.*;
import io.deephaven.csv.util.CsvReaderException;
import org.assertj.core.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import org.junit.jupiter.params.provider.ValueSource;

import java.io.InputStream;
import java.lang.reflect.Array;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.stream.Stream;

public class EscapeTest {
@ParameterizedTest
@MethodSource("provideTuplesForEscapeTest")
public void escapeTest(String input, Character escape, Object[] expectedValues) throws CsvReaderException {
final Charset charset = StandardCharsets.UTF_8;
final CsvSpecs specs = CsvTestUtil.defaultCsvBuilder().hasHeaderRow(false).quote('`').escape(escape).build();
final InputStream stream = CsvTestUtil.toInputStream(input, charset);
CsvReader.Result result = CsvReader.read(specs, stream, charset, CsvTestUtil.makeMySinkFactory());

Assertions.assertThat(result.numRows()).isEqualTo(1);
Assertions.assertThat(result.numCols()).isEqualTo(expectedValues.length);

for (int i = 0; i != result.numCols(); ++i) {
final Object array = result.columns()[i].data();
final Object element0 = Array.get(array, 0);
Assertions.assertThat(element0).isEqualTo(expectedValues[i]);
}
}

private static Stream<Arguments> provideTuplesForEscapeTest() {
// Note for Java readability we use unusual characters for quote and escape.
// Namely: quote is ` (backtick) and escape is | (vertical bar).
return Stream.of(
// Cases that are not surrounded with quotes

// Deep|,haven,42 with no escape configured comes through as "Deep|", "haven", 42
Arguments.of("Deep|,haven,42\n", null, new Object[] {"Deep|", "haven", 42}),
// Deep|,haven,42 with escape configured as | comes through as "Deep,haven", 42
Arguments.of("Deep|,haven,42\n", '|', new Object[] {"Deep,haven", 42}),
// Deephave|n,42 with no escape configured comes through as "Deephave|n", 42
Arguments.of("Deephave|n,42\n", null, new Object[] {"Deephave|n", 42}),
// Deephave|n,42 with escape configured as | comes through as "Deephave\n", with \n being newline
Arguments.of("Deephave|n,42\n", '|', new Object[] {"Deephave\n", 42}),

// Cases that are surrounded with quotes

// `Deep,haven`,42 with no escape configured comes through as "Deep,haven", 42
// because quotation marks are another way to escape the field separator.
Arguments.of("`Deep,haven`,42\n", null, new Object[] {"Deep,haven", 42}),
// `Deep|,haven`,42 with escape configured as | also comes through as "Deep,haven", 42
// because the escape is processed even inside quotes.
Arguments.of("`Deep|,haven`,42\n", '|', new Object[] {"Deep,haven", 42}),
// `Deephave|n`,42 with no escape configured comes through as "Deephave|n", 42
Arguments.of("`Deephave|n`,42\n", '*', new Object[] {"Deephave|n", 42}),
// `Deephave|n,42` with escape configured as | comes through as "Deephave\n", with \n being newline
Arguments.of("`Deephave|n`,42\n", '|', new Object[] {"Deephave\n", 42}),

// C style escapes

// Without escape configured, C-style escapes are not special
Arguments.of("Deep|b|r|n|t|fhaven,42\n", null, new Object[] {"Deep|b|r|n|t|fhaven", 42}),
// With escape configured, C-style escapes are special
Arguments.of("Deep|b|r|n|t|fhaven,42\n", '|', new Object[] {"Deep\b\r\n\t\fhaven", 42}),
// Surrounding with quotes, without escape configured, C-style escapes are not special
Arguments.of("`Deep|b|r|n|t|fhaven`,42\n", null, new Object[] {"Deep|b|r|n|t|fhaven", 42}),
// Surrounding with quotes, with escape configured, C-style escapes are special
Arguments.of("`Deep|b|r|n|t|fhaven`,42\n", '|', new Object[] {"Deep\b\r\n\t\fhaven", 42}),

// Quotation mark in the middle of unquoted text

// Without escape configured: a quotation mark in the middle is passed through
Arguments.of("Deep`haven,42\n", null, new Object[] {"Deep`haven", 42}),
// With escape configured: a quotation mark in the middle is still passed through (not that interesting)
Arguments.of("Deep`haven,42\n", '|', new Object[] {"Deep`haven", 42}),
// Without escape configured: an escaped quotation mark in the middle just passes through the | and the
// `
Arguments.of("Deep|`haven,42\n", null, new Object[] {"Deep|`haven", 42}),
// With escape configured: an escaped quotation mark in the middle passes through the `
Arguments.of("Deep|`haven,42\n", '|', new Object[] {"Deep`haven", 42}),

// Getting a quotation mark in the middle of quoted text

// Without escape configured: a double quotation mark in the middle is passed through as a single quote
Arguments.of("`Deep``haven`,42\n", null, new Object[] {"Deep`haven", 42}),
// With escape configured: a double quotation mark in the middle is still passed through (same as above)
Arguments.of("`Deep``haven`,42\n", '|', new Object[] {"Deep`haven", 42}),
// With escape configured: a single escaped quotation mark in the middle passes through the quote
Arguments.of("`Deep|`haven`,42\n", '|', new Object[] {"Deep`haven", 42}));
}

@ParameterizedTest
@MethodSource("provideTuplesForQuoteTest")
public void choiceOfQuoteTest(String input, char quote, Object[] expectedValues) throws CsvReaderException {
final Charset charset = StandardCharsets.UTF_8;
final CsvSpecs specs = CsvTestUtil.defaultCsvBuilder().hasHeaderRow(false).quote(quote).escape('|').build();
final InputStream stream = CsvTestUtil.toInputStream(input, charset);
CsvReader.Result result = CsvReader.read(specs, stream, charset, CsvTestUtil.makeMySinkFactory());

Assertions.assertThat(result.numRows()).isEqualTo(1);
Assertions.assertThat(result.numCols()).isEqualTo(expectedValues.length);

for (int i = 0; i != result.numCols(); ++i) {
final Object array = result.columns()[i].data();
final Object element0 = Array.get(array, 0);
Assertions.assertThat(element0).isEqualTo(expectedValues[i]);
}
}

private static Stream<Arguments> provideTuplesForQuoteTest() {
return Stream.of(
// `Deep|`haven`,42 comes through as Deep`haven, 42
Arguments.of("`Deep|`haven`,42\n", '`', new Object[] {"Deep`haven", 42}),
// *Deep|*haven*,42 comes through as Deep*haven, 42
Arguments.of("*Deep|*haven*,42\n", '*', new Object[] {"Deep*haven", 42}),
// nDeephave|nn,42 comes through as Deephave\n, 42 where n is the newline.
// This demonstrates the fanciful example that you *can* use 'n' as a quote character, but
// you should know that \n will translate to newline, not escape your quote character.
Arguments.of("nDeephave|nn,42\n", 'n', new Object[] {"Deephave\n", 42}),
// nDeephavennn,42 comes through as Deephaven, 42.
// Following up to the above, this shows if you use 'n' as a quote character
// and you want it in your data, you need to double it.
Arguments.of("nDeephavennn,42\n", 'n', new Object[] {"Deephaven", 42}));
}

@ParameterizedTest
@MethodSource("provideTuplesForErroneousUseOfEscapeTest")
public void erroneousUseOfEscape(String input, String exceptionFragment) throws CsvReaderException {
final Charset charset = StandardCharsets.UTF_8;
final CsvSpecs specs = CsvTestUtil.defaultCsvBuilder().hasHeaderRow(false).escape('|').build();
final InputStream stream = CsvTestUtil.toInputStream(input, charset);

Assertions.assertThatThrownBy(() -> {
CsvReader.read(specs, stream, charset, CsvTestUtil.makeMySinkFactory());
}).hasMessageContaining(exceptionFragment);
}

private static Stream<Arguments> provideTuplesForErroneousUseOfEscapeTest() {
return Stream.of(
// Last character of input cannot be escape
Arguments.of("hello|", "The escape character cannot be the last character of the input"),
// Cannot escape carriage return
Arguments.of("hello|\r", "Can't escape a carriage return or newline"),
// Cannot escape newline
Arguments.of("hello|\n", "Can't escape a carriage return or newline"),
// Cannot escape non-ASCII
Arguments.of("hello|❤", "Can't escape a non-ASCII character"));
}
}