From a405d487441bf65bf2adc5618f12fd19529b0003 Mon Sep 17 00:00:00 2001 From: Corey Kosak Date: Wed, 14 Jan 2026 19:41:37 -0500 Subject: [PATCH] fix: DH-21413: fail if there's non-whitespace material after a cell's closing quote --- .../reading/cells/DelimitedCellGrabber.java | 32 ++++++++++++------- .../java/io/deephaven/csv/CsvReaderTest.java | 31 ++++++++++++------ 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java index 5c3145f9..d235f2da 100644 --- a/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java +++ b/src/main/java/io/deephaven/csv/reading/cells/DelimitedCellGrabber.java @@ -155,25 +155,35 @@ private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastIn ++offset; startOffset = offset; } - // We got out of the quoted string. Consume any trailing matter after the quote and before the - // field - // delimiter. Hopefully that trailing matter is just whitespace, but we shall see. - finishField(dest, lastInRow, endOfInput); - - // From this point on, note that dest is a slice that may point to the underlying input buffer - // or the spill buffer. Take care from this point on to not disturb the input (e.g. by reading - // the next chunk) or the spill buffer. // The easiest way to make all the above logic run smoothly is to let the final quotation mark - // (which will unconditionally be there) and subsequent whitespace (if any) into the field. - // Then we can simply trim it back out now. + // (which will unconditionally be there) and subsequent matter (if any) into the field. + // Then we can simply trim it back out, making sure that what we are trimming is only whitespace. + // After trimming, we will see if the expected number of chars matches the actual number of chars. + // The -1 here is because the number of characters processed includes the closing quote already. + final int expectedSize = spillBuffer.size() + offset - startOffset - 1; + finishField(dest, lastInRow, endOfInput); + + // Trim away any trailing whitespace while (dest.begin() != dest.end() && RangeTests.isSpaceOrTab(dest.back())) { dest.setEnd(dest.end() - 1); } + + final String exceptionMessage = "Logic error: final non-whitespace in field is not quoteChar"; + + // Trim away the final quote char if (dest.begin() == dest.end() || dest.back() != quoteChar) { - throw new RuntimeException("Logic error: final non-whitespace in field is not quoteChar"); + throw new RuntimeException(exceptionMessage); } dest.setEnd(dest.end() - 1); + + // Ensure we have the expected number of chars. The above logic can get misled if there are multiple + // closing quotes, as in the input "hello there"junk". + // The quote at the end of 'there' is the real closing quote; the remainder of the text is trash and should + // be rejected. + if (dest.size() != expectedSize) { + throw new RuntimeException(exceptionMessage); + } } /** diff --git a/src/test/java/io/deephaven/csv/CsvReaderTest.java b/src/test/java/io/deephaven/csv/CsvReaderTest.java index c1941cbf..a66af5c0 100644 --- a/src/test/java/io/deephaven/csv/CsvReaderTest.java +++ b/src/test/java/io/deephaven/csv/CsvReaderTest.java @@ -20,6 +20,7 @@ import java.io.InputStream; import java.math.BigDecimal; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.time.Duration; import java.time.Instant; @@ -1064,9 +1065,13 @@ public void quotingSuccessfulEdgeCases() throws CsvReaderException { // # + "####\n" // ## - + "######\n"; + + "######\n" + // hello + + "#hello#\n" + // hello + + "#hello#\t\t \t\t\n"; - final ColumnSet expected = ColumnSet.of(Column.ofRefs("Values", null, "#", "##")); + final ColumnSet expected = ColumnSet.of(Column.ofRefs("Values", null, "#", "##", "hello", "hello")); CsvTestUtil.invokeTests(CsvTestUtil.defaultCsvBuilder().quote('#').build(), input, expected); } @@ -1081,14 +1086,22 @@ public void quotingFailingEdgeCases() { .hasRootCauseMessage("Cell did not have closing quote character"); } - @Test - public void quotingExcessMaterial() { - final String input = "" + "Val1,Val2\n" + "#hello#junk,there\n"; // invalid - + @ParameterizedTest + @ValueSource(strings = { + // trailing matter after closing quote (here the quote is the hashmark) + "#hello#junk,there\n", + // trailing matter after closing quote, but there's another quote at the end. still an error + "#hello#junk#,there\n" + }) + public void quotingExcessNonwhitespaceMaterial(String input) { Assertions.assertThatThrownBy( - () -> CsvTestUtil.invokeTests(CsvTestUtil.defaultCsvBuilder().quote('#').build(), input, - ColumnSet.NONE)) - .hasRootCauseMessage("Logic error: final non-whitespace in field is not quoteChar"); + () -> { + final CsvSpecs specs = CsvTestUtil.defaultCsvBuilder().hasHeaderRow(false).quote('#').build(); + final Charset charset = StandardCharsets.UTF_8; + final InputStream stream = CsvTestUtil.toInputStream(input, charset); + CsvReader.read(specs, stream, charset, CsvTestUtil.makeMySinkFactory()); + }) + .hasMessage("Logic error: final non-whitespace in field is not quoteChar"); } @Test