From c0e8db667e0bec8e39c7153ba06aff0f4e100550 Mon Sep 17 00:00:00 2001 From: digi-scrypt Date: Fri, 5 Jun 2026 20:37:48 +0530 Subject: [PATCH] fix surrogate pair byte counting in array read --- .../commons/csv/ExtendedBufferedReader.java | 18 +++++++++------- .../org/apache/commons/csv/CSVParserTest.java | 21 +++++++++++++++++++ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 889b58edc..1a5910727 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -109,8 +109,10 @@ long getBytesRead() { private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException { int len = 0; - for (int i = offset; i < length; i++) { - len += getEncodedCharLength(buf[i]); + int previous = lastChar; + for (int i = offset; i < offset + length; i++) { + len += getEncodedCharLength(buf[i], previous); + previous = buf[i]; } return len; } @@ -140,9 +142,9 @@ private long getEncodedCharLength(final char[] buf, final int offset, final int * @return the byte length of the character. * @throws CharacterCodingException if the character cannot be encoded. */ - private int getEncodedCharLength(final int current) throws CharacterCodingException { + private int getEncodedCharLength(final int current, final int previous) throws CharacterCodingException { final char cChar = (char) current; - final char lChar = (char) lastChar; + final char lChar = (char) previous; if (!Character.isSurrogate(cChar)) { return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit(); } @@ -205,7 +207,7 @@ public int read() throws IOException { lineNumber++; } if (encoder != null) { - this.bytesRead += getEncodedCharLength(current); + this.bytesRead += getEncodedCharLength(current, lastChar); } lastChar = current; position++; @@ -229,13 +231,13 @@ public int read(final char[] buf, final int offset, final int length) throws IOE lineNumber++; } } + if (encoder != null) { + this.bytesRead += getEncodedCharLength(buf, offset, len); + } lastChar = buf[offset + len - 1]; } else if (len == EOF) { lastChar = EOF; } - if (encoder != null) { - this.bytesRead += getEncodedCharLength(buf, offset, len); - } position += len; return len; } diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 8b1527c42..bb0232db8 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -666,6 +666,27 @@ void testGetBytePositionMultiCharacterDelimiter() throws IOException { } } + @Test + void testGetBytePositionMultiCharacterDelimiterWithSupplementaryChar() throws IOException { + // Delimiter holds a 4-byte (surrogate pair) character; the delimiter tail is consumed through + // the char[] read path, where the surrogate halves must be paired with the correct neighbor. + final String code = "aa[😀]bb\ncc[😀]dd\n"; + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[😀]").get(); + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(format) + .setCharset(StandardCharsets.UTF_8) + .setTrackBytes(true) + .get()) { + final Iterator it = parser.iterator(); + final CSVRecord first = it.next(); + final CSVRecord second = it.next(); + assertEquals(0, first.getBytePosition()); + // "aa[😀]bb\n" -> 2 + 1 + 4 + 1 + 2 + 1 = 11 bytes in UTF-8 + assertEquals(11, second.getBytePosition()); + } + } + @Test void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception { final String row0 = "é,x\n";