Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,10 @@ long getBytesRead() {

private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException {
int len = 0;
for (int i = offset; i < length; i++) {
len += getEncodedCharLength(buf[i]);
int previous = lastChar;
for (int i = offset; i < offset + length; i++) {
len += getEncodedCharLength(buf[i], previous);
previous = buf[i];
}
return len;
}
Expand Down Expand Up @@ -140,9 +142,9 @@ private long getEncodedCharLength(final char[] buf, final int offset, final int
* @return the byte length of the character.
* @throws CharacterCodingException if the character cannot be encoded.
*/
private int getEncodedCharLength(final int current) throws CharacterCodingException {
private int getEncodedCharLength(final int current, final int previous) throws CharacterCodingException {
final char cChar = (char) current;
final char lChar = (char) lastChar;
final char lChar = (char) previous;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
}
Expand Down Expand Up @@ -205,7 +207,7 @@ public int read() throws IOException {
lineNumber++;
}
if (encoder != null) {
this.bytesRead += getEncodedCharLength(current);
this.bytesRead += getEncodedCharLength(current, lastChar);
}
lastChar = current;
position++;
Expand All @@ -229,13 +231,13 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
lineNumber++;
}
}
if (encoder != null) {
this.bytesRead += getEncodedCharLength(buf, offset, len);
}
lastChar = buf[offset + len - 1];
} else if (len == EOF) {
lastChar = EOF;
}
if (encoder != null) {
this.bytesRead += getEncodedCharLength(buf, offset, len);
}
position += len;
return len;
}
Expand Down
21 changes: 21 additions & 0 deletions src/test/java/org/apache/commons/csv/CSVParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,27 @@ void testGetBytePositionMultiCharacterDelimiter() throws IOException {
}
}

@Test
void testGetBytePositionMultiCharacterDelimiterWithSupplementaryChar() throws IOException {
// Delimiter holds a 4-byte (surrogate pair) character; the delimiter tail is consumed through
// the char[] read path, where the surrogate halves must be paired with the correct neighbor.
final String code = "aa[😀]bb\ncc[😀]dd\n";
final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[😀]").get();
try (CSVParser parser = CSVParser.builder()
.setReader(new StringReader(code))
.setFormat(format)
.setCharset(StandardCharsets.UTF_8)
.setTrackBytes(true)
.get()) {
final Iterator<CSVRecord> it = parser.iterator();
final CSVRecord first = it.next();
final CSVRecord second = it.next();
assertEquals(0, first.getBytePosition());
// "aa[😀]bb\n" -> 2 + 1 + 4 + 1 + 2 + 1 = 11 bytes in UTF-8
assertEquals(11, second.getBytePosition());
}
}

@Test
void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception {
final String row0 = "é,x\n";
Expand Down