diff --git a/roboflow/util/folderparser.py b/roboflow/util/folderparser.py index 047cdaee..80f64db4 100644 --- a/roboflow/util/folderparser.py +++ b/roboflow/util/folderparser.py @@ -1,3 +1,4 @@ +import csv import json import os import re @@ -324,25 +325,37 @@ def _read_jsonl(path): def _parseAnnotationCSV(filename): - # TODO: use a proper CSV library? + # Open in text mode so line endings are normalized to \n (matches legacy behaviour). + # Annotation filenames never contain embedded newlines, so the csv multi-line-field + # caveat for newline="" does not apply here. with open(filename) as f: - lines = f.readlines() - headers = [h.strip() for h in lines[0].split(",")] + raw_lines = f.readlines() + + rows = list(csv.reader(raw_lines)) + if not rows: + return {"headers": "", "lines": []} + + headers = [h.strip() for h in rows[0]] + # Multi-label classification csv typically named _classes.csv if os.path.basename(filename) == "_classes.csv": parsed_lines = [] - for line in lines[1:]: - parts = [p.strip() for p in line.split(",")] + for parts in rows[1:]: + parts = [p.strip() for p in parts] + if not parts: + continue file_name = parts[0] labels = [headers[i] for i, v in enumerate(parts[1:], start=1) if v == "1"] parsed_lines.append({"file_name": file_name, "labels": labels}) return {"type": "multilabel_csv", "rows": parsed_lines, "headers": headers} - header_line = lines[0] - lines = [{"file_name": ld.split(",")[0].strip(), "line": ld} for ld in lines[1:]] - return { - "headers": header_line, - "lines": lines, - } + + # For regular CSV, preserve raw lines so callers can reconstruct verbatim CSV text + # for upload, but use csv.reader to correctly extract file_name (handles quoted commas). + header_line = raw_lines[0] if raw_lines else "" + lines = [ + {"file_name": row[0].strip() if row else "", "line": raw_line} for raw_line, row in zip(raw_lines[1:], rows[1:]) + ] + return {"headers": header_line, "lines": lines} def _guessAnnotationFileFormat(parsed, extension): diff --git a/tests/util/test_folderparser.py b/tests/util/test_folderparser.py index 4f9ddb5b..8a67fc86 100644 --- a/tests/util/test_folderparser.py +++ b/tests/util/test_folderparser.py @@ -205,6 +205,35 @@ def test_coco_root_annotation_matches_images_in_subdirs(self): self.assertEqual(len(ann_data["annotations"]), 1, "Should have one annotation") self.assertEqual(ann_data["annotations"][0]["bbox"], [10, 20, 100, 200]) + def test_parse_csv_quoted_filename(self): + """_parseAnnotationCSV must handle filenames containing commas (RFC 4180 quoting).""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False, newline="") as f: + f.write("img_fName,class_label\n") + f.write('"image,with,commas.jpg",cat\n') + f.write("normal.jpg,dog\n") + tmppath = f.name + try: + parsed = folderparser._parseAnnotationCSV(tmppath) + names = [ld["file_name"] for ld in parsed["lines"]] + self.assertEqual(names[0], "image,with,commas.jpg") + self.assertEqual(names[1], "normal.jpg") + finally: + os.unlink(tmppath) + + def test_parse_multilabel_csv_quoted_filename(self): + """_parseAnnotationCSV must handle quoted filenames in _classes.csv format.""" + with tempfile.TemporaryDirectory() as tmpdir: + classes_csv = os.path.join(tmpdir, "_classes.csv") + with open(classes_csv, "w") as f: + f.write("filename,cat,dog\n") + f.write('"image,with,commas.jpg",1,0\n') + f.write("normal.jpg,0,1\n") + parsed = folderparser._parseAnnotationCSV(classes_csv) + self.assertEqual(parsed["type"], "multilabel_csv") + rows = {r["file_name"]: r["labels"] for r in parsed["rows"]} + self.assertEqual(rows["image,with,commas.jpg"], ["cat"]) + self.assertEqual(rows["normal.jpg"], ["dog"]) + def _assertJsonMatchesFile(actual, filename): with open(filename) as file: