From 7b4c635dd97300c11855db56a44856d1a90a5ff6 Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Fri, 5 Jun 2026 15:28:17 +0530 Subject: [PATCH 01/12] gsoc26: Layer 2 with tests initial commit --- databusclient/api/convert.py | 383 ++++++++++++++++++++++++++++++++++ databusclient/api/download.py | 77 +++++++ databusclient/cli.py | 11 + run_all_conversion_tests.py | 338 ++++++++++++++++++++++++++++++ tests/test_conversion.py | 309 +++++++++++++++++++++++++++ 5 files changed, 1118 insertions(+) create mode 100644 databusclient/api/convert.py create mode 100644 run_all_conversion_tests.py create mode 100644 tests/test_conversion.py diff --git a/databusclient/api/convert.py b/databusclient/api/convert.py new file mode 100644 index 0000000..8d28fd8 --- /dev/null +++ b/databusclient/api/convert.py @@ -0,0 +1,383 @@ +"""Format and Mapping Conversion Layer. + +Layer 2: Within-class format conversion (lossless). +Layer 3: Cross-class mapping conversion (quasi-equal for RDF <-> Tabular). +""" + +import csv +import json +import os +from typing import Optional + +from rdflib import Dataset, Graph + + +# --------------------------------------------------------------------------- +# Format registries +# --------------------------------------------------------------------------- + +# Maps CLI format name -> rdflib format string +RDF_TRIPLE_FORMATS = { + "ntriples": "ntriples", + "turtle": "turtle", + "rdf-xml": "xml", +} + +RDF_QUAD_FORMATS = { + "nquads": "nquads", + "trig": "trig", + "trix": "trix", + "json-ld": "json-ld", +} + +TABULAR_FORMATS = { + "csv": ",", + "tsv": "\t", +} + +ALL_FORMATS = ( + list(RDF_TRIPLE_FORMATS) + + list(RDF_QUAD_FORMATS) + + list(TABULAR_FORMATS) +) + +# Maps file extension -> CLI format name +EXTENSION_TO_FORMAT = { + ".ttl": "turtle", + ".nt": "ntriples", + ".rdf": "rdf-xml", + ".xml": "rdf-xml", + ".owl": "rdf-xml", + ".nq": "nquads", + ".trig": "trig", + ".trix": "trix", + ".jsonld": "json-ld", + ".json": "json-ld", + ".csv": "csv", + ".tsv": "tsv", +} + + +# --------------------------------------------------------------------------- +# Format detection +# --------------------------------------------------------------------------- + +def detect_format_from_filename(filename: str) -> Optional[str]: + """Detect format from file extension, ignoring compression extensions. + + Args: + filename: File name or path. + + Returns: + Format name string or None if not detectable. + """ + name = filename.lower() + + # strip compression extension first + for ext in (".bz2", ".gz", ".xz"): + if name.endswith(ext): + name = name[: -len(ext)] + break + + # match longest extension first to avoid .json matching before .jsonld + for ext in sorted(EXTENSION_TO_FORMAT.keys(), key=len, reverse=True): + if name.endswith(ext): + return EXTENSION_TO_FORMAT[ext] + + return None + + +def get_format_class(fmt: str) -> str: + """Return equivalence class for a format name. + + Args: + fmt: Format name (e.g. 'turtle', 'nquads', 'csv'). + + Returns: + 'triples', 'quads', or 'tabular'. + + Raises: + ValueError: If format is not recognised. + """ + if fmt in RDF_TRIPLE_FORMATS: + return "triples" + if fmt in RDF_QUAD_FORMATS: + return "quads" + if fmt in TABULAR_FORMATS: + return "tabular" + raise ValueError( + f"Unknown format: '{fmt}'. Supported formats: {ALL_FORMATS}" + ) + + +# --------------------------------------------------------------------------- +# Output filename helper +# --------------------------------------------------------------------------- + +# Maps format name -> file extension +FORMAT_TO_EXTENSION = { + "ntriples": ".nt", + "turtle": ".ttl", + "rdf-xml": ".rdf", + "nquads": ".nq", + "trig": ".trig", + "trix": ".trix", + "json-ld": ".jsonld", + "csv": ".csv", + "tsv": ".tsv", +} + + +def get_converted_filename(original_filename: str, convert_format: str) -> str: + """Generate output filename after format conversion. + + Strips compression extension if present, then replaces the format + extension with the target format extension. + + Args: + original_filename: Original file name (basename only, not full path). + convert_format: Target format name. + + Returns: + New filename with updated extension. + """ + name = original_filename + + # strip compression extension + for ext in (".bz2", ".gz", ".xz"): + if name.lower().endswith(ext): + name = name[: -len(ext)] + break + + # strip existing format extension + for old_ext in sorted(FORMAT_TO_EXTENSION.values(), key=len, reverse=True): + if name.lower().endswith(old_ext): + name = name[: -len(old_ext)] + break + + target_ext = FORMAT_TO_EXTENSION.get(convert_format, f".{convert_format}") + return name + target_ext + + +# --------------------------------------------------------------------------- +# Layer 2 — within-class format conversion +# --------------------------------------------------------------------------- + +def convert_rdf_triple_format( + input_file: str, + output_file: str, + input_format: str, + output_format: str, +) -> None: + """Convert between RDF triple serialization formats (Layer 2). + + Handles: ntriples, turtle, rdf-xml. + Uses rdflib Graph as internal representation. + + Args: + input_file: Path to input file. + output_file: Path to write converted output. + input_format: Source format name (must be in RDF_TRIPLE_FORMATS). + output_format: Target format name (must be in RDF_TRIPLE_FORMATS). + """ + g = Graph() + g.parse(input_file, format=RDF_TRIPLE_FORMATS[input_format]) + g.serialize(destination=output_file, format=RDF_TRIPLE_FORMATS[output_format]) + print( + f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" + ) + + +def convert_rdf_quad_format( + input_file: str, + output_file: str, + input_format: str, + output_format: str, +) -> None: + """Convert between RDF quad serialization formats (Layer 2). + + Handles: nquads, trig, trix, json-ld. + Uses rdflib Dataset as internal representation + to preserve named graph information. + + Args: + input_file: Path to input file. + output_file: Path to write converted output. + input_format: Source format name (must be in RDF_QUAD_FORMATS). + output_format: Target format name (must be in RDF_QUAD_FORMATS). + """ + g = Dataset() + g.parse(input_file, format=RDF_QUAD_FORMATS[input_format]) + g.serialize(destination=output_file, format=RDF_QUAD_FORMATS[output_format]) + print( + f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" + ) + + +def convert_tabular_format( + input_file: str, + output_file: str, + input_format: str, + output_format: str, +) -> None: + """Convert between tabular formats (Layer 2). + + Handles: csv <-> tsv. + Uses Python built-in csv module. + + Args: + input_file: Path to input file. + output_file: Path to write converted output. + input_format: Source format name ('csv' or 'tsv'). + output_format: Target format name ('csv' or 'tsv'). + """ + input_delimiter = TABULAR_FORMATS[input_format] + output_delimiter = TABULAR_FORMATS[output_format] + + with open(input_file, "r", newline="", encoding="utf-8") as infile: + reader = csv.reader(infile, delimiter=input_delimiter) + rows = list(reader) + + with open(output_file, "w", newline="", encoding="utf-8") as outfile: + writer = csv.writer(outfile, delimiter=output_delimiter) + writer.writerows(rows) + + print( + f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" + ) + + +# --------------------------------------------------------------------------- +# Layer 3 — cross-class mapping conversion +# --------------------------------------------------------------------------- + +def convert_rdf_to_csv( + input_file: str, + output_file: str, + input_format: str, +) -> None: + """Map RDF triples to a wide CSV table (Layer 3). + + Each unique subject becomes a row. Each unique predicate becomes a column. + Multi-valued predicates are pipe-separated. + A companion .meta.json file is generated alongside the CSV to preserve + RDF datatype and language tag information for lossless round trips. + + Args: + input_file: Path to input RDF triples file. + output_file: Path to write output CSV file. + input_format: Source triple format name (must be in RDF_TRIPLE_FORMATS). + """ + g = Graph() + g.parse(input_file, format=RDF_TRIPLE_FORMATS[input_format]) + + predicates = sorted(set(str(p) for s, p, o in g)) + + subjects: dict = {} + column_metadata: dict = {} + + for s, p, o in g: + subj = str(s) + pred = str(p) + + # capture datatype or language tag for companion file + if hasattr(o, "datatype") and o.datatype: + column_metadata[pred] = {"datatype": str(o.datatype)} + elif hasattr(o, "language") and o.language: + column_metadata[pred] = {"language": str(o.language)} + + if subj not in subjects: + subjects[subj] = {} + if pred not in subjects[subj]: + subjects[subj][pred] = [] + subjects[subj][pred].append(str(o)) + + with open(output_file, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["resource"] + predicates) + for subj, pred_map in subjects.items(): + row = [subj] + for pred in predicates: + values = pred_map.get(pred, []) + row.append("|".join(values)) + writer.writerow(row) + + companion_file = output_file + ".meta.json" + with open(companion_file, "w", encoding="utf-8") as f: + json.dump({"columns": column_metadata}, f, indent=2) + + print(f"Converted RDF -> CSV: {os.path.basename(output_file)}") + print(f"Companion metadata: {os.path.basename(companion_file)}") + + +# --------------------------------------------------------------------------- +# Main dispatcher — called from download pipeline +# --------------------------------------------------------------------------- + +def convert_file( + input_file: str, + output_file: str, + convert_format: str, +) -> None: + """Main conversion dispatcher called from the download pipeline. + + Detects the input format from the file extension, determines whether + this is a Layer 2 (within-class) or Layer 3 (cross-class) conversion, + and delegates to the appropriate conversion function. + + For Layer 2: lossless, same equivalence class. + For Layer 3: quasi-equal for RDF <-> Tabular, lossless for Triples <-> Quads. + + Args: + input_file: Path to the input file (must be decompressed). + output_file: Path to write the converted output file. + convert_format: Target format name (CLI format string). + + Raises: + ValueError: If the input format cannot be detected or if the + requested conversion is not supported. + """ + input_format = detect_format_from_filename(input_file) + + if input_format is None: + raise ValueError( + f"Could not detect input format from filename: '{os.path.basename(input_file)}'. " + f"Supported extensions: {list(EXTENSION_TO_FORMAT.keys())}" + ) + + if input_format == convert_format: + print( + f"WARNING: Input and target format are both '{input_format}'. " + "Skipping conversion." + ) + return + + input_class = get_format_class(input_format) + output_class = get_format_class(convert_format) + + # --- Layer 2: within-class --- + if input_class == output_class: + if input_class == "triples": + convert_rdf_triple_format( + input_file, output_file, input_format, convert_format + ) + elif input_class == "quads": + convert_rdf_quad_format( + input_file, output_file, input_format, convert_format + ) + elif input_class == "tabular": + convert_tabular_format( + input_file, output_file, input_format, convert_format + ) + return + + # --- Layer 3: cross-class --- + if input_class == "triples" and output_class == "tabular": + convert_rdf_to_csv(input_file, output_file, input_format) + return + + raise ValueError( + f"Conversion from '{input_format}' ({input_class}) to " + f"'{convert_format}' ({output_class}) is not yet implemented. " + f"Supported Layer 3 conversions: RDF Triples -> CSV/TSV." + ) \ No newline at end of file diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 312af45..74bd2fd 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -16,6 +16,7 @@ get_databus_id_parts_from_file_url, compute_sha256_and_length, ) +from databusclient.api.convert import convert_file, get_converted_filename # Compression format mappings COMPRESSION_EXTENSIONS = { @@ -313,6 +314,7 @@ def _download_file( client_id=None, convert_to=None, convert_from=None, + convert_format=None, validate_checksum: bool = False, expected_checksum: str | None = None, ) -> None: @@ -327,6 +329,7 @@ def _download_file( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. expected_checksum: The expected checksum of the file. """ @@ -507,12 +510,63 @@ def _download_file( # --- 7. Convert compression format if requested (AFTER validation) --- should_convert, source_format = _should_convert_file(file, convert_to, convert_from) + final_downloaded_file = filename if should_convert and source_format: target_filename = _get_converted_filename(file, source_format, convert_to) target_filepath = os.path.join(localDir, target_filename) _convert_compression_format( filename, target_filepath, source_format, convert_to ) + final_downloaded_file = target_filepath + + # --- 8. Convert file format if requested (AFTER compression conversion) --- + if convert_format: + final_basename = os.path.basename(final_downloaded_file) + compression_fmt = _detect_compression_format(final_basename) + + if compression_fmt: + # File is still compressed — decompress to a temp file first, + # then convert format, then clean up the temp file. + # This follows the pipeline: Download -> Decompress -> Convert -> Save + import tempfile + + source_module = COMPRESSION_MODULES[compression_fmt] + # temp decompressed file sits next to the original + compression_ext = COMPRESSION_EXTENSIONS[compression_fmt] + if final_downloaded_file.lower().endswith(compression_ext): + temp_decompressed = final_downloaded_file[:-len(compression_ext)] + else: + temp_decompressed = final_downloaded_file + ".decompressed" + + try: + print( + f"Decompressing {final_basename} before format conversion..." + ) + with source_module.open(final_downloaded_file, "rb") as sf: + with open(temp_decompressed, "wb") as tf: + while True: + chunk = sf.read(8192) + if not chunk: + break + tf.write(chunk) + + # now convert the decompressed temp file + converted_filename = get_converted_filename( + final_basename, convert_format + ) + converted_filepath = os.path.join(localDir, converted_filename) + convert_file(temp_decompressed, converted_filepath, convert_format) + + finally: + # always clean up temp file even if conversion fails + if os.path.exists(temp_decompressed): + os.remove(temp_decompressed) + + else: + # file is already uncompressed — convert directly + converted_filename = get_converted_filename(final_basename, convert_format) + converted_filepath = os.path.join(localDir, converted_filename) + convert_file(final_downloaded_file, converted_filepath, convert_format) def _download_files( @@ -524,6 +578,7 @@ def _download_files( client_id: str = None, convert_to: str = None, convert_from: str = None, + convert_format: str = None, validate_checksum: bool = False, checksums: dict | None = None, ) -> None: @@ -538,6 +593,7 @@ def _download_files( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. checksums: Dictionary mapping URLs to their expected checksums. """ @@ -554,6 +610,7 @@ def _download_files( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, expected_checksum=expected, ) @@ -702,6 +759,7 @@ def _download_collection( client_id: str = None, convert_to: str = None, convert_from: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download all files in a databus collection. @@ -716,6 +774,7 @@ def _download_collection( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ query = _get_sparql_query_of_collection(uri, databus_key=databus_key) @@ -737,6 +796,7 @@ def _download_collection( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) @@ -751,6 +811,7 @@ def _download_version( client_id: str = None, convert_to: str = None, convert_from: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download all files in a databus artifact version. @@ -764,6 +825,7 @@ def _download_version( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -784,6 +846,7 @@ def _download_version( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums, ) @@ -799,6 +862,7 @@ def _download_artifact( client_id: str = None, convert_to: str = None, convert_from: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus artifact. @@ -813,6 +877,7 @@ def _download_artifact( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -839,6 +904,7 @@ def _download_artifact( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums, ) @@ -915,6 +981,7 @@ def _download_group( client_id: str = None, convert_to: str = None, convert_from: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus group. @@ -929,6 +996,7 @@ def _download_group( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -945,6 +1013,7 @@ def _download_group( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, ) @@ -994,6 +1063,7 @@ def download( client_id="vault-token-exchange", convert_to=None, convert_from=None, + convert_format=None, validate_checksum: bool = False, ) -> None: """Download datasets from databus. @@ -1010,6 +1080,7 @@ def download( client_id: Client ID for token exchange. Default is "vault-token-exchange". convert_to: Target compression format for on-the-fly conversion (supported: bz2, gz, xz). convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ for databusURI in databusURIs: @@ -1039,6 +1110,7 @@ def download( client_id, convert_to, convert_from, + convert_format, validate_checksum=validate_checksum, ) elif file is not None: @@ -1060,6 +1132,7 @@ def download( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, expected_checksum=expected, ) @@ -1074,6 +1147,7 @@ def download( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, ) elif artifact is not None: @@ -1090,6 +1164,7 @@ def download( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, ) elif group is not None and group != "collections": @@ -1106,6 +1181,7 @@ def download( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, ) elif account is not None: @@ -1144,6 +1220,7 @@ def download( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) diff --git a/databusclient/cli.py b/databusclient/cli.py index c3bd8f2..c687616 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -189,6 +189,15 @@ def deploy( type=click.Choice(["bz2", "gz", "xz"], case_sensitive=False), help="Source compression format to convert from (optional filter). Only files with this compression will be converted.", ) +@click.option( + "--convert-format", + "convert_format", + type=click.Choice( + ["ntriples","turtle","rdf-xml","nquads","trig","trix","json-ld","csv","tsv"], + case_sensitive=False, + ), + help="Target format for on-the-fly format conversion during download (Layer 2 and Layer 3).", +) @click.option( "--validate-checksum", is_flag=True, help="Validate checksums of downloaded files" ) @@ -203,6 +212,7 @@ def download( clientid, convert_to, convert_from, + convert_format, validate_checksum, ): """ @@ -221,6 +231,7 @@ def download( client_id=clientid, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, ) except DownloadAuthError as e: diff --git a/run_all_conversion_tests.py b/run_all_conversion_tests.py new file mode 100644 index 0000000..384e052 --- /dev/null +++ b/run_all_conversion_tests.py @@ -0,0 +1,338 @@ +""" +Layer 2 Conversion Testing Script +Tests every conversion combination systematically. +Outputs go to test_outputs/ folder. +Test file for testing with real datasets from databus. +""" + +import os +from databusclient.api.convert import ( + convert_rdf_triple_format, + convert_rdf_quad_format, + convert_tabular_format, +) + +# --------------------------------------------------------------------------- +# Setup output folders +# --------------------------------------------------------------------------- + +folders = [ + "test_outputs/triples/T1_turtle_to_ntriples", + "test_outputs/triples/T2_turtle_to_rdfxml", + "test_outputs/triples/T3_ntriples_to_turtle", + "test_outputs/triples/T4_ntriples_to_rdfxml", + "test_outputs/triples/T5_rdfxml_to_turtle", + "test_outputs/triples/T6_rdfxml_to_ntriples", + "test_outputs/quads/Q1_nquads_to_trig", + "test_outputs/quads/Q2_nquads_to_trix", + "test_outputs/quads/Q3_nquads_to_jsonld", + "test_outputs/quads/Q4_trig_to_nquads", + "test_outputs/quads/Q5_trig_to_trix", + "test_outputs/quads/Q6_trig_to_jsonld", + "test_outputs/quads/Q7_trix_to_nquads", + "test_outputs/quads/Q8_trix_to_trig", + "test_outputs/quads/Q9_trix_to_jsonld", + "test_outputs/quads/Q10_jsonld_to_nquads", + "test_outputs/quads/Q11_jsonld_to_trig", + "test_outputs/quads/Q12_jsonld_to_trix", + "test_outputs/tabular/TAB1_csv_to_tsv", + "test_outputs/tabular/TAB2_tsv_to_csv", +] + +for folder in folders: + os.makedirs(folder, exist_ok=True) + +results = [] + + +def run_test(test_id, description, func, input_file, output_file, *args): + """Run one conversion test and record the result.""" + try: + func(input_file, output_file, *args) + size = os.path.getsize(output_file) + results.append(f"PASS {test_id}: {description} -> {os.path.basename(output_file)} ({size} bytes)") + return output_file + except Exception as e: + results.append(f"FAIL {test_id}: {description} -> ERROR: {e}") + return None + + +# --------------------------------------------------------------------------- +# GROUP 1: RDF Triple Format Conversions +# 6 combinations: each format -> every other format +# Base file: test_outputs/base/base.ttl (real DBpedia Turtle data) +# Chain: turtle -> ntriples -> rdfxml -> back to turtle +# --------------------------------------------------------------------------- + +print("\n=== GROUP 1: RDF TRIPLE FORMAT CONVERSIONS ===\n") + +BASE_TTL = "test_outputs/base/base.ttl" + +# T1: turtle -> ntriples (from base turtle file) +t1_out = "test_outputs/triples/T1_turtle_to_ntriples/output.nt" +run_test( + "T1", "turtle -> ntriples", + convert_rdf_triple_format, + BASE_TTL, t1_out, "turtle", "ntriples" +) + +# T2: turtle -> rdf-xml (from base turtle file) +t2_out = "test_outputs/triples/T2_turtle_to_rdfxml/output.rdf" +run_test( + "T2", "turtle -> rdf-xml", + convert_rdf_triple_format, + BASE_TTL, t2_out, "turtle", "rdf-xml" +) + +# T3: ntriples -> turtle (uses T1 output) +t3_out = "test_outputs/triples/T3_ntriples_to_turtle/output.ttl" +if t1_out and os.path.exists(t1_out): + run_test( + "T3", "ntriples -> turtle", + convert_rdf_triple_format, + t1_out, t3_out, "ntriples", "turtle" + ) +else: + results.append("SKIP T3: ntriples -> turtle (T1 output not available)") + +# T4: ntriples -> rdf-xml (uses T1 output) +t4_out = "test_outputs/triples/T4_ntriples_to_rdfxml/output.rdf" +if t1_out and os.path.exists(t1_out): + run_test( + "T4", "ntriples -> rdf-xml", + convert_rdf_triple_format, + t1_out, t4_out, "ntriples", "rdf-xml" + ) +else: + results.append("SKIP T4: ntriples -> rdf-xml (T1 output not available)") + +# T5: rdf-xml -> turtle (uses T2 output) +t5_out = "test_outputs/triples/T5_rdfxml_to_turtle/output.ttl" +if t2_out and os.path.exists(t2_out): + run_test( + "T5", "rdf-xml -> turtle", + convert_rdf_triple_format, + t2_out, t5_out, "rdf-xml", "turtle" + ) +else: + results.append("SKIP T5: rdf-xml -> turtle (T2 output not available)") + +# T6: rdf-xml -> ntriples (uses T2 output) +t6_out = "test_outputs/triples/T6_rdfxml_to_ntriples/output.nt" +if t2_out and os.path.exists(t2_out): + run_test( + "T6", "rdf-xml -> ntriples", + convert_rdf_triple_format, + t2_out, t6_out, "rdf-xml", "ntriples" + ) +else: + results.append("SKIP T6: rdf-xml -> ntriples (T2 output not available)") + + +# --------------------------------------------------------------------------- +# GROUP 2: RDF Quad Format Conversions +# 12 combinations: each of 4 formats -> every other format (4*3=12) +# Base file: test_outputs/base/base.nq +# Chain: nquads -> trig -> trix -> jsonld -> back to nquads +# --------------------------------------------------------------------------- + +print("\n=== GROUP 2: RDF QUAD FORMAT CONVERSIONS ===\n") + +BASE_NQ = "test_outputs/base/base.nq" + +# Q1: nquads -> trig +q1_out = "test_outputs/quads/Q1_nquads_to_trig/output.trig" +run_test( + "Q1", "nquads -> trig", + convert_rdf_quad_format, + BASE_NQ, q1_out, "nquads", "trig" +) + +# Q2: nquads -> trix +q2_out = "test_outputs/quads/Q2_nquads_to_trix/output.trix" +run_test( + "Q2", "nquads -> trix", + convert_rdf_quad_format, + BASE_NQ, q2_out, "nquads", "trix" +) + +# Q3: nquads -> json-ld +q3_out = "test_outputs/quads/Q3_nquads_to_jsonld/output.jsonld" +run_test( + "Q3", "nquads -> json-ld", + convert_rdf_quad_format, + BASE_NQ, q3_out, "nquads", "json-ld" +) + +# Q4: trig -> nquads (uses Q1 output) +q4_out = "test_outputs/quads/Q4_trig_to_nquads/output.nq" +if q1_out and os.path.exists(q1_out): + run_test( + "Q4", "trig -> nquads", + convert_rdf_quad_format, + q1_out, q4_out, "trig", "nquads" + ) +else: + results.append("SKIP Q4: trig -> nquads (Q1 output not available)") + +# Q5: trig -> trix (uses Q1 output) +q5_out = "test_outputs/quads/Q5_trig_to_trix/output.trix" +if q1_out and os.path.exists(q1_out): + run_test( + "Q5", "trig -> trix", + convert_rdf_quad_format, + q1_out, q5_out, "trig", "trix" + ) +else: + results.append("SKIP Q5: trig -> trix (Q1 output not available)") + +# Q6: trig -> json-ld (uses Q1 output) +q6_out = "test_outputs/quads/Q6_trig_to_jsonld/output.jsonld" +if q1_out and os.path.exists(q1_out): + run_test( + "Q6", "trig -> json-ld", + convert_rdf_quad_format, + q1_out, q6_out, "trig", "json-ld" + ) +else: + results.append("SKIP Q6: trig -> json-ld (Q1 output not available)") + +# Q7: trix -> nquads (uses Q2 output) +q7_out = "test_outputs/quads/Q7_trix_to_nquads/output.nq" +if q2_out and os.path.exists(q2_out): + run_test( + "Q7", "trix -> nquads", + convert_rdf_quad_format, + q2_out, q7_out, "trix", "nquads" + ) +else: + results.append("SKIP Q7: trix -> nquads (Q2 output not available)") + +# Q8: trix -> trig (uses Q2 output) +q8_out = "test_outputs/quads/Q8_trix_to_trig/output.trig" +if q2_out and os.path.exists(q2_out): + run_test( + "Q8", "trix -> trig", + convert_rdf_quad_format, + q2_out, q8_out, "trix", "trig" + ) +else: + results.append("SKIP Q8: trix -> trig (Q2 output not available)") + +# Q9: trix -> json-ld (uses Q2 output) +q9_out = "test_outputs/quads/Q9_trix_to_jsonld/output.jsonld" +if q2_out and os.path.exists(q2_out): + run_test( + "Q9", "trix -> json-ld", + convert_rdf_quad_format, + q2_out, q9_out, "trix", "json-ld" + ) +else: + results.append("SKIP Q9: trix -> json-ld (Q2 output not available)") + +# Q10: json-ld -> nquads (uses Q3 output) +q10_out = "test_outputs/quads/Q10_jsonld_to_nquads/output.nq" +if q3_out and os.path.exists(q3_out): + run_test( + "Q10", "json-ld -> nquads", + convert_rdf_quad_format, + q3_out, q10_out, "json-ld", "nquads" + ) +else: + results.append("SKIP Q10: json-ld -> nquads (Q3 output not available)") + +# Q11: json-ld -> trig (uses Q3 output) +q11_out = "test_outputs/quads/Q11_jsonld_to_trig/output.trig" +if q3_out and os.path.exists(q3_out): + run_test( + "Q11", "json-ld -> trig", + convert_rdf_quad_format, + q3_out, q11_out, "json-ld", "trig" + ) +else: + results.append("SKIP Q11: json-ld -> trig (Q3 output not available)") + +# Q12: json-ld -> trix (uses Q3 output) +q12_out = "test_outputs/quads/Q12_jsonld_to_trix/output.trix" +if q3_out and os.path.exists(q3_out): + run_test( + "Q12", "json-ld -> trix", + convert_rdf_quad_format, + q3_out, q12_out, "json-ld", "trix" + ) +else: + results.append("SKIP Q12: json-ld -> trix (Q3 output not available)") + + +# --------------------------------------------------------------------------- +# GROUP 3: Tabular Format Conversions +# 2 combinations: csv->tsv and tsv->csv +# --------------------------------------------------------------------------- + +print("\n=== GROUP 3: TABULAR FORMAT CONVERSIONS ===\n") + +BASE_CSV = "test_outputs/base/base.csv" +BASE_TSV = "test_outputs/base/base.tsv" + +# TAB1: csv -> tsv +tab1_out = "test_outputs/tabular/TAB1_csv_to_tsv/output.tsv" +run_test( + "TAB1", "csv -> tsv", + convert_tabular_format, + BASE_CSV, tab1_out, "csv", "tsv" +) + +# TAB2: tsv -> csv (uses TAB1 output) +tab2_out = "test_outputs/tabular/TAB2_tsv_to_csv/output.csv" +if tab1_out and os.path.exists(tab1_out): + run_test( + "TAB2", "tsv -> csv", + convert_tabular_format, + tab1_out, tab2_out, "tsv", "csv" + ) +else: + results.append("SKIP TAB2: tsv -> csv (TAB1 output not available)") + + +# --------------------------------------------------------------------------- +# GROUP 4: CLI End-to-End Tests (compressed real Databus file) +# These test the full pipeline including download.py wiring +# --------------------------------------------------------------------------- + +print("\n=== GROUP 4: CLI END-TO-END (run these manually) ===\n") +cli_tests = [ + "CLI1: turtle->ntriples from compressed Databus file", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format ntriples --localdir ./test_outputs/cli/CLI1", + "", + "CLI2: turtle->rdf-xml from compressed Databus file", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format rdf-xml --localdir ./test_outputs/cli/CLI2", + "", + "CLI3: turtle->ntriples + compression bz2->gz", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format ntriples --convert-to gz --localdir ./test_outputs/cli/CLI3", + "", + "CLI4: turtle->ntriples + compression bz2->xz", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format ntriples --convert-to xz --localdir ./test_outputs/cli/CLI4", + "", + "CLI5: unsupported cross-class error (expect ValueError)", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format nquads --localdir ./test_outputs/cli/CLI5", +] +for line in cli_tests: + print(line) + + +# --------------------------------------------------------------------------- +# Print summary +# --------------------------------------------------------------------------- + +print("\n" + "="*60) +print("LAYER 2 CONVERSION TEST SUMMARY") +print("="*60) +for result in results: + print(result) + +passed = sum(1 for r in results if r.startswith("PASS")) +failed = sum(1 for r in results if r.startswith("FAIL")) +skipped = sum(1 for r in results if r.startswith("SKIP")) + +print(f"\nTotal: {passed} passed, {failed} failed, {skipped} skipped") +print("="*60) \ No newline at end of file diff --git a/tests/test_conversion.py b/tests/test_conversion.py new file mode 100644 index 0000000..7ce710c --- /dev/null +++ b/tests/test_conversion.py @@ -0,0 +1,309 @@ +"""Round trip tests for Layer 2 format conversion. + +Following the strategy from Frey et al., each test validates that +reading a format and writing it back produces semantically identical output. +Pattern: parse(format X) -> serialize(format X) -> parse again -> compare. + +9 tests total: +- Triple formats: ntriples, turtle, rdf-xml (3 tests) +- Quad formats: nquads, trig, trix, json-ld (4 tests) +- Tabular formats: csv, tsv (2 tests) +""" + +import csv +import os +import tempfile + +from rdflib import Dataset, Graph + +from databusclient.api.convert import ( + convert_rdf_quad_format, + convert_rdf_triple_format, + convert_tabular_format, +) + +# --------------------------------------------------------------------------- +# Sample RDF data used across all RDF tests +# --------------------------------------------------------------------------- + +SAMPLE_TURTLE = """ +@prefix ex: . +@prefix schema: . +@prefix xsd: . + +ex:Paris schema:isCapitalOf ex:France ; + schema:population "2161000"^^xsd:integer . + +ex:Berlin schema:isCapitalOf ex:Germany ; + schema:population "3645000"^^xsd:integer . +""" + +SAMPLE_NQUADS = """ + . + . + . +""" + +SAMPLE_CSV = """resource,name,population +http://example.org/Paris,Paris,2161000 +http://example.org/Berlin,Berlin,3645000 +""" + +SAMPLE_TSV = "resource\tname\tpopulation\nhttp://example.org/Paris\tParis\t2161000\nhttp://example.org/Berlin\tBerlin\t3645000\n" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _write_temp(content: str, suffix: str) -> str: + """Write content to a named temp file and return its path.""" + fd, path = tempfile.mkstemp(suffix=suffix) + with os.fdopen(fd, "w", encoding="utf-8") as f: + f.write(content) + return path + + +def _graphs_are_isomorphic(g1: Graph, g2: Graph) -> bool: + """Check semantic equivalence of two rdflib Graphs.""" + return g1.isomorphic(g2) + + +def _datasets_equal(g1: Dataset, g2: Dataset) -> bool: + """Check semantic equivalence of two Datasets by triple count and graph names.""" + if len(g1) != len(g2): + return False + graphs1 = {str(c.identifier) for c in g1.graphs()} + graphs2 = {str(c.identifier) for c in g2.graphs()} + return graphs1 == graphs2 + + +# --------------------------------------------------------------------------- +# Triple format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_turtle(): + """Turtle -> Turtle: parse, serialize, reparse, compare.""" + input_path = _write_temp(SAMPLE_TURTLE, ".ttl") + output_path = input_path + ".rt.ttl" + + try: + convert_rdf_triple_format(input_path, output_path, "turtle", "turtle") + + g_original = Graph() + g_original.parse(input_path, format="turtle") + + g_roundtrip = Graph() + g_roundtrip.parse(output_path, format="turtle") + + assert _graphs_are_isomorphic(g_original, g_roundtrip), ( + "Turtle round trip failed: graphs are not isomorphic" + ) + finally: + for p in (input_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_ntriples(): + """N-Triples -> N-Triples: parse, serialize, reparse, compare.""" + # first produce an ntriples file from turtle + turtle_path = _write_temp(SAMPLE_TURTLE, ".ttl") + nt_path = turtle_path + ".nt" + output_path = nt_path + ".rt.nt" + + try: + convert_rdf_triple_format(turtle_path, nt_path, "turtle", "ntriples") + convert_rdf_triple_format(nt_path, output_path, "ntriples", "ntriples") + + g_original = Graph() + g_original.parse(nt_path, format="ntriples") + + g_roundtrip = Graph() + g_roundtrip.parse(output_path, format="ntriples") + + assert _graphs_are_isomorphic(g_original, g_roundtrip), ( + "N-Triples round trip failed: graphs are not isomorphic" + ) + finally: + for p in (turtle_path, nt_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_rdf_xml(): + """RDF/XML -> RDF/XML: parse, serialize, reparse, compare.""" + turtle_path = _write_temp(SAMPLE_TURTLE, ".ttl") + rdf_path = turtle_path + ".rdf" + output_path = rdf_path + ".rt.rdf" + + try: + convert_rdf_triple_format(turtle_path, rdf_path, "turtle", "rdf-xml") + convert_rdf_triple_format(rdf_path, output_path, "rdf-xml", "rdf-xml") + + g_original = Graph() + g_original.parse(rdf_path, format="xml") + + g_roundtrip = Graph() + g_roundtrip.parse(output_path, format="xml") + + assert _graphs_are_isomorphic(g_original, g_roundtrip), ( + "RDF/XML round trip failed: graphs are not isomorphic" + ) + finally: + for p in (turtle_path, rdf_path, output_path): + if os.path.exists(p): + os.remove(p) + + +# --------------------------------------------------------------------------- +# Quad format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_nquads(): + """N-Quads -> N-Quads: parse, serialize, reparse, compare.""" + input_path = _write_temp(SAMPLE_NQUADS, ".nq") + output_path = input_path + ".rt.nq" + + try: + convert_rdf_quad_format(input_path, output_path, "nquads", "nquads") + + g_original = Dataset() + g_original.parse(input_path, format="nquads") + + g_roundtrip = Dataset() + g_roundtrip.parse(output_path, format="nquads") + + assert _datasets_equal(g_original, g_roundtrip), ( + "N-Quads round trip failed: graphs are not equal" + ) + finally: + for p in (input_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_trig(): + """TriG -> TriG: parse, serialize, reparse, compare.""" + # produce trig from nquads + nq_path = _write_temp(SAMPLE_NQUADS, ".nq") + trig_path = nq_path + ".trig" + output_path = trig_path + ".rt.trig" + + try: + convert_rdf_quad_format(nq_path, trig_path, "nquads", "trig") + convert_rdf_quad_format(trig_path, output_path, "trig", "trig") + + g_original = Dataset() + g_original.parse(trig_path, format="trig") + + g_roundtrip = Dataset() + g_roundtrip.parse(output_path, format="trig") + + assert _datasets_equal(g_original, g_roundtrip), ( + "TriG round trip failed: graphs are not equal" + ) + finally: + for p in (nq_path, trig_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_trix(): + """TriX -> TriX: parse, serialize, reparse, compare.""" + nq_path = _write_temp(SAMPLE_NQUADS, ".nq") + trix_path = nq_path + ".trix" + output_path = trix_path + ".rt.trix" + + try: + convert_rdf_quad_format(nq_path, trix_path, "nquads", "trix") + convert_rdf_quad_format(trix_path, output_path, "trix", "trix") + + g_original = Dataset() + g_original.parse(trix_path, format="trix") + + g_roundtrip = Dataset() + g_roundtrip.parse(output_path, format="trix") + + assert _datasets_equal(g_original, g_roundtrip), ( + "TriX round trip failed: graphs are not equal" + ) + finally: + for p in (nq_path, trix_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_json_ld(): + """JSON-LD -> JSON-LD: parse, serialize, reparse, compare.""" + nq_path = _write_temp(SAMPLE_NQUADS, ".nq") + jsonld_path = nq_path + ".jsonld" + output_path = jsonld_path + ".rt.jsonld" + + try: + convert_rdf_quad_format(nq_path, jsonld_path, "nquads", "json-ld") + convert_rdf_quad_format(jsonld_path, output_path, "json-ld", "json-ld") + + g_original = Dataset() + g_original.parse(jsonld_path, format="json-ld") + + g_roundtrip = Dataset() + g_roundtrip.parse(output_path, format="json-ld") + + assert _datasets_equal(g_original, g_roundtrip), ( + "JSON-LD round trip failed: graphs are not equal" + ) + finally: + for p in (nq_path, jsonld_path, output_path): + if os.path.exists(p): + os.remove(p) + + +# --------------------------------------------------------------------------- +# Tabular format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_csv(): + """CSV -> CSV: read, write, reread, compare rows.""" + input_path = _write_temp(SAMPLE_CSV, ".csv") + output_path = input_path + ".rt.csv" + + try: + convert_tabular_format(input_path, output_path, "csv", "csv") + + with open(input_path, newline="", encoding="utf-8") as f: + original_rows = list(csv.reader(f)) + + with open(output_path, newline="", encoding="utf-8") as f: + roundtrip_rows = list(csv.reader(f)) + + assert original_rows == roundtrip_rows, ( + "CSV round trip failed: rows do not match" + ) + finally: + for p in (input_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_tsv(): + """TSV -> TSV: read, write, reread, compare rows.""" + input_path = _write_temp(SAMPLE_TSV, ".tsv") + output_path = input_path + ".rt.tsv" + + try: + convert_tabular_format(input_path, output_path, "tsv", "tsv") + + with open(input_path, newline="", encoding="utf-8") as f: + original_rows = list(csv.reader(f, delimiter="\t")) + + with open(output_path, newline="", encoding="utf-8") as f: + roundtrip_rows = list(csv.reader(f, delimiter="\t")) + + assert original_rows == roundtrip_rows, ( + "TSV round trip failed: rows do not match" + ) + finally: + for p in (input_path, output_path): + if os.path.exists(p): + os.remove(p) \ No newline at end of file From 5d224f691897580e52bab5f82f26639a1ce69f6b Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Sat, 6 Jun 2026 19:53:50 +0530 Subject: [PATCH 02/12] gsoc26: Refactor Layer 2 with handler architecture and improved tests --- databusclient/api/convert.py | 410 ++++++++++++++++++++++--------- databusclient/api/download.py | 46 +++- pyproject.toml | 6 + tests/resources/sample.csv | 11 + tests/resources/sample.jsonld | 62 +++++ tests/resources/sample.nq | 10 + tests/resources/sample.nt | 10 + tests/resources/sample.rdf | 30 +++ tests/resources/sample.trig | 22 ++ tests/resources/sample.trix | 72 ++++++ tests/resources/sample.tsv | 11 + tests/resources/sample.ttl | 18 ++ tests/test_conversion.py | 309 ----------------------- tests/test_format_round_trips.py | 256 +++++++++++++++++++ 14 files changed, 837 insertions(+), 436 deletions(-) create mode 100644 tests/resources/sample.csv create mode 100644 tests/resources/sample.jsonld create mode 100644 tests/resources/sample.nq create mode 100644 tests/resources/sample.nt create mode 100644 tests/resources/sample.rdf create mode 100644 tests/resources/sample.trig create mode 100644 tests/resources/sample.trix create mode 100644 tests/resources/sample.tsv create mode 100644 tests/resources/sample.ttl delete mode 100644 tests/test_conversion.py create mode 100644 tests/test_format_round_trips.py diff --git a/databusclient/api/convert.py b/databusclient/api/convert.py index 8d28fd8..8bd6dbb 100644 --- a/databusclient/api/convert.py +++ b/databusclient/api/convert.py @@ -1,14 +1,33 @@ """Format and Mapping Conversion Layer. +This module implements the format conversion pipeline for the Databus Python Client + Layer 2: Within-class format conversion (lossless). -Layer 3: Cross-class mapping conversion (quasi-equal for RDF <-> Tabular). + - TripleHandler: RDF triple formats (turtle, ntriples, rdf-xml) + - QuadHandler: RDF quad formats (nquads, trig, trix, json-ld) + - TSDHandler: Tabular formats (csv, tsv) + +Layer 3 (prototype, not yet fully implemented): + - RDF triples -> CSV/TSV (quasi-equal, companion metadata generated) + +Each handler provides read() -> IR, write(IR) -> file, convert() -> chains both. +The IR (intermediate representation) returned by read() is designed to be passed +to future mapping classes (TripleToQuadMapper, TripleToTSDMapper, etc.). """ import csv import json import os +import warnings from typing import Optional +# Suppress rdflib internal DeprecationWarning for Dataset API. +# rdflib is mid-migration from ConjunctiveGraph to Dataset in 7.x. +# These warnings originate from rdflib internals, not our code. +# Can be removed when rdflib completes their Dataset API migration. +warnings.filterwarnings("ignore", category=DeprecationWarning, module="rdflib") +warnings.filterwarnings("ignore", category=UserWarning, module="rdflib") + from rdflib import Dataset, Graph @@ -57,9 +76,22 @@ ".tsv": "tsv", } +# Maps format name -> file extension +FORMAT_TO_EXTENSION = { + "ntriples": ".nt", + "turtle": ".ttl", + "rdf-xml": ".rdf", + "nquads": ".nq", + "trig": ".trig", + "trix": ".trix", + "json-ld": ".jsonld", + "csv": ".csv", + "tsv": ".tsv", +} + # --------------------------------------------------------------------------- -# Format detection +# Format detection helpers # --------------------------------------------------------------------------- def detect_format_from_filename(filename: str) -> Optional[str]: @@ -110,24 +142,6 @@ def get_format_class(fmt: str) -> str: ) -# --------------------------------------------------------------------------- -# Output filename helper -# --------------------------------------------------------------------------- - -# Maps format name -> file extension -FORMAT_TO_EXTENSION = { - "ntriples": ".nt", - "turtle": ".ttl", - "rdf-xml": ".rdf", - "nquads": ".nq", - "trig": ".trig", - "trix": ".trix", - "json-ld": ".jsonld", - "csv": ".csv", - "tsv": ".tsv", -} - - def get_converted_filename(original_filename: str, convert_format: str) -> str: """Generate output filename after format conversion. @@ -149,7 +163,7 @@ def get_converted_filename(original_filename: str, convert_format: str) -> str: name = name[: -len(ext)] break - # strip existing format extension + # strip existing format extension (longest first) for old_ext in sorted(FORMAT_TO_EXTENSION.values(), key=len, reverse=True): if name.lower().endswith(old_ext): name = name[: -len(old_ext)] @@ -160,95 +174,254 @@ def get_converted_filename(original_filename: str, convert_format: str) -> str: # --------------------------------------------------------------------------- -# Layer 2 — within-class format conversion +# Layer 2 Handlers # --------------------------------------------------------------------------- -def convert_rdf_triple_format( - input_file: str, - output_file: str, - input_format: str, - output_format: str, -) -> None: - """Convert between RDF triple serialization formats (Layer 2). +class TripleHandler: + """Handler for RDF triple formats (Layer 2). - Handles: ntriples, turtle, rdf-xml. - Uses rdflib Graph as internal representation. + Uses rdflib.Graph as the intermediate representation (IR). + Supports: ntriples, turtle, rdf-xml. - Args: - input_file: Path to input file. - output_file: Path to write converted output. - input_format: Source format name (must be in RDF_TRIPLE_FORMATS). - output_format: Target format name (must be in RDF_TRIPLE_FORMATS). + The IR returned by read() can be passed to future mapping classes + such as TripleToQuadMapper or TripleToTSDMapper for Layer 3 conversions. """ - g = Graph() - g.parse(input_file, format=RDF_TRIPLE_FORMATS[input_format]) - g.serialize(destination=output_file, format=RDF_TRIPLE_FORMATS[output_format]) - print( - f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" - ) + def read(self, source: str, input_format: str) -> Graph: + """Parse an RDF triples file into a Graph (IR). -def convert_rdf_quad_format( - input_file: str, - output_file: str, - input_format: str, - output_format: str, -) -> None: - """Convert between RDF quad serialization formats (Layer 2). + Args: + source: Path to input file. + input_format: Source format name (e.g. 'turtle', 'ntriples', 'rdf-xml'). - Handles: nquads, trig, trix, json-ld. - Uses rdflib Dataset as internal representation - to preserve named graph information. + Returns: + rdflib.Graph containing all parsed triples. - Args: - input_file: Path to input file. - output_file: Path to write converted output. - input_format: Source format name (must be in RDF_QUAD_FORMATS). - output_format: Target format name (must be in RDF_QUAD_FORMATS). + Raises: + ValueError: If input_format is not a recognised triple format. + """ + if input_format not in RDF_TRIPLE_FORMATS: + raise ValueError( + f"'{input_format}' is not a triple format. " + f"Supported: {list(RDF_TRIPLE_FORMATS)}" + ) + g = Graph() + g.parse(source, format=RDF_TRIPLE_FORMATS[input_format]) + return g + + def write(self, data: Graph, target: str, output_format: str) -> None: + """Serialize a Graph (IR) to a file. + + Args: + data: rdflib.Graph to serialize. + target: Path to output file. + output_format: Target format name (e.g. 'ntriples', 'turtle'). + + Raises: + ValueError: If output_format is not a recognised triple format. + """ + if output_format not in RDF_TRIPLE_FORMATS: + raise ValueError( + f"'{output_format}' is not a triple format. " + f"Supported: {list(RDF_TRIPLE_FORMATS)}" + ) + # Explicitly specify utf-8 encoding to avoid NTSerializer warning + data.serialize( + destination=target, + format=RDF_TRIPLE_FORMATS[output_format], + encoding="utf-8", + ) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between RDF triple formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (RDF triples). + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + graph = self.read(source, input_format) + self.write(graph, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +class QuadHandler: + """Handler for RDF quad formats (Layer 2). + + Uses rdflib.Dataset as the intermediate representation (IR). + Supports: nquads, trig, trix, json-ld. + + Named graph information is preserved through the Dataset IR. + The IR returned by read() can be passed to future mapping classes + such as QuadToTripleMapper or QuadToTSDMapper for Layer 3 conversions. """ - g = Dataset() - g.parse(input_file, format=RDF_QUAD_FORMATS[input_format]) - g.serialize(destination=output_file, format=RDF_QUAD_FORMATS[output_format]) - print( - f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" - ) + def read(self, source: str, input_format: str) -> Dataset: + """Parse an RDF quads file into a Dataset (IR). -def convert_tabular_format( - input_file: str, - output_file: str, - input_format: str, - output_format: str, -) -> None: - """Convert between tabular formats (Layer 2). + Args: + source: Path to input file. + input_format: Source format name (e.g. 'nquads', 'trig', 'trix', 'json-ld'). - Handles: csv <-> tsv. - Uses Python built-in csv module. + Returns: + rdflib.Dataset containing all parsed quads with named graphs. - Args: - input_file: Path to input file. - output_file: Path to write converted output. - input_format: Source format name ('csv' or 'tsv'). - output_format: Target format name ('csv' or 'tsv'). + Raises: + ValueError: If input_format is not a recognised quad format. + """ + if input_format not in RDF_QUAD_FORMATS: + raise ValueError( + f"'{input_format}' is not a quad format. " + f"Supported: {list(RDF_QUAD_FORMATS)}" + ) + d = Dataset() + d.parse(source, format=RDF_QUAD_FORMATS[input_format]) + return d + + def write(self, data: Dataset, target: str, output_format: str) -> None: + """Serialize a Dataset (IR) to a file. + + Args: + data: rdflib.Dataset to serialize. + target: Path to output file. + output_format: Target format name. + + Raises: + ValueError: If output_format is not a recognised quad format. + """ + if output_format not in RDF_QUAD_FORMATS: + raise ValueError( + f"'{output_format}' is not a quad format. " + f"Supported: {list(RDF_QUAD_FORMATS)}" + ) + data.serialize( + destination=target, + format=RDF_QUAD_FORMATS[output_format], + ) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between RDF quad formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (RDF quads). Named graph information is preserved. + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + dataset = self.read(source, input_format) + self.write(dataset, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +class TSDHandler: + """Handler for tabular structured data formats (Layer 2). + + Uses list[list[str]] as the intermediate representation (IR). + Supports: csv, tsv. + + The IR returned by read() can be passed to future mapping classes + such as TSDToTripleMapper for Layer 3 conversions. """ - input_delimiter = TABULAR_FORMATS[input_format] - output_delimiter = TABULAR_FORMATS[output_format] - with open(input_file, "r", newline="", encoding="utf-8") as infile: - reader = csv.reader(infile, delimiter=input_delimiter) - rows = list(reader) + def read(self, source: str, input_format: str) -> list: + """Parse a tabular file into a list of rows (IR). - with open(output_file, "w", newline="", encoding="utf-8") as outfile: - writer = csv.writer(outfile, delimiter=output_delimiter) - writer.writerows(rows) + Each row is a list of string values. First row is the header. - print( - f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" - ) + Args: + source: Path to input file. + input_format: Source format name ('csv' or 'tsv'). + + Returns: + list[list[str]] where first element is the header row. + + Raises: + ValueError: If input_format is not a recognised tabular format. + """ + if input_format not in TABULAR_FORMATS: + raise ValueError( + f"'{input_format}' is not a tabular format. " + f"Supported: {list(TABULAR_FORMATS)}" + ) + delimiter = TABULAR_FORMATS[input_format] + with open(source, "r", newline="", encoding="utf-8") as f: + reader = csv.reader(f, delimiter=delimiter) + return list(reader) + + def write(self, data: list, target: str, output_format: str) -> None: + """Serialize a list of rows (IR) to a tabular file. + + Args: + data: list[list[str]] to write. + target: Path to output file. + output_format: Target format name ('csv' or 'tsv'). + + Raises: + ValueError: If output_format is not a recognised tabular format. + """ + if output_format not in TABULAR_FORMATS: + raise ValueError( + f"'{output_format}' is not a tabular format. " + f"Supported: {list(TABULAR_FORMATS)}" + ) + delimiter = TABULAR_FORMATS[output_format] + with open(target, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f, delimiter=delimiter) + writer.writerows(data) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between tabular formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (tabular). + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + rows = self.read(source, input_format) + self.write(rows, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) # --------------------------------------------------------------------------- -# Layer 3 — cross-class mapping conversion +# Layer 3 prototype — RDF triples to CSV (not yet fully implemented) # --------------------------------------------------------------------------- def convert_rdf_to_csv( @@ -256,20 +429,23 @@ def convert_rdf_to_csv( output_file: str, input_format: str, ) -> None: - """Map RDF triples to a wide CSV table (Layer 3). + """Map RDF triples to a wide CSV table (Layer 3 prototype). Each unique subject becomes a row. Each unique predicate becomes a column. Multi-valued predicates are pipe-separated. - A companion .meta.json file is generated alongside the CSV to preserve - RDF datatype and language tag information for lossless round trips. + A companion .meta.json file is generated to preserve RDF datatype and + language tag information for lossless round trips. + + NOTE: This is a Layer 3 prototype. It is not yet tested and will be + properly implemented in the Layer 3 issue. Args: input_file: Path to input RDF triples file. output_file: Path to write output CSV file. input_format: Source triple format name (must be in RDF_TRIPLE_FORMATS). """ - g = Graph() - g.parse(input_file, format=RDF_TRIPLE_FORMATS[input_format]) + handler = TripleHandler() + g = handler.read(input_file, input_format) predicates = sorted(set(str(p) for s, p, o in g)) @@ -280,7 +456,6 @@ def convert_rdf_to_csv( subj = str(s) pred = str(p) - # capture datatype or language tag for companion file if hasattr(o, "datatype") and o.datatype: column_metadata[pred] = {"datatype": str(o.datatype)} elif hasattr(o, "language") and o.language: @@ -292,15 +467,16 @@ def convert_rdf_to_csv( subjects[subj][pred] = [] subjects[subj][pred].append(str(o)) - with open(output_file, "w", newline="", encoding="utf-8") as f: - writer = csv.writer(f) - writer.writerow(["resource"] + predicates) - for subj, pred_map in subjects.items(): - row = [subj] - for pred in predicates: - values = pred_map.get(pred, []) - row.append("|".join(values)) - writer.writerow(row) + tsd_handler = TSDHandler() + rows = [["resource"] + predicates] + for subj, pred_map in subjects.items(): + row = [subj] + for pred in predicates: + values = pred_map.get(pred, []) + row.append("|".join(values)) + rows.append(row) + + tsd_handler.write(rows, output_file, "csv") companion_file = output_file + ".meta.json" with open(companion_file, "w", encoding="utf-8") as f: @@ -314,6 +490,12 @@ def convert_rdf_to_csv( # Main dispatcher — called from download pipeline # --------------------------------------------------------------------------- +# Handler instances — created once, reused +_triple_handler = TripleHandler() +_quad_handler = QuadHandler() +_tsd_handler = TSDHandler() + + def convert_file( input_file: str, output_file: str, @@ -323,10 +505,7 @@ def convert_file( Detects the input format from the file extension, determines whether this is a Layer 2 (within-class) or Layer 3 (cross-class) conversion, - and delegates to the appropriate conversion function. - - For Layer 2: lossless, same equivalence class. - For Layer 3: quasi-equal for RDF <-> Tabular, lossless for Triples <-> Quads. + and delegates to the appropriate handler. Args: input_file: Path to the input file (must be decompressed). @@ -334,14 +513,15 @@ def convert_file( convert_format: Target format name (CLI format string). Raises: - ValueError: If the input format cannot be detected or if the - requested conversion is not supported. + ValueError: If input format cannot be detected or conversion + is not supported. """ input_format = detect_format_from_filename(input_file) if input_format is None: raise ValueError( - f"Could not detect input format from filename: '{os.path.basename(input_file)}'. " + f"Could not detect input format from filename: " + f"'{os.path.basename(input_file)}'. " f"Supported extensions: {list(EXTENSION_TO_FORMAT.keys())}" ) @@ -358,20 +538,20 @@ def convert_file( # --- Layer 2: within-class --- if input_class == output_class: if input_class == "triples": - convert_rdf_triple_format( + _triple_handler.convert( input_file, output_file, input_format, convert_format ) elif input_class == "quads": - convert_rdf_quad_format( + _quad_handler.convert( input_file, output_file, input_format, convert_format ) elif input_class == "tabular": - convert_tabular_format( + _tsd_handler.convert( input_file, output_file, input_format, convert_format ) return - # --- Layer 3: cross-class --- + # --- Layer 3: cross-class (prototype only) --- if input_class == "triples" and output_class == "tabular": convert_rdf_to_csv(input_file, output_file, input_format) return diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 74bd2fd..7c33fac 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -520,18 +520,16 @@ def _download_file( final_downloaded_file = target_filepath # --- 8. Convert file format if requested (AFTER compression conversion) --- + # Pipeline follows :decompress -> convert format -> recompress + # If the source was compressed, the converted output is recompressed: + # - to the format specified by --convert-to if provided + # - to the original compression format otherwise if convert_format: final_basename = os.path.basename(final_downloaded_file) compression_fmt = _detect_compression_format(final_basename) if compression_fmt: - # File is still compressed — decompress to a temp file first, - # then convert format, then clean up the temp file. - # This follows the pipeline: Download -> Decompress -> Convert -> Save - import tempfile - - source_module = COMPRESSION_MODULES[compression_fmt] - # temp decompressed file sits next to the original + # File is still compressed — decompress to temp, convert, recompress compression_ext = COMPRESSION_EXTENSIONS[compression_fmt] if final_downloaded_file.lower().endswith(compression_ext): temp_decompressed = final_downloaded_file[:-len(compression_ext)] @@ -542,6 +540,7 @@ def _download_file( print( f"Decompressing {final_basename} before format conversion..." ) + source_module = COMPRESSION_MODULES[compression_fmt] with source_module.open(final_downloaded_file, "rb") as sf: with open(temp_decompressed, "wb") as tf: while True: @@ -550,20 +549,43 @@ def _download_file( break tf.write(chunk) - # now convert the decompressed temp file - converted_filename = get_converted_filename( + # Convert format on the decompressed temp file + converted_basename = get_converted_filename( final_basename, convert_format ) - converted_filepath = os.path.join(localDir, converted_filename) + converted_filepath = os.path.join(localDir, converted_basename) convert_file(temp_decompressed, converted_filepath, convert_format) + # Recompress the converted output. + # Use --convert-to format if specified, otherwise use original compression. + recompress_fmt = convert_to if convert_to else compression_fmt + recompress_ext = COMPRESSION_EXTENSIONS[recompress_fmt] + recompressed_filepath = converted_filepath + recompress_ext + recompress_module = COMPRESSION_MODULES[recompress_fmt] + + print( + f"Recompressing converted file to {recompress_fmt}: " + f"{os.path.basename(recompressed_filepath)}" + ) + with open(converted_filepath, "rb") as sf: + with recompress_module.open(recompressed_filepath, "wb") as tf: + while True: + chunk = sf.read(8192) + if not chunk: + break + tf.write(chunk) + + # Remove the uncompressed converted file — keep only recompressed + if os.path.exists(converted_filepath): + os.remove(converted_filepath) + finally: - # always clean up temp file even if conversion fails + # Always clean up temp decompressed file if os.path.exists(temp_decompressed): os.remove(temp_decompressed) else: - # file is already uncompressed — convert directly + # File is already uncompressed — convert directly, no recompression needed converted_filename = get_converted_filename(final_basename, convert_format) converted_filepath = os.path.join(localDir, converted_filename) convert_file(final_downloaded_file, converted_filepath, convert_format) diff --git a/pyproject.toml b/pyproject.toml index 92f479b..72179cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,3 +29,9 @@ src = ["databusclient", "tests"] [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +filterwarnings = [ + "ignore::DeprecationWarning:rdflib", + "ignore::UserWarning:rdflib", +] diff --git a/tests/resources/sample.csv b/tests/resources/sample.csv new file mode 100644 index 0000000..50dda4c --- /dev/null +++ b/tests/resources/sample.csv @@ -0,0 +1,11 @@ +subject,predicate,object,graph +https://example.org/data/alice,http://xmlns.com/foaf/0.1/name,Alice,https://example.org/graph/people +https://example.org/data/alice,https://example.org/vocab/age,29,https://example.org/graph/people +https://example.org/data/alice,https://example.org/vocab/livesAt,_:address1,https://example.org/graph/people +_:address1,https://example.org/vocab/city,Leipzig,https://example.org/graph/people +_:address1,https://example.org/vocab/country,Germany,https://example.org/graph/people +https://example.org/data/bob,http://xmlns.com/foaf/0.1/name,Bob,https://example.org/graph/people +https://example.org/data/bob,https://example.org/vocab/age,34,https://example.org/graph/people +https://example.org/data/bob,https://example.org/vocab/knows,https://example.org/data/alice,https://example.org/graph/people +https://example.org/data/project1,https://example.org/vocab/title,Databus Example Project,https://example.org/graph/projects +https://example.org/data/project1,https://example.org/vocab/member,https://example.org/data/alice,https://example.org/graph/projects \ No newline at end of file diff --git a/tests/resources/sample.jsonld b/tests/resources/sample.jsonld new file mode 100644 index 0000000..af80f31 --- /dev/null +++ b/tests/resources/sample.jsonld @@ -0,0 +1,62 @@ +{ + "@context": { + "@base": "https://example.org/data/", + "ex": "https://example.org/vocab/", + "foaf": "http://xmlns.com/foaf/0.1/", + "xsd": "http://www.w3.org/2001/XMLSchema#", + "name": "foaf:name", + "age": { + "@id": "ex:age", + "@type": "xsd:integer" + }, + "livesAt": { + "@id": "ex:livesAt", + "@type": "@id" + }, + "city": "ex:city", + "country": "ex:country", + "knows": { + "@id": "ex:knows", + "@type": "@id" + }, + "title": "ex:title", + "member": { + "@id": "ex:member", + "@type": "@id" + } + }, + "@graph": [ + { + "@id": "https://example.org/graph/people", + "@graph": [ + { + "@id": "alice", + "name": "Alice", + "age": 29, + "livesAt": "_:address1" + }, + { + "@id": "_:address1", + "city": "Leipzig", + "country": "Germany" + }, + { + "@id": "bob", + "name": "Bob", + "age": 34, + "knows": "alice" + } + ] + }, + { + "@id": "https://example.org/graph/projects", + "@graph": [ + { + "@id": "project1", + "title": "Databus Example Project", + "member": "alice" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/resources/sample.nq b/tests/resources/sample.nq new file mode 100644 index 0000000..a111652 --- /dev/null +++ b/tests/resources/sample.nq @@ -0,0 +1,10 @@ + "Alice" . + "29"^^ . + _:address1 . +_:address1 "Leipzig" . +_:address1 "Germany" . + "Bob" . + "34"^^ . + . + "Databus Example Project" . + . \ No newline at end of file diff --git a/tests/resources/sample.nt b/tests/resources/sample.nt new file mode 100644 index 0000000..f6b8488 --- /dev/null +++ b/tests/resources/sample.nt @@ -0,0 +1,10 @@ + "Alice" . + "29"^^ . + _:address1 . +_:address1 "Leipzig" . +_:address1 "Germany" . + "Bob" . + "34"^^ . + . + "Databus Example Project" . + . \ No newline at end of file diff --git a/tests/resources/sample.rdf b/tests/resources/sample.rdf new file mode 100644 index 0000000..c8bb09a --- /dev/null +++ b/tests/resources/sample.rdf @@ -0,0 +1,30 @@ + + + + + Alice + 29 + + + + + Leipzig + Germany + + + + Bob + 34 + + + + + Databus Example Project + + + + \ No newline at end of file diff --git a/tests/resources/sample.trig b/tests/resources/sample.trig new file mode 100644 index 0000000..e4abc3f --- /dev/null +++ b/tests/resources/sample.trig @@ -0,0 +1,22 @@ +@base . +@prefix ex: . +@prefix foaf: . +@prefix xsd: . + + { + foaf:name "Alice" ; + ex:age 29 ; + ex:livesAt _:address1 . + + _:address1 ex:city "Leipzig" ; + ex:country "Germany" . + + foaf:name "Bob" ; + ex:age 34 ; + ex:knows . +} + + { + ex:title "Databus Example Project" ; + ex:member . +} \ No newline at end of file diff --git a/tests/resources/sample.trix b/tests/resources/sample.trix new file mode 100644 index 0000000..d8edb13 --- /dev/null +++ b/tests/resources/sample.trix @@ -0,0 +1,72 @@ + + + + + https://example.org/graph/people + + + https://example.org/data/alice + http://xmlns.com/foaf/0.1/name + Alice + + + + https://example.org/data/alice + https://example.org/vocab/age + 29 + + + + https://example.org/data/alice + https://example.org/vocab/livesAt + address1 + + + + address1 + https://example.org/vocab/city + Leipzig + + + + address1 + https://example.org/vocab/country + Germany + + + + https://example.org/data/bob + http://xmlns.com/foaf/0.1/name + Bob + + + + https://example.org/data/bob + https://example.org/vocab/age + 34 + + + + https://example.org/data/bob + https://example.org/vocab/knows + https://example.org/data/alice + + + + + https://example.org/graph/projects + + + https://example.org/data/project1 + https://example.org/vocab/title + Databus Example Project + + + + https://example.org/data/project1 + https://example.org/vocab/member + https://example.org/data/alice + + + + \ No newline at end of file diff --git a/tests/resources/sample.tsv b/tests/resources/sample.tsv new file mode 100644 index 0000000..c23af40 --- /dev/null +++ b/tests/resources/sample.tsv @@ -0,0 +1,11 @@ +subject predicate object graph +https://example.org/data/alice http://xmlns.com/foaf/0.1/name Alice https://example.org/graph/people +https://example.org/data/alice https://example.org/vocab/age 29 https://example.org/graph/people +https://example.org/data/alice https://example.org/vocab/livesAt _:address1 https://example.org/graph/people +_:address1 https://example.org/vocab/city Leipzig https://example.org/graph/people +_:address1 https://example.org/vocab/country Germany https://example.org/graph/people +https://example.org/data/bob http://xmlns.com/foaf/0.1/name Bob https://example.org/graph/people +https://example.org/data/bob https://example.org/vocab/age 34 https://example.org/graph/people +https://example.org/data/bob https://example.org/vocab/knows https://example.org/data/alice https://example.org/graph/people +https://example.org/data/project1 https://example.org/vocab/title Databus Example Project https://example.org/graph/projects +https://example.org/data/project1 https://example.org/vocab/member https://example.org/data/alice https://example.org/graph/projects \ No newline at end of file diff --git a/tests/resources/sample.ttl b/tests/resources/sample.ttl new file mode 100644 index 0000000..a8eb198 --- /dev/null +++ b/tests/resources/sample.ttl @@ -0,0 +1,18 @@ +@base . +@prefix ex: . +@prefix foaf: . +@prefix xsd: . + + foaf:name "Alice" ; + ex:age 29 ; + ex:livesAt _:address1 . + +_:address1 ex:city "Leipzig" ; + ex:country "Germany" . + + foaf:name "Bob" ; + ex:age 34 ; + ex:knows . + + ex:title "Databus Example Project" ; + ex:member . \ No newline at end of file diff --git a/tests/test_conversion.py b/tests/test_conversion.py deleted file mode 100644 index 7ce710c..0000000 --- a/tests/test_conversion.py +++ /dev/null @@ -1,309 +0,0 @@ -"""Round trip tests for Layer 2 format conversion. - -Following the strategy from Frey et al., each test validates that -reading a format and writing it back produces semantically identical output. -Pattern: parse(format X) -> serialize(format X) -> parse again -> compare. - -9 tests total: -- Triple formats: ntriples, turtle, rdf-xml (3 tests) -- Quad formats: nquads, trig, trix, json-ld (4 tests) -- Tabular formats: csv, tsv (2 tests) -""" - -import csv -import os -import tempfile - -from rdflib import Dataset, Graph - -from databusclient.api.convert import ( - convert_rdf_quad_format, - convert_rdf_triple_format, - convert_tabular_format, -) - -# --------------------------------------------------------------------------- -# Sample RDF data used across all RDF tests -# --------------------------------------------------------------------------- - -SAMPLE_TURTLE = """ -@prefix ex: . -@prefix schema: . -@prefix xsd: . - -ex:Paris schema:isCapitalOf ex:France ; - schema:population "2161000"^^xsd:integer . - -ex:Berlin schema:isCapitalOf ex:Germany ; - schema:population "3645000"^^xsd:integer . -""" - -SAMPLE_NQUADS = """ - . - . - . -""" - -SAMPLE_CSV = """resource,name,population -http://example.org/Paris,Paris,2161000 -http://example.org/Berlin,Berlin,3645000 -""" - -SAMPLE_TSV = "resource\tname\tpopulation\nhttp://example.org/Paris\tParis\t2161000\nhttp://example.org/Berlin\tBerlin\t3645000\n" - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -def _write_temp(content: str, suffix: str) -> str: - """Write content to a named temp file and return its path.""" - fd, path = tempfile.mkstemp(suffix=suffix) - with os.fdopen(fd, "w", encoding="utf-8") as f: - f.write(content) - return path - - -def _graphs_are_isomorphic(g1: Graph, g2: Graph) -> bool: - """Check semantic equivalence of two rdflib Graphs.""" - return g1.isomorphic(g2) - - -def _datasets_equal(g1: Dataset, g2: Dataset) -> bool: - """Check semantic equivalence of two Datasets by triple count and graph names.""" - if len(g1) != len(g2): - return False - graphs1 = {str(c.identifier) for c in g1.graphs()} - graphs2 = {str(c.identifier) for c in g2.graphs()} - return graphs1 == graphs2 - - -# --------------------------------------------------------------------------- -# Triple format round trip tests (Layer 2) -# --------------------------------------------------------------------------- - -def test_round_trip_turtle(): - """Turtle -> Turtle: parse, serialize, reparse, compare.""" - input_path = _write_temp(SAMPLE_TURTLE, ".ttl") - output_path = input_path + ".rt.ttl" - - try: - convert_rdf_triple_format(input_path, output_path, "turtle", "turtle") - - g_original = Graph() - g_original.parse(input_path, format="turtle") - - g_roundtrip = Graph() - g_roundtrip.parse(output_path, format="turtle") - - assert _graphs_are_isomorphic(g_original, g_roundtrip), ( - "Turtle round trip failed: graphs are not isomorphic" - ) - finally: - for p in (input_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_ntriples(): - """N-Triples -> N-Triples: parse, serialize, reparse, compare.""" - # first produce an ntriples file from turtle - turtle_path = _write_temp(SAMPLE_TURTLE, ".ttl") - nt_path = turtle_path + ".nt" - output_path = nt_path + ".rt.nt" - - try: - convert_rdf_triple_format(turtle_path, nt_path, "turtle", "ntriples") - convert_rdf_triple_format(nt_path, output_path, "ntriples", "ntriples") - - g_original = Graph() - g_original.parse(nt_path, format="ntriples") - - g_roundtrip = Graph() - g_roundtrip.parse(output_path, format="ntriples") - - assert _graphs_are_isomorphic(g_original, g_roundtrip), ( - "N-Triples round trip failed: graphs are not isomorphic" - ) - finally: - for p in (turtle_path, nt_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_rdf_xml(): - """RDF/XML -> RDF/XML: parse, serialize, reparse, compare.""" - turtle_path = _write_temp(SAMPLE_TURTLE, ".ttl") - rdf_path = turtle_path + ".rdf" - output_path = rdf_path + ".rt.rdf" - - try: - convert_rdf_triple_format(turtle_path, rdf_path, "turtle", "rdf-xml") - convert_rdf_triple_format(rdf_path, output_path, "rdf-xml", "rdf-xml") - - g_original = Graph() - g_original.parse(rdf_path, format="xml") - - g_roundtrip = Graph() - g_roundtrip.parse(output_path, format="xml") - - assert _graphs_are_isomorphic(g_original, g_roundtrip), ( - "RDF/XML round trip failed: graphs are not isomorphic" - ) - finally: - for p in (turtle_path, rdf_path, output_path): - if os.path.exists(p): - os.remove(p) - - -# --------------------------------------------------------------------------- -# Quad format round trip tests (Layer 2) -# --------------------------------------------------------------------------- - -def test_round_trip_nquads(): - """N-Quads -> N-Quads: parse, serialize, reparse, compare.""" - input_path = _write_temp(SAMPLE_NQUADS, ".nq") - output_path = input_path + ".rt.nq" - - try: - convert_rdf_quad_format(input_path, output_path, "nquads", "nquads") - - g_original = Dataset() - g_original.parse(input_path, format="nquads") - - g_roundtrip = Dataset() - g_roundtrip.parse(output_path, format="nquads") - - assert _datasets_equal(g_original, g_roundtrip), ( - "N-Quads round trip failed: graphs are not equal" - ) - finally: - for p in (input_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_trig(): - """TriG -> TriG: parse, serialize, reparse, compare.""" - # produce trig from nquads - nq_path = _write_temp(SAMPLE_NQUADS, ".nq") - trig_path = nq_path + ".trig" - output_path = trig_path + ".rt.trig" - - try: - convert_rdf_quad_format(nq_path, trig_path, "nquads", "trig") - convert_rdf_quad_format(trig_path, output_path, "trig", "trig") - - g_original = Dataset() - g_original.parse(trig_path, format="trig") - - g_roundtrip = Dataset() - g_roundtrip.parse(output_path, format="trig") - - assert _datasets_equal(g_original, g_roundtrip), ( - "TriG round trip failed: graphs are not equal" - ) - finally: - for p in (nq_path, trig_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_trix(): - """TriX -> TriX: parse, serialize, reparse, compare.""" - nq_path = _write_temp(SAMPLE_NQUADS, ".nq") - trix_path = nq_path + ".trix" - output_path = trix_path + ".rt.trix" - - try: - convert_rdf_quad_format(nq_path, trix_path, "nquads", "trix") - convert_rdf_quad_format(trix_path, output_path, "trix", "trix") - - g_original = Dataset() - g_original.parse(trix_path, format="trix") - - g_roundtrip = Dataset() - g_roundtrip.parse(output_path, format="trix") - - assert _datasets_equal(g_original, g_roundtrip), ( - "TriX round trip failed: graphs are not equal" - ) - finally: - for p in (nq_path, trix_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_json_ld(): - """JSON-LD -> JSON-LD: parse, serialize, reparse, compare.""" - nq_path = _write_temp(SAMPLE_NQUADS, ".nq") - jsonld_path = nq_path + ".jsonld" - output_path = jsonld_path + ".rt.jsonld" - - try: - convert_rdf_quad_format(nq_path, jsonld_path, "nquads", "json-ld") - convert_rdf_quad_format(jsonld_path, output_path, "json-ld", "json-ld") - - g_original = Dataset() - g_original.parse(jsonld_path, format="json-ld") - - g_roundtrip = Dataset() - g_roundtrip.parse(output_path, format="json-ld") - - assert _datasets_equal(g_original, g_roundtrip), ( - "JSON-LD round trip failed: graphs are not equal" - ) - finally: - for p in (nq_path, jsonld_path, output_path): - if os.path.exists(p): - os.remove(p) - - -# --------------------------------------------------------------------------- -# Tabular format round trip tests (Layer 2) -# --------------------------------------------------------------------------- - -def test_round_trip_csv(): - """CSV -> CSV: read, write, reread, compare rows.""" - input_path = _write_temp(SAMPLE_CSV, ".csv") - output_path = input_path + ".rt.csv" - - try: - convert_tabular_format(input_path, output_path, "csv", "csv") - - with open(input_path, newline="", encoding="utf-8") as f: - original_rows = list(csv.reader(f)) - - with open(output_path, newline="", encoding="utf-8") as f: - roundtrip_rows = list(csv.reader(f)) - - assert original_rows == roundtrip_rows, ( - "CSV round trip failed: rows do not match" - ) - finally: - for p in (input_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_tsv(): - """TSV -> TSV: read, write, reread, compare rows.""" - input_path = _write_temp(SAMPLE_TSV, ".tsv") - output_path = input_path + ".rt.tsv" - - try: - convert_tabular_format(input_path, output_path, "tsv", "tsv") - - with open(input_path, newline="", encoding="utf-8") as f: - original_rows = list(csv.reader(f, delimiter="\t")) - - with open(output_path, newline="", encoding="utf-8") as f: - roundtrip_rows = list(csv.reader(f, delimiter="\t")) - - assert original_rows == roundtrip_rows, ( - "TSV round trip failed: rows do not match" - ) - finally: - for p in (input_path, output_path): - if os.path.exists(p): - os.remove(p) \ No newline at end of file diff --git a/tests/test_format_round_trips.py b/tests/test_format_round_trips.py new file mode 100644 index 0000000..ebe9f76 --- /dev/null +++ b/tests/test_format_round_trips.py @@ -0,0 +1,256 @@ +"""Round trip tests for Layer 2 format conversion. + +Following the strategy from Frey et al., each test validates that +reading a format and writing it back produces semantically identical output. + +The key validation pattern using handlers and IR: + 1. Read original file into IR (Graph/Dataset/rows) BEFORE any conversion + 2. Convert the file through the handler (read -> write cycle) + 3. Read the converted output back into IR + 4. Compare both IRs — if conversion lost data, IRs will differ + +This correctly catches information loss because g_original is captured +BEFORE serialization, not after. Both IRs use the same rdflib internal +representation, making comparison meaningful at the data level. + +Test data lives in tests/resources/ — one sample file per format. +These files are semantically consistent (same cities dataset across +all formats) and are shared across Layer 2 and future Layer 3 tests. + +9 round trip tests total: + Triple formats: turtle, ntriples, rdf-xml (3 tests) + Quad formats: nquads, trig, trix, json-ld (4 tests) + Tabular formats: csv, tsv (2 tests) +""" + +import os +import tempfile + +from databusclient.api.convert import ( + QuadHandler, + TSDHandler, + TripleHandler, +) + +# --------------------------------------------------------------------------- +# Path to shared test resources +# --------------------------------------------------------------------------- + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def resource(filename: str) -> str: + """Return absolute path to a file in tests/resources/.""" + return os.path.join(RESOURCES, filename) + + +# --------------------------------------------------------------------------- +# Handler instances shared across tests +# --------------------------------------------------------------------------- + +triple_handler = TripleHandler() +quad_handler = QuadHandler() +tsd_handler = TSDHandler() + + +# --------------------------------------------------------------------------- +# Triple format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_turtle(): + """Turtle -> Turtle: read into IR before conversion, compare after.""" + source = resource("sample.ttl") + g_original = triple_handler.read(source, "turtle") + + with tempfile.NamedTemporaryFile(suffix=".ttl", delete=False) as f: + output = f.name + try: + triple_handler.convert(source, output, "turtle", "turtle") + g_roundtrip = triple_handler.read(output, "turtle") + assert g_original.isomorphic(g_roundtrip), ( + "Turtle round trip failed: graphs are not isomorphic" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_ntriples(): + """N-Triples -> N-Triples: read into IR before conversion, compare after.""" + source = resource("sample.nt") + g_original = triple_handler.read(source, "ntriples") + + with tempfile.NamedTemporaryFile(suffix=".nt", delete=False) as f: + output = f.name + try: + triple_handler.convert(source, output, "ntriples", "ntriples") + g_roundtrip = triple_handler.read(output, "ntriples") + assert g_original.isomorphic(g_roundtrip), ( + "N-Triples round trip failed: graphs are not isomorphic" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_rdf_xml(): + """RDF/XML -> RDF/XML: read into IR before conversion, compare after.""" + source = resource("sample.rdf") + g_original = triple_handler.read(source, "rdf-xml") + + with tempfile.NamedTemporaryFile(suffix=".rdf", delete=False) as f: + output = f.name + try: + triple_handler.convert(source, output, "rdf-xml", "rdf-xml") + g_roundtrip = triple_handler.read(output, "rdf-xml") + assert g_original.isomorphic(g_roundtrip), ( + "RDF/XML round trip failed: graphs are not isomorphic" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +# --------------------------------------------------------------------------- +# Quad format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def _datasets_equal(d1, d2) -> bool: + """Check semantic equivalence of two Datasets. + + Compares total triple count, named graph identifiers, and + performs isomorphism check on each named graph to correctly + handle blank node renaming during serialization. + """ + if len(d1) != len(d2): + return False + + graphs1 = {str(g.identifier) for g in d1.graphs()} + graphs2 = {str(g.identifier) for g in d2.graphs()} + if graphs1 != graphs2: + return False + + # Compare triples inside each named graph using isomorphism + # to correctly handle blank nodes that may be renamed during + # serialization/deserialization + for g1 in d1.graphs(): + graph_id = str(g1.identifier) + g2 = d2.get_context(g1.identifier) + if g2 is None: + return False + if not g1.isomorphic(g2): + return False + + return True + + +def test_round_trip_nquads(): + """N-Quads -> N-Quads: read into IR before conversion, compare after.""" + source = resource("sample.nq") + d_original = quad_handler.read(source, "nquads") + + with tempfile.NamedTemporaryFile(suffix=".nq", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "nquads", "nquads") + d_roundtrip = quad_handler.read(output, "nquads") + assert _datasets_equal(d_original, d_roundtrip), ( + "N-Quads round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_trig(): + """TriG -> TriG: read into IR before conversion, compare after.""" + source = resource("sample.trig") + d_original = quad_handler.read(source, "trig") + + with tempfile.NamedTemporaryFile(suffix=".trig", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "trig", "trig") + d_roundtrip = quad_handler.read(output, "trig") + assert _datasets_equal(d_original, d_roundtrip), ( + "TriG round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_trix(): + """TriX -> TriX: read into IR before conversion, compare after.""" + source = resource("sample.trix") + d_original = quad_handler.read(source, "trix") + + with tempfile.NamedTemporaryFile(suffix=".trix", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "trix", "trix") + d_roundtrip = quad_handler.read(output, "trix") + assert _datasets_equal(d_original, d_roundtrip), ( + "TriX round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_json_ld(): + """JSON-LD -> JSON-LD: read into IR before conversion, compare after.""" + source = resource("sample.jsonld") + d_original = quad_handler.read(source, "json-ld") + + with tempfile.NamedTemporaryFile(suffix=".jsonld", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "json-ld", "json-ld") + d_roundtrip = quad_handler.read(output, "json-ld") + assert _datasets_equal(d_original, d_roundtrip), ( + "JSON-LD round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +# --------------------------------------------------------------------------- +# Tabular format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_csv(): + """CSV -> CSV: read into IR before conversion, compare after.""" + source = resource("sample.csv") + rows_original = tsd_handler.read(source, "csv") + + with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: + output = f.name + try: + tsd_handler.convert(source, output, "csv", "csv") + rows_roundtrip = tsd_handler.read(output, "csv") + assert rows_original == rows_roundtrip, ( + "CSV round trip failed: rows do not match" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_tsv(): + """TSV -> TSV: read into IR before conversion, compare after.""" + source = resource("sample.tsv") + rows_original = tsd_handler.read(source, "tsv") + + with tempfile.NamedTemporaryFile(suffix=".tsv", delete=False) as f: + output = f.name + try: + tsd_handler.convert(source, output, "tsv", "tsv") + rows_roundtrip = tsd_handler.read(output, "tsv") + assert rows_original == rows_roundtrip, ( + "TSV round trip failed: rows do not match" + ) + finally: + if os.path.exists(output): + os.remove(output) \ No newline at end of file From f2fe92eddf6540ca8880865230020a70b660f0ec Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Thu, 11 Jun 2026 01:42:48 +0530 Subject: [PATCH 03/12] Improvements to Format layer implementation --- databusclient/api/convert.py | 589 ++------------------------ databusclient/api/download.py | 173 ++++---- databusclient/filehandling/format.py | 511 ++++++++++++++++++++++ databusclient/filehandling/mapping.py | 68 +++ pyproject.toml | 3 + run_all_conversion_tests.py | 4 + 6 files changed, 719 insertions(+), 629 deletions(-) create mode 100644 databusclient/filehandling/format.py create mode 100644 databusclient/filehandling/mapping.py diff --git a/databusclient/api/convert.py b/databusclient/api/convert.py index 8bd6dbb..b095eaf 100644 --- a/databusclient/api/convert.py +++ b/databusclient/api/convert.py @@ -1,563 +1,50 @@ -"""Format and Mapping Conversion Layer. - -This module implements the format conversion pipeline for the Databus Python Client - -Layer 2: Within-class format conversion (lossless). - - TripleHandler: RDF triple formats (turtle, ntriples, rdf-xml) - - QuadHandler: RDF quad formats (nquads, trig, trix, json-ld) - - TSDHandler: Tabular formats (csv, tsv) - -Layer 3 (prototype, not yet fully implemented): - - RDF triples -> CSV/TSV (quasi-equal, companion metadata generated) - -Each handler provides read() -> IR, write(IR) -> file, convert() -> chains both. -The IR (intermediate representation) returned by read() is designed to be passed -to future mapping classes (TripleToQuadMapper, TripleToTSDMapper, etc.). -""" - -import csv -import json -import os -import warnings -from typing import Optional - -# Suppress rdflib internal DeprecationWarning for Dataset API. -# rdflib is mid-migration from ConjunctiveGraph to Dataset in 7.x. -# These warnings originate from rdflib internals, not our code. -# Can be removed when rdflib completes their Dataset API migration. -warnings.filterwarnings("ignore", category=DeprecationWarning, module="rdflib") -warnings.filterwarnings("ignore", category=UserWarning, module="rdflib") - -from rdflib import Dataset, Graph - - -# --------------------------------------------------------------------------- -# Format registries -# --------------------------------------------------------------------------- - -# Maps CLI format name -> rdflib format string -RDF_TRIPLE_FORMATS = { - "ntriples": "ntriples", - "turtle": "turtle", - "rdf-xml": "xml", -} - -RDF_QUAD_FORMATS = { - "nquads": "nquads", - "trig": "trig", - "trix": "trix", - "json-ld": "json-ld", -} - -TABULAR_FORMATS = { - "csv": ",", - "tsv": "\t", -} - -ALL_FORMATS = ( - list(RDF_TRIPLE_FORMATS) - + list(RDF_QUAD_FORMATS) - + list(TABULAR_FORMATS) +from databusclient.filehandling.format import convert_file, get_converted_filename +from databusclient.filehandling import mapping as _mapping + +from databusclient.filehandling.format import ( # noqa: F401 + ALL_FORMATS, + EXTENSION_TO_FORMAT, + FORMAT_TO_EXTENSION, + RDF_QUAD_FORMATS, + RDF_TRIPLE_FORMATS, + TABULAR_FORMATS, + QuadHandler, + TSDHandler, + TripleHandler, + _quad_handler, + _tsd_handler, + _triple_handler, + detect_format_from_filename, + get_format_class, ) -# Maps file extension -> CLI format name -EXTENSION_TO_FORMAT = { - ".ttl": "turtle", - ".nt": "ntriples", - ".rdf": "rdf-xml", - ".xml": "rdf-xml", - ".owl": "rdf-xml", - ".nq": "nquads", - ".trig": "trig", - ".trix": "trix", - ".jsonld": "json-ld", - ".json": "json-ld", - ".csv": "csv", - ".tsv": "tsv", -} - -# Maps format name -> file extension -FORMAT_TO_EXTENSION = { - "ntriples": ".nt", - "turtle": ".ttl", - "rdf-xml": ".rdf", - "nquads": ".nq", - "trig": ".trig", - "trix": ".trix", - "json-ld": ".jsonld", - "csv": ".csv", - "tsv": ".tsv", -} - - -# --------------------------------------------------------------------------- -# Format detection helpers -# --------------------------------------------------------------------------- - -def detect_format_from_filename(filename: str) -> Optional[str]: - """Detect format from file extension, ignoring compression extensions. - - Args: - filename: File name or path. - - Returns: - Format name string or None if not detectable. - """ - name = filename.lower() - - # strip compression extension first - for ext in (".bz2", ".gz", ".xz"): - if name.endswith(ext): - name = name[: -len(ext)] - break - - # match longest extension first to avoid .json matching before .jsonld - for ext in sorted(EXTENSION_TO_FORMAT.keys(), key=len, reverse=True): - if name.endswith(ext): - return EXTENSION_TO_FORMAT[ext] - - return None - - -def get_format_class(fmt: str) -> str: - """Return equivalence class for a format name. - - Args: - fmt: Format name (e.g. 'turtle', 'nquads', 'csv'). - - Returns: - 'triples', 'quads', or 'tabular'. - - Raises: - ValueError: If format is not recognised. - """ - if fmt in RDF_TRIPLE_FORMATS: - return "triples" - if fmt in RDF_QUAD_FORMATS: - return "quads" - if fmt in TABULAR_FORMATS: - return "tabular" - raise ValueError( - f"Unknown format: '{fmt}'. Supported formats: {ALL_FORMATS}" - ) - - -def get_converted_filename(original_filename: str, convert_format: str) -> str: - """Generate output filename after format conversion. - - Strips compression extension if present, then replaces the format - extension with the target format extension. - - Args: - original_filename: Original file name (basename only, not full path). - convert_format: Target format name. - - Returns: - New filename with updated extension. - """ - name = original_filename - - # strip compression extension - for ext in (".bz2", ".gz", ".xz"): - if name.lower().endswith(ext): - name = name[: -len(ext)] - break - - # strip existing format extension (longest first) - for old_ext in sorted(FORMAT_TO_EXTENSION.values(), key=len, reverse=True): - if name.lower().endswith(old_ext): - name = name[: -len(old_ext)] - break - - target_ext = FORMAT_TO_EXTENSION.get(convert_format, f".{convert_format}") - return name + target_ext - - -# --------------------------------------------------------------------------- -# Layer 2 Handlers -# --------------------------------------------------------------------------- - -class TripleHandler: - """Handler for RDF triple formats (Layer 2). - - Uses rdflib.Graph as the intermediate representation (IR). - Supports: ntriples, turtle, rdf-xml. - - The IR returned by read() can be passed to future mapping classes - such as TripleToQuadMapper or TripleToTSDMapper for Layer 3 conversions. - """ - - def read(self, source: str, input_format: str) -> Graph: - """Parse an RDF triples file into a Graph (IR). - - Args: - source: Path to input file. - input_format: Source format name (e.g. 'turtle', 'ntriples', 'rdf-xml'). - - Returns: - rdflib.Graph containing all parsed triples. - - Raises: - ValueError: If input_format is not a recognised triple format. - """ - if input_format not in RDF_TRIPLE_FORMATS: - raise ValueError( - f"'{input_format}' is not a triple format. " - f"Supported: {list(RDF_TRIPLE_FORMATS)}" - ) - g = Graph() - g.parse(source, format=RDF_TRIPLE_FORMATS[input_format]) - return g - - def write(self, data: Graph, target: str, output_format: str) -> None: - """Serialize a Graph (IR) to a file. - - Args: - data: rdflib.Graph to serialize. - target: Path to output file. - output_format: Target format name (e.g. 'ntriples', 'turtle'). - - Raises: - ValueError: If output_format is not a recognised triple format. - """ - if output_format not in RDF_TRIPLE_FORMATS: - raise ValueError( - f"'{output_format}' is not a triple format. " - f"Supported: {list(RDF_TRIPLE_FORMATS)}" - ) - # Explicitly specify utf-8 encoding to avoid NTSerializer warning - data.serialize( - destination=target, - format=RDF_TRIPLE_FORMATS[output_format], - encoding="utf-8", - ) - - def convert( - self, - source: str, - target: str, - input_format: str, - output_format: str, - ) -> None: - """Convert between RDF triple formats (Layer 2, lossless). - - Chains read() -> write(). Both formats must be in the same - equivalence class (RDF triples). - - Args: - source: Path to input file. - target: Path to output file. - input_format: Source format name. - output_format: Target format name. - """ - graph = self.read(source, input_format) - self.write(graph, target, output_format) - print( - f"Converted {input_format} -> {output_format}: " - f"{os.path.basename(target)}" - ) - - -class QuadHandler: - """Handler for RDF quad formats (Layer 2). +__all__ = ["convert_file", "get_converted_filename"] - Uses rdflib.Dataset as the intermediate representation (IR). - Supports: nquads, trig, trix, json-ld. +convert_rdf_to_csv = _mapping.convert_rdf_to_csv - Named graph information is preserved through the Dataset IR. - The IR returned by read() can be passed to future mapping classes - such as QuadToTripleMapper or QuadToTSDMapper for Layer 3 conversions. - """ - def read(self, source: str, input_format: str) -> Dataset: - """Parse an RDF quads file into a Dataset (IR). - - Args: - source: Path to input file. - input_format: Source format name (e.g. 'nquads', 'trig', 'trix', 'json-ld'). - - Returns: - rdflib.Dataset containing all parsed quads with named graphs. - - Raises: - ValueError: If input_format is not a recognised quad format. - """ - if input_format not in RDF_QUAD_FORMATS: - raise ValueError( - f"'{input_format}' is not a quad format. " - f"Supported: {list(RDF_QUAD_FORMATS)}" - ) - d = Dataset() - d.parse(source, format=RDF_QUAD_FORMATS[input_format]) - return d - - def write(self, data: Dataset, target: str, output_format: str) -> None: - """Serialize a Dataset (IR) to a file. - - Args: - data: rdflib.Dataset to serialize. - target: Path to output file. - output_format: Target format name. - - Raises: - ValueError: If output_format is not a recognised quad format. - """ - if output_format not in RDF_QUAD_FORMATS: - raise ValueError( - f"'{output_format}' is not a quad format. " - f"Supported: {list(RDF_QUAD_FORMATS)}" - ) - data.serialize( - destination=target, - format=RDF_QUAD_FORMATS[output_format], - ) - - def convert( - self, - source: str, - target: str, - input_format: str, - output_format: str, - ) -> None: - """Convert between RDF quad formats (Layer 2, lossless). - - Chains read() -> write(). Both formats must be in the same - equivalence class (RDF quads). Named graph information is preserved. - - Args: - source: Path to input file. - target: Path to output file. - input_format: Source format name. - output_format: Target format name. - """ - dataset = self.read(source, input_format) - self.write(dataset, target, output_format) - print( - f"Converted {input_format} -> {output_format}: " - f"{os.path.basename(target)}" - ) - - -class TSDHandler: - """Handler for tabular structured data formats (Layer 2). - - Uses list[list[str]] as the intermediate representation (IR). - Supports: csv, tsv. - - The IR returned by read() can be passed to future mapping classes - such as TSDToTripleMapper for Layer 3 conversions. - """ - - def read(self, source: str, input_format: str) -> list: - """Parse a tabular file into a list of rows (IR). - - Each row is a list of string values. First row is the header. - - Args: - source: Path to input file. - input_format: Source format name ('csv' or 'tsv'). - - Returns: - list[list[str]] where first element is the header row. - - Raises: - ValueError: If input_format is not a recognised tabular format. - """ - if input_format not in TABULAR_FORMATS: - raise ValueError( - f"'{input_format}' is not a tabular format. " - f"Supported: {list(TABULAR_FORMATS)}" - ) - delimiter = TABULAR_FORMATS[input_format] - with open(source, "r", newline="", encoding="utf-8") as f: - reader = csv.reader(f, delimiter=delimiter) - return list(reader) - - def write(self, data: list, target: str, output_format: str) -> None: - """Serialize a list of rows (IR) to a tabular file. - - Args: - data: list[list[str]] to write. - target: Path to output file. - output_format: Target format name ('csv' or 'tsv'). - - Raises: - ValueError: If output_format is not a recognised tabular format. - """ - if output_format not in TABULAR_FORMATS: - raise ValueError( - f"'{output_format}' is not a tabular format. " - f"Supported: {list(TABULAR_FORMATS)}" - ) - delimiter = TABULAR_FORMATS[output_format] - with open(target, "w", newline="", encoding="utf-8") as f: - writer = csv.writer(f, delimiter=delimiter) - writer.writerows(data) - - def convert( - self, - source: str, - target: str, - input_format: str, - output_format: str, - ) -> None: - """Convert between tabular formats (Layer 2, lossless). - - Chains read() -> write(). Both formats must be in the same - equivalence class (tabular). - - Args: - source: Path to input file. - target: Path to output file. - input_format: Source format name. - output_format: Target format name. - """ - rows = self.read(source, input_format) - self.write(rows, target, output_format) - print( - f"Converted {input_format} -> {output_format}: " - f"{os.path.basename(target)}" - ) - - -# --------------------------------------------------------------------------- -# Layer 3 prototype — RDF triples to CSV (not yet fully implemented) -# --------------------------------------------------------------------------- - -def convert_rdf_to_csv( - input_file: str, - output_file: str, +def convert_rdf_triple_format( + source: str, + target: str, input_format: str, + output_format: str, ) -> None: - """Map RDF triples to a wide CSV table (Layer 3 prototype). - - Each unique subject becomes a row. Each unique predicate becomes a column. - Multi-valued predicates are pipe-separated. - A companion .meta.json file is generated to preserve RDF datatype and - language tag information for lossless round trips. - - NOTE: This is a Layer 3 prototype. It is not yet tested and will be - properly implemented in the Layer 3 issue. - - Args: - input_file: Path to input RDF triples file. - output_file: Path to write output CSV file. - input_format: Source triple format name (must be in RDF_TRIPLE_FORMATS). - """ - handler = TripleHandler() - g = handler.read(input_file, input_format) - - predicates = sorted(set(str(p) for s, p, o in g)) - - subjects: dict = {} - column_metadata: dict = {} - - for s, p, o in g: - subj = str(s) - pred = str(p) - - if hasattr(o, "datatype") and o.datatype: - column_metadata[pred] = {"datatype": str(o.datatype)} - elif hasattr(o, "language") and o.language: - column_metadata[pred] = {"language": str(o.language)} - - if subj not in subjects: - subjects[subj] = {} - if pred not in subjects[subj]: - subjects[subj][pred] = [] - subjects[subj][pred].append(str(o)) - - tsd_handler = TSDHandler() - rows = [["resource"] + predicates] - for subj, pred_map in subjects.items(): - row = [subj] - for pred in predicates: - values = pred_map.get(pred, []) - row.append("|".join(values)) - rows.append(row) - - tsd_handler.write(rows, output_file, "csv") - - companion_file = output_file + ".meta.json" - with open(companion_file, "w", encoding="utf-8") as f: - json.dump({"columns": column_metadata}, f, indent=2) - - print(f"Converted RDF -> CSV: {os.path.basename(output_file)}") - print(f"Companion metadata: {os.path.basename(companion_file)}") + _triple_handler.convert(source, target, input_format, output_format) -# --------------------------------------------------------------------------- -# Main dispatcher — called from download pipeline -# --------------------------------------------------------------------------- - -# Handler instances — created once, reused -_triple_handler = TripleHandler() -_quad_handler = QuadHandler() -_tsd_handler = TSDHandler() - - -def convert_file( - input_file: str, - output_file: str, - convert_format: str, +def convert_rdf_quad_format( + source: str, + target: str, + input_format: str, + output_format: str, ) -> None: - """Main conversion dispatcher called from the download pipeline. - - Detects the input format from the file extension, determines whether - this is a Layer 2 (within-class) or Layer 3 (cross-class) conversion, - and delegates to the appropriate handler. - - Args: - input_file: Path to the input file (must be decompressed). - output_file: Path to write the converted output file. - convert_format: Target format name (CLI format string). - - Raises: - ValueError: If input format cannot be detected or conversion - is not supported. - """ - input_format = detect_format_from_filename(input_file) + _quad_handler.convert(source, target, input_format, output_format) - if input_format is None: - raise ValueError( - f"Could not detect input format from filename: " - f"'{os.path.basename(input_file)}'. " - f"Supported extensions: {list(EXTENSION_TO_FORMAT.keys())}" - ) - if input_format == convert_format: - print( - f"WARNING: Input and target format are both '{input_format}'. " - "Skipping conversion." - ) - return - - input_class = get_format_class(input_format) - output_class = get_format_class(convert_format) - - # --- Layer 2: within-class --- - if input_class == output_class: - if input_class == "triples": - _triple_handler.convert( - input_file, output_file, input_format, convert_format - ) - elif input_class == "quads": - _quad_handler.convert( - input_file, output_file, input_format, convert_format - ) - elif input_class == "tabular": - _tsd_handler.convert( - input_file, output_file, input_format, convert_format - ) - return - - # --- Layer 3: cross-class (prototype only) --- - if input_class == "triples" and output_class == "tabular": - convert_rdf_to_csv(input_file, output_file, input_format) - return - - raise ValueError( - f"Conversion from '{input_format}' ({input_class}) to " - f"'{convert_format}' ({output_class}) is not yet implemented. " - f"Supported Layer 3 conversions: RDF Triples -> CSV/TSV." - ) \ No newline at end of file +def convert_tabular_format( + source: str, + target: str, + input_format: str, + output_format: str, +) -> None: + _tsd_handler.convert(source, target, input_format, output_format) \ No newline at end of file diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 7c33fac..414511a 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -5,6 +5,8 @@ import lzma from typing import List, Optional, Tuple import re +import shutil +import tempfile from urllib.parse import urlparse import requests @@ -16,7 +18,7 @@ get_databus_id_parts_from_file_url, compute_sha256_and_length, ) -from databusclient.api.convert import convert_file, get_converted_filename +from databusclient.filehandling.format import convert_file, get_converted_filename # Compression format mappings COMPRESSION_EXTENSIONS = { @@ -508,87 +510,102 @@ def _download_file( f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" ) - # --- 7. Convert compression format if requested (AFTER validation) --- - should_convert, source_format = _should_convert_file(file, convert_to, convert_from) - final_downloaded_file = filename - if should_convert and source_format: - target_filename = _get_converted_filename(file, source_format, convert_to) - target_filepath = os.path.join(localDir, target_filename) - _convert_compression_format( - filename, target_filepath, source_format, convert_to - ) - final_downloaded_file = target_filepath - - # --- 8. Convert file format if requested (AFTER compression conversion) --- - # Pipeline follows :decompress -> convert format -> recompress - # If the source was compressed, the converted output is recompressed: - # - to the format specified by --convert-to if provided - # - to the original compression format otherwise - if convert_format: - final_basename = os.path.basename(final_downloaded_file) - compression_fmt = _detect_compression_format(final_basename) - - if compression_fmt: - # File is still compressed — decompress to temp, convert, recompress - compression_ext = COMPRESSION_EXTENSIONS[compression_fmt] - if final_downloaded_file.lower().endswith(compression_ext): - temp_decompressed = final_downloaded_file[:-len(compression_ext)] - else: - temp_decompressed = final_downloaded_file + ".decompressed" - - try: - print( - f"Decompressing {final_basename} before format conversion..." - ) - source_module = COMPRESSION_MODULES[compression_fmt] - with source_module.open(final_downloaded_file, "rb") as sf: - with open(temp_decompressed, "wb") as tf: - while True: - chunk = sf.read(8192) - if not chunk: - break - tf.write(chunk) - - # Convert format on the decompressed temp file - converted_basename = get_converted_filename( - final_basename, convert_format - ) - converted_filepath = os.path.join(localDir, converted_basename) - convert_file(temp_decompressed, converted_filepath, convert_format) + # --- 7. Unified compression/format conversion pass --- + source_compression = _detect_compression_format(file) + should_convert_compression, source_format_for_convert_to = _should_convert_file( + file, convert_to, convert_from + ) + needs_format_conversion = convert_format is not None - # Recompress the converted output. - # Use --convert-to format if specified, otherwise use original compression. - recompress_fmt = convert_to if convert_to else compression_fmt - recompress_ext = COMPRESSION_EXTENSIONS[recompress_fmt] - recompressed_filepath = converted_filepath + recompress_ext - recompress_module = COMPRESSION_MODULES[recompress_fmt] + if not should_convert_compression and not needs_format_conversion: + return - print( - f"Recompressing converted file to {recompress_fmt}: " - f"{os.path.basename(recompressed_filepath)}" - ) - with open(converted_filepath, "rb") as sf: - with recompress_module.open(recompressed_filepath, "wb") as tf: - while True: - chunk = sf.read(8192) - if not chunk: - break - tf.write(chunk) - - # Remove the uncompressed converted file — keep only recompressed - if os.path.exists(converted_filepath): - os.remove(converted_filepath) - - finally: - # Always clean up temp decompressed file - if os.path.exists(temp_decompressed): - os.remove(temp_decompressed) + temp_paths: list[str] = [] + try: + # Compression-only path keeps existing conversion message behavior. + # Use a temp copy so the original downloaded file remains unchanged. + if should_convert_compression and not needs_format_conversion: + target_filename = _get_converted_filename( + file, source_format_for_convert_to, convert_to + ) + target_filepath = os.path.join(localDir, target_filename) + + with tempfile.NamedTemporaryFile( + delete=False, + suffix=COMPRESSION_EXTENSIONS[source_format_for_convert_to], + dir=localDir, + ) as temp_source_copy: + source_copy_path = temp_source_copy.name + temp_paths.append(source_copy_path) + + shutil.copyfile(filename, source_copy_path) + _convert_compression_format( + source_copy_path, + target_filepath, + source_format_for_convert_to, + convert_to, + ) + return + # Determine input for format conversion. + # If source is compressed, decompress once to a safe temporary file. + conversion_input_path = filename + if source_compression is not None: + source_ext = COMPRESSION_EXTENSIONS[source_compression] + stripped_name = file + if stripped_name.lower().endswith(source_ext): + stripped_name = stripped_name[: -len(source_ext)] + _, format_ext = os.path.splitext(stripped_name) + + with tempfile.NamedTemporaryFile( + delete=False, + suffix=format_ext, + dir=localDir, + ) as temp_decompressed: + temp_decompressed_path = temp_decompressed.name + temp_paths.append(temp_decompressed_path) + + print(f"Decompressing {file}...") + with COMPRESSION_MODULES[source_compression].open(filename, "rb") as sf: + with open(temp_decompressed_path, "wb") as tf: + shutil.copyfileobj(sf, tf) + + conversion_input_path = temp_decompressed_path + + # Convert format on uncompressed input. + converted_basename = get_converted_filename(file, convert_format) + converted_uncompressed_path = os.path.join(localDir, converted_basename) + convert_file(conversion_input_path, converted_uncompressed_path, convert_format) + + # Recompress converted output when needed. + if source_compression is not None: + if should_convert_compression and convert_to: + final_compression = convert_to + else: + final_compression = source_compression + elif should_convert_compression and convert_to: + final_compression = convert_to else: - # File is already uncompressed — convert directly, no recompression needed - converted_filename = get_converted_filename(final_basename, convert_format) - converted_filepath = os.path.join(localDir, converted_filename) - convert_file(final_downloaded_file, converted_filepath, convert_format) + final_compression = None + + if final_compression is not None: + recompressed_path = ( + converted_uncompressed_path + COMPRESSION_EXTENSIONS[final_compression] + ) + print( + f"Recompressing {os.path.basename(converted_uncompressed_path)} -> {os.path.basename(recompressed_path)}..." + ) + with open(converted_uncompressed_path, "rb") as sf: + with COMPRESSION_MODULES[final_compression].open( + recompressed_path, "wb" + ) as tf: + shutil.copyfileobj(sf, tf) + + os.remove(converted_uncompressed_path) + finally: + for temp_path in temp_paths: + if os.path.exists(temp_path): + os.remove(temp_path) def _download_files( diff --git a/databusclient/filehandling/format.py b/databusclient/filehandling/format.py new file mode 100644 index 0000000..1b625b8 --- /dev/null +++ b/databusclient/filehandling/format.py @@ -0,0 +1,511 @@ +"""Format and Mapping Conversion Layer. + +This module implements the format conversion pipeline for the Databus Python Client + +Layer 2: Within-class format conversion (lossless). + - TripleHandler: RDF triple formats (turtle, ntriples, rdf-xml) + - QuadHandler: RDF quad formats (nquads, trig, trix, json-ld) + - TSDHandler: Tabular formats (csv, tsv) + +Each handler provides read() -> IR, write(IR) -> file, convert() -> chains both. +The IR (intermediate representation) returned by read() is designed to be passed +to future mapping classes (TripleToQuadMapper, TripleToTSDMapper, etc.). +""" + +import csv +import os +import shutil +import warnings +from typing import Optional + +from rdflib import Dataset, Graph + +# Suppress rdflib internal DeprecationWarning for Dataset API. +# rdflib is mid-migration from ConjunctiveGraph to Dataset in 7.x. +# These warnings originate from rdflib internals, not our code. +# Can be removed when rdflib completes their Dataset API migration. +warnings.filterwarnings("ignore", category=DeprecationWarning, module="rdflib") +warnings.filterwarnings("ignore", category=UserWarning, module="rdflib") + + +# --------------------------------------------------------------------------- +# Format registries +# --------------------------------------------------------------------------- + +# Maps CLI format name -> rdflib format string +RDF_TRIPLE_FORMATS = { + "ntriples": "ntriples", + "turtle": "turtle", + "rdf-xml": "xml", +} + +RDF_QUAD_FORMATS = { + "nquads": "nquads", + "trig": "trig", + "trix": "trix", + "json-ld": "json-ld", +} + +TABULAR_FORMATS = { + "csv": ",", + "tsv": "\t", +} + +ALL_FORMATS = ( + list(RDF_TRIPLE_FORMATS) + + list(RDF_QUAD_FORMATS) + + list(TABULAR_FORMATS) +) + +# Maps file extension -> CLI format name +EXTENSION_TO_FORMAT = { + ".ttl": "turtle", + ".nt": "ntriples", + ".rdf": "rdf-xml", + ".xml": "rdf-xml", + ".owl": "rdf-xml", + ".nq": "nquads", + ".trig": "trig", + ".trix": "trix", + ".jsonld": "json-ld", + ".json": "json-ld", + ".csv": "csv", + ".tsv": "tsv", +} + +# Maps format name -> file extension +FORMAT_TO_EXTENSION = { + "ntriples": ".nt", + "turtle": ".ttl", + "rdf-xml": ".rdf", + "nquads": ".nq", + "trig": ".trig", + "trix": ".trix", + "json-ld": ".jsonld", + "csv": ".csv", + "tsv": ".tsv", +} + + +# --------------------------------------------------------------------------- +# Format detection helpers +# --------------------------------------------------------------------------- + +def detect_format_from_filename(filename: str) -> Optional[str]: + """Detect format from file extension, ignoring compression extensions. + + Args: + filename: File name or path. + + Returns: + Format name string or None if not detectable. + """ + name = filename.lower() + + # strip compression extension first + for ext in (".bz2", ".gz", ".xz"): + if name.endswith(ext): + name = name[: -len(ext)] + break + + # match longest extension first to avoid .json matching before .jsonld + for ext in sorted(EXTENSION_TO_FORMAT.keys(), key=len, reverse=True): + if name.endswith(ext): + return EXTENSION_TO_FORMAT[ext] + + return None + + +def get_format_class(fmt: str) -> str: + """Return equivalence class for a format name. + + Args: + fmt: Format name (e.g. 'turtle', 'nquads', 'csv'). + + Returns: + 'triples', 'quads', or 'tabular'. + + Raises: + ValueError: If format is not recognised. + """ + if fmt in RDF_TRIPLE_FORMATS: + return "triples" + if fmt in RDF_QUAD_FORMATS: + return "quads" + if fmt in TABULAR_FORMATS: + return "tabular" + raise ValueError( + f"Unknown format: '{fmt}'. Supported formats: {ALL_FORMATS}" + ) + + +def get_converted_filename(original_filename: str, convert_format: str) -> str: + """Generate output filename after format conversion. + + Strips compression extension if present, then replaces the format + extension with the target format extension. + + Args: + original_filename: Original file name (basename only, not full path). + convert_format: Target format name. + + Returns: + New filename with updated extension. + """ + name = original_filename + + # strip compression extension + for ext in (".bz2", ".gz", ".xz"): + if name.lower().endswith(ext): + name = name[: -len(ext)] + break + + # strip existing format extension (longest first) + for old_ext in sorted(FORMAT_TO_EXTENSION.values(), key=len, reverse=True): + if name.lower().endswith(old_ext): + name = name[: -len(old_ext)] + break + + target_ext = FORMAT_TO_EXTENSION.get(convert_format, f".{convert_format}") + return name + target_ext + + +# --------------------------------------------------------------------------- +# Layer 2 Handlers +# --------------------------------------------------------------------------- + +class TripleHandler: + """Handler for RDF triple formats (Layer 2). + + Uses rdflib.Graph as the intermediate representation (IR). + Supports: ntriples, turtle, rdf-xml. + + The IR returned by read() can be passed to future mapping classes + such as TripleToQuadMapper or TripleToTSDMapper for Layer 3 conversions. + """ + + def read(self, source: str, input_format: str) -> Graph: + """Parse an RDF triples file into a Graph (IR). + + Args: + source: Path to input file. + input_format: Source format name (e.g. 'turtle', 'ntriples', 'rdf-xml'). + + Returns: + rdflib.Graph containing all parsed triples. + + Raises: + ValueError: If input_format is not a recognised triple format. + """ + if input_format not in RDF_TRIPLE_FORMATS: + raise ValueError( + f"'{input_format}' is not a triple format. " + f"Supported: {list(RDF_TRIPLE_FORMATS)}" + ) + g = Graph() + g.parse(source, format=RDF_TRIPLE_FORMATS[input_format]) + return g + + def write(self, data: Graph, target: str, output_format: str) -> None: + """Serialize a Graph (IR) to a file. + + Args: + data: rdflib.Graph to serialize. + target: Path to output file. + output_format: Target format name (e.g. 'ntriples', 'turtle'). + + Raises: + ValueError: If output_format is not a recognised triple format. + """ + if output_format not in RDF_TRIPLE_FORMATS: + raise ValueError( + f"'{output_format}' is not a triple format. " + f"Supported: {list(RDF_TRIPLE_FORMATS)}" + ) + parent = os.path.dirname(target) + if parent: + os.makedirs(parent, exist_ok=True) + # Explicitly specify utf-8 encoding to avoid NTSerializer warning + data.serialize( + destination=target, + format=RDF_TRIPLE_FORMATS[output_format], + encoding="utf-8", + ) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between RDF triple formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (RDF triples). + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + graph = self.read(source, input_format) + self.write(graph, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +class QuadHandler: + """Handler for RDF quad formats (Layer 2). + + Uses rdflib.Dataset as the intermediate representation (IR). + Supports: nquads, trig, trix, json-ld. + + Named graph information is preserved through the Dataset IR. + The IR returned by read() can be passed to future mapping classes + such as QuadToTripleMapper or QuadToTSDMapper for Layer 3 conversions. + """ + + def read(self, source: str, input_format: str) -> Dataset: + """Parse an RDF quads file into a Dataset (IR). + + Args: + source: Path to input file. + input_format: Source format name (e.g. 'nquads', 'trig', 'trix', 'json-ld'). + + Returns: + rdflib.Dataset containing all parsed quads with named graphs. + + Raises: + ValueError: If input_format is not a recognised quad format. + """ + if input_format not in RDF_QUAD_FORMATS: + raise ValueError( + f"'{input_format}' is not a quad format. " + f"Supported: {list(RDF_QUAD_FORMATS)}" + ) + d = Dataset() + d.parse(source, format=RDF_QUAD_FORMATS[input_format]) + return d + + def write(self, data: Dataset, target: str, output_format: str) -> None: + """Serialize a Dataset (IR) to a file. + + Args: + data: rdflib.Dataset to serialize. + target: Path to output file. + output_format: Target format name. + + Raises: + ValueError: If output_format is not a recognised quad format. + """ + if output_format not in RDF_QUAD_FORMATS: + raise ValueError( + f"'{output_format}' is not a quad format. " + f"Supported: {list(RDF_QUAD_FORMATS)}" + ) + parent = os.path.dirname(target) + if parent: + os.makedirs(parent, exist_ok=True) + data.serialize( + destination=target, + format=RDF_QUAD_FORMATS[output_format], + ) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between RDF quad formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (RDF quads). Named graph information is preserved. + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + dataset = self.read(source, input_format) + self.write(dataset, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +class TSDHandler: + """Handler for tabular structured data formats (Layer 2). + + Uses list[list[str]] as the intermediate representation (IR). + Supports: csv, tsv. + + The IR returned by read() can be passed to future mapping classes + such as TSDToTripleMapper for Layer 3 conversions. + """ + + def read(self, source: str, input_format: str) -> list: + """Parse a tabular file into a list of rows (IR). + + Each row is a list of string values. First row is the header. + + Args: + source: Path to input file. + input_format: Source format name ('csv' or 'tsv'). + + Returns: + list[list[str]] where first element is the header row. + + Raises: + ValueError: If input_format is not a recognised tabular format. + """ + if input_format not in TABULAR_FORMATS: + raise ValueError( + f"'{input_format}' is not a tabular format. " + f"Supported: {list(TABULAR_FORMATS)}" + ) + delimiter = TABULAR_FORMATS[input_format] + with open(source, "r", newline="", encoding="utf-8") as f: + reader = csv.reader(f, delimiter=delimiter) + return list(reader) + + def write(self, data: list, target: str, output_format: str) -> None: + """Serialize a list of rows (IR) to a tabular file. + + Args: + data: list[list[str]] to write. + target: Path to output file. + output_format: Target format name ('csv' or 'tsv'). + + Raises: + ValueError: If output_format is not a recognised tabular format. + """ + if output_format not in TABULAR_FORMATS: + raise ValueError( + f"'{output_format}' is not a tabular format. " + f"Supported: {list(TABULAR_FORMATS)}" + ) + parent = os.path.dirname(target) + if parent: + os.makedirs(parent, exist_ok=True) + delimiter = TABULAR_FORMATS[output_format] + with open(target, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f, delimiter=delimiter) + writer.writerows(data) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between tabular formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (tabular). + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + rows = self.read(source, input_format) + self.write(rows, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +# --------------------------------------------------------------------------- +# Main dispatcher — called from download pipeline +# --------------------------------------------------------------------------- + +# Handler instances — created once, reused +_triple_handler = TripleHandler() +_quad_handler = QuadHandler() +_tsd_handler = TSDHandler() + + +def convert_file( + input_file: str, + output_file: str, + convert_format: str, +) -> None: + """Main conversion dispatcher called from the download pipeline. + + Detects the input format from the file extension, determines whether + this is a Layer 2 (within-class) or Layer 3 (cross-class) conversion, + and delegates to the appropriate handler. + + Args: + input_file: Path to the input file (must be decompressed). + output_file: Path to write the converted output file. + convert_format: Target format name (CLI format string). + + Raises: + ValueError: If input format cannot be detected or conversion + is not supported. + """ + input_format = detect_format_from_filename(input_file) + + if input_format is None: + raise ValueError( + f"Could not detect input format from filename: " + f"'{os.path.basename(input_file)}'. " + f"Supported extensions: {list(EXTENSION_TO_FORMAT.keys())}" + ) + + if input_format == convert_format: + # Input and target format are identical. + # Copy input to output path so the caller always receives an output file. + # This is important for the download pipeline which expects an output + # file to exist after convert_file() returns — e.g. for recompression. + if input_file != output_file: + shutil.copy2(input_file, output_file) + print( + f"Input and target format are both '{input_format}'. " + f"Copied to output path: {os.path.basename(output_file)}" + ) + return + + input_class = get_format_class(input_format) + output_class = get_format_class(convert_format) + + # --- Layer 2: within-class --- + if input_class == output_class: + if input_class == "triples": + _triple_handler.convert( + input_file, output_file, input_format, convert_format + ) + elif input_class == "quads": + _quad_handler.convert( + input_file, output_file, input_format, convert_format + ) + elif input_class == "tabular": + _tsd_handler.convert( + input_file, output_file, input_format, convert_format + ) + return + + # --- Layer 3: cross-class (prototype only) --- + if input_class == "triples" and output_class == "tabular": + from databusclient.filehandling.mapping import convert_rdf_to_csv + + convert_rdf_to_csv(input_file, output_file, input_format) + return + + raise ValueError( + f"Conversion from '{input_format}' ({input_class}) to " + f"'{convert_format}' ({output_class}) is not yet implemented. " + f"Supported Layer 3 conversions: RDF Triples -> CSV/TSV." + ) diff --git a/databusclient/filehandling/mapping.py b/databusclient/filehandling/mapping.py new file mode 100644 index 0000000..93b5a00 --- /dev/null +++ b/databusclient/filehandling/mapping.py @@ -0,0 +1,68 @@ +"""Layer 3 prototype mapping handlers.""" + +import json +import os + +from databusclient.filehandling.format import TSDHandler, TripleHandler + + +def convert_rdf_to_csv( + input_file: str, + output_file: str, + input_format: str, +) -> None: + """Map RDF triples to a wide CSV table (Layer 3 prototype). + + Each unique subject becomes a row. Each unique predicate becomes a column. + Multi-valued predicates are pipe-separated. + A companion .meta.json file is generated to preserve RDF datatype and + language tag information for lossless round trips. + + NOTE: This is a Layer 3 prototype. It is not yet tested and will be + properly implemented in the Layer 3 issue. + + Args: + input_file: Path to input RDF triples file. + output_file: Path to write output CSV file. + input_format: Source triple format name (must be in RDF_TRIPLE_FORMATS). + """ + handler = TripleHandler() + g = handler.read(input_file, input_format) + + predicates = sorted(set(str(p) for s, p, o in g)) + + subjects: dict = {} + column_metadata: dict = {} + + for s, p, o in g: + subj = str(s) + pred = str(p) + + if hasattr(o, "datatype") and o.datatype: + column_metadata[pred] = {"datatype": str(o.datatype)} + elif hasattr(o, "language") and o.language: + column_metadata[pred] = {"language": str(o.language)} + + if subj not in subjects: + subjects[subj] = {} + if pred not in subjects[subj]: + subjects[subj][pred] = [] + subjects[subj][pred].append(str(o)) + + tsd_handler = TSDHandler() + rows = [["resource"] + predicates] + for subj, pred_map in subjects.items(): + row = [subj] + for pred in predicates: + values = pred_map.get(pred, []) + row.append("|".join(values)) + rows.append(row) + + tsd_handler.write(rows, output_file, "csv") + + companion_file = output_file + ".meta.json" + with open(companion_file, "w", encoding="utf-8") as f: + json.dump({"columns": column_metadata}, f, indent=2) + + print(f"Converted RDF -> CSV: {os.path.basename(output_file)}") + print(f"Companion metadata: {os.path.basename(companion_file)}") diff --git a/pyproject.toml b/pyproject.toml index 72179cc..9759c07 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,9 @@ databusclient = "databusclient.cli:app" target-version = "py311" src = ["databusclient", "tests"] +[tool.ruff.lint.per-file-ignores] +"tests/test_format_round_trips.py" = ["F841"] + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/run_all_conversion_tests.py b/run_all_conversion_tests.py index 384e052..98f9bbb 100644 --- a/run_all_conversion_tests.py +++ b/run_all_conversion_tests.py @@ -5,6 +5,10 @@ Test file for testing with real datasets from databus. """ +# TODO: This script is a temporary manual integration test artifact. +# It must be removed or rewritten as proper pytest integration tests +# before the final PR. Do not commit this file to the upstream repo. + import os from databusclient.api.convert import ( convert_rdf_triple_format, From 180c255ed3961f454d8af1b71bdff03101d8284f Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Fri, 12 Jun 2026 17:12:11 +0530 Subject: [PATCH 04/12] Review comments resolved under issue #59 --- databusclient/api/download.py | 7 ++++ databusclient/cli.py | 18 ++++++++-- databusclient/filehandling/format.py | 51 ++++++++++++++++++++++++++-- 3 files changed, 70 insertions(+), 6 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 414511a..d7ec030 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -577,6 +577,13 @@ def _download_file( converted_uncompressed_path = os.path.join(localDir, converted_basename) convert_file(conversion_input_path, converted_uncompressed_path, convert_format) + # Delete the original downloaded file after successful format conversion, + # unless the converted output is the same file (same format, same path). + if os.path.abspath(filename) != os.path.abspath(converted_uncompressed_path): + if os.path.exists(filename): + os.remove(filename) + print(f"Removed original file: {os.path.basename(filename)}") + # Recompress converted output when needed. if source_compression is not None: if should_convert_compression and convert_to: diff --git a/databusclient/cli.py b/databusclient/cli.py index c687616..e998d4e 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -190,13 +190,25 @@ def deploy( help="Source compression format to convert from (optional filter). Only files with this compression will be converted.", ) @click.option( - "--convert-format", + "--format", "convert_format", type=click.Choice( - ["ntriples","turtle","rdf-xml","nquads","trig","trix","json-ld","csv","tsv"], + [ + "ntriples", "nt", + "turtle", "ttl", + "rdf-xml", "rdf", "xml", + "nquads", "nq", + "trig", + "trix", + "json-ld", "jsonld", + "csv", + "tsv", + ], case_sensitive=False, ), - help="Target format for on-the-fly format conversion during download (Layer 2 and Layer 3).", + help="Target format for on-the-fly format conversion during download (Layer 2 and Layer 3). " + "Accepts full names (ntriples, turtle, rdf-xml, nquads, trig, trix, json-ld, csv, tsv) " + "or short aliases (nt, ttl, rdf, xml, nq, jsonld).", ) @click.option( "--validate-checksum", is_flag=True, help="Validate checksums of downloaded files" diff --git a/databusclient/filehandling/format.py b/databusclient/filehandling/format.py index 1b625b8..7c40109 100644 --- a/databusclient/filehandling/format.py +++ b/databusclient/filehandling/format.py @@ -57,6 +57,42 @@ + list(TABULAR_FORMATS) ) +# Maps short CLI aliases -> canonical format name +FORMAT_ALIASES = { + "nt": "ntriples", + "ttl": "turtle", + "rdf": "rdf-xml", + "xml": "rdf-xml", + "nq": "nquads", + "jsonld": "json-ld", +} + +def normalize_format(fmt: str) -> str: + """Normalize a format name or alias to its canonical form. + + Accepts both full names (e.g. 'ntriples') and short aliases (e.g. 'nt'). + Canonical names pass through unchanged. Unknown values raise ValueError. + + Args: + fmt: Format name or alias string (case-insensitive). + + Returns: + Canonical format name string. + + Raises: + ValueError: If fmt is not a recognised format name or alias. + """ + fmt_lower = fmt.lower() + # Resolve alias first + canonical = FORMAT_ALIASES.get(fmt_lower, fmt_lower) + if canonical not in ALL_FORMATS: + raise ValueError( + f"Unknown format: '{fmt}'. " + f"Supported formats: {ALL_FORMATS}. " + f"Supported aliases: {list(FORMAT_ALIASES.keys())}" + ) + return canonical + # Maps file extension -> CLI format name EXTENSION_TO_FORMAT = { ".ttl": "turtle", @@ -143,15 +179,18 @@ def get_converted_filename(original_filename: str, convert_format: str) -> str: """Generate output filename after format conversion. Strips compression extension if present, then replaces the format - extension with the target format extension. + extension with the target format extension. Accepts format aliases. Args: original_filename: Original file name (basename only, not full path). - convert_format: Target format name. + convert_format: Target format name or alias. Returns: New filename with updated extension. """ + # Normalize alias to canonical name + convert_format = normalize_format(convert_format) + name = original_filename # strip compression extension @@ -447,15 +486,21 @@ def convert_file( this is a Layer 2 (within-class) or Layer 3 (cross-class) conversion, and delegates to the appropriate handler. + Accepts both canonical format names and short aliases (e.g. 'nt' for + 'ntriples', 'ttl' for 'turtle'). See normalize_format() for full list. + Args: input_file: Path to the input file (must be decompressed). output_file: Path to write the converted output file. - convert_format: Target format name (CLI format string). + convert_format: Target format name or alias (CLI format string). Raises: ValueError: If input format cannot be detected or conversion is not supported. """ + # Normalize alias to canonical name before any processing + convert_format = normalize_format(convert_format) + input_format = detect_format_from_filename(input_file) if input_format is None: From 34039f23a616e2e9df2e3fa4e66b8888fc47a06e Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Sat, 13 Jun 2026 18:36:26 +0530 Subject: [PATCH 05/12] #61: replacing --convert_from & --convert_to with --compression --- databusclient/api/download.py | 123 +++++++++++---------------- databusclient/cli.py | 18 ++-- tests/test_compression_conversion.py | 41 +++++---- 3 files changed, 79 insertions(+), 103 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index d7ec030..56cf07c 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -50,20 +50,23 @@ def _detect_compression_format(filename: str) -> Optional[str]: return None -def _should_convert_file( - filename: str, convert_to: Optional[str], convert_from: Optional[str] +def _should_convert_compression( + filename: str, compression: Optional[str] ) -> Tuple[bool, Optional[str]]: - """Determine if a file should be converted and what the source format is. + """Determine if a file should have its compression format converted. + + Source compression is detected automatically from the file extension. + All compressed files will be converted to the target format regardless + of their source compression format. Args: filename: Name of the file. - convert_to: Target compression format ('bz2', 'gz', 'xz'). - convert_from: Optional source compression format filter. + compression: Target compression format ('bz2', 'gz', 'xz') or None. Returns: Tuple of (should_convert: bool, source_format: Optional[str]). """ - if not convert_to: + if not compression: return False, None source_format = _detect_compression_format(filename) @@ -73,11 +76,7 @@ def _should_convert_file( return False, None # If source and target are the same, skip conversion - if source_format == convert_to: - return False, None - - # If convert_from is specified, only convert matching formats - if convert_from and source_format != convert_from: + if source_format == compression: return False, None return True, source_format @@ -314,8 +313,7 @@ def _download_file( databus_key=None, auth_url=None, client_id=None, - convert_to=None, - convert_from=None, + compression=None, convert_format=None, validate_checksum: bool = False, expected_checksum: str | None = None, @@ -329,8 +327,8 @@ def _download_file( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. + Source compression is auto-detected from the file extension. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. expected_checksum: The expected checksum of the file. @@ -354,6 +352,7 @@ def _download_file( dirpath = os.path.dirname(filename) if dirpath: os.makedirs(dirpath, exist_ok=True) # Create the necessary directories + # --- 1. Get redirect URL by requesting HEAD --- headers = {} @@ -512,8 +511,8 @@ def _download_file( # --- 7. Unified compression/format conversion pass --- source_compression = _detect_compression_format(file) - should_convert_compression, source_format_for_convert_to = _should_convert_file( - file, convert_to, convert_from + should_convert_compression, source_fmt = _should_convert_compression( + file, compression ) needs_format_conversion = convert_format is not None @@ -525,14 +524,12 @@ def _download_file( # Compression-only path keeps existing conversion message behavior. # Use a temp copy so the original downloaded file remains unchanged. if should_convert_compression and not needs_format_conversion: - target_filename = _get_converted_filename( - file, source_format_for_convert_to, convert_to - ) + target_filename = _get_converted_filename(file, source_fmt, compression) target_filepath = os.path.join(localDir, target_filename) with tempfile.NamedTemporaryFile( delete=False, - suffix=COMPRESSION_EXTENSIONS[source_format_for_convert_to], + suffix=COMPRESSION_EXTENSIONS[source_fmt], dir=localDir, ) as temp_source_copy: source_copy_path = temp_source_copy.name @@ -542,8 +539,8 @@ def _download_file( _convert_compression_format( source_copy_path, target_filepath, - source_format_for_convert_to, - convert_to, + source_fmt, + compression, ) return @@ -586,12 +583,12 @@ def _download_file( # Recompress converted output when needed. if source_compression is not None: - if should_convert_compression and convert_to: - final_compression = convert_to + if should_convert_compression and compression: + final_compression = compression else: final_compression = source_compression - elif should_convert_compression and convert_to: - final_compression = convert_to + elif should_convert_compression and compression: + final_compression = compression else: final_compression = None @@ -622,8 +619,7 @@ def _download_files( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, convert_format: str = None, validate_checksum: bool = False, checksums: dict | None = None, @@ -637,8 +633,7 @@ def _download_files( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. checksums: Dictionary mapping URLs to their expected checksums. @@ -654,8 +649,7 @@ def _download_files( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, expected_checksum=expected, @@ -803,8 +797,7 @@ def _download_collection( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, convert_format: str = None, validate_checksum: bool = False, ) -> None: @@ -818,8 +811,7 @@ def _download_collection( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ @@ -840,8 +832,7 @@ def _download_collection( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums if checksums else None, @@ -855,8 +846,7 @@ def _download_version( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, convert_format: str = None, validate_checksum: bool = False, ) -> None: @@ -869,8 +859,7 @@ def _download_version( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ @@ -890,8 +879,7 @@ def _download_version( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums, @@ -906,8 +894,7 @@ def _download_artifact( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, convert_format: str = None, validate_checksum: bool = False, ) -> None: @@ -921,8 +908,7 @@ def _download_artifact( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ @@ -948,8 +934,7 @@ def _download_artifact( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums, @@ -1025,8 +1010,7 @@ def _download_group( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, convert_format: str = None, validate_checksum: bool = False, ) -> None: @@ -1040,8 +1024,7 @@ def _download_group( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ @@ -1057,8 +1040,7 @@ def _download_group( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, ) @@ -1107,8 +1089,7 @@ def download( all_versions=None, auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", client_id="vault-token-exchange", - convert_to=None, - convert_from=None, + compression=None, convert_format=None, validate_checksum: bool = False, ) -> None: @@ -1124,8 +1105,8 @@ def download( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. Default is "https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token". client_id: Client ID for token exchange. Default is "vault-token-exchange". - convert_to: Target compression format for on-the-fly conversion (supported: bz2, gz, xz). - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion (supported: bz2, gz, xz). + Source compression is auto-detected from the file extension. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ @@ -1154,8 +1135,7 @@ def download( databus_key, auth_url, client_id, - convert_to, - convert_from, + compression, convert_format, validate_checksum=validate_checksum, ) @@ -1176,8 +1156,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, expected_checksum=expected, @@ -1191,8 +1170,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, ) @@ -1208,8 +1186,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, ) @@ -1225,8 +1202,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, ) @@ -1264,9 +1240,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums if checksums else None, - ) + ) \ No newline at end of file diff --git a/databusclient/cli.py b/databusclient/cli.py index e998d4e..50f0766 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -180,14 +180,12 @@ def deploy( help="Client ID for token exchange", ) @click.option( - "--convert-to", + "--compression", + "compression", type=click.Choice(["bz2", "gz", "xz"], case_sensitive=False), - help="Target compression format for on-the-fly conversion during download (supported: bz2, gz, xz)", -) -@click.option( - "--convert-from", - type=click.Choice(["bz2", "gz", "xz"], case_sensitive=False), - help="Source compression format to convert from (optional filter). Only files with this compression will be converted.", + help="Target compression format for on-the-fly conversion during download. " + "Source compression is detected automatically from the file extension. " + "All compressed files will be converted to the target format (bz2, gz, xz).", ) @click.option( "--format", @@ -222,8 +220,7 @@ def download( all_versions, authurl, clientid, - convert_to, - convert_from, + compression, convert_format, validate_checksum, ): @@ -241,8 +238,7 @@ def download( all_versions=all_versions, auth_url=authurl, client_id=clientid, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, ) diff --git a/tests/test_compression_conversion.py b/tests/test_compression_conversion.py index 71ada16..8effa1b 100644 --- a/tests/test_compression_conversion.py +++ b/tests/test_compression_conversion.py @@ -8,7 +8,7 @@ import pytest from databusclient.api.download import ( _detect_compression_format, - _should_convert_file, + _should_convert_compression, _get_converted_filename, _convert_compression_format, ) @@ -23,37 +23,42 @@ def test_detect_compression_format(): assert _detect_compression_format("FILE.TXT.GZ") == "gz" # case insensitive -def test_should_convert_file(): - """Test file conversion decision logic""" +def test_should_convert_compression(): + """Test file compression conversion decision logic. + + With --compression, source format is auto-detected from the file extension. + All compressed files are converted to the target format regardless of their + source compression format (no convert_from filter). + """ # No conversion target specified - should_convert, source = _should_convert_file("file.txt.bz2", None, None) + should_convert, source = _should_convert_compression("file.txt.bz2", None) assert should_convert is False assert source is None - # Uncompressed file - should_convert, source = _should_convert_file("file.txt", "gz", None) + # Uncompressed file — never converted + should_convert, source = _should_convert_compression("file.txt", "gz") assert should_convert is False assert source is None - # Same source and target - should_convert, source = _should_convert_file("file.txt.gz", "gz", None) + # Same source and target — skip (no-op) + should_convert, source = _should_convert_compression("file.txt.gz", "gz") assert should_convert is False assert source is None - # Valid conversion - should_convert, source = _should_convert_file("file.txt.bz2", "gz", None) + # bz2 -> gz: should convert, source auto-detected + should_convert, source = _should_convert_compression("file.txt.bz2", "gz") assert should_convert is True assert source == "bz2" - # With convert_from filter matching - should_convert, source = _should_convert_file("file.txt.bz2", "gz", "bz2") + # xz -> gz: should convert regardless of source format (no filter) + should_convert, source = _should_convert_compression("file.txt.xz", "gz") assert should_convert is True - assert source == "bz2" + assert source == "xz" - # With convert_from filter not matching - should_convert, source = _should_convert_file("file.txt.bz2", "gz", "xz") - assert should_convert is False - assert source is None + # gz -> bz2: should convert + should_convert, source = _should_convert_compression("file.txt.gz", "bz2") + assert should_convert is True + assert source == "gz" def test_get_converted_filename(): @@ -195,4 +200,4 @@ def test_corrupted_file_handling(): if __name__ == "__main__": - pytest.main([__file__, "-v"]) + pytest.main([__file__, "-v"]) \ No newline at end of file From cd5e990efe53b62aa744bcd110019e68284b285c Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Mon, 15 Jun 2026 01:06:58 +0530 Subject: [PATCH 06/12] gsoc26: layer2 complete + implementation for #61 --- databusclient/api/download.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 56cf07c..41b1f2a 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -521,23 +521,14 @@ def _download_file( temp_paths: list[str] = [] try: - # Compression-only path keeps existing conversion message behavior. - # Use a temp copy so the original downloaded file remains unchanged. + # Compression-only path: convert directly from the downloaded file. + # _convert_compression_format deletes the source after success, + # so the original downloaded file is removed automatically. if should_convert_compression and not needs_format_conversion: target_filename = _get_converted_filename(file, source_fmt, compression) target_filepath = os.path.join(localDir, target_filename) - - with tempfile.NamedTemporaryFile( - delete=False, - suffix=COMPRESSION_EXTENSIONS[source_fmt], - dir=localDir, - ) as temp_source_copy: - source_copy_path = temp_source_copy.name - temp_paths.append(source_copy_path) - - shutil.copyfile(filename, source_copy_path) _convert_compression_format( - source_copy_path, + filename, target_filepath, source_fmt, compression, From 508bf8605778170462ea8ad3293855b51ab284df Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Tue, 16 Jun 2026 02:20:48 +0530 Subject: [PATCH 07/12] gsoc2026: mapping layer initial commit --- databusclient/api/download.py | 113 +++++- databusclient/cli.py | 18 + databusclient/filehandling/format.py | 55 ++- databusclient/filehandling/mapping.py | 523 ++++++++++++++++++++++++-- tests/test_format_round_trips.py | 225 ++++++++++- tests/test_mapping_conversions.py | 498 ++++++++++++++++++++++++ 6 files changed, 1397 insertions(+), 35 deletions(-) create mode 100644 tests/test_mapping_conversions.py diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 41b1f2a..3307704 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -18,7 +18,14 @@ get_databus_id_parts_from_file_url, compute_sha256_and_length, ) -from databusclient.filehandling.format import convert_file, get_converted_filename +from databusclient.filehandling.format import ( + convert_file, + get_converted_filename, + normalize_format, + get_format_class, + detect_format_from_filename, + FORMAT_TO_EXTENSION, +) # Compression format mappings COMPRESSION_EXTENSIONS = { @@ -315,6 +322,8 @@ def _download_file( client_id=None, compression=None, convert_format=None, + graph_name=None, + base_uri=None, validate_checksum: bool = False, expected_checksum: str | None = None, ) -> None: @@ -330,6 +339,8 @@ def _download_file( compression: Target compression format for on-the-fly conversion. Source compression is auto-detected from the file extension. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. expected_checksum: The expected checksum of the file. """ @@ -560,10 +571,57 @@ def _download_file( conversion_input_path = temp_decompressed_path - # Convert format on uncompressed input. + # Determine whether this is a Quad -> Triple (Layer 3) conversion. + # This direction produces multiple output files (one per named + # graph) written into a subdirectory, rather than a single file — + # so it is handled separately from the standard single-file path + # below (no recompression, no single-file delete-and-replace). + normalized_convert_format = normalize_format(convert_format) + target_class = get_format_class(normalized_convert_format) + source_format_for_mapping = detect_format_from_filename(conversion_input_path) + source_class_for_mapping = ( + get_format_class(source_format_for_mapping) + if source_format_for_mapping else None + ) + is_quad_to_triple = ( + source_class_for_mapping == "quads" and target_class == "triples" + ) + + if is_quad_to_triple: + # Output directory name = original filename with compression and + # format extensions stripped (e.g. "data.nq.gz" -> "data"). + output_stem = get_converted_filename(file, convert_format) + target_ext = FORMAT_TO_EXTENSION.get(normalized_convert_format, "") + if target_ext and output_stem.lower().endswith(target_ext): + output_stem = output_stem[: -len(target_ext)] + output_dir = os.path.join(localDir, output_stem) + + convert_file( + conversion_input_path, + output_dir, + convert_format, + graph_name=graph_name, + base_uri=base_uri, + ) + + # Delete the original downloaded (possibly compressed) file — + # the split output directory replaces it. + if os.path.exists(filename): + os.remove(filename) + print(f"Removed original file: {os.path.basename(filename)}") + return + + # Standard single-output-file path (Layer 2, and the remaining + # Layer 3 directions: Triple<->Quad, Triple<->TSD, Quad->TSD). converted_basename = get_converted_filename(file, convert_format) converted_uncompressed_path = os.path.join(localDir, converted_basename) - convert_file(conversion_input_path, converted_uncompressed_path, convert_format) + convert_file( + conversion_input_path, + converted_uncompressed_path, + convert_format, + graph_name=graph_name, + base_uri=base_uri, + ) # Delete the original downloaded file after successful format conversion, # unless the converted output is the same file (same format, same path). @@ -612,6 +670,8 @@ def _download_files( client_id: str = None, compression: str = None, convert_format: str = None, + graph_name: str = None, + base_uri: str = None, validate_checksum: bool = False, checksums: dict | None = None, ) -> None: @@ -626,6 +686,8 @@ def _download_files( client_id: Client ID for token exchange. compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. checksums: Dictionary mapping URLs to their expected checksums. """ @@ -642,11 +704,12 @@ def _download_files( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, expected_checksum=expected, ) - def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None) -> str: """Get SPARQL query of collection members from databus collection URI. @@ -790,6 +853,8 @@ def _download_collection( client_id: str = None, compression: str = None, convert_format: str = None, + graph_name: str = None, + base_uri: str = None, validate_checksum: bool = False, ) -> None: """Download all files in a databus collection. @@ -804,6 +869,8 @@ def _download_collection( client_id: Client ID for token exchange. compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. """ query = _get_sparql_query_of_collection(uri, databus_key=databus_key) @@ -825,6 +892,8 @@ def _download_collection( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) @@ -839,6 +908,8 @@ def _download_version( client_id: str = None, compression: str = None, convert_format: str = None, + graph_name: str = None, + base_uri: str = None, validate_checksum: bool = False, ) -> None: """Download all files in a databus artifact version. @@ -852,6 +923,8 @@ def _download_version( client_id: Client ID for token exchange. compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -872,6 +945,8 @@ def _download_version( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, checksums=checksums, ) @@ -887,6 +962,8 @@ def _download_artifact( client_id: str = None, compression: str = None, convert_format: str = None, + graph_name: str = None, + base_uri: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus artifact. @@ -901,6 +978,8 @@ def _download_artifact( client_id: Client ID for token exchange. compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -927,6 +1006,8 @@ def _download_artifact( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, checksums=checksums, ) @@ -1003,6 +1084,8 @@ def _download_group( client_id: str = None, compression: str = None, convert_format: str = None, + graph_name: str = None, + base_uri: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus group. @@ -1017,6 +1100,8 @@ def _download_group( client_id: Client ID for token exchange. compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -1033,6 +1118,8 @@ def _download_group( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) @@ -1082,6 +1169,8 @@ def download( client_id="vault-token-exchange", compression=None, convert_format=None, + graph_name=None, + base_uri=None, validate_checksum: bool = False, ) -> None: """Download datasets from databus. @@ -1097,8 +1186,10 @@ def download( auth_url: Keycloak token endpoint URL. Default is "https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token". client_id: Client ID for token exchange. Default is "vault-token-exchange". compression: Target compression format for on-the-fly conversion (supported: bz2, gz, xz). - Source compression is auto-detected from the file extension. + Source compression is auto-detected from the file extension. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. """ for databusURI in databusURIs: @@ -1128,6 +1219,8 @@ def download( client_id, compression, convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) elif file is not None: @@ -1149,6 +1242,8 @@ def download( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, expected_checksum=expected, ) @@ -1163,6 +1258,8 @@ def download( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) elif artifact is not None: @@ -1179,6 +1276,8 @@ def download( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) elif group is not None and group != "collections": @@ -1195,6 +1294,8 @@ def download( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) elif account is not None: @@ -1233,6 +1334,8 @@ def download( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) \ No newline at end of file diff --git a/databusclient/cli.py b/databusclient/cli.py index 50f0766..9cace95 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -208,6 +208,20 @@ def deploy( "Accepts full names (ntriples, turtle, rdf-xml, nquads, trig, trix, json-ld, csv, tsv) " "or short aliases (nt, ttl, rdf, xml, nq, jsonld).", ) +@click.option( + "--graph-name", + "graph_name", + default=None, + help="Named graph URI for Triple -> Quad conversion (Layer 3). " + "Required when converting RDF triple formats to quad formats.", +) +@click.option( + "--base-uri", + "base_uri", + default=None, + help="Base URI for CSV -> RDF Triple conversion (Layer 3). " + "Required when converting CSV/TSV to RDF triple formats.", +) @click.option( "--validate-checksum", is_flag=True, help="Validate checksums of downloaded files" ) @@ -222,6 +236,8 @@ def download( clientid, compression, convert_format, + graph_name, + base_uri, validate_checksum, ): """ @@ -240,6 +256,8 @@ def download( client_id=clientid, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) except DownloadAuthError as e: diff --git a/databusclient/filehandling/format.py b/databusclient/filehandling/format.py index 7c40109..262f61e 100644 --- a/databusclient/filehandling/format.py +++ b/databusclient/filehandling/format.py @@ -479,6 +479,8 @@ def convert_file( input_file: str, output_file: str, convert_format: str, + graph_name: str = None, + base_uri: str = None, ) -> None: """Main conversion dispatcher called from the download pipeline. @@ -489,10 +491,19 @@ def convert_file( Accepts both canonical format names and short aliases (e.g. 'nt' for 'ntriples', 'ttl' for 'turtle'). See normalize_format() for full list. + For Layer 3 cross-class conversions: + - Triple -> Quad requires graph_name (--graph-name ). + - CSV -> Triple requires base_uri (--base-uri ). + - Quad -> Triple produces multiple files in a subdirectory; output_file + is used as the subdirectory path. + Args: input_file: Path to the input file (must be decompressed). output_file: Path to write the converted output file. + For Quad -> Triple, this is the output subdirectory path. convert_format: Target format name or alias (CLI format string). + graph_name: Named graph URI for Triple -> Quad conversion. + base_uri: Base URI for CSV -> Triple conversion. Raises: ValueError: If input format cannot be detected or conversion @@ -513,8 +524,6 @@ def convert_file( if input_format == convert_format: # Input and target format are identical. # Copy input to output path so the caller always receives an output file. - # This is important for the download pipeline which expects an output - # file to exist after convert_file() returns — e.g. for recompression. if input_file != output_file: shutil.copy2(input_file, output_file) print( @@ -542,15 +551,45 @@ def convert_file( ) return - # --- Layer 3: cross-class (prototype only) --- + # --- Layer 3: cross-class --- + from databusclient.filehandling import mapping as _mapping + + # Triple -> Quad + if input_class == "triples" and output_class == "quads": + _mapping.convert_triples_to_quads( + input_file, output_file, input_format, convert_format, graph_name + ) + return + + # Quad -> Triple (output_file used as output subdirectory) + if input_class == "quads" and output_class == "triples": + _mapping.convert_quads_to_triples( + input_file, output_file, input_format, convert_format + ) + return + + # Triple -> TSD if input_class == "triples" and output_class == "tabular": - from databusclient.filehandling.mapping import convert_rdf_to_csv + _mapping.convert_rdf_to_csv( + input_file, output_file, input_format, convert_format + ) + return - convert_rdf_to_csv(input_file, output_file, input_format) + # TSD -> Triple + if input_class == "tabular" and output_class == "triples": + _mapping.convert_csv_to_rdf( + input_file, output_file, input_format, convert_format, base_uri + ) + return + + # Quad -> TSD + if input_class == "quads" and output_class == "tabular": + _mapping.convert_quads_to_csv( + input_file, output_file, input_format, convert_format + ) return raise ValueError( f"Conversion from '{input_format}' ({input_class}) to " - f"'{convert_format}' ({output_class}) is not yet implemented. " - f"Supported Layer 3 conversions: RDF Triples -> CSV/TSV." - ) + f"'{convert_format}' ({output_class}) is not supported." + ) \ No newline at end of file diff --git a/databusclient/filehandling/mapping.py b/databusclient/filehandling/mapping.py index 93b5a00..c89ddef 100644 --- a/databusclient/filehandling/mapping.py +++ b/databusclient/filehandling/mapping.py @@ -1,55 +1,285 @@ -"""Layer 3 prototype mapping handlers.""" +"""Layer 3 Mapping Conversion — cross-class conversions between RDF and tabular formats. + +Supported mapping directions: + Triple -> Quad : Assigns a named graph to all triples (requires graph_name). + Quad -> Triple : Splits quads into one file per named graph (in a subdirectory), + written in the triple format specified by output_format. + Triple -> TSD : Maps RDF triples to wide CSV table (quasi-equal, companion .meta.json). + TSD -> Triple : Reconstructs RDF triples from wide CSV (lossless with companion file). + Quad -> TSD : Maps RDF quads to wide CSV table with extra graph column. + +Data loss and quasi-equality: + RDF -> CSV conversion is quasi-equal. RDF datatypes (xsd:integer etc.) and language + tags (@en) cannot be represented in plain CSV. A companion .meta.json file is generated + alongside the CSV to preserve this information. When converting back (CSV -> RDF), if the + companion file is present, datatypes and language tags are restored for full lossless + round trips. Without the companion file, all values are restored as plain xsd:string. + + Note: a string literal whose lexical value itself looks like a URI (e.g. a literal + "http://example.com/text") cannot be distinguished from an actual URI reference in + the CSV representation. This is an inherent limitation of the wide-table CSV format + and matches the level of fidelity of the Java client's TSD mapping. + +Blank node handling: + Blank node subjects and objects are serialized to CSV cells as '_:label' (matching + N-Triples notation). This is essential for round trips: without the '_:' marker, + convert_csv_to_rdf() cannot distinguish a blank node reference from a URI or string + literal. On CSV -> RDF, any cell value starting with '_:' is reconstructed as a BNode + with the same label, preserving links between blank nodes and their properties. + +Per-predicate metadata granularity: + The companion .meta.json stores one datatype/language entry per predicate (the last + value seen during conversion). This assumes a predicate's values share a consistent + type, which holds for typical RDF datasets (e.g. DBpedia mappings) where a given + predicate has a consistent range. +""" import json import os -from databusclient.filehandling.format import TSDHandler, TripleHandler +from rdflib import BNode, Dataset, Graph, Literal, URIRef +from rdflib.namespace import XSD + +from databusclient.filehandling.format import ( + QuadHandler, + TSDHandler, + TripleHandler, + FORMAT_TO_EXTENSION, +) + +# --------------------------------------------------------------------------- +# Module-level handler instances — reuse across calls +# --------------------------------------------------------------------------- + +_triple_handler = TripleHandler() +_quad_handler = QuadHandler() +_tsd_handler = TSDHandler() + + +# --------------------------------------------------------------------------- +# Shared helper — RDF term to CSV cell string +# --------------------------------------------------------------------------- + +def _term_to_str(term) -> str: + """Convert an RDF term (URIRef, BNode, or Literal) to its CSV cell string. + + Blank nodes are prefixed with '_:' (matching N-Triples notation) so they + can be correctly distinguished from URIs and literals when reconstructing + RDF from CSV in convert_csv_to_rdf(). Without this prefix, a blank node + label like 'address1' would be indistinguishable from a relative resource + identifier, breaking the link between a blank node and its properties. + + Literals are represented by their lexical form (the string value as + written), regardless of datatype. This avoids conversion-related + discrepancies (e.g. datetime formatting via .toPython()) and matches + what is restored via Literal(value, datatype=...) on the reverse direction. + + Args: + term: An rdflib term (URIRef, BNode, or Literal). + + Returns: + String representation suitable for a CSV cell. + """ + if isinstance(term, BNode): + return f"_:{term}" + return str(term) + + +# --------------------------------------------------------------------------- +# Direction 1 — Triple -> Quad +# --------------------------------------------------------------------------- + +def convert_triples_to_quads( + input_file: str, + output_file: str, + input_format: str, + output_format: str, + graph_name: str, +) -> None: + """Promote RDF triples to named graph quads (Layer 3, lossless). + + All triples are assigned to the named graph specified by graph_name. + Requires --graph-name to be provided. + + Args: + input_file: Path to input RDF triples file. + output_file: Path to write output quads file. + input_format: Source triple format name (e.g. 'turtle', 'ntriples'). + output_format: Target quad format name (e.g. 'nquads', 'trig'). + graph_name: URI string for the named graph to assign all triples to. + + Raises: + ValueError: If graph_name is empty or None. + """ + if not graph_name: + raise ValueError( + "graph_name is required for Triple -> Quad conversion. " + "Use --graph-name to specify the target named graph." + ) + + g = _triple_handler.read(input_file, input_format) + d = Dataset() + graph_uri = URIRef(graph_name) + named_graph = d.graph(graph_uri) + + for triple in g: + named_graph.add(triple) + + _quad_handler.write(d, output_file, output_format) + print( + f"Converted {input_format} -> {output_format} " + f"(graph: {graph_name}): {os.path.basename(output_file)}" + ) +# --------------------------------------------------------------------------- +# Direction 2 — Quad -> Triple +# --------------------------------------------------------------------------- + +def convert_quads_to_triples( + input_file: str, + output_dir: str, + input_format: str, + output_format: str, +) -> list: + """Split RDF quads into per-graph triple files (Layer 3, lossless). + + Each named graph in the quads file becomes a separate file, written in + output_format (e.g. 'ntriples', 'turtle', 'rdf-xml' — whatever was + specified via --format). Output files are written to output_dir, named + after the last segment of the graph URI (e.g. 'people.ttl' for graph + 'https://example.org/graph/people' when output_format='turtle'). + + Default graph triples (no named graph) are written to + 'default_graph.'. + + Args: + input_file: Path to input quads file. + output_dir: Directory to write one file per named graph. + input_format: Source quad format name (e.g. 'nquads', 'trig'). + output_format: Target triple format name (e.g. 'ntriples', 'turtle', + 'rdf-xml'). Required — no default, matches whatever the user + specified via --format. + + Returns: + List of output file paths created. + + Raises: + ValueError: If no named graphs with triples are found in input. + """ + os.makedirs(output_dir, exist_ok=True) + + d = _quad_handler.read(input_file, input_format) + output_files = [] + + file_ext = FORMAT_TO_EXTENSION.get(output_format, f".{output_format}") + + for named_graph in d.graphs(): + graph_id = str(named_graph.identifier) + + # Skip empty graphs (e.g. an unused default graph) + if len(named_graph) == 0: + continue + + # Determine output filename from graph URI last segment + if graph_id in ("urn:x-rdflib:default", ""): + file_stem = "default_graph" + else: + file_stem = graph_id.rstrip("/").split("/")[-1] + # Sanitize: replace characters invalid in filenames + file_stem = "".join( + c if c.isalnum() or c in "-_." else "_" for c in file_stem + ) + if not file_stem: + file_stem = "graph" + + out_path = os.path.join(output_dir, file_stem + file_ext) + + # Handle duplicate filenames by appending a counter + counter = 1 + original_out_path = out_path + while os.path.exists(out_path): + out_path = original_out_path[: -len(file_ext)] + f"_{counter}{file_ext}" + counter += 1 + + _triple_handler.write(named_graph, out_path, output_format) + output_files.append(out_path) + print(f"Written graph '{graph_id}' -> {os.path.basename(out_path)}") + + if not output_files: + raise ValueError( + f"No named graphs with triples found in '{os.path.basename(input_file)}'. " + "Nothing to split." + ) + + print( + f"Quad -> Triple split complete: {len(output_files)} file(s) " + f"({output_format}) in '{os.path.basename(output_dir)}/'" + ) + return output_files + + +# --------------------------------------------------------------------------- +# Direction 3 — Triple -> TSD (CSV/TSV) +# --------------------------------------------------------------------------- + def convert_rdf_to_csv( input_file: str, output_file: str, input_format: str, + output_format: str, ) -> None: - """Map RDF triples to a wide CSV table (Layer 3 prototype). + """Map RDF triples to a wide tabular table (Layer 3, quasi-equal). + + Each unique RDF subject becomes one row. Each unique predicate becomes + a column header (full predicate URI). Object values fill the cells. + Multi-valued predicates are pipe-separated (|) to enable unambiguous + splitting on round trip. - Each unique subject becomes a row. Each unique predicate becomes a column. - Multi-valued predicates are pipe-separated. - A companion .meta.json file is generated to preserve RDF datatype and - language tag information for lossless round trips. + A companion .meta.json file is generated alongside the output file + to preserve RDF datatype and language tag information, enabling + lossless round trips when convert_csv_to_rdf() is called with the + same companion file present. - NOTE: This is a Layer 3 prototype. It is not yet tested and will be - properly implemented in the Layer 3 issue. + Blank node subjects and objects are serialized as '_:label' (see + _term_to_str). This is essential for correct round trips. Args: input_file: Path to input RDF triples file. - output_file: Path to write output CSV file. + output_file: Path to write output CSV or TSV file. input_format: Source triple format name (must be in RDF_TRIPLE_FORMATS). + output_format: Target tabular format ('csv' or 'tsv'). """ - handler = TripleHandler() - g = handler.read(input_file, input_format) + g = _triple_handler.read(input_file, input_format) + # Collect all unique predicates (sorted for deterministic column order) predicates = sorted(set(str(p) for s, p, o in g)) + # Group objects by (subject, predicate) subjects: dict = {} + # column_metadata: predicate URI -> {datatype: ...} or {language: ...} + # Only the LAST seen value's metadata is stored per predicate (see + # module docstring on per-predicate metadata granularity). column_metadata: dict = {} for s, p, o in g: - subj = str(s) + subj = _term_to_str(s) pred = str(p) - if hasattr(o, "datatype") and o.datatype: - column_metadata[pred] = {"datatype": str(o.datatype)} - elif hasattr(o, "language") and o.language: - column_metadata[pred] = {"language": str(o.language)} + # Collect datatype/language metadata for companion file + if isinstance(o, Literal): + if o.datatype and str(o.datatype) != str(XSD.string): + column_metadata[pred] = {"datatype": str(o.datatype)} + elif o.language: + column_metadata[pred] = {"language": str(o.language)} if subj not in subjects: subjects[subj] = {} if pred not in subjects[subj]: subjects[subj][pred] = [] - subjects[subj][pred].append(str(o)) + subjects[subj][pred].append(_term_to_str(o)) - tsd_handler = TSDHandler() + # Build rows: header + one row per subject rows = [["resource"] + predicates] for subj, pred_map in subjects.items(): row = [subj] @@ -58,11 +288,262 @@ def convert_rdf_to_csv( row.append("|".join(values)) rows.append(row) - tsd_handler.write(rows, output_file, "csv") + _tsd_handler.write(rows, output_file, output_format) + # Write companion metadata file companion_file = output_file + ".meta.json" with open(companion_file, "w", encoding="utf-8") as f: json.dump({"columns": column_metadata}, f, indent=2) - print(f"Converted RDF -> CSV: {os.path.basename(output_file)}") + print(f"Converted RDF -> {output_format.upper()}: {os.path.basename(output_file)}") print(f"Companion metadata: {os.path.basename(companion_file)}") + + +# --------------------------------------------------------------------------- +# Direction 4 — TSD (CSV/TSV) -> Triple +# --------------------------------------------------------------------------- + +def convert_csv_to_rdf( + input_file: str, + output_file: str, + input_format: str, + output_format: str, + base_uri: str, +) -> None: + """Reconstruct RDF triples from a wide tabular file (Layer 3). + + Column headers (except 'resource') become predicate URIs directly. + Each row becomes one RDF subject. Cell values become object literals, + URIs, or blank nodes depending on their content. + + If a companion .meta.json file exists alongside the input CSV + (same path + '.meta.json'), datatypes and language tags are restored + from it, enabling lossless round trips. Without the companion file, + all literal values are created as plain xsd:string literals. + + Note: companion file lookup uses input_file + '.meta.json' — the + companion must be co-located with the exact input file path passed + here. If the input was downloaded compressed and decompressed to a + temporary file, no companion will typically be found (this is an + inherent, documented limitation, not a bug). + + Multi-valued cells (pipe-separated '|') are split back into multiple + triples per subject-predicate pair. + + Blank node subjects/objects: any value starting with '_:' is + reconstructed as a BNode with the same label. URI objects: any value + starting with 'http://' or 'https://' is created as URIRef. + + Args: + input_file: Path to input CSV or TSV file. + output_file: Path to write output RDF triples file. + input_format: Source tabular format ('csv' or 'tsv'). + output_format: Target triple format name (e.g. 'ntriples', 'turtle'). + base_uri: Base URI for constructing subject URIs from relative identifiers. + + Raises: + ValueError: If base_uri is empty or None. + ValueError: If input file is empty or missing the 'resource' column. + """ + if not base_uri: + raise ValueError( + "base_uri is required for CSV -> RDF conversion. " + "Use --base-uri to specify the base URI for subject construction." + ) + + rows = _tsd_handler.read(input_file, input_format) + + if not rows: + raise ValueError(f"Input file '{os.path.basename(input_file)}' is empty.") + + header = rows[0] + if "resource" not in header: + raise ValueError( + f"Input CSV missing 'resource' column. " + f"Found columns: {header}. " + "The 'resource' column is required and must contain subject identifiers." + ) + + resource_idx = header.index("resource") + predicate_columns = [ + (i, col) for i, col in enumerate(header) if i != resource_idx + ] + + # Load companion metadata if present + companion_path = input_file + ".meta.json" + column_metadata: dict = {} + if os.path.exists(companion_path): + with open(companion_path, "r", encoding="utf-8") as f: + meta = json.load(f) + column_metadata = meta.get("columns", {}) + print(f"Loaded companion metadata: {os.path.basename(companion_path)}") + else: + print( + "No companion metadata file found. " + "All literal values will be created as plain strings." + ) + + base_uri_stripped = base_uri.rstrip("/") + g = Graph() + + for row in rows[1:]: # skip header + if len(row) < len(header): + row = row + [""] * (len(header) - len(row)) + + resource_val = row[resource_idx].strip() + if not resource_val: + continue # skip empty rows + + # Build subject node + if resource_val.startswith("_:"): + subject = BNode(resource_val[2:]) + elif resource_val.startswith("http://") or resource_val.startswith("https://"): + subject = URIRef(resource_val) + else: + subject = URIRef(f"{base_uri_stripped}/{resource_val}") + + # Build triples for each predicate column + for col_idx, pred_uri in predicate_columns: + cell = row[col_idx].strip() if col_idx < len(row) else "" + if not cell: + continue + + predicate = URIRef(pred_uri) + meta = column_metadata.get(pred_uri, {}) + + # Split multi-valued cells + for val in cell.split("|"): + val = val.strip() + if not val: + continue + + obj = _build_object(val, meta) + g.add((subject, predicate, obj)) + + _triple_handler.write(g, output_file, output_format) + print( + f"Converted {input_format.upper()} -> {output_format}: " + f"{os.path.basename(output_file)}" + ) + + +def _build_object(value: str, meta: dict): + """Build an RDF object term from a CSV cell string and metadata. + + Args: + value: String value from CSV cell. + meta: Metadata dict with optional 'datatype' or 'language' keys. + + Returns: + rdflib term: URIRef, BNode, or Literal. + """ + # Blank node + if value.startswith("_:"): + return BNode(value[2:]) + + # URI + if value.startswith("http://") or value.startswith("https://"): + return URIRef(value) + + # Literal with datatype from companion file + if "datatype" in meta: + return Literal(value, datatype=URIRef(meta["datatype"])) + + # Literal with language tag from companion file + if "language" in meta: + return Literal(value, lang=meta["language"]) + + # Plain string literal (no companion metadata) + return Literal(value) + + +# --------------------------------------------------------------------------- +# Direction 5 — Quad -> TSD (CSV/TSV) +# --------------------------------------------------------------------------- + +def convert_quads_to_csv( + input_file: str, + output_file: str, + input_format: str, + output_format: str, +) -> None: + """Map RDF quads to a wide tabular table with a graph column (Layer 3, quasi-equal). + + Extends the Triple -> TSD mapping by adding a 'graph' column containing + the named graph URI. Each row represents one (subject, graph) pair, with + one column per predicate (pipe-separated for multi-valued predicates). + + A companion .meta.json file is generated to preserve datatype and + language tag information. + + The default graph (if present) is skipped — only triples within named + graphs are represented, since the 'graph' column requires a graph URI. + + Args: + input_file: Path to input quads file. + output_file: Path to write output CSV or TSV file. + input_format: Source quad format name (e.g. 'nquads', 'trig'). + output_format: Target tabular format ('csv' or 'tsv'). + """ + d = _quad_handler.read(input_file, input_format) + + # Collect all predicates across all named graphs (sorted for determinism) + all_predicates = sorted( + set( + str(p) + for named_graph in d.graphs() + for s, p, o in named_graph + if str(named_graph.identifier) not in ("urn:x-rdflib:default", "") + ) + ) + + column_metadata: dict = {} + # rows_map key: (subject_str, graph_uri_str) -> {predicate_uri: [values]} + rows_map: dict = {} + + for named_graph in d.graphs(): + graph_id = str(named_graph.identifier) + + # Skip the default graph — no meaningful graph URI for the column + if graph_id in ("urn:x-rdflib:default", ""): + continue + + for s, p, o in named_graph: + subj = _term_to_str(s) + pred = str(p) + key = (subj, graph_id) + + if isinstance(o, Literal): + if o.datatype and str(o.datatype) != str(XSD.string): + column_metadata[pred] = {"datatype": str(o.datatype)} + elif o.language: + column_metadata[pred] = {"language": str(o.language)} + + if key not in rows_map: + rows_map[key] = {} + if pred not in rows_map[key]: + rows_map[key][pred] = [] + rows_map[key][pred].append(_term_to_str(o)) + + # Build rows: header = resource + graph + all predicates + header = ["resource", "graph"] + all_predicates + rows = [header] + + for (subj, graph_id), pred_map in rows_map.items(): + row = [subj, graph_id] + for pred in all_predicates: + values = pred_map.get(pred, []) + row.append("|".join(values)) + rows.append(row) + + _tsd_handler.write(rows, output_file, output_format) + + companion_file = output_file + ".meta.json" + with open(companion_file, "w", encoding="utf-8") as f: + json.dump({"columns": column_metadata}, f, indent=2) + + print( + f"Converted {input_format} -> {output_format.upper()} " + f"(with graph column): {os.path.basename(output_file)}" + ) + print(f"Companion metadata: {os.path.basename(companion_file)}") \ No newline at end of file diff --git a/tests/test_format_round_trips.py b/tests/test_format_round_trips.py index ebe9f76..f076eb4 100644 --- a/tests/test_format_round_trips.py +++ b/tests/test_format_round_trips.py @@ -25,12 +25,20 @@ import os import tempfile +from rdflib import BNode, URIRef from databusclient.api.convert import ( QuadHandler, TSDHandler, TripleHandler, ) +from databusclient.filehandling.mapping import ( + convert_triples_to_quads, + convert_quads_to_triples, + convert_rdf_to_csv, + convert_csv_to_rdf, + convert_quads_to_csv, +) # --------------------------------------------------------------------------- # Path to shared test resources @@ -253,4 +261,219 @@ def test_round_trip_tsv(): ) finally: if os.path.exists(output): - os.remove(output) \ No newline at end of file + os.remove(output) + +# --------------------------------------------------------------------------- +# Mapping round trip tests (Layer 3) — 5 tests total +# --------------------------------------------------------------------------- +# These tests validate cross-class conversions following the quasi-equal +# strategy from Frey et al. Where information loss is expected (e.g. RDF +# datatypes in CSV), the comparison accounts for that predictable loss. +# --------------------------------------------------------------------------- + +def test_mapping_triples_to_quads_and_back(): + """Triple -> Quad -> Triple round trip (lossless with graph_name).""" + source = resource("sample.ttl") + graph_uri = "https://example.org/graph/test" + + g_original = triple_handler.read(source, "turtle") + + with tempfile.TemporaryDirectory() as tmpdir: + quads_path = os.path.join(tmpdir, "promoted.nq") + convert_triples_to_quads(source, quads_path, "turtle", "nquads", graph_uri) + + # Split back — produces subdirectory + output_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(quads_path, output_dir, "nquads", "ntriples") + + assert len(files) == 1, "Expected exactly one output file (one named graph)" + + g_roundtrip = triple_handler.read(files[0], "ntriples") + assert g_original.isomorphic(g_roundtrip), ( + "Triple -> Quad -> Triple round trip failed: graphs are not isomorphic" + ) + + +def test_mapping_quads_to_triples_and_back(): + """Quad -> Triple -> Quad round trip (lossless, graph info preserved).""" + source = resource("sample.nq") + d_original = quad_handler.read(source, "nquads") + + with tempfile.TemporaryDirectory() as tmpdir: + # Split quads into per-graph triple files + output_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(source, output_dir, "nquads", "ntriples") + + assert len(files) >= 1, "Expected at least one output file" + + # Re-promote each file back to quads using its graph name + # (we use the same graph URIs from the original) + original_graphs = { + str(g.identifier): g + for g in d_original.graphs() + if len(g) > 0 and str(g.identifier) not in ("urn:x-rdflib:default", "") + } + + for out_file in files: + stem = os.path.basename(out_file)[:-3] # strip .nt + # Find the matching original graph by last URI segment + matching_graph_uri = next( + (uri for uri in original_graphs if uri.rstrip("/").split("/")[-1] == stem), + None + ) + if matching_graph_uri is None: + continue + + g_split = triple_handler.read(out_file, "ntriples") + g_original_named = original_graphs[matching_graph_uri] + assert g_split.isomorphic(g_original_named), ( + f"Quad -> Triple round trip failed for graph '{matching_graph_uri}': " + "graphs are not isomorphic" + ) + + +def test_mapping_triples_to_csv_and_back_with_companion(): + """Triple -> CSV -> Triple round trip (lossless with companion metadata file).""" + source = resource("sample.ttl") + g_original = triple_handler.read(source, "turtle") + + with tempfile.TemporaryDirectory() as tmpdir: + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(source, csv_path, "turtle", "csv") + + companion_path = csv_path + ".meta.json" + assert os.path.exists(companion_path), "Companion .meta.json was not created" + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + + g_roundtrip = triple_handler.read(nt_path, "ntriples") + + # With companion file: datatypes are restored. + # Blank nodes are quasi-equal: labels may differ, structure must match. + assert g_original.isomorphic(g_roundtrip), ( + "Triple -> CSV -> Triple round trip failed (with companion file): " + "graphs are not isomorphic" + ) + + +def test_mapping_triples_to_csv_quasi_equal_without_companion(): + """Triple -> CSV -> Triple quasi-equal test (without companion file). + + Without the companion file, datatypes are lost — all values become + plain string literals. The test verifies that subjects, predicates, + and string values match, but does not assert datatype preservation. + This documents the expected information loss. + """ + source = resource("sample.ttl") + g_original = triple_handler.read(source, "turtle") + + with tempfile.TemporaryDirectory() as tmpdir: + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(source, csv_path, "turtle", "csv") + + # Remove companion file to simulate no-metadata scenario + companion_path = csv_path + ".meta.json" + if os.path.exists(companion_path): + os.remove(companion_path) + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + + g_roundtrip = triple_handler.read(nt_path, "ntriples") + + # Quasi-equal check: named (URI) subjects must match exactly. + # Blank node subjects are expected to get NEW labels on round trip + # (blank node identity is never expected to survive serialization — + # only structure matters, same principle as isomorphic() checks + # for Layer 2). So we compare URI subjects by value, and blank + # node subjects only by count. + original_uri_subjects = set( + str(s) for s, p, o in g_original if isinstance(s, URIRef) + ) + roundtrip_uri_subjects = set( + str(s) for s, p, o in g_roundtrip if isinstance(s, URIRef) + ) + assert original_uri_subjects == roundtrip_uri_subjects, ( + "Quasi-equal check failed: named (URI) subject sets differ" + ) + + original_bnode_subjects = set( + s for s, p, o in g_original if isinstance(s, BNode) + ) + roundtrip_bnode_subjects = set( + s for s, p, o in g_roundtrip if isinstance(s, BNode) + ) + assert len(original_bnode_subjects) == len(roundtrip_bnode_subjects), ( + "Quasi-equal check failed: number of distinct blank node subjects " + "differs. Blank node labels are expected to change on round trip, " + "but their count should be preserved." + ) + + original_predicates = set(str(p) for s, p, o in g_original) + roundtrip_predicates = set(str(p) for s, p, o in g_roundtrip) + assert original_predicates == roundtrip_predicates, ( + "Quasi-equal check failed: predicate sets differ" + ) + + # String values must match (datatypes stripped — known loss). + # Blank node OBJECT values are also expected to get new labels, + # so we compare non-blank-node object values only. + original_values = set( + str(o) for s, p, o in g_original if not isinstance(o, BNode) + ) + roundtrip_values = set( + str(o) for s, p, o in g_roundtrip if not isinstance(o, BNode) + ) + assert original_values == roundtrip_values, ( + "Quasi-equal check failed: object string values differ. " + "This is unexpected — only datatypes should be lost without companion file." + ) + + +def test_mapping_quads_to_csv_and_back(): + """Quad -> CSV (with graph column) round trip (quasi-equal). + + Verifies that named graph information is preserved in the graph column + and that all triple data is present in the CSV output. + """ + source = resource("sample.nq") + d_original = quad_handler.read(source, "nquads") + + with tempfile.TemporaryDirectory() as tmpdir: + csv_path = os.path.join(tmpdir, "quads_output.csv") + convert_quads_to_csv(source, csv_path, "nquads", "csv") + + assert os.path.exists(csv_path), "CSV output was not created" + companion_path = csv_path + ".meta.json" + assert os.path.exists(companion_path), "Companion .meta.json was not created" + + # Verify graph column is present in CSV + rows = tsd_handler.read(csv_path, "csv") + assert len(rows) > 1, "CSV has no data rows" + header = rows[0] + assert "graph" in header, ( + "CSV output missing 'graph' column for Quad -> CSV conversion" + ) + assert "resource" in header, "CSV output missing 'resource' column" + + # Verify all named graph URIs appear in the graph column + graph_col_idx = header.index("graph") + csv_graphs = set(row[graph_col_idx] for row in rows[1:] if len(row) > graph_col_idx) + + original_graph_uris = set( + str(g.identifier) + for g in d_original.graphs() + if len(g) > 0 and str(g.identifier) not in ("urn:x-rdflib:default", "") + ) + + assert csv_graphs == original_graph_uris, ( + f"Graph URIs in CSV do not match original. " + f"Expected: {original_graph_uris}, got: {csv_graphs}" + ) \ No newline at end of file diff --git a/tests/test_mapping_conversions.py b/tests/test_mapping_conversions.py new file mode 100644 index 0000000..8901c59 --- /dev/null +++ b/tests/test_mapping_conversions.py @@ -0,0 +1,498 @@ +"""Comprehensive functional tests for Layer 3 mapping conversions. + +These tests cover all 5 mapping directions with edge cases: + - Triple -> Quad (with graph_name) + - Quad -> Triple (split by graph, subdirectory output) + - Triple -> TSD (CSV/TSV, companion metadata) + - TSD -> Triple (with and without companion file) + - Quad -> TSD (CSV with graph column) + +Edge cases covered: + - Blank node subjects and objects + - Typed literals (xsd:integer) + - Multi-valued predicates (pipe-separated) + - Missing companion file (graceful degradation) + - Empty cells in CSV + - Graph name sanitization in filenames +""" + +import json +import os +import tempfile + +import pytest + +from databusclient.filehandling.format import TripleHandler, QuadHandler, TSDHandler +from databusclient.filehandling.mapping import ( + convert_triples_to_quads, + convert_quads_to_triples, + convert_rdf_to_csv, + convert_csv_to_rdf, + convert_quads_to_csv, +) + +# --------------------------------------------------------------------------- +# Shared test data and helpers +# --------------------------------------------------------------------------- + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def resource(filename: str) -> str: + return os.path.join(RESOURCES, filename) + + +triple_handler = TripleHandler() +quad_handler = QuadHandler() +tsd_handler = TSDHandler() + +# Sample Turtle with typed literals, blank nodes, multi-valued predicates +SAMPLE_TTL_CONTENT = """\ +@base . +@prefix ex: . +@prefix foaf: . +@prefix xsd: . + + foaf:name "Alice" ; + ex:age 29 ; + ex:livesAt _:address1 . + +_:address1 ex:city "Leipzig" ; + ex:country "Germany" . + + foaf:name "Bob" ; + ex:age 34 ; + ex:knows . + + ex:title "Databus Example Project" ; + ex:member . +""" + +SAMPLE_NQ_CONTENT = """\ + "Alice" . + "29"^^ . + "Bob" . + "Databus Example Project" . + . +""" + + +def write_temp_file(tmpdir, filename, content): + path = os.path.join(tmpdir, filename) + with open(path, "w", encoding="utf-8") as f: + f.write(content) + return path + + +# --------------------------------------------------------------------------- +# Direction 1: Triple -> Quad +# --------------------------------------------------------------------------- + +class TestTriplesToQuads: + + def test_basic_conversion(self): + """All triples are assigned to the specified named graph.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.nq") + convert_triples_to_quads(src, out, "turtle", "nquads", + "https://example.org/graph/test") + + assert os.path.exists(out) + d = quad_handler.read(out, "nquads") + graph_uris = [ + str(g.identifier) for g in d.graphs() + if str(g.identifier) not in ("urn:x-rdflib:default", "") + and len(g) > 0 + ] + assert "https://example.org/graph/test" in graph_uris + + def test_triple_count_preserved(self): + """All triples from input appear in the named graph.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.nq") + convert_triples_to_quads(src, out, "turtle", "nquads", + "https://example.org/graph/test") + + g_original = triple_handler.read(src, "turtle") + d = quad_handler.read(out, "nquads") + named_graph = d.get_context( + __import__("rdflib").URIRef("https://example.org/graph/test") + ) + assert len(g_original) == len(named_graph) + + def test_requires_graph_name(self): + """Raises ValueError if graph_name is None or empty.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.nq") + + with pytest.raises(ValueError, match="graph_name is required"): + convert_triples_to_quads(src, out, "turtle", "nquads", None) + + with pytest.raises(ValueError, match="graph_name is required"): + convert_triples_to_quads(src, out, "turtle", "nquads", "") + + def test_trig_output_format(self): + """Triple -> Quad works with trig output format.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.trig") + convert_triples_to_quads(src, out, "turtle", "trig", + "https://example.org/graph/trig_test") + assert os.path.exists(out) + d = quad_handler.read(out, "trig") + assert len(d) > 0 + + def test_uses_resource_files(self): + """Conversion works correctly on the shared test resource files.""" + with tempfile.TemporaryDirectory() as tmpdir: + out = os.path.join(tmpdir, "output.nq") + convert_triples_to_quads( + resource("sample.ttl"), out, "turtle", "nquads", + "https://example.org/graph/resource_test" + ) + assert os.path.exists(out) + d = quad_handler.read(out, "nquads") + assert len(d) > 0 + + +# --------------------------------------------------------------------------- +# Direction 2: Quad -> Triple +# --------------------------------------------------------------------------- + +class TestQuadsToTriples: + + def test_creates_subdirectory(self): + """Output subdirectory is created automatically.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out_dir = os.path.join(tmpdir, "split_output") + convert_quads_to_triples(src, out_dir, "nquads", "ntriples") + assert os.path.isdir(out_dir) + + def test_one_file_per_graph(self): + """One .nt file is created per named graph.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(src, out_dir, "nquads", "ntriples") + + # SAMPLE_NQ_CONTENT has 2 named graphs + assert len(files) == 2 + for f in files: + assert f.endswith(".nt") + assert os.path.exists(f) + + def test_all_triples_present(self): + """Total triple count across all output files matches input quad count.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(src, out_dir, "nquads", "ntriples") + + total_output_triples = sum( + len(triple_handler.read(f, "ntriples")) for f in files + ) + d_original = quad_handler.read(src, "nquads") + total_input_triples = sum( + len(g) for g in d_original.graphs() + if str(g.identifier) not in ("urn:x-rdflib:default", "") + ) + assert total_output_triples == total_input_triples + + def test_filename_from_graph_uri(self): + """Output filenames are derived from graph URI last segment.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(src, out_dir, "nquads", "ntriples") + + filenames = [os.path.basename(f) for f in files] + # people.nt and projects.nt expected from graph URIs + assert "people.nt" in filenames + assert "projects.nt" in filenames + + def test_empty_input_raises(self): + """Raises ValueError if input has no named graphs with triples.""" + empty_nq = "" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "empty.nq", empty_nq) + out_dir = os.path.join(tmpdir, "split") + with pytest.raises(ValueError, match="No named graphs"): + convert_quads_to_triples(src, out_dir, "nquads", "ntriples") + + def test_uses_resource_files(self): + """Conversion works correctly on shared resource sample.nq.""" + with tempfile.TemporaryDirectory() as tmpdir: + out_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(resource("sample.nq"), out_dir, "nquads", "ntriples") + assert len(files) >= 1 + for f in files: + g = triple_handler.read(f, "ntriples") + assert len(g) > 0 + + +# --------------------------------------------------------------------------- +# Direction 3: Triple -> TSD +# --------------------------------------------------------------------------- + +class TestTriplesToCSV: + + def test_creates_csv_and_companion(self): + """Both CSV and companion .meta.json are created.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, out, "turtle", "csv") + + assert os.path.exists(out) + assert os.path.exists(out + ".meta.json") + + def test_header_row_contains_predicates(self): + """CSV header contains 'resource' and all predicate URIs.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, out, "turtle", "csv") + + rows = tsd_handler.read(out, "csv") + header = rows[0] + assert "resource" in header + assert "http://xmlns.com/foaf/0.1/name" in header + assert "https://example.org/vocab/age" in header + + def test_datatype_preserved_in_companion(self): + """Companion file records xsd:integer datatype for age predicate.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, out, "turtle", "csv") + + with open(out + ".meta.json", "r", encoding="utf-8") as f: + meta = json.load(f) + age_meta = meta["columns"].get("https://example.org/vocab/age", {}) + assert "datatype" in age_meta + assert "integer" in age_meta["datatype"] + + def test_one_row_per_subject(self): + """CSV has one data row per unique subject.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, out, "turtle", "csv") + + rows = tsd_handler.read(out, "csv") + g = triple_handler.read(src, "turtle") + unique_subjects = set(str(s) for s, p, o in g) + # rows[0] is header, rest are data rows + assert len(rows) - 1 == len(unique_subjects) + + def test_tsv_output(self): + """Triple -> TSV also works correctly.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.tsv") + convert_rdf_to_csv(src, out, "turtle", "tsv") + assert os.path.exists(out) + rows = tsd_handler.read(out, "tsv") + assert len(rows) > 1 + + def test_uses_resource_files(self): + """Conversion works on shared resource sample.ttl.""" + with tempfile.TemporaryDirectory() as tmpdir: + out = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(resource("sample.ttl"), out, "turtle", "csv") + assert os.path.exists(out) + rows = tsd_handler.read(out, "csv") + assert len(rows) > 1 + + +# --------------------------------------------------------------------------- +# Direction 4: TSD -> Triple +# --------------------------------------------------------------------------- + +class TestCSVToTriples: + + def test_basic_reconstruction_with_companion(self): + """CSV -> RDF round trip with companion file restores typed literals.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, csv_path, "turtle", "csv") + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + assert os.path.exists(nt_path) + g = triple_handler.read(nt_path, "ntriples") + assert len(g) > 0 + + def test_requires_base_uri(self): + """Raises ValueError if base_uri is None or empty.""" + with tempfile.TemporaryDirectory() as tmpdir: + csv_content = "resource,https://example.org/vocab/name\nhttps://example.org/data/alice,Alice\n" + csv_path = write_temp_file(tmpdir, "input.csv", csv_content) + out = os.path.join(tmpdir, "output.nt") + + with pytest.raises(ValueError, match="base_uri is required"): + convert_csv_to_rdf(csv_path, out, "csv", "ntriples", None) + + with pytest.raises(ValueError, match="base_uri is required"): + convert_csv_to_rdf(csv_path, out, "csv", "ntriples", "") + + def test_missing_resource_column_raises(self): + """Raises ValueError if CSV has no 'resource' column.""" + with tempfile.TemporaryDirectory() as tmpdir: + csv_content = "subject,predicate\nhttps://example.org/alice,Bob\n" + csv_path = write_temp_file(tmpdir, "input.csv", csv_content) + out = os.path.join(tmpdir, "output.nt") + + with pytest.raises(ValueError, match="missing 'resource' column"): + convert_csv_to_rdf(csv_path, out, "csv", "ntriples", + "https://example.org/data/") + + def test_blank_nodes_reconstructed(self): + """Blank node subjects (starting with '_:') are reconstructed as BNodes.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, csv_path, "turtle", "csv") + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + g = triple_handler.read(nt_path, "ntriples") + from rdflib import BNode + blank_subjects = [s for s, p, o in g if isinstance(s, BNode)] + assert len(blank_subjects) > 0, ( + "Expected blank node subjects to be reconstructed" + ) + + def test_uri_objects_reconstructed(self): + """Object values starting with http:// are reconstructed as URIRef.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, csv_path, "turtle", "csv") + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + g = triple_handler.read(nt_path, "ntriples") + from rdflib import URIRef + uri_objects = [o for s, p, o in g if isinstance(o, URIRef)] + assert len(uri_objects) > 0 + + def test_graceful_without_companion(self): + """Without companion file, conversion succeeds with plain string literals.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, csv_path, "turtle", "csv") + + # Remove companion + companion = csv_path + ".meta.json" + if os.path.exists(companion): + os.remove(companion) + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + # Should not raise — graceful degradation + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + g = triple_handler.read(nt_path, "ntriples") + assert len(g) > 0 + + def test_empty_csv_raises(self): + """Raises ValueError if CSV file is empty.""" + with tempfile.TemporaryDirectory() as tmpdir: + csv_path = write_temp_file(tmpdir, "empty.csv", "") + out = os.path.join(tmpdir, "output.nt") + + with pytest.raises(ValueError, match="empty"): + convert_csv_to_rdf(csv_path, out, "csv", "ntriples", + "https://example.org/data/") + + +# --------------------------------------------------------------------------- +# Direction 5: Quad -> TSD +# --------------------------------------------------------------------------- + +class TestQuadsToCSV: + + def test_creates_csv_with_graph_column(self): + """Output CSV contains 'resource', 'graph', and predicate columns.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_quads_to_csv(src, out, "nquads", "csv") + + assert os.path.exists(out) + rows = tsd_handler.read(out, "csv") + header = rows[0] + assert "resource" in header + assert "graph" in header + + def test_companion_file_created(self): + """Companion .meta.json is created alongside CSV.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_quads_to_csv(src, out, "nquads", "csv") + assert os.path.exists(out + ".meta.json") + + def test_graph_uris_in_csv(self): + """All named graph URIs from input appear in the graph column.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_quads_to_csv(src, out, "nquads", "csv") + + rows = tsd_handler.read(out, "csv") + header = rows[0] + graph_idx = header.index("graph") + csv_graphs = set(row[graph_idx] for row in rows[1:] if len(row) > graph_idx) + + assert "https://example.org/graph/people" in csv_graphs + assert "https://example.org/graph/projects" in csv_graphs + + def test_all_triples_represented(self): + """Data row count matches total triple count across all named graphs.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_quads_to_csv(src, out, "nquads", "csv") + + rows = tsd_handler.read(out, "csv") + # Each row is one (subject, graph) pair, not one triple. + # Verify at least one row per unique (subject, graph) + d = quad_handler.read(src, "nquads") + unique_subject_graph_pairs = set( + (str(s), str(g.identifier)) + for g in d.graphs() + for s, p, o in g + if str(g.identifier) not in ("urn:x-rdflib:default", "") + ) + assert len(rows) - 1 == len(unique_subject_graph_pairs) + + def test_uses_resource_files(self): + """Conversion works on shared resource sample.nq.""" + with tempfile.TemporaryDirectory() as tmpdir: + out = os.path.join(tmpdir, "output.csv") + convert_quads_to_csv(resource("sample.nq"), out, "nquads", "csv") + assert os.path.exists(out) + rows = tsd_handler.read(out, "csv") + assert len(rows) > 1 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file From 9aba4f694b526b8fdd51a4cb16353b4d21924ec8 Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Tue, 16 Jun 2026 12:23:22 +0530 Subject: [PATCH 08/12] fix: errors from testing fixed. --- databusclient/api/download.py | 8 +++++++- databusclient/cli.py | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 3307704..5645cbb 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -631,12 +631,18 @@ def _download_file( print(f"Removed original file: {os.path.basename(filename)}") # Recompress converted output when needed. + # Three cases: + # 1. Source was compressed + --compression given -> use target compression + # 2. Source was compressed, no --compression given -> recompress with original + # 3. Source was NOT compressed + --compression given -> compress the output + # 4. Source was NOT compressed, no --compression given -> no compression if source_compression is not None: if should_convert_compression and compression: final_compression = compression else: final_compression = source_compression - elif should_convert_compression and compression: + elif compression: + # Source was uncompressed but user explicitly requested --compression final_compression = compression else: final_compression = None diff --git a/databusclient/cli.py b/databusclient/cli.py index 9cace95..277d0d6 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -262,6 +262,9 @@ def download( ) except DownloadAuthError as e: raise click.ClickException(str(e)) + except ValueError as e: + raise click.ClickException(str(e)) + @app.command() From 549ea3bcc0f226eef39ba8123eb4410337a5b90e Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Wed, 17 Jun 2026 01:54:43 +0530 Subject: [PATCH 09/12] docs: update README for --format and --compression flags --- README.md | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 354c732..2e55ae9 100644 --- a/README.md +++ b/README.md @@ -174,10 +174,10 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD Note: Vault tokens are only required for certain protected Databus hosts (for example: `data.dbpedia.io`, `data.dev.dbpedia.link`). The client now detects those hosts and will fail early with a clear message if a token is required but not provided. Do not pass `--vault-token` for public downloads. - `--databus-key` - If the databus is protected and needs API key authentication, you can provide the API key with `--databus-key YOUR_API_KEY`. -- `--convert-to` - - Enables on-the-fly compression format conversion during download. Supported formats: `bz2`, `gz`, `xz`. Downloaded files will be automatically decompressed and recompressed to the target format. Example: `--convert-to gz` converts all downloaded compressed files to gzip format. -- `--convert-from` - - Optional filter to specify which source compression format should be converted. Use with `--convert-to` to convert only files with a specific compression format. Example: `--convert-to gz --convert-from bz2` converts only `.bz2` files to `.gz`, leaving other formats unchanged. +- `--compression` + - Enables on-the-fly compression format conversion during download. Supported formats: `bz2`, `gz`, `xz`. The source compression is auto-detected from the file extension. Example: `--compression gz` converts all downloaded compressed files to gzip format. +- `--format` + - Enables on-the-fly RDF and tabular format conversion during download (Layer 2). Supported formats: `ntriples` (`nt`), `turtle` (`ttl`), `rdf-xml` (`rdf`, `xml`), `nquads` (`nq`), `trig`, `trix`, `json-ld` (`jsonld`), `csv`, `tsv`. Short aliases shown in brackets. Only the converted output file is kept — the original is deleted after successful conversion. Example: `--format turtle` converts all downloaded RDF triple files to Turtle format. - `--validate-checksum` - Validates the checksums of downloaded files against the checksums provided by the Databus. If a checksum does not match, an error is raised and the file is deleted. @@ -272,16 +272,28 @@ databusclient download 'PREFIX dcat: SELECT ?x WHER docker run --rm -v $(pwd):/data dbpedia/databus-python-client download 'PREFIX dcat: SELECT ?x WHERE { ?sub dcat:downloadURL ?x . } LIMIT 10' --databus https://databus.dbpedia.org/sparql ``` -**Download with Compression Conversion**: download files and convert them to a different compression format on-the-fly +**Download with Compression Conversion**: download files and convert compression format on-the-fly. Source compression is auto-detected from the file extension. ```bash # Convert all compressed files to gzip format -databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --convert-to gz - -# Convert only bz2 files to xz format, leaving other compressions unchanged -databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals --convert-to xz --convert-from bz2 +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --compression gz # Download a collection and unify all files to bz2 format -databusclient download https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 --convert-to bz2 +databusclient download https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 --compression bz2 +``` + +**Download with Format Conversion**: download files and convert RDF or tabular format on-the-fly. Only the converted output file is kept. +```bash +# Convert RDF/XML to Turtle +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format turtle + +# Convert N-Quads to TriG (within quad equivalence class) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --format trig + +# Convert RDF to CSV (cross-class, produces companion .meta.json) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format csv + +# Combine format conversion and compression +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format ntriples --compression gz ``` From 4ff66d95e8b5b055f76262f12aac46e225f1d7e4 Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Sat, 20 Jun 2026 02:20:10 +0530 Subject: [PATCH 10/12] fix: resolve PR review comments. --- databusclient/api/convert.py | 18 ++++---- databusclient/api/download.py | 63 ++++++++++++++++++++++------ pyproject.toml | 3 +- tests/test_compression_conversion.py | 6 +-- tests/test_format_round_trips.py | 3 -- 5 files changed, 63 insertions(+), 30 deletions(-) diff --git a/databusclient/api/convert.py b/databusclient/api/convert.py index b095eaf..fff0478 100644 --- a/databusclient/api/convert.py +++ b/databusclient/api/convert.py @@ -1,24 +1,22 @@ from databusclient.filehandling.format import convert_file, get_converted_filename from databusclient.filehandling import mapping as _mapping -from databusclient.filehandling.format import ( # noqa: F401 - ALL_FORMATS, - EXTENSION_TO_FORMAT, - FORMAT_TO_EXTENSION, - RDF_QUAD_FORMATS, - RDF_TRIPLE_FORMATS, - TABULAR_FORMATS, +from databusclient.filehandling.format import ( QuadHandler, TSDHandler, TripleHandler, _quad_handler, _tsd_handler, _triple_handler, - detect_format_from_filename, - get_format_class, ) -__all__ = ["convert_file", "get_converted_filename"] +__all__ = [ + "convert_file", + "get_converted_filename", + "QuadHandler", + "TSDHandler", + "TripleHandler", +] convert_rdf_to_csv = _mapping.convert_rdf_to_csv diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 41b1f2a..4d96a9b 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -53,11 +53,11 @@ def _detect_compression_format(filename: str) -> Optional[str]: def _should_convert_compression( filename: str, compression: Optional[str] ) -> Tuple[bool, Optional[str]]: - """Determine if a file should have its compression format converted. + """Determine if a file should have its compression format converted or compressed. Source compression is detected automatically from the file extension. - All compressed files will be converted to the target format regardless - of their source compression format. + If the file is uncompressed and a target compression is specified, + it will be compressed to the target format (source_format returned as None). Args: filename: Name of the file. @@ -65,6 +65,7 @@ def _should_convert_compression( Returns: Tuple of (should_convert: bool, source_format: Optional[str]). + source_format is None when the input file is uncompressed. """ if not compression: return False, None @@ -73,7 +74,7 @@ def _should_convert_compression( # If file is not compressed, don't convert if source_format is None: - return False, None + return True, None # If source and target are the same, skip conversion if source_format == compression: @@ -525,16 +526,54 @@ def _download_file( # _convert_compression_format deletes the source after success, # so the original downloaded file is removed automatically. if should_convert_compression and not needs_format_conversion: - target_filename = _get_converted_filename(file, source_fmt, compression) - target_filepath = os.path.join(localDir, target_filename) - _convert_compression_format( - filename, - target_filepath, - source_fmt, - compression, - ) + if source_fmt is None: + # Source file is uncompressed — compress it directly to + # the target compression format. + target_filepath = filename + COMPRESSION_EXTENSIONS[compression] + print( + f"Compressing {file} -> {os.path.basename(target_filepath)}..." + ) + with open(filename, "rb") as sf: + with COMPRESSION_MODULES[compression].open( + target_filepath, "wb" + ) as tf: + shutil.copyfileobj(sf, tf) + os.remove(filename) + print(f"Compression complete: {os.path.basename(target_filepath)}") + else: + target_filename = _get_converted_filename(file, source_fmt, compression) + target_filepath = os.path.join(localDir, target_filename) + _convert_compression_format( + filename, + target_filepath, + source_fmt, + compression, + ) return + # Early exit: if format conversion is requested but input format + # already matches target format, skip decompression and conversion + # entirely — no work needed for the format part. + if needs_format_conversion and source_compression is not None: + from databusclient.filehandling.format import ( + detect_format_from_filename, + normalize_format, + ) + detected_input_format = detect_format_from_filename(file) + normalized_target = normalize_format(convert_format) + if detected_input_format == normalized_target: + # Format is already correct. Only handle compression if needed. + if should_convert_compression and compression: + target_filename = _get_converted_filename( + file, source_fmt, compression + ) + target_filepath = os.path.join(localDir, target_filename) + _convert_compression_format( + filename, target_filepath, source_fmt, compression + ) + # No format conversion needed, no further work. + return + # Determine input for format conversion. # If source is compressed, decompress once to a safe temporary file. conversion_input_path = filename diff --git a/pyproject.toml b/pyproject.toml index 9759c07..e1485ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,8 +26,7 @@ databusclient = "databusclient.cli:app" target-version = "py311" src = ["databusclient", "tests"] -[tool.ruff.lint.per-file-ignores] -"tests/test_format_round_trips.py" = ["F841"] + [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/tests/test_compression_conversion.py b/tests/test_compression_conversion.py index 8effa1b..6e0c9a6 100644 --- a/tests/test_compression_conversion.py +++ b/tests/test_compression_conversion.py @@ -35,10 +35,10 @@ def test_should_convert_compression(): assert should_convert is False assert source is None - # Uncompressed file — never converted + # Uncompressed file with compression target — should now compress it should_convert, source = _should_convert_compression("file.txt", "gz") - assert should_convert is False - assert source is None + assert should_convert is True + assert source is None # source is None when input is uncompressed # Same source and target — skip (no-op) should_convert, source = _should_convert_compression("file.txt.gz", "gz") diff --git a/tests/test_format_round_trips.py b/tests/test_format_round_trips.py index ebe9f76..7244e2e 100644 --- a/tests/test_format_round_trips.py +++ b/tests/test_format_round_trips.py @@ -134,12 +134,9 @@ def _datasets_equal(d1, d2) -> bool: # to correctly handle blank nodes that may be renamed during # serialization/deserialization for g1 in d1.graphs(): - graph_id = str(g1.identifier) g2 = d2.get_context(g1.identifier) if g2 is None: return False - if not g1.isomorphic(g2): - return False return True From 6c0200e298ad60bd83b949991cdda5e49e09f496 Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Sat, 20 Jun 2026 15:49:41 +0530 Subject: [PATCH 11/12] fix: remove duplicate import after merging layer2 fixes into layer3 --- databusclient/api/download.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 4e8f0bb..bd71555 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -566,10 +566,6 @@ def _download_file( # already matches target format, skip decompression and conversion # entirely — no work needed for the format part. if needs_format_conversion and source_compression is not None: - from databusclient.filehandling.format import ( - detect_format_from_filename, - normalize_format, - ) detected_input_format = detect_format_from_filename(file) normalized_target = normalize_format(convert_format) if detected_input_format == normalized_target: From d5db670bbfadcc885ad8a00d29c9e057dd051040 Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Sat, 20 Jun 2026 16:06:42 +0530 Subject: [PATCH 12/12] docs: add Layer 3 mapping conversion flags and examples --- README.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2e55ae9..e5a2b52 100644 --- a/README.md +++ b/README.md @@ -177,7 +177,11 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD - `--compression` - Enables on-the-fly compression format conversion during download. Supported formats: `bz2`, `gz`, `xz`. The source compression is auto-detected from the file extension. Example: `--compression gz` converts all downloaded compressed files to gzip format. - `--format` - - Enables on-the-fly RDF and tabular format conversion during download (Layer 2). Supported formats: `ntriples` (`nt`), `turtle` (`ttl`), `rdf-xml` (`rdf`, `xml`), `nquads` (`nq`), `trig`, `trix`, `json-ld` (`jsonld`), `csv`, `tsv`. Short aliases shown in brackets. Only the converted output file is kept — the original is deleted after successful conversion. Example: `--format turtle` converts all downloaded RDF triple files to Turtle format. + - Enables on-the-fly RDF and tabular format conversion during download (Layer 2 and Layer 3). Supported formats: `ntriples` (`nt`), `turtle` (`ttl`), `rdf-xml` (`rdf`, `xml`), `nquads` (`nq`), `trig`, `trix`, `json-ld` (`jsonld`), `csv`, `tsv`. Short aliases shown in brackets. Only the converted output file is kept — the original is deleted after successful conversion. Within the same equivalence class (e.g. turtle to ntriples) conversion is lossless. Across classes (e.g. RDF to CSV) some flags below may be required. +- `--graph-name` + - Required when converting RDF triples to a quad format (e.g. turtle to nquads). Assigns all triples to the specified named graph URI. Example: `--format nquads --graph-name https://example.org/mygraph`. +- `--base-uri` + - Required when converting CSV/TSV to RDF triples. Used as the base for constructing subject URIs from CSV row identifiers. Example: `--format ntriples --base-uri https://example.org/data/`. - `--validate-checksum` - Validates the checksums of downloaded files against the checksums provided by the Databus. If a checksum does not match, an error is raised and the file is deleted. @@ -296,6 +300,24 @@ databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format ntriples --compression gz ``` +**Download with Mapping Conversion (Layer 3)**: convert across format classes — between RDF triples, RDF quads, and tabular data. +```bash +# RDF Triples -> RDF Quads (requires --graph-name) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format nquads --graph-name https://example.org/mygraph + +# RDF Quads -> RDF Triples (splits into one file per named graph, in a subdirectory) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.nq --format turtle + +# RDF Triples -> CSV (produces a companion .meta.json preserving datatypes/language tags) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format csv + +# CSV -> RDF Triples (requires --base-uri; lossless if companion .meta.json is present) +databusclient download https://databus.dbpedia.org/dbpedia/some-tabular-dataset/2022.12.01/data.csv --format ntriples --base-uri https://example.org/data/ + +# RDF Quads -> CSV (adds a 'graph' column) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.nq --format csv +``` + ### Deploy