From e13a168e4ac5c0e5df271ebee06526d664f7e291 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 10 Jun 2026 12:40:16 -0700 Subject: [PATCH 1/6] enrich: OC material/object-type concept overlay onto the wide (#272, fixes #260) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Overlays Eric Kansa's OC PQG concept mappings onto the unified wide: p__has_material_category / p__has_sample_object_type are REPLACED for OC pids — OC wins unconditionally (RY decision 2026-06-10, #272). Mints IdentifiedConcept rows for URIs the frozen export never had (e.g. otheranthropogenicmaterial — the correct #260 value, absent entirely). - scripts/enrich_wide_with_oc_concepts.py: deterministic single-pass DuckDB overlay; ordered-list preservation; hard-fails on dup pids/row_ids and unresolved OC concept refs; emits .manifest.json (input shas, counts). - scripts/validate_oc_concept_enrichment.py: INDEPENDENT trust gate — re-derives expected URI lists from (src, oc) with its own SQL; non-overlay rows must be byte-identical; minted set must be exactly the missing URIs; #260 sentinel. - tests/test_oc_concept_enrichment.py: 13 fixture tests incl. unconditional-win, order preservation, determinism (bit-identical), validator tamper-detection, hard-failure modes. - validate_frontend_derived.py: #260 sentinel parameterized by data vintage (--sentinel-material); default now the post-#272 corrected value. - Makefile: oc-wide / enrich / validate-enrich / all-272 chain; CI runs both fixture suites. - DATA_PROVENANCE.md: Stage 3 split into 3a (thumbnails) + 3b (OC concepts). Scope (documented): overlay only — ~75K new OC records not ingested; p__has_context_category untouched. Both follow-ups tracked in #272. Co-Authored-By: Claude Fable 5 --- .github/workflows/pipeline-tests.yml | 8 +- DATA_PROVENANCE.md | 10 +- Makefile | 39 ++- scripts/enrich_wide_with_oc_concepts.py | 287 ++++++++++++++++++++ scripts/validate_frontend_derived.py | 20 +- scripts/validate_oc_concept_enrichment.py | 197 ++++++++++++++ tests/test_oc_concept_enrichment.py | 307 ++++++++++++++++++++++ 7 files changed, 854 insertions(+), 14 deletions(-) create mode 100644 scripts/enrich_wide_with_oc_concepts.py create mode 100644 scripts/validate_oc_concept_enrichment.py create mode 100644 tests/test_oc_concept_enrichment.py diff --git a/.github/workflows/pipeline-tests.yml b/.github/workflows/pipeline-tests.yml index 5fb0164..3bb3f91 100644 --- a/.github/workflows/pipeline-tests.yml +++ b/.github/workflows/pipeline-tests.yml @@ -7,7 +7,10 @@ on: paths: - "scripts/build_frontend_derived.py" - "scripts/validate_frontend_derived.py" + - "scripts/enrich_wide_with_oc_concepts.py" + - "scripts/validate_oc_concept_enrichment.py" - "tests/test_frontend_derived.py" + - "tests/test_oc_concept_enrichment.py" - "scripts/requirements.txt" - "Makefile" - ".github/workflows/pipeline-tests.yml" @@ -16,7 +19,10 @@ on: paths: - "scripts/build_frontend_derived.py" - "scripts/validate_frontend_derived.py" + - "scripts/enrich_wide_with_oc_concepts.py" + - "scripts/validate_oc_concept_enrichment.py" - "tests/test_frontend_derived.py" + - "tests/test_oc_concept_enrichment.py" workflow_dispatch: jobs: @@ -33,4 +39,4 @@ jobs: # builds tiny synthetic wides (WKB BLOB + DuckDB GEOMETRY), runs the real # builder + algebraic validator, asserts the contract. Exits non-zero on # any failure -> PR is blocked. - run: python -m pytest tests/test_frontend_derived.py -q + run: python -m pytest tests/test_frontend_derived.py tests/test_oc_concept_enrichment.py -q diff --git a/DATA_PROVENANCE.md b/DATA_PROVENANCE.md index 4b795ce..b492695 100644 --- a/DATA_PROVENANCE.md +++ b/DATA_PROVENANCE.md @@ -17,8 +17,11 @@ STAGE 0/1 export_client → JSONL → GeoParquet STAGE 2 pqg/pqg/sql_converter.py (export → base PQG; 7-stage DuckDB SQL) → narrow (…_narrow.parquet, ~844MB, 106M rows) and wide (…_wide.parquet, ~282MB, 20M rows) ▼ -STAGE 3 sidecar/enrichment merge (LEFT JOIN by pid) ← Eric's independently-maintained OC PQG (GCS) - scripts/enrich_wide_with_oc_thumbnails.py → isamples_202604_wide.parquet (+47K thumbnails) +STAGE 3 sidecar/enrichment merges (LEFT JOIN by pid) ← Eric's independently-maintained OC PQG (GCS) + 3a scripts/enrich_wide_with_oc_thumbnails.py → isamples_202604_wide.parquet (+47K thumbnails) + 3b scripts/enrich_wide_with_oc_concepts.py → isamples_202606_wide.parquet (#272: OC material/ + object-type concepts REPLACE the frozen export's for OC pids — OC wins unconditionally; + gate: scripts/validate_oc_concept_enrichment.py) ▼ STAGE 4 wide → frontend derived files (NOW SCRIPTED: scripts/build_frontend_derived.py) → wide_h3 · h3_summary_res4/6/8 · samples_map_lite · sample_facets_v2 · facet_summaries · facet_cross_filter @@ -36,7 +39,8 @@ DuckDB-WASM in the browser (explorer.qmd; parquet URLs ~L767-781) |---|---|---|---| | **0/1 Export** | Solr API → `isamples_export_*_geo.parquet` | `export_client` `ExportClient.perform_full_download()` (`export_client.py:423-469`) → `write_geoparquet_from_json_lines()`; schema `SOURCE_COLUMNS` (`duckdb_utilities.py:9-42`, incl. `keywords: STRUCT(keyword VARCHAR)[]` — **text only, no URI**, L17) | ❌ API offline; **frozen** | | **2 Base PQG** | export → `*_narrow.parquet` / `*_wide.parquet` | `pqg/pqg/sql_converter.py` `convert_isamples_sql(input, output, wide=…)` (CLI `python pqg/sql_converter.py in.parquet out.parquet [--wide]`); 7 stages, decomposes nested structs → nodes+edges; site dedupe by rounded lat/lon+label | ✅ scripted (exact prod invocation not recorded — gap) | -| **3 Sidecar merge** | base wide + Eric's OC PQG → `isamples_202604_wide.parquet` | `scripts/enrich_wide_with_oc_thumbnails.py` — `LEFT JOIN` OC `(pid, thumbnail_url)` into wide (`COALESCE`). **This is the precedent for merging ANY per-source supplement (incl. concept URIs) by pid.** Drift check: `scripts/check_oc_pqg_drift.py` (detects only; no mirror) | ⚠️ merge scripted; OC mirror + R2 upload manual | +| **3a Sidecar: thumbnails** | base wide + Eric's OC PQG → `isamples_202604_wide.parquet` | `scripts/enrich_wide_with_oc_thumbnails.py` — `LEFT JOIN` OC `(pid, thumbnail_url)` into wide (`COALESCE`). Drift check: `scripts/check_oc_pqg_drift.py` (detects only; no mirror) | ⚠️ merge scripted; OC mirror + R2 upload manual | +| **3b Sidecar: OC concepts (#272)** | 3a wide + Eric's OC **wide** → `isamples_202606_wide.parquet` | `scripts/enrich_wide_with_oc_concepts.py` — REPLACES `p__has_material_category` / `p__has_sample_object_type` for OC pids with OC's ordered concept lists (**OC wins unconditionally** — RY decision 2026-06-10, #272); mints `IdentifiedConcept` rows for URIs the frozen export never had (e.g. `otheranthropogenicmaterial`, the #260 fix); deterministic; emits `.manifest.json`. Independent gate: `scripts/validate_oc_concept_enrichment.py` (re-derives from inputs; non-overlay rows must be byte-identical). Scope: overlay only — ~75K OC records absent from the frozen export are NOT ingested (follow-up); `p__has_context_category` untouched (follow-up). | ✅ merge + gate scripted (`make all-272`); R2 upload manual | | **4 Frontend derived** | wide → 7 explorer files | The 6 map/facet files (`wide_h3`, `h3_summary_res4/6/8`, `samples_map_lite`, `sample_facets_v2`, `facet_summaries`, `facet_cross_filter`) ← **`scripts/build_frontend_derived.py`** (deterministic; geometry-agnostic; emits a manifest). `vocab_labels.parquet` ← `scripts/build_vocab_labels.py` (SKOS TTLs). Gated by `scripts/validate_frontend_derived.py` (algebraic + `--wide` semantic re-derivation) + `tests/test_frontend_derived.py` (fixtures, CI). | ✅ scripted; facet/map files semantic-tested; wide_h3 column-smoke-tested | | **5 Publish** | files → R2 + Worker | Worker `workers/data-isamples-org/src/index.js` (`wrangler deploy`); immutable cache for `isamples_\d{6}_*.parquet`; `/current/.parquet` → 302 via `current/manifest.json`. Bucket `isamples-ry` | ⚠️ Worker scripted; **file upload + manifest update are manual** | diff --git a/Makefile b/Makefile index bde1ddd..ce49483 100644 --- a/Makefile +++ b/Makefile @@ -2,31 +2,42 @@ # # make test # fast fixture tests (no network, no big data) — the CI gate # make wide # download + checksum the canonical wide parquet -# make derived # build the derived files from $(WIDE) into $(OUTDIR) +# make oc-wide # download + checksum Eric's OC PQG wide (concept source of truth, #272) +# make enrich # overlay OC material/object-type concepts onto $(WIDE) -> $(ENRICHED) +# make validate-enrich # independent trust gate for the enrichment (non-zero exit on failure) +# make derived # build the derived files from $(DERIVED_WIDE) into $(OUTDIR) # make validate # algebraic trust gate over the built files (non-zero exit on failure) -# make all # wide -> derived -> validate +# make all # wide -> derived -> validate (no enrichment) +# make all-272 # wide+oc-wide -> enrich -> validate-enrich -> derived -> validate # # Override on the command line, e.g.: -# make all WIDE_URL=https://data.isamples.org/isamples_202604_wide.parquet TAG=isamples_202606 +# make all-272 TAG=isamples_202606 # # Requirements: python with `pip install -r scripts/requirements.txt`, plus # network access on first run (DuckDB pulls the h3 community extension). PY ?= python WIDE_URL ?= https://data.isamples.org/isamples_202604_wide.parquet +OC_WIDE_URL ?= https://storage.googleapis.com/opencontext-parquet/oc_isamples_pqg_wide.parquet OUTDIR ?= build/derived WIDE ?= $(OUTDIR)/wide.parquet +OC_WIDE ?= $(OUTDIR)/oc_wide.parquet TAG ?= isamples_dev +ENRICHED ?= $(OUTDIR)/$(TAG)_wide.parquet +# derived files build from the plain wide by default; all-272 overrides to the enriched one +DERIVED_WIDE ?= $(WIDE) BUILD := scripts/build_frontend_derived.py VALIDATE := scripts/validate_frontend_derived.py +ENRICH := scripts/enrich_wide_with_oc_concepts.py +VALIDATE_ENRICH := scripts/validate_oc_concept_enrichment.py -.PHONY: help test wide derived validate all clean +.PHONY: help test wide oc-wide enrich validate-enrich derived validate all all-272 clean help: @grep -E '^# make' Makefile | sed 's/^# / /' # Fast, deterministic fixture tests — the gate a human (or CI) runs without any AI. test: - $(PY) -m pytest tests/test_frontend_derived.py -q + $(PY) -m pytest tests/test_frontend_derived.py tests/test_oc_concept_enrichment.py -q wide: $(WIDE) $(WIDE): @@ -34,13 +45,29 @@ $(WIDE): curl -fSL -o $(WIDE) "$(WIDE_URL)" @echo "sha256: $$(shasum -a 256 $(WIDE) | cut -d' ' -f1) $(WIDE)" +oc-wide: $(OC_WIDE) +$(OC_WIDE): + @mkdir -p $(OUTDIR) + curl -fSL -o $(OC_WIDE) "$(OC_WIDE_URL)" + @echo "sha256: $$(shasum -a 256 $(OC_WIDE) | cut -d' ' -f1) $(OC_WIDE)" + +enrich: $(WIDE) $(OC_WIDE) + $(PY) $(ENRICH) --src $(WIDE) --oc-wide $(OC_WIDE) --out $(ENRICHED) + +validate-enrich: + $(PY) $(VALIDATE_ENRICH) --src $(WIDE) --oc-wide $(OC_WIDE) --out $(ENRICHED) + derived: $(WIDE) - $(PY) $(BUILD) --wide $(WIDE) --outdir $(OUTDIR) --tag $(TAG) --skip wide_h3 + $(PY) $(BUILD) --wide $(DERIVED_WIDE) --outdir $(OUTDIR) --tag $(TAG) --skip wide_h3 validate: $(PY) $(VALIDATE) --dir $(OUTDIR) --tag $(TAG) all: wide derived validate +# Full #272 chain: enrich the wide with OC concepts, gate it, then build+gate derived. +all-272: wide oc-wide enrich validate-enrich + $(MAKE) derived validate DERIVED_WIDE=$(ENRICHED) TAG=$(TAG) + clean: rm -rf $(OUTDIR) diff --git a/scripts/enrich_wide_with_oc_concepts.py b/scripts/enrich_wide_with_oc_concepts.py new file mode 100644 index 0000000..75eb6b1 --- /dev/null +++ b/scripts/enrich_wide_with_oc_concepts.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +"""Overlay OpenContext material/object-type concepts onto the unified wide parquet. + +Issue #272 (fixes #260): the unified wide derives from the FROZEN iSamples +Central export, whose vocab mappings for OpenContext samples are stale or wrong +(e.g. a ceramic carrying [anthropogenicmetal, biogenicnonorganicmaterial, rock]). +Eric Kansa's independently-maintained OC PQG carries the corrected mappings. + +POLICY (RY decision, 2026-06-10, #272): **OC wins unconditionally for OC pids.** +For every MaterialSampleRecord pid present in the OC wide, this script REPLACES +`p__has_material_category` and `p__has_sample_object_type` with the OC values — +including when OC is less specific (root-only). The frozen export's "specific" +values for OC samples are exactly the class proven untrustworthy in #260. + +WHAT IT DOES (single DuckDB pass, deterministic): + 1. Extract per-pid ORDERED concept-URI lists for both dimensions from the OC + wide (array order is preserved — the frontend builder picks the first + non-root concept by array order). + 2. Map URIs -> target row_ids via the src wide's IdentifiedConcept rows + (duplicate-URI concepts resolve to MIN(row_id), deterministically). + 3. Mint NEW IdentifiedConcept rows for URIs the src wide lacks (e.g. + `otheranthropogenicmaterial` — absent from the frozen export entirely). + New row_ids = max(src row_id) + dense rank by URI (deterministic). + Labels/scheme metadata are carried from the OC concept rows. + 4. Write src rows with the two p__ columns replaced for overlay pids + (all other rows and columns byte-identical), UNION the new concept rows. + 5. Emit a {out}.manifest.json (inputs' sha256, counts, argv, git SHA). + +WHAT IT DOES *NOT* DO (scope, documented in #272): + - OC samples absent from the src wide (~75K new records) are NOT ingested — + overlay only. New-record ingestion is a follow-up. + - `p__has_context_category` is untouched (unverified against OC; follow-up). + +HARD FAILURES (refuses to write): + - duplicate pids among OC MaterialSampleRecords (overlay grain would be wrong) + - any OC concept reference that does not resolve to an OC IdentifiedConcept row + - duplicate row_ids among src entity rows (entity grain would be wrong) + +Usage: + python scripts/enrich_wide_with_oc_concepts.py \ + --src isamples_202604_wide.parquet \ + --oc-wide oc_isamples_pqg_wide_2026-06-09.parquet \ + --out isamples_202606_wide.parquet + +Validate with scripts/validate_oc_concept_enrichment.py (independent re-derivation). +""" +import argparse +import hashlib +import json +import os +import subprocess +import sys +import time + +import duckdb + +# the two overlay dimensions: (src/our wide column, human name) +DIMS = ["p__has_material_category", "p__has_sample_object_type"] + + +def sha256_file(path, _bufsize=1 << 20): + h = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(_bufsize), b""): + h.update(chunk) + return h.hexdigest() + + +def git_sha(): + try: + return subprocess.check_output( + ["git", "rev-parse", "HEAD"], + cwd=os.path.dirname(os.path.abspath(__file__)), + stderr=subprocess.DEVNULL).decode().strip() + except Exception: + return None + + +def log(msg, t0): + print(f"[{time.time()-t0:6.1f}s] {msg}", flush=True) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--src", required=True, help="unified wide parquet (frozen-export lineage)") + ap.add_argument("--oc-wide", required=True, help="Eric's OC PQG wide parquet (source of truth for OC concepts)") + ap.add_argument("--out", required=True, help="output enriched wide parquet") + ap.add_argument("--no-manifest", action="store_true") + args = ap.parse_args() + + for fp in (args.src, args.oc_wide): + if not os.path.exists(fp): + sys.exit(f"FATAL: missing input {fp}") + if os.path.abspath(args.out) in (os.path.abspath(args.src), os.path.abspath(args.oc_wide)): + sys.exit("FATAL: --out must not overwrite an input") + + t0 = time.time() + con = duckdb.connect() + SRC = f"read_parquet('{args.src}')" + OC = f"read_parquet('{args.oc_wide}')" + + # ---- schema contract: src must carry both p__ columns; capture full column list + src_cols = [(r[0], r[1]) for r in con.sql(f"DESCRIBE SELECT * FROM {SRC}").fetchall()] + src_colnames = [c for c, _ in src_cols] + for d in DIMS: + if d not in src_colnames: + sys.exit(f"FATAL: src wide lacks required column {d}") + p_types = {c: t for c, t in src_cols if c in DIMS} + + # ---- grain checks (hard-fail before any writing) ---------------------- + n_dup_src_rowid = con.sql( + f"SELECT COUNT(*) FROM (SELECT row_id FROM {SRC} GROUP BY row_id HAVING COUNT(*)>1)").fetchone()[0] + n_dup_oc_pid = con.sql( + f"SELECT COUNT(*) FROM (SELECT pid FROM {OC} WHERE otype='MaterialSampleRecord' " + f"GROUP BY pid HAVING COUNT(*)>1)").fetchone()[0] + if n_dup_src_rowid or n_dup_oc_pid: + sys.exit(f"FATAL: non-unique keys — src duplicate row_ids={n_dup_src_rowid}, " + f"OC duplicate MSR pids={n_dup_oc_pid}. Refusing to write.") + + # ---- 1. OC per-pid ORDERED URI lists, both dims ------------------------ + # WITH ORDINALITY preserves OC's array order; every rid MUST resolve to an + # OC IdentifiedConcept row (unresolved refs are a hard error, not a drop). + con.execute(f""" + CREATE TEMP TABLE oc_ref AS + SELECT s.pid, 'p__has_material_category' AS dim, u.rid, u.ord + FROM {OC} s, UNNEST(s.p__has_material_category) WITH ORDINALITY AS u(rid, ord) + WHERE s.otype='MaterialSampleRecord' + UNION ALL + SELECT s.pid, 'p__has_sample_object_type' AS dim, u.rid, u.ord + FROM {OC} s, UNNEST(s.p__has_sample_object_type) WITH ORDINALITY AS u(rid, ord) + WHERE s.otype='MaterialSampleRecord'; + + CREATE TEMP TABLE oc_concepts AS + SELECT row_id, pid AS uri, label, scheme_name, scheme_uri + FROM {OC} WHERE otype='IdentifiedConcept'; + """) + n_unresolved = con.sql(""" + SELECT COUNT(*) FROM oc_ref r LEFT JOIN oc_concepts c ON c.row_id = r.rid + WHERE c.row_id IS NULL""").fetchone()[0] + if n_unresolved: + sys.exit(f"FATAL: {n_unresolved} OC concept references do not resolve to an " + f"OC IdentifiedConcept row. OC file is internally inconsistent; refusing to write.") + + con.execute(""" + CREATE TEMP TABLE oc_uri_lists AS + SELECT r.pid, r.dim, list(c.uri ORDER BY r.ord, c.uri) AS uris + FROM oc_ref r JOIN oc_concepts c ON c.row_id = r.rid + GROUP BY r.pid, r.dim; + """) + log("OC ordered URI lists extracted", t0) + + # ---- 2. URI -> target row_id (existing concepts; MIN(row_id) on dup URIs) + con.execute(f""" + CREATE TEMP TABLE src_concept_map AS + SELECT pid AS uri, MIN(row_id) AS row_id + FROM {SRC} WHERE otype='IdentifiedConcept' GROUP BY pid; + """) + + # ---- 3. mint rows for missing URIs (deterministic ids: max + rank by URI) + max_row_id = con.sql(f"SELECT COALESCE(MAX(row_id), 0) FROM {SRC}").fetchone()[0] + con.execute(f""" + CREATE TEMP TABLE new_concepts AS + WITH missing AS ( + SELECT DISTINCT u.uri + FROM (SELECT DISTINCT unnest(uris) AS uri FROM oc_uri_lists) u + LEFT JOIN src_concept_map m ON m.uri = u.uri + WHERE m.uri IS NULL + ), + meta AS ( + -- deterministic metadata per URI (OC may carry duplicate concept rows) + SELECT uri, MIN(label) AS label, MIN(scheme_name) AS scheme_name, MIN(scheme_uri) AS scheme_uri + FROM oc_concepts GROUP BY uri + ) + SELECT {max_row_id} + ROW_NUMBER() OVER (ORDER BY missing.uri) AS row_id, + missing.uri, meta.label, meta.scheme_name, meta.scheme_uri + FROM missing JOIN meta ON meta.uri = missing.uri; + + CREATE TEMP TABLE uri_map AS + SELECT uri, row_id FROM src_concept_map + UNION ALL + SELECT uri, row_id FROM new_concepts; + """) + n_new = con.sql("SELECT COUNT(*) FROM new_concepts").fetchone()[0] + log(f"new IdentifiedConcept rows to mint: {n_new}", t0) + + # ---- 4. per-pid mapped row_id lists ------------------------------------ + con.execute(""" + CREATE TEMP TABLE overlay AS + SELECT l.pid, + MAX(CASE WHEN l.dim='p__has_material_category' THEN ids END) AS mat_ids, + MAX(CASE WHEN l.dim='p__has_sample_object_type' THEN ids END) AS obj_ids + FROM ( + SELECT ol.pid, ol.dim, list(m.row_id ORDER BY u.ord, m.row_id) AS ids + FROM oc_uri_lists ol, + UNNEST(ol.uris) WITH ORDINALITY AS u(uri, ord) + JOIN uri_map m ON m.uri = u.uri + GROUP BY ol.pid, ol.dim + ) l + GROUP BY l.pid; + """) + n_overlay = con.sql("SELECT COUNT(*) FROM overlay").fetchone()[0] + n_match = con.sql(f""" + SELECT COUNT(*) FROM overlay o + JOIN {SRC} s ON s.pid = o.pid AND s.otype='MaterialSampleRecord'""").fetchone()[0] + log(f"overlay pids={n_overlay:,} matched in src={n_match:,} " + f"unmatched (new OC records, NOT ingested)={n_overlay-n_match:,}", t0) + + # ---- 5. write: replaced src rows + minted concept rows ------------------ + # OC WINS UNCONDITIONALLY for matched MaterialSampleRecord pids — the two + # p__ columns become the OC-derived lists (NOT COALESCE). + new_concept_select = ", ".join( + { + "row_id": f"n.row_id::{dict(src_cols)['row_id']} AS row_id", + "pid": "n.uri AS pid", + "otype": "'IdentifiedConcept' AS otype", + "label": "n.label AS label", + "scheme_name": "n.scheme_name AS scheme_name", + "scheme_uri": "n.scheme_uri AS scheme_uri", + }.get(c, f"NULL::{t} AS {c}") + for c, t in src_cols + ) + replace_exprs = ", ".join( + f"(CASE WHEN ov.pid IS NOT NULL AND s.otype='MaterialSampleRecord' " + f"THEN ov.{alias}::{p_types[col]} ELSE s.{col} END) AS {col}" + for col, alias in [("p__has_material_category", "mat_ids"), + ("p__has_sample_object_type", "obj_ids")]) + con.execute(f""" + COPY ( + SELECT s.* REPLACE ({replace_exprs}) + FROM {SRC} s LEFT JOIN overlay ov ON ov.pid = s.pid + UNION ALL BY NAME + SELECT {new_concept_select} FROM new_concepts n + ORDER BY row_id + ) TO '{args.out}' (FORMAT PARQUET, COMPRESSION ZSTD) + """) + log(f"wrote {args.out}", t0) + + # ---- post-write accounting ---------------------------------------------- + OUT = f"read_parquet('{args.out}')" + n_src, n_out = (con.sql(f"SELECT COUNT(*) FROM {SRC}").fetchone()[0], + con.sql(f"SELECT COUNT(*) FROM {OUT}").fetchone()[0]) + if n_out != n_src + n_new: + sys.exit(f"FATAL: row count {n_out} != src {n_src} + new concepts {n_new}") + changed = con.sql(f""" + SELECT + COUNT(*) FILTER (WHERE s.p__has_material_category IS DISTINCT FROM o.p__has_material_category), + COUNT(*) FILTER (WHERE s.p__has_sample_object_type IS DISTINCT FROM o.p__has_sample_object_type) + FROM {SRC} s JOIN {OUT} o ON o.row_id = s.row_id + WHERE s.otype='MaterialSampleRecord'""").fetchone() + log(f"rows={n_out:,} (src {n_src:,} + {n_new} minted concepts) " + f"material changed={changed[0]:,} object_type changed={changed[1]:,}", t0) + + if not args.no_manifest: + manifest = { + "script": os.path.basename(__file__), + "argv": sys.argv, + "git_sha": git_sha(), + "duckdb_version": duckdb.__version__, + "policy": "OC wins unconditionally for OC pids (#272, RY 2026-06-10)", + "dims": DIMS, + "inputs": { + "src": {"path": args.src, "bytes": os.path.getsize(args.src), + "sha256": sha256_file(args.src)}, + "oc_wide": {"path": args.oc_wide, "bytes": os.path.getsize(args.oc_wide), + "sha256": sha256_file(args.oc_wide)}, + }, + "counts": { + "src_rows": n_src, "out_rows": n_out, + "overlay_pids": n_overlay, "overlay_matched": n_match, + "overlay_unmatched_new_oc_records": n_overlay - n_match, + "minted_concepts": n_new, + "material_changed": changed[0], "object_type_changed": changed[1], + }, + "output": {"path": args.out, "bytes": os.path.getsize(args.out), + "sha256": sha256_file(args.out)}, + } + mpath = args.out + ".manifest.json" + with open(mpath, "w") as fh: + json.dump(manifest, fh, indent=2) + log(f"manifest → {mpath}", t0) + + log("done", t0) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/validate_frontend_derived.py b/scripts/validate_frontend_derived.py index 4cd8ad7..9976e26 100755 --- a/scripts/validate_frontend_derived.py +++ b/scripts/validate_frontend_derived.py @@ -26,8 +26,16 @@ def sha256_file(path, _b=1 << 20): return h.hexdigest() MATERIAL_ROOT = "https://w3id.org/isample/vocabulary/material/1.0/material" -PID_K = "ark:/28722/k2p55x96j" # #260 sentinel: must not flip under #271 selection -PID_K_EXPECTED = "https://w3id.org/isample/vocabulary/material/1.0/anthropogenicmetal" +PID_K = "ark:/28722/k2p55x96j" # #260 sentinel (a ceramic) +# Expected sentinel material BY DATA VINTAGE: +# pre-#272 wides (frozen-export lineage, e.g. 202601/202604): the export's +# (wrong) value — anthropogenicmetal — because #271 only fixed SELECTION, +# not the data. The check then asserts selection didn't regress. +# post-#272 wides (OC-concept-enriched, 202606+): the corrected value from +# Eric's OC PQG — otheranthropogenicmaterial (see #260/#272). +# Default = the enriched expectation; validate a legacy build with +# --sentinel-material https://w3id.org/isample/vocabulary/material/1.0/anthropogenicmetal +PID_K_EXPECTED = "https://w3id.org/isample/vocabulary/material/1.0/otheranthropogenicmaterial" EXPECTED_SCHEMA = { "facets": [("pid", "VARCHAR"), ("source", "VARCHAR"), ("material", "VARCHAR"), @@ -46,6 +54,9 @@ def main(): "(re-derive and diff the written files against a fresh build)") ap.add_argument("--min-rows", type=int, default=1_000_000, help="floor for the non-empty sanity check (use 1 for fixtures)") + ap.add_argument("--sentinel-material", default=PID_K_EXPECTED, + help="expected material URI for the #260 sentinel pid; default is the " + "post-#272 (OC-enriched) value — override for pre-enrichment builds") a = ap.parse_args() def f(name, attr): @@ -80,12 +91,13 @@ def scalar(sql): check("material root absent", scalar(f"SELECT COUNT(*) FROM {F} WHERE material='{MATERIAL_ROOT}'") == 0, "facets rows with bare root material (want 0)") - # --- 2. #260 sentinel preserved (skip when the pid isn't in this dataset, e.g. fixtures) --- + # --- 2. #260 sentinel (skip when the pid isn't in this dataset, e.g. fixtures) --- row = con.sql(f"SELECT material FROM {F} WHERE pid='{PID_K}'").fetchone() if row is None: info.append(f"sentinel {PID_K} not present (N/A for this dataset)") else: - check(f"sentinel {PID_K} preserved", row[0] == PID_K_EXPECTED, f"got {row}") + check(f"sentinel {PID_K} == expected vintage value", row[0] == a.sentinel_material, + f"got {row}, expected {a.sentinel_material} (wrong --sentinel-material for this data vintage?)") # --- 3. PID uniqueness (browser relies on one row per pid) --- check("facets pid unique", scalar(f"SELECT COUNT(*) FROM (SELECT pid FROM {F} GROUP BY pid HAVING COUNT(*)>1)") == 0, diff --git a/scripts/validate_oc_concept_enrichment.py b/scripts/validate_oc_concept_enrichment.py new file mode 100644 index 0000000..c601018 --- /dev/null +++ b/scripts/validate_oc_concept_enrichment.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +"""Independent trust gate for the OC concept enrichment (#272). + +Validates an enriched wide AGAINST ITS INPUTS — it re-derives the expected +result from (src, oc-wide) with its own SQL (deliberately NOT importing the +enrichment script) and asserts the written output matches. Exits non-zero on +any failure so it can gate a publish. + +What a wrong output looks like and which check catches it: + - an OC pid kept the frozen export's junk materials -> overlay-applied + - URI list order scrambled (changes facet selection) -> overlay-applied (order-sensitive) + - a non-OC row was modified -> non-overlay untouched + - rows dropped/duplicated -> row accounting / grain + - minted concept missing, wrong id, or extra rows -> minted concepts exact + - #260 ceramic still "anthropogenic metal" -> sentinel + +Usage: + python scripts/validate_oc_concept_enrichment.py \ + --src isamples_202604_wide.parquet \ + --oc-wide oc_isamples_pqg_wide_2026-06-09.parquet \ + --out isamples_202606_wide.parquet +""" +import argparse +import hashlib +import json +import os +import sys + +import duckdb + +SENTINEL_PID = "ark:/28722/k2p55x96j" # #260: ceramic, must carry otheranthropogenicmaterial +SENTINEL_MATERIAL = "https://w3id.org/isample/vocabulary/material/1.0/otheranthropogenicmaterial" +DIMS = [("p__has_material_category", "mat"), ("p__has_sample_object_type", "obj")] + + +def sha256_file(path, _b=1 << 20): + h = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(_b), b""): + h.update(chunk) + return h.hexdigest() + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--src", required=True) + ap.add_argument("--oc-wide", required=True) + ap.add_argument("--out", required=True) + a = ap.parse_args() + + con = duckdb.connect() + SRC = f"read_parquet('{a.src}')" + OC = f"read_parquet('{a.oc_wide}')" + OUT = f"read_parquet('{a.out}')" + + R, info = [], [] + def check(name, passed, detail=""): + R.append((name, bool(passed), detail)) + def scalar(sql): + return con.sql(sql).fetchone()[0] + + # ---- expected per-pid ORDERED URI lists from OC (independent derivation) + con.execute(f""" + CREATE TEMP TABLE exp_oc AS + SELECT s.pid, + (SELECT list(c.pid ORDER BY u.ord, c.pid) + FROM UNNEST(s.p__has_material_category) WITH ORDINALITY AS u(rid, ord) + JOIN {OC} c ON c.row_id=u.rid AND c.otype='IdentifiedConcept') AS mat_uris, + (SELECT list(c.pid ORDER BY u.ord, c.pid) + FROM UNNEST(s.p__has_sample_object_type) WITH ORDINALITY AS u(rid, ord) + JOIN {OC} c ON c.row_id=u.rid AND c.otype='IdentifiedConcept') AS obj_uris + FROM {OC} s WHERE s.otype='MaterialSampleRecord'; + """) + + # ---- actual per-pid URI lists in OUT (resolve out's row_ids -> out's concepts) + con.execute(f""" + CREATE TEMP TABLE out_concepts AS + SELECT row_id, pid AS uri FROM {OUT} WHERE otype='IdentifiedConcept'; + CREATE TEMP TABLE act_out AS + SELECT s.pid, + (SELECT list(c.uri ORDER BY u.ord, c.uri) + FROM UNNEST(s.p__has_material_category) WITH ORDINALITY AS u(rid, ord) + JOIN out_concepts c ON c.row_id=u.rid) AS mat_uris, + (SELECT list(c.uri ORDER BY u.ord, c.uri) + FROM UNNEST(s.p__has_sample_object_type) WITH ORDINALITY AS u(rid, ord) + JOIN out_concepts c ON c.row_id=u.rid) AS obj_uris + FROM {OUT} s + WHERE s.otype='MaterialSampleRecord' AND s.pid IN (SELECT pid FROM exp_oc); + """) + + # --- 1. overlay applied, order-sensitive, both dims --------------------- + n_expected_in_src = scalar(f""" + SELECT COUNT(*) FROM exp_oc e + JOIN {SRC} s ON s.pid=e.pid AND s.otype='MaterialSampleRecord'""") + bad_overlay = scalar(""" + SELECT COUNT(*) FROM exp_oc e JOIN act_out o ON o.pid=e.pid + WHERE e.mat_uris IS DISTINCT FROM o.mat_uris + OR e.obj_uris IS DISTINCT FROM o.obj_uris""") + n_act = scalar("SELECT COUNT(*) FROM act_out") + check("overlay applied (OC == OUT, ordered, both dims)", bad_overlay == 0, + f"{bad_overlay} overlay pids differ from OC expectation") + check("all matched OC pids present in OUT", n_act == n_expected_in_src, + f"act={n_act} expected={n_expected_in_src}") + + # --- 2. non-overlay rows byte-identical --------------------------------- + cols = ", ".join(r[0] for r in con.sql(f"DESCRIBE SELECT * FROM {SRC}").fetchall()) + untouched_src = (f"SELECT {cols} FROM {SRC} WHERE pid NOT IN (SELECT pid FROM exp_oc) " + f"OR otype <> 'MaterialSampleRecord'") + untouched_out = (f"SELECT {cols} FROM {OUT} WHERE row_id <= (SELECT MAX(row_id) FROM {SRC}) " + f"AND (pid NOT IN (SELECT pid FROM exp_oc) OR otype <> 'MaterialSampleRecord')") + diff = scalar(f"""SELECT + (SELECT COUNT(*) FROM (({untouched_src}) EXCEPT ALL ({untouched_out}))) + + (SELECT COUNT(*) FROM (({untouched_out}) EXCEPT ALL ({untouched_src})))""") + check("non-overlay rows untouched", diff == 0, f"{diff} row diffs outside the overlay") + + # --- 3. minted concepts: exactly the missing URIs, ids beyond src max ---- + max_src = scalar(f"SELECT COALESCE(MAX(row_id),0) FROM {SRC}") + minted_diff = scalar(f""" + WITH oc_uris AS ( + SELECT DISTINCT unnest(mat_uris) AS uri FROM exp_oc + UNION SELECT DISTINCT unnest(obj_uris) FROM exp_oc), + missing AS ( + SELECT uri FROM oc_uris + WHERE uri NOT IN (SELECT pid FROM {SRC} WHERE otype='IdentifiedConcept') + AND uri IS NOT NULL), + minted AS (SELECT pid AS uri FROM {OUT} WHERE row_id > {max_src}) + SELECT + (SELECT COUNT(*) FROM (SELECT uri FROM missing EXCEPT SELECT uri FROM minted)) + + (SELECT COUNT(*) FROM (SELECT uri FROM minted EXCEPT SELECT uri FROM missing))""") + check("minted concepts == exactly the missing URIs", minted_diff == 0, + f"{minted_diff} URI mismatches between minted rows and missing set") + bad_minted_type = scalar( + f"SELECT COUNT(*) FROM {OUT} WHERE row_id > {max_src} AND otype <> 'IdentifiedConcept'") + check("minted rows are IdentifiedConcept", bad_minted_type == 0, f"{bad_minted_type} bad otype") + + # --- 4. grain + accounting ---------------------------------------------- + n_src, n_out = scalar(f"SELECT COUNT(*) FROM {SRC}"), scalar(f"SELECT COUNT(*) FROM {OUT}") + n_minted = scalar(f"SELECT COUNT(*) FROM {OUT} WHERE row_id > {max_src}") + check("row accounting (out == src + minted)", n_out == n_src + n_minted, + f"out={n_out:,} src={n_src:,} minted={n_minted}") + dup = scalar(f"SELECT COUNT(*) FROM (SELECT row_id FROM {OUT} GROUP BY row_id HAVING COUNT(*)>1)") + check("out row_id unique", dup == 0, f"{dup} duplicate row_ids") + + # --- 5. every referenced concept resolves in OUT ------------------------- + dangling = scalar(f""" + SELECT COUNT(*) FROM ( + SELECT u.rid FROM {OUT} s, UNNEST(s.p__has_material_category) AS u(rid) + WHERE s.otype='MaterialSampleRecord' + UNION ALL + SELECT u.rid FROM {OUT} s, UNNEST(s.p__has_sample_object_type) AS u(rid) + WHERE s.otype='MaterialSampleRecord') refs + LEFT JOIN out_concepts c ON c.row_id = refs.rid + WHERE c.row_id IS NULL""") + check("no dangling concept references in OUT", dangling == 0, f"{dangling} dangling refs") + + # --- 6. #260 sentinel ---------------------------------------------------- + row = con.sql(f"SELECT mat_uris FROM act_out WHERE pid='{SENTINEL_PID}'").fetchone() + if row is None: + info.append(f"sentinel {SENTINEL_PID} not present (N/A for this dataset)") + else: + check(f"sentinel {SENTINEL_PID} == [{SENTINEL_MATERIAL.rsplit('/',1)[1]}]", + row[0] == [SENTINEL_MATERIAL], f"got {row[0]}") + + # --- 7. manifest integrity (if present) ---------------------------------- + mpath = a.out + ".manifest.json" + if os.path.exists(mpath): + try: + man = json.load(open(mpath)) + except Exception as e: + man = None + check("manifest parses", False, f"unreadable: {e}") + if man: + check("manifest output sha256 matches file", + man.get("output", {}).get("sha256") == sha256_file(a.out), + "out file does not match its manifest") + for key, path in (("src", a.src), ("oc_wide", a.oc_wide)): + msha = man.get("inputs", {}).get(key, {}).get("sha256") + if msha: + check(f"manifest input sha256 matches --{key.replace('_','-')}", + msha == sha256_file(path), f"{key} sha mismatch") + else: + info.append("no .manifest.json next to --out (manifest verification skipped)") + + print(f"\n{'CHECK':<52} {'RESULT':<6} DETAIL\n" + "-" * 100) + ok = True + for name, passed, detail in R: + ok = ok and passed + print(f"{name:<52} {'PASS' if passed else 'FAIL':<6} {detail}") + print("-" * 100) + for line in info: + print(" info:", line) + print("\n" + ("ALL CHECKS PASS" if ok else "FAILURES PRESENT")) + sys.exit(0 if ok else 1) + + +if __name__ == "__main__": + main() diff --git a/tests/test_oc_concept_enrichment.py b/tests/test_oc_concept_enrichment.py new file mode 100644 index 0000000..3908954 --- /dev/null +++ b/tests/test_oc_concept_enrichment.py @@ -0,0 +1,307 @@ +"""Fast, AI-free fixture tests for the OC concept enrichment (#272, fixes #260). + +Builds tiny synthetic src-wide + oc-wide parquet pairs, runs the real +enrichment script + independent validator against them, and asserts the +contract — especially the production cases: + - OC wins UNCONDITIONALLY (incl. OC root-only replacing src "specifics") + - array ORDER preserved (frontend picks first non-root by order) + - concepts missing from src are minted deterministically (#260's + otheranthropogenicmaterial was absent from the frozen export entirely) + - non-OC rows and non-overlay columns byte-identical + - determinism: same inputs -> bit-identical output + - validator FAILS on tampered output (overlay reverted; concept row dropped) + - hard failure on duplicate OC pids / unresolved OC concept refs + +Run: pytest tests/test_oc_concept_enrichment.py -q (needs: duckdb) +""" +import hashlib +import json +import os +import subprocess +import sys + +import duckdb +import pytest + +HERE = os.path.dirname(os.path.abspath(__file__)) +REPO = os.path.dirname(HERE) +ENRICH = os.path.join(REPO, "scripts", "enrich_wide_with_oc_concepts.py") +VALIDATE = os.path.join(REPO, "scripts", "validate_oc_concept_enrichment.py") + +MAT = "https://w3id.org/isample/vocabulary/material/1.0/" +OBJ = "https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/" +ROOT = MAT + "material" + +# ---- src wide fixture ------------------------------------------------------- +# concepts present in src (row_id, uri). NOTE: 'otheranthropogenicmaterial' +# is deliberately ABSENT (mirrors the frozen export). +SRC_CONCEPTS = [ + (101, ROOT), + (102, MAT + "anthropogenicmetal"), + (103, MAT + "rock"), + (104, MAT + "biogenicnonorganicmaterial"), + (105, OBJ + "artifact"), + (106, OBJ + "othersolidobject"), +] +# (row_id, pid, mat_ids, obj_ids) — src MaterialSampleRecords +SRC_SAMPLES = [ + # the #260 shape: ceramic with the junk trio; OC will correct it + (1, "ark:/28722/k2p55x96j", [102, 104, 103], [105]), + # OC will replace specifics with ROOT-only (unconditional-win case) + (2, "ark:/28722/rootonly", [103, 104], [105]), + # OC sample whose URI list ORDER differs from src order + (3, "ark:/28722/order", [103, 102], [106]), + # NOT in OC -> must remain byte-identical + (4, "igsn:NONOC1", [103], [106]), +] +# a non-MSR entity row that shares NOTHING with the overlay -> untouched +SRC_OTHER = [(50, "event-1", "SamplingEvent")] + +# ---- oc wide fixture -------------------------------------------------------- +OC_CONCEPTS = [ + (9001, MAT + "otheranthropogenicmaterial", "Other anthropogenic material", + "iSamples Materials Vocabulary", MAT + "materialsvocabulary"), + (9002, ROOT, "Material", "iSamples Materials Vocabulary", MAT + "materialsvocabulary"), + (9003, MAT + "rock", "Rock", "iSamples Materials Vocabulary", MAT + "materialsvocabulary"), + (9004, OBJ + "artifact", "Artifact", None, None), + (9005, MAT + "organicmaterial", "Organic material", None, None), +] +# (pid, mat_ids, obj_ids) +OC_SAMPLES = [ + ("ark:/28722/k2p55x96j", [9001], [9004]), # the #260 correction + ("ark:/28722/rootonly", [9002], [9004]), # OC root-only wins + ("ark:/28722/order", [9003, 9001, 9005], [9004]), # order must survive + ("ark:/28722/newrecord", [9001], [9004]), # NOT in src -> not ingested +] + +SRC_NULL_COLS = [ # (name, sqltype) — non-overlay columns carried in the fixture + ("label", "VARCHAR"), ("description", "VARCHAR"), ("thumbnail_url", "VARCHAR"), + ("scheme_name", "VARCHAR"), ("scheme_uri", "VARCHAR"), + ("p__has_context_category", "BIGINT[]"), ("p__keywords", "BIGINT[]"), +] + + +def _arr(xs, t="BIGINT[]"): + return f"NULL::{t}" if xs is None else "[" + ",".join(str(x) for x in xs) + f"]::{t}" + + +def _null_cols(overrides=None): + o = overrides or {} + return ", ".join( + f"{o[c]} AS {c}" if c in o else f"NULL::{t} AS {c}" for c, t in SRC_NULL_COLS) + + +def build_src(path): + con = duckdb.connect() + rows = [] + for rid, uri in SRC_CONCEPTS: + rows.append( + f"SELECT {rid}::BIGINT AS row_id, '{uri}' AS pid, 'IdentifiedConcept' AS otype, " + f"NULL::VARCHAR AS n, NULL::BIGINT[] AS p__has_material_category, " + f"NULL::BIGINT[] AS p__has_sample_object_type, " + + _null_cols({"label": f"'{uri}'"})) + for rid, pid, mats, objs in SRC_SAMPLES: + rows.append( + f"SELECT {rid}::BIGINT, '{pid}', 'MaterialSampleRecord', 'TEST', " + f"{_arr(mats)}, {_arr(objs)}, " + + _null_cols({"label": f"'label {pid}'", "description": f"'desc {pid}'", + "thumbnail_url": f"'https://t/{rid}.jpg'", + "p__keywords": "[101]::BIGINT[]"})) + for rid, pid, otype in SRC_OTHER: + rows.append( + f"SELECT {rid}::BIGINT, '{pid}', '{otype}', NULL, " + f"NULL::BIGINT[], NULL::BIGINT[], " + _null_cols()) + con.execute(f"COPY ({' UNION ALL '.join(rows)}) TO '{path}' (FORMAT PARQUET)") + con.close() + + +def build_oc(path, samples=None, concepts=None, extra_msr_sql=None): + con = duckdb.connect() + rows = [] + for rid, uri, label, sname, suri in (concepts or OC_CONCEPTS): + rows.append( + f"SELECT {rid}::INTEGER AS row_id, '{uri}' AS pid, 'IdentifiedConcept' AS otype, " + f"{'NULL' if label is None else repr(label)}::VARCHAR AS label, " + f"{'NULL' if sname is None else repr(sname)}::VARCHAR AS scheme_name, " + f"{'NULL' if suri is None else repr(suri)}::VARCHAR AS scheme_uri, " + f"NULL::INTEGER[] AS p__has_material_category, NULL::INTEGER[] AS p__has_sample_object_type") + for pid, mats, objs in (samples or OC_SAMPLES): + rows.append( + f"SELECT NULL::INTEGER, '{pid}', 'MaterialSampleRecord', NULL::VARCHAR, " + f"NULL::VARCHAR, NULL::VARCHAR, {_arr(mats, 'INTEGER[]')}, {_arr(objs, 'INTEGER[]')}") + if extra_msr_sql: + rows.append(extra_msr_sql) + con.execute(f"COPY ({' UNION ALL '.join(rows)}) TO '{path}' (FORMAT PARQUET)") + con.close() + + +def run_enrich(src, oc, out, no_manifest=False): + cmd = [sys.executable, ENRICH, "--src", src, "--oc-wide", oc, "--out", out] + if no_manifest: + cmd.append("--no-manifest") + return subprocess.run(cmd, capture_output=True, text=True) + + +def run_validate(src, oc, out): + return subprocess.run( + [sys.executable, VALIDATE, "--src", src, "--oc-wide", oc, "--out", out], + capture_output=True, text=True) + + +def mats_of(out, pid): + con = duckdb.connect() + r = con.sql(f""" + SELECT (SELECT list(c.pid ORDER BY u.ord) + FROM UNNEST(s.p__has_material_category) WITH ORDINALITY u(rid, ord) + JOIN read_parquet('{out}') c ON c.row_id=u.rid AND c.otype='IdentifiedConcept') + FROM read_parquet('{out}') s + WHERE s.pid='{pid}' AND s.otype='MaterialSampleRecord'""").fetchone() + con.close() + return r[0] if r else None + + +@pytest.fixture +def pair(tmp_path): + src, oc, out = (str(tmp_path / n) for n in ("src.parquet", "oc.parquet", "out.parquet")) + build_src(src) + build_oc(oc) + return src, oc, out + + +def test_overlay_corrects_260_shape(pair): + src, oc, out = pair + r = run_enrich(src, oc, out) + assert r.returncode == 0, r.stderr + r.stdout + assert mats_of(out, "ark:/28722/k2p55x96j") == [MAT + "otheranthropogenicmaterial"] + + +def test_oc_root_only_wins_unconditionally(pair): + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + # src had [rock, biogenic...]; OC says root-only -> root-only WINS (#272 policy) + assert mats_of(out, "ark:/28722/rootonly") == [ROOT] + + +def test_uri_order_preserved(pair): + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + assert mats_of(out, "ark:/28722/order") == [ + MAT + "rock", MAT + "otheranthropogenicmaterial", MAT + "organicmaterial"] + + +def test_non_oc_rows_untouched_and_new_records_not_ingested(pair): + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + con = duckdb.connect() + # non-OC sample + SamplingEvent rows byte-identical + diff = con.sql(f""" + SELECT (SELECT COUNT(*) FROM ((SELECT * FROM read_parquet('{src}') WHERE pid IN ('igsn:NONOC1','event-1')) + EXCEPT ALL (SELECT * FROM read_parquet('{out}') WHERE pid IN ('igsn:NONOC1','event-1')))) + + (SELECT COUNT(*) FROM ((SELECT * FROM read_parquet('{out}') WHERE pid IN ('igsn:NONOC1','event-1')) + EXCEPT ALL (SELECT * FROM read_parquet('{src}') WHERE pid IN ('igsn:NONOC1','event-1')))) + """).fetchone()[0] + assert diff == 0 + # OC-only sample NOT ingested + n = con.sql(f"SELECT COUNT(*) FROM read_parquet('{out}') WHERE pid='ark:/28722/newrecord'").fetchone()[0] + assert n == 0 + con.close() + + +def test_minted_concepts_deterministic_ids_and_metadata(pair): + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + con = duckdb.connect() + minted = con.sql(f""" + SELECT row_id, pid, label, scheme_name FROM read_parquet('{out}') + WHERE row_id > 106 ORDER BY row_id""").fetchall() + con.close() + # missing URIs sorted: organicmaterial < otheranthropogenicmaterial + assert [m[1] for m in minted] == [MAT + "organicmaterial", MAT + "otheranthropogenicmaterial"] + assert [m[0] for m in minted] == [107, 108] + assert minted[1][2] == "Other anthropogenic material" + assert minted[1][3] == "iSamples Materials Vocabulary" + + +def test_determinism_bit_identical(pair): + src, oc, out = pair + out2 = out.replace("out.parquet", "out2.parquet") + assert run_enrich(src, oc, out, no_manifest=True).returncode == 0 + assert run_enrich(src, oc, out2, no_manifest=True).returncode == 0 + h = lambda p: hashlib.sha256(open(p, "rb").read()).hexdigest() + assert h(out) == h(out2) + + +def test_manifest_written_with_counts(pair): + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + man = json.load(open(out + ".manifest.json")) + c = man["counts"] + assert c["overlay_pids"] == 4 and c["overlay_matched"] == 3 + assert c["overlay_unmatched_new_oc_records"] == 1 + assert c["minted_concepts"] == 2 + assert man["inputs"]["src"]["sha256"] and man["output"]["sha256"] + + +def test_validator_passes_on_good_output(pair): + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + r = run_validate(src, oc, out) + assert r.returncode == 0, r.stdout + r.stderr + + +def test_validator_fails_on_reverted_overlay(pair, tmp_path): + """Adversary: a 'rebuild' that silently kept the src junk values must FAIL.""" + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + tampered = str(tmp_path / "tampered.parquet") + con = duckdb.connect() + con.execute(f""" + COPY ( + SELECT o.* REPLACE ( + (CASE WHEN o.pid='ark:/28722/k2p55x96j' AND o.otype='MaterialSampleRecord' + THEN [102,104,103]::BIGINT[] ELSE o.p__has_material_category END) AS p__has_material_category) + FROM read_parquet('{out}') o ORDER BY row_id + ) TO '{tampered}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + con.close() + r = run_validate(src, oc, tampered) + assert r.returncode != 0 + assert "overlay applied" in r.stdout + + +def test_validator_fails_on_dropped_minted_concept(pair, tmp_path): + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + tampered = str(tmp_path / "dropped.parquet") + con = duckdb.connect() + con.execute(f"""COPY (SELECT * FROM read_parquet('{out}') WHERE row_id IS NULL OR row_id <> 108 + ORDER BY row_id) TO '{tampered}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + con.close() + r = run_validate(src, oc, tampered) + assert r.returncode != 0 + + +def test_hard_fail_on_duplicate_oc_pids(pair): + src, oc, out = pair + dup_oc = oc.replace("oc.parquet", "oc_dup.parquet") + build_oc(dup_oc, samples=OC_SAMPLES + [("ark:/28722/k2p55x96j", [9003], [9004])]) + r = run_enrich(src, dup_oc, out) + assert r.returncode != 0 + assert "duplicate" in (r.stderr + r.stdout) + assert not os.path.exists(out) + + +def test_hard_fail_on_unresolved_oc_concept_ref(pair): + src, oc, out = pair + bad_oc = oc.replace("oc.parquet", "oc_bad.parquet") + build_oc(bad_oc, samples=[("ark:/28722/k2p55x96j", [99999], [9004])]) + r = run_enrich(src, bad_oc, out) + assert r.returncode != 0 + assert "resolve" in (r.stderr + r.stdout) + assert not os.path.exists(out) + + +def test_refuses_to_overwrite_input(pair): + src, oc, _ = pair + r = run_enrich(src, oc, src) + assert r.returncode != 0 + assert "overwrite" in (r.stderr + r.stdout) From 55dc100d6c72d8fed15fd584d7397d235d5d4e88 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 10 Jun 2026 12:47:53 -0700 Subject: [PATCH 2/6] =?UTF-8?q?enrich:=20close=20Codex=20round-1=20finding?= =?UTF-8?q?s=20=E2=80=94=20validator=20set/hash=20gates=20+=20legacy=20sen?= =?UTF-8?q?tinel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - validator: ALL src rows now compared on ALL non-replaced columns (keyed row_id hash join — scales to 20.7M, no full-table EXCEPT) [BLOCKER 1 + perf 4] - validator: overlay pid SET equality + distinctness; sentinel absence is a FAILURE when present in inputs (was a silent N/A) [BLOCKER 2] - validator: minted rows must carry OC label/scheme metadata - Makefile: legacy chain passes --sentinel-material (pre-#272 value); all-272 clears it to use the enriched default [MAJOR 3] - enrich: document []->NULL normalization (pqg #8 convention) [MINOR 5] - tests: +3 — both Codex attacks reproduced (verified to fool the OLD validator, caught by the fixed one) + empty-array normalization pin Co-Authored-By: Claude Fable 5 --- Makefile | 10 ++- scripts/enrich_wide_with_oc_concepts.py | 5 ++ scripts/validate_oc_concept_enrichment.py | 87 +++++++++++++++++------ tests/test_oc_concept_enrichment.py | 59 +++++++++++++++ 4 files changed, 138 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index ce49483..03b2386 100644 --- a/Makefile +++ b/Makefile @@ -60,14 +60,20 @@ validate-enrich: derived: $(WIDE) $(PY) $(BUILD) --wide $(DERIVED_WIDE) --outdir $(OUTDIR) --tag $(TAG) --skip wide_h3 +# Sentinel expectation tracks data vintage: the plain (non-enriched) chain +# validates a frozen-export wide -> legacy value; the all-272 chain overrides +# to the OC-corrected default baked into the validator. +LEGACY_SENTINEL := https://w3id.org/isample/vocabulary/material/1.0/anthropogenicmetal +SENTINEL_FLAG ?= --sentinel-material $(LEGACY_SENTINEL) + validate: - $(PY) $(VALIDATE) --dir $(OUTDIR) --tag $(TAG) + $(PY) $(VALIDATE) --dir $(OUTDIR) --tag $(TAG) $(SENTINEL_FLAG) all: wide derived validate # Full #272 chain: enrich the wide with OC concepts, gate it, then build+gate derived. all-272: wide oc-wide enrich validate-enrich - $(MAKE) derived validate DERIVED_WIDE=$(ENRICHED) TAG=$(TAG) + $(MAKE) derived validate DERIVED_WIDE=$(ENRICHED) TAG=$(TAG) SENTINEL_FLAG= clean: rm -rf $(OUTDIR) diff --git a/scripts/enrich_wide_with_oc_concepts.py b/scripts/enrich_wide_with_oc_concepts.py index 75eb6b1..09e6201 100644 --- a/scripts/enrich_wide_with_oc_concepts.py +++ b/scripts/enrich_wide_with_oc_concepts.py @@ -31,6 +31,11 @@ overlay only. New-record ingestion is a follow-up. - `p__has_context_category` is untouched (unverified against OC; follow-up). +NORMALIZATION: an EMPTY OC array (`[]`) becomes NULL in the output — this is +deliberate and matches the wide-format convention that p__* columns are NULL +when no relationship exists (pqg issue #8). In practice all 1.11M OC +MaterialSampleRecords carry non-empty arrays for both dims. + HARD FAILURES (refuses to write): - duplicate pids among OC MaterialSampleRecords (overlay grain would be wrong) - any OC concept reference that does not resolve to an OC IdentifiedConcept row diff --git a/scripts/validate_oc_concept_enrichment.py b/scripts/validate_oc_concept_enrichment.py index c601018..2df3417 100644 --- a/scripts/validate_oc_concept_enrichment.py +++ b/scripts/validate_oc_concept_enrichment.py @@ -88,30 +88,55 @@ def scalar(sql): WHERE s.otype='MaterialSampleRecord' AND s.pid IN (SELECT pid FROM exp_oc); """) - # --- 1. overlay applied, order-sensitive, both dims --------------------- - n_expected_in_src = scalar(f""" - SELECT COUNT(*) FROM exp_oc e - JOIN {SRC} s ON s.pid=e.pid AND s.otype='MaterialSampleRecord'""") + # --- 1. overlay applied, order-sensitive, both dims; pid SET equality ---- + # (Codex round-1 BLOCKER: an inner join + count let a duplicated-pid / + # dropped-sentinel output pass. Use set EXCEPTs, not counts.) bad_overlay = scalar(""" SELECT COUNT(*) FROM exp_oc e JOIN act_out o ON o.pid=e.pid WHERE e.mat_uris IS DISTINCT FROM o.mat_uris OR e.obj_uris IS DISTINCT FROM o.obj_uris""") - n_act = scalar("SELECT COUNT(*) FROM act_out") check("overlay applied (OC == OUT, ordered, both dims)", bad_overlay == 0, f"{bad_overlay} overlay pids differ from OC expectation") - check("all matched OC pids present in OUT", n_act == n_expected_in_src, - f"act={n_act} expected={n_expected_in_src}") - - # --- 2. non-overlay rows byte-identical --------------------------------- - cols = ", ".join(r[0] for r in con.sql(f"DESCRIBE SELECT * FROM {SRC}").fetchall()) - untouched_src = (f"SELECT {cols} FROM {SRC} WHERE pid NOT IN (SELECT pid FROM exp_oc) " - f"OR otype <> 'MaterialSampleRecord'") - untouched_out = (f"SELECT {cols} FROM {OUT} WHERE row_id <= (SELECT MAX(row_id) FROM {SRC}) " - f"AND (pid NOT IN (SELECT pid FROM exp_oc) OR otype <> 'MaterialSampleRecord')") - diff = scalar(f"""SELECT - (SELECT COUNT(*) FROM (({untouched_src}) EXCEPT ALL ({untouched_out}))) + - (SELECT COUNT(*) FROM (({untouched_out}) EXCEPT ALL ({untouched_src})))""") - check("non-overlay rows untouched", diff == 0, f"{diff} row diffs outside the overlay") + pid_set_diff = scalar(f""" + WITH expected AS ( + SELECT e.pid FROM exp_oc e + JOIN {SRC} s ON s.pid=e.pid AND s.otype='MaterialSampleRecord') + SELECT (SELECT COUNT(*) FROM (SELECT pid FROM expected EXCEPT SELECT pid FROM act_out)) + + (SELECT COUNT(*) FROM (SELECT pid FROM act_out EXCEPT SELECT pid FROM expected))""") + check("overlay pid SET == (OC ∩ src) pid SET", pid_set_diff == 0, + f"{pid_set_diff} pids differ between expected and actual overlay sets") + dup_overlay_pid = scalar( + "SELECT COUNT(*) FROM (SELECT pid FROM act_out GROUP BY pid HAVING COUNT(*)>1)") + check("overlay pids distinct in OUT", dup_overlay_pid == 0, + f"{dup_overlay_pid} duplicated overlay pids in OUT") + + # --- 2. ALL src rows present + ALL non-replaced columns identical -------- + # (Codex round-1 BLOCKER: comparing only non-overlay rows let an output + # null label/thumbnail/geometry on overlay rows and still pass. Compare + # EVERY src row on EVERY column except the two replaced arrays — keyed by + # row_id with row hashes, not full-table EXCEPT, so it scales to 20.7M.) + keep_cols = [r[0] for r in con.sql(f"DESCRIBE SELECT * FROM {SRC}").fetchall() + if r[0] not in ("p__has_material_category", "p__has_sample_object_type")] + keep_expr = ", ".join(keep_cols) + missing_rows = scalar(f""" + SELECT COUNT(*) FROM {SRC} s LEFT JOIN {OUT} o ON o.row_id = s.row_id + WHERE o.row_id IS NULL""") + check("every src row_id present in OUT", missing_rows == 0, f"{missing_rows} src rows missing") + col_diff = scalar(f""" + WITH sh AS (SELECT row_id, hash(ROW({keep_expr})) AS h FROM {SRC}), + oh AS (SELECT row_id, hash(ROW({keep_expr})) AS h FROM {OUT}) + SELECT COUNT(*) FROM sh JOIN oh ON oh.row_id = sh.row_id WHERE sh.h <> oh.h""") + check("all non-replaced columns identical (every src row)", col_diff == 0, + f"{col_diff} rows differ outside the two replaced arrays") + # the two replaced arrays: must equal src for everything that is NOT an + # overlay MaterialSampleRecord (overlay rows are covered by check 1). + arr_diff = scalar(f""" + SELECT COUNT(*) FROM {SRC} s JOIN {OUT} o ON o.row_id = s.row_id + WHERE NOT (s.otype='MaterialSampleRecord' AND s.pid IN (SELECT pid FROM exp_oc)) + AND (s.p__has_material_category IS DISTINCT FROM o.p__has_material_category + OR s.p__has_sample_object_type IS DISTINCT FROM o.p__has_sample_object_type)""") + check("replaced arrays untouched outside the overlay", arr_diff == 0, + f"{arr_diff} non-overlay rows had their concept arrays modified") # --- 3. minted concepts: exactly the missing URIs, ids beyond src max ---- max_src = scalar(f"SELECT COALESCE(MAX(row_id),0) FROM {SRC}") @@ -132,6 +157,18 @@ def scalar(sql): bad_minted_type = scalar( f"SELECT COUNT(*) FROM {OUT} WHERE row_id > {max_src} AND otype <> 'IdentifiedConcept'") check("minted rows are IdentifiedConcept", bad_minted_type == 0, f"{bad_minted_type} bad otype") + bad_minted_meta = scalar(f""" + WITH oc_meta AS ( + SELECT pid AS uri, MIN(label) AS label, MIN(scheme_name) AS scheme_name, + MIN(scheme_uri) AS scheme_uri + FROM {OC} WHERE otype='IdentifiedConcept' GROUP BY pid) + SELECT COUNT(*) FROM {OUT} o JOIN oc_meta m ON m.uri = o.pid + WHERE o.row_id > {max_src} + AND (o.label IS DISTINCT FROM m.label + OR o.scheme_name IS DISTINCT FROM m.scheme_name + OR o.scheme_uri IS DISTINCT FROM m.scheme_uri)""") + check("minted rows carry OC label/scheme metadata", bad_minted_meta == 0, + f"{bad_minted_meta} minted rows with wrong metadata") # --- 4. grain + accounting ---------------------------------------------- n_src, n_out = scalar(f"SELECT COUNT(*) FROM {SRC}"), scalar(f"SELECT COUNT(*) FROM {OUT}") @@ -154,12 +191,20 @@ def scalar(sql): check("no dangling concept references in OUT", dangling == 0, f"{dangling} dangling refs") # --- 6. #260 sentinel ---------------------------------------------------- + # N/A ONLY if the pid is absent from the INPUTS (fixtures). If src+oc both + # carry it, its absence from the overlay output is a FAILURE, not a skip + # (Codex round-1: a dropped sentinel row was silently 'N/A'). + in_inputs = scalar(f""" + SELECT COUNT(*) FROM {SRC} s + WHERE s.pid='{SENTINEL_PID}' AND s.otype='MaterialSampleRecord' + AND s.pid IN (SELECT pid FROM exp_oc)""") row = con.sql(f"SELECT mat_uris FROM act_out WHERE pid='{SENTINEL_PID}'").fetchone() - if row is None: - info.append(f"sentinel {SENTINEL_PID} not present (N/A for this dataset)") + if not in_inputs and row is None: + info.append(f"sentinel {SENTINEL_PID} not present in inputs (N/A for this dataset)") else: check(f"sentinel {SENTINEL_PID} == [{SENTINEL_MATERIAL.rsplit('/',1)[1]}]", - row[0] == [SENTINEL_MATERIAL], f"got {row[0]}") + row is not None and row[0] == [SENTINEL_MATERIAL], + f"got {row[0] if row else 'MISSING ROW'}") # --- 7. manifest integrity (if present) ---------------------------------- mpath = a.out + ".manifest.json" diff --git a/tests/test_oc_concept_enrichment.py b/tests/test_oc_concept_enrichment.py index 3908954..127ab77 100644 --- a/tests/test_oc_concept_enrichment.py +++ b/tests/test_oc_concept_enrichment.py @@ -300,6 +300,65 @@ def test_hard_fail_on_unresolved_oc_concept_ref(pair): assert not os.path.exists(out) +def test_validator_fails_on_nulled_overlay_columns(pair, tmp_path): + """Codex round-1 BLOCKER: keep the OC URI lists but wreck a popup-facing + column (label) on overlay rows — must FAIL, not pass as 'overlay ok'.""" + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + tampered = str(tmp_path / "nulled.parquet") + con = duckdb.connect() + con.execute(f""" + COPY ( + SELECT o.* REPLACE ( + (CASE WHEN o.pid LIKE 'ark:/28722/%' AND o.otype='MaterialSampleRecord' + THEN NULL ELSE o.label END) AS label) + FROM read_parquet('{out}') o ORDER BY row_id + ) TO '{tampered}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + con.close() + r = run_validate(src, oc, tampered) + assert r.returncode != 0 + assert "non-replaced columns identical" in r.stdout + + +def test_validator_fails_on_duplicated_pid_replacing_sentinel(pair, tmp_path): + """Codex round-1 BLOCKER: replace the sentinel row with a duplicate of + another overlay pid (unique row_id) — counts stay equal; sets must not.""" + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + tampered = str(tmp_path / "dup_pid.parquet") + con = duckdb.connect() + # drop the sentinel row entirely; insert a full clone of another overlay + # row (correct arrays!) reusing the sentinel's row_id. Counts all balance; + # only SET checks notice. + con.execute(f""" + COPY ( + SELECT * FROM read_parquet('{out}') + WHERE NOT (pid='ark:/28722/k2p55x96j' AND otype='MaterialSampleRecord') + UNION ALL + SELECT o.* REPLACE (1::BIGINT AS row_id) + FROM read_parquet('{out}') o + WHERE o.pid='ark:/28722/order' AND o.otype='MaterialSampleRecord' + ORDER BY row_id + ) TO '{tampered}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + con.close() + r = run_validate(src, oc, tampered) + assert r.returncode != 0 + + +def test_empty_oc_array_normalizes_to_null(pair, tmp_path): + """Documented normalization: OC `[]` -> NULL in the output (pqg issue #8 + convention). All real OC rows are non-empty; this pins the edge behavior.""" + src, oc, out = pair + empty_oc = str(tmp_path / "oc_empty.parquet") + build_oc(empty_oc, samples=[("ark:/28722/k2p55x96j", [], [9004])]) + assert run_enrich(src, empty_oc, out).returncode == 0 + con = duckdb.connect() + mats = con.sql(f"""SELECT p__has_material_category FROM read_parquet('{out}') + WHERE pid='ark:/28722/k2p55x96j' AND otype='MaterialSampleRecord'""").fetchone()[0] + con.close() + assert mats is None + + def test_refuses_to_overwrite_input(pair): src, oc, _ = pair r = run_enrich(src, oc, src) From 77a183c2bef8e21571ae739b367df5369ad39514 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 10 Jun 2026 12:53:57 -0700 Subject: [PATCH 3/6] =?UTF-8?q?enrich:=20close=20Codex=20round-2=20finding?= =?UTF-8?q?s=20=E2=80=94=20exact=20minted-row=20gate,=20OC=20grain,=20make?= =?UTF-8?q?=20-j=20safety?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - validator: minted rows now compared FULL-ROW against a re-derived expectation (deterministic ids max(src)+rank(uri); NULLs everywhere except pid/otype/label/scheme) — shifted-id and smuggled-column outputs now fail - enricher + validator: hard-reject duplicate OC IdentifiedConcept row_ids (one reference must never fan out into several URIs) - Makefile: $(ENRICHED) is a real file target; validate-enrich depends on it; all/all-272 use ordered sub-makes — safe under make -j - tests: +3 regression (both round-2 attacks verified to fool the previous gate; dup-concept-row_id input rejected by both scripts) Co-Authored-By: Claude Fable 5 --- Makefile | 16 ++++-- scripts/enrich_wide_with_oc_concepts.py | 10 +++- scripts/validate_oc_concept_enrichment.py | 62 +++++++++++++++-------- tests/test_oc_concept_enrichment.py | 59 +++++++++++++++++++++ 4 files changed, 120 insertions(+), 27 deletions(-) diff --git a/Makefile b/Makefile index 03b2386..da59d62 100644 --- a/Makefile +++ b/Makefile @@ -51,10 +51,12 @@ $(OC_WIDE): curl -fSL -o $(OC_WIDE) "$(OC_WIDE_URL)" @echo "sha256: $$(shasum -a 256 $(OC_WIDE) | cut -d' ' -f1) $(OC_WIDE)" -enrich: $(WIDE) $(OC_WIDE) +# real file dependency so `make -j` orders enrich before validate-enrich +enrich: $(ENRICHED) +$(ENRICHED): $(WIDE) $(OC_WIDE) $(PY) $(ENRICH) --src $(WIDE) --oc-wide $(OC_WIDE) --out $(ENRICHED) -validate-enrich: +validate-enrich: $(ENRICHED) $(PY) $(VALIDATE_ENRICH) --src $(WIDE) --oc-wide $(OC_WIDE) --out $(ENRICHED) derived: $(WIDE) @@ -69,11 +71,15 @@ SENTINEL_FLAG ?= --sentinel-material $(LEGACY_SENTINEL) validate: $(PY) $(VALIDATE) --dir $(OUTDIR) --tag $(TAG) $(SENTINEL_FLAG) -all: wide derived validate +# ordered sub-makes: safe under `make -j` (derived must finish before validate) +all: wide + $(MAKE) derived + $(MAKE) validate # Full #272 chain: enrich the wide with OC concepts, gate it, then build+gate derived. -all-272: wide oc-wide enrich validate-enrich - $(MAKE) derived validate DERIVED_WIDE=$(ENRICHED) TAG=$(TAG) SENTINEL_FLAG= +all-272: validate-enrich + $(MAKE) derived DERIVED_WIDE=$(ENRICHED) TAG=$(TAG) + $(MAKE) validate TAG=$(TAG) SENTINEL_FLAG= clean: rm -rf $(OUTDIR) diff --git a/scripts/enrich_wide_with_oc_concepts.py b/scripts/enrich_wide_with_oc_concepts.py index 09e6201..993a025 100644 --- a/scripts/enrich_wide_with_oc_concepts.py +++ b/scripts/enrich_wide_with_oc_concepts.py @@ -118,9 +118,15 @@ def main(): n_dup_oc_pid = con.sql( f"SELECT COUNT(*) FROM (SELECT pid FROM {OC} WHERE otype='MaterialSampleRecord' " f"GROUP BY pid HAVING COUNT(*)>1)").fetchone()[0] - if n_dup_src_rowid or n_dup_oc_pid: + # duplicate OC concept row_ids would fan ONE reference out into SEVERAL + # URIs through the resolve join (Codex round-2) — input-integrity failure. + n_dup_oc_crid = con.sql( + f"SELECT COUNT(*) FROM (SELECT row_id FROM {OC} WHERE otype='IdentifiedConcept' " + f"AND row_id IS NOT NULL GROUP BY row_id HAVING COUNT(*)>1)").fetchone()[0] + if n_dup_src_rowid or n_dup_oc_pid or n_dup_oc_crid: sys.exit(f"FATAL: non-unique keys — src duplicate row_ids={n_dup_src_rowid}, " - f"OC duplicate MSR pids={n_dup_oc_pid}. Refusing to write.") + f"OC duplicate MSR pids={n_dup_oc_pid}, " + f"OC duplicate concept row_ids={n_dup_oc_crid}. Refusing to write.") # ---- 1. OC per-pid ORDERED URI lists, both dims ------------------------ # WITH ORDINALITY preserves OC's array order; every rid MUST resolve to an diff --git a/scripts/validate_oc_concept_enrichment.py b/scripts/validate_oc_concept_enrichment.py index 2df3417..cfcb453 100644 --- a/scripts/validate_oc_concept_enrichment.py +++ b/scripts/validate_oc_concept_enrichment.py @@ -59,6 +59,15 @@ def check(name, passed, detail=""): def scalar(sql): return con.sql(sql).fetchone()[0] + # ---- input integrity: duplicate OC concept row_ids would fan one + # reference into several URIs through every resolve join below (Codex + # round-2) — reject the inputs before deriving expectations from them. + n_dup_oc_crid = scalar( + f"SELECT COUNT(*) FROM (SELECT row_id FROM {OC} WHERE otype='IdentifiedConcept' " + f"AND row_id IS NOT NULL GROUP BY row_id HAVING COUNT(*)>1)") + check("OC concept row_ids unique (input integrity)", n_dup_oc_crid == 0, + f"{n_dup_oc_crid} duplicated OC IdentifiedConcept row_ids") + # ---- expected per-pid ORDERED URI lists from OC (independent derivation) con.execute(f""" CREATE TEMP TABLE exp_oc AS @@ -138,9 +147,25 @@ def scalar(sql): check("replaced arrays untouched outside the overlay", arr_diff == 0, f"{arr_diff} non-overlay rows had their concept arrays modified") - # --- 3. minted concepts: exactly the missing URIs, ids beyond src max ---- + # --- 3. minted concepts: EXACT full-row expectation, re-derived ---------- + # (Codex round-2 MAJOR: URI-set + metadata spot checks let shifted row_ids + # and smuggled column values pass. Re-derive the complete expected minted + # rows — deterministic ids max(src)+rank(uri), every other column NULL + # except pid/otype/label/scheme — and demand exact equality, all columns.) max_src = scalar(f"SELECT COALESCE(MAX(row_id),0) FROM {SRC}") - minted_diff = scalar(f""" + src_schema = [(r[0], r[1]) for r in con.sql(f"DESCRIBE SELECT * FROM {SRC}").fetchall()] + exp_minted_cols = ", ".join( + { + "row_id": f"({max_src} + ROW_NUMBER() OVER (ORDER BY uri))::{dict(src_schema)['row_id']} AS row_id", + "pid": "uri AS pid", + "otype": "'IdentifiedConcept' AS otype", + "label": "label", + "scheme_name": "scheme_name", + "scheme_uri": "scheme_uri", + }.get(c, f"NULL::{t} AS {c}") + for c, t in src_schema) + con.execute(f""" + CREATE TEMP TABLE expected_minted AS WITH oc_uris AS ( SELECT DISTINCT unnest(mat_uris) AS uri FROM exp_oc UNION SELECT DISTINCT unnest(obj_uris) FROM exp_oc), @@ -148,27 +173,24 @@ def scalar(sql): SELECT uri FROM oc_uris WHERE uri NOT IN (SELECT pid FROM {SRC} WHERE otype='IdentifiedConcept') AND uri IS NOT NULL), - minted AS (SELECT pid AS uri FROM {OUT} WHERE row_id > {max_src}) - SELECT - (SELECT COUNT(*) FROM (SELECT uri FROM missing EXCEPT SELECT uri FROM minted)) + - (SELECT COUNT(*) FROM (SELECT uri FROM minted EXCEPT SELECT uri FROM missing))""") - check("minted concepts == exactly the missing URIs", minted_diff == 0, - f"{minted_diff} URI mismatches between minted rows and missing set") - bad_minted_type = scalar( - f"SELECT COUNT(*) FROM {OUT} WHERE row_id > {max_src} AND otype <> 'IdentifiedConcept'") - check("minted rows are IdentifiedConcept", bad_minted_type == 0, f"{bad_minted_type} bad otype") - bad_minted_meta = scalar(f""" - WITH oc_meta AS ( + meta AS ( SELECT pid AS uri, MIN(label) AS label, MIN(scheme_name) AS scheme_name, MIN(scheme_uri) AS scheme_uri FROM {OC} WHERE otype='IdentifiedConcept' GROUP BY pid) - SELECT COUNT(*) FROM {OUT} o JOIN oc_meta m ON m.uri = o.pid - WHERE o.row_id > {max_src} - AND (o.label IS DISTINCT FROM m.label - OR o.scheme_name IS DISTINCT FROM m.scheme_name - OR o.scheme_uri IS DISTINCT FROM m.scheme_uri)""") - check("minted rows carry OC label/scheme metadata", bad_minted_meta == 0, - f"{bad_minted_meta} minted rows with wrong metadata") + SELECT {exp_minted_cols} + FROM (SELECT m.uri, t.label, t.scheme_name, t.scheme_uri + FROM missing m JOIN meta t ON t.uri = m.uri); + """) + all_cols = ", ".join(c for c, _ in src_schema) + minted_exact_diff = scalar(f""" + SELECT (SELECT COUNT(*) FROM ( + (SELECT {all_cols} FROM expected_minted) + EXCEPT (SELECT {all_cols} FROM {OUT} WHERE row_id > {max_src}))) + + (SELECT COUNT(*) FROM ( + (SELECT {all_cols} FROM {OUT} WHERE row_id > {max_src}) + EXCEPT (SELECT {all_cols} FROM expected_minted)))""") + check("minted rows EXACTLY match re-derived expectation (all columns)", + minted_exact_diff == 0, f"{minted_exact_diff} full-row mismatches among minted rows") # --- 4. grain + accounting ---------------------------------------------- n_src, n_out = scalar(f"SELECT COUNT(*) FROM {SRC}"), scalar(f"SELECT COUNT(*) FROM {OUT}") diff --git a/tests/test_oc_concept_enrichment.py b/tests/test_oc_concept_enrichment.py index 127ab77..37789cb 100644 --- a/tests/test_oc_concept_enrichment.py +++ b/tests/test_oc_concept_enrichment.py @@ -359,6 +359,65 @@ def test_empty_oc_array_normalizes_to_null(pair, tmp_path): assert mats is None +def test_validator_fails_on_shifted_minted_row_ids(pair, tmp_path): + """Codex round-2 MAJOR: move minted rows to arbitrary ids (107/108 -> + 1007/1008), repoint overlay arrays at them — URI resolution still correct, + but the deterministic-id contract is broken. Must FAIL.""" + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + tampered = str(tmp_path / "shifted.parquet") + con = duckdb.connect() + con.execute(f""" + COPY ( + SELECT o.* REPLACE ( + (CASE WHEN o.row_id IN (107,108) THEN o.row_id + 900 ELSE o.row_id END) AS row_id, + list_transform(o.p__has_material_category, + x -> CASE WHEN x IN (107,108) THEN x + 900 ELSE x END) AS p__has_material_category, + list_transform(o.p__has_sample_object_type, + x -> CASE WHEN x IN (107,108) THEN x + 900 ELSE x END) AS p__has_sample_object_type) + FROM read_parquet('{out}') o ORDER BY row_id + ) TO '{tampered}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + con.close() + r = run_validate(src, oc, tampered) + assert r.returncode != 0 + + +def test_validator_fails_on_smuggled_minted_columns(pair, tmp_path): + """Codex round-2 MAJOR: minted rows must be NULL outside + pid/otype/label/scheme — smuggled thumbnail_url must FAIL.""" + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + tampered = str(tmp_path / "smuggled.parquet") + con = duckdb.connect() + con.execute(f""" + COPY ( + SELECT o.* REPLACE ( + (CASE WHEN o.row_id > 106 THEN 'https://evil/x.jpg' ELSE o.thumbnail_url END) AS thumbnail_url) + FROM read_parquet('{out}') o ORDER BY row_id + ) TO '{tampered}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + con.close() + r = run_validate(src, oc, tampered) + assert r.returncode != 0 + assert "EXACTLY match" in r.stdout + + +def test_hard_fail_on_duplicate_oc_concept_row_ids(pair): + """Codex round-2 MAJOR: a duplicated OC concept row_id fans one reference + into several URIs — both enricher and validator must reject the input.""" + src, oc, out = pair + dup_oc = oc.replace("oc.parquet", "oc_dupconcept.parquet") + build_oc(dup_oc, concepts=OC_CONCEPTS + [ + (9003, MAT + "soil", "Soil", None, None)]) # row_id 9003 duplicated, different URI + r = run_enrich(src, dup_oc, out) + assert r.returncode != 0 + assert "concept row_ids" in (r.stderr + r.stdout) + assert not os.path.exists(out) + # validator must also reject it as an input, given any output + assert run_enrich(src, oc, out).returncode == 0 + rv = run_validate(src, dup_oc, out) + assert rv.returncode != 0 + + def test_refuses_to_overwrite_input(pair): src, oc, _ = pair r = run_enrich(src, oc, src) From d92438cccb6590a1fe0b76de4a7632649ce055ce Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 10 Jun 2026 12:58:28 -0700 Subject: [PATCH 4/6] =?UTF-8?q?enrich:=20close=20Codex=20round-3=20finding?= =?UTF-8?q?s=20=E2=80=94=20validator=20input-integrity=20parity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - validator: reject unresolved OC concept refs (inner joins were silently dropping them from the expectation — standalone runs could pass a wrong output) and duplicate OC MSR pids (grain parity with the enricher) - validator: minted expectation uses NOT EXISTS (NULL-pid src concept made NOT IN evaluate UNKNOWN -> false failure) - tests: +3 regression for all three findings Co-Authored-By: Claude Fable 5 --- scripts/validate_oc_concept_enrichment.py | 29 +++++++++++++-- tests/test_oc_concept_enrichment.py | 43 +++++++++++++++++++++++ 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/scripts/validate_oc_concept_enrichment.py b/scripts/validate_oc_concept_enrichment.py index cfcb453..a13f24c 100644 --- a/scripts/validate_oc_concept_enrichment.py +++ b/scripts/validate_oc_concept_enrichment.py @@ -67,6 +67,26 @@ def scalar(sql): f"AND row_id IS NOT NULL GROUP BY row_id HAVING COUNT(*)>1)") check("OC concept row_ids unique (input integrity)", n_dup_oc_crid == 0, f"{n_dup_oc_crid} duplicated OC IdentifiedConcept row_ids") + n_dup_oc_pid = scalar( + f"SELECT COUNT(*) FROM (SELECT pid FROM {OC} WHERE otype='MaterialSampleRecord' " + f"GROUP BY pid HAVING COUNT(*)>1)") + check("OC MSR pids unique (input integrity)", n_dup_oc_pid == 0, + f"{n_dup_oc_pid} duplicated OC MaterialSampleRecord pids") + # unresolved OC concept refs: the inner joins below would silently DROP + # them from the expectation (Codex round-3) — the gate must reject the + # input the same way the enricher does, since it may run standalone. + n_unresolved = scalar(f""" + SELECT COUNT(*) FROM ( + SELECT u.rid FROM {OC} s, UNNEST(s.p__has_material_category) AS u(rid) + WHERE s.otype='MaterialSampleRecord' + UNION ALL + SELECT u.rid FROM {OC} s, UNNEST(s.p__has_sample_object_type) AS u(rid) + WHERE s.otype='MaterialSampleRecord') refs + LEFT JOIN (SELECT row_id FROM {OC} WHERE otype='IdentifiedConcept') c + ON c.row_id = refs.rid + WHERE c.row_id IS NULL""") + check("all OC concept refs resolve (input integrity)", n_unresolved == 0, + f"{n_unresolved} dangling OC concept references") # ---- expected per-pid ORDERED URI lists from OC (independent derivation) con.execute(f""" @@ -170,9 +190,12 @@ def scalar(sql): SELECT DISTINCT unnest(mat_uris) AS uri FROM exp_oc UNION SELECT DISTINCT unnest(obj_uris) FROM exp_oc), missing AS ( - SELECT uri FROM oc_uris - WHERE uri NOT IN (SELECT pid FROM {SRC} WHERE otype='IdentifiedConcept') - AND uri IS NOT NULL), + -- NOT EXISTS, not NOT IN: a NULL-pid src concept would make NOT IN + -- evaluate UNKNOWN and silently empty this set (Codex round-3 MINOR) + SELECT uri FROM oc_uris u + WHERE u.uri IS NOT NULL + AND NOT EXISTS (SELECT 1 FROM {SRC} s + WHERE s.otype='IdentifiedConcept' AND s.pid = u.uri)), meta AS ( SELECT pid AS uri, MIN(label) AS label, MIN(scheme_name) AS scheme_name, MIN(scheme_uri) AS scheme_uri diff --git a/tests/test_oc_concept_enrichment.py b/tests/test_oc_concept_enrichment.py index 37789cb..389be27 100644 --- a/tests/test_oc_concept_enrichment.py +++ b/tests/test_oc_concept_enrichment.py @@ -418,6 +418,49 @@ def test_hard_fail_on_duplicate_oc_concept_row_ids(pair): assert rv.returncode != 0 +def test_validator_rejects_unresolved_oc_refs_standalone(pair, tmp_path): + """Codex round-3 MAJOR: the validator's inner joins silently DROPPED + dangling OC refs from the expectation — an output omitting them passed. + The gate must reject such input even when run standalone.""" + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 # good output from good oc + bad_oc = str(tmp_path / "oc_dangling.parquet") + build_oc(bad_oc, samples=[("ark:/28722/k2p55x96j", [9001, 99999], [9004])]) + r = run_validate(src, bad_oc, out) + assert r.returncode != 0 + assert "concept refs resolve" in r.stdout + + +def test_validator_rejects_duplicate_oc_msr_pids(pair, tmp_path): + """Codex round-3 MAJOR: validator must match the enricher's input grain.""" + src, oc, out = pair + assert run_enrich(src, oc, out).returncode == 0 + dup_oc = str(tmp_path / "oc_duppid.parquet") + build_oc(dup_oc, samples=OC_SAMPLES + [("ark:/28722/k2p55x96j", [9001], [9004])]) + r = run_validate(src, dup_oc, out) + assert r.returncode != 0 + assert "MSR pids unique" in r.stdout + + +def test_null_pid_src_concept_does_not_break_minted_expectation(pair, tmp_path): + """Codex round-3 MINOR: a NULL-pid IdentifiedConcept in src must not + empty the minted expectation (NOT IN NULL trap) — good output still passes.""" + src, oc, out = pair + src2 = str(tmp_path / "src_nullpid.parquet") + con = duckdb.connect() + con.execute(f""" + COPY ( + SELECT * FROM read_parquet('{src}') + UNION ALL + SELECT s.* REPLACE (999::BIGINT AS row_id, NULL::VARCHAR AS pid) + FROM read_parquet('{src}') s WHERE s.row_id = 101 + ) TO '{src2}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + con.close() + assert run_enrich(src2, oc, out).returncode == 0 + r = run_validate(src2, oc, out) + assert r.returncode == 0, r.stdout + r.stderr + + def test_refuses_to_overwrite_input(pair): src, oc, _ = pair r = run_enrich(src, oc, src) From 4f4e1dbc9e0dca6c596a11664f9d5c318183beb1 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 10 Jun 2026 13:00:34 -0700 Subject: [PATCH 5/6] =?UTF-8?q?enrich:=20Codex=20round-4=20NIT=20=E2=80=94?= =?UTF-8?q?=20derived=20target=20depends=20on=20$(DERIVED=5FWIDE)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Fable 5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index da59d62..b4fff9c 100644 --- a/Makefile +++ b/Makefile @@ -59,7 +59,7 @@ $(ENRICHED): $(WIDE) $(OC_WIDE) validate-enrich: $(ENRICHED) $(PY) $(VALIDATE_ENRICH) --src $(WIDE) --oc-wide $(OC_WIDE) --out $(ENRICHED) -derived: $(WIDE) +derived: $(DERIVED_WIDE) $(PY) $(BUILD) --wide $(DERIVED_WIDE) --outdir $(OUTDIR) --tag $(TAG) --skip wide_h3 # Sentinel expectation tracks data vintage: the plain (non-enriched) chain From 8125afc1d28670f4ccb2da7c90735ae889e55da3 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 10 Jun 2026 13:01:54 -0700 Subject: [PATCH 6/6] explorer: read the 202606 OC-concept-enriched data files (#272) All tagged data URLs 202601->202606; wide_url pinned to the explicit versioned file (popups read corrected OC material/object-type from it). current/wide.parquet alias stays on the previous wide until the production cutover decision. Co-Authored-By: Claude Fable 5 --- explorer.qmd | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index 6448f77..770c11c 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -12,8 +12,8 @@ format: include-in-header: text: | - - + + --- @@ -764,18 +764,20 @@ R2_BASE = (() => { // default and absolute overrides (http://localhost:8099/data) pass through. return raw.startsWith('/') ? new URL(raw, location.origin).href : raw; })() -h3_res4_url = `${R2_BASE}/isamples_202601_h3_summary_res4.parquet` -h3_res6_url = `${R2_BASE}/isamples_202601_h3_summary_res6.parquet` -h3_res8_url = `${R2_BASE}/isamples_202601_h3_summary_res8.parquet` -lite_url = `${R2_BASE}/isamples_202601_samples_map_lite.parquet` -// Stable alias that 302-redirects to the current enriched wide parquet -// (isamples_YYYYMM_wide.parquet). Gets OpenContext thumbnails populated. -wide_url = `${R2_BASE}/current/wide.parquet` +h3_res4_url = `${R2_BASE}/isamples_202606_h3_summary_res4.parquet` +h3_res6_url = `${R2_BASE}/isamples_202606_h3_summary_res6.parquet` +h3_res8_url = `${R2_BASE}/isamples_202606_h3_summary_res8.parquet` +lite_url = `${R2_BASE}/isamples_202606_samples_map_lite.parquet` +// Explicit versioned wide (#272: OC concept-enriched — popups read material/ +// object-type from this file). The stable alias `current/wide.parquet` still +// points at the previous wide until the production cutover flips the manifest; +// pinning the version here keeps staging and prod each self-consistent. +wide_url = `${R2_BASE}/isamples_202606_wide.parquet` // v2 carries object_type alongside material and context (URI-string columns). -facets_url = `${R2_BASE}/isamples_202601_sample_facets_v2.parquet` -facet_summaries_url = `${R2_BASE}/isamples_202601_facet_summaries.parquet` +facets_url = `${R2_BASE}/isamples_202606_sample_facets_v2.parquet` +facet_summaries_url = `${R2_BASE}/isamples_202606_facet_summaries.parquet` // Pre-aggregated single-filter cache for fast cross-filtered facet counts. -cross_filter_url = `${R2_BASE}/isamples_202601_facet_cross_filter.parquet` +cross_filter_url = `${R2_BASE}/isamples_202606_facet_cross_filter.parquet` // SKOS prefLabels for Material / Sampled Feature / Specimen Type URIs. // ~60 KB lookup; falls back to URI tail if a URI isn't covered. vocab_labels_url = `${R2_BASE}/vocab_labels.parquet`