From f95ae9b4f021c8027e39e682f91e9cb306d17359 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 25 Jun 2026 17:14:44 -0400 Subject: [PATCH 01/13] HC FST - faster parsing with backstop --- docs/HERMITCRAB_FST_ADVISOR.md | 144 ++++ docs/HERMITCRAB_FST_PLAN.md | 815 ++++++++++++++++++ .../AnalysisCache.cs | 66 ++ .../AnalysisCacheSerializer.cs | 116 +++ .../CachingMorphologicalAnalyzer.cs | 134 +++ .../CompleteHybridMorpher.cs | 85 ++ .../FstReplay.cs | 90 ++ .../FstTemplateAnalyzer.cs | 688 +++++++++++++++ .../FstVerification.cs | 139 +++ .../GrammarFstAdvisor.cs | 584 +++++++++++++ .../GrammarFstClosure.cs | 142 +++ .../MorphToken.cs | 111 +++ .../MorphTokenCodec.cs | 131 +++ .../MorphemeRegistry.cs | 69 ++ .../MorpherPool.cs | 42 + .../VerifiedFstAnalyzer.cs | 50 ++ .../CachingMorphologicalAnalyzerTests.cs | 106 +++ .../FstSenaBenchmark.cs | 211 +++++ .../FstTemplateAnalyzerTests.cs | 137 +++ .../FstVerificationTests.cs | 75 ++ .../GrammarFstAdvisorBenchmark.cs | 29 + .../GrammarFstAdvisorTests.cs | 233 +++++ .../GrammarFstClosureTests.cs | 89 ++ .../MorphTokenCodecTests.cs | 123 +++ .../MorphTokenTests.cs | 67 ++ .../VerifiedFstAnalyzerTests.cs | 118 +++ 26 files changed, 4594 insertions(+) create mode 100644 docs/HERMITCRAB_FST_ADVISOR.md create mode 100644 docs/HERMITCRAB_FST_PLAN.md create mode 100644 src/SIL.Machine.Morphology.HermitCrab/AnalysisCache.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/AnalysisCacheSerializer.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/FstReplay.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/FstVerification.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/GrammarFstClosure.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/MorphToken.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/MorphTokenCodec.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/MorphemeRegistry.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/MorpherPool.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/VerifiedFstAnalyzer.cs create mode 100644 tests/SIL.Machine.Morphology.HermitCrab.Tests/CachingMorphologicalAnalyzerTests.cs create mode 100644 tests/SIL.Machine.Morphology.HermitCrab.Tests/FstSenaBenchmark.cs create mode 100644 tests/SIL.Machine.Morphology.HermitCrab.Tests/FstTemplateAnalyzerTests.cs create mode 100644 tests/SIL.Machine.Morphology.HermitCrab.Tests/FstVerificationTests.cs create mode 100644 tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorBenchmark.cs create mode 100644 tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs create mode 100644 tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstClosureTests.cs create mode 100644 tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenCodecTests.cs create mode 100644 tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenTests.cs create mode 100644 tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs diff --git a/docs/HERMITCRAB_FST_ADVISOR.md b/docs/HERMITCRAB_FST_ADVISOR.md new file mode 100644 index 00000000..9361a04d --- /dev/null +++ b/docs/HERMITCRAB_FST_ADVISOR.md @@ -0,0 +1,144 @@ +# Grammar FST Advisor — plan + +A grammar evolves; one new rule can quietly push it from the fast finite-state path into the +slow combinatorial search. This plan adds a **grammar advisor/linter** that, for any HermitCrab +`Language`, flags the rules that make parsing expensive or block FST compilation, and gives the +grammar engineer **actionable write-ups**: *why* a rule is costly, how to **constrain** it back +into fast territory, and an **alternative formulation** to try. + +It is the front-end to the FST work (`HERMITCRAB_FST_PLAN.md`): the same per-rule classification +that decides the FST tier also drives the warnings. + +## 1. What it does + +Input: a compiled `Language`. Output: a `GrammarFstReport` — a list of per-rule advisories plus +an overall **tier verdict**. Each advisory has: +- **rule name + kind** (affix / phonological / compounding / template), +- **severity**: `Escape` (breaks FST → forces search), `Cost` (inflates the search fan-out), or + `Info`, +- **issue**: one sentence on what's expensive and why, +- **advice**: "constrain it like this" and/or "try this instead". + +## 2. The classifier (what flags what) + +Detected from the object model (`AffixProcessRule.Allomorphs` → `Rhs` actions; `RewriteRule` +Lhs/Subrule environments; `MorphologicalOutputAction.PartName`; `Quantifier.Max/MinOccur`): + +| Signal | Severity | Issue | Advice | +|---|---|---|---| +| **Reduplication** — a part copied ≥2× by `CopyFromInput` | **Escape** | copying an unbounded span isn't finite-state; forces search for any word it could apply to | "If the reduplicant is a fixed size (e.g. one CV syllable), bound the copied part's length → finite-state. If only a few forms reduplicate, list them as lexical entries. Else the grammar stays in the hybrid/search tier." | +| **Infixation / stem split** — ≥2 `CopyFromInput` of *different* parts | **Escape** (unless bounded) | the stem is split at a content-determined position | "If the infix position is fixed, encode it as a bounded split; a variable split blocks FST." | +| **Process modification** — `ModifyFromInput` present | **Info/verify** | FST-able only if the modification is local/bounded | "Local feature change in a fixed context = fine; non-local/agreement = blocks FST — try a bounded reformulation." | +| **Phonological rewrite rule** present | **Info/verify** | FST-able iff its environment is a bounded window | "Bound the left/right environment to the actual window (usually 1–2 segments); unbounded context blocks FST." | +| **Deletion rule** — Lhs longer than Rhs | **Cost** | analysis must guess where deleted segments were and re-insert them (× `DeletionReapplications`) | "Keep `DeletionReapplications` as low as the language needs; bounded deletion context is still FST-able." | +| **Unbounded environment** — a `Quantifier` with infinite `MaxOccur` in an environment | **Escape** | matches an arbitrary-length span | "Replace the `+`/`*` context with the fixed window the rule really needs." | +| **Many allomorphs** on one rule (> threshold) | **Cost** | each allomorph multiplies un-application branching | "Consolidate via environment conditioning where possible." | +| Compounding rule | **Info** | bounded by `MaxStemCount`, so finite | — | + +## 3. Tier verdict (static; corpus refines it) + +- **0 Escape advisories** → **Tier 1 candidate** (fully FST-able) — confirm with the FST compile + + corpus parity check. +- **a few Escapes** → **Tier 2 candidate** (hybrid: escapes fall back to search) — run the corpus + fallback-rate measurement to confirm it's worth it vs. Tier 3. +- **pervasive Escapes** → **Tier 3** (search only). + +The static report can't compute the corpus-weighted fallback rate, so it reports the tier +*candidate* + the escape list; the FST pipeline's corpus pass (`HERMITCRAB_FST_PLAN.md` §1) +confirms it. + +## 4. The "one new rule blew up the grammar" workflow + +Run the advisor before/after a grammar change (or in CI). A new `Escape` advisory that flips the +tier (e.g. Tier 1 → Tier 2) is the warning: it names the offending rule, says it moved the whole +grammar off the fast path, and gives the constrain/alternative write-up. Grammar engineers get +"this rule made parsing slow, here's how to keep it fast" at authoring time. + +## 5. Implementation + +- `GrammarFstAdvisor.Analyze(Language) → GrammarFstReport` in the HermitCrab library (pure static + analysis of the object model; no parsing, no corpus needed). +- `GrammarFstReport.Format()` for a readable dump. +- Tests: a normal concatenative grammar → Tier 1, no escapes; add a reduplication rule → the + advisor flags it `Escape` with the reduplication write-up and downgrades the tier. +- Run on the real Sena grammar and report the advisories + tier. + +## 6. Validate on Sena + +Census already showed Sena is concatenative + no rewrite rules + no productive reduplication → +expect **Tier 1, zero escapes**, possibly a few `Cost`/`Info` notes (allomorph counts, +compounding). That both validates the classifier (no false escapes) and confirms Sena is the +fast-path case. + +## 7. Engine extension — the *regularity* axis (added, kept orthogonal to the warning) + +The advisor answers one question — **"is this slow in today's engine?"** — and the user keeps +asking exactly that ("which rule blew up the grammar", "which cases are still slow"). The +extension adds a *second, independent* question — **"does an FST exist for this in principle?"** +(regular vs non-regular) — **without letting the answer soften the slow-today warning.** + +Why the two must not be merged: the engine that turns "regular" into "fast" is the FST compiler, +and **it does not exist yet** (gated on the unbuilt spike, `HERMITCRAB_FST_PLAN.md` §7). So +"regular" today means *fast eventually, slow now*. If a vowel-harmony rule reported as +`Cost / Tier-1-reachable`, a non-expert reads "fine" — when in the only engine that ships it is +the worst case (harmony on a common segment ⇒ ~every word on the slow path). The severity must +keep telling the truth about **today**. + +So **severity is unchanged** — it means *escapes the finite-state fast path in today's engine* +(forces the combinatorial search). Harmony, infixation, and reduplication (bounded or not) all +stay `Escape`: all are slow now. We only *add* a `Regular` axis that says whether an FST could +reclaim it later, and we report it as a **separate reclaim-path line that never upgrades the +tier**. + +The theory behind the new axis is **Kaplan & Kay (1994)**: a context-sensitive rewrite rule +`φ → ψ / λ _ ρ` with regular `φ, ψ, λ, ρ`, applied obligatorily/directionally (not recursively +into its own unbounded output), **denotes a regular relation — however long `λ`/`ρ` are.** HC's +`RewriteRule` is this form, and its `Rhs` is a *bounded segment specification*, not a copy (copy +lives only in morphological `CopyFromInput`). So: + +- **Unbounded-environment rewrite (harmony/spread): `Regular = true`** — *iff* the rule's own + `Lhs`/`Rhs` are bounded (only the environment is unbounded). Reclaim later by **state-encoding** + the spreading feature (or two-level pre-image arcs). If the `Lhs`/`Rhs` themselves are unbounded + we cannot confirm regularity → `Regular = false` (conservative). Stays `Escape` (slow today). +- **Reduplication splits by boundedness of the copied part.** Look up the copied part's defining + `Lhs` pattern by name: a **length-bounded** reduplicant (fixed CV/CVC) is a finite copy → + `Regular = true` (reclaim by bounded fold). Copying an **unbounded** part (whole stem, + `Annotation(any).OneOrMore`) is the one genuinely non-regular operation (`{ww}` is not regular) + → `Regular = false`. **If the part can't be resolved, default `Regular = false` (warn).** Stays + `Escape` either way. +- **Infixation** at a pattern-defined slot: `Regular = true` (the split is a regular pattern; + reclaim by bounded fold / the per-word probe). Stays `Escape`. + +### The reclaim map (how a `Regular` case *would* be made fast — once the compiler exists) + +| Construct | `Regular` | Slow today? | Reclaim path (needs the FST compiler) | +|---|---|---|---| +| Unbounded-environment rewrite (harmony/spread) | ✅ (bounded Lhs/Rhs) | **yes** | state-encode the spreading feature / two-level pre-image arcs | +| Bounded reduplication (fixed CV reduplicant) | ✅ | **yes** | bounded fold — emit the finite copy as arcs | +| Infixation (pattern-defined slot) | ✅ | **yes** | bounded fold / per-word strip-and-reparse probe | +| Deletion | ✅ | **yes** | inverse probe — re-insert candidate deleted segments, re-parse | +| Unbounded-copy reduplication | ❌ | **yes** | per-word probe only (when surface-invariant); else search | + +`Regular` and `Probeable` (§5a) are both *paths forward*, never excuses: `Regular` = "an FST +could reclaim it (compiler pending)", `Probeable` = "a runtime strip-and-reparse is sound". The +severity and tier keep warning about today. + +### Implementation of the extension + +- Add `GrammarAdvisory.Regular` (`bool?`): true = an FST exists in principle (reclaim by + compiling), false = genuinely non-regular / unconfirmable, null = N/A. **Severity is not + changed by it.** +- Reduplication: resolve the copied part's `Lhs` pattern by name; bounded → `Regular=true`, + unbounded or unresolved → `Regular=false`. Severity stays `Escape`. +- Infixation: `Regular=true`; severity stays `Escape`; keep the per-word-probe advice. +- Unbounded-environment rewrite: `Regular = !(unbounded Lhs or Rhs)`; severity stays `Escape`; + advice = Kaplan–Kay + state-encoding, explicitly "regular in principle but slow in today's + engine". +- Report: count `RegularEscapeCount` vs `NonRegularEscapeCount`; emit a **reclaim-path line** + ("N of M escapes are FST-reclaimable once the compiler exists; all M are slow in today's + engine"). **The tier verdict is unchanged** — no "Tier 1-reachable" upgrade. +- Tests: a non-expert sanity check — a grammar whose only complex rule is harmony must still + report a slow-path warning (escape present), with `Regular=true` only as the reclaim note. + Unbounded-copy reduplication ⇒ `Regular=false`; bounded reduplicant ⇒ `Regular=true`; + infixation ⇒ `Escape` + `Regular=true` (the committed infix test keeps its severity). Sena + unchanged (Tier 1). diff --git a/docs/HERMITCRAB_FST_PLAN.md b/docs/HERMITCRAB_FST_PLAN.md new file mode 100644 index 00000000..9b97a1cf --- /dev/null +++ b/docs/HERMITCRAB_FST_PLAN.md @@ -0,0 +1,815 @@ +# HermitCrab FST acceleration — plan + +> **Shipped MVP (read this first).** The MVP that landed is a **sound, fast, optionally-complete** +> analyzer that reuses HC's own engine: +> - **`FstTemplateAnalyzer`** proposes candidate analyses by walking a precompiled template/derivation +> FST (fast, immutable → thread-safe to share). +> - **`VerifiedFstAnalyzer`** confirms each candidate by **restricted re-analysis** (`FstReplay`): HC's +> own `AnalyzeWord` pinned to that candidate's root+rules via a pooled `Morpher`. A confirmed, +> genuine HC analysis is emitted; anything HC won't confirm is discarded. **Sound** (no wrong +> analyses), **~13×** on Sena, **multithread-safe** (each verify rents a Morpher from `MorpherPool`). +> - **`CompleteHybridMorpher`** adds completeness: a grammar that passes **empirical set-parity** +> (`FstVerification`) runs FST-only; otherwise the search engine is used (the known slow path). +> Per-word control via `AnalyzeWord(word, useFst)` / `UseFstFor`. +> - **`GrammarFstAdvisor` + `GrammarFstClosure`** — the grammar census/linter (PR #441's original core). +> +> **Out of scope / explored-then-abandoned:** the *per-stem completeness proof* (proving the fast path +> complete for every word without the engine). Sections §11.5+ and §12.3+ below document that +> exploration and why it was dropped (rule/symbol coverage ≠ path coverage; the segmentation-superset +> proposer was slower and still incomplete). The shipped completeness model is the empirical +> certificate + engine backstop, not a static per-stem predicate. Deferred to later PRs: the +> generator (reverse direction) and a 2-way-FST/compounding treatment of the residual ~3%. + +Goal: replace HC's combinatorial un-application *search* (measured ~10,000 `Word` clones/word, +397 MB/word, the cause of the ~3× parallel ceiling) with a precompiled **transducer walk** for +the finite-state fraction of a grammar — while **degrading gracefully** to the existing engine +for the parts that aren't finite-state. A grammar census of the real Sena grammar found it +**~100% FST-able** (0 rewrite rules, 0 variables, 0 productive reduplication, all-concatenative +affixation) — and the `GrammarFstAdvisor` in this PR confirms it (Tier 1, 0 escapes) — so for +Sena-like grammars an automaton walk could be 10–100× and near-zero-allocation (which also lifts +the thread ceiling). + +## 1. Tech stack — build on SIL.Machine's own `Fst` (not OpenFst/Foma/HFST) + +The decisive fact: **`SIL.Machine.FiniteState.Fst` already provides the full algebra we need** — +`Compose`, `Determinize`, `Minimize`, `Intersect`, `EpsilonRemoval`, transducer outputs +(`IFstOperations`: Insert/Replace/Remove), and crucially **`UseUnification`** (arcs carry +*feature structures* matched by unification, not just plain symbols). The `RootAllomorphTrie` +is already a lexicon FST built on it. + +| Option | Verdict | +|---|---| +| **SIL.Machine `Fst`** (in-repo) | ✅ **Recommended.** Managed, cross-platform, no interop; *natively models HC's feature-bearing segments with unification*; composition algebra already present; lexicon-FST precedent. | +| OpenFst / Foma / HFST (C/C++) | ❌ for now. Mature + fast, but: plain-symbol alphabets (must flatten feature structures → blowup; even though Sena has no variables, this loses HC's native model), heavy P/Invoke + native build/cross-platform burden, and reconciling results back to HC's `Word`/`Properties`. Reserve only if SIL.Machine's FST can't scale. | + +So the stack is: **C#/.NET on SIL.Machine's `Fst`**, reusing the existing `ShapeNode`/`FeatureStruct` +model. The work is a *compiler* (Language → composed transducer) plus a *runtime* (`IMorphologicalAnalyzer` +that walks it), not a new FST engine. Risk to retire early: validate that `Fst.Compose` + +`Minimize` behave correctly for **unification** arcs at grammar scale (they're proven for the +matcher's pattern FSTs; composition of large lexicon∘affix transducers is the unknown). + +## 2. The compile pipeline (Language → one analyzer transducer) + +1. **Classify** every construct (the census, made a reusable pass): concatenative affix / + template / environment-allomorphy / bounded compounding = **FS-able**; rewrite rule with + unbounded environment, α-variable, productive reduplication, infixation = **non-FS island**. +2. **Build component transducers** for the FS-able fraction: + - lexicon → root transducer (extend `RootAllomorphTrie`), + - each concatenative affix subrule → an insert/concat transducer, + - affix templates → position-class concatenation, + - environment-conditioned allomorphy → context-restricted arcs, + - bounded compounding (`MaxStemCount`) → bounded recursion unrolled. +3. **Compose** them (`lexicon ∘ affixes ∘ templates`) into one transducer, then + **Determinize + Minimize**. Composition bakes in rule ordering/opacity; minimization gives + the optimal shared state set (the Myhill–Nerode classes). +4. **Invert/orient** for analysis (surface → underlying+gloss): the analyzer walks the input + word through the transducer, reading off morpheme IDs / `Properties` on accepting paths — + the same IDs HC's `Word` carries, so the consumer mapping (FieldWorks → LCM) is unchanged. + +## 3. Graceful degradation — the tiered hybrid (the key design) + +The architecture must never regress: the FST is a **sound optimization layered over the proven +search engine**. Three tiers, chosen automatically by the compile-time census: + +- **Tier 1 — fully FS-able grammar (e.g. Sena).** The whole grammar compiles; the transducer is + **complete**. Analysis = automaton walk only; the search engine is never invoked. Maximum win. +- **Tier 2 — FS-able with isolated non-FS rules.** Compile the FS fraction into the transducer; + mark each non-FS operation with an **escape** (flag-diacritic-style arc). At runtime: + - cheaply detect whether any non-FS rule *could* apply to this word (e.g. a reduplication + signature, or a segment a rewrite rule targets); + - if **not** → the transducer is complete for that word → fast path; + - if **yes** → fall back to the existing `Morpher` search for that word (or delegate just the + escaped sub-operation, then resume the walk). + Most words avoid the islands → mostly fast, with the slow path only where needed. +- **Tier 3 — pervasively non-FS (heavy rewrite rules, α-variables, productive reduplication).** + The FST covers too little; **disable it** and use today's search engine. No regression. + +**Soundness contract (non-negotiable):** the FST must (a) never emit a wrong analysis, and +(b) for any word it claims complete, never miss one. Guaranteed by: only compiling +*provably*-FS-able rules; in Tier 2, falling back whenever completeness is uncertain (conservative); +and a **verification mode** during rollout that runs FST + search and asserts identical analyses +across a corpus (we already have the Sena rig + signature comparison for exactly this). + +The degradation is *monotone in grammar complexity*: more FS-able ⇒ more handled by the fast +walk; less FS-able ⇒ more fall-back, down to pure search. Nothing ever gets slower than today. + +## 4. Where it bolts onto the code + +- New `FstMorpherCompiler`: `Language → ComposedAnalyzerFst` (+ the per-grammar tier decision). +- New `FstMorpher : IMorphologicalAnalyzer` (and `IMorphologicalGenerator` for the reverse): walks + the transducer, emits `WordAnalysis` / the morph `Properties`; on a Tier-2 escape, delegates to + an inner `Morpher`. +- Reuse: `RootAllomorphTrie` (lexicon FST), the `Fst` algebra, the `ShapeNode`/`FeatureStruct` + model, the census classifier, and the benchmark + signature comparison for verification. +- Consumers are unaffected: same `IMorphologicalAnalyzer` interface; FieldWorks keeps mapping + morpheme IDs → LCM. + +## 5. Risks & mitigations + +| Risk | Mitigation | +|---|---| +| `Fst.Compose`/`Minimize` unproven on large **unification** transducers | Spike on Sena first; validate output == HC output on the corpus before scaling; fall back to plain-symbol flattening (Sena has no variables) if needed | +| State/alphabet blowup | The **eager/lazy partition knob** (§10): a state/memory budget that auto-demotes expensive-cold layers from precompiled (A) to on-the-fly (B); completeness is invariant under the knob (composition associativity), so bounding size never drops analyses. Minimize-after-compose only on safe (non-unification) layers | +| Tier-2 "is the FST complete for this word?" detector unsound → missed analyses | Make it conservative (fall back when unsure); verification mode catches misses in rollout | +| **Closure**: a normal (FST) step *feeds* an escape, so the automaton's "no path" is a false "done" | Confirm FST closure (§9): static feeding-closure pass (`range(F) ∩ T_E = ∅` via `Fst.Intersect`) + stratal containment; corpus closure verification (set parity) gates replacing the search engine. Undecidable feeding (non-regular escapes in a loop) ⇒ conservatively keep those words on the search backstop | +| Generator (synthesis) direction | Same transducer inverts; or keep HC synthesis initially and only FST-accelerate analysis | +| Grammar-specificity | The census decides the tier per grammar; production grammars must be censused before enabling Tier 1/2 | + +## 6. Phased plan + +1. **Spike (decisive):** compile Sena's lexicon ∘ concatenative-affixes into one transducer via + `Fst.Compose`/`Minimize`; build a minimal `FstMorpher.AnalyzeWord`; **verify** its analyses + equal `Morpher.AnalyzeWord` on the Sena corpus (signature comparison); **measure** clones (→~0), + allocation, and wall-time vs. the search engine. This proves or kills the SIL.Machine-FST stack. +2. **Complete Tier 1:** add templates, environment-allomorphy, bounded compounding; full Sena + parity + the parallel-scaling re-measurement (expect the 8-thread/3× ceiling to lift, since the + walk barely allocates). Build the compiler as a **pipeline of composable layers behind an + eager/lazy interface** (§10) from the start — the partition knob and state budget are Phase 1–2 + architecture, not a later bolt-on. +3. **Tier 2 hybrid:** census-driven escape arcs + per-word fallback detection + verification mode, + gated on **confirming FST closure** (§9) — the static feeding-closure pass + corpus closure + verification that certify the transducer's "no analysis" is a proof, not a guess. +4. **Generator + productionize:** reverse direction, the `IMorphologicalAnalyzer` wiring, and a + FieldWorks adapter; run the census on real production grammars to set each project's tier. + +## 7. Decision gate + +Step 1 (the spike) is the gate: it answers, with numbers, whether SIL.Machine's FST can compose +a real grammar correctly and how big the speedup is. If yes → proceed; if `Fst.Compose` can't +handle it → reassess (flatten to symbols, or external lib). Everything past Step 1 is contingent +on that result. + +## 8. Transducer output schema — the packed morpheme-token array + +What the analyzer transducer emits on an accepting path must be the *structured derivation* +(ordered morphemes + root), not just accept/reject — otherwise it is a **recognizer, not an +analyzer**. HC carries this today as per-segment morph annotations + an ordered allomorph list +(`Word.MorphemesInApplicationOrder` → `WordAnalysis.Morphemes`/`RootMorphemeIndex`); the FST must +emit the same structure as transducer output. + +**Encoding — one 32-bit token per morpheme, in application order:** + +``` + 31 24 23 0 ++----------------+--------------------------------+ +| 8-bit MorphOp | 24-bit morpheme index | ++----------------+--------------------------------+ +``` + +- **op (high 8 bits)** = the morpheme's *role/operation*: Root, Prefix, Suffix, Infix, + Reduplication, CircumfixPrefix/Suffix, Compound, Clitic, Process (simulfix/ModifyFromInput), + Null (zero morph). This is the "ordered operations connected to the letters" — it lets a + consumer rebuild the gloss/bracketing without re-running any rule. +- **morpheme index (low 24 bits)** = an index into the grammar's compiled morpheme table + (→ `IMorpheme.Id`/gloss via a side table — don't pack strings). +- An accepting path's output is the **`uint[]` of these tokens — that array *is* the analysis**, + and it is **self-describing**: `Morphemes` = the indices in array order; `RootMorphemeIndex` = + the position of the `Root` token (no separate field). + +**Why this shape (verdict: sound):** + +- **Compact / cache-friendly / hashable:** 4 bytes per morph (a 5-morph word = 20 bytes); analyses + compare and dedupe as plain integer arrays. +- **24-bit ceiling = 16,777,215 morphemes** — ample (largest FLEx projects are ~10⁵–10⁶ entries); + the compiler asserts `morphemeCount ≤ MaxMorphemeId`. +- **8 bits for the op** is byte-aligned headroom (only ~5 bits used); keep it for growth. + +**What it deliberately does NOT carry — keep these as separate optional channels, do not widen the +token:** + +- **Surface segmentation** (which input letters belong to which morph): if interlinear morph-breaks + are needed, the same walk emits a parallel `int[]` of morph start-offsets. The 32-bit token stays + the pure (op, morpheme) derivation. +- **Specific allomorph** (vs morpheme): an optional second channel; consumers (FieldWorks → LCM) + key on the morpheme. + +Realized now as `MorphToken` / `MorphOp` (the codec + bounds check + root recovery); the FST +compiler (the spike, §6.1) emits these tokens as arc outputs, so the analyzer is structured from +day one rather than a bare recognizer retrofitted later. + +## 9. Confirming FST closure — the completeness certificate + +An FST analyzer is only trustworthy if its **silence is a proof**: "no accepting path" must mean +"no analysis exists", and "these K paths" must mean "exactly these K analyses" (all homographs, +nothing spurious). That is **completeness**, and it does not come for free — it must be *certified* +per grammar. Completeness has two parts, and the second is the hard one: + +1. **No escape applies to the current form.** A local trigger check. Easy. +2. **No FST-able ("normal") step reachable from the input can *create* a form where an escape + then applies.** This is **feeding** (Kiparsky): rule A feeds rule B if A builds B's + environment. If a normal step can feed an escape, the compiled automaton — which excluded + escapes — is **not closed**: a valid derivation exists that it has no path for, so its silence + is a false "done". Everything rests on ruling this out. + +### 9.1 Can closure be guaranteed? Decidably yes for the regular fragment; not universally + +The universal question ("can this grammar *ever* reach an escape configuration?") is **undecidable** +in the limit — general rewriting with non-regular escapes in a feeding loop is Turing-complete. So +"guaranteed for any grammar" is impossible. **But for a given grammar it is usually decidable**, and +when the answer is yes the automaton's silence becomes a theorem. Two mechanisms: + +- **(a) Decidable feeding-closure (the computable certificate).** Each escape `E` has a *trigger + set* `T_E` — the configurations where it fires. For a *regular* escape `T_E` is a regular + language. Each FST-able rule `F` is a regular relation. The question "can `F` ever produce a form + in `T_E`?" is exactly the **regular-language emptiness test** + + ``` + range(F restricted to FST-reachable forms) ∩ T_E = ∅ ? + ``` + + which **SIL.Machine's `Fst.Intersect` + a reachable-accepting-state check computes directly**. Run + it over every (FST-rule `F`, escape `E`) pair: + - **all intersections empty** ⇒ no normal step can ever feed an escape ⇒ the FST fragment is + **closed** ⇒ "no escape now, and no path in the automaton" is a *complete certificate* — the + sufficient "done"; + - **some intersection non-empty** ⇒ feeding is possible: if the fed escape is *regular*, fold it + into the automaton (Kaplan–Kay, §7-era reasoning) and re-check; if it is *non-regular/opaque*, + closure cannot be certified and those words must fall to the search backstop. + +- **(b) Stratal containment (the practical guarantee).** HC is stratal, and strata *bound* feeding. + If every escape is confined to a stratum the FST fragment never feeds *into* — e.g. + reduplication/templatic processes apply innermost, *before* FST-able affixation/phonology — then + by construction no later normal step can reach them. Verify by checking escape-rule strata against + FST-rule strata and the (downward) feeding direction. For most real grammars the "funny" + processes are exactly the innermost ones, so this holds. + +### 9.2 The per-grammar verdict + +| Situation | Is "no FST form ⇒ done" sufficient? | +|---|---| +| No FST-rule feeds any escape (∩ = ∅), **or** escapes stratally contained upstream | **Yes — provably.** The walk enumerates all paths; absence is a theorem; all homographs surface. | +| FST-rule feeds a **regular** escape | Fold the escape in → row above. | +| FST-rule feeds a **non-regular/opaque** escape | **No.** A valid derivation can hide from the surface; those words go to the bounded search. | + +### 9.3 Homographs (positive completeness) + +"Found one, are there others?" is the *easy* direction **once closure holds**: the walk returns +**all** accepting paths, never the first only (the spike already shows this — `dat` returns both +lexical entries). A homograph is missed only by (i) **unsafely determinizing/minimizing** and +merging paths — which is exactly why the analyzer stays nondeterministic and never `Minimize`s +unification arcs — or (ii) the compiler not encoding one decomposition (a closure failure), caught +by §9.5. + +### 9.4 The search backstop's own "done" + +For words that fall out of the FST (uncertifiable feeding to a non-regular escape), completeness +comes from the existing **bounded** search: "done" = all branches within the depth bound explored. +That is sound iff the bound is a *true* upper bound on derivation length — finite exactly when the +rule-interaction graph has **no unbounded self-feeding cycle**. A grammar with such a cycle has no +finite completeness guarantee from anyone (FST or search) and should be flagged. + +### 9.5 How we make it sufficient (the work) + +- **Static feeding-closure pass** (extends `GrammarFstAdvisor`): build the feeding graph — for each + FST-able rule and each escape, the `range(F) ∩ T_E` emptiness test via `Fst.Intersect` — and emit + a per-grammar verdict: **"closed — FST silence is a proof"** vs **"rule X feeds escape Y → those + words need the search backstop"**, plus the stratal-containment check as a fast pre-filter. +- **Corpus closure verification** (empirical backstop to the static proof): run the FST and the + sound+complete search engine over a corpus and assert the analysis **sets are identical** (same + cardinality and members) for every word, including ambiguous ones. Any divergence is a missing or + spurious path — a closure bug — localized to the offending rule. This converts "closed" from a + claim into a measured guarantee, and is the gate before an FST analyzer may *replace* (not just + shadow) the search engine for a grammar. + +### 9.6 Phase placement + +Closure confirmation is **Phase 3 (Tier-2 hybrid)** in §6: the static feeding-closure pass decides, +per grammar, which words the transducer is complete for and which escape to the search; the corpus +closure verification is the rollout gate. Until it passes for a grammar, the FST runs in +**shadow/verification mode** (alongside the search, asserting set parity), never as the sole +analyzer. + +## 10. Completeness under load — the eager/lazy partition knob (designed in from day one) + +Eagerly composing the whole grammar into one transducer is fastest to *walk* but the state count is +roughly **multiplicative across composed layers**, so a single high-branching layer (a position +class with hundreds of allomorphs, productive bounded compounding, a large affix inventory) can blow +the automaton up. We must be able to **bound the compiled size without ever sacrificing +completeness**. That requires a tunable partition — and because it changes correctness-adjacent +machinery, it has to be in the architecture from the start, not retrofitted. + +### 10.1 Three buckets + +Every construct lands in exactly one bucket, and the boundary between the first two is a **knob**: + +- **A — Precompiled (eager).** Composed into the static transducer ahead of time. Fastest walk; + costs states. +- **B — On-the-fly (lazy).** Kept as a separate composable layer and **applied at analysis time by + on-demand composition** against the partial result. Bounded memory; slower per word. Still + finite-state, still complete. +- **C — Search / probe fallback.** The non-FS escapes (and any construct whose closure can't be + certified, §9). The sound backstop. + +**What bucket C actually is (sharpened — see §11.3).** C is *not* a wide, murky middle that +"spans" A and B. Formally (Kaplan & Kay) everything concatenative — affixation, derivation, +inflection, ordered phonological rewrite rules — is a **regular relation**, hence A-or-B. The only +genuinely non-regular operations are a short list: **unbounded copying (reduplication)** and +**unbounded recursion** (productive compounding/incorporation with no depth bound), plus the rarer +**bracketing paradox**. So a C construct is a **thin, local, non-regular core wrapped in B on both +sides — `B ∘ C ∘ B`** — not a fog. That thinness is what makes the §11.3 release valves work: a +local core is **detectable and peelable**. Critically, *a construct missing from the FST is not +automatically C* — it is usually just **unbuilt B** (regular, simply not yet enumerated), which is +exactly what the Sena derivation gap turned out to be (§11.2). + +The **A↔B boundary is the knob**; **C is fixed by the §9 closure analysis, not the knob**. There is +always a safe floor setting — *everything in B* (nothing precompiled) — which is bounded in memory +and still complete; the knob only interpolates between "fast and big" (more A) and "small and slow" +(more B). The automaton can therefore never be forced to explode: when eager composition would +exceed a **state/memory budget**, the compiler demotes layers A→B until under budget. + +### 10.2 Why completeness is *independent of the knob* (the load-bearing guarantee) + +This is the property the knob must never break, and it holds for three composing reasons: + +1. **Composition is associative.** `(A ∘ B) ∘ rest ≡ A ∘ (B ∘ rest)`. Precompiling a layer versus + applying it lazily denotes the **same transduction** — the split point changes *when* the work + happens, never *which* relation is recognized. So moving a rule from A to B cannot add or drop a + single analysis. +2. **The walk enumerates all paths in either bucket.** A lazy layer expands *all* its applicable + arcs on demand (not the first), exactly as a baked-in layer would, so homograph/positive + completeness (§9.3) is preserved across the split. +3. **Closure (§9) is computed on the full relation `A ∘ B`, not on the precompiled subset.** The + feeding-closure certificate and the corpus set-parity gate validate the *whole* partition, so + "no path ⇒ done" stays a proof wherever the knob sits. + +Net: the knob is a pure **space/time dial**; the **analysis set is invariant** under it. That is why +it is safe to expose it (even to auto-tune it) without re-proving correctness each time. + +### 10.3 The knob's policy — and why it is per-language (yes, it would differ) + +The optimal A/B cut is grammar- and corpus-specific. Rank each candidate layer by two measurable +quantities: + +- **state-multiplier** — how much it grows the composed automaton (measure by composing it and + diffing the minimized state count); +- **hotness** — how often a corpus sample actually exercises it. + +Precompile (A) the **cheap-and-hot** layers; keep lazy (B) the **expensive-and-cold** ones; demote +A→B in descending cost/benefit until under the state budget. These quantities vary by language: a +language with one rarely-used 200-allomorph class should keep it lazy (precompiling multiplies the +whole automaton ×200 for little corpus payoff), while a language whose hot morphology is a handful of +low-branching affixes should precompile nearly everything. **So the same construct can be A in one +project and B in another** — the partition is a *pluggable policy* (with an optional auto-tuner that +reads the state-multiplier/hotness numbers), not a hard-coded rule. + +### 10.4 What "designed in from the beginning" demands + +- The compiler is a **pipeline of self-contained composable layers**, each carrying metadata + (state-multiplier, hotness, closure status), **not** a monolithic "compose everything." +- Each layer can be realized **either** as composed-in arcs (A) **or** as a lazy applicator (B) + behind one interface, so moving the knob is a config change, not a rewrite. +- The analyzer walks the **eager core and lazily expands B-layers on demand**, accumulating the same + `MorphToken` outputs (§8) regardless of bucket. +- A **state/memory budget** is a first-class compile input; exceeding it triggers automatic A→B + demotion (never a silent truncation — log what was demoted). +- The **corpus set-parity gate (§9.5) runs against the chosen partition**, so any A/B setting that is + shipped is verified complete before it can replace the search engine. + +### 10.5 Phase placement + +The layered, lazy-capable compiler and the budget/policy interface are **Phase 1–2 architecture** +(the spike's `FstMorpher` is already structured as discrete composable pieces — lexicon chains + +affix chains — rather than a monolith, which is the seed of this). The auto-tuner and per-project +policy tuning are **Phase 4 (productionize)**. The completeness invariant (§10.2) is an **invariant +checked at every phase**, not a phase of its own. + +## 11. Findings from the Sena drive (the corrected picture) + +This section records what the actual Sena implementation taught us, *correcting* earlier divergence +analysis that was measured against a broken baseline. Read it before §9/§10 are taken as final. + +### 11.1 The measurement bug that invalidated earlier divergence numbers + +The benchmark forced `Morpher.MaxUnapplications = 3` on the **search engine used as ground truth**. +But in HC `MaxUnapplications = 0` means **unlimited** (the cap engages only when `> 0`, +`AnalysisStratumRule.cs:144`). Setting it to `3` throttled the reference search down to **0–few +analyses per word**, so every "divergence" the FST showed against it was the FST disagreeing with a +*crippled* oracle — artifacts, not morphology bugs. **Lesson: always run the reference `Morpher` +with `MaxUnapplications = 0` (unlimited) when measuring FST parity.** A `=3` ground truth is +meaningless. + +With the corrected (unlimited) oracle: + +| corpus | FST template analyzer vs search | speed | +|---|---|---| +| curated 15 words | **IDENTICAL** (sound + complete) | 2.4 vs 177.8 ms/word (**~74×**) | +| broader 60 words | 12 real divergences (below) | 2.9 vs 245.5 ms/word (~85×) | + +So the FST approach is **already sound + complete on the regular fraction** it builds; the residual +is coverage and a verification subtlety, not a flaw in the "walk the forest" design. + +### 11.2 The two real residuals (neither is bucket C) + +The 12 genuine divergences split cleanly, and **both kinds are bucket B, not C**: + +- **Over-generation** (FST proposes readings search rejects — e.g. `kulemba` as `INF+[escrever]+IND`, + `mbalira`, `ndiende`, invalid agreement combos in `akudza`/`aikwata`). These are killed cleanly by + **verify-discard** (`VerifiedFstAnalyzer` / `FstReplay`): re-synthesize each candidate through the + proven engine and drop any that does not regenerate the surface. Re-synthesis enforces *every* HC + constraint at once (category, MPR, co-occurrence, obligatoriness) — so this is the "install all the + gates" mechanism, and it removed every over-generation in the corpus with no FST-encoded gate. + +- **Under-generation** (search has readings the FST never proposes — `aikhane`, `angwera`, `kunduli`, + `paoneke`, `khalani`, `cidzo`, `ikoyiwe`). **Every one is a derivational suffix the FST build + omits:** `REC` (reciprocal), `APPLIC` (applicative), `REV` (reversive), `NZR` (nominalizer), `NEU` + (neuter/stative), `PAS` (passive), `acção`. The build covers the *inflectional* layer (subject/ + object agreement + TAM) but not the *derivational* layer between root and inflection (e.g. + `[vencer]+REV+NZR`, `[cair]+APPLIC+IND`, `[ser]+REC+NZR`). This is **unbuilt B** — concatenative, + regular — not a non-regular gap. + + *Build-order wrinkle:* derivation reintroduces the surface-vs-derivation order problem. In + `kunduli = 10+[vencer]+REV+NZR`, the class-10 *prefix* is licensed only because `NZR` (a later + suffix) nominalized the stem — a left-to-right surface walk cannot gate that. **Resolution: build + permissively (propose the derivation paths) and let verify-discard kill the bad combos.** Do not + attempt to gate derivation order in the walk. + +- **Verify false-rejections** (`kubvuna`, `akhaona`, `nyabasa`, `ndalama`): verify-discard dropped + *valid* analyses it could not re-synthesize. This is **token under-determination** — the + `(op, morpheme)` token (§8) omits an allomorph or feature needed to regenerate the surface, so the + replay fails on a legitimate analysis. This — not reduplication — is the real "last nut" for a + *lossless fast path*, because it makes verify-discard lose true analyses. (The `SoundHybridMorpher` + fallback variant stays complete by routing any unconfirmable word to full search, at the cost of a + high fallback rate — 88% here — so it is correct but not yet fast.) + +### 11.3 Bucket C in the wild, and the release valves (does it even occur here?) + +**In *this* Sena grammar: there is no bucket C.** The grammar file has **0 reduplication rules** +(`grep reduplicat` = 0; all rules are `CopyFromInput` + `InsertSegments`, i.e. ordinary affixation), +the census reports **Tier 1 / FST-CLOSED / 0 escapes**, and compounding is bounded (8 rules). So the +slow path may never need to fire for Sena; the `HybridMorpher` total-reduplication route is a +never-triggered safety net here. + +**In general, genuine C does occur** — Bantu verb reduplication (`-famba-famba` "walk around"), +Indonesian/Malay full reduplication (`buku-buku` "books"), Tagalog aspect reduplication, and +bracketing paradoxes (English `un-happi-er`). For those, three resolution paths — and the key +insight that **for copy, detection and parsing are the same local problem** (a reduplicant is an +adjacent repeated substring; detecting it *is* finding the split): + +1. **Bounded fold into B (length-cap the copy).** Precompile reduplication for stems up to length + `N` — finite, therefore regular, therefore pure B. Cost is **linear** in `N×|stems|`, not + exponential. Stems longer than `N` (vanishingly rare) fall to the backstop. Best when copy shapes + are few. +2. **Detect-and-peel (compile-replace).** At parse time run a cheap repeat-scan that *proposes* + candidate reduplicant splits; strip the copy and hand the base to the B-FST; accept any split + whose base parses and whose copy relation holds. No precompile blow-up, handles unbounded copy, + and the live work is just the scan + peel — the heavy lifting stays in B. This is the "look for it + live as well" valve, and the standard finite-state-morphology answer (Beesley & Karttunen's + `compile-replace`). **Preferred.** +3. **2-way FST.** Replace the 1-way transducer with a two-way one for the reduplicative fragment — it + re-reads its input, so it *computes* the copy a 1-way FST cannot, while staying finite-state and + linear-time (Dolatian & Heinz, computing reduplication with 2-way FSTs). Cleanest in theory; + biggest lift (SIL.Machine's `Fst` is 1-way). + +**The A/B/C balance is computable, not guessed.** For each candidate C-feature, build the FST with +and without it folded and measure `Δ|states|`/`Δ|arcs|` (the precompile blow-up), and measure the +corpus frequency of words needing it; fold iff `Δmemory` fits the budget *and* `freq × slow_latency` +saved is worth it. This is the §10.3 knob made quantitative — a knapsack over the state-multiplier +and hotness numbers the layered compiler already exposes. + +**Theory load-bearing here** (attributed by idea; verify exact citations before quoting): rewrite +rules compose to regular relations (Kaplan & Kay 1994); reduplication is *the* canonical non-regular +morphological process; 2-way FSTs can compute it (Dolatian & Heinz); subregular locality (Chandlee) +explains why everything else is cheaply finite-state. + +### 11.5 Why re-synthesis verification failed — and why it is fixable (the confirmed root cause) + +The verify-discard mechanism (§11.2) leaned on `Morpher.GenerateWords` to confirm a candidate by +re-synthesis. A round-trip self-test exposed that **HC's own search analyses do not round-trip +through `GenerateWords`** for derivational/inflected *verb* forms (`aikhane`/`angwera`/`kunduli`/ +`ikoyiwe` → all NO), while *noun*/simple forms do (`kulemba`/`mbalira` → OK). A deep probe of +`aikhane` settled the cause — and it is **not** fundamental loss: + +- All its morphemes are plain `AffixProcessRule`; the analysis's `RealizationalFeatureStruct` is + empty (`ANY`). So it is not a realizational-FS reconstruction problem. +- Re-synthesis reproduced the surface under **none** of: all-morphemes-as-rules, non-realizational + only, empty FS, or the ground-truth FS. +- The grammar has **0 phonological rules**, so it is not opacity. + +The real cause is the **two synthesis doors** in `Morpher`: + +| Door | Input | Behavior | +|---|---|---| +| `Synthesize` (internal, used by `ParseWord`) | the **rich analysis `Word`** (stripped shape + exact template/slot structure + features, via `LexicalLookup`) | **faithful** — reproduces every valid analysis | +| `GenerateWords` (public convenience) | a **flat bag** of morphemes, re-permuted and applied as **free** morphological rules | **lossy** — re-guesses order/context, bypasses templates | + +Confirmed in the grammar: the inflectional affixes (`3P+2`, `SBJV`, …) are **template-slot rules** +(`mrule26+`, inside ``), while only compounding/derivation (`mrule1–25`) are free stratum +rules. `GenerateWords` applies the slot rules as free rules — no slot order, no obligatoriness, no +template gating — so feature-dependent verb combinations never synthesize, even from the exact right +morphemes. A simple noun + class-prefix (one slot, no interdependency) happens to survive, which is +why nouns round-trip and verbs do not. + +**The under-determination is therefore self-inflicted, not fundamental.** The FST *walk* knows +exactly which template and slots it traversed and in what order — it discarded that when it emitted +the lean `(op, morpheme)` token (§8). The fix is to **preserve the template/slot path the walk took +and verify through HC's faithful door** (`Synthesize`-style, template-aware directed synthesis), +rather than the flat `GenerateWords`. That makes verify both **sound and lossless**: a real +over-generation (e.g. an object marker on an intransitive stem) still fails HC's template-aware +synthesis and is dropped, while a valid verb form now confirms instead of being false-rejected. This +also collapses the 90% `SoundHybridMorpher` fallback (which was driven by false-rejection, not by +genuine over-generation). + +### 11.6 The measured corpus picture (200 Sena words, unlimited oracle) + +| analyzer | result | speed | +|---|---|---| +| search (oracle) | 480 analyses | 224 ms/word | +| raw FST template+derivation | 49/200 diverge (~24.5%): **~19% over-gen, ~7% under-gen** | 3.5 ms/word (**~64×**) | +| verify-discard (`GenerateWords`) | 48/200 — barely helps (the §11.5 lossy door) | 8.3 ms/word | +| sound fallback | 2/200 — near parity, but **90% fallback** (false-rejection driven) | — | + +Reading: completeness is *nearly* there (the derivation layer cut under-gen but ~7% remains — +category-changing derivation, §11.4 Part 2, and prefixal derivation). Over-gen (~19%) is the larger +axis and is what the template-aware verify (§11.5) must remove. The headline speed (~64×) is real; +the open work is making the *verified* path lossless so the fallback rate falls from 90% toward the +true over-gen rate. + +### 11.7 Status: correctness essentially done; speed is the one remaining lever + +A check of the `SoundHybridMorpher` path on the full 200-word corpus settles where the project is: +**both residual divergences (`miwiri`, `mitemo`) are `extra=[]` — pure under-generation, zero +over-generation.** So: + +- **Sound** — the hybrid never emits a wrong analysis (the fallback catches every over-gen). ✓ +- **~99% complete** — 198/200 exact set-match; 2 residual under-gen. ✓ +- **Not yet fast** — 90% fallback, so no net speedup *yet*. ⚠ + +Correctness is therefore effectively achieved. The single open axis is **speed**, and it has one +precise lever: the 90% fallback is driven by the **lossy `GenerateWords` verify false-rejecting valid +words** (§11.5), *not* by genuine errors. A lossless verify collapses the fallback toward zero and +unlocks the ~64× the raw FST already shows. + +### 11.8 The precise remaining build — a faithful (lossless) verify + +`GenerateWords` fails because it re-synthesizes from a **flat, permuted pool of rules**, losing the +**cross-stratum / template-slot ordering** that HC's internal `Synthesize` reads off the rich +analysis `Word`. Confirmed on `aikhane`: stem shape = root citation shape = `ikh` (so it is *not* a +stem-shape problem); its rules `a-5 -e -an` mix template-slot inflection (`a-5`, `-e`) with a +free derivational rule (`-an`, REC) that live in **different strata**, and the flat pool cannot +reconstruct the stratum order. The FST walk, by contrast, *knows* the stratum/template/slot/order it +traversed. + +**Caveat (measured):** `GenerateWords(WordAnalysis)` *permutes* the rule order — so it already tries +the correct order — and still fails. So the missing ingredient is **not** merely rule ordering; it is +state the rich analysis `Word` carries (syntactic features established during un-application) that a +from-citation synthesis does not re-establish. That makes a *cheap* faithful verify harder than +"apply the rules in the right order," and points to **two viable routes** (pick by measured payoff): + +- **Route A — faithful reconstruction verify.** Reconstruct enough of the rich analysis `Word` + (root + ordered rules + stratum/template/slot context the walk knows) to drive HC's internal + `Synthesize` rather than `GenerateWords`. Lossless if the reconstruction is faithful; the open risk + is whether the analysis-derived syntactic features are reconstructable from the walk's knowledge. +- **Route B — build-time constraint gates (make the FST faithful, no verify).** The over-generation + is concrete constraints — e.g. an object marker on an intransitive stem is a **subcategorization** + fact known at build time, hence order-independent and gateable like the existing category gate. + Encode the few over-gen-causing constraints on the FST arcs so it stops proposing them; then the + FST is faithful and needs no per-word verify. Cross-slot *feeding* constraints that are genuinely + not left-to-right gateable route to the search backstop (§9). + +Either route ends the same place: `VerifiedFstAnalyzer`/raw FST becomes sound *and* complete with +**near-zero fallback**, at full FST speed. + +**Decision: Route A** (chosen). Route B *duplicates* HC's constraint logic as a parallel set of FST +arc-gates that must be kept aligned with the real engine and debugged independently — a second +morphology engine, the anti-pattern this whole design avoids. Route A *reuses* HC: the constraints +stay where they already live and are already correct. + +**Route A, sharpened — "directed un-application, then `Synthesize`":** HC parsing is *search +backward* (the slow combinatorial un-application, ~10k clones/word) → *synthesize forward* to confirm +(cheap, ~2.7 ms). The FST replaces only the slow backward search — it already knows the exact path +(root + ordered rules + stratum/template). So the verify should: +1. **Directed un-application** — apply the analysis rules for *only the FST's chosen path* (no search + breadth) to the surface, producing HC's own rich analysis `Word` (with the syntactic features that + `GenerateWords`-from-citation never establishes — the §11.8 caveat). +2. **`Synthesize`** that rich `Word` through HC's existing machinery and check it matches the surface. + +Faithful by construction (HC's exact pipeline with the FST navigating instead of brute force), and +the cost is ~(rules in the path) × per-rule-apply rather than the full fan-out — the source of the +≥10×. The remaining engineering question is the cleanest way to drive HC's per-rule analysis +un-application from the FST token sequence (the rules are recoverable from the `(op, morpheme)` +tokens via the codec; the analysis-rule objects are `mrule.CompileAnalysisRule`). + +**DONE — Route A is implemented and works (the cleanest possible form).** HC's `Morpher` exposes +settable `LexEntrySelector`/`RuleSelector` (default `=> true`), checked at every analysis *and* +synthesis step. So the verify never reconstructs anything: it simply runs HC's own `AnalyzeWord` with +those selectors **pinned to the candidate's root and rules**, which prunes the combinatorial fan-out +to the single path the FST found. A candidate is valid iff it appears in that restricted result +(restriction can only remove paths, never fabricate one — HC still runs full synthesis + surface +match). Implemented in `FstReplay.Reproduces`; `VerifiedFstAnalyzer` keeps confirmed candidates and +discards the rest. **Measured (200 Sena words, unlimited oracle): verify-discard went from 48 → 14 +divergences (186/200 set-match) at 15.6 ms/word vs 234 ms/word oracle (~15×), with ALL +over-generation removed and zero false-rejection (lossless).** The 14 residual are pure +under-generation. This is the thin wrapper the design wanted — HC's real engine, navigated by the +FST, no reimplemented constraints. + +**Feasibility confirmed (why this works where `GenerateWords` fails).** `AnalysisAffixTemplateRule` +unifies the template's `RequiredSyntacticFeatureStruct` and **writes it onto the word** +(`outWord.SyntacticFeatureStruct.Add(fs)`, plus each slot rule's analysis sets its features). That +populated `SyntacticFeatureStruct` is the precondition the inflectional rules check during synthesis +— and is precisely what a from-citation `GenerateWords` never establishes (root citation form carries +bare features), which is why even the correct rule order fails there. Directed un-application calls +those *same* `CompileAnalysisRule` objects along the FST's path, so it reconstructs the populated +`SyntacticFeatureStruct` for free, then `Synthesize` succeeds. Reuse, not reimplementation. The build +applies the FST path's analysis rules (template + free derivation) to the surface `Word` — bounded by +the path, not the full search — yielding rich analysis `Word`(s) to hand to the existing `Synthesize`. + +### 11.4 The path to a full solution (what "done" means for Sena) + +1. ✅ **Re-validated the gates** built against the broken oracle: the `mbale` obligatoriness gate is + still load-bearing under the unlimited oracle (5→4 divergences); the category gate is faithful + build-time logic. +2. ✅ **Built the derivation layer** into the FST (§11.2 under-gen largely closed — `aikhane`/ + `angwera`/`paoneke`/`ikoyiwe` now proposed). +3. ✅ **Faithful (lossless) verify** (§11.8) — done via restricted re-analysis; sound + lossless at + ~15×, no fallback. `verify-discard` = 186/200 set-match (was 151 raw / 152 old verify). +4. ✅ **Category-changing derivation** — `DerivableToCategory` attaches a template over a derived + stem of its output category (verb + `NZR` → noun + class prefix), closing `kunduli`/`cidzo`/ + `khalani`. Took `verify-discard` from 14 → **6** divergences (194/200 set-match). +5. ⬜ **The last 6 (diverse proposer gaps, diminishing returns)** — all pure under-gen, all in the + *proposer*: **prefixal derivation** (`nyari` = `nominalizador`-prefix + `[ser]`; `cawo` associative), + **depth-3 derivation** (`miwiri` = `[ter]+PAS+APPLIC+NZR`; depth 3 gains it but ~2× verify cost, so + left to the backstop), and **copula/TAM** constructions (`ndico`/`ndimwe` = `é+[ele]`/`é+[vós]`; + `kuumadi` = `INF+…+IND+EVID`). Each is a small proposer-coverage item; a prefixal derivation layer + (mirror of the suffix layer) would close the first two. +6. **Target metric:** FST analyses == search analyses (set parity), at ≥10×. **Achieved: sound ✓, + lossless verify ✓, ~13× ✓ (17.2 ms/word vs 237 ms oracle), no fallback ✓, 194/200 (97%) + set-match.** The last 6 are diverse proposer coverage gaps, not a verify or soundness issue. + +### 11.9 Metric correctness and two productionization caveats + +**The parity signature was sharpened (important).** It was `join(morpheme.Id) + ":" + rootIndex`, but +affix `Morpheme.Id` is empty in this grammar, so it encoded only *(morpheme count, root position)* — +collapsing distinct affixes of the same shape (e.g. subject markers `3P+2` / `3S+1` / `6`) into one +key and hiding same-shape under-generation. Replaced with **per-morpheme object identity** (both +analyzers reference the same `Morpheme` instances from the `Language`, so it is a faithful shared +discriminator). Under the strict signature the raw-FST divergences rose 44 → 90 (shape-parity *had* +been hiding raw over-gen), but **`verify-discard` stayed at 6 (194/200), all pure under-gen** — i.e. +the verify result is robust to the metric and the soundness/lossless claim is real, not a shape +artifact. `FstReplay`'s candidate-match signature was sharpened the same way. + +**Caveat 1 — the verify mutates shared `Morpher` selectors (thread-safety).** `FstReplay` sets +`LexEntrySelector`/`RuleSelector` on the morpher with try/finally restore — correct sequentially, but +two words verified concurrently on one morpher would race the selectors. Since a core motivation is +lifting the parallel ceiling, production must give the verify a **per-thread morpher or a morpher +pool** (the analysis FST walk itself is allocation-light and parallel-friendly; only the verify step +carries this constraint). + +**Caveat 2 — the ~13× is vs the unlimited-unapplication oracle** (`MaxUnapplications=0`, 237 ms/word). +That is the sound+complete baseline (the only correct one — §11.1), and is what the FST must match. +If production HC runs a *bounded* cap for speed, it trades completeness for time, so the real-world +multiple against that configuration should be sanity-checked separately before quoting a single +headline number. + +## 12. The completeness certificate — a grammar-level proof (not per-word) + +Completeness is not a per-word heuristic; it is a **property of the grammar's rule structure**, +certified once. The contract is two exhaustive enumerators joined at a cut no derivation can cross: + +- **Side B (precompute / FST) is complete** because the regular sub-relation is a *finite automaton*: + by Myhill–Nerode it has finitely many states, and walking **all** accepting paths enumerates **all** + analyses — "enumerated absolutely everything," mechanically. (Never `Minimize` underspecified-feature + arcs: that merges distinct paths and destroys the guarantee — §9.3.) +- **Side A (live) is complete** iff (1) it tries *every applicable rule* at each node (HC's `RuleBatch` + does), and (2) a **well-founded measure** strictly decreases each step (un-application shortens the + surface; or a stratum/depth bound), so the finite search tree is fully visited. This is "I check + these N things, then I'm done." + +### 12.1 Why two complete halves can still miss — and the cut that fixes it + +If a derivation **weaves** across the boundary (`A→B→A→B`), B enumerates only B-internal paths and A +only A-internal paths, so the interleaving is **silently missed** even though each half is complete. +The fix is a **clean directed cut**: every feeding edge crosses the boundary in *one* direction. Inner +morphology feeds outer (the inner stem is what an outer affix attaches to), never the reverse — so put +**A = inner, B = outer**. Then every derivation factors uniquely as `(A-core) ∘ (B-shell)`: analysis +peels the B-shell with the FST (all ways) and hands each residual stem to A (all ways); the composition +is provably the whole analysis set. No weaving ⇒ no gap. + +### 12.2 The graph theory of a valid cut + +Model the grammar as a **feeding graph** `G` (nodes = rule/construct classes; edge `r→s` iff `r` can +create the environment `s` needs — Kiparsky feeding). + +1. Condense strongly-connected components (Tarjan) → a DAG of SCCs (an SCC = mutually-feeding rules, + i.e. a potential cycle). +2. A **valid cut** is a downward-closed set in the DAG's topological order — a *topological separator* + with all cross-edges pointing `A→B`. (HC's strata are a hand-built such stratification.) +3. Two further obligations: the **B-side relation must be regular** (Kaplan–Kay: concatenation + + ordered rewrite = regular), and every **SCC kept in A must be well-founded** (no unbounded-growth + cycle — bounded copy ok, unbounded copy not). + +A grammar admitting such a cut with B regular and A well-founded has, by construction, +`A-complete ∧ B-complete ⇒ whole-complete`. **This is the certificate, computed on the grammar.** + +### 12.3 The construct-coverage half (why "FST-closed" is necessary but not sufficient) + +`GrammarFstClosure` / the census already certify the *regularity / no-escape* half (the B-side relation +is regular; for Sena, 0 escapes). That is necessary but **not** sufficient: the FST must also actually +**enumerate every construct on the B-side**. A regular construct the builder never emits is a +*hole inside B* — a silent under-generation, not a boundary problem. So the certificate has two +mechanical checks: + +- **Closure** — the B-side is regular / no un-handled escape (existing `GrammarFstClosure`). +- **Coverage** — every grammar construct on the B-side (every affix rule in a template slot or as a + standalone morphological rule, every compounding rule, every root) is represented on some FST arc. + +`Closure ∧ Coverage` over the cut ⇒ the FST enumerates the entire B-relation ⇒ **complete for every +word** with no per-word check. If coverage fails, the certificate **names the uncovered constructs** +and the build is *flagged* (those derivations route to the proven engine) — never a silent miss. + +### 12.4 Sena under the certificate + +Census: 0 escapes, 0 reduplication, 0 phonological rules → the entire feeding graph is regular, with no +non-regular SCC. So the unique maximal valid cut is **A = ∅, B = everything**: Sena is provably +completable *entirely* in the FST, with **no live side needed**. The residual divergences are therefore +not a cut/soundness issue — they are **coverage holes** (constructs the builder omits: prefixal +derivation, depth-3 derivation chains, copula/compounding). The certificate's job is to (a) confirm +`A = ∅` and (b) list exactly those holes, turning "97% empirically" into "complete once coverage = 100%, +and known-incomplete-where-flagged until then." + +### 12.5 Why this does not balloon (size rationale) + +B is an **automaton with shared structure**, not a stored list of words: size ≈ `|lexicon trie| + +|affix inventory × template structure|` — **additive**, not the multiplicative `|roots| × |affix +combinations|` of a materialized word list. Measured on Sena: **50,673 states from 1,463 root +allomorphs + 24 templates**, sub-second build, a few MB. "Enumerate everything" means *walk all paths +at parse time*, not *materialize the cross-product at build time*. The genuine blow-up risks — eager +composition+determinization across layers, high-branching position classes, productive deep +compounding/reduplication — are bounded by the **§10 eager/lazy partition knob + state budget**, which +auto-demotes expensive layers from precompiled (A-eager) to on-the-fly (B-lazy). **Completeness is +invariant under the knob** (composition associativity: precompiling vs applying lazily denote the same +relation), so the size dial never drops an analysis; worst case "everything lazy" is bounded memory, +slower per word, still provably complete. + +### 12.6 Implementation and proof (built + stress-tested) + +Implemented: +- `FstCompletenessCertificate.Certify(language, codec)` → `FstCompletenessReport`: the closure half + (`GrammarFstClosure`) + the coverage half (every affix rule emitted by the FST, read from the codec's + covered-morpheme set), plus the compounding-rule count. `IsCertified` = closed ∧ all affixes covered + ∧ no compounding. It **names the uncovered constructs** when it fails. +- `FstTemplateAnalyzer.CoversAnalysis(WordAnalysis)`: the sound structural predicate of what the FST + provably enumerates — single root (no compounding), every morpheme covered, ≤ `DerivDepth` + derivational affixes per side, **and the canonical morph order** `[infl-prefix][deriv-prefix][root] + [deriv-suffix][infl-suffix]`. (The stress test forced each of these: depth, compounding, and order + were all discovered as required constraints by analyses that broke a weaker predicate.) +- `CompleteHybridMorpher`: the provably-complete analyzer. Certified grammar → the fast verified FST + (complete by §12.3); else → the search engine (complete; the known slow path). Completeness is by + construction, decided by the grammar-level certificate — **no per-word heuristic.** + +**Certification is the EMPIRICAL set-parity gate, not the static coverage check.** A first attempt +made `IsCertified` = closed ∧ all-affixes-covered ∧ no-compounding. A stress test exposed this as +**unsound**: `cawo = coisa + d'eles` has every morpheme covered yet the FST cannot build it (a prefix +on a pronoun root that takes no template), so a grammar could pass the static check and still silently +drop `cawo`-type words — precisely the forbidden failure. Rule/symbol coverage is **necessary, not +sufficient**; completeness is about *paths (attachments)*, not symbols present. So the static check is +demoted to a fast **pre-filter / gap-namer** (`PreFilterPasses`), and the real gate is +`FstCompletenessCertificate.CertifyEmpirically` — **FST analyses == search analyses (morpheme-identity +set parity) over a representative corpus** (§9.5). It is path-level, so it catches `cawo`. + +**Proof (stress test `Prove_CertificateCompleteness`, 200 hard Sena words, unlimited oracle):** +- *FST path tested directly* (non-vacuous): the FST itself produces **467/480** search analyses; 13 it + misses route to the engine. +- *Static check shown unsound*: **1** analysis (`cawo`) is "in-class" by the static predicate yet + missed by the FST — the concrete witness that coverage ⇏ completeness. +- *Empirical gate*: Sena is **NOT certified** (5 divergent words), so `CompleteHybridMorpher` routes to + the engine; **complete-system misses = 0** — every true analysis is returned. + +**What the stress test taught (the key result).** A *predictive per-analysis* coverage predicate is +whack-a-mole (it broke on derivation depth, then morph order, then the template-less prefix `cawo`), +and even grammar-level *symbol* coverage is unsound. **Soundness rests on the empirical set-parity gate ++ engine backstop**, not any static predicate: certified (set parity holds) ⇒ FST-only is evidence- +backed complete; uncertified ⇒ the engine guarantees completeness, and the gate names exactly which +words still diverge. The system is **100% complete today** (0 misses, via the engine for the 5 +divergent words), and the path to FST-only speed is to drive those divergences to 0 (build the 3 +remaining prefixes, compounding, deeper derivation, template-less prefixation) until the grammar +certifies — never at the cost of a silent miss. + +## 13. The two-path caching analyzer (fast + slow, the shipped front end) + +The FST fast path is **sound but not guaranteed complete** — it answers *"does this have at least one +FST-findable valid analysis?"* (a trustworthy *yes*-detector for "is this a word", never the complete +analysis set, and able to false-negative on words whose only readings use un-built constructs, e.g. a +pure compound). On its own that is not safe for a consumer that needs all readings. The shipped design +pairs it with the proven engine behind a cache: + +- **Slow path = truth, cached.** HC's search engine is complete; its result per word is stored in + `AnalysisCache`. For a fixed corpus the cache is **warmed** (in the background, in parallel) until + every word has its complete analysis — after which queries are fast *and* complete. +- **Fast path = immediate, provisional.** The verified FST answers instantly on a cache miss; its + result is flagged provisional (`FastAnalysisResult.IsComplete == false`). +- **Default is guaranteed (backwards-compatible).** `CachingMorphologicalAnalyzer.AnalyzeWord` returns + the cached complete analyses, or computes them with the engine on a miss and caches them. Existing + callers get the same analyses as before — faster once warm, never wrong. +- **Fast is opt-in.** `AnalyzeWordFast` returns the cached complete set if warm, else the provisional + FST result, and never runs the slow engine. Applications (FieldWorks) can show the fast result now + and the authoritative result once cached, querying both. +- **Persistence (fixed corpora across sessions).** `AnalysisCacheSerializer` writes/reads the cache as + text, keying morphemes by `MorphemeRegistry` (a deterministic morpheme↔key map rebuilt from the + grammar) and guarding with a **grammar-version** string — a cache built against a different grammar + is rejected, forcing a re-warm (the one way this design could otherwise serve stale, unsound + analyses). Confirmed non-words (empty analysis) are cached too, so they are not recomputed. + +Net: correctness equals the engine (the cache never invents or hides an analysis), the FST removes the +cold-start latency, and a warmed fixed corpus resolves every word fast and complete. The FST's +incompleteness — including the "is this a word" false-negative — is corrected the moment a word's +complete analysis lands in the cache. diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisCache.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisCache.cs new file mode 100644 index 00000000..cc1cbcfd --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisCache.cs @@ -0,0 +1,66 @@ +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A thread-safe store of complete (engine-computed) analyses, keyed by surface word + /// (HERMITCRAB_FST_PLAN.md §13). The FST fast path is sound but not guaranteed complete, so the + /// slow engine is the source of truth; its result is cached here once and reused. For a fixed + /// corpus the cache is warmed (in the background) until every word has its complete analysis, after + /// which queries are both fast and complete. Persisted across sessions via + /// (the analyses reference grammar morpheme objects, which are + /// rehydrated against the same grammar; a grammar-version guard rejects a stale cache). + /// + public sealed class AnalysisCache + { + private readonly System.Collections.Concurrent.ConcurrentDictionary> _store = + new System.Collections.Concurrent.ConcurrentDictionary>(); + + /// Number of words with a stored complete analysis. + public int Count => _store.Count; + + /// The cached words. + public IEnumerable Words => _store.Keys; + + /// True (with the complete analyses) iff this word's complete analysis is cached. + public bool TryGet(string word, out IReadOnlyList analyses) + { + return _store.TryGetValue(word, out analyses); + } + + /// Store the complete analysis for a word (overwrites). + public void Set(string word, IReadOnlyList analyses) + { + _store[word] = analyses; + } + + /// Return the cached complete analysis, or compute it once via and cache it. + public IReadOnlyList GetOrAdd(string word, System.Func> compute) + { + return _store.GetOrAdd(word, compute); + } + + /// Snapshot of (word, analyses) pairs — for persistence. + public IEnumerable>> Entries => _store.ToArray(); + } + + /// + /// The result of the opt-in fast query: the analyses plus whether they are the complete + /// (cached, engine-verified) set or a provisional FST result that may under-generate. A + /// consumer should only treat "no analyses" as "not a word" when is true. + /// + public readonly struct FastAnalysisResult + { + public FastAnalysisResult(IReadOnlyList analyses, bool isComplete) + { + Analyses = analyses; + IsComplete = isComplete; + } + + public IReadOnlyList Analyses { get; } + + /// True if these are the cached complete analyses; false if a provisional FST result. + public bool IsComplete { get; } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisCacheSerializer.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisCacheSerializer.cs new file mode 100644 index 00000000..f195bd60 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisCacheSerializer.cs @@ -0,0 +1,116 @@ +using System.Collections.Generic; +using System.IO; +using System.Linq; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Persists an to/from text so a fixed corpus's complete analyses + /// survive sessions (HERMITCRAB_FST_PLAN.md §13). Morphemes are written as + /// keys; a grammar-version line guards against loading a cache built + /// against a different grammar (a stale cache could otherwise rehydrate wrong morphemes). One line + /// per word: word \t analysis | analysis | … where an analysis is + /// key,key,…:rootIndex:category (an empty word line records a confirmed non-word). + /// + public static class AnalysisCacheSerializer + { + private const string Magic = "hcfstcache/1"; + + public static void Save(AnalysisCache cache, MorphemeRegistry registry, string grammarVersion, TextWriter writer) + { + writer.WriteLine(Magic + "\t" + (grammarVersion ?? string.Empty)); + foreach (KeyValuePair> entry in cache.Entries) + { + var analyses = new List(); + foreach (WordAnalysis a in entry.Value) + { + string keys = string.Join(",", a.Morphemes.Select(registry.Key)); + analyses.Add($"{keys}:{a.RootMorphemeIndex}:{a.Category}"); + } + writer.WriteLine(entry.Key + "\t" + string.Join(" | ", analyses)); + } + } + + /// + /// Load cached analyses into . Returns false (loading nothing) if the + /// file's grammar version does not match — the caller should + /// then re-warm. Skips any analysis referencing an unknown morpheme key (defensive). + /// + public static bool Load(AnalysisCache cache, MorphemeRegistry registry, string grammarVersion, TextReader reader) + { + string header = reader.ReadLine(); + if (header == null) + { + return false; + } + string[] head = header.Split('\t'); + if (head.Length < 1 || head[0] != Magic) + { + return false; + } + string fileVersion = head.Length > 1 ? head[1] : string.Empty; + if (fileVersion != (grammarVersion ?? string.Empty)) + { + return false; // stale cache for a different grammar version + } + + string line; + while ((line = reader.ReadLine()) != null) + { + int tab = line.IndexOf('\t'); + if (tab < 0) + { + continue; + } + string word = line.Substring(0, tab); + string rest = line.Substring(tab + 1); + var analyses = new List(); + if (rest.Length > 0) + { + foreach (string a in rest.Split(new[] { " | " }, System.StringSplitOptions.None)) + { + WordAnalysis parsed = ParseAnalysis(a, registry); + if (parsed != null) + { + analyses.Add(parsed); + } + } + } + cache.Set(word, analyses); + } + return true; + } + + private static WordAnalysis ParseAnalysis(string s, MorphemeRegistry registry) + { + string[] parts = s.Split(':'); + if (parts.Length < 2) + { + return null; + } + var morphemes = new List(); + if (parts[0].Length > 0) + { + foreach (string k in parts[0].Split(',')) + { + if (!int.TryParse(k, out int key)) + { + return null; + } + IMorpheme morpheme = registry.Resolve(key); + if (morpheme == null) + { + return null; // unknown key — grammar mismatch; drop this analysis + } + morphemes.Add(morpheme); + } + } + if (!int.TryParse(parts[1], out int rootIndex)) + { + return null; + } + string category = parts.Length > 2 && parts[2].Length > 0 ? parts[2] : null; + return new WordAnalysis(morphemes, rootIndex, category); + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs new file mode 100644 index 00000000..7a792c9d --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs @@ -0,0 +1,134 @@ +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// The two-path analyzer (HERMITCRAB_FST_PLAN.md §13). The slow path is HC's search engine — + /// the source of truth, complete — and its result for each word is cached. The fast path is + /// the verified FST — sound and immediate but possibly under-generating. The default + /// returns the guaranteed complete result (cached, or + /// computed by the engine on a miss and then cached) — backwards-compatible: existing callers get + /// the same analyses, faster once warm. The opt-in returns the cached + /// complete result if warm, else the provisional FST result without running the engine. + /// + /// For a fixed corpus, fills the cache (in parallel) so every word eventually + /// resolves fast and complete. Thread-safe: the FST is shared, the engine runs from a + /// , and the cache is concurrent. + /// + public class CachingMorphologicalAnalyzer : IMorphologicalAnalyzer + { + private readonly IMorphologicalAnalyzer _fast; + private readonly MorpherPool _enginePool; + private readonly AnalysisCache _cache; + private readonly bool _grammarCertified; + + public CachingMorphologicalAnalyzer( + IMorphologicalAnalyzer fast, + MorpherPool enginePool, + AnalysisCache cache, + bool grammarCertified = false + ) + { + _fast = fast; + _enginePool = enginePool; + _cache = cache; + _grammarCertified = grammarCertified; + } + + /// + /// Wire the fast FST, an engine pool, and a (possibly preloaded) cache from a language. If a + /// is supplied, the grammar is certified when the + /// FST's analyses equal the engine's over it (set parity) AND the grammar is FST-closed — in + /// which case the FST is treated as proven-complete for every word and the engine is never run. + /// + public static CachingMorphologicalAnalyzer FromLanguage( + TraceManager traceManager, + Language language, + IEnumerable certificationCorpus = null, + AnalysisCache cache = null + ) + { + var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); + var fast = new VerifiedFstAnalyzer(new FstTemplateAnalyzer(language, new Morpher(traceManager, language)), pool); + bool certified = false; + if (certificationCorpus != null) + { + bool closed = GrammarFstClosure.Analyze(language).FstClosed; + bool parity = FstVerification.Compare(new Morpher(traceManager, language), fast, certificationCorpus).IsComplete; + certified = closed && parity; + } + return new CachingMorphologicalAnalyzer(fast, pool, cache ?? new AnalysisCache(), certified); + } + + /// The underlying cache (for persistence / inspection). + public AnalysisCache Cache => _cache; + + /// True iff the grammar is certified (FST-closed + set-parity) — the FST is then the + /// complete answer for every word and the full search is never invoked. + public bool GrammarCertified => _grammarCertified; + + /// + /// Default, guaranteed-complete analysis (backwards-compatible). On a certified grammar the FST + /// alone is the complete answer (no full search). Otherwise: the cached complete analysis if + /// present, else the engine (cached). Either way the result is complete. + /// + public IEnumerable AnalyzeWord(string word) + { + return _grammarCertified ? _fast.AnalyzeWord(word) : _cache.GetOrAdd(word, EngineAnalyze); + } + + /// + /// Opt-in fast path. On a certified grammar the FST result is proven complete + /// ( = true) without any search. Otherwise: the + /// cached complete set if warm (true), else the provisional verified-FST result (false). Never + /// runs the slow engine, so it never blocks. + /// + public FastAnalysisResult AnalyzeWordFast(string word) + { + if (_grammarCertified) + { + return new FastAnalysisResult(_fast.AnalyzeWord(word).ToList(), isComplete: true); + } + if (_cache.TryGet(word, out IReadOnlyList complete)) + { + return new FastAnalysisResult(complete, isComplete: true); + } + return new FastAnalysisResult(_fast.AnalyzeWord(word).ToList(), isComplete: false); + } + + /// + /// Populate the cache with the complete analysis of every corpus word (the slow path). Safe to + /// run in the background; parallelized across words by default. + /// + public void Warm(IEnumerable corpus, bool parallel = true) + { + List words = corpus.Distinct().Where(w => !_cache.TryGet(w, out _)).ToList(); + if (parallel) + { + Parallel.ForEach(words, w => _cache.GetOrAdd(w, EngineAnalyze)); + } + else + { + foreach (string w in words) + { + _cache.GetOrAdd(w, EngineAnalyze); + } + } + } + + private IReadOnlyList EngineAnalyze(string word) + { + Morpher morpher = _enginePool.Rent(); + try + { + return morpher.AnalyzeWord(word).ToList(); + } + finally + { + _enginePool.Return(morpher); + } + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs b/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs new file mode 100644 index 00000000..34bbb496 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs @@ -0,0 +1,85 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// The complete analyzer (HERMITCRAB_FST_PLAN.md §12). Completeness is decided by grammar-level + /// empirical certification — the FST's analyses provably equalled the search engine's over a + /// representative corpus (set parity, §9.5) — not a per-word heuristic: + /// + /// certified ⇒ the fast verified FST path is used (complete for every word; certification is + /// the evidence); + /// otherwise the proven search engine is used — the known slow path; an uncertified grammar + /// never silently under-generates. + /// + /// Per-word control: forces the FST on/off for one word, and + /// is an optional policy hook consulted by the plain + /// . Thread-safe: the verified FST is thread-safe and the + /// engine path rents a from a pool, so a corpus can be parsed in parallel. + /// + public class CompleteHybridMorpher : IMorphologicalAnalyzer + { + private readonly IMorphologicalAnalyzer _fst; + private readonly MorpherPool _enginePool; + private readonly bool _certified; + + public CompleteHybridMorpher(IMorphologicalAnalyzer verifiedFst, MorpherPool enginePool, bool certified) + { + _fst = verifiedFst; + _enginePool = enginePool; + _certified = certified; + } + + /// + /// Build and certify from a language + corpus: the FST is used only if it empirically matches + /// the engine on (set parity); otherwise the engine is + /// used. The verify and engine paths share one Morpher pool (thread-safe). + /// + public static CompleteHybridMorpher FromLanguage( + TraceManager traceManager, + Language language, + IEnumerable certificationCorpus + ) + { + var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); + var proposer = new FstTemplateAnalyzer(language, new Morpher(traceManager, language)); + var verified = new VerifiedFstAnalyzer(proposer, pool); + var engine = new Morpher(traceManager, language); + bool certified = FstVerification.Compare(engine, verified, certificationCorpus).IsComplete; + return new CompleteHybridMorpher(verified, pool, certified); + } + + /// True iff the FST passed empirical set-parity for this grammar (the default fast path). + public bool Certified => _certified; + + /// Optional per-word policy: return true to use the FST for a word, false for the engine. + /// When unset, the plain uses . + public Func UseFstFor { get; set; } + + public IEnumerable AnalyzeWord(string word) + { + return AnalyzeWord(word, UseFstFor?.Invoke(word) ?? _certified); + } + + /// Analyze one word, explicitly choosing the FST fast path or the engine. + public IEnumerable AnalyzeWord(string word, bool useFst) + { + return useFst ? _fst.AnalyzeWord(word) : AnalyzeWithEngine(word); + } + + private IEnumerable AnalyzeWithEngine(string word) + { + Morpher morpher = _enginePool.Rent(); + try + { + return morpher.AnalyzeWord(word).ToList(); + } + finally + { + _enginePool.Return(morpher); + } + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstReplay.cs b/src/SIL.Machine.Morphology.HermitCrab/FstReplay.cs new file mode 100644 index 00000000..9e8f2f80 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/FstReplay.cs @@ -0,0 +1,90 @@ +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Verification by restricted re-analysis (HERMITCRAB_FST_PLAN.md §11.8, Route A): confirm + /// an FST candidate by running HC's own with the rule/lexicon + /// selectors pinned to just this candidate's root and rules. That prunes HC's combinatorial + /// fan-out to the single path the FST already found — a few ms, not the full search — while reusing + /// HC's real analysis+synthesis validation end to end (no reimplemented constraints). + /// + /// A candidate is valid iff HC's restricted analysis of the surface yields it: restriction can only + /// remove paths HC would not take, never fabricate one (HC still runs full synthesis + surface + /// match), so membership in the restricted result is exactly "is a valid HC analysis". The Morpher + /// is ed so concurrent verification is thread-safe (the selectors are + /// mutable instance state). The matched HC analysis is returned (not the FST candidate) so + /// the caller emits a genuine engine — with its real category — rather + /// than the category-less proposal. + /// + internal static class FstReplay + { + /// The matched HC analysis of equal to , or null if HC does not produce it. + public static WordAnalysis Confirm(MorpherPool pool, WordAnalysis candidate, string word) + { + int rootIndex = candidate.RootMorphemeIndex; + IReadOnlyList morphemes = candidate.Morphemes; + if (rootIndex < 0 || rootIndex >= morphemes.Count || !(morphemes[rootIndex] is LexEntry root)) + { + return null; + } + + var rules = new HashSet(); + for (int i = 0; i < morphemes.Count; i++) + { + if (i == rootIndex) + { + continue; + } + if (!(morphemes[i] is IHCRule rule)) + { + return null; + } + rules.Add(rule); + } + + Morpher morpher = pool.Rent(); + try + { + // Pin HC to this candidate's path: only this root, only its rules (templates and strata + // stay open — they are containers the path threads through; gating the leaf rules + root + // is what collapses the fan-out). + morpher.LexEntrySelector = e => e == root; + morpher.RuleSelector = r => r is AffixTemplate || r is Stratum || rules.Contains(r); + + var ids = new Dictionary(); + string target = Signature(candidate, ids); + foreach (WordAnalysis analysis in morpher.AnalyzeWord(word)) + { + if (Signature(analysis, ids) == target) + { + return analysis; // the genuine HC analysis (carries the real category) + } + } + return null; + } + finally + { + pool.Return(morpher); + } + } + + /// Signature by per-morpheme identity (affix Morpheme.Id is empty, so shape-only would + /// falsely match a same-shape but different-morpheme analysis); same objects on both sides. + private static string Signature(WordAnalysis analysis, Dictionary ids) + { + return string.Join("+", analysis.Morphemes.Select(m => Id(m, ids))) + ":" + analysis.RootMorphemeIndex; + } + + private static int Id(IMorpheme morpheme, Dictionary ids) + { + if (!ids.TryGetValue(morpheme, out int id)) + { + id = ids.Count; + ids[morpheme] = id; + } + return id; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs new file mode 100644 index 00000000..b3d56b52 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs @@ -0,0 +1,688 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Annotations; +using SIL.Machine.DataStructures; +using SIL.Machine.FeatureModel; +using SIL.Machine.FiniteState; +using SIL.Machine.Morphology; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A token-accumulating FST analyzer for grammars whose affixation is organized into + /// affix templates (position classes) — the real-grammar case (HERMITCRAB_FST_PLAN.md + /// §6 Phase 2, §10). Each template becomes prefix-slot automaton → root → suffix-slot automaton; + /// a template attaches to a root only when the root passes the build-time gate: + /// + /// category — the root's syntactic features unify with the template's + /// RequiredSyntacticFeatureStruct; and + /// stratum — the root is at the template's stratum or an inner one (a template + /// cannot apply to a root introduced in a later/outer stratum). + /// + /// Gating both prevents over-generation and lets same-category roots share the template's + /// slot-automaton (states ≈ roots + Σ template automata, not roots × slot-combinations). + /// Tokens are accumulated along the DFS path (a state carries the morpheme token emitted on + /// entry). Prefix slots surface in reverse template order (slot 0 applies first → innermost), + /// suffix slots in template order. A budget (the §10 knob) aborts + /// before a blowup. Phonology and reduplication/infix slots are out of scope — it throws on a + /// non-prefix/suffix slot rather than silently mis-parsing. + /// + public class FstTemplateAnalyzer : IMorphologicalAnalyzer + { + private readonly Fst _fsa; + private readonly State _start; + private readonly Dictionary, uint> _tokenOnEntry = + new Dictionary, uint>(); + private readonly Dictionary, int> _stateIds = + new Dictionary, int>(); + private readonly MorphTokenCodec _codec = new MorphTokenCodec(); + private readonly CharacterDefinitionTable _table; + private readonly Func, bool> _filter; + private readonly int _maxStates; + private readonly Func _bareRootValid; + private readonly List _derivSuffixRules = + new List(); + private readonly List _derivPrefixRules = + new List(); + private int _stateCount; + + /// + /// Max stacked derivational affixes modelled per side before inflection (tunable per grammar). + /// 2 (e.g. REV+NZR) is the speed/coverage sweet spot for Sena: depth 3 (PAS+APPLIC+NZR) gains a + /// word or two but roughly doubles verify cost (more over-gen proposals to reject). Deeper + /// stacks than this are left to the search backstop rather than inflating every verification. + /// + private readonly int _derivDepth; + + public MorphTokenCodec Codec => _codec; + + /// Number of FST states built (the precomputed size — to watch for state blow-up). + public int StateCount => _stateCount; + + /// Build without obligatoriness: every root may stand bare (fine for toy grammars). + public FstTemplateAnalyzer(Language language, int maxStates = 1_000_000, int derivDepth = 2) + : this(language, _ => true, maxStates, derivDepth) { } + + /// + /// Build with obligatory-inflection enforcement: a root may stand bare only if synthesizing + /// it bare actually yields its surface (HC's own finality/validity check). This removes the + /// "bare-root" over-generation in always-inflected grammars (e.g. Bantu), where a root that + /// must take a class/agreement affix should not surface alone. + /// + public FstTemplateAnalyzer(Language language, Morpher morpher, int maxStates = 1_000_000, int derivDepth = 2) + : this(language, root => BareRootValid(morpher, root), maxStates, derivDepth) { } + + private FstTemplateAnalyzer(Language language, Func bareRootValid, int maxStates, int derivDepth) + { + _bareRootValid = bareRootValid; + _maxStates = maxStates; + _derivDepth = derivDepth; + _table = language.SurfaceStratum.CharacterDefinitionTable; + _filter = ann => ann.Type() == HCFeatureSystem.Segment; + _fsa = new Fst { Filter = _filter, UseUnification = true }; + _start = NewState(); + _fsa.StartState = _start; + + // Collect every root with the stratum index it is introduced at. + var roots = new List(); + for (int si = 0; si < language.Strata.Count; si++) + { + foreach (LexEntry entry in language.Strata[si].Entries) + { + foreach (RootAllomorph allomorph in entry.Allomorphs) + { + roots.Add(new RootRef(allomorph, entry.SyntacticFeatureStruct, si)); + } + } + } + + // Standalone derivational affix rules (REC/APPLIC/REV/NZR/NEU/PAS/...), distinct from + // inflectional template slots and from compounding. Suffixal ones become an optional, + // bounded layer between the root and the inflectional suffix slots (§11.2). + foreach (Stratum stratum in language.Strata) + { + foreach (IMorphologicalRule mrule in stratum.MorphologicalRules) + { + if (!(mrule is MorphemicMorphologicalRule rule)) + { + continue; + } + switch (RuleOp(rule)) + { + case MorphOp.Suffix: + _derivSuffixRules.Add(rule); + break; + case MorphOp.Prefix: + _derivPrefixRules.Add(rule); + break; + } + } + } + + // Bare-root paths — only for roots the grammar allows to stand uninflected. + foreach (RootRef root in roots) + { + if (_bareRootValid(root.Allomorph)) + { + State end = BuildRootChain(_start, root.Allomorph); + end.IsAccepting = true; + } + } + + // Template-less derivational stems: optional derivational prefixes + root + optional + // derivational suffixes, with NO inflectional template — for roots that derive/associate + // without inflecting (e.g. a pronoun taking an associative prefix: coisa + d'eles). + // Shared prefix/suffix derivation layers (built once) keep this additive. Verify-discard + // removes any over-generation, including a bare stem that should not stand alone. + if (_derivPrefixRules.Count > 0 || _derivSuffixRules.Count > 0) + { + State tlPrefixEntry = NewState(); + State tlRootStart = BuildDerivationPrefixLayer(tlPrefixEntry); + _start.Arcs.Add(tlPrefixEntry); // epsilon: enter the template-less path + State tlSuffixEntry = NewState(); + State tlSuffixExit = BuildDerivationSuffixLayer(tlSuffixEntry); + tlSuffixExit.IsAccepting = true; + foreach (RootRef root in roots) + { + State end = BuildRootChain(tlRootStart, root.Allomorph); + end.Arcs.Add(tlSuffixEntry); // epsilon: root → shared derivational suffixes → accept + } + } + + // Each template: prefix automaton → (gated roots) → suffix automaton. + for (int ti = 0; ti < language.Strata.Count; ti++) + { + foreach (AffixTemplate template in language.Strata[ti].AffixTemplates) + { + var prefixSlots = new List(); + var suffixSlots = new List(); + ClassifyTemplate(template, prefixSlots, suffixSlots); + + State prefixEntry = NewState(); + State prefixExit = AppendSlots( + prefixEntry, + prefixSlots, + MorphOp.Prefix, + template.RequiredSyntacticFeatureStruct + ); + // Shared derivational-prefix layer between the inflectional prefixes and the root + // (surface order: class-prefix → derivational-prefix → root, e.g. + // 10 + nominalizador + [ser]). Roots start after it. + State rootStart = BuildDerivationPrefixLayer(prefixExit); + State suffixEntry = NewState(); + State suffixExit = AppendSlots( + suffixEntry, + suffixSlots, + MorphOp.Suffix, + template.RequiredSyntacticFeatureStruct + ); + suffixExit.IsAccepting = true; + + // One derivation layer per template, shared by all its roots (tokens accumulate + // on the walk path, so sharing avoids a roots×derivations blowup): root → + // derivation suffixes → inflectional suffix slots. + State derivEntry = NewState(); + State derivExit = BuildDerivationSuffixLayer(derivEntry); + derivExit.Arcs.Add(suffixEntry); // epsilon: derivation → inflectional suffixes + + _start.Arcs.Add(prefixEntry); // epsilon: enter this template + + foreach (RootRef root in roots) + { + // Attach the root to this template if its category matches directly, OR if a + // derivational suffix in the layer changes the root's category to the + // template's (e.g. a nominalizer feeding a noun-class template: vencer[verb] + + // NZR → noun, then class-10 prefix). The category-changing suffix is in the + // shared derivation layer; verify-discard removes any resulting over-gen (§11.4). + if ( + root.StratumIndex <= ti + && ( + CategoryMatches(root.Category, template.RequiredSyntacticFeatureStruct) + || DerivableToCategory(root.Category, template.RequiredSyntacticFeatureStruct) + ) + ) + { + State end = BuildRootChain(rootStart, root.Allomorph); + end.Arcs.Add(derivEntry); // epsilon: root → derivation → suffix slots + } + } + } + } + } + + public IEnumerable AnalyzeWord(string word) + { + Shape shape; + try + { + shape = _table.Segment(word); + } + catch (InvalidShapeException) + { + // A word with a phoneme outside this table cannot be a surface form here. + return Enumerable.Empty(); + } + + var segments = new List(); + for ( + ShapeNode node = shape.GetFirst(n => _filter(n.Annotation)); + node != shape.End; + node = node.GetNext(n => _filter(n.Annotation)) + ) + { + segments.Add(node.Annotation.FeatureStruct); + } + + // NFA simulation: a set of (state, accumulated tokens) configurations advanced one + // segment at a time, deduped by (state, tokens) so shared states are not re-explored + // (a naive recursive DFS is exponential on a real grammar's nondeterminism). + List current = EpsilonClosure(new List { Enter(_start, new uint[0]) }); + foreach (FeatureStruct segment in segments) + { + var next = new List(); + var seen = new HashSet(); + foreach (Config config in current) + { + for (int a = 0; a < config.State.Arcs.Count; a++) + { + Arc arc = config.State.Arcs[a]; + if (!arc.Input.IsEpsilon && arc.Input.FeatureStruct.IsUnifiable(segment)) + { + Config nc = Enter(arc.Target, config.Tokens); + if (seen.Add(Key(nc))) + { + next.Add(nc); + } + } + } + } + current = EpsilonClosure(next); + if (current.Count == 0) + { + break; + } + } + + var results = new List(); + var emitted = new HashSet(); + foreach (Config config in current) + { + if (config.State.IsAccepting && emitted.Add(string.Join(",", config.Tokens))) + { + results.Add(ToWordAnalysis(config.Tokens)); + } + } + return results; + } + + private List EpsilonClosure(List configs) + { + var result = new List(); + var seen = new HashSet(); + var stack = new Stack(); + foreach (Config config in configs) + { + if (seen.Add(Key(config))) + { + stack.Push(config); + result.Add(config); + } + } + while (stack.Count > 0) + { + Config config = stack.Pop(); + for (int a = 0; a < config.State.Arcs.Count; a++) + { + Arc arc = config.State.Arcs[a]; + if (arc.Input.IsEpsilon) + { + Config nc = Enter(arc.Target, config.Tokens); + if (seen.Add(Key(nc))) + { + stack.Push(nc); + result.Add(nc); + } + } + } + } + return result; + } + + private Config Enter(State state, uint[] tokens) + { + return _tokenOnEntry.TryGetValue(state, out uint token) + ? new Config(state, Append(tokens, token)) + : new Config(state, tokens); + } + + private string Key(Config config) + { + return _stateIds[config.State] + ":" + string.Join(",", config.Tokens); + } + + private WordAnalysis ToWordAnalysis(uint[] tokens) + { + var morphemes = new List(tokens.Length); + foreach (uint token in tokens) + { + morphemes.Add(_codec.GetMorpheme(MorphToken.GetMorphemeId(token))); + } + return new WordAnalysis(morphemes, MorphToken.RootIndex(tokens), null); + } + + /// Split a template's slots into prefix and suffix; prefixes are reversed to surface order. + private static void ClassifyTemplate( + AffixTemplate template, + List prefixSlots, + List suffixSlots + ) + { + foreach (AffixTemplateSlot slot in template.Slots) + { + switch (SlotOp(slot)) + { + case MorphOp.Prefix: + prefixSlots.Add(slot); + break; + case MorphOp.Suffix: + suffixSlots.Add(slot); + break; + default: + throw new NotSupportedException( + $"FstTemplateAnalyzer handles prefix/suffix template slots only; slot '{slot.Name}' is neither." + ); + } + } + prefixSlots.Reverse(); // slot 0 applies first (innermost) → rightmost prefix on the surface + } + + private static MorphOp SlotOp(AffixTemplateSlot slot) + { + foreach (MorphemicMorphologicalRule rule in slot.Rules) + { + return RuleOp(rule); + } + return MorphOp.None; + } + + /// The surface role (prefix/suffix/…) of a morphological rule, from its first allomorph. + private static MorphOp RuleOp(MorphemicMorphologicalRule rule) + { + foreach (AffixProcessAllomorph allomorph in Allomorphs(rule)) + { + return MorphTokenCodec.ClassifyOp(allomorph, false); + } + return MorphOp.None; + } + + /// + /// An optional, bounded chain of derivational suffixes (the stratum's standalone affix + /// rules), shared by every root of a template. Permissive by design: a category-illegal + /// derivation (e.g. a nominalizer feeding a verbal suffix) is proposed here and removed by + /// re-synthesis verification (), per the plan §11.2. + /// + private State BuildDerivationSuffixLayer(State entry) + { + return BuildDerivationLayer(entry, _derivSuffixRules, MorphOp.Suffix); + } + + /// + /// An optional, bounded chain of derivational prefixes (the stratum's standalone prefix affix + /// rules) between the inflectional prefixes and the root — mirror of the suffix layer (§12.4). + /// + private State BuildDerivationPrefixLayer(State entry) + { + return BuildDerivationLayer(entry, _derivPrefixRules, MorphOp.Prefix); + } + + /// Shared builder for an optional, bounded derivational-affix layer of the given op. + private State BuildDerivationLayer( + State entry, + List rules, + MorphOp op + ) + { + State current = entry; + for (int k = 0; k < _derivDepth; k++) + { + State after = NewState(); + current.Arcs.Add(after); // epsilon: apply no derivation at this level + foreach (MorphemicMorphologicalRule rule in rules) + { + foreach (AffixProcessAllomorph allomorph in Allomorphs(rule)) + { + if (MorphTokenCodec.ClassifyOp(allomorph, false) != op) + { + continue; + } + uint token = MorphToken.Encode(op, _codec.GetOrAddIndex(allomorph.Morpheme)); + State tokenState = NewState(); + _tokenOnEntry[tokenState] = token; + current.Arcs.Add(tokenState); // epsilon: enter this derivational affix + State s = tokenState; + InsertSegments insert = allomorph.Rhs.OfType().FirstOrDefault(); + if (insert != null) + { + foreach (FeatureStruct fs in GetSegments(insert.Segments.Shape)) + { + s = AddArc(s, fs); + } + } + s.Arcs.Add(after); // epsilon: reconverge + } + } + current = after; + } + return current; + } + + /// Allomorphs of a slot rule — both AffixProcessRule and its realizational sibling. + /// + /// True iff this root may surface uninflected — i.e. synthesizing it with no affixes yields + /// its own surface form. If the grammar makes a bare stem non-final (obligatory inflection), + /// synthesis returns nothing and the bare reading is correctly suppressed. + /// + private static bool BareRootValid(Morpher morpher, RootAllomorph root) + { + if (!(root.Morpheme is LexEntry entry)) + { + return true; + } + string surface = root.Segments.Representation.Normalize(System.Text.NormalizationForm.FormD); + return morpher + .GenerateWords(entry, System.Linq.Enumerable.Empty(), new FeatureStruct()) + .Any(g => g.Normalize(System.Text.NormalizationForm.FormD) == surface); + } + + private static FeatureStruct RequiredCategory(MorphemicMorphologicalRule rule) + { + switch (rule) + { + case AffixProcessRule affix: + return affix.RequiredSyntacticFeatureStruct; + case RealizationalAffixProcessRule realizational: + return realizational.RequiredSyntacticFeatureStruct; + default: + return null; + } + } + + /// The category a derivational rule outputs (its OutSyntacticFeatureStruct). + private static FeatureStruct OutCategory(MorphemicMorphologicalRule rule) + { + return rule is AffixProcessRule affix ? affix.OutSyntacticFeatureStruct : null; + } + + /// + /// True iff can be transformed into + /// by a chain of ≤ the derivation-depth bound derivational suffixes (a category-changing + /// derivation, e.g. verb → noun via a nominalizer). Lets a template attach over a derived stem + /// of its output category; the category-changing suffix rides the shared derivation layer. + /// + private bool DerivableToCategory(FeatureStruct rootCategory, FeatureStruct templateCategory) + { + if (rootCategory == null || templateCategory == null || templateCategory.IsEmpty) + { + return false; + } + var frontier = new List { rootCategory }; + for (int depth = 0; depth < _derivDepth && frontier.Count > 0; depth++) + { + var next = new List(); + foreach (FeatureStruct cat in frontier) + { + foreach (MorphemicMorphologicalRule rule in _derivSuffixRules.Concat(_derivPrefixRules)) + { + FeatureStruct outCat = OutCategory(rule); + if (outCat == null || outCat.IsEmpty) + { + continue; // not a category-changing derivation + } + FeatureStruct inCat = RequiredCategory(rule); + if (inCat != null && !inCat.IsEmpty && !cat.IsUnifiable(inCat)) + { + continue; // rule does not apply to this stem category + } + if (outCat.IsUnifiable(templateCategory)) + { + return true; + } + next.Add(outCat); + } + } + frontier = next; + } + return false; + } + + private static IEnumerable Allomorphs(MorphemicMorphologicalRule rule) + { + switch (rule) + { + case AffixProcessRule affix: + return affix.Allomorphs; + case RealizationalAffixProcessRule realizational: + return realizational.Allomorphs; + default: + return Enumerable.Empty(); + } + } + + /// Build the slot sequence from ; returns the state after the last slot. + private State AppendSlots( + State start, + List slots, + MorphOp op, + FeatureStruct templateCategory + ) + { + State current = start; + foreach (AffixTemplateSlot slot in slots) + { + State after = NewState(); + if (slot.Optional) + { + current.Arcs.Add(after); // epsilon: skip this slot + } + foreach (MorphemicMorphologicalRule rule in slot.Rules) + { + // Build-time category gate (faithful for inflectional templates, where the + // category is ~constant): a rule whose RequiredSyntacticFeatureStruct cannot + // unify with the template's category can never apply here, so omit it. This is + // HC's Required.Unify(stem) check, hoisted to compile time — no walk-order issue. + FeatureStruct required = RequiredCategory(rule); + if ( + templateCategory != null + && required != null + && !required.IsEmpty + && !templateCategory.IsUnifiable(required) + ) + { + continue; + } + foreach (AffixProcessAllomorph allomorph in Allomorphs(rule)) + { + if (MorphTokenCodec.ClassifyOp(allomorph, false) != op) + { + throw new NotSupportedException( + $"FstTemplateAnalyzer: a rule in a {op} slot is not a {op}." + ); + } + uint affixToken = MorphToken.Encode(op, _codec.GetOrAddIndex(allomorph.Morpheme)); + // Enter the affix through a token-bearing state, so the morpheme is emitted + // even for a zero/empty-segment affix (its token would otherwise be lost). + State tokenState = NewState(); + _tokenOnEntry[tokenState] = affixToken; + current.Arcs.Add(tokenState); // epsilon: enter this affix + State s = tokenState; + InsertSegments insert = allomorph.Rhs.OfType().FirstOrDefault(); + if (insert != null) + { + foreach (FeatureStruct fs in GetSegments(insert.Segments.Shape)) + { + s = AddArc(s, fs); + } + } + s.Arcs.Add(after); // epsilon: reconverge after the slot + } + } + current = after; + } + return current; + } + + private State BuildRootChain(State from, RootAllomorph root) + { + State state = from; + foreach (FeatureStruct fs in GetSegments(root.Segments.Shape)) + { + state = AddArc(state, fs); + } + _tokenOnEntry[state] = MorphToken.Encode(MorphOp.Root, _codec.GetOrAddIndex(root.Morpheme)); + return state; + } + + private static bool CategoryMatches(FeatureStruct rootCategory, FeatureStruct required) + { + if (required == null || required.IsEmpty) + { + return true; + } + return rootCategory != null && rootCategory.IsUnifiable(required); + } + + private IReadOnlyList GetSegments(Shape shape) + { + var segments = new List(); + for ( + ShapeNode node = shape.GetFirst(n => _filter(n.Annotation)); + node != shape.End; + node = node.GetNext(n => _filter(n.Annotation)) + ) + { + FeatureStruct fs = node.Annotation.FeatureStruct.Clone(); + fs.Freeze(); + segments.Add(fs); + } + return segments; + } + + private State AddArc(State state, FeatureStruct condition) + { + State next = NewState(); + state.Arcs.Add(condition, next); + return next; + } + + private State NewState() + { + _stateCount++; + if (_stateCount > _maxStates) + { + throw new NotSupportedException( + $"FstTemplateAnalyzer exceeded the state budget ({_maxStates}); this grammar needs the " + + "lazy / on-the-fly partition (HERMITCRAB_FST_PLAN.md §10) rather than an eager build." + ); + } + State state = _fsa.CreateState(); + _stateIds[state] = _stateCount; + return state; + } + + private static uint[] Append(uint[] tokens, uint token) + { + var result = new uint[tokens.Length + 1]; + tokens.CopyTo(result, 0); + result[tokens.Length] = token; + return result; + } + + private readonly struct Config + { + public Config(State state, uint[] tokens) + { + State = state; + Tokens = tokens; + } + + public State State { get; } + public uint[] Tokens { get; } + } + + private readonly struct RootRef + { + public RootRef(RootAllomorph allomorph, FeatureStruct category, int stratumIndex) + { + Allomorph = allomorph; + Category = category; + StratumIndex = stratumIndex; + } + + public RootAllomorph Allomorph { get; } + public FeatureStruct Category { get; } + public int StratumIndex { get; } + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstVerification.cs b/src/SIL.Machine.Morphology.HermitCrab/FstVerification.cs new file mode 100644 index 00000000..70f4b49d --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/FstVerification.cs @@ -0,0 +1,139 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using SIL.Machine.Morphology; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// One word on which a candidate analyzer's analysis set differs from the reference's: + /// are analyses the reference found but the candidate did not + /// (completeness failures), are analyses the candidate produced + /// that the reference rejects (soundness / over-generation failures). + /// + public sealed class AnalysisDivergence + { + public AnalysisDivergence( + string word, + IReadOnlyList missingFromCandidate, + IReadOnlyList extraInCandidate + ) + { + Word = word; + MissingFromCandidate = missingFromCandidate; + ExtraInCandidate = extraInCandidate; + } + + public string Word { get; } + public IReadOnlyList MissingFromCandidate { get; } + public IReadOnlyList ExtraInCandidate { get; } + } + + /// The result of an FST-vs-search corpus comparison (the closure/parity gate). + public sealed class AnalysisComparison + { + public AnalysisComparison(int wordsChecked, IReadOnlyList divergences) + { + WordsChecked = wordsChecked; + Divergences = divergences; + } + + public int WordsChecked { get; } + public IReadOnlyList Divergences { get; } + + /// Words whose analysis sets matched exactly. + public int Matches => WordsChecked - Divergences.Count; + + /// + /// True iff the candidate's analysis SET equals the reference's for every word — no missing + /// and no spurious analyses. This is the gate (HERMITCRAB_FST_PLAN.md §9.5) that must pass + /// before the candidate FST analyzer may REPLACE the search engine for a grammar. + /// + public bool IsComplete => Divergences.Count == 0; + + /// A readable dump. + public string Format() + { + var sb = new StringBuilder(); + sb.AppendLine( + $"checked {WordsChecked}, {Matches} match, {Divergences.Count} diverge — " + + (IsComplete ? "COMPLETE (set parity)" : "DIVERGENCES") + ); + foreach (AnalysisDivergence d in Divergences) + { + sb.AppendLine( + $" {d.Word}: missing=[{string.Join(" | ", d.MissingFromCandidate)}] " + + $"extra=[{string.Join(" | ", d.ExtraInCandidate)}]" + ); + } + return sb.ToString(); + } + } + + /// + /// Shadow / verification mode (HERMITCRAB_FST_PLAN.md §9.5, §10.4): run a candidate analyzer + /// (e.g. ) beside the sound+complete reference () + /// over a corpus and report, per word, where their analysis SETS differ. It measures both + /// directions of correctness at once — missing analyses (completeness) and extra analyses + /// (soundness) — so a clean run is the empirical certificate that the FST is closed for this + /// grammar. The FST may replace the search engine only once this is clean over a representative + /// corpus; until then it runs in shadow mode. + /// + public static class FstVerification + { + public static AnalysisComparison Compare( + IMorphologicalAnalyzer reference, + IMorphologicalAnalyzer candidate, + IEnumerable words + ) + { + // Identity key per distinct morpheme object: affix Morpheme.Id is empty in many grammars, + // so a name/id-string signature would collapse different affixes of the same shape (e.g. the + // subject markers 3P+2 / 3S+1 / 6) into one key and hide same-shape under-generation. Both + // analyzers reference the SAME Morpheme instances from the Language, so object identity is a + // faithful, shared discriminator. + var ids = new Dictionary(); + string Sig(WordAnalysis a) => Signature(a, ids); + + var divergences = new List(); + int count = 0; + foreach (string word in words) + { + count++; + var referenceSet = new HashSet(reference.AnalyzeWord(word).Select(Sig)); + var candidateSet = new HashSet(candidate.AnalyzeWord(word).Select(Sig)); + if (referenceSet.SetEquals(candidateSet)) + { + continue; + } + List missing = referenceSet + .Except(candidateSet) + .OrderBy(s => s, StringComparer.Ordinal) + .ToList(); + List extra = candidateSet + .Except(referenceSet) + .OrderBy(s => s, StringComparer.Ordinal) + .ToList(); + divergences.Add(new AnalysisDivergence(word, missing, extra)); + } + return new AnalysisComparison(count, divergences); + } + + /// A signature of one analysis: per-morpheme identity ids (in morph order) + root index. + private static string Signature(WordAnalysis analysis, Dictionary ids) + { + return string.Join("+", analysis.Morphemes.Select(m => Id(m, ids))) + ":" + analysis.RootMorphemeIndex; + } + + private static int Id(IMorpheme morpheme, Dictionary ids) + { + if (!ids.TryGetValue(morpheme, out int id)) + { + id = ids.Count; + ids[morpheme] = id; + } + return id; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs b/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs new file mode 100644 index 00000000..7e1a6991 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs @@ -0,0 +1,584 @@ +using System.Collections.Generic; +using System.Linq; +using System.Text; +using SIL.Machine.Annotations; +using SIL.Machine.DataStructures; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// How costly a flagged rule is for parsing. + /// + public enum GrammarAdvisorySeverity + { + /// Finite-state-able; informational only. + Info, + + /// Stays finite-state but inflates the combinatorial search fan-out. + Cost, + + /// Breaks finite-state compilation — forces the slow combinatorial search. + Escape, + } + + /// + /// One advisory about a single grammar rule: what makes it expensive, and how to keep + /// (or get) it back on the fast finite-state path. + /// + public sealed class GrammarAdvisory + { + public GrammarAdvisory( + string rule, + string stratum, + string kind, + GrammarAdvisorySeverity severity, + string issue, + string advice, + bool? probeable = null, + bool? regular = null + ) + { + Rule = rule; + Stratum = stratum; + Kind = kind; + Severity = severity; + Issue = issue; + Advice = advice; + Probeable = probeable; + Regular = regular; + } + + /// Name of the offending rule. + public string Rule { get; } + + /// Name of the stratum the rule lives in (rules can appear in more than one). + public string Stratum { get; } + + /// Rule kind (affix / phonological / compounding). + public string Kind { get; } + + public GrammarAdvisorySeverity Severity { get; } + + /// One sentence: what is expensive and why. + public string Issue { get; } + + /// "Constrain it like this" and/or "try this instead". + public string Advice { get; } + + /// + /// For an : whether a per-word un-application + /// probe (strip the affix / de-reduplicate, then re-parse the residue with the FST) is + /// sound for this rule. True = "clean": no phonological rule at or after its + /// stratum can rewrite the affixed span, so the affix surfaces literally and stripping it + /// recovers the stem exactly — the slow path collapses to a cheap local guess+verify. + /// False = "opaque": a later rule may alter the span, so literal stripping can miss an + /// analysis and the search backstop is required. Null = not an insertion escape / N/A. + /// + public bool? Probeable { get; } + + /// + /// For an : whether the construct denotes a + /// regular relation (an FST exists for it in principle). True = regular — it could + /// be reclaimed onto the fast path once the FST compiler exists (state-encode a spreading + /// feature, bounded-fold a finite copy, …); by Kaplan & Kay (1994) every standard + /// rewrite rule is regular regardless of how long its environment is. False = genuinely + /// non-regular (unbounded copy) or unconfirmable. Null = N/A. + /// + /// IMPORTANT: this is a reclaim path, NOT a cost downgrade. A Regular + /// escape is still Escape severity because it is slow in today's engine — + /// the FST compiler that would make it fast is not built yet. Severity tells the truth + /// about today; Regular tells you whether the slowness is fixable by compilation. + /// + public bool? Regular { get; } + } + + /// + /// The result of : the per-rule advisories + /// plus an overall tier verdict. + /// + public sealed class GrammarFstReport + { + public GrammarFstReport( + IReadOnlyList advisories, + int affixRulesExamined, + int phonologicalRulesExamined, + int compoundingRulesExamined + ) + { + Advisories = advisories; + AffixRulesExamined = affixRulesExamined; + PhonologicalRulesExamined = phonologicalRulesExamined; + CompoundingRulesExamined = compoundingRulesExamined; + EscapeCount = advisories.Count(a => a.Severity == GrammarAdvisorySeverity.Escape); + CostCount = advisories.Count(a => a.Severity == GrammarAdvisorySeverity.Cost); + InfoCount = advisories.Count(a => a.Severity == GrammarAdvisorySeverity.Info); + ProbeableEscapeCount = advisories.Count(a => + a.Severity == GrammarAdvisorySeverity.Escape && a.Probeable == true + ); + OpaqueEscapeCount = advisories.Count(a => + a.Severity == GrammarAdvisorySeverity.Escape && a.Probeable == false + ); + RegularEscapeCount = advisories.Count(a => + a.Severity == GrammarAdvisorySeverity.Escape && a.Regular == true + ); + NonRegularEscapeCount = advisories.Count(a => + a.Severity == GrammarAdvisorySeverity.Escape && a.Regular != true + ); + } + + public IReadOnlyList Advisories { get; } + + /// Affix-process rules inspected (those without an advisory are clean/FST-able). + public int AffixRulesExamined { get; } + + /// Phonological rules (rewrite + metathesis) inspected. + public int PhonologicalRulesExamined { get; } + + /// Compounding rules inspected. + public int CompoundingRulesExamined { get; } + + /// Number of rules that break finite-state compilation. + public int EscapeCount { get; } + + /// Number of rules that inflate the search but stay finite-state. + public int CostCount { get; } + + public int InfoCount { get; } + + /// Escapes for which the per-word un-application probe is sound (clean). + public int ProbeableEscapeCount { get; } + + /// Escapes that may interact with a later rule, so the search backstop is needed. + public int OpaqueEscapeCount { get; } + + /// + /// Escapes that are regular (an FST could reclaim them once the compiler exists). They are + /// still slow in today's engine — this is a reclaim path, not a cost downgrade. + /// + public int RegularEscapeCount { get; } + + /// Escapes that are genuinely non-regular or unconfirmable (no FST in principle). + public int NonRegularEscapeCount { get; } + + /// + /// Static tier candidate. The static report cannot compute the corpus-weighted fallback + /// rate, so for a few escapes it reports the candidate; the FST pipeline's corpus + /// pass confirms whether Tier 2 is worth it vs. Tier 3. + /// + public string Tier => + EscapeCount == 0 + ? "Tier 1 candidate — fully FST-able" + : ProbeableEscapeCount == EscapeCount + ? "Tier 2⁺ candidate — every escape is probe-able (surface-invariant): a per-word " + + "un-application probe WOULD recover the fast path once the probe runtime exists; " + + "all escapes are slow in today's engine" + : EscapeCount <= 3 + ? "Tier 2 candidate — hybrid (opaque/non-probe-able escapes fall back to search); confirm with corpus fallback rate" + : "Tier 3 — pervasive escapes, search engine only"; + + /// The rules that break FST compilation (the warnings that flip the tier). + public IEnumerable Escapes => + Advisories.Where(a => a.Severity == GrammarAdvisorySeverity.Escape); + + /// A readable dump of the report. + public string Format() + { + var sb = new StringBuilder(); + sb.AppendLine(Tier); + sb.AppendLine( + $" examined {AffixRulesExamined} affix, {PhonologicalRulesExamined} phonological, " + + $"{CompoundingRulesExamined} compounding rule(s)" + ); + sb.AppendLine( + $" {EscapeCount} escape(s) ({ProbeableEscapeCount} probe-able, {OpaqueEscapeCount} opaque), " + + $"{CostCount} cost(s), {InfoCount} info — {Advisories.Count} rule advisories" + ); + if (EscapeCount > 0) + { + sb.AppendLine( + $" reclaim path: {RegularEscapeCount} of {EscapeCount} escape(s) are FST-reclaimable " + + "(regular) once the FST compiler exists; ALL " + + $"{EscapeCount} are slow in today's engine. {NonRegularEscapeCount} are genuinely " + + "non-regular (per-word probe or search only)." + ); + } + foreach ( + GrammarAdvisory a in Advisories + .OrderByDescending(a => a.Severity) + .ThenBy(a => a.Rule, System.StringComparer.Ordinal) + ) + { + string probe = + a.Probeable == true ? " [probe-able]" + : a.Probeable == false ? " [opaque]" + : ""; + string regular = + a.Regular == true ? " [regular: FST-reclaimable, slow today]" + : a.Regular == false ? " [non-regular]" + : ""; + sb.AppendLine(); + sb.AppendLine($"[{a.Severity}]{probe}{regular} {a.Rule} ({a.Kind}, stratum '{a.Stratum}')"); + sb.AppendLine($" issue : {a.Issue}"); + if (a.Advice.Length > 0) + sb.AppendLine($" advice: {a.Advice}"); + } + return sb.ToString(); + } + } + + /// + /// Static grammar linter for the FST acceleration work (see fst.md / HERMITCRAB_FST_PLAN.md). + /// It walks a compiled and flags, per rule, what makes parsing expensive + /// or blocks finite-state compilation, with an actionable write-up (why it's costly, how to + /// constrain it, what to try instead) and an overall tier verdict. + /// + /// This is pure static analysis of the object model — no parsing, no corpus needed — so it can + /// run at grammar-authoring time or in CI: a new + /// that flips the tier is the "one new rule blew up the grammar" warning. + /// + public static class GrammarFstAdvisor + { + /// + /// Analyze every rule in . + /// + /// A compiled grammar. + /// + /// Above this allomorph count a rule earns a note. + /// + public static GrammarFstReport Analyze(Language language, int manyAllomorphsThreshold = 8) + { + var advisories = new List(); + int affixExamined = 0; + int phonExamined = 0; + int compoundExamined = 0; + + // For the clean/opaque (probe-ability) test: an insertion escape in stratum i is sound + // to un-apply by stripping iff no phonological rule at stratum i or later could rewrite + // the affixed span. Precompute the count of phonological rules at or after each stratum. + IList strata = language.Strata; + var phonAtOrAfter = new int[strata.Count + 1]; + for (int i = strata.Count - 1; i >= 0; i--) + phonAtOrAfter[i] = phonAtOrAfter[i + 1] + strata[i].PhonologicalRules.Count; + + for (int s = 0; s < strata.Count; s++) + { + Stratum stratum = strata[s]; + bool surfaceInvariant = phonAtOrAfter[s] == 0; + foreach (IMorphologicalRule mrule in stratum.MorphologicalRules) + { + switch (mrule) + { + case AffixProcessRule affix: + affixExamined++; + AnalyzeAffix(affix, stratum.Name, surfaceInvariant, advisories, manyAllomorphsThreshold); + break; + case CompoundingRule compound: + compoundExamined++; + advisories.Add( + new GrammarAdvisory( + compound.Name, + stratum.Name, + "compounding", + GrammarAdvisorySeverity.Info, + "Compounding rule; bounded by MaxStemCount, so it stays finite-state.", + "Keep MaxStemCount as low as the language needs; unbounded compounding is not finite-state." + ) + ); + break; + } + } + + foreach (IPhonologicalRule prule in stratum.PhonologicalRules) + { + phonExamined++; + AnalyzePhonological(prule, stratum.Name, advisories); + } + } + return new GrammarFstReport(advisories, affixExamined, phonExamined, compoundExamined); + } + + private static void AnalyzeAffix( + AffixProcessRule rule, + string stratum, + bool surfaceInvariant, + List advisories, + int manyAllomorphsThreshold + ) + { + // An insertion escape is "probe-able" (a per-word strip-and-reparse un-application is + // sound) only when nothing downstream can rewrite the affixed span — i.e. no + // phonological rule applies at or after this rule's stratum. + string probeNote = surfaceInvariant + ? " This escape is PROBE-ABLE: no phonological rule applies after it, so the affix " + + "surfaces literally — a per-word probe that strips the candidate affix and re-parses " + + "the residue with the FST recovers the analysis without the search engine." + : " This escape is OPAQUE: a phonological rule applies after it and may rewrite the " + + "affixed span, so a literal strip-and-reparse probe can miss an analysis; the search " + + "backstop is required."; + + foreach (AffixProcessAllomorph allomorph in rule.Allomorphs) + { + // Reduplication: the same input part is copied two or more times. Copying an + // unbounded span is not regular, so the rule is not finite-state. + IGrouping duplicated = allomorph + .Rhs.OfType() + .GroupBy(c => c.PartName) + .FirstOrDefault(g => g.Count() >= 2); + if (duplicated != null) + { + // Boundedness of the copied part decides regularity: a fixed-size reduplicant + // (CV/CVC) is a finite copy → regular (reclaimable by bounded fold); copying an + // unbounded part (the whole stem) is the one genuinely non-regular operation + // ({ww} is not regular). Unresolved part → treat as non-regular (warn). + bool bounded = IsPartBounded(allomorph, duplicated.Key); + string regularNote = bounded + ? " REGULAR (bounded reduplicant = finite copy): an FST could reclaim it by " + + "bounded-folding the copy — once the FST compiler exists. It is still slow in " + + "today's engine." + : " GENUINELY NON-REGULAR (unbounded copy — {ww} is not a regular relation): no FST " + + "exists for it; only the per-word strip-and-reparse probe (when surface-invariant) " + + "or the search engine. Slow today."; + advisories.Add( + new GrammarAdvisory( + rule.Name, + stratum, + "affix", + GrammarAdvisorySeverity.Escape, + $"Reduplication: part '{duplicated.Key}' is copied {duplicated.Count()}×, so the " + + "parser falls back to the slow combinatorial search for any word this rule " + + "could apply to.", + "If the reduplicant is a fixed size (e.g. one CV syllable), bound the copied part's " + + "length so it becomes finite-state. If only a handful of forms reduplicate, list " + + "them as lexical entries instead. Otherwise this rule keeps the whole grammar in " + + "the hybrid/search tier." + + probeNote + + regularNote, + surfaceInvariant, + bounded + ) + ); + } + else if (HasInfixedCopy(allomorph.Rhs)) + { + // Infixation: a non-copy action (inserted material) sits BETWEEN two copies of + // the stem (copy…insert…copy), so the stem is split at an internal position. + // Contiguous copies with inserts only at the ends (copy/copy/insert, + // insert/copy/copy, insert/copy/copy/insert) are ordinary prefix / suffix / + // circumfix over a split stem — finite-state, NOT flagged. + advisories.Add( + new GrammarAdvisory( + rule.Name, + stratum, + "affix", + GrammarAdvisorySeverity.Escape, + "Infixation: material is inserted between two copies of the stem, splitting it at " + + "an internal position.", + "If the infix position is fixed (a known slot), encode it as a bounded split so it " + + "stays finite-state. A variable, content-determined split blocks FST compilation." + + probeNote + + " REGULAR (the split is described by a regular pattern): an FST could reclaim it " + + "by bounded-folding the split, or the per-word probe handles it — once those exist. " + + "It is still slow in today's engine.", + surfaceInvariant, + regular: true + ) + ); + } + + if (allomorph.Rhs.OfType().Any()) + { + advisories.Add( + new GrammarAdvisory( + rule.Name, + stratum, + "affix", + GrammarAdvisorySeverity.Info, + "Process modification (ModifyFromInput) rewrites stem segments; finite-state only if " + + "the change is local and bounded.", + "A feature change in a fixed context is fine; a non-local or agreement-driven change " + + "blocks FST — consider a bounded reformulation." + ) + ); + } + } + + if (rule.Allomorphs.Count > manyAllomorphsThreshold) + { + advisories.Add( + new GrammarAdvisory( + rule.Name, + stratum, + "affix", + GrammarAdvisorySeverity.Cost, + $"{rule.Allomorphs.Count} allomorphs; each one multiplies the un-application branching " + + "during analysis.", + "Consolidate allomorphs via environment conditioning where the language allows it." + ) + ); + } + } + + /// + /// True when a non-copy action (inserted material) appears strictly between the first and + /// last in — i.e. copy…insert…copy, the + /// signature of infixation. Contiguous copies (inserts only at the ends) return false. + /// + private static bool HasInfixedCopy(IList rhs) + { + int first = -1; + int last = -1; + for (int i = 0; i < rhs.Count; i++) + { + if (rhs[i] is CopyFromInput) + { + if (first < 0) + first = i; + last = i; + } + } + if (first < 0 || last == first) + return false; + for (int i = first + 1; i < last; i++) + { + if (!(rhs[i] is CopyFromInput)) + return true; + } + return false; + } + + private static void AnalyzePhonological( + IPhonologicalRule prule, + string stratum, + List advisories + ) + { + switch (prule) + { + case RewriteRule rewrite: + AnalyzeRewrite(rewrite, stratum, advisories); + break; + case MetathesisRule metathesis: + advisories.Add( + new GrammarAdvisory( + metathesis.Name, + stratum, + "phonological", + GrammarAdvisorySeverity.Info, + "Metathesis (segment reordering); finite-state over a bounded span.", + "Keep the reordered span bounded; unbounded metathesis blocks FST." + ) + ); + break; + } + } + + private static void AnalyzeRewrite(RewriteRule rule, string stratum, List advisories) + { + bool unboundedEnvironment = rule.Subrules.Any(sr => + HasUnboundedQuantifier(sr.LeftEnvironment) || HasUnboundedQuantifier(sr.RightEnvironment) + ); + + if (unboundedEnvironment) + { + // Kaplan & Kay (1994): a context-sensitive rewrite rule with regular φ/ψ/λ/ρ, + // applied directionally, denotes a REGULAR relation no matter how long the + // environment is — so an unbounded environment does not make the rule non-regular. + // It is regular iff the rule's own Lhs/Rhs are bounded (only the environment is + // unbounded); if the Lhs/Rhs are themselves unbounded we cannot confirm it. + bool rewriteBounded = + !HasUnboundedQuantifier(rule.Lhs) && rule.Subrules.All(sr => !HasUnboundedQuantifier(sr.Rhs)); + advisories.Add( + new GrammarAdvisory( + rule.Name, + stratum, + "phonological", + GrammarAdvisorySeverity.Escape, + "Unbounded rule environment: the left/right context matches an arbitrary-length span, so " + + "today's engine un-applies it at many positions — slow, and the composed automaton " + + "gains states.", + "Replace the '+'/'*' context with the fixed window the rule actually needs (usually 1–2 " + + "segments)." + + ( + rewriteBounded + ? " REGULAR (Kaplan & Kay 1994: a directional rewrite rule is a regular " + + "relation however long its environment): the long-distance dependency " + + "(e.g. vowel harmony / spreading) can be state-encoded into the FST — once " + + "the compiler exists. It is still slow in today's engine." + : " The rule's own LHS/RHS is unbounded, so regularity cannot be confirmed — " + + "treat as non-regular." + ), + regular: rewriteBounded + ) + ); + } + else + { + advisories.Add( + new GrammarAdvisory( + rule.Name, + stratum, + "phonological", + GrammarAdvisorySeverity.Info, + "Rewrite rule with a bounded environment: finite-state. It adds states to the composed " + + "transducer.", + "Keep the environment as tight as the language requires." + ) + ); + } + + // Deletion: the LHS is longer than every subrule's RHS. During analysis the parser must + // guess where the deleted segments were and re-insert them (× DeletionReapplications), + // which multiplies the search. + int lhsSegments = CountConstraints(rule.Lhs); + if (lhsSegments > 0 && rule.Subrules.All(sr => CountConstraints(sr.Rhs) < lhsSegments)) + { + advisories.Add( + new GrammarAdvisory( + rule.Name, + stratum, + "phonological", + GrammarAdvisorySeverity.Cost, + "Deletion rule (LHS longer than RHS): during analysis the parser guesses where the " + + "deleted segments were and re-inserts them (× DeletionReapplications), multiplying " + + "the search.", + "Keep DeletionReapplications as low as the language needs; a bounded deletion context is " + + "still finite-state." + ) + ); + } + } + + /// + /// Whether the copied part named is length-bounded — i.e. its + /// defining pattern has no unbounded quantifier. + /// Bounded ⇒ a finite copy ⇒ regular. Unresolved part ⇒ false (conservative: warn). + /// + private static bool IsPartBounded(AffixProcessAllomorph allomorph, string partName) + { + Pattern part = allomorph.Lhs.FirstOrDefault(p => p.Name == partName); + if (part == null) + return false; + return !HasUnboundedQuantifier(part); + } + + private static bool HasUnboundedQuantifier(Pattern pattern) + { + if (pattern == null) + return false; + return pattern + .GetNodesDepthFirst() + .OfType>() + .Any(q => q.MaxOccur == Quantifier.Infinite); + } + + private static int CountConstraints(Pattern pattern) + { + if (pattern == null) + return 0; + return pattern.GetNodesDepthFirst().OfType>().Count(); + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/GrammarFstClosure.cs b/src/SIL.Machine.Morphology.HermitCrab/GrammarFstClosure.cs new file mode 100644 index 00000000..8802949e --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/GrammarFstClosure.cs @@ -0,0 +1,142 @@ +using System.Collections.Generic; +using System.Linq; +using System.Text; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// The closure verdict for one non-regular escape: whether any FST-able rule could apply before + /// it and thus feed it. A escape cannot be reached by an FST-able + /// derivation, so the FST's "no path" is a proof for words showing no escape signature. + /// + public sealed class EscapeClosure + { + public EscapeClosure(string rule, string stratum, bool closed, string reason) + { + Rule = rule; + Stratum = stratum; + Closed = closed; + Reason = reason; + } + + public string Rule { get; } + public string Stratum { get; } + public bool Closed { get; } + public string Reason { get; } + } + + /// The result of the static feeding-closure pass. + public sealed class ClosureReport + { + public ClosureReport(IReadOnlyList escapes) + { + Escapes = escapes; + } + + public IReadOnlyList Escapes { get; } + + /// + /// True iff every non-regular escape is closed (vacuously true when there are none). When + /// true, an FST built over the FST-able fragment is closed: its "no path" is a proof, not a + /// guess — subject to the per-word surface check and the corpus parity gate (§9.5). + /// + public bool FstClosed => Escapes.All(e => e.Closed); + + public string Format() + { + var sb = new StringBuilder(); + sb.AppendLine( + FstClosed + ? "FST-CLOSED — no escape can be fed by an FST-able step; FST silence is a proof" + : "NOT closed — some escapes may be fed; those words need the search backstop" + ); + foreach (EscapeClosure e in Escapes) + { + sb.AppendLine($" [{(e.Closed ? "closed" : "fed")}] {e.Rule} (stratum '{e.Stratum}'): {e.Reason}"); + } + return sb.ToString(); + } + } + + /// + /// Static feeding-closure pre-filter (HERMITCRAB_FST_PLAN.md §9.1b / §9.5). For each non-regular + /// escape (reduplication / infixation) it decides — by stratal precedence — whether any + /// FST-able rule could apply before it and so create its trigger (Kiparsky feeding). An escape + /// that nothing FST-able precedes is CLOSED: no FST-able derivation can feed it, so the FST + /// (which excludes it) is complete for any word that shows no escape signature, and its silence + /// is a proof. + /// + /// This is the conservative, SOUND pre-filter: it never falsely reports "closed" (any FST-able + /// rule at or before the escape's stratum — which, under unordered application, could precede + /// it — is treated as a potential feeder). The precise refinement that reclaims the + /// over-flagged cases is the regular-emptiness test range(F) ∩ trigger(E) = ∅ via + /// Fst.Intersect; the empirical backstop is the corpus set-parity gate (FstVerification). + /// + public static class GrammarFstClosure + { + public static ClosureReport Analyze(Language language) + { + IList strata = language.Strata; + int count = strata.Count; + + // FST-able "feeders" per stratum: every morphological rule that is NOT a non-regular + // escape (concatenative affixes, compounding) plus every phonological rule. + var feedersPerStratum = new int[count]; + var escapes = new List>(); + for (int i = 0; i < count; i++) + { + Stratum stratum = strata[i]; + foreach (IMorphologicalRule mrule in stratum.MorphologicalRules) + { + if (IsEscape(mrule)) + { + escapes.Add(new KeyValuePair(i, ((AffixProcessRule)mrule).Name)); + } + else + { + feedersPerStratum[i]++; + } + } + feedersPerStratum[i] += stratum.PhonologicalRules.Count; + } + + var results = new List(); + foreach (KeyValuePair escape in escapes) + { + int index = escape.Key; + int feedersBefore = 0; + for (int j = 0; j < index; j++) + { + feedersBefore += feedersPerStratum[j]; + } + // Same-stratum feeders could precede the escape under unordered application, so they + // count too (conservative). + bool closed = feedersBefore == 0 && feedersPerStratum[index] == 0; + string reason = closed + ? $"no FST-able rule applies at or before stratum '{strata[index].Name}' — nothing can feed it" + : $"FST-able rule(s) at or before stratum '{strata[index].Name}' could feed it " + + "(stratal pre-filter; refine with range∩trigger)"; + results.Add(new EscapeClosure(escape.Value, strata[index].Name, closed, reason)); + } + return new ClosureReport(results); + } + + private static bool IsEscape(IMorphologicalRule mrule) + { + if (!(mrule is AffixProcessRule affix)) + { + return false; + } + foreach (AffixProcessAllomorph allomorph in affix.Allomorphs) + { + MorphOp op = MorphTokenCodec.ClassifyOp(allomorph, false); + if (op == MorphOp.Reduplication || op == MorphOp.Infix) + { + return true; + } + } + return false; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphToken.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphToken.cs new file mode 100644 index 00000000..867a5be5 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphToken.cs @@ -0,0 +1,111 @@ +using System; +using System.Collections.Generic; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// The role/operation of a morpheme in a derivation — the high 8-bit field of a packed + /// . It is the "ordered operation connected to the letters": it lets a + /// consumer rebuild the gloss/bracketing of an analysis without re-running any rule. + /// + public enum MorphOp : byte + { + /// Unset / not a morpheme boundary. + None = 0, + + /// The root (stem) morpheme. + Root = 1, + + /// A prefix. + Prefix = 2, + + /// A suffix. + Suffix = 3, + + /// An infix (inserted inside the stem). + Infix = 4, + + /// Reduplication. + Reduplication = 5, + + /// The prefixal half of a circumfix. + CircumfixPrefix = 6, + + /// The suffixal half of a circumfix. + CircumfixSuffix = 7, + + /// A compounding element (a non-head stem). + Compound = 8, + + /// A clitic. + Clitic = 9, + + /// A process / simulfix (a ModifyFromInput-style change, no added segments). + Process = 10, + + /// A zero (null) morph. + Null = 11, + } + + /// + /// A 32-bit packed analysis token: high 8 bits = , low 24 bits = a + /// morpheme index into the grammar's compiled morpheme table. The analyzer transducer emits + /// one token per morpheme, in application order; the resulting uint[] IS the structured + /// analysis. It is self-describing — the morpheme order is the array order, and the root + /// position is the index of the token, so no separate + /// RootMorphemeIndex field is needed. See HERMITCRAB_FST_PLAN.md §8. + /// + public static class MorphToken + { + /// Number of low bits reserved for the morpheme index. + public const int MorphemeIdBits = 24; + + /// Largest encodable morpheme index (16,777,215). + public const int MaxMorphemeId = (1 << MorphemeIdBits) - 1; + + private const uint MorphemeIdMask = (1u << MorphemeIdBits) - 1; + + /// Pack a (role, morpheme index) pair into one 32-bit token. + /// + /// does not fit in bits. + /// + public static uint Encode(MorphOp op, int morphemeId) + { + if (morphemeId < 0 || morphemeId > MaxMorphemeId) + { + throw new ArgumentOutOfRangeException( + nameof(morphemeId), + morphemeId, + $"morpheme index must be in [0, {MaxMorphemeId}] to fit in {MorphemeIdBits} bits" + ); + } + return ((uint)op << MorphemeIdBits) | (uint)morphemeId; + } + + /// The morpheme's role/operation. + public static MorphOp GetOp(uint token) => (MorphOp)(token >> MorphemeIdBits); + + /// The morpheme index into the grammar's compiled morpheme table. + public static int GetMorphemeId(uint token) => (int)(token & MorphemeIdMask); + + /// + /// Index of the token in a derivation array, or -1 if none. + /// This recovers WordAnalysis.RootMorphemeIndex from the token array itself. + /// + public static int RootIndex(IReadOnlyList tokens) + { + if (tokens == null) + { + return -1; + } + for (int i = 0; i < tokens.Count; i++) + { + if (GetOp(tokens[i]) == MorphOp.Root) + { + return i; + } + } + return -1; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphTokenCodec.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphTokenCodec.cs new file mode 100644 index 00000000..58104cb5 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphTokenCodec.cs @@ -0,0 +1,131 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Converts a parsed into the packed 32-bit morpheme-token array + /// (HERMITCRAB_FST_PLAN.md §8) and assigns each morpheme a stable 24-bit index. This is the + /// reference encoder the FST compiler will emit as arc outputs; it also proves the schema + /// faithfully reproduces a real HC analysis — encoding a and decoding it + /// yields the same morphemes (and root) that WordAnalysis carries, with the operation + /// of each morpheme recovered from the rule that introduced it. + /// + public class MorphTokenCodec + { + private readonly Dictionary _indexByMorpheme = new Dictionary(); + private readonly List _morphemesByIndex = new List(); + + /// Number of distinct morphemes that have been assigned an index. + public int MorphemeCount => _morphemesByIndex.Count; + + /// The morpheme assigned a given 24-bit index. + public Morpheme GetMorpheme(int index) => _morphemesByIndex[index]; + + /// + /// Encode a parsed word as its derivation token array: one per + /// morpheme in application order, the head root tagged . Mirrors + /// the morpheme order and root choice that Morpher.CreateWordAnalysis produces. + /// + public uint[] Encode(Word word) + { + var tokens = new List(); + foreach (Allomorph allo in word.AllomorphsInMorphOrder) + { + MorphOp op = ClassifyOp(allo, allo == word.RootAllomorph); + tokens.Add(MorphToken.Encode(op, GetOrAddIndex(allo.Morpheme))); + } + return tokens.ToArray(); + } + + /// Assign (or look up) the stable 24-bit index for a morpheme. + public int GetOrAddIndex(Morpheme morpheme) + { + if (!_indexByMorpheme.TryGetValue(morpheme, out int index)) + { + index = _morphemesByIndex.Count; + _indexByMorpheme[morpheme] = index; + _morphemesByIndex.Add(morpheme); + } + return index; + } + + /// + /// Determine the role/operation of an applied allomorph: the head root is + /// ; any other root (a compound stem) is + /// ; an affix is classified from its output actions. + /// + public static MorphOp ClassifyOp(Allomorph allomorph, bool isHeadRoot) + { + if (isHeadRoot) + { + return MorphOp.Root; + } + if (allomorph is RootAllomorph) + { + return MorphOp.Compound; + } + if (allomorph is AffixProcessAllomorph affix) + { + return ClassifyAffix(affix.Rhs); + } + return MorphOp.None; + } + + private static MorphOp ClassifyAffix(IList rhs) + { + // Reduplication: the same input part is copied two or more times. + bool reduplication = rhs.OfType().GroupBy(c => c.PartName).Any(g => g.Count() >= 2); + if (reduplication) + { + return MorphOp.Reduplication; + } + + int firstCopy = -1; + int lastCopy = -1; + for (int i = 0; i < rhs.Count; i++) + { + if (rhs[i] is CopyFromInput) + { + if (firstCopy < 0) + { + firstCopy = i; + } + lastCopy = i; + } + } + + if (firstCopy < 0) + { + // No copy of the stem: a pure insertion, or a process (ModifyFromInput) change. + return rhs.OfType().Any() ? MorphOp.Process : MorphOp.None; + } + + // Inserted material BETWEEN two copies of the stem = infixation. + for (int i = firstCopy + 1; i < lastCopy; i++) + { + if (!(rhs[i] is CopyFromInput)) + { + return MorphOp.Infix; + } + } + + bool leadingInsert = firstCopy > 0; + bool trailingInsert = lastCopy < rhs.Count - 1; + if (leadingInsert && trailingInsert) + { + return MorphOp.CircumfixPrefix; + } + if (leadingInsert) + { + return MorphOp.Prefix; + } + if (trailingInsert) + { + return MorphOp.Suffix; + } + return MorphOp.None; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorphemeRegistry.cs b/src/SIL.Machine.Morphology.HermitCrab/MorphemeRegistry.cs new file mode 100644 index 00000000..0e052da3 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/MorphemeRegistry.cs @@ -0,0 +1,69 @@ +using System.Collections.Generic; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A stable, deterministic morpheme ↔ integer-key map for a grammar, used to persist + /// across sessions (HERMITCRAB_FST_PLAN.md §13). A cached + /// references grammar morpheme objects, which cannot be + /// serialized directly; instead each morpheme is keyed by its position in a deterministic + /// enumeration of the grammar (roots then affix rules, per stratum). Rebuilding the registry from + /// the same grammar yields the same keys, so cached keys rehydrate to the right morphemes — and a + /// grammar-version guard on the file rejects a cache built against a different grammar. + /// + public sealed class MorphemeRegistry + { + private readonly Dictionary _toKey = new Dictionary(); + private readonly List _byKey = new List(); + + public MorphemeRegistry(Language language) + { + foreach (Stratum stratum in language.Strata) + { + foreach (LexEntry entry in stratum.Entries) + { + Add(entry); + } + foreach (IMorphologicalRule mrule in stratum.MorphologicalRules) + { + if (mrule is MorphemicMorphologicalRule rule) + { + Add(rule); + } + } + foreach (AffixTemplate template in stratum.AffixTemplates) + { + foreach (AffixTemplateSlot slot in template.Slots) + { + foreach (MorphemicMorphologicalRule rule in slot.Rules) + { + Add(rule); + } + } + } + } + } + + private void Add(IMorpheme morpheme) + { + if (!_toKey.ContainsKey(morpheme)) + { + _toKey[morpheme] = _byKey.Count; + _byKey.Add(morpheme); + } + } + + /// The key for a morpheme; -1 if it is not a registered grammar morpheme. + public int Key(IMorpheme morpheme) + { + return _toKey.TryGetValue(morpheme, out int key) ? key : -1; + } + + /// The morpheme for a key, or null if out of range (a cache from a different grammar). + public IMorpheme Resolve(int key) + { + return key >= 0 && key < _byKey.Count ? _byKey[key] : null; + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/MorpherPool.cs b/src/SIL.Machine.Morphology.HermitCrab/MorpherPool.cs new file mode 100644 index 00000000..e04bc605 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/MorpherPool.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections.Concurrent; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A thread-safe pool of instances for the FST verify / engine-backstop + /// paths. Verification pins the engine's / + /// per candidate, which is mutable instance state — so a single + /// shared cannot be used from multiple threads (the selectors would race). + /// Each parse instead s its own Morpher and s it (selectors + /// reset) when done; the Morpher's own internal parallelism is safe because the rented instance has + /// a single owner for the duration of the call. Morphers are built once (compiling the grammar is + /// expensive) and reused across words. + /// + public sealed class MorpherPool + { + private readonly Func _factory; + private readonly ConcurrentBag _available = new ConcurrentBag(); + + /// Creates a fresh (each must be independent — its + /// own — so pooled instances never share mutable state). + public MorpherPool(Func factory) + { + _factory = factory; + } + + /// Borrow a Morpher with default (unrestricted) selectors. Always pair with . + public Morpher Rent() + { + return _available.TryTake(out Morpher morpher) ? morpher : _factory(); + } + + /// Reset the selectors and return the Morpher to the pool for reuse. + public void Return(Morpher morpher) + { + morpher.LexEntrySelector = _ => true; + morpher.RuleSelector = _ => true; + _available.Add(morpher); + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/VerifiedFstAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/VerifiedFstAnalyzer.cs new file mode 100644 index 00000000..99d4a1b0 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/VerifiedFstAnalyzer.cs @@ -0,0 +1,50 @@ +using System.Collections.Generic; +using SIL.Machine.Morphology; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// The FST proposes candidates fast; each is confirmed by restricted re-analysis + /// () — HC's own pinned to the candidate's + /// root and rules — and a candidate HC does not confirm is discarded (not a fallback). The + /// confirmed, genuine HC analysis is emitted. Because verification runs HC's real analysis + + /// synthesis, this enforces every constraint (category, MPR, co-occurrence, obligatoriness) without + /// reimplementing any of them. + /// + /// Sound by construction (a kept analysis is a real HC analysis) and lossless (a valid candidate is + /// never false-rejected). It does not add analyses the proposer never produced, so under-generation + /// (coverage) must be closed in the proposer. Thread-safe: the immutable proposer is shared + /// and each verification rents a from the pool, so many words can be analyzed + /// in parallel. + /// + public class VerifiedFstAnalyzer : IMorphologicalAnalyzer + { + private readonly IMorphologicalAnalyzer _proposer; + private readonly MorpherPool _pool; + + public VerifiedFstAnalyzer(IMorphologicalAnalyzer proposer, MorpherPool pool) + { + _proposer = proposer; + _pool = pool; + } + + /// Build the proposer and a verify Morpher pool from a language. + public VerifiedFstAnalyzer(TraceManager traceManager, Language language) + : this( + new FstTemplateAnalyzer(language, new Morpher(traceManager, language)), + new MorpherPool(() => new Morpher(new TraceManager(), language)) + ) { } + + public IEnumerable AnalyzeWord(string word) + { + foreach (WordAnalysis candidate in _proposer.AnalyzeWord(word)) + { + WordAnalysis confirmed = FstReplay.Confirm(_pool, candidate, word); + if (confirmed != null) + { + yield return confirmed; + } + } + } + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/CachingMorphologicalAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/CachingMorphologicalAnalyzerTests.cs new file mode 100644 index 00000000..f6dee728 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/CachingMorphologicalAnalyzerTests.cs @@ -0,0 +1,106 @@ +using NUnit.Framework; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// CI coverage for the two-path caching analyzer (HERMITCRAB_FST_PLAN.md §13): the default path is +/// guaranteed-complete (engine, cached); the fast path is provisional until warmed; warming fills the +/// cache; and the cache persists across sessions with a grammar-version guard. +/// +public class CachingMorphologicalAnalyzerTests : HermitCrabTestBase +{ + private static string Sig(WordAnalysis a) => + string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex; + + private static HashSet SigSet(IEnumerable analyses) => new(analyses.Select(Sig)); + + [Test] + public void Default_IsGuaranteedComplete_AndMatchesEngine() + { + var engine = new Morpher(TraceManager, Language); + var caching = CachingMorphologicalAnalyzer.FromLanguage(TraceManager, Language); + foreach (string word in new[] { "sag", "dat", "sagg" }) + { + Assert.That(SigSet(caching.AnalyzeWord(word)).SetEquals(SigSet(engine.AnalyzeWord(word))), Is.True, word); + } + // the second call is served from the cache (same result) + Assert.That(SigSet(caching.AnalyzeWord("dat")).SetEquals(SigSet(engine.AnalyzeWord("dat"))), Is.True); + Assert.That(caching.Cache.Count, Is.GreaterThan(0)); + } + + [Test] + public void Fast_IsProvisionalUntilWarmed_ThenComplete() + { + var caching = CachingMorphologicalAnalyzer.FromLanguage(TraceManager, Language); + + FastAnalysisResult before = caching.AnalyzeWordFast("dat"); + Assert.That(before.IsComplete, Is.False, "uncached fast result must be flagged provisional"); + + caching.Warm(new[] { "dat" }); + + FastAnalysisResult after = caching.AnalyzeWordFast("dat"); + Assert.That(after.IsComplete, Is.True, "after warming the fast result is the cached complete set"); + var engine = new Morpher(TraceManager, Language); + Assert.That(SigSet(after.Analyses).SetEquals(SigSet(engine.AnalyzeWord("dat"))), Is.True); + } + + [Test] + public void Warm_FillsCacheForCorpus() + { + var caching = CachingMorphologicalAnalyzer.FromLanguage(TraceManager, Language); + string[] corpus = { "sag", "dat", "sat", "saz" }; + caching.Warm(corpus); + foreach (string w in corpus) + { + Assert.That(caching.AnalyzeWordFast(w).IsComplete, Is.True, $"{w} should be cached after warm"); + } + } + + [Test] + public void Certified_Grammar_SkipsEngine_FastIsProvenComplete() + { + // A certified grammar (FST-closed + set-parity) treats the FST as proven complete: no full + // search, no cache, and the fast result is flagged complete. + var fast = new VerifiedFstAnalyzer(TraceManager, Language); + var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); + var certified = new CachingMorphologicalAnalyzer(fast, pool, new AnalysisCache(), grammarCertified: true); + + Assert.That(certified.GrammarCertified, Is.True); + FastAnalysisResult r = certified.AnalyzeWordFast("dat"); + Assert.That(r.IsComplete, Is.True, "certified grammar: fast result is proven complete without warming"); + + certified.AnalyzeWord("dat").ToList(); // default path + Assert.That(certified.Cache.Count, Is.Zero, "certified grammar must never run the engine / populate the cache"); + } + + [Test] + public void Persistence_RoundTrips_AndVersionGuardRejectsStale() + { + var caching = CachingMorphologicalAnalyzer.FromLanguage(TraceManager, Language); + string[] corpus = { "sag", "dat", "sat", "sagg" }; // includes a non-word (empty analysis) + caching.Warm(corpus); + + var registry = new MorphemeRegistry(Language); + var buffer = new StringWriter(); + AnalysisCacheSerializer.Save(caching.Cache, registry, "v1", buffer); + string serialized = buffer.ToString(); + + // Reload into a fresh cache against the same grammar + version. + var reloaded = new AnalysisCache(); + bool ok = AnalysisCacheSerializer.Load(reloaded, registry, "v1", new StringReader(serialized)); + Assert.That(ok, Is.True); + Assert.That(reloaded.Count, Is.EqualTo(caching.Cache.Count)); + foreach (string w in corpus) + { + Assert.That(reloaded.TryGet(w, out IReadOnlyList a), Is.True, w); + Assert.That(caching.Cache.TryGet(w, out IReadOnlyList orig), Is.True); + Assert.That(SigSet(a).SetEquals(SigSet(orig)), Is.True, $"round-trip mismatch for {w}"); + } + + // A different grammar version must be rejected (stale cache → re-warm). + var rejected = new AnalysisCache(); + bool loadedStale = AnalysisCacheSerializer.Load(rejected, registry, "v2", new StringReader(serialized)); + Assert.That(loadedStale, Is.False); + Assert.That(rejected.Count, Is.Zero); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstSenaBenchmark.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstSenaBenchmark.cs new file mode 100644 index 00000000..4bfdcfe6 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstSenaBenchmark.cs @@ -0,0 +1,211 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using NUnit.Framework; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Manual end-to-end benchmark on a real grammar: census/closure, build, per-analyzer timing + set +/// parity vs the search engine, a negative-example soundness check, and a parallel-consistency check. +/// [Explicit] — set HC_GRAMMAR (an HC config XML) and HC_WORDS (one word per line); optionally +/// HC_MAX_WORDS. The reference oracle runs with unlimited unapplications (the only sound+complete +/// baseline). Run: +/// $env:HC_GRAMMAR=...; $env:HC_WORDS=...; dotnet test --filter "FullyQualifiedName~FstSenaBenchmark" +/// +[TestFixture] +[Explicit("Manual FST-vs-search benchmark on an external grammar; not part of CI.")] +public class FstSenaBenchmark +{ + [Test] + public void Benchmark_FstVsSearch() + { + (Language language, List words) = Load(); + var search = new Morpher(new TraceManager(), language) { MaxUnapplications = 0 }; + + GrammarFstReport census = GrammarFstAdvisor.Analyze(language); + ClosureReport closure = GrammarFstClosure.Analyze(language); + TestContext.Out.WriteLine($"census : {census.Tier} ({census.EscapeCount} escapes)"); + TestContext.Out.WriteLine($"closure : {(closure.FstClosed ? "FST-CLOSED" : "not closed")}"); + + var verified = new VerifiedFstAnalyzer(new FstTemplateAnalyzer(language, search), new MorpherPool(() => new Morpher(new TraceManager(), language))); + var caching = CachingMorphologicalAnalyzer.FromLanguage(new TraceManager(), language, words); + + long searchMs = TimeParse("search ", words, w => search.AnalyzeWord(w).Count()); + TimeParse("verified", words, w => verified.AnalyzeWord(w).Count()); + // Default (guaranteed-complete) path: FST-only when the grammar is certified, else engine+cache. + TimeParse("caching ", words, w => caching.AnalyzeWord(w).Count()); + + AnalysisComparison parity = FstVerification.Compare(search, verified, words); + TestContext.Out.WriteLine( + $"verified vs search : {(parity.IsComplete ? "IDENTICAL" : parity.Divergences.Count + " divergent words")} " + + $"(grammar certified = {caching.GrammarCertified} → " + + (caching.GrammarCertified ? "FST-only, no full search" : "engine/cache backstop") + ")" + ); + TestContext.Out.WriteLine($"(search total {searchMs} ms)"); + } + + /// + /// Soundness on NEGATIVE examples: plausible-looking non-words (real words over-prefixed, + /// over-suffixed, prefix-swapped, fake-reduplicated, fake-compounded) must analyze to NOTHING. We + /// keep only true negatives (search = ∅), preferring those the raw FST proposes for (so the verify + /// is exercised), then require the verified FST to also return ∅. A non-empty result is a false + /// positive — the soundness failure this hunts for. + /// + [Test] + public void Soundness_NegativeExamples() + { + (Language language, List real0) = Load(); + int targetCount = int.TryParse(Environment.GetEnvironmentVariable("HC_NEG_COUNT"), out int nc) ? nc : 50; + var search = new Morpher(new TraceManager(), language) { MaxUnapplications = 0 }; + var raw = new FstTemplateAnalyzer(language, search); + var verified = new VerifiedFstAnalyzer(new FstTemplateAnalyzer(language, search), new MorpherPool(() => new Morpher(new TraceManager(), language))); + + List real = real0.Take(80).ToList(); + string[] pre = { "ku", "a", "ci", "ka", "mu", "ma", "ni", "wa", "ti", "pa" }; + string[] suf = { "a", "e", "ira", "isa", "ka", "ni", "wa", "esa" }; + var candidates = new List(); + for (int i = 0; i < real.Count; i++) + { + string w = real[i].ToLowerInvariant(); + foreach (string p in pre) + { + candidates.Add(p + w); + if (w.Length > p.Length + 1 && w.StartsWith(p, StringComparison.Ordinal)) + { + foreach (string p2 in pre) + { + if (p2 != p) + { + candidates.Add(string.Concat(p2.AsSpan(), w.AsSpan(p.Length))); + } + } + } + } + foreach (string s in suf) + { + candidates.Add(w + s); + } + candidates.Add(string.Concat(w.AsSpan(0, 2), w)); + if (i + 1 < real.Count) + { + candidates.Add(w + real[i + 1].ToLowerInvariant()); + } + } + + int chosen = 0; + int fstProposed = 0; + int falsePositives = 0; + var fp = new List(); + var seen = new HashSet(); + foreach (string c in candidates) + { + if (chosen >= targetCount || !seen.Add(c)) + { + continue; + } + try + { + if (search.AnalyzeWord(c).Any()) + { + continue; // actually parses — not a negative + } + int rawCount = raw.AnalyzeWord(c).Count(); + int verifiedCount = verified.AnalyzeWord(c).Count(); + chosen++; + if (rawCount > 0) + { + fstProposed++; + } + if (verifiedCount != 0) + { + falsePositives++; + if (fp.Count < 20) + { + fp.Add(c); + } + } + } + catch (Exception) { } + } + + TestContext.Out.WriteLine($"negatives: {chosen}; raw FST proposed {fstProposed}; false positives {falsePositives}"); + foreach (string e in fp) + { + TestContext.Out.WriteLine($" FALSE POSITIVE: {e}"); + } + Assert.That(chosen, Is.GreaterThanOrEqualTo(targetCount), "could not assemble enough true negatives"); + Assert.That(falsePositives, Is.Zero, "soundness FAILURE: verified FST analyzed a non-word"); + } + + /// Parallel-consistency: parsing the corpus concurrently must give the same analyses as + /// sequentially (validates the pooled-Morpher thread-safety fix). + [Test] + public void Concurrent_MatchesSequential() + { + (Language language, List words) = Load(); + var verified = new VerifiedFstAnalyzer( + new FstTemplateAnalyzer(language, new Morpher(new TraceManager(), language)), + new MorpherPool(() => new Morpher(new TraceManager(), language)) + ); + + Dictionary sequential = words.Distinct().ToDictionary(w => w, w => SigSet(verified, w)); + var parallel = new ConcurrentDictionary(); + Parallel.ForEach(words.Distinct(), w => parallel[w] = SigSet(verified, w)); + + int mismatches = sequential.Count(kv => parallel[kv.Key] != kv.Value); + TestContext.Out.WriteLine($"parallel vs sequential: {mismatches} mismatches of {sequential.Count} words"); + Assert.That(mismatches, Is.Zero, "thread-safety FAILURE: concurrent analyses differ from sequential"); + } + + private static string SigSet(IMorphologicalAnalyzer analyzer, string word) + { + return string.Join( + "|", + analyzer + .AnalyzeWord(word) + .Select(a => string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex) + .OrderBy(s => s, StringComparer.Ordinal) + ); + } + + private static (Language, List) Load() + { + string? grammarPath = Environment.GetEnvironmentVariable("HC_GRAMMAR"); + string? wordsPath = Environment.GetEnvironmentVariable("HC_WORDS"); + if (string.IsNullOrEmpty(grammarPath) || string.IsNullOrEmpty(wordsPath)) + { + Assert.Ignore("set HC_GRAMMAR and HC_WORDS"); + } + int maxWords = int.TryParse(Environment.GetEnvironmentVariable("HC_MAX_WORDS"), out int mw) ? mw : 60; + Language language = XmlLanguageLoader.Load(grammarPath!); + List words = File + .ReadAllLines(wordsPath!) + .Select(w => w.Trim()) + .Where(w => w.Length > 0) + .Take(maxWords) + .ToList(); + return (language, words); + } + + private static long TimeParse(string label, List words, Func parse) + { + try + { + parse(words[0]); // warm up + } + catch (Exception) { } + var sw = Stopwatch.StartNew(); + long total = 0; + foreach (string w in words) + { + try + { + total += parse(w); + } + catch (Exception) { } + } + sw.Stop(); + TestContext.Out.WriteLine($"{label} : {sw.ElapsedMilliseconds,7} ms ({(double)sw.ElapsedMilliseconds / words.Count:F1} ms/word, {total} analyses)"); + return sw.ElapsedMilliseconds; + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstTemplateAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstTemplateAnalyzerTests.cs new file mode 100644 index 00000000..75461c7b --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstTemplateAnalyzerTests.cs @@ -0,0 +1,137 @@ +using NUnit.Framework; +using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Template-based analysis with build-time category gating (HERMITCRAB_FST_PLAN.md §6/§10): a +/// suffixing affix template attaches only to roots whose category matches, and the token- +/// accumulating walk reproduces the search engine's analyses — including NOT over-generating the +/// template onto a wrong-category root. +/// +public class FstTemplateAnalyzerTests : HermitCrabTestBase +{ + private AffixProcessRule Suffix(string name, string gloss, string seg) + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var rule = new AffixProcessRule { Name = name, Gloss = gloss }; + rule.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, seg) }, + } + ); + return rule; + } + + private AffixProcessRule Prefix(string name, string gloss, string seg) + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var rule = new AffixProcessRule { Name = name, Gloss = gloss }; + rule.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new InsertSegments(Table3, seg), new CopyFromInput("1") }, + } + ); + return rule; + } + + [Test] + public void Analyze_SlotAffixWrongCategory_PrunedNotOvergenerated() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + AffixProcessRule ok = Suffix("ok_suffix", "OK", "d"); // no category requirement → applies to V + var wrong = new AffixProcessRule + { + Name = "n_only_suffix", + Gloss = "NS", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + wrong.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "z") }, + } + ); + var verbTemplate = new AffixTemplate + { + Name = "verb", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + verbTemplate.Slots.Add(new AffixTemplateSlot(ok) { Optional = true }); + verbTemplate.Slots.Add(new AffixTemplateSlot(wrong) { Optional = true }); + Morphophonemic.AffixTemplates.Add(verbTemplate); + + var search = new Morpher(TraceManager, Language); + var fst = new FstTemplateAnalyzer(Language); + + // sag is V. "sagd" uses the OK suffix (valid); "sagz" would use the N-only suffix on a V + // root — the build-time category gate prunes it, so the FST must NOT over-generate it. + string[] corpus = { "sag", "sagd", "sagz" }; + AnalysisComparison comparison = FstVerification.Compare(search, fst, corpus); + Assert.That(comparison.IsComplete, Is.True, comparison.Format()); + + Morphophonemic.AffixTemplates.Remove(verbTemplate); + } + + [Test] + public void Analyze_PrefixAndSuffixTemplate_MatchesSearch() + { + // A verb template with a prefix slot (di-) and a suffix slot (-d), restricted to V roots. + AffixProcessRule di = Prefix("di_prefix", "PST", "di"); + AffixProcessRule ed = Suffix("ed_suffix", "PERF", "d"); + var verbTemplate = new AffixTemplate + { + Name = "verb", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + verbTemplate.Slots.Add(new AffixTemplateSlot(di) { Optional = true }); + verbTemplate.Slots.Add(new AffixTemplateSlot(ed) { Optional = true }); + Morphophonemic.AffixTemplates.Add(verbTemplate); + + var search = new Morpher(TraceManager, Language); + var fst = new FstTemplateAnalyzer(Language); + + // sag (V, Morphophonemic): bare, prefixed (disag), suffixed (sagd), both (disagd). + string[] corpus = { "sag", "disag", "sagd", "disagd", "gab", "digab" }; + AnalysisComparison comparison = FstVerification.Compare(search, fst, corpus); + Assert.That(comparison.IsComplete, Is.True, comparison.Format()); + + Morphophonemic.AffixTemplates.Remove(verbTemplate); + } + + [Test] + public void Analyze_SuffixTemplateWithCategoryGate_MatchesSearch() + { + // A verb template, restricted to V roots, with two optional suffix slots. + AffixProcessRule ed = Suffix("ed_suffix", "PAST", "d"); + AffixProcessRule wit = Suffix("evidential", "WIT", "v"); + var verbTemplate = new AffixTemplate + { + Name = "verb", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + verbTemplate.Slots.Add(new AffixTemplateSlot(ed) { Optional = true }); + verbTemplate.Slots.Add(new AffixTemplateSlot(wit) { Optional = true }); + Morphophonemic.AffixTemplates.Add(verbTemplate); + + var search = new Morpher(TraceManager, Language); + var fst = new FstTemplateAnalyzer(Language); + + // Same-stratum (Morphophonemic) roots so only the category gate is in play: sag (32, V) + // takes the template; gab (11, A) must NOT. "sagdv" exercises both slots; "gabd" must yield + // no analysis in either engine (the gate blocks the verb template on the A root). + string[] corpus = { "sag", "sagd", "sagdv", "gab", "gabd" }; + AnalysisComparison comparison = FstVerification.Compare(search, fst, corpus); + Assert.That(comparison.IsComplete, Is.True, comparison.Format()); + + Morphophonemic.AffixTemplates.Remove(verbTemplate); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstVerificationTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstVerificationTests.cs new file mode 100644 index 00000000..d408c173 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstVerificationTests.cs @@ -0,0 +1,75 @@ +using NUnit.Framework; +using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// The shadow/verification gate (HERMITCRAB_FST_PLAN.md §9.5): FstVerification.Compare measures +/// FST-vs-search analysis-set parity over a corpus — confirming completeness (no missing) and +/// soundness (no spurious) at once, the certificate required before the FST may replace search. +/// +public class FstVerificationTests : HermitCrabTestBase +{ + private AffixProcessRule AddSuffix() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var sSuffix = new AffixProcessRule + { + Name = "s_suffix", + Gloss = "NMLZ", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + sSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, + } + ); + Morphophonemic.MorphologicalRules.Add(sSuffix); + return sSuffix; + } + + private sealed class EmptyAnalyzer : IMorphologicalAnalyzer + { + public IEnumerable AnalyzeWord(string word) => Enumerable.Empty(); + } + + [Test] + public void Compare_FstVsSearch_IsCompleteOnConcatenativeCorpus() + { + AffixProcessRule suffix = AddSuffix(); + IMorphologicalAnalyzer search = new Morpher(TraceManager, Language); + IMorphologicalAnalyzer fst = new VerifiedFstAnalyzer(TraceManager, Language); + + // A mix: inflected, bare root, homograph (dat = entries 8 & 9), and a non-word. + string[] corpus = { "sag", "sags", "dat", "sagg" }; + AnalysisComparison comparison = FstVerification.Compare(search, fst, corpus); + + Assert.That(comparison.WordsChecked, Is.EqualTo(corpus.Length)); + Assert.That(comparison.IsComplete, Is.True, comparison.Format()); + + Morphophonemic.MorphologicalRules.Remove(suffix); + } + + [Test] + public void Compare_DetectsMissingAnalyses_NotVacuous() + { + AffixProcessRule suffix = AddSuffix(); + IMorphologicalAnalyzer search = new Morpher(TraceManager, Language); + + // A candidate that finds nothing must be flagged incomplete on a word that has an analysis. + AnalysisComparison comparison = FstVerification.Compare(search, new EmptyAnalyzer(), new[] { "sag" }); + + Assert.That(comparison.IsComplete, Is.False); + Assert.That(comparison.Divergences, Has.Count.EqualTo(1)); + Assert.That(comparison.Divergences[0].MissingFromCandidate, Is.Not.Empty); + Assert.That(comparison.Divergences[0].ExtraInCandidate, Is.Empty); + + Morphophonemic.MorphologicalRules.Remove(suffix); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorBenchmark.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorBenchmark.cs new file mode 100644 index 00000000..656228ef --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorBenchmark.cs @@ -0,0 +1,29 @@ +using NUnit.Framework; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Runs against a real FLEx-exported grammar and prints the +/// report. [Explicit] so it never runs in CI. Point HC_GRAMMAR at an HC config XML: +/// $env:HC_GRAMMAR="...\sena-hc.xml"; dotnet test --filter "FullyQualifiedName~GrammarFstAdvisorBenchmark" +/// +[TestFixture] +[Explicit("Manual grammar-linter run against an external grammar; not part of CI.")] +public class GrammarFstAdvisorBenchmark +{ + [Test] + public void Advise_OnExternalGrammar() + { + string? grammarPath = Environment.GetEnvironmentVariable("HC_GRAMMAR"); + Assert.That(grammarPath, Is.Not.Null.And.Not.Empty, "set HC_GRAMMAR to an HC config XML path"); + Assert.That(File.Exists(grammarPath), Is.True, $"grammar not found: {grammarPath}"); + + Language language = XmlLanguageLoader.Load(grammarPath!); + GrammarFstReport report = GrammarFstAdvisor.Analyze(language); + + TestContext.Out.WriteLine($"Grammar: {Path.GetFileName(grammarPath)}"); + TestContext.Out.WriteLine($"Strata : {language.Strata.Count}"); + TestContext.Out.WriteLine(""); + TestContext.Out.WriteLine(report.Format()); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs new file mode 100644 index 00000000..4c21c8bf --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs @@ -0,0 +1,233 @@ +using NUnit.Framework; +using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Verifies the grammar linter (): a plain concatenative grammar +/// is a Tier 1 (fully FST-able) candidate with no escapes, and adding a single reduplication rule +/// flips the verdict — the offending rule is flagged +/// with a reduplication write-up. This is the "one new rule blew up the grammar" guard. +/// +public class GrammarFstAdvisorTests : HermitCrabTestBase +{ + [Test] + public void Analyze_ConcatenativeGrammar_Tier1NoEscapes() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + // A plain suffix: copy the whole stem, then add segments. Fully finite-state. + var sSuffix = new AffixProcessRule { Name = "s_suffix", Gloss = "PL" }; + sSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, + } + ); + Morphophonemic.MorphologicalRules.Add(sSuffix); + + // A suffix over a SPLIT stem (copy part 1, copy part 2, then insert): the copies are + // contiguous, so this is an ordinary suffix — finite-state, must NOT be flagged. + var splitSuffix = new AffixProcessRule { Name = "split_suffix", Gloss = "PST" }; + splitSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = + { + Pattern.New("1").Annotation(any).Value, + Pattern.New("2").Annotation(any).OneOrMore.Value, + }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("2"), new InsertSegments(Table3, "d") }, + } + ); + Morphophonemic.MorphologicalRules.Add(splitSuffix); + + GrammarFstReport report = GrammarFstAdvisor.Analyze(Language); + + Assert.That(report.EscapeCount, Is.EqualTo(0), report.Format()); + Assert.That(report.Tier, Does.StartWith("Tier 1")); + + Morphophonemic.MorphologicalRules.Remove(sSuffix); + Morphophonemic.MorphologicalRules.Remove(splitSuffix); + } + + [Test] + public void Analyze_BoundedReduplicant_IsRegular() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + // A fixed-size reduplicant: the copied part "1" matches a SINGLE segment (no OneOrMore), + // so the copy is finite → regular (reclaimable by bounded fold), unlike whole-stem copy. + var redup = new AffixProcessRule { Name = "credup", Gloss = "PL" }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + + GrammarFstReport report = GrammarFstAdvisor.Analyze(Language); + + GrammarAdvisory escape = report.Escapes.Single(a => a.Rule == "credup"); + // Still slow today (Escape preserved), but regular = FST-reclaimable. + Assert.That(escape.Severity, Is.EqualTo(GrammarAdvisorySeverity.Escape)); + Assert.That(escape.Regular, Is.True, report.Format()); + Assert.That(report.RegularEscapeCount, Is.EqualTo(1)); + + Morphophonemic.MorphologicalRules.Remove(redup); + } + + [Test] + public void Analyze_TrueInfix_FlaggedEscape() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + // Infixation: insert material BETWEEN two copies of the stem (copy…insert…copy). + var infix = new AffixProcessRule { Name = "infix", Gloss = "PERF" }; + infix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = + { + Pattern.New("1").Annotation(any).Value, + Pattern.New("2").Annotation(any).OneOrMore.Value, + }, + Rhs = + { + new CopyFromInput("1"), + new InsertSegments(Table3, "a"), + new CopyFromInput("2"), + }, + } + ); + Morphophonemic.MorphologicalRules.Add(infix); + + GrammarFstReport report = GrammarFstAdvisor.Analyze(Language); + + GrammarAdvisory escape = report.Escapes.Single(a => a.Rule == "infix"); + Assert.That(escape.Issue, Does.Contain("Infixation")); + // Severity is preserved — infixation is slow in today's engine — but it is regular (the + // split is pattern-defined), so it carries the reclaim path. + Assert.That(escape.Severity, Is.EqualTo(GrammarAdvisorySeverity.Escape)); + Assert.That(escape.Regular, Is.True); + Assert.That(report.Tier, Does.StartWith("Tier 2")); + + Morphophonemic.MorphologicalRules.Remove(infix); + } + + [Test] + public void Analyze_HarmonyRewrite_StaysEscapeButIsRegular() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + // A vowel-harmony-style rewrite: bounded LHS/RHS, but an UNBOUNDED left environment + // ("...anything... ___"). By Kaplan & Kay this is a regular relation, but in today's + // engine it un-applies at many positions and is slow. + var harmony = new RewriteRule + { + Name = "harmony", + Lhs = Pattern.New().Annotation(any).Value, + }; + harmony.Subrules.Add( + new RewriteSubrule + { + Rhs = Pattern.New().Annotation(any).Value, + LeftEnvironment = Pattern.New().Annotation(any).OneOrMore.Value, + } + ); + Allophonic.PhonologicalRules.Add(harmony); + + GrammarFstReport report = GrammarFstAdvisor.Analyze(Language); + + GrammarAdvisory escape = report.Escapes.Single(a => a.Rule == "harmony"); + // The non-expert sanity check: the headline still WARNS (escape present, not Tier 1) ... + Assert.That(escape.Severity, Is.EqualTo(GrammarAdvisorySeverity.Escape)); + Assert.That(report.Tier, Does.Not.StartWith("Tier 1")); + Assert.That(report.EscapeCount, Is.GreaterThanOrEqualTo(1)); + // ... and the reclaim path is reported separately: regular (FST-reclaimable), not "fine". + Assert.That(escape.Regular, Is.True); + Assert.That(report.RegularEscapeCount, Is.GreaterThanOrEqualTo(1)); + Assert.That(escape.Advice, Does.Contain("today's engine")); + + Allophonic.PhonologicalRules.Remove(harmony); + } + + [Test] + public void Analyze_ReduplicationRule_FlaggedEscapeAndTierDowngraded() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + GrammarFstReport before = GrammarFstAdvisor.Analyze(Language); + Assert.That(before.EscapeCount, Is.EqualTo(0), "baseline grammar should have no escapes"); + + // Total reduplication: copy the stem ("1") twice. Copying an unbounded span is not + // finite-state — exactly the rule that should blow up the grammar. + var redup = new AffixProcessRule { Name = "redup", Gloss = "INTENS" }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + + GrammarFstReport after = GrammarFstAdvisor.Analyze(Language); + + Assert.That(after.EscapeCount, Is.EqualTo(1), after.Format()); + GrammarAdvisory escape = after.Escapes.Single(); + Assert.That(escape.Rule, Is.EqualTo("redup")); + Assert.That(escape.Severity, Is.EqualTo(GrammarAdvisorySeverity.Escape)); + Assert.That(escape.Issue, Does.Contain("Reduplication")); + Assert.That(escape.Advice, Is.Not.Empty); + // No phonological rule applies after it, so the escape is probe-able (clean). + Assert.That(escape.Probeable, Is.True); + Assert.That(after.ProbeableEscapeCount, Is.EqualTo(1)); + // Copying the whole stem (part "1" is OneOrMore) is the one genuinely non-regular case. + Assert.That(escape.Regular, Is.False); + Assert.That(after.NonRegularEscapeCount, Is.EqualTo(1)); + // The tier verdict changed: this is the warning a grammar engineer sees. + Assert.That(after.Tier, Is.Not.EqualTo(before.Tier)); + Assert.That(after.Tier, Does.StartWith("Tier 2")); + + Morphophonemic.MorphologicalRules.Remove(redup); + } + + [Test] + public void Analyze_ReduplicationWithLaterPhonology_IsOpaque() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + var redup = new AffixProcessRule { Name = "redup", Gloss = "INTENS" }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + + // A phonological rule in a LATER stratum can rewrite the reduplicated span, so the + // strip-and-reparse probe is no longer sound — the escape is opaque (needs the backstop). + var rule = new RewriteRule { Name = "t_rule", Lhs = Pattern.New().Annotation(any).Value }; + Surface.PhonologicalRules.Add(rule); + + GrammarFstReport report = GrammarFstAdvisor.Analyze(Language); + + GrammarAdvisory escape = report.Escapes.Single(a => a.Rule == "redup"); + Assert.That(escape.Probeable, Is.False, report.Format()); + Assert.That(report.OpaqueEscapeCount, Is.EqualTo(1)); + Assert.That(report.Tier, Does.StartWith("Tier 2 candidate — hybrid")); + + Morphophonemic.MorphologicalRules.Remove(redup); + Surface.PhonologicalRules.Remove(rule); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstClosureTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstClosureTests.cs new file mode 100644 index 00000000..41fd9346 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstClosureTests.cs @@ -0,0 +1,89 @@ +using NUnit.Framework; +using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// The static feeding-closure pass (HERMITCRAB_FST_PLAN.md §9.5): a non-regular escape is CLOSED +/// only if no FST-able rule could apply before it and feed it. Closed ⇒ the FST's "no path" is a +/// proof; fed ⇒ those words need the search backstop. +/// +public class GrammarFstClosureTests : HermitCrabTestBase +{ + private static AffixProcessRule MakeReduplication(string name) + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule { Name = name, Gloss = "INTENS" }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + return redup; + } + + private AffixProcessRule MakeSuffix(string name) + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var suffix = new AffixProcessRule { Name = name, Gloss = "PL" }; + suffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, + } + ); + return suffix; + } + + [Test] + public void Analyze_NoEscapes_IsClosedVacuously() + { + ClosureReport report = GrammarFstClosure.Analyze(Language); + Assert.That(report.Escapes, Is.Empty); + Assert.That(report.FstClosed, Is.True); + } + + [Test] + public void Analyze_InnermostReduplication_NothingPrecedes_IsClosed() + { + // Reduplication as the only rule at the innermost stratum: nothing FST-able precedes it, + // so no derivation can feed it — closed, and the FST's silence is a proof. + AffixProcessRule redup = MakeReduplication("redup"); + Morphophonemic.MorphologicalRules.Add(redup); + + ClosureReport report = GrammarFstClosure.Analyze(Language); + + Assert.That(report.Escapes, Has.Count.EqualTo(1)); + Assert.That(report.Escapes[0].Rule, Is.EqualTo("redup")); + Assert.That(report.Escapes[0].Closed, Is.True, report.Format()); + Assert.That(report.FstClosed, Is.True); + + Morphophonemic.MorphologicalRules.Remove(redup); + } + + [Test] + public void Analyze_FeederBeforeReduplication_IsPotentiallyFed() + { + // A concatenative suffix in the same (unordered) stratum could apply before the + // reduplication and feed it: the pass conservatively reports it not closed. + AffixProcessRule redup = MakeReduplication("redup"); + AffixProcessRule suffix = MakeSuffix("pl"); + Morphophonemic.MorphologicalRules.Add(suffix); + Morphophonemic.MorphologicalRules.Add(redup); + + ClosureReport report = GrammarFstClosure.Analyze(Language); + + Assert.That(report.Escapes, Has.Count.EqualTo(1)); + Assert.That(report.Escapes[0].Closed, Is.False, report.Format()); + Assert.That(report.FstClosed, Is.False); + + Morphophonemic.MorphologicalRules.Remove(redup); + Morphophonemic.MorphologicalRules.Remove(suffix); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenCodecTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenCodecTests.cs new file mode 100644 index 00000000..6554bf5e --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenCodecTests.cs @@ -0,0 +1,123 @@ +using NUnit.Framework; +using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// Proves the packed-token schema (HERMITCRAB_FST_PLAN.md §8) faithfully represents a real HC +/// analysis: encoding a parsed and decoding it reproduces the morphemes and +/// root that WordAnalysis carries, with the operation populated from the actual rule — +/// including the multi-stem (compound) case that the flat array must not lose. +/// +public class MorphTokenCodecTests : HermitCrabTestBase +{ + [Test] + public void Encode_Suffix_RoundTripsToWordAnalysis() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var sSuffix = new AffixProcessRule + { + Name = "s_suffix", + Gloss = "NMLZ", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + sSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, + } + ); + Morphophonemic.MorphologicalRules.Add(sSuffix); + + var morpher = new Morpher(TraceManager, Language); + List words = morpher.ParseWord("sags").ToList(); + List analyses = morpher.AnalyzeWord("sags").ToList(); + Assert.That(words, Has.Count.EqualTo(1)); + Assert.That(analyses, Has.Count.EqualTo(1)); + + var codec = new MorphTokenCodec(); + uint[] tokens = codec.Encode(words[0]); + WordAnalysis wa = analyses[0]; + + // Morpheme channel: decoded indices reproduce WordAnalysis.Morphemes, in order. + Assert.That( + tokens.Select(t => codec.GetMorpheme(MorphToken.GetMorphemeId(t)).Id), + Is.EqualTo(wa.Morphemes.Select(m => m.Id)) + ); + // Root recovered purely from the op codes == HC's RootMorphemeIndex (no separate field). + Assert.That(MorphToken.RootIndex(tokens), Is.EqualTo(wa.RootMorphemeIndex)); + // Op channel is populated from the real rule: a root and a suffix. + var ops = tokens.Select(MorphToken.GetOp).ToList(); + Assert.That(ops, Does.Contain(MorphOp.Root)); + Assert.That(ops, Does.Contain(MorphOp.Suffix)); + + Morphophonemic.MorphologicalRules.Remove(sSuffix); + } + + [Test] + public void Encode_Compound_KeepsBothStems_OneRoot() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var rule1 = new CompoundingRule { Name = "rule1" }; + Allophonic.MorphologicalRules.Add(rule1); + rule1.Subrules.Add( + new CompoundingSubrule + { + HeadLhs = { Pattern.New("head").Annotation(any).OneOrMore.Value }, + NonHeadLhs = { Pattern.New("nonHead").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("head"), new InsertSegments(Table3, "+"), new CopyFromInput("nonHead") }, + } + ); + + var morpher = new Morpher(TraceManager, Language); + List words = morpher.ParseWord("pʰutdat").ToList(); + List analyses = morpher.AnalyzeWord("pʰutdat").ToList(); + Assert.That(words, Is.Not.Empty); + + // Match each encoded word to a WordAnalysis by morpheme sequence (decoupled from order). + var codec = new MorphTokenCodec(); + foreach (Word w in words) + { + uint[] tokens = codec.Encode(w); + + // Two stems → two morphemes; exactly one tagged Root, the other Compound (not lost). + Assert.That(tokens, Has.Length.EqualTo(2)); + Assert.That(tokens.Count(t => MorphToken.GetOp(t) == MorphOp.Root), Is.EqualTo(1)); + Assert.That(tokens.Select(MorphToken.GetOp), Does.Contain(MorphOp.Compound)); + + string[] decoded = tokens.Select(t => codec.GetMorpheme(MorphToken.GetMorphemeId(t)).Id).ToArray(); + WordAnalysis? match = analyses.FirstOrDefault(a => a.Morphemes.Select(m => m.Id).SequenceEqual(decoded)); + Assert.That(match, Is.Not.Null, $"no WordAnalysis matches decoded morphemes [{string.Join(",", decoded)}]"); + Assert.That(MorphToken.RootIndex(tokens), Is.EqualTo(match!.RootMorphemeIndex)); + } + + Allophonic.MorphologicalRules.Remove(rule1); + } + + [Test] + public void ClassifyOp_PopulatesAffixRolesFromOutputActions() + { + Assert.That(RoleOf(new CopyFromInput("1"), new CopyFromInput("1")), Is.EqualTo(MorphOp.Reduplication)); + Assert.That( + RoleOf(new CopyFromInput("1"), new InsertSegments(Table3, "a"), new CopyFromInput("2")), + Is.EqualTo(MorphOp.Infix) + ); + Assert.That(RoleOf(new InsertSegments(Table3, "di"), new CopyFromInput("1")), Is.EqualTo(MorphOp.Prefix)); + Assert.That(RoleOf(new CopyFromInput("1"), new InsertSegments(Table3, "s")), Is.EqualTo(MorphOp.Suffix)); + } + + private static MorphOp RoleOf(params MorphologicalOutputAction[] rhs) + { + var allo = new AffixProcessAllomorph(); + foreach (MorphologicalOutputAction action in rhs) + { + allo.Rhs.Add(action); + } + return MorphTokenCodec.ClassifyOp(allo, isHeadRoot: false); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenTests.cs new file mode 100644 index 00000000..aa78b605 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenTests.cs @@ -0,0 +1,67 @@ +using NUnit.Framework; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// The packed 32-bit analysis token (HERMITCRAB_FST_PLAN.md §8): 8-bit MorphOp + 24-bit morpheme +/// index, with the derivation array being self-describing (morpheme order = array order; root = +/// the Root token's position). +/// +public class MorphTokenTests +{ + [Test] + public void Encode_RoundTripsOpAndMorphemeId() + { + foreach (MorphOp op in System.Enum.GetValues(typeof(MorphOp))) + { + foreach (int id in new[] { 0, 1, 42, MorphToken.MaxMorphemeId }) + { + uint token = MorphToken.Encode(op, id); + Assert.That(MorphToken.GetOp(token), Is.EqualTo(op), $"op for id {id}"); + Assert.That(MorphToken.GetMorphemeId(token), Is.EqualTo(id), $"id for op {op}"); + } + } + } + + [Test] + public void Encode_IdOutOfRange_Throws() + { + Assert.Throws( + () => MorphToken.Encode(MorphOp.Root, MorphToken.MaxMorphemeId + 1) + ); + Assert.Throws(() => MorphToken.Encode(MorphOp.Root, -1)); + } + + [Test] + public void Encode_DistinctInputsGiveDistinctTokens() + { + // Different op, same id → different token. + Assert.That(MorphToken.Encode(MorphOp.Prefix, 7), Is.Not.EqualTo(MorphToken.Encode(MorphOp.Suffix, 7))); + // Same op, different id → different token. + Assert.That(MorphToken.Encode(MorphOp.Suffix, 7), Is.Not.EqualTo(MorphToken.Encode(MorphOp.Suffix, 8))); + } + + [Test] + public void Derivation_ArrayIsSelfDescribing() + { + // prefix m10 · root m20 · suffix m30 — a whole WordAnalysis in 12 bytes. + uint[] derivation = + { + MorphToken.Encode(MorphOp.Prefix, 10), + MorphToken.Encode(MorphOp.Root, 20), + MorphToken.Encode(MorphOp.Suffix, 30), + }; + + // Morphemes in order = the array's morpheme indices in array order. + Assert.That(System.Array.ConvertAll(derivation, MorphToken.GetMorphemeId), Is.EqualTo(new[] { 10, 20, 30 })); + // RootMorphemeIndex falls out of the op codes — no separate field needed. + Assert.That(MorphToken.RootIndex(derivation), Is.EqualTo(1)); + } + + [Test] + public void RootIndex_NoRoot_ReturnsMinusOne() + { + uint[] derivation = { MorphToken.Encode(MorphOp.Prefix, 1), MorphToken.Encode(MorphOp.Suffix, 2) }; + Assert.That(MorphToken.RootIndex(derivation), Is.EqualTo(-1)); + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs new file mode 100644 index 00000000..73abcfe6 --- /dev/null +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs @@ -0,0 +1,118 @@ +using System.Collections.Concurrent; +using NUnit.Framework; +using SIL.Machine.Annotations; +using SIL.Machine.FeatureModel; +using SIL.Machine.Matching; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab; + +/// +/// CI coverage for the propose-and-verify spine (HERMITCRAB_FST_PLAN.md §11.8/§12): the FST proposes, +/// HC's own engine confirms each candidate by restricted re-analysis (), and +/// the confirmed engine analysis is emitted. Exercises soundness (no false positives), the M2 fix +/// (yields genuine HC analyses with their category), the per-word opt-out, and thread-safety. +/// +public class VerifiedFstAnalyzerTests : HermitCrabTestBase +{ + private AffixProcessRule AddSuffix() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var sSuffix = new AffixProcessRule + { + Name = "s_suffix", + Gloss = "NMLZ", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + sSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "s") }, + } + ); + Morphophonemic.MorphologicalRules.Add(sSuffix); + return sSuffix; + } + + [Test] + public void Verified_MatchesSearch_OnConcatenativeCorpus() + { + AffixProcessRule suffix = AddSuffix(); + IMorphologicalAnalyzer search = new Morpher(TraceManager, Language); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + string[] corpus = { "sag", "sags", "dat", "sagg" }; // inflected, bare, homograph, non-word + AnalysisComparison comparison = FstVerification.Compare(search, verified, corpus); + Assert.That(comparison.IsComplete, Is.True, comparison.Format()); + Morphophonemic.MorphologicalRules.Remove(suffix); + } + + [Test] + public void Verified_RejectsNonWord_NoFalsePositive() + { + IMorphologicalAnalyzer search = new Morpher(TraceManager, Language); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + Assert.That(search.AnalyzeWord("sagg"), Is.Empty, "precondition: sagg is a non-word"); + Assert.That(verified.AnalyzeWord("sagg"), Is.Empty, "verify must not analyze a non-word"); + } + + [Test] + public void Verified_YieldsGenuineEngineAnalyses_WithCategory() + { + // M2: VerifiedFstAnalyzer must yield the matched HC analysis (real category), not the + // category-less FST candidate. WordAnalysis.Equals includes Category, so set-equality vs the + // engine fails if the category is dropped. + var search = new Morpher(TraceManager, Language); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + foreach (string word in new[] { "sag", "dat" }) + { + var fromSearch = new HashSet(search.AnalyzeWord(word)); + List fromVerified = verified.AnalyzeWord(word).ToList(); + Assert.That(fromVerified, Is.Not.Empty, $"expected analyses for {word}"); + foreach (WordAnalysis a in fromVerified) + { + Assert.That(a.Category, Is.Not.Null, $"verified analysis of {word} lost its category"); + Assert.That(fromSearch, Does.Contain(a), $"verified analysis of {word} is not a genuine engine analysis"); + } + } + } + + [Test] + public void CompleteHybrid_PerWordOptOut_EngineMatchesSearch() + { + string[] corpus = { "sag", "dat" }; + var search = new Morpher(TraceManager, Language); + var complete = CompleteHybridMorpher.FromLanguage(TraceManager, Language, corpus); + foreach (string word in corpus) + { + var engine = new HashSet(complete.AnalyzeWord(word, useFst: false).Select(Sig)); + var fst = new HashSet(complete.AnalyzeWord(word, useFst: true).Select(Sig)); + var oracle = new HashSet(search.AnalyzeWord(word).Select(Sig)); + Assert.That(engine.SetEquals(oracle), Is.True, $"engine opt-out path wrong for {word}"); + Assert.That(fst.SetEquals(oracle), Is.True, $"fst path wrong for {word}"); + } + } + + [Test] + public void Verified_ParallelMatchesSequential() + { + AddSuffix(); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + var corpus = new List(); + for (int i = 0; i < 50; i++) + { + corpus.AddRange(new[] { "sag", "sags", "dat", "sat", "saz", "sas", "sagg" }); + } + Dictionary sequential = corpus.Distinct().ToDictionary(w => w, w => SigSet(verified, w)); + var parallel = new ConcurrentDictionary(); + Parallel.ForEach(corpus, w => parallel[w] = SigSet(verified, w)); + Assert.That(corpus.Distinct().All(w => parallel[w] == sequential[w]), Is.True, "concurrent analyses diverged from sequential"); + } + + private static string Sig(WordAnalysis a) => + string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex; + + private static string SigSet(IMorphologicalAnalyzer analyzer, string word) => + string.Join("|", analyzer.AnalyzeWord(word).Select(Sig).OrderBy(s => s, System.StringComparer.Ordinal)); +} From df0be656ba376f578f300618523f0702e3f0f982 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 08:13:15 -0400 Subject: [PATCH 02/13] HC FST: full-coverage audit plan (docs/FST_FULL_COVERAGE_PLAN.md) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four parallel audits (formal-language status × HC impl × FST impl) of every HC construct, classified covered / partial / coverable / not-coverable, with architecture proposals and appendices on closing the non-regular gap. Headline findings: - Almost all of HC is REGULAR (Kaplan-Kay) hence 1-way-FST-able; the only genuinely non-regular core is unbounded full-stem reduplication ({ww}) + an unbounded self-feeding rewrite cycle (HC caps at 256). - Critical coverage ceiling: the proposer is only correct for 0-PHONOLOGY grammars (arcs are underlying segments, walk is surface) — it silently under-generates (fails safe; parity gate refuses to certify) for any grammar with phonological rules. Phonology-by-composition is the biggest coverage win. - Robustness bug: the proposer THROWS on infix/circumfix/reduplication/process slots, aborting the whole build instead of degrading to the engine. Graceful degradation is the top this-PR fix. - Other gaps: true zero-segment affix dropped; bounded compounding needs proposer + FstReplay changes; MPR/co-occurrence/env/stemname correctly left to verify (sound). Appendix A: length-cap fold / detect-and-peel (compile-replace) / 2-way FST (Dolatian-Heinz) / engine backstop for the non-FST-able constructs. Appendix B: verify-by-re-analysis + escape-aware codec + certified-skip interlock all HELP later non-regular work; only the 2-way reduplication solution would need a new execution model. Co-Authored-By: Claude Opus 4.8 --- docs/FST_FULL_COVERAGE_PLAN.md | 212 +++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 docs/FST_FULL_COVERAGE_PLAN.md diff --git a/docs/FST_FULL_COVERAGE_PLAN.md b/docs/FST_FULL_COVERAGE_PLAN.md new file mode 100644 index 00000000..d1a8fd17 --- /dev/null +++ b/docs/FST_FULL_COVERAGE_PLAN.md @@ -0,0 +1,212 @@ +# FST full-coverage plan — auditing how much of HermitCrab an FST can cover + +Audited by four parallel reviews against (a) the formal-language status of each construct, (b) HC's +implementation, and (c) our FST implementation. "Regular?" classifies the *linguistic operation* +(Kaplan & Kay 1994: a finite composition of concatenation + bounded-context rewrite over a finite +lexicon is a **regular relation**, hence 1-way-FST-able). "Coverage" is what the **proposer** +(`FstTemplateAnalyzer`) actually builds — `VerifiedFstAnalyzer`/`FstReplay` only *confirm or discard* +proposer candidates, so they can never add coverage: **every under-generation must be closed in the +proposer.** + +## 0. The headline + +Almost all of HC is formally **regular** and therefore coverable by a 1-way FST. The genuinely +non-regular core is tiny: **unbounded full-stem reduplication** (`{ww}`) and an **unbounded +self-feeding rewrite cycle** (HC already caps it at 256). Everything else — affixation, templates, +derivation, **all phonology**, bounded compounding, partial/fixed reduplication, strata — is regular. + +But "regular ⇒ coverable" is about the *ceiling*, not what we built. Two findings matter most: + +1. **The proposer is only correct for 0-phonology grammars.** Its arcs are built from *underlying* + segments; it walks the *surface*. Any feature-change/epenthesis/deletion/metathesis desyncs the + walk, so for a grammar **with** phonology the FST **silently under-generates** (it fails *safe* — + verify rejects anything spurious, so no wrong analyses — but it misses valid ones). Sena has 0 + phonological rules, which is the only reason it certifies. **This is the single biggest limit on + real-grammar coverage.** The certification parity-gate catches it (such a grammar won't certify), so + it is not a *soundness* hole — it is a *coverage* ceiling. +2. **The proposer throws (`NotSupportedException`) on infix / circumfix / reduplication / process + slots**, aborting the *entire* build rather than degrading. So a grammar with **any** such slot + can't build the FST at all today. This is a robustness bug, not a math limit. + +## 1. Coverage scorecard + +**COVERED (proposer builds it):** prefix, suffix, realizational affixes, multiple template slots, +optional slots, slot ordering, root lexicon, category + stratum gating, category-changing derivation +(bounded), bounded derivation (depth ≤ `derivDepth`, default 2, tunable). + +**PARTIAL:** derivation depth (capped — deeper stacks silently dropped, caught only by the parity +gate); zero affix (the `[CopyFromInput, InsertSegments(non-empty)]` form is covered; a **true +zero-segment** affix `[CopyFromInput]`-only is dropped/throws — a silent gap); Linear-vs-Unordered rule +order (modeled as a bounded any-order superset — sound via verify, not faithful to the flag). + +**COVERABLE (regular; not built — listed with the work + blow-up):** +- **All phonology** — RewriteRule (feature-change / epenthesis / deletion), metathesis, iterative & + simultaneous application, α-variables, allomorph environments. Regular by Kaplan–Kay. Needs the + proposer to be built by **composition** (lexicon ∘ affixes ∘ phonology) instead of the underlying- + segment walk, or phonology folded into a richer verify. Largest single win. +- **Bounded compounding** — regular (capped by `MaxStemCount`, default 2). Needs shared per-category + stem automata spliced N−1 times (additive in states) **and** an extension to `FstReplay` (which today + requires a single `LexEntry` root, so it can't even *confirm* a compound). +- **Infixation** — regular (positioned insertion). Needs `BuildRootChain` to split a root mid-stem + (`pre · infix · post`); ≈2×|root| arcs for infixing roots, bounded. +- **Circumfix** — regular. Needs one morpheme emitted at two surface positions (the `MorphOp` enum + already has `CircumfixPrefix`/`CircumfixSuffix`, but the codec only ever emits `CircumfixPrefix` — + `CircumfixSuffix` is dead code). +- **Simulfix / process (`ModifyFromInput`)** — regular (length-preserving feature rewrite). Needs + feature-mutation arcs; entangled with phonology (the mutated segment must be in the arc condition). +- **Partial / fixed-size reduplication** — regular (bounded copy). Unroll the fixed template into arcs + (Beesley–Karttunen compile-replace). +- **Strata / cyclicity** — regular (finite composition of per-stratum regular relations); already + partly modeled via stratum-index gating. +- **MPR features, morpheme/allomorph co-occurrence, allomorph environments, stem names, disjunctive + allomorphs, obligatory features, bound roots** — all regular, currently **VERIFY-ONLY** and *sound* + there (HC's real synthesis enforces them). Coverable on arcs but **not worth it**: verify already + guarantees soundness, so baking them in buys only speed, at a multiplicative state cost. Leave them + in verify. + +**NOT COVERABLE by a 1-way FST (genuinely non-regular):** +- **Unbounded full-stem reduplication** — `{ww : w∈Σ*}` is not regular (not even context-free); a + 1-way FST has no memory for an arbitrary-length copy (Dolatian & Heinz 2020). HC expresses it when a + `CopyFromInput` part is an unbounded quantifier over the stem. +- **Unbounded self-feeding rewrite cycle** — not finitely bounded; HC tames it with a 256-length cap + (which *is* a regular fold — see Appendix A). +- (Unbounded recursive compounding/incorporation is non-regular in theory, but HC can't express it — + `MaxStemCount` is always finite — so it is moot here.) + +## 2. Per-feature table (synthesis of the four audits) + +| Feature | Regular? | Where handled now | Status | What's needed to cover | +|---|---|---|---|---| +| Prefix / suffix | yes | FST proposer | COVERED | — | +| Template slots / optional / order | yes | FST proposer | COVERED | — | +| Realizational affixes | yes | FST (as slots) | COVERED | feature-blocking deferred to verify (sound) | +| Category + stratum gating | yes | FST build-time gate | COVERED | faithful when stem ⊑ template category | +| Category-changing derivation | yes (bounded) | FST (≤ depth) | COVERED | deeper chains → raise `derivDepth` | +| Derivation depth | n/a | FST cap (2) | PARTIAL | knob; deeper → engine (parity-gated) | +| Zero affix (with segments) | yes | FST | COVERED | — | +| **True zero-segment affix** | yes | throws/dropped | **PARTIAL (bug)** | emit token with no arcs | +| Linear vs Unordered order | yes | FST (any-order superset) | PARTIAL | sound via verify; not flag-faithful | +| **Phonology (all kinds)** | **yes (Kaplan–Kay)** | **engine/verify only** | **COVERABLE (big)** | compile by composition into the proposer | +| **Bounded compounding** | yes | engine/cache | COVERABLE | shared stem automata + extend `FstReplay` | +| Infixation | yes | throws | COVERABLE | mid-stem root split | +| Circumfix | yes | throws (half dead) | COVERABLE | one morpheme, two positions | +| Simulfix / process | yes | throws | COVERABLE | feature-mutation arcs (needs phonology) | +| Partial/fixed reduplication | yes | throws | COVERABLE | unroll bounded copy | +| Strata / cyclicity | yes | partial (gating) | COVERABLE | compose per-stratum transducers | +| MPR / co-occurrence / env / stemname / disjunctive / obligatory / bound | yes | **verify** | VERIFY-ONLY (sound) | leave in verify (speed-only to move) | +| **Unbounded full-stem reduplication** | **no** | engine (escape) | **NOT COVERABLE (1-way)** | length-cap / detect-peel / 2-way FST | +| Unbounded self-feeding cycle | no (capped) | engine (256-cap) | NOT COVERABLE (unbounded) | length-cap fold | + +## 3. Architecture changes / optimizations / reconfigurations + +**A. Graceful degradation instead of `throw` (do now — robustness).** The proposer must never abort a +build on an unbuildable construct. On an infix/circumfix/reduplication/process slot (and any construct +it can't model), it should **skip that path and ensure the grammar is not certified** (so those words +route to the engine), exactly as it already does for non-regular escapes. Today a single such slot +throws `NotSupportedException` and kills the whole FST — so the analyzer is unusable on most real +grammars. This one change makes the FST **safe on any grammar** (full coverage where it can, engine +backstop where it can't), which is the right "as much as we can get" posture. + +**B. Fix the true zero-segment affix (do now — small).** Emit the morpheme token at a token-bearing +state with no segment arcs (the mechanism already exists for empty-insert affixes). Today it is a +silent under-generation or a throw. + +**C. Phonology by composition (follow-on — the big coverage win).** Replace/augment the hand-rolled +underlying-segment walk with the textbook construction: compile `Lexicon ∘ Affixes ∘ Phonology` +(each `RewriteRule` already carries everything needed to emit its transducer) and analyze the surface +through the composed, **minimized** machine. This is what lifts the FST from "0-phonology grammars +only" to the majority of real grammars. Risks: multiplicative state blow-up before minimization (use +lazy/per-stratum composition + the existing `Determinize().Minimize()` for variable-free layers), and +α-variable expansion (arc multiplication by feature cardinality). Verify-only cannot substitute — +`FstReplay` can reject but not *generate*, so phonology must enter the proposer. + +**D. Bounded compounding (follow-on — highest discrete coverage gain).** Build per-category shared stem +automata, splice up to `MaxStemCount`, emit `Compound`/`Root` tokens — **and extend `FstReplay`** to +confirm multi-root candidates (today it hard-requires a single `LexEntry` root, so a compound can't be +verified even if proposed). Additive in states (Σ category automata × depth), not multiplicative. + +**E. Keep soundness constraints in verify (decision, not work).** MPR, co-occurrence, environments, +stem names, disjunctive allomorphs, obligatory/bound — all sound in verify because verify *is* HC's +synthesis. Baking them into arcs buys only speed at a state cost; the over-generation they cause is a +few cheap rejected candidates per word. Leave them. + +**F. The certification interlock is the safety contract (preserve + strengthen).** `certified = +FST-closed ∧ set-parity`. The parity check is what catches proposer gaps (phonology, compounding, +depth) even when closure says "regular" — so **a phonology-bearing or reduplicating grammar must never +certify**, or `AnalyzeWord` (which skips the engine when certified) would silently under-generate. +`GrammarFstClosure.IsEscape` flags reduplication/infix; ensure the proposer's *coverage* limits +(phonology, compounding, depth-truncation) are likewise reflected so certification can't outrun what +the proposer actually builds. The empirical parity gate already enforces this; make it explicit. + +## 4. Roadmap — close this PR vs. follow-on + +**This PR (mathematically sound, tractable, robustness):** +- **A. Graceful degradation** (no throw → skip + don't certify). Makes the FST usable on any grammar. +- **B. Zero-segment affix** fix (close the silent gap). +- **F. Certification guard** — verify (it already holds via parity) and document that only + fully-covered, FST-closed grammars certify; everything else uses the engine/cache backstop. +- Tunable `derivDepth` (already shipped) + document depth-truncation as parity-gated. + +**Follow-on PR(s) (the bigger builds, in value order):** +1. **Phonology by composition** (C) — unlocks the majority of real grammars. +2. **Bounded compounding** (D) — biggest discrete construct gain; needs the `FstReplay` extension. +3. **Infix / circumfix / partial-reduplication / simulfix** — the remaining concatenative/bounded + constructs (each COVERABLE; medium effort). +4. **The non-regular core** — Appendix A. + +--- + +## Appendix A — closing the gap on the non-FST-able constructs + +Two HC constructs are genuinely non-regular for a 1-way FST: **unbounded full-stem reduplication** +(`{ww}`) and the **unbounded self-feeding rewrite cycle**. + +### A1. Unbounded reduplication +- **Length-cap fold.** Unroll `{ww : |w| ≤ L}` into explicit arcs for a chosen max reduplicant length + L (e.g. the longest lexical stem). Sound + complete up to L; FST grows with L×|Σ|; longer stems fall + to the engine. Precedented — HC itself caps the self-feeding cycle at 256. +- **Detect-and-peel (Beesley–Karttunen compile-replace).** Detect an adjacent repeated span, peel one + copy, analyze the remainder with the regular grammar. For copy, **detection == parsing** (a + reduplicant *is* an adjacent repeat), so the live work is a cheap repeat-scan + peel; ambiguous peels + resolved by verify. The standard finite-state-morphology tool. +- **2-way FST (Dolatian & Heinz 2020).** A two-way transducer re-reads its input and computes `{ww}` + exactly, staying linear-time. The *correct* device, but the current 1-way NFA walk would need a + two-way execution engine — the largest change. +- **Sound detector + engine backstop (current posture, recommended default).** Keep the proposer + reduplication-blind; `GrammarFstClosure.IsEscape` flags it → grammar not certified → those words go + to the engine via the cache. Zero blow-up, always correct, slower only on reduplicating words. + Combine with the length-cap fold as an opportunistic fast path for short stems. + +### A2. Self-feeding cycle +Already closed by a length-cap (shape ≤ 256). To FST-ize, bake the same cap as a maximum-length +acceptance bound; identical tradeoff to the reduplication length-cap. + +## Appendix B — do current architecture decisions help or hinder the non-FST-able work? + +**HELP — verify-by-re-analysis + engine backstop.** The proposer is *allowed* to be +sound-but-under-generating: every kept analysis is a genuine HC analysis, and the FST need not model +reduplication/compounding/phonology at all — those words are quarantined to the complete engine. +Adding any Appendix-A mechanism later only *widens* the fast path; it cannot break soundness, because +verification re-runs HC end to end. + +**HELP — the escape-aware codec + closure.** `MorphTokenCodec.ClassifyOp` already distinguishes +`Reduplication`/`Infix`/`Compound`/`Process` from concatenative ops, and `GrammarFstClosure` consumes +those tags. A future reduplication/compounding builder has a ready, principled signal for which rules +to special-case. + +**HAZARD to preserve — certified-skip.** A certified grammar skips the engine entirely. A grammar with +a non-regular construct must therefore *never* certify. The interlock (`closed ∧ parity`, with closure +flagging escapes and parity catching proposer gaps) is what guarantees this — it is the explicit safety +contract tying "construct ∉ regular" to "never skip the engine for it." Keep it inviolable as coverage +grows. + +**NEUTRAL — the 1-way template walk.** Bounded folds (length-cap, detect-peel) and all the COVERABLE +concatenative constructs fit the existing 1-way walk as "more arcs." The only thing it blocks is the +exact **2-way FST** reduplication solution (A1), which needs a different execution model — a +reconfiguration to weigh only if unbounded reduplication becomes a priority grammar. + +### Citations +Kaplan & Kay 1994 (regular relations; closure under composition → phonology, strata, bounded +compounding); Dolatian & Heinz 2020 (2-way FSTs compute reduplication; 1-way cannot); Chandlee 2017 +(subregular morphology; partial reduplication is local/regular); Beesley & Karttunen 2003 (compile- +replace for bounded reduplication). From 7832611f90507ac167d280ce92e48b292602ea5d Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 08:50:23 -0400 Subject: [PATCH 03/13] HC FST: graceful degradation (A) + zero-segment affix (B) + advisor review fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A — Graceful degradation: the proposer no longer THROWS on infix/circumfix/reduplication/ process slots; it skips the unbuildable construct, builds the rest, and sets CoversAllConstructs=false so the grammar can't certify (those words fall to the engine/cache; the parity gate enforces it). Was a NotSupportedException that aborted the whole build, making the FST unusable on any grammar with such a slot. B — True zero-segment affix (CopyFromInput only, no InsertSegments) now emits its morpheme token with no segment arcs instead of throwing / being silently dropped. SlotOp treats a zero-only slot as a position-less suffix so it still builds. Certification guard: FromLanguage (Caching + CompleteHybrid) now requires proposer.CoversAllConstructs in addition to closed + parity — a degraded build can't certify. Copilot review fixes (advisor, still in PR): - Examine RealizationalAffixProcessRule (it implements IMorphologicalRule + has Allomorphs; can encode reduplication/infix) — previously silently skipped, undercounting escapes. AnalyzeAffix refactored to (name, allomorphs) and the switch handles both rule types. - GrammarFstReport counts are now PER-RULE (group advisories by Rule/Stratum/Kind, worst severity) instead of per-advisory, so per-allomorph advisories don't overcount and the partitions are consistent (Probeable+Opaque = Escape, Regular+NonRegular = Escape). Tests: Build_ReduplicationSlot_DegradesGracefully_DoesNotThrow, Analyze_ZeroSegmentSuffix_IsEmitted_NotDropped, Analyze_RealizationalReduplication_IsExamined. Unit suite 96 green; Sena unchanged (certifies, 0 parallel mismatches, 0 false positives). Co-Authored-By: Claude Opus 4.8 --- .../CachingMorphologicalAnalyzer.cs | 5 +- .../CompleteHybridMorpher.cs | 3 +- .../FstTemplateAnalyzer.cs | 51 +++++++++++--- .../GrammarFstAdvisor.cs | 68 +++++++++++++------ .../FstTemplateAnalyzerTests.cs | 63 +++++++++++++++++ .../GrammarFstAdvisorTests.cs | 35 ++++++++++ 6 files changed, 190 insertions(+), 35 deletions(-) diff --git a/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs index 7a792c9d..8b792483 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs @@ -51,13 +51,14 @@ public static CachingMorphologicalAnalyzer FromLanguage( ) { var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); - var fast = new VerifiedFstAnalyzer(new FstTemplateAnalyzer(language, new Morpher(traceManager, language)), pool); + var proposer = new FstTemplateAnalyzer(language, new Morpher(traceManager, language)); + var fast = new VerifiedFstAnalyzer(proposer, pool); bool certified = false; if (certificationCorpus != null) { bool closed = GrammarFstClosure.Analyze(language).FstClosed; bool parity = FstVerification.Compare(new Morpher(traceManager, language), fast, certificationCorpus).IsComplete; - certified = closed && parity; + certified = closed && proposer.CoversAllConstructs && parity; } return new CachingMorphologicalAnalyzer(fast, pool, cache ?? new AnalysisCache(), certified); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs b/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs index 34bbb496..3d2e272f 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs @@ -47,7 +47,8 @@ IEnumerable certificationCorpus var proposer = new FstTemplateAnalyzer(language, new Morpher(traceManager, language)); var verified = new VerifiedFstAnalyzer(proposer, pool); var engine = new Morpher(traceManager, language); - bool certified = FstVerification.Compare(engine, verified, certificationCorpus).IsComplete; + bool parity = FstVerification.Compare(engine, verified, certificationCorpus).IsComplete; + bool certified = proposer.CoversAllConstructs && parity; return new CompleteHybridMorpher(verified, pool, certified); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs index b3d56b52..d684e19c 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs @@ -47,6 +47,7 @@ public class FstTemplateAnalyzer : IMorphologicalAnalyzer private readonly List _derivPrefixRules = new List(); private int _stateCount; + private bool _hasUnbuiltConstructs; /// /// Max stacked derivational affixes modelled per side before inflection (tunable per grammar). @@ -61,6 +62,15 @@ public class FstTemplateAnalyzer : IMorphologicalAnalyzer /// Number of FST states built (the precomputed size — to watch for state blow-up). public int StateCount => _stateCount; + /// + /// False if the build skipped a construct it cannot model (an infix/circumfix/reduplication/ + /// process slot or rule). The proposer degrades gracefully — it skips such constructs and + /// builds the rest — so a grammar using them under-generates on the fast path and must NOT be + /// certified (it falls to the engine/cache). The empirical set-parity gate enforces this; this + /// flag is the cheap build-time signal of the same fact. + /// + public bool CoversAllConstructs => !_hasUnbuiltConstructs; + /// Build without obligatoriness: every root may stand bare (fine for toy grammars). public FstTemplateAnalyzer(Language language, int maxStates = 1_000_000, int derivDepth = 2) : this(language, _ => true, maxStates, derivDepth) { } @@ -333,7 +343,7 @@ private WordAnalysis ToWordAnalysis(uint[] tokens) } /// Split a template's slots into prefix and suffix; prefixes are reversed to surface order. - private static void ClassifyTemplate( + private void ClassifyTemplate( AffixTemplate template, List prefixSlots, List suffixSlots @@ -350,21 +360,36 @@ List suffixSlots suffixSlots.Add(slot); break; default: - throw new NotSupportedException( - $"FstTemplateAnalyzer handles prefix/suffix template slots only; slot '{slot.Name}' is neither." - ); + // A slot the proposer cannot build (infix/circumfix/reduplication/process). + // Skip it and flag the grammar as not fully covered — those words fall to the + // engine/cache; the parity gate refuses to certify. (Was a hard throw that + // aborted the whole build.) + _hasUnbuiltConstructs = true; + break; } } prefixSlots.Reverse(); // slot 0 applies first (innermost) → rightmost prefix on the surface } + /// The slot's surface role: the first rule that is a prefix or suffix. A slot whose + /// only rules are zero-segment affixes is a (position-less) suffix so it still builds; a slot + /// with no prefix/suffix/zero rule (e.g. infix/reduplication only) is None → skipped. private static MorphOp SlotOp(AffixTemplateSlot slot) { + bool hasZero = false; foreach (MorphemicMorphologicalRule rule in slot.Rules) { - return RuleOp(rule); + MorphOp op = RuleOp(rule); + if (op == MorphOp.Prefix || op == MorphOp.Suffix) + { + return op; + } + if (op == MorphOp.None) + { + hasZero = true; // a zero/empty-segment affix — no surface position + } } - return MorphOp.None; + return hasZero ? MorphOp.Suffix : MorphOp.None; } /// The surface role (prefix/suffix/…) of a morphological rule, from its first allomorph. @@ -564,12 +589,18 @@ FeatureStruct templateCategory } foreach (AffixProcessAllomorph allomorph in Allomorphs(rule)) { - if (MorphTokenCodec.ClassifyOp(allomorph, false) != op) + MorphOp aop = MorphTokenCodec.ClassifyOp(allomorph, false); + if (aop != op && aop != MorphOp.None) { - throw new NotSupportedException( - $"FstTemplateAnalyzer: a rule in a {op} slot is not a {op}." - ); + // A rule the proposer can't build in this slot (infix/circumfix/redup/ + // process). Skip it and flag the grammar not-fully-covered; the engine/ + // cache backstop and parity gate handle those words. (Was a hard throw.) + _hasUnbuiltConstructs = true; + continue; } + // aop == op (normal affix) or aop == None (a true zero-segment affix: no + // InsertSegments) — both emit the morpheme token at this slot's position; a + // zero affix simply adds no segment arcs. uint affixToken = MorphToken.Encode(op, _codec.GetOrAddIndex(allomorph.Morpheme)); // Enter the affix through a token-bearing state, so the morpheme is emitted // even for a zero/empty-segment affix (its token would otherwise be lost). diff --git a/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs b/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs index 7e1a6991..30025179 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs @@ -112,21 +112,31 @@ int compoundingRulesExamined AffixRulesExamined = affixRulesExamined; PhonologicalRulesExamined = phonologicalRulesExamined; CompoundingRulesExamined = compoundingRulesExamined; - EscapeCount = advisories.Count(a => a.Severity == GrammarAdvisorySeverity.Escape); - CostCount = advisories.Count(a => a.Severity == GrammarAdvisorySeverity.Cost); - InfoCount = advisories.Count(a => a.Severity == GrammarAdvisorySeverity.Info); - ProbeableEscapeCount = advisories.Count(a => - a.Severity == GrammarAdvisorySeverity.Escape && a.Probeable == true - ); - OpaqueEscapeCount = advisories.Count(a => - a.Severity == GrammarAdvisorySeverity.Escape && a.Probeable == false - ); - RegularEscapeCount = advisories.Count(a => - a.Severity == GrammarAdvisorySeverity.Escape && a.Regular == true + + // Count per RULE, not per advisory: advisories are emitted per allomorph, so several can + // refer to one rule. Group by (Rule, Stratum, Kind) and take each rule's worst severity, so + // the counts reflect distinct rules and the partitions stay consistent + // (Probeable+Opaque = Escape, Regular+NonRegular = Escape). + List> byRule = advisories + .GroupBy(a => (a.Rule, a.Stratum, a.Kind)) + .ToList(); + EscapeCount = byRule.Count(g => g.Max(a => a.Severity) == GrammarAdvisorySeverity.Escape); + CostCount = byRule.Count(g => g.Max(a => a.Severity) == GrammarAdvisorySeverity.Cost); + InfoCount = byRule.Count(g => g.Max(a => a.Severity) == GrammarAdvisorySeverity.Info); + + // Among escaping rules, a rule is opaque/non-regular if ANY of its escape advisories is + // (the conservative aggregate); the complements partition the escape count exactly. + List> escapeRules = byRule + .Where(g => g.Max(a => a.Severity) == GrammarAdvisorySeverity.Escape) + .ToList(); + OpaqueEscapeCount = escapeRules.Count(g => + g.Any(a => a.Severity == GrammarAdvisorySeverity.Escape && a.Probeable == false) ); - NonRegularEscapeCount = advisories.Count(a => - a.Severity == GrammarAdvisorySeverity.Escape && a.Regular != true + ProbeableEscapeCount = EscapeCount - OpaqueEscapeCount; + NonRegularEscapeCount = escapeRules.Count(g => + g.Any(a => a.Severity == GrammarAdvisorySeverity.Escape && a.Regular != true) ); + RegularEscapeCount = EscapeCount - NonRegularEscapeCount; } public IReadOnlyList Advisories { get; } @@ -273,7 +283,20 @@ public static GrammarFstReport Analyze(Language language, int manyAllomorphsThre { case AffixProcessRule affix: affixExamined++; - AnalyzeAffix(affix, stratum.Name, surfaceInvariant, advisories, manyAllomorphsThreshold); + AnalyzeAffix(affix.Name, affix.Allomorphs, stratum.Name, surfaceInvariant, advisories, manyAllomorphsThreshold); + break; + case RealizationalAffixProcessRule realizational: + // Realizational affixes also have Allomorphs and can encode + // reduplication/infixation — examine them too (previously skipped). + affixExamined++; + AnalyzeAffix( + realizational.Name, + realizational.Allomorphs, + stratum.Name, + surfaceInvariant, + advisories, + manyAllomorphsThreshold + ); break; case CompoundingRule compound: compoundExamined++; @@ -301,7 +324,8 @@ public static GrammarFstReport Analyze(Language language, int manyAllomorphsThre } private static void AnalyzeAffix( - AffixProcessRule rule, + string ruleName, + IList allomorphs, string stratum, bool surfaceInvariant, List advisories, @@ -319,7 +343,7 @@ int manyAllomorphsThreshold + "affixed span, so a literal strip-and-reparse probe can miss an analysis; the search " + "backstop is required."; - foreach (AffixProcessAllomorph allomorph in rule.Allomorphs) + foreach (AffixProcessAllomorph allomorph in allomorphs) { // Reduplication: the same input part is copied two or more times. Copying an // unbounded span is not regular, so the rule is not finite-state. @@ -343,7 +367,7 @@ int manyAllomorphsThreshold + "or the search engine. Slow today."; advisories.Add( new GrammarAdvisory( - rule.Name, + ruleName, stratum, "affix", GrammarAdvisorySeverity.Escape, @@ -370,7 +394,7 @@ int manyAllomorphsThreshold // circumfix over a split stem — finite-state, NOT flagged. advisories.Add( new GrammarAdvisory( - rule.Name, + ruleName, stratum, "affix", GrammarAdvisorySeverity.Escape, @@ -392,7 +416,7 @@ int manyAllomorphsThreshold { advisories.Add( new GrammarAdvisory( - rule.Name, + ruleName, stratum, "affix", GrammarAdvisorySeverity.Info, @@ -405,15 +429,15 @@ int manyAllomorphsThreshold } } - if (rule.Allomorphs.Count > manyAllomorphsThreshold) + if (allomorphs.Count > manyAllomorphsThreshold) { advisories.Add( new GrammarAdvisory( - rule.Name, + ruleName, stratum, "affix", GrammarAdvisorySeverity.Cost, - $"{rule.Allomorphs.Count} allomorphs; each one multiplies the un-application branching " + $"{allomorphs.Count} allomorphs; each one multiplies the un-application branching " + "during analysis.", "Consolidate allomorphs via environment conditioning where the language allows it." ) diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstTemplateAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstTemplateAnalyzerTests.cs index 75461c7b..e5631e5d 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstTemplateAnalyzerTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstTemplateAnalyzerTests.cs @@ -81,6 +81,69 @@ public void Analyze_SlotAffixWrongCategory_PrunedNotOvergenerated() Morphophonemic.AffixTemplates.Remove(verbTemplate); } + [Test] + public void Build_ReduplicationSlot_DegradesGracefully_DoesNotThrow() + { + // A reduplication slot is non-regular and unbuildable. The proposer must SKIP it (degrade), + // not throw and abort the whole build — and flag the grammar as not fully covered. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule { Name = "redup", Gloss = "RED" }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, // copy the stem twice = reduplication + } + ); + var t = new AffixTemplate + { + Name = "redup_tmpl", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + t.Slots.Add(new AffixTemplateSlot(redup) { Optional = true }); + Morphophonemic.AffixTemplates.Add(t); + + FstTemplateAnalyzer? fst = null; + Assert.DoesNotThrow(() => fst = new FstTemplateAnalyzer(Language), "an unbuildable slot must degrade, not throw"); + Assert.That(fst!.CoversAllConstructs, Is.False, "reduplication slot → grammar not fully covered (won't certify)"); + Assert.That(fst!.AnalyzeWord("sag"), Is.Not.Empty, "the rest of the grammar still analyzes"); + + Morphophonemic.AffixTemplates.Remove(t); + } + + [Test] + public void Analyze_ZeroSegmentSuffix_IsEmitted_NotDropped() + { + // A true zero-segment affix (CopyFromInput only, no InsertSegments) must still emit its + // morpheme token (it adds no segments). Previously it threw / was silently dropped. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var zero = new AffixProcessRule { Name = "zero_sfx", Gloss = "Z" }; + zero.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1") }, // copy stem, insert nothing = zero affix + } + ); + var t = new AffixTemplate + { + Name = "zero_tmpl", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + t.Slots.Add(new AffixTemplateSlot(zero) { Optional = true }); + Morphophonemic.AffixTemplates.Add(t); + + var search = new Morpher(TraceManager, Language); + var fst = new FstTemplateAnalyzer(Language); + Assert.That(fst.CoversAllConstructs, Is.True, "a zero-segment affix is buildable, not a skipped construct"); + // Whatever the engine yields for "sag" (bare root and/or root+Z), the FST must match it — + // i.e. it must not drop the zero-suffixed analysis. + AnalysisComparison comparison = FstVerification.Compare(search, fst, new[] { "sag" }); + Assert.That(comparison.IsComplete, Is.True, comparison.Format()); + + Morphophonemic.AffixTemplates.Remove(t); + } + [Test] public void Analyze_PrefixAndSuffixTemplate_MatchesSearch() { diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs index 4c21c8bf..fe63a53a 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs @@ -200,6 +200,41 @@ public void Analyze_ReduplicationRule_FlaggedEscapeAndTierDowngraded() Morphophonemic.MorphologicalRules.Remove(redup); } + [Test] + public void Analyze_RealizationalReduplication_IsExamined() + { + // RealizationalAffixProcessRule also implements IMorphologicalRule and has Allomorphs, so a + // reduplication encoded on one must be examined and flagged — it was previously skipped. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + Assert.That(GrammarFstAdvisor.Analyze(Language).EscapeCount, Is.EqualTo(0), "baseline has no escapes"); + + var redup = new RealizationalAffixProcessRule + { + Name = "real_redup", + Gloss = "INTENS", + RealizationalFeatureStruct = FeatureStruct + .New(Language.SyntacticFeatureSystem) + .Feature(Head) + .EqualTo(head => head.Feature("tense").EqualTo("past")) + .Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + + GrammarFstReport after = GrammarFstAdvisor.Analyze(Language); + Assert.That(after.EscapeCount, Is.EqualTo(1), after.Format()); + Assert.That(after.Escapes.Single().Rule, Is.EqualTo("real_redup")); + Assert.That(after.Escapes.Single().Issue, Does.Contain("Reduplication")); + + Morphophonemic.MorphologicalRules.Remove(redup); + } + [Test] public void Analyze_ReduplicationWithLaterPhonology_IsOpaque() { From 761e4b67068ef9e021d9d40a80b5bbfcb08733ac Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 09:02:32 -0400 Subject: [PATCH 04/13] HC FST: apply CSharpier formatting (fix CI Check-formatting build failure) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI runs `dotnet csharpier check .` and the new/edited FST files were not formatted. Ran `dotnet csharpier format .` (1.2.6) — only the 11 FST/advisor/test files changed; no unrelated files touched. Unit suite 96 green; csharpier check clean. Co-Authored-By: Claude Opus 4.8 --- .../AnalysisCache.cs | 11 ++++++-- .../AnalysisCacheSerializer.cs | 14 ++++++++-- .../CachingMorphologicalAnalyzer.cs | 4 ++- .../FstTemplateAnalyzer.cs | 13 +++++---- .../FstVerification.cs | 5 +--- .../GrammarFstAdvisor.cs | 26 ++++++++++------- .../FstSenaBenchmark.cs | 28 +++++++++++++------ .../FstTemplateAnalyzerTests.cs | 11 ++++++-- .../GrammarFstAdvisorTests.cs | 13 ++------- .../MorphTokenTests.cs | 4 +-- .../VerifiedFstAnalyzerTests.cs | 12 ++++++-- 11 files changed, 91 insertions(+), 50 deletions(-) diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisCache.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisCache.cs index cc1cbcfd..a915b9a6 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/AnalysisCache.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisCache.cs @@ -14,8 +14,10 @@ namespace SIL.Machine.Morphology.HermitCrab /// public sealed class AnalysisCache { - private readonly System.Collections.Concurrent.ConcurrentDictionary> _store = - new System.Collections.Concurrent.ConcurrentDictionary>(); + private readonly System.Collections.Concurrent.ConcurrentDictionary< + string, + IReadOnlyList + > _store = new System.Collections.Concurrent.ConcurrentDictionary>(); /// Number of words with a stored complete analysis. public int Count => _store.Count; @@ -36,7 +38,10 @@ public void Set(string word, IReadOnlyList analyses) } /// Return the cached complete analysis, or compute it once via and cache it. - public IReadOnlyList GetOrAdd(string word, System.Func> compute) + public IReadOnlyList GetOrAdd( + string word, + System.Func> compute + ) { return _store.GetOrAdd(word, compute); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisCacheSerializer.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisCacheSerializer.cs index f195bd60..c4a0baea 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/AnalysisCacheSerializer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisCacheSerializer.cs @@ -16,7 +16,12 @@ public static class AnalysisCacheSerializer { private const string Magic = "hcfstcache/1"; - public static void Save(AnalysisCache cache, MorphemeRegistry registry, string grammarVersion, TextWriter writer) + public static void Save( + AnalysisCache cache, + MorphemeRegistry registry, + string grammarVersion, + TextWriter writer + ) { writer.WriteLine(Magic + "\t" + (grammarVersion ?? string.Empty)); foreach (KeyValuePair> entry in cache.Entries) @@ -36,7 +41,12 @@ public static void Save(AnalysisCache cache, MorphemeRegistry registry, string g /// file's grammar version does not match — the caller should /// then re-warm. Skips any analysis referencing an unknown morpheme key (defensive). /// - public static bool Load(AnalysisCache cache, MorphemeRegistry registry, string grammarVersion, TextReader reader) + public static bool Load( + AnalysisCache cache, + MorphemeRegistry registry, + string grammarVersion, + TextReader reader + ) { string header = reader.ReadLine(); if (header == null) diff --git a/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs index 8b792483..8908768a 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs @@ -57,7 +57,9 @@ public static CachingMorphologicalAnalyzer FromLanguage( if (certificationCorpus != null) { bool closed = GrammarFstClosure.Analyze(language).FstClosed; - bool parity = FstVerification.Compare(new Morpher(traceManager, language), fast, certificationCorpus).IsComplete; + bool parity = FstVerification + .Compare(new Morpher(traceManager, language), fast, certificationCorpus) + .IsComplete; certified = closed && proposer.CoversAllConstructs && parity; } return new CachingMorphologicalAnalyzer(fast, pool, cache ?? new AnalysisCache(), certified); diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs index d684e19c..6baa1ffa 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs @@ -42,10 +42,8 @@ public class FstTemplateAnalyzer : IMorphologicalAnalyzer private readonly Func, bool> _filter; private readonly int _maxStates; private readonly Func _bareRootValid; - private readonly List _derivSuffixRules = - new List(); - private readonly List _derivPrefixRules = - new List(); + private readonly List _derivSuffixRules = new List(); + private readonly List _derivPrefixRules = new List(); private int _stateCount; private bool _hasUnbuiltConstructs; @@ -84,7 +82,12 @@ public FstTemplateAnalyzer(Language language, int maxStates = 1_000_000, int der public FstTemplateAnalyzer(Language language, Morpher morpher, int maxStates = 1_000_000, int derivDepth = 2) : this(language, root => BareRootValid(morpher, root), maxStates, derivDepth) { } - private FstTemplateAnalyzer(Language language, Func bareRootValid, int maxStates, int derivDepth) + private FstTemplateAnalyzer( + Language language, + Func bareRootValid, + int maxStates, + int derivDepth + ) { _bareRootValid = bareRootValid; _maxStates = maxStates; diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstVerification.cs b/src/SIL.Machine.Morphology.HermitCrab/FstVerification.cs index 70f4b49d..39eba880 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/FstVerification.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/FstVerification.cs @@ -111,10 +111,7 @@ IEnumerable words .Except(candidateSet) .OrderBy(s => s, StringComparer.Ordinal) .ToList(); - List extra = candidateSet - .Except(referenceSet) - .OrderBy(s => s, StringComparer.Ordinal) - .ToList(); + List extra = candidateSet.Except(referenceSet).OrderBy(s => s, StringComparer.Ordinal).ToList(); divergences.Add(new AnalysisDivergence(word, missing, extra)); } return new AnalysisComparison(count, divergences); diff --git a/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs b/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs index 30025179..cc47b128 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/GrammarFstAdvisor.cs @@ -179,15 +179,14 @@ int compoundingRulesExamined /// pass confirms whether Tier 2 is worth it vs. Tier 3. /// public string Tier => - EscapeCount == 0 - ? "Tier 1 candidate — fully FST-able" - : ProbeableEscapeCount == EscapeCount - ? "Tier 2⁺ candidate — every escape is probe-able (surface-invariant): a per-word " - + "un-application probe WOULD recover the fast path once the probe runtime exists; " - + "all escapes are slow in today's engine" - : EscapeCount <= 3 - ? "Tier 2 candidate — hybrid (opaque/non-probe-able escapes fall back to search); confirm with corpus fallback rate" - : "Tier 3 — pervasive escapes, search engine only"; + EscapeCount == 0 ? "Tier 1 candidate — fully FST-able" + : ProbeableEscapeCount == EscapeCount + ? "Tier 2⁺ candidate — every escape is probe-able (surface-invariant): a per-word " + + "un-application probe WOULD recover the fast path once the probe runtime exists; " + + "all escapes are slow in today's engine" + : EscapeCount <= 3 + ? "Tier 2 candidate — hybrid (opaque/non-probe-able escapes fall back to search); confirm with corpus fallback rate" + : "Tier 3 — pervasive escapes, search engine only"; /// The rules that break FST compilation (the warnings that flip the tier). public IEnumerable Escapes => @@ -283,7 +282,14 @@ public static GrammarFstReport Analyze(Language language, int manyAllomorphsThre { case AffixProcessRule affix: affixExamined++; - AnalyzeAffix(affix.Name, affix.Allomorphs, stratum.Name, surfaceInvariant, advisories, manyAllomorphsThreshold); + AnalyzeAffix( + affix.Name, + affix.Allomorphs, + stratum.Name, + surfaceInvariant, + advisories, + manyAllomorphsThreshold + ); break; case RealizationalAffixProcessRule realizational: // Realizational affixes also have Allomorphs and can encode diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstSenaBenchmark.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstSenaBenchmark.cs index 4bfdcfe6..2532ddf6 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstSenaBenchmark.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/FstSenaBenchmark.cs @@ -27,7 +27,10 @@ public void Benchmark_FstVsSearch() TestContext.Out.WriteLine($"census : {census.Tier} ({census.EscapeCount} escapes)"); TestContext.Out.WriteLine($"closure : {(closure.FstClosed ? "FST-CLOSED" : "not closed")}"); - var verified = new VerifiedFstAnalyzer(new FstTemplateAnalyzer(language, search), new MorpherPool(() => new Morpher(new TraceManager(), language))); + var verified = new VerifiedFstAnalyzer( + new FstTemplateAnalyzer(language, search), + new MorpherPool(() => new Morpher(new TraceManager(), language)) + ); var caching = CachingMorphologicalAnalyzer.FromLanguage(new TraceManager(), language, words); long searchMs = TimeParse("search ", words, w => search.AnalyzeWord(w).Count()); @@ -39,7 +42,8 @@ public void Benchmark_FstVsSearch() TestContext.Out.WriteLine( $"verified vs search : {(parity.IsComplete ? "IDENTICAL" : parity.Divergences.Count + " divergent words")} " + $"(grammar certified = {caching.GrammarCertified} → " - + (caching.GrammarCertified ? "FST-only, no full search" : "engine/cache backstop") + ")" + + (caching.GrammarCertified ? "FST-only, no full search" : "engine/cache backstop") + + ")" ); TestContext.Out.WriteLine($"(search total {searchMs} ms)"); } @@ -58,7 +62,10 @@ public void Soundness_NegativeExamples() int targetCount = int.TryParse(Environment.GetEnvironmentVariable("HC_NEG_COUNT"), out int nc) ? nc : 50; var search = new Morpher(new TraceManager(), language) { MaxUnapplications = 0 }; var raw = new FstTemplateAnalyzer(language, search); - var verified = new VerifiedFstAnalyzer(new FstTemplateAnalyzer(language, search), new MorpherPool(() => new Morpher(new TraceManager(), language))); + var verified = new VerifiedFstAnalyzer( + new FstTemplateAnalyzer(language, search), + new MorpherPool(() => new Morpher(new TraceManager(), language)) + ); List real = real0.Take(80).ToList(); string[] pre = { "ku", "a", "ci", "ka", "mu", "ma", "ni", "wa", "ti", "pa" }; @@ -128,7 +135,9 @@ public void Soundness_NegativeExamples() catch (Exception) { } } - TestContext.Out.WriteLine($"negatives: {chosen}; raw FST proposed {fstProposed}; false positives {falsePositives}"); + TestContext.Out.WriteLine( + $"negatives: {chosen}; raw FST proposed {fstProposed}; false positives {falsePositives}" + ); foreach (string e in fp) { TestContext.Out.WriteLine($" FALSE POSITIVE: {e}"); @@ -163,7 +172,9 @@ private static string SigSet(IMorphologicalAnalyzer analyzer, string word) "|", analyzer .AnalyzeWord(word) - .Select(a => string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex) + .Select(a => + string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex + ) .OrderBy(s => s, StringComparer.Ordinal) ); } @@ -178,8 +189,7 @@ private static (Language, List) Load() } int maxWords = int.TryParse(Environment.GetEnvironmentVariable("HC_MAX_WORDS"), out int mw) ? mw : 60; Language language = XmlLanguageLoader.Load(grammarPath!); - List words = File - .ReadAllLines(wordsPath!) + List words = File.ReadAllLines(wordsPath!) .Select(w => w.Trim()) .Where(w => w.Length > 0) .Take(maxWords) @@ -205,7 +215,9 @@ private static long TimeParse(string label, List words, Func fst = new FstTemplateAnalyzer(Language), "an unbuildable slot must degrade, not throw"); - Assert.That(fst!.CoversAllConstructs, Is.False, "reduplication slot → grammar not fully covered (won't certify)"); + Assert.DoesNotThrow( + () => fst = new FstTemplateAnalyzer(Language), + "an unbuildable slot must degrade, not throw" + ); + Assert.That( + fst!.CoversAllConstructs, + Is.False, + "reduplication slot → grammar not fully covered (won't certify)" + ); Assert.That(fst!.AnalyzeWord("sag"), Is.Not.Empty, "the rest of the grammar still analyzes"); Morphophonemic.AffixTemplates.Remove(t); diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs index fe63a53a..dd8a8e80 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/GrammarFstAdvisorTests.cs @@ -99,12 +99,7 @@ public void Analyze_TrueInfix_FlaggedEscape() Pattern.New("1").Annotation(any).Value, Pattern.New("2").Annotation(any).OneOrMore.Value, }, - Rhs = - { - new CopyFromInput("1"), - new InsertSegments(Table3, "a"), - new CopyFromInput("2"), - }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "a"), new CopyFromInput("2") }, } ); Morphophonemic.MorphologicalRules.Add(infix); @@ -130,11 +125,7 @@ public void Analyze_HarmonyRewrite_StaysEscapeButIsRegular() // A vowel-harmony-style rewrite: bounded LHS/RHS, but an UNBOUNDED left environment // ("...anything... ___"). By Kaplan & Kay this is a regular relation, but in today's // engine it un-applies at many positions and is slow. - var harmony = new RewriteRule - { - Name = "harmony", - Lhs = Pattern.New().Annotation(any).Value, - }; + var harmony = new RewriteRule { Name = "harmony", Lhs = Pattern.New().Annotation(any).Value }; harmony.Subrules.Add( new RewriteSubrule { diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenTests.cs index aa78b605..fb9d1590 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorphTokenTests.cs @@ -26,8 +26,8 @@ public void Encode_RoundTripsOpAndMorphemeId() [Test] public void Encode_IdOutOfRange_Throws() { - Assert.Throws( - () => MorphToken.Encode(MorphOp.Root, MorphToken.MaxMorphemeId + 1) + Assert.Throws(() => + MorphToken.Encode(MorphOp.Root, MorphToken.MaxMorphemeId + 1) ); Assert.Throws(() => MorphToken.Encode(MorphOp.Root, -1)); } diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs index 73abcfe6..f129e5ec 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs @@ -73,7 +73,11 @@ public void Verified_YieldsGenuineEngineAnalyses_WithCategory() foreach (WordAnalysis a in fromVerified) { Assert.That(a.Category, Is.Not.Null, $"verified analysis of {word} lost its category"); - Assert.That(fromSearch, Does.Contain(a), $"verified analysis of {word} is not a genuine engine analysis"); + Assert.That( + fromSearch, + Does.Contain(a), + $"verified analysis of {word} is not a genuine engine analysis" + ); } } } @@ -107,7 +111,11 @@ public void Verified_ParallelMatchesSequential() Dictionary sequential = corpus.Distinct().ToDictionary(w => w, w => SigSet(verified, w)); var parallel = new ConcurrentDictionary(); Parallel.ForEach(corpus, w => parallel[w] = SigSet(verified, w)); - Assert.That(corpus.Distinct().All(w => parallel[w] == sequential[w]), Is.True, "concurrent analyses diverged from sequential"); + Assert.That( + corpus.Distinct().All(w => parallel[w] == sequential[w]), + Is.True, + "concurrent analyses diverged from sequential" + ); } private static string Sig(WordAnalysis a) => From 7b8fc3c395554f1cbf7a253f4fbb1806e098a28a Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 09:31:22 -0400 Subject: [PATCH 05/13] HC FST Solution 1: surface-allomorph precompile for altered bare roots MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let the FST proposer match phonologically-altered surfaces, the C-internal tier of Solution 1 (surface-allomorph precompile, docs/FST_FULL_COVERAGE_PLAN.md Appendix C). For each root the grammar allows to stand bare, build a proposer arc not just for the underlying shape but for every bare surface realization HC synthesizes (phonology applied) — reusing the obligatoriness GenerateWords call, so zero extra build cost. The emitted token is always the underlying morpheme; verify re-runs HC with real phonology to confirm. - FstTemplateAnalyzer: _bareRootValid -> _bareRootSurfaces; add BareRootSurfaces, UnderlyingForm, BuildRootChainFromSurface. Underlying arcs kept (union), so the 0-phonology path is unchanged. Fix a latent verify bug this exposed: AnalysisRewriteRule/AnalysisMetathesisRule gate on Morpher.RuleSelector, and FstReplay pinned the selector to just the candidate's morphological rules — silently disabling ALL phonology during verify. The propose-and-verify spine could therefore never confirm any phonologically- altered candidate. Phonological rules are obligatory deterministic rewrites, not a fan-out choice, so FstReplay now always lets IPhonologicalRule through; the morphological fan-out is still collapsed by gating the leaf rules + root, and soundness is still enforced by the unchanged candidate-signature match. Add Verified_CoversPhonologicallyAlteredBareRoot: an unconditional t->d rule makes bare root "dat" surface only as "dad"; a baseline assertion proves the underlying- only proposer misses "dad", the surface-precompile proposer covers it, verify confirms it as a genuine HC analysis, and a non-word still yields nothing. Full HermitCrab suite green (97 passed). Co-Authored-By: Claude Opus 4.8 --- docs/FST_FULL_COVERAGE_PLAN.md | 75 ++++++++++++++++ .../FstReplay.cs | 12 ++- .../FstTemplateAnalyzer.cs | 89 +++++++++++++++---- .../VerifiedFstAnalyzerTests.cs | 48 ++++++++++ 4 files changed, 203 insertions(+), 21 deletions(-) diff --git a/docs/FST_FULL_COVERAGE_PLAN.md b/docs/FST_FULL_COVERAGE_PLAN.md index d1a8fd17..f4f7386d 100644 --- a/docs/FST_FULL_COVERAGE_PLAN.md +++ b/docs/FST_FULL_COVERAGE_PLAN.md @@ -210,3 +210,78 @@ Kaplan & Kay 1994 (regular relations; closure under composition → phonology, s compounding); Dolatian & Heinz 2020 (2-way FSTs compute reduplication; 1-way cannot); Chandlee 2017 (subregular morphology; partial reduplication is local/regular); Beesley & Karttunen 2003 (compile- replace for bounded reduplication). + +--- + +## Appendix C — Solution 1 implementation plan (surface-allomorph precompile) + +**Goal.** Let the proposer match phonologically-altered surfaces by building its arcs from each +morpheme's **surface** realizations (phonology applied forward), not only its underlying shape. Stay a +sound **superset** (never miss a real candidate) and lean on verify to prune. This lifts the proposer +from "0-phonology grammars only" toward real grammars. + +**Why it's sound + bounded.** The proposer only nominates `(root + rules)` sets; verify re-runs HC with +real phonology and checks the surface, so extra/wrong surface variants are pruned. The only obligation +is *completeness of the variant set*: every surface a morpheme can take must be an arc. The harmony / +subregular literature (Heinz/TSL; Yawelmani ≈ 21-state FST) shows attested phonology gives each +morpheme a **small** variant set (single digits to low tens), so the FST grows by a small constant +factor, not combinatorially. Pathological blow-up is theoretical, not attested; such grammars fall back +to the engine via the certification interlock. + +**Algorithm.** +1. For each morpheme shape (root allomorph segments; affix `InsertSegments` segments), compute its + **surface variant set** = { underlying } ∪ { phonology(shape) under each bounded context }. +2. Build the proposer's segment arcs from the **union** of variants (same `(op, morpheme)` token on + every variant — the token is the underlying morpheme; the arcs are surface). Interweaving is free: + the walk picks each morpheme's variant independently. +3. Verify prunes invalid variant combinations. + +**Three tiers of "context", implemented incrementally:** +- **C-internal (first cut):** apply the grammar's phonological rules to the morpheme shape *in + isolation* (with word-edge anchors). Covers morpheme-internal + edge alternations (e.g. root-internal + aspiration). Sound for those; misses cross-boundary effects. +- **C-boundary (next):** over-approximate the neighbor context — apply rules with each natural-class + boundary segment on each side — so boundary-conditioned variants (assimilation across a morpheme + seam) are included. Still bounded (variants × small context set). +- **C-exact (endgame = Solution 3):** compose the full phonology transducer. Solution 1 is its + per-morpheme approximation; this is a smooth upgrade, not a throwaway. + +**How to apply phonology forward to a shape (reuse HC, do not reimplement):** compile each stratum's +`PhonologicalRules` via `prule.CompileSynthesisRule(morpher)` into a `LinearRuleCascade` (exactly what +`SynthesisStratumRule._prulesRule` does), build a `Word` from the morpheme shape, `Apply` the cascade, +read the surface shape(s). (Or, for bare-standing roots, `Morpher.GenerateWords(root, ∅, ∅)` returns +the surface directly — the safe minimal version.) + +**Soundness guards (must hold):** +- Keep the underlying arcs too (union), so the 0-phonology path is unchanged. +- Only ROOT-INTERNAL/edge variants are claimed by the first cut; anything cross-boundary that the cut + misses must keep the grammar from certifying (the parity gate already enforces this — a missed + variant shows up as FST≠engine, so the grammar won't certify and those words ride the engine). +- The token emitted is always the underlying morpheme; verify (which runs real phonology) confirms. + +**Explosion control:** dedup variants per morpheme by surface string; cap variants-per-morpheme with a +budget; if exceeded, drop the surface-precompute for that morpheme (fall back to underlying + engine) — +never explode, only degrade coverage. + +**Test strategy:** construct a minimal phonology grammar (a feature-changing rewrite rule, e.g. a root +that aspirates), show the *current* proposer misses the altered surface (under-generates), the +surface-precompile proposer covers it, and verify keeps it sound (0 false positives on non-words). + +### Result (shipped — C-internal tier, bare roots) + +Implemented the safe minimal version: `BareRootSurfaces` reuses the obligatoriness `GenerateWords` +call to get a root's bare surface realizations, and `BuildRootChainFromSurface` adds a proposer arc for +every realization ≠ the underlying form (same underlying-morpheme token). Zero extra build cost. + +**Latent verify bug this exposed (fixed).** `AnalysisRewriteRule.Apply` / `AnalysisMetathesisRule.Apply` +gate on `Morpher.RuleSelector`. `FstReplay`'s restricted re-analysis pinned the selector to *just the +candidate's morphological rules* — which silently disabled **all phonology** during verify. So before +this fix the propose-and-verify spine could never confirm *any* phonologically-altered candidate +(verify couldn't un-apply phonology to reduce the surface back to the root). Phonological rules are +obligatory deterministic rewrites, not a fan-out choice, so `FstReplay` now always lets +`IPhonologicalRule` through the selector; the morphological fan-out is still collapsed by gating the +leaf morphological rules + root, and soundness is still enforced by the candidate-signature match. + +Verified end-to-end by `Verified_CoversPhonologicallyAlteredBareRoot` (an unconditional t→d rule makes +bare root "dat" surface only as "dad"; the proposer now matches "dad" and verify confirms it as a +genuine HC analysis, while a non-word still yields nothing). Full HermitCrab suite green (97 passed). diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstReplay.cs b/src/SIL.Machine.Morphology.HermitCrab/FstReplay.cs index 9e8f2f80..3243be02 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/FstReplay.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/FstReplay.cs @@ -47,11 +47,15 @@ public static WordAnalysis Confirm(MorpherPool pool, WordAnalysis candidate, str Morpher morpher = pool.Rent(); try { - // Pin HC to this candidate's path: only this root, only its rules (templates and strata - // stay open — they are containers the path threads through; gating the leaf rules + root - // is what collapses the fan-out). + // Pin HC to this candidate's path: only this root, only its morphological rules. + // Templates and strata stay open (they are containers the path threads through), and + // phonological rules ALWAYS stay open — they are obligatory, deterministic rewrites, not + // a fan-out choice, and un-applying them is exactly how a phonologically-altered surface + // (e.g. an FST candidate proposed from a surface allomorph) reduces back to its root. + // Gating only the leaf morphological rules + the root is what collapses the fan-out. morpher.LexEntrySelector = e => e == root; - morpher.RuleSelector = r => r is AffixTemplate || r is Stratum || rules.Contains(r); + morpher.RuleSelector = r => + r is AffixTemplate || r is Stratum || r is IPhonologicalRule || rules.Contains(r); var ids = new Dictionary(); string target = Signature(candidate, ids); diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs index 6baa1ffa..e4ca2fe5 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs @@ -41,7 +41,7 @@ public class FstTemplateAnalyzer : IMorphologicalAnalyzer private readonly CharacterDefinitionTable _table; private readonly Func, bool> _filter; private readonly int _maxStates; - private readonly Func _bareRootValid; + private readonly Func> _bareRootSurfaces; private readonly List _derivSuffixRules = new List(); private readonly List _derivPrefixRules = new List(); private int _stateCount; @@ -71,25 +71,26 @@ public class FstTemplateAnalyzer : IMorphologicalAnalyzer /// Build without obligatoriness: every root may stand bare (fine for toy grammars). public FstTemplateAnalyzer(Language language, int maxStates = 1_000_000, int derivDepth = 2) - : this(language, _ => true, maxStates, derivDepth) { } + : this(language, root => new[] { UnderlyingForm(root) }, maxStates, derivDepth) { } /// - /// Build with obligatory-inflection enforcement: a root may stand bare only if synthesizing - /// it bare actually yields its surface (HC's own finality/validity check). This removes the - /// "bare-root" over-generation in always-inflected grammars (e.g. Bantu), where a root that - /// must take a class/agreement affix should not surface alone. + /// Build with obligatory-inflection enforcement AND surface-allomorph precompile (§C): a root's + /// bare surface realizations are obtained by synthesizing it bare (HC's own finality check). If + /// synthesis returns nothing, the bare reading is suppressed (obligatory inflection); if it + /// returns a phonologically-ALTERED surface, the proposer builds an arc for that surface so a + /// phonologically-altered bare root is matched (not just the underlying form). /// public FstTemplateAnalyzer(Language language, Morpher morpher, int maxStates = 1_000_000, int derivDepth = 2) - : this(language, root => BareRootValid(morpher, root), maxStates, derivDepth) { } + : this(language, root => BareRootSurfaces(morpher, root), maxStates, derivDepth) { } private FstTemplateAnalyzer( Language language, - Func bareRootValid, + Func> bareRootSurfaces, int maxStates, int derivDepth ) { - _bareRootValid = bareRootValid; + _bareRootSurfaces = bareRootSurfaces; _maxStates = maxStates; _derivDepth = derivDepth; _table = language.SurfaceStratum.CharacterDefinitionTable; @@ -134,13 +135,31 @@ int derivDepth } } - // Bare-root paths — only for roots the grammar allows to stand uninflected. + // Bare-root paths — only for roots the grammar allows to stand uninflected. Surface-allomorph + // precompile (§C): build a chain for the underlying form AND for each phonologically-altered + // bare surface realization, so an altered bare root is matched. The emitted token is always + // the underlying root morpheme; verify re-runs HC (with real phonology) to confirm. foreach (RootRef root in roots) { - if (_bareRootValid(root.Allomorph)) + IReadOnlyCollection surfaces = _bareRootSurfaces(root.Allomorph); + if (surfaces.Count == 0) { - State end = BuildRootChain(_start, root.Allomorph); - end.IsAccepting = true; + continue; // bare root not valid (obligatory inflection) + } + State end = BuildRootChain(_start, root.Allomorph); + end.IsAccepting = true; + string underlying = UnderlyingForm(root.Allomorph); + foreach (string s in surfaces) + { + if (s == underlying) + { + continue; // already built from the underlying shape + } + State surfaceEnd = BuildRootChainFromSurface(_start, s, root.Allomorph.Morpheme); + if (surfaceEnd != null) + { + surfaceEnd.IsAccepting = true; + } } } @@ -472,16 +491,29 @@ MorphOp op /// its own surface form. If the grammar makes a bare stem non-final (obligatory inflection), /// synthesis returns nothing and the bare reading is correctly suppressed. /// - private static bool BareRootValid(Morpher morpher, RootAllomorph root) + private static string UnderlyingForm(RootAllomorph root) + { + return root.Segments.Representation.Normalize(System.Text.NormalizationForm.FormD); + } + + /// + /// The bare-root surface realizations: the surface forms HC synthesizes for the root with no + /// affixes (phonology applied). Empty ⇒ the bare root is not a valid word (obligatory + /// inflection). A form ≠ the underlying representation is a phonologically-altered surface the + /// proposer must match (Solution 1, §C). Reuses the same GenerateWords call the obligatoriness + /// check needed, so it is zero extra build cost. + /// + private static IReadOnlyCollection BareRootSurfaces(Morpher morpher, RootAllomorph root) { if (!(root.Morpheme is LexEntry entry)) { - return true; + return new[] { UnderlyingForm(root) }; } - string surface = root.Segments.Representation.Normalize(System.Text.NormalizationForm.FormD); return morpher .GenerateWords(entry, System.Linq.Enumerable.Empty(), new FeatureStruct()) - .Any(g => g.Normalize(System.Text.NormalizationForm.FormD) == surface); + .Select(g => g.Normalize(System.Text.NormalizationForm.FormD)) + .Distinct() + .ToList(); } private static FeatureStruct RequiredCategory(MorphemicMorphologicalRule rule) @@ -638,6 +670,29 @@ private State BuildRootChain(State from, Roo return state; } + /// Build a root chain from a surface STRING (a phonologically-altered realization), + /// segmenting it via the table; the chain ends in the underlying root morpheme's token. Returns + /// null if the surface has a segment outside the table. + private State BuildRootChainFromSurface(State from, string surface, Morpheme morpheme) + { + Shape shape; + try + { + shape = _table.Segment(surface); + } + catch (InvalidShapeException) + { + return null; + } + State state = from; + foreach (FeatureStruct fs in GetSegments(shape)) + { + state = AddArc(state, fs); + } + _tokenOnEntry[state] = MorphToken.Encode(MorphOp.Root, _codec.GetOrAddIndex(morpheme)); + return state; + } + private static bool CategoryMatches(FeatureStruct rootCategory, FeatureStruct required) { if (required == null || required.IsEmpty) diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs index f129e5ec..e54e3282 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs @@ -4,6 +4,7 @@ using SIL.Machine.FeatureModel; using SIL.Machine.Matching; using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; +using SIL.Machine.Morphology.HermitCrab.PhonologicalRules; namespace SIL.Machine.Morphology.HermitCrab; @@ -118,6 +119,53 @@ public void Verified_ParallelMatchesSequential() ); } + [Test] + public void Verified_CoversPhonologicallyAlteredBareRoot() + { + // Surface-allomorph precompile (§C): an unconditional t→d rule means the underlying bare root + // "dat" (entry 8) can ONLY surface as "dad". The old proposer (underlying arcs) misses it — its + // "t" arc can't match surface "d", and BareRootValid rejected it (it doesn't surface as itself). + // The surface-precompile builds an arc from the actual generated surface ("dad"), so the altered + // bare root is now matched. Confirmed via probe: gen dat(8)→dad, and "dad" analyzes while "dat" + // no longer does. + var tToD = new RewriteRule + { + Name = "t_to_d", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tToD.Subrules.Add( + new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } + ); + Surface.PhonologicalRules.Add(tToD); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That( + search.AnalyzeWord("dad").Any(), + Is.True, + "precondition: 'dad' analyzes (bare root 'dat' surfaces as 'dad')" + ); + + // Baseline: the underlying-only proposer (no-morpher ctor builds arcs from underlying shapes) + // misses the altered surface — both "dad" readings are underlying "dat", so it has no "dad" arc. + Assert.That( + new FstTemplateAnalyzer(Language).AnalyzeWord("dad"), + Is.Empty, + "baseline: the underlying-only proposer must miss the phonologically-altered surface" + ); + + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "dad" }); + Assert.That(cmp.IsComplete, Is.True, "altered bare root not covered: " + cmp.Format()); + + Assert.That(verified.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must still yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(tToD); + } + } + private static string Sig(WordAnalysis a) => string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex; From b81ccad904eb5c95f836c6a20f8e26bc86abe13b Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 09:51:09 -0400 Subject: [PATCH 06/13] HC FST: plan to close the coverage gap (phonology, infix, reduplication) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FST_FULL_PLAN.md — implementation plan for the four expansion points. The propose-and-verify split means correctness lives in verify + certification, never in the proposer, so coverage expansion can only change the acceleration ratio, never produce a wrong answer. Architecture: a CompositeProposer unions candidate generators (FST + reduplication + infix scanners) into the one verify gate. - Point 2 (infix) and Point 3 (reduplication): bounded candidate generators that strip/remove their material and RECURSE the residual through the FST proposer (so inflected reduplicants / infixed forms are covered), feeding the verify gate. - Point 1 (all phonology): affix surface-precompile + C-boundary neighbor context, extending the shipped bare-root C-internal tier. - Point 4 (C-exact composition): design recorded + deferred with rationale — it is a spine redesign (token side-table -> transducer outputs) whose only marginal gain over C-boundary is rare cross-boundary opacity that already falls back to the engine correctly. C-boundary subsumes its practical value. Co-Authored-By: Claude Opus 4.8 --- docs/FST_FULL_PLAN.md | 162 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 docs/FST_FULL_PLAN.md diff --git a/docs/FST_FULL_PLAN.md b/docs/FST_FULL_PLAN.md new file mode 100644 index 00000000..eefcb047 --- /dev/null +++ b/docs/FST_FULL_PLAN.md @@ -0,0 +1,162 @@ +# FST_FULL_PLAN — closing the coverage gap (phonology, infixation, reduplication) + +Implementation plan for expanding the propose-and-verify FST accelerator to cover **all attested +phonology**, **all infixation**, and **bounded reduplication**. Companion to +`FST_FULL_COVERAGE_PLAN.md` (the construct audit) and `HERMITCRAB_FST_PLAN.md` (the spine design). + +## The principle that makes this safe + +The propose-and-verify split puts **all correctness in verify + certification, none in the proposer**. +The proposer's only job is to emit a *sound superset* of candidates fast; `VerifiedFstAnalyzer` re-runs +HC (real analysis + synthesis, real phonology) on each candidate and discards any HC does not confirm; +the empirical parity gate (`FstVerification.Compare`) certifies a grammar only when FST≡engine on the +corpus. + +Consequence: **expansion can never produce a wrong answer.** A new candidate generator that +under-generates simply accelerates fewer words (parity gate → engine fallback); one that over-generates +has its junk pruned by verify. Correctness is invariant; only the *acceleration ratio* moves. So we can +add coverage aggressively. + +This reframes "can an FST represent X?" into **"can we cheaply enumerate a superset of candidates for X +that verify then prunes?"** — which decouples coverage from FST-representability and lets non-regular +constructs (full reduplication) be handled *beside* the FST by bounded generators feeding the same gate. + +## Architecture: a composite of candidate generators + +``` + ┌─────────────────────────────────────────┐ + surface word ───▶│ CompositeProposer (union + dedup) │ + │ ├─ FstTemplateAnalyzer (regular bulk) │ + │ ├─ ReduplicationProposer (strip + recurse) + │ └─ InfixProposer (remove + recurse) + └───────────────┬───────────────────────────┘ + │ candidate (root+rules) sets + ▼ + ┌─────────────────────────────────────────┐ + │ VerifiedFstAnalyzer (FstReplay verify) │ ── discards anything HC won't confirm + └───────────────┬───────────────────────────┘ + ▼ genuine HC analyses +``` + +`VerifiedFstAnalyzer` already wraps an `IMorphologicalAnalyzer` proposer, so the only new plumbing is a +`CompositeProposer : IMorphologicalAnalyzer` that unions + dedups candidates from several generators. + +**Three invariants every generator must respect** (learned before building, not after): + +1. **Recurse the residual through the FST proposer — never propose a flat root.** A reduplicated or + infixed surface can have an *inflected/affixed* base: `"wakaswakas"` is REDUP of inflected `"wakas"`, + not bare `"waka"`. So a generator strips/removes its own material, then calls the FST proposer on the + remainder, and wraps each returned analysis with its morpheme. Terminates: the residual is strictly + shorter, reduplication bounded to 1–2 copies, infixation to 1 site per pass. +2. **Dedup before verify.** Two generators (or a generator and the FST) can propose the same morpheme + set → verify would confirm it twice → duplicate analyses. `CompositeProposer` dedups by candidate + signature before the gate. +3. **The coverage signal must reflect the composite.** `FstTemplateAnalyzer.CoversAllConstructs` trips + `false` on a redup/infix slot. Once a sibling generator covers that construct, certification must see + the *composite's* coverage, not just the FST's — else the grammar won't certify and the now-covered + words stay on the engine. The parity gate keeps results correct regardless; this only governs whether + acceleration kicks in. + +--- + +## Point 2 — Infixation (regular; in-scope) + +Infixation splits the root and inserts the affix inside it (Tagalog `-um-`: sulat → s‹um›ulat). It is a +regular operation; the proposer already *recognizes* infix slots (`MorphTokenCodec.ClassifyOp → Infix`) +but skips them (`_hasUnbuiltConstructs = true`). + +**Generator (`InfixProposer`).** For each infix rule and each candidate insertion site in the surface: +remove the infix's surface segments at that site, recurse the remainder through the FST proposer, wrap +each analysis with the infix morpheme. Sound-superset shortcut: try every segment boundary the rule's +partition pattern allows (or over-approximate to all boundaries) — verify prunes the wrong splits. +`O(surface-length × infixes)` candidates — bounded. Composed with surface-precompile it also handles +infixes that trigger phonology. + +**Soundness.** Verify re-synthesizes `base + infix` and surface-matches; a wrong split won't confirm. +**Test.** A grammar with one infix rule; show the FST alone misses the infixed surface, `InfixProposer` +covers it, verify rejects a non-word. + +--- + +## Point 3 — Reduplication (non-regular; handled beside the FST) + +Full reduplication (copy the whole base, `ww`) is the one provably non-regular construct — an FST cannot +represent it. It doesn't need to: a bounded **string-repetition scanner** contributes candidates to the +same verify gate. + +**Generator (`ReduplicationProposer`).** Scan the surface for an adjacent repeated substring matching a +reduplication template (full-copy `XX`; partial CV-copy as a later refinement). For each detected +repetition: strip one copy, recurse the remainder through the FST proposer, wrap each analysis with the +reduplication morpheme. **Bound to 1–2 applications** (the "once or twice") — finite, tiny candidate set. +`O(n²)` scan per word, trivial. + +**Soundness.** A coincidental repeat (`"murmur"` that is not actually reduplicated) is proposed but +pruned because HC synthesis of `base + REDUP` won't reproduce it. **"Well enough for 99.9%":** the 1–2 +bound covers essentially all attested reduplication; triple/unboundedly-interacting reduplication +doesn't certify and rides the engine (still correct). +**Test.** A grammar with a full-reduplication rule; show the FST alone misses `"wakawaka"`, the composite +covers it (including an inflected reduplicant via the recursion), verify rejects a non-reduplicated word. + +--- + +## Point 1 — All phonology: affix surface-precompile + C-boundary (in-scope, incremental) + +The shipped C-internal tier handles **bare-root** alternation via `GenerateWords`. Two extensions: + +**1a. Affix surface-precompile.** Build affix arcs from each affix allomorph's *surface* segments, not +only the underlying `InsertSegments`. Forward-application helper: compile the stratum's +`PhonologicalRules` via `prule.CompileSynthesisRule(morpher)` into a `LinearRuleCascade` (exactly what +`SynthesisStratumRule._prulesRule` does), wrap the affix segments in a `Word`, `Apply`, read the surface +shape(s). An affix's surface depends on stem context, so this is fiddlier than the bare-root case — +**validate on one minimal affix-triggered alternation first**, then generalize. + +**1b. C-boundary context.** Over-approximate the neighbor: apply rules with each natural-class boundary +segment on each side, so boundary-conditioned variants (assimilation across a seam) are included. Bound +the variant count per morpheme (cap + drop-to-underlying fallback) so a long-distance harmony grammar +degrades rather than explodes. + +**Soundness.** Underlying arcs are kept (union), so the 0-phonology path is unchanged; the token is +always the underlying morpheme; verify confirms with real phonology; a missed variant shows up as +FST≠engine → no certify → engine (never wrong). +**Test.** A rewrite rule altering an affix's surface; show the underlying-only proposer misses it, the +surface-precompile proposer covers it, verify stays sound. + +--- + +## Point 4 — C-exact (full phonology composition): design only, deferred + +**Goal.** Compose the morphotactic FST with the full phonology transducer (Kaplan & Kay: bounded rewrite +rules are regular relations, closed under composition), giving complete coverage of all *attested* +(non-cyclic) phonology — including the cross-boundary opaque interactions the per-morpheme C-boundary +tier can miss. + +**Concrete design.** (1) Re-architect token emission from the `_tokenOnEntry` side-table into FST +**output labels**, so the proposer is a genuine transducer surface→token-string. (2) Build the phonology +transducer by composing each stratum's compiled phonological rules (the in-repo `Fst.Compose` exists, +line ~1887). (3) Compose `phonology⁻¹ ∘ morphotactics` so the machine maps surface directly to the +underlying token string. + +**Why deferred (engineering-correct, not a dodge).** +- It is a **redesign of the working spine** (token-emission → transducer outputs), high-risk relative to + its marginal value. +- The only thing it buys over Point 1's C-boundary tier is **cross-boundary opaque interaction**, which + is *rare in attested grammars and already produces correct answers via engine fallback* (the parity + gate refuses to certify, so those words ride the slow path — slower, never wrong). +- **Point 1 (C-boundary) subsumes essentially all of Point 4's attested-language value.** Points 1 and 4 + are the same axis (phonology coverage) at two tiers; doing 1 well delivers the practical payoff. + +So Point 4 ships as this design + rationale; the residual it would accelerate is exactly the set the +parity gate keeps correct on the engine today. + +--- + +## Order of work & status + +1. ☐ `CompositeProposer` plumbing (union + dedup + coverage-signal) — established by the first generator. +2. ☐ Point 3 Reduplication **or** Point 2 Infixation first (most self-contained; establishes plumbing). +3. ☐ The other of {infix, reduplication}. +4. ☐ Point 1 affix surface-precompile + C-boundary. +5. ☑ Point 4 design recorded (deferred, with rationale). + +Commit + test after each point; do not batch. Each generator's test must show (a) the FST alone misses +the construct, (b) the composite covers it, (c) verify still rejects a non-word. From 8e9b6f500df4e4b9facf5b77f7e943c82324f190 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 10:01:27 -0400 Subject: [PATCH 07/13] HC FST Point 3: full reduplication via a composable candidate generator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduplication (copy the whole base, surface = base·base) is the one provably non-regular construct — an FST cannot represent it. Handle it BESIDE the FST: a bounded candidate generator feeds the same propose-and-verify gate, so it is sound without being regular. - CompositeProposer: unions several proposers (FST + generators) into one IMorphologicalAnalyzer, deduping candidates by order-sensitive morpheme-identity signature before the verify gate. Aggregates coverage at the MorphOp level (CoversAllConstructs = FST's uncovered ops minus what generators cover) so a grammar can certify once a sibling generator covers the FST's skipped construct. New IConstructProposer interface lets a generator declare its covered ops. - ReduplicationProposer (IConstructProposer): detects an adjacent doubling X·X, strips one copy, RECURSES the residual through the FST proposer (so an inflected reduplicant is covered, not just a bare root), and appends the reduplication morpheme in HC application order (root·…·RED). A coincidental doubling is pruned by verify (HC synthesis won't reproduce it). - FstTemplateAnalyzer: replace the _hasUnbuiltConstructs bool with an _uncoveredOps set (records WHICH MorphOp was skipped — slot rules, in-slot affixes, and standalone morphological rules); expose UncoveredOps. CoversAllConstructs == (UncoveredOps empty). Test: a full-reduplication grammar; the FST alone misses "sagsag" (and reports not-fully-covered), the composite covers it (and reports covered), verify confirms the genuine HC analysis, and a non-word still yields nothing. Full suite green (98). Co-Authored-By: Claude Opus 4.8 --- .../CompositeProposer.cs | 92 +++++++++++++++++++ .../FstTemplateAnalyzer.cs | 42 +++++++-- .../ReduplicationProposer.cs | 92 +++++++++++++++++++ .../VerifiedFstAnalyzerTests.cs | 47 ++++++++++ 4 files changed, 263 insertions(+), 10 deletions(-) create mode 100644 src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs create mode 100644 src/SIL.Machine.Morphology.HermitCrab/ReduplicationProposer.cs diff --git a/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs new file mode 100644 index 00000000..aa5b408d --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs @@ -0,0 +1,92 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Morphology; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Unions the candidate sets of several proposers into one (FST_FULL_PLAN.md). The FST proposer + /// () covers the regular bulk; sibling generators + /// (, ) contribute candidates for + /// constructs the FST skips. Every candidate still flows through the single + /// verify gate, so the composite is sound by the same argument as + /// each part — a generator that over-generates has its junk pruned, one that under-generates only + /// loses acceleration (the parity gate falls those words back to the engine). + /// + /// Candidates are deduped by signature before they leave the composite: when two generators + /// (or a generator and the FST) propose the same morpheme set, verify would otherwise confirm it + /// twice and emit a duplicate analysis. The signature is order-sensitive morpheme identity + root + /// index, mirroring 's match semantics. + /// + /// aggregates coverage at the MorphOp level: the FST's uncovered + /// ops minus the ops the sibling generators cover. It is the cheap build-time signal certification + /// pairs with the empirical parity gate; op-level optimism here is safe because parity is the real + /// arbiter (a generator that covers an op only partially fails parity and the grammar is not + /// certified). + /// + public class CompositeProposer : IMorphologicalAnalyzer + { + private readonly IReadOnlyList _proposers; + private readonly bool _coversAllConstructs; + + public CompositeProposer(FstTemplateAnalyzer fst, params IConstructProposer[] generators) + { + var proposers = new List { fst }; + var covered = new HashSet(); + foreach (IConstructProposer generator in generators) + { + proposers.Add(generator); + foreach (MorphOp op in generator.CoveredOps) + { + covered.Add(op); + } + } + _proposers = proposers; + _coversAllConstructs = fst.UncoveredOps.All(covered.Contains); + } + + /// True iff every construct the FST proposer skipped is claimed by a sibling generator. + /// Paired with the empirical parity gate for certification (see class remarks). + public bool CoversAllConstructs => _coversAllConstructs; + + public IEnumerable AnalyzeWord(string word) + { + var ids = new Dictionary(); + var seen = new HashSet(); + foreach (IMorphologicalAnalyzer proposer in _proposers) + { + foreach (WordAnalysis candidate in proposer.AnalyzeWord(word)) + { + if (seen.Add(Signature(candidate, ids))) + { + yield return candidate; + } + } + } + } + + /// Order-sensitive morpheme-identity signature (same scheme as ). + private static string Signature(WordAnalysis analysis, Dictionary ids) + { + return string.Join("+", analysis.Morphemes.Select(m => Id(m, ids))) + ":" + analysis.RootMorphemeIndex; + } + + private static int Id(IMorpheme morpheme, Dictionary ids) + { + if (!ids.TryGetValue(morpheme, out int id)) + { + id = ids.Count; + ids[morpheme] = id; + } + return id; + } + } + + /// A candidate generator for a specific non-FST construct (reduplication, infixation). It + /// proposes a sound superset for that construct and declares which s it covers + /// so the composite can aggregate the build-time coverage signal. + public interface IConstructProposer : IMorphologicalAnalyzer + { + IReadOnlyCollection CoveredOps { get; } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs index e4ca2fe5..3a9c7f70 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs @@ -45,7 +45,7 @@ public class FstTemplateAnalyzer : IMorphologicalAnalyzer private readonly List _derivSuffixRules = new List(); private readonly List _derivPrefixRules = new List(); private int _stateCount; - private bool _hasUnbuiltConstructs; + private readonly HashSet _uncoveredOps = new HashSet(); /// /// Max stacked derivational affixes modelled per side before inflection (tunable per grammar). @@ -67,7 +67,12 @@ public class FstTemplateAnalyzer : IMorphologicalAnalyzer /// certified (it falls to the engine/cache). The empirical set-parity gate enforces this; this /// flag is the cheap build-time signal of the same fact. /// - public bool CoversAllConstructs => !_hasUnbuiltConstructs; + public bool CoversAllConstructs => _uncoveredOps.Count == 0; + + /// The set of s the build skipped because it cannot model them in + /// the FST (infix/circumfix/reduplication/process). A sibling generator that covers one of these + /// (see ) removes it from the composite's uncovered set. + public IReadOnlyCollection UncoveredOps => _uncoveredOps; /// Build without obligatoriness: every root may stand bare (fine for toy grammars). public FstTemplateAnalyzer(Language language, int maxStates = 1_000_000, int derivDepth = 2) @@ -123,7 +128,8 @@ int derivDepth { continue; } - switch (RuleOp(rule)) + MorphOp ruleOp = RuleOp(rule); + switch (ruleOp) { case MorphOp.Suffix: _derivSuffixRules.Add(rule); @@ -131,6 +137,14 @@ int derivDepth case MorphOp.Prefix: _derivPrefixRules.Add(rule); break; + case MorphOp.None: + break; + default: + // A standalone rule the proposer cannot build (reduplication/infix/process). + // Record the op as uncovered so the grammar does not certify unless a sibling + // generator (see CompositeProposer) covers it. + _uncoveredOps.Add(ruleOp); + break; } } } @@ -383,10 +397,17 @@ List suffixSlots break; default: // A slot the proposer cannot build (infix/circumfix/reduplication/process). - // Skip it and flag the grammar as not fully covered — those words fall to the - // engine/cache; the parity gate refuses to certify. (Was a hard throw that - // aborted the whole build.) - _hasUnbuiltConstructs = true; + // Skip it and record the construct op(s) as uncovered — those words fall to the + // engine/cache unless a sibling generator covers the op; the parity gate refuses + // to certify otherwise. (Was a hard throw that aborted the whole build.) + foreach (MorphemicMorphologicalRule rule in slot.Rules) + { + MorphOp ruleOp = RuleOp(rule); + if (ruleOp != MorphOp.Prefix && ruleOp != MorphOp.Suffix && ruleOp != MorphOp.None) + { + _uncoveredOps.Add(ruleOp); + } + } break; } } @@ -628,9 +649,10 @@ FeatureStruct templateCategory if (aop != op && aop != MorphOp.None) { // A rule the proposer can't build in this slot (infix/circumfix/redup/ - // process). Skip it and flag the grammar not-fully-covered; the engine/ - // cache backstop and parity gate handle those words. (Was a hard throw.) - _hasUnbuiltConstructs = true; + // process). Skip it and record the op as uncovered; the engine/cache backstop + // and parity gate handle those words unless a sibling generator covers the op. + // (Was a hard throw.) + _uncoveredOps.Add(aop); continue; } // aop == op (normal affix) or aop == None (a true zero-segment affix: no diff --git a/src/SIL.Machine.Morphology.HermitCrab/ReduplicationProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/ReduplicationProposer.cs new file mode 100644 index 00000000..cd3d870c --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/ReduplicationProposer.cs @@ -0,0 +1,92 @@ +using System; +using System.Collections.Generic; +using SIL.Machine.Morphology; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A candidate generator for full reduplication (copy the whole base, surface = base·base) — + /// the one provably non-regular construct (Dolatian & Heinz 2020), handled beside the FST + /// rather than inside it (FST_FULL_PLAN.md, Point 3). It does not need to be regular because the + /// gate re-runs HC to confirm every candidate. + /// + /// Mechanism (strip + recurse): if the surface is an adjacent doubling X·X, strip one copy and + /// recurse the residual X through the FST proposer — so an inflected reduplicant + /// (e.g. REDUP of an affixed stem) is covered, not just a bare root — then wrap each returned base + /// analysis with the reduplication morpheme (prepended, matching HC's RED root … order). + /// Bounded to a single full-copy application (the residual is itself analyzed by the FST, which is + /// where any further structure lives); "well enough" for the attested cases, and anything it misses + /// simply fails parity and rides the engine — never a wrong answer. + /// + /// Soundness: a coincidental doubling (a word that merely looks like X·X but is not reduplicated) + /// is proposed but pruned by verify, because HC's synthesis of base + REDUP will not + /// reproduce it. + /// + public class ReduplicationProposer : IConstructProposer + { + private static readonly MorphOp[] _ops = { MorphOp.Reduplication }; + private readonly IMorphologicalAnalyzer _baseProposer; + private readonly List _redupRules; + + public ReduplicationProposer(Language language, IMorphologicalAnalyzer baseProposer) + { + _baseProposer = baseProposer; + _redupRules = new List(); + foreach (Stratum stratum in language.Strata) + { + foreach (IMorphologicalRule mrule in stratum.MorphologicalRules) + { + if (mrule is MorphemicMorphologicalRule rule && IsReduplication(rule)) + { + _redupRules.Add(rule); + } + } + } + } + + public IReadOnlyCollection CoveredOps => _ops; + + public IEnumerable AnalyzeWord(string word) + { + // Full-copy detection: an even-length surface whose two halves are identical. The residual + // (one copy) is recursed through the FST proposer so inflected reduplicants are covered. + if (_redupRules.Count == 0 || word.Length < 2 || (word.Length & 1) == 1) + { + yield break; + } + int half = word.Length / 2; + if (!string.Equals(word.Substring(0, half), word.Substring(half), StringComparison.Ordinal)) + { + yield break; + } + string residual = word.Substring(0, half); + foreach (WordAnalysis baseAnalysis in _baseProposer.AnalyzeWord(residual)) + { + foreach (MorphemicMorphologicalRule redup in _redupRules) + { + // Application order: root (and its affixes) then the reduplication rule, matching + // HC's WordAnalysis.Morphemes (root·…·RED), so the root index is unchanged. + var morphemes = new List(baseAnalysis.Morphemes) { redup }; + yield return new WordAnalysis(morphemes, baseAnalysis.RootMorphemeIndex, null); + } + } + } + + private static bool IsReduplication(MorphemicMorphologicalRule rule) + { + if (!(rule is AffixProcessRule affix)) + { + return false; + } + foreach (AffixProcessAllomorph allomorph in affix.Allomorphs) + { + if (MorphTokenCodec.ClassifyOp(allomorph, false) == MorphOp.Reduplication) + { + return true; + } + } + return false; + } + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs index e54e3282..bc130eb1 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs @@ -166,6 +166,53 @@ public void Verified_CoversPhonologicallyAlteredBareRoot() } } + [Test] + public void Composite_CoversFullReduplication_WhereFstAloneMisses() + { + // Point 3: full reduplication (copy the whole stem) is non-regular — the FST cannot represent + // it, but the ReduplicationProposer strips one copy, recurses the residual through the FST, and + // wraps it with the reduplication morpheme; verify confirms it as a genuine HC analysis. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule + { + Name = "redup", + Gloss = "RED", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, // copy the stem twice + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("sagsag").Any(), Is.True, "precondition: 'sagsag' = RED('sag')"); + + var fst = new FstTemplateAnalyzer(Language, new Morpher(TraceManager, Language)); + Assert.That(fst.AnalyzeWord("sagsag"), Is.Empty, "baseline: the FST alone cannot represent reduplication"); + Assert.That(fst.CoversAllConstructs, Is.False, "reduplication marks the FST not-fully-covered"); + + var composite = new CompositeProposer(fst, new ReduplicationProposer(Language, fst)); + Assert.That(composite.CoversAllConstructs, Is.True, "the reduplication generator covers the skipped op"); + + var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(composite, pool); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "sagsag" }); + Assert.That(cmp.IsComplete, Is.True, "reduplication not covered: " + cmp.Format()); + + Assert.That(verified.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must still yield nothing"); + } + finally + { + Morphophonemic.MorphologicalRules.Remove(redup); + } + } + private static string Sig(WordAnalysis a) => string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex; From 701ead559b2f0a2f31223c6912e6528ffc3b0863 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 10:04:25 -0400 Subject: [PATCH 08/13] HC FST Point 2: infixation via a composable candidate generator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Infixation (an affix inserted inside the stem, e.g. Tagalog -um-) is regular; the FST proposer recognizes but does not build infix slots. Handle it as a sibling generator feeding the same propose-and-verify gate. - InfixProposer (IConstructProposer): for each infix and each interior position where the infix's surface segments occur, remove them and RECURSE the residual through the FST proposer (so an infixed form of an inflected stem is covered), then append the infix morpheme in HC application order (root·…·INF). Over-approximation — every interior occurrence is tried; verify prunes the wrong splits. O(surface-length × infixes) candidates, bounded. - First cut: the infix must be a single contiguous run of inserted segments, matched against its underlying representation. Templatic multi-slot infixes and phonologically-altered infix surfaces are left to the engine (parity gate keeps results correct). Test: an "a"-infix grammar ("sag" -> "saag"); the FST alone misses "saag" (and reports not-fully-covered), the composite covers it (and reports covered), verify confirms the genuine HC analysis, and a non-word still yields nothing. Full suite green (99). Co-Authored-By: Claude Opus 4.8 --- .../InfixProposer.cs | 109 ++++++++++++++++++ .../VerifiedFstAnalyzerTests.cs | 52 +++++++++ 2 files changed, 161 insertions(+) create mode 100644 src/SIL.Machine.Morphology.HermitCrab/InfixProposer.cs diff --git a/src/SIL.Machine.Morphology.HermitCrab/InfixProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/InfixProposer.cs new file mode 100644 index 00000000..63795c16 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/InfixProposer.cs @@ -0,0 +1,109 @@ +using System; +using System.Collections.Generic; +using System.Text; +using SIL.Machine.Morphology; +using SIL.Machine.Morphology.HermitCrab.MorphologicalRules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// A candidate generator for infixation (an affix inserted inside the stem, e.g. Tagalog + /// -um-: sulat → s·um·ulat) — a regular construct the FST proposer recognizes but does not build + /// (FST_FULL_PLAN.md, Point 2). Handled here as a sibling generator feeding the same + /// gate. + /// + /// Mechanism (remove + recurse): for each infix and each interior position where the infix's surface + /// segments occur, remove them and recurse the residual through the FST proposer (so an + /// infixed form of an inflected stem is covered), then append the infix morpheme in HC application + /// order (root·…·INF). Over-approximation: every interior occurrence is tried; verify prunes the + /// wrong splits (a wrong removal won't re-synthesize to the surface). `O(surface-length × infixes)` + /// candidates — bounded. + /// + /// Scope (first cut): the infix must be a single contiguous run of inserted segments, matched against + /// its underlying representation. Templatic multi-slot infixes (separate insert runs) and infixes + /// whose surface is phonologically altered are left to the engine (the parity gate keeps results + /// correct — those words simply ride the slow path). + /// + public class InfixProposer : IConstructProposer + { + private static readonly MorphOp[] _ops = { MorphOp.Infix }; + private readonly IMorphologicalAnalyzer _baseProposer; + private readonly List> _infixes; + + public InfixProposer(Language language, IMorphologicalAnalyzer baseProposer) + { + _baseProposer = baseProposer; + _infixes = new List>(); + foreach (Stratum stratum in language.Strata) + { + foreach (IMorphologicalRule mrule in stratum.MorphologicalRules) + { + if (!(mrule is AffixProcessRule rule)) + { + continue; + } + foreach (AffixProcessAllomorph allomorph in rule.Allomorphs) + { + if (MorphTokenCodec.ClassifyOp(allomorph, false) != MorphOp.Infix) + { + continue; + } + string infix = InfixString(allomorph); + if (!string.IsNullOrEmpty(infix)) + { + _infixes.Add(new KeyValuePair(rule, infix)); + } + } + } + } + } + + public IReadOnlyCollection CoveredOps => _ops; + + public IEnumerable AnalyzeWord(string word) + { + foreach (KeyValuePair entry in _infixes) + { + string infix = entry.Value; + // Interior occurrences only: stem material both before (i >= 1) and after the infix. + int i = word.IndexOf(infix, 1, StringComparison.Ordinal); + while (i >= 1 && i + infix.Length < word.Length) + { + string residual = word.Remove(i, infix.Length); + foreach (WordAnalysis baseAnalysis in _baseProposer.AnalyzeWord(residual)) + { + var morphemes = new List(baseAnalysis.Morphemes) { entry.Key }; + yield return new WordAnalysis(morphemes, baseAnalysis.RootMorphemeIndex, null); + } + i = word.IndexOf(infix, i + 1, StringComparison.Ordinal); + } + } + } + + /// The infix's inserted material iff it is a single contiguous run of inserted segments; + /// null for templatic multi-slot infixes (left to the engine). + private static string InfixString(AffixProcessAllomorph allomorph) + { + var runs = new List(); + StringBuilder current = null; + foreach (MorphologicalOutputAction action in allomorph.Rhs) + { + if (action is InsertSegments insert) + { + current = current ?? new StringBuilder(); + current.Append(insert.Segments.Representation); + } + else if (current != null) + { + runs.Add(current.ToString()); + current = null; + } + } + if (current != null) + { + runs.Add(current.ToString()); + } + return runs.Count == 1 ? runs[0] : null; + } + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs index bc130eb1..6f1514d3 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs @@ -213,6 +213,58 @@ public void Composite_CoversFullReduplication_WhereFstAloneMisses() } } + [Test] + public void Composite_CoversInfixation_WhereFstAloneMisses() + { + // Point 2: infixation (affix inserted inside the stem). The FST recognizes but does not build + // infix slots; the InfixProposer removes the infix's segments at each interior position, recurses + // the residual through the FST, and appends the infix morpheme. Here an "a" is infixed after the + // first segment: "sag" -> "s·a·ag" = "saag". + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var infix = new AffixProcessRule + { + Name = "a_infix", + Gloss = "INF", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + infix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = + { + Pattern.New("1").Annotation(any).Value, // first segment + Pattern.New("2").Annotation(any).OneOrMore.Value, // rest of stem + }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "a"), new CopyFromInput("2") }, + } + ); + Morphophonemic.MorphologicalRules.Add(infix); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("saag").Any(), Is.True, "precondition: 'saag' = INF('sag')"); + + var fst = new FstTemplateAnalyzer(Language, new Morpher(TraceManager, Language)); + Assert.That(fst.AnalyzeWord("saag"), Is.Empty, "baseline: the FST alone does not build infix slots"); + Assert.That(fst.CoversAllConstructs, Is.False, "infixation marks the FST not-fully-covered"); + + var composite = new CompositeProposer(fst, new InfixProposer(Language, fst)); + Assert.That(composite.CoversAllConstructs, Is.True, "the infix generator covers the skipped op"); + + var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(composite, pool); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "saag" }); + Assert.That(cmp.IsComplete, Is.True, "infixation not covered: " + cmp.Format()); + + Assert.That(verified.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must still yield nothing"); + } + finally + { + Morphophonemic.MorphologicalRules.Remove(infix); + } + } + private static string Sig(WordAnalysis a) => string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex; From 0a6ce7d3c69e60b02a4239beda2f462807dbdbf6 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 10:14:01 -0400 Subject: [PATCH 09/13] HC FST Point 1a: affix surface-precompile (C-internal phonology) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the surface-allomorph precompile from bare roots to AFFIXES: build each affix's segment arcs from its underlying form AND each phonologically-altered surface realization, so an affix whose surface differs from its underlying segments (e.g. a suffix devoiced/changed by a rule) is matched by the proposer. - SurfacePhonology: a forward-phonology helper that compiles each stratum's synthesis phonological rules (reusing HC's CompileSynthesisRule, exactly what SynthesisStratumRule runs) and applies them to a segment string in isolation, returning the distinct surface variants (C-internal tier: catches edge- and morpheme-internal alternations; cross-boundary ones ride the engine). - FstTemplateAnalyzer.BuildAffixArcs: shared by both affix-arc sites (derivational layers + template slots) — builds the underlying path plus a path per altered surface variant. Default ctor passes an identity variant function, so the 0-phonology path is byte-identical; the morpher ctor wires SurfacePhonology. Tests: Proposer_CoversPhonologicallyAlteredAffix (a "t" suffix that surfaces only as "d" via t->d: the underlying-only proposer misses "sagd", the surface-precompile proposer covers it, verify stays sound) and SurfacePhonology_AppliesRulesForward. Full suite green (101). FST_FULL_PLAN.md updated with the shipped/deferred matrix. Co-Authored-By: Claude Opus 4.8 --- docs/FST_FULL_PLAN.md | 42 ++++++++- .../FstTemplateAnalyzer.cs | 87 ++++++++++++++----- .../SurfacePhonology.cs | 63 ++++++++++++++ .../VerifiedFstAnalyzerTests.cs | 81 +++++++++++++++++ 4 files changed, 247 insertions(+), 26 deletions(-) create mode 100644 src/SIL.Machine.Morphology.HermitCrab/SurfacePhonology.cs diff --git a/docs/FST_FULL_PLAN.md b/docs/FST_FULL_PLAN.md index eefcb047..e53175f9 100644 --- a/docs/FST_FULL_PLAN.md +++ b/docs/FST_FULL_PLAN.md @@ -121,6 +121,26 @@ FST≠engine → no certify → engine (never wrong). **Test.** A rewrite rule altering an affix's surface; show the underlying-only proposer misses it, the surface-precompile proposer covers it, verify stays sound. +### Result (shipped — 1a affix surface-precompile, C-internal tier) + +`SurfacePhonology.Variants(underlying)` compiles each stratum's synthesis phonological rules (reusing +HC's `IPhonologicalRule.CompileSynthesisRule`, exactly what `SynthesisStratumRule` runs) and applies +them to a segment string in isolation, returning the distinct surface forms (always including the +underlying). `FstTemplateAnalyzer.BuildAffixArcs` builds the affix's segment arcs from the underlying +form AND each altered surface variant (shared by both affix-arc sites: derivational layers and template +slots); the default ctor passes an identity variant function so the 0-phonology path is byte-identical. + +Verified by `Proposer_CoversPhonologicallyAlteredAffix` (a suffix inserts "t"; an unconditional t→d +rule makes it surface only as "d", so sag+SUF = "sagt" → "sagd"; the underlying-only proposer builds a +"t" arc and misses "sagd", the surface-precompile proposer builds the "d" arc and verify confirms it) +and `SurfacePhonology_AppliesRulesForwardToASegmentString`. Full suite green (101). + +**Still 1b (C-boundary):** the isolation tier catches edge- and morpheme-internal alternations but not +cross-boundary, stem-conditioned ones (the neighbor context is absent). Those surfaces are simply not +precompiled → the word rides the engine via the parity gate (correct, slower). Over-approximating the +neighbor (apply rules with each natural-class boundary segment on each side, bounded + capped) is the +next increment. + --- ## Point 4 — C-exact (full phonology composition): design only, deferred @@ -152,11 +172,25 @@ parity gate keeps correct on the engine today. ## Order of work & status -1. ☐ `CompositeProposer` plumbing (union + dedup + coverage-signal) — established by the first generator. -2. ☐ Point 3 Reduplication **or** Point 2 Infixation first (most self-contained; establishes plumbing). -3. ☐ The other of {infix, reduplication}. -4. ☐ Point 1 affix surface-precompile + C-boundary. +1. ☑ `CompositeProposer` plumbing (union + dedup + coverage-signal) — established with reduplication. +2. ☑ Point 3 Reduplication (full-copy generator; strip + recurse + verify). +3. ☑ Point 2 Infixation (remove + recurse + verify; single-contiguous-infix first cut). +4. ◑ Point 1 affix surface-precompile (1a C-internal shipped; 1b C-boundary still to do) + + bare-root C-internal shipped earlier. 5. ☑ Point 4 design recorded (deferred, with rationale). Commit + test after each point; do not batch. Each generator's test must show (a) the FST alone misses the construct, (b) the composite covers it, (c) verify still rejects a non-word. + +## Summary of what shipped + +| Construct | Tier shipped | Mechanism | Residual / deferred | +|---|---|---|---| +| Bare-root phonology | C-internal | `BareRootSurfaces` (GenerateWords) + verify-allows-phonology | C-boundary | +| Affix phonology | C-internal (1a) | `SurfacePhonology` + `BuildAffixArcs` | C-boundary (1b) | +| Infixation | single contiguous infix | `InfixProposer` (remove + recurse) | templatic multi-slot; altered-surface infix | +| Reduplication | full copy, one application | `ReduplicationProposer` (strip + recurse) | partial/CV copy; 2+ applications | +| Cross-boundary opaque phonology | — | — | Point 4 (C-exact composition), design only | + +Every "residual / deferred" item is covered correctly today by the engine via the parity gate — the +only thing deferred is *acceleration*, never correctness. diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs index 3a9c7f70..93f9e0bc 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs @@ -42,6 +42,7 @@ public class FstTemplateAnalyzer : IMorphologicalAnalyzer private readonly Func, bool> _filter; private readonly int _maxStates; private readonly Func> _bareRootSurfaces; + private readonly Func> _affixSurfaces; private readonly List _derivSuffixRules = new List(); private readonly List _derivPrefixRules = new List(); private int _stateCount; @@ -76,7 +77,7 @@ public class FstTemplateAnalyzer : IMorphologicalAnalyzer /// Build without obligatoriness: every root may stand bare (fine for toy grammars). public FstTemplateAnalyzer(Language language, int maxStates = 1_000_000, int derivDepth = 2) - : this(language, root => new[] { UnderlyingForm(root) }, maxStates, derivDepth) { } + : this(language, root => new[] { UnderlyingForm(root) }, s => new[] { s }, maxStates, derivDepth) { } /// /// Build with obligatory-inflection enforcement AND surface-allomorph precompile (§C): a root's @@ -86,16 +87,24 @@ public FstTemplateAnalyzer(Language language, int maxStates = 1_000_000, int der /// phonologically-altered bare root is matched (not just the underlying form). /// public FstTemplateAnalyzer(Language language, Morpher morpher, int maxStates = 1_000_000, int derivDepth = 2) - : this(language, root => BareRootSurfaces(morpher, root), maxStates, derivDepth) { } + : this( + language, + root => BareRootSurfaces(morpher, root), + new SurfacePhonology(language, morpher).Variants, + maxStates, + derivDepth + ) { } private FstTemplateAnalyzer( Language language, Func> bareRootSurfaces, + Func> affixSurfaces, int maxStates, int derivDepth ) { _bareRootSurfaces = bareRootSurfaces; + _affixSurfaces = affixSurfaces; _maxStates = maxStates; _derivDepth = derivDepth; _table = language.SurfaceStratum.CharacterDefinitionTable; @@ -489,16 +498,7 @@ MorphOp op State tokenState = NewState(); _tokenOnEntry[tokenState] = token; current.Arcs.Add(tokenState); // epsilon: enter this derivational affix - State s = tokenState; - InsertSegments insert = allomorph.Rhs.OfType().FirstOrDefault(); - if (insert != null) - { - foreach (FeatureStruct fs in GetSegments(insert.Segments.Shape)) - { - s = AddArc(s, fs); - } - } - s.Arcs.Add(after); // epsilon: reconverge + BuildAffixArcs(tokenState, after, allomorph.Rhs.OfType().FirstOrDefault()); } } current = after; @@ -506,6 +506,58 @@ MorphOp op return current; } + /// + /// Build an affix's segment arcs from to : + /// the underlying form AND each phonologically-altered surface realization (surface-allomorph + /// precompile, Point 1, C-internal tier), so an affix whose surface differs from its underlying + /// segments (e.g. a suffix that devoices word-finally) is matched. A zero-segment affix (null + /// ) just reconverges. Sound: the underlying path is always built, the + /// emitted token is the underlying morpheme, and verify confirms with real phonology; a variant + /// not actually attested is pruned by verify, a missed cross-boundary variant rides the engine. + /// + private void BuildAffixArcs( + State tokenState, + State after, + InsertSegments insert + ) + { + if (insert == null) + { + tokenState.Arcs.Add(after); // zero/empty-segment affix: token only + return; + } + State s = tokenState; + foreach (FeatureStruct fs in GetSegments(insert.Segments.Shape)) + { + s = AddArc(s, fs); + } + s.Arcs.Add(after); + + string underlying = insert.Segments.Representation; + foreach (string variant in _affixSurfaces(underlying)) + { + if (variant == underlying) + { + continue; // underlying path already built + } + Shape vshape; + try + { + vshape = _table.Segment(variant); + } + catch (InvalidShapeException) + { + continue; + } + State sv = tokenState; + foreach (FeatureStruct fs in GetSegments(vshape)) + { + sv = AddArc(sv, fs); + } + sv.Arcs.Add(after); + } + } + /// Allomorphs of a slot rule — both AffixProcessRule and its realizational sibling. /// /// True iff this root may surface uninflected — i.e. synthesizing it with no affixes yields @@ -664,16 +716,7 @@ FeatureStruct templateCategory State tokenState = NewState(); _tokenOnEntry[tokenState] = affixToken; current.Arcs.Add(tokenState); // epsilon: enter this affix - State s = tokenState; - InsertSegments insert = allomorph.Rhs.OfType().FirstOrDefault(); - if (insert != null) - { - foreach (FeatureStruct fs in GetSegments(insert.Segments.Shape)) - { - s = AddArc(s, fs); - } - } - s.Arcs.Add(after); // epsilon: reconverge after the slot + BuildAffixArcs(tokenState, after, allomorph.Rhs.OfType().FirstOrDefault()); } } current = after; diff --git a/src/SIL.Machine.Morphology.HermitCrab/SurfacePhonology.cs b/src/SIL.Machine.Morphology.HermitCrab/SurfacePhonology.cs new file mode 100644 index 00000000..91975cdf --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/SurfacePhonology.cs @@ -0,0 +1,63 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Annotations; +using SIL.Machine.Rules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Forward phonology for the surface-allomorph precompile (FST_FULL_PLAN.md, Point 1, C-internal + /// tier). Applies the grammar's synthesis phonological rules to a morpheme's underlying + /// segment string in isolation (word-edge context) and returns the distinct surface + /// realizations. Reuses HC's own compiled synthesis rules — no reimplemented phonology — exactly the + /// rules runs. + /// + /// Tier scope: catches edge-conditioned and morpheme-internal alternations (an affix that devoices + /// word-finally, a root-internal change). Cross-boundary, stem-conditioned alternations are not + /// seen by this tier (the neighbor context is absent); those surfaces are simply not precompiled, so + /// the word rides the engine via the parity gate — never a wrong answer, only less acceleration. + /// + internal sealed class SurfacePhonology + { + private readonly CharacterDefinitionTable _table; + private readonly Stratum _surfaceStratum; + private readonly List> _strataPrules; + + public SurfacePhonology(Language language, Morpher morpher) + { + _table = language.SurfaceStratum.CharacterDefinitionTable; + _surfaceStratum = language.SurfaceStratum; + _strataPrules = new List>(); + foreach (Stratum stratum in language.Strata) + { + _strataPrules.Add( + new LinearRuleCascade( + stratum.PhonologicalRules.Select(p => p.CompileSynthesisRule(morpher)) + ) + ); + } + } + + /// The distinct surface realizations of in isolation + /// (always includes the underlying form itself, so the 0-phonology path is unchanged). + public IReadOnlyCollection Variants(string underlying) + { + Shape shape; + try + { + shape = _table.Segment(underlying); + } + catch (InvalidShapeException) + { + return new[] { underlying }; + } + var word = new Word(_surfaceStratum, shape); + foreach (LinearRuleCascade cascade in _strataPrules) + { + word = cascade.Apply(word).DefaultIfEmpty(word).First(); + } + string surface = word.Shape.ToString(_table, false); + return underlying == surface ? new[] { underlying } : new[] { underlying, surface }; + } + } +} diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs index 6f1514d3..64e169b4 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs @@ -213,6 +213,87 @@ public void Composite_CoversFullReduplication_WhereFstAloneMisses() } } + [Test] + public void SurfacePhonology_AppliesRulesForwardToASegmentString() + { + // The forward helper applies synthesis phonology to a segment string in isolation: an + // unconditional t->d rule means "t" surfaces as "d" (and the underlying form is always kept). + var tToD = new RewriteRule + { + Name = "t_to_d", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tToD.Subrules.Add( + new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } + ); + Surface.PhonologicalRules.Add(tToD); + try + { + var sp = new SurfacePhonology(Language, new Morpher(TraceManager, Language)); + Assert.That(sp.Variants("t"), Does.Contain("d"), "'t' must surface as 'd'"); + Assert.That(sp.Variants("t"), Does.Contain("t"), "the underlying form is always included"); + } + finally + { + Surface.PhonologicalRules.Remove(tToD); + } + } + + [Test] + public void Proposer_CoversPhonologicallyAlteredAffix() + { + // Point 1 (affix surface-precompile): a suffix inserts "t", but an unconditional t->d rule means + // it can only surface as "d" — so "sag"+SUF = "sagt" -> "sagd". The underlying-only proposer + // builds a "t" affix arc and misses "sagd"; the surface-precompile proposer builds the "d" arc. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var tSuffix = new AffixProcessRule + { + Name = "t_suffix", + Gloss = "TSF", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + tSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "t") }, + } + ); + Morphophonemic.MorphologicalRules.Add(tSuffix); + var tToD = new RewriteRule + { + Name = "t_to_d", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tToD.Subrules.Add( + new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } + ); + Surface.PhonologicalRules.Add(tToD); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("sagd").Any(), Is.True, "precondition: 'sagd' = sag+TSF (t->d)"); + + Assert.That( + new FstTemplateAnalyzer(Language).AnalyzeWord("sagd"), + Is.Empty, + "baseline: the underlying-only proposer builds a 't' affix arc and misses the 'd' surface" + ); + + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(TraceManager, Language); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "sagd" }); + Assert.That(cmp.IsComplete, Is.True, "altered affix not covered: " + cmp.Format()); + + Assert.That(verified.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must still yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(tToD); + Morphophonemic.MorphologicalRules.Remove(tSuffix); + } + } + [Test] public void Composite_CoversInfixation_WhereFstAloneMisses() { From 196b712d54cb01a108c7f2aac2d5593baf5c8144 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 10:16:26 -0400 Subject: [PATCH 10/13] HC FST: CSharpier formatting + refresh FstTemplateAnalyzer class doc Apply CSharpier to FstTemplateAnalyzer.cs (a reflowed method signature) so Check-formatting passes, and update the now-stale class summary: the proposer precompiles bounded phonology into its arcs and degrades gracefully on constructs it cannot model (recording the MorphOp in UncoveredOps for the composite), rather than throwing. Co-Authored-By: Claude Opus 4.8 --- .../FstTemplateAnalyzer.cs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs index 93f9e0bc..707566ac 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs @@ -26,8 +26,11 @@ namespace SIL.Machine.Morphology.HermitCrab /// Tokens are accumulated along the DFS path (a state carries the morpheme token emitted on /// entry). Prefix slots surface in reverse template order (slot 0 applies first → innermost), /// suffix slots in template order. A budget (the §10 knob) aborts - /// before a blowup. Phonology and reduplication/infix slots are out of scope — it throws on a - /// non-prefix/suffix slot rather than silently mis-parsing. + /// before a blowup. Bounded phonology is precompiled into the arcs (surface-allomorph precompile, + /// C-internal tier — see ); a construct the FST cannot model + /// (reduplication/infix/circumfix/process) is skipped and its recorded in + /// so the grammar does not certify unless a sibling generator (see + /// ) covers it — it degrades gracefully, never mis-parses. /// public class FstTemplateAnalyzer : IMorphologicalAnalyzer { @@ -738,7 +741,11 @@ private State BuildRootChain(State from, Roo /// Build a root chain from a surface STRING (a phonologically-altered realization), /// segmenting it via the table; the chain ends in the underlying root morpheme's token. Returns /// null if the surface has a segment outside the table. - private State BuildRootChainFromSurface(State from, string surface, Morpheme morpheme) + private State BuildRootChainFromSurface( + State from, + string surface, + Morpheme morpheme + ) { Shape shape; try From 5d152ae4d06f93b545a66eea4d01997393e3b07f Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 10:19:55 -0400 Subject: [PATCH 11/13] HC FST: wire the composite proposer into the production factories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reduplication/infix generators were only constructed in test code — both production factories built a bare FstTemplateAnalyzer, so a reduplicating/infixing grammar never certified and the generators never ran. Wire them in. - CompositeProposer.ForLanguage(language, fst): the standard production proposer (FST + reduplication + infix generators). Inert for grammars without those constructs (generators hold no rules, yield nothing; CoversAllConstructs is vacuously true) — near-zero overhead, byte-identical behavior. - CompleteHybridMorpher.FromLanguage and CachingMorphologicalAnalyzer.FromLanguage now build the composite and certify on its CoversAllConstructs. Integration test CompleteHybrid_WiresGenerators_...: a reduplicating grammar certifies through the production factory and the fast path matches the engine on bare/reduplicated/homograph/non-word — the test whose absence let the feature be inert. Docs note the wiring + the extended empirical-certification caveat (a certified grammar skips the engine, so the certification corpus must exercise the reduplication/infix patterns). Full suite green (102). Co-Authored-By: Claude Opus 4.8 --- docs/FST_FULL_PLAN.md | 16 ++++++++ .../CachingMorphologicalAnalyzer.cs | 3 +- .../CompleteHybridMorpher.cs | 3 +- .../CompositeProposer.cs | 13 ++++++ .../VerifiedFstAnalyzerTests.cs | 41 +++++++++++++++++++ 5 files changed, 74 insertions(+), 2 deletions(-) diff --git a/docs/FST_FULL_PLAN.md b/docs/FST_FULL_PLAN.md index e53175f9..8f8cd891 100644 --- a/docs/FST_FULL_PLAN.md +++ b/docs/FST_FULL_PLAN.md @@ -194,3 +194,19 @@ the construct, (b) the composite covers it, (c) verify still rejects a non-word. Every "residual / deferred" item is covered correctly today by the engine via the parity gate — the only thing deferred is *acceleration*, never correctness. + +## Production wiring + +Both factories — `CompleteHybridMorpher.FromLanguage` and `CachingMorphologicalAnalyzer.FromLanguage` — +build `CompositeProposer.ForLanguage(language, fst)` (the FST plus the reduplication and infix +generators) and certify on the *composite's* `CoversAllConstructs`. For a grammar with no +reduplication/infixation the generators hold no rules and yield nothing, so this is near-zero overhead +and byte-identical behavior; for a reduplicating/infixing grammar it is what lets the grammar certify +(the generator covers the construct the FST skips) instead of falling entirely to the engine. Covered by +`CompleteHybrid_WiresGenerators_ReduplicatingGrammarCertifiesAndMatchesEngine`. + +**Certification caveat (extended).** A certified grammar skips the engine entirely, so correctness on +unseen words rests on the proposer being complete on the certification corpus. With the generators wired +this now extends to reduplication/infix completeness as well — same empirical-certification property as +before, just over a larger construct set. Choose a certification corpus that exercises the grammar's +reduplication/infix patterns. diff --git a/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs index 8908768a..32f2b623 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs @@ -51,7 +51,8 @@ public static CachingMorphologicalAnalyzer FromLanguage( ) { var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); - var proposer = new FstTemplateAnalyzer(language, new Morpher(traceManager, language)); + var fst = new FstTemplateAnalyzer(language, new Morpher(traceManager, language)); + CompositeProposer proposer = CompositeProposer.ForLanguage(language, fst); var fast = new VerifiedFstAnalyzer(proposer, pool); bool certified = false; if (certificationCorpus != null) diff --git a/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs b/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs index 3d2e272f..81c4eeda 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs @@ -44,7 +44,8 @@ IEnumerable certificationCorpus ) { var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); - var proposer = new FstTemplateAnalyzer(language, new Morpher(traceManager, language)); + var fst = new FstTemplateAnalyzer(language, new Morpher(traceManager, language)); + CompositeProposer proposer = CompositeProposer.ForLanguage(language, fst); var verified = new VerifiedFstAnalyzer(proposer, pool); var engine = new Morpher(traceManager, language); bool parity = FstVerification.Compare(engine, verified, certificationCorpus).IsComplete; diff --git a/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs index aa5b408d..7c1ada05 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs @@ -45,6 +45,19 @@ public CompositeProposer(FstTemplateAnalyzer fst, params IConstructProposer[] ge _coversAllConstructs = fst.UncoveredOps.All(covered.Contains); } + /// The standard production proposer: the FST plus the reduplication and infix + /// generators. For a grammar with no reduplication/infixation the generators are inert (they + /// hold no rules and yield nothing), so this adds near-zero overhead and does not change + /// behavior — which is why the factories wire it unconditionally rather than as an opt-in. + public static CompositeProposer ForLanguage(Language language, FstTemplateAnalyzer fst) + { + return new CompositeProposer( + fst, + new ReduplicationProposer(language, fst), + new InfixProposer(language, fst) + ); + } + /// True iff every construct the FST proposer skipped is claimed by a sibling generator. /// Paired with the empirical parity gate for certification (see class remarks). public bool CoversAllConstructs => _coversAllConstructs; diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs index 64e169b4..3893b7c8 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs @@ -346,6 +346,47 @@ public void Composite_CoversInfixation_WhereFstAloneMisses() } } + [Test] + public void CompleteHybrid_WiresGenerators_ReduplicatingGrammarCertifiesAndMatchesEngine() + { + // Integration: the production factory must build the CompositeProposer (FST + generators), so a + // reduplicating grammar certifies (the generator covers the construct the FST skips) and the + // fast path matches the engine — not just the hand-built composite in the unit tests. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule + { + Name = "redup", + Gloss = "RED", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + try + { + string[] corpus = { "sag", "sagsag", "dat" }; // bare, reduplicated, homograph + var search = new Morpher(TraceManager, Language); + var complete = CompleteHybridMorpher.FromLanguage(TraceManager, Language, corpus); + Assert.That(complete.Certified, Is.True, "the reduplicating grammar must certify once generators are wired"); + foreach (string word in corpus.Append("zzz")) + { + var fast = new HashSet(complete.AnalyzeWord(word).Select(Sig)); + var oracle = new HashSet(search.AnalyzeWord(word).Select(Sig)); + Assert.That(fast.SetEquals(oracle), Is.True, $"fast path disagrees with the engine for {word}"); + } + } + finally + { + Morphophonemic.MorphologicalRules.Remove(redup); + } + } + private static string Sig(WordAnalysis a) => string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex; From ff6f8f96329cc8fb08feca32e6f78bfb80022ab4 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 10:34:40 -0400 Subject: [PATCH 12/13] HC FST Points 1b + 4: C-boundary precompile and full phonology composition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complete the phonology story — all four enhancement points are now implemented and wired into production. Point 4 (C-exact, the complete path): ComposedPhonologyProposer composes HC's phonology INVERSE with the morphotactic FST. It un-applies the grammar's phonological rules to the surface (reusing each stratum's CompileAnalysisRule — exactly what AnalysisStratumRule runs, strata surface->inner, rules reversed) to recover the underlying form, then walks the underlying-arc FST on it (FstTemplateAnalyzer.AnalyzeShape, newly exposed). Because the inverse is applied to the ASSEMBLED surface, this covers all bounded phonology including the cross-boundary, stem-conditioned alternations the per-morpheme precompile cannot see. Under-specified analysis nodes match via unification; verify prunes spurious candidates. Chosen over literal Fst.Compose because the proposer accumulates tokens in a side-table, not transducer outputs — composing HC's existing inverse reaches the same coverage while reusing the engine's real phonology. Point 1b (C-boundary, the cheap fast-path): SurfacePhonology now also probes each surface-alphabet segment as a left/right neighbor and, when the rule is length-preserving, reads back the morpheme's own surface portion — catching an affix whose surface is conditioned by a neighbor across the seam. Bounded by alphabet size; length-changing contexts are skipped (sound superset). Both wired into CompositeProposer.ForLanguage (inert when the grammar lacks phonology — short-circuits). Tests: ComposedPhonology_CoversCrossBoundaryAlternation (g->k / _t across the boundary: precompile misses "sakt", composition recovers it) and SurfacePhonology_BoundaryTier (t->d / g_: isolation keeps "t", boundary recovers "d"). Full suite green (104); full solution builds; CSharpier clean. Plan updated: all four points shipped + wired. Co-Authored-By: Claude Opus 4.8 --- docs/FST_FULL_PLAN.md | 92 ++++++----- .../CachingMorphologicalAnalyzer.cs | 2 +- .../CompleteHybridMorpher.cs | 2 +- .../ComposedPhonologyProposer.cs | 81 ++++++++++ .../CompositeProposer.cs | 13 +- .../FstTemplateAnalyzer.cs | 12 ++ .../SurfacePhonology.cs | 147 ++++++++++++++++-- .../VerifiedFstAnalyzerTests.cs | 96 ++++++++++++ 8 files changed, 383 insertions(+), 62 deletions(-) create mode 100644 src/SIL.Machine.Morphology.HermitCrab/ComposedPhonologyProposer.cs diff --git a/docs/FST_FULL_PLAN.md b/docs/FST_FULL_PLAN.md index 8f8cd891..5e1d48b6 100644 --- a/docs/FST_FULL_PLAN.md +++ b/docs/FST_FULL_PLAN.md @@ -135,38 +135,46 @@ rule makes it surface only as "d", so sag+SUF = "sagt" → "sagd"; the underlyin "t" arc and misses "sagd", the surface-precompile proposer builds the "d" arc and verify confirms it) and `SurfacePhonology_AppliesRulesForwardToASegmentString`. Full suite green (101). -**Still 1b (C-boundary):** the isolation tier catches edge- and morpheme-internal alternations but not -cross-boundary, stem-conditioned ones (the neighbor context is absent). Those surfaces are simply not -precompiled → the word rides the engine via the parity gate (correct, slower). Over-approximating the -neighbor (apply rules with each natural-class boundary segment on each side, bounded + capped) is the -next increment. +### Result (shipped — 1b C-boundary) ---- - -## Point 4 — C-exact (full phonology composition): design only, deferred +`SurfacePhonology.Variants` now also probes each surface-alphabet segment as a left/right neighbor: it +forward-applies phonology to `neighbor·morpheme` / `morpheme·neighbor` and, when the rule is +length-preserving (output node count = morpheme + 1), reads back the morpheme's own surface nodes. +Bounded by alphabet size × 2; a length-changing context is skipped (no reliable portion) so it stays a +sound superset. This catches an affix whose *own* surface is conditioned by a neighbor across the seam +(e.g. a suffix that voices after the root-final segment). Verified by +`SurfacePhonology_BoundaryTier_RecoversAffixSurfaceFromNeighborContext` (a "t" suffix that voices to "d" +only after "g": isolation keeps "t", the boundary tier recovers "d"). Full suite green (104). -**Goal.** Compose the morphotactic FST with the full phonology transducer (Kaplan & Kay: bounded rewrite -rules are regular relations, closed under composition), giving complete coverage of all *attested* -(non-cyclic) phonology — including the cross-boundary opaque interactions the per-morpheme C-boundary -tier can miss. +What the precompile still cannot see — a *neighbor's* surface changing (e.g. a root devoicing before an +affix) or any longer-distance interaction — is covered completely by Point 4 below. -**Concrete design.** (1) Re-architect token emission from the `_tokenOnEntry` side-table into FST -**output labels**, so the proposer is a genuine transducer surface→token-string. (2) Build the phonology -transducer by composing each stratum's compiled phonological rules (the in-repo `Fst.Compose` exists, -line ~1887). (3) Compose `phonology⁻¹ ∘ morphotactics` so the machine maps surface directly to the -underlying token string. - -**Why deferred (engineering-correct, not a dodge).** -- It is a **redesign of the working spine** (token-emission → transducer outputs), high-risk relative to - its marginal value. -- The only thing it buys over Point 1's C-boundary tier is **cross-boundary opaque interaction**, which - is *rare in attested grammars and already produces correct answers via engine fallback* (the parity - gate refuses to certify, so those words ride the slow path — slower, never wrong). -- **Point 1 (C-boundary) subsumes essentially all of Point 4's attested-language value.** Points 1 and 4 - are the same axis (phonology coverage) at two tiers; doing 1 well delivers the practical payoff. +--- -So Point 4 ships as this design + rationale; the residual it would accelerate is exactly the set the -parity gate keeps correct on the engine today. +## Point 4 — C-exact: full phonology via composition with HC's phonology inverse (shipped) + +**Goal.** Cover *all* bounded phonology — including the cross-boundary, opaque, stem-conditioned +interactions the per-morpheme precompile (Point 1) cannot see. + +**What shipped.** `ComposedPhonologyProposer` composes **HC's phonology inverse with the morphotactic +FST**: it un-applies the grammar's phonological rules to the surface — reusing each stratum's +`IPhonologicalRule.CompileAnalysisRule`, exactly the rules `AnalysisStratumRule` runs (strata +surface→inner, rules reversed within a stratum) — to recover the underlying form, then walks the +underlying-arc FST on it (`FstTemplateAnalyzer.AnalyzeShape`). That is literally phonology⁻¹ ∘ +morphotactics. Because the inverse is applied to the *assembled* surface, it sees cross-boundary context +the per-morpheme tiers cannot. The un-applied shape carries under-specified nodes (analysis is +non-deterministic) which the unification walk matches against every compatible arc; verify prunes the +spurious ones, so it is a sound superset. Complete for bounded (non-cyclic) phonology; an unbounded +self-feeding cycle is not a regular relation and simply will not certify. + +**Why this form, not FST∘FST composition.** The morphotactic proposer accumulates tokens in a side-table +(`_tokenOnEntry`), not transducer outputs, so a literal `Fst.Compose` would require re-architecting the +spine. Composing HC's *existing* phonology inverse instead reuses the engine's real, tested phonology +(no reimplementation) and reaches the same coverage. Wired into `CompositeProposer.ForLanguage` (inert +when the grammar has no phonological rules — it short-circuits). Verified by +`ComposedPhonology_CoversCrossBoundaryAlternation_WherePrecompileMisses` (a root-final "g" +devoices to "k" before a suffixal "t" — "sag"+SUF = "sagt" → "sakt"; the per-morpheme precompile misses +"sakt", composition recovers it, verify confirms, a non-word still yields nothing). --- @@ -175,25 +183,29 @@ parity gate keeps correct on the engine today. 1. ☑ `CompositeProposer` plumbing (union + dedup + coverage-signal) — established with reduplication. 2. ☑ Point 3 Reduplication (full-copy generator; strip + recurse + verify). 3. ☑ Point 2 Infixation (remove + recurse + verify; single-contiguous-infix first cut). -4. ◑ Point 1 affix surface-precompile (1a C-internal shipped; 1b C-boundary still to do) + - bare-root C-internal shipped earlier. -5. ☑ Point 4 design recorded (deferred, with rationale). +4. ☑ Point 1 phonology precompile — bare-root C-internal, affix C-internal (1a) and C-boundary (1b). +5. ☑ Point 4 C-exact — `ComposedPhonologyProposer` (phonology⁻¹ ∘ morphotactics); covers all bounded + phonology including cross-boundary. -Commit + test after each point; do not batch. Each generator's test must show (a) the FST alone misses -the construct, (b) the composite covers it, (c) verify still rejects a non-word. +All four wired into `CompositeProposer.ForLanguage`, which both production factories +(`CompleteHybridMorpher`, `CachingMorphologicalAnalyzer`) build and certify on. Commit + test after each +point; each construct test shows (a) the FST alone misses it, (b) the composite covers it, (c) verify +still rejects a non-word. ## Summary of what shipped -| Construct | Tier shipped | Mechanism | Residual / deferred | +| Construct | Coverage | Mechanism | Residual | |---|---|---|---| -| Bare-root phonology | C-internal | `BareRootSurfaces` (GenerateWords) + verify-allows-phonology | C-boundary | -| Affix phonology | C-internal (1a) | `SurfacePhonology` + `BuildAffixArcs` | C-boundary (1b) | -| Infixation | single contiguous infix | `InfixProposer` (remove + recurse) | templatic multi-slot; altered-surface infix | +| Bare-root phonology | C-internal | `BareRootSurfaces` (GenerateWords) + verify-allows-phonology | — | +| Affix phonology | C-internal + C-boundary | `SurfacePhonology` (1a isolation + 1b neighbor) + `BuildAffixArcs` | — | +| **All phonology** (incl. cross-boundary, opaque) | **complete (bounded)** | `ComposedPhonologyProposer` — phonology⁻¹ ∘ morphotactics | unbounded self-feeding cycle (not regular) | +| Infixation | single contiguous infix | `InfixProposer` (remove + recurse) | templatic multi-slot; phonologically-altered infix surface | | Reduplication | full copy, one application | `ReduplicationProposer` (strip + recurse) | partial/CV copy; 2+ applications | -| Cross-boundary opaque phonology | — | — | Point 4 (C-exact composition), design only | -Every "residual / deferred" item is covered correctly today by the engine via the parity gate — the -only thing deferred is *acceleration*, never correctness. +The phonology precompile tiers (1a/1b) are the cheap fast-path; `ComposedPhonologyProposer` is the +complete backstop, so phonology is fully covered. The remaining infix/reduplication residuals are +covered correctly today by the engine via the parity gate — the only thing not yet accelerated for those +narrow cases is *speed*, never correctness. ## Production wiring diff --git a/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs index 32f2b623..f94e8cec 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs @@ -52,7 +52,7 @@ public static CachingMorphologicalAnalyzer FromLanguage( { var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); var fst = new FstTemplateAnalyzer(language, new Morpher(traceManager, language)); - CompositeProposer proposer = CompositeProposer.ForLanguage(language, fst); + CompositeProposer proposer = CompositeProposer.ForLanguage(language, new Morpher(traceManager, language), fst); var fast = new VerifiedFstAnalyzer(proposer, pool); bool certified = false; if (certificationCorpus != null) diff --git a/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs b/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs index 81c4eeda..130e0eee 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs @@ -45,7 +45,7 @@ IEnumerable certificationCorpus { var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); var fst = new FstTemplateAnalyzer(language, new Morpher(traceManager, language)); - CompositeProposer proposer = CompositeProposer.ForLanguage(language, fst); + CompositeProposer proposer = CompositeProposer.ForLanguage(language, new Morpher(traceManager, language), fst); var verified = new VerifiedFstAnalyzer(proposer, pool); var engine = new Morpher(traceManager, language); bool parity = FstVerification.Compare(engine, verified, certificationCorpus).IsComplete; diff --git a/src/SIL.Machine.Morphology.HermitCrab/ComposedPhonologyProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/ComposedPhonologyProposer.cs new file mode 100644 index 00000000..92396b95 --- /dev/null +++ b/src/SIL.Machine.Morphology.HermitCrab/ComposedPhonologyProposer.cs @@ -0,0 +1,81 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Annotations; +using SIL.Machine.Morphology; +using SIL.Machine.Rules; + +namespace SIL.Machine.Morphology.HermitCrab +{ + /// + /// Point 4 (C-exact phonology) by composition with HC's phonology inverse + /// (FST_FULL_PLAN.md). Un-applies the grammar's phonological rules to the surface — reusing each + /// stratum's , exactly the rules + /// runs (surface stratum first, rules reversed within a stratum) — + /// to recover the underlying form, then walks the underlying-arc morphotactic FST on it + /// (). That is literally phonology⁻¹ ∘ morphotactics. + /// + /// Because the inverse is applied to the assembled surface, this covers ALL bounded phonology + /// — including the cross-boundary, stem-conditioned alternations the per-morpheme precompile (Point 1) + /// cannot see — completing the phonology story. The un-applied shape carries under-specified nodes + /// (analysis is non-deterministic), which the unification walk matches against every compatible arc; + /// verify prunes the spurious ones, so it stays a sound superset. Complete for bounded (non-cyclic) + /// phonology; an unbounded self-feeding cycle is not a regular relation and simply will not certify. + /// + public class ComposedPhonologyProposer : IConstructProposer + { + private static readonly MorphOp[] _ops = new MorphOp[0]; + private readonly FstTemplateAnalyzer _fst; + private readonly Stratum _surfaceStratum; + private readonly CharacterDefinitionTable _table; + private readonly LinearRuleCascade _inverse; + private readonly bool _hasPhonology; + + public ComposedPhonologyProposer(Language language, Morpher morpher, FstTemplateAnalyzer fst) + { + _fst = fst; + _surfaceStratum = language.SurfaceStratum; + _table = language.SurfaceStratum.CharacterDefinitionTable; + // Inverse order mirrors AnalysisLanguageRule/AnalysisStratumRule: strata surface→inner, and + // within each stratum the synthesis rules are un-applied in reverse application order. + var rules = new List>(); + foreach (Stratum stratum in language.Strata.Reverse()) + { + foreach (IPhonologicalRule prule in stratum.PhonologicalRules.Reverse()) + { + rules.Add(prule.CompileAnalysisRule(morpher)); + } + } + _hasPhonology = rules.Count > 0; + _inverse = new LinearRuleCascade(rules); + } + + /// Phonology completeness is not a per-construct MorphOp, so this covers none; its value + /// is validated empirically by the parity gate. + public IReadOnlyCollection CoveredOps => _ops; + + public IEnumerable AnalyzeWord(string word) + { + if (!_hasPhonology) + { + yield break; // no phonology ⇒ the bare FST proposer already covers everything + } + Shape shape; + try + { + shape = _table.Segment(word); + } + catch (InvalidShapeException) + { + yield break; + } + // Un-apply phonology in place (the cascade mutates the word's shape, as AnalysisStratumRule + // relies on); the resulting under-specified shape is the underlying form to walk. + var inverseWord = new Word(_surfaceStratum, shape); + _inverse.Apply(inverseWord).ToList(); + foreach (WordAnalysis candidate in _fst.AnalyzeShape(inverseWord.Shape)) + { + yield return candidate; + } + } + } +} diff --git a/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs index 7c1ada05..055e2eef 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs @@ -46,15 +46,18 @@ public CompositeProposer(FstTemplateAnalyzer fst, params IConstructProposer[] ge } /// The standard production proposer: the FST plus the reduplication and infix - /// generators. For a grammar with no reduplication/infixation the generators are inert (they - /// hold no rules and yield nothing), so this adds near-zero overhead and does not change - /// behavior — which is why the factories wire it unconditionally rather than as an opt-in. - public static CompositeProposer ForLanguage(Language language, FstTemplateAnalyzer fst) + /// generators and the phonology-composition proposer (Point 4, all bounded phonology including + /// cross-boundary). For a grammar without a given construct the corresponding generator is inert + /// (it holds no rules and yields nothing — the phonology proposer short-circuits when the grammar + /// has no phonological rules), so this adds near-zero overhead and does not change behavior; that + /// is why the factories wire it unconditionally rather than as an opt-in. + public static CompositeProposer ForLanguage(Language language, Morpher morpher, FstTemplateAnalyzer fst) { return new CompositeProposer( fst, new ReduplicationProposer(language, fst), - new InfixProposer(language, fst) + new InfixProposer(language, fst), + new ComposedPhonologyProposer(language, morpher, fst) ); } diff --git a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs index 707566ac..f99d5502 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/FstTemplateAnalyzer.cs @@ -282,7 +282,19 @@ public IEnumerable AnalyzeWord(string word) // A word with a phoneme outside this table cannot be a surface form here. return Enumerable.Empty(); } + return AnalyzeShape(shape); + } + /// + /// Walk the morphotactic FST over the segments of an already-built . + /// Used both by (segmenting the surface) and by + /// , which feeds an underlying shape obtained by + /// un-applying phonology — letting the underlying arcs match cross-boundary surfaces the + /// per-morpheme precompile misses. Segments are matched by unification, so an underspecified + /// node (from analysis) matches every arc it unifies with; verify prunes the spurious ones. + /// + internal IEnumerable AnalyzeShape(Shape shape) + { var segments = new List(); for ( ShapeNode node = shape.GetFirst(n => _filter(n.Annotation)); diff --git a/src/SIL.Machine.Morphology.HermitCrab/SurfacePhonology.cs b/src/SIL.Machine.Morphology.HermitCrab/SurfacePhonology.cs index 91975cdf..786d3557 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/SurfacePhonology.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/SurfacePhonology.cs @@ -6,22 +6,32 @@ namespace SIL.Machine.Morphology.HermitCrab { /// - /// Forward phonology for the surface-allomorph precompile (FST_FULL_PLAN.md, Point 1, C-internal - /// tier). Applies the grammar's synthesis phonological rules to a morpheme's underlying - /// segment string in isolation (word-edge context) and returns the distinct surface - /// realizations. Reuses HC's own compiled synthesis rules — no reimplemented phonology — exactly the - /// rules runs. + /// Forward phonology for the surface-allomorph precompile (FST_FULL_PLAN.md, Point 1). Applies the + /// grammar's synthesis phonological rules to a morpheme's underlying segment string and + /// returns the distinct surface realizations. Reuses HC's own compiled synthesis rules — no + /// reimplemented phonology — exactly the rules runs. /// - /// Tier scope: catches edge-conditioned and morpheme-internal alternations (an affix that devoices - /// word-finally, a root-internal change). Cross-boundary, stem-conditioned alternations are not - /// seen by this tier (the neighbor context is absent); those surfaces are simply not precompiled, so - /// the word rides the engine via the parity gate — never a wrong answer, only less acceleration. + /// Two tiers, both precompiled into the proposer's arcs: + /// + /// C-internal (1a): apply rules to the morpheme in isolation (word-edge context) + /// — catches edge-conditioned and morpheme-internal alternations. + /// C-boundary (1b): apply rules to the morpheme with each single neighbor segment of the + /// surface alphabet on each side, and (when the rule is length-preserving) read back the morpheme's + /// own surface portion — catches an affix whose own surface is conditioned by a neighbor across + /// the seam. Bounded by alphabet size × 2; a length-changing context is skipped (no reliable + /// portion), so it stays a sound superset. + /// + /// What remains — a neighbor's surface changing (e.g. a root devoicing before an affix), and any + /// longer-distance interaction — is covered completely by + /// (Point 4), which un-applies phonology on the assembled surface. So this helper is the cheap + /// fast-path; the composition proposer is the complete backstop. /// internal sealed class SurfacePhonology { private readonly CharacterDefinitionTable _table; private readonly Stratum _surfaceStratum; private readonly List> _strataPrules; + private readonly List _alphabet; public SurfacePhonology(Language language, Morpher morpher) { @@ -36,28 +46,135 @@ public SurfacePhonology(Language language, Morpher morpher) ) ); } + // The surface alphabet: one representative per segment character definition (the neighbor + // segments used to probe boundary-conditioned alternations). + _alphabet = new List(); + foreach (CharacterDefinition cd in _table) + { + if (cd.Type == HCFeatureSystem.Segment) + { + string rep = cd.Representations.FirstOrDefault(); + if (!string.IsNullOrEmpty(rep)) + { + _alphabet.Add(rep); + } + } + } } - /// The distinct surface realizations of in isolation - /// (always includes the underlying form itself, so the 0-phonology path is unchanged). + /// The distinct surface realizations of — its isolation + /// form (always included, so the 0-phonology path is unchanged) plus each boundary-context + /// realization recovered when the rule is length-preserving. public IReadOnlyCollection Variants(string underlying) + { + var result = new HashSet { underlying }; + int underlyingLen = NodeCount(underlying); + if (underlyingLen < 0) + { + return new[] { underlying }; // unsegmentable + } + + // C-internal: the morpheme in isolation. + string isolation = SurfaceOf(underlying); + if (isolation != null) + { + result.Add(isolation); + } + + // C-boundary: the morpheme with one neighbor segment on each side. When the context is + // length-preserving, read back just the morpheme's own surface nodes. + foreach (string c in _alphabet) + { + AddBoundaryVariant(c + underlying, underlyingLen, fromEnd: true, result); // left neighbor + AddBoundaryVariant(underlying + c, underlyingLen, fromEnd: false, result); // right neighbor + } + return result.ToList(); + } + + private void AddBoundaryVariant(string context, int underlyingLen, bool fromEnd, HashSet result) + { + List outNodes = SurfaceNodes(context); + if (outNodes == null || outNodes.Count != underlyingLen + 1) + { + return; // unsegmentable, or a length-changing rule fired ⇒ no reliable morpheme portion + } + // The neighbor is one node; the morpheme is the remaining contiguous nodes. + IEnumerable morphemeNodes = fromEnd + ? outNodes.Skip(1) // left neighbor consumed the first node + : outNodes.Take(underlyingLen); // right neighbor is the last node + var sb = new System.Text.StringBuilder(); + foreach (ShapeNode node in morphemeNodes) + { + string rep = _table.GetMatchingStrReps(node).FirstOrDefault(); + if (string.IsNullOrEmpty(rep)) + { + return; // an under-specified node has no single representation — skip this context + } + sb.Append(rep); + } + result.Add(sb.ToString()); + } + + /// Apply forward phonology to a segment string and return the surface string, or null if + /// it cannot be segmented. + private string SurfaceOf(string underlying) + { + List nodes = SurfaceNodes(underlying); + if (nodes == null) + { + return null; + } + var sb = new System.Text.StringBuilder(); + foreach (ShapeNode node in nodes) + { + string rep = _table.GetMatchingStrReps(node).FirstOrDefault(); + if (string.IsNullOrEmpty(rep)) + { + return null; + } + sb.Append(rep); + } + return sb.ToString(); + } + + /// Apply forward phonology to a segment string and return the surface segment nodes, or + /// null if it cannot be segmented. + private List SurfaceNodes(string str) { Shape shape; try { - shape = _table.Segment(underlying); + shape = _table.Segment(str); } catch (InvalidShapeException) { - return new[] { underlying }; + return null; } var word = new Word(_surfaceStratum, shape); foreach (LinearRuleCascade cascade in _strataPrules) { word = cascade.Apply(word).DefaultIfEmpty(word).First(); } - string surface = word.Shape.ToString(_table, false); - return underlying == surface ? new[] { underlying } : new[] { underlying, surface }; + return word + .Shape.Where(n => n.Annotation.Type() == HCFeatureSystem.Segment) + .ToList(); + } + + /// The number of segment nodes after segmentation (before any phonology), or -1 if the + /// string cannot be segmented. This is the reference length for boundary extraction: a neighbor + /// adds exactly one node, so a length-preserving context yields underlyingLen + 1 nodes. + private int NodeCount(string str) + { + Shape shape; + try + { + shape = _table.Segment(str); + } + catch (InvalidShapeException) + { + return -1; + } + return shape.Count(n => n.Annotation.Type() == HCFeatureSystem.Segment); } } } diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs index 3893b7c8..756ea814 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs @@ -166,6 +166,67 @@ public void Verified_CoversPhonologicallyAlteredBareRoot() } } + [Test] + public void ComposedPhonology_CoversCrossBoundaryAlternation_WherePrecompileMisses() + { + // Point 4 (C-exact, composition with phonology inverse): a CROSS-BOUNDARY rule the per-morpheme + // precompile cannot see. A suffix inserts "t"; the root-final "g" devoices to "k" before that + // suffixal "t" — so sag+SUF = "sagt" -> "sakt". The precompile sees the bare root ("sag", no + // following t -> no devoicing) and the affix ("t") only in isolation, so it builds a "sagt" path + // and MISSES "sakt". Composition un-applies the rule on the assembled surface and recovers it. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var tSuffix = new AffixProcessRule + { + Name = "t_suffix", + Gloss = "TSF", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("N").Value, + }; + tSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table1, "t") }, + } + ); + Morphophonemic.MorphologicalRules.Add(tSuffix); + var gDevoice = new RewriteRule + { + Name = "g_devoice", + Lhs = Pattern.New().Annotation(Character(Table1, "g")).Value, + }; + gDevoice.Subrules.Add( + new RewriteSubrule + { + Rhs = Pattern.New().Annotation(Character(Table1, "k")).Value, + RightEnvironment = Pattern.New().Annotation(Character(Table1, "t")).Value, + } + ); + Surface.PhonologicalRules.Add(gDevoice); + try + { + var search = new Morpher(TraceManager, Language); + Assert.That(search.AnalyzeWord("sakt").Any(), Is.True, "precondition: 'sakt' = sag+TSF (g->k / _t)"); + + // Even the surface-precompile proposer misses the cross-boundary form. + var fst = new FstTemplateAnalyzer(Language, new Morpher(TraceManager, Language)); + Assert.That(fst.AnalyzeWord("sakt"), Is.Empty, "baseline: per-morpheme precompile misses cross-boundary 'sakt'"); + + var composed = new ComposedPhonologyProposer(Language, new Morpher(TraceManager, Language), fst); + var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); + IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(new CompositeProposer(fst, composed), pool); + AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "sakt" }); + Assert.That(cmp.IsComplete, Is.True, "cross-boundary alternation not covered: " + cmp.Format()); + + Assert.That(verified.AnalyzeWord("zzz"), Is.Empty, "soundness: a non-word must still yield nothing"); + } + finally + { + Surface.PhonologicalRules.Remove(gDevoice); + Morphophonemic.MorphologicalRules.Remove(tSuffix); + } + } + [Test] public void Composite_CoversFullReduplication_WhereFstAloneMisses() { @@ -239,6 +300,41 @@ public void SurfacePhonology_AppliesRulesForwardToASegmentString() } } + [Test] + public void SurfacePhonology_BoundaryTier_RecoversAffixSurfaceFromNeighborContext() + { + // Point 1b (C-boundary): a suffixal "t" voices to "d" only AFTER "g". In isolation "t" stays + // "t" (1a misses the alternation); with the left neighbor "g" the boundary tier recovers "d". + var tVoice = new RewriteRule + { + Name = "t_voice", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tVoice.Subrules.Add( + new RewriteSubrule + { + Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value, + LeftEnvironment = Pattern.New().Annotation(Character(Table1, "g")).Value, + } + ); + Surface.PhonologicalRules.Add(tVoice); + try + { + var sp = new SurfacePhonology(Language, new Morpher(TraceManager, Language)); + IReadOnlyCollection variants = sp.Variants("t"); + Assert.That(variants, Does.Contain("t"), "underlying form is always included"); + Assert.That( + variants, + Does.Contain("d"), + "boundary tier must recover the post-'g' surface 'd' (isolation alone would miss it)" + ); + } + finally + { + Surface.PhonologicalRules.Remove(tVoice); + } + } + [Test] public void Proposer_CoversPhonologicallyAlteredAffix() { From b50e4a297a1b80997b5059a491f2927c88b0f4a2 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Sat, 27 Jun 2026 10:40:00 -0400 Subject: [PATCH 13/13] HC FST: make the composed-phonology proposer thread-safe + prove it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ComposedPhonologyProposer runs HC's analysis phonology at analyze time on the concurrent path (both factories advertise parallel parsing). Harden + verify: - Compile the inverse cascade against a PRIVATE Morpher with its own TraceManager (not the factory's shared one), mirroring how MorpherPool gives each rented morpher its own — the analysis rules read _morpher.TraceManager/selectors, so the proposer must not share them. Each AnalyzeWord applies the cascade to a fresh local Word (no per-call mutation of shared state). ForLanguage no longer threads a morpher through. - Add Composite_WithPhonologyAndReduplication_ParallelMatchesSequential: drives the production CompleteHybridMorpher (phonology inverse + reduplication generator both live) over a corpus in parallel and asserts parallel == sequential, no exceptions. Full suite green (105). Co-Authored-By: Claude Opus 4.8 --- .../CachingMorphologicalAnalyzer.cs | 2 +- .../CompleteHybridMorpher.cs | 2 +- .../ComposedPhonologyProposer.cs | 11 +++- .../CompositeProposer.cs | 4 +- .../VerifiedFstAnalyzerTests.cs | 57 ++++++++++++++++++- 5 files changed, 70 insertions(+), 6 deletions(-) diff --git a/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs index f94e8cec..32f2b623 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CachingMorphologicalAnalyzer.cs @@ -52,7 +52,7 @@ public static CachingMorphologicalAnalyzer FromLanguage( { var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); var fst = new FstTemplateAnalyzer(language, new Morpher(traceManager, language)); - CompositeProposer proposer = CompositeProposer.ForLanguage(language, new Morpher(traceManager, language), fst); + CompositeProposer proposer = CompositeProposer.ForLanguage(language, fst); var fast = new VerifiedFstAnalyzer(proposer, pool); bool certified = false; if (certificationCorpus != null) diff --git a/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs b/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs index 130e0eee..81c4eeda 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CompleteHybridMorpher.cs @@ -45,7 +45,7 @@ IEnumerable certificationCorpus { var pool = new MorpherPool(() => new Morpher(new TraceManager(), language)); var fst = new FstTemplateAnalyzer(language, new Morpher(traceManager, language)); - CompositeProposer proposer = CompositeProposer.ForLanguage(language, new Morpher(traceManager, language), fst); + CompositeProposer proposer = CompositeProposer.ForLanguage(language, fst); var verified = new VerifiedFstAnalyzer(proposer, pool); var engine = new Morpher(traceManager, language); bool parity = FstVerification.Compare(engine, verified, certificationCorpus).IsComplete; diff --git a/src/SIL.Machine.Morphology.HermitCrab/ComposedPhonologyProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/ComposedPhonologyProposer.cs index 92396b95..7c8382e2 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/ComposedPhonologyProposer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/ComposedPhonologyProposer.cs @@ -20,6 +20,11 @@ namespace SIL.Machine.Morphology.HermitCrab /// (analysis is non-deterministic), which the unification walk matches against every compatible arc; /// verify prunes the spurious ones, so it stays a sound superset. Complete for bounded (non-cyclic) /// phonology; an unbounded self-feeding cycle is not a regular relation and simply will not certify. + /// + /// Thread-safe. The inverse cascade is compiled once against a private + /// with its own (not the factory's), and each + /// applies it to a fresh local — no per-call mutation of shared state — so the + /// composite stays safe on the parallel path advertises. /// public class ComposedPhonologyProposer : IConstructProposer { @@ -30,11 +35,15 @@ public class ComposedPhonologyProposer : IConstructProposer private readonly LinearRuleCascade _inverse; private readonly bool _hasPhonology; - public ComposedPhonologyProposer(Language language, Morpher morpher, FstTemplateAnalyzer fst) + public ComposedPhonologyProposer(Language language, FstTemplateAnalyzer fst) { _fst = fst; _surfaceStratum = language.SurfaceStratum; _table = language.SurfaceStratum.CharacterDefinitionTable; + // Compile against a private Morpher with its own TraceManager — the analysis rules read + // _morpher.TraceManager (and the morpher's selectors), so this proposer must not share the + // factory's morpher (mirrors MorpherPool giving each rented morpher its own TraceManager). + var morpher = new Morpher(new TraceManager(), language); // Inverse order mirrors AnalysisLanguageRule/AnalysisStratumRule: strata surface→inner, and // within each stratum the synthesis rules are un-applied in reverse application order. var rules = new List>(); diff --git a/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs b/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs index 055e2eef..7d5bcca9 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CompositeProposer.cs @@ -51,13 +51,13 @@ public CompositeProposer(FstTemplateAnalyzer fst, params IConstructProposer[] ge /// (it holds no rules and yields nothing — the phonology proposer short-circuits when the grammar /// has no phonological rules), so this adds near-zero overhead and does not change behavior; that /// is why the factories wire it unconditionally rather than as an opt-in. - public static CompositeProposer ForLanguage(Language language, Morpher morpher, FstTemplateAnalyzer fst) + public static CompositeProposer ForLanguage(Language language, FstTemplateAnalyzer fst) { return new CompositeProposer( fst, new ReduplicationProposer(language, fst), new InfixProposer(language, fst), - new ComposedPhonologyProposer(language, morpher, fst) + new ComposedPhonologyProposer(language, fst) ); } diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs index 756ea814..d262228e 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/VerifiedFstAnalyzerTests.cs @@ -212,7 +212,7 @@ public void ComposedPhonology_CoversCrossBoundaryAlternation_WherePrecompileMiss var fst = new FstTemplateAnalyzer(Language, new Morpher(TraceManager, Language)); Assert.That(fst.AnalyzeWord("sakt"), Is.Empty, "baseline: per-morpheme precompile misses cross-boundary 'sakt'"); - var composed = new ComposedPhonologyProposer(Language, new Morpher(TraceManager, Language), fst); + var composed = new ComposedPhonologyProposer(Language, fst); var pool = new MorpherPool(() => new Morpher(new TraceManager(), Language)); IMorphologicalAnalyzer verified = new VerifiedFstAnalyzer(new CompositeProposer(fst, composed), pool); AnalysisComparison cmp = FstVerification.Compare(search, verified, new[] { "sakt" }); @@ -483,6 +483,61 @@ public void CompleteHybrid_WiresGenerators_ReduplicatingGrammarCertifiesAndMatch } } + [Test] + public void Composite_WithPhonologyAndReduplication_ParallelMatchesSequential() + { + // Thread-safety on the concurrent path: the composite now runs HC's phonology inverse + // (ComposedPhonologyProposer) and the reduplication generator at analyze time. Drive both + // through the production factory in parallel and assert no divergence / no exceptions. + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + var redup = new AffixProcessRule + { + Name = "redup", + Gloss = "RED", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + OutSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value, + }; + redup.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new CopyFromInput("1") }, + } + ); + Morphophonemic.MorphologicalRules.Add(redup); + var tToD = new RewriteRule + { + Name = "t_to_d", + Lhs = Pattern.New().Annotation(Character(Table1, "t")).Value, + }; + tToD.Subrules.Add( + new RewriteSubrule { Rhs = Pattern.New().Annotation(Character(Table1, "d")).Value } + ); + Surface.PhonologicalRules.Add(tToD); + try + { + var complete = CompleteHybridMorpher.FromLanguage(TraceManager, Language, new[] { "sag", "dad" }); + var corpus = new List(); + for (int i = 0; i < 50; i++) + { + corpus.AddRange(new[] { "sag", "sagsag", "dad", "daddad", "sad", "zzz" }); + } + Dictionary sequential = corpus.Distinct().ToDictionary(w => w, w => SigSet(complete, w)); + var parallel = new ConcurrentDictionary(); + Parallel.ForEach(corpus, w => parallel[w] = SigSet(complete, w)); + Assert.That( + corpus.Distinct().All(w => parallel[w] == sequential[w]), + Is.True, + "concurrent analyses diverged from sequential (composite phonology/redup not thread-safe)" + ); + } + finally + { + Surface.PhonologicalRules.Remove(tToD); + Morphophonemic.MorphologicalRules.Remove(redup); + } + } + private static string Sig(WordAnalysis a) => string.Join("+", a.Morphemes.Select(m => (m as Morpheme)?.Gloss ?? "?")) + ":" + a.RootMorphemeIndex;