diff --git a/README.md b/README.md index d354466..05e19d4 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ flowchart LR | **Apache-2.0, end to end** | Every runtime dep is OSI-approved permissive. No PolyForm, BSL, Commons Clause, Elastic v2, GPL, or AGPL. You can fork, embed, and ship commercial products on top without a license-review detour. | | **Local-first, offline-capable** | `codehub analyze --offline` opens zero sockets. Your code never leaves your machine. No telemetry. | | **Deterministic indexing** | Identical inputs produce a byte-identical graph hash. Reproducible. Auditable. Cacheable in CI. | +| **First-party source only** | `analyze` honors the repo's `.gitignore` (nested files included) and always skips dependency installs, virtualenvs, build output, and tool caches — `node_modules`, `.venv`/`venv`, `__pycache__`, `dist`/`build`/`target`, `.next`/`.nuxt`/`.turbo`, `.mypy_cache`/`.pytest_cache`/`.ruff_cache`, `coverage`, and similar. Exclusion is decided once at scan time (`HARDCODED_IGNORES` in `packages/ingestion/src/pipeline/gitignore.ts`), so every retrieval surface — `query`, `context`, `impact`, `sql`, `pack` — inherits it. Ambiguous names that are often real source (`vendor`, `env`, `out`, `bin`) are left to your `.gitignore`, which supports `!`-negation a hardcoded rule can't. | | **MCP-native** | Works out-of-the-box with Claude Code, Cursor, Codex, Windsurf, OpenCode. The MCP server is the primary interface; CLI exists for scripts and CI. | | **Single-file embedded storage** | One `store.sqlite` file holds everything — symbols, edges, embeddings, BM25 (FTS5) + HNSW traversal, and the temporal views (cochanges, summaries) — via Node's built-in `node:sqlite`. No daemon, no database to operate, and **zero native storage bindings** (ADR 0019 removed both `@ladybugdb/core` and `@duckdb/node-api`). | | **15 languages at GA** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, C, C++, Ruby, Kotlin, Swift, PHP, Dart, COBOL — tree-sitter for the first 14 plus a regex provider for fixed-format COBOL. | diff --git a/packages/ingestion/src/pipeline/gitignore.test.ts b/packages/ingestion/src/pipeline/gitignore.test.ts index 5b5c159..92b62b7 100644 --- a/packages/ingestion/src/pipeline/gitignore.test.ts +++ b/packages/ingestion/src/pipeline/gitignore.test.ts @@ -12,7 +12,53 @@ import { mkdtemp, rm } from "node:fs/promises"; import { tmpdir } from "node:os"; import path from "node:path"; import { test } from "node:test"; -import { loadGitignoreChain, parseGitignore, shouldIgnore } from "./gitignore.js"; +import { + HARDCODED_IGNORES, + loadGitignoreChain, + parseGitignore, + shouldIgnore, +} from "./gitignore.js"; + +test("HARDCODED_IGNORES covers the well-known dependency/virtualenv/build/cache dirs", () => { + // Operator contract: venv + node_modules + similar are always excluded, + // even with no .gitignore present. Guards against silent regression of the + // list. Plain `venv` is as load-bearing as `.venv` (both are real + // virtualenv layouts). + const required = [ + "node_modules", + "bower_components", + ".venv", + "venv", + "__pycache__", + ".tox", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + "dist", + "build", + "target", + ".next", + ".nuxt", + ".turbo", + "coverage", + ".git", + ]; + const set = new Set(HARDCODED_IGNORES); + for (const name of required) { + assert.ok(set.has(name), `HARDCODED_IGNORES must contain "${name}"`); + } + // `vendor` is intentionally NOT hardcoded — it is ambiguous (real vendored + // first-party source lives under a vendor/ path in this very repo). A + // hardcoded ignore cannot be re-included via .gitignore !negation, so we + // leave vendor exclusion to the repo's own .gitignore. + assert.ok(!set.has("vendor"), "vendor must NOT be hardcoded (left to .gitignore)"); + // The list is exact path segments — no globs, no slashes, no duplicates. + assert.equal(set.size, HARDCODED_IGNORES.length, "HARDCODED_IGNORES must not contain duplicates"); + for (const name of HARDCODED_IGNORES) { + assert.ok(!name.includes("/"), `"${name}" must be a bare directory segment, not a path`); + assert.ok(!name.includes("*"), `"${name}" must be a literal name, not a glob`); + } +}); test("loadGitignoreChain: root file only — returns a single-entry map", async () => { const repo = await mkdtemp(path.join(tmpdir(), "och-gi-root-")); diff --git a/packages/ingestion/src/pipeline/gitignore.ts b/packages/ingestion/src/pipeline/gitignore.ts index d95f8a9..c8690c8 100644 --- a/packages/ingestion/src/pipeline/gitignore.ts +++ b/packages/ingestion/src/pipeline/gitignore.ts @@ -221,20 +221,59 @@ async function loadDir( } } -/** Hardcoded directory names we always skip, even absent a `.gitignore`. */ +/** + * Hardcoded directory names we always skip, even absent a `.gitignore`. + * + * The scan walker tests each directory entry's *name* against this set + * (see `phases/scan.ts` — `hardcoded.has(name)`), so a bare name like + * `node_modules` or `venv` excludes that directory at ANY depth, not just + * the repo root. Entries are exact path segments, not globs. + * + * Scope: dependency installs, language/tool virtualenvs, build output, and + * tool caches — directories that never hold first-party source. We + * deliberately exclude ambiguous names that are commonly real source/config + * directories: `env` (often a config module, not only a virtualenv), + * `out`/`bin`/`obj` (frequently first-party), and `vendor` (Go/PHP/Ruby use + * it for third-party deps, but it is also a common name for vendored + * first-party source — this repo keeps source at + * `packages/ingestion/src/pipeline/phases/vendor/`). Those are left to a + * repo's own `.gitignore`, which supports `!negation` for re-inclusion; a + * hardcoded ignore cannot be overridden. + */ export const HARDCODED_IGNORES: readonly string[] = [ - "node_modules", + // Version-control metadata. ".git", ".svn", ".hg", - "dist", - "build", - "target", + // OpenCodeHub's own index / meta directory. META_DIR_NAME, + // JavaScript / TypeScript dependencies, package-manager stores, and caches. + "node_modules", + "bower_components", + ".pnpm-store", + ".yarn", + // Python virtualenvs, bytecode caches, and tool caches. ".venv", + "venv", "__pycache__", + ".tox", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + // Build / compiler output. + "dist", + "build", + "target", + // Framework build output, bundler + build-tool caches. ".next", ".nuxt", ".turbo", + ".gradle", + ".parcel-cache", + ".cache", + // Test / coverage output. "coverage", + // Editor / IDE settings (no first-party source). + ".idea", + ".vscode", ]; diff --git a/packages/ingestion/src/pipeline/phases/scan.test.ts b/packages/ingestion/src/pipeline/phases/scan.test.ts index 6993062..bdb3e2f 100644 --- a/packages/ingestion/src/pipeline/phases/scan.test.ts +++ b/packages/ingestion/src/pipeline/phases/scan.test.ts @@ -7,6 +7,7 @@ import path from "node:path"; import { after, before, describe, it } from "node:test"; import { promisify } from "node:util"; import { KnowledgeGraph } from "@opencodehub/core-types"; +import { HARDCODED_IGNORES } from "../gitignore.js"; import type { PipelineContext } from "../types.js"; import { scanPhase } from "./scan.js"; @@ -59,6 +60,114 @@ describe("scanPhase", () => { assert.ok(!rels.includes("blob.bin"), "binary files must be skipped"); }); + it("skips every HARDCODED_IGNORES directory at the repo root and nested", async () => { + // Build a repo where each hardcoded-ignore name appears both at the root + // and one level deep, each holding a source file the scan would otherwise + // pick up. None of those files may appear in the scan output. + const fixture = await mkdtemp(path.join(tmpdir(), "och-scan-hardcoded-")); + try { + await fs.writeFile(path.join(fixture, "real.ts"), "export const R = 1;\n"); + for (const name of HARDCODED_IGNORES) { + // Root-level: /leaf.ts + const rootDir = path.join(fixture, name); + await fs.mkdir(rootDir, { recursive: true }); + await fs.writeFile(path.join(rootDir, "leaf.ts"), "export const X = 1;\n"); + // Nested: src//leaf.ts — proves per-segment matching at depth. + const nestedDir = path.join(fixture, "src", name); + await fs.mkdir(nestedDir, { recursive: true }); + await fs.writeFile(path.join(nestedDir, "leaf.ts"), "export const Y = 2;\n"); + } + const ctx: PipelineContext = { + repoPath: fixture, + options: { skipGit: true }, + graph: new KnowledgeGraph(), + phaseOutputs: new Map(), + }; + const out = await scanPhase.run(ctx, new Map()); + const rels = out.files.map((f) => f.relPath); + // The one legitimate source file survives. + assert.ok(rels.includes("real.ts"), "first-party source must be kept"); + // No kept path may traverse any hardcoded-ignore directory, at any depth. + for (const name of HARDCODED_IGNORES) { + const offenders = rels.filter((r) => r.split("/").includes(name)); + assert.deepEqual( + offenders, + [], + `no scanned path may pass through "${name}/" — found: ${offenders.join(", ")}`, + ); + } + } finally { + await rm(fixture, { recursive: true, force: true }); + } + }); + + it("excludes venv/ and node_modules/ specifically, at root and nested", async () => { + // Regression guard for the operator requirement: virtualenvs (.venv AND + // the bare `venv` name) and node_modules must never enter the index. + const fixture = await mkdtemp(path.join(tmpdir(), "och-scan-venv-")); + try { + const layouts = [ + "venv/lib/site.py", + ".venv/lib/site.py", + "node_modules/pkg/index.js", + "backend/venv/lib/dep.py", + "frontend/node_modules/pkg/index.js", + ]; + for (const rel of layouts) { + const abs = path.join(fixture, rel); + await fs.mkdir(path.dirname(abs), { recursive: true }); + await fs.writeFile(abs, "x\n"); + } + await fs.writeFile(path.join(fixture, "app.py"), "print('hi')\n"); + const ctx: PipelineContext = { + repoPath: fixture, + options: { skipGit: true }, + graph: new KnowledgeGraph(), + phaseOutputs: new Map(), + }; + const out = await scanPhase.run(ctx, new Map()); + const rels = out.files.map((f) => f.relPath); + assert.ok(rels.includes("app.py"), "first-party source must be kept"); + for (const seg of ["venv", ".venv", "node_modules"]) { + assert.ok( + !rels.some((r) => r.split("/").includes(seg)), + `"${seg}/" content must never appear in scan output`, + ); + } + } finally { + await rm(fixture, { recursive: true, force: true }); + } + }); + + it("excludes a user-.gitignore'd directory end-to-end through the scan phase", async () => { + // .gitignore honoring on analyze: a directory the repo's own .gitignore + // excludes must not be scanned, even though it is not a hardcoded ignore. + const fixture = await mkdtemp(path.join(tmpdir(), "och-scan-gitignore-")); + try { + await fs.writeFile(path.join(fixture, ".gitignore"), "generated/\nsecret.key\n"); + await fs.writeFile(path.join(fixture, "main.ts"), "export const M = 1;\n"); + await fs.writeFile(path.join(fixture, "secret.key"), "shh\n"); + await fs.mkdir(path.join(fixture, "generated", "deep"), { recursive: true }); + await fs.writeFile(path.join(fixture, "generated", "deep", "g.ts"), "export const G = 1;\n"); + const ctx: PipelineContext = { + repoPath: fixture, + options: { skipGit: true }, + graph: new KnowledgeGraph(), + phaseOutputs: new Map(), + }; + const out = await scanPhase.run(ctx, new Map()); + const rels = out.files.map((f) => f.relPath); + assert.ok(rels.includes("main.ts"), "tracked source must be kept"); + assert.ok(!rels.includes("secret.key"), ".gitignore file pattern must be honored"); + assert.ok( + !rels.some((r) => r.startsWith("generated/")), + ".gitignore directory pattern must be honored at scan time", + ); + } finally { + await rm(fixture, { recursive: true, force: true }); + } + }); + it("emits deterministic sha256 for each file", async () => { const one = await scanPhase.run(makeCtx(), new Map()); const two = await scanPhase.run(makeCtx(), new Map());