Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ flowchart LR
| **Apache-2.0, end to end** | Every runtime dep is OSI-approved permissive. No PolyForm, BSL, Commons Clause, Elastic v2, GPL, or AGPL. You can fork, embed, and ship commercial products on top without a license-review detour. |
| **Local-first, offline-capable** | `codehub analyze --offline` opens zero sockets. Your code never leaves your machine. No telemetry. |
| **Deterministic indexing** | Identical inputs produce a byte-identical graph hash. Reproducible. Auditable. Cacheable in CI. |
| **First-party source only** | `analyze` honors the repo's `.gitignore` (nested files included) and always skips dependency installs, virtualenvs, build output, and tool caches — `node_modules`, `.venv`/`venv`, `__pycache__`, `dist`/`build`/`target`, `.next`/`.nuxt`/`.turbo`, `.mypy_cache`/`.pytest_cache`/`.ruff_cache`, `coverage`, and similar. Exclusion is decided once at scan time (`HARDCODED_IGNORES` in `packages/ingestion/src/pipeline/gitignore.ts`), so every retrieval surface — `query`, `context`, `impact`, `sql`, `pack` — inherits it. Ambiguous names that are often real source (`vendor`, `env`, `out`, `bin`) are left to your `.gitignore`, which supports `!`-negation a hardcoded rule can't. |
| **MCP-native** | Works out-of-the-box with Claude Code, Cursor, Codex, Windsurf, OpenCode. The MCP server is the primary interface; CLI exists for scripts and CI. |
| **Single-file embedded storage** | One `store.sqlite` file holds everything — symbols, edges, embeddings, BM25 (FTS5) + HNSW traversal, and the temporal views (cochanges, summaries) — via Node's built-in `node:sqlite`. No daemon, no database to operate, and **zero native storage bindings** (ADR 0019 removed both `@ladybugdb/core` and `@duckdb/node-api`). |
| **15 languages at GA** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, C, C++, Ruby, Kotlin, Swift, PHP, Dart, COBOL — tree-sitter for the first 14 plus a regex provider for fixed-format COBOL. |
Expand Down
48 changes: 47 additions & 1 deletion packages/ingestion/src/pipeline/gitignore.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,53 @@ import { mkdtemp, rm } from "node:fs/promises";
import { tmpdir } from "node:os";
import path from "node:path";
import { test } from "node:test";
import { loadGitignoreChain, parseGitignore, shouldIgnore } from "./gitignore.js";
import {
HARDCODED_IGNORES,
loadGitignoreChain,
parseGitignore,
shouldIgnore,
} from "./gitignore.js";

test("HARDCODED_IGNORES covers the well-known dependency/virtualenv/build/cache dirs", () => {
// Operator contract: venv + node_modules + similar are always excluded,
// even with no .gitignore present. Guards against silent regression of the
// list. Plain `venv` is as load-bearing as `.venv` (both are real
// virtualenv layouts).
const required = [
"node_modules",
"bower_components",
".venv",
"venv",
"__pycache__",
".tox",
".mypy_cache",
".pytest_cache",
".ruff_cache",
"dist",
"build",
"target",
".next",
".nuxt",
".turbo",
"coverage",
".git",
];
const set = new Set(HARDCODED_IGNORES);
for (const name of required) {
assert.ok(set.has(name), `HARDCODED_IGNORES must contain "${name}"`);
}
// `vendor` is intentionally NOT hardcoded — it is ambiguous (real vendored
// first-party source lives under a vendor/ path in this very repo). A
// hardcoded ignore cannot be re-included via .gitignore !negation, so we
// leave vendor exclusion to the repo's own .gitignore.
assert.ok(!set.has("vendor"), "vendor must NOT be hardcoded (left to .gitignore)");
// The list is exact path segments — no globs, no slashes, no duplicates.
assert.equal(set.size, HARDCODED_IGNORES.length, "HARDCODED_IGNORES must not contain duplicates");
for (const name of HARDCODED_IGNORES) {
assert.ok(!name.includes("/"), `"${name}" must be a bare directory segment, not a path`);
assert.ok(!name.includes("*"), `"${name}" must be a literal name, not a glob`);
}
});

test("loadGitignoreChain: root file only — returns a single-entry map", async () => {
const repo = await mkdtemp(path.join(tmpdir(), "och-gi-root-"));
Expand Down
49 changes: 44 additions & 5 deletions packages/ingestion/src/pipeline/gitignore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -221,20 +221,59 @@ async function loadDir(
}
}

/** Hardcoded directory names we always skip, even absent a `.gitignore`. */
/**
* Hardcoded directory names we always skip, even absent a `.gitignore`.
*
* The scan walker tests each directory entry's *name* against this set
* (see `phases/scan.ts` — `hardcoded.has(name)`), so a bare name like
* `node_modules` or `venv` excludes that directory at ANY depth, not just
* the repo root. Entries are exact path segments, not globs.
*
* Scope: dependency installs, language/tool virtualenvs, build output, and
* tool caches — directories that never hold first-party source. We
* deliberately exclude ambiguous names that are commonly real source/config
* directories: `env` (often a config module, not only a virtualenv),
* `out`/`bin`/`obj` (frequently first-party), and `vendor` (Go/PHP/Ruby use
* it for third-party deps, but it is also a common name for vendored
* first-party source — this repo keeps source at
* `packages/ingestion/src/pipeline/phases/vendor/`). Those are left to a
* repo's own `.gitignore`, which supports `!negation` for re-inclusion; a
* hardcoded ignore cannot be overridden.
*/
export const HARDCODED_IGNORES: readonly string[] = [
"node_modules",
// Version-control metadata.
".git",
".svn",
".hg",
"dist",
"build",
"target",
// OpenCodeHub's own index / meta directory.
META_DIR_NAME,
// JavaScript / TypeScript dependencies, package-manager stores, and caches.
"node_modules",
"bower_components",
".pnpm-store",
".yarn",
// Python virtualenvs, bytecode caches, and tool caches.
".venv",
"venv",
"__pycache__",
".tox",
".mypy_cache",
".pytest_cache",
".ruff_cache",
// Build / compiler output.
"dist",
"build",
"target",
// Framework build output, bundler + build-tool caches.
".next",
".nuxt",
".turbo",
".gradle",
".parcel-cache",
".cache",
// Test / coverage output.
"coverage",
// Editor / IDE settings (no first-party source).
".idea",
".vscode",
];
109 changes: 109 additions & 0 deletions packages/ingestion/src/pipeline/phases/scan.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import path from "node:path";
import { after, before, describe, it } from "node:test";
import { promisify } from "node:util";
import { KnowledgeGraph } from "@opencodehub/core-types";
import { HARDCODED_IGNORES } from "../gitignore.js";
import type { PipelineContext } from "../types.js";
import { scanPhase } from "./scan.js";

Expand Down Expand Up @@ -59,6 +60,114 @@ describe("scanPhase", () => {
assert.ok(!rels.includes("blob.bin"), "binary files must be skipped");
});

it("skips every HARDCODED_IGNORES directory at the repo root and nested", async () => {
// Build a repo where each hardcoded-ignore name appears both at the root
// and one level deep, each holding a source file the scan would otherwise
// pick up. None of those files may appear in the scan output.
const fixture = await mkdtemp(path.join(tmpdir(), "och-scan-hardcoded-"));
try {
await fs.writeFile(path.join(fixture, "real.ts"), "export const R = 1;\n");
for (const name of HARDCODED_IGNORES) {
// Root-level: <name>/leaf.ts
const rootDir = path.join(fixture, name);
await fs.mkdir(rootDir, { recursive: true });
await fs.writeFile(path.join(rootDir, "leaf.ts"), "export const X = 1;\n");
// Nested: src/<name>/leaf.ts — proves per-segment matching at depth.
const nestedDir = path.join(fixture, "src", name);
await fs.mkdir(nestedDir, { recursive: true });
await fs.writeFile(path.join(nestedDir, "leaf.ts"), "export const Y = 2;\n");
}
const ctx: PipelineContext = {
repoPath: fixture,
options: { skipGit: true },
graph: new KnowledgeGraph(),
phaseOutputs: new Map(),
};
const out = await scanPhase.run(ctx, new Map());
const rels = out.files.map((f) => f.relPath);
// The one legitimate source file survives.
assert.ok(rels.includes("real.ts"), "first-party source must be kept");
// No kept path may traverse any hardcoded-ignore directory, at any depth.
for (const name of HARDCODED_IGNORES) {
const offenders = rels.filter((r) => r.split("/").includes(name));
assert.deepEqual(
offenders,
[],
`no scanned path may pass through "${name}/" — found: ${offenders.join(", ")}`,
);
}
} finally {
await rm(fixture, { recursive: true, force: true });
}
});

it("excludes venv/ and node_modules/ specifically, at root and nested", async () => {
// Regression guard for the operator requirement: virtualenvs (.venv AND
// the bare `venv` name) and node_modules must never enter the index.
const fixture = await mkdtemp(path.join(tmpdir(), "och-scan-venv-"));
try {
const layouts = [
"venv/lib/site.py",
".venv/lib/site.py",
"node_modules/pkg/index.js",
"backend/venv/lib/dep.py",
"frontend/node_modules/pkg/index.js",
];
for (const rel of layouts) {
const abs = path.join(fixture, rel);
await fs.mkdir(path.dirname(abs), { recursive: true });
await fs.writeFile(abs, "x\n");
}
await fs.writeFile(path.join(fixture, "app.py"), "print('hi')\n");
const ctx: PipelineContext = {
repoPath: fixture,
options: { skipGit: true },
graph: new KnowledgeGraph(),
phaseOutputs: new Map(),
};
const out = await scanPhase.run(ctx, new Map());
const rels = out.files.map((f) => f.relPath);
assert.ok(rels.includes("app.py"), "first-party source must be kept");
for (const seg of ["venv", ".venv", "node_modules"]) {
assert.ok(
!rels.some((r) => r.split("/").includes(seg)),
`"${seg}/" content must never appear in scan output`,
);
}
} finally {
await rm(fixture, { recursive: true, force: true });
}
});

it("excludes a user-.gitignore'd directory end-to-end through the scan phase", async () => {
// .gitignore honoring on analyze: a directory the repo's own .gitignore
// excludes must not be scanned, even though it is not a hardcoded ignore.
const fixture = await mkdtemp(path.join(tmpdir(), "och-scan-gitignore-"));
try {
await fs.writeFile(path.join(fixture, ".gitignore"), "generated/\nsecret.key\n");
await fs.writeFile(path.join(fixture, "main.ts"), "export const M = 1;\n");
await fs.writeFile(path.join(fixture, "secret.key"), "shh\n");
await fs.mkdir(path.join(fixture, "generated", "deep"), { recursive: true });
await fs.writeFile(path.join(fixture, "generated", "deep", "g.ts"), "export const G = 1;\n");
const ctx: PipelineContext = {
repoPath: fixture,
options: { skipGit: true },
graph: new KnowledgeGraph(),
phaseOutputs: new Map(),
};
const out = await scanPhase.run(ctx, new Map());
const rels = out.files.map((f) => f.relPath);
assert.ok(rels.includes("main.ts"), "tracked source must be kept");
assert.ok(!rels.includes("secret.key"), ".gitignore file pattern must be honored");
assert.ok(
!rels.some((r) => r.startsWith("generated/")),
".gitignore directory pattern must be honored at scan time",
);
} finally {
await rm(fixture, { recursive: true, force: true });
}
});

it("emits deterministic sha256 for each file", async () => {
const one = await scanPhase.run(makeCtx(), new Map());
const two = await scanPhase.run(makeCtx(), new Map());
Expand Down
Loading