bm25 — Skillopedia

bm25 Ranked content search over any text corpus. One CLI, in-memory BM25 index per process, with a session-local disk cache so repeat invocations against the same corpus load in tens of milliseconds instead of rebuilding. Setup Install is sub-second on a warm uv cache. That's the entire dependency. Usage Corpus types | Spec | Meaning | |------|---------| | or | Local directory | | | | | | | | | Tarball fetch via GitHub API ( used if set) | Options | Option | Default | Description | |--------|---------|-------------| | | 10 | Results per query | | | (auto) | Repeatable. If set, only files matc…

)\n\n\ndef resolve_corpus(spec: str) -> Path:\n \"\"\"Resolve a CORPUS spec to a local directory. Downloads tarballs for github.com/... specs.\"\"\"\n if spec == 'uploads':\n return Path('/mnt/user-data/uploads')\n if spec == 'project':\n return Path('/mnt/project')\n m = GH_URL_RE.match(spec)\n if m:\n owner, repo, ref = m.group(1), m.group(2), m.group(3) or 'main'\n tmpdir = Path(tempfile.mkdtemp(prefix=f'bm25-{repo}-'))\n tar_path = tmpdir / f'{repo}.tar.gz'\n url = f'https://api.github.com/repos/{owner}/{repo}/tarball/{ref}'\n sys.stderr.write(f\"[bm25] fetching {url} ...\\n\")\n req = urllib.request.Request(url)\n token = os.environ.get('GH_TOKEN')\n if token:\n req.add_header('Authorization', f'token {token}')\n req.add_header('User-Agent', 'muninn-raven')\n with urllib.request.urlopen(req) as r, open(tar_path, 'wb') as f:\n while chunk := r.read(1 \u003c\u003c 20):\n f.write(chunk)\n extract_dir = tmpdir / 'extract'\n extract_dir.mkdir()\n with tarfile.open(tar_path) as tar:\n tar.extractall(extract_dir)\n # tarball top-level is owner-repo-sha; descend into it\n subs = list(extract_dir.iterdir())\n if len(subs) == 1 and subs[0].is_dir():\n return subs[0]\n return extract_dir\n p = Path(spec).expanduser().resolve()\n if not p.is_dir():\n sys.stderr.write(f\"[bm25] corpus not found or not a directory: {spec}\\n\")\n sys.exit(2)\n return p\n\n\n# ---------- index ----------\n\nCACHE_ROOT = Path('/home/claude/.bm25-cache')\n\n\ndef cache_key(corpus_root: Path, include, exclude, max_bytes: int) -> str:\n \"\"\"Stable 16-hex hash of the inputs that fully determine an index.\n\n Same key → identical index. Changing any of: corpus root path,\n include/exclude globs, or max_bytes → new key → cache miss → rebuild.\n \"\"\"\n h = hashlib.sha256()\n h.update(str(corpus_root.resolve()).encode())\n h.update(b'|')\n h.update(','.join(sorted(include)).encode())\n h.update(b'|')\n h.update(','.join(sorted(exclude)).encode())\n h.update(b'|')\n h.update(str(max_bytes).encode())\n return h.hexdigest()[:16]\n\n\nclass CorpusIndex:\n \"\"\"In-memory BM25 index over a corpus. Optional session-local disk cache.\n\n The cache lives at /home/claude/.bm25-cache/\u003ckey>/ which is ephemeral\n (dies with the container session). The key is a hash of the index-\n determining inputs (corpus path + filters + size cap), so changing any\n of those naturally invalidates. Same key + same session → load in\n ~100–300ms instead of rebuilding in seconds.\n \"\"\"\n\n def __init__(self, root: Path, paths, docs, retriever, build_s: float, source: str):\n self.root = root\n self.paths = paths\n self.docs = docs\n self.retriever = retriever\n self.build_s = build_s\n self.source = source # 'built' or 'cache'\n\n @classmethod\n def build(cls, corpus_root: Path, include, exclude, max_bytes: int):\n \"\"\"Walk the corpus, tokenize, index. Returns a new CorpusIndex.\"\"\"\n t0 = time.time()\n paths, docs = [], []\n for rel, text in discover_files(corpus_root, include, exclude, max_bytes):\n paths.append(rel)\n docs.append(text)\n tokens = bm25s.tokenize(docs, stopwords=None, show_progress=False)\n retriever = bm25s.BM25()\n retriever.index(tokens, show_progress=False)\n return cls(corpus_root, paths, docs, retriever, time.time() - t0, source='built')\n\n def save(self, cache_dir: Path):\n \"\"\"Persist to disk so the next invocation can load instead of rebuild.\"\"\"\n cache_dir.mkdir(parents=True, exist_ok=True)\n self.retriever.save(str(cache_dir / 'bm25'))\n with open(cache_dir / 'corpus.pkl', 'wb') as f:\n pickle.dump({'paths': self.paths, 'docs': self.docs}, f)\n with open(cache_dir / 'manifest.json', 'w') as f:\n json.dump({\n 'corpus_root': str(self.root),\n 'files_count': len(self.paths),\n 'built_at': datetime.now(timezone.utc).isoformat(),\n 'build_s': round(self.build_s, 3),\n }, f, indent=2)\n\n @classmethod\n def load(cls, cache_dir: Path, corpus_root: Path):\n \"\"\"Try to load a cached index. Returns CorpusIndex on success, None on any failure.\"\"\"\n try:\n mf_path = cache_dir / 'manifest.json'\n if not mf_path.exists():\n return None\n t0 = time.time()\n retriever = bm25s.BM25.load(str(cache_dir / 'bm25'))\n with open(cache_dir / 'corpus.pkl', 'rb') as f:\n corp = pickle.load(f)\n with open(mf_path) as f:\n manifest = json.load(f)\n idx = cls(corpus_root, corp['paths'], corp['docs'], retriever,\n time.time() - t0, source='cache')\n idx.manifest = manifest\n return idx\n except Exception:\n return None\n\n def query(self, q: str, k: int = 10):\n tokens = bm25s.tokenize([q], stopwords=None, show_progress=False)\n idxs, scores = self.retriever.retrieve(tokens, k=min(k, len(self.docs)), show_progress=False)\n return [(int(i), float(s)) for i, s in zip(idxs[0], scores[0])]\n\n\ndef get_or_build_index(corpus_root: Path, include, exclude, max_bytes: int, use_cache: bool):\n \"\"\"Cache-aware index lookup. Returns (CorpusIndex, cache_dir_or_None, key).\"\"\"\n key = cache_key(corpus_root, include, exclude, max_bytes)\n cache_dir = CACHE_ROOT / key\n if use_cache:\n loaded = CorpusIndex.load(cache_dir, corpus_root)\n if loaded is not None:\n return loaded, cache_dir, key\n idx = CorpusIndex.build(corpus_root, include, exclude, max_bytes)\n if use_cache:\n try:\n idx.save(cache_dir)\n except Exception as e:\n sys.stderr.write(f\"[bm25] warn: failed to write cache ({e}); continuing\\n\")\n return idx, (cache_dir if use_cache else None), key\n\n\n# ---------- snippet extraction ----------\n\ndef best_snippet(doc: str, q: str, lines: int = 3) -> str:\n \"\"\"Pick the best matching span — line containing the most query tokens, with context.\"\"\"\n if lines \u003c= 0:\n return ''\n q_terms = {t.lower() for t in re.findall(r'\\w+', q) if len(t) > 1}\n if not q_terms:\n return doc[:200] + ('...' if len(doc) > 200 else '')\n doc_lines = doc.splitlines()\n best_i, best_score = 0, -1\n for i, line in enumerate(doc_lines):\n line_terms = {t.lower() for t in re.findall(r'\\w+', line)}\n hits = len(line_terms & q_terms)\n if hits > best_score:\n best_score = hits\n best_i = i\n half = max(0, lines // 2)\n lo = max(0, best_i - half)\n hi = min(len(doc_lines), best_i + half + 1)\n return '\\n'.join(doc_lines[lo:hi])\n\n\n# ---------- output ----------\n\ndef print_results(idx: CorpusIndex, q: str, hits, snippet_lines: int, as_json: bool):\n if as_json:\n out = {'query': q, 'results': [\n {'path': idx.paths[i], 'score': s, 'snippet': best_snippet(idx.docs[i], q, snippet_lines)}\n for i, s in hits\n ]}\n print(json.dumps(out, indent=2))\n return\n print(f\"\\nQUERY: {q}\")\n print(\"-\" * 70)\n for rank, (i, s) in enumerate(hits, 1):\n path = idx.paths[i]\n snip = best_snippet(idx.docs[i], q, snippet_lines)\n snip_disp = ''\n if snip:\n snip_disp = '\\n ' + snip.replace('\\n', '\\n ')\n print(f\" {rank}. {s:6.2f} {path}{snip_disp}\")\n\n\n# ---------- CLI ----------\n\ndef main():\n ap = argparse.ArgumentParser(prog='bm25', description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)\n ap.add_argument('corpus', help='directory, \"uploads\", \"project\", or github.com/owner/repo[@ref]')\n ap.add_argument('queries', nargs='*', help='one or more queries to run')\n ap.add_argument('--top-k', type=int, default=10, help='results per query (default 10)')\n ap.add_argument('--include', action='append', default=[], help='glob to include (repeat for multiple)')\n ap.add_argument('--exclude', action='append', default=[], help='glob to exclude (repeat for multiple)')\n ap.add_argument('--snippet-lines', type=int, default=3, help='lines of snippet context (0 = none)')\n ap.add_argument('--max-file-bytes', type=int, default=2_000_000, help='skip files larger than this')\n ap.add_argument('--json', action='store_true', help='machine-readable output')\n ap.add_argument('--interactive', '-i', action='store_true', help='REPL: query, q, query, q, ... (one corpus, many queries)')\n ap.add_argument('--stats', action='store_true', help='print discover/index timings')\n ap.add_argument('--no-cache', action='store_true',\n help='bypass the session-local index cache at /home/claude/.bm25-cache/')\n args = ap.parse_args()\n\n if not args.queries and not args.interactive:\n ap.error('provide queries as positional args, or use --interactive')\n\n root = resolve_corpus(args.corpus)\n idx, cache_dir, key = get_or_build_index(\n root, args.include, args.exclude, args.max_file_bytes,\n use_cache=not args.no_cache,\n )\n if idx.source == 'cache':\n manifest = getattr(idx, 'manifest', {})\n built_at = manifest.get('built_at', '?')\n sys.stderr.write(f\"[bm25] cache HIT {key} ({len(idx.docs)} files, \"\n f\"loaded in {idx.build_s:.2f}s, built {built_at})\\n\")\n else:\n if cache_dir is not None:\n sys.stderr.write(f\"[bm25] cache MISS {key} → built {len(idx.docs)} files in {idx.build_s:.2f}s, saved to {cache_dir}\\n\")\n else:\n sys.stderr.write(f\"[bm25] no-cache: built {len(idx.docs)} files in {idx.build_s:.2f}s\\n\")\n if args.stats:\n print(json.dumps({\n 'files': len(idx.docs), 'source': idx.source,\n 'build_or_load_s': round(idx.build_s, 3),\n 'cache_key': key, 'cache_dir': str(cache_dir) if cache_dir else None,\n }, indent=2))\n\n for q in args.queries:\n hits = idx.query(q, k=args.top_k)\n print_results(idx, q, hits, args.snippet_lines, args.json)\n\n if args.interactive:\n sys.stderr.write(\"[bm25] interactive mode. type query, blank line to exit.\\n\")\n while True:\n try:\n q = input('bm25> ').strip()\n except (EOFError, KeyboardInterrupt):\n break\n if not q:\n break\n hits = idx.query(q, k=args.top_k)\n print_results(idx, q, hits, args.snippet_lines, args.json)\n\n\nif __name__ == '__main__':\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":14268,"content_sha256":"bac0b3fe2475ad04e1b310fa4d1ae74853e2a4619871497dd9c545d608105a0b"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"bm25","type":"text"}]},{"type":"paragraph","content":[{"text":"Ranked content search over any text corpus. One CLI, in-memory BM25 index per process, with a session-local disk cache so repeat invocations against the same corpus load in tens of milliseconds instead of rebuilding.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Setup","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"uv pip install --system --break-system-packages bm25s","type":"text"}]},{"type":"paragraph","content":[{"text":"Install is sub-second on a warm uv cache. That's the entire dependency.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Usage","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"BM25=/mnt/skills/user/bm25/scripts/bm25.py\n\n# Local directory\npython3 $BM25 ./repo 'csrf middleware'\n\n# Multiple queries against the same in-memory index (build once, query many)\npython3 $BM25 ./repo 'csrf middleware' 'session backend' 'queryset filter'\n\n# Cloned GitHub repo via tarball (one HTTP call)\npython3 $BM25 'github.com/django/django' 'atomic transaction'\npython3 $BM25 'github.com/django/django@stable/5.0.x' 'atomic transaction'\n\n# Project knowledge or uploads\npython3 $BM25 project 'RAG scaling laws'\npython3 $BM25 uploads 'tax loss harvesting'\n\n# Filters\npython3 $BM25 ./repo 'auth flow' --exclude 'tests/*' --exclude '*/tests/*'\npython3 $BM25 ./repo 'config' --include '*.py' --include '*.toml'\n\n# Interactive (REPL — single corpus, many queries)\npython3 $BM25 ./repo --interactive\n\n# JSON output for piping\npython3 $BM25 ./repo 'auth flow' --json","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Corpus types","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Spec","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Meaning","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./path","type":"text","marks":[{"type":"code_inline"}]},{"text":" or ","type":"text"},{"text":"/abs/path","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Local directory","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"uploads","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"/mnt/user-data/uploads/","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"project","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"/mnt/project/","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"github.com/owner/repo[@ref]","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tarball fetch via GitHub API (","type":"text"},{"text":"GH_TOKEN","type":"text","marks":[{"type":"code_inline"}]},{"text":" used if set)","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Options","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Option","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Default","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Description","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--top-k N","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"10","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Results per query","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--include GLOB","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"(auto)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Repeatable. If set, only files matching one of these globs are indexed","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--exclude GLOB","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph"}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Repeatable. Skip files matching these globs","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--snippet-lines N","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"3","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Lines of snippet context per hit (0 = none)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--max-file-bytes N","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"2,000,000","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Skip files larger than this","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--json","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph"}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Machine-readable output","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--interactive","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"-i","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph"}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"REPL mode for ad-hoc querying within one session","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--stats","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph"}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Print discover + index timings as JSON","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--no-cache","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph"}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Bypass the session-local index cache; build in-memory only","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"With no ","type":"text"},{"text":"--include","type":"text","marks":[{"type":"code_inline"}]},{"text":", a default set of text/code extensions is indexed (Python, JS/TS, Go, Rust, Markdown, JSON, YAML, etc.). Standard noise dirs are skipped unconditionally: ","type":"text"},{"text":".git","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"node_modules","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"__pycache__","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":".venv","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"dist","type":"text","marks":[{"type":"code_inline"}]},{"text":", etc.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"When to use bm25","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Question shape","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tool","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\"Find lines matching ","type":"text"},{"text":"class.*Error","type":"text","marks":[{"type":"code_inline"}]},{"text":"\"","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"grep","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ripgrep","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\"Show me where ","type":"text"},{"text":"parse_input","type":"text","marks":[{"type":"code_inline"}]},{"text":" is defined\"","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"tree-sitting","type":"text","marks":[{"type":"code_inline"}]},{"text":" (","type":"text"},{"text":"find:","type":"text","marks":[{"type":"code_inline"}]},{"text":"/","type":"text"},{"text":"source:","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\"Which files are about CSRF handling?\"","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"bm25","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\"Rank these docs by relevance to 'rate limiting strategies'\"","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"bm25","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\"What's the implementation of the atomic transaction context manager?\"","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"bm25","type":"text","marks":[{"type":"strong"}]},{"text":", then ","type":"text"},{"text":"tree-sitting source:","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\"Find code by natural-language concept (in a code repo)\"","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"searching-codebases","type":"text","marks":[{"type":"code_inline"}]},{"text":" (which has its own TF-IDF mode)","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"The boundary with ","type":"text"},{"text":"searching-codebases","type":"text","marks":[{"type":"code_inline"}]},{"text":": that skill is code-specific (routes between regex and TF-IDF, expands via tree-sitting AST). ","type":"text"},{"text":"bm25","type":"text","marks":[{"type":"code_inline"}]},{"text":" is the simpler general-purpose tool — any corpus, no AST awareness, no routing. Prefer ","type":"text"},{"text":"searching-codebases","type":"text","marks":[{"type":"code_inline"}]},{"text":" for code; reach for ","type":"text"},{"text":"bm25","type":"text","marks":[{"type":"code_inline"}]},{"text":" when the corpus is mixed (docs + code), non-code (notes, transcripts, PDFs converted to text), or when you specifically want BM25's length-normalized scoring.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Design notes","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Session-local disk cache","type":"text","marks":[{"type":"strong"}]},{"text":" at ","type":"text"},{"text":"/home/claude/.bm25-cache/\u003ckey>/","type":"text","marks":[{"type":"code_inline"}]},{"text":". The key is a hash of ","type":"text"},{"text":"(resolved_corpus_path, include_globs, exclude_globs, max_file_bytes)","type":"text","marks":[{"type":"code_inline"}]},{"text":" — any change invalidates naturally. First invocation builds and saves; subsequent invocations against the same corpus and filters load in tens of milliseconds. The cache lives in ","type":"text"},{"text":"/home/claude","type":"text","marks":[{"type":"code_inline"}]},{"text":", which is ephemeral, so it expires at the session boundary — same lifetime as the corpus state itself, no cross-session staleness. ~5–35MB per cached index, depending on corpus size.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--no-cache","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" bypasses both load and save — useful only if you've mutated the corpus mid-session (rare) or want to confirm a rebuild matches.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Reuse within a single invocation.","type":"text","marks":[{"type":"strong"}]},{"text":" The retriever stays in memory between queries in one process. Passing multiple queries positionally, or using ","type":"text"},{"text":"--interactive","type":"text","marks":[{"type":"code_inline"}]},{"text":", amortizes any rebuild cost across queries.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"No AST awareness.","type":"text","marks":[{"type":"strong"}]},{"text":" Chunking is per-file. For symbol-level results in code, combine with ","type":"text"},{"text":"tree-sitting","type":"text","marks":[{"type":"code_inline"}]},{"text":" queries on the same paths.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Tokenizer.","type":"text","marks":[{"type":"strong"}]},{"text":" Default ","type":"text"},{"text":"bm25s.tokenize","type":"text","marks":[{"type":"code_inline"}]},{"text":" with stopwords disabled — over a small Django sample, AST-derived token streams (identifiers/strings/ comments only) gave near-identical rankings, so we don't bother.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Output format","type":"text"}]},{"type":"paragraph","content":[{"text":"Default (human-readable):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"QUERY: csrf middleware\n----------------------------------------------------------------------\n 1. 5.51 django/core/checks/security/csrf.py\n def _csrf_middleware():\n return \"django.middleware.csrf.CsrfViewMiddleware\" in settings.MIDDLEWARE\n 2. 5.34 docs/howto/csrf.txt\n ...","type":"text"}]},{"type":"paragraph","content":[{"text":"--json","type":"text","marks":[{"type":"code_inline"}]},{"text":" produces ","type":"text"},{"text":"{\"query\": ..., \"results\": [{\"path\", \"score\", \"snippet\"}, ...]}","type":"text","marks":[{"type":"code_inline"}]},{"text":".","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Architecture","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"bm25.py CLI\n ├── resolve_corpus(spec) → local Path (downloads tarball if github.com/...)\n ├── cache_key(...) → 16-hex sha256 of inputs that determine the index\n ├── CorpusIndex.load(cache_dir) → returns cached index if present, else None\n ├── CorpusIndex.build(...) → walks files, tokenizes, indexes with bm25s\n ├── CorpusIndex.save(cache_dir) → persists to /home/claude/.bm25-cache/\u003ckey>/\n ├── query(q, k) → ranked (doc_idx, score) pairs\n └── best_snippet(doc, q, lines) → pick line w/ most query-term hits + context","type":"text"}]},{"type":"paragraph","content":[{"text":"Cache contents per directory:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"bm25/","type":"text","marks":[{"type":"code_inline"}]},{"text":" — bm25s.BM25.save() output (NumPy arrays + vocab)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"corpus.pkl","type":"text","marks":[{"type":"code_inline"}]},{"text":" — pickled ","type":"text"},{"text":"{paths, docs}","type":"text","marks":[{"type":"code_inline"}]},{"text":" so we can render snippets without re-reading the source files","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"manifest.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" — corpus root, files count, built_at timestamp","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"No network beyond optional tarball fetch on ","type":"text"},{"text":"github.com/...","type":"text","marks":[{"type":"code_inline"}]},{"text":" corpora. No state outside ","type":"text"},{"text":"/home/claude/","type":"text","marks":[{"type":"code_inline"}]},{"text":", which is ephemeral.","type":"text"}]}]},"metadata":{"date":"2026-06-05","name":"bm25","author":"@skillopedia","source":{"stars":124,"repo_name":"claude-skills","origin_url":"https://github.com/oaustegard/claude-skills/blob/HEAD/bm25/SKILL.md","repo_owner":"oaustegard","body_sha256":"95e484d1c089e9cfcd0c090190077224393054f0e87bf4a80c173b54559dea3b","cluster_key":"59a097210782ef9d6d06a8e3374d887880204bc990cccf1a49fe9250b6f45f96","clean_bundle":{"format":"clean-skill-bundle-v1","source":"oaustegard/claude-skills/bm25/SKILL.md","attachments":[{"id":"8ba8ecc5-8fba-5ae5-bf17-a09759a04c1b","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/8ba8ecc5-8fba-5ae5-bf17-a09759a04c1b/attachment.md","path":"CHANGELOG.md","size":2302,"sha256":"43efb075cd6b777eba982963509fd238e369f4592cc2adbc58e68a42ca2d983f","contentType":"text/markdown; charset=utf-8"},{"id":"897a291a-60d7-58fe-904d-e0748a37f04c","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/897a291a-60d7-58fe-904d-e0748a37f04c/attachment.md","path":"README.md","size":1377,"sha256":"ce78f5d361847fafa420725cf65aa1b33794f7d7011cc1ada67b7b1b015a026d","contentType":"text/markdown; charset=utf-8"},{"id":"7b07eac9-3aa6-52fb-8772-8c975ce7519d","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/7b07eac9-3aa6-52fb-8772-8c975ce7519d/attachment.py","path":"scripts/bm25.py","size":14268,"sha256":"bac0b3fe2475ad04e1b310fa4d1ae74853e2a4619871497dd9c545d608105a0b","contentType":"text/x-python; charset=utf-8"}],"bundle_sha256":"7d5e42753df8d359024d10622dce93b58164d008b61496bb3c6e5b979555ea7d","attachment_count":3,"text_attachments":3,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":0,"excluded_attachments":[]},"cluster_size":1,"skill_md_path":"bm25/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"web-development","category_label":"Web"},"exact_dupes_collapsed_into_this":0},"version":"v1","category":"web-development","metadata":{"version":"0.1.2"},"import_tag":"clean-skills-v1","description":"Ranked content search over any text corpus using BM25 (via xhluca/bm25s). Corpus-agnostic: works on cloned repos, project knowledge stores, uploaded files/archives, and any local directory. Stateless — builds an in-memory index each invocation, no cache, no persistence. Use when you need ranked multi-word content search beyond grep, or when picking the \"most relevant files for these terms\" across a corpus. Triggers on \"rank these documents\", \"search this corpus\", \"find content about X\", \"which files are most about Y\", or multi-word concept queries against a known body of text."}},"renderedAt":1782980289500}

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.