searching-codebases — Skillopedia

Searching Codebases Find code in any codebase by pattern or concept. One entry point, two search strategies, automatic routing. Prerequisites tree-sitting (for structural context expansion) installs automatically when the flag is used. Primary Command SOURCE is any of: - Local directory path - GitHub URL (downloads tarball automatically) - (uses ) - (uses ) - Path to a or archive Search Modes Regex mode (patterns, identifiers, literal text): Semantic mode (concepts, natural language): Auto-detection: short queries and code-like tokens → regex. Multi-word natural language → semantic. Override…

, content, flags=re.MULTILINE)\n\n for i in range(1, len(sections), 2):\n filename = sections[i].strip()\n body = sections[i + 1] if i + 1 \u003c len(sections) else \"\"\n\n if len(body.strip()) > 20:\n map_dir = str(Path(rel_path).parent)\n source_path = f\"{map_dir}/{filename}\" if map_dir != \".\" else filename\n chunks.append(Chunk(\n file=source_path, line=1, kind=\"map_entry\",\n name=filename, text=f\"{filename} {body}\",\n ))\n\n return chunks\n\n\ndef _extract_js_ts(filepath: str, rel_path: str) -> list[Chunk]:\n \"\"\"Extract functions, classes, and interfaces from JS/TS/TSX/JSX files.\n\n Covers: function declarations, arrow functions assigned to const/let/var,\n class declarations, interface/type declarations, exported members,\n and JSDoc comments preceding any of the above.\n \"\"\"\n try:\n with open(filepath, \"r\", errors=\"replace\") as f:\n content = f.read()\n except OSError:\n return []\n\n chunks = []\n\n # Module-level JSDoc or leading block comment\n mod_match = re.match(r'\\s*(/\\*\\*[\\s\\S]*?\\*/)', content)\n if mod_match:\n doc = mod_match.group(1).strip('/* \\n')\n if len(doc) > 20:\n chunks.append(Chunk(\n file=rel_path, line=1, kind=\"module_doc\",\n name=Path(rel_path).stem, text=doc,\n ))\n\n # --- Pattern 1: function/class/interface declarations ---\n # Captures: export? async? function name(...), class Name, interface Name, type Name\n decl_pattern = re.compile(\n r'(/\\*\\*[\\s\\S]*?\\*/\\s*)?' # optional JSDoc (group 1)\n r'^[ \\t]*(export\\s+(?:default\\s+)?)?' # optional export (group 2)\n r'(async\\s+)?' # optional async (group 3)\n r'(function\\*?|class|interface|type|enum)\\s+' # keyword (group 4)\n r'(\\w+)' # name (group 5)\n r'([^\\n]*)', # rest of line (group 6)\n re.MULTILINE\n )\n\n for match in decl_pattern.finditer(content):\n jsdoc = match.group(1) or \"\"\n jsdoc_clean = re.sub(r'[/*]', ' ', jsdoc).strip()\n keyword = match.group(4)\n name = match.group(5)\n rest = match.group(6).strip()\n line_num = content[:match.start()].count('\\n') + 1\n\n kind_map = {\n 'function': 'function', 'class': 'class',\n 'interface': 'class', 'type': 'class', 'enum': 'class',\n }\n # Strip trailing * from function*\n kind = kind_map.get(keyword.rstrip('*'), 'function')\n\n text_parts = [name, f\"{keyword} {name}{rest}\"]\n if jsdoc_clean:\n text_parts.append(jsdoc_clean)\n\n chunks.append(Chunk(\n file=rel_path, line=line_num, kind=kind,\n name=name, text=\" \".join(text_parts),\n ))\n\n # --- Pattern 2: arrow functions assigned to variables ---\n # const myFunc = (...) => { ... }\n # const myFunc: Type = (...) => ...\n arrow_pattern = re.compile(\n r'(/\\*\\*[\\s\\S]*?\\*/\\s*)?' # optional JSDoc\n r'^[ \\t]*(export\\s+(?:default\\s+)?)?' # optional export\n r'(const|let|var)\\s+' # binding keyword\n r'(\\w+)' # name (group 4)\n r'([^=]*?)\\s*=' # type annotation etc\n r'\\s*(?:async\\s+)?' # optional async\n r'\$[^)]*\$\\s*(?::\\s*[^=]+?)?\\s*=>', # arrow function signature\n re.MULTILINE\n )\n\n for match in arrow_pattern.finditer(content):\n jsdoc = match.group(1) or \"\"\n jsdoc_clean = re.sub(r'[/*]', ' ', jsdoc).strip()\n name = match.group(4)\n line_num = content[:match.start()].count('\\n') + 1\n\n # Skip if already captured by declaration pattern (unlikely but safe)\n text_parts = [name, match.group(0).split('=>')[0].strip()]\n if jsdoc_clean:\n text_parts.append(jsdoc_clean)\n\n chunks.append(Chunk(\n file=rel_path, line=line_num, kind=\"function\",\n name=name, text=\" \".join(text_parts),\n ))\n\n # --- Pattern 3: class methods (inside class bodies) ---\n method_pattern = re.compile(\n r'(/\\*\\*[\\s\\S]*?\\*/\\s*)?' # optional JSDoc\n r'^[ \\t]+((?:static|async|get|set|private|protected|public|readonly)\\s+)*'\n r'(\\w+)\\s*\\(', # method name + open paren\n re.MULTILINE\n )\n\n for match in method_pattern.finditer(content):\n jsdoc = match.group(1) or \"\"\n jsdoc_clean = re.sub(r'[/*]', ' ', jsdoc).strip()\n name = match.group(3)\n line_num = content[:match.start()].count('\\n') + 1\n\n # Skip common false positives\n if name in ('if', 'for', 'while', 'switch', 'catch', 'return',\n 'require', 'import', 'console', 'throw', 'new',\n 'typeof', 'instanceof', 'delete', 'void', 'yield',\n 'await', 'super', 'this'):\n continue\n\n text_parts = [name]\n if jsdoc_clean:\n text_parts.append(jsdoc_clean)\n\n chunks.append(Chunk(\n file=rel_path, line=line_num, kind=\"function\",\n name=name, text=\" \".join(text_parts),\n ))\n\n return chunks\n\n\ndef _extract_yaml(filepath: str, rel_path: str) -> list[Chunk]:\n \"\"\"Extract top-level keys and commented sections from YAML files.\n\n YAML files often contain infrastructure config, CI/CD pipelines,\n Kubernetes manifests, docker-compose services, etc. The top-level\n keys and their comments are the most searchable units.\n \"\"\"\n try:\n with open(filepath, \"r\", errors=\"replace\") as f:\n lines = f.readlines()\n except OSError:\n return []\n\n # Skip very large YAML files (likely generated, e.g. lockfiles)\n if len(lines) > 2000:\n return []\n\n chunks = []\n current_key = None\n current_lines = []\n current_start = 1\n comment_buffer = []\n\n for i, line in enumerate(lines, 1):\n stripped = line.strip()\n\n # Accumulate comment lines preceding a key\n if stripped.startswith('#'):\n comment_buffer.append(stripped.lstrip('#').strip())\n continue\n\n # Top-level key: no leading whitespace, ends with ':'\n top_key_match = re.match(r'^([a-zA-Z_][\\w.-]*)\\s*:', line)\n if top_key_match:\n # Flush previous section\n if current_key and current_lines:\n text = \" \".join(current_lines)\n if len(text) > 15:\n chunks.append(Chunk(\n file=rel_path, line=current_start, kind=\"section\",\n name=current_key, text=text,\n ))\n\n current_key = top_key_match.group(1)\n current_lines = [current_key]\n current_start = i\n\n # Include preceding comments as searchable text\n if comment_buffer:\n current_lines.extend(comment_buffer)\n comment_buffer = []\n\n elif current_key and stripped:\n # Nested content: include values and inline comments\n # Strip YAML syntax noise but keep identifiers and values\n clean = re.sub(r'^\\s*-\\s*', '', stripped)\n clean = clean.split('#')[0].strip() # strip inline comments into separate add\n inline_comment = stripped.split('#')[1].strip() if '#' in stripped else \"\"\n if clean:\n current_lines.append(clean)\n if inline_comment:\n current_lines.append(inline_comment)\n else:\n comment_buffer = []\n\n # Flush last section\n if current_key and current_lines:\n text = \" \".join(current_lines)\n if len(text) > 15:\n chunks.append(Chunk(\n file=rel_path, line=current_start, kind=\"section\",\n name=current_key, text=text,\n ))\n\n # Also create a file-level chunk with the whole file as context\n # (useful for small config files like docker-compose.yml)\n if len(lines) \u003c 100:\n all_text = \" \".join(\n l.strip() for l in lines\n if l.strip() and not l.strip().startswith('---')\n )\n if len(all_text) > 30:\n chunks.append(Chunk(\n file=rel_path, line=1, kind=\"module_doc\",\n name=Path(rel_path).stem, text=all_text,\n ))\n\n return chunks\n\n\n# ── Index ────────────────────────────────────────────────────────\n\nSKIP_DIRS = {\n '.git', 'node_modules', '__pycache__', '.venv', 'venv', 'dist',\n 'build', '.next', '.mypy_cache', '.pytest_cache', '.tox', '.eggs',\n '.ruff_cache', 'target', 'coverage', '.coverage',\n}\n\nEXTRACTORS = {\n '.py': _extract_python,\n '.md': _extract_markdown,\n '.js': _extract_js_ts,\n '.jsx': _extract_js_ts,\n '.ts': _extract_js_ts,\n '.tsx': _extract_js_ts,\n '.mjs': _extract_js_ts,\n '.mts': _extract_js_ts,\n '.yaml': _extract_yaml,\n '.yml': _extract_yaml,\n}\n\n\n@dataclass\nclass Index:\n \"\"\"TF-IDF index over code chunks.\"\"\"\n chunks: list[Chunk] = field(default_factory=list)\n vectorizer: Optional[TfidfVectorizer] = None\n matrix: Optional[object] = None\n build_time_ms: float = 0\n repo_path: str = \"\"\n\n def build(self, repo_path: str, skip_dirs: set[str] = None):\n \"\"\"Walk repo, extract chunks, build TF-IDF matrix.\"\"\"\n t0 = time.monotonic()\n self.repo_path = str(repo_path)\n skip = skip_dirs or SKIP_DIRS\n self.chunks = []\n\n for root, dirs, files in os.walk(repo_path):\n dirs[:] = [d for d in dirs if d not in skip and not d.startswith('.')]\n\n for fname in files:\n fpath = os.path.join(root, fname)\n rel = os.path.relpath(fpath, repo_path)\n ext = Path(fname).suffix.lower()\n\n if fname == \"_MAP.md\":\n self.chunks.extend(_extract_map_entries(fpath, rel))\n elif ext in EXTRACTORS:\n self.chunks.extend(EXTRACTORS[ext](fpath, rel))\n\n if not self.chunks:\n print(\"WARNING: No chunks extracted\", file=sys.stderr)\n return self\n\n self.vectorizer = TfidfVectorizer(\n ngram_range=(1, 2),\n sublinear_tf=True,\n max_df=0.80,\n min_df=2,\n stop_words=\"english\",\n max_features=50000,\n token_pattern=r'(?u)\\b[a-zA-Z_]\\w{1,}\\b',\n )\n self.matrix = self.vectorizer.fit_transform([c.text for c in self.chunks])\n self.build_time_ms = (time.monotonic() - t0) * 1000\n return self\n\n def search(self, query: str, top_k: int = 5, min_score: float = 0.01\n ) -> list[tuple[Chunk, float]]:\n \"\"\"Semantic search: rank chunks by cosine similarity to query.\"\"\"\n if self.vectorizer is None or self.matrix is None:\n return []\n\n q_vec = self.vectorizer.transform([query])\n scores = cosine_similarity(q_vec, self.matrix).flatten()\n indices = scores.argsort()[::-1]\n\n results = []\n for idx in indices:\n if scores[idx] \u003c min_score:\n break\n results.append((self.chunks[idx], float(scores[idx])))\n if len(results) >= top_k:\n break\n return results\n\n def search_grouped(self, query: str, top_k: int = 10, min_score: float = 0.01\n ) -> dict[str, list[tuple[Chunk, float]]]:\n \"\"\"Search and group by file — useful for feeding targets to grep.\"\"\"\n results = self.search(query, top_k=top_k * 3, min_score=min_score)\n grouped = {}\n for chunk, score in results:\n grouped.setdefault(chunk.file, []).append((chunk, score))\n sorted_files = sorted(\n grouped.items(), key=lambda x: x[1][0][1], reverse=True\n )\n return dict(sorted_files[:top_k])\n\n def stats(self) -> dict:\n if not self.chunks:\n return {\"chunks\": 0}\n kinds = {}\n for c in self.chunks:\n kinds[c.kind] = kinds.get(c.kind, 0) + 1\n return {\n \"chunks\": len(self.chunks),\n \"files\": len(set(c.file for c in self.chunks)),\n \"vocabulary\": len(self.vectorizer.get_feature_names_out()) if self.vectorizer else 0,\n \"build_ms\": round(self.build_time_ms),\n \"kinds\": kinds,\n }\n\n\n# ── Output formatting ────────────────────────────────────────────\n\ndef _sanitize(text: str) -> str:\n \"\"\"Strip HTML entities and normalize whitespace.\"\"\"\n import html\n return html.unescape(text).replace('\\n', ' ').strip()\n\n\ndef _format_results(results: list[tuple[Chunk, float]]) -> str:\n lines = []\n for chunk, score in results:\n preview = _sanitize(chunk.text[:140])\n if len(chunk.text) > 140:\n preview += \"...\"\n lines.append(f\" {score:.3f} [{chunk.kind}] {chunk.name} {chunk.loc}\")\n lines.append(f\" {preview}\")\n lines.append(\"\")\n return \"\\n\".join(lines)\n\n\ndef _format_grouped(grouped: dict[str, list[tuple[Chunk, float]]]) -> str:\n lines = []\n for filepath, hits in grouped.items():\n lines.append(f\"\\n {filepath}\")\n for chunk, score in hits:\n name = _sanitize(chunk.name)\n lines.append(\n f\" {score:.3f} {chunk.kind:12s} {name} :{chunk.line}\"\n )\n return \"\\n\".join(lines)\n\n\ndef _format_for_grep(grouped: dict[str, list[tuple[Chunk, float]]],\n repo_path: str) -> str:\n lines = [\"# Files ranked by relevance (use with grep -n -A25):\"]\n for filepath, hits in grouped.items():\n best_score = hits[0][1]\n names = [_sanitize(h[0].name) for h in hits[:3]]\n lines.append(f\"# score={best_score:.3f} matches: {', '.join(names)}\")\n lines.append(os.path.join(repo_path, filepath))\n return \"\\n\".join(lines)\n\n\n# ── CLI ──────────────────────────────────────────────────────────\n\ndef main():\n if len(sys.argv) \u003c 3:\n print(__doc__)\n sys.exit(1)\n\n cmd = sys.argv[1]\n repo_path = sys.argv[2]\n\n if cmd == \"index\":\n idx = Index()\n idx.build(repo_path)\n print(json.dumps(idx.stats(), indent=2))\n\n elif cmd == \"search\":\n if len(sys.argv) \u003c 4:\n print(\"Usage: code_rag.py search /path/to/repo \\\"query\\\" [options]\",\n file=sys.stderr)\n sys.exit(1)\n\n query = sys.argv[3]\n top_k = 5\n grouped = \"--grouped\" in sys.argv\n rg = \"--rg\" in sys.argv\n\n for i, arg in enumerate(sys.argv):\n if arg == \"--top\" and i + 1 \u003c len(sys.argv):\n top_k = int(sys.argv[i + 1])\n\n idx = Index()\n idx.build(repo_path)\n stats = idx.stats()\n print(f\"Indexed {stats['chunks']} chunks from {stats['files']} files \"\n f\"({stats['vocabulary']} features, {stats['build_ms']}ms)\",\n file=sys.stderr)\n\n if grouped or rg:\n results = idx.search_grouped(query, top_k=top_k)\n if rg:\n print(_format_for_grep(results, repo_path))\n else:\n print(_format_grouped(results))\n else:\n results = idx.search(query, top_k=top_k)\n if results:\n print(_format_results(results))\n else:\n print(\" No results above threshold.\", file=sys.stderr)\n\n else:\n print(f\"Unknown command: {cmd}\", file=sys.stderr)\n sys.exit(1)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":22048,"content_sha256":"46d721c89eb960be71feb6fac3970e58dd9d12a266b72dfbdc9ac1c0975a2e10"},{"filename":"scripts/context.py","content":"\"\"\"\nExpand search match lines into full structural context (functions/classes).\n\nUses tree-sitting's AST cache for symbol boundaries. Scans the repo once\non first expand call (~700ms), then all expansions are sub-millisecond.\nFalls back to a fixed-size context window if tree-sitting is unavailable.\n\"\"\"\n\nimport os\nimport sys\nfrom dataclasses import dataclass\nfrom typing import List, Optional\n\n\n@dataclass\nclass CodeContext:\n \"\"\"A structural code unit containing a match.\"\"\"\n file_path: str\n start_line: int\n end_line: int\n match_line: int\n node_type: str # \"function\", \"class\", \"method\"\n name: str\n source: str\n language: Optional[str] = None\n signature: Optional[str] = None\n\n\n# Lazily initialized tree-sitting cache\n_cache = None\n_cache_root = None\n\n\ndef _ensure_cache(search_root: str):\n \"\"\"Scan the repo with tree-sitting on first call. No-op on subsequent calls.\"\"\"\n global _cache, _cache_root\n if _cache is not None and _cache_root == search_root:\n return _cache\n\n try:\n ts_scripts = \"/mnt/skills/user/tree-sitting/scripts\"\n if ts_scripts not in sys.path:\n sys.path.insert(0, ts_scripts)\n from engine import CodeCache\n\n _cache = CodeCache()\n _cache.scan(search_root)\n _cache_root = search_root\n return _cache\n except Exception as e:\n print(f\"tree-sitting unavailable ({e}), using window fallback\",\n file=sys.stderr)\n return None\n\n\ndef expand_match(file_path: str, line_number: int, search_root: str,\n signatures_only: bool = True) -> Optional[CodeContext]:\n \"\"\"\n Expand a match at file:line into its containing function/class.\n\n Uses tree-sitting AST data for structural boundaries. Falls back to\n a context window around the match if tree-sitting is unavailable.\n\n Args:\n file_path: Absolute path to matched file\n line_number: 1-indexed line number of the match\n search_root: Root directory of the codebase\n signatures_only: Return only signature, not full body\n \"\"\"\n cache = _ensure_cache(search_root)\n if cache is not None:\n return _expand_from_ast(file_path, line_number, search_root,\n cache, signatures_only)\n return _expand_window(file_path, line_number)\n\n\ndef _expand_from_ast(file_path: str, line_number: int, search_root: str,\n cache, signatures_only: bool) -> Optional[CodeContext]:\n \"\"\"Expand using tree-sitting's parsed AST symbols.\"\"\"\n relpath = os.path.relpath(file_path, search_root)\n\n # Get symbols for this file from the cache\n entry = cache.files.get(relpath)\n if not entry or not entry.symbols:\n return _expand_window(file_path, line_number)\n\n # Find the innermost symbol containing this line\n containing = None\n for sym in entry.symbols:\n if sym.line \u003c= line_number \u003c= sym.end_line:\n # Prefer the most specific (innermost) match\n if containing is None:\n containing = sym\n else:\n if sym.line >= containing.line:\n containing = sym\n # Also check children (methods within classes)\n for child in getattr(sym, 'children', []):\n if child.line \u003c= line_number \u003c= child.end_line:\n if containing is None or child.line >= containing.line:\n containing = child\n\n if not containing:\n # No containing symbol — try nearest preceding symbol\n best = None\n for sym in entry.symbols:\n if sym.line \u003c= line_number:\n if best is None or sym.line > best.line:\n best = sym\n containing = best\n\n if not containing:\n return _expand_window(file_path, line_number)\n\n start_line = containing.line\n end_line = containing.end_line or start_line\n\n try:\n with open(file_path, \"r\") as f:\n lines = f.readlines()\n except (FileNotFoundError, PermissionError):\n return None\n\n # Ensure end_line doesn't exceed file\n end_line = min(end_line, len(lines))\n\n # Trim trailing blanks\n while end_line > start_line and not lines[end_line - 1].strip():\n end_line -= 1\n\n source = \"\".join(lines[start_line - 1:end_line])\n\n name = containing.name or \"\"\n kind = containing.kind or \"function\"\n # Normalize kind to node_type\n kind_map = {\n \"class\": \"class\", \"struct\": \"class\", \"interface\": \"class\",\n \"enum\": \"class\", \"trait\": \"class\",\n \"method\": \"method\", \"impl_method\": \"method\",\n \"function\": \"function\", \"func\": \"function\",\n }\n node_type = kind_map.get(kind, kind)\n\n signature = None\n if signatures_only:\n sig = containing.signature or \"\"\n if sig:\n signature = sig\n else:\n # Use first line as signature fallback\n first_line = lines[start_line - 1].rstrip() if start_line \u003c= len(lines) else name\n signature = first_line\n\n return CodeContext(\n file_path=file_path, start_line=start_line, end_line=end_line,\n match_line=line_number, node_type=node_type, name=name,\n source=source, language=entry.lang, signature=signature,\n )\n\n\ndef _expand_window(file_path: str, line_number: int,\n context: int = 10) -> Optional[CodeContext]:\n \"\"\"Fallback: return a fixed window around the match.\"\"\"\n try:\n with open(file_path, \"r\") as f:\n lines = f.readlines()\n except (FileNotFoundError, PermissionError):\n return None\n\n start = max(1, line_number - context)\n end = min(len(lines), line_number + context)\n source = \"\".join(lines[start - 1:end])\n\n ext = os.path.splitext(file_path)[1].lower()\n ext_to_lang = {\n \".py\": \"python\", \".js\": \"javascript\", \".ts\": \"typescript\",\n \".go\": \"go\", \".rs\": \"rust\", \".rb\": \"ruby\", \".java\": \"java\",\n \".c\": \"c\", \".cpp\": \"cpp\", \".cs\": \"csharp\",\n }\n\n return CodeContext(\n file_path=file_path, start_line=start, end_line=end,\n match_line=line_number, node_type=\"context\", name=\"\",\n source=source, language=ext_to_lang.get(ext),\n )\n\n\ndef deduplicate_contexts(contexts: List[CodeContext]) -> List[CodeContext]:\n \"\"\"Remove duplicate expansions (same function from multiple match lines).\"\"\"\n seen = set()\n unique = []\n for ctx in contexts:\n key = (ctx.file_path, ctx.start_line, ctx.end_line)\n if key not in seen:\n seen.add(key)\n unique.append(ctx)\n return unique\n","content_type":"text/x-python; charset=utf-8","language":"python","size":6550,"content_sha256":"1d8a414ed2cb3debc1a5c104cd655287c942051f5263193b48474684cda10d0c"},{"filename":"scripts/ngram_index.py","content":"\"\"\"\nInverted index for fast regex search using sparse n-grams.\n\nArchitecture:\n- Index maps n-gram hashes → sets of file IDs\n- File IDs map back to file paths\n- Query decomposes regex into literals → covering n-grams → posting list intersection\n- Candidates verified by actual regex match (ripgrep or Python re)\n\"\"\"\n\nimport os\nimport re\nimport sre_parse\nimport struct\nimport subprocess\nimport time\nimport zlib\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set, Tuple\n\nfrom sparse_ngrams import (\n FrequencyWeights,\n build_all,\n build_covering,\n compute_weights,\n ngram_hash,\n ngram_text,\n weight_crc32,\n)\n\n# Default extensions to index (source code)\nDEFAULT_EXTENSIONS = {\n \".py\", \".js\", \".jsx\", \".ts\", \".tsx\", \".mjs\", \".mts\",\n \".go\", \".rs\", \".rb\", \".java\", \".c\", \".h\", \".cpp\", \".hpp\", \".cc\",\n \".cs\", \".php\", \".swift\", \".kt\", \".scala\", \".lua\", \".zig\",\n \".sh\", \".bash\", \".zsh\", \".fish\",\n \".html\", \".css\", \".scss\", \".less\",\n \".json\", \".yaml\", \".yml\", \".toml\", \".xml\", \".md\", \".txt\", \".rst\",\n \".sql\", \".graphql\", \".proto\",\n \".dockerfile\", \".env\", \".ini\", \".cfg\", \".conf\",\n \".r\", \".R\", \".jl\", \".ex\", \".exs\", \".erl\", \".hrl\",\n \".vim\", \".el\", \".clj\", \".cljs\", \".ml\", \".mli\", \".hs\",\n}\n\n# Default directories to skip\nDEFAULT_SKIP_DIRS = {\n \".git\", \"node_modules\", \"__pycache__\", \".venv\", \"venv\",\n \"dist\", \"build\", \".next\", \".cache\", \"target\", \"vendor\",\n \".tox\", \".mypy_cache\", \".pytest_cache\", \"coverage\",\n \".idea\", \".vscode\", \".eclipse\",\n}\n\n# Max file size to index (skip giant generated files)\nMAX_FILE_SIZE = 1_000_000 # 1MB\n\n\n# @lat: [[code-intelligence#N-gram Indexing]]\nclass NgramIndex:\n \"\"\"\n Sparse n-gram inverted index for a directory of source files.\n\n Build once, query many times. Candidate files from index lookup\n are verified with actual regex matching for correctness.\n \"\"\"\n\n def __init__(self, weight_fn=None):\n # n-gram hash → set of file IDs\n self.postings: Dict[int, Set[int]] = {}\n # file ID → file path\n self.files: Dict[int, str] = {}\n # file path → file ID (reverse lookup)\n self._path_to_id: Dict[str, int] = {}\n # next file ID\n self._next_id: int = 0\n # weight function\n self._weight_fn = weight_fn or weight_crc32\n # frequency weights (trained from corpus)\n self._freq_weights: Optional[FrequencyWeights] = None\n # stats\n self.stats = {\n \"files_indexed\": 0,\n \"files_skipped\": 0,\n \"total_ngrams\": 0,\n \"unique_ngrams\": 0,\n \"index_time_ms\": 0,\n \"total_bytes\": 0,\n }\n\n def _assign_id(self, path: str) -> int:\n \"\"\"Assign a numeric ID to a file path.\"\"\"\n if path in self._path_to_id:\n return self._path_to_id[path]\n fid = self._next_id\n self._next_id += 1\n self.files[fid] = path\n self._path_to_id[path] = fid\n return fid\n\n def _should_index(self, path: str, skip_dirs: Set[str]) -> bool:\n \"\"\"Check if a file should be indexed.\"\"\"\n p = Path(path)\n\n # Check directory exclusions\n for part in p.parts:\n if part in skip_dirs:\n return False\n\n # Check extension\n suffix = p.suffix.lower()\n if suffix and suffix not in DEFAULT_EXTENSIONS:\n # Also accept extensionless files (Makefile, Dockerfile, etc.)\n if suffix:\n return False\n\n # Check if it's a Makefile/Dockerfile/etc (no extension)\n name = p.name.lower()\n known_names = {\n \"makefile\", \"dockerfile\", \"vagrantfile\", \"gemfile\",\n \"rakefile\", \"procfile\", \"brewfile\", \"justfile\",\n }\n if not suffix and name not in known_names:\n return False\n\n return True\n\n def build(\n self,\n root: str,\n skip_dirs: Optional[Set[str]] = None,\n use_frequency_weights: bool = True,\n verbose: bool = False,\n ):\n \"\"\"\n Build the index from all source files under root.\n\n If use_frequency_weights is True, makes two passes:\n 1. Train frequency table on all file contents\n 2. Index using frequency-based weights (rare pairs = high weight)\n \"\"\"\n skip = skip_dirs or DEFAULT_SKIP_DIRS\n t_start = time.monotonic()\n\n # Collect files\n file_paths = []\n for dirpath, dirnames, filenames in os.walk(root):\n # Prune skip dirs\n dirnames[:] = [d for d in dirnames if d not in skip]\n for fname in filenames:\n fpath = os.path.join(dirpath, fname)\n if self._should_index(fpath, skip):\n try:\n size = os.path.getsize(fpath)\n if size \u003c= MAX_FILE_SIZE and size > 0:\n file_paths.append(fpath)\n else:\n self.stats[\"files_skipped\"] += 1\n except OSError:\n self.stats[\"files_skipped\"] += 1\n\n if verbose:\n print(f\"Found {len(file_paths)} files to index\")\n\n # Pass 1: train frequency weights (optional)\n if use_frequency_weights:\n if verbose:\n print(\"Pass 1: Training frequency weights...\")\n fw = FrequencyWeights()\n for fpath in file_paths:\n try:\n with open(fpath, \"rb\") as f:\n data = f.read()\n fw.train(data)\n self.stats[\"total_bytes\"] += len(data)\n except (OSError, UnicodeDecodeError):\n pass\n fw.freeze()\n self._freq_weights = fw\n self._weight_fn = fw.weight\n if verbose:\n print(f\" Trained on {self.stats['total_bytes']:,} bytes\")\n else:\n # Single pass, count bytes as we go\n pass\n\n # Pass 2 (or only pass): build index\n if verbose:\n print(\"Building sparse n-gram index...\")\n total_ngrams = 0\n for i, fpath in enumerate(file_paths):\n try:\n with open(fpath, \"rb\") as f:\n data = f.read()\n\n if not use_frequency_weights:\n self.stats[\"total_bytes\"] += len(data)\n\n fid = self._assign_id(fpath)\n weights = compute_weights(data, self._weight_fn)\n ngrams = build_all(weights)\n\n for start, end in ngrams:\n h = ngram_hash(data, start, end)\n if h not in self.postings:\n self.postings[h] = set()\n self.postings[h].add(fid)\n total_ngrams += 1\n\n self.stats[\"files_indexed\"] += 1\n\n if verbose and (i + 1) % 500 == 0:\n print(f\" Indexed {i + 1}/{len(file_paths)} files...\")\n\n except (OSError, UnicodeDecodeError):\n self.stats[\"files_skipped\"] += 1\n\n elapsed_ms = (time.monotonic() - t_start) * 1000\n self.stats[\"total_ngrams\"] = total_ngrams\n self.stats[\"unique_ngrams\"] = len(self.postings)\n self.stats[\"index_time_ms\"] = round(elapsed_ms, 1)\n\n if verbose:\n print(f\"Index built: {self.stats['files_indexed']} files, \"\n f\"{self.stats['unique_ngrams']:,} unique n-grams, \"\n f\"{elapsed_ms:.0f}ms\")\n\n def _query_literal(self, literal: bytes) -> Optional[Set[int]]:\n \"\"\"\n Query the index with a literal byte string.\n Returns set of candidate file IDs, or None if no n-grams extracted.\n \"\"\"\n if len(literal) \u003c 3:\n # Too short for meaningful n-gram lookup\n return None\n\n weights = compute_weights(literal, self._weight_fn)\n covering = build_covering(weights)\n\n if not covering:\n return None\n\n # Intersect posting lists for all covering n-grams\n candidates = None\n for start, end in covering:\n h = ngram_hash(literal, start, end)\n posting = self.postings.get(h, set())\n if candidates is None:\n candidates = set(posting)\n else:\n candidates &= posting\n # Early termination: empty intersection\n if candidates is not None and not candidates:\n return set()\n\n return candidates\n\n def _eval_plan(self, plan: \"QueryPlan\") -> Optional[Set[int]]:\n \"\"\"\n Evaluate a query plan tree against the index.\n\n AND → intersect posting lists\n OR → union posting lists\n LITERAL → look up covering n-grams\n \"\"\"\n if plan.op == \"literal\":\n return self._query_literal(plan.literal)\n elif plan.op == \"and\":\n result = None\n for child in plan.children:\n child_ids = self._eval_plan(child)\n if child_ids is not None:\n if result is None:\n result = set(child_ids)\n else:\n result &= child_ids\n if not result:\n return set()\n return result\n elif plan.op == \"or\":\n result = set()\n for child in plan.children:\n child_ids = self._eval_plan(child)\n if child_ids is None:\n # This branch can't be indexed — any file could match\n # So the whole OR is unbounded\n return None\n result |= child_ids\n return result\n return None\n\n def search(\n self,\n pattern: str,\n root: str,\n max_results: int = 100,\n verbose: bool = False,\n ) -> List[dict]:\n \"\"\"\n Search for a regex pattern. Returns list of matches.\n\n Each match: {\"file\": path, \"line\": line_number, \"text\": line_text}\n\n Pipeline:\n 1. Parse regex into query plan tree (preserving AND/OR structure)\n 2. Evaluate plan against index (intersect/union posting lists)\n 3. Verify candidates with ripgrep (or Python re fallback)\n \"\"\"\n start = time.monotonic()\n\n # Build query plan from regex\n plan = extract_query_plan(pattern)\n\n if verbose:\n print(f\"Pattern: {pattern}\")\n print(f\"Query plan: {plan}\")\n\n # Evaluate plan against index\n candidate_ids: Optional[Set[int]] = None\n if plan is not None:\n candidate_ids = self._eval_plan(plan)\n\n if candidate_ids is None:\n # No usable plan — fall back to scanning all files\n candidate_files = list(self.files.values())\n if verbose:\n print(f\"No usable literals — scanning all {len(candidate_files)} files\")\n else:\n candidate_files = [self.files[fid] for fid in candidate_ids if fid in self.files]\n if verbose:\n reduction = (1 - len(candidate_files) / max(len(self.files), 1)) * 100\n print(f\"Index narrowed to {len(candidate_files)}/{len(self.files)} \"\n f\"files ({reduction:.0f}% reduction)\")\n\n # Verify with actual regex matching\n matches = verify_candidates(pattern, candidate_files, root, max_results, verbose)\n\n elapsed_ms = (time.monotonic() - start) * 1000\n if verbose:\n print(f\"Search completed: {len(matches)} matches in {elapsed_ms:.0f}ms\")\n\n return matches\n\n\nclass QueryPlan:\n \"\"\"\n Tree structure representing how to combine index lookups.\n \n AND nodes: all children must match (intersect posting lists)\n OR nodes: any child can match (union posting lists)\n LITERAL nodes: leaf — look up in index\n \"\"\"\n __slots__ = (\"op\", \"children\", \"literal\")\n\n def __init__(self, op: str, children=None, literal: bytes = None):\n self.op = op # \"and\", \"or\", \"literal\"\n self.children = children or []\n self.literal = literal\n\n def __repr__(self):\n if self.op == \"literal\":\n return f\"LIT({self.literal!r})\"\n kids = \", \".join(repr(c) for c in self.children)\n return f\"{self.op.upper()}({kids})\"\n\n\ndef extract_query_plan(pattern: str) -> Optional[QueryPlan]:\n \"\"\"\n Parse a regex and produce a QueryPlan tree preserving AND/OR structure.\n\n Sequential literals → AND (intersect)\n Alternations (a|b) → OR (union)\n \"\"\"\n try:\n parsed = sre_parse.parse(pattern)\n except Exception:\n encoded = pattern.encode(\"utf-8\")\n if len(encoded) >= 3:\n return QueryPlan(\"literal\", literal=encoded)\n return None\n\n plan = _build_plan(parsed)\n return _simplify(plan)\n\n\ndef _build_plan(parsed) -> Optional[QueryPlan]:\n \"\"\"Recursively build a query plan from parsed regex.\"\"\"\n parts = [] # AND children from sequential processing\n current = bytearray()\n\n def flush():\n nonlocal current\n if current:\n parts.append(QueryPlan(\"literal\", literal=bytes(current)))\n current = bytearray()\n\n for op, av in parsed:\n if op == sre_parse.LITERAL:\n current.append(av)\n elif op == sre_parse.AT:\n pass # anchors\n elif op == sre_parse.SUBPATTERN:\n if av[3] is not None:\n sub = _build_plan(av[3])\n if sub:\n flush()\n parts.append(sub)\n elif op == sre_parse.BRANCH:\n flush()\n branches = []\n for branch in av[1]:\n bp = _build_plan(branch)\n if bp:\n branches.append(bp)\n if branches:\n if len(branches) == 1:\n parts.append(branches[0])\n else:\n parts.append(QueryPlan(\"or\", children=branches))\n else:\n flush()\n\n flush()\n\n if not parts:\n return None\n if len(parts) == 1:\n return parts[0]\n return QueryPlan(\"and\", children=parts)\n\n\ndef _simplify(plan: Optional[QueryPlan]) -> Optional[QueryPlan]:\n \"\"\"Flatten nested AND(AND(...)) and OR(OR(...)) nodes.\"\"\"\n if plan is None:\n return None\n if plan.op == \"literal\":\n return plan\n\n new_children = []\n for child in plan.children:\n s = _simplify(child)\n if s is None:\n continue\n # Flatten: AND(AND(a,b), c) → AND(a,b,c)\n if s.op == plan.op:\n new_children.extend(s.children)\n else:\n new_children.append(s)\n\n if not new_children:\n return None\n if len(new_children) == 1:\n return new_children[0]\n return QueryPlan(plan.op, children=new_children)\n\n\ndef extract_literals(pattern: str) -> List[bytes]:\n \"\"\"\n Legacy interface: extract flat list of AND-literals from a pattern.\n For simple patterns without alternation — all literals must be present.\n \"\"\"\n plan = extract_query_plan(pattern)\n if plan is None:\n return []\n lits = []\n _collect_and_literals(plan, lits)\n return lits\n\n\ndef _collect_and_literals(plan: QueryPlan, out: list):\n \"\"\"Collect literals from AND nodes only (conservative: no OR).\"\"\"\n if plan.op == \"literal\":\n out.append(plan.literal)\n elif plan.op == \"and\":\n for child in plan.children:\n _collect_and_literals(child, out)\n # OR nodes are skipped — can't AND their children\n\n\ndef verify_candidates(\n pattern: str,\n candidate_files: List[str],\n root: str,\n max_results: int = 100,\n verbose: bool = False,\n) -> List[dict]:\n \"\"\"\n Verify candidate files by actually running the regex.\n Uses ripgrep if available, falls back to Python re.\n \"\"\"\n if not candidate_files:\n return []\n\n # Try ripgrep first\n rg = _find_ripgrep()\n if rg:\n return _verify_ripgrep(rg, pattern, candidate_files, max_results, verbose)\n else:\n return _verify_python(pattern, candidate_files, max_results, verbose)\n\n\ndef _find_ripgrep() -> Optional[str]:\n \"\"\"Find ripgrep binary.\"\"\"\n for name in [\"rg\", \"ripgrep\"]:\n try:\n result = subprocess.run(\n [\"which\", name], capture_output=True, text=True\n )\n if result.returncode == 0:\n return result.stdout.strip()\n except FileNotFoundError:\n pass\n return None\n\n\ndef _verify_ripgrep(\n rg_path: str,\n pattern: str,\n files: List[str],\n max_results: int,\n verbose: bool,\n) -> List[dict]:\n \"\"\"Verify candidates using ripgrep for speed.\"\"\"\n matches = []\n\n # ripgrep can accept file list via stdin with --files-from\n # But simpler: pass files as arguments (may hit ARG_MAX for huge lists)\n # For large lists, batch them\n BATCH_SIZE = 500\n for i in range(0, len(files), BATCH_SIZE):\n batch = files[i : i + BATCH_SIZE]\n cmd = [\n rg_path,\n \"--no-heading\",\n \"--with-filename\",\n \"--line-number\",\n \"--color=never\",\n \"--max-count=50\", # limit per file\n \"-e\", pattern,\n ] + batch\n\n try:\n result = subprocess.run(\n cmd, capture_output=True, text=True, timeout=30\n )\n for line in result.stdout.splitlines():\n # Format: file:line:text — but file paths could contain colons\n # Use --line-number to guarantee line field is numeric\n parts = line.split(\":\", 2)\n if len(parts) >= 3 and parts[1].isdigit():\n matches.append({\n \"file\": parts[0],\n \"line\": int(parts[1]),\n \"text\": parts[2],\n })\n if len(matches) >= max_results:\n return matches\n except (subprocess.TimeoutExpired, subprocess.SubprocessError) as e:\n if verbose:\n print(f\"ripgrep error: {e}\")\n\n return matches\n\n\ndef _verify_python(\n pattern: str,\n files: List[str],\n max_results: int,\n verbose: bool,\n) -> List[dict]:\n \"\"\"Verify candidates using Python re (fallback).\"\"\"\n matches = []\n try:\n compiled = re.compile(pattern)\n except re.error as e:\n if verbose:\n print(f\"Invalid regex: {e}\")\n return []\n\n for fpath in files:\n try:\n with open(fpath, \"r\", errors=\"replace\") as f:\n for line_num, line in enumerate(f, 1):\n if compiled.search(line):\n matches.append({\n \"file\": fpath,\n \"line\": line_num,\n \"text\": line.rstrip(),\n })\n if len(matches) >= max_results:\n return matches\n except OSError:\n pass\n\n return matches\n\n\ndef _brute_force_search(\n pattern: str,\n root: str,\n skip_dirs: Set[str],\n max_results: int = 100,\n) -> List[dict]:\n \"\"\"\n Brute-force search (no index) for benchmarking comparison.\n Walks all files and matches with ripgrep or Python re.\n \"\"\"\n rg = _find_ripgrep()\n if rg:\n cmd = [\n rg,\n \"--no-heading\",\n \"--line-number\",\n \"--color=never\",\n ]\n for d in skip_dirs:\n cmd.extend([\"--glob\", f\"!{d}\"])\n cmd.extend([\"-e\", pattern, root])\n\n try:\n result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)\n matches = []\n for line in result.stdout.splitlines():\n parts = line.split(\":\", 2)\n if len(parts) >= 3 and parts[1].isdigit():\n matches.append({\n \"file\": parts[0],\n \"line\": int(parts[1]),\n \"text\": parts[2],\n })\n if len(matches) >= max_results:\n break\n return matches\n except (subprocess.TimeoutExpired, subprocess.SubprocessError):\n pass\n\n # Fallback: Python re over all files\n try:\n compiled = re.compile(pattern)\n except re.error:\n return []\n\n matches = []\n for dirpath, dirnames, filenames in os.walk(root):\n dirnames[:] = [d for d in dirnames if d not in skip_dirs]\n for fname in filenames:\n fpath = os.path.join(dirpath, fname)\n try:\n size = os.path.getsize(fpath)\n if size > MAX_FILE_SIZE or size == 0:\n continue\n with open(fpath, \"r\", errors=\"replace\") as f:\n for line_num, line in enumerate(f, 1):\n if compiled.search(line):\n matches.append({\n \"file\": fpath,\n \"line\": line_num,\n \"text\": line.rstrip(),\n })\n if len(matches) >= max_results:\n return matches\n except (OSError, UnicodeDecodeError):\n pass\n return matches\n","content_type":"text/x-python; charset=utf-8","language":"python","size":21371,"content_sha256":"64ec4d3188d84fcd05e0ea826275f519f2ed0e47a562c252404c88824ae2c71e"},{"filename":"scripts/resolve.py","content":"\"\"\"\nResolve code sources to a local directory path.\n\nHandles: GitHub URLs, local directories, uploaded archives,\nuploaded files, project knowledge files.\n\"\"\"\n\nimport os\nimport shutil\nimport subprocess\nimport tarfile\nimport tempfile\nimport urllib.request\nimport zipfile\nfrom pathlib import Path\nfrom typing import Optional\n\n\nWORK_DIR = \"/home/claude/code-search-workspace\"\n\n\ndef resolve(source: str, branch: str = \"main\") -> str:\n \"\"\"\n Resolve a source to a local directory path.\n\n Args:\n source: GitHub URL, local path, or \"uploads\" / \"project\"\n branch: Git branch for GitHub URLs\n\n Returns:\n Absolute path to a directory containing the code\n \"\"\"\n # GitHub URL\n if source.startswith((\"http://\", \"https://\")):\n return _resolve_github(source, branch)\n\n # Explicit \"uploads\" keyword\n if source.lower() in (\"uploads\", \"uploaded\"):\n return _resolve_uploads()\n\n # Explicit \"project\" keyword\n if source.lower() in (\"project\", \"project-knowledge\"):\n return _resolve_project()\n\n # Archive file path\n p = os.path.expanduser(source)\n if os.path.isfile(p):\n return _resolve_archive(p)\n\n # Local directory\n if os.path.isdir(p):\n return os.path.abspath(p)\n\n raise FileNotFoundError(f\"Cannot resolve source: {source}\")\n\n\ndef _resolve_github(url: str, branch: str) -> str:\n \"\"\"Download a GitHub repo tarball and extract it.\"\"\"\n url = url.rstrip(\"/\")\n if url.endswith(\".git\"):\n url = url[:-4]\n\n parts = url.replace(\"https://github.com/\", \"\").split(\"/\")\n if len(parts) \u003c 2:\n raise ValueError(f\"Cannot parse GitHub URL: {url}\")\n\n owner, repo = parts[0], parts[1]\n dest = os.path.join(WORK_DIR, f\"{owner}-{repo}\")\n\n # Clean previous download\n if os.path.exists(dest):\n shutil.rmtree(dest)\n os.makedirs(dest, exist_ok=True)\n\n tarball_url = f\"https://codeload.github.com/{owner}/{repo}/tar.gz/{branch}\"\n tar_path = os.path.join(dest, \"repo.tar.gz\")\n\n for attempt in range(3):\n try:\n urllib.request.urlretrieve(tarball_url, tar_path)\n with tarfile.open(tar_path) as tf:\n tf.extractall(dest)\n os.remove(tar_path)\n # Find extracted directory\n for entry in os.listdir(dest):\n ep = os.path.join(dest, entry)\n if os.path.isdir(ep):\n return ep\n except Exception as e:\n if attempt == 2:\n raise RuntimeError(f\"Failed to download {owner}/{repo}@{branch}: {e}\")\n continue\n\n return dest\n\n\ndef _resolve_uploads() -> str:\n \"\"\"Use uploaded files from /mnt/user-data/uploads/.\"\"\"\n uploads = \"/mnt/user-data/uploads\"\n if not os.path.isdir(uploads):\n raise FileNotFoundError(\"No uploads directory found\")\n\n entries = os.listdir(uploads)\n if not entries:\n raise FileNotFoundError(\"No uploaded files found\")\n\n # If there's a single archive, extract it\n archives = [e for e in entries if e.endswith((\".zip\", \".tar.gz\", \".tgz\", \".tar\"))]\n if len(archives) == 1 and len(entries) == 1:\n return _resolve_archive(os.path.join(uploads, archives[0]))\n\n # Otherwise use the uploads directory as-is\n return uploads\n\n\ndef _resolve_project() -> str:\n \"\"\"Use project knowledge files from /mnt/project/.\"\"\"\n project = \"/mnt/project\"\n if not os.path.isdir(project):\n raise FileNotFoundError(\"No project directory found\")\n return project\n\n\ndef _resolve_archive(path: str) -> str:\n \"\"\"Extract an archive to a temp directory.\"\"\"\n dest = os.path.join(WORK_DIR, \"extracted\")\n if os.path.exists(dest):\n shutil.rmtree(dest)\n os.makedirs(dest)\n\n if path.endswith(\".zip\"):\n with zipfile.ZipFile(path) as zf:\n zf.extractall(dest)\n elif path.endswith((\".tar.gz\", \".tgz\")):\n with tarfile.open(path) as tf:\n tf.extractall(dest)\n elif path.endswith(\".tar\"):\n with tarfile.open(path) as tf:\n tf.extractall(dest)\n else:\n raise ValueError(f\"Unknown archive format: {path}\")\n\n # If archive contained a single directory, return that\n entries = os.listdir(dest)\n if len(entries) == 1 and os.path.isdir(os.path.join(dest, entries[0])):\n return os.path.join(dest, entries[0])\n return dest\n\n\ndef count_files(root: str, skip_dirs: set = None) -> int:\n \"\"\"Quick file count for deciding whether indexing is worthwhile.\"\"\"\n skip = skip_dirs or {\".git\", \"node_modules\", \"__pycache__\", \".venv\", \"venv\",\n \"dist\", \"build\", \".next\", \"target\", \"vendor\"}\n count = 0\n for _, dirs, files in os.walk(root):\n dirs[:] = [d for d in dirs if d not in skip]\n count += len(files)\n return count\n","content_type":"text/x-python; charset=utf-8","language":"python","size":4768,"content_sha256":"06a61ce2dacb80327ae0817d7b6222660047f8414944c2b2852ad81b44eb2af7"},{"filename":"scripts/search.py","content":"#!/usr/bin/env python3\n\"\"\"\nUnified code search: regex (n-gram indexed) and semantic (TF-IDF).\n\nUsage:\n # Auto-detect query type\n python search.py /path/to/repo \"def handle_error\"\n python search.py /path/to/repo \"retry logic with backoff\"\n\n # Explicit mode\n python search.py /path/to/repo \"class.*Error\" --regex\n python search.py /path/to/repo \"error handling\" --semantic\n\n # Multiple queries\n python search.py /path/to/repo \"def test_\" \"import os\" \"TODO|FIXME\"\n\n # GitHub repo\n python search.py https://github.com/org/repo \"authentication flow\"\n\n # Expand to full function bodies via tree-sitting AST\n python search.py /path/to/repo \"query\" --expand\n\n # Benchmark regex search: indexed vs brute-force\n python search.py /path/to/repo \"pattern\" --benchmark\n\n # JSON output\n python search.py /path/to/repo \"query\" --json\n\"\"\"\n\nimport argparse\nimport json\nimport os\nimport re\nimport subprocess\nimport sys\nimport time\n\n# Add script directory to path\nsys.path.insert(0, os.path.dirname(__file__))\n\nfrom resolve import resolve, count_files\n\n\n# Regex metacharacters that signal \"this is a regex, not natural language\"\n_REGEX_META = {'*', '+', '?', '[', ']', '(', ')', '{', '}', '|', '^', '

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, '\\\\', '.'}\n# Only flag as regex if the \"exotic\" ones appear (not just . or parens)\n_STRONG_REGEX_META = {'*', '+', '?', '[', ']', '{', '}', '^', '

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, '\\\\', '|'}\n\n\n# @lat: [[code-intelligence#Multi-Modal Search]]\ndef detect_mode(query: str) -> str:\n \"\"\"\n Heuristic: is this query a regex/literal or a conceptual search?\n\n Returns \"regex\" or \"semantic\".\n \"\"\"\n # Explicit regex markers\n if any(c in query for c in _STRONG_REGEX_META):\n return \"regex\"\n\n # Short queries with code-like tokens → regex (literal search)\n words = query.split()\n if len(words) \u003c= 3:\n # Looks like code: contains underscores, dots, camelCase, parens\n if any(c in query for c in \"_.()\"):\n return \"regex\"\n # Single identifier\n if len(words) == 1:\n return \"regex\"\n\n # Multi-word queries without code markers → semantic\n if len(words) >= 3:\n return \"semantic\"\n\n return \"regex\"\n\n\ndef search_regex(root: str, queries: list, expand: bool = False,\n benchmark: bool = False, verbose: bool = False,\n skip_dirs: set = None) -> dict:\n \"\"\"\n Regex/literal search using sparse n-gram index.\n\n Returns {query: [matches]} where each match has file, line, text,\n and optionally context (expanded function).\n \"\"\"\n from ngram_index import NgramIndex, _brute_force_search\n\n # Build index\n index = NgramIndex()\n file_count = count_files(root, skip_dirs)\n\n # Skip indexing for tiny codebases — just use ripgrep directly\n if file_count \u003c 20 and not benchmark:\n if verbose:\n print(f\"Small codebase ({file_count} files), using direct search\", file=sys.stderr)\n results = {}\n for q in queries:\n from ngram_index import _brute_force_search, DEFAULT_SKIP_DIRS\n matches = _brute_force_search(q, root, skip_dirs or DEFAULT_SKIP_DIRS)\n results[q] = _maybe_expand(matches, root, expand)\n return results\n\n index.build(root, skip_dirs=skip_dirs,\n use_frequency_weights=True, verbose=verbose)\n\n if verbose:\n s = index.stats\n print(f\"Index: {s['files_indexed']} files, {s['unique_ngrams']:,} n-grams, \"\n f\"{s['index_time_ms']:.0f}ms\", file=sys.stderr)\n\n results = {}\n for q in queries:\n if benchmark:\n _run_benchmark(index, q, root, skip_dirs or set())\n else:\n matches = index.search(q, root, max_results=500, verbose=verbose)\n results[q] = _maybe_expand(matches, root, expand)\n\n return results\n\n\ndef search_semantic(root: str, queries: list, expand: bool = False,\n verbose: bool = False, skip_dirs: set = None) -> dict:\n \"\"\"\n Semantic search using TF-IDF over code chunks.\n\n Returns {query: [matches]} with file, line, text, score.\n \"\"\"\n # Ensure sklearn is available\n try:\n from code_rag import Index\n except ImportError:\n subprocess.run(\n [\"uv\", \"pip\", \"install\", \"scikit-learn\", \"--system\"],\n capture_output=True,\n )\n from code_rag import Index\n\n index = Index()\n index.build(root, skip_dirs=skip_dirs)\n\n if verbose:\n s = index.stats()\n print(f\"TF-IDF index: {s.get('chunks', 0)} chunks, {s.get('vocabulary', 0)} terms, \"\n f\"{s.get('build_ms', 0)}ms\", file=sys.stderr)\n\n results = {}\n for q in queries:\n hits = index.search(q, top_k=20)\n matches = []\n for chunk, score in hits:\n matches.append({\n \"file\": os.path.join(root, chunk.file) if not os.path.isabs(chunk.file) else chunk.file,\n \"line\": chunk.line,\n \"text\": chunk.text[:200],\n \"score\": round(score, 4),\n \"kind\": chunk.kind,\n \"name\": chunk.name,\n })\n results[q] = matches\n\n return results\n\n\ndef _maybe_expand(matches: list, root: str, expand: bool) -> list:\n \"\"\"Optionally expand matches to full function context.\"\"\"\n if not expand:\n return matches\n\n from context import expand_match, deduplicate_contexts\n\n contexts = []\n for m in matches:\n ctx = expand_match(m[\"file\"], m[\"line\"], root, signatures_only=False)\n if ctx:\n m[\"context\"] = {\n \"name\": ctx.name,\n \"type\": ctx.node_type,\n \"start_line\": ctx.start_line,\n \"end_line\": ctx.end_line,\n \"source\": ctx.source,\n }\n return matches\n\n\ndef _run_benchmark(index, pattern, root, skip_dirs):\n \"\"\"Compare indexed search vs brute-force ripgrep.\"\"\"\n from ngram_index import _brute_force_search\n\n t0 = time.monotonic()\n indexed = index.search(pattern, root, max_results=5000, verbose=False)\n t_idx = (time.monotonic() - t0) * 1000\n\n t0 = time.monotonic()\n brute = _brute_force_search(pattern, root, skip_dirs, max_results=5000)\n t_brute = (time.monotonic() - t0) * 1000\n\n idx_files = {m[\"file\"] for m in indexed}\n brute_files = {m[\"file\"] for m in brute}\n missed = brute_files - idx_files\n\n print(f\"\\n{'='*60}\")\n print(f\"BENCHMARK: '{pattern}'\")\n print(f\"{'='*60}\")\n print(f\" Indexed: {t_idx:8.1f}ms ({len(indexed)} matches, {len(idx_files)} files)\")\n print(f\" Brute rg: {t_brute:8.1f}ms ({len(brute)} matches, {len(brute_files)} files)\")\n if t_brute > 0:\n print(f\" Speedup: {t_brute / max(t_idx, 0.1):.1f}x\")\n if missed:\n print(f\" ⚠ Missed: {len(missed)} files\")\n elif not (idx_files - brute_files):\n print(f\" ✓ Results match\")\n\n\ndef format_results(results: dict, root: str, output_json: bool = False) -> str:\n \"\"\"Format search results for display.\"\"\"\n if output_json:\n # Make paths relative\n for q, matches in results.items():\n for m in matches:\n try:\n m[\"file\"] = os.path.relpath(m[\"file\"], root)\n except ValueError:\n pass\n return json.dumps(results, indent=2)\n\n lines = []\n for query, matches in results.items():\n if len(results) > 1:\n lines.append(f\"\\n--- {query} ---\")\n\n if not matches:\n lines.append(\"No matches found.\")\n continue\n\n lines.append(f\"{len(matches)} match{'es' if len(matches) != 1 else ''}\")\n\n for m in matches[:30]:\n try:\n rel = os.path.relpath(m[\"file\"], root)\n except ValueError:\n rel = m[\"file\"]\n\n if \"score\" in m:\n lines.append(f\" {rel}:{m['line']} [{m['score']:.3f}] {m.get('name', '')}\")\n else:\n text = m[\"text\"][:150].rstrip()\n lines.append(f\" {rel}:{m['line']}: {text}\")\n\n if \"context\" in m:\n ctx = m[\"context\"]\n lines.append(f\" → {ctx['type']} {ctx['name']} \"\n f\"(lines {ctx['start_line']}-{ctx['end_line']})\")\n\n if len(matches) > 30:\n lines.append(f\" ... and {len(matches) - 30} more\")\n\n return \"\\n\".join(lines)\n\n\ndef main():\n parser = argparse.ArgumentParser(description=\"Unified code search\")\n parser.add_argument(\"source\", help=\"Path, GitHub URL, 'uploads', or 'project'\")\n parser.add_argument(\"queries\", nargs=\"+\", help=\"Search queries\")\n parser.add_argument(\"--regex\", action=\"store_true\", help=\"Force regex mode\")\n parser.add_argument(\"--semantic\", action=\"store_true\", help=\"Force semantic mode\")\n parser.add_argument(\"--expand\", action=\"store_true\", help=\"Expand to full function bodies\")\n parser.add_argument(\"--benchmark\", action=\"store_true\", help=\"Benchmark indexed vs brute-force\")\n parser.add_argument(\"--branch\", default=\"main\", help=\"Git branch for GitHub URLs\")\n parser.add_argument(\"--skip\", default=None, help=\"Comma-separated directories to skip\")\n parser.add_argument(\"--json\", action=\"store_true\", help=\"JSON output\")\n parser.add_argument(\"-v\", \"--verbose\", action=\"store_true\")\n\n args = parser.parse_args()\n\n # Resolve source\n root = resolve(args.source, args.branch)\n if args.verbose:\n print(f\"Resolved: {root} ({count_files(root)} files)\", file=sys.stderr)\n\n skip_dirs = None\n if args.skip:\n skip_dirs = set(args.skip.split(\",\"))\n\n # Route queries\n all_results = {}\n for query in args.queries:\n if args.regex:\n mode = \"regex\"\n elif args.semantic:\n mode = \"semantic\"\n else:\n mode = detect_mode(query)\n\n if args.verbose:\n print(f\"Query: '{query}' → {mode} mode\", file=sys.stderr)\n\n # Batch by mode for efficiency (one index build per mode)\n regex_queries = []\n semantic_queries = []\n for query in args.queries:\n if args.regex:\n regex_queries.append(query)\n elif args.semantic:\n semantic_queries.append(query)\n else:\n mode = detect_mode(query)\n if mode == \"regex\":\n regex_queries.append(query)\n else:\n semantic_queries.append(query)\n\n if regex_queries:\n results = search_regex(root, regex_queries, expand=args.expand,\n benchmark=args.benchmark, verbose=args.verbose,\n skip_dirs=skip_dirs)\n all_results.update(results)\n\n if semantic_queries:\n results = search_semantic(root, semantic_queries, expand=args.expand,\n verbose=args.verbose, skip_dirs=skip_dirs)\n all_results.update(results)\n\n if not args.benchmark:\n print(format_results(all_results, root, args.json))\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":10971,"content_sha256":"d1fc135aaf7ada59c0e139d125246df36654a96e79977f038cf4161165aa22fd"},{"filename":"scripts/sparse_ngrams.py","content":"\"\"\"\nSparse N-gram extraction for fast regex search indexing.\n\nBased on the approach described by Cursor (2026): variable-length n-grams\nselected deterministically via a weight function over character pairs.\n\nTwo modes:\n- build_all: Extract ALL valid sparse n-grams (used at index time)\n- build_covering: Extract MINIMAL covering set (used at query time)\n\nA sparse n-gram is a substring where the character-pair weights at both\nboundary positions are strictly greater than all interior weights.\n\"\"\"\n\nimport zlib\nfrom typing import List, Tuple, Optional\nfrom collections import Counter\n\n\ndef weight_crc32(a: int, b: int) -> int:\n \"\"\"CRC32-based weight for a character pair. Deterministic, uniform.\"\"\"\n return zlib.crc32(bytes([a, b])) & 0xFFFFFFFF\n\n\nclass FrequencyWeights:\n \"\"\"\n Frequency-based weight function: rare character pairs get HIGH weights,\n common pairs get LOW weights. This produces longer n-grams at rare\n boundaries (more selective posting lists) and shorter n-grams at common\n boundaries (acceptable since they appear everywhere anyway).\n \"\"\"\n\n def __init__(self):\n self._freq: dict[tuple[int, int], int] = {}\n self._max_freq: int = 1\n self._frozen = False\n\n def train(self, data: bytes):\n \"\"\"Accumulate character pair frequencies from training data.\"\"\"\n if self._frozen:\n raise RuntimeError(\"Cannot train after freezing\")\n for i in range(len(data) - 1):\n pair = (data[i], data[i + 1])\n self._freq[pair] = self._freq.get(pair, 0) + 1\n\n def freeze(self):\n \"\"\"Finalize the frequency table. Converts frequencies to weights.\"\"\"\n if self._freq:\n self._max_freq = max(self._freq.values())\n self._frozen = True\n\n def weight(self, a: int, b: int) -> int:\n \"\"\"\n Weight for a character pair. Higher = rarer.\n Uses inverted frequency: rare pairs get high weights.\n Falls back to CRC32 for unseen pairs (treated as very rare).\n \"\"\"\n if not self._frozen:\n raise RuntimeError(\"Must freeze() before computing weights\")\n freq = self._freq.get((a, b), 0)\n if freq == 0:\n # Unseen pair = very rare = high weight\n return self._max_freq + weight_crc32(a, b) % (self._max_freq // 2 + 1)\n # Invert: rare = high weight\n return self._max_freq - freq + 1\n\n def save(self) -> bytes:\n \"\"\"Serialize frequency table.\"\"\"\n import json\n data = {\n \"freq\": {f\"{a},{b}\": c for (a, b), c in self._freq.items()},\n \"max_freq\": self._max_freq,\n }\n return json.dumps(data).encode()\n\n @classmethod\n def load(cls, raw: bytes) -> \"FrequencyWeights\":\n \"\"\"Deserialize frequency table.\"\"\"\n import json\n data = json.loads(raw)\n w = cls()\n w._freq = {\n (int(k.split(\",\")[0]), int(k.split(\",\")[1])): v\n for k, v in data[\"freq\"].items()\n }\n w._max_freq = data[\"max_freq\"]\n w._frozen = True\n return w\n\n\ndef compute_weights(\n text: bytes, weight_fn=weight_crc32\n) -> List[int]:\n \"\"\"Compute weights for all consecutive character pairs in text.\"\"\"\n if len(text) \u003c 2:\n return []\n return [weight_fn(text[i], text[i + 1]) for i in range(len(text) - 1)]\n\n\ndef build_all(weights: List[int]) -> List[Tuple[int, int]]:\n \"\"\"\n Extract ALL valid sparse n-grams from a weight sequence.\n\n Uses a monotone stack algorithm (O(n) amortized).\n\n Returns list of (start_pair_pos, end_pair_pos) where each n-gram\n spans characters [start_pair_pos, end_pair_pos + 2) in the original text.\n\n A sparse n-gram from pair position a to pair position b is valid iff\n w[a] > w[k] and w[b] > w[k] for all a \u003c k \u003c b.\n \"\"\"\n n = len(weights)\n if n == 0:\n return []\n if n == 1:\n return [(0, 0)]\n\n ngrams = []\n # Monotone decreasing stack of pair positions\n stack: List[int] = []\n\n for i in range(n):\n # Pop positions dominated by current weight\n while stack and weights[i] >= weights[stack[-1]]:\n j = stack.pop()\n # (j, i) is valid: j and i are both >= weights[j],\n # and everything between j and i on the stack was already\n # popped (so had weight \u003c w[j] \u003c w[i])\n ngrams.append((j, i))\n\n # Adjacent stack entry to current position forms valid n-gram\n if stack:\n ngrams.append((stack[-1], i))\n\n stack.append(i)\n\n return ngrams\n\n\ndef build_covering(weights: List[int]) -> List[Tuple[int, int]]:\n \"\"\"\n Extract the MINIMAL covering set of sparse n-grams.\n\n Used at query time: produces the fewest, longest n-grams needed\n to look up in the index. Any document containing the query text\n must contain all of these n-grams.\n\n Greedy: from each position, jump to the farthest valid endpoint\n (the first position with weight >= current).\n \"\"\"\n n = len(weights)\n if n == 0:\n return []\n if n == 1:\n return [(0, 0)]\n\n ngrams = []\n i = 0\n\n while i \u003c n:\n # Find the first position j > i where w[j] >= w[i]\n j = i + 1\n while j \u003c n and weights[j] \u003c weights[i]:\n j += 1\n\n if j >= n:\n # No higher weight found — take the highest remaining position\n # as the endpoint (best available boundary)\n if i \u003c n - 1:\n best = i + 1\n for k in range(i + 2, n):\n if weights[k] > weights[best]:\n best = k\n ngrams.append((i, best))\n i = best\n else:\n # At the last position, nothing more to cover\n break\n else:\n ngrams.append((i, j))\n i = j\n\n return ngrams\n\n\ndef ngram_text(text: bytes, start: int, end: int) -> bytes:\n \"\"\"\n Extract the n-gram substring from text given pair positions.\n Pair position p corresponds to characters text[p:p+2].\n N-gram from pair a to pair b spans text[a:b+2].\n \"\"\"\n return text[start : end + 2]\n\n\ndef ngram_hash(text: bytes, start: int, end: int) -> int:\n \"\"\"Hash an n-gram for use as index key. Uses CRC32 for speed.\"\"\"\n return zlib.crc32(text[start : end + 2]) & 0xFFFFFFFF\n","content_type":"text/x-python; charset=utf-8","language":"python","size":6312,"content_sha256":"0460cc648e296dfd3f35997780ce3d95fa6bb0653da9a7a7ffd7bd674f241046"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"Searching Codebases","type":"text"}]},{"type":"paragraph","content":[{"text":"Find code in any codebase by pattern or concept. One entry point, two search strategies, automatic routing.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Prerequisites","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"uv tool install ripgrep","type":"text"}]},{"type":"paragraph","content":[{"text":"tree-sitting (for structural context expansion) installs automatically when the ","type":"text"},{"text":"--expand","type":"text","marks":[{"type":"code_inline"}]},{"text":" flag is used.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Primary Command","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"SKILL_DIR=/mnt/skills/user/searching-codebases\n\npython3 $SKILL_DIR/scripts/search.py SOURCE \"query1\" [\"query2\" ...] [OPTIONS]","type":"text"}]},{"type":"paragraph","content":[{"text":"SOURCE is any of:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Local directory path","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"GitHub URL (downloads tarball automatically)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"uploads","type":"text","marks":[{"type":"code_inline"}]},{"text":" (uses ","type":"text"},{"text":"/mnt/user-data/uploads/","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"project","type":"text","marks":[{"type":"code_inline"}]},{"text":" (uses ","type":"text"},{"text":"/mnt/project/","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Path to a ","type":"text"},{"text":".zip","type":"text","marks":[{"type":"code_inline"}]},{"text":" or ","type":"text"},{"text":".tar.gz","type":"text","marks":[{"type":"code_inline"}]},{"text":" archive","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Search Modes","type":"text"}]},{"type":"paragraph","content":[{"text":"Regex mode","type":"text","marks":[{"type":"strong"}]},{"text":" (patterns, identifiers, literal text):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python3 $SKILL_DIR/scripts/search.py ./repo \"def handle_error\"\npython3 $SKILL_DIR/scripts/search.py ./repo \"class.*Exception\" --regex\npython3 $SKILL_DIR/scripts/search.py ./repo \"TODO|FIXME|HACK\"","type":"text"}]},{"type":"paragraph","content":[{"text":"Semantic mode","type":"text","marks":[{"type":"strong"}]},{"text":" (concepts, natural language):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python3 $SKILL_DIR/scripts/search.py ./repo \"retry logic with backoff\" --semantic\npython3 $SKILL_DIR/scripts/search.py ./repo \"authentication flow\"\npython3 $SKILL_DIR/scripts/search.py ./repo \"error handling strategy\"","type":"text"}]},{"type":"paragraph","content":[{"text":"Auto-detection: short queries and code-like tokens → regex. Multi-word natural language → semantic. Override with ","type":"text"},{"text":"--regex","type":"text","marks":[{"type":"code_inline"}]},{"text":" or ","type":"text"},{"text":"--semantic","type":"text","marks":[{"type":"code_inline"}]},{"text":".","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Options","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--regex","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"--semantic","type":"text","marks":[{"type":"code_inline"}]},{"text":": Force search mode","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--expand","type":"text","marks":[{"type":"code_inline"}]},{"text":": Return full function bodies via tree-sitting AST context","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--benchmark","type":"text","marks":[{"type":"code_inline"}]},{"text":": Compare indexed regex vs brute-force ripgrep","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--branch NAME","type":"text","marks":[{"type":"code_inline"}]},{"text":": Git branch for GitHub URLs (default: main)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--skip DIRS","type":"text","marks":[{"type":"code_inline"}]},{"text":": Comma-separated directories to skip","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--json","type":"text","marks":[{"type":"code_inline"}]},{"text":": Machine-readable output","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"-v","type":"text","marks":[{"type":"code_inline"}]},{"text":": Show index stats and query routing decisions","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"How It Works","type":"text"}]},{"type":"paragraph","content":[{"text":"Regex search","type":"text","marks":[{"type":"strong"}]},{"text":" builds a sparse n-gram inverted index over all files. Queries are decomposed into literal fragments, looked up in the index to identify candidate files (typically 90-99% reduction), then verified with ripgrep. Frequency-weighted n-grams make rare character sequences more selective.","type":"text"}]},{"type":"paragraph","content":[{"text":"Semantic search","type":"text","marks":[{"type":"strong"}]},{"text":" builds a TF-IDF index over code chunks (functions, classes, structural entries). Queries are ranked by cosine similarity.","type":"text"}]},{"type":"paragraph","content":[{"text":"Context expansion","type":"text","marks":[{"type":"strong"}]},{"text":" (","type":"text"},{"text":"--expand","type":"text","marks":[{"type":"code_inline"}]},{"text":") uses tree-sitting's AST cache to identify function/class boundaries, returning complete structural units rather than line fragments. On first use, tree-sitting scans the repo (~700ms for 250 files); subsequent expansions are sub-millisecond.","type":"text"}]},{"type":"paragraph","content":[{"text":"Small codebases","type":"text","marks":[{"type":"strong"}]},{"text":" (\u003c 20 files) skip indexing entirely — direct ripgrep is faster when there's nothing to narrow.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Mixed Queries","type":"text"}]},{"type":"paragraph","content":[{"text":"Multiple queries can use different modes in a single invocation. Each query is auto-routed independently, and indexes are built once per mode:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python3 $SKILL_DIR/scripts/search.py ./repo \\\n \"class.*Error\" \\\n \"error recovery strategy\" \\\n \"def retry\"","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Dependencies","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"tree-sitting","type":"text","marks":[{"type":"strong"}]},{"text":": Provides AST-based context expansion for ","type":"text"},{"text":"--expand","type":"text","marks":[{"type":"code_inline"}]},{"text":". Not required — search works without it, just with less structural context in results.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ripgrep","type":"text","marks":[{"type":"strong"}]},{"text":": Required for regex verification. Install via ","type":"text"},{"text":"uv tool install ripgrep","type":"text","marks":[{"type":"code_inline"}]},{"text":".","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"scikit-learn","type":"text","marks":[{"type":"strong"}]},{"text":": Required for semantic mode. Installs automatically.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"When to Use","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Known target","type":"text","marks":[{"type":"strong"}]},{"text":": \"where is the retry logic?\", \"find all error handlers\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Pattern matching","type":"text","marks":[{"type":"strong"}]},{"text":": regex across large codebases with indexed speedup","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Concept search","type":"text","marks":[{"type":"strong"}]},{"text":": \"authentication flow\", \"database connection pooling\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Cross-reference","type":"text","marks":[{"type":"strong"}]},{"text":": find all callers/users of a specific function","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"When NOT to Use","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"First encounter","type":"text","marks":[{"type":"strong"}]},{"text":": \"what does this repo do?\" → use exploring-codebases","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Repos under ~10 files","type":"text","marks":[{"type":"strong"}]},{"text":": just read them directly","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Exact symbol lookup","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"find_symbol('ClassName')","type":"text","marks":[{"type":"code_inline"}]},{"text":" via tree-sitting is simpler","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Structural overview","type":"text","marks":[{"type":"strong"}]},{"text":": use tree-sitting's ","type":"text"},{"text":"tree_overview()","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"dir_overview()","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Files","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"scripts/search.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Entry point, query routing, output formatting","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"scripts/resolve.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Input source resolution (GitHub, uploads, archives)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"scripts/context.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" — tree-sitting-based AST context expansion","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"scripts/ngram_index.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Sparse n-gram inverted index, regex decomposition","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"scripts/sparse_ngrams.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Core n-gram algorithms, frequency weights","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"scripts/code_rag.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" — TF-IDF semantic search over code chunks","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"searching-codebases","author":"@skillopedia","source":{"stars":124,"repo_name":"claude-skills","origin_url":"https://github.com/oaustegard/claude-skills/blob/HEAD/searching-codebases/SKILL.md","repo_owner":"oaustegard","body_sha256":"e0ab8228045cd8b3bf5c771f1b0d38f3ceb5ceb0bc102a8a0905200481b4b6a2","cluster_key":"2d33203d10273d6bc936eb1dd3938297ce55f26b6d551be12373458a68e24558","clean_bundle":{"format":"clean-skill-bundle-v1","source":"oaustegard/claude-skills/searching-codebases/SKILL.md","attachments":[{"id":"82728e6f-a15f-5dc7-87da-df43fe0bb6b5","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/82728e6f-a15f-5dc7-87da-df43fe0bb6b5/attachment.md","path":"CHANGELOG.md","size":1155,"sha256":"ef5b53d6d60c94cea7a01dfece7c194b12b2db44b8e92bb56e1f4a38459b4728","contentType":"text/markdown; charset=utf-8"},{"id":"ae247ad4-15ae-5aca-a4ee-66799bdc27c9","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/ae247ad4-15ae-5aca-a4ee-66799bdc27c9/attachment.md","path":"README.md","size":1172,"sha256":"93da16097f130c6a605603daad96d45ce62c6b5750802017970064e3a5e2d507","contentType":"text/markdown; charset=utf-8"},{"id":"b0b9d1dd-f47b-535b-84e7-25c24388f7e7","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/b0b9d1dd-f47b-535b-84e7-25c24388f7e7/attachment.py","path":"scripts/code_rag.py","size":22048,"sha256":"46d721c89eb960be71feb6fac3970e58dd9d12a266b72dfbdc9ac1c0975a2e10","contentType":"text/x-python; charset=utf-8"},{"id":"441ca273-16cc-527a-b5b0-00508d94942f","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/441ca273-16cc-527a-b5b0-00508d94942f/attachment.py","path":"scripts/context.py","size":6550,"sha256":"1d8a414ed2cb3debc1a5c104cd655287c942051f5263193b48474684cda10d0c","contentType":"text/x-python; charset=utf-8"},{"id":"618b6f4a-77ff-56f9-863e-9a4ca9dbe718","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/618b6f4a-77ff-56f9-863e-9a4ca9dbe718/attachment.py","path":"scripts/ngram_index.py","size":21371,"sha256":"64ec4d3188d84fcd05e0ea826275f519f2ed0e47a562c252404c88824ae2c71e","contentType":"text/x-python; charset=utf-8"},{"id":"400b6395-7865-56c2-bd10-c41a9a96b0c6","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/400b6395-7865-56c2-bd10-c41a9a96b0c6/attachment.py","path":"scripts/resolve.py","size":4768,"sha256":"06a61ce2dacb80327ae0817d7b6222660047f8414944c2b2852ad81b44eb2af7","contentType":"text/x-python; charset=utf-8"},{"id":"dec5213b-70ed-58e8-a616-b81ed666e4a7","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/dec5213b-70ed-58e8-a616-b81ed666e4a7/attachment.py","path":"scripts/search.py","size":10971,"sha256":"d1fc135aaf7ada59c0e139d125246df36654a96e79977f038cf4161165aa22fd","contentType":"text/x-python; charset=utf-8"},{"id":"a7f23169-72a0-5d59-b634-6c643a69c349","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/a7f23169-72a0-5d59-b634-6c643a69c349/attachment.py","path":"scripts/sparse_ngrams.py","size":6312,"sha256":"0460cc648e296dfd3f35997780ce3d95fa6bb0653da9a7a7ffd7bd674f241046","contentType":"text/x-python; charset=utf-8"}],"bundle_sha256":"3eb531e6e845a629348855e951df2aa739f6dbee6bb63dba567fc2f25dca5b65","attachment_count":8,"text_attachments":8,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":0,"excluded_attachments":[]},"cluster_size":1,"skill_md_path":"searching-codebases/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"data-analytics","category_label":"Data"},"exact_dupes_collapsed_into_this":0},"version":"v1","category":"data-analytics","metadata":{"version":"2.0.0"},"import_tag":"clean-skills-v1","description":"Find code by regex pattern or natural language concept in any codebase. Auto-routes between n-gram indexed regex search (2-20x faster than ripgrep) and TF-IDF semantic search. Expands results to full functions via tree-sitting AST data. Accepts GitHub URLs, local directories, uploaded files/archives, or project knowledge. Use when asked to find implementations, search for patterns, or answer \"where is X\" / \"how does Y work\" about code. Triggers on \"search this repo\", \"find where X is\", \"grep for\", \"what handles Y\", regex patterns, or natural-language questions about code. This is the convergent \"find X\" skill — for first-encounter orientation, use exploring-codebases instead."}},"renderedAt":1782986527747}

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.