github-research — Skillopedia

GitHub Research Skill Trigger Activate this skill when the user wants to: - "Find repos for [topic]", "GitHub research on [topic]" - "Analyze open-source code for [topic]" - "Find implementations of [paper/technique]" - "Which repos implement [algorithm]?" - Uses slash command Overview This skill systematically discovers, evaluates, and deeply analyzes GitHub repositories related to a research topic. It reads deep-research output (paper database, phase reports, code references) and produces an actionable integration blueprint for reusing open-source code. Installation : — scripts, references,…

, text, re.MULTILINE):\n heading = match.group(1).strip()\n if len(heading) > 3:\n results.append({\n \"type\": \"keyword\",\n \"value\": heading.lower(),\n \"frequency\": 1,\n \"source\": \"paper_titles\",\n })\n\n return results\n\n\ndef main():\n parser = argparse.ArgumentParser(\n description=\"Extract GitHub URLs, paper refs, and keywords from deep-research output.\"\n )\n parser.add_argument(\"--research-dir\", required=True,\n help=\"Path to deep-research output directory\")\n parser.add_argument(\"--output\", required=True,\n help=\"Output JSONL file path\")\n args = parser.parse_args()\n\n research_dir = Path(args.research_dir).resolve()\n if not research_dir.is_dir():\n print(f\"[error] research dir not found: {research_dir}\", file=sys.stderr)\n sys.exit(1)\n\n print(f\"[info] scanning {research_dir} ...\", file=sys.stderr)\n\n # 1. Extract GitHub URLs from all markdown files\n github_refs = scan_md_files_for_urls(research_dir)\n print(f\" GitHub URLs: {len(github_refs)}\", file=sys.stderr)\n\n # 2. Parse paper_db.jsonl\n papers = parse_paper_db(research_dir)\n print(f\" Papers: {len(papers)}\", file=sys.stderr)\n\n # 3. Extract keywords from papers\n keywords = extract_keywords(papers)\n\n # 4. Extract themes from synthesis/report headings\n themes = extract_synthesis_themes(research_dir)\n keywords.extend(themes)\n\n # Deduplicate keywords by value\n seen_kw: set[str] = set()\n deduped_kw: list[dict] = []\n for kw in keywords:\n if kw[\"value\"] not in seen_kw:\n seen_kw.add(kw[\"value\"])\n deduped_kw.append(kw)\n keywords = deduped_kw\n\n print(f\" Keywords: {len(keywords)}\", file=sys.stderr)\n\n # Write output\n output_path = Path(args.output)\n output_path.parent.mkdir(parents=True, exist_ok=True)\n with open(output_path, \"w\", encoding=\"utf-8\") as f:\n for record in github_refs:\n f.write(json.dumps(record, ensure_ascii=False) + \"\\n\")\n for record in papers:\n f.write(json.dumps(record, ensure_ascii=False) + \"\\n\")\n for record in keywords:\n f.write(json.dumps(record, ensure_ascii=False) + \"\\n\")\n\n # Summary stats\n unique_repos = len({r[\"repo_id\"] for r in github_refs})\n print(\n f\"Extracted {unique_repos} GitHub URLs, \"\n f\"{len(papers)} papers, {len(keywords)} keywords \"\n f\"-> {output_path}\",\n file=sys.stderr,\n )\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":10894,"content_sha256":"71708bddd8c8c8c7010b9f2069f0ac7a1171600dcbf222b298367b3c02651b4a"},{"filename":"scripts/find_implementations.py","content":"#!/usr/bin/env python3\n\"\"\"Search cloned repo for specific implementations (classes, functions, algorithms).\n\nUsage:\n python find_implementations.py --repo-dir ./repos/owner_name --patterns \"class Transformer\" \"def train\" --output matches.jsonl\n python find_implementations.py --repo-dir ./repos/owner_name --keywords \"attention\" \"embedding\" --output matches.jsonl\n\"\"\"\n\nimport argparse\nimport json\nimport os\nimport re\nimport sys\nfrom pathlib import Path\n\n\n# ---------------------------------------------------------------------------\n# Constants\n# ---------------------------------------------------------------------------\n\nSKIP_DIRS: frozenset[str] = frozenset({\n \".git\", \"node_modules\", \"__pycache__\", \".venv\", \"venv\",\n \".tox\", \".mypy_cache\", \".pytest_cache\", \".eggs\", \"dist\", \"build\",\n \".next\", \".nuxt\",\n})\n\nTEST_INDICATORS: tuple[str, ...] = (\n \"/test/\", \"/tests/\", \"test_\", \"_test.py\", \".test.\", \".spec.\",\n)\n\nMAX_FILE_SIZE: int = 1_048_576 # 1 MB\n\n\n# ---------------------------------------------------------------------------\n# Helpers\n# ---------------------------------------------------------------------------\n\ndef is_binary(filepath: Path) -> bool:\n \"\"\"Check if a file is likely binary by reading the first 8 KB.\"\"\"\n try:\n with open(filepath, \"rb\") as f:\n chunk = f.read(8192)\n return b\"\\x00\" in chunk\n except OSError:\n return True\n\n\ndef is_test_path(rel_path: str) -> bool:\n \"\"\"Return True if the relative path looks like a test file.\"\"\"\n rel_lower = rel_path.lower()\n for indicator in TEST_INDICATORS:\n if indicator in rel_lower:\n return True\n # Also check basename prefix\n basename = os.path.basename(rel_lower)\n if basename.startswith(\"test_\"):\n return True\n return False\n\n\ndef classify_pattern(pattern: str) -> tuple[str, re.Pattern[str]]:\n \"\"\"Classify a search pattern and return (match_type, compiled_regex).\n\n Patterns starting with \"class \" or \"def \" get special match types.\n Other patterns are compiled as-is (user-supplied regex).\n \"\"\"\n if pattern.startswith(\"class \"):\n name = pattern[6:].strip()\n return \"class\", re.compile(r\"class\\s+\" + re.escape(name))\n elif pattern.startswith(\"def \"):\n name = pattern[4:].strip()\n return \"function\", re.compile(r\"def\\s+\" + re.escape(name))\n else:\n try:\n return \"keyword\", re.compile(pattern)\n except re.error:\n # Fall back to escaped literal if pattern is not valid regex\n return \"keyword\", re.compile(re.escape(pattern))\n\n\ndef read_lines(filepath: Path) -> list[str] | None:\n \"\"\"Read all lines from a text file. Returns None on decode failure.\"\"\"\n try:\n with open(filepath, \"r\", encoding=\"utf-8\", errors=\"strict\") as f:\n return f.readlines()\n except (OSError, UnicodeDecodeError):\n return None\n\n\n# ---------------------------------------------------------------------------\n# Search logic\n# ---------------------------------------------------------------------------\n\ndef search_file(\n filepath: Path,\n rel_path: str,\n patterns: list[tuple[str, re.Pattern[str]]],\n keywords: list[str],\n context: int,\n repo_id: str,\n) -> list[dict]:\n \"\"\"Search a single file for patterns and keywords. Returns match dicts.\"\"\"\n lines = read_lines(filepath)\n if lines is None:\n return []\n\n results: list[dict] = []\n\n # --- Pattern search (regex) ---\n for match_type, regex in patterns:\n for i, line in enumerate(lines):\n if regex.search(line):\n start = max(0, i - context)\n end = min(len(lines), i + context + 1)\n results.append({\n \"repo_id\": repo_id,\n \"file_path\": rel_path,\n \"line_number\": i + 1,\n \"match_type\": match_type,\n \"matched_text\": line.rstrip(\"\\n\"),\n \"context_before\": [\n l.rstrip(\"\\n\") for l in lines[start:i]\n ],\n \"context_after\": [\n l.rstrip(\"\\n\") for l in lines[i + 1:end]\n ],\n })\n\n # --- Keyword search (case-insensitive substring) ---\n for kw in keywords:\n kw_lower = kw.lower()\n for i, line in enumerate(lines):\n if kw_lower in line.lower():\n start = max(0, i - context)\n end = min(len(lines), i + context + 1)\n results.append({\n \"repo_id\": repo_id,\n \"file_path\": rel_path,\n \"line_number\": i + 1,\n \"match_type\": \"keyword\",\n \"matched_text\": line.rstrip(\"\\n\"),\n \"context_before\": [\n l.rstrip(\"\\n\") for l in lines[start:i]\n ],\n \"context_after\": [\n l.rstrip(\"\\n\") for l in lines[i + 1:end]\n ],\n })\n\n return results\n\n\n# ---------------------------------------------------------------------------\n# Main\n# ---------------------------------------------------------------------------\n\ndef main() -> None:\n parser = argparse.ArgumentParser(\n description=\"Search cloned repo for specific implementations \"\n \"(classes, functions, algorithms)\",\n )\n parser.add_argument(\n \"--repo-dir\", required=True,\n help=\"Path to the cloned repository\",\n )\n parser.add_argument(\n \"--patterns\", nargs=\"+\", default=None,\n help='Regex patterns (e.g. \"class Transformer\" \"def train\")',\n )\n parser.add_argument(\n \"--keywords\", nargs=\"+\", default=None,\n help=\"Simple keyword search terms (case-insensitive substring match)\",\n )\n parser.add_argument(\n \"--output\", required=True,\n help=\"Output JSONL file path\",\n )\n parser.add_argument(\n \"--context\", type=int, default=5,\n help=\"Lines of context before and after each match (default: 5)\",\n )\n parser.add_argument(\n \"--skip-tests\", action=\"store_true\",\n help=\"Skip test files and test directories\",\n )\n parser.add_argument(\n \"--repo-id\", default=None,\n help=\"Repository identifier (owner/name); inferred from dir name if omitted\",\n )\n\n args = parser.parse_args()\n\n if not args.patterns and not args.keywords:\n parser.error(\"Provide at least one of --patterns or --keywords\")\n\n repo_dir = Path(args.repo_dir).resolve()\n if not repo_dir.is_dir():\n print(f\"Error: repo directory not found: {repo_dir}\", file=sys.stderr)\n sys.exit(1)\n\n # Infer repo_id\n repo_id: str = args.repo_id or \"\"\n if not repo_id:\n dir_name = repo_dir.name\n repo_id = dir_name.replace(\"_\", \"/\", 1) if \"_\" in dir_name else dir_name\n\n # Classify patterns\n classified_patterns: list[tuple[str, re.Pattern[str]]] = []\n if args.patterns:\n for p in args.patterns:\n match_type, regex = classify_pattern(p)\n classified_patterns.append((match_type, regex))\n print(f\" Pattern: '{p}' -> type={match_type}\", file=sys.stderr)\n\n keyword_list: list[str] = args.keywords or []\n if keyword_list:\n print(f\" Keywords: {keyword_list}\", file=sys.stderr)\n\n # Walk and search\n total_matches = 0\n files_with_matches = 0\n files_scanned = 0\n files_skipped = 0\n\n output_path = Path(args.output)\n output_path.parent.mkdir(parents=True, exist_ok=True)\n\n print(f\"Searching {repo_dir} ...\", file=sys.stderr)\n\n with open(output_path, \"w\", encoding=\"utf-8\") as out_f:\n for dirpath, dirnames, filenames in os.walk(repo_dir):\n # Filter directories in-place to prune traversal\n dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]\n\n for fname in filenames:\n filepath = Path(dirpath) / fname\n rel_path = str(filepath.relative_to(repo_dir)).replace(os.sep, \"/\")\n\n # Skip test files if requested\n if args.skip_tests and is_test_path(rel_path):\n files_skipped += 1\n continue\n\n # Skip files that are too large\n try:\n size = filepath.stat().st_size\n except OSError:\n continue\n if size > MAX_FILE_SIZE:\n files_skipped += 1\n continue\n\n # Skip binary files\n if is_binary(filepath):\n files_skipped += 1\n continue\n\n files_scanned += 1\n matches = search_file(\n filepath, rel_path,\n classified_patterns, keyword_list,\n args.context, repo_id,\n )\n\n if matches:\n files_with_matches += 1\n for m in matches:\n out_f.write(json.dumps(m, ensure_ascii=False) + \"\\n\")\n total_matches += 1\n\n print(\n f\"Found {total_matches} matches across {files_with_matches} files \"\n f\"({files_scanned} scanned, {files_skipped} skipped) -> {args.output}\",\n file=sys.stderr,\n )\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":9370,"content_sha256":"446e075b7433ebcf2cda8b4849b4cc6c09ade6bb8fe6c0fd7ad479eba166298b"},{"filename":"scripts/repo_db.py","content":"#!/usr/bin/env python3\n\"\"\"JSONL GitHub repo database management.\n\nSubcommands: merge, filter, score, search, tag, stats, export, rank.\nDeduplication by exact repo_id (owner/name) match.\n\nUsage:\n python repo_db.py merge --inputs a.jsonl b.jsonl --output merged.jsonl\n python repo_db.py filter --input db.jsonl --output filtered.jsonl --min-stars 100\n python repo_db.py score --input db.jsonl --output scored.jsonl\n python repo_db.py search --input db.jsonl --query \"transformer\"\n python repo_db.py tag --input db.jsonl --ids owner/name --tags impl baseline\n python repo_db.py stats --input db.jsonl\n python repo_db.py export --input db.jsonl --format markdown\n python repo_db.py rank --input db.jsonl --output ranked.jsonl --by composite_score\n\"\"\"\n\nimport argparse\nimport csv\nimport io\nimport json\nimport math\nimport os\nimport re\nimport sys\nfrom datetime import datetime, timezone\n\n\n# -- I/O helpers --------------------------------------------------------------\n\ndef load_jsonl(path: str) -> list[dict]:\n \"\"\"Load records from a JSONL file.\"\"\"\n records = []\n if not os.path.exists(path):\n return records\n with open(path, encoding=\"utf-8\") as f:\n for line in f:\n line = line.strip()\n if line:\n records.append(json.loads(line))\n return records\n\n\ndef save_jsonl(records: list[dict], path: str):\n \"\"\"Save records to a JSONL file.\"\"\"\n os.makedirs(os.path.dirname(path) or \".\", exist_ok=True)\n with open(path, \"w\", encoding=\"utf-8\") as f:\n for rec in records:\n f.write(json.dumps(rec, ensure_ascii=False) + \"\\n\")\n\n\ndef deduplicate(records: list[dict]) -> list[dict]:\n \"\"\"Remove duplicate repos by exact repo_id match. Later entries win.\"\"\"\n seen: dict[str, int] = {}\n for i, rec in enumerate(records):\n rid = rec.get(\"repo_id\", \"\")\n if rid:\n seen[rid] = i\n # Preserve order of last occurrence\n indices = sorted(seen.values())\n return [records[i] for i in indices]\n\n\n# -- Scoring ------------------------------------------------------------------\n\ndef _sigmoid(x: float) -> float:\n \"\"\"Sigmoid with steepness 5 centered at 0.5.\"\"\"\n return 1.0 / (1.0 + math.exp(-5.0 * (x - 0.5)))\n\n\ndef _parse_iso(dt_str: str | None) -> datetime | None:\n \"\"\"Parse an ISO-8601 datetime string.\"\"\"\n if not dt_str:\n return None\n # Handle trailing Z and optional fractional seconds\n dt_str = dt_str.replace(\"Z\", \"+00:00\")\n try:\n return datetime.fromisoformat(dt_str)\n except (ValueError, TypeError):\n return None\n\n\ndef compute_activity_score(rec: dict, now: datetime) -> float:\n \"\"\"Compute activity score in [0, 1].\"\"\"\n pushed = _parse_iso(rec.get(\"pushed_at\"))\n if pushed is None:\n return _sigmoid(0.0)\n\n days_since_push = max((now - pushed).days, 0)\n recent_push = 1.0 if days_since_push \u003c 90 else 0.0\n has_recent_commits = 1.0 if days_since_push \u003c 180 else 0.0\n\n stars = max(rec.get(\"stars\", 0), 1)\n open_issues = rec.get(\"open_issues\", 0)\n ratio = min(open_issues / stars, 1.0)\n issues_component = 1.0 - ratio # lower ratio is better\n\n raw = recent_push * 0.4 + has_recent_commits * 0.3 + issues_component * 0.3\n return _sigmoid(raw)\n\n\ndef compute_quality_score(rec: dict) -> float:\n \"\"\"Compute quality score in [0, 1].\"\"\"\n stars = rec.get(\"stars\", 0)\n forks = rec.get(\"forks\", 0)\n\n # Normalize log components: log(x+1) / log(100001) gives ~0-1 for 0-100k\n max_log = math.log(100001)\n star_comp = min(math.log(stars + 1) / max_log, 1.0) * 0.3\n fork_comp = min(math.log(forks + 1) / max_log, 1.0) * 0.2\n\n has_license = 0.15 if rec.get(\"license\") else 0.0\n has_readme = 0.15 if rec.get(\"readme_excerpt\") else 0.0\n not_archived = 0.2 if not rec.get(\"archived\", False) else 0.0\n\n raw = star_comp + fork_comp + has_license + has_readme + not_archived\n return min(raw, 1.0)\n\n\ndef score_record(rec: dict, now: datetime) -> dict:\n \"\"\"Compute all scores for a single record and return updated copy.\"\"\"\n rec = dict(rec)\n rec[\"activity_score\"] = round(compute_activity_score(rec, now), 4)\n rec[\"quality_score\"] = round(compute_quality_score(rec), 4)\n\n relevance = rec.get(\"relevance_score\", 0.0) or 0.0\n quality = rec[\"quality_score\"]\n activity = rec[\"activity_score\"]\n rec[\"composite_score\"] = round(\n relevance * 0.4 + quality * 0.35 + activity * 0.25, 4\n )\n return rec\n\n\n# -- Subcommand implementations -----------------------------------------------\n\ndef cmd_merge(args):\n \"\"\"Merge multiple JSONL files with deduplication by repo_id.\"\"\"\n all_records = []\n for path in args.inputs:\n records = load_jsonl(path)\n print(f\"Loaded {len(records)} from {path}\", file=sys.stderr)\n all_records.extend(records)\n\n merged = deduplicate(all_records)\n save_jsonl(merged, args.output)\n print(f\"Merged: {len(all_records)} -> {len(merged)} unique repos -> {args.output}\",\n file=sys.stderr)\n\n\ndef cmd_filter(args):\n \"\"\"Filter repos by various criteria.\"\"\"\n records = load_jsonl(args.input)\n kept = []\n\n for rec in records:\n if args.min_stars is not None and rec.get(\"stars\", 0) \u003c args.min_stars:\n continue\n if args.min_score is not None and rec.get(\"composite_score\", 0.0) \u003c args.min_score:\n continue\n if args.language and rec.get(\"language\", \"\").lower() != args.language.lower():\n continue\n if args.not_archived and rec.get(\"archived\", False):\n continue\n kept.append(rec)\n\n # Sort by composite_score descending\n kept.sort(key=lambda r: -(r.get(\"composite_score\", 0.0) or 0.0))\n\n if args.max_repos and args.max_repos > 0 and len(kept) > args.max_repos:\n kept = kept[:args.max_repos]\n\n save_jsonl(kept, args.output)\n print(f\"Filtered: {len(records)} -> {len(kept)} repos -> {args.output}\",\n file=sys.stderr)\n\n\ndef cmd_score(args):\n \"\"\"Compute composite scores for all repos.\"\"\"\n records = load_jsonl(args.input)\n now = datetime.now(timezone.utc)\n\n scored = [score_record(rec, now) for rec in records]\n save_jsonl(scored, args.output)\n print(f\"Scored {len(scored)} repos -> {args.output}\", file=sys.stderr)\n\n\ndef cmd_search(args):\n \"\"\"Search repos by keyword match in a field.\"\"\"\n records = load_jsonl(args.input)\n query_lower = args.query.lower()\n results = []\n\n for rec in records:\n value = rec.get(args.field, \"\")\n if isinstance(value, list):\n value = \" \".join(str(v) for v in value)\n if query_lower in str(value).lower():\n results.append(rec)\n\n for rec in results:\n print(json.dumps(rec, ensure_ascii=False))\n print(f\"Found {len(results)} matches\", file=sys.stderr)\n\n\ndef cmd_tag(args):\n \"\"\"Add tags to specific repos. Supports 'relevance:0.85' format.\"\"\"\n records = load_jsonl(args.input)\n id_set = set(args.ids)\n tagged = 0\n\n # Separate relevance assignments from plain tags\n plain_tags = []\n relevance_val = None\n for t in args.tags:\n if t.startswith(\"relevance:\"):\n try:\n relevance_val = float(t.split(\":\", 1)[1])\n except ValueError:\n plain_tags.append(t)\n else:\n plain_tags.append(t)\n\n for rec in records:\n rid = rec.get(\"repo_id\", \"\")\n if rid in id_set:\n if plain_tags:\n existing = rec.get(\"tags\", [])\n rec[\"tags\"] = sorted(set(existing + plain_tags))\n if relevance_val is not None:\n rec[\"relevance_score\"] = relevance_val\n tagged += 1\n\n save_jsonl(records, args.input)\n msg_parts = []\n if plain_tags:\n msg_parts.append(f\"tags={plain_tags}\")\n if relevance_val is not None:\n msg_parts.append(f\"relevance_score={relevance_val}\")\n print(f\"Tagged {tagged} repos with {', '.join(msg_parts)}\", file=sys.stderr)\n\n\ndef cmd_stats(args):\n \"\"\"Compute and print JSON summary statistics.\"\"\"\n records = load_jsonl(args.input)\n if not records:\n print(json.dumps({\"total\": 0}, indent=2))\n return\n\n languages: dict[str, int] = {}\n sources: dict[str, int] = {}\n tags_dist: dict[str, int] = {}\n total_stars = 0\n total_forks = 0\n archived_count = 0\n with_readme = 0\n with_papers = 0\n scored_count = 0\n score_sum = 0.0\n\n for rec in records:\n lang = rec.get(\"language\") or \"Unknown\"\n languages[lang] = languages.get(lang, 0) + 1\n\n src = rec.get(\"source\", \"unknown\")\n sources[src] = sources.get(src, 0) + 1\n\n total_stars += rec.get(\"stars\", 0)\n total_forks += rec.get(\"forks\", 0)\n\n if rec.get(\"archived\"):\n archived_count += 1\n if rec.get(\"readme_excerpt\"):\n with_readme += 1\n if rec.get(\"paper_ids\"):\n with_papers += 1\n\n cs = rec.get(\"composite_score\", 0.0) or 0.0\n if cs > 0:\n scored_count += 1\n score_sum += cs\n\n for tag in rec.get(\"tags\", []):\n tags_dist[tag] = tags_dist.get(tag, 0) + 1\n\n stats = {\n \"total\": len(records),\n \"archived\": archived_count,\n \"with_readme\": with_readme,\n \"with_papers\": with_papers,\n \"total_stars\": total_stars,\n \"total_forks\": total_forks,\n \"avg_stars\": round(total_stars / len(records), 1),\n \"avg_composite_score\": round(score_sum / scored_count, 4) if scored_count else 0.0,\n \"languages\": dict(sorted(languages.items(), key=lambda x: -x[1])[:15]),\n \"sources\": sources,\n \"tags\": tags_dist,\n }\n print(json.dumps(stats, indent=2))\n\n\ndef cmd_export(args):\n \"\"\"Export database in csv, jsonl, or markdown format.\"\"\"\n records = load_jsonl(args.input)\n if not records:\n print(\"No records to export.\", file=sys.stderr)\n return\n\n fmt = args.format\n\n if fmt == \"csv\":\n output = _export_csv(records)\n elif fmt == \"jsonl\":\n output = \"\\n\".join(json.dumps(r, ensure_ascii=False) for r in records) + \"\\n\"\n elif fmt == \"markdown\":\n output = _export_markdown(records)\n else:\n print(f\"Unknown format: {fmt}\", file=sys.stderr)\n return\n\n if args.output:\n with open(args.output, \"w\", encoding=\"utf-8\") as f:\n f.write(output)\n print(f\"Exported {len(records)} repos to {args.output}\", file=sys.stderr)\n else:\n print(output, end=\"\")\n\n\ndef _export_csv(records: list[dict]) -> str:\n fields = [\n \"repo_id\", \"name\", \"owner\", \"stars\", \"forks\", \"language\", \"license\",\n \"composite_score\", \"quality_score\", \"activity_score\", \"relevance_score\",\n \"topics\", \"tags\", \"archived\", \"source\",\n ]\n buf = io.StringIO()\n writer = csv.DictWriter(buf, fieldnames=fields, extrasaction=\"ignore\")\n writer.writeheader()\n for rec in records:\n row = dict(rec)\n if isinstance(row.get(\"topics\"), list):\n row[\"topics\"] = \"; \".join(row[\"topics\"])\n if isinstance(row.get(\"tags\"), list):\n row[\"tags\"] = \"; \".join(row[\"tags\"])\n writer.writerow(row)\n return buf.getvalue()\n\n\ndef _export_markdown(records: list[dict]) -> str:\n lines = [\"| Repo | Stars | Language | Score | Description |\",\n \"|------|------:|----------|------:|-------------|\"]\n for rec in records:\n rid = rec.get(\"repo_id\", \"\")\n url = rec.get(\"url\", f\"https://github.com/{rid}\")\n stars = rec.get(\"stars\", 0)\n lang = rec.get(\"language\", \"\")\n score = rec.get(\"composite_score\", 0.0) or 0.0\n desc = (rec.get(\"description\") or \"\")[:80]\n desc = desc.replace(\"|\", \"\\\\|\")\n lines.append(f\"| [{rid}]({url}) | {stars} | {lang} | {score:.3f} | {desc} |\")\n return \"\\n\".join(lines) + \"\\n\"\n\n\ndef cmd_rank(args):\n \"\"\"Rank repos by a given field descending.\"\"\"\n records = load_jsonl(args.input)\n # Map CLI choice to actual record field name\n field_map = {\n \"composite_score\": \"composite_score\",\n \"stars\": \"stars\",\n \"updated\": \"updated_at\",\n }\n field = field_map.get(args.by, args.by)\n\n def sort_key(r):\n val = r.get(field, 0)\n if val is None:\n return \"\"\n # For date strings, lexicographic sort works with ISO format\n if isinstance(val, str):\n return val\n return val\n\n records.sort(key=sort_key, reverse=True)\n\n for i, rec in enumerate(records):\n rec[\"rank\"] = i + 1\n\n save_jsonl(records, args.output)\n print(f\"Ranked {len(records)} repos by {field} -> {args.output}\", file=sys.stderr)\n\n\n# -- CLI -----------------------------------------------------------------------\n\ndef main():\n parser = argparse.ArgumentParser(\n description=\"JSONL GitHub repo database management tool\"\n )\n sub = parser.add_subparsers(dest=\"command\", required=True)\n\n # merge\n p = sub.add_parser(\"merge\", help=\"Merge multiple JSONL files with dedup by repo_id\")\n p.add_argument(\"--inputs\", nargs=\"+\", required=True, help=\"Input JSONL files\")\n p.add_argument(\"--output\", required=True, help=\"Output JSONL file\")\n\n # filter\n p = sub.add_parser(\"filter\", help=\"Filter repos by stars, score, language, etc.\")\n p.add_argument(\"--input\", required=True, help=\"Input JSONL file\")\n p.add_argument(\"--output\", required=True, help=\"Output JSONL file\")\n p.add_argument(\"--min-stars\", type=int, default=None, help=\"Minimum star count\")\n p.add_argument(\"--min-score\", type=float, default=None, help=\"Minimum composite score\")\n p.add_argument(\"--max-repos\", type=int, default=None, help=\"Max repos to keep (0=unlimited)\")\n p.add_argument(\"--language\", default=None, help=\"Filter by primary language\")\n p.add_argument(\"--not-archived\", action=\"store_true\", help=\"Exclude archived repos\")\n\n # score\n p = sub.add_parser(\"score\", help=\"Compute composite scores for all repos\")\n p.add_argument(\"--input\", required=True, help=\"Input JSONL file\")\n p.add_argument(\"--output\", required=True, help=\"Output scored JSONL file\")\n\n # search\n p = sub.add_parser(\"search\", help=\"Search repos by keyword in a field\")\n p.add_argument(\"--input\", required=True, help=\"Input JSONL file\")\n p.add_argument(\"--query\", required=True, help=\"Search query string\")\n p.add_argument(\"--field\", default=\"description\",\n choices=[\"description\", \"topics\", \"name\"],\n help=\"Field to search (default: description)\")\n\n # tag\n p = sub.add_parser(\"tag\", help=\"Add tags to repos; supports 'relevance:0.85' format\")\n p.add_argument(\"--input\", required=True, help=\"Input JSONL file (modified in-place)\")\n p.add_argument(\"--ids\", nargs=\"+\", required=True, help=\"Repo IDs (owner/name)\")\n p.add_argument(\"--tags\", nargs=\"+\", required=True,\n help=\"Tags to add; use 'relevance:0.85' to set relevance_score\")\n\n # stats\n p = sub.add_parser(\"stats\", help=\"Print JSON summary statistics\")\n p.add_argument(\"--input\", required=True, help=\"Input JSONL file\")\n\n # export\n p = sub.add_parser(\"export\", help=\"Export database to csv, jsonl, or markdown\")\n p.add_argument(\"--input\", required=True, help=\"Input JSONL file\")\n p.add_argument(\"--format\", choices=[\"csv\", \"jsonl\", \"markdown\"], default=\"jsonl\",\n help=\"Output format (default: jsonl)\")\n p.add_argument(\"--output\", \"-o\", default=None, help=\"Output file (default: stdout)\")\n\n # rank\n p = sub.add_parser(\"rank\", help=\"Rank repos by a field descending\")\n p.add_argument(\"--input\", required=True, help=\"Input JSONL file\")\n p.add_argument(\"--output\", required=True, help=\"Output ranked JSONL file\")\n p.add_argument(\"--by\", default=\"composite_score\",\n choices=[\"composite_score\", \"stars\", \"updated\"],\n help=\"Field to rank by (default: composite_score)\")\n\n args = parser.parse_args()\n\n dispatch = {\n \"merge\": cmd_merge,\n \"filter\": cmd_filter,\n \"score\": cmd_score,\n \"search\": cmd_search,\n \"tag\": cmd_tag,\n \"stats\": cmd_stats,\n \"export\": cmd_export,\n \"rank\": cmd_rank,\n }\n dispatch[args.command](args)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":16253,"content_sha256":"af961e458a77c980d15d93932641880840779d9cc12d98c2dbf8a4a07f48f374"},{"filename":"scripts/repo_metadata.py","content":"#!/usr/bin/env python3\n\"\"\"Fetch detailed metadata for GitHub repos via ``gh api``.\n\nRetrieves main repo info and language breakdown, then maps the response\nto the repo_db schema. Can enrich existing JSONL records or create new\nones from scratch.\n\"\"\"\n\nimport argparse\nimport json\nimport subprocess\nimport sys\nimport time\n\n\n# ---------------------------------------------------------------------------\n# GitHub helpers\n# ---------------------------------------------------------------------------\n\ndef gh_api(endpoint: str) -> dict | None:\n \"\"\"Call ``gh api`` and return parsed JSON, or *None* on failure.\"\"\"\n try:\n proc = subprocess.run(\n [\"gh\", \"api\", endpoint],\n capture_output=True, text=True, timeout=30,\n )\n if proc.returncode == 0:\n return json.loads(proc.stdout)\n stderr = proc.stderr.strip()\n if \"404\" in stderr or \"Not Found\" in stderr:\n print(f\" 404 — repo not found\", file=sys.stderr)\n elif \"rate limit\" in stderr.lower() or \"403\" in stderr:\n print(f\" Rate limited. Consider increasing --delay.\", file=sys.stderr)\n else:\n print(f\" gh api error ({proc.returncode}): {stderr[:120]}\", file=sys.stderr)\n except FileNotFoundError:\n print(\"Error: 'gh' CLI not found. Please install GitHub CLI.\", file=sys.stderr)\n sys.exit(1)\n except (subprocess.TimeoutExpired, json.JSONDecodeError) as exc:\n print(f\" gh api exception: {exc}\", file=sys.stderr)\n return None\n\n\n# ---------------------------------------------------------------------------\n# Language breakdown\n# ---------------------------------------------------------------------------\n\ndef compute_languages_pct(lang_data: dict) -> dict[str, float]:\n \"\"\"Convert raw byte counts to percentages (0–100).\"\"\"\n total = sum(lang_data.values())\n if total == 0:\n return {}\n return {lang: round(count / total * 100, 2) for lang, count in lang_data.items()}\n\n\n# ---------------------------------------------------------------------------\n# Record building\n# ---------------------------------------------------------------------------\n\ndef build_record(repo_id: str, meta: dict, lang_data: dict) -> dict:\n \"\"\"Build a repo_db-schema record from GitHub API responses.\"\"\"\n license_info = meta.get(\"license\") or {}\n languages_pct = compute_languages_pct(lang_data)\n\n return {\n \"repo_id\": repo_id,\n \"github_url\": meta.get(\"html_url\", f\"https://github.com/{repo_id}\"),\n \"description\": meta.get(\"description\", \"\"),\n \"homepage\": meta.get(\"homepage\", \"\"),\n \"stars\": meta.get(\"stargazers_count\", 0),\n \"forks_count\": meta.get(\"forks_count\", 0),\n \"open_issues_count\": meta.get(\"open_issues_count\", 0),\n \"watchers_count\": meta.get(\"watchers_count\", 0),\n \"default_branch\": meta.get(\"default_branch\", \"\"),\n \"license\": license_info.get(\"spdx_id\") or license_info.get(\"name\", \"\"),\n \"language\": meta.get(\"language\", \"\"),\n \"languages_pct\": languages_pct,\n \"topics\": meta.get(\"topics\", []),\n \"is_fork\": meta.get(\"fork\", False),\n \"is_archived\": meta.get(\"archived\", False),\n \"size_kb\": meta.get(\"size\", 0),\n \"created_at\": meta.get(\"created_at\", \"\"),\n \"updated_at\": meta.get(\"updated_at\", \"\"),\n \"pushed_at\": meta.get(\"pushed_at\", \"\"),\n \"owner_type\": (meta.get(\"owner\") or {}).get(\"type\", \"\"),\n \"network_count\": meta.get(\"network_count\", 0),\n \"subscribers_count\": meta.get(\"subscribers_count\", 0),\n }\n\n\ndef enrich_record(existing: dict, fresh: dict) -> dict:\n \"\"\"Merge *fresh* metadata into an *existing* record.\n\n Fresh values overwrite existing ones, except we preserve fields\n that exist only in the original (e.g. ``source``, ``paper_ids``).\n \"\"\"\n merged = dict(existing)\n merged.update(fresh)\n # Preserve provenance fields from the original record.\n for key in (\"source\", \"paper_ids\", \"paper_titles\", \"is_official\", \"framework\"):\n if key in existing:\n merged[key] = existing[key]\n return merged\n\n\n# ---------------------------------------------------------------------------\n# I/O helpers\n# ---------------------------------------------------------------------------\n\ndef load_jsonl(path: str) -> list[dict]:\n \"\"\"Load a JSONL file into a list of dicts.\"\"\"\n records: list[dict] = []\n with open(path, encoding=\"utf-8\") as f:\n for line in f:\n line = line.strip()\n if not line:\n continue\n try:\n records.append(json.loads(line))\n except json.JSONDecodeError:\n continue\n return records\n\n\ndef write_jsonl(records: list[dict], path: str) -> None:\n \"\"\"Write a list of dicts as JSONL.\"\"\"\n with open(path, \"w\", encoding=\"utf-8\") as fout:\n for rec in records:\n fout.write(json.dumps(rec, ensure_ascii=False) + \"\\n\")\n\n\n# ---------------------------------------------------------------------------\n# Main\n# ---------------------------------------------------------------------------\n\ndef main() -> None:\n parser = argparse.ArgumentParser(\n description=\"Fetch detailed GitHub repo metadata via gh api.\",\n )\n source = parser.add_mutually_exclusive_group(required=True)\n source.add_argument(\"--repos\", nargs=\"+\", metavar=\"OWNER/NAME\",\n help=\"One or more repos as owner/name\")\n source.add_argument(\"--input\", metavar=\"FILE\",\n help=\"JSONL file with existing repo records to enrich\")\n parser.add_argument(\"--output\", required=True, help=\"Output JSONL file path\")\n parser.add_argument(\"--delay\", type=float, default=0.5,\n help=\"Seconds between API requests per repo (default: 0.5)\")\n\n args = parser.parse_args()\n\n # Determine which repos to process.\n existing_records: dict[str, dict] = {} # keyed by repo_id\n\n if args.input:\n for rec in load_jsonl(args.input):\n rid = rec.get(\"repo_id\")\n if rid:\n existing_records[rid] = rec\n repo_ids = list(existing_records.keys())\n else:\n repo_ids = args.repos or []\n\n if not repo_ids:\n print(\"No repos to process.\", file=sys.stderr)\n sys.exit(0)\n\n total = len(repo_ids)\n results: list[dict] = []\n\n for idx, repo_id in enumerate(repo_ids, 1):\n print(f\"Fetching metadata: {idx}/{total} — {repo_id}\", file=sys.stderr)\n\n meta = gh_api(f\"/repos/{repo_id}\")\n if meta is None:\n print(f\" Skipping {repo_id} (metadata unavailable)\", file=sys.stderr)\n # Keep the original record if enriching.\n if repo_id in existing_records:\n results.append(existing_records[repo_id])\n continue\n\n time.sleep(args.delay)\n\n lang_data = gh_api(f\"/repos/{repo_id}/languages\") or {}\n\n fresh = build_record(repo_id, meta, lang_data)\n\n if repo_id in existing_records:\n record = enrich_record(existing_records[repo_id], fresh)\n else:\n record = fresh\n\n results.append(record)\n\n if idx \u003c total:\n time.sleep(args.delay)\n\n write_jsonl(results, args.output)\n print(f\"Done. {len(results)}/{total} record(s) written to {args.output}\",\n file=sys.stderr)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":7398,"content_sha256":"2b202904e679e10a89289a01f756e75d87f8b2f9db944a9bf2e425b87fb17649"},{"filename":"scripts/repo_readme_fetch.py","content":"#!/usr/bin/env python3\n\"\"\"Fetch README content for GitHub repos without cloning.\n\nUses ``gh api`` to retrieve the README of one or more repositories,\ndecodes the base64 payload, and writes the results as JSONL.\n\"\"\"\n\nimport argparse\nimport base64\nimport json\nimport subprocess\nimport sys\nimport time\n\n\ndef gh_api(endpoint: str) -> dict | None:\n \"\"\"Call ``gh api`` and return parsed JSON, or *None* on failure.\"\"\"\n try:\n proc = subprocess.run(\n [\"gh\", \"api\", endpoint],\n capture_output=True, text=True, timeout=30,\n )\n if proc.returncode == 0:\n return json.loads(proc.stdout)\n # Distinguish 404 (no README) from other errors.\n if \"404\" in proc.stderr or \"Not Found\" in proc.stderr:\n return None\n print(f\" gh api error ({proc.returncode}): {proc.stderr.strip()[:120]}\",\n file=sys.stderr)\n except FileNotFoundError:\n print(\" Error: 'gh' CLI not found. Please install GitHub CLI.\", file=sys.stderr)\n sys.exit(1)\n except (subprocess.TimeoutExpired, json.JSONDecodeError) as exc:\n print(f\" gh api exception: {exc}\", file=sys.stderr)\n return None\n\n\ndef fetch_readme(repo_id: str, max_chars: int) -> dict | None:\n \"\"\"Return a dict with repo_id, readme_text, readme_length or *None*.\"\"\"\n data = gh_api(f\"/repos/{repo_id}/readme\")\n if data is None:\n return None\n\n content_b64 = data.get(\"content\", \"\")\n try:\n raw = base64.b64decode(content_b64).decode(\"utf-8\", errors=\"replace\")\n except Exception:\n raw = \"\"\n\n truncated = raw[:max_chars]\n return {\n \"repo_id\": repo_id,\n \"readme_text\": truncated,\n \"readme_length\": len(raw),\n }\n\n\ndef load_repo_ids_from_jsonl(path: str) -> list[str]:\n \"\"\"Read repo_id values from a JSONL file.\"\"\"\n ids: list[str] = []\n with open(path, encoding=\"utf-8\") as f:\n for line in f:\n line = line.strip()\n if not line:\n continue\n try:\n obj = json.loads(line)\n rid = obj.get(\"repo_id\")\n if rid:\n ids.append(rid)\n except json.JSONDecodeError:\n continue\n return ids\n\n\ndef main() -> None:\n parser = argparse.ArgumentParser(\n description=\"Fetch README content for GitHub repos without cloning.\",\n )\n source = parser.add_mutually_exclusive_group(required=True)\n source.add_argument(\"--repos\", nargs=\"+\", metavar=\"OWNER/NAME\",\n help=\"One or more repos as owner/name\")\n source.add_argument(\"--input\", metavar=\"FILE\",\n help=\"JSONL file with repo_id fields\")\n parser.add_argument(\"--output\", required=True, help=\"Output JSONL file path\")\n parser.add_argument(\"--max-chars\", type=int, default=5000,\n help=\"Max characters to keep from README (default: 5000)\")\n\n args = parser.parse_args()\n\n repo_ids: list[str] = args.repos if args.repos else load_repo_ids_from_jsonl(args.input)\n\n if not repo_ids:\n print(\"No repos to process.\", file=sys.stderr)\n sys.exit(0)\n\n total = len(repo_ids)\n results: list[dict] = []\n\n for idx, repo_id in enumerate(repo_ids, 1):\n print(f\"Fetching README: {idx}/{total} — {repo_id}\", file=sys.stderr)\n record = fetch_readme(repo_id, max_chars=args.max_chars)\n if record:\n results.append(record)\n else:\n print(f\" No README found for {repo_id}\", file=sys.stderr)\n if idx \u003c total:\n time.sleep(0.5)\n\n with open(args.output, \"w\", encoding=\"utf-8\") as fout:\n for rec in results:\n fout.write(json.dumps(rec, ensure_ascii=False) + \"\\n\")\n\n print(f\"Done. {len(results)}/{total} README(s) written to {args.output}\",\n file=sys.stderr)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":3886,"content_sha256":"99cb2db4987c3b31c19b24fb6e68588cd724dbb9e7ab7523a47806572742da2f"},{"filename":"scripts/search_github_code.py","content":"#!/usr/bin/env python3\n\"\"\"Search GitHub code for specific function/class implementations via `gh api`.\n\nSearches code using GitHub's code search API, groups results by repo, and\noutputs both code-level matches (stderr) and repo-level JSONL (--output).\n\"\"\"\n\nimport argparse\nimport json\nimport math\nimport shutil\nimport subprocess\nimport sys\nimport time\nfrom pathlib import Path\n\nRESULTS_PER_PAGE = 30\nDEFAULT_MAX_RESULTS = 50\n\n\ndef check_gh_installed():\n \"\"\"Verify gh CLI is available.\"\"\"\n if not shutil.which(\"gh\"):\n print(\"[error] gh CLI not found. Install it: https://cli.github.com/\", file=sys.stderr)\n sys.exit(1)\n\n\ndef gh_api(endpoint: str, params: dict | None = None) -> dict:\n \"\"\"Call gh api and return parsed JSON.\"\"\"\n if params:\n from urllib.parse import urlencode\n endpoint = f\"{endpoint}?{urlencode(params)}\"\n cmd = [\"gh\", \"api\", endpoint]\n try:\n result = subprocess.run(\n cmd,\n capture_output=True,\n text=True,\n timeout=30,\n )\n except FileNotFoundError:\n print(\"[error] gh CLI not found\", file=sys.stderr)\n sys.exit(1)\n except subprocess.TimeoutExpired:\n print(\"[error] gh api call timed out\", file=sys.stderr)\n sys.exit(1)\n\n if result.returncode != 0:\n stderr = result.stderr.strip()\n if \"rate limit\" in stderr.lower() or \"403\" in stderr:\n print(f\"[error] GitHub API rate limit hit: {stderr}\", file=sys.stderr)\n sys.exit(1)\n print(f\"[error] gh api failed (exit {result.returncode}): {stderr}\", file=sys.stderr)\n sys.exit(1)\n\n try:\n return json.loads(result.stdout)\n except json.JSONDecodeError as e:\n print(f\"[error] invalid JSON from gh api: {e}\", file=sys.stderr)\n sys.exit(1)\n\n\ndef build_code_query(query: str, language: str | None, filename: str | None) -> str:\n \"\"\"Build GitHub code search query with qualifiers.\"\"\"\n parts = [query]\n if language:\n parts.append(f\"language:{language}\")\n if filename:\n parts.append(f\"filename:{filename}\")\n return \" \".join(parts)\n\n\ndef search_code(query: str, max_results: int) -> list[dict]:\n \"\"\"Search GitHub code with pagination.\"\"\"\n total_pages = math.ceil(max_results / RESULTS_PER_PAGE)\n all_items = []\n\n for page in range(1, total_pages + 1):\n print(f\"[info] fetching page {page}/{total_pages} ...\", file=sys.stderr)\n\n params = {\n \"q\": query,\n \"per_page\": str(min(RESULTS_PER_PAGE, max_results - len(all_items))),\n \"page\": str(page),\n }\n\n data = gh_api(\"/search/code\", params)\n\n total_count = data.get(\"total_count\", 0)\n if page == 1:\n print(f\"[info] total matching files: {total_count}\", file=sys.stderr)\n\n items = data.get(\"items\", [])\n if not items:\n print(\"[info] no more results\", file=sys.stderr)\n break\n\n all_items.extend(items)\n\n if len(all_items) >= max_results:\n all_items = all_items[:max_results]\n break\n\n if data.get(\"incomplete_results\", False):\n print(\"[warn] GitHub returned incomplete results\", file=sys.stderr)\n\n # Code search is heavily rate limited: 10 req/min\n if page \u003c total_pages:\n time.sleep(6)\n\n return all_items\n\n\ndef extract_code_match(item: dict) -> dict:\n \"\"\"Extract a code match record from a search result item.\"\"\"\n repo = item.get(\"repository\", {})\n owner_obj = repo.get(\"owner\", {})\n full_name = repo.get(\"full_name\", \"\")\n\n # text_matches may be present if Accept header includes text-match\n text_fragments = []\n for tm in item.get(\"text_matches\", []):\n fragment = tm.get(\"fragment\", \"\")\n if fragment:\n text_fragments.append(fragment)\n\n return {\n \"repo_id\": full_name,\n \"file_path\": item.get(\"path\", \"\"),\n \"file_url\": item.get(\"html_url\", \"\"),\n \"file_name\": item.get(\"name\", \"\"),\n \"matched_text\": \"\\n---\\n\".join(text_fragments) if text_fragments else \"\",\n \"context\": {\n \"sha\": item.get(\"sha\", \"\"),\n \"score\": item.get(\"score\", 0),\n },\n }\n\n\ndef build_repo_record(repo_id: str, repo_data: dict) -> dict:\n \"\"\"Build a minimal repo_db-compatible record from code search results.\"\"\"\n owner = repo_id.split(\"/\")[0] if \"/\" in repo_id else \"\"\n name = repo_id.split(\"/\")[1] if \"/\" in repo_id else repo_id\n\n return {\n \"repo_id\": repo_id,\n \"url\": f\"https://github.com/{repo_id}\",\n \"name\": name,\n \"owner\": owner,\n \"description\": repo_data.get(\"description\", \"\"),\n \"stars\": 0,\n \"forks\": 0,\n \"language\": \"\",\n \"license\": \"\",\n \"topics\": [],\n \"created_at\": \"\",\n \"updated_at\": \"\",\n \"pushed_at\": \"\",\n \"open_issues\": 0,\n \"default_branch\": \"main\",\n \"archived\": False,\n \"source\": \"code_search\",\n \"code_matches\": repo_data.get(\"files\", []),\n \"languages_pct\": {},\n \"readme_excerpt\": \"\",\n \"paper_ids\": [],\n \"paper_titles\": [],\n \"relevance_score\": 0.0,\n \"quality_score\": 0.0,\n \"activity_score\": 0.0,\n \"composite_score\": 0.0,\n \"tags\": [],\n \"analyzed\": False,\n \"local_path\": None,\n }\n\n\ndef main():\n parser = argparse.ArgumentParser(\n description=\"Search GitHub code for function/class implementations via gh API.\"\n )\n parser.add_argument(\"--query\", required=True, help=\"Code search query string\")\n parser.add_argument(\"--language\", default=None, help=\"Filter by programming language\")\n parser.add_argument(\"--filename\", default=None, help=\"Filter by filename pattern\")\n parser.add_argument(\"--max-results\", type=int, default=DEFAULT_MAX_RESULTS, help=\"Max code results to fetch (default: 50)\")\n parser.add_argument(\"--output\", required=True, help=\"Output JSONL file for repo-level records\")\n args = parser.parse_args()\n\n check_gh_installed()\n\n full_query = build_code_query(args.query, args.language, args.filename)\n print(f\"[info] code search query: {full_query}\", file=sys.stderr)\n print(f\"[info] max results: {args.max_results}\", file=sys.stderr)\n\n items = search_code(full_query, args.max_results)\n\n # Process matches and group by repo\n code_matches = []\n repos: dict[str, dict] = {} # repo_id -> {description, file_count, files}\n\n for item in items:\n match = extract_code_match(item)\n code_matches.append(match)\n\n rid = match[\"repo_id\"]\n if rid not in repos:\n repo_obj = item.get(\"repository\", {})\n repos[rid] = {\n \"description\": repo_obj.get(\"description\") or \"\",\n \"file_count\": 0,\n \"files\": [],\n }\n repos[rid][\"file_count\"] += 1\n repos[rid][\"files\"].append(match[\"file_path\"])\n\n # Print code matches to stderr, grouped by repo\n print(f\"\\n{'=' * 60}\", file=sys.stderr)\n print(f\"Code matches: {len(code_matches)} files in {len(repos)} repos\", file=sys.stderr)\n print(f\"{'=' * 60}\", file=sys.stderr)\n\n for rid in sorted(repos.keys()):\n info = repos[rid]\n print(f\"\\n {rid} ({info['file_count']} files)\", file=sys.stderr)\n if info[\"description\"]:\n print(f\" {info['description'][:100]}\", file=sys.stderr)\n for fpath in info[\"files\"][:5]:\n print(f\" - {fpath}\", file=sys.stderr)\n if len(info[\"files\"]) > 5:\n print(f\" ... and {len(info['files']) - 5} more\", file=sys.stderr)\n\n # Write repo-level JSONL to --output (for feeding into repo_db merge)\n output_path = Path(args.output)\n output_path.parent.mkdir(parents=True, exist_ok=True)\n with open(output_path, \"w\", encoding=\"utf-8\") as f:\n for rid, rdata in sorted(repos.items()):\n record = build_repo_record(rid, rdata)\n f.write(json.dumps(record, ensure_ascii=False) + \"\\n\")\n\n print(f\"\\n[info] wrote {len(repos)} repo records to {output_path}\", file=sys.stderr)\n\n # Also write code-level matches alongside (with .code suffix)\n code_output = output_path.with_suffix(\".code.jsonl\")\n with open(code_output, \"w\", encoding=\"utf-8\") as f:\n for match in code_matches:\n f.write(json.dumps(match, ensure_ascii=False) + \"\\n\")\n\n print(f\"[info] wrote {len(code_matches)} code matches to {code_output}\", file=sys.stderr)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":8507,"content_sha256":"9ebe5bdbf3d72dacc07d376c87c4b4aa2063bfd8e217d8147afece79445a586d"},{"filename":"scripts/search_github.py","content":"#!/usr/bin/env python3\n\"\"\"Search GitHub repositories via `gh api` with multiple query strategies.\n\nBuilds qualified search queries and paginates through results, outputting\nrepo_db-compatible JSONL records.\n\"\"\"\n\nimport argparse\nimport json\nimport math\nimport shutil\nimport subprocess\nimport sys\nimport time\nfrom pathlib import Path\n\nRESULTS_PER_PAGE = 30\nDEFAULT_MAX_RESULTS = 100\n\n\ndef check_gh_installed():\n \"\"\"Verify gh CLI is available.\"\"\"\n if not shutil.which(\"gh\"):\n print(\"[error] gh CLI not found. Install it: https://cli.github.com/\", file=sys.stderr)\n sys.exit(1)\n\n\ndef gh_api(endpoint: str, params: dict | None = None) -> dict:\n \"\"\"Call gh api and return parsed JSON.\"\"\"\n if params:\n from urllib.parse import urlencode\n endpoint = f\"{endpoint}?{urlencode(params)}\"\n cmd = [\"gh\", \"api\", endpoint]\n try:\n result = subprocess.run(\n cmd,\n capture_output=True,\n text=True,\n timeout=30,\n )\n except FileNotFoundError:\n print(\"[error] gh CLI not found\", file=sys.stderr)\n sys.exit(1)\n except subprocess.TimeoutExpired:\n print(\"[error] gh api call timed out\", file=sys.stderr)\n sys.exit(1)\n\n if result.returncode != 0:\n stderr = result.stderr.strip()\n # Check for rate limit\n if \"rate limit\" in stderr.lower() or \"403\" in stderr:\n print(f\"[error] GitHub API rate limit hit: {stderr}\", file=sys.stderr)\n sys.exit(1)\n print(f\"[error] gh api failed (exit {result.returncode}): {stderr}\", file=sys.stderr)\n sys.exit(1)\n\n try:\n return json.loads(result.stdout)\n except json.JSONDecodeError as e:\n print(f\"[error] invalid JSON from gh api: {e}\", file=sys.stderr)\n sys.exit(1)\n\n\ndef build_query(query: str, language: str | None, min_stars: int | None, topic: str | None) -> str:\n \"\"\"Build GitHub search query string with qualifiers.\"\"\"\n parts = [query]\n if language:\n parts.append(f\"language:{language}\")\n if min_stars is not None and min_stars > 0:\n parts.append(f\"stars:>={min_stars}\")\n if topic:\n parts.append(f\"topic:{topic}\")\n return \" \".join(parts)\n\n\ndef extract_license(license_obj) -> str:\n \"\"\"Extract license identifier from GitHub API license object.\"\"\"\n if not license_obj or not isinstance(license_obj, dict):\n return \"\"\n return license_obj.get(\"spdx_id\") or license_obj.get(\"name\") or \"\"\n\n\ndef map_repo(item: dict) -> dict:\n \"\"\"Map a GitHub API repository item to repo_db schema.\"\"\"\n owner_obj = item.get(\"owner\") or {}\n return {\n \"repo_id\": item.get(\"full_name\", \"\"),\n \"url\": item.get(\"html_url\", \"\"),\n \"name\": item.get(\"name\", \"\"),\n \"owner\": owner_obj.get(\"login\", \"\"),\n \"description\": item.get(\"description\") or \"\",\n \"stars\": item.get(\"stargazers_count\", 0),\n \"forks\": item.get(\"forks_count\", 0),\n \"language\": item.get(\"language\") or \"\",\n \"license\": extract_license(item.get(\"license\")),\n \"topics\": item.get(\"topics\", []),\n \"created_at\": item.get(\"created_at\", \"\"),\n \"updated_at\": item.get(\"updated_at\", \"\"),\n \"pushed_at\": item.get(\"pushed_at\", \"\"),\n \"open_issues\": item.get(\"open_issues_count\", 0),\n \"default_branch\": item.get(\"default_branch\", \"main\"),\n \"archived\": item.get(\"archived\", False),\n \"source\": \"search_github\",\n # Default fields for repo_db compatibility\n \"languages_pct\": {},\n \"readme_excerpt\": \"\",\n \"paper_ids\": [],\n \"paper_titles\": [],\n \"relevance_score\": 0.0,\n \"quality_score\": 0.0,\n \"activity_score\": 0.0,\n \"composite_score\": 0.0,\n \"tags\": [],\n \"analyzed\": False,\n \"local_path\": None,\n }\n\n\ndef check_rate_limit():\n \"\"\"Check GitHub API rate limit and warn if approaching it.\"\"\"\n try:\n result = subprocess.run(\n [\"gh\", \"api\", \"rate_limit\"],\n capture_output=True,\n text=True,\n timeout=10,\n )\n if result.returncode == 0:\n data = json.loads(result.stdout)\n search = data.get(\"resources\", {}).get(\"search\", {})\n remaining = search.get(\"remaining\", -1)\n limit = search.get(\"limit\", -1)\n reset_ts = search.get(\"reset\", 0)\n\n if remaining >= 0:\n if remaining \u003c= 3:\n reset_time = time.strftime(\"%H:%M:%S\", time.localtime(reset_ts))\n print(\n f\"[warn] GitHub search API rate limit: {remaining}/{limit} remaining, \"\n f\"resets at {reset_time}\",\n file=sys.stderr,\n )\n else:\n print(f\"[info] rate limit: {remaining}/{limit} search requests remaining\", file=sys.stderr)\n except (subprocess.TimeoutExpired, json.JSONDecodeError, OSError):\n pass # Non-critical, skip\n\n\ndef search_repos(\n query: str,\n sort: str,\n max_results: int,\n) -> list[dict]:\n \"\"\"Search GitHub repositories with pagination.\"\"\"\n total_pages = math.ceil(max_results / RESULTS_PER_PAGE)\n all_repos = []\n seen_ids = set()\n\n for page in range(1, total_pages + 1):\n print(f\"[info] fetching page {page}/{total_pages} ...\", file=sys.stderr)\n\n params = {\n \"q\": query,\n \"sort\": sort,\n \"order\": \"desc\",\n \"per_page\": str(min(RESULTS_PER_PAGE, max_results - len(all_repos))),\n \"page\": str(page),\n }\n\n data = gh_api(\"/search/repositories\", params)\n\n total_count = data.get(\"total_count\", 0)\n if page == 1:\n print(f\"[info] total matching repos: {total_count}\", file=sys.stderr)\n\n items = data.get(\"items\", [])\n if not items:\n print(\"[info] no more results\", file=sys.stderr)\n break\n\n for item in items:\n repo = map_repo(item)\n if repo[\"repo_id\"] not in seen_ids:\n seen_ids.add(repo[\"repo_id\"])\n all_repos.append(repo)\n\n if len(all_repos) >= max_results:\n break\n\n if len(all_repos) >= max_results:\n break\n\n # Incomplete results means GitHub truncated\n if data.get(\"incomplete_results\", False):\n print(\"[warn] GitHub returned incomplete results (query too broad?)\", file=sys.stderr)\n\n # Respect rate limit: sleep 2s between pages for search API\n if page \u003c total_pages:\n time.sleep(2)\n\n return all_repos\n\n\ndef main():\n parser = argparse.ArgumentParser(\n description=\"Search GitHub repositories via gh API with flexible query strategies.\"\n )\n parser.add_argument(\"--query\", required=True, help=\"Search query string\")\n parser.add_argument(\"--language\", default=None, help=\"Filter by programming language\")\n parser.add_argument(\"--min-stars\", type=int, default=None, help=\"Minimum star count\")\n parser.add_argument(\n \"--sort\",\n choices=[\"stars\", \"updated\", \"best-match\"],\n default=\"best-match\",\n help=\"Sort order (default: best-match)\",\n )\n parser.add_argument(\"--max-results\", type=int, default=DEFAULT_MAX_RESULTS, help=\"Max results to fetch (default: 100)\")\n parser.add_argument(\"--topic\", default=None, help=\"Filter by GitHub topic\")\n parser.add_argument(\"--output\", required=True, help=\"Output JSONL file path\")\n args = parser.parse_args()\n\n check_gh_installed()\n\n # Build the qualified query\n full_query = build_query(args.query, args.language, args.min_stars, args.topic)\n print(f\"[info] search query: {full_query}\", file=sys.stderr)\n print(f\"[info] sort: {args.sort}, max: {args.max_results}\", file=sys.stderr)\n\n # Check rate limit before starting\n check_rate_limit()\n\n # Handle \"best-match\" -> empty sort for GitHub API (relevance is default)\n sort_param = \"\" if args.sort == \"best-match\" else args.sort\n\n repos = search_repos(full_query, sort_param, args.max_results)\n\n # Write output\n output_path = Path(args.output)\n output_path.parent.mkdir(parents=True, exist_ok=True)\n with open(output_path, \"w\", encoding=\"utf-8\") as f:\n for repo in repos:\n f.write(json.dumps(repo, ensure_ascii=False) + \"\\n\")\n\n # Summary stats\n print(f\"\\n[info] results summary:\", file=sys.stderr)\n print(f\" repos found: {len(repos)}\", file=sys.stderr)\n if repos:\n languages = {}\n for r in repos:\n lang = r[\"language\"] or \"unknown\"\n languages[lang] = languages.get(lang, 0) + 1\n top_langs = sorted(languages.items(), key=lambda x: -x[1])[:5]\n print(f\" top languages: {', '.join(f'{l}({c})' for l, c in top_langs)}\", file=sys.stderr)\n\n star_counts = [r[\"stars\"] for r in repos]\n print(f\" stars range: {min(star_counts)} - {max(star_counts)}\", file=sys.stderr)\n\n print(f\"[info] written to {output_path}\", file=sys.stderr)\n\n # Check rate limit after\n check_rate_limit()\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":9109,"content_sha256":"a7277a5e8c0b0563a2b156c0b6b558599588d92a90334b28be2f2d123b9248ce"},{"filename":"scripts/search_paperswithcode.py","content":"#!/usr/bin/env python3\n\"\"\"Search Papers With Code for paper-to-repo mappings.\n\nUses the Papers With Code public API (https://paperswithcode.com/api/v1/)\nto find GitHub repositories linked to research papers. Supports lookup by\nsingle arXiv ID, multiple IDs, a file of IDs, or keyword search.\n\nSelf-contained: stdlib only (urllib for HTTP).\n\nUsage:\n python search_paperswithcode.py --arxiv-id 2301.12345 --output repos.jsonl\n python search_paperswithcode.py --query \"multi-agent\" --output repos.jsonl\n python search_paperswithcode.py --arxiv-ids-file papers.txt --output repos.jsonl\n python search_paperswithcode.py --arxiv-ids 2301.12345 2305.67890 --output repos.jsonl\n\"\"\"\n\nimport argparse\nimport json\nimport sys\nimport time\nimport urllib.error\nimport urllib.parse\nimport urllib.request\nfrom pathlib import Path\n\nPWC_API = \"https://paperswithcode.com/api/v1\"\nUSER_AGENT = \"github-research/1.0\"\nREQUEST_DELAY = 1.0 # seconds between requests\n\n\n# ---------------------------------------------------------------------------\n# HTTP helpers\n# ---------------------------------------------------------------------------\n\ndef pwc_request(url: str) -> dict | list | None:\n \"\"\"Make a GET request to the Papers With Code API with retry logic.\n\n Returns parsed JSON or None on failure.\n \"\"\"\n headers = {\"User-Agent\": USER_AGENT, \"Accept\": \"application/json\"}\n req = urllib.request.Request(url, headers=headers)\n\n for attempt in range(3):\n try:\n with urllib.request.urlopen(req, timeout=30) as resp:\n return json.loads(resp.read().decode(\"utf-8\"))\n except urllib.error.HTTPError as e:\n if e.code == 404:\n return None\n if e.code == 429:\n wait = 2 ** (attempt + 1)\n print(f\"[warn] rate limited, waiting {wait}s ...\", file=sys.stderr)\n time.sleep(wait)\n continue\n print(f\"[warn] HTTP {e.code} for {url}\", file=sys.stderr)\n if attempt \u003c 2:\n time.sleep(1)\n continue\n return None\n except (urllib.error.URLError, OSError) as e:\n print(f\"[warn] request failed: {e}\", file=sys.stderr)\n if attempt \u003c 2:\n time.sleep(1)\n continue\n return None\n return None\n\n\ndef paginate_results(url: str) -> list[dict]:\n \"\"\"Follow ``next`` links and collect all ``results`` items.\"\"\"\n items: list[dict] = []\n current_url: str | None = url\n while current_url:\n data = pwc_request(current_url)\n if not data or not isinstance(data, dict):\n break\n items.extend(data.get(\"results\", []))\n current_url = data.get(\"next\")\n if current_url:\n time.sleep(REQUEST_DELAY)\n return items\n\n\n# ---------------------------------------------------------------------------\n# GitHub URL parsing\n# ---------------------------------------------------------------------------\n\ndef parse_repo_url(url: str) -> tuple[str, str]:\n \"\"\"Parse a GitHub URL into (owner, name). Returns (\"\", \"\") if not GitHub.\"\"\"\n if not url or \"github.com\" not in url:\n return \"\", \"\"\n url = url.rstrip(\"/\")\n for prefix in (\"https://github.com/\", \"http://github.com/\"):\n if url.startswith(prefix):\n path = url[len(prefix):]\n parts = path.split(\"/\")\n if len(parts) >= 2 and parts[0] and parts[1]:\n name = parts[1].removesuffix(\".git\")\n return parts[0], name\n return \"\", \"\"\n\n\n# ---------------------------------------------------------------------------\n# Repo record mapping\n# ---------------------------------------------------------------------------\n\ndef map_to_repo_record(\n repo: dict,\n paper_title: str = \"\",\n arxiv_id: str = \"\",\n) -> dict | None:\n \"\"\"Map a PWC repository entry to the repo_db schema.\n\n Returns None if the URL is not a valid GitHub repo.\n \"\"\"\n url = repo.get(\"url\", \"\")\n owner, name = parse_repo_url(url)\n if not owner or not name:\n return None\n\n repo_id = f\"{owner}/{name}\"\n paper_ids = [arxiv_id] if arxiv_id else []\n paper_titles = [paper_title] if paper_title else []\n\n return {\n \"repo_id\": repo_id,\n \"url\": f\"https://github.com/{repo_id}\",\n \"name\": name,\n \"owner\": owner,\n \"description\": repo.get(\"description\") or \"\",\n \"stars\": repo.get(\"stars\", 0) or 0,\n \"forks\": 0,\n \"language\": repo.get(\"framework\") or \"\",\n \"license\": \"\",\n \"topics\": [],\n \"created_at\": \"\",\n \"updated_at\": \"\",\n \"pushed_at\": \"\",\n \"open_issues\": 0,\n \"default_branch\": \"main\",\n \"archived\": False,\n \"source\": \"paperswithcode\",\n \"paper_ids\": paper_ids,\n \"paper_titles\": paper_titles,\n \"is_official\": repo.get(\"is_official\", False),\n # Default fields for repo_db compatibility\n \"languages_pct\": {},\n \"readme_excerpt\": \"\",\n \"relevance_score\": 0.0,\n \"quality_score\": 0.0,\n \"activity_score\": 0.0,\n \"composite_score\": 0.0,\n \"tags\": [],\n \"analyzed\": False,\n \"local_path\": None,\n }\n\n\n# ---------------------------------------------------------------------------\n# Core lookup functions\n# ---------------------------------------------------------------------------\n\ndef lookup_paper_by_arxiv(arxiv_id: str) -> dict | None:\n \"\"\"Look up a paper on PWC by arXiv ID. Returns first matching paper or None.\"\"\"\n clean_id = arxiv_id.strip()\n # Remove version suffix if present (e.g. 2301.12345v2 -> 2301.12345)\n if \"v\" in clean_id and clean_id[-1].isdigit():\n base, _, ver = clean_id.rpartition(\"v\")\n if ver.isdigit():\n clean_id = base\n\n url = f\"{PWC_API}/papers/?arxiv_id={urllib.parse.quote(clean_id, safe='')}\"\n data = pwc_request(url)\n if not data or not isinstance(data, dict):\n return None\n\n results = data.get(\"results\", [])\n return results[0] if results else None\n\n\ndef fetch_paper_repos(paper_id: str) -> list[dict]:\n \"\"\"Fetch all repositories linked to a paper by its PWC paper ID.\"\"\"\n url = f\"{PWC_API}/papers/{urllib.parse.quote(paper_id, safe='')}/repositories/\"\n return paginate_results(url)\n\n\ndef search_papers_by_query(query: str, max_papers: int = 50) -> list[dict]:\n \"\"\"Search PWC papers by keyword query.\"\"\"\n papers: list[dict] = []\n page = 1\n\n while len(papers) \u003c max_papers:\n params = urllib.parse.urlencode({\"q\": query, \"page\": page})\n url = f\"{PWC_API}/papers/?{params}\"\n data = pwc_request(url)\n if not data or not isinstance(data, dict):\n break\n\n results = data.get(\"results\", [])\n if not results:\n break\n\n papers.extend(results)\n\n if data.get(\"next\"):\n page += 1\n time.sleep(REQUEST_DELAY)\n else:\n break\n\n return papers[:max_papers]\n\n\n# ---------------------------------------------------------------------------\n# Processing functions\n# ---------------------------------------------------------------------------\n\ndef process_arxiv_id(arxiv_id: str) -> tuple[list[dict], str]:\n \"\"\"Process a single arXiv ID: look up paper, fetch repos.\n\n Returns (repo_records, paper_title).\n \"\"\"\n clean_id = arxiv_id.strip()\n if not clean_id:\n return [], \"\"\n\n print(f\" arXiv:{clean_id} ...\", file=sys.stderr)\n paper = lookup_paper_by_arxiv(clean_id)\n if not paper:\n print(f\" not found on PWC\", file=sys.stderr)\n return [], \"\"\n\n paper_id = paper.get(\"id\", \"\")\n paper_title = paper.get(\"title\", \"\")\n repo_count = paper.get(\"repository_count\", 0)\n\n print(f\" {paper_title[:80]}\", file=sys.stderr)\n print(f\" repos: {repo_count}\", file=sys.stderr)\n\n if not paper_id or repo_count == 0:\n return [], paper_title\n\n time.sleep(REQUEST_DELAY)\n raw_repos = fetch_paper_repos(paper_id)\n\n records = []\n for repo in raw_repos:\n record = map_to_repo_record(repo, paper_title=paper_title, arxiv_id=clean_id)\n if record:\n records.append(record)\n\n print(f\" mapped {len(records)} GitHub repos\", file=sys.stderr)\n return records, paper_title\n\n\ndef process_query(query: str) -> list[dict]:\n \"\"\"Search PWC by keyword and fetch repos for matching papers.\"\"\"\n print(f\"[info] searching PWC for: {query}\", file=sys.stderr)\n papers = search_papers_by_query(query)\n print(f\"[info] found {len(papers)} papers\", file=sys.stderr)\n\n all_records: list[dict] = []\n papers_with_repos = 0\n\n for i, paper in enumerate(papers):\n paper_id = paper.get(\"id\", \"\")\n paper_title = paper.get(\"title\", \"\")\n arxiv_id = paper.get(\"arxiv_id\", \"\") or \"\"\n repo_count = paper.get(\"repository_count\", 0)\n\n if not paper_id or repo_count == 0:\n continue\n\n print(\n f\" [{i + 1}/{len(papers)}] {paper_title[:60]} ({repo_count} repos)\",\n file=sys.stderr,\n )\n\n time.sleep(REQUEST_DELAY)\n raw_repos = fetch_paper_repos(paper_id)\n\n for repo in raw_repos:\n record = map_to_repo_record(\n repo, paper_title=paper_title, arxiv_id=arxiv_id\n )\n if record:\n all_records.append(record)\n\n papers_with_repos += 1\n time.sleep(REQUEST_DELAY)\n\n print(f\"[info] {papers_with_repos} papers had repos\", file=sys.stderr)\n return all_records\n\n\ndef deduplicate_repos(records: list[dict]) -> list[dict]:\n \"\"\"Deduplicate repos by repo_id, merging paper_ids and paper_titles.\"\"\"\n seen: dict[str, dict] = {}\n for record in records:\n rid = record[\"repo_id\"]\n if rid in seen:\n existing = seen[rid]\n # Merge paper references\n for pid in record.get(\"paper_ids\", []):\n if pid and pid not in existing[\"paper_ids\"]:\n existing[\"paper_ids\"].append(pid)\n for pt in record.get(\"paper_titles\", []):\n if pt and pt not in existing[\"paper_titles\"]:\n existing[\"paper_titles\"].append(pt)\n # Keep higher star count\n if record.get(\"stars\", 0) > existing.get(\"stars\", 0):\n existing[\"stars\"] = record[\"stars\"]\n # Prefer official repos\n if record.get(\"is_official\", False):\n existing[\"is_official\"] = True\n else:\n seen[rid] = record\n return list(seen.values())\n\n\n# ---------------------------------------------------------------------------\n# Main\n# ---------------------------------------------------------------------------\n\ndef main() -> None:\n parser = argparse.ArgumentParser(\n description=\"Search Papers With Code for paper-to-repo mappings.\"\n )\n parser.add_argument(\"--arxiv-id\", default=None,\n help=\"Single arXiv ID to look up\")\n parser.add_argument(\"--arxiv-ids\", nargs=\"+\", default=None,\n help=\"Multiple arXiv IDs\")\n parser.add_argument(\"--arxiv-ids-file\", default=None,\n help=\"File with one arXiv ID per line\")\n parser.add_argument(\"--query\", default=None,\n help=\"Search papers by keyword\")\n parser.add_argument(\"--output\", required=True,\n help=\"Output JSONL file path\")\n args = parser.parse_args()\n\n # Validate: at least one input source required\n if not any([args.arxiv_id, args.arxiv_ids, args.arxiv_ids_file, args.query]):\n parser.error(\n \"At least one of --arxiv-id, --arxiv-ids, --arxiv-ids-file, \"\n \"or --query is required\"\n )\n\n # Collect all arXiv IDs from all sources\n all_arxiv_ids: list[str] = []\n if args.arxiv_id:\n all_arxiv_ids.append(args.arxiv_id)\n if args.arxiv_ids:\n all_arxiv_ids.extend(args.arxiv_ids)\n if args.arxiv_ids_file:\n ids_file = Path(args.arxiv_ids_file)\n if not ids_file.exists():\n print(f\"[error] arXiv IDs file not found: {ids_file}\", file=sys.stderr)\n sys.exit(1)\n try:\n for line in ids_file.read_text(encoding=\"utf-8\").splitlines():\n line = line.strip()\n if line and not line.startswith(\"#\"):\n all_arxiv_ids.append(line)\n except OSError as e:\n print(f\"[error] cannot read {ids_file}: {e}\", file=sys.stderr)\n sys.exit(1)\n\n all_records: list[dict] = []\n papers_found = 0\n\n # Process arXiv IDs\n if all_arxiv_ids:\n print(\n f\"[info] looking up {len(all_arxiv_ids)} arXiv IDs on Papers With Code ...\",\n file=sys.stderr,\n )\n for arxiv_id in all_arxiv_ids:\n records, title = process_arxiv_id(arxiv_id)\n if title:\n papers_found += 1\n all_records.extend(records)\n time.sleep(REQUEST_DELAY)\n\n # Process keyword query\n if args.query:\n query_records = process_query(args.query)\n all_records.extend(query_records)\n\n # Deduplicate\n deduped = deduplicate_repos(all_records)\n\n # Write output\n output_path = Path(args.output)\n output_path.parent.mkdir(parents=True, exist_ok=True)\n with open(output_path, \"w\", encoding=\"utf-8\") as f:\n for record in deduped:\n f.write(json.dumps(record, ensure_ascii=False) + \"\\n\")\n\n # Summary\n total_paper_links = sum(len(r.get(\"paper_ids\", [])) for r in deduped)\n official = sum(1 for r in deduped if r.get(\"is_official\", False))\n\n print(f\"\\n[info] results summary:\", file=sys.stderr)\n print(f\" repos found: {len(deduped)}\", file=sys.stderr)\n print(f\" linked to papers: {total_paper_links}\", file=sys.stderr)\n if official:\n print(f\" official implementations: {official}\", file=sys.stderr)\n print(f\"Found {len(deduped)} repos linked to {papers_found} papers\", file=sys.stderr)\n print(f\"[info] written to {output_path}\", file=sys.stderr)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":14075,"content_sha256":"e670a6cf7361011a7518aab39f1c2842b218638f44ac0f8d7ab0de25d29c411a"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"GitHub Research Skill","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Trigger","type":"text"}]},{"type":"paragraph","content":[{"text":"Activate this skill when the user wants to:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"Find repos for [topic]\", \"GitHub research on [topic]\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"Analyze open-source code for [topic]\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"Find implementations of [paper/technique]\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"Which repos implement [algorithm]?\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Uses ","type":"text"},{"text":"/github-research \u003cdeep-research-output-dir>","type":"text","marks":[{"type":"code_inline"}]},{"text":" slash command","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Overview","type":"text"}]},{"type":"paragraph","content":[{"text":"This skill systematically discovers, evaluates, and deeply analyzes GitHub repositories related to a research topic. It reads ","type":"text"},{"text":"deep-research","type":"text","marks":[{"type":"strong"}]},{"text":" output (paper database, phase reports, code references) and produces an actionable integration blueprint for reusing open-source code.","type":"text"}]},{"type":"paragraph","content":[{"text":"Installation","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"~/.claude/skills/github-research/","type":"text","marks":[{"type":"code_inline"}]},{"text":" — scripts, references, and this skill definition. ","type":"text"},{"text":"Output","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"./github-research-output/{slug}/","type":"text","marks":[{"type":"code_inline"}]},{"text":" relative to the current working directory. ","type":"text"},{"text":"Input","type":"text","marks":[{"type":"strong"}]},{"text":": A deep-research output directory (containing ","type":"text"},{"text":"paper_db.jsonl","type":"text","marks":[{"type":"code_inline"}]},{"text":", phase reports, ","type":"text"},{"text":"code_repos.md","type":"text","marks":[{"type":"code_inline"}]},{"text":", etc.)","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"6-Phase Pipeline","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"Phase 1: Intake → Extract refs, URLs, keywords from deep-research output\nPhase 2: Discovery → Multi-source broad GitHub search (50-200 repos)\nPhase 3: Filtering → Score & rank → select top 15-30 repos\nPhase 4: Deep Dive → Clone & deeply analyze top 8-15 repos (code reading)\nPhase 5: Analysis → Per-repo reports + cross-repo comparison\nPhase 6: Blueprint → Integration/reuse plan for research topic","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Output Directory Structure","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"github-research-output/{slug}/\n├── repo_db.jsonl # Master repo database\n├── phase1_intake/\n│ ├── extracted_refs.jsonl # URLs, keywords, paper-repo links\n│ └── intake_summary.md\n├── phase2_discovery/\n│ ├── search_results/ # Raw JSONL from each search\n│ └── discovery_log.md\n├── phase3_filtering/\n│ ├── ranked_repos.jsonl # Scored & ranked subset\n│ └── filtering_report.md\n├── phase4_deep_dive/\n│ ├── repos/ # Cloned repos (shallow)\n│ ├── analyses/ # Per-repo analysis .md files\n│ └── deep_dive_summary.md\n├── phase5_analysis/\n│ ├── comparison_matrix.md # Cross-repo comparison\n│ ├── technique_map.md # Paper concept → code mapping\n│ └── analysis_report.md\n└── phase6_blueprint/\n ├── integration_plan.md # How to combine repos\n ├── reuse_catalog.md # Reusable components catalog\n ├── final_report.md # Complete compiled report\n └── blueprint_summary.md","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Scripts Reference","type":"text"}]},{"type":"paragraph","content":[{"text":"All scripts are Python 3, stdlib-only, located in ","type":"text"},{"text":"~/.claude/skills/github-research/scripts/","type":"text","marks":[{"type":"code_inline"}]},{"text":".","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Script","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Purpose","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Key Flags","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"extract_research_refs.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Parse deep-research output for GitHub URLs, paper refs, keywords","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--research-dir","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--output","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"search_github.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Search GitHub repos via ","type":"text"},{"text":"gh api","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--query","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--language","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--min-stars","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--sort","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--max-results","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--topic","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--output","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"search_github_code.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Search GitHub code for implementations","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--query","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--language","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--filename","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--max-results","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--output","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"search_paperswithcode.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Search Papers With Code for paper→repo mappings","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--paper-title","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--arxiv-id","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--query","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--output","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"repo_db.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"JSONL repo database management","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"subcommands: ","type":"text"},{"text":"merge","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"filter","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"score","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"search","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"tag","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"stats","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"export","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"rank","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"repo_metadata.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fetch detailed metadata via ","type":"text"},{"text":"gh api","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--repos","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--input","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--output","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--delay","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"clone_repo.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Shallow-clone repos for analysis","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--repo","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--output-dir","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--depth","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--branch","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"analyze_repo_structure.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Map file tree, key files, LOC stats","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--repo-dir","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--output","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"extract_dependencies.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Extract and parse dependency files","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--repo-dir","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--output","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"find_implementations.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Search cloned repo for specific code patterns","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--repo-dir","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--patterns","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--output","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"repo_readme_fetch.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fetch README without cloning","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--repos","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--input","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--output","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--max-chars","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"compare_repos.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Generate comparison matrix across repos","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--input","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--output","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"compile_github_report.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Assemble final report from all phases","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--topic-dir","type":"text","marks":[{"type":"code_inline"}]}]}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Phase 1: Intake","type":"text"}]},{"type":"paragraph","content":[{"text":"Goal","type":"text","marks":[{"type":"strong"}]},{"text":": Extract all relevant references, URLs, and keywords from the deep-research output.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Steps","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Create output directory structure","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"SLUG=$(echo \"$TOPIC\" | tr '[:upper:]' '[:lower:]' | tr ' ' '-' | tr -cd 'a-z0-9-')\nmkdir -p github-research-output/$SLUG/{phase1_intake,phase2_discovery/search_results,phase3_filtering,phase4_deep_dive/{repos,analyses},phase5_analysis,phase6_blueprint}","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Extract references from deep-research output","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/extract_research_refs.py \\\n --research-dir \u003cdeep-research-output-dir> \\\n --output github-research-output/$SLUG/phase1_intake/extracted_refs.jsonl","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Review extracted refs","type":"text","marks":[{"type":"strong"}]},{"text":": Read the generated JSONL. Note:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"GitHub URLs found directly in reports","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Paper titles and arxiv IDs (for Papers With Code lookup)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Research keywords and themes (for GitHub search queries)","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Write intake summary","type":"text","marks":[{"type":"strong"}]},{"text":": Create ","type":"text"},{"text":"phase1_intake/intake_summary.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" with:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Number of direct GitHub URLs found","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Number of papers with potential code links","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Key research themes extracted","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Planned search queries for Phase 2","type":"text"}]}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Checkpoint","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"extracted_refs.jsonl","type":"text","marks":[{"type":"code_inline"}]},{"text":" exists with entries","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"intake_summary.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" written","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Search strategy documented","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Phase 2: Discovery","type":"text"}]},{"type":"paragraph","content":[{"text":"Goal","type":"text","marks":[{"type":"strong"}]},{"text":": Cast a wide net to find 50-200 candidate repos from multiple sources.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Steps","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Search by direct URLs","type":"text","marks":[{"type":"strong"}]},{"text":": Any GitHub URLs from Phase 1 → fetch metadata:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/repo_metadata.py \\\n --repos owner1/name1 owner2/name2 ... \\\n --output github-research-output/$SLUG/phase2_discovery/search_results/direct_urls.jsonl","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Search Papers With Code","type":"text","marks":[{"type":"strong"}]},{"text":": For each paper with an arxiv ID:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/search_paperswithcode.py \\\n --arxiv-id 2401.12345 \\\n --output github-research-output/$SLUG/phase2_discovery/search_results/pwc_2401.12345.jsonl","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Search GitHub by keywords","type":"text","marks":[{"type":"strong"}]},{"text":" (3-8 queries based on research themes):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/search_github.py \\\n --query \"multi-agent LLM coordination\" \\\n --min-stars 10 --sort stars --max-results 50 \\\n --output github-research-output/$SLUG/phase2_discovery/search_results/gh_query1.jsonl","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Search GitHub code","type":"text","marks":[{"type":"strong"}]},{"text":" (for specific implementations):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/search_github_code.py \\\n --query \"class MultiAgentOrchestrator\" \\\n --language python --max-results 30 \\\n --output github-research-output/$SLUG/phase2_discovery/search_results/code_query1.jsonl","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Fetch READMEs","type":"text","marks":[{"type":"strong"}]},{"text":" for repos that lack descriptions:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/repo_readme_fetch.py \\\n --input \u003crepos.jsonl> \\\n --output github-research-output/$SLUG/phase2_discovery/search_results/readmes.jsonl","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Merge all results","type":"text","marks":[{"type":"strong"}]},{"text":" into master database:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/repo_db.py merge \\\n --inputs github-research-output/$SLUG/phase2_discovery/search_results/*.jsonl \\\n --output github-research-output/$SLUG/repo_db.jsonl","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Write discovery log","type":"text","marks":[{"type":"strong"}]},{"text":": Create ","type":"text"},{"text":"phase2_discovery/discovery_log.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" with search queries used, results per source, total unique repos found.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Rate Limits","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"GitHub search API: 30 requests/minute (authenticated)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Papers With Code API: No strict limit but be respectful (1 req/sec)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Add ","type":"text"},{"text":"--delay 1.0","type":"text","marks":[{"type":"code_inline"}]},{"text":" to batch operations when needed","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Checkpoint","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"repo_db.jsonl","type":"text","marks":[{"type":"code_inline"}]},{"text":" populated with 50-200 repos","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"discovery_log.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" with search details","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Phase 3: Filtering","type":"text"}]},{"type":"paragraph","content":[{"text":"Goal","type":"text","marks":[{"type":"strong"}]},{"text":": Score and rank repos, select top 15-30 for deeper analysis.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Steps","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Enrich metadata","type":"text","marks":[{"type":"strong"}]},{"text":" for all repos:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/repo_metadata.py \\\n --input github-research-output/$SLUG/repo_db.jsonl \\\n --output github-research-output/$SLUG/repo_db.jsonl \\\n --delay 0.5","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Score repos","type":"text","marks":[{"type":"strong"}]},{"text":" (quality + activity scores):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/repo_db.py score \\\n --input github-research-output/$SLUG/repo_db.jsonl \\\n --output github-research-output/$SLUG/repo_db.jsonl","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"LLM relevance scoring","type":"text","marks":[{"type":"strong"}]},{"text":": Read through the top ~50 repos (by quality_score) and assign ","type":"text"},{"text":"relevance_score","type":"text","marks":[{"type":"code_inline"}]},{"text":" (0.0-1.0) based on:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Direct relevance to research topic","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Implementation completeness","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Code quality signals (from README, description)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Update the relevance scores:","type":"text"}]}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/repo_db.py tag \\\n --input github-research-output/$SLUG/repo_db.jsonl \\\n --ids owner/name --tags \"relevance:0.85\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Compute composite scores and rank","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/repo_db.py score \\\n --input github-research-output/$SLUG/repo_db.jsonl \\\n --output github-research-output/$SLUG/repo_db.jsonl\npython ~/.claude/skills/github-research/scripts/repo_db.py rank \\\n --input github-research-output/$SLUG/repo_db.jsonl \\\n --output github-research-output/$SLUG/phase3_filtering/ranked_repos.jsonl \\\n --by composite_score","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Select top repos","type":"text","marks":[{"type":"strong"}]},{"text":": Filter to top 15-30:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/repo_db.py filter \\\n --input github-research-output/$SLUG/phase3_filtering/ranked_repos.jsonl \\\n --output github-research-output/$SLUG/phase3_filtering/ranked_repos.jsonl \\\n --max-repos 30 --not-archived","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Write filtering report","type":"text","marks":[{"type":"strong"}]},{"text":": Create ","type":"text"},{"text":"phase3_filtering/filtering_report.md","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Stats before/after filtering","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Score distributions","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Top 30 repos with scores and rationale","type":"text"}]}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Scoring Formula","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"activity_score = sigmoid((days_since_push \u003c 90) * 0.4 + has_recent_commits * 0.3 + open_issues_ratio * 0.3)\nquality_score = normalize(log(stars+1) * 0.3 + log(forks+1) * 0.2 + has_license * 0.15 + has_readme * 0.15 + not_archived * 0.2)\ncomposite_score = relevance * 0.4 + quality * 0.35 + activity * 0.25","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Checkpoint","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ranked_repos.jsonl","type":"text","marks":[{"type":"code_inline"}]},{"text":" with 15-30 repos","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"filtering_report.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" with scoring details","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Phase 4: Deep Dive","type":"text"}]},{"type":"paragraph","content":[{"text":"Goal","type":"text","marks":[{"type":"strong"}]},{"text":": Clone and deeply analyze the top 8-15 repos.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Steps","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Select repos for deep dive","type":"text","marks":[{"type":"strong"}]},{"text":": Take top 8-15 from ranked list.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Clone each repo","type":"text","marks":[{"type":"strong"}]},{"text":" (shallow):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/clone_repo.py \\\n --repo owner/name \\\n --output-dir github-research-output/$SLUG/phase4_deep_dive/repos/","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Analyze structure","type":"text","marks":[{"type":"strong"}]},{"text":" for each cloned repo:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/analyze_repo_structure.py \\\n --repo-dir github-research-output/$SLUG/phase4_deep_dive/repos/name/ \\\n --output github-research-output/$SLUG/phase4_deep_dive/analyses/name_structure.json","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Extract dependencies","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/extract_dependencies.py \\\n --repo-dir github-research-output/$SLUG/phase4_deep_dive/repos/name/ \\\n --output github-research-output/$SLUG/phase4_deep_dive/analyses/name_deps.json","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Find implementations","type":"text","marks":[{"type":"strong"}]},{"text":": Search for key algorithms/concepts from research:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/find_implementations.py \\\n --repo-dir github-research-output/$SLUG/phase4_deep_dive/repos/name/ \\\n --patterns \"class Transformer\" \"def forward\" \"attention\" \\\n --output github-research-output/$SLUG/phase4_deep_dive/analyses/name_impls.jsonl","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Deep code reading","type":"text","marks":[{"type":"strong"}]},{"text":": For each repo, READ the key source files identified by structure analysis. Write a per-repo analysis in ","type":"text"},{"text":"phase4_deep_dive/analyses/{name}_analysis.md","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Architecture overview","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Key algorithms implemented","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Code quality assessment","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"API / interface design","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Dependencies and requirements","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Strengths and limitations","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Reusability assessment (how easy to extract components)","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Write deep dive summary","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"phase4_deep_dive/deep_dive_summary.md","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"IMPORTANT: Actually Read Code","type":"text"}]},{"type":"paragraph","content":[{"text":"Do NOT just summarize READMEs. You must:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Read the main source files (entry points, core modules)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Understand the actual implementation approach","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Identify specific functions/classes that implement research concepts","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Note code patterns, design decisions, and trade-offs","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Checkpoint","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Repos cloned in ","type":"text"},{"text":"repos/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Per-repo analysis files in ","type":"text"},{"text":"analyses/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"deep_dive_summary.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" written","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Phase 5: Analysis","type":"text"}]},{"type":"paragraph","content":[{"text":"Goal","type":"text","marks":[{"type":"strong"}]},{"text":": Cross-repo comparison and technique-to-code mapping.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Steps","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Generate comparison matrix","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/compare_repos.py \\\n --input github-research-output/$SLUG/phase4_deep_dive/analyses/ \\\n --output github-research-output/$SLUG/phase5_analysis/comparison.json","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Write comparison matrix","type":"text","marks":[{"type":"strong"}]},{"text":": Create ","type":"text"},{"text":"phase5_analysis/comparison_matrix.md","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Table comparing repos across dimensions (language, LOC, stars, framework, license, tests)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Dependency overlap analysis","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Strengths/weaknesses per repo","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Write technique map","type":"text","marks":[{"type":"strong"}]},{"text":": Create ","type":"text"},{"text":"phase5_analysis/technique_map.md","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Map each paper concept / research technique → specific repo + file + function","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Identify gaps (techniques with no implementation found)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Note alternative implementations of the same concept","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Write analysis report","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"phase5_analysis/analysis_report.md","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Executive summary of findings","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Key insights from code analysis","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Recommendations for which repos to use for which purposes","type":"text"}]}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Checkpoint","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"comparison_matrix.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" with repo comparison table","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"technique_map.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" mapping concepts to code","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"analysis_report.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" with findings","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Phase 6: Blueprint","type":"text"}]},{"type":"paragraph","content":[{"text":"Goal","type":"text","marks":[{"type":"strong"}]},{"text":": Produce an actionable integration and reuse plan.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Steps","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Write integration plan","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"phase6_blueprint/integration_plan.md","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Recommended architecture for combining repos","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Step-by-step integration approach","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Dependency resolution strategy","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Potential conflicts and how to resolve them","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Write reuse catalog","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"phase6_blueprint/reuse_catalog.md","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"For each reusable component: source repo, file path, function/class, what it does, how to extract it","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"License compatibility matrix","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Effort estimates (easy/medium/hard to integrate)","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Compile final report","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python ~/.claude/skills/github-research/scripts/compile_github_report.py \\\n --topic-dir github-research-output/$SLUG/","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Write blueprint summary","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"phase6_blueprint/blueprint_summary.md","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"One-page executive summary","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Top 5 repos and why","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Recommended next steps","type":"text"}]}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Checkpoint","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"integration_plan.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" complete","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"reuse_catalog.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" with component catalog","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"final_report.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" compiled","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"blueprint_summary.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" as executive summary","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Quality Conventions","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Repos are ranked by composite score","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"relevance × 0.4 + quality × 0.35 + activity × 0.25","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Deep dive requires reading actual code","type":"text","marks":[{"type":"strong"}]},{"text":", not just READMEs","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Integration blueprint must map paper concepts → specific code files/functions","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Incremental saves","type":"text","marks":[{"type":"strong"}]},{"text":": Each phase writes to disk immediately","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Checkpoint recovery","type":"text","marks":[{"type":"strong"}]},{"text":": Can resume from any phase by checking what outputs exist","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"All scripts are stdlib-only Python","type":"text","marks":[{"type":"strong"}]},{"text":" — no pip installs needed","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"gh","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" CLI is required","type":"text","marks":[{"type":"strong"}]},{"text":" for GitHub API access (must be authenticated)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Deduplication","type":"text","marks":[{"type":"strong"}]},{"text":" by ","type":"text"},{"text":"repo_id","type":"text","marks":[{"type":"code_inline"}]},{"text":" (owner/name) across all searches","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Rate limit awareness","type":"text","marks":[{"type":"strong"}]},{"text":": Respect GitHub search API limits (30 req/min)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Error Handling","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"If ","type":"text"},{"text":"gh","type":"text","marks":[{"type":"code_inline"}]},{"text":" is not installed: warn user and provide installation instructions","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"If a repo is archived/deleted: skip gracefully, note in log","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"If clone fails: skip, note in log, continue with remaining repos","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"If Papers With Code API is down: skip, rely on GitHub search only","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Always write partial progress to disk so work is not lost","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"References","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"See ","type":"text"},{"text":"references/phase-guide.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" for detailed phase execution guidance","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Deep-research skill: ","type":"text"},{"text":"~/.claude/skills/deep-research/SKILL.md","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Paper database pattern: ","type":"text"},{"text":"~/.claude/skills/deep-research/scripts/paper_db.py","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"github-research","author":"@skillopedia","source":{"stars":92,"repo_name":"agent-research-skills","origin_url":"https://github.com/lingzhi227/agent-research-skills/blob/HEAD/skills/github-research/SKILL.md","repo_owner":"lingzhi227","body_sha256":"464a1301589ef938d3e3c87d324fa56a59a0a636c8ba45ea194ebbe9a66761b5","cluster_key":"9f15d50ebd38737e57a1bbfb32578f45cb1bcdb1d23621b95ad129ea1f722b48","clean_bundle":{"format":"clean-skill-bundle-v1","source":"lingzhi227/agent-research-skills/skills/github-research/SKILL.md","attachments":[{"id":"88dabd1f-a5c7-5c92-b82f-7cc1c6b0fbf4","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/88dabd1f-a5c7-5c92-b82f-7cc1c6b0fbf4/attachment.md","path":"references/phase-guide.md","size":7637,"sha256":"e6234e428d1201b1044c5f9ba094ad972164ed78a062e10749a13569cafd18d6","contentType":"text/markdown; charset=utf-8"},{"id":"7ebe7555-0eef-54d4-aea4-680cce89066a","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/7ebe7555-0eef-54d4-aea4-680cce89066a/attachment.py","path":"scripts/analyze_repo_structure.py","size":7662,"sha256":"83d0456dcad35937522038a44f766900d63c33670814c64a13c531d6c750eac9","contentType":"text/x-python; charset=utf-8"},{"id":"1dbcb821-211d-5f1a-b2e2-6300a2fee49c","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/1dbcb821-211d-5f1a-b2e2-6300a2fee49c/attachment.py","path":"scripts/clone_repo.py","size":3517,"sha256":"a33381461ae46e86b0790791f84a79268edcb2bd258a1cb1c932accb0356157a","contentType":"text/x-python; charset=utf-8"},{"id":"806875cf-035d-5525-a08c-7d8f5e1d86a4","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/806875cf-035d-5525-a08c-7d8f5e1d86a4/attachment.py","path":"scripts/compare_repos.py","size":16339,"sha256":"d1fbd135b9ae15112f06f55b6f3dc947a5b8f5a8fc34826e6345c5878e9a4bba","contentType":"text/x-python; charset=utf-8"},{"id":"127bb919-a25e-5e0c-8ad6-60fd76bceb82","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/127bb919-a25e-5e0c-8ad6-60fd76bceb82/attachment.py","path":"scripts/compile_github_report.py","size":20175,"sha256":"88cd50d797b1159e842b8ff7af9f714f8894f4849dc854e6bc168ea493d2fc07","contentType":"text/x-python; charset=utf-8"},{"id":"bf66fa57-f729-5a6b-946c-143fa2fed4db","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/bf66fa57-f729-5a6b-946c-143fa2fed4db/attachment.py","path":"scripts/extract_dependencies.py","size":18102,"sha256":"11486d7a710acff3d4617577318afec0264a318b2d5442bd058345688d0a34c1","contentType":"text/x-python; charset=utf-8"},{"id":"64fad565-d430-52d9-8d57-e31f1aea8469","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/64fad565-d430-52d9-8d57-e31f1aea8469/attachment.py","path":"scripts/extract_research_refs.py","size":10894,"sha256":"71708bddd8c8c8c7010b9f2069f0ac7a1171600dcbf222b298367b3c02651b4a","contentType":"text/x-python; charset=utf-8"},{"id":"44b66545-ccde-51e6-aae7-ab34163bcdf5","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/44b66545-ccde-51e6-aae7-ab34163bcdf5/attachment.py","path":"scripts/find_implementations.py","size":9370,"sha256":"446e075b7433ebcf2cda8b4849b4cc6c09ade6bb8fe6c0fd7ad479eba166298b","contentType":"text/x-python; charset=utf-8"},{"id":"823a2cd9-8365-524f-af1d-3de9ab31c47b","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/823a2cd9-8365-524f-af1d-3de9ab31c47b/attachment.py","path":"scripts/repo_db.py","size":16253,"sha256":"af961e458a77c980d15d93932641880840779d9cc12d98c2dbf8a4a07f48f374","contentType":"text/x-python; charset=utf-8"},{"id":"762cc3f5-ad49-5b72-a783-0701a3a0a147","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/762cc3f5-ad49-5b72-a783-0701a3a0a147/attachment.py","path":"scripts/repo_metadata.py","size":7398,"sha256":"2b202904e679e10a89289a01f756e75d87f8b2f9db944a9bf2e425b87fb17649","contentType":"text/x-python; charset=utf-8"},{"id":"07d3edd1-5adb-5450-9049-19d8ca674e0b","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/07d3edd1-5adb-5450-9049-19d8ca674e0b/attachment.py","path":"scripts/repo_readme_fetch.py","size":3886,"sha256":"99cb2db4987c3b31c19b24fb6e68588cd724dbb9e7ab7523a47806572742da2f","contentType":"text/x-python; charset=utf-8"},{"id":"26cd40aa-d40b-5b47-8db9-04f07c1fa0b7","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/26cd40aa-d40b-5b47-8db9-04f07c1fa0b7/attachment.py","path":"scripts/search_github.py","size":9109,"sha256":"a7277a5e8c0b0563a2b156c0b6b558599588d92a90334b28be2f2d123b9248ce","contentType":"text/x-python; charset=utf-8"},{"id":"d889357f-090a-589a-a850-bcbb880c14a3","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/d889357f-090a-589a-a850-bcbb880c14a3/attachment.py","path":"scripts/search_github_code.py","size":8507,"sha256":"9ebe5bdbf3d72dacc07d376c87c4b4aa2063bfd8e217d8147afece79445a586d","contentType":"text/x-python; charset=utf-8"},{"id":"08773369-679e-5c75-84cc-e14108a0e610","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/08773369-679e-5c75-84cc-e14108a0e610/attachment.py","path":"scripts/search_paperswithcode.py","size":14075,"sha256":"e670a6cf7361011a7518aab39f1c2842b218638f44ac0f8d7ab0de25d29c411a","contentType":"text/x-python; charset=utf-8"}],"bundle_sha256":"8c89350ccfc9be4f94b970db72e847a3e56ad62e081c04b4a6e58439963c5670","attachment_count":14,"text_attachments":14,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":0,"excluded_attachments":[]},"cluster_size":1,"skill_md_path":"skills/github-research/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"integrations-apis","category_label":"Integrations"},"exact_dupes_collapsed_into_this":0},"version":"v1","category":"integrations-apis","import_tag":"clean-skills-v1","description":"Explore and analyze GitHub repositories related to a research topic. Reads deep-research output, discovers repos from multiple sources, deeply analyzes code, and produces integration blueprints.","argument-hint":["deep-research-output-dir"]}},"renderedAt":1782979279102}

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.