mapping-documents — Skillopedia

Mapping Documents Generate files providing hierarchical document structure with semantic annotations. Maps show section summaries, typed claims (result/definition/method/caveat/open-question), symbol definitions, and cross-section dependencies — all anchored to page numbers. The structural analog to : tree-sitter parses code via grammar, docmap parses documents via font analysis + LLM extraction. Installation Generate Maps API key resolution: flag env env. Output Artifacts Four files, forming a three-layer progressive-disclosure stack: | File | Purpose | When to read | |------|---------|-----…

, '', raw)\n return json.loads(raw)\n except json.JSONDecodeError:\n print(f\" ⚠ JSON parse failed for §{section.id}\", file=sys.stderr)\n return {\"summary\": section.title, \"claims\": [], \"symbols\": [], \"dependencies\": []}\n except Exception as e:\n print(f\" ⚠ API error for §{section.id}: {e}\", file=sys.stderr)\n return {\"summary\": section.title, \"claims\": [], \"symbols\": [], \"dependencies\": []}\n\n\ndef extract_semantics(sections: list[Section], genre: str, api_key: str,\n max_workers: int = 4) -> dict[str, dict]:\n \"\"\"Run semantic extraction on all sections in parallel.\"\"\"\n results = {}\n targets = [s for s in sections\n if len(s.text.strip()) > 100 and s.id != 'references' and s.level \u003c= 3]\n\n print(f\" Extracting semantics for {len(targets)} sections \"\n f\"({max_workers} workers)...\", file=sys.stderr)\n\n with ThreadPoolExecutor(max_workers=max_workers) as pool:\n futures = {pool.submit(_semantic_extract, s, genre, api_key): s for s in targets}\n for future in as_completed(futures):\n section = futures[future]\n result = future.result()\n results[section.id] = result\n print(f\" ✓ §{section.id} — {result.get('summary', '')[:60]}\", file=sys.stderr)\n\n return results\n\n\n# ---------------------------------------------------------------------------\n# 3. OUTPUT GENERATION\n# ---------------------------------------------------------------------------\n\ndef _normalize_symbol(s: str) -> str:\n \"\"\"Normalize Unicode variants for symbol deduplication.\"\"\"\n # Normalize Unicode (NFKC collapses compatibility chars)\n s = unicodedata.normalize('NFKC', s)\n # Normalize common arrow/dash variants\n s = s.replace('→', '->').replace('−', '-').replace('·', '*')\n # Collapse whitespace\n s = re.sub(r'\\s+', ' ', s).strip()\n return s\n\n\ndef generate_map(sections: list[Section], semantics: dict, metadata: dict) -> str:\n \"\"\"Generate _MAP.md — progressive disclosure document map.\"\"\"\n lines = []\n title = metadata.get('title', metadata.get('file', 'Document'))\n lines.append(f\"# {title}\")\n lines.append(f\"*{metadata.get('pages', '?')} pages*\\n\")\n\n # TOC\n lines.append(\"## Contents\\n\")\n for s in sections:\n if s.level > 3 or s.id == 'references':\n continue\n indent = \" \" * s.level\n sem = semantics.get(s.id, {})\n summary = sem.get('summary', '')\n page_ref = f\"p.{s.page_start}\" + (f\"–{s.page_end}\" if s.page_end > s.page_start else \"\")\n if summary:\n lines.append(f\"{indent}- **§{s.id}** {s.title} ({page_ref}) — {summary}\")\n else:\n lines.append(f\"{indent}- **§{s.id}** {s.title} ({page_ref})\")\n\n # Detailed sections\n lines.append(\"\\n---\\n\")\n lines.append(\"## Sections\\n\")\n\n for s in sections:\n if s.level > 3:\n continue\n sem = semantics.get(s.id, {})\n level_marker = \"#\" * min(s.level + 3, 6)\n page_ref = f\"p.{s.page_start}\" + (f\"–{s.page_end}\" if s.page_end > s.page_start else \"\")\n\n lines.append(f\"{level_marker} §{s.id} {s.title} ({page_ref})\\n\")\n if sem.get('summary'):\n lines.append(f\"{sem['summary']}\\n\")\n\n claims = sem.get('claims', [])\n if claims:\n lines.append(\"**Key points:**\")\n for c in claims:\n lines.append(f\"- [{c.get('type', 'claim')}] {c['text']} (p.{c.get('page', '?')})\")\n lines.append(\"\")\n\n symbols = [sym for sym in sem.get('symbols', []) if sym.get('defined_here')]\n if symbols:\n lines.append(\"**Defines:**\")\n for sym in symbols:\n lines.append(f\"- `{sym['symbol']}` — {sym['meaning']} (p.{sym.get('page', '?')})\")\n lines.append(\"\")\n\n deps = sem.get('dependencies', [])\n if deps:\n lines.append(f\"*Depends on: {', '.join(deps)}*\\n\")\n\n if s.equations:\n lines.append(f\"*Equations: ({'), ('.join(sorted(set(l for l, _ in s.equations)))})*\")\n if s.figures:\n lines.append(f\"*Figures: {', '.join(sorted(set(l for l, _, _ in s.figures)))}*\")\n if s.tables:\n lines.append(f\"*Tables: {', '.join(sorted(set(l for l, _, _ in s.tables)))}*\")\n lines.append(\"\")\n\n return '\\n'.join(lines)\n\n\ndef generate_symbols(semantics: dict) -> list[dict]:\n \"\"\"Generate symbols.json — flat index with deduplication.\"\"\"\n seen = {}\n for sec_id, sem in semantics.items():\n for sym in sem.get('symbols', []):\n key = _normalize_symbol(sym['symbol'])\n if key not in seen:\n seen[key] = {\n 'symbol': sym['symbol'],\n 'meaning': sym['meaning'],\n 'defined_in': sec_id if sym.get('defined_here') else None,\n 'defined_at_page': sym.get('page') if sym.get('defined_here') else None,\n 'used_in': [sec_id],\n }\n else:\n if sec_id not in seen[key]['used_in']:\n seen[key]['used_in'].append(sec_id)\n if sym.get('defined_here') and not seen[key]['defined_in']:\n seen[key]['defined_in'] = sec_id\n seen[key]['defined_at_page'] = sym.get('page')\n return sorted(seen.values(), key=lambda s: s['symbol'])\n\n\ndef generate_anchors(sections: list[Section], semantics: dict) -> list[dict]:\n \"\"\"Generate anchors.json — every claim with its page reference.\"\"\"\n anchors = []\n for sec_id, sem in semantics.items():\n for i, claim in enumerate(sem.get('claims', [])):\n anchors.append({\n 'id': f\"{sec_id}.c{i}\",\n 'section': sec_id,\n 'type': claim.get('type', 'claim'),\n 'text': claim['text'],\n 'page': claim.get('page'),\n })\n return anchors\n\n\ndef generate_usage_snippet(stem: str, out_dir: str, metadata: dict,\n semantics: dict) -> str:\n \"\"\"Generate a snippet for CLAUDE.md / project instructions.\n\n This is the glue between the map and the agent's instruction file.\n Designed to be pasted into CLAUDE.md, AGENTS.md, or a Claude.ai\n project knowledge file.\n \"\"\"\n title = metadata.get('title', stem)\n pages = metadata.get('pages', '?')\n n_symbols = len(generate_symbols(semantics)) if semantics else 0\n n_claims = sum(len(sem.get('claims', [])) for sem in semantics.values())\n\n # Use out_dir as the relative path prefix\n rel = out_dir.rstrip('/')\n\n lines = [\n f\"## Reference: {title}\",\n f\"\",\n f\"Source document: `{rel}/{stem}.pdf` ({pages} pages)\",\n f\"\",\n f\"### How to use the document map\",\n f\"\",\n f\"A semantic map of this document is available at three levels of detail:\",\n f\"\",\n f\"1. **This file** — hand-curated invariants and guidance (you are here)\",\n f\"2. **`{rel}/{stem}_MAP.md`** — machine-generated section map with typed\",\n f\" claims, symbol definitions, and dependencies. Read this for any question\",\n f\" about what the document says. Replaces reading the raw PDF in most cases.\",\n f\"3. **`{rel}/{stem}.pdf`** — the original. Read only when you need exact\",\n f\" wording, figures, or proofs.\",\n f\"\",\n ]\n\n if n_symbols > 0 or n_claims > 0:\n lines.append(f\"### Querying the indexes\")\n lines.append(f\"\")\n lines.append(f\"Two JSON indexes support programmatic lookup:\")\n lines.append(f\"\")\n\n if n_symbols > 0:\n lines.extend([\n f\"**Symbol lookup** (`{rel}/{stem}.symbols.json` — {n_symbols} symbols):\",\n f\"```bash\",\n f\"# Find where a symbol is defined\",\n f\"python3 -c \\\"import json; [print(f'Defined in §{{s[\\\\\\\"defined_in\\\\\\\"]}} p.{{s[\\\\\\\"defined_at_page\\\\\\\"]}}: {{s[\\\\\\\"meaning\\\\\\\"]}}') for s in json.load(open('{rel}/{stem}.symbols.json')) if 'QUERY' in s['symbol']]\\\"\",\n f\"```\",\n f\"\",\n ])\n\n if n_claims > 0:\n lines.extend([\n f\"**Claim queries** (`{rel}/{stem}.anchors.json` — {n_claims} claims):\",\n f\"```bash\",\n f\"# List all caveats\",\n f\"python3 -c \\\"import json; [print(f'p.{{c[\\\\\\\"page\\\\\\\"]}} {{c[\\\\\\\"text\\\\\\\"]}}') for c in json.load(open('{rel}/{stem}.anchors.json')) if c['type'] == 'caveat']\\\"\",\n f\"\",\n f\"# All claims in a specific section\",\n f\"python3 -c \\\"import json; [print(f'[{{c[\\\\\\\"type\\\\\\\"]}}] {{c[\\\\\\\"text\\\\\\\"]}}') for c in json.load(open('{rel}/{stem}.anchors.json')) if c['section'] == 'SECTION_ID']\\\"\",\n f\"```\",\n f\"\",\n ])\n\n lines.extend([\n f\"\u003c!-- Generated by mapping-documents v0.1.2 -->\",\n f\"\u003c!-- Paste into CLAUDE.md, AGENTS.md, or Claude.ai project knowledge -->\",\n ])\n\n return '\\n'.join(lines)\n\n\n# ---------------------------------------------------------------------------\n# 4. CLI\n# ---------------------------------------------------------------------------\n\ndef main():\n parser = argparse.ArgumentParser(\n description=\"docmap — tree-sitter for documents\",\n formatter_class=argparse.RawDescriptionHelpFormatter,\n )\n parser.add_argument('pdf', help='Path to PDF file')\n parser.add_argument('--genre', choices=list(GENRE_PROMPTS.keys()),\n default='paper', help='Document genre (default: paper)')\n parser.add_argument('--structure-only', action='store_true',\n help='Skip LLM semantic extraction')\n parser.add_argument('--out', default='.', help='Output directory')\n parser.add_argument('--api-key', help='Anthropic API key (or set ANTHROPIC_API_KEY / API_KEY)')\n parser.add_argument('--model', default='claude-sonnet-4-6',\n help='Model for semantic extraction')\n parser.add_argument('--workers', type=int, default=4,\n help='Parallel workers for semantic extraction')\n parser.add_argument('-v', '--verbose', action='store_true')\n parser.add_argument('--no-usage-snippet', action='store_true',\n help='Skip generating the CLAUDE.md usage snippet')\n args = parser.parse_args()\n\n pdf_path = args.pdf\n if not os.path.exists(pdf_path):\n print(f\"Error: {pdf_path} not found\", file=sys.stderr)\n sys.exit(1)\n\n stem = Path(pdf_path).stem\n out_dir = Path(args.out)\n out_dir.mkdir(parents=True, exist_ok=True)\n map_path = out_dir / f\"{stem}_MAP.md\"\n symbols_path = out_dir / f\"{stem}.symbols.json\"\n anchors_path = out_dir / f\"{stem}.anchors.json\"\n\n # Pass 1: Structure\n print(f\"Parsing structure: {pdf_path}\", file=sys.stderr)\n sections, metadata = parse_structure(pdf_path)\n print(f\" Found {len(sections)} sections across {metadata['pages']} pages\", file=sys.stderr)\n\n if args.verbose:\n for s in sections:\n print(f\" {' ' * s.level}§{s.id} {s.title} \"\n f\"(p.{s.page_start}–{s.page_end}, {len(s.text)} chars)\", file=sys.stderr)\n\n # Pass 2: Semantics\n semantics = {}\n if not args.structure_only:\n api_key = (args.api_key\n or os.environ.get('ANTHROPIC_API_KEY')\n or os.environ.get('API_KEY'))\n if not api_key:\n print(\"Warning: no API key. Use --api-key or set ANTHROPIC_API_KEY. \"\n \"Falling back to structure-only.\", file=sys.stderr)\n else:\n semantics = extract_semantics(sections, args.genre, api_key,\n max_workers=args.workers)\n\n # Pass 3: Generate outputs\n print(f\"Generating outputs in {out_dir}/\", file=sys.stderr)\n\n map_content = generate_map(sections, semantics, metadata)\n map_path.write_text(map_content)\n print(f\" ✓ {map_path} ({len(map_content):,} chars)\", file=sys.stderr)\n\n if semantics:\n symbols = generate_symbols(semantics)\n symbols_path.write_text(json.dumps(symbols, indent=2))\n print(f\" ✓ {symbols_path} ({len(symbols)} symbols)\", file=sys.stderr)\n\n anchors = generate_anchors(sections, semantics)\n anchors_path.write_text(json.dumps(anchors, indent=2))\n print(f\" ✓ {anchors_path} ({len(anchors)} claims)\", file=sys.stderr)\n\n # Pass 4: Usage snippet for CLAUDE.md / project instructions\n if not args.no_usage_snippet:\n snippet_path = out_dir / f\"{stem}_USAGE.md\"\n snippet = generate_usage_snippet(stem, str(args.out), metadata, semantics)\n snippet_path.write_text(snippet)\n print(f\" ✓ {snippet_path} (paste into CLAUDE.md or project instructions)\",\n file=sys.stderr)\n\n print(\"Done.\", file=sys.stderr)\n\n\nif __name__ == '__main__':\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":27519,"content_sha256":"d95674c322236e8633a7ab01ad04a9a1065eff21a60d940dc07238e62355380f"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"Mapping Documents","type":"text"}]},{"type":"paragraph","content":[{"text":"Generate ","type":"text"},{"text":"_MAP.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" files providing hierarchical document structure with semantic annotations. Maps show section summaries, typed claims (result/definition/method/caveat/open-question), symbol definitions, and cross-section dependencies — all anchored to page numbers.","type":"text"}]},{"type":"paragraph","content":[{"text":"The structural analog to ","type":"text"},{"text":"mapping-codebases","type":"text","marks":[{"type":"code_inline"}]},{"text":": tree-sitter parses code via grammar, docmap parses documents via font analysis + LLM extraction.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Installation","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"pip install pdfplumber anthropic --break-system-packages -q","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Generate Maps","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Full run (structure + semantic extraction via Claude API)\npython /mnt/skills/user/mapping-documents/scripts/docmap.py paper.pdf \\\n --out docs/ --genre paper --workers 4\n\n# Structure only (no API calls, no cost)\npython /mnt/skills/user/mapping-documents/scripts/docmap.py paper.pdf \\\n --out docs/ --structure-only","type":"text"}]},{"type":"paragraph","content":[{"text":"API key resolution: ","type":"text"},{"text":"--api-key","type":"text","marks":[{"type":"code_inline"}]},{"text":" flag > ","type":"text"},{"text":"ANTHROPIC_API_KEY","type":"text","marks":[{"type":"code_inline"}]},{"text":" env > ","type":"text"},{"text":"API_KEY","type":"text","marks":[{"type":"code_inline"}]},{"text":" env.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Output Artifacts","type":"text"}]},{"type":"paragraph","content":[{"text":"Four files, forming a three-layer progressive-disclosure stack:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"CLAUDE.md / project instructions ← curated invariants (you write this)\n ↕ (_USAGE.md bridges the gap)\n_MAP.md + JSON indexes ← navigable document map (docmap generates)\n ↕\nraw PDF ← the source document","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"File","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Purpose","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"When to read","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"{stem}_USAGE.md","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Snippet for pasting into CLAUDE.md / AGENTS.md / project knowledge. Describes the reading order and JSON query patterns.","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Once, at setup","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"{stem}_MAP.md","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Section map: TOC with summaries, typed claims, defined symbols, dependencies. All page-anchored.","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Any question about what the document says","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"{stem}.symbols.json","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Flat symbol index: where defined, where used, what it means.","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\"Where is X defined?\"","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"{stem}.anchors.json","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Every claim: section ID, type, text, page number.","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"\"What caveats exist?\" / \"What does §3 claim?\"","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"After Generating: Wire It Up","type":"text"}]},{"type":"paragraph","content":[{"text":"Generating the map is step 1. Step 2 is telling the agent the map exists.","type":"text"}]},{"type":"paragraph","content":[{"text":"For a code repo (CLAUDE.md / AGENTS.md):","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Paste the generated usage snippet into your agent instructions\ncat docs/paper_USAGE.md >> CLAUDE.md","type":"text"}]},{"type":"paragraph","content":[{"text":"For Claude.ai project knowledge:","type":"text","marks":[{"type":"strong"}]},{"text":" Upload ","type":"text"},{"text":"_MAP.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" as a project knowledge file, or paste the ","type":"text"},{"text":"_USAGE.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" content into project instructions.","type":"text"}]},{"type":"paragraph","content":[{"text":"The ","type":"text"},{"text":"_USAGE.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" snippet includes copy-pasteable query commands for the JSON indexes. Replace ","type":"text"},{"text":"QUERY","type":"text","marks":[{"type":"code_inline"}]},{"text":" and ","type":"text"},{"text":"SECTION_ID","type":"text","marks":[{"type":"code_inline"}]},{"text":" placeholders with actual values.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Navigate Via Maps","type":"text"}]},{"type":"paragraph","content":[{"text":"After generating and wiring up, use the map for navigation — read ","type":"text"},{"text":"_MAP.md","type":"text","marks":[{"type":"code_inline"}]},{"text":", not the raw PDF.","type":"text"}]},{"type":"paragraph","content":[{"text":"Workflow:","type":"text","marks":[{"type":"strong"}]}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Read ","type":"text"},{"text":"_USAGE.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" block in CLAUDE.md for orientation","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Read top-level TOC in ","type":"text"},{"text":"_MAP.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" for structure and section summaries","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Drill into relevant sections for typed claims and symbol definitions","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Query ","type":"text"},{"text":".symbols.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" for \"where is X defined?\" lookups","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Query ","type":"text"},{"text":".anchors.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" for claim filtering by type or section","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Read the raw PDF only when exact wording or figures are needed","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Querying the JSON indexes:","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Symbol lookup\npython3 -c \"import json; [print(f'§{s[\\\"defined_in\\\"]} p.{s[\\\"defined_at_page\\\"]}') \\\n for s in json.load(open('docs/paper.symbols.json')) if 'edl' in s['symbol']]\"\n\n# All caveats in the document\npython3 -c \"import json; [print(f'p.{c[\\\"page\\\"]} {c[\\\"text\\\"]}') \\\n for c in json.load(open('docs/paper.anchors.json')) if c['type'] == 'caveat']\"\n\n# All claims in a section\npython3 -c \"import json; [print(f'[{c[\\\"type\\\"]}] {c[\\\"text\\\"]}') \\\n for c in json.load(open('docs/paper.anchors.json')) if c['section'] == '4.3']\"","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Genre Support","type":"text"}]},{"type":"paragraph","content":[{"text":"Genre controls the claim taxonomy used in semantic extraction.","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Genre","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Claim types","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Best for","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"paper","type":"text","marks":[{"type":"code_inline"}]},{"text":" (default)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"definition, result, method, claim, caveat, open-question","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Academic papers, arXiv preprints","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"spec","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"requirement, definition, constraint, example, note","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"RFCs, API specs, technical standards","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"legal","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"definition, obligation, right, exception, condition, reference","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Contracts, policy documents, regulations","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Limitations (v0.1.x)","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"PDF-only.","type":"text","marks":[{"type":"strong"}]},{"text":" No DOCX, HTML, or plain text input yet.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Single-column layout assumed.","type":"text","marks":[{"type":"strong"}]},{"text":" Two-column papers may mis-order text within sections.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"No caching.","type":"text","marks":[{"type":"strong"}]},{"text":" Re-running re-extracts everything.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"No citation cross-referencing.","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Genre must be specified manually.","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Semantic extraction can hallucinate.","type":"text","marks":[{"type":"strong"}]},{"text":" Every claim is page-anchored, but the page number comes from the LLM. Verify critical claims against the source.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"CLI Reference","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"python docmap.py paper.pdf [options]\n\nOptions:\n --genre {paper,spec,legal} Claim taxonomy (default: paper)\n --structure-only Skip LLM pass (free, fast)\n --out DIR Output directory (default: .)\n --api-key KEY Anthropic API key\n --model MODEL Model (default: claude-sonnet-4-6)\n --workers N Parallel workers (default: 4)\n --no-usage-snippet Skip _USAGE.md generation\n -v Verbose structural parsing","type":"text"}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"mapping-documents","author":"@skillopedia","source":{"stars":124,"repo_name":"claude-skills","origin_url":"https://github.com/oaustegard/claude-skills/blob/HEAD/mapping-documents/SKILL.md","repo_owner":"oaustegard","body_sha256":"f51c3a86ae92b5234ad547d0a9d7aa97ea323e2f3797c1c49db6e91087a4ac9f","cluster_key":"313cfc4a1e2405bd890dc475eb16a1343bd318d4060a31b897fdab7061c97de9","clean_bundle":{"format":"clean-skill-bundle-v1","source":"oaustegard/claude-skills/mapping-documents/SKILL.md","attachments":[{"id":"dec3ea9f-fe8b-5618-8614-ff61790a2223","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/dec3ea9f-fe8b-5618-8614-ff61790a2223/attachment.md","path":"CHANGELOG.md","size":875,"sha256":"6aba047157e83cfc52d2544b2a5e9d6e0cc2a347ca339f1314ce142ec216596d","contentType":"text/markdown; charset=utf-8"},{"id":"dc8722e6-39a9-5f4f-92ea-f5cfceb8bc48","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/dc8722e6-39a9-5f4f-92ea-f5cfceb8bc48/attachment.md","path":"README.md","size":630,"sha256":"ee4aada703e2c9059c7108281517d53ce467c4b628585ac5fb93c4bda95fa803","contentType":"text/markdown; charset=utf-8"},{"id":"209c184e-6e4b-576b-bdcd-5a9d6ed6c7c3","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/209c184e-6e4b-576b-bdcd-5a9d6ed6c7c3/attachment.py","path":"scripts/docmap.py","size":27519,"sha256":"d95674c322236e8633a7ab01ad04a9a1065eff21a60d940dc07238e62355380f","contentType":"text/x-python; charset=utf-8"}],"bundle_sha256":"6ecddbc372d9e485487baa3880fb19d2cec7033497288d89a9870391d6673676","attachment_count":3,"text_attachments":3,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":0,"excluded_attachments":[]},"cluster_size":1,"skill_md_path":"mapping-documents/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"documents-office","category_label":"Documents"},"exact_dupes_collapsed_into_this":0},"version":"v1","category":"documents-office","metadata":{"version":"0.1.2"},"import_tag":"clean-skills-v1","description":"Generate navigable semantic maps from PDF documents. Extracts section structure via font analysis, then runs LLM extraction per section for claims, symbols, and dependencies — all page-anchored. Produces _MAP.md (progressive disclosure), .symbols.json (definition index), .anchors.json (claim references), and a _USAGE.md snippet for CLAUDE.md. Use when analyzing papers, specs, or legal docs; when asked to \"map this document\", \"index this PDF\", \"what does this paper say\"; or when a coding agent needs grounded reference material from a PDF source. Analogous to mapping-codebases but for prose documents."}},"renderedAt":1782986511517}

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.