Skill Creator Purpose Use this skill to turn a vague capability request or an existing rough skill into a reliable, reusable agent skill. A good skill has clear trigger metadata, concise operational instructions, and only the resources needed to execute the workflow. Workflow 1. Capture the user's intent before writing. Identify what the skill should enable, when it should trigger, expected outputs, tools or dependencies, and examples of realistic user requests. 2. Inspect any existing skill files or source materials. Separate core workflow instructions from long references, scripts, assets,…

, name):\n return False, f\"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)\"\n if name.startswith('-') or name.endswith('-') or '--' in name:\n return False, f\"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens\"\n # Check name length (max 64 characters per spec)\n if len(name) > 64:\n return False, f\"Name is too long ({len(name)} characters). Maximum is 64 characters.\"\n\n # Extract and validate description\n description = frontmatter.get('description', '')\n if not isinstance(description, str):\n return False, f\"Description must be a string, got {type(description).__name__}\"\n description = description.strip()\n if description:\n # Check for angle brackets\n if '\u003c' in description or '>' in description:\n return False, \"Description cannot contain angle brackets (\u003c or >)\"\n # Check description length (max 1024 characters per spec)\n if len(description) > 1024:\n return False, f\"Description is too long ({len(description)} characters). Maximum is 1024 characters.\"\n\n # Validate compatibility field if present (optional)\n compatibility = frontmatter.get('compatibility', '')\n if compatibility:\n if not isinstance(compatibility, str):\n return False, f\"Compatibility must be a string, got {type(compatibility).__name__}\"\n if len(compatibility) > 500:\n return False, f\"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters.\"\n\n return True, \"Skill is valid!\"\n\nif __name__ == \"__main__\":\n if len(sys.argv) != 2:\n print(\"Usage: python quick_validate.py \u003cskill_directory>\")\n sys.exit(1)\n \n valid, message = validate_skill(sys.argv[1])\n print(message)\n sys.exit(0 if valid else 1)","content_type":"text/x-python; charset=utf-8","language":"python","size":3972,"content_sha256":"67cf5703402013936c8fb75ad6a1afecd8841d45cc5e606b634eb05825fde365"},{"filename":"scripts/run_eval.py","content":"#!/usr/bin/env python3\n\"\"\"Run trigger evaluation for a skill description.\n\nTests whether a skill's description causes Claude to trigger (read the skill)\nfor a set of queries. Outputs results as JSON.\n\"\"\"\n\nimport argparse\nimport json\nimport os\nimport select\nimport subprocess\nimport sys\nimport time\nimport uuid\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom pathlib import Path\n\nfrom scripts.utils import parse_skill_md\n\n\ndef find_project_root() -> Path:\n \"\"\"Find the project root by walking up from cwd looking for .claude/.\n\n Mimics how Claude Code discovers its project root, so the command file\n we create ends up where claude -p will look for it.\n \"\"\"\n current = Path.cwd()\n for parent in [current, *current.parents]:\n if (parent / \".claude\").is_dir():\n return parent\n return current\n\n\ndef run_single_query(\n query: str,\n skill_name: str,\n skill_description: str,\n timeout: int,\n project_root: str,\n model: str | None = None,\n) -> bool:\n \"\"\"Run a single query and return whether the skill was triggered.\n\n Creates a command file in .claude/commands/ so it appears in Claude's\n available_skills list, then runs `claude -p` with the raw query.\n Uses --include-partial-messages to detect triggering early from\n stream events (content_block_start) rather than waiting for the\n full assistant message, which only arrives after tool execution.\n \"\"\"\n unique_id = uuid.uuid4().hex[:8]\n clean_name = f\"{skill_name}-skill-{unique_id}\"\n project_commands_dir = Path(project_root) / \".claude\" / \"commands\"\n command_file = project_commands_dir / f\"{clean_name}.md\"\n\n try:\n project_commands_dir.mkdir(parents=True, exist_ok=True)\n # Use YAML block scalar to avoid breaking on quotes in description\n indented_desc = \"\\n \".join(skill_description.split(\"\\n\"))\n command_content = (\n f\"---\\n\"\n f\"description: |\\n\"\n f\" {indented_desc}\\n\"\n f\"---\\n\\n\"\n f\"# {skill_name}\\n\\n\"\n f\"This skill handles: {skill_description}\\n\"\n )\n command_file.write_text(command_content)\n\n cmd = [\n \"claude\",\n \"-p\", query,\n \"--output-format\", \"stream-json\",\n \"--verbose\",\n \"--include-partial-messages\",\n ]\n if model:\n cmd.extend([\"--model\", model])\n\n # Remove CLAUDECODE env var to allow nesting claude -p inside a\n # Claude Code session. The guard is for interactive terminal conflicts;\n # programmatic subprocess usage is safe.\n env = {k: v for k, v in os.environ.items() if k != \"CLAUDECODE\"}\n\n process = subprocess.Popen(\n cmd,\n stdout=subprocess.PIPE,\n stderr=subprocess.DEVNULL,\n cwd=project_root,\n env=env,\n )\n\n triggered = False\n start_time = time.time()\n buffer = \"\"\n # Track state for stream event detection\n pending_tool_name = None\n accumulated_json = \"\"\n\n try:\n while time.time() - start_time \u003c timeout:\n if process.poll() is not None:\n remaining = process.stdout.read()\n if remaining:\n buffer += remaining.decode(\"utf-8\", errors=\"replace\")\n break\n\n ready, _, _ = select.select([process.stdout], [], [], 1.0)\n if not ready:\n continue\n\n chunk = os.read(process.stdout.fileno(), 8192)\n if not chunk:\n break\n buffer += chunk.decode(\"utf-8\", errors=\"replace\")\n\n while \"\\n\" in buffer:\n line, buffer = buffer.split(\"\\n\", 1)\n line = line.strip()\n if not line:\n continue\n\n try:\n event = json.loads(line)\n except json.JSONDecodeError:\n continue\n\n # Early detection via stream events\n if event.get(\"type\") == \"stream_event\":\n se = event.get(\"event\", {})\n se_type = se.get(\"type\", \"\")\n\n if se_type == \"content_block_start\":\n cb = se.get(\"content_block\", {})\n if cb.get(\"type\") == \"tool_use\":\n tool_name = cb.get(\"name\", \"\")\n if tool_name in (\"Skill\", \"Read\"):\n pending_tool_name = tool_name\n accumulated_json = \"\"\n else:\n return False\n\n elif se_type == \"content_block_delta\" and pending_tool_name:\n delta = se.get(\"delta\", {})\n if delta.get(\"type\") == \"input_json_delta\":\n accumulated_json += delta.get(\"partial_json\", \"\")\n if clean_name in accumulated_json:\n return True\n\n elif se_type in (\"content_block_stop\", \"message_stop\"):\n if pending_tool_name:\n return clean_name in accumulated_json\n if se_type == \"message_stop\":\n return False\n\n # Fallback: full assistant message\n elif event.get(\"type\") == \"assistant\":\n message = event.get(\"message\", {})\n for content_item in message.get(\"content\", []):\n if content_item.get(\"type\") != \"tool_use\":\n continue\n tool_name = content_item.get(\"name\", \"\")\n tool_input = content_item.get(\"input\", {})\n if tool_name == \"Skill\" and clean_name in tool_input.get(\"skill\", \"\"):\n triggered = True\n elif tool_name == \"Read\" and clean_name in tool_input.get(\"file_path\", \"\"):\n triggered = True\n return triggered\n\n elif event.get(\"type\") == \"result\":\n return triggered\n finally:\n # Clean up process on any exit path (return, exception, timeout)\n if process.poll() is None:\n process.kill()\n process.wait()\n\n return triggered\n finally:\n if command_file.exists():\n command_file.unlink()\n\n\ndef run_eval(\n eval_set: list[dict],\n skill_name: str,\n description: str,\n num_workers: int,\n timeout: int,\n project_root: Path,\n runs_per_query: int = 1,\n trigger_threshold: float = 0.5,\n model: str | None = None,\n) -> dict:\n \"\"\"Run the full eval set and return results.\"\"\"\n results = []\n\n with ProcessPoolExecutor(max_workers=num_workers) as executor:\n future_to_info = {}\n for item in eval_set:\n for run_idx in range(runs_per_query):\n future = executor.submit(\n run_single_query,\n item[\"query\"],\n skill_name,\n description,\n timeout,\n str(project_root),\n model,\n )\n future_to_info[future] = (item, run_idx)\n\n query_triggers: dict[str, list[bool]] = {}\n query_items: dict[str, dict] = {}\n for future in as_completed(future_to_info):\n item, _ = future_to_info[future]\n query = item[\"query\"]\n query_items[query] = item\n if query not in query_triggers:\n query_triggers[query] = []\n try:\n query_triggers[query].append(future.result())\n except Exception as e:\n print(f\"Warning: query failed: {e}\", file=sys.stderr)\n query_triggers[query].append(False)\n\n for query, triggers in query_triggers.items():\n item = query_items[query]\n trigger_rate = sum(triggers) / len(triggers)\n should_trigger = item[\"should_trigger\"]\n if should_trigger:\n did_pass = trigger_rate >= trigger_threshold\n else:\n did_pass = trigger_rate \u003c trigger_threshold\n results.append({\n \"query\": query,\n \"should_trigger\": should_trigger,\n \"trigger_rate\": trigger_rate,\n \"triggers\": sum(triggers),\n \"runs\": len(triggers),\n \"pass\": did_pass,\n })\n\n passed = sum(1 for r in results if r[\"pass\"])\n total = len(results)\n\n return {\n \"skill_name\": skill_name,\n \"description\": description,\n \"results\": results,\n \"summary\": {\n \"total\": total,\n \"passed\": passed,\n \"failed\": total - passed,\n },\n }\n\n\ndef main():\n parser = argparse.ArgumentParser(description=\"Run trigger evaluation for a skill description\")\n parser.add_argument(\"--eval-set\", required=True, help=\"Path to eval set JSON file\")\n parser.add_argument(\"--skill-path\", required=True, help=\"Path to skill directory\")\n parser.add_argument(\"--description\", default=None, help=\"Override description to test\")\n parser.add_argument(\"--num-workers\", type=int, default=10, help=\"Number of parallel workers\")\n parser.add_argument(\"--timeout\", type=int, default=30, help=\"Timeout per query in seconds\")\n parser.add_argument(\"--runs-per-query\", type=int, default=3, help=\"Number of runs per query\")\n parser.add_argument(\"--trigger-threshold\", type=float, default=0.5, help=\"Trigger rate threshold\")\n parser.add_argument(\"--model\", default=None, help=\"Model to use for claude -p (default: user's configured model)\")\n parser.add_argument(\"--verbose\", action=\"store_true\", help=\"Print progress to stderr\")\n args = parser.parse_args()\n\n eval_set = json.loads(Path(args.eval_set).read_text())\n skill_path = Path(args.skill_path)\n\n if not (skill_path / \"SKILL.md\").exists():\n print(f\"Error: No SKILL.md found at {skill_path}\", file=sys.stderr)\n sys.exit(1)\n\n name, original_description, content = parse_skill_md(skill_path)\n description = args.description or original_description\n project_root = find_project_root()\n\n if args.verbose:\n print(f\"Evaluating: {description}\", file=sys.stderr)\n\n output = run_eval(\n eval_set=eval_set,\n skill_name=name,\n description=description,\n num_workers=args.num_workers,\n timeout=args.timeout,\n project_root=project_root,\n runs_per_query=args.runs_per_query,\n trigger_threshold=args.trigger_threshold,\n model=args.model,\n )\n\n if args.verbose:\n summary = output[\"summary\"]\n print(f\"Results: {summary['passed']}/{summary['total']} passed\", file=sys.stderr)\n for r in output[\"results\"]:\n status = \"PASS\" if r[\"pass\"] else \"FAIL\"\n rate_str = f\"{r['triggers']}/{r['runs']}\"\n print(f\" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}\", file=sys.stderr)\n\n print(json.dumps(output, indent=2))\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":11464,"content_sha256":"43e3b8f80dbf69c343967ba77e268fae991d9fa3ed68b32a0ff02532cd48657f"},{"filename":"scripts/run_loop.py","content":"#!/usr/bin/env python3\n\"\"\"Run the eval + improve loop until all pass or max iterations reached.\n\nCombines run_eval.py and improve_description.py in a loop, tracking history\nand returning the best description found. Supports train/test split to prevent\noverfitting.\n\"\"\"\n\nimport argparse\nimport json\nimport random\nimport sys\nimport tempfile\nimport time\nimport webbrowser\nfrom pathlib import Path\n\nfrom scripts.generate_report import generate_html\nfrom scripts.improve_description import improve_description\nfrom scripts.run_eval import find_project_root, run_eval\nfrom scripts.utils import parse_skill_md\n\n\ndef split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:\n \"\"\"Split eval set into train and test sets, stratified by should_trigger.\"\"\"\n random.seed(seed)\n\n # Separate by should_trigger\n trigger = [e for e in eval_set if e[\"should_trigger\"]]\n no_trigger = [e for e in eval_set if not e[\"should_trigger\"]]\n\n # Shuffle each group\n random.shuffle(trigger)\n random.shuffle(no_trigger)\n\n # Calculate split points\n n_trigger_test = max(1, int(len(trigger) * holdout))\n n_no_trigger_test = max(1, int(len(no_trigger) * holdout))\n\n # Split\n test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]\n train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]\n\n return train_set, test_set\n\n\ndef run_loop(\n eval_set: list[dict],\n skill_path: Path,\n description_override: str | None,\n num_workers: int,\n timeout: int,\n max_iterations: int,\n runs_per_query: int,\n trigger_threshold: float,\n holdout: float,\n model: str,\n verbose: bool,\n live_report_path: Path | None = None,\n log_dir: Path | None = None,\n) -> dict:\n \"\"\"Run the eval + improvement loop.\"\"\"\n project_root = find_project_root()\n name, original_description, content = parse_skill_md(skill_path)\n current_description = description_override or original_description\n\n # Split into train/test if holdout > 0\n if holdout > 0:\n train_set, test_set = split_eval_set(eval_set, holdout)\n if verbose:\n print(f\"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})\", file=sys.stderr)\n else:\n train_set = eval_set\n test_set = []\n\n history = []\n exit_reason = \"unknown\"\n\n for iteration in range(1, max_iterations + 1):\n if verbose:\n print(f\"\\n{'='*60}\", file=sys.stderr)\n print(f\"Iteration {iteration}/{max_iterations}\", file=sys.stderr)\n print(f\"Description: {current_description}\", file=sys.stderr)\n print(f\"{'='*60}\", file=sys.stderr)\n\n # Evaluate train + test together in one batch for parallelism\n all_queries = train_set + test_set\n t0 = time.time()\n all_results = run_eval(\n eval_set=all_queries,\n skill_name=name,\n description=current_description,\n num_workers=num_workers,\n timeout=timeout,\n project_root=project_root,\n runs_per_query=runs_per_query,\n trigger_threshold=trigger_threshold,\n model=model,\n )\n eval_elapsed = time.time() - t0\n\n # Split results back into train/test by matching queries\n train_queries_set = {q[\"query\"] for q in train_set}\n train_result_list = [r for r in all_results[\"results\"] if r[\"query\"] in train_queries_set]\n test_result_list = [r for r in all_results[\"results\"] if r[\"query\"] not in train_queries_set]\n\n train_passed = sum(1 for r in train_result_list if r[\"pass\"])\n train_total = len(train_result_list)\n train_summary = {\"passed\": train_passed, \"failed\": train_total - train_passed, \"total\": train_total}\n train_results = {\"results\": train_result_list, \"summary\": train_summary}\n\n if test_set:\n test_passed = sum(1 for r in test_result_list if r[\"pass\"])\n test_total = len(test_result_list)\n test_summary = {\"passed\": test_passed, \"failed\": test_total - test_passed, \"total\": test_total}\n test_results = {\"results\": test_result_list, \"summary\": test_summary}\n else:\n test_results = None\n test_summary = None\n\n history.append({\n \"iteration\": iteration,\n \"description\": current_description,\n \"train_passed\": train_summary[\"passed\"],\n \"train_failed\": train_summary[\"failed\"],\n \"train_total\": train_summary[\"total\"],\n \"train_results\": train_results[\"results\"],\n \"test_passed\": test_summary[\"passed\"] if test_summary else None,\n \"test_failed\": test_summary[\"failed\"] if test_summary else None,\n \"test_total\": test_summary[\"total\"] if test_summary else None,\n \"test_results\": test_results[\"results\"] if test_results else None,\n # For backward compat with report generator\n \"passed\": train_summary[\"passed\"],\n \"failed\": train_summary[\"failed\"],\n \"total\": train_summary[\"total\"],\n \"results\": train_results[\"results\"],\n })\n\n # Write live report if path provided\n if live_report_path:\n partial_output = {\n \"original_description\": original_description,\n \"best_description\": current_description,\n \"best_score\": \"in progress\",\n \"iterations_run\": len(history),\n \"holdout\": holdout,\n \"train_size\": len(train_set),\n \"test_size\": len(test_set),\n \"history\": history,\n }\n live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))\n\n if verbose:\n def print_eval_stats(label, results, elapsed):\n pos = [r for r in results if r[\"should_trigger\"]]\n neg = [r for r in results if not r[\"should_trigger\"]]\n tp = sum(r[\"triggers\"] for r in pos)\n pos_runs = sum(r[\"runs\"] for r in pos)\n fn = pos_runs - tp\n fp = sum(r[\"triggers\"] for r in neg)\n neg_runs = sum(r[\"runs\"] for r in neg)\n tn = neg_runs - fp\n total = tp + tn + fp + fn\n precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0\n recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0\n accuracy = (tp + tn) / total if total > 0 else 0.0\n print(f\"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)\", file=sys.stderr)\n for r in results:\n status = \"PASS\" if r[\"pass\"] else \"FAIL\"\n rate_str = f\"{r['triggers']}/{r['runs']}\"\n print(f\" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}\", file=sys.stderr)\n\n print_eval_stats(\"Train\", train_results[\"results\"], eval_elapsed)\n if test_summary:\n print_eval_stats(\"Test \", test_results[\"results\"], 0)\n\n if train_summary[\"failed\"] == 0:\n exit_reason = f\"all_passed (iteration {iteration})\"\n if verbose:\n print(f\"\\nAll train queries passed on iteration {iteration}!\", file=sys.stderr)\n break\n\n if iteration == max_iterations:\n exit_reason = f\"max_iterations ({max_iterations})\"\n if verbose:\n print(f\"\\nMax iterations reached ({max_iterations}).\", file=sys.stderr)\n break\n\n # Improve the description based on train results\n if verbose:\n print(f\"\\nImproving description...\", file=sys.stderr)\n\n t0 = time.time()\n # Strip test scores from history so improvement model can't see them\n blinded_history = [\n {k: v for k, v in h.items() if not k.startswith(\"test_\")}\n for h in history\n ]\n new_description = improve_description(\n skill_name=name,\n skill_content=content,\n current_description=current_description,\n eval_results=train_results,\n history=blinded_history,\n model=model,\n log_dir=log_dir,\n iteration=iteration,\n )\n improve_elapsed = time.time() - t0\n\n if verbose:\n print(f\"Proposed ({improve_elapsed:.1f}s): {new_description}\", file=sys.stderr)\n\n current_description = new_description\n\n # Find the best iteration by TEST score (or train if no test set)\n if test_set:\n best = max(history, key=lambda h: h[\"test_passed\"] or 0)\n best_score = f\"{best['test_passed']}/{best['test_total']}\"\n else:\n best = max(history, key=lambda h: h[\"train_passed\"])\n best_score = f\"{best['train_passed']}/{best['train_total']}\"\n\n if verbose:\n print(f\"\\nExit reason: {exit_reason}\", file=sys.stderr)\n print(f\"Best score: {best_score} (iteration {best['iteration']})\", file=sys.stderr)\n\n return {\n \"exit_reason\": exit_reason,\n \"original_description\": original_description,\n \"best_description\": best[\"description\"],\n \"best_score\": best_score,\n \"best_train_score\": f\"{best['train_passed']}/{best['train_total']}\",\n \"best_test_score\": f\"{best['test_passed']}/{best['test_total']}\" if test_set else None,\n \"final_description\": current_description,\n \"iterations_run\": len(history),\n \"holdout\": holdout,\n \"train_size\": len(train_set),\n \"test_size\": len(test_set),\n \"history\": history,\n }\n\n\ndef main():\n parser = argparse.ArgumentParser(description=\"Run eval + improve loop\")\n parser.add_argument(\"--eval-set\", required=True, help=\"Path to eval set JSON file\")\n parser.add_argument(\"--skill-path\", required=True, help=\"Path to skill directory\")\n parser.add_argument(\"--description\", default=None, help=\"Override starting description\")\n parser.add_argument(\"--num-workers\", type=int, default=10, help=\"Number of parallel workers\")\n parser.add_argument(\"--timeout\", type=int, default=30, help=\"Timeout per query in seconds\")\n parser.add_argument(\"--max-iterations\", type=int, default=5, help=\"Max improvement iterations\")\n parser.add_argument(\"--runs-per-query\", type=int, default=3, help=\"Number of runs per query\")\n parser.add_argument(\"--trigger-threshold\", type=float, default=0.5, help=\"Trigger rate threshold\")\n parser.add_argument(\"--holdout\", type=float, default=0.4, help=\"Fraction of eval set to hold out for testing (0 to disable)\")\n parser.add_argument(\"--model\", required=True, help=\"Model for improvement\")\n parser.add_argument(\"--verbose\", action=\"store_true\", help=\"Print progress to stderr\")\n parser.add_argument(\"--report\", default=\"auto\", help=\"Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)\")\n parser.add_argument(\"--results-dir\", default=None, help=\"Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here\")\n args = parser.parse_args()\n\n eval_set = json.loads(Path(args.eval_set).read_text())\n skill_path = Path(args.skill_path)\n\n if not (skill_path / \"SKILL.md\").exists():\n print(f\"Error: No SKILL.md found at {skill_path}\", file=sys.stderr)\n sys.exit(1)\n\n name, _, _ = parse_skill_md(skill_path)\n\n # Set up live report path\n if args.report != \"none\":\n if args.report == \"auto\":\n timestamp = time.strftime(\"%Y%m%d_%H%M%S\")\n live_report_path = Path(tempfile.gettempdir()) / f\"skill_description_report_{skill_path.name}_{timestamp}.html\"\n else:\n live_report_path = Path(args.report)\n # Open the report immediately so the user can watch\n live_report_path.write_text(\"\u003chtml>\u003cbody>\u003ch1>Starting optimization loop...\u003c/h1>\u003cmeta http-equiv='refresh' content='5'>\u003c/body>\u003c/html>\")\n webbrowser.open(str(live_report_path))\n else:\n live_report_path = None\n\n # Determine output directory (create before run_loop so logs can be written)\n if args.results_dir:\n timestamp = time.strftime(\"%Y-%m-%d_%H%M%S\")\n results_dir = Path(args.results_dir) / timestamp\n results_dir.mkdir(parents=True, exist_ok=True)\n else:\n results_dir = None\n\n log_dir = results_dir / \"logs\" if results_dir else None\n\n output = run_loop(\n eval_set=eval_set,\n skill_path=skill_path,\n description_override=args.description,\n num_workers=args.num_workers,\n timeout=args.timeout,\n max_iterations=args.max_iterations,\n runs_per_query=args.runs_per_query,\n trigger_threshold=args.trigger_threshold,\n holdout=args.holdout,\n model=args.model,\n verbose=args.verbose,\n live_report_path=live_report_path,\n log_dir=log_dir,\n )\n\n # Save JSON output\n json_output = json.dumps(output, indent=2)\n print(json_output)\n if results_dir:\n (results_dir / \"results.json\").write_text(json_output)\n\n # Write final HTML report (without auto-refresh)\n if live_report_path:\n live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))\n print(f\"\\nReport: {live_report_path}\", file=sys.stderr)\n\n if results_dir and live_report_path:\n (results_dir / \"report.html\").write_text(generate_html(output, auto_refresh=False, skill_name=name))\n\n if results_dir:\n print(f\"Results saved to: {results_dir}\", file=sys.stderr)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":13605,"content_sha256":"7bd6f674203168520517eec94c55f493c0d154339b061b4d7c0f0dad187d0f21"},{"filename":"scripts/utils.py","content":"\"\"\"Shared utilities for skill-creator scripts.\"\"\"\n\nfrom pathlib import Path\n\n\n\ndef parse_skill_md(skill_path: Path) -> tuple[str, str, str]:\n \"\"\"Parse a SKILL.md file, returning (name, description, full_content).\"\"\"\n content = (skill_path / \"SKILL.md\").read_text()\n lines = content.split(\"\\n\")\n\n if lines[0].strip() != \"---\":\n raise ValueError(\"SKILL.md missing frontmatter (no opening ---)\")\n\n end_idx = None\n for i, line in enumerate(lines[1:], start=1):\n if line.strip() == \"---\":\n end_idx = i\n break\n\n if end_idx is None:\n raise ValueError(\"SKILL.md missing frontmatter (no closing ---)\")\n\n name = \"\"\n description = \"\"\n frontmatter_lines = lines[1:end_idx]\n i = 0\n while i \u003c len(frontmatter_lines):\n line = frontmatter_lines[i]\n if line.startswith(\"name:\"):\n name = line[len(\"name:\"):].strip().strip('\"').strip(\"'\")\n elif line.startswith(\"description:\"):\n value = line[len(\"description:\"):].strip()\n # Handle YAML multiline indicators (>, |, >-, |-)\n if value in (\">\", \"|\", \">-\", \"|-\"):\n continuation_lines: list[str] = []\n i += 1\n while i \u003c len(frontmatter_lines) and (frontmatter_lines[i].startswith(\" \") or frontmatter_lines[i].startswith(\"\\t\")):\n continuation_lines.append(frontmatter_lines[i].strip())\n i += 1\n description = \" \".join(continuation_lines)\n continue\n else:\n description = value.strip('\"').strip(\"'\")\n i += 1\n\n return name, description, content\n","content_type":"text/x-python; charset=utf-8","language":"python","size":1661,"content_sha256":"3af8ae62c40c73ab712207436a0d9a981e845f25c5a7040229eb189cc8e45bb1"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"Skill Creator","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Purpose","type":"text"}]},{"type":"paragraph","content":[{"text":"Use this skill to turn a vague capability request or an existing rough skill into a reliable, reusable agent skill. A good skill has clear trigger metadata, concise operational instructions, and only the resources needed to execute the workflow.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Workflow","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Capture the user's intent before writing. Identify what the skill should enable, when it should trigger, expected outputs, tools or dependencies, and examples of realistic user requests.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Inspect any existing skill files or source materials. Separate core workflow instructions from long references, scripts, assets, examples, and eval data.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Draft ","type":"text"},{"text":"SKILL.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" with frontmatter that includes a clear ","type":"text"},{"text":"name","type":"text","marks":[{"type":"code_inline"}]},{"text":" and a description that states both what the skill does and when to use it.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Keep the main body concise. Put detailed schemas, examples, policy, or framework-specific material into bundled references when the information is not always needed.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Add scripts only when deterministic execution or repeated code generation is useful. Test scripts before treating them as part of the skill.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Add realistic eval prompts. For objective skills, define assertions that can be checked against transcripts and produced files.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Run skill-vs-baseline evaluations when possible, grade the results, compare failure modes, and revise the skill.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Package or present the final skill only after validation catches no frontmatter or naming issues.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Writing Rules","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use imperative instructions for the agent that will consume the skill.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Put trigger guidance in the description, not only in the body.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Avoid surprise behavior, hidden side effects, malware, or broad permissions unrelated to the user's intent.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Avoid standalone README, changelog, or install-guide clutter unless the content is a necessary runtime reference.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Keep resource paths explicit so an agent knows what to load and when.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Included Tools And References","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use ","type":"text"},{"text":"scripts/quick_validate.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" to validate skill structure.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use ","type":"text"},{"text":"scripts/run_eval.py","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"scripts/run_loop.py","type":"text","marks":[{"type":"code_inline"}]},{"text":", and related scripts for skill evaluation loops.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use ","type":"text"},{"text":"scripts/improve_description.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" when improving trigger accuracy.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use ","type":"text"},{"text":"eval-viewer/generate_review.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" and ","type":"text"},{"text":"eval-viewer/viewer.html","type":"text","marks":[{"type":"code_inline"}]},{"text":" to review benchmark outputs.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use ","type":"text"},{"text":"references/schemas.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" for JSON schemas used by eval, grading, benchmark, comparison, and analysis files.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Use bundled agent prompts when delegating grading, comparison, or post-hoc analysis.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Completion Criteria","type":"text"}]},{"type":"paragraph","content":[{"text":"A rewritten skill is ready when the frontmatter is valid, the triggering description is specific, the body is short enough to load comfortably, all referenced bundled files exist, scripts have been smoke-tested where practical, and at least a small set of realistic eval prompts exists or the user has explicitly skipped evals.","type":"text"}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"skill-creator","tags":["skills","evaluation","agents","authoring"],"author":"@skillopedia","source":{"stars":145114,"repo_name":"skills","origin_url":"https://github.com/anthropics/skills/blob/HEAD/skills/skill-creator/SKILL.md","repo_owner":"anthropics","body_sha256":"89590d9c27d91db98ab198f184c1505406ec0b7cf403e038fcee19bb694a9786","cluster_key":"eca09455adc0435974f2a7d865d85fc9c3e2fd62f7a519e5e9d7389b4f9b3a24","clean_bundle":{"format":"clean-skill-bundle-v1","source":"anthropics/skills/skills/skill-creator/SKILL.md","attachments":[{"id":"f21bbcea-5cc9-5b9e-b2cf-78e8aacb2a8c","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/f21bbcea-5cc9-5b9e-b2cf-78e8aacb2a8c/attachment.md","path":"agents/analyzer.md","size":10376,"sha256":"bf68f4cac5a56c673a928c2e6d619586c5b93ea364026ab37547772cb45a663a","contentType":"text/markdown; charset=utf-8"},{"id":"e664d0f0-68aa-5f63-96cc-a374dd025074","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/e664d0f0-68aa-5f63-96cc-a374dd025074/attachment.md","path":"agents/comparator.md","size":7287,"sha256":"fe1fc9787c495d864c5d6eada47396478572325fde1b33a96d78bf4b849b7a3e","contentType":"text/markdown; charset=utf-8"},{"id":"e02b794d-b4e0-50a9-adb5-1d99b28bb4bc","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/e02b794d-b4e0-50a9-adb5-1d99b28bb4bc/attachment.md","path":"agents/grader.md","size":9049,"sha256":"57134da0c1a4eea33fbd74a1c9c44aa814f07d6bc64de303edb586f941e5d21a","contentType":"text/markdown; charset=utf-8"},{"id":"b63cea53-5749-5d2d-895d-90d911b7f4ae","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/b63cea53-5749-5d2d-895d-90d911b7f4ae/attachment.html","path":"assets/eval_review.html","size":7058,"sha256":"ce477dcc74dc1c0d1d3352646a79167b5a63634e936b1019160025065974e452","contentType":"text/html; charset=utf-8"},{"id":"85d5d3cd-e3b1-5262-90e2-ab5562055db2","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/85d5d3cd-e3b1-5262-90e2-ab5562055db2/attachment.py","path":"eval-viewer/generate_review.py","size":16365,"sha256":"fc9d1b9243fe5ab6012ebd579bd76d0035de1b79fd3b969de114defab26478fb","contentType":"text/x-python; charset=utf-8"},{"id":"5046e0f7-1fd4-5ae2-b4cb-ffce17eb2604","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/5046e0f7-1fd4-5ae2-b4cb-ffce17eb2604/attachment.html","path":"eval-viewer/viewer.html","size":44998,"sha256":"a53213426ee1100441d701a3a0d49cda7a842f992d2c36463f4d3cc0258575fa","contentType":"text/html; charset=utf-8"},{"id":"701cb442-a479-52c5-94cc-32d9eeb37319","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/701cb442-a479-52c5-94cc-32d9eeb37319/attachment.md","path":"references/schemas.md","size":12061,"sha256":"8e8876180a8989b406a4d3edddf875b04cdfd5805cc8616686d552b11ce4455f","contentType":"text/markdown; charset=utf-8"},{"id":"95eadc5b-e30e-5dd4-b17d-853c842d1795","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/95eadc5b-e30e-5dd4-b17d-853c842d1795/attachment.py","path":"scripts/__init__.py","size":0,"sha256":"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855","contentType":"text/x-python; charset=utf-8"},{"id":"9ec41378-f8f9-52bd-a6c3-302e0d87424a","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/9ec41378-f8f9-52bd-a6c3-302e0d87424a/attachment.py","path":"scripts/aggregate_benchmark.py","size":14386,"sha256":"123ef128ea5ccc01a4b1ac212ef5567f21e9c13d3d240609780beeb3200c49aa","contentType":"text/x-python; charset=utf-8"},{"id":"250d782c-05ec-546e-8f16-93b6cf40a5df","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/250d782c-05ec-546e-8f16-93b6cf40a5df/attachment.py","path":"scripts/generate_report.py","size":12847,"sha256":"13df7118a3c50c83c4c3250a606d5f2b20b25a3d44cbc392b3d669ec75281453","contentType":"text/x-python; charset=utf-8"},{"id":"1fe8a081-d2aa-58b4-bb48-f1e43a06dced","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/1fe8a081-d2aa-58b4-bb48-f1e43a06dced/attachment.py","path":"scripts/improve_description.py","size":11116,"sha256":"87d864570220b699fac52da309d2d6efdb060647bfebc74f768128e646accf80","contentType":"text/x-python; charset=utf-8"},{"id":"84a44a9d-d5ae-57d8-a184-4b7d1f4de133","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/84a44a9d-d5ae-57d8-a184-4b7d1f4de133/attachment.py","path":"scripts/package_skill.py","size":4234,"sha256":"1a33059b0db1ef73375d46d513e5ea81369d2e8838c970597b0d52ddef8d1c0f","contentType":"text/x-python; charset=utf-8"},{"id":"c07c83f8-3037-5f22-a770-534db66ee976","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/c07c83f8-3037-5f22-a770-534db66ee976/attachment.py","path":"scripts/quick_validate.py","size":3972,"sha256":"67cf5703402013936c8fb75ad6a1afecd8841d45cc5e606b634eb05825fde365","contentType":"text/x-python; charset=utf-8"},{"id":"2719bb53-f557-5e5e-85bb-e25a2ce09563","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/2719bb53-f557-5e5e-85bb-e25a2ce09563/attachment.py","path":"scripts/run_eval.py","size":11464,"sha256":"43e3b8f80dbf69c343967ba77e268fae991d9fa3ed68b32a0ff02532cd48657f","contentType":"text/x-python; charset=utf-8"},{"id":"4668b2f3-ab4d-5a8f-a05c-ffd1cd9be0bd","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/4668b2f3-ab4d-5a8f-a05c-ffd1cd9be0bd/attachment.py","path":"scripts/run_loop.py","size":13605,"sha256":"7bd6f674203168520517eec94c55f493c0d154339b061b4d7c0f0dad187d0f21","contentType":"text/x-python; charset=utf-8"},{"id":"491a65a6-05dc-5b94-866b-649ac1dbfd38","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/491a65a6-05dc-5b94-866b-649ac1dbfd38/attachment.py","path":"scripts/utils.py","size":1661,"sha256":"3af8ae62c40c73ab712207436a0d9a981e845f25c5a7040229eb189cc8e45bb1","contentType":"text/x-python; charset=utf-8"}],"bundle_sha256":"7aa173714c0222babefdcbe5f1006e53dba1c77bcbff1e0396ee200c7a62b65a","attachment_count":16,"text_attachments":16,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":0,"excluded_attachments":[]},"cluster_size":17,"skill_md_path":"skills/skill-creator/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"ai-agent-development","category_label":"AI"},"exact_dupes_collapsed_into_this":16},"version":"v1","category":"ai-agent-development","import_tag":"clean-skills-v1","description":"Create, improve, evaluate, package, and benchmark agent skills. Use when a user wants to draft a new skill, rewrite an existing skill, add bundled resources, run skill evals, compare baseline vs skill behavior, or optimize triggering descriptions."}},"renderedAt":1782987139948}

Skill Creator Purpose Use this skill to turn a vague capability request or an existing rough skill into a reliable, reusable agent skill. A good skill has clear trigger metadata, concise operational instructions, and only the resources needed to execute the workflow. Workflow 1. Capture the user's intent before writing. Identify what the skill should enable, when it should trigger, expected outputs, tools or dependencies, and examples of realistic user requests. 2. Inspect any existing skill files or source materials. Separate core workflow instructions from long references, scripts, assets,…