Evals — Skillopedia

Customization Before executing, check for user customizations at: If this directory exists, load and apply any PREFERENCES.md, configurations, or resources found there. These override default behavior. If the directory does not exist, proceed with skill defaults. 🚨 MANDATORY: Voice Notification (REQUIRED BEFORE ANY ACTION) You MUST send this notification BEFORE doing anything else when this skill is invoked. 1. Send voice notification : 2. Output text notification : This is not optional. Execute this curl command immediately upon skill invocation. Evals - AI Agent Evaluation Framework Compre…

);\n if (!pattern.test(String(actual))) return false;\n } else if (actual !== expected) {\n return false;\n }\n }\n }\n\n return true;\n });\n\n checks.push({\n check: `required.${req.tool}`,\n passed: !!matchingCall,\n details: matchingCall\n ? `Found: ${JSON.stringify(matchingCall.params).slice(0, 100)}`\n : `Not found in ${toolCalls.length} tool calls`,\n });\n }\n }\n\n // Check forbidden tool calls\n if (params.forbidden) {\n for (const forbidden of params.forbidden) {\n const found = toolCalls.some(tc => tc.name === forbidden);\n checks.push({\n check: `forbidden.${forbidden}`,\n passed: !found,\n details: found ? 'Found (should not exist)' : 'Not found (correct)',\n });\n }\n }\n\n // Check sequence (tools must be called in order)\n if (params.sequence) {\n const toolOrder = toolCalls.map(tc => tc.name);\n let seqIndex = 0;\n\n for (const tool of toolOrder) {\n if (seqIndex \u003c params.sequence.length && tool === params.sequence[seqIndex]) {\n seqIndex++;\n }\n }\n\n const sequenceComplete = seqIndex === params.sequence.length;\n checks.push({\n check: 'sequence',\n passed: sequenceComplete,\n details: sequenceComplete\n ? `Sequence complete: ${params.sequence.join(' → ')}`\n : `Incomplete: found ${seqIndex}/${params.sequence.length} in order`,\n });\n }\n\n // Check max calls\n if (params.max_calls !== undefined) {\n const withinLimit = toolCalls.length \u003c= params.max_calls;\n checks.push({\n check: 'max_calls',\n passed: withinLimit,\n details: `${toolCalls.length} calls (max: ${params.max_calls})`,\n });\n }\n\n const passCount = checks.filter(c => c.passed).length;\n const score = checks.length > 0 ? passCount / checks.length : 1;\n const passed = passCount === checks.length;\n\n return this.createResult(score, passed, performance.now() - start, {\n reasoning: `${passCount}/${checks.length} tool call checks passed`,\n details: {\n checks,\n total_tool_calls: toolCalls.length,\n tool_call_summary: toolCalls.map(tc => tc.name),\n },\n });\n }\n}\n\nregisterGrader('tool_calls', ToolCallVerificationGrader);\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":3712,"content_sha256":"89a2fcbb69b65aac61f1fdbf263d20916ff69836f9319af3c41a599ca0a97041"},{"filename":"Graders/index.ts","content":"/**\n * Graders Index\n * Central export for all grader types\n */\n\n// Base\nexport * from './Base.ts';\n\n// Code-based graders\nexport * from './CodeBased/index.ts';\n\n// Model-based graders\nexport * from './ModelBased/index.ts';\n\n// Note: Human graders require separate implementation\n// See Graders/Human/ for review workflow\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":322,"content_sha256":"bd54c998c7436d459b841abcfd5706a295c213a276cf3a02e50c9e445a8b2ab9"},{"filename":"Graders/ModelBased/index.ts","content":"/**\n * Model-Based Graders Index\n * LLM-powered graders for nuanced evaluation\n */\n\n// Import to register graders\nimport './LLMRubric.ts';\nimport './NaturalLanguageAssert.ts';\nimport './PairwiseComparison.ts';\n\nexport { LLMRubricGrader } from './LLMRubric.ts';\nexport { NaturalLanguageAssertGrader } from './NaturalLanguageAssert.ts';\nexport { PairwiseComparisonGrader } from './PairwiseComparison.ts';\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":403,"content_sha256":"f6f7e06a5eed2767882a26d9003b54c22dc0be0b9d39dfc57f10212288309774"},{"filename":"Graders/ModelBased/LLMRubric.ts","content":"/**\n * LLM Rubric Grader\n * Score output against a detailed rubric using an LLM judge\n */\n\nimport { BaseGrader, registerGrader, type GraderContext } from '../Base.ts';\nimport type { GraderConfig, GraderResult, LLMRubricParams } from '../../Types/index.ts';\nimport { inference, type InferenceLevel } from '../../../PAI/Tools/Inference';\nimport { readFileSync, existsSync } from 'fs';\n\nexport class LLMRubricGrader extends BaseGrader {\n type = 'llm_rubric' as const;\n category = 'model_based' as const;\n\n async grade(context: GraderContext): Promise\u003cGraderResult> {\n const start = performance.now();\n const params = this.config.params as LLMRubricParams;\n\n // Load rubric\n let rubric = params.rubric;\n if (existsSync(params.rubric)) {\n rubric = readFileSync(params.rubric, 'utf-8');\n }\n\n const scale = params.scale ?? '1-5';\n // Map model preference to inference level (default to standard/Sonnet)\n const levelMap: Record\u003cstring, InferenceLevel> = {\n 'claude-haiku-4-5-20251001': 'fast',\n 'claude-sonnet-4-20250514': 'standard',\n 'claude-opus-4-20250514': 'smart',\n };\n const level: InferenceLevel = levelMap[params.judge_model ?? ''] ?? 'standard';\n\n // Build prompt\n const systemPrompt = this.buildSystemPrompt(scale, params.reasoning_first ?? true);\n const userPrompt = this.buildUserPrompt(rubric, params.assertions, context);\n\n try {\n const result = await inference({\n systemPrompt,\n userPrompt,\n level,\n timeout: 30000,\n });\n\n if (!result.success) {\n throw new Error(result.error || 'Inference failed');\n }\n\n const text = result.output;\n const { score, reasoning, assertion_results } = this.parseResponse(text, scale, params.assertions);\n\n const passed = this.scoreToPassed(score, scale);\n\n return this.createResult(score, passed, performance.now() - start, {\n reasoning,\n details: {\n assertion_results,\n inference_level: level,\n scale,\n raw_response: text,\n },\n });\n } catch (e) {\n return this.createResult(0, false, performance.now() - start, {\n reasoning: `LLM judge error: ${e}`,\n });\n }\n }\n\n private buildSystemPrompt(scale: string, reasoningFirst: boolean): string {\n const scaleInstructions = {\n '1-5': 'Score from 1 (very poor) to 5 (excellent)',\n '1-10': 'Score from 1 (very poor) to 10 (excellent)',\n 'pass-fail': 'Determine if the output PASSES or FAILS the criteria',\n }[scale];\n\n const format = reasoningFirst\n ? `First explain your reasoning, then provide your score. Format:\nREASONING: \u003cyour detailed analysis>\nSCORE: \u003cyour score>`\n : `Provide your score first, then explain. Format:\nSCORE: \u003cyour score>\nREASONING: \u003cyour explanation>`;\n\n return `You are an expert evaluator assessing AI-generated output against quality criteria.\n\n${scaleInstructions}\n\n${format}\n\nBe objective and fair. Consider both strengths and weaknesses.`;\n }\n\n private buildUserPrompt(\n rubric: string,\n assertions: string[] | undefined,\n context: GraderContext\n ): string {\n let prompt = `## Evaluation Rubric\n\n${rubric}\n\n## Output to Evaluate\n\n${context.output}\n`;\n\n if (assertions?.length) {\n prompt += `\n## Specific Assertions to Check\n\nFor each assertion, determine if it is TRUE or FALSE:\n\n${assertions.map((a, i) => `${i + 1}. ${a}`).join('\\n')}\n\nAfter the main evaluation, provide assertion results in this format:\nASSERTIONS:\n${assertions.map((_, i) => `${i + 1}. TRUE/FALSE`).join('\\n')}\n`;\n }\n\n if (context.reference) {\n prompt += `\n## Reference Output (for comparison)\n\n${context.reference}\n`;\n }\n\n prompt += `\n## Your Evaluation\n\nEvaluate the output against the rubric and provide your assessment.`;\n\n return prompt;\n }\n\n private parseResponse(\n text: string,\n scale: string,\n assertions?: string[]\n ): { score: number; reasoning: string; assertion_results?: boolean[] } {\n // Extract score\n let score = 0;\n const scoreMatch = text.match(/SCORE:\\s*(\\d+(?:\\.\\d+)?|PASS|FAIL)/i);\n\n if (scoreMatch) {\n if (scale === 'pass-fail') {\n score = scoreMatch[1].toUpperCase() === 'PASS' ? 1 : 0;\n } else if (scale === '1-5') {\n score = (parseFloat(scoreMatch[1]) - 1) / 4; // Normalize to 0-1\n } else if (scale === '1-10') {\n score = (parseFloat(scoreMatch[1]) - 1) / 9; // Normalize to 0-1\n }\n }\n\n // Extract reasoning\n const reasoningMatch = text.match(/REASONING:\\s*([\\s\\S]*?)(?=SCORE:|ASSERTIONS:|$)/i);\n const reasoning = reasoningMatch?.[1]?.trim() ?? text;\n\n // Extract assertion results\n let assertion_results: boolean[] | undefined;\n if (assertions?.length) {\n const assertionsMatch = text.match(/ASSERTIONS:\\s*([\\s\\S]*?)$/i);\n if (assertionsMatch) {\n assertion_results = assertions.map((_, i) => {\n const lineMatch = assertionsMatch[1].match(new RegExp(`${i + 1}\\\\.\\\\s*(TRUE|FALSE)`, 'i'));\n return lineMatch?.[1]?.toUpperCase() === 'TRUE';\n });\n }\n }\n\n return { score: Math.max(0, Math.min(1, score)), reasoning, assertion_results };\n }\n\n private scoreToPassed(score: number, scale: string): boolean {\n if (scale === 'pass-fail') return score >= 0.5;\n // For 1-5 and 1-10, pass if score is above middle\n return score >= 0.5;\n }\n}\n\nregisterGrader('llm_rubric', LLMRubricGrader);\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":5446,"content_sha256":"dfc8b6977197f14df4ca1200ddc00108d526b438702030da57240054ea454cff"},{"filename":"Graders/ModelBased/NaturalLanguageAssert.ts","content":"/**\n * Natural Language Assertion Grader\n * Check if specific assertions are true about the output\n */\n\nimport { BaseGrader, registerGrader, type GraderContext } from '../Base.ts';\nimport type { GraderConfig, GraderResult, NaturalLanguageAssertParams } from '../../Types/index.ts';\nimport { inference, type InferenceLevel } from '../../../PAI/Tools/Inference';\n\nexport class NaturalLanguageAssertGrader extends BaseGrader {\n type = 'natural_language_assert' as const;\n category = 'model_based' as const;\n\n async grade(context: GraderContext): Promise\u003cGraderResult> {\n const start = performance.now();\n const params = this.config.params as NaturalLanguageAssertParams;\n\n if (!params?.assertions?.length) {\n return this.createResult(0, false, performance.now() - start, {\n reasoning: 'No assertions configured',\n });\n }\n\n // Map model preference to inference level (default to standard/Sonnet)\n const levelMap: Record\u003cstring, InferenceLevel> = {\n 'claude-haiku-4-5-20251001': 'fast',\n 'claude-sonnet-4-20250514': 'standard',\n 'claude-opus-4-20250514': 'smart',\n };\n const level: InferenceLevel = levelMap[params.judge_model ?? ''] ?? 'standard';\n const requireAll = params.require_all ?? true;\n\n const systemPrompt = `You are an assertion checker. For each assertion, determine if it is TRUE or FALSE based on the given output.\n\nBe strict and literal. If you cannot clearly verify an assertion, mark it FALSE.\n\nRespond in this exact format for each assertion:\n1. TRUE/FALSE: \u003cbrief explanation>\n2. TRUE/FALSE: \u003cbrief explanation>\n...`;\n\n const userPrompt = `## Output to Check\n\n${context.output}\n\n## Tool Calls Made (for context)\n\n${context.transcript.tool_calls.map(tc => `- ${tc.name}(${JSON.stringify(tc.params)})`).join('\\n') || 'None'}\n\n## Assertions to Verify\n\n${params.assertions.map((a, i) => `${i + 1}. ${a}`).join('\\n')}\n\nCheck each assertion against the output and tool calls.`;\n\n try {\n const result = await inference({\n systemPrompt,\n userPrompt,\n level,\n timeout: 30000,\n });\n\n if (!result.success) {\n throw new Error(result.error || 'Inference failed');\n }\n\n const text = result.output;\n const results = this.parseResults(text, params.assertions);\n\n const passCount = results.filter(r => r.passed).length;\n const score = passCount / params.assertions.length;\n\n const passed = requireAll\n ? passCount === params.assertions.length\n : passCount > 0;\n\n return this.createResult(score, passed, performance.now() - start, {\n reasoning: `${passCount}/${params.assertions.length} assertions passed`,\n details: {\n results,\n require_all: requireAll,\n inference_level: level,\n },\n });\n } catch (e) {\n return this.createResult(0, false, performance.now() - start, {\n reasoning: `LLM assertion check error: ${e}`,\n });\n }\n }\n\n private parseResults(\n text: string,\n assertions: string[]\n ): { assertion: string; passed: boolean; explanation: string }[] {\n return assertions.map((assertion, i) => {\n const pattern = new RegExp(`${i + 1}\\\\.\\\\s*(TRUE|FALSE):\\\\s*(.*)`, 'i');\n const match = text.match(pattern);\n\n if (match) {\n return {\n assertion,\n passed: match[1].toUpperCase() === 'TRUE',\n explanation: match[2].trim(),\n };\n }\n\n // Try to find by content if numbered format didn't work\n const containsTrue = text.toLowerCase().includes(`assertion ${i + 1}`) &&\n text.toLowerCase().includes('true');\n\n return {\n assertion,\n passed: containsTrue,\n explanation: 'Could not parse result',\n };\n });\n }\n}\n\nregisterGrader('natural_language_assert', NaturalLanguageAssertGrader);\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":3842,"content_sha256":"05c1f65df58e798947973cfd2a4b706670306ea26a3cec55de9a8e1401c00a3a"},{"filename":"Graders/ModelBased/PairwiseComparison.ts","content":"/**\n * Pairwise Comparison Grader\n * Compare output against a reference with position swapping to reduce bias\n */\n\nimport { BaseGrader, registerGrader, type GraderContext } from '../Base.ts';\nimport type { GraderConfig, GraderResult, PairwiseComparisonParams } from '../../Types/index.ts';\nimport { inference, type InferenceLevel } from '../../../PAI/Tools/Inference';\nimport { readFileSync, existsSync } from 'fs';\n\nexport class PairwiseComparisonGrader extends BaseGrader {\n type = 'pairwise_comparison' as const;\n category = 'model_based' as const;\n\n async grade(context: GraderContext): Promise\u003cGraderResult> {\n const start = performance.now();\n const params = this.config.params as PairwiseComparisonParams;\n\n // Load reference\n let reference = params.reference;\n if (existsSync(params.reference)) {\n reference = readFileSync(params.reference, 'utf-8');\n }\n\n if (!reference) {\n return this.createResult(0, false, performance.now() - start, {\n reasoning: 'No reference output available',\n });\n }\n\n // Map model preference to inference level (default to standard/Sonnet)\n const levelMap: Record\u003cstring, InferenceLevel> = {\n 'claude-haiku-4-5-20251001': 'fast',\n 'claude-sonnet-4-20250514': 'standard',\n 'claude-opus-4-20250514': 'smart',\n };\n const level: InferenceLevel = levelMap[params.judge_model ?? ''] ?? 'standard';\n const positionSwap = params.position_swap ?? true;\n\n // Run comparison(s)\n const results: { position: string; winner: 'A' | 'B' | 'tie'; reasoning: string }[] = [];\n\n // First comparison: Output = A, Reference = B\n const result1 = await this.compare(context.output, reference, level, params.criteria);\n results.push({ position: 'output_first', ...result1 });\n\n if (positionSwap) {\n // Second comparison: Reference = A, Output = B\n const result2 = await this.compare(reference, context.output, level, params.criteria);\n // Flip winner since positions are swapped\n const flippedWinner = result2.winner === 'A' ? 'B' : result2.winner === 'B' ? 'A' : 'tie';\n results.push({\n position: 'reference_first',\n winner: flippedWinner as 'A' | 'B' | 'tie',\n reasoning: result2.reasoning,\n });\n }\n\n // Aggregate results\n const outputWins = results.filter(r => r.winner === 'A').length;\n const referenceWins = results.filter(r => r.winner === 'B').length;\n const ties = results.filter(r => r.winner === 'tie').length;\n\n let score: number;\n let aggregateWinner: string;\n\n if (outputWins > referenceWins) {\n score = 1.0;\n aggregateWinner = 'output';\n } else if (referenceWins > outputWins) {\n score = 0.0;\n aggregateWinner = 'reference';\n } else {\n score = 0.5;\n aggregateWinner = 'tie';\n }\n\n // For the score, also consider partial wins\n if (positionSwap && results.length === 2) {\n score = (outputWins + ties * 0.5) / 2;\n }\n\n const passed = score >= 0.5;\n\n return this.createResult(score, passed, performance.now() - start, {\n reasoning: `${aggregateWinner} wins (output: ${outputWins}, reference: ${referenceWins}, ties: ${ties})`,\n details: {\n results,\n position_swap: positionSwap,\n inference_level: level,\n criteria: params.criteria,\n },\n });\n }\n\n private async compare(\n outputA: string,\n outputB: string,\n level: InferenceLevel,\n criteria?: string[]\n ): Promise\u003c{ winner: 'A' | 'B' | 'tie'; reasoning: string }> {\n const criteriaText = criteria?.length\n ? `Focus on these criteria:\\n${criteria.map(c => `- ${c}`).join('\\n')}`\n : 'Consider overall quality, accuracy, clarity, and helpfulness.';\n\n const systemPrompt = `You are comparing two outputs to determine which is better.\n\n${criteriaText}\n\nRespond in this format:\nREASONING: \u003cyour analysis comparing A and B>\nWINNER: A or B or TIE\n\nBe objective. Consider both outputs fairly.`;\n\n const userPrompt = `## Output A\n\n${outputA}\n\n## Output B\n\n${outputB}\n\nCompare these outputs and determine which is better.`;\n\n try {\n const result = await inference({\n systemPrompt,\n userPrompt,\n level,\n timeout: 30000,\n });\n\n if (!result.success) {\n throw new Error(result.error || 'Inference failed');\n }\n\n const text = result.output;\n\n const winnerMatch = text.match(/WINNER:\\s*(A|B|TIE)/i);\n const reasoningMatch = text.match(/REASONING:\\s*([\\s\\S]*?)(?=WINNER:|$)/i);\n\n const winner = winnerMatch?.[1]?.toUpperCase() === 'A' ? 'A'\n : winnerMatch?.[1]?.toUpperCase() === 'B' ? 'B'\n : 'tie';\n\n return {\n winner,\n reasoning: reasoningMatch?.[1]?.trim() ?? text,\n };\n } catch (e) {\n return {\n winner: 'tie',\n reasoning: `Comparison error: ${e}`,\n };\n }\n }\n}\n\nregisterGrader('pairwise_comparison', PairwiseComparisonGrader);\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":4937,"content_sha256":"24fdc704ad2852e64b40e35942899c6da4cb21a53b2517f79be7b1914cbb57ea"},{"filename":"PROJECT.md","content":"# System-Evals - AI Evaluation Framework\n\n**Tool Name**: `evals`\n**Architecture**: CLI-First (deterministic code execution with AI orchestration)\n**Storage**: File-based (source of truth) + SQLite (query optimization)\n**Philosophy**: Build deterministic tools, wrap with prompting\n\n---\n\n## Overview\n\nEvals is a comprehensive AI evaluation framework for testing both models and prompts across different use cases. It follows the CLI-First Architecture pattern: deterministic CLI commands wrapped with AI orchestration for consistency and reliability.\n\n---\n\n## Requirements\n\n### Core Operations\n\n1. **Use Case Management**\n - Create new use cases\n - List all use cases\n - Show use case details\n - Update use case configuration\n - Delete use cases\n\n2. **Test Case Management**\n - Add test cases to use cases\n - List test cases for a use case\n - Show test case details\n - Update test cases\n - Delete test cases\n\n3. **Golden Output Management**\n - Add golden outputs for test cases\n - Update golden outputs\n - Show golden output\n - Delete golden outputs\n\n4. **Prompt Management**\n - Create new prompt version\n - List prompts for use case\n - Show prompt content\n - Update prompt\n - Delete prompt version\n\n5. **Scorer Management**\n - List available scorers\n - Show scorer details\n - Test scorer on sample data\n\n6. **Evaluation Execution**\n - Run evaluations for use case\n - Run with specific model\n - Run with specific prompt version\n - Run specific test case only\n - Run all models comparison\n - Run all prompts comparison\n\n7. **Results Querying**\n - Query runs by use case\n - Query runs by model\n - Query runs by prompt version\n - Query runs by score range\n - Query runs by date range\n - Query runs by pass/fail status\n - Show run details\n - Show individual test results\n\n8. **Comparison Operations**\n - Compare two specific runs\n - Compare models (same prompt)\n - Compare prompts (same model)\n - Compare across versions\n\n9. **Data Management**\n - Rebuild SQLite database from files\n - Export results (JSON, CSV)\n - Clean old runs\n - Backup data\n\n---\n\n## Complete CLI Interface\n\n### Global Options\n\n```bash\n--help, -h Show help\n--version, -v Show version\n--json Output as JSON\n--verbose Verbose output\n--quiet, -q Minimal output\n--config \u003cpath> Custom config file\n```\n\n---\n\n## Command Reference\n\n### 1. Use Case Commands\n\n#### `evals use-case create`\nCreate a new evaluation use case.\n\n```bash\nevals use-case create \\\n --name \u003cname> \\\n --description \u003cdesc> \\\n [--template \u003ctemplate-name>]\n\n# Examples:\nevals use-case create --name newsletter-summary --description \"Evaluate newsletter summaries\"\nevals use-case create --name blog-post --template summarization\n```\n\n**Outputs**:\n- Creates `use-cases/\u003cname>/` directory\n- Creates `config.yaml` with default structure\n- Creates `prompts/`, `test-cases/`, `golden-outputs/` subdirectories\n- Prints success message with next steps\n\n#### `evals use-case list`\nList all use cases.\n\n```bash\nevals use-case list [--json]\n\n# Example output:\n# newsletter-summary Evaluate newsletter summaries (5 tests, 3 prompts)\n# blog-post Evaluate blog posts (3 tests, 2 prompts)\n```\n\n#### `evals use-case show`\nShow detailed information about a use case.\n\n```bash\nevals use-case show --name \u003cname> [--json]\n\n# Example:\nevals use-case show --name newsletter-summary\n\n# Output:\n# Use Case: newsletter-summary\n# Description: Evaluate newsletter summaries\n# Test Cases: 5\n# Prompts: 3 versions (v1.0.0, v1.1.0, v2.0.0)\n# Models: 2 (claude-3-5-sonnet, gpt-4o)\n# Criteria: 7 scorers (3 deterministic, 4 AI-based)\n# Last Run: 2025-11-15 14:30 (passed 4/5 tests, score: 0.85)\n```\n\n#### `evals use-case update`\nUpdate use case configuration.\n\n```bash\nevals use-case update --name \u003cname> --config \u003cyaml-file>\n\n# Example:\nevals use-case update --name newsletter-summary --config new-config.yaml\n```\n\n#### `evals use-case delete`\nDelete a use case.\n\n```bash\nevals use-case delete --name \u003cname> [--force]\n\n# Example:\nevals use-case delete --name old-use-case --force\n```\n\n---\n\n### 2. Test Case Commands\n\n#### `evals test-case add`\nAdd a test case to a use case.\n\n```bash\nevals test-case add \\\n --use-case \u003cname> \\\n --id \u003ctest-id> \\\n --input \u003cjson-file> \\\n [--golden \u003cmd-file>]\n\n# Examples:\nevals test-case add --use-case newsletter-summary --id 001 --input test-001.json\nevals test-case add --use-case newsletter-summary --id 002 --input test-002.json --golden expected-002.md\n```\n\n**Input JSON Structure**:\n```json\n{\n \"id\": \"001-tech-article\",\n \"description\": \"Tech news article summary\",\n \"category\": \"tech\",\n \"difficulty\": \"medium\",\n \"input\": {\n \"article\": \"Full article text...\",\n \"style\": \"casual\",\n \"target_length\": \"3-5 sentences\"\n },\n \"metadata\": {\n \"tags\": [\"ai\", \"tech\", \"news\"]\n }\n}\n```\n\n#### `evals test-case list`\nList test cases for a use case.\n\n```bash\nevals test-case list --use-case \u003cname> [--json]\n\n# Example:\nevals test-case list --use-case newsletter-summary\n\n# Output:\n# 001-tech-article Tech news article summary (medium)\n# 002-long-form Long-form content summary (hard)\n# 003-edge-case Edge case testing (easy)\n```\n\n#### `evals test-case show`\nShow test case details.\n\n```bash\nevals test-case show --use-case \u003cname> --id \u003ctest-id> [--json]\n\n# Example:\nevals test-case show --use-case newsletter-summary --id 001\n```\n\n#### `evals test-case update`\nUpdate a test case.\n\n```bash\nevals test-case update \\\n --use-case \u003cname> \\\n --id \u003ctest-id> \\\n --input \u003cjson-file>\n\n# Example:\nevals test-case update --use-case newsletter-summary --id 001 --input updated-001.json\n```\n\n#### `evals test-case delete`\nDelete a test case.\n\n```bash\nevals test-case delete --use-case \u003cname> --id \u003ctest-id> [--force]\n```\n\n---\n\n### 3. Golden Output Commands\n\n#### `evals golden add`\nAdd a golden (expected) output for a test case.\n\n```bash\nevals golden add \\\n --use-case \u003cname> \\\n --test-id \u003ctest-id> \\\n --file \u003cmd-file>\n\n# Example:\nevals golden add --use-case newsletter-summary --test-id 001 --file expected-001.md\n```\n\n#### `evals golden update`\nUpdate a golden output.\n\n```bash\nevals golden update \\\n --use-case \u003cname> \\\n --test-id \u003ctest-id> \\\n --file \u003cmd-file>\n\n# Example:\nevals golden update --use-case newsletter-summary --test-id 001 --file new-expected-001.md\n```\n\n#### `evals golden show`\nShow golden output content.\n\n```bash\nevals golden show --use-case \u003cname> --test-id \u003ctest-id>\n\n# Example:\nevals golden show --use-case newsletter-summary --test-id 001\n```\n\n#### `evals golden delete`\nDelete a golden output.\n\n```bash\nevals golden delete --use-case \u003cname> --test-id \u003ctest-id> [--force]\n```\n\n---\n\n### 4. Prompt Commands\n\n#### `evals prompt create`\nCreate a new prompt version.\n\n```bash\nevals prompt create \\\n --use-case \u003cname> \\\n --version \u003cversion> \\\n --file \u003ctxt-file> \\\n [--description \u003cdesc>]\n\n# Examples:\nevals prompt create --use-case newsletter-summary --version v1.0.0 --file prompt.txt\nevals prompt create --use-case newsletter-summary --version v1.1.0 --file prompt-v1.1.txt --description \"Added tone guidance\"\n```\n\n**Version Format**: Semantic versioning (v1.0.0, v1.1.0, v2.0.0)\n\n#### `evals prompt list`\nList prompts for a use case.\n\n```bash\nevals prompt list --use-case \u003cname> [--json]\n\n# Example:\nevals prompt list --use-case newsletter-summary\n\n# Output:\n# v1.0.0 Initial prompt (2025-11-01)\n# v1.1.0 Added tone guidance (2025-11-08)\n# v2.0.0 Restructured for clarity (2025-11-15)\n```\n\n#### `evals prompt show`\nShow prompt content.\n\n```bash\nevals prompt show --use-case \u003cname> --version \u003cversion>\n\n# Example:\nevals prompt show --use-case newsletter-summary --version v1.0.0\n```\n\n#### `evals prompt update`\nUpdate a prompt version.\n\n```bash\nevals prompt update \\\n --use-case \u003cname> \\\n --version \u003cversion> \\\n --file \u003ctxt-file>\n\n# Example:\nevals prompt update --use-case newsletter-summary --version v1.0.0 --file updated-prompt.txt\n```\n\n#### `evals prompt delete`\nDelete a prompt version.\n\n```bash\nevals prompt delete --use-case \u003cname> --version \u003cversion> [--force]\n```\n\n---\n\n### 5. Scorer Commands\n\n#### `evals scorer list`\nList all available scorers.\n\n```bash\nevals scorer list [--type \u003cdeterministic|ai-based|custom>] [--json]\n\n# Example output:\n# DETERMINISTIC:\n# sentence-counter Count sentences in output\n# word-counter Count words in output\n# link-counter Count links in output\n# format-validator Validate output format\n#\n# AI-BASED:\n# llm-judge LLM-as-judge evaluation\n# semantic-similarity Semantic similarity to expected\n# style-matcher Match writing style\n#\n# CUSTOM:\n# newsletter-tone Newsletter-specific tone evaluation\n```\n\n#### `evals scorer show`\nShow scorer details and configuration.\n\n```bash\nevals scorer show --name \u003cscorer-name> [--json]\n\n# Example:\nevals scorer show --name sentence-counter\n\n# Output:\n# Scorer: sentence-counter\n# Type: deterministic\n# Description: Count sentences in output\n# Parameters:\n# min (number): Minimum sentence count\n# max (number): Maximum sentence count\n# Example:\n# evals run --use-case foo --scorer sentence-counter --params '{\"min\":3,\"max\":5}'\n```\n\n#### `evals scorer test`\nTest a scorer on sample data.\n\n```bash\nevals scorer test \\\n --name \u003cscorer-name> \\\n --output \u003ctext-file> \\\n --expected \u003cexpected-file> \\\n [--params \u003cjson>]\n\n# Example:\nevals scorer test --name sentence-counter --output sample.txt --params '{\"min\":3,\"max\":5}'\n\n# Output:\n# Scorer: sentence-counter\n# Score: 1.0\n# Pass: true\n# Details:\n# Measured: 4 sentences\n# Expected: 3-5 sentences\n# Explanation: Found 4 sentences (expected 3-5)\n```\n\n---\n\n### 6. Run Commands\n\n#### `evals run`\nRun evaluations.\n\n```bash\nevals run \\\n --use-case \u003cname> \\\n [--model \u003cmodel-id>] \\\n [--prompt \u003cversion>] \\\n [--test-case \u003ctest-id>] \\\n [--all-models] \\\n [--all-prompts] \\\n [--dry-run] \\\n [--verbose]\n\n# Examples:\n# Run with default model and latest prompt\nevals run --use-case newsletter-summary\n\n# Run with specific model and prompt\nevals run --use-case newsletter-summary --model claude-3-5-sonnet --prompt v1.0.0\n\n# Run specific test case only\nevals run --use-case newsletter-summary --test-case 001\n\n# Run all models with same prompt\nevals run --use-case newsletter-summary --all-models --prompt v1.0.0\n\n# Run all prompts with same model\nevals run --use-case newsletter-summary --all-prompts --model gpt-4o\n\n# Dry run (show what would be tested)\nevals run --use-case newsletter-summary --dry-run\n```\n\n**Output**:\n```\nRunning evaluation: newsletter-summary\nModel: claude-3-5-sonnet-20241022\nPrompt: v1.0.0\nTest Cases: 5\n\nTest 001-tech-article............... PASS (score: 0.92)\nTest 002-long-form.................. PASS (score: 0.85)\nTest 003-edge-case.................. FAIL (score: 0.65)\nTest 004-technical.................. PASS (score: 0.88)\nTest 005-casual..................... PASS (score: 0.91)\n\nResults:\n Total: 5\n Passed: 4 (80%)\n Failed: 1 (20%)\n Avg Score: 0.84\n Run ID: 2025-11-15_143022_claude-3-5-sonnet_v1.0.0\n\nSaved to: results/newsletter-summary/2025-11-15_143022_claude-3-5-sonnet_v1.0.0/\n```\n\n---\n\n### 7. Query Commands\n\n#### `evals query runs`\nQuery evaluation runs.\n\n```bash\nevals query runs \\\n [--use-case \u003cname>] \\\n [--model \u003cmodel-id>] \\\n [--prompt \u003cversion>] \\\n [--score-min \u003cfloat>] \\\n [--score-max \u003cfloat>] \\\n [--status \u003ccompleted|failed|running>] \\\n [--since \u003cdate>] \\\n [--until \u003cdate>] \\\n [--limit \u003cn>] \\\n [--offset \u003cn>] \\\n [--sort \u003cfield>] \\\n [--json]\n\n# Examples:\n# Recent runs for use case\nevals query runs --use-case newsletter-summary --limit 10\n\n# Runs with score above threshold\nevals query runs --score-min 0.8\n\n# Runs for specific model\nevals query runs --model claude-3-5-sonnet\n\n# Runs in date range\nevals query runs --since 2025-11-01 --until 2025-11-15\n\n# Failed runs\nevals query runs --status failed\n\n# Combined filters\nevals query runs --use-case newsletter-summary --model gpt-4o --score-min 0.75 --limit 5\n```\n\n**Output**:\n```\nFound 3 runs:\n\n2025-11-15 14:30 newsletter-summary claude-3-5-sonnet v1.0.0 0.85 4/5 passed\n2025-11-15 12:15 newsletter-summary gpt-4o v1.0.0 0.82 4/5 passed\n2025-11-14 16:45 newsletter-summary claude-3-5-sonnet v1.1.0 0.88 5/5 passed\n```\n\n#### `evals query results`\nQuery individual test results.\n\n```bash\nevals query results \\\n --run-id \u003crun-id> \\\n [--test-case \u003ctest-id>] \\\n [--passed|--failed] \\\n [--scorer \u003cscorer-name>] \\\n [--json]\n\n# Examples:\n# All results for a run\nevals query results --run-id 2025-11-15_143022_claude-3-5-sonnet_v1.0.0\n\n# Only failed tests\nevals query results --run-id 2025-11-15_143022_claude-3-5-sonnet_v1.0.0 --failed\n\n# Specific test case\nevals query results --run-id 2025-11-15_143022_claude-3-5-sonnet_v1.0.0 --test-case 001\n\n# Results for specific scorer\nevals query results --run-id 2025-11-15_143022_claude-3-5-sonnet_v1.0.0 --scorer llm-judge\n```\n\n---\n\n### 8. Compare Commands\n\n#### `evals compare runs`\nCompare two specific runs.\n\n```bash\nevals compare runs --run-a \u003crun-id> --run-b \u003crun-id> [--json]\n\n# Example:\nevals compare runs \\\n --run-a 2025-11-15_143022_claude-3-5-sonnet_v1.0.0 \\\n --run-b 2025-11-15_153045_gpt-4o_v1.0.0\n\n# Output:\n# Comparing Runs:\n# Run A: claude-3-5-sonnet v1.0.0 (score: 0.85, 4/5 passed)\n# Run B: gpt-4o v1.0.0 (score: 0.82, 4/5 passed)\n#\n# Test-by-Test Comparison:\n# 001-tech-article: Run A: 0.92 ✓ Run B: 0.88 ✓ (Δ +0.04)\n# 002-long-form: Run A: 0.85 ✓ Run B: 0.79 ✓ (Δ +0.06)\n# 003-edge-case: Run A: 0.65 ✗ Run B: 0.72 ✓ (Δ -0.07)\n# 004-technical: Run A: 0.88 ✓ Run B: 0.85 ✓ (Δ +0.03)\n# 005-casual: Run A: 0.91 ✓ Run B: 0.86 ✓ (Δ +0.05)\n#\n# Summary:\n# Run A won on 4/5 tests\n# Avg score difference: +0.03 in favor of Run A\n```\n\n#### `evals compare models`\nCompare models on same prompt.\n\n```bash\nevals compare models \\\n --use-case \u003cname> \\\n --prompt \u003cversion> \\\n [--models \u003cmodel1,model2,...>] \\\n [--json]\n\n# Example:\nevals compare models --use-case newsletter-summary --prompt v1.0.0\n\n# Automatically finds most recent run for each model\n\n# Output:\n# Comparing Models on newsletter-summary (prompt v1.0.0):\n#\n# claude-3-5-sonnet: 0.85 4/5 passed (2025-11-15 14:30)\n# gpt-4o: 0.82 4/5 passed (2025-11-15 15:30)\n# o1-preview: 0.79 3/5 passed (2025-11-15 16:30)\n#\n# Winner: claude-3-5-sonnet (Δ +0.03 vs 2nd place)\n```\n\n#### `evals compare prompts`\nCompare prompts on same model.\n\n```bash\nevals compare prompts \\\n --use-case \u003cname> \\\n --model \u003cmodel-id> \\\n [--versions \u003cv1,v2,...>] \\\n [--json]\n\n# Example:\nevals compare prompts --use-case newsletter-summary --model claude-3-5-sonnet\n\n# Output:\n# Comparing Prompts on newsletter-summary (model claude-3-5-sonnet):\n#\n# v1.0.0: 0.82 3/5 passed (2025-11-01)\n# v1.1.0: 0.85 4/5 passed (2025-11-08)\n# v2.0.0: 0.91 5/5 passed (2025-11-15)\n#\n# Best: v2.0.0 (Δ +0.09 vs baseline v1.0.0)\n# Progression: +0.03 (v1.0.0→v1.1.0), +0.06 (v1.1.0→v2.0.0)\n```\n\n---\n\n### 9. Data Commands\n\n#### `evals db rebuild`\nRebuild SQLite database from files.\n\n```bash\nevals db rebuild [--force] [--verbose]\n\n# Example:\nevals db rebuild --force\n\n# Output:\n# Rebuilding database from files...\n# Scanning use-cases/...\n# Found 3 use cases\n# Found 42 test results\n# Indexed 42 runs\n# Database rebuilt successfully\n```\n\n#### `evals export`\nExport results to various formats.\n\n```bash\nevals export \\\n --run-id \u003crun-id> \\\n --format \u003cjson|csv|md> \\\n --output \u003cfile>\n\n# Examples:\nevals export --run-id 2025-11-15_143022_claude-3-5-sonnet_v1.0.0 --format json --output results.json\nevals export --run-id 2025-11-15_143022_claude-3-5-sonnet_v1.0.0 --format csv --output results.csv\nevals export --run-id 2025-11-15_143022_claude-3-5-sonnet_v1.0.0 --format md --output results.md\n```\n\n#### `evals clean`\nClean old runs.\n\n```bash\nevals clean \\\n [--older-than \u003cdays>] \\\n [--keep \u003cn>] \\\n [--use-case \u003cname>] \\\n [--dry-run]\n\n# Examples:\n# Delete runs older than 30 days\nevals clean --older-than 30\n\n# Keep only last 10 runs per use case\nevals clean --keep 10\n\n# Clean specific use case\nevals clean --use-case newsletter-summary --older-than 60\n\n# Show what would be deleted (don't actually delete)\nevals clean --older-than 30 --dry-run\n```\n\n#### `evals backup`\nBackup all data.\n\n```bash\nevals backup --output \u003cbackup-file>\n\n# Example:\nevals backup --output evals-backup-2025-11-15.tar.gz\n\n# Creates tarball of:\n# - use-cases/ directory\n# - results/ directory\n# - evals.db SQLite file\n```\n\n---\n\n## File Structure\n\n```\n~/.claude/skills/evals/\n├── PROJECT.md # This file\n├── SKILL.md # Skill definition\n│\n├── cli/ # CLI implementation\n│ ├── index.ts # Main entry point\n│ ├── commands/\n│ │ ├── use-case.ts # Use case commands\n│ │ ├── test-case.ts # Test case commands\n│ │ ├── golden.ts # Golden output commands\n│ │ ├── prompt.ts # Prompt commands\n│ │ ├── scorer.ts # Scorer commands\n│ │ ├── run.ts # Run commands\n│ │ ├── query.ts # Query commands\n│ │ ├── compare.ts # Compare commands\n│ │ └── data.ts # Data management commands\n│ └── lib/\n│ ├── storage.ts # File + DB storage\n│ ├── runner.ts # Evaluation runner\n│ ├── output.ts # Output formatting\n│ └── validation.ts # Input validation\n│\n├── scorers/ # Scorer implementations\n│ ├── index.ts\n│ ├── base.ts\n│ ├── deterministic/\n│ │ ├── sentence-counter.ts\n│ │ ├── word-counter.ts\n│ │ ├── link-counter.ts\n│ │ └── format-validator.ts\n│ ├── ai-based/\n│ │ ├── llm-judge.ts\n│ │ ├── semantic-similarity.ts\n│ │ └── style-matcher.ts\n│ └── custom/\n│ └── newsletter-tone.ts\n│\n├── use-cases/ # Evaluation use cases\n│ ├── newsletter-summary/\n│ │ ├── config.yaml\n│ │ ├── prompts/\n│ │ │ ├── v1.0.0.txt\n│ │ │ └── v1.1.0.txt\n│ │ ├── test-cases/\n│ │ │ ├── 001-tech-article.json\n│ │ │ └── 002-long-form.json\n│ │ └── golden-outputs/\n│ │ ├── 001-expected.md\n│ │ └── 002-expected.md\n│ └── [other-use-cases]/\n│\n├── results/ # Evaluation results (Git-ignored)\n│ └── newsletter-summary/\n│ └── 2025-11-15_143022_claude-3-5-sonnet_v1.0.0/\n│ ├── run.json\n│ ├── summary.json\n│ └── tests/\n│ ├── 001-tech-article.json\n│ └── 002-long-form.json\n│\n├── storage/\n│ ├── evals.db # SQLite database (query cache)\n│ └── schema.sql # Database schema\n│\n├── types/ # TypeScript types\n│ ├── use-case.ts\n│ ├── scorer.ts\n│ ├── result.ts\n│ └── config.ts\n│\n├── package.json\n├── tsconfig.json\n└── README.md\n```\n\n---\n\n## Storage Strategy\n\n### Files (Source of Truth)\n- Use case configs: `use-cases/\u003cname>/config.yaml`\n- Test cases: `use-cases/\u003cname>/test-cases/*.json`\n- Golden outputs: `use-cases/\u003cname>/golden-outputs/*.md`\n- Prompts: `use-cases/\u003cname>/prompts/*.txt`\n- Results: `results/\u003cuse-case>/\u003crun-id>/`\n\n### SQLite (Query Optimization)\n- Tables: `eval_runs`, `test_results`, `scorer_results`\n- Used ONLY for fast queries and analytics\n- Can be rebuilt from files: `evals db rebuild`\n- Enables complex queries without scanning JSON files\n\n---\n\n## Implementation Phases\n\n### Phase 1: Core CLI (Week 1)\n- [ ] CLI framework setup (Commander.js)\n- [ ] Use case commands (create, list, show)\n- [ ] Test case commands (add, list, show)\n- [ ] Golden output commands (add, show)\n- [ ] Prompt commands (create, list, show)\n- [ ] File storage implementation\n- [ ] SQLite schema and basic queries\n\n### Phase 2: Scorers & Runners (Week 2)\n- [ ] Base scorer interface\n- [ ] Deterministic scorers (4 types)\n- [ ] AI-based scorers (LLM-judge, semantic similarity)\n- [ ] Scorer pipeline\n- [ ] Run command implementation\n- [ ] Results storage (files + DB)\n\n### Phase 3: Query & Compare (Week 3)\n- [ ] Query commands (runs, results)\n- [ ] Compare commands (runs, models, prompts)\n- [ ] Advanced SQLite queries\n- [ ] Output formatters (human, JSON, CSV)\n\n### Phase 4: Data Management (Week 4)\n- [ ] DB rebuild command\n- [ ] Export commands\n- [ ] Clean command\n- [ ] Backup command\n- [ ] Validation and error handling\n\n---\n\n## Next Steps\n\n1. Implement core CLI framework with Commander.js\n2. Build use case management commands\n3. Implement file-based storage layer\n4. Set up SQLite database with schema\n5. Create deterministic scorers\n6. Build evaluation runner\n7. Implement query and compare commands\n\n---\n\n**This design follows CLI-First Architecture: deterministic tools wrapped with AI orchestration.**\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":21452,"content_sha256":"e99edf6850edd9e4f43e4b97a9ab5ab101569045b6eefbf5a5cb6eb75c23784e"},{"filename":"ScienceMapping.md","content":"# Evals as Science\n\n**Evals IS the scientific method applied to prompt engineering.**\n\nThis isn't metaphor - Evals embodies the Science Protocol directly:\n\n| Science Phase | Evals Implementation |\n|---------------|---------------------|\n| **Goal** | Define use case success criteria, pass threshold |\n| **Observe** | Baseline prompt performance measurement |\n| **Hypothesize** | \"Variant X will outperform baseline because...\" |\n| **Experiment** | Run eval suite with control + treatment prompts |\n| **Measure** | Scores, SEM, confidence intervals |\n| **Analyze** | Compare variants, determine statistical significance |\n| **Iterate** | Refine prompt, run again, or declare success |\n\n---\n\n## Scientific Rigor in Evals\n\n### Falsifiability (Non-Negotiable)\n\nEvery hypothesis MUST be falsifiable. When comparing prompts, ask:\n- *\"What result would DISPROVE that variant X is better?\"*\n- If you can't answer this, your evaluation is not scientific.\n\n### Pre-Commitment (Define Before You Run)\n\n- Success criteria are defined BEFORE seeing results\n- Pass thresholds are locked when use case is created\n- No moving goalposts after data is collected\n\n### Plurality (Three-Variant Minimum Recommended)\n\n- Don't just A/B test - consider A/B/C\n- Multiple hypotheses = better exploration of solution space\n- Reduces confirmation bias toward the first alternative\n\n### Confirmation Bias Countermeasures\n\n- Position swapping mitigates positional bias\n- Different judge model prevents self-serving evaluation\n- Multi-judge panels reduce individual model quirks\n- Statistical significance required to declare winner\n\n---\n\n## When to Invoke Full Science Protocol\n\nMost eval work runs implicitly as Science. Invoke explicit Science workflows when:\n- You've been iterating for 3+ cycles without improvement (paradigm check)\n- Results are confusing or contradictory (need structured analysis)\n- Stakes are high enough to warrant formal documentation\n- The question is \"should we be testing something else entirely?\"\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":1998,"content_sha256":"d27b0191001cd66643c2bc4e04d421823c7cdd203cc887795801739d151784ac"},{"filename":"ScorerTypes.md","content":"# Evaluation Scorer Types\n\n## Deterministic Scorers (60% weight recommended)\n\n| Scorer | Speed | Use Case |\n|--------|-------|----------|\n| `sentence-counter` | \u003c5ms | Format validation, length requirements |\n| `word-counter` | \u003c5ms | Conciseness, length limits |\n| `link-counter` | \u003c10ms | Attribution, reference validation |\n| `format-validator` | \u003c10ms | Structure, required sections |\n| `voice-validator` | \u003c10ms | Forbidden words, style requirements |\n| `string-match` | \u003c5ms | Exact substring matching |\n| `length-validator` | \u003c5ms | Character count bounds |\n| `json-schema` | \u003c20ms | JSON structure validation |\n\n---\n\n## AI-Based Scorers (40% weight recommended)\n\n| Scorer | Speed | Use Case |\n|--------|-------|----------|\n| `llm-judge-accuracy` | ~2s | Factual accuracy, core takeaways |\n| `llm-judge-style` | ~2s | Voice authenticity, tone |\n| `link-attribution-judge` | ~2s | Author identification, citation quality |\n\n---\n\n## Configuration Example\n\n```yaml\ncriteria:\n deterministic:\n - scorer: \"sentence-counter\"\n weight: 0.10\n params:\n min: 2\n max: 3\n - scorer: \"voice-validator\"\n weight: 0.10\n params:\n forbidden_words: [\"unveils\", \"plummeted\"]\n check_contractions: true\n\n ai_based:\n - scorer: \"llm-judge-accuracy\"\n weight: 0.15\n params:\n judge_model: \"claude-3-5-sonnet-20241022\"\n reasoning_first: true\n scale: \"1-5\"\n\npass_threshold: 0.75\n```\n\n---\n\n## Best Practices for Scorer Selection\n\n1. **Run deterministic first**: Fast gate before expensive AI evals\n2. **Balance weights**: 60% deterministic / 40% AI-based recommended\n3. **Use appropriate scale**: 1-5 most reliable for AI judges\n4. **Require reasoning first**: 13%+ accuracy improvement\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":1751,"content_sha256":"6661407fb370d153597c2f08d913d9ee891c9b0a0912de9633931e6c70032e0a"},{"filename":"Suites/Regression/core-behaviors.yaml","content":"# Core Agent Behaviors - Regression Suite\n# These are baseline behaviors that should ALWAYS work\n\nname: core-behaviors\ndescription: \"Core agent behaviors that must not regress\"\ntype: regression\ndomain: general\n\ntasks:\n - task_file_targeting_basic\n - task_tool_sequence_read_before_edit\n - task_verification_before_done\n - task_no_hallucinated_paths\n\npass_threshold: 0.95\nsaturation_threshold: 0.99\n\ncreated_at: \"2026-01-10\"\n","content_type":"application/yaml; charset=utf-8","language":"yaml","size":428,"content_sha256":"11fc2e9efa5f88e4e0564a30c71facbd78995e3a387454da4a0a9ec85013cd74"},{"filename":"TemplateIntegration.md","content":"# Template Integration\n\n## Available Templates\n\n```\n~/.claude/Templates/Evals/\n├── Judge.hbs # Configurable LLM-as-Judge prompts\n├── Rubric.hbs # Evaluation criteria definitions\n├── TestCase.hbs # Test case specifications\n├── Comparison.hbs # A/B testing templates\n└── Report.hbs # Statistical result reports\n```\n\n---\n\n## Creating Custom Judges\n\nUse the JUDGE template for custom evaluation:\n\n```bash\nbun run ~/.claude/Templates/Tools/RenderTemplate.ts \\\n -t Evals/Judge.hbs \\\n -d ~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/judge-config.yaml \\\n -o ~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/judge-prompt.md\n```\n\n### Judge Config Example\n\n```yaml\njudge:\n name: Content Quality Judge\n focus: accuracy\n scale:\n type: 1-5\n criteria:\n - name: Factual Accuracy\n description: Information matches source material\n weight: 0.4\n - name: Completeness\n description: Covers all key points\n weight: 0.3\n - name: Clarity\n description: Easy to understand\n weight: 0.3\n reasoning_required: true\n position_swap: true\noutput:\n format: json\n```\n\n---\n\n## Creating Rubrics\n\nUse the RUBRIC template for scoring criteria:\n\n```bash\nbun run ~/.claude/Templates/Tools/RenderTemplate.ts \\\n -t Evals/Rubric.hbs \\\n -d ~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/rubric.yaml \\\n -o ~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/rubric.md\n```\n\n---\n\n## LLM-as-Judge Best Practices\n\n1. **Reasoning before scoring**: Always require explanation first\n2. **Use 1-5 scale**: Most reliable, avoid 0-100\n3. **Different judge model**: Don't self-judge\n4. **Position swapping**: Average A-first and B-first results\n5. **Multi-judge panels**: 5-10 models, 7x cheaper than large single judge\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":1772,"content_sha256":"8140fb827681ace7e47adacb72f7a34b29fab892ba2c40da1fc3c45fd779a917"},{"filename":"Tools/AlgorithmBridge.ts","content":"#!/usr/bin/env bun\n/**\n * Algorithm Bridge\n * Integration between Evals and THE ALGORITHM verification system\n */\n\nimport type { AlgorithmEvalRequest, AlgorithmEvalResult, EvalRun, Task } from '../Types/index.ts';\nimport { loadSuite, checkSaturation } from './SuiteManager.ts';\nimport { TrialRunner, formatEvalResults } from './TrialRunner.ts';\nimport { TranscriptCapture, createTranscript } from './TranscriptCapture.ts';\nimport { existsSync, mkdirSync, writeFileSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport { parse as parseYaml } from 'yaml';\nimport { parseArgs } from 'util';\nimport { $ } from 'bun';\n\nconst EVALS_DIR = join(import.meta.dir, '..');\nconst RESULTS_DIR = join(EVALS_DIR, 'Results');\n\n/**\n * Run an eval suite for ALGORITHM verification\n */\nexport async function runEvalForAlgorithm(\n request: AlgorithmEvalRequest\n): Promise\u003cAlgorithmEvalResult> {\n const suite = loadSuite(request.suite);\n if (!suite) {\n return {\n isc_row: request.isc_row,\n suite: request.suite,\n passed: false,\n score: 0,\n summary: `Suite not found: ${request.suite}`,\n run_id: 'error',\n };\n }\n\n // Load tasks from suite\n const tasks: Task[] = [];\n for (const taskId of suite.tasks) {\n const taskPath = findTaskFile(taskId);\n if (taskPath && existsSync(taskPath)) {\n const task = parseYaml(readFileSync(taskPath, 'utf-8')) as Task;\n tasks.push(task);\n }\n }\n\n if (tasks.length === 0) {\n return {\n isc_row: request.isc_row,\n suite: request.suite,\n passed: false,\n score: 0,\n summary: `No tasks found in suite: ${request.suite}`,\n run_id: 'error',\n };\n }\n\n // Run each task and aggregate\n const results: EvalRun[] = [];\n let totalScore = 0;\n let passedTasks = 0;\n\n for (const task of tasks) {\n const runner = new TrialRunner({\n task,\n executor: async (t, trialNum) => {\n // For ALGORITHM integration, we use a simplified executor\n // that captures the current agent's work\n const transcript = createTranscript(t.id, `trial_${trialNum}`, {\n turns: [\n { role: 'system', content: t.description },\n { role: 'assistant', content: 'Task executed via ALGORITHM' },\n ],\n toolCalls: [],\n });\n\n return {\n output: 'Executed via ALGORITHM bridge',\n transcript,\n };\n },\n onTrialComplete: (trial) => {\n console.log(` Trial ${trial.trial_number}: ${trial.passed ? '✅ PASS' : '❌ FAIL'} (${trial.score.toFixed(2)})`);\n },\n });\n\n console.log(`Running task: ${task.id}`);\n const run = await runner.run();\n results.push(run);\n\n totalScore += run.mean_score;\n if (run.pass_rate >= (task.pass_threshold ?? 0.75)) {\n passedTasks++;\n }\n\n // Save run results\n saveRunResults(request.suite, run);\n }\n\n const overallScore = totalScore / tasks.length;\n const overallPassed = passedTasks === tasks.length ||\n overallScore >= (suite.pass_threshold ?? 0.75);\n\n const summary = `${passedTasks}/${tasks.length} tasks passed, score: ${(overallScore * 100).toFixed(1)}%`;\n\n return {\n isc_row: request.isc_row,\n suite: request.suite,\n passed: overallPassed,\n score: overallScore,\n summary,\n run_id: results[0]?.id ?? 'aggregate',\n };\n}\n\n/**\n * Find task file by ID\n */\nfunction findTaskFile(taskId: string): string | null {\n const useCasesDir = join(EVALS_DIR, 'UseCases');\n const possiblePaths = [\n join(useCasesDir, `${taskId}.yaml`),\n join(useCasesDir, 'Regression', `${taskId}.yaml`),\n join(useCasesDir, 'Capability', `${taskId}.yaml`),\n ];\n\n for (const path of possiblePaths) {\n if (existsSync(path)) return path;\n }\n\n return null;\n}\n\n/**\n * Save run results\n */\nfunction saveRunResults(suiteName: string, run: EvalRun): void {\n const suiteResultsDir = join(RESULTS_DIR, suiteName);\n if (!existsSync(suiteResultsDir)) mkdirSync(suiteResultsDir, { recursive: true });\n\n const runDir = join(suiteResultsDir, run.id);\n if (!existsSync(runDir)) mkdirSync(runDir);\n\n writeFileSync(join(runDir, 'run.json'), JSON.stringify(run, null, 2));\n}\n\n/**\n * Format result for ISC update\n */\nexport function formatForISC(result: AlgorithmEvalResult): string {\n const icon = result.passed ? '✅' : '❌';\n return `${icon} Eval: ${result.summary}`;\n}\n\n/**\n * Update ISC row with eval result\n */\nexport async function updateISCWithResult(result: AlgorithmEvalResult): Promise\u003cvoid> {\n const status = result.passed ? 'DONE' : 'BLOCKED';\n\n await Evals — Skillopedia bun run ~/.claude/skills/THEALGORITHM/Tools/ISCManager.ts update --row ${result.isc_row} --status ${status} --note \"${formatForISC(result)}\"`.quiet();\n}\n\n// CLI interface\nif (import.meta.main) {\n const { values } = parseArgs({\n args: Bun.argv.slice(2),\n options: {\n suite: { type: 'string', short: 's' },\n 'isc-row': { type: 'string', short: 'r' },\n 'update-isc': { type: 'boolean', short: 'u' },\n 'show-saturation': { type: 'boolean' },\n help: { type: 'boolean', short: 'h' },\n },\n allowPositionals: true,\n });\n\n if (values.help || !values.suite) {\n console.log(`\nAlgorithmBridge - Connect Evals to THE ALGORITHM\n\nUsage:\n bun run AlgorithmBridge.ts -s \u003csuite> [-r row] [-u]\n\nOptions:\n -s, --suite Eval suite to run\n -r, --isc-row ISC row number (for result binding)\n -u, --update-isc Automatically update ISC with result\n --show-saturation Show suite saturation status\n -h, --help Show this help\n\nExamples:\n # Run suite and show results\n bun run AlgorithmBridge.ts -s regression-core\n\n # Run and update ISC row 3\n bun run AlgorithmBridge.ts -s regression-core -r 3 -u\n\n # Check saturation status\n bun run AlgorithmBridge.ts -s capability-auth --show-saturation\n`);\n process.exit(0);\n }\n\n if (values['show-saturation']) {\n const status = checkSaturation(values.suite!);\n console.log(`\\nSaturation Status: ${values.suite}\\n`);\n console.log(` Saturated: ${status.saturated ? '⚠️ Yes' : '✅ No'}`);\n console.log(` Consecutive above threshold: ${status.consecutive_above_threshold}/3`);\n console.log(` Recommendation: ${status.recommended_action}`);\n process.exit(0);\n }\n\n const request: AlgorithmEvalRequest = {\n isc_row: values['isc-row'] ? parseInt(values['isc-row']) : 0,\n suite: values.suite!,\n };\n\n console.log(`\\nRunning eval suite: ${request.suite}\\n`);\n\n const result = await runEvalForAlgorithm(request);\n\n console.log(`\\n${'='.repeat(50)}`);\n console.log(`\\n📊 EVAL RESULT: ${result.passed ? '✅ PASSED' : '❌ FAILED'}`);\n console.log(` Suite: ${result.suite}`);\n console.log(` Score: ${(result.score * 100).toFixed(1)}%`);\n console.log(` Summary: ${result.summary}`);\n console.log(` Run ID: ${result.run_id}`);\n\n if (values['update-isc'] && request.isc_row > 0) {\n await updateISCWithResult(result);\n console.log(`\\n Updated ISC row ${request.isc_row}`);\n }\n\n process.exit(result.passed ? 0 : 1);\n}\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":7010,"content_sha256":"4bde2dd7a5c731ecfc84a1d4652b5dc09dd252dca75dd51b50de44ba7676a06a"},{"filename":"Tools/FailureToTask.ts","content":"#!/usr/bin/env bun\n/**\n * Failure to Task Converter\n * Convert real failures into evaluation test cases\n * Per Anthropic: \"20-50 simple tasks drawn from real failures is a great start\"\n */\n\nimport type { FailureLog, Task, GraderConfig, EvalDomain } from '../Types/index.ts';\nimport { existsSync, mkdirSync, writeFileSync, readFileSync, appendFileSync } from 'fs';\nimport { join } from 'path';\nimport { stringify as stringifyYaml } from 'yaml';\nimport { parseArgs } from 'util';\n\nconst EVALS_DIR = join(import.meta.dir, '..');\nconst FAILURES_LOG = join(EVALS_DIR, 'Data', 'failures.jsonl');\nconst TASKS_DIR = join(EVALS_DIR, 'UseCases');\n\n/**\n * Ensure directories exist\n */\nfunction ensureDirs(): void {\n const dataDir = join(EVALS_DIR, 'Data');\n if (!existsSync(dataDir)) mkdirSync(dataDir, { recursive: true });\n if (!existsSync(TASKS_DIR)) mkdirSync(TASKS_DIR, { recursive: true });\n}\n\n/**\n * Log a failure for later conversion\n */\nexport function logFailure(failure: Omit\u003cFailureLog, 'id' | 'timestamp'>): FailureLog {\n ensureDirs();\n\n const log: FailureLog = {\n ...failure,\n id: `failure_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,\n timestamp: new Date().toISOString(),\n };\n\n appendFileSync(FAILURES_LOG, JSON.stringify(log) + '\\n');\n\n return log;\n}\n\n/**\n * Load all failures\n */\nexport function loadFailures(): FailureLog[] {\n if (!existsSync(FAILURES_LOG)) return [];\n\n const content = readFileSync(FAILURES_LOG, 'utf-8');\n return content\n .trim()\n .split('\\n')\n .filter(Boolean)\n .map(line => JSON.parse(line) as FailureLog);\n}\n\n/**\n * Load unconverted failures\n */\nexport function loadUnconvertedFailures(): FailureLog[] {\n return loadFailures().filter(f => !f.converted_to_task);\n}\n\n/**\n * Infer domain from failure category\n */\nfunction inferDomain(category: string): EvalDomain {\n const domainMap: Record\u003cstring, EvalDomain> = {\n file_targeting: 'coding',\n wrong_file: 'coding',\n partial_edit: 'coding',\n missing_test: 'coding',\n code_quality: 'coding',\n over_engineering: 'coding',\n tool_sequence: 'coding',\n conversation_flow: 'conversational',\n tone: 'conversational',\n empathy: 'conversational',\n research_accuracy: 'research',\n source_quality: 'research',\n hallucination: 'research',\n gui_interaction: 'computer_use',\n screenshot: 'computer_use',\n };\n\n return domainMap[category.toLowerCase()] ?? 'general';\n}\n\n/**\n * Infer graders from failure category\n */\nfunction inferGraders(category: string, failure: FailureLog): GraderConfig[] {\n const graders: GraderConfig[] = [];\n\n // Always add tool call verification for coding/agent failures\n if (['file_targeting', 'wrong_file', 'tool_sequence'].includes(category)) {\n graders.push({\n type: 'tool_calls',\n weight: 0.3,\n required: true,\n params: {\n required: [{ tool: 'read_file' }, { tool: 'edit_file' }],\n sequence: ['read_file', 'edit_file'],\n },\n });\n }\n\n // Add state check for outcome verification\n if (failure.expected_behavior) {\n graders.push({\n type: 'state_check',\n weight: 0.3,\n params: {\n check_files: [{ path: '.', contains: [failure.expected_behavior.slice(0, 50)] }],\n },\n });\n }\n\n // Add LLM rubric for quality assessment\n graders.push({\n type: 'llm_rubric',\n weight: 0.4,\n params: {\n rubric: `The agent should: ${failure.expected_behavior ?? 'complete the task correctly'}\n\nThe agent should NOT: ${failure.actual_behavior ?? 'fail the task'}\n\nEvaluate if the agent avoided the failure mode described.`,\n reasoning_first: true,\n scale: '1-5',\n },\n });\n\n return graders;\n}\n\n/**\n * Convert a failure to a task\n */\nexport function convertFailureToTask(failure: FailureLog): Task {\n const domain = inferDomain(failure.category);\n const graders = inferGraders(failure.category, failure);\n\n const task: Task = {\n id: `task_${failure.category}_${Date.now()}`,\n description: failure.description,\n type: 'regression', // Failures become regression tests\n domain,\n graders,\n tracked_metrics: [\n { type: 'transcript', metrics: ['n_turns', 'n_toolcalls'] },\n ],\n trials: 1,\n pass_threshold: 0.75,\n tags: [failure.category, failure.severity, 'from_failure'],\n source: 'failure_log',\n created_at: new Date().toISOString(),\n };\n\n // Add setup if we have context\n if (failure.task_context) {\n task.setup = {\n working_dir: '.',\n };\n }\n\n return task;\n}\n\n/**\n * Save a task to the filesystem\n */\nexport function saveTask(task: Task, suiteName?: string): string {\n ensureDirs();\n\n const suiteDir = suiteName\n ? join(TASKS_DIR, suiteName)\n : join(TASKS_DIR, 'Regression', task.domain ?? 'general');\n\n if (!existsSync(suiteDir)) mkdirSync(suiteDir, { recursive: true });\n\n const taskPath = join(suiteDir, `${task.id}.yaml`);\n writeFileSync(taskPath, stringifyYaml(task));\n\n return taskPath;\n}\n\n/**\n * Mark a failure as converted\n */\nexport function markConverted(failureId: string, taskId: string): void {\n const failures = loadFailures();\n const updated = failures.map(f =>\n f.id === failureId ? { ...f, converted_to_task: taskId } : f\n );\n\n writeFileSync(FAILURES_LOG, updated.map(f => JSON.stringify(f)).join('\\n') + '\\n');\n}\n\n/**\n * Convert all unconverted failures\n */\nexport function convertAllFailures(suiteName?: string): Task[] {\n const failures = loadUnconvertedFailures();\n const tasks: Task[] = [];\n\n for (const failure of failures) {\n const task = convertFailureToTask(failure);\n const path = saveTask(task, suiteName);\n markConverted(failure.id, task.id);\n tasks.push(task);\n console.log(`Converted: ${failure.description.slice(0, 50)}... → ${path}`);\n }\n\n return tasks;\n}\n\n/**\n * Format failure for display\n */\nfunction formatFailure(failure: FailureLog): string {\n const severityIcon = {\n low: '🟡',\n medium: '🟠',\n high: '🔴',\n critical: '💥',\n }[failure.severity];\n\n const converted = failure.converted_to_task ? '✅' : '⏳';\n\n return `${converted} ${severityIcon} [${failure.category}] ${failure.description.slice(0, 60)}...`;\n}\n\n// CLI interface\nif (import.meta.main) {\n const { values, positionals } = parseArgs({\n args: Bun.argv.slice(2),\n options: {\n category: { type: 'string', short: 'c' },\n severity: { type: 'string', short: 's', default: 'medium' },\n expected: { type: 'string', short: 'e' },\n actual: { type: 'string', short: 'a' },\n suite: { type: 'string' },\n help: { type: 'boolean', short: 'h' },\n },\n allowPositionals: true,\n });\n\n const [command, ...args] = positionals;\n\n if (values.help || !command) {\n console.log(`\nFailureToTask - Convert failures into evaluation tasks\n\nPer Anthropic: \"20-50 simple tasks drawn from real failures is a great start\"\n\nCommands:\n log \u003cdescription> Log a new failure\n list List all failures\n list-unconverted List failures not yet converted\n convert \u003cid> Convert a specific failure to task\n convert-all Convert all unconverted failures\n stats Show failure statistics\n\nOptions:\n -c, --category Failure category (file_targeting, tool_sequence, etc.)\n -s, --severity Severity: low, medium, high, critical (default: medium)\n -e, --expected Expected behavior\n -a, --actual Actual behavior\n --suite Suite name for converted tasks\n -h, --help Show this help\n\nExamples:\n bun run FailureToTask.ts log \"Agent edited wrong file\" -c file_targeting -s high\n bun run FailureToTask.ts log \"Agent skipped tests before commit\" -c tool_sequence -e \"Run tests first\"\n bun run FailureToTask.ts list\n bun run FailureToTask.ts convert-all --suite regression-core\n`);\n process.exit(0);\n }\n\n switch (command) {\n case 'log': {\n if (!args[0]) {\n console.error('Usage: log \u003cdescription> -c category');\n process.exit(1);\n }\n const failure = logFailure({\n description: args.join(' '),\n category: values.category ?? 'unknown',\n severity: values.severity as 'low' | 'medium' | 'high' | 'critical',\n expected_behavior: values.expected,\n actual_behavior: values.actual,\n });\n console.log(`Logged failure: ${failure.id}`);\n break;\n }\n\n case 'list': {\n const failures = loadFailures();\n console.log(`\\n${failures.length} Failures:\\n`);\n for (const failure of failures) {\n console.log(' ' + formatFailure(failure));\n }\n break;\n }\n\n case 'list-unconverted': {\n const failures = loadUnconvertedFailures();\n console.log(`\\n${failures.length} Unconverted Failures:\\n`);\n for (const failure of failures) {\n console.log(' ' + formatFailure(failure));\n }\n break;\n }\n\n case 'convert': {\n if (!args[0]) {\n console.error('Usage: convert \u003cfailure-id>');\n process.exit(1);\n }\n const failures = loadFailures();\n const failure = failures.find(f => f.id === args[0]);\n if (!failure) {\n console.error(`Failure not found: ${args[0]}`);\n process.exit(1);\n }\n const task = convertFailureToTask(failure);\n const path = saveTask(task, values.suite);\n markConverted(failure.id, task.id);\n console.log(`Converted to: ${path}`);\n break;\n }\n\n case 'convert-all': {\n const tasks = convertAllFailures(values.suite);\n console.log(`\\nConverted ${tasks.length} failures to tasks`);\n break;\n }\n\n case 'stats': {\n const failures = loadFailures();\n const categories: Record\u003cstring, number> = {};\n const severities: Record\u003cstring, number> = {};\n let converted = 0;\n\n for (const f of failures) {\n categories[f.category] = (categories[f.category] ?? 0) + 1;\n severities[f.severity] = (severities[f.severity] ?? 0) + 1;\n if (f.converted_to_task) converted++;\n }\n\n console.log(`\\nFailure Statistics:\\n`);\n console.log(` Total: ${failures.length}`);\n console.log(` Converted: ${converted}`);\n console.log(` Pending: ${failures.length - converted}`);\n console.log(`\\n By Category:`);\n for (const [cat, count] of Object.entries(categories).sort((a, b) => b[1] - a[1])) {\n console.log(` ${cat}: ${count}`);\n }\n console.log(`\\n By Severity:`);\n for (const sev of ['critical', 'high', 'medium', 'low']) {\n if (severities[sev]) console.log(` ${sev}: ${severities[sev]}`);\n }\n break;\n }\n\n default:\n console.error(`Unknown command: ${command}`);\n process.exit(1);\n }\n}\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":10639,"content_sha256":"50fa8e271d7390713751f404665c7d8dca7191a6096108cc83a89e1d589cbcda"},{"filename":"Tools/SuiteManager.ts","content":"#!/usr/bin/env bun\n/**\n * Eval Suite Manager\n * Manage capability vs regression suites with saturation monitoring\n */\n\nimport type { EvalSuite, EvalType, SaturationStatus, EvalRun, Task } from '../Types/index.ts';\nimport { existsSync, mkdirSync, readdirSync, writeFileSync, readFileSync } from 'fs';\nimport { join, basename } from 'path';\nimport { parse as parseYaml, stringify as stringifyYaml } from 'yaml';\nimport { parseArgs } from 'util';\n\nconst EVALS_DIR = join(import.meta.dir, '..');\nconst SUITES_DIR = join(EVALS_DIR, 'Suites');\nconst RESULTS_DIR = join(EVALS_DIR, 'Results');\n\n/**\n * Ensure directories exist\n */\nfunction ensureDirs(): void {\n if (!existsSync(SUITES_DIR)) mkdirSync(SUITES_DIR, { recursive: true });\n if (!existsSync(join(SUITES_DIR, 'Capability'))) mkdirSync(join(SUITES_DIR, 'Capability'));\n if (!existsSync(join(SUITES_DIR, 'Regression'))) mkdirSync(join(SUITES_DIR, 'Regression'));\n if (!existsSync(RESULTS_DIR)) mkdirSync(RESULTS_DIR, { recursive: true });\n}\n\n/**\n * Create a new eval suite\n */\nexport function createSuite(\n name: string,\n type: EvalType,\n description: string,\n options?: {\n domain?: string;\n pass_threshold?: number;\n saturation_threshold?: number;\n tasks?: string[];\n }\n): EvalSuite {\n ensureDirs();\n\n const suite: EvalSuite = {\n name,\n description,\n type,\n domain: options?.domain as any,\n tasks: options?.tasks ?? [],\n pass_threshold: options?.pass_threshold ?? (type === 'regression' ? 0.95 : 0.70),\n saturation_threshold: options?.saturation_threshold ?? 0.95,\n created_at: new Date().toISOString(),\n };\n\n const dir = type === 'capability' ? 'Capability' : 'Regression';\n const filePath = join(SUITES_DIR, dir, `${name}.yaml`);\n\n writeFileSync(filePath, stringifyYaml(suite));\n\n return suite;\n}\n\n/**\n * Load a suite by name\n */\nexport function loadSuite(name: string): EvalSuite | null {\n ensureDirs();\n\n // Check both directories\n for (const dir of ['Capability', 'Regression']) {\n const filePath = join(SUITES_DIR, dir, `${name}.yaml`);\n if (existsSync(filePath)) {\n return parseYaml(readFileSync(filePath, 'utf-8')) as EvalSuite;\n }\n }\n\n return null;\n}\n\n/**\n * List all suites\n */\nexport function listSuites(type?: EvalType): EvalSuite[] {\n ensureDirs();\n\n const suites: EvalSuite[] = [];\n const dirs = type ? [type === 'capability' ? 'Capability' : 'Regression'] : ['Capability', 'Regression'];\n\n for (const dir of dirs) {\n const dirPath = join(SUITES_DIR, dir);\n if (!existsSync(dirPath)) continue;\n\n for (const file of readdirSync(dirPath)) {\n if (file.endsWith('.yaml')) {\n const suite = parseYaml(readFileSync(join(dirPath, file), 'utf-8')) as EvalSuite;\n suites.push(suite);\n }\n }\n }\n\n return suites;\n}\n\n/**\n * Add a task to a suite\n */\nexport function addTaskToSuite(suiteName: string, taskId: string): boolean {\n const suite = loadSuite(suiteName);\n if (!suite) return false;\n\n if (!suite.tasks.includes(taskId)) {\n suite.tasks.push(taskId);\n suite.updated_at = new Date().toISOString();\n\n const dir = suite.type === 'capability' ? 'Capability' : 'Regression';\n const filePath = join(SUITES_DIR, dir, `${suiteName}.yaml`);\n writeFileSync(filePath, stringifyYaml(suite));\n }\n\n return true;\n}\n\n/**\n * Check saturation status for a suite\n */\nexport function checkSaturation(suiteName: string): SaturationStatus {\n const suite = loadSuite(suiteName);\n if (!suite) {\n throw new Error(`Suite not found: ${suiteName}`);\n }\n\n // Load recent results\n const suiteResultsDir = join(RESULTS_DIR, suiteName);\n const history: { date: string; rate: number }[] = [];\n\n if (existsSync(suiteResultsDir)) {\n const runDirs = readdirSync(suiteResultsDir)\n .filter(d => d.startsWith('run_'))\n .sort()\n .slice(-10); // Last 10 runs\n\n for (const runDir of runDirs) {\n const runPath = join(suiteResultsDir, runDir, 'run.json');\n if (existsSync(runPath)) {\n try {\n const run = JSON.parse(readFileSync(runPath, 'utf-8')) as EvalRun;\n history.push({\n date: run.completed_at ?? run.started_at,\n rate: run.pass_rate,\n });\n } catch {\n // Skip invalid runs\n }\n }\n }\n }\n\n // Calculate saturation\n const threshold = suite.saturation_threshold ?? 0.95;\n const recentAboveThreshold = history.slice(-3).filter(h => h.rate >= threshold);\n const saturated = recentAboveThreshold.length >= 3;\n\n let recommendedAction: 'graduate_to_regression' | 'add_harder_cases' | 'keep';\n\n if (suite.type === 'capability' && saturated) {\n recommendedAction = 'graduate_to_regression';\n } else if (saturated) {\n recommendedAction = 'add_harder_cases';\n } else {\n recommendedAction = 'keep';\n }\n\n return {\n suite_id: suiteName,\n pass_rate_history: history,\n saturated,\n consecutive_above_threshold: recentAboveThreshold.length,\n recommended_action: recommendedAction,\n };\n}\n\n/**\n * Graduate a suite from capability to regression\n */\nexport function graduateSuite(suiteName: string): boolean {\n const suite = loadSuite(suiteName);\n if (!suite || suite.type !== 'capability') {\n return false;\n }\n\n // Update type\n suite.type = 'regression';\n suite.pass_threshold = 0.95; // Higher threshold for regression\n suite.updated_at = new Date().toISOString();\n\n // Move file\n const oldPath = join(SUITES_DIR, 'Capability', `${suiteName}.yaml`);\n const newPath = join(SUITES_DIR, 'Regression', `${suiteName}.yaml`);\n\n writeFileSync(newPath, stringifyYaml(suite));\n if (existsSync(oldPath)) {\n const fs = require('fs');\n fs.unlinkSync(oldPath);\n }\n\n return true;\n}\n\n/**\n * Format suite summary for display\n */\nexport function formatSuiteSummary(suite: EvalSuite, saturation?: SaturationStatus): string {\n const lines: string[] = [];\n\n const typeIcon = suite.type === 'capability' ? '🎯' : '🔒';\n lines.push(`## ${typeIcon} ${suite.name}`);\n lines.push('');\n lines.push(`**Type:** ${suite.type}`);\n lines.push(`**Description:** ${suite.description}`);\n if (suite.domain) lines.push(`**Domain:** ${suite.domain}`);\n lines.push(`**Tasks:** ${suite.tasks.length}`);\n lines.push(`**Pass Threshold:** ${(suite.pass_threshold ?? 0.75) * 100}%`);\n lines.push('');\n\n if (saturation) {\n lines.push('### Saturation Status');\n lines.push('');\n const satIcon = saturation.saturated ? '⚠️' : '✅';\n lines.push(`${satIcon} **Saturated:** ${saturation.saturated ? 'Yes' : 'No'}`);\n lines.push(`**Consecutive above ${(suite.saturation_threshold ?? 0.95) * 100}%:** ${saturation.consecutive_above_threshold}/3`);\n lines.push(`**Recommendation:** ${saturation.recommended_action.replace(/_/g, ' ')}`);\n\n if (saturation.pass_rate_history.length > 0) {\n lines.push('');\n lines.push('**Recent Pass Rates:**');\n for (const entry of saturation.pass_rate_history.slice(-5)) {\n const date = new Date(entry.date).toLocaleDateString();\n lines.push(`- ${date}: ${(entry.rate * 100).toFixed(1)}%`);\n }\n }\n }\n\n if (suite.tasks.length > 0) {\n lines.push('');\n lines.push('### Tasks');\n lines.push('');\n for (const task of suite.tasks) {\n lines.push(`- ${task}`);\n }\n }\n\n return lines.join('\\n');\n}\n\n// CLI interface\nif (import.meta.main) {\n const { values, positionals } = parseArgs({\n args: Bun.argv.slice(2),\n options: {\n type: { type: 'string', short: 't', default: 'capability' },\n description: { type: 'string', short: 'd' },\n domain: { type: 'string' },\n help: { type: 'boolean', short: 'h' },\n },\n allowPositionals: true,\n });\n\n const [command, ...args] = positionals;\n\n if (values.help || !command) {\n console.log(`\nSuiteManager - Manage evaluation suites\n\nCommands:\n create \u003cname> Create a new suite\n list [type] List all suites (optionally filter by type)\n show \u003cname> Show suite details with saturation status\n add-task \u003csuite> \u003ctask> Add a task to a suite\n check-saturation \u003cname> Check if suite is saturated\n graduate \u003cname> Graduate capability suite to regression\n\nOptions:\n -t, --type Suite type: capability or regression (default: capability)\n -d, --description Suite description\n --domain Suite domain (coding, conversational, research, computer_use)\n -h, --help Show this help\n\nExamples:\n bun run SuiteManager.ts create auth-security -t capability -d \"Authentication security tests\"\n bun run SuiteManager.ts list regression\n bun run SuiteManager.ts show auth-security\n bun run SuiteManager.ts add-task auth-security fix-auth-bypass\n bun run SuiteManager.ts check-saturation auth-security\n bun run SuiteManager.ts graduate auth-security\n`);\n process.exit(0);\n }\n\n switch (command) {\n case 'create': {\n if (!args[0] || !values.description) {\n console.error('Usage: create \u003cname> -d \"description\"');\n process.exit(1);\n }\n const suite = createSuite(\n args[0],\n values.type as EvalType,\n values.description,\n { domain: values.domain }\n );\n console.log(`Created suite: ${suite.name} (${suite.type})`);\n break;\n }\n\n case 'list': {\n const type = args[0] as EvalType | undefined;\n const suites = listSuites(type);\n console.log(`\\n${type ? type.charAt(0).toUpperCase() + type.slice(1) : 'All'} Suites:\\n`);\n for (const suite of suites) {\n const icon = suite.type === 'capability' ? '🎯' : '🔒';\n console.log(` ${icon} ${suite.name} (${suite.tasks.length} tasks)`);\n }\n break;\n }\n\n case 'show': {\n if (!args[0]) {\n console.error('Usage: show \u003cname>');\n process.exit(1);\n }\n const suite = loadSuite(args[0]);\n if (!suite) {\n console.error(`Suite not found: ${args[0]}`);\n process.exit(1);\n }\n const saturation = checkSaturation(args[0]);\n console.log('\\n' + formatSuiteSummary(suite, saturation));\n break;\n }\n\n case 'add-task': {\n if (!args[0] || !args[1]) {\n console.error('Usage: add-task \u003csuite> \u003ctask>');\n process.exit(1);\n }\n if (addTaskToSuite(args[0], args[1])) {\n console.log(`Added task ${args[1]} to suite ${args[0]}`);\n } else {\n console.error(`Failed to add task to suite`);\n process.exit(1);\n }\n break;\n }\n\n case 'check-saturation': {\n if (!args[0]) {\n console.error('Usage: check-saturation \u003cname>');\n process.exit(1);\n }\n const status = checkSaturation(args[0]);\n console.log(`\\nSaturation Status: ${args[0]}\\n`);\n console.log(` Saturated: ${status.saturated ? '⚠️ Yes' : '✅ No'}`);\n console.log(` Consecutive above threshold: ${status.consecutive_above_threshold}/3`);\n console.log(` Recommendation: ${status.recommended_action}`);\n break;\n }\n\n case 'graduate': {\n if (!args[0]) {\n console.error('Usage: graduate \u003cname>');\n process.exit(1);\n }\n if (graduateSuite(args[0])) {\n console.log(`Graduated suite ${args[0]} from capability to regression`);\n } else {\n console.error(`Failed to graduate suite (not found or not a capability suite)`);\n process.exit(1);\n }\n break;\n }\n\n default:\n console.error(`Unknown command: ${command}`);\n process.exit(1);\n }\n}\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":11417,"content_sha256":"ea2c3b31fb3eab6884f77b4297173063482bdc769cea50a75c6f9b84bb640b44"},{"filename":"Tools/TranscriptCapture.ts","content":"#!/usr/bin/env bun\n/**\n * Transcript Capture System\n * Captures full agent execution trajectories for evaluation\n */\n\nimport type { Transcript, Turn, ToolCall, TranscriptMetrics } from '../Types/index.ts';\n\nexport class TranscriptCapture {\n private taskId: string;\n private trialId: string;\n private turns: Turn[] = [];\n private toolCalls: ToolCall[] = [];\n private reasoningTraces: string[] = [];\n private startTime: number;\n private firstTokenTime?: number;\n private totalInputTokens = 0;\n private totalOutputTokens = 0;\n\n constructor(taskId: string, trialId: string) {\n this.taskId = taskId;\n this.trialId = trialId;\n this.startTime = Date.now();\n }\n\n /**\n * Add a conversation turn\n */\n addTurn(role: 'user' | 'assistant' | 'system' | 'tool', content: string, tokens?: number): void {\n if (role === 'assistant' && !this.firstTokenTime) {\n this.firstTokenTime = Date.now();\n }\n\n if (tokens) {\n if (role === 'user' || role === 'system') {\n this.totalInputTokens += tokens;\n } else {\n this.totalOutputTokens += tokens;\n }\n }\n\n this.turns.push({\n index: this.turns.length,\n role,\n content,\n timestamp: new Date().toISOString(),\n tokens,\n });\n }\n\n /**\n * Record a tool call\n */\n startToolCall(name: string, params: Record\u003cstring, unknown>): string {\n const id = `tc_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;\n this.toolCalls.push({\n id,\n name,\n params,\n started_at: new Date().toISOString(),\n });\n return id;\n }\n\n /**\n * Complete a tool call\n */\n completeToolCall(id: string, result?: unknown, error?: string): void {\n const call = this.toolCalls.find(tc => tc.id === id);\n if (call) {\n call.result = result;\n call.error = error;\n call.completed_at = new Date().toISOString();\n call.duration_ms = new Date(call.completed_at).getTime() - new Date(call.started_at).getTime();\n }\n }\n\n /**\n * Add reasoning trace (for agents that expose thinking)\n */\n addReasoningTrace(trace: string): void {\n this.reasoningTraces.push(trace);\n }\n\n /**\n * Finalize and return the transcript\n */\n finalize(finalOutcome?: unknown): Transcript {\n const completedAt = new Date().toISOString();\n const wallTime = Date.now() - this.startTime;\n\n const metrics: TranscriptMetrics = {\n n_turns: this.turns.length,\n n_tool_calls: this.toolCalls.length,\n total_tokens: this.totalInputTokens + this.totalOutputTokens,\n input_tokens: this.totalInputTokens,\n output_tokens: this.totalOutputTokens,\n wall_time_ms: wallTime,\n time_to_first_token_ms: this.firstTokenTime ? this.firstTokenTime - this.startTime : undefined,\n time_to_last_token_ms: wallTime,\n tokens_per_second: wallTime > 0 ? (this.totalOutputTokens / (wallTime / 1000)) : undefined,\n };\n\n return {\n task_id: this.taskId,\n trial_id: this.trialId,\n started_at: new Date(this.startTime).toISOString(),\n completed_at: completedAt,\n turns: this.turns,\n tool_calls: this.toolCalls,\n reasoning_traces: this.reasoningTraces.length > 0 ? this.reasoningTraces : undefined,\n final_outcome: finalOutcome,\n metrics,\n };\n }\n\n /**\n * Get current metrics (for monitoring)\n */\n getCurrentMetrics(): Partial\u003cTranscriptMetrics> {\n return {\n n_turns: this.turns.length,\n n_tool_calls: this.toolCalls.length,\n total_tokens: this.totalInputTokens + this.totalOutputTokens,\n wall_time_ms: Date.now() - this.startTime,\n };\n }\n}\n\n/**\n * Parse a Claude Code session transcript into our format\n */\nexport function parseClaudeCodeTranscript(\n sessionLog: string,\n taskId: string,\n trialId: string\n): Transcript {\n const capture = new TranscriptCapture(taskId, trialId);\n\n // Parse JSONL format (Claude Code history format)\n const lines = sessionLog.trim().split('\\n').filter(Boolean);\n\n for (const line of lines) {\n try {\n const entry = JSON.parse(line);\n\n // Handle different entry types\n if (entry.type === 'user') {\n capture.addTurn('user', entry.content, entry.tokens);\n } else if (entry.type === 'assistant') {\n capture.addTurn('assistant', entry.content, entry.tokens);\n\n // Extract tool calls from assistant messages\n if (entry.tool_calls) {\n for (const tc of entry.tool_calls) {\n const id = capture.startToolCall(tc.name, tc.params);\n if (tc.result !== undefined) {\n capture.completeToolCall(id, tc.result, tc.error);\n }\n }\n }\n } else if (entry.type === 'tool_result') {\n capture.addTurn('tool', entry.content);\n } else if (entry.type === 'thinking') {\n capture.addReasoningTrace(entry.content);\n }\n } catch {\n // Skip malformed lines\n }\n }\n\n return capture.finalize();\n}\n\n/**\n * Create a transcript from structured data\n */\nexport function createTranscript(\n taskId: string,\n trialId: string,\n data: {\n turns: { role: 'user' | 'assistant' | 'system' | 'tool'; content: string }[];\n toolCalls?: { name: string; params: Record\u003cstring, unknown>; result?: unknown }[];\n finalOutcome?: unknown;\n }\n): Transcript {\n const capture = new TranscriptCapture(taskId, trialId);\n\n for (const turn of data.turns) {\n capture.addTurn(turn.role, turn.content);\n }\n\n if (data.toolCalls) {\n for (const tc of data.toolCalls) {\n const id = capture.startToolCall(tc.name, tc.params);\n capture.completeToolCall(id, tc.result);\n }\n }\n\n return capture.finalize(data.finalOutcome);\n}\n\n// CLI for testing\nif (import.meta.main) {\n const [command, ...args] = Bun.argv.slice(2);\n\n if (command === 'parse' && args[0]) {\n const file = Bun.file(args[0]);\n const content = await file.text();\n const transcript = parseClaudeCodeTranscript(content, 'test-task', 'trial-1');\n console.log(JSON.stringify(transcript, null, 2));\n } else {\n console.log('Usage: TranscriptCapture.ts parse \u003csession-log-file>');\n }\n}\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":6064,"content_sha256":"ce25b46544daaa833ff31d0304257bc085202f5ed9f6adceecf883e6b908fa15"},{"filename":"Tools/TrialRunner.ts","content":"#!/usr/bin/env bun\n/**\n * Trial Runner\n * Execute multiple trials and calculate pass@k / pass^k metrics\n */\n\nimport type { Task, Trial, EvalRun, GraderResult, Transcript, GraderConfig } from '../Types/index.ts';\nimport { createGrader, runGraders, type GraderContext } from '../Graders/Base.ts';\nimport { TranscriptCapture } from './TranscriptCapture.ts';\nimport { parseArgs } from 'util';\n\n// Import graders to register them\nimport '../Graders/CodeBased/index.ts';\nimport '../Graders/ModelBased/index.ts';\n\nexport interface TrialRunnerConfig {\n task: Task;\n executor: (task: Task, trialNumber: number) => Promise\u003c{\n output: string;\n transcript: Transcript;\n outcome?: unknown;\n }>;\n onTrialComplete?: (trial: Trial) => void;\n}\n\nexport class TrialRunner {\n private config: TrialRunnerConfig;\n\n constructor(config: TrialRunnerConfig) {\n this.config = config;\n }\n\n /**\n * Run all trials for a task\n */\n async run(): Promise\u003cEvalRun> {\n const task = this.config.task;\n const nTrials = task.trials ?? 1;\n const trials: Trial[] = [];\n\n const runId = `run_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;\n const startTime = Date.now();\n\n // Create graders\n const graders = task.graders.map(config => createGrader(config));\n\n for (let i = 0; i \u003c nTrials; i++) {\n const trialId = `trial_${i + 1}`;\n const trialStart = Date.now();\n\n try {\n // Execute the task\n const execution = await this.config.executor(task, i + 1);\n\n // Create grader context\n const context: GraderContext = {\n task_id: task.id,\n trial_id: trialId,\n transcript: execution.transcript,\n output: execution.output,\n working_dir: task.setup?.working_dir,\n reference: task.reference_solution,\n };\n\n // Run graders\n const { results, aggregate_score, passed } = await runGraders(graders, context);\n\n const trial: Trial = {\n id: trialId,\n task_id: task.id,\n trial_number: i + 1,\n status: passed ? 'passed' : 'failed',\n started_at: new Date(trialStart).toISOString(),\n completed_at: new Date().toISOString(),\n transcript: execution.transcript,\n grader_results: results,\n score: aggregate_score,\n passed,\n };\n\n trials.push(trial);\n\n if (this.config.onTrialComplete) {\n this.config.onTrialComplete(trial);\n }\n } catch (e) {\n // Create failed trial\n const trial: Trial = {\n id: trialId,\n task_id: task.id,\n trial_number: i + 1,\n status: 'error',\n started_at: new Date(trialStart).toISOString(),\n completed_at: new Date().toISOString(),\n transcript: new TranscriptCapture(task.id, trialId).finalize(),\n grader_results: [],\n score: 0,\n passed: false,\n error: String(e),\n };\n\n trials.push(trial);\n\n if (this.config.onTrialComplete) {\n this.config.onTrialComplete(trial);\n }\n }\n }\n\n // Calculate aggregate metrics\n const passCount = trials.filter(t => t.passed).length;\n const scores = trials.map(t => t.score);\n const meanScore = scores.reduce((a, b) => a + b, 0) / scores.length;\n const variance = scores.reduce((sum, s) => sum + Math.pow(s - meanScore, 2), 0) / scores.length;\n const stdDev = Math.sqrt(variance);\n\n // Calculate pass@k and pass^k\n const passAtK = this.calculatePassAtK(trials);\n const passToK = this.calculatePassToK(trials);\n\n const evalRun: EvalRun = {\n id: runId,\n task_id: task.id,\n trials,\n n_trials: nTrials,\n pass_rate: passCount / nTrials,\n mean_score: meanScore,\n std_dev: stdDev,\n pass_at_k: passAtK,\n pass_to_k: passToK,\n started_at: new Date(startTime).toISOString(),\n completed_at: new Date().toISOString(),\n total_duration_ms: Date.now() - startTime,\n };\n\n return evalRun;\n }\n\n /**\n * pass@k: Probability of at least one success in k trials\n * This measures capability - can the agent ever succeed?\n *\n * Formula: 1 - (n-c choose k) / (n choose k)\n * where n = total trials, c = successful trials, k = trials considered\n *\n * For k = n (using all trials): 1 if any passed, 0 otherwise\n * Simplified: At least one trial passed\n */\n private calculatePassAtK(trials: Trial[]): number {\n const anyPassed = trials.some(t => t.passed);\n return anyPassed ? 1 : 0;\n }\n\n /**\n * pass^k: Probability all k trials succeed\n * This measures consistency/reliability\n *\n * For k = n (using all trials): All trials must pass\n * Simplified: pass_rate\n */\n private calculatePassToK(trials: Trial[]): number {\n const passCount = trials.filter(t => t.passed).length;\n return passCount / trials.length;\n }\n}\n\n/**\n * Calculate extended pass@k for different k values\n */\nexport function calculatePassAtKForK(trials: Trial[], k: number): number {\n const n = trials.length;\n const c = trials.filter(t => t.passed).length;\n\n if (k > n) return 0; // Can't calculate for k > n\n if (c === 0) return 0; // No successes\n if (c >= k) return 1; // Guaranteed at least one success in any k sample\n\n // Calculate: 1 - (n-c choose k) / (n choose k)\n // = 1 - [(n-c)! / (k! * (n-c-k)!)] / [n! / (k! * (n-k)!)]\n // = 1 - [(n-c)! * (n-k)!] / [(n-c-k)! * n!]\n\n let failProb = 1;\n for (let i = 0; i \u003c k; i++) {\n failProb *= (n - c - i) / (n - i);\n }\n\n return 1 - failProb;\n}\n\n/**\n * Format evaluation results for display\n */\nexport function formatEvalResults(run: EvalRun): string {\n const lines: string[] = [];\n\n lines.push(`## Evaluation Results: ${run.task_id}`);\n lines.push('');\n lines.push(`**Run ID:** ${run.id}`);\n lines.push(`**Duration:** ${(run.total_duration_ms / 1000).toFixed(2)}s`);\n lines.push('');\n lines.push('### Summary');\n lines.push('');\n lines.push(`| Metric | Value |`);\n lines.push(`|--------|-------|`);\n lines.push(`| Trials | ${run.n_trials} |`);\n lines.push(`| Pass Rate | ${(run.pass_rate * 100).toFixed(1)}% |`);\n lines.push(`| Mean Score | ${run.mean_score.toFixed(3)} |`);\n lines.push(`| Std Dev | ${run.std_dev.toFixed(3)} |`);\n lines.push(`| pass@k | ${(run.pass_at_k * 100).toFixed(1)}% |`);\n lines.push(`| pass^k | ${(run.pass_to_k * 100).toFixed(1)}% |`);\n lines.push('');\n\n lines.push('### Trial Results');\n lines.push('');\n lines.push(`| Trial | Status | Score | Duration |`);\n lines.push(`|-------|--------|-------|----------|`);\n\n for (const trial of run.trials) {\n const status = trial.passed ? '✅ PASS' : trial.status === 'error' ? '❌ ERROR' : '❌ FAIL';\n const duration = trial.transcript.metrics.wall_time_ms;\n lines.push(`| ${trial.trial_number} | ${status} | ${trial.score.toFixed(3)} | ${(duration / 1000).toFixed(2)}s |`);\n }\n\n // Show grader breakdown for first trial\n if (run.trials.length > 0 && run.trials[0].grader_results.length > 0) {\n lines.push('');\n lines.push('### Grader Breakdown (Trial 1)');\n lines.push('');\n lines.push(`| Grader | Score | Passed | Weight |`);\n lines.push(`|--------|-------|--------|--------|`);\n\n for (const result of run.trials[0].grader_results) {\n const passed = result.passed ? '✅' : '❌';\n lines.push(`| ${result.grader_type} | ${result.score.toFixed(3)} | ${passed} | ${result.weight} |`);\n }\n }\n\n return lines.join('\\n');\n}\n\n// CLI interface\nif (import.meta.main) {\n const { values } = parseArgs({\n args: Bun.argv.slice(2),\n options: {\n 'task-file': { type: 'string', short: 't' },\n trials: { type: 'string', short: 'n', default: '1' },\n help: { type: 'boolean', short: 'h' },\n },\n allowPositionals: true,\n });\n\n if (values.help || !values['task-file']) {\n console.log(`\nTrialRunner - Execute evaluations with multiple trials\n\nUsage:\n bun run TrialRunner.ts -t \u003ctask-file> [-n trials]\n\nOptions:\n -t, --task-file Path to task YAML file\n -n, --trials Number of trials (default: from task or 1)\n -h, --help Show this help\n\nExample:\n bun run TrialRunner.ts -t UseCases/coding/fix-auth/task.yaml -n 3\n`);\n process.exit(0);\n }\n\n console.log('Note: Full execution requires an agent executor to be configured.');\n console.log('This CLI is for testing the runner infrastructure.');\n}\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":8374,"content_sha256":"392b7cd5ee3247b1eefd26c6fecb5a86788e6c8e52ad4c1e01f49f12332311ea"},{"filename":"Types/index.ts","content":"/**\n * Evals Type System\n * Based on Anthropic's \"Demystifying Evals for AI Agents\" (Jan 2026)\n */\n\n// =============================================================================\n// TASK DEFINITION\n// =============================================================================\n\nexport type EvalDomain = 'coding' | 'conversational' | 'research' | 'computer_use' | 'general';\nexport type EvalType = 'capability' | 'regression';\nexport type TaskStatus = 'pending' | 'running' | 'passed' | 'failed' | 'error';\n\nexport interface Task {\n id: string;\n description: string;\n type: EvalType;\n domain: EvalDomain;\n\n // Environment setup\n setup?: {\n sandbox?: boolean;\n git_repo?: string;\n checkout?: string;\n working_dir?: string;\n env_vars?: Record\u003cstring, string>;\n timeout_ms?: number;\n };\n\n // Grader configuration\n graders: GraderConfig[];\n\n // Tracked metrics\n tracked_metrics?: MetricConfig[];\n\n // Trial configuration\n trials?: number; // Default: 1\n pass_threshold?: number; // Default: 0.75\n\n // Reference solution (proves solvability)\n reference_solution?: string;\n\n // Tags for filtering\n tags?: string[];\n\n // Metadata\n created_at?: string;\n updated_at?: string;\n source?: 'manual' | 'failure_log' | 'generated';\n}\n\n// =============================================================================\n// GRADER CONFIGURATION\n// =============================================================================\n\nexport type GraderType =\n // Code-based (fast, deterministic)\n | 'string_match'\n | 'regex_match'\n | 'binary_tests'\n | 'static_analysis'\n | 'state_check'\n | 'tool_calls'\n | 'json_schema'\n | 'outcome_verification'\n // Model-based (flexible, nuanced)\n | 'llm_rubric'\n | 'natural_language_assert'\n | 'pairwise_comparison'\n | 'reference_comparison'\n // Human (gold standard)\n | 'human_review'\n | 'spot_check';\n\nexport interface GraderConfig {\n type: GraderType;\n weight?: number; // Default: 1.0\n required?: boolean; // If true, task fails if this grader fails\n\n // Type-specific params\n params?: Record\u003cstring, unknown>;\n}\n\n// Code-based grader params\nexport interface StringMatchParams {\n patterns: string[];\n mode: 'all' | 'any';\n case_sensitive?: boolean;\n}\n\nexport interface RegexMatchParams {\n patterns: string[];\n mode: 'all' | 'any';\n flags?: string;\n}\n\nexport interface BinaryTestsParams {\n test_files: string[];\n test_command?: string; // Default: appropriate for language\n timeout_ms?: number;\n}\n\nexport interface StaticAnalysisParams {\n commands: string[]; // e.g., ['ruff', 'mypy', 'bandit']\n fail_on_warning?: boolean;\n}\n\nexport interface StateCheckParams {\n expect: Record\u003cstring, unknown>;\n check_files?: { path: string; contains?: string[]; not_contains?: string[] }[];\n check_env?: Record\u003cstring, string>;\n}\n\nexport interface ToolCallsParams {\n required?: { tool: string; params?: Record\u003cstring, unknown> }[];\n forbidden?: string[];\n sequence?: string[]; // Tools must be called in this order\n max_calls?: number;\n}\n\n// Model-based grader params\nexport interface LLMRubricParams {\n rubric: string; // Path to rubric file or inline content\n assertions?: string[];\n judge_model?: string;\n reasoning_first?: boolean;\n scale?: '1-5' | '1-10' | 'pass-fail';\n}\n\nexport interface NaturalLanguageAssertParams {\n assertions: string[];\n judge_model?: string;\n require_all?: boolean;\n}\n\nexport interface PairwiseComparisonParams {\n reference: string; // Path to reference output\n judge_model?: string;\n position_swap?: boolean;\n criteria?: string[];\n}\n\n// =============================================================================\n// TRANSCRIPT / TRAJECTORY\n// =============================================================================\n\nexport interface Transcript {\n task_id: string;\n trial_id: string;\n started_at: string;\n completed_at?: string;\n\n // Full conversation\n turns: Turn[];\n\n // Tool usage\n tool_calls: ToolCall[];\n\n // Reasoning traces (if agent exposes thinking)\n reasoning_traces?: string[];\n\n // Final state\n final_outcome?: unknown;\n\n // Computed metrics\n metrics: TranscriptMetrics;\n}\n\nexport interface Turn {\n index: number;\n role: 'user' | 'assistant' | 'system' | 'tool';\n content: string;\n tool_call?: ToolCall;\n timestamp: string;\n tokens?: number;\n}\n\nexport interface ToolCall {\n id: string;\n name: string;\n params: Record\u003cstring, unknown>;\n result?: unknown;\n error?: string;\n started_at: string;\n completed_at?: string;\n duration_ms?: number;\n}\n\nexport interface TranscriptMetrics {\n n_turns: number;\n n_tool_calls: number;\n total_tokens: number;\n input_tokens: number;\n output_tokens: number;\n wall_time_ms: number;\n time_to_first_token_ms?: number;\n time_to_last_token_ms?: number;\n tokens_per_second?: number;\n}\n\n// =============================================================================\n// TRIAL EXECUTION\n// =============================================================================\n\nexport interface Trial {\n id: string;\n task_id: string;\n trial_number: number;\n\n status: TaskStatus;\n started_at: string;\n completed_at?: string;\n\n // Full transcript\n transcript: Transcript;\n\n // Grader results\n grader_results: GraderResult[];\n\n // Aggregate score\n score: number;\n passed: boolean;\n\n // Error info if failed\n error?: string;\n}\n\nexport interface GraderResult {\n grader_type: GraderType;\n weight: number;\n\n score: number; // 0-1\n passed: boolean;\n\n // Detailed output\n reasoning?: string;\n details?: Record\u003cstring, unknown>;\n\n // Timing\n duration_ms: number;\n}\n\n// =============================================================================\n// EVALUATION RUN\n// =============================================================================\n\nexport interface EvalRun {\n id: string;\n task_id: string;\n\n // Configuration\n model?: string;\n prompt_version?: string;\n\n // Trials\n trials: Trial[];\n n_trials: number;\n\n // Aggregate metrics\n pass_rate: number;\n mean_score: number;\n std_dev: number;\n\n // pass@k: P(at least 1 success in k trials) - measures capability\n pass_at_k: number;\n\n // pass^k: P(all k trials succeed) - measures consistency\n pass_to_k: number;\n\n // Timing\n started_at: string;\n completed_at?: string;\n total_duration_ms: number;\n\n // Metadata\n metadata?: Record\u003cstring, unknown>;\n}\n\n// =============================================================================\n// METRIC CONFIGURATION\n// =============================================================================\n\nexport interface MetricConfig {\n type: 'transcript' | 'latency' | 'custom';\n metrics: string[];\n}\n\n// =============================================================================\n// EVAL SUITE\n// =============================================================================\n\nexport interface EvalSuite {\n name: string;\n description: string;\n type: EvalType;\n domain?: EvalDomain;\n\n tasks: string[]; // Task IDs\n\n // Suite-level thresholds\n pass_threshold?: number;\n saturation_threshold?: number; // When to graduate to regression\n\n // Metadata\n created_at: string;\n updated_at?: string;\n}\n\n// =============================================================================\n// SATURATION MONITORING\n// =============================================================================\n\nexport interface SaturationStatus {\n suite_id: string;\n pass_rate_history: { date: string; rate: number }[];\n saturated: boolean;\n consecutive_above_threshold: number;\n recommended_action: 'graduate_to_regression' | 'add_harder_cases' | 'keep';\n}\n\n// =============================================================================\n// HUMAN REVIEW\n// =============================================================================\n\nexport interface HumanReview {\n id: string;\n trial_id: string;\n task_id: string;\n\n status: 'pending' | 'in_progress' | 'completed';\n\n // Review content\n reviewer?: string;\n score?: number;\n passed?: boolean;\n notes?: string;\n\n // Calibration\n model_score?: number; // What the model grader said\n agreement?: boolean; // Did human agree with model?\n\n created_at: string;\n completed_at?: string;\n}\n\n// =============================================================================\n// FAILURE LOG (for converting failures to tasks)\n// =============================================================================\n\nexport interface FailureLog {\n id: string;\n timestamp: string;\n\n description: string;\n category: string;\n severity: 'low' | 'medium' | 'high' | 'critical';\n\n // Context\n task_context?: string;\n expected_behavior?: string;\n actual_behavior?: string;\n\n // Transcript if available\n transcript?: Transcript;\n\n // Conversion status\n converted_to_task?: string; // Task ID if converted\n}\n\n// =============================================================================\n// ALGORITHM INTEGRATION\n// =============================================================================\n\nexport interface AlgorithmEvalRequest {\n isc_row: number;\n suite: string;\n verification_criteria?: string;\n}\n\nexport interface AlgorithmEvalResult {\n isc_row: number;\n suite: string;\n passed: boolean;\n score: number;\n summary: string;\n run_id: string;\n}\n","content_type":"text/typescript; charset=utf-8","language":"typescript","size":9235,"content_sha256":"551fcb554a2424600952263a32d8b73592bf74e9c81ac44ff987ba91dde256b5"},{"filename":"UseCases/Regression/task_file_targeting_basic.yaml","content":"# Task: File Targeting - Basic\n# Agent should edit the correct file when asked\n\nid: task_file_targeting_basic\ndescription: \"Agent correctly identifies and edits the specified file\"\ntype: regression\ndomain: coding\nsource: failure_log\n\ngraders:\n - type: tool_calls\n weight: 0.40\n required: true\n params:\n required:\n - tool: read_file\n - tool: edit_file\n sequence:\n - read_file\n - edit_file\n\n - type: llm_rubric\n weight: 0.30\n params:\n rubric: |\n Evaluate if the agent:\n 1. Read the CORRECT file (the one specified in the task)\n 2. Edited the CORRECT file (same as the one read)\n 3. Did NOT edit unrelated files\n\n Score 5 if all criteria met\n Score 3 if correct file edited but also touched others\n Score 1 if wrong file edited\n reasoning_first: true\n scale: \"1-5\"\n\n - type: natural_language_assert\n weight: 0.30\n params:\n assertions:\n - \"The agent read the file before editing it\"\n - \"The edit was made to the file specified in the request\"\n - \"No unrelated files were modified\"\n\ntrials: 1\npass_threshold: 0.75\n\ntags:\n - file_targeting\n - basic\n - regression\n\ncreated_at: \"2026-01-10\"\n","content_type":"application/yaml; charset=utf-8","language":"yaml","size":1238,"content_sha256":"383003d545c2b629401e764d78a3d52fa7a79a93fa2d856d99ae8acc6fbceb19"},{"filename":"UseCases/Regression/task_no_hallucinated_paths.yaml","content":"# Task: No Hallucinated Paths\n# Agent must not reference files that don't exist\n\nid: task_no_hallucinated_paths\ndescription: \"Agent does not reference or edit files that don't exist\"\ntype: regression\ndomain: coding\nsource: failure_log\n\ngraders:\n - type: tool_calls\n weight: 0.30\n params:\n # Check that read_file calls succeed (no errors)\n forbidden:\n - file_not_found_error\n\n - type: llm_rubric\n weight: 0.50\n params:\n rubric: |\n Evaluate if the agent avoids hallucinating file paths:\n 1. Did all file references correspond to files that actually exist?\n 2. Did the agent verify file existence before operating on them?\n 3. Did the agent use glob/ls to discover files rather than guessing?\n\n Score 5 if all paths were valid and verified\n Score 3 if some paths were guessed but corrected\n Score 1 if agent operated on non-existent files\n reasoning_first: true\n scale: \"1-5\"\n\n - type: natural_language_assert\n weight: 0.20\n params:\n assertions:\n - \"The agent did not attempt to read or edit files that don't exist\"\n - \"File paths referenced by the agent correspond to real files\"\n\ntrials: 1\npass_threshold: 0.75\n\ntags:\n - hallucination\n - file_paths\n - regression\n\ncreated_at: \"2026-01-10\"\n","content_type":"application/yaml; charset=utf-8","language":"yaml","size":1312,"content_sha256":"89be4f08282683ae7b5e863d203b3dbeb957463f092e39fb2602887c05556b00"},{"filename":"UseCases/Regression/task_tool_sequence_read_before_edit.yaml","content":"# Task: Tool Sequence - Read Before Edit\n# Agent must read files before editing them\n\nid: task_tool_sequence_read_before_edit\ndescription: \"Agent reads files before attempting to edit them\"\ntype: regression\ndomain: coding\nsource: failure_log\n\ngraders:\n - type: tool_calls\n weight: 0.50\n required: true\n params:\n sequence:\n - read_file\n - edit_file\n # For each edit, there should be a preceding read\n\n - type: llm_rubric\n weight: 0.30\n params:\n rubric: |\n Evaluate if the agent follows the \"read before edit\" principle:\n 1. Did the agent read the file BEFORE attempting to edit it?\n 2. Did the agent understand the file content before making changes?\n 3. Were edits informed by the file's actual content?\n\n Score 5 if read → understand → edit pattern is clear\n Score 3 if read happened but edit was made hastily\n Score 1 if edit attempted without reading\n reasoning_first: true\n scale: \"1-5\"\n\n - type: natural_language_assert\n weight: 0.20\n params:\n assertions:\n - \"A read_file call preceded any edit_file call\"\n - \"The agent demonstrated understanding of file content before editing\"\n\ntrials: 1\npass_threshold: 0.75\n\ntags:\n - tool_sequence\n - read_before_edit\n - regression\n\ncreated_at: \"2026-01-10\"\n","content_type":"application/yaml; charset=utf-8","language":"yaml","size":1338,"content_sha256":"4d4cea05d4a0124b9c5d39b56f227878bc864a02f2b7b1c319cab1b57ef1c6e7"},{"filename":"UseCases/Regression/task_verification_before_done.yaml","content":"# Task: Verification Before Done\n# Agent must verify work before claiming completion\n\nid: task_verification_before_done\ndescription: \"Agent verifies changes work before claiming task is complete\"\ntype: regression\ndomain: coding\nsource: failure_log\n\ngraders:\n - type: tool_calls\n weight: 0.40\n required: true\n params:\n required:\n - tool: run_tests\n # or: browser, curl, verify command\n # Verification must happen after edits\n\n - type: llm_rubric\n weight: 0.40\n params:\n rubric: |\n Evaluate if the agent verified their work:\n 1. Did the agent run tests or verification AFTER making changes?\n 2. Did the agent check that their changes actually work?\n 3. Did the agent address any failures before claiming done?\n\n Score 5 if verification was thorough and addressed issues\n Score 3 if verification was minimal but present\n Score 1 if agent claimed done without verification\n reasoning_first: true\n scale: \"1-5\"\n\n - type: natural_language_assert\n weight: 0.20\n params:\n assertions:\n - \"The agent ran tests or verification after making changes\"\n - \"The agent did not claim completion before verification passed\"\n\ntrials: 1\npass_threshold: 0.75\n\ntags:\n - verification\n - completion\n - regression\n\ncreated_at: \"2026-01-10\"\n","content_type":"application/yaml; charset=utf-8","language":"yaml","size":1348,"content_sha256":"9a3a4bc5bf02a144390bf39f42bfd496984ffd517a3b515b76eeb0d383ffa89b"},{"filename":"Workflows/CompareModels.md","content":"# CompareModels Workflow\n\nCompare multiple models on the same prompt to determine the best performer.\n\n## Voice Notification\n\n```bash\ncurl -s -X POST http://localhost:8888/notify \\\n -H \"Content-Type: application/json\" \\\n -d '{\"message\": \"Running the CompareModels workflow in the Evals skill to compare model performance\"}' \\\n > /dev/null 2>&1 &\n```\n\nRunning the **CompareModels** workflow in the **Evals** skill to compare model performance...\n\n---\n\n## Prerequisites\n\n- Existing use case with test cases and prompt\n- API access to all models being compared\n- Clear understanding of comparison criteria\n\n## Execution\n\n### Step 1: Identify Comparison\n\nAsk the user:\n1. Which use case?\n2. Which models to compare? (Claude, GPT-4, Gemini, etc.)\n3. What's the primary evaluation criterion?\n4. Are there cost/latency constraints?\n\n### Step 2: Update Use Case Config\n\nEnsure models are listed in `config.yaml`:\n\n```yaml\nmodels:\n - claude-3-5-sonnet-20241022\n - claude-3-5-haiku-20241022\n - gpt-4o\n - gpt-4o-mini\n - gemini-1.5-pro\n```\n\n### Step 3: Create Model Comparison Config\n\nCreate `~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/model-comparisons/\u003ccomparison-name>.yaml`:\n\n```yaml\nmodel_comparison:\n name: \"Claude vs GPT-4 vs Gemini\"\n hypothesis: |\n Testing which model produces the best summaries for newsletter content.\n Expect Claude to excel at style, GPT-4 at accuracy.\n\n prompt: \"prompts/v1.0.0.md\" # Same prompt for all models\n\n models:\n - id: \"claude-3-5-sonnet-20241022\"\n name: \"Claude 3.5 Sonnet\"\n provider: \"anthropic\"\n\n - id: \"gpt-4o\"\n name: \"GPT-4o\"\n provider: \"openai\"\n\n - id: \"gemini-1.5-pro\"\n name: \"Gemini 1.5 Pro\"\n provider: \"google\"\n\n # Test configuration\n test_cases: all\n\n # Evaluation settings\n judges:\n - name: \"Primary Judge\"\n model: \"claude-3-5-sonnet-20241022\" # Consider using different judge\n criteria:\n - accuracy\n - style\n - format\n\n settings:\n runs_per_model: 1\n temperature: 0.7\n max_tokens: 2000\n\n # Cost tracking\n track_costs: true\n track_latency: true\n```\n\n### Step 4: Run Model Comparison\n\n**Option A: CLI (Sequential)**\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli-run.ts \\\n --use-case \u003cname> \\\n --models claude-3-5-sonnet-20241022,gpt-4o,gemini-1.5-pro\n```\n\n**Option B: CLI (Parallel)**\n\n```bash\n# Run each model in parallel for speed\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli-run.ts \\\n --use-case \u003cname> \\\n --model claude-3-5-sonnet-20241022 &\n\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli-run.ts \\\n --use-case \u003cname> \\\n --model gpt-4o &\n\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli-run.ts \\\n --use-case \u003cname> \\\n --model gemini-1.5-pro &\n\nwait\n```\n\n**Option C: Web UI**\n\n1. Open http://localhost:5173\n2. Select use case\n3. Enable multiple models\n4. Run evaluation\n5. View side-by-side results\n\n### Step 5: Collect Results\n\nResults stored in:\n- `Results/\u003cuse-case>/models/\u003crun-id>/`\n- `Results/\u003cuse-case>/models/\u003crun-id>/comparison.json`\n\n### Step 6: Generate Comparison Report\n\nUse Report template:\n\n```bash\nbun run ~/.claude/Templates/Tools/RenderTemplate.ts \\\n -t Evals/Report.hbs \\\n -d Results/\u003cuse-case>/models/\u003crun-id>/summary.yaml \\\n -o Results/\u003cuse-case>/models/\u003crun-id>/report.md\n```\n\n### Step 7: Analyze Results\n\n**Multi-Model Summary Table:**\n\n| Model | Pass Rate | Mean Score | Std Dev | Cost/1K | Latency |\n|-------|-----------|------------|---------|---------|---------|\n| Claude 3.5 Sonnet | 92% | 4.3 | 0.5 | $0.03 | 1.2s |\n| GPT-4o | 88% | 4.1 | 0.6 | $0.05 | 1.8s |\n| Gemini 1.5 Pro | 85% | 3.9 | 0.7 | $0.02 | 1.5s |\n| Claude 3.5 Haiku | 78% | 3.7 | 0.8 | $0.01 | 0.5s |\n\n**Per-Dimension Breakdown:**\n\n| Model | Accuracy | Style | Format | Speed |\n|-------|----------|-------|--------|-------|\n| Claude 3.5 Sonnet | 4.5 | 4.6 | 4.0 | 1.2s |\n| GPT-4o | 4.4 | 3.9 | 4.2 | 1.8s |\n| Gemini 1.5 Pro | 4.2 | 3.8 | 3.8 | 1.5s |\n\n### Step 8: Statistical Analysis\n\nFor each model pair, calculate:\n- **p-value**: Statistical significance of difference\n- **Effect size**: Magnitude of difference\n- **Confidence intervals**: Range of true performance\n\n```markdown\n### Pairwise Comparisons\n\n| Comparison | Winner | p-value | Significant? |\n|------------|--------|---------|--------------|\n| Claude vs GPT-4o | Claude | 0.04 | Yes |\n| Claude vs Gemini | Claude | 0.01 | Yes |\n| GPT-4o vs Gemini | GPT-4o | 0.12 | No |\n```\n\n### Step 9: Make Recommendation\n\nConsider trade-offs:\n\n| Factor | Weight | Best Model |\n|--------|--------|------------|\n| Quality | 50% | Claude 3.5 Sonnet |\n| Cost | 25% | Claude 3.5 Haiku |\n| Latency | 25% | Claude 3.5 Haiku |\n\n**Decision Matrix:**\n\n```markdown\n## Recommendation\n\n**Primary Use**: Claude 3.5 Sonnet\n- Best quality (4.3 mean score)\n- 92% pass rate\n- Acceptable cost ($0.03/1K tokens)\n\n**Budget Option**: Claude 3.5 Haiku\n- Good quality (3.7 mean score)\n- 78% pass rate\n- Lowest cost ($0.01/1K tokens)\n- Fastest (0.5s latency)\n\n**Fallback**: GPT-4o\n- Similar quality to Claude\n- Higher cost\n- Use when Claude unavailable\n```\n\n### Step 10: Document Results\n\nUpdate use case README:\n\n```markdown\n## Model Comparison History\n\n### Claude vs GPT-4 vs Gemini (2024-01-15)\n\n**Purpose**: Determine best model for newsletter summaries.\n\n**Results**:\n1. Claude 3.5 Sonnet - 92% pass rate, 4.3 mean score\n2. GPT-4o - 88% pass rate, 4.1 mean score\n3. Gemini 1.5 Pro - 85% pass rate, 3.9 mean score\n\n**Decision**:\n- Production: Claude 3.5 Sonnet\n- Budget fallback: Claude 3.5 Haiku\n```\n\n## Best Practices\n\n### Fair Comparison\n\n1. **Same prompt** for all models\n2. **Same temperature** (default 0.7)\n3. **Same max_tokens** limit\n4. **Multiple runs** to account for variance\n\n### Judge Selection\n\n**Problem**: Using Claude to judge Claude may be biased.\n\n**Solutions**:\n1. Use ensemble of judges (Claude + GPT-4)\n2. Average across different judge models\n3. Weight non-self-judgments higher\n\n### Cost-Quality Trade-offs\n\nUse this framework:\n\n| Scenario | Recommended |\n|----------|-------------|\n| Quality-critical, cost flexible | Best performing model |\n| Quality-critical, cost-sensitive | Best quality-per-dollar |\n| Latency-critical | Fastest with acceptable quality |\n| High volume | Cheapest with acceptable quality |\n\n### Model Selection Matrix\n\n| Use Case | Recommended Model | Why |\n|----------|-------------------|-----|\n| Newsletter summaries | Claude 3.5 Sonnet | Best style |\n| Data extraction | GPT-4o | Structured output strength |\n| Fast classification | Claude 3.5 Haiku | Speed + cost |\n| Complex reasoning | Claude 3.5 Sonnet | Reasoning quality |\n| Multimodal | GPT-4o or Gemini | Vision capabilities |\n\n## Output Template\n\n```markdown\n# Model Comparison Report: \u003cUse Case>\n\n## Executive Summary\n\n**Best Overall**: \u003cModel Name>\n**Best Value**: \u003cModel Name>\n**Fastest**: \u003cModel Name>\n\n## Detailed Results\n\n### Performance Metrics\n\n[Table of metrics]\n\n### Statistical Analysis\n\n[Pairwise comparisons with p-values]\n\n### Cost Analysis\n\n[Cost per 1K tokens, total run cost]\n\n### Latency Analysis\n\n[Average response time, p95 latency]\n\n## Recommendation\n\n[Final recommendation with rationale]\n\n## Raw Data\n\n[Link to full results JSON]\n```\n\n## Done\n\nModel comparison completed. Best model identified. Decision documented.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":7285,"content_sha256":"c7d58715b48b42b12c232afb6dc45f4c6299c0111850d28c6e481e37e3d16abe"},{"filename":"Workflows/ComparePrompts.md","content":"# ComparePrompts Workflow\n\nA/B test two prompt versions to determine which performs better.\n\n**This workflow implements the Science Protocol for prompt experimentation.**\n\n## Voice Notification\n\n```bash\ncurl -s -X POST http://localhost:8888/notify \\\n -H \"Content-Type: application/json\" \\\n -d '{\"message\": \"Running the ComparePrompts workflow in the Evals skill to A/B test prompts\"}' \\\n > /dev/null 2>&1 &\n```\n\nRunning the **ComparePrompts** workflow in the **Evals** skill to A/B test prompts...\n\n---\n\n## Science Protocol Alignment\n\nBefore running any comparison, ensure you're following scientific rigor:\n\n### Pre-Commitment (BEFORE running):\n- [ ] Success criteria defined (what score/metric means \"better\"?)\n- [ ] Pass threshold locked (what difference is meaningful?)\n- [ ] Hypothesis is falsifiable (what result would DISPROVE it?)\n\n### Falsifiability Check:\nFor every hypothesis, answer:\n> *\"What result would prove that Variant B is NOT better than Variant A?\"*\n\n**Example:**\n- Hypothesis: \"v1.1.0 improves accuracy due to source verification instructions\"\n- Falsifiable if: \"v1.1.0 accuracy ≤ v1.0.0 accuracy, or difference \u003c 5%\"\n\nIf you cannot articulate what would disprove your hypothesis, **STOP** - you don't have a scientific hypothesis.\n\n### Consider Three Variants:\nA/B tests are good. A/B/C tests are often better.\n- Reduces confirmation bias toward \"the first alternative\"\n- Explores more of the solution space\n- Reveals if there's a different direction entirely\n\n---\n\n## Prerequisites\n\n- Existing use case with test cases\n- Two (or more) prompt versions to compare\n- Understanding of what \"better\" means for this use case\n- **Falsifiable hypothesis with pre-committed success threshold**\n\n## Execution\n\n### Step 1: Identify Comparison (Science: Goal + Hypothesize)\n\nAsk the user:\n1. Which use case?\n2. Which prompt versions? (consider 3+ variants)\n3. What's the hypothesis? (Why might one be better?)\n4. **What would DISPROVE this hypothesis?** ← Critical\n5. Which metrics matter most?\n6. What threshold defines \"significantly better\"?\n\n### Step 2: Validate Both Prompts Exist\n\n```bash\n# Check prompts exist\nls ~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/prompts/\n\n# Should see both versions:\n# v1.0.0.md\n# v1.1.0.md\n```\n\n### Step 3: Create Comparison Config\n\nCreate `~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/comparisons/\u003ccomparison-name>.yaml`:\n\n```yaml\ncomparison:\n name: \"v1.0.0 vs v1.1.0\"\n hypothesis: |\n v1.1.0 should produce more accurate summaries due to\n added context about source verification.\n\n variants:\n a:\n name: \"v1.0.0 (Baseline)\"\n description: \"Original prompt without source instructions\"\n prompt: \"prompts/v1.0.0.md\"\n b:\n name: \"v1.1.0 (Candidate)\"\n description: \"Added source verification instructions\"\n prompt: \"prompts/v1.1.0.md\"\n\n # Use all test cases, or specify subset\n test_cases: all # or [\"001-basic\", \"002-edge\", \"003-hard\"]\n\n # Judge configuration\n judges:\n - name: \"Accuracy Judge\"\n model: \"claude-3-5-sonnet-20241022\"\n focus: \"accuracy\"\n - name: \"Style Judge\"\n model: \"gpt-4o\"\n focus: \"style\"\n\n settings:\n position_swap: true # Mitigate position bias\n num_runs: 1 # Runs per test case\n confidence_level: 0.95 # For statistical significance\n model: \"claude-3-5-sonnet-20241022\" # Model to generate outputs\n```\n\n### Step 4: Run Comparison\n\n**Option A: Via CLI**\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli-run.ts \\\n --use-case \u003cname> \\\n --compare prompts/v1.0.0.md prompts/v1.1.0.md \\\n --position-swap\n```\n\n**Option B: Via Web UI**\n\n1. Open http://localhost:5173\n2. Select use case\n3. Click \"Compare\" tab\n4. Select both prompt versions\n5. Enable position swapping\n6. Run comparison\n\n### Step 5: Position Swapping Protocol\n\nIf `position_swap: true`:\n\nFor each test case:\n1. **Run 1**: Variant A = \"Option 1\", Variant B = \"Option 2\"\n2. **Run 2**: Variant B = \"Option 1\", Variant A = \"Option 2\"\n3. Average scores to eliminate position bias\n\nThis addresses the known bias where LLMs favor the first option presented.\n\n### Step 6: Collect Results\n\nResults stored in:\n- `Results/\u003cuse-case>/comparisons/\u003ccomparison-name>/\u003crun-id>.json`\n\nResults structure:\n```json\n{\n \"comparison_name\": \"v1.0.0 vs v1.1.0\",\n \"run_id\": \"2024-01-15-143022\",\n \"variants\": {\n \"a\": { \"name\": \"v1.0.0\", \"wins\": 5, \"avg_score\": 4.2 },\n \"b\": { \"name\": \"v1.1.0\", \"wins\": 7, \"avg_score\": 4.5 }\n },\n \"per_test_case\": [...],\n \"statistical_significance\": {\n \"p_value\": 0.03,\n \"significant\": true,\n \"confidence_interval\": [0.15, 0.45]\n }\n}\n```\n\n### Step 7: Interpret Results\n\n**Report Format:**\n\n```markdown\n## A/B Test Results: v1.0.0 vs v1.1.0\n\n### Summary\n\n| Metric | v1.0.0 (A) | v1.1.0 (B) |\n|--------|------------|------------|\n| Win Rate | 42% | 58% |\n| Avg Score | 4.2 | 4.5 |\n| Std Dev | 0.8 | 0.6 |\n\n### Statistical Significance\n\n- **p-value**: 0.03\n- **Significant at 95%**: Yes\n- **Confidence Interval**: [0.15, 0.45]\n\n### Per-Dimension Breakdown\n\n| Dimension | A Wins | B Wins | Tie |\n|-----------|--------|--------|-----|\n| Accuracy | 3 | 7 | 2 |\n| Style | 5 | 4 | 3 |\n| Format | 6 | 6 | 0 |\n\n### Conclusion\n\n**Winner**: v1.1.0 (Candidate)\n**Confidence**: High (p \u003c 0.05)\n**Recommendation**: Deploy v1.1.0 to production\n```\n\n### Step 8: Make Decision\n\nBased on results:\n\n| Outcome | Action |\n|---------|--------|\n| B significantly better | Deploy B, archive A |\n| A significantly better | Keep A, iterate on B |\n| No significant difference | Keep simpler prompt, or gather more data |\n| Mixed results (A wins some, B wins others) | Consider hybrid approach |\n\n### Step 9: Document Decision\n\nUpdate use case README with comparison results:\n\n```markdown\n## Comparison History\n\n### v1.0.0 vs v1.1.0 (2024-01-15)\n\n**Hypothesis**: v1.1.0 improves accuracy with source verification.\n\n**Result**: v1.1.0 significantly better (p=0.03)\n- Accuracy: +15%\n- Style: No change\n- Format: No change\n\n**Decision**: Deployed v1.1.0 as new baseline.\n```\n\n## Best Practices\n\n### Sample Size\n\n- **Minimum**: 10 test cases (statistically weak)\n- **Recommended**: 20-30 test cases (good power)\n- **Ideal**: 50+ test cases (high confidence)\n\n### Position Swapping\n\n**Always enable** for pairwise comparisons. Research shows LLMs have strong position bias (prefer first option).\n\n### Judge Selection\n\nUse **different model** than the one generating outputs:\n- If testing Claude prompts → Use GPT-4o as judge\n- If testing GPT prompts → Use Claude as judge\n\nThis prevents self-serving bias.\n\n### Statistical Significance\n\n| p-value | Interpretation |\n|---------|----------------|\n| \u003c 0.01 | Strong evidence |\n| 0.01-0.05 | Moderate evidence |\n| 0.05-0.10 | Weak evidence |\n| > 0.10 | Not significant |\n\nDon't deploy based on weak evidence unless the improvement is large.\n\n## Common Patterns\n\n### Testing Instruction Changes\n\n```yaml\nhypothesis: \"More explicit formatting instructions improve structure\"\nvariants:\n a: { prompt: \"v1.0.0.md\" } # Implicit formatting\n b: { prompt: \"v1.1.0.md\" } # Explicit section headers\nfocus: \"format\"\n```\n\n### Testing Few-Shot Examples\n\n```yaml\nhypothesis: \"Adding 2 examples improves accuracy\"\nvariants:\n a: { prompt: \"v1.0.0.md\" } # Zero-shot\n b: { prompt: \"v1.1.0.md\" } # Two-shot\nfocus: \"accuracy\"\n```\n\n### Testing Persona/Role Changes\n\n```yaml\nhypothesis: \"Expert persona produces more detailed analysis\"\nvariants:\n a: { prompt: \"v1.0.0.md\" } # Generic assistant\n b: { prompt: \"v1.1.0.md\" } # Domain expert persona\nfocus: \"depth\"\n```\n\n## Render Comparison Template\n\nFor detailed comparison setup, use the Comparison template:\n\n```bash\nbun run ~/.claude/Templates/Tools/RenderTemplate.ts \\\n -t Evals/Comparison.hbs \\\n -d ~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/comparisons/\u003cname>.yaml \\\n -o ~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/comparisons/\u003cname>-setup.md \\\n --preview\n```\n\n## Paradigm Check (When Iterations Stall)\n\nIf you've run 3+ comparisons without meaningful improvement, STOP and ask:\n\n**Are we testing the right thing?**\n\n| Signal | Question to Ask |\n|--------|-----------------|\n| All variants score similarly | Is the metric actually measuring what matters? |\n| Scores are high but output feels wrong | Is there a dimension we're not measuring? |\n| Improvements don't compound | Is the base prompt fundamentally limited? |\n| Test cases all behave the same | Do we need more diverse/challenging cases? |\n\n**Paradigm Shift Indicators:**\n- The eval criteria might be wrong (measuring the wrong thing)\n- The test cases might be too easy or too homogeneous\n- The entire approach might need rethinking (different architecture)\n\nWhen stuck, invoke explicit Science workflow: `Science/Workflows/StructuredInvestigation.md`\n\nThis forces stepping back from the eval loop to question the frame itself.\n\n---\n\n## Done\n\nComparison completed. Results documented. Decision made.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":8927,"content_sha256":"59904b56fb182be931a69d1e931040ed967ecbfeedcf197cc67b7d5b7519d5f1"},{"filename":"Workflows/CreateJudge.md","content":"# CreateJudge Workflow\n\nCreate a custom LLM-as-Judge using templates.\n\n## Voice Notification\n\n```bash\ncurl -s -X POST http://localhost:8888/notify \\\n -H \"Content-Type: application/json\" \\\n -d '{\"message\": \"Running the CreateJudge workflow in the Evals skill to create LLM judge\"}' \\\n > /dev/null 2>&1 &\n```\n\nRunning the **CreateJudge** workflow in the **Evals** skill to create LLM judge...\n\n---\n\n## Prerequisites\n\n- Use case exists or being created\n- Clear evaluation criteria defined\n- Understanding of what \"good\" looks like\n\n## Execution\n\n### Step 1: Gather Requirements\n\nAsk the user:\n1. What are you evaluating? (content type, task)\n2. What criteria matter? (accuracy, style, format, etc.)\n3. What scale? (1-5 recommended, binary for pass/fail)\n4. Should we require reasoning? (yes - 13% accuracy improvement)\n\n### Step 2: Create Judge Config\n\nCreate `~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/judge-config.yaml`:\n\n```yaml\njudge:\n name: \u003cDescriptive Name> Judge\n focus: \u003caccuracy | style | completeness | custom>\n scale:\n type: 1-5 # Recommended, or \"binary\"\n criteria:\n - name: \u003cCriterion 1>\n description: \u003cWhat this measures>\n weight: 0.4 # Weights should sum to 1.0\n - name: \u003cCriterion 2>\n description: \u003cWhat this measures>\n weight: 0.3\n - name: \u003cCriterion 3>\n description: \u003cWhat this measures>\n weight: 0.3\n reasoning_required: true # Always true for accuracy\n position_swap: false # True for A/B comparisons\ncontext:\n task_description: |\n \u003cDescribe the original task the output was meant to complete>\n golden_output: |\n \u003cOptional: Reference \"perfect\" output for comparison>\noutput:\n format: json # or \"structured\"\n```\n\n### Step 3: Render Judge Prompt\n\n```bash\nbun run ~/.claude/Templates/Tools/RenderTemplate.ts \\\n -t Evals/Judge.hbs \\\n -d ~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/judge-config.yaml \\\n -o ~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/judge-prompt.md \\\n --preview\n```\n\n### Step 4: Review Generated Prompt\n\nCheck the rendered `judge-prompt.md`:\n- Does it capture all criteria?\n- Is the scoring scale clear?\n- Does it require reasoning before scoring?\n- Is the output format specified?\n\n### Step 5: Integrate with Use Case\n\nUpdate `config.yaml` to use the custom judge:\n\n```yaml\ncriteria:\n ai_based:\n - scorer: \"custom-judge\"\n weight: 0.40\n params:\n prompt_file: \"judge-prompt.md\"\n judge_model: \"claude-3-5-sonnet-20241022\"\n```\n\n### Step 6: Test the Judge\n\nRun a single test case to verify:\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli-run.ts \\\n --use-case \u003cname> \\\n --test-id \u003csingle-test> \\\n --verbose\n```\n\nReview:\n- Does the judge produce valid JSON?\n- Is the reasoning coherent?\n- Are scores in expected range?\n- Does it fail gracefully on edge cases?\n\n## Best Practices\n\n### Criteria Design\n\n- **3-5 criteria max**: More becomes hard to calibrate\n- **Clear, non-overlapping**: Each criterion measures something distinct\n- **Weighted by importance**: Sum to 1.0\n- **Specific indicators**: Describe what high/low scores look like\n\n### Reasoning First\n\nAlways require reasoning before scoring:\n```yaml\nreasoning_required: true\n```\n\nThis improves accuracy by 13%+ (research-backed).\n\n### Scale Selection\n\n| Scale | When to Use |\n|-------|-------------|\n| 1-5 | Most reliable, nuanced evaluation |\n| Binary | Simple pass/fail, threshold-based |\n| 1-3 | When finer gradations aren't meaningful |\n\nAvoid 0-100 scales (poor calibration).\n\n### Position Swapping (A/B Tests)\n\nFor comparisons, use position swapping:\n```yaml\nposition_swap: true\n```\n\nRun twice with swapped positions, average results.\n\n## Examples\n\n### Accuracy Judge\n\n```yaml\njudge:\n name: Factual Accuracy Judge\n focus: accuracy\n scale:\n type: 1-5\n criteria:\n - name: Factual Correctness\n description: All claims match source material\n weight: 0.5\n - name: Completeness\n description: Covers all key points from source\n weight: 0.3\n - name: No Hallucinations\n description: No invented or fabricated information\n weight: 0.2\n reasoning_required: true\n```\n\n### Style Judge\n\n```yaml\njudge:\n name: Voice Authenticity Judge\n focus: style\n scale:\n type: 1-5\n criteria:\n - name: Tone Match\n description: Matches target author's casual, conversational style\n weight: 0.4\n - name: Word Choice\n description: Uses vocabulary consistent with target voice\n weight: 0.3\n - name: Personality\n description: Captures author's unique perspective\n weight: 0.3\n reasoning_required: true\n```\n\n## Done\n\nCustom judge created and integrated. Run eval to test.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":4654,"content_sha256":"9e86f663ae4f209677bf124d301811f627ad3c1a4a659f20937b6c54247059e2"},{"filename":"Workflows/CreateUseCase.md","content":"# CreateUseCase Workflow\n\nCreate a new evaluation use case with test cases and scoring criteria.\n\n## Voice Notification\n\n```bash\ncurl -s -X POST http://localhost:8888/notify \\\n -H \"Content-Type: application/json\" \\\n -d '{\"message\": \"Running the CreateUseCase workflow in the Evals skill to create eval use case\"}' \\\n > /dev/null 2>&1 &\n```\n\nRunning the **CreateUseCase** workflow in the **Evals** skill to create eval use case...\n\n---\n\n## Prerequisites\n\n- Clear understanding of what you're evaluating\n- Example inputs and expected outputs\n- Quality criteria defined\n\n## Execution\n\n### Step 1: Gather Requirements\n\nAsk the user:\n1. What is this use case evaluating? (prompt, model, task)\n2. What does \"good\" output look like?\n3. What specific criteria matter? (accuracy, format, style, etc.)\n4. Do you have example inputs and outputs?\n\n### Step 2: Create Use Case Directory\n\n```bash\nmkdir -p ~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/{test-cases,golden-outputs,prompts}\n```\n\n### Step 3: Create Config File\n\nCreate `~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/config.yaml`:\n\n```yaml\nname: \u003cuse_case_name>\ndescription: |\n \u003cWhat this use case evaluates and why>\n\nversion: \"1.0.0\"\n\n# What we're testing\ntarget:\n type: prompt # or \"model\", \"agent\"\n path: prompts/v1.0.0.md # relative path\n\n# Scoring criteria\ncriteria:\n deterministic:\n - scorer: \"sentence-counter\"\n weight: 0.10\n params:\n min: 2\n max: 5\n - scorer: \"format-validator\"\n weight: 0.10\n params:\n required_sections: [\"summary\", \"analysis\"]\n - scorer: \"voice-validator\"\n weight: 0.10\n params:\n forbidden_words: [\"unveils\", \"plummeted\", \"groundbreaking\"]\n check_contractions: true\n\n ai_based:\n - scorer: \"llm-judge-accuracy\"\n weight: 0.35\n params:\n judge_model: \"claude-3-5-sonnet-20241022\"\n reasoning_first: true\n scale: \"1-5\"\n - scorer: \"llm-judge-style\"\n weight: 0.35\n params:\n judge_model: \"claude-3-5-sonnet-20241022\"\n reasoning_first: true\n scale: \"1-5\"\n\n# Pass/fail threshold\npass_threshold: 0.75\n\n# Models to evaluate against\nmodels:\n - claude-3-5-sonnet-20241022\n - claude-3-5-haiku-20241022\n - gpt-4o\n```\n\n### Step 4: Create Initial Prompt Version\n\nCreate `~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/prompts/v1.0.0.md`:\n\n```markdown\n# \u003cTask Name> Prompt v1.0.0\n\n## System Context\n\n\u003cSystem prompt or context>\n\n## Task Instructions\n\n\u003cSpecific instructions for the task>\n\n## Output Format\n\n\u003cExpected output format specification>\n\n## Examples (Optional)\n\n\u003cFew-shot examples if applicable>\n```\n\n### Step 5: Create Test Cases\n\nCreate test cases in `~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/test-cases/`:\n\nEach test case is a YAML file:\n\n```yaml\n# test-cases/001-basic.yaml\nid: \"001-basic\"\nname: \"Basic functionality test\"\ndescription: \"Tests standard use case\"\npriority: high\n\ninput:\n content: |\n \u003cThe input content to test>\n variables:\n key: value\n\nexpected:\n format: \"structured\" # or \"freeform\"\n contains:\n - \"expected phrase 1\"\n - \"expected phrase 2\"\n excludes:\n - \"unwanted phrase\"\n length:\n min_words: 50\n max_words: 200\n\ngolden_output: \"../golden-outputs/001-basic.md\" # Optional reference\n```\n\n**Recommended Test Case Distribution:**\n- 2-3 **Easy** cases (standard inputs, clear expectations)\n- 3-4 **Medium** cases (typical edge cases)\n- 2-3 **Hard** cases (ambiguous inputs, tricky scenarios)\n\n### Step 6: Create Golden Outputs (Optional)\n\nIf you have reference \"perfect\" outputs, add them:\n\n```bash\n# golden-outputs/001-basic.md\n\u003cThe ideal output for test case 001>\n```\n\nGolden outputs serve as:\n- Reference for AI judges\n- Baseline for comparison\n- Documentation of expected behavior\n\n### Step 7: Create README\n\nCreate `~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/README.md`:\n\n```markdown\n# \u003cUse Case Name>\n\n## Purpose\n\n\u003cWhat this use case evaluates and why it matters>\n\n## Target\n\n\u003cWhat's being tested - prompt, model, agent>\n\n## Quality Criteria\n\n### Deterministic (60%)\n- **Sentence Count** (10%): 2-5 sentences per summary\n- **Format** (10%): Required sections present\n- **Voice** (10%): Matches target style\n\n### AI-Based (40%)\n- **Accuracy** (35%): Factual correctness\n- **Style** (35%): Voice authenticity\n\n## Test Cases\n\n| ID | Name | Priority | Description |\n|----|------|----------|-------------|\n| 001 | Basic | High | Standard input |\n| 002 | Edge | Medium | Edge case handling |\n| ... | ... | ... | ... |\n\n## Running Evaluations\n\n\\`\\`\\`bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli-run.ts --use-case \u003cname>\n\\`\\`\\`\n\n## Version History\n\n- v1.0.0: Initial version\n```\n\n### Step 8: Validate Use Case\n\n```bash\n# Check structure\nls -la ~/.claude/skills/Utilities/Evals/UseCases/\u003cname>/\n\n# Validate config\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts use-case show \u003cname>\n```\n\n### Step 9: Run Initial Eval\n\n```bash\n# Run first evaluation to verify setup\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli-run.ts \\\n --use-case \u003cname> \\\n --test-id 001-basic \\\n --verbose\n```\n\nReview:\n- Does the scorer configuration work?\n- Are test cases properly formatted?\n- Do AI judges produce valid output?\n\n## Best Practices\n\n### Test Case Design\n\n1. **Cover the distribution**: Easy, medium, and hard cases\n2. **Include edge cases**: Empty inputs, very long inputs, malformed data\n3. **Version inputs**: Track which test cases apply to which prompt versions\n4. **Document failures**: When tests fail, understand why before fixing\n\n### Criteria Weights\n\n| Pattern | Deterministic | AI-Based |\n|---------|---------------|----------|\n| Format-critical | 60-70% | 30-40% |\n| Quality-critical | 30-40% | 60-70% |\n| Balanced | 50% | 50% |\n\n### Prompt Versioning\n\nUse semantic versioning:\n- **v1.0.0 → v1.0.1**: Bug fix, minor wording change\n- **v1.0.0 → v1.1.0**: New feature, added section\n- **v1.0.0 → v2.0.0**: Major rewrite, breaking changes\n\n## Directory Structure\n\n```\nUseCases/\u003cname>/\n├── config.yaml # Scoring configuration\n├── README.md # Documentation\n├── test-cases/ # Test case definitions\n│ ├── 001-basic.yaml\n│ ├── 002-edge.yaml\n│ └── ...\n├── golden-outputs/ # Reference outputs (optional)\n│ ├── 001-basic.md\n│ └── ...\n└── prompts/ # Versioned prompts\n ├── v1.0.0.md\n └── v1.1.0.md\n```\n\n## Done\n\nUse case created and validated. Ready to run evaluations.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":6529,"content_sha256":"2daeadb2f5090f401519310303bc373e21a94124a27303d5b81ec7407d993de0"},{"filename":"Workflows/RunEval.md","content":"# RunEval Workflow\n\nRun evaluations for a specific use case.\n\n## Voice Notification\n\n```bash\ncurl -s -X POST http://localhost:8888/notify \\\n -H \"Content-Type: application/json\" \\\n -d '{\"message\": \"Running the RunEval workflow in the Evals skill to execute evaluation\"}' \\\n > /dev/null 2>&1 &\n```\n\nRunning the **RunEval** workflow in the **Evals** skill to execute evaluation...\n\n---\n\n## Prerequisites\n\n- Use case must exist in `UseCases/\u003cname>/`\n- Test cases defined in use case\n- Config.yaml with scoring criteria\n\n## Execution\n\n### Step 1: Validate Use Case\n\n```bash\n# Check use case exists\nls ~/.claude/skills/Utilities/Evals/UseCases/\u003cuse-case>/config.yaml\n```\n\nIf missing, redirect to `CreateUseCase.md` workflow.\n\n### Step 2: Check EvalServer Status\n\n```bash\n# Check if server is running\ncurl -s http://localhost:5173 > /dev/null 2>&1 && echo \"Running\" || echo \"Not running\"\n```\n\nIf not running, start it:\n```bash\ncd ~/.claude/skills/Utilities/Evals/EvalServer && bun run dev &\n```\n\n### Step 3: Run Evaluation\n\n**Option A: Web UI (Recommended)**\n1. Open http://localhost:5173\n2. Select use case from dropdown\n3. Choose model(s) to evaluate\n4. Click \"Run Evaluation\"\n5. Watch real-time streaming results\n\n**Option B: CLI**\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli-run.ts \\\n --use-case \u003cname> \\\n --model claude-3-5-sonnet-20241022\n```\n\n### Step 4: Collect Results\n\nResults are stored in:\n- `Results/\u003cuse-case>/\u003crun-id>/results.json`\n- `EvalServer/storage/evals.db` (queryable)\n\n### Step 5: Report Summary\n\nUse structured response format:\n\n```markdown\n📋 SUMMARY: Evaluation completed for \u003cuse-case>\n\n📊 STATUS:\n| Metric | Value |\n|--------|-------|\n| Pass Rate | X% |\n| Mean Score | X.XX |\n| Failed Tests | X |\n\n📖 STORY EXPLANATION:\n1. Ran evaluation against \u003cN> test cases\n2. Deterministic scorers completed first\n3. AI judges evaluated accuracy and style\n4. Calculated weighted scores\n5. Compared against pass threshold\n6. \u003cKey finding 1>\n7. \u003cKey finding 2>\n8. \u003cRecommendation>\n\n🎯 COMPLETED: Evaluation finished with X% pass rate.\n```\n\n## Error Handling\n\n**If eval fails:**\n1. Check model API key is configured\n2. Verify test cases have valid inputs\n3. Check scorer configurations in config.yaml\n4. Review error logs in terminal\n\n## Done\n\nEvaluation complete. Results available in UI and files.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":2339,"content_sha256":"f79611b6761f8d7a100a51cc6e31fc619b041c1c54faca66c5dc1c4c3ca52a03"},{"filename":"Workflows/ViewResults.md","content":"# ViewResults Workflow\n\nQuery and display evaluation results, generate reports, and track trends.\n\n## Voice Notification\n\n```bash\ncurl -s -X POST http://localhost:8888/notify \\\n -H \"Content-Type: application/json\" \\\n -d '{\"message\": \"Running the ViewResults workflow in the Evals skill to display eval results\"}' \\\n > /dev/null 2>&1 &\n```\n\nRunning the **ViewResults** workflow in the **Evals** skill to display eval results...\n\n---\n\n## Prerequisites\n\n- Evaluations have been run\n- Results exist in Results/ directory or SQLite database\n\n## Execution\n\n### Step 1: Identify Query\n\nAsk the user:\n1. Which use case?\n2. What time range? (latest, last week, specific run)\n3. What to show? (summary, details, comparison, trends)\n4. What format? (table, report, chart)\n\n### Step 2: Quick Status Check\n\n**Latest Results for Use Case:**\n\n```bash\n# Show most recent run\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts results \\\n --use-case \u003cname> \\\n --latest\n```\n\n**All Recent Runs:**\n\n```bash\n# List last 10 runs\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts results \\\n --use-case \u003cname> \\\n --limit 10\n```\n\n### Step 3: View Detailed Results\n\n**Single Run Details:**\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts results \\\n --run-id \u003crun-id> \\\n --verbose\n```\n\n**Per-Test-Case Breakdown:**\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts results \\\n --run-id \u003crun-id> \\\n --show-cases\n```\n\n### Step 4: Generate Report\n\n**Standard Report:**\n\n```bash\n# Generate markdown report\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts report \\\n --run-id \u003crun-id> \\\n --output ~/.claude/skills/Utilities/Evals/Results/\u003cuse-case>/\u003crun-id>/report.md\n```\n\n**Using Report Template:**\n\n```bash\n# Render with template\nbun run ~/.claude/Templates/Tools/RenderTemplate.ts \\\n -t Evals/Report.hbs \\\n -d ~/.claude/skills/Utilities/Evals/Results/\u003cuse-case>/\u003crun-id>/results.yaml \\\n -o ~/.claude/skills/Utilities/Evals/Results/\u003cuse-case>/\u003crun-id>/report.md\n```\n\n### Step 5: Query Database\n\n**Direct SQLite Queries:**\n\n```bash\ncd ~/.claude/skills/Utilities/Evals/EvalServer\n\n# Recent runs by use case\nsqlite3 storage/evals.db \"\n SELECT run_id, model, pass_rate, mean_score, created_at\n FROM eval_runs\n WHERE use_case = '\u003cname>'\n ORDER BY created_at DESC\n LIMIT 10\n\"\n\n# Failed test cases\nsqlite3 storage/evals.db \"\n SELECT test_id, score, failure_reason\n FROM eval_results\n WHERE run_id = '\u003crun-id>' AND passed = 0\n\"\n\n# Score trends over time\nsqlite3 storage/evals.db \"\n SELECT date(created_at), avg(mean_score)\n FROM eval_runs\n WHERE use_case = '\u003cname>'\n GROUP BY date(created_at)\n ORDER BY created_at\n\"\n```\n\n### Step 6: Compare Runs\n\n**Two Runs Side-by-Side:**\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts compare \\\n --run-a \u003crun-id-1> \\\n --run-b \u003crun-id-2>\n```\n\n**Trend Analysis:**\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts trend \\\n --use-case \u003cname> \\\n --days 30\n```\n\n### Step 7: Report Summary\n\nUse structured response format:\n\n```markdown\n📋 SUMMARY: Evaluation results for \u003cuse-case>\n\n📊 STATUS:\n| Metric | Value |\n|--------|-------|\n| Run ID | \u003crun-id> |\n| Date | \u003cdate> |\n| Model | \u003cmodel> |\n| Pass Rate | X% |\n| Mean Score | X.XX |\n| Total Tests | N |\n| Passed | N |\n| Failed | N |\n\n📖 STORY EXPLANATION:\n1. Retrieved evaluation run from \u003cdate>\n2. \u003cN> test cases were evaluated\n3. Deterministic scorers ran first (format, length, voice)\n4. AI judges evaluated accuracy and style\n5. Weighted scores calculated\n6. \u003cPass rate>% passed the 0.75 threshold\n7. \u003cKey finding about top/bottom performers>\n8. \u003cRecommendation based on results>\n\n🎯 COMPLETED: Results retrieved for \u003cuse-case>, \u003cpass-rate>% pass rate.\n```\n\n## Query Patterns\n\n### By Time Range\n\n```bash\n# Last 24 hours\n--since \"24 hours ago\"\n\n# Last week\n--since \"7 days ago\"\n\n# Specific date range\n--from \"2024-01-01\" --to \"2024-01-15\"\n```\n\n### By Score Threshold\n\n```bash\n# Only failed runs\n--min-pass-rate 0 --max-pass-rate 0.74\n\n# Only excellent runs\n--min-pass-rate 0.90\n```\n\n### By Model\n\n```bash\n# Specific model\n--model claude-3-5-sonnet-20241022\n\n# Compare models\n--compare-models\n```\n\n### By Test Case\n\n```bash\n# Specific test\n--test-id 001-basic\n\n# All failures\n--failures-only\n```\n\n## Output Formats\n\n### Table (Default)\n\n```\n┌──────────┬────────────────────────────┬───────────┬────────────┐\n│ Run ID │ Model │ Pass Rate │ Mean Score │\n├──────────┼────────────────────────────┼───────────┼────────────┤\n│ abc123 │ claude-3-5-sonnet-20241022 │ 92% │ 4.3 │\n│ def456 │ gpt-4o │ 88% │ 4.1 │\n└──────────┴────────────────────────────┴───────────┴────────────┘\n```\n\n### JSON\n\n```bash\n--format json\n```\n\n```json\n{\n \"run_id\": \"abc123\",\n \"use_case\": \"newsletter_summaries\",\n \"model\": \"claude-3-5-sonnet-20241022\",\n \"summary\": {\n \"total_cases\": 12,\n \"passed\": 11,\n \"failed\": 1,\n \"pass_rate\": 0.917,\n \"mean_score\": 4.3,\n \"std_dev\": 0.5\n },\n \"per_test_case\": [...]\n}\n```\n\n### Markdown Report\n\n```bash\n--format markdown\n```\n\nUses Report.hbs template to generate full report.\n\n### CSV Export\n\n```bash\n--format csv --output results.csv\n```\n\nFor spreadsheet analysis.\n\n## Trend Analysis\n\n### Regression Detection\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts trend \\\n --use-case \u003cname> \\\n --detect-regression \\\n --threshold 0.10 # Alert if >10% drop\n```\n\n### Performance Over Time\n\n```\n📈 Trend: newsletter_summaries (last 30 days)\n\nDate | Pass Rate | Mean Score | Change\n-----------|-----------|------------|--------\n2024-01-15 | 92% | 4.3 | +5%\n2024-01-10 | 87% | 4.1 | -2%\n2024-01-05 | 89% | 4.2 | baseline\n\nTrend: ↑ Improving\nAlert: None\n```\n\n## Web UI Options\n\n### Dashboard View\n\n1. Open http://localhost:5173\n2. Select use case from sidebar\n3. View:\n - Latest run summary\n - Pass rate trend chart\n - Failing test cases\n - Model comparison\n\n### Run Details\n\n1. Click on specific run\n2. View:\n - Per-test-case scores\n - Judge reasoning\n - Output samples\n - Diff against baseline\n\n### Export Options\n\n- Download JSON\n- Export to CSV\n- Generate PDF report\n\n## Common Queries\n\n### \"How did the last eval go?\"\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts results \\\n --use-case \u003cname> \\\n --latest \\\n --summary\n```\n\n### \"Why did test X fail?\"\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts results \\\n --run-id \u003crun-id> \\\n --test-id \u003ctest-id> \\\n --verbose\n```\n\n### \"Is performance improving or declining?\"\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts trend \\\n --use-case \u003cname> \\\n --days 14\n```\n\n### \"Which model is best for this task?\"\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts compare \\\n --use-case \u003cname> \\\n --compare-models \\\n --recent\n```\n\n### \"Show me all failures this week\"\n\n```bash\nbun run ~/.claude/skills/Utilities/Evals/EvalServer/cli.ts results \\\n --use-case \u003cname> \\\n --since \"7 days ago\" \\\n --failures-only\n```\n\n## Done\n\nResults retrieved and reported. Use findings to guide prompt/model decisions.\n","content_type":"text/markdown; charset=utf-8","language":"markdown","size":7618,"content_sha256":"689764419b6fd2127dcb4cf7f1e88a8c1c6166515e2d6d8c234d30539a12b050"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":2},"content":[{"text":"Customization","type":"text"}]},{"type":"paragraph","content":[{"text":"Before executing, check for user customizations at:","type":"text","marks":[{"type":"strong"}]},{"text":" ","type":"text"},{"text":"~/.claude/PAI/USER/SKILLCUSTOMIZATIONS/Evals/","type":"text","marks":[{"type":"code_inline"}]}]},{"type":"paragraph","content":[{"text":"If this directory exists, load and apply any PREFERENCES.md, configurations, or resources found there. These override default behavior. If the directory does not exist, proceed with skill defaults.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"🚨 MANDATORY: Voice Notification (REQUIRED BEFORE ANY ACTION)","type":"text"}]},{"type":"paragraph","content":[{"text":"You MUST send this notification BEFORE doing anything else when this skill is invoked.","type":"text","marks":[{"type":"strong"}]}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Send voice notification","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"curl -s -X POST http://localhost:8888/notify \\\n -H \"Content-Type: application/json\" \\\n -d '{\"message\": \"Running the WORKFLOWNAME workflow in the Evals skill to ACTION\"}' \\\n > /dev/null 2>&1 &","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Output text notification","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"Running the **WorkflowName** workflow in the **Evals** skill to ACTION...","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"This is not optional. Execute this curl command immediately upon skill invocation.","type":"text","marks":[{"type":"strong"}]}]},{"type":"heading","attrs":{"level":1},"content":[{"text":"Evals - AI Agent Evaluation Framework","type":"text"}]},{"type":"paragraph","content":[{"text":"Comprehensive agent evaluation system based on Anthropic's \"Demystifying Evals for AI Agents\" (Jan 2026).","type":"text"}]},{"type":"paragraph","content":[{"text":"Key differentiator:","type":"text","marks":[{"type":"strong"}]},{"text":" Evaluates agent ","type":"text"},{"text":"workflows","type":"text","marks":[{"type":"em"}]},{"text":" (transcripts, tool calls, multi-turn conversations), not just single outputs.","type":"text"}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"When to Activate","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"run evals\", \"test this agent\", \"evaluate\", \"check quality\", \"benchmark\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\"regression test\", \"capability test\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Compare agent behaviors across changes","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Validate agent workflows before deployment","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Verify ALGORITHM ISC rows","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Create new evaluation tasks from failures","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Core Concepts","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Three Grader Types","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Type","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Strengths","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Weaknesses","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Use For","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Code-based","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fast, cheap, deterministic, reproducible","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Brittle, lacks nuance","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tests, state checks, tool verification","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Model-based","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Flexible, captures nuance, scalable","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Non-deterministic, expensive","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Quality rubrics, assertions, comparisons","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Human","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Gold standard, handles subjectivity","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Expensive, slow","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Calibration, spot checks, A/B testing","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Evaluation Types","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Type","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Pass Target","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Purpose","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Capability","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~70%","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Stretch goals, measuring improvement potential","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Regression","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~99%","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Quality gates, detecting backsliding","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Key Metrics","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"pass@k","type":"text","marks":[{"type":"strong"}]},{"text":": Probability of at least 1 success in k trials (measures capability)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"pass^k","type":"text","marks":[{"type":"strong"}]},{"text":": Probability all k trials succeed (measures consistency/reliability)","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Workflow Routing","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Request Pattern","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Route To","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Run eval, evaluate suite, run tests, benchmark","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Workflows/RunEval.md","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Compare models, model comparison, A/B test models","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Workflows/CompareModels.md","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Compare prompts, prompt comparison, test prompts","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Workflows/ComparePrompts.md","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Create judge, model grader, evaluation judge","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Workflows/CreateJudge.md","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Create use case, new eval, test case, create suite","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Workflows/CreateUseCase.md","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"View results, eval results, scores, pass rate","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Workflows/ViewResults.md","type":"text","marks":[{"type":"code_inline"}]}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"CLI Quick Reference","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Trigger","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tool","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Run suite","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tools/AlgorithmBridge.ts","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Log failure","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tools/FailureToTask.ts log","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Convert failures","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tools/FailureToTask.ts convert-all","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Create suite","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tools/SuiteManager.ts create","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Check saturation","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tools/SuiteManager.ts check-saturation","type":"text","marks":[{"type":"code_inline"}]}]}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Quick Reference","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"CLI Commands","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Run an eval suite\nbun run ~/.claude/skills/Utilities/Evals/Tools/AlgorithmBridge.ts -s \u003csuite>\n\n# Log a failure for later conversion\nbun run ~/.claude/skills/Utilities/Evals/Tools/FailureToTask.ts log \"description\" -c category -s severity\n\n# Convert failures to test tasks\nbun run ~/.claude/skills/Utilities/Evals/Tools/FailureToTask.ts convert-all\n\n# Manage suites\nbun run ~/.claude/skills/Utilities/Evals/Tools/SuiteManager.ts create \u003cname> -t capability -d \"description\"\nbun run ~/.claude/skills/Utilities/Evals/Tools/SuiteManager.ts list\nbun run ~/.claude/skills/Utilities/Evals/Tools/SuiteManager.ts check-saturation \u003cname>\nbun run ~/.claude/skills/Utilities/Evals/Tools/SuiteManager.ts graduate \u003cname>","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"ALGORITHM Integration","type":"text"}]},{"type":"paragraph","content":[{"text":"Evals is a verification method for THE ALGORITHM ISC rows:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Run eval and update ISC row\nbun run ~/.claude/skills/Utilities/Evals/Tools/AlgorithmBridge.ts -s regression-core -r 3 -u","type":"text"}]},{"type":"paragraph","content":[{"text":"ISC rows can specify eval verification:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"| # | What Ideal Looks Like | Verify |\n|---|----------------------|--------|\n| 1 | Auth bypass fixed | eval:auth-security |\n| 2 | Tests all pass | eval:regression |","type":"text"}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Available Graders","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Code-Based (Fast, Deterministic)","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Grader","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Use Case","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"string_match","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Exact substring matching","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"regex_match","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Pattern matching","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"binary_tests","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Run test files","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"static_analysis","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Lint, type-check, security scan","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"state_check","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Verify system state after execution","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"tool_calls","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Verify specific tools were called","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Model-Based (Nuanced)","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Grader","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Use Case","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"llm_rubric","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Score against detailed rubric","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"natural_language_assert","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Check assertions are true","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"pairwise_comparison","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Compare to reference with position swap","type":"text"}]}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Domain Patterns","type":"text"}]},{"type":"paragraph","content":[{"text":"Pre-configured grader stacks for common agent types:","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Domain","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Primary Graders","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"coding","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"binary_tests + static_analysis + tool_calls + llm_rubric","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"conversational","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"llm_rubric + natural_language_assert + state_check","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"research","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"llm_rubric + natural_language_assert + tool_calls","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"computer_use","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"state_check + tool_calls + llm_rubric","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"See ","type":"text"},{"text":"Data/DomainPatterns.yaml","type":"text","marks":[{"type":"code_inline"}]},{"text":" for full configurations.","type":"text"}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Task Schema (YAML)","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"yaml"},"content":[{"text":"task:\n id: \"fix-auth-bypass_1\"\n description: \"Fix authentication bypass when password is empty\"\n type: regression # or capability\n domain: coding\n\n graders:\n - type: binary_tests\n required: [test_empty_pw.py]\n weight: 0.30\n\n - type: tool_calls\n weight: 0.20\n params:\n sequence: [read_file, edit_file, run_tests]\n\n - type: llm_rubric\n weight: 0.50\n params:\n rubric: prompts/security_review.md\n\n trials: 3\n pass_threshold: 0.75","type":"text"}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Resource Index","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Resource","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Purpose","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Types/index.ts","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Core type definitions","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Graders/CodeBased/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Deterministic graders","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Graders/ModelBased/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"LLM-powered graders","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tools/TranscriptCapture.ts","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Capture agent trajectories","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tools/TrialRunner.ts","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Multi-trial execution with pass@k","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tools/SuiteManager.ts","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Suite management and saturation","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tools/FailureToTask.ts","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Convert failures to test tasks","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tools/AlgorithmBridge.ts","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"ALGORITHM integration","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Data/DomainPatterns.yaml","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Domain-specific grader configs","type":"text"}]}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Key Principles (from Anthropic)","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Start with 20-50 real failures","type":"text","marks":[{"type":"strong"}]},{"text":" - Don't overthink, capture what actually broke","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Unambiguous tasks","type":"text","marks":[{"type":"strong"}]},{"text":" - Two experts should reach identical verdicts","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Balanced problem sets","type":"text","marks":[{"type":"strong"}]},{"text":" - Test both \"should do\" AND \"should NOT do\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Grade outputs, not paths","type":"text","marks":[{"type":"strong"}]},{"text":" - Don't penalize valid creative solutions","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Calibrate LLM judges","type":"text","marks":[{"type":"strong"}]},{"text":" - Against human expert judgment","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Check transcripts regularly","type":"text","marks":[{"type":"strong"}]},{"text":" - Verify graders work correctly","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Monitor saturation","type":"text","marks":[{"type":"strong"}]},{"text":" - Graduate to regression when hitting 95%+","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Build infrastructure early","type":"text","marks":[{"type":"strong"}]},{"text":" - Evals shape how quickly you can adopt new models","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Related","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ALGORITHM","type":"text","marks":[{"type":"strong"}]},{"text":": Evals is a verification method","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Science","type":"text","marks":[{"type":"strong"}]},{"text":": Evals implements scientific method","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Browser","type":"text","marks":[{"type":"strong"}]},{"text":": For visual verification graders","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"Evals","author":"@skillopedia","source":{"stars":14561,"repo_name":"personal_ai_infrastructure","origin_url":"https://github.com/danielmiessler/personal_ai_infrastructure/blob/HEAD/Releases/v4.0.0/.claude/skills/Utilities/Evals/SKILL.md","repo_owner":"danielmiessler","body_sha256":"dce019d0182c8ec01af2602ba80c08d4d28c6323be17019d998a81c0ba177826","cluster_key":"d7c65b2e6cd70d2aa42f633ade5b5eb185c84d0756622b1d430c59c37ccbb492","clean_bundle":{"format":"clean-skill-bundle-v1","source":"danielmiessler/personal_ai_infrastructure/Releases/v4.0.0/.claude/skills/Utilities/Evals/SKILL.md","attachments":[{"id":"7176f291-bf43-5837-8c15-69541a53a6c1","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/7176f291-bf43-5837-8c15-69541a53a6c1/attachment.md","path":"BestPractices.md","size":1541,"sha256":"7a397e5a970a0072d69a29d2d0e36cce489bb7688df64c1cb8df4cf4528c7085","contentType":"text/markdown; charset=utf-8"},{"id":"4ff9b1eb-0697-5b4f-9ccc-cd50148c8d7d","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/4ff9b1eb-0697-5b4f-9ccc-cd50148c8d7d/attachment.md","path":"CLIReference.md","size":2267,"sha256":"dcc98d912a8f4e7038245dd44741635fa1d40377fed15acaabb25d08e8c14fca","contentType":"text/markdown; charset=utf-8"},{"id":"4f548f8a-8d31-5927-95cb-90b3f33a9c3d","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/4f548f8a-8d31-5927-95cb-90b3f33a9c3d/attachment.yaml","path":"Data/DomainPatterns.yaml","size":4631,"sha256":"4c3262e21548b1b6a47ccd58c8e00077a83509292b76bb42ba09722790afaff3","contentType":"application/yaml; charset=utf-8"},{"id":"b5f4beb0-f3c2-5228-979d-d3d10b79c7fb","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/b5f4beb0-f3c2-5228-979d-d3d10b79c7fb/attachment.ts","path":"Graders/Base.ts","size":3016,"sha256":"ce0922806f879e3436f9c146c82bb4400fa5df892866f806ef407e9dabaa482b","contentType":"text/typescript; charset=utf-8"},{"id":"e8fce0b1-1163-56b8-b15b-dbd4c4e6e91b","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/e8fce0b1-1163-56b8-b15b-dbd4c4e6e91b/attachment.ts","path":"Graders/CodeBased/BinaryTests.ts","size":2544,"sha256":"664ebf3915b5e974ff1b037a74f34d23d69b67a115b8a8f9d7f1e2e8c256afbf","contentType":"text/typescript; charset=utf-8"},{"id":"dc2a0561-2d61-53c2-ac34-379e587b5351","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/dc2a0561-2d61-53c2-ac34-379e587b5351/attachment.ts","path":"Graders/CodeBased/RegexMatch.ts","size":1952,"sha256":"e396e2f9b17bde92f8038d7a52df3eec04d3aee0815df68674958217ac665568","contentType":"text/typescript; charset=utf-8"},{"id":"e191e638-2203-5954-8988-5dbf702a4e0d","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/e191e638-2203-5954-8988-5dbf702a4e0d/attachment.ts","path":"Graders/CodeBased/StateCheck.ts","size":4855,"sha256":"20d068505e76e9dacd0f066504a5a78a80cc7bcfa4f8515e9df8c07d8b732582","contentType":"text/typescript; charset=utf-8"},{"id":"e86640d4-7129-50e7-a957-c847714f3b0c","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/e86640d4-7129-50e7-a957-c847714f3b0c/attachment.ts","path":"Graders/CodeBased/StaticAnalysis.ts","size":2966,"sha256":"da8e482cc8638fd2b8c5751d9b93632f8590d558d5b1f8a05eb980b0a99ccde4","contentType":"text/typescript; charset=utf-8"},{"id":"16d0bc5a-a4af-5f56-a9fb-abe89a51ddf6","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/16d0bc5a-a4af-5f56-a9fb-abe89a51ddf6/attachment.ts","path":"Graders/CodeBased/StringMatch.ts","size":1690,"sha256":"89b6d10bf45948775e915b011545a82b73c6a309b1a21490abb2ebfbe6a7b4bf","contentType":"text/typescript; charset=utf-8"},{"id":"5ecc67fb-94bf-59ef-beb3-5480c5ee554e","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/5ecc67fb-94bf-59ef-beb3-5480c5ee554e/attachment.ts","path":"Graders/CodeBased/ToolCallVerification.ts","size":3712,"sha256":"89a2fcbb69b65aac61f1fdbf263d20916ff69836f9319af3c41a599ca0a97041","contentType":"text/typescript; charset=utf-8"},{"id":"f9e64e2e-e56e-5389-b340-63e21e417480","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/f9e64e2e-e56e-5389-b340-63e21e417480/attachment.ts","path":"Graders/CodeBased/index.ts","size":615,"sha256":"5acb6beed43e8979089c2dc7156df2a531b2349a720d41ed33ba6afbcecc4c56","contentType":"text/typescript; charset=utf-8"},{"id":"940af72d-63b6-5c7a-a463-286b3723f52e","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/940af72d-63b6-5c7a-a463-286b3723f52e/attachment.ts","path":"Graders/ModelBased/LLMRubric.ts","size":5446,"sha256":"dfc8b6977197f14df4ca1200ddc00108d526b438702030da57240054ea454cff","contentType":"text/typescript; charset=utf-8"},{"id":"c3c4683b-069e-5b48-80d6-9f88f54c80dc","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/c3c4683b-069e-5b48-80d6-9f88f54c80dc/attachment.ts","path":"Graders/ModelBased/NaturalLanguageAssert.ts","size":3842,"sha256":"05c1f65df58e798947973cfd2a4b706670306ea26a3cec55de9a8e1401c00a3a","contentType":"text/typescript; charset=utf-8"},{"id":"739bf191-546e-5ca8-840f-13c7eab6749b","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/739bf191-546e-5ca8-840f-13c7eab6749b/attachment.ts","path":"Graders/ModelBased/PairwiseComparison.ts","size":4937,"sha256":"24fdc704ad2852e64b40e35942899c6da4cb21a53b2517f79be7b1914cbb57ea","contentType":"text/typescript; charset=utf-8"},{"id":"161a75bc-cd1b-57cf-80c1-5bd5241fa87b","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/161a75bc-cd1b-57cf-80c1-5bd5241fa87b/attachment.ts","path":"Graders/ModelBased/index.ts","size":403,"sha256":"f6f7e06a5eed2767882a26d9003b54c22dc0be0b9d39dfc57f10212288309774","contentType":"text/typescript; charset=utf-8"},{"id":"2743794d-b5f4-5688-9983-2179c66a93db","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/2743794d-b5f4-5688-9983-2179c66a93db/attachment.ts","path":"Graders/index.ts","size":322,"sha256":"bd54c998c7436d459b841abcfd5706a295c213a276cf3a02e50c9e445a8b2ab9","contentType":"text/typescript; charset=utf-8"},{"id":"1ce92d4d-3585-537a-ad89-f0df6c54c58b","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/1ce92d4d-3585-537a-ad89-f0df6c54c58b/attachment.md","path":"PROJECT.md","size":21452,"sha256":"e99edf6850edd9e4f43e4b97a9ab5ab101569045b6eefbf5a5cb6eb75c23784e","contentType":"text/markdown; charset=utf-8"},{"id":"bb8b5fec-82ee-55bb-a5ec-7df14c6f3f14","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/bb8b5fec-82ee-55bb-a5ec-7df14c6f3f14/attachment.md","path":"ScienceMapping.md","size":1998,"sha256":"d27b0191001cd66643c2bc4e04d421823c7cdd203cc887795801739d151784ac","contentType":"text/markdown; charset=utf-8"},{"id":"8e259f5c-05e6-5b30-9f31-1ae78a8c8edf","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/8e259f5c-05e6-5b30-9f31-1ae78a8c8edf/attachment.md","path":"ScorerTypes.md","size":1751,"sha256":"6661407fb370d153597c2f08d913d9ee891c9b0a0912de9633931e6c70032e0a","contentType":"text/markdown; charset=utf-8"},{"id":"69531259-3e5e-5d88-94a6-6206b3c4a5b9","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/69531259-3e5e-5d88-94a6-6206b3c4a5b9/attachment.yaml","path":"Suites/Regression/core-behaviors.yaml","size":428,"sha256":"11fc2e9efa5f88e4e0564a30c71facbd78995e3a387454da4a0a9ec85013cd74","contentType":"application/yaml; charset=utf-8"},{"id":"a30e126a-be97-554e-a9eb-2bde46e6afeb","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/a30e126a-be97-554e-a9eb-2bde46e6afeb/attachment.md","path":"TemplateIntegration.md","size":1772,"sha256":"8140fb827681ace7e47adacb72f7a34b29fab892ba2c40da1fc3c45fd779a917","contentType":"text/markdown; charset=utf-8"},{"id":"1740fe8a-9567-5baa-a442-9713a7c37710","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/1740fe8a-9567-5baa-a442-9713a7c37710/attachment.ts","path":"Tools/AlgorithmBridge.ts","size":7010,"sha256":"4bde2dd7a5c731ecfc84a1d4652b5dc09dd252dca75dd51b50de44ba7676a06a","contentType":"text/typescript; charset=utf-8"},{"id":"16ee1fe7-2275-5460-abbc-7de440b6a412","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/16ee1fe7-2275-5460-abbc-7de440b6a412/attachment.ts","path":"Tools/FailureToTask.ts","size":10639,"sha256":"50fa8e271d7390713751f404665c7d8dca7191a6096108cc83a89e1d589cbcda","contentType":"text/typescript; charset=utf-8"},{"id":"140541e7-c7db-54eb-a2a6-095bf5778c30","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/140541e7-c7db-54eb-a2a6-095bf5778c30/attachment.ts","path":"Tools/SuiteManager.ts","size":11417,"sha256":"ea2c3b31fb3eab6884f77b4297173063482bdc769cea50a75c6f9b84bb640b44","contentType":"text/typescript; charset=utf-8"},{"id":"6787475a-58d1-54fe-9282-18af9b3ed9d9","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/6787475a-58d1-54fe-9282-18af9b3ed9d9/attachment.ts","path":"Tools/TranscriptCapture.ts","size":6064,"sha256":"ce25b46544daaa833ff31d0304257bc085202f5ed9f6adceecf883e6b908fa15","contentType":"text/typescript; charset=utf-8"},{"id":"91809294-6b3f-5a49-b9db-fd172e94545c","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/91809294-6b3f-5a49-b9db-fd172e94545c/attachment.ts","path":"Tools/TrialRunner.ts","size":8374,"sha256":"392b7cd5ee3247b1eefd26c6fecb5a86788e6c8e52ad4c1e01f49f12332311ea","contentType":"text/typescript; charset=utf-8"},{"id":"727d36d7-26a8-5649-9c45-a098fca97c8b","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/727d36d7-26a8-5649-9c45-a098fca97c8b/attachment.ts","path":"Types/index.ts","size":9235,"sha256":"551fcb554a2424600952263a32d8b73592bf74e9c81ac44ff987ba91dde256b5","contentType":"text/typescript; charset=utf-8"},{"id":"03661722-705b-57a0-b365-70a1ae017e1a","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/03661722-705b-57a0-b365-70a1ae017e1a/attachment.yaml","path":"UseCases/Regression/task_file_targeting_basic.yaml","size":1238,"sha256":"383003d545c2b629401e764d78a3d52fa7a79a93fa2d856d99ae8acc6fbceb19","contentType":"application/yaml; charset=utf-8"},{"id":"e788086d-8f5d-55f5-8a2b-c1fc10758976","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/e788086d-8f5d-55f5-8a2b-c1fc10758976/attachment.yaml","path":"UseCases/Regression/task_no_hallucinated_paths.yaml","size":1312,"sha256":"89be4f08282683ae7b5e863d203b3dbeb957463f092e39fb2602887c05556b00","contentType":"application/yaml; charset=utf-8"},{"id":"4d6c7a54-c0cf-51aa-bc4e-22cfae60dc4d","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/4d6c7a54-c0cf-51aa-bc4e-22cfae60dc4d/attachment.yaml","path":"UseCases/Regression/task_tool_sequence_read_before_edit.yaml","size":1338,"sha256":"4d4cea05d4a0124b9c5d39b56f227878bc864a02f2b7b1c319cab1b57ef1c6e7","contentType":"application/yaml; charset=utf-8"},{"id":"ec7d5975-75a6-5a48-87f0-0829a5d717d8","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/ec7d5975-75a6-5a48-87f0-0829a5d717d8/attachment.yaml","path":"UseCases/Regression/task_verification_before_done.yaml","size":1348,"sha256":"9a3a4bc5bf02a144390bf39f42bfd496984ffd517a3b515b76eeb0d383ffa89b","contentType":"application/yaml; charset=utf-8"},{"id":"cbc85485-2391-5836-b163-009c7c406b92","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/cbc85485-2391-5836-b163-009c7c406b92/attachment.md","path":"Workflows/CompareModels.md","size":7285,"sha256":"c7d58715b48b42b12c232afb6dc45f4c6299c0111850d28c6e481e37e3d16abe","contentType":"text/markdown; charset=utf-8"},{"id":"f4c5a1a7-d804-5add-9fdf-97a69da1daec","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/f4c5a1a7-d804-5add-9fdf-97a69da1daec/attachment.md","path":"Workflows/ComparePrompts.md","size":8927,"sha256":"59904b56fb182be931a69d1e931040ed967ecbfeedcf197cc67b7d5b7519d5f1","contentType":"text/markdown; charset=utf-8"},{"id":"54ae9f1c-9138-5f87-b727-622ef0bb3fdd","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/54ae9f1c-9138-5f87-b727-622ef0bb3fdd/attachment.md","path":"Workflows/CreateJudge.md","size":4654,"sha256":"9e86f663ae4f209677bf124d301811f627ad3c1a4a659f20937b6c54247059e2","contentType":"text/markdown; charset=utf-8"},{"id":"2d788534-00e0-58b0-91b9-549591a6d3ff","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/2d788534-00e0-58b0-91b9-549591a6d3ff/attachment.md","path":"Workflows/CreateUseCase.md","size":6529,"sha256":"2daeadb2f5090f401519310303bc373e21a94124a27303d5b81ec7407d993de0","contentType":"text/markdown; charset=utf-8"},{"id":"61ded4dc-e023-55aa-8e8d-d37054682f70","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/61ded4dc-e023-55aa-8e8d-d37054682f70/attachment.md","path":"Workflows/RunEval.md","size":2339,"sha256":"f79611b6761f8d7a100a51cc6e31fc619b041c1c54faca66c5dc1c4c3ca52a03","contentType":"text/markdown; charset=utf-8"},{"id":"c87d243e-30d5-5200-9521-9eccb9e9be63","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/c87d243e-30d5-5200-9521-9eccb9e9be63/attachment.md","path":"Workflows/ViewResults.md","size":7618,"sha256":"689764419b6fd2127dcb4cf7f1e88a8c1c6166515e2d6d8c234d30539a12b050","contentType":"text/markdown; charset=utf-8"}],"bundle_sha256":"19941460056ea04e058e3539fc35d92410e91fac2da8d12e6a8d563a155c9e7a","attachment_count":37,"text_attachments":37,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":0,"excluded_attachments":[]},"cluster_size":5,"skill_md_path":"Releases/v4.0.0/.claude/skills/Utilities/Evals/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"testing-qa","category_label":"Testing"},"exact_dupes_collapsed_into_this":4},"version":"v1","category":"testing-qa","import_tag":"clean-skills-v1","description":"Objective eval metrics via code/model/human graders with pass@k/pass^k scoring. USE WHEN eval, evaluate, test agent, benchmark, verify behavior, regression test, capability test, run eval, compare models, compare prompts, create judge, create use case, view results, failure to task, suite manager, transcript capture, trial runner."}},"renderedAt":1782979729949}

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.