embedding-strategies — Skillopedia

Embedding Strategies Guide to selecting and optimizing embedding models for vector search applications. When to Use This Skill - Choosing embedding models for RAG - Optimizing chunking strategies - Fine-tuning embeddings for domains - Comparing embedding model performance - Reducing embedding dimensions - Handling multilingual content Core Concepts 1. Embedding Model Comparison | Model | Dimensions | Max Tokens | Best For | |-------|------------|------------|----------| | text-embedding-3-large | 3072 | 8191 | High accuracy | | text-embedding-3-small | 1536 | 8191 | Cost-effective | | voyage-…

\n) -> List[Tuple[str, str]]:\n \"\"\"Chunk markdown by headers, preserving hierarchy.\"\"\"\n lines = text.split('\\n')\n chunks = []\n current_header = \"\"\n current_content = []\n\n for line in lines:\n if re.match(headers_pattern, line, re.MULTILINE):\n if current_content:\n chunks.append((current_header, '\\n'.join(current_content)))\n current_header = line\n current_content = []\n else:\n current_content.append(line)\n\n if current_content:\n chunks.append((current_header, '\\n'.join(current_content)))\n\n return chunks\n\n\ndef recursive_character_splitter(\n text: str,\n chunk_size: int = 1000,\n chunk_overlap: int = 200,\n separators: List[str] = None\n) -> List[str]:\n \"\"\"LangChain-style recursive splitter.\"\"\"\n separators = separators or [\"\\n\\n\", \"\\n\", \". \", \" \", \"\"]\n\n def split_text(text: str, separators: List[str]) -> List[str]:\n if not text:\n return []\n\n separator = separators[0]\n remaining_separators = separators[1:]\n\n if separator == \"\":\n # Character-level split\n return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - chunk_overlap)]\n\n splits = text.split(separator)\n chunks = []\n current_chunk = []\n current_length = 0\n\n for split in splits:\n split_length = len(split) + len(separator)\n\n if current_length + split_length > chunk_size and current_chunk:\n chunk_text = separator.join(current_chunk)\n\n # Recursively split if still too large\n if len(chunk_text) > chunk_size and remaining_separators:\n chunks.extend(split_text(chunk_text, remaining_separators))\n else:\n chunks.append(chunk_text)\n\n # Start new chunk with overlap\n overlap_splits = []\n overlap_length = 0\n for s in reversed(current_chunk):\n if overlap_length + len(s) \u003c= chunk_overlap:\n overlap_splits.insert(0, s)\n overlap_length += len(s)\n else:\n break\n current_chunk = overlap_splits\n current_length = overlap_length\n\n current_chunk.append(split)\n current_length += split_length\n\n if current_chunk:\n chunks.append(separator.join(current_chunk))\n\n return chunks\n\n return split_text(text, separators)\n```\n\n### Template 4: Domain-Specific Embedding Pipeline\n\n```python\nclass DomainEmbeddingPipeline:\n \"\"\"Pipeline for domain-specific embeddings.\"\"\"\n\n def __init__(\n self,\n embedding_model: str = \"text-embedding-3-small\",\n chunk_size: int = 512,\n chunk_overlap: int = 50,\n preprocessing_fn=None\n ):\n self.embedding_model = embedding_model\n self.chunk_size = chunk_size\n self.chunk_overlap = chunk_overlap\n self.preprocess = preprocessing_fn or self._default_preprocess\n\n def _default_preprocess(self, text: str) -> str:\n \"\"\"Default preprocessing.\"\"\"\n # Remove excessive whitespace\n text = re.sub(r'\\s+', ' ', text)\n # Remove special characters\n text = re.sub(r'[^\\w\\s.,!?-]', '', text)\n return text.strip()\n\n async def process_documents(\n self,\n documents: List[dict],\n id_field: str = \"id\",\n content_field: str = \"content\",\n metadata_fields: List[str] = None\n ) -> List[dict]:\n \"\"\"Process documents for vector storage.\"\"\"\n processed = []\n\n for doc in documents:\n content = doc[content_field]\n doc_id = doc[id_field]\n\n # Preprocess\n cleaned = self.preprocess(content)\n\n # Chunk\n chunks = chunk_by_tokens(\n cleaned,\n self.chunk_size,\n self.chunk_overlap\n )\n\n # Create embeddings\n embeddings = get_embeddings(chunks, self.embedding_model)\n\n # Create records\n for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):\n record = {\n \"id\": f\"{doc_id}_chunk_{i}\",\n \"document_id\": doc_id,\n \"chunk_index\": i,\n \"text\": chunk,\n \"embedding\": embedding\n }\n\n # Add metadata\n if metadata_fields:\n for field in metadata_fields:\n if field in doc:\n record[field] = doc[field]\n\n processed.append(record)\n\n return processed\n\n\n# Code-specific pipeline\nclass CodeEmbeddingPipeline:\n \"\"\"Specialized pipeline for code embeddings.\"\"\"\n\n def __init__(self, model: str = \"voyage-code-2\"):\n self.model = model\n\n def chunk_code(self, code: str, language: str) -> List[dict]:\n \"\"\"Chunk code by functions/classes.\"\"\"\n import tree_sitter\n\n # Parse with tree-sitter\n # Extract functions, classes, methods\n # Return chunks with context\n pass\n\n def embed_with_context(self, chunk: str, context: str) -> List[float]:\n \"\"\"Embed code with surrounding context.\"\"\"\n combined = f\"Context: {context}\\n\\nCode:\\n{chunk}\"\n return get_embedding(combined, model=self.model)\n```\n\n### Template 5: Embedding Quality Evaluation\n\n```python\nimport numpy as np\nfrom typing import List, Tuple\n\ndef evaluate_retrieval_quality(\n queries: List[str],\n relevant_docs: List[List[str]], # List of relevant doc IDs per query\n retrieved_docs: List[List[str]], # List of retrieved doc IDs per query\n k: int = 10\n) -> dict:\n \"\"\"Evaluate embedding quality for retrieval.\"\"\"\n\n def precision_at_k(relevant: set, retrieved: List[str], k: int) -> float:\n retrieved_k = retrieved[:k]\n relevant_retrieved = len(set(retrieved_k) & relevant)\n return relevant_retrieved / k\n\n def recall_at_k(relevant: set, retrieved: List[str], k: int) -> float:\n retrieved_k = retrieved[:k]\n relevant_retrieved = len(set(retrieved_k) & relevant)\n return relevant_retrieved / len(relevant) if relevant else 0\n\n def mrr(relevant: set, retrieved: List[str]) -> float:\n for i, doc in enumerate(retrieved):\n if doc in relevant:\n return 1 / (i + 1)\n return 0\n\n def ndcg_at_k(relevant: set, retrieved: List[str], k: int) -> float:\n dcg = sum(\n 1 / np.log2(i + 2) if doc in relevant else 0\n for i, doc in enumerate(retrieved[:k])\n )\n ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(relevant), k)))\n return dcg / ideal_dcg if ideal_dcg > 0 else 0\n\n metrics = {\n f\"precision@{k}\": [],\n f\"recall@{k}\": [],\n \"mrr\": [],\n f\"ndcg@{k}\": []\n }\n\n for relevant, retrieved in zip(relevant_docs, retrieved_docs):\n relevant_set = set(relevant)\n metrics[f\"precision@{k}\"].append(precision_at_k(relevant_set, retrieved, k))\n metrics[f\"recall@{k}\"].append(recall_at_k(relevant_set, retrieved, k))\n metrics[\"mrr\"].append(mrr(relevant_set, retrieved))\n metrics[f\"ndcg@{k}\"].append(ndcg_at_k(relevant_set, retrieved, k))\n\n return {name: np.mean(values) for name, values in metrics.items()}\n\n\ndef compute_embedding_similarity(\n embeddings1: np.ndarray,\n embeddings2: np.ndarray,\n metric: str = \"cosine\"\n) -> np.ndarray:\n \"\"\"Compute similarity matrix between embedding sets.\"\"\"\n if metric == \"cosine\":\n # Normalize\n norm1 = embeddings1 / np.linalg.norm(embeddings1, axis=1, keepdims=True)\n norm2 = embeddings2 / np.linalg.norm(embeddings2, axis=1, keepdims=True)\n return norm1 @ norm2.T\n elif metric == \"euclidean\":\n from scipy.spatial.distance import cdist\n return -cdist(embeddings1, embeddings2, metric='euclidean')\n elif metric == \"dot\":\n return embeddings1 @ embeddings2.T\n```\n\n## Best Practices\n\n### Do's\n- **Match model to use case** - Code vs prose vs multilingual\n- **Chunk thoughtfully** - Preserve semantic boundaries\n- **Normalize embeddings** - For cosine similarity\n- **Batch requests** - More efficient than one-by-one\n- **Cache embeddings** - Avoid recomputing\n\n### Don'ts\n- **Don't ignore token limits** - Truncation loses info\n- **Don't mix embedding models** - Incompatible spaces\n- **Don't skip preprocessing** - Garbage in, garbage out\n- **Don't over-chunk** - Lose context\n\n## Resources\n\n- [OpenAI Embeddings](https://platform.openai.com/docs/guides/embeddings)\n- [Sentence Transformers](https://www.sbert.net/)\n- [MTEB Benchmark](https://huggingface.co/spaces/mteb/leaderboard)\n---","attachment_filenames":["skill-report.json"],"attachments":[{"filename":"skill-report.json","content":"{\n \"schema_version\": \"2.0\",\n \"meta\": {\n \"generated_at\": \"2026-01-21T19:19:51.862Z\",\n \"slug\": \"wshobson-embedding-strategies\",\n \"source_url\": \"https://github.com/wshobson/agents/tree/main/plugins/llm-application-dev/skills/embedding-strategies\",\n \"source_ref\": \"main\",\n \"model\": \"claude\",\n \"analysis_version\": \"3.0.0\",\n \"source_type\": \"community\",\n \"content_hash\": \"03e6481e39240999e3eeb94103d7e2ae5393274624280107bfdc72389e0c2920\",\n \"tree_hash\": \"969b869548c758c248ce9c6f8ed58e36a849d2c674a24bc8ffdc353b342309a6\"\n },\n \"skill\": {\n \"name\": \"embedding-strategies\",\n \"description\": \"Select and optimize embedding models for semantic search and RAG applications. Use when choosing embedding models, implementing chunking strategies, or optimizing embedding quality for specific domains.\",\n \"summary\": \"Select and optimize embedding models for semantic search and RAG applications. Use when choosing embedding models, implementing chunking strategies, or optimizing embedding quality for specific domains.\",\n \"icon\": \"📦\",\n \"version\": \"1.0.0\",\n \"author\": \"wshobson\",\n \"license\": \"MIT\",\n \"tags\": [\n \"semantic-search\",\n \"vector-database\",\n \"rag\",\n \"embeddings\",\n \"text-embedding\"\n ],\n \"supported_tools\": [\n \"claude\",\n \"codex\",\n \"claude-code\"\n ],\n \"risk_factors\": []\n },\n \"security_audit\": {\n \"risk_level\": \"low\",\n \"is_blocked\": false,\n \"safe_to_publish\": true,\n \"summary\": \"All static findings are false positives. C2 keyword alerts triggered by hash hex strings. Weak crypto alerts from hash substrings. External command alerts from ASCII flow diagrams using arrows. Hardcoded URL alerts are legitimate documentation links. No malicious code, command execution, or data exfiltration patterns found.\",\n \"risk_factor_evidence\": [],\n \"critical_findings\": [],\n \"high_findings\": [],\n \"medium_findings\": [],\n \"low_findings\": [],\n \"dangerous_patterns\": [],\n \"files_scanned\": 2,\n \"total_lines\": 818,\n \"audit_model\": \"claude\",\n \"audited_at\": \"2026-01-21T19:19:51.863Z\",\n \"risk_factors\": []\n },\n \"content\": {\n \"user_title\": \"Optimize Embedding Models for Semantic Search\",\n \"value_statement\": \"Choosing the right embedding model and chunking strategy is critical for retrieval quality. This skill provides templates and best practices for implementing high-quality vector search pipelines.\",\n \"seo_keywords\": [\n \"Claude\",\n \"Codex\",\n \"Claude Code\",\n \"embedding models\",\n \"semantic search\",\n \"vector embeddings\",\n \"RAG\",\n \"chunking strategies\",\n \"text-embedding\",\n \"vector search\"\n ],\n \"actual_capabilities\": [\n \"Compare embedding models across dimensions, costs, and use cases\",\n \"Implement chunking strategies for optimal retrieval\",\n \"Build pipelines for OpenAI and local embedding models\",\n \"Evaluate embedding quality with precision, recall, and NDCG metrics\",\n \"Handle multilingual and domain-specific content\"\n ],\n \"limitations\": [\n \"Does not execute embedding API calls directly\",\n \"Does not provision vector database infrastructure\",\n \"Does not provide pre-trained embedding models\",\n \"Does not handle real-time embedding updates\"\n ],\n \"use_cases\": [\n {\n \"title\": \"Build RAG Systems\",\n \"description\": \"Implement retrieval-augmented generation by selecting appropriate embedding models and chunking strategies for your document corpus.\",\n \"target_user\": \"ML engineers building RAG applications\"\n },\n {\n \"title\": \"Optimize Semantic Search\",\n \"description\": \"Improve search relevance by choosing embedding models matched to your content type and implementing proper chunking and preprocessing.\",\n \"target_user\": \"Search engineers and data scientists\"\n },\n {\n \"title\": \"Create Embedding Pipelines\",\n \"description\": \"Build scalable pipelines that process documents, chunk content, generate embeddings, and prepare records for vector databases.\",\n \"target_user\": \"Backend developers and data engineers\"\n }\n ],\n \"prompt_templates\": [\n {\n \"title\": \"Select Embedding Model\",\n \"prompt\": \"I need to choose an embedding model for my [use case: code search / multilingual documents / legal contracts]. My priorities are [priority: accuracy / cost / speed]. I have [constraints: limit on dimensions / need open source / need API access]. Recommend 3 models with rationale.\",\n \"scenario\": \"Choosing the right embedding model\"\n },\n {\n \"title\": \"Implement Chunking Strategy\",\n \"prompt\": \"Help me implement chunking for my [data type: technical documentation / conversational data / code]. I need to handle [requirement: preserve context / maintain semantic boundaries / limit chunk size]. Provide Python code for [strategy: token-based / sentence-based / recursive character] chunking.\",\n \"scenario\": \"Implementing document chunking\"\n },\n {\n \"title\": \"Build Embedding Pipeline\",\n \"prompt\": \"Create a Python pipeline that [input: processes documents from source / generates embeddings / stores in vector database]. Include [feature: batching / progress tracking / metadata handling]. Use [model: OpenAI embeddings / sentence-transformers].\",\n \"scenario\": \"Building embedding pipelines\"\n },\n {\n \"title\": \"Evaluate Retrieval Quality\",\n \"prompt\": \"My embedding-based retrieval has [problem: low recall / inconsistent results / poor precision]. My setup uses [model details]. Analyze potential causes and suggest improvements for [metric: precision at k / recall / ndcg].\",\n \"scenario\": \"Diagnosing retrieval issues\"\n }\n ],\n \"output_examples\": [\n {\n \"input\": \"Recommend an embedding model for a legal document search system. I need high accuracy and can use API services.\",\n \"output\": [\n \"Recommended: text-embedding-3-large (3072 dimensions) or voyage-2 (1024 dimensions)\",\n \"text-embedding-3-large: Best accuracy, handles 8191 tokens, ideal for long legal clauses\",\n \"voyage-2: Specialized for legal/code, 1024 dimensions, 4000 token limit\",\n \"Consider chunking legal documents by section headers to preserve clause context\"\n ]\n },\n {\n \"input\": \"How should I chunk my technical documentation for a RAG system?\",\n \"output\": [\n \"Strategy: Use semantic chunking by headers combined with recursive character splitting\",\n \"Recommended chunk size: 512 tokens with 50 token overlap\",\n \"Preserve code examples as complete chunks\",\n \"Add context metadata linking chunks to original sections\"\n ]\n }\n ],\n \"best_practices\": [\n \"Match embedding model to content type: code, prose, or multilingual\",\n \"Normalize embeddings for reliable cosine similarity comparisons\",\n \"Use token overlap when chunking to preserve context across boundaries\"\n ],\n \"anti_patterns\": [\n \"Mixing different embedding models in the same index\",\n \"Ignoring token limits and truncating content mid-thought\",\n \"Skipping preprocessing, allowing noise to degrade embedding quality\"\n ],\n \"faq\": [\n {\n \"question\": \"What embedding model should I start with?\",\n \"answer\": \"Start with text-embedding-3-small for general use. It balances cost and quality. Switch to text-embedding-3-large if you need higher accuracy, or voyage-2 for code and legal content.\"\n },\n {\n \"question\": \"How do I choose chunk size?\",\n \"answer\": \"512 tokens is a good starting point for most use cases. Adjust based on your content complexity and model token limits. Overlap by 50 tokens to maintain context across chunks.\"\n },\n {\n \"question\": \"Can I use local embedding models?\",\n \"answer\": \"Yes. Sentence-transformers supports models like BAAI/bge-large-en-v1.5 and intfloat/multilingual-e5-large. These run locally and work well for open-source or offline scenarios.\"\n },\n {\n \"question\": \"How do I evaluate my embedding quality?\",\n \"answer\": \"Use precision@k, recall@k, MRR, and NDCG@k metrics. Test with known relevant documents and compare retrieved results against ground truth.\"\n },\n {\n \"question\": \"Should I normalize embeddings?\",\n \"answer\": \"Yes. Normalize embeddings before using cosine similarity. Most modern embedding models produce normalized vectors by default, but local models may require explicit normalization.\"\n },\n {\n \"question\": \"What preprocessing should I apply?\",\n \"answer\": \"Remove excessive whitespace, normalize unicode characters, and filter special characters. Keep content semantically meaningful. Domain-specific cleaning may be needed for code or structured data.\"\n }\n ]\n },\n \"file_structure\": [\n {\n \"name\": \"SKILL.md\",\n \"type\": \"file\",\n \"path\": \"SKILL.md\",\n \"lines\": 480\n }\n ]\n}\n","content_type":"application/json; charset=utf-8","language":"json","size":9077,"content_sha256":"8bb38e924cb4fdc1642b15750805d8470d44c1e253c4cd7327d38f5379bf6352"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"Embedding Strategies","type":"text"}]},{"type":"paragraph","content":[{"text":"Guide to selecting and optimizing embedding models for vector search applications.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"When to Use This Skill","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Choosing embedding models for RAG","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Optimizing chunking strategies","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Fine-tuning embeddings for domains","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Comparing embedding model performance","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Reducing embedding dimensions","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Handling multilingual content","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Core Concepts","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"1. Embedding Model Comparison","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Model","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Dimensions","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Max Tokens","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Best For","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"text-embedding-3-large","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"3072","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"8191","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"High accuracy","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"text-embedding-3-small","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"1536","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"8191","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Cost-effective","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"voyage-2","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"1024","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"4000","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Code, legal","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"bge-large-en-v1.5","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"1024","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"512","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Open source","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"all-MiniLM-L6-v2","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"384","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"256","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fast, lightweight","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"multilingual-e5-large","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"1024","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"512","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Multi-language","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"2. Embedding Pipeline","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"Document → Chunking → Preprocessing → Embedding Model → Vector\n ↓\n [Overlap, Size] [Clean, Normalize] [API/Local]","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Templates","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Template 1: OpenAI Embeddings","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"from openai import OpenAI\nfrom typing import List\nimport numpy as np\n\nclient = OpenAI()\n\ndef get_embeddings(\n texts: List[str],\n model: str = \"text-embedding-3-small\",\n dimensions: int = None\n) -> List[List[float]]:\n \"\"\"Get embeddings from OpenAI.\"\"\"\n # Handle batching for large lists\n batch_size = 100\n all_embeddings = []\n\n for i in range(0, len(texts), batch_size):\n batch = texts[i:i + batch_size]\n\n kwargs = {\"input\": batch, \"model\": model}\n if dimensions:\n kwargs[\"dimensions\"] = dimensions\n\n response = client.embeddings.create(**kwargs)\n embeddings = [item.embedding for item in response.data]\n all_embeddings.extend(embeddings)\n\n return all_embeddings\n\n\ndef get_embedding(text: str, **kwargs) -> List[float]:\n \"\"\"Get single embedding.\"\"\"\n return get_embeddings([text], **kwargs)[0]\n\n\n# Dimension reduction with OpenAI\ndef get_reduced_embedding(text: str, dimensions: int = 512) -> List[float]:\n \"\"\"Get embedding with reduced dimensions (Matryoshka).\"\"\"\n return get_embedding(\n text,\n model=\"text-embedding-3-small\",\n dimensions=dimensions\n )","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Template 2: Local Embeddings with Sentence Transformers","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"from sentence_transformers import SentenceTransformer\nfrom typing import List, Optional\nimport numpy as np\n\nclass LocalEmbedder:\n \"\"\"Local embedding with sentence-transformers.\"\"\"\n\n def __init__(\n self,\n model_name: str = \"BAAI/bge-large-en-v1.5\",\n device: str = \"cuda\"\n ):\n self.model = SentenceTransformer(model_name, device=device)\n\n def embed(\n self,\n texts: List[str],\n normalize: bool = True,\n show_progress: bool = False\n ) -> np.ndarray:\n \"\"\"Embed texts with optional normalization.\"\"\"\n embeddings = self.model.encode(\n texts,\n normalize_embeddings=normalize,\n show_progress_bar=show_progress,\n convert_to_numpy=True\n )\n return embeddings\n\n def embed_query(self, query: str) -> np.ndarray:\n \"\"\"Embed a query with BGE-style prefix.\"\"\"\n # BGE models benefit from query prefix\n if \"bge\" in self.model.get_sentence_embedding_dimension():\n query = f\"Represent this sentence for searching relevant passages: {query}\"\n return self.embed([query])[0]\n\n def embed_documents(self, documents: List[str]) -> np.ndarray:\n \"\"\"Embed documents for indexing.\"\"\"\n return self.embed(documents)\n\n\n# E5 model with instructions\nclass E5Embedder:\n def __init__(self, model_name: str = \"intfloat/multilingual-e5-large\"):\n self.model = SentenceTransformer(model_name)\n\n def embed_query(self, query: str) -> np.ndarray:\n return self.model.encode(f\"query: {query}\")\n\n def embed_document(self, document: str) -> np.ndarray:\n return self.model.encode(f\"passage: {document}\")","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Template 3: Chunking Strategies","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"from typing import List, Tuple\nimport re\n\ndef chunk_by_tokens(\n text: str,\n chunk_size: int = 512,\n chunk_overlap: int = 50,\n tokenizer=None\n) -> List[str]:\n \"\"\"Chunk text by token count.\"\"\"\n import tiktoken\n tokenizer = tokenizer or tiktoken.get_encoding(\"cl100k_base\")\n\n tokens = tokenizer.encode(text)\n chunks = []\n\n start = 0\n while start \u003c len(tokens):\n end = start + chunk_size\n chunk_tokens = tokens[start:end]\n chunk_text = tokenizer.decode(chunk_tokens)\n chunks.append(chunk_text)\n start = end - chunk_overlap\n\n return chunks\n\n\ndef chunk_by_sentences(\n text: str,\n max_chunk_size: int = 1000,\n min_chunk_size: int = 100\n) -> List[str]:\n \"\"\"Chunk text by sentences, respecting size limits.\"\"\"\n import nltk\n sentences = nltk.sent_tokenize(text)\n\n chunks = []\n current_chunk = []\n current_size = 0\n\n for sentence in sentences:\n sentence_size = len(sentence)\n\n if current_size + sentence_size > max_chunk_size and current_chunk:\n chunks.append(\" \".join(current_chunk))\n current_chunk = []\n current_size = 0\n\n current_chunk.append(sentence)\n current_size += sentence_size\n\n if current_chunk:\n chunks.append(\" \".join(current_chunk))\n\n return chunks\n\n\ndef chunk_by_semantic_sections(\n text: str,\n headers_pattern: str = r'^#{1,3}\\s+.+

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.