mirror of
https://github.com/wshobson/agents.git
synced 2026-03-18 09:37:15 +00:00
feat: add 5 new specialized agents with 20 skills
Add domain expert agents with comprehensive skill sets: - service-mesh-expert (cloud-infrastructure): Istio/Linkerd patterns, mTLS, observability - event-sourcing-architect (backend-development): CQRS, event stores, projections, sagas - vector-database-engineer (llm-application-dev): embeddings, similarity search, hybrid search - monorepo-architect (developer-essentials): Nx, Turborepo, Bazel, pnpm workspaces - threat-modeling-expert (security-scanning): STRIDE, attack trees, security requirements Update all documentation to reflect correct counts: - 67 plugins, 99 agents, 107 skills, 71 commands
This commit is contained in:
479
plugins/llm-application-dev/skills/embedding-strategies/SKILL.md
Normal file
479
plugins/llm-application-dev/skills/embedding-strategies/SKILL.md
Normal file
@@ -0,0 +1,479 @@
|
||||
---
|
||||
name: embedding-strategies
|
||||
description: Select and optimize embedding models for semantic search and RAG applications. Use when choosing embedding models, implementing chunking strategies, or optimizing embedding quality for specific domains.
|
||||
---
|
||||
|
||||
# Embedding Strategies
|
||||
|
||||
Guide to selecting and optimizing embedding models for vector search applications.
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
- Choosing embedding models for RAG
|
||||
- Optimizing chunking strategies
|
||||
- Fine-tuning embeddings for domains
|
||||
- Comparing embedding model performance
|
||||
- Reducing embedding dimensions
|
||||
- Handling multilingual content
|
||||
|
||||
## Core Concepts
|
||||
|
||||
### 1. Embedding Model Comparison
|
||||
|
||||
| Model | Dimensions | Max Tokens | Best For |
|
||||
|-------|------------|------------|----------|
|
||||
| **text-embedding-3-large** | 3072 | 8191 | High accuracy |
|
||||
| **text-embedding-3-small** | 1536 | 8191 | Cost-effective |
|
||||
| **voyage-2** | 1024 | 4000 | Code, legal |
|
||||
| **bge-large-en-v1.5** | 1024 | 512 | Open source |
|
||||
| **all-MiniLM-L6-v2** | 384 | 256 | Fast, lightweight |
|
||||
| **multilingual-e5-large** | 1024 | 512 | Multi-language |
|
||||
|
||||
### 2. Embedding Pipeline
|
||||
|
||||
```
|
||||
Document → Chunking → Preprocessing → Embedding Model → Vector
|
||||
↓
|
||||
[Overlap, Size] [Clean, Normalize] [API/Local]
|
||||
```
|
||||
|
||||
## Templates
|
||||
|
||||
### Template 1: OpenAI Embeddings
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
from typing import List
|
||||
import numpy as np
|
||||
|
||||
client = OpenAI()
|
||||
|
||||
def get_embeddings(
|
||||
texts: List[str],
|
||||
model: str = "text-embedding-3-small",
|
||||
dimensions: int = None
|
||||
) -> List[List[float]]:
|
||||
"""Get embeddings from OpenAI."""
|
||||
# Handle batching for large lists
|
||||
batch_size = 100
|
||||
all_embeddings = []
|
||||
|
||||
for i in range(0, len(texts), batch_size):
|
||||
batch = texts[i:i + batch_size]
|
||||
|
||||
kwargs = {"input": batch, "model": model}
|
||||
if dimensions:
|
||||
kwargs["dimensions"] = dimensions
|
||||
|
||||
response = client.embeddings.create(**kwargs)
|
||||
embeddings = [item.embedding for item in response.data]
|
||||
all_embeddings.extend(embeddings)
|
||||
|
||||
return all_embeddings
|
||||
|
||||
|
||||
def get_embedding(text: str, **kwargs) -> List[float]:
|
||||
"""Get single embedding."""
|
||||
return get_embeddings([text], **kwargs)[0]
|
||||
|
||||
|
||||
# Dimension reduction with OpenAI
|
||||
def get_reduced_embedding(text: str, dimensions: int = 512) -> List[float]:
|
||||
"""Get embedding with reduced dimensions (Matryoshka)."""
|
||||
return get_embedding(
|
||||
text,
|
||||
model="text-embedding-3-small",
|
||||
dimensions=dimensions
|
||||
)
|
||||
```
|
||||
|
||||
### Template 2: Local Embeddings with Sentence Transformers
|
||||
|
||||
```python
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from typing import List, Optional
|
||||
import numpy as np
|
||||
|
||||
class LocalEmbedder:
|
||||
"""Local embedding with sentence-transformers."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "BAAI/bge-large-en-v1.5",
|
||||
device: str = "cuda"
|
||||
):
|
||||
self.model = SentenceTransformer(model_name, device=device)
|
||||
|
||||
def embed(
|
||||
self,
|
||||
texts: List[str],
|
||||
normalize: bool = True,
|
||||
show_progress: bool = False
|
||||
) -> np.ndarray:
|
||||
"""Embed texts with optional normalization."""
|
||||
embeddings = self.model.encode(
|
||||
texts,
|
||||
normalize_embeddings=normalize,
|
||||
show_progress_bar=show_progress,
|
||||
convert_to_numpy=True
|
||||
)
|
||||
return embeddings
|
||||
|
||||
def embed_query(self, query: str) -> np.ndarray:
|
||||
"""Embed a query with BGE-style prefix."""
|
||||
# BGE models benefit from query prefix
|
||||
if "bge" in self.model.get_sentence_embedding_dimension():
|
||||
query = f"Represent this sentence for searching relevant passages: {query}"
|
||||
return self.embed([query])[0]
|
||||
|
||||
def embed_documents(self, documents: List[str]) -> np.ndarray:
|
||||
"""Embed documents for indexing."""
|
||||
return self.embed(documents)
|
||||
|
||||
|
||||
# E5 model with instructions
|
||||
class E5Embedder:
|
||||
def __init__(self, model_name: str = "intfloat/multilingual-e5-large"):
|
||||
self.model = SentenceTransformer(model_name)
|
||||
|
||||
def embed_query(self, query: str) -> np.ndarray:
|
||||
return self.model.encode(f"query: {query}")
|
||||
|
||||
def embed_document(self, document: str) -> np.ndarray:
|
||||
return self.model.encode(f"passage: {document}")
|
||||
```
|
||||
|
||||
### Template 3: Chunking Strategies
|
||||
|
||||
```python
|
||||
from typing import List, Tuple
|
||||
import re
|
||||
|
||||
def chunk_by_tokens(
|
||||
text: str,
|
||||
chunk_size: int = 512,
|
||||
chunk_overlap: int = 50,
|
||||
tokenizer=None
|
||||
) -> List[str]:
|
||||
"""Chunk text by token count."""
|
||||
import tiktoken
|
||||
tokenizer = tokenizer or tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
tokens = tokenizer.encode(text)
|
||||
chunks = []
|
||||
|
||||
start = 0
|
||||
while start < len(tokens):
|
||||
end = start + chunk_size
|
||||
chunk_tokens = tokens[start:end]
|
||||
chunk_text = tokenizer.decode(chunk_tokens)
|
||||
chunks.append(chunk_text)
|
||||
start = end - chunk_overlap
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_by_sentences(
|
||||
text: str,
|
||||
max_chunk_size: int = 1000,
|
||||
min_chunk_size: int = 100
|
||||
) -> List[str]:
|
||||
"""Chunk text by sentences, respecting size limits."""
|
||||
import nltk
|
||||
sentences = nltk.sent_tokenize(text)
|
||||
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
for sentence in sentences:
|
||||
sentence_size = len(sentence)
|
||||
|
||||
if current_size + sentence_size > max_chunk_size and current_chunk:
|
||||
chunks.append(" ".join(current_chunk))
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
current_chunk.append(sentence)
|
||||
current_size += sentence_size
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(" ".join(current_chunk))
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_by_semantic_sections(
|
||||
text: str,
|
||||
headers_pattern: str = r'^#{1,3}\s+.+$'
|
||||
) -> List[Tuple[str, str]]:
|
||||
"""Chunk markdown by headers, preserving hierarchy."""
|
||||
lines = text.split('\n')
|
||||
chunks = []
|
||||
current_header = ""
|
||||
current_content = []
|
||||
|
||||
for line in lines:
|
||||
if re.match(headers_pattern, line, re.MULTILINE):
|
||||
if current_content:
|
||||
chunks.append((current_header, '\n'.join(current_content)))
|
||||
current_header = line
|
||||
current_content = []
|
||||
else:
|
||||
current_content.append(line)
|
||||
|
||||
if current_content:
|
||||
chunks.append((current_header, '\n'.join(current_content)))
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def recursive_character_splitter(
|
||||
text: str,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
separators: List[str] = None
|
||||
) -> List[str]:
|
||||
"""LangChain-style recursive splitter."""
|
||||
separators = separators or ["\n\n", "\n", ". ", " ", ""]
|
||||
|
||||
def split_text(text: str, separators: List[str]) -> List[str]:
|
||||
if not text:
|
||||
return []
|
||||
|
||||
separator = separators[0]
|
||||
remaining_separators = separators[1:]
|
||||
|
||||
if separator == "":
|
||||
# Character-level split
|
||||
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - chunk_overlap)]
|
||||
|
||||
splits = text.split(separator)
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
for split in splits:
|
||||
split_length = len(split) + len(separator)
|
||||
|
||||
if current_length + split_length > chunk_size and current_chunk:
|
||||
chunk_text = separator.join(current_chunk)
|
||||
|
||||
# Recursively split if still too large
|
||||
if len(chunk_text) > chunk_size and remaining_separators:
|
||||
chunks.extend(split_text(chunk_text, remaining_separators))
|
||||
else:
|
||||
chunks.append(chunk_text)
|
||||
|
||||
# Start new chunk with overlap
|
||||
overlap_splits = []
|
||||
overlap_length = 0
|
||||
for s in reversed(current_chunk):
|
||||
if overlap_length + len(s) <= chunk_overlap:
|
||||
overlap_splits.insert(0, s)
|
||||
overlap_length += len(s)
|
||||
else:
|
||||
break
|
||||
current_chunk = overlap_splits
|
||||
current_length = overlap_length
|
||||
|
||||
current_chunk.append(split)
|
||||
current_length += split_length
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(separator.join(current_chunk))
|
||||
|
||||
return chunks
|
||||
|
||||
return split_text(text, separators)
|
||||
```
|
||||
|
||||
### Template 4: Domain-Specific Embedding Pipeline
|
||||
|
||||
```python
|
||||
class DomainEmbeddingPipeline:
|
||||
"""Pipeline for domain-specific embeddings."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding_model: str = "text-embedding-3-small",
|
||||
chunk_size: int = 512,
|
||||
chunk_overlap: int = 50,
|
||||
preprocessing_fn=None
|
||||
):
|
||||
self.embedding_model = embedding_model
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
self.preprocess = preprocessing_fn or self._default_preprocess
|
||||
|
||||
def _default_preprocess(self, text: str) -> str:
|
||||
"""Default preprocessing."""
|
||||
# Remove excessive whitespace
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
# Remove special characters
|
||||
text = re.sub(r'[^\w\s.,!?-]', '', text)
|
||||
return text.strip()
|
||||
|
||||
async def process_documents(
|
||||
self,
|
||||
documents: List[dict],
|
||||
id_field: str = "id",
|
||||
content_field: str = "content",
|
||||
metadata_fields: List[str] = None
|
||||
) -> List[dict]:
|
||||
"""Process documents for vector storage."""
|
||||
processed = []
|
||||
|
||||
for doc in documents:
|
||||
content = doc[content_field]
|
||||
doc_id = doc[id_field]
|
||||
|
||||
# Preprocess
|
||||
cleaned = self.preprocess(content)
|
||||
|
||||
# Chunk
|
||||
chunks = chunk_by_tokens(
|
||||
cleaned,
|
||||
self.chunk_size,
|
||||
self.chunk_overlap
|
||||
)
|
||||
|
||||
# Create embeddings
|
||||
embeddings = get_embeddings(chunks, self.embedding_model)
|
||||
|
||||
# Create records
|
||||
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
||||
record = {
|
||||
"id": f"{doc_id}_chunk_{i}",
|
||||
"document_id": doc_id,
|
||||
"chunk_index": i,
|
||||
"text": chunk,
|
||||
"embedding": embedding
|
||||
}
|
||||
|
||||
# Add metadata
|
||||
if metadata_fields:
|
||||
for field in metadata_fields:
|
||||
if field in doc:
|
||||
record[field] = doc[field]
|
||||
|
||||
processed.append(record)
|
||||
|
||||
return processed
|
||||
|
||||
|
||||
# Code-specific pipeline
|
||||
class CodeEmbeddingPipeline:
|
||||
"""Specialized pipeline for code embeddings."""
|
||||
|
||||
def __init__(self, model: str = "voyage-code-2"):
|
||||
self.model = model
|
||||
|
||||
def chunk_code(self, code: str, language: str) -> List[dict]:
|
||||
"""Chunk code by functions/classes."""
|
||||
import tree_sitter
|
||||
|
||||
# Parse with tree-sitter
|
||||
# Extract functions, classes, methods
|
||||
# Return chunks with context
|
||||
pass
|
||||
|
||||
def embed_with_context(self, chunk: str, context: str) -> List[float]:
|
||||
"""Embed code with surrounding context."""
|
||||
combined = f"Context: {context}\n\nCode:\n{chunk}"
|
||||
return get_embedding(combined, model=self.model)
|
||||
```
|
||||
|
||||
### Template 5: Embedding Quality Evaluation
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
from typing import List, Tuple
|
||||
|
||||
def evaluate_retrieval_quality(
|
||||
queries: List[str],
|
||||
relevant_docs: List[List[str]], # List of relevant doc IDs per query
|
||||
retrieved_docs: List[List[str]], # List of retrieved doc IDs per query
|
||||
k: int = 10
|
||||
) -> dict:
|
||||
"""Evaluate embedding quality for retrieval."""
|
||||
|
||||
def precision_at_k(relevant: set, retrieved: List[str], k: int) -> float:
|
||||
retrieved_k = retrieved[:k]
|
||||
relevant_retrieved = len(set(retrieved_k) & relevant)
|
||||
return relevant_retrieved / k
|
||||
|
||||
def recall_at_k(relevant: set, retrieved: List[str], k: int) -> float:
|
||||
retrieved_k = retrieved[:k]
|
||||
relevant_retrieved = len(set(retrieved_k) & relevant)
|
||||
return relevant_retrieved / len(relevant) if relevant else 0
|
||||
|
||||
def mrr(relevant: set, retrieved: List[str]) -> float:
|
||||
for i, doc in enumerate(retrieved):
|
||||
if doc in relevant:
|
||||
return 1 / (i + 1)
|
||||
return 0
|
||||
|
||||
def ndcg_at_k(relevant: set, retrieved: List[str], k: int) -> float:
|
||||
dcg = sum(
|
||||
1 / np.log2(i + 2) if doc in relevant else 0
|
||||
for i, doc in enumerate(retrieved[:k])
|
||||
)
|
||||
ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(relevant), k)))
|
||||
return dcg / ideal_dcg if ideal_dcg > 0 else 0
|
||||
|
||||
metrics = {
|
||||
f"precision@{k}": [],
|
||||
f"recall@{k}": [],
|
||||
"mrr": [],
|
||||
f"ndcg@{k}": []
|
||||
}
|
||||
|
||||
for relevant, retrieved in zip(relevant_docs, retrieved_docs):
|
||||
relevant_set = set(relevant)
|
||||
metrics[f"precision@{k}"].append(precision_at_k(relevant_set, retrieved, k))
|
||||
metrics[f"recall@{k}"].append(recall_at_k(relevant_set, retrieved, k))
|
||||
metrics["mrr"].append(mrr(relevant_set, retrieved))
|
||||
metrics[f"ndcg@{k}"].append(ndcg_at_k(relevant_set, retrieved, k))
|
||||
|
||||
return {name: np.mean(values) for name, values in metrics.items()}
|
||||
|
||||
|
||||
def compute_embedding_similarity(
|
||||
embeddings1: np.ndarray,
|
||||
embeddings2: np.ndarray,
|
||||
metric: str = "cosine"
|
||||
) -> np.ndarray:
|
||||
"""Compute similarity matrix between embedding sets."""
|
||||
if metric == "cosine":
|
||||
# Normalize
|
||||
norm1 = embeddings1 / np.linalg.norm(embeddings1, axis=1, keepdims=True)
|
||||
norm2 = embeddings2 / np.linalg.norm(embeddings2, axis=1, keepdims=True)
|
||||
return norm1 @ norm2.T
|
||||
elif metric == "euclidean":
|
||||
from scipy.spatial.distance import cdist
|
||||
return -cdist(embeddings1, embeddings2, metric='euclidean')
|
||||
elif metric == "dot":
|
||||
return embeddings1 @ embeddings2.T
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Do's
|
||||
- **Match model to use case** - Code vs prose vs multilingual
|
||||
- **Chunk thoughtfully** - Preserve semantic boundaries
|
||||
- **Normalize embeddings** - For cosine similarity
|
||||
- **Batch requests** - More efficient than one-by-one
|
||||
- **Cache embeddings** - Avoid recomputing
|
||||
|
||||
### Don'ts
|
||||
- **Don't ignore token limits** - Truncation loses info
|
||||
- **Don't mix embedding models** - Incompatible spaces
|
||||
- **Don't skip preprocessing** - Garbage in, garbage out
|
||||
- **Don't over-chunk** - Lose context
|
||||
|
||||
## Resources
|
||||
|
||||
- [OpenAI Embeddings](https://platform.openai.com/docs/guides/embeddings)
|
||||
- [Sentence Transformers](https://www.sbert.net/)
|
||||
- [MTEB Benchmark](https://huggingface.co/spaces/mteb/leaderboard)
|
||||
@@ -0,0 +1,568 @@
|
||||
---
|
||||
name: hybrid-search-implementation
|
||||
description: Combine vector and keyword search for improved retrieval. Use when implementing RAG systems, building search engines, or when neither approach alone provides sufficient recall.
|
||||
---
|
||||
|
||||
# Hybrid Search Implementation
|
||||
|
||||
Patterns for combining vector similarity and keyword-based search.
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
- Building RAG systems with improved recall
|
||||
- Combining semantic understanding with exact matching
|
||||
- Handling queries with specific terms (names, codes)
|
||||
- Improving search for domain-specific vocabulary
|
||||
- When pure vector search misses keyword matches
|
||||
|
||||
## Core Concepts
|
||||
|
||||
### 1. Hybrid Search Architecture
|
||||
|
||||
```
|
||||
Query → ┬─► Vector Search ──► Candidates ─┐
|
||||
│ │
|
||||
└─► Keyword Search ─► Candidates ─┴─► Fusion ─► Results
|
||||
```
|
||||
|
||||
### 2. Fusion Methods
|
||||
|
||||
| Method | Description | Best For |
|
||||
|--------|-------------|----------|
|
||||
| **RRF** | Reciprocal Rank Fusion | General purpose |
|
||||
| **Linear** | Weighted sum of scores | Tunable balance |
|
||||
| **Cross-encoder** | Rerank with neural model | Highest quality |
|
||||
| **Cascade** | Filter then rerank | Efficiency |
|
||||
|
||||
## Templates
|
||||
|
||||
### Template 1: Reciprocal Rank Fusion
|
||||
|
||||
```python
|
||||
from typing import List, Dict, Tuple
|
||||
from collections import defaultdict
|
||||
|
||||
def reciprocal_rank_fusion(
|
||||
result_lists: List[List[Tuple[str, float]]],
|
||||
k: int = 60,
|
||||
weights: List[float] = None
|
||||
) -> List[Tuple[str, float]]:
|
||||
"""
|
||||
Combine multiple ranked lists using RRF.
|
||||
|
||||
Args:
|
||||
result_lists: List of (doc_id, score) tuples per search method
|
||||
k: RRF constant (higher = more weight to lower ranks)
|
||||
weights: Optional weights per result list
|
||||
|
||||
Returns:
|
||||
Fused ranking as (doc_id, score) tuples
|
||||
"""
|
||||
if weights is None:
|
||||
weights = [1.0] * len(result_lists)
|
||||
|
||||
scores = defaultdict(float)
|
||||
|
||||
for result_list, weight in zip(result_lists, weights):
|
||||
for rank, (doc_id, _) in enumerate(result_list):
|
||||
# RRF formula: 1 / (k + rank)
|
||||
scores[doc_id] += weight * (1.0 / (k + rank + 1))
|
||||
|
||||
# Sort by fused score
|
||||
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
def linear_combination(
|
||||
vector_results: List[Tuple[str, float]],
|
||||
keyword_results: List[Tuple[str, float]],
|
||||
alpha: float = 0.5
|
||||
) -> List[Tuple[str, float]]:
|
||||
"""
|
||||
Combine results with linear interpolation.
|
||||
|
||||
Args:
|
||||
vector_results: (doc_id, similarity_score) from vector search
|
||||
keyword_results: (doc_id, bm25_score) from keyword search
|
||||
alpha: Weight for vector search (1-alpha for keyword)
|
||||
"""
|
||||
# Normalize scores to [0, 1]
|
||||
def normalize(results):
|
||||
if not results:
|
||||
return {}
|
||||
scores = [s for _, s in results]
|
||||
min_s, max_s = min(scores), max(scores)
|
||||
range_s = max_s - min_s if max_s != min_s else 1
|
||||
return {doc_id: (score - min_s) / range_s for doc_id, score in results}
|
||||
|
||||
vector_scores = normalize(vector_results)
|
||||
keyword_scores = normalize(keyword_results)
|
||||
|
||||
# Combine
|
||||
all_docs = set(vector_scores.keys()) | set(keyword_scores.keys())
|
||||
combined = {}
|
||||
|
||||
for doc_id in all_docs:
|
||||
v_score = vector_scores.get(doc_id, 0)
|
||||
k_score = keyword_scores.get(doc_id, 0)
|
||||
combined[doc_id] = alpha * v_score + (1 - alpha) * k_score
|
||||
|
||||
return sorted(combined.items(), key=lambda x: x[1], reverse=True)
|
||||
```
|
||||
|
||||
### Template 2: PostgreSQL Hybrid Search
|
||||
|
||||
```python
|
||||
import asyncpg
|
||||
from typing import List, Dict, Optional
|
||||
import numpy as np
|
||||
|
||||
class PostgresHybridSearch:
|
||||
"""Hybrid search with pgvector and full-text search."""
|
||||
|
||||
def __init__(self, pool: asyncpg.Pool):
|
||||
self.pool = pool
|
||||
|
||||
async def setup_schema(self):
|
||||
"""Create tables and indexes."""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
content TEXT NOT NULL,
|
||||
embedding vector(1536),
|
||||
metadata JSONB DEFAULT '{}',
|
||||
ts_content tsvector GENERATED ALWAYS AS (
|
||||
to_tsvector('english', content)
|
||||
) STORED
|
||||
);
|
||||
|
||||
-- Vector index (HNSW)
|
||||
CREATE INDEX IF NOT EXISTS documents_embedding_idx
|
||||
ON documents USING hnsw (embedding vector_cosine_ops);
|
||||
|
||||
-- Full-text index (GIN)
|
||||
CREATE INDEX IF NOT EXISTS documents_fts_idx
|
||||
ON documents USING gin (ts_content);
|
||||
""")
|
||||
|
||||
async def hybrid_search(
|
||||
self,
|
||||
query: str,
|
||||
query_embedding: List[float],
|
||||
limit: int = 10,
|
||||
vector_weight: float = 0.5,
|
||||
filter_metadata: Optional[Dict] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Perform hybrid search combining vector and full-text.
|
||||
|
||||
Uses RRF fusion for combining results.
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
# Build filter clause
|
||||
where_clause = "1=1"
|
||||
params = [query_embedding, query, limit * 3]
|
||||
|
||||
if filter_metadata:
|
||||
for key, value in filter_metadata.items():
|
||||
params.append(value)
|
||||
where_clause += f" AND metadata->>'{key}' = ${len(params)}"
|
||||
|
||||
results = await conn.fetch(f"""
|
||||
WITH vector_search AS (
|
||||
SELECT
|
||||
id,
|
||||
content,
|
||||
metadata,
|
||||
ROW_NUMBER() OVER (ORDER BY embedding <=> $1::vector) as vector_rank,
|
||||
1 - (embedding <=> $1::vector) as vector_score
|
||||
FROM documents
|
||||
WHERE {where_clause}
|
||||
ORDER BY embedding <=> $1::vector
|
||||
LIMIT $3
|
||||
),
|
||||
keyword_search AS (
|
||||
SELECT
|
||||
id,
|
||||
content,
|
||||
metadata,
|
||||
ROW_NUMBER() OVER (ORDER BY ts_rank(ts_content, websearch_to_tsquery('english', $2)) DESC) as keyword_rank,
|
||||
ts_rank(ts_content, websearch_to_tsquery('english', $2)) as keyword_score
|
||||
FROM documents
|
||||
WHERE ts_content @@ websearch_to_tsquery('english', $2)
|
||||
AND {where_clause}
|
||||
ORDER BY ts_rank(ts_content, websearch_to_tsquery('english', $2)) DESC
|
||||
LIMIT $3
|
||||
)
|
||||
SELECT
|
||||
COALESCE(v.id, k.id) as id,
|
||||
COALESCE(v.content, k.content) as content,
|
||||
COALESCE(v.metadata, k.metadata) as metadata,
|
||||
v.vector_score,
|
||||
k.keyword_score,
|
||||
-- RRF fusion
|
||||
COALESCE(1.0 / (60 + v.vector_rank), 0) * $4::float +
|
||||
COALESCE(1.0 / (60 + k.keyword_rank), 0) * (1 - $4::float) as rrf_score
|
||||
FROM vector_search v
|
||||
FULL OUTER JOIN keyword_search k ON v.id = k.id
|
||||
ORDER BY rrf_score DESC
|
||||
LIMIT $3 / 3
|
||||
""", *params, vector_weight)
|
||||
|
||||
return [dict(row) for row in results]
|
||||
|
||||
async def search_with_rerank(
|
||||
self,
|
||||
query: str,
|
||||
query_embedding: List[float],
|
||||
limit: int = 10,
|
||||
rerank_candidates: int = 50
|
||||
) -> List[Dict]:
|
||||
"""Hybrid search with cross-encoder reranking."""
|
||||
from sentence_transformers import CrossEncoder
|
||||
|
||||
# Get candidates
|
||||
candidates = await self.hybrid_search(
|
||||
query, query_embedding, limit=rerank_candidates
|
||||
)
|
||||
|
||||
if not candidates:
|
||||
return []
|
||||
|
||||
# Rerank with cross-encoder
|
||||
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
||||
|
||||
pairs = [(query, c["content"]) for c in candidates]
|
||||
scores = model.predict(pairs)
|
||||
|
||||
for candidate, score in zip(candidates, scores):
|
||||
candidate["rerank_score"] = float(score)
|
||||
|
||||
# Sort by rerank score and return top results
|
||||
reranked = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)
|
||||
return reranked[:limit]
|
||||
```
|
||||
|
||||
### Template 3: Elasticsearch Hybrid Search
|
||||
|
||||
```python
|
||||
from elasticsearch import Elasticsearch
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
class ElasticsearchHybridSearch:
|
||||
"""Hybrid search with Elasticsearch and dense vectors."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
es_client: Elasticsearch,
|
||||
index_name: str = "documents"
|
||||
):
|
||||
self.es = es_client
|
||||
self.index_name = index_name
|
||||
|
||||
def create_index(self, vector_dims: int = 1536):
|
||||
"""Create index with dense vector and text fields."""
|
||||
mapping = {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"content": {
|
||||
"type": "text",
|
||||
"analyzer": "english"
|
||||
},
|
||||
"embedding": {
|
||||
"type": "dense_vector",
|
||||
"dims": vector_dims,
|
||||
"index": True,
|
||||
"similarity": "cosine"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"enabled": True
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.es.indices.create(index=self.index_name, body=mapping, ignore=400)
|
||||
|
||||
def hybrid_search(
|
||||
self,
|
||||
query: str,
|
||||
query_embedding: List[float],
|
||||
limit: int = 10,
|
||||
boost_vector: float = 1.0,
|
||||
boost_text: float = 1.0,
|
||||
filter: Optional[Dict] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Hybrid search using Elasticsearch's built-in capabilities.
|
||||
"""
|
||||
# Build the hybrid query
|
||||
search_body = {
|
||||
"size": limit,
|
||||
"query": {
|
||||
"bool": {
|
||||
"should": [
|
||||
# Vector search (kNN)
|
||||
{
|
||||
"script_score": {
|
||||
"query": {"match_all": {}},
|
||||
"script": {
|
||||
"source": f"cosineSimilarity(params.query_vector, 'embedding') * {boost_vector} + 1.0",
|
||||
"params": {"query_vector": query_embedding}
|
||||
}
|
||||
}
|
||||
},
|
||||
# Text search (BM25)
|
||||
{
|
||||
"match": {
|
||||
"content": {
|
||||
"query": query,
|
||||
"boost": boost_text
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"minimum_should_match": 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Add filter if provided
|
||||
if filter:
|
||||
search_body["query"]["bool"]["filter"] = filter
|
||||
|
||||
response = self.es.search(index=self.index_name, body=search_body)
|
||||
|
||||
return [
|
||||
{
|
||||
"id": hit["_id"],
|
||||
"content": hit["_source"]["content"],
|
||||
"metadata": hit["_source"].get("metadata", {}),
|
||||
"score": hit["_score"]
|
||||
}
|
||||
for hit in response["hits"]["hits"]
|
||||
]
|
||||
|
||||
def hybrid_search_rrf(
|
||||
self,
|
||||
query: str,
|
||||
query_embedding: List[float],
|
||||
limit: int = 10,
|
||||
window_size: int = 100
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Hybrid search using Elasticsearch 8.x RRF.
|
||||
"""
|
||||
search_body = {
|
||||
"size": limit,
|
||||
"sub_searches": [
|
||||
{
|
||||
"query": {
|
||||
"match": {
|
||||
"content": query
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"query": {
|
||||
"knn": {
|
||||
"field": "embedding",
|
||||
"query_vector": query_embedding,
|
||||
"k": window_size,
|
||||
"num_candidates": window_size * 2
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"rank": {
|
||||
"rrf": {
|
||||
"window_size": window_size,
|
||||
"rank_constant": 60
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = self.es.search(index=self.index_name, body=search_body)
|
||||
|
||||
return [
|
||||
{
|
||||
"id": hit["_id"],
|
||||
"content": hit["_source"]["content"],
|
||||
"score": hit["_score"]
|
||||
}
|
||||
for hit in response["hits"]["hits"]
|
||||
]
|
||||
```
|
||||
|
||||
### Template 4: Custom Hybrid RAG Pipeline
|
||||
|
||||
```python
|
||||
from typing import List, Dict, Optional, Callable
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
id: str
|
||||
content: str
|
||||
score: float
|
||||
source: str # "vector", "keyword", "hybrid"
|
||||
metadata: Dict = None
|
||||
|
||||
|
||||
class HybridRAGPipeline:
|
||||
"""Complete hybrid search pipeline for RAG."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vector_store,
|
||||
keyword_store,
|
||||
embedder,
|
||||
reranker=None,
|
||||
fusion_method: str = "rrf",
|
||||
vector_weight: float = 0.5
|
||||
):
|
||||
self.vector_store = vector_store
|
||||
self.keyword_store = keyword_store
|
||||
self.embedder = embedder
|
||||
self.reranker = reranker
|
||||
self.fusion_method = fusion_method
|
||||
self.vector_weight = vector_weight
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
top_k: int = 10,
|
||||
filter: Optional[Dict] = None,
|
||||
use_rerank: bool = True
|
||||
) -> List[SearchResult]:
|
||||
"""Execute hybrid search pipeline."""
|
||||
|
||||
# Step 1: Get query embedding
|
||||
query_embedding = self.embedder.embed(query)
|
||||
|
||||
# Step 2: Execute parallel searches
|
||||
vector_results, keyword_results = await asyncio.gather(
|
||||
self._vector_search(query_embedding, top_k * 3, filter),
|
||||
self._keyword_search(query, top_k * 3, filter)
|
||||
)
|
||||
|
||||
# Step 3: Fuse results
|
||||
if self.fusion_method == "rrf":
|
||||
fused = self._rrf_fusion(vector_results, keyword_results)
|
||||
else:
|
||||
fused = self._linear_fusion(vector_results, keyword_results)
|
||||
|
||||
# Step 4: Rerank if enabled
|
||||
if use_rerank and self.reranker:
|
||||
fused = await self._rerank(query, fused[:top_k * 2])
|
||||
|
||||
return fused[:top_k]
|
||||
|
||||
async def _vector_search(
|
||||
self,
|
||||
embedding: List[float],
|
||||
limit: int,
|
||||
filter: Dict
|
||||
) -> List[SearchResult]:
|
||||
results = await self.vector_store.search(embedding, limit, filter)
|
||||
return [
|
||||
SearchResult(
|
||||
id=r["id"],
|
||||
content=r["content"],
|
||||
score=r["score"],
|
||||
source="vector",
|
||||
metadata=r.get("metadata")
|
||||
)
|
||||
for r in results
|
||||
]
|
||||
|
||||
async def _keyword_search(
|
||||
self,
|
||||
query: str,
|
||||
limit: int,
|
||||
filter: Dict
|
||||
) -> List[SearchResult]:
|
||||
results = await self.keyword_store.search(query, limit, filter)
|
||||
return [
|
||||
SearchResult(
|
||||
id=r["id"],
|
||||
content=r["content"],
|
||||
score=r["score"],
|
||||
source="keyword",
|
||||
metadata=r.get("metadata")
|
||||
)
|
||||
for r in results
|
||||
]
|
||||
|
||||
def _rrf_fusion(
|
||||
self,
|
||||
vector_results: List[SearchResult],
|
||||
keyword_results: List[SearchResult]
|
||||
) -> List[SearchResult]:
|
||||
"""Fuse with RRF."""
|
||||
k = 60
|
||||
scores = {}
|
||||
content_map = {}
|
||||
|
||||
for rank, result in enumerate(vector_results):
|
||||
scores[result.id] = scores.get(result.id, 0) + 1 / (k + rank + 1)
|
||||
content_map[result.id] = result
|
||||
|
||||
for rank, result in enumerate(keyword_results):
|
||||
scores[result.id] = scores.get(result.id, 0) + 1 / (k + rank + 1)
|
||||
if result.id not in content_map:
|
||||
content_map[result.id] = result
|
||||
|
||||
sorted_ids = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)
|
||||
|
||||
return [
|
||||
SearchResult(
|
||||
id=doc_id,
|
||||
content=content_map[doc_id].content,
|
||||
score=scores[doc_id],
|
||||
source="hybrid",
|
||||
metadata=content_map[doc_id].metadata
|
||||
)
|
||||
for doc_id in sorted_ids
|
||||
]
|
||||
|
||||
async def _rerank(
|
||||
self,
|
||||
query: str,
|
||||
results: List[SearchResult]
|
||||
) -> List[SearchResult]:
|
||||
"""Rerank with cross-encoder."""
|
||||
if not results:
|
||||
return results
|
||||
|
||||
pairs = [(query, r.content) for r in results]
|
||||
scores = self.reranker.predict(pairs)
|
||||
|
||||
for result, score in zip(results, scores):
|
||||
result.score = float(score)
|
||||
|
||||
return sorted(results, key=lambda x: x.score, reverse=True)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Do's
|
||||
- **Tune weights empirically** - Test on your data
|
||||
- **Use RRF for simplicity** - Works well without tuning
|
||||
- **Add reranking** - Significant quality improvement
|
||||
- **Log both scores** - Helps with debugging
|
||||
- **A/B test** - Measure real user impact
|
||||
|
||||
### Don'ts
|
||||
- **Don't assume one size fits all** - Different queries need different weights
|
||||
- **Don't skip keyword search** - Handles exact matches better
|
||||
- **Don't over-fetch** - Balance recall vs latency
|
||||
- **Don't ignore edge cases** - Empty results, single word queries
|
||||
|
||||
## Resources
|
||||
|
||||
- [RRF Paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf)
|
||||
- [Vespa Hybrid Search](https://blog.vespa.ai/improving-text-ranking-with-few-shot-prompting/)
|
||||
- [Cohere Rerank](https://docs.cohere.com/docs/reranking)
|
||||
@@ -0,0 +1,558 @@
|
||||
---
|
||||
name: similarity-search-patterns
|
||||
description: Implement efficient similarity search with vector databases. Use when building semantic search, implementing nearest neighbor queries, or optimizing retrieval performance.
|
||||
---
|
||||
|
||||
# Similarity Search Patterns
|
||||
|
||||
Patterns for implementing efficient similarity search in production systems.
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
- Building semantic search systems
|
||||
- Implementing RAG retrieval
|
||||
- Creating recommendation engines
|
||||
- Optimizing search latency
|
||||
- Scaling to millions of vectors
|
||||
- Combining semantic and keyword search
|
||||
|
||||
## Core Concepts
|
||||
|
||||
### 1. Distance Metrics
|
||||
|
||||
| Metric | Formula | Best For |
|
||||
|--------|---------|----------|
|
||||
| **Cosine** | 1 - (A·B)/(‖A‖‖B‖) | Normalized embeddings |
|
||||
| **Euclidean (L2)** | √Σ(a-b)² | Raw embeddings |
|
||||
| **Dot Product** | A·B | Magnitude matters |
|
||||
| **Manhattan (L1)** | Σ|a-b| | Sparse vectors |
|
||||
|
||||
### 2. Index Types
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────┐
|
||||
│ Index Types │
|
||||
├─────────────┬───────────────┬───────────────────┤
|
||||
│ Flat │ HNSW │ IVF+PQ │
|
||||
│ (Exact) │ (Graph-based) │ (Quantized) │
|
||||
├─────────────┼───────────────┼───────────────────┤
|
||||
│ O(n) search │ O(log n) │ O(√n) │
|
||||
│ 100% recall │ ~95-99% │ ~90-95% │
|
||||
│ Small data │ Medium-Large │ Very Large │
|
||||
└─────────────┴───────────────┴───────────────────┘
|
||||
```
|
||||
|
||||
## Templates
|
||||
|
||||
### Template 1: Pinecone Implementation
|
||||
|
||||
```python
|
||||
from pinecone import Pinecone, ServerlessSpec
|
||||
from typing import List, Dict, Optional
|
||||
import hashlib
|
||||
|
||||
class PineconeVectorStore:
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str,
|
||||
index_name: str,
|
||||
dimension: int = 1536,
|
||||
metric: str = "cosine"
|
||||
):
|
||||
self.pc = Pinecone(api_key=api_key)
|
||||
|
||||
# Create index if not exists
|
||||
if index_name not in self.pc.list_indexes().names():
|
||||
self.pc.create_index(
|
||||
name=index_name,
|
||||
dimension=dimension,
|
||||
metric=metric,
|
||||
spec=ServerlessSpec(cloud="aws", region="us-east-1")
|
||||
)
|
||||
|
||||
self.index = self.pc.Index(index_name)
|
||||
|
||||
def upsert(
|
||||
self,
|
||||
vectors: List[Dict],
|
||||
namespace: str = ""
|
||||
) -> int:
|
||||
"""
|
||||
Upsert vectors.
|
||||
vectors: [{"id": str, "values": List[float], "metadata": dict}]
|
||||
"""
|
||||
# Batch upsert
|
||||
batch_size = 100
|
||||
total = 0
|
||||
|
||||
for i in range(0, len(vectors), batch_size):
|
||||
batch = vectors[i:i + batch_size]
|
||||
self.index.upsert(vectors=batch, namespace=namespace)
|
||||
total += len(batch)
|
||||
|
||||
return total
|
||||
|
||||
def search(
|
||||
self,
|
||||
query_vector: List[float],
|
||||
top_k: int = 10,
|
||||
namespace: str = "",
|
||||
filter: Optional[Dict] = None,
|
||||
include_metadata: bool = True
|
||||
) -> List[Dict]:
|
||||
"""Search for similar vectors."""
|
||||
results = self.index.query(
|
||||
vector=query_vector,
|
||||
top_k=top_k,
|
||||
namespace=namespace,
|
||||
filter=filter,
|
||||
include_metadata=include_metadata
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"id": match.id,
|
||||
"score": match.score,
|
||||
"metadata": match.metadata
|
||||
}
|
||||
for match in results.matches
|
||||
]
|
||||
|
||||
def search_with_rerank(
|
||||
self,
|
||||
query: str,
|
||||
query_vector: List[float],
|
||||
top_k: int = 10,
|
||||
rerank_top_n: int = 50,
|
||||
namespace: str = ""
|
||||
) -> List[Dict]:
|
||||
"""Search and rerank results."""
|
||||
# Over-fetch for reranking
|
||||
initial_results = self.search(
|
||||
query_vector,
|
||||
top_k=rerank_top_n,
|
||||
namespace=namespace
|
||||
)
|
||||
|
||||
# Rerank with cross-encoder or LLM
|
||||
reranked = self._rerank(query, initial_results)
|
||||
|
||||
return reranked[:top_k]
|
||||
|
||||
def _rerank(self, query: str, results: List[Dict]) -> List[Dict]:
|
||||
"""Rerank results using cross-encoder."""
|
||||
from sentence_transformers import CrossEncoder
|
||||
|
||||
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
||||
|
||||
pairs = [(query, r["metadata"]["text"]) for r in results]
|
||||
scores = model.predict(pairs)
|
||||
|
||||
for result, score in zip(results, scores):
|
||||
result["rerank_score"] = float(score)
|
||||
|
||||
return sorted(results, key=lambda x: x["rerank_score"], reverse=True)
|
||||
|
||||
def delete(self, ids: List[str], namespace: str = ""):
|
||||
"""Delete vectors by ID."""
|
||||
self.index.delete(ids=ids, namespace=namespace)
|
||||
|
||||
def delete_by_filter(self, filter: Dict, namespace: str = ""):
|
||||
"""Delete vectors matching filter."""
|
||||
self.index.delete(filter=filter, namespace=namespace)
|
||||
```
|
||||
|
||||
### Template 2: Qdrant Implementation
|
||||
|
||||
```python
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
class QdrantVectorStore:
|
||||
def __init__(
|
||||
self,
|
||||
url: str = "localhost",
|
||||
port: int = 6333,
|
||||
collection_name: str = "documents",
|
||||
vector_size: int = 1536
|
||||
):
|
||||
self.client = QdrantClient(url=url, port=port)
|
||||
self.collection_name = collection_name
|
||||
|
||||
# Create collection if not exists
|
||||
collections = self.client.get_collections().collections
|
||||
if collection_name not in [c.name for c in collections]:
|
||||
self.client.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=models.VectorParams(
|
||||
size=vector_size,
|
||||
distance=models.Distance.COSINE
|
||||
),
|
||||
# Optional: enable quantization for memory efficiency
|
||||
quantization_config=models.ScalarQuantization(
|
||||
scalar=models.ScalarQuantizationConfig(
|
||||
type=models.ScalarType.INT8,
|
||||
quantile=0.99,
|
||||
always_ram=True
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
def upsert(self, points: List[Dict]) -> int:
|
||||
"""
|
||||
Upsert points.
|
||||
points: [{"id": str/int, "vector": List[float], "payload": dict}]
|
||||
"""
|
||||
qdrant_points = [
|
||||
models.PointStruct(
|
||||
id=p["id"],
|
||||
vector=p["vector"],
|
||||
payload=p.get("payload", {})
|
||||
)
|
||||
for p in points
|
||||
]
|
||||
|
||||
self.client.upsert(
|
||||
collection_name=self.collection_name,
|
||||
points=qdrant_points
|
||||
)
|
||||
return len(points)
|
||||
|
||||
def search(
|
||||
self,
|
||||
query_vector: List[float],
|
||||
limit: int = 10,
|
||||
filter: Optional[models.Filter] = None,
|
||||
score_threshold: Optional[float] = None
|
||||
) -> List[Dict]:
|
||||
"""Search for similar vectors."""
|
||||
results = self.client.search(
|
||||
collection_name=self.collection_name,
|
||||
query_vector=query_vector,
|
||||
limit=limit,
|
||||
query_filter=filter,
|
||||
score_threshold=score_threshold
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"id": r.id,
|
||||
"score": r.score,
|
||||
"payload": r.payload
|
||||
}
|
||||
for r in results
|
||||
]
|
||||
|
||||
def search_with_filter(
|
||||
self,
|
||||
query_vector: List[float],
|
||||
must_conditions: List[Dict] = None,
|
||||
should_conditions: List[Dict] = None,
|
||||
must_not_conditions: List[Dict] = None,
|
||||
limit: int = 10
|
||||
) -> List[Dict]:
|
||||
"""Search with complex filters."""
|
||||
conditions = []
|
||||
|
||||
if must_conditions:
|
||||
conditions.extend([
|
||||
models.FieldCondition(
|
||||
key=c["key"],
|
||||
match=models.MatchValue(value=c["value"])
|
||||
)
|
||||
for c in must_conditions
|
||||
])
|
||||
|
||||
filter = models.Filter(must=conditions) if conditions else None
|
||||
|
||||
return self.search(query_vector, limit=limit, filter=filter)
|
||||
|
||||
def search_with_sparse(
|
||||
self,
|
||||
dense_vector: List[float],
|
||||
sparse_vector: Dict[int, float],
|
||||
limit: int = 10,
|
||||
dense_weight: float = 0.7
|
||||
) -> List[Dict]:
|
||||
"""Hybrid search with dense and sparse vectors."""
|
||||
# Requires collection with named vectors
|
||||
results = self.client.search(
|
||||
collection_name=self.collection_name,
|
||||
query_vector=models.NamedVector(
|
||||
name="dense",
|
||||
vector=dense_vector
|
||||
),
|
||||
limit=limit
|
||||
)
|
||||
return [{"id": r.id, "score": r.score, "payload": r.payload} for r in results]
|
||||
```
|
||||
|
||||
### Template 3: pgvector with PostgreSQL
|
||||
|
||||
```python
|
||||
import asyncpg
|
||||
from typing import List, Dict, Optional
|
||||
import numpy as np
|
||||
|
||||
class PgVectorStore:
|
||||
def __init__(self, connection_string: str):
|
||||
self.connection_string = connection_string
|
||||
|
||||
async def init(self):
|
||||
"""Initialize connection pool and extension."""
|
||||
self.pool = await asyncpg.create_pool(self.connection_string)
|
||||
|
||||
async with self.pool.acquire() as conn:
|
||||
# Enable extension
|
||||
await conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
||||
|
||||
# Create table
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
content TEXT,
|
||||
metadata JSONB,
|
||||
embedding vector(1536)
|
||||
)
|
||||
""")
|
||||
|
||||
# Create index (HNSW for better performance)
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS documents_embedding_idx
|
||||
ON documents
|
||||
USING hnsw (embedding vector_cosine_ops)
|
||||
WITH (m = 16, ef_construction = 64)
|
||||
""")
|
||||
|
||||
async def upsert(self, documents: List[Dict]):
|
||||
"""Upsert documents with embeddings."""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.executemany(
|
||||
"""
|
||||
INSERT INTO documents (id, content, metadata, embedding)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
content = EXCLUDED.content,
|
||||
metadata = EXCLUDED.metadata,
|
||||
embedding = EXCLUDED.embedding
|
||||
""",
|
||||
[
|
||||
(
|
||||
doc["id"],
|
||||
doc["content"],
|
||||
doc.get("metadata", {}),
|
||||
np.array(doc["embedding"]).tolist()
|
||||
)
|
||||
for doc in documents
|
||||
]
|
||||
)
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query_embedding: List[float],
|
||||
limit: int = 10,
|
||||
filter_metadata: Optional[Dict] = None
|
||||
) -> List[Dict]:
|
||||
"""Search for similar documents."""
|
||||
query = """
|
||||
SELECT id, content, metadata,
|
||||
1 - (embedding <=> $1::vector) as similarity
|
||||
FROM documents
|
||||
"""
|
||||
|
||||
params = [query_embedding]
|
||||
|
||||
if filter_metadata:
|
||||
conditions = []
|
||||
for key, value in filter_metadata.items():
|
||||
params.append(value)
|
||||
conditions.append(f"metadata->>'{key}' = ${len(params)}")
|
||||
query += " WHERE " + " AND ".join(conditions)
|
||||
|
||||
query += f" ORDER BY embedding <=> $1::vector LIMIT ${len(params) + 1}"
|
||||
params.append(limit)
|
||||
|
||||
async with self.pool.acquire() as conn:
|
||||
rows = await conn.fetch(query, *params)
|
||||
|
||||
return [
|
||||
{
|
||||
"id": row["id"],
|
||||
"content": row["content"],
|
||||
"metadata": row["metadata"],
|
||||
"score": row["similarity"]
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
async def hybrid_search(
|
||||
self,
|
||||
query_embedding: List[float],
|
||||
query_text: str,
|
||||
limit: int = 10,
|
||||
vector_weight: float = 0.5
|
||||
) -> List[Dict]:
|
||||
"""Hybrid search combining vector and full-text."""
|
||||
async with self.pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
WITH vector_results AS (
|
||||
SELECT id, content, metadata,
|
||||
1 - (embedding <=> $1::vector) as vector_score
|
||||
FROM documents
|
||||
ORDER BY embedding <=> $1::vector
|
||||
LIMIT $3 * 2
|
||||
),
|
||||
text_results AS (
|
||||
SELECT id, content, metadata,
|
||||
ts_rank(to_tsvector('english', content),
|
||||
plainto_tsquery('english', $2)) as text_score
|
||||
FROM documents
|
||||
WHERE to_tsvector('english', content) @@ plainto_tsquery('english', $2)
|
||||
LIMIT $3 * 2
|
||||
)
|
||||
SELECT
|
||||
COALESCE(v.id, t.id) as id,
|
||||
COALESCE(v.content, t.content) as content,
|
||||
COALESCE(v.metadata, t.metadata) as metadata,
|
||||
COALESCE(v.vector_score, 0) * $4 +
|
||||
COALESCE(t.text_score, 0) * (1 - $4) as combined_score
|
||||
FROM vector_results v
|
||||
FULL OUTER JOIN text_results t ON v.id = t.id
|
||||
ORDER BY combined_score DESC
|
||||
LIMIT $3
|
||||
""",
|
||||
query_embedding, query_text, limit, vector_weight
|
||||
)
|
||||
|
||||
return [dict(row) for row in rows]
|
||||
```
|
||||
|
||||
### Template 4: Weaviate Implementation
|
||||
|
||||
```python
|
||||
import weaviate
|
||||
from weaviate.util import generate_uuid5
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
class WeaviateVectorStore:
|
||||
def __init__(
|
||||
self,
|
||||
url: str = "http://localhost:8080",
|
||||
class_name: str = "Document"
|
||||
):
|
||||
self.client = weaviate.Client(url=url)
|
||||
self.class_name = class_name
|
||||
self._ensure_schema()
|
||||
|
||||
def _ensure_schema(self):
|
||||
"""Create schema if not exists."""
|
||||
schema = {
|
||||
"class": self.class_name,
|
||||
"vectorizer": "none", # We provide vectors
|
||||
"properties": [
|
||||
{"name": "content", "dataType": ["text"]},
|
||||
{"name": "source", "dataType": ["string"]},
|
||||
{"name": "chunk_id", "dataType": ["int"]}
|
||||
]
|
||||
}
|
||||
|
||||
if not self.client.schema.exists(self.class_name):
|
||||
self.client.schema.create_class(schema)
|
||||
|
||||
def upsert(self, documents: List[Dict]):
|
||||
"""Batch upsert documents."""
|
||||
with self.client.batch as batch:
|
||||
batch.batch_size = 100
|
||||
|
||||
for doc in documents:
|
||||
batch.add_data_object(
|
||||
data_object={
|
||||
"content": doc["content"],
|
||||
"source": doc.get("source", ""),
|
||||
"chunk_id": doc.get("chunk_id", 0)
|
||||
},
|
||||
class_name=self.class_name,
|
||||
uuid=generate_uuid5(doc["id"]),
|
||||
vector=doc["embedding"]
|
||||
)
|
||||
|
||||
def search(
|
||||
self,
|
||||
query_vector: List[float],
|
||||
limit: int = 10,
|
||||
where_filter: Optional[Dict] = None
|
||||
) -> List[Dict]:
|
||||
"""Vector search."""
|
||||
query = (
|
||||
self.client.query
|
||||
.get(self.class_name, ["content", "source", "chunk_id"])
|
||||
.with_near_vector({"vector": query_vector})
|
||||
.with_limit(limit)
|
||||
.with_additional(["distance", "id"])
|
||||
)
|
||||
|
||||
if where_filter:
|
||||
query = query.with_where(where_filter)
|
||||
|
||||
results = query.do()
|
||||
|
||||
return [
|
||||
{
|
||||
"id": item["_additional"]["id"],
|
||||
"content": item["content"],
|
||||
"source": item["source"],
|
||||
"score": 1 - item["_additional"]["distance"]
|
||||
}
|
||||
for item in results["data"]["Get"][self.class_name]
|
||||
]
|
||||
|
||||
def hybrid_search(
|
||||
self,
|
||||
query: str,
|
||||
query_vector: List[float],
|
||||
limit: int = 10,
|
||||
alpha: float = 0.5 # 0 = keyword, 1 = vector
|
||||
) -> List[Dict]:
|
||||
"""Hybrid search combining BM25 and vector."""
|
||||
results = (
|
||||
self.client.query
|
||||
.get(self.class_name, ["content", "source"])
|
||||
.with_hybrid(query=query, vector=query_vector, alpha=alpha)
|
||||
.with_limit(limit)
|
||||
.with_additional(["score"])
|
||||
.do()
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"content": item["content"],
|
||||
"source": item["source"],
|
||||
"score": item["_additional"]["score"]
|
||||
}
|
||||
for item in results["data"]["Get"][self.class_name]
|
||||
]
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Do's
|
||||
- **Use appropriate index** - HNSW for most cases
|
||||
- **Tune parameters** - ef_search, nprobe for recall/speed
|
||||
- **Implement hybrid search** - Combine with keyword search
|
||||
- **Monitor recall** - Measure search quality
|
||||
- **Pre-filter when possible** - Reduce search space
|
||||
|
||||
### Don'ts
|
||||
- **Don't skip evaluation** - Measure before optimizing
|
||||
- **Don't over-index** - Start with flat, scale up
|
||||
- **Don't ignore latency** - P99 matters for UX
|
||||
- **Don't forget costs** - Vector storage adds up
|
||||
|
||||
## Resources
|
||||
|
||||
- [Pinecone Docs](https://docs.pinecone.io/)
|
||||
- [Qdrant Docs](https://qdrant.tech/documentation/)
|
||||
- [pgvector](https://github.com/pgvector/pgvector)
|
||||
- [Weaviate Docs](https://weaviate.io/developers/weaviate)
|
||||
521
plugins/llm-application-dev/skills/vector-index-tuning/SKILL.md
Normal file
521
plugins/llm-application-dev/skills/vector-index-tuning/SKILL.md
Normal file
@@ -0,0 +1,521 @@
|
||||
---
|
||||
name: vector-index-tuning
|
||||
description: Optimize vector index performance for latency, recall, and memory. Use when tuning HNSW parameters, selecting quantization strategies, or scaling vector search infrastructure.
|
||||
---
|
||||
|
||||
# Vector Index Tuning
|
||||
|
||||
Guide to optimizing vector indexes for production performance.
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
- Tuning HNSW parameters
|
||||
- Implementing quantization
|
||||
- Optimizing memory usage
|
||||
- Reducing search latency
|
||||
- Balancing recall vs speed
|
||||
- Scaling to billions of vectors
|
||||
|
||||
## Core Concepts
|
||||
|
||||
### 1. Index Type Selection
|
||||
|
||||
```
|
||||
Data Size Recommended Index
|
||||
────────────────────────────────────────
|
||||
< 10K vectors → Flat (exact search)
|
||||
10K - 1M → HNSW
|
||||
1M - 100M → HNSW + Quantization
|
||||
> 100M → IVF + PQ or DiskANN
|
||||
```
|
||||
|
||||
### 2. HNSW Parameters
|
||||
|
||||
| Parameter | Default | Effect |
|
||||
|-----------|---------|--------|
|
||||
| **M** | 16 | Connections per node, ↑ = better recall, more memory |
|
||||
| **efConstruction** | 100 | Build quality, ↑ = better index, slower build |
|
||||
| **efSearch** | 50 | Search quality, ↑ = better recall, slower search |
|
||||
|
||||
### 3. Quantization Types
|
||||
|
||||
```
|
||||
Full Precision (FP32): 4 bytes × dimensions
|
||||
Half Precision (FP16): 2 bytes × dimensions
|
||||
INT8 Scalar: 1 byte × dimensions
|
||||
Product Quantization: ~32-64 bytes total
|
||||
Binary: dimensions/8 bytes
|
||||
```
|
||||
|
||||
## Templates
|
||||
|
||||
### Template 1: HNSW Parameter Tuning
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
from typing import List, Tuple
|
||||
import time
|
||||
|
||||
def benchmark_hnsw_parameters(
|
||||
vectors: np.ndarray,
|
||||
queries: np.ndarray,
|
||||
ground_truth: np.ndarray,
|
||||
m_values: List[int] = [8, 16, 32, 64],
|
||||
ef_construction_values: List[int] = [64, 128, 256],
|
||||
ef_search_values: List[int] = [32, 64, 128, 256]
|
||||
) -> List[dict]:
|
||||
"""Benchmark different HNSW configurations."""
|
||||
import hnswlib
|
||||
|
||||
results = []
|
||||
dim = vectors.shape[1]
|
||||
n = vectors.shape[0]
|
||||
|
||||
for m in m_values:
|
||||
for ef_construction in ef_construction_values:
|
||||
# Build index
|
||||
index = hnswlib.Index(space='cosine', dim=dim)
|
||||
index.init_index(max_elements=n, M=m, ef_construction=ef_construction)
|
||||
|
||||
build_start = time.time()
|
||||
index.add_items(vectors)
|
||||
build_time = time.time() - build_start
|
||||
|
||||
# Get memory usage
|
||||
memory_bytes = index.element_count * (
|
||||
dim * 4 + # Vector storage
|
||||
m * 2 * 4 # Graph edges (approximate)
|
||||
)
|
||||
|
||||
for ef_search in ef_search_values:
|
||||
index.set_ef(ef_search)
|
||||
|
||||
# Measure search
|
||||
search_start = time.time()
|
||||
labels, distances = index.knn_query(queries, k=10)
|
||||
search_time = time.time() - search_start
|
||||
|
||||
# Calculate recall
|
||||
recall = calculate_recall(labels, ground_truth, k=10)
|
||||
|
||||
results.append({
|
||||
"M": m,
|
||||
"ef_construction": ef_construction,
|
||||
"ef_search": ef_search,
|
||||
"build_time_s": build_time,
|
||||
"search_time_ms": search_time * 1000 / len(queries),
|
||||
"recall@10": recall,
|
||||
"memory_mb": memory_bytes / 1024 / 1024
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def calculate_recall(predictions: np.ndarray, ground_truth: np.ndarray, k: int) -> float:
|
||||
"""Calculate recall@k."""
|
||||
correct = 0
|
||||
for pred, truth in zip(predictions, ground_truth):
|
||||
correct += len(set(pred[:k]) & set(truth[:k]))
|
||||
return correct / (len(predictions) * k)
|
||||
|
||||
|
||||
def recommend_hnsw_params(
|
||||
num_vectors: int,
|
||||
target_recall: float = 0.95,
|
||||
max_latency_ms: float = 10,
|
||||
available_memory_gb: float = 8
|
||||
) -> dict:
|
||||
"""Recommend HNSW parameters based on requirements."""
|
||||
|
||||
# Base recommendations
|
||||
if num_vectors < 100_000:
|
||||
m = 16
|
||||
ef_construction = 100
|
||||
elif num_vectors < 1_000_000:
|
||||
m = 32
|
||||
ef_construction = 200
|
||||
else:
|
||||
m = 48
|
||||
ef_construction = 256
|
||||
|
||||
# Adjust ef_search based on recall target
|
||||
if target_recall >= 0.99:
|
||||
ef_search = 256
|
||||
elif target_recall >= 0.95:
|
||||
ef_search = 128
|
||||
else:
|
||||
ef_search = 64
|
||||
|
||||
return {
|
||||
"M": m,
|
||||
"ef_construction": ef_construction,
|
||||
"ef_search": ef_search,
|
||||
"notes": f"Estimated for {num_vectors:,} vectors, {target_recall:.0%} recall"
|
||||
}
|
||||
```
|
||||
|
||||
### Template 2: Quantization Strategies
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
from typing import Optional
|
||||
|
||||
class VectorQuantizer:
|
||||
"""Quantization strategies for vector compression."""
|
||||
|
||||
@staticmethod
|
||||
def scalar_quantize_int8(
|
||||
vectors: np.ndarray,
|
||||
min_val: Optional[float] = None,
|
||||
max_val: Optional[float] = None
|
||||
) -> Tuple[np.ndarray, dict]:
|
||||
"""Scalar quantization to INT8."""
|
||||
if min_val is None:
|
||||
min_val = vectors.min()
|
||||
if max_val is None:
|
||||
max_val = vectors.max()
|
||||
|
||||
# Scale to 0-255 range
|
||||
scale = 255.0 / (max_val - min_val)
|
||||
quantized = np.clip(
|
||||
np.round((vectors - min_val) * scale),
|
||||
0, 255
|
||||
).astype(np.uint8)
|
||||
|
||||
params = {"min_val": min_val, "max_val": max_val, "scale": scale}
|
||||
return quantized, params
|
||||
|
||||
@staticmethod
|
||||
def dequantize_int8(
|
||||
quantized: np.ndarray,
|
||||
params: dict
|
||||
) -> np.ndarray:
|
||||
"""Dequantize INT8 vectors."""
|
||||
return quantized.astype(np.float32) / params["scale"] + params["min_val"]
|
||||
|
||||
@staticmethod
|
||||
def product_quantize(
|
||||
vectors: np.ndarray,
|
||||
n_subvectors: int = 8,
|
||||
n_centroids: int = 256
|
||||
) -> Tuple[np.ndarray, dict]:
|
||||
"""Product quantization for aggressive compression."""
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
n, dim = vectors.shape
|
||||
assert dim % n_subvectors == 0
|
||||
subvector_dim = dim // n_subvectors
|
||||
|
||||
codebooks = []
|
||||
codes = np.zeros((n, n_subvectors), dtype=np.uint8)
|
||||
|
||||
for i in range(n_subvectors):
|
||||
start = i * subvector_dim
|
||||
end = (i + 1) * subvector_dim
|
||||
subvectors = vectors[:, start:end]
|
||||
|
||||
kmeans = KMeans(n_clusters=n_centroids, random_state=42)
|
||||
codes[:, i] = kmeans.fit_predict(subvectors)
|
||||
codebooks.append(kmeans.cluster_centers_)
|
||||
|
||||
params = {
|
||||
"codebooks": codebooks,
|
||||
"n_subvectors": n_subvectors,
|
||||
"subvector_dim": subvector_dim
|
||||
}
|
||||
return codes, params
|
||||
|
||||
@staticmethod
|
||||
def binary_quantize(vectors: np.ndarray) -> np.ndarray:
|
||||
"""Binary quantization (sign of each dimension)."""
|
||||
# Convert to binary: positive = 1, negative = 0
|
||||
binary = (vectors > 0).astype(np.uint8)
|
||||
|
||||
# Pack bits into bytes
|
||||
n, dim = vectors.shape
|
||||
packed_dim = (dim + 7) // 8
|
||||
|
||||
packed = np.zeros((n, packed_dim), dtype=np.uint8)
|
||||
for i in range(dim):
|
||||
byte_idx = i // 8
|
||||
bit_idx = i % 8
|
||||
packed[:, byte_idx] |= (binary[:, i] << bit_idx)
|
||||
|
||||
return packed
|
||||
|
||||
|
||||
def estimate_memory_usage(
|
||||
num_vectors: int,
|
||||
dimensions: int,
|
||||
quantization: str = "fp32",
|
||||
index_type: str = "hnsw",
|
||||
hnsw_m: int = 16
|
||||
) -> dict:
|
||||
"""Estimate memory usage for different configurations."""
|
||||
|
||||
# Vector storage
|
||||
bytes_per_dimension = {
|
||||
"fp32": 4,
|
||||
"fp16": 2,
|
||||
"int8": 1,
|
||||
"pq": 0.05, # Approximate
|
||||
"binary": 0.125
|
||||
}
|
||||
|
||||
vector_bytes = num_vectors * dimensions * bytes_per_dimension[quantization]
|
||||
|
||||
# Index overhead
|
||||
if index_type == "hnsw":
|
||||
# Each node has ~M*2 edges, each edge is 4 bytes (int32)
|
||||
index_bytes = num_vectors * hnsw_m * 2 * 4
|
||||
elif index_type == "ivf":
|
||||
# Inverted lists + centroids
|
||||
index_bytes = num_vectors * 8 + 65536 * dimensions * 4
|
||||
else:
|
||||
index_bytes = 0
|
||||
|
||||
total_bytes = vector_bytes + index_bytes
|
||||
|
||||
return {
|
||||
"vector_storage_mb": vector_bytes / 1024 / 1024,
|
||||
"index_overhead_mb": index_bytes / 1024 / 1024,
|
||||
"total_mb": total_bytes / 1024 / 1024,
|
||||
"total_gb": total_bytes / 1024 / 1024 / 1024
|
||||
}
|
||||
```
|
||||
|
||||
### Template 3: Qdrant Index Configuration
|
||||
|
||||
```python
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models
|
||||
|
||||
def create_optimized_collection(
|
||||
client: QdrantClient,
|
||||
collection_name: str,
|
||||
vector_size: int,
|
||||
num_vectors: int,
|
||||
optimize_for: str = "balanced" # "recall", "speed", "memory"
|
||||
) -> None:
|
||||
"""Create collection with optimized settings."""
|
||||
|
||||
# HNSW configuration based on optimization target
|
||||
hnsw_configs = {
|
||||
"recall": models.HnswConfigDiff(m=32, ef_construct=256),
|
||||
"speed": models.HnswConfigDiff(m=16, ef_construct=64),
|
||||
"balanced": models.HnswConfigDiff(m=16, ef_construct=128),
|
||||
"memory": models.HnswConfigDiff(m=8, ef_construct=64)
|
||||
}
|
||||
|
||||
# Quantization configuration
|
||||
quantization_configs = {
|
||||
"recall": None, # No quantization for max recall
|
||||
"speed": models.ScalarQuantization(
|
||||
scalar=models.ScalarQuantizationConfig(
|
||||
type=models.ScalarType.INT8,
|
||||
quantile=0.99,
|
||||
always_ram=True
|
||||
)
|
||||
),
|
||||
"balanced": models.ScalarQuantization(
|
||||
scalar=models.ScalarQuantizationConfig(
|
||||
type=models.ScalarType.INT8,
|
||||
quantile=0.99,
|
||||
always_ram=False
|
||||
)
|
||||
),
|
||||
"memory": models.ProductQuantization(
|
||||
product=models.ProductQuantizationConfig(
|
||||
compression=models.CompressionRatio.X16,
|
||||
always_ram=False
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
# Optimizer configuration
|
||||
optimizer_configs = {
|
||||
"recall": models.OptimizersConfigDiff(
|
||||
indexing_threshold=10000,
|
||||
memmap_threshold=50000
|
||||
),
|
||||
"speed": models.OptimizersConfigDiff(
|
||||
indexing_threshold=5000,
|
||||
memmap_threshold=20000
|
||||
),
|
||||
"balanced": models.OptimizersConfigDiff(
|
||||
indexing_threshold=20000,
|
||||
memmap_threshold=50000
|
||||
),
|
||||
"memory": models.OptimizersConfigDiff(
|
||||
indexing_threshold=50000,
|
||||
memmap_threshold=10000 # Use disk sooner
|
||||
)
|
||||
}
|
||||
|
||||
client.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=models.VectorParams(
|
||||
size=vector_size,
|
||||
distance=models.Distance.COSINE
|
||||
),
|
||||
hnsw_config=hnsw_configs[optimize_for],
|
||||
quantization_config=quantization_configs[optimize_for],
|
||||
optimizers_config=optimizer_configs[optimize_for]
|
||||
)
|
||||
|
||||
|
||||
def tune_search_parameters(
|
||||
client: QdrantClient,
|
||||
collection_name: str,
|
||||
target_recall: float = 0.95
|
||||
) -> dict:
|
||||
"""Tune search parameters for target recall."""
|
||||
|
||||
# Search parameter recommendations
|
||||
if target_recall >= 0.99:
|
||||
search_params = models.SearchParams(
|
||||
hnsw_ef=256,
|
||||
exact=False,
|
||||
quantization=models.QuantizationSearchParams(
|
||||
ignore=True, # Don't use quantization for search
|
||||
rescore=True
|
||||
)
|
||||
)
|
||||
elif target_recall >= 0.95:
|
||||
search_params = models.SearchParams(
|
||||
hnsw_ef=128,
|
||||
exact=False,
|
||||
quantization=models.QuantizationSearchParams(
|
||||
ignore=False,
|
||||
rescore=True,
|
||||
oversampling=2.0
|
||||
)
|
||||
)
|
||||
else:
|
||||
search_params = models.SearchParams(
|
||||
hnsw_ef=64,
|
||||
exact=False,
|
||||
quantization=models.QuantizationSearchParams(
|
||||
ignore=False,
|
||||
rescore=False
|
||||
)
|
||||
)
|
||||
|
||||
return search_params
|
||||
```
|
||||
|
||||
### Template 4: Performance Monitoring
|
||||
|
||||
```python
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
import numpy as np
|
||||
|
||||
@dataclass
|
||||
class SearchMetrics:
|
||||
latency_p50_ms: float
|
||||
latency_p95_ms: float
|
||||
latency_p99_ms: float
|
||||
recall: float
|
||||
qps: float
|
||||
|
||||
|
||||
class VectorSearchMonitor:
|
||||
"""Monitor vector search performance."""
|
||||
|
||||
def __init__(self, ground_truth_fn=None):
|
||||
self.latencies = []
|
||||
self.recalls = []
|
||||
self.ground_truth_fn = ground_truth_fn
|
||||
|
||||
def measure_search(
|
||||
self,
|
||||
search_fn,
|
||||
query_vectors: np.ndarray,
|
||||
k: int = 10,
|
||||
num_iterations: int = 100
|
||||
) -> SearchMetrics:
|
||||
"""Benchmark search performance."""
|
||||
latencies = []
|
||||
|
||||
for _ in range(num_iterations):
|
||||
for query in query_vectors:
|
||||
start = time.perf_counter()
|
||||
results = search_fn(query, k=k)
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
latencies.append(latency)
|
||||
|
||||
latencies = np.array(latencies)
|
||||
total_queries = num_iterations * len(query_vectors)
|
||||
total_time = sum(latencies) / 1000 # seconds
|
||||
|
||||
return SearchMetrics(
|
||||
latency_p50_ms=np.percentile(latencies, 50),
|
||||
latency_p95_ms=np.percentile(latencies, 95),
|
||||
latency_p99_ms=np.percentile(latencies, 99),
|
||||
recall=self._calculate_recall(search_fn, query_vectors, k) if self.ground_truth_fn else 0,
|
||||
qps=total_queries / total_time
|
||||
)
|
||||
|
||||
def _calculate_recall(self, search_fn, queries: np.ndarray, k: int) -> float:
|
||||
"""Calculate recall against ground truth."""
|
||||
if not self.ground_truth_fn:
|
||||
return 0
|
||||
|
||||
correct = 0
|
||||
total = 0
|
||||
|
||||
for query in queries:
|
||||
predicted = set(search_fn(query, k=k))
|
||||
actual = set(self.ground_truth_fn(query, k=k))
|
||||
correct += len(predicted & actual)
|
||||
total += k
|
||||
|
||||
return correct / total
|
||||
|
||||
|
||||
def profile_index_build(
|
||||
build_fn,
|
||||
vectors: np.ndarray,
|
||||
batch_sizes: List[int] = [1000, 10000, 50000]
|
||||
) -> dict:
|
||||
"""Profile index build performance."""
|
||||
results = {}
|
||||
|
||||
for batch_size in batch_sizes:
|
||||
times = []
|
||||
for i in range(0, len(vectors), batch_size):
|
||||
batch = vectors[i:i + batch_size]
|
||||
start = time.perf_counter()
|
||||
build_fn(batch)
|
||||
times.append(time.perf_counter() - start)
|
||||
|
||||
results[batch_size] = {
|
||||
"avg_batch_time_s": np.mean(times),
|
||||
"vectors_per_second": batch_size / np.mean(times)
|
||||
}
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Do's
|
||||
- **Benchmark with real queries** - Synthetic may not represent production
|
||||
- **Monitor recall continuously** - Can degrade with data drift
|
||||
- **Start with defaults** - Tune only when needed
|
||||
- **Use quantization** - Significant memory savings
|
||||
- **Consider tiered storage** - Hot/cold data separation
|
||||
|
||||
### Don'ts
|
||||
- **Don't over-optimize early** - Profile first
|
||||
- **Don't ignore build time** - Index updates have cost
|
||||
- **Don't forget reindexing** - Plan for maintenance
|
||||
- **Don't skip warming** - Cold indexes are slow
|
||||
|
||||
## Resources
|
||||
|
||||
- [HNSW Paper](https://arxiv.org/abs/1603.09320)
|
||||
- [Faiss Wiki](https://github.com/facebookresearch/faiss/wiki)
|
||||
- [ANN Benchmarks](https://ann-benchmarks.com/)
|
||||
Reference in New Issue
Block a user