mirror of
https://github.com/wshobson/agents.git
synced 2026-03-18 09:37:15 +00:00
style: format all files with prettier
This commit is contained in:
@@ -20,9 +20,11 @@ Master comprehensive evaluation strategies for LLM applications, from automated
|
||||
## Core Evaluation Types
|
||||
|
||||
### 1. Automated Metrics
|
||||
|
||||
Fast, repeatable, scalable evaluation using computed scores.
|
||||
|
||||
**Text Generation:**
|
||||
|
||||
- **BLEU**: N-gram overlap (translation)
|
||||
- **ROUGE**: Recall-oriented (summarization)
|
||||
- **METEOR**: Semantic similarity
|
||||
@@ -30,21 +32,25 @@ Fast, repeatable, scalable evaluation using computed scores.
|
||||
- **Perplexity**: Language model confidence
|
||||
|
||||
**Classification:**
|
||||
|
||||
- **Accuracy**: Percentage correct
|
||||
- **Precision/Recall/F1**: Class-specific performance
|
||||
- **Confusion Matrix**: Error patterns
|
||||
- **AUC-ROC**: Ranking quality
|
||||
|
||||
**Retrieval (RAG):**
|
||||
|
||||
- **MRR**: Mean Reciprocal Rank
|
||||
- **NDCG**: Normalized Discounted Cumulative Gain
|
||||
- **Precision@K**: Relevant in top K
|
||||
- **Recall@K**: Coverage in top K
|
||||
|
||||
### 2. Human Evaluation
|
||||
|
||||
Manual assessment for quality aspects difficult to automate.
|
||||
|
||||
**Dimensions:**
|
||||
|
||||
- **Accuracy**: Factual correctness
|
||||
- **Coherence**: Logical flow
|
||||
- **Relevance**: Answers the question
|
||||
@@ -53,9 +59,11 @@ Manual assessment for quality aspects difficult to automate.
|
||||
- **Helpfulness**: Useful to the user
|
||||
|
||||
### 3. LLM-as-Judge
|
||||
|
||||
Use stronger LLMs to evaluate weaker model outputs.
|
||||
|
||||
**Approaches:**
|
||||
|
||||
- **Pointwise**: Score individual responses
|
||||
- **Pairwise**: Compare two responses
|
||||
- **Reference-based**: Compare to gold standard
|
||||
@@ -134,6 +142,7 @@ results = await suite.evaluate(model=your_model, test_cases=test_cases)
|
||||
## Automated Metrics Implementation
|
||||
|
||||
### BLEU Score
|
||||
|
||||
```python
|
||||
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
||||
|
||||
@@ -149,6 +158,7 @@ def calculate_bleu(reference: str, hypothesis: str, **kwargs) -> float:
|
||||
```
|
||||
|
||||
### ROUGE Score
|
||||
|
||||
```python
|
||||
from rouge_score import rouge_scorer
|
||||
|
||||
@@ -168,6 +178,7 @@ def calculate_rouge(reference: str, hypothesis: str, **kwargs) -> dict:
|
||||
```
|
||||
|
||||
### BERTScore
|
||||
|
||||
```python
|
||||
from bert_score import score
|
||||
|
||||
@@ -192,6 +203,7 @@ def calculate_bertscore(
|
||||
```
|
||||
|
||||
### Custom Metrics
|
||||
|
||||
```python
|
||||
def calculate_groundedness(response: str, context: str, **kwargs) -> float:
|
||||
"""Check if response is grounded in provided context."""
|
||||
@@ -232,6 +244,7 @@ def calculate_factuality(claim: str, sources: list[str], **kwargs) -> float:
|
||||
## LLM-as-Judge Patterns
|
||||
|
||||
### Single Output Evaluation
|
||||
|
||||
```python
|
||||
from anthropic import Anthropic
|
||||
from pydantic import BaseModel, Field
|
||||
@@ -280,6 +293,7 @@ Provide ratings in JSON format:
|
||||
```
|
||||
|
||||
### Pairwise Comparison
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Literal
|
||||
@@ -324,6 +338,7 @@ Answer with JSON:
|
||||
```
|
||||
|
||||
### Reference-Based Evaluation
|
||||
|
||||
```python
|
||||
class ReferenceEvaluation(BaseModel):
|
||||
semantic_similarity: float = Field(ge=0, le=1)
|
||||
@@ -371,6 +386,7 @@ Respond in JSON:
|
||||
## Human Evaluation Frameworks
|
||||
|
||||
### Annotation Guidelines
|
||||
|
||||
```python
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
@@ -412,6 +428,7 @@ class AnnotationTask:
|
||||
```
|
||||
|
||||
### Inter-Rater Agreement
|
||||
|
||||
```python
|
||||
from sklearn.metrics import cohen_kappa_score
|
||||
|
||||
@@ -444,6 +461,7 @@ def calculate_agreement(
|
||||
## A/B Testing
|
||||
|
||||
### Statistical Testing Framework
|
||||
|
||||
```python
|
||||
from scipy import stats
|
||||
import numpy as np
|
||||
@@ -504,6 +522,7 @@ class ABTest:
|
||||
## Regression Testing
|
||||
|
||||
### Regression Detection
|
||||
|
||||
```python
|
||||
from dataclasses import dataclass
|
||||
|
||||
@@ -595,6 +614,7 @@ print(f"Mean score: {experiment_results.aggregate_metrics['qa']['mean']}")
|
||||
## Benchmarking
|
||||
|
||||
### Running Benchmarks
|
||||
|
||||
```python
|
||||
from dataclasses import dataclass
|
||||
import numpy as np
|
||||
|
||||
Reference in New Issue
Block a user