feat(llm-application-dev): modernize to LangGraph and latest models v2.0.0

- Migrate from LangChain 0.x to LangChain 1.x/LangGraph patterns
- Update model references to Claude 4.5 and GPT-5.2
- Add Voyage AI as primary embedding recommendation
- Add structured outputs with Pydantic
- Replace deprecated initialize_agent() with StateGraph
- Fix security: use AST-based safe math instead of unsafe execution
- Add plugin.json and README.md for consistency
- Bump marketplace version to 1.3.3
This commit is contained in:
Seth Hobson
2026-01-19 15:43:25 -05:00
parent e827cc713a
commit 8be0e8ac7a
12 changed files with 1940 additions and 708 deletions

View File

@@ -64,34 +64,71 @@ Use stronger LLMs to evaluate weaker model outputs.
## Quick Start
```python
from llm_eval import EvaluationSuite, Metric
from dataclasses import dataclass
from typing import Callable
import numpy as np
# Define evaluation suite
@dataclass
class Metric:
name: str
fn: Callable
@staticmethod
def accuracy():
return Metric("accuracy", calculate_accuracy)
@staticmethod
def bleu():
return Metric("bleu", calculate_bleu)
@staticmethod
def bertscore():
return Metric("bertscore", calculate_bertscore)
@staticmethod
def custom(name: str, fn: Callable):
return Metric(name, fn)
class EvaluationSuite:
def __init__(self, metrics: list[Metric]):
self.metrics = metrics
async def evaluate(self, model, test_cases: list[dict]) -> dict:
results = {m.name: [] for m in self.metrics}
for test in test_cases:
prediction = await model.predict(test["input"])
for metric in self.metrics:
score = metric.fn(
prediction=prediction,
reference=test.get("expected"),
context=test.get("context")
)
results[metric.name].append(score)
return {
"metrics": {k: np.mean(v) for k, v in results.items()},
"raw_scores": results
}
# Usage
suite = EvaluationSuite([
Metric.accuracy(),
Metric.bleu(),
Metric.bertscore(),
Metric.custom(name="groundedness", fn=check_groundedness)
Metric.custom("groundedness", check_groundedness)
])
# Prepare test cases
test_cases = [
{
"input": "What is the capital of France?",
"expected": "Paris",
"context": "France is a country in Europe. Paris is its capital."
},
# ... more test cases
]
# Run evaluation
results = suite.evaluate(
model=your_model,
test_cases=test_cases
)
print(f"Overall Accuracy: {results.metrics['accuracy']}")
print(f"BLEU Score: {results.metrics['bleu']}")
results = await suite.evaluate(model=your_model, test_cases=test_cases)
```
## Automated Metrics Implementation
@@ -100,7 +137,7 @@ print(f"BLEU Score: {results.metrics['bleu']}")
```python
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
def calculate_bleu(reference, hypothesis):
def calculate_bleu(reference: str, hypothesis: str, **kwargs) -> float:
"""Calculate BLEU score between reference and hypothesis."""
smoothie = SmoothingFunction().method4
@@ -109,21 +146,18 @@ def calculate_bleu(reference, hypothesis):
hypothesis.split(),
smoothing_function=smoothie
)
# Usage
bleu = calculate_bleu(
reference="The cat sat on the mat",
hypothesis="A cat is sitting on the mat"
)
```
### ROUGE Score
```python
from rouge_score import rouge_scorer
def calculate_rouge(reference, hypothesis):
def calculate_rouge(reference: str, hypothesis: str, **kwargs) -> dict:
"""Calculate ROUGE scores."""
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scorer = rouge_scorer.RougeScorer(
['rouge1', 'rouge2', 'rougeL'],
use_stemmer=True
)
scores = scorer.score(reference, hypothesis)
return {
@@ -137,8 +171,12 @@ def calculate_rouge(reference, hypothesis):
```python
from bert_score import score
def calculate_bertscore(references, hypotheses):
"""Calculate BERTScore using pre-trained BERT."""
def calculate_bertscore(
references: list[str],
hypotheses: list[str],
**kwargs
) -> dict:
"""Calculate BERTScore using pre-trained model."""
P, R, F1 = score(
hypotheses,
references,
@@ -155,44 +193,72 @@ def calculate_bertscore(references, hypotheses):
### Custom Metrics
```python
def calculate_groundedness(response, context):
def calculate_groundedness(response: str, context: str, **kwargs) -> float:
"""Check if response is grounded in provided context."""
# Use NLI model to check entailment
from transformers import pipeline
nli = pipeline("text-classification", model="microsoft/deberta-large-mnli")
nli = pipeline(
"text-classification",
model="microsoft/deberta-large-mnli"
)
result = nli(f"{context} [SEP] {response}")[0]
# Return confidence that response is entailed by context
return result['score'] if result['label'] == 'ENTAILMENT' else 0.0
def calculate_toxicity(text):
def calculate_toxicity(text: str, **kwargs) -> float:
"""Measure toxicity in generated text."""
from detoxify import Detoxify
results = Detoxify('original').predict(text)
return max(results.values()) # Return highest toxicity score
def calculate_factuality(claim, knowledge_base):
"""Verify factual claims against knowledge base."""
# Implementation depends on your knowledge base
# Could use retrieval + NLI, or fact-checking API
pass
def calculate_factuality(claim: str, sources: list[str], **kwargs) -> float:
"""Verify factual claims against sources."""
from transformers import pipeline
nli = pipeline("text-classification", model="facebook/bart-large-mnli")
scores = []
for source in sources:
result = nli(f"{source}</s></s>{claim}")[0]
if result['label'] == 'entailment':
scores.append(result['score'])
return max(scores) if scores else 0.0
```
## LLM-as-Judge Patterns
### Single Output Evaluation
```python
def llm_judge_quality(response, question):
"""Use GPT-5 to judge response quality."""
prompt = f"""Rate the following response on a scale of 1-10 for:
1. Accuracy (factually correct)
2. Helpfulness (answers the question)
3. Clarity (well-written and understandable)
from anthropic import Anthropic
from pydantic import BaseModel, Field
import json
class QualityRating(BaseModel):
accuracy: int = Field(ge=1, le=10, description="Factual correctness")
helpfulness: int = Field(ge=1, le=10, description="Answers the question")
clarity: int = Field(ge=1, le=10, description="Well-written and understandable")
reasoning: str = Field(description="Brief explanation")
async def llm_judge_quality(
response: str,
question: str,
context: str = None
) -> QualityRating:
"""Use Claude to judge response quality."""
client = Anthropic()
system = """You are an expert evaluator of AI responses.
Rate responses on accuracy, helpfulness, and clarity (1-10 scale).
Provide brief reasoning for your ratings."""
prompt = f"""Rate the following response:
Question: {question}
{f'Context: {context}' if context else ''}
Response: {response}
Provide ratings in JSON format:
@@ -201,23 +267,37 @@ Provide ratings in JSON format:
"helpfulness": <1-10>,
"clarity": <1-10>,
"reasoning": "<brief explanation>"
}}
"""
}}"""
result = openai.ChatCompletion.create(
model="gpt-5",
messages=[{"role": "user", "content": prompt}],
temperature=0
message = client.messages.create(
model="claude-sonnet-4-5",
max_tokens=500,
system=system,
messages=[{"role": "user", "content": prompt}]
)
return json.loads(result.choices[0].message.content)
return QualityRating(**json.loads(message.content[0].text))
```
### Pairwise Comparison
```python
def compare_responses(question, response_a, response_b):
from pydantic import BaseModel, Field
from typing import Literal
class ComparisonResult(BaseModel):
winner: Literal["A", "B", "tie"]
reasoning: str
confidence: int = Field(ge=1, le=10)
async def compare_responses(
question: str,
response_a: str,
response_b: str
) -> ComparisonResult:
"""Compare two responses using LLM judge."""
prompt = f"""Compare these two responses to the question and determine which is better.
client = Anthropic()
prompt = f"""Compare these two responses and determine which is better.
Question: {question}
@@ -225,38 +305,84 @@ Response A: {response_a}
Response B: {response_b}
Which response is better and why? Consider accuracy, helpfulness, and clarity.
Consider accuracy, helpfulness, and clarity.
Answer with JSON:
{{
"winner": "A" or "B" or "tie",
"reasoning": "<explanation>",
"confidence": <1-10>
}}
"""
}}"""
result = openai.ChatCompletion.create(
model="gpt-5",
messages=[{"role": "user", "content": prompt}],
temperature=0
message = client.messages.create(
model="claude-sonnet-4-5",
max_tokens=500,
messages=[{"role": "user", "content": prompt}]
)
return json.loads(result.choices[0].message.content)
return ComparisonResult(**json.loads(message.content[0].text))
```
### Reference-Based Evaluation
```python
class ReferenceEvaluation(BaseModel):
semantic_similarity: float = Field(ge=0, le=1)
factual_accuracy: float = Field(ge=0, le=1)
completeness: float = Field(ge=0, le=1)
issues: list[str]
async def evaluate_against_reference(
response: str,
reference: str,
question: str
) -> ReferenceEvaluation:
"""Evaluate response against gold standard reference."""
client = Anthropic()
prompt = f"""Compare the response to the reference answer.
Question: {question}
Reference Answer: {reference}
Response to Evaluate: {response}
Evaluate:
1. Semantic similarity (0-1): How similar is the meaning?
2. Factual accuracy (0-1): Are all facts correct?
3. Completeness (0-1): Does it cover all key points?
4. List any specific issues or errors.
Respond in JSON:
{{
"semantic_similarity": <0-1>,
"factual_accuracy": <0-1>,
"completeness": <0-1>,
"issues": ["issue1", "issue2"]
}}"""
message = client.messages.create(
model="claude-sonnet-4-5",
max_tokens=500,
messages=[{"role": "user", "content": prompt}]
)
return ReferenceEvaluation(**json.loads(message.content[0].text))
```
## Human Evaluation Frameworks
### Annotation Guidelines
```python
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class AnnotationTask:
"""Structure for human annotation task."""
response: str
question: str
context: Optional[str] = None
def __init__(self, response, question, context=None):
self.response = response
self.question = question
self.context = context
def get_annotation_form(self):
def get_annotation_form(self) -> dict:
return {
"question": self.question,
"context": self.context,
@@ -289,22 +415,29 @@ class AnnotationTask:
```python
from sklearn.metrics import cohen_kappa_score
def calculate_agreement(rater1_scores, rater2_scores):
def calculate_agreement(
rater1_scores: list[int],
rater2_scores: list[int]
) -> dict:
"""Calculate inter-rater agreement."""
kappa = cohen_kappa_score(rater1_scores, rater2_scores)
interpretation = {
kappa < 0: "Poor",
kappa < 0.2: "Slight",
kappa < 0.4: "Fair",
kappa < 0.6: "Moderate",
kappa < 0.8: "Substantial",
kappa <= 1.0: "Almost Perfect"
}
if kappa < 0:
interpretation = "Poor"
elif kappa < 0.2:
interpretation = "Slight"
elif kappa < 0.4:
interpretation = "Fair"
elif kappa < 0.6:
interpretation = "Moderate"
elif kappa < 0.8:
interpretation = "Substantial"
else:
interpretation = "Almost Perfect"
return {
"kappa": kappa,
"interpretation": interpretation[True]
"interpretation": interpretation
}
```
@@ -314,23 +447,26 @@ def calculate_agreement(rater1_scores, rater2_scores):
```python
from scipy import stats
import numpy as np
from dataclasses import dataclass, field
@dataclass
class ABTest:
def __init__(self, variant_a_name="A", variant_b_name="B"):
self.variant_a = {"name": variant_a_name, "scores": []}
self.variant_b = {"name": variant_b_name, "scores": []}
variant_a_name: str = "A"
variant_b_name: str = "B"
variant_a_scores: list[float] = field(default_factory=list)
variant_b_scores: list[float] = field(default_factory=list)
def add_result(self, variant, score):
def add_result(self, variant: str, score: float):
"""Add evaluation result for a variant."""
if variant == "A":
self.variant_a["scores"].append(score)
self.variant_a_scores.append(score)
else:
self.variant_b["scores"].append(score)
self.variant_b_scores.append(score)
def analyze(self, alpha=0.05):
def analyze(self, alpha: float = 0.05) -> dict:
"""Perform statistical analysis."""
a_scores = self.variant_a["scores"]
b_scores = self.variant_b["scores"]
a_scores = np.array(self.variant_a_scores)
b_scores = np.array(self.variant_b_scores)
# T-test
t_stat, p_value = stats.ttest_ind(a_scores, b_scores)
@@ -347,12 +483,12 @@ class ABTest:
"p_value": p_value,
"statistically_significant": p_value < alpha,
"cohens_d": cohens_d,
"effect_size": self.interpret_cohens_d(cohens_d),
"winner": "B" if np.mean(b_scores) > np.mean(a_scores) else "A"
"effect_size": self._interpret_cohens_d(cohens_d),
"winner": self.variant_b_name if np.mean(b_scores) > np.mean(a_scores) else self.variant_a_name
}
@staticmethod
def interpret_cohens_d(d):
def _interpret_cohens_d(d: float) -> str:
"""Interpret Cohen's d effect size."""
abs_d = abs(d)
if abs_d < 0.2:
@@ -369,12 +505,22 @@ class ABTest:
### Regression Detection
```python
from dataclasses import dataclass
@dataclass
class RegressionResult:
metric: str
baseline: float
current: float
change: float
is_regression: bool
class RegressionDetector:
def __init__(self, baseline_results, threshold=0.05):
def __init__(self, baseline_results: dict, threshold: float = 0.05):
self.baseline = baseline_results
self.threshold = threshold
def check_for_regression(self, new_results):
def check_for_regression(self, new_results: dict) -> dict:
"""Detect if new results show regression."""
regressions = []
@@ -389,39 +535,97 @@ class RegressionDetector:
relative_change = (new_score - baseline_score) / baseline_score
# Flag if significant decrease
if relative_change < -self.threshold:
regressions.append({
"metric": metric,
"baseline": baseline_score,
"current": new_score,
"change": relative_change
})
is_regression = relative_change < -self.threshold
if is_regression:
regressions.append(RegressionResult(
metric=metric,
baseline=baseline_score,
current=new_score,
change=relative_change,
is_regression=True
))
return {
"has_regression": len(regressions) > 0,
"regressions": regressions
"regressions": regressions,
"summary": f"{len(regressions)} metric(s) regressed"
}
```
## LangSmith Evaluation Integration
```python
from langsmith import Client
from langsmith.evaluation import evaluate, LangChainStringEvaluator
# Initialize LangSmith client
client = Client()
# Create dataset
dataset = client.create_dataset("qa_test_cases")
client.create_examples(
inputs=[{"question": q} for q in questions],
outputs=[{"answer": a} for a in expected_answers],
dataset_id=dataset.id
)
# Define evaluators
evaluators = [
LangChainStringEvaluator("qa"), # QA correctness
LangChainStringEvaluator("context_qa"), # Context-grounded QA
LangChainStringEvaluator("cot_qa"), # Chain-of-thought QA
]
# Run evaluation
async def target_function(inputs: dict) -> dict:
result = await your_chain.ainvoke(inputs)
return {"answer": result}
experiment_results = await evaluate(
target_function,
data=dataset.name,
evaluators=evaluators,
experiment_prefix="v1.0.0",
metadata={"model": "claude-sonnet-4-5", "version": "1.0.0"}
)
print(f"Mean score: {experiment_results.aggregate_metrics['qa']['mean']}")
```
## Benchmarking
### Running Benchmarks
```python
from dataclasses import dataclass
import numpy as np
@dataclass
class BenchmarkResult:
metric: str
mean: float
std: float
min: float
max: float
class BenchmarkRunner:
def __init__(self, benchmark_dataset):
def __init__(self, benchmark_dataset: list[dict]):
self.dataset = benchmark_dataset
def run_benchmark(self, model, metrics):
async def run_benchmark(
self,
model,
metrics: list[Metric]
) -> dict[str, BenchmarkResult]:
"""Run model on benchmark and calculate metrics."""
results = {metric.name: [] for metric in metrics}
for example in self.dataset:
# Generate prediction
prediction = model.predict(example["input"])
prediction = await model.predict(example["input"])
# Calculate each metric
for metric in metrics:
score = metric.calculate(
score = metric.fn(
prediction=prediction,
reference=example["reference"],
context=example.get("context")
@@ -430,26 +634,24 @@ class BenchmarkRunner:
# Aggregate results
return {
metric: {
"mean": np.mean(scores),
"std": np.std(scores),
"min": min(scores),
"max": max(scores)
}
metric: BenchmarkResult(
metric=metric,
mean=np.mean(scores),
std=np.std(scores),
min=min(scores),
max=max(scores)
)
for metric, scores in results.items()
}
```
## Resources
- **references/metrics.md**: Comprehensive metric guide
- **references/human-evaluation.md**: Annotation best practices
- **references/benchmarking.md**: Standard benchmarks
- **references/a-b-testing.md**: Statistical testing guide
- **references/regression-testing.md**: CI/CD integration
- **assets/evaluation-framework.py**: Complete evaluation harness
- **assets/benchmark-dataset.jsonl**: Example datasets
- **scripts/evaluate-model.py**: Automated evaluation runner
- [LangSmith Evaluation Guide](https://docs.smith.langchain.com/evaluation)
- [RAGAS Framework](https://docs.ragas.io/)
- [DeepEval Library](https://docs.deepeval.com/)
- [Arize Phoenix](https://docs.arize.com/phoenix/)
- [HELM Benchmark](https://crfm.stanford.edu/helm/)
## Best Practices
@@ -469,3 +671,5 @@ class BenchmarkRunner:
- **Data Contamination**: Testing on training data
- **Ignoring Variance**: Not accounting for statistical uncertainty
- **Metric Mismatch**: Using metrics not aligned with business goals
- **Position Bias**: In pairwise evals, randomize order
- **Overfitting Prompts**: Optimizing for test set instead of real use