mirror of
https://github.com/wshobson/agents.git
synced 2026-03-18 09:37:15 +00:00
⚡ Bolt: Parallelize Prompt Evaluation in optimize-prompt.py (#145)
* feat: Parallelize prompt evaluation in optimize-prompt.py - Update `PromptOptimizer.evaluate_prompt` to use `ThreadPoolExecutor` for concurrent test case processing - Significantly reduces total execution time when using high-latency LLM clients (network IO bound) - Maintain accurate metric aggregation (latency, accuracy, token count) from parallel results - This prepares the script for real-world usage where sequential execution is a major bottleneck ⚡ Bolt: Reduces total evaluation time from O(n) to O(1) latency-wise (bounded by max_workers) for concurrent requests. * feat: Parallelize prompt evaluation in optimize-prompt.py - Update `PromptOptimizer.evaluate_prompt` to use `ThreadPoolExecutor` for concurrent test case processing - Significantly reduces total execution time when using high-latency LLM clients (network IO bound) - Maintain accurate metric aggregation (latency, accuracy, token count) from parallel results - Ensure no generated artifacts (`optimization_results.json`) are committed ⚡ Bolt: Reduces total evaluation time from O(n) to O(1) latency-wise (bounded by max_workers) for concurrent requests. --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
8879465553
commit
70cf3f3682
@@ -9,6 +9,7 @@ import json
|
|||||||
import time
|
import time
|
||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
@@ -26,7 +27,7 @@ class PromptOptimizer:
|
|||||||
self.results_history = []
|
self.results_history = []
|
||||||
|
|
||||||
def evaluate_prompt(self, prompt_template: str, test_cases: List[TestCase] = None) -> Dict[str, float]:
|
def evaluate_prompt(self, prompt_template: str, test_cases: List[TestCase] = None) -> Dict[str, float]:
|
||||||
"""Evaluate a prompt template against test cases."""
|
"""Evaluate a prompt template against test cases in parallel."""
|
||||||
if test_cases is None:
|
if test_cases is None:
|
||||||
test_cases = self.test_suite
|
test_cases = self.test_suite
|
||||||
|
|
||||||
@@ -37,7 +38,7 @@ class PromptOptimizer:
|
|||||||
'success_rate': []
|
'success_rate': []
|
||||||
}
|
}
|
||||||
|
|
||||||
for test_case in test_cases:
|
def process_test_case(test_case):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# Render prompt with test case inputs
|
# Render prompt with test case inputs
|
||||||
@@ -49,16 +50,29 @@ class PromptOptimizer:
|
|||||||
# Measure latency
|
# Measure latency
|
||||||
latency = time.time() - start_time
|
latency = time.time() - start_time
|
||||||
|
|
||||||
# Calculate metrics
|
# Calculate individual metrics
|
||||||
metrics['latency'].append(latency)
|
token_count = len(prompt.split()) + len(response.split())
|
||||||
metrics['token_count'].append(len(prompt.split()) + len(response.split()))
|
success = 1 if response else 0
|
||||||
metrics['success_rate'].append(1 if response else 0)
|
|
||||||
|
|
||||||
# Check accuracy
|
|
||||||
accuracy = self.calculate_accuracy(response, test_case.expected_output)
|
accuracy = self.calculate_accuracy(response, test_case.expected_output)
|
||||||
metrics['accuracy'].append(accuracy)
|
|
||||||
|
return {
|
||||||
|
'latency': latency,
|
||||||
|
'token_count': token_count,
|
||||||
|
'success_rate': success,
|
||||||
|
'accuracy': accuracy
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run test cases in parallel
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
results = list(executor.map(process_test_case, test_cases))
|
||||||
|
|
||||||
# Aggregate metrics
|
# Aggregate metrics
|
||||||
|
for result in results:
|
||||||
|
metrics['latency'].append(result['latency'])
|
||||||
|
metrics['token_count'].append(result['token_count'])
|
||||||
|
metrics['success_rate'].append(result['success_rate'])
|
||||||
|
metrics['accuracy'].append(result['accuracy'])
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'avg_accuracy': np.mean(metrics['accuracy']),
|
'avg_accuracy': np.mean(metrics['accuracy']),
|
||||||
'avg_latency': np.mean(metrics['latency']),
|
'avg_latency': np.mean(metrics['latency']),
|
||||||
|
|||||||
Reference in New Issue
Block a user