Bolt: Parallelize Prompt Evaluation in optimize-prompt.py (#145)

* feat: Parallelize prompt evaluation in optimize-prompt.py

- Update `PromptOptimizer.evaluate_prompt` to use `ThreadPoolExecutor` for concurrent test case processing
- Significantly reduces total execution time when using high-latency LLM clients (network IO bound)
- Maintain accurate metric aggregation (latency, accuracy, token count) from parallel results
- This prepares the script for real-world usage where sequential execution is a major bottleneck

 Bolt: Reduces total evaluation time from O(n) to O(1) latency-wise (bounded by max_workers) for concurrent requests.

* feat: Parallelize prompt evaluation in optimize-prompt.py

- Update `PromptOptimizer.evaluate_prompt` to use `ThreadPoolExecutor` for concurrent test case processing
- Significantly reduces total execution time when using high-latency LLM clients (network IO bound)
- Maintain accurate metric aggregation (latency, accuracy, token count) from parallel results
- Ensure no generated artifacts (`optimization_results.json`) are committed

 Bolt: Reduces total evaluation time from O(n) to O(1) latency-wise (bounded by max_workers) for concurrent requests.

---------

Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
This commit is contained in:
google-labs-jules[bot]
2025-12-19 09:12:15 -05:00
committed by GitHub
parent 8879465553
commit 70cf3f3682

View File

@@ -9,6 +9,7 @@ import json
import time import time
from typing import List, Dict, Any from typing import List, Dict, Any
from dataclasses import dataclass from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor
import numpy as np import numpy as np
@@ -26,7 +27,7 @@ class PromptOptimizer:
self.results_history = [] self.results_history = []
def evaluate_prompt(self, prompt_template: str, test_cases: List[TestCase] = None) -> Dict[str, float]: def evaluate_prompt(self, prompt_template: str, test_cases: List[TestCase] = None) -> Dict[str, float]:
"""Evaluate a prompt template against test cases.""" """Evaluate a prompt template against test cases in parallel."""
if test_cases is None: if test_cases is None:
test_cases = self.test_suite test_cases = self.test_suite
@@ -37,7 +38,7 @@ class PromptOptimizer:
'success_rate': [] 'success_rate': []
} }
for test_case in test_cases: def process_test_case(test_case):
start_time = time.time() start_time = time.time()
# Render prompt with test case inputs # Render prompt with test case inputs
@@ -49,16 +50,29 @@ class PromptOptimizer:
# Measure latency # Measure latency
latency = time.time() - start_time latency = time.time() - start_time
# Calculate metrics # Calculate individual metrics
metrics['latency'].append(latency) token_count = len(prompt.split()) + len(response.split())
metrics['token_count'].append(len(prompt.split()) + len(response.split())) success = 1 if response else 0
metrics['success_rate'].append(1 if response else 0)
# Check accuracy
accuracy = self.calculate_accuracy(response, test_case.expected_output) accuracy = self.calculate_accuracy(response, test_case.expected_output)
metrics['accuracy'].append(accuracy)
return {
'latency': latency,
'token_count': token_count,
'success_rate': success,
'accuracy': accuracy
}
# Run test cases in parallel
with ThreadPoolExecutor() as executor:
results = list(executor.map(process_test_case, test_cases))
# Aggregate metrics # Aggregate metrics
for result in results:
metrics['latency'].append(result['latency'])
metrics['token_count'].append(result['token_count'])
metrics['success_rate'].append(result['success_rate'])
metrics['accuracy'].append(result['accuracy'])
return { return {
'avg_accuracy': np.mean(metrics['accuracy']), 'avg_accuracy': np.mean(metrics['accuracy']),
'avg_latency': np.mean(metrics['latency']), 'avg_latency': np.mean(metrics['latency']),