๐ก Comprehensive Examples & Use Cases¶
๐ฏ Example Categories¶
๐ Getting StartedBasic patterns & quick setup | โก PerformanceAsync & high-throughput | ๐ง CustomizationExtensions & custom logic | ๐ข EnterpriseProduction deployments |
๐ Getting Started Examples¶
๐ Basic Model Evaluation¶
**Quick 5-minute evaluation setup** **๐ Complete File**: [`examples/basic_usage.py`](../../examples/basic_usage.py)
"""
Basic model evaluation example
Perfect for testing single models quickly
"""
from llm_evaluation_framework import (
ModelRegistry,
ModelInferenceEngine,
TestDatasetGenerator
)
def basic_evaluation_example():
"""Simple evaluation workflow"""
# 1๏ธโฃ Initialize components
registry = ModelRegistry()
generator = TestDatasetGenerator()
engine = ModelInferenceEngine(registry)
# 2๏ธโฃ Register your model
registry.register_model("gpt-3.5-turbo", {
"provider": "openai",
"api_cost_input": 0.0015, # Cost per 1K input tokens
"api_cost_output": 0.002, # Cost per 1K output tokens
"capabilities": ["reasoning", "creativity", "coding"],
"parameters": {
"temperature": 0.7,
"max_tokens": 1000,
"top_p": 0.9
}
})
# 3๏ธโฃ Generate test cases
use_case = {
"domain": "customer_service",
"required_capabilities": ["reasoning", "empathy"],
"difficulty": "medium"
}
test_cases = generator.generate_test_cases(use_case, count=20)
print(f"Generated {len(test_cases)} test cases")
# 4๏ธโฃ Run evaluation
results = engine.evaluate_model("gpt-3.5-turbo", test_cases, use_case)
# 5๏ธโฃ Display results
metrics = results['aggregate_metrics']
print(f"๐ฏ Accuracy: {metrics['accuracy']:.1%}")
print(f"๐ฐ Total Cost: ${metrics['total_cost']:.4f}")
print(f"โฑ๏ธ Average Time: {metrics['average_response_time']:.2f}s")
print(f"โ
Success Rate: {metrics['success_rate']:.1%}")
return results
if __name__ == "__main__":
results = basic_evaluation_example()
๐ Model Comparison¶
**Compare multiple models side-by-side** **๐ Complete File**: [`examples/model_comparison.py`](../../examples/model_comparison.py)
"""
Multi-model comparison example
Compare different LLMs on the same test cases
"""
def model_comparison_example():
"""Compare multiple models on identical test cases"""
registry = ModelRegistry()
generator = TestDatasetGenerator()
engine = ModelInferenceEngine(registry)
# Register multiple models
models_config = {
"gpt-3.5-turbo": {
"provider": "openai",
"api_cost_input": 0.0015,
"api_cost_output": 0.002,
"capabilities": ["reasoning", "creativity"]
},
"claude-3-sonnet": {
"provider": "anthropic",
"api_cost_input": 0.003,
"api_cost_output": 0.015,
"capabilities": ["reasoning", "analysis", "writing"]
},
"gpt-4": {
"provider": "openai",
"api_cost_input": 0.03,
"api_cost_output": 0.06,
"capabilities": ["reasoning", "creativity", "coding"]
}
}
# Register all models
for model_name, config in models_config.items():
registry.register_model(model_name, config)
# Generate common test cases
use_case = {
"domain": "technical_writing",
"required_capabilities": ["reasoning", "analysis"],
"difficulty": "hard"
}
test_cases = generator.generate_test_cases(use_case, count=30)
# Evaluate each model
comparison_results = {}
for model_name in models_config.keys():
print(f"\n๐ค Evaluating {model_name}...")
results = engine.evaluate_model(model_name, test_cases, use_case)
comparison_results[model_name] = results['aggregate_metrics']
print(f" Accuracy: {results['aggregate_metrics']['accuracy']:.1%}")
print(f" Cost: ${results['aggregate_metrics']['total_cost']:.4f}")
# Create comparison table
print("\n๐ COMPARISON RESULTS")
print("="*60)
print(f"{'Model':<20} {'Accuracy':<12} {'Cost':<10} {'Time':<8}")
print("-"*60)
for model, metrics in comparison_results.items():
print(f"{model:<20} {metrics['accuracy']:<12.1%} "
f"${metrics['total_cost']:<9.4f} {metrics['average_response_time']:<8.2f}s")
# Find best model by criteria
best_accuracy = max(comparison_results.items(),
key=lambda x: x[1]['accuracy'])
best_cost = min(comparison_results.items(),
key=lambda x: x[1]['total_cost'])
print(f"\n๐ Best Accuracy: {best_accuracy[0]} ({best_accuracy[1]['accuracy']:.1%})")
print(f"๐ฐ Most Cost-Effective: {best_cost[0]} (${best_cost[1]['total_cost']:.4f})")
return comparison_results
if __name__ == "__main__":
comparison_results = model_comparison_example()
โก High-Performance Examples¶
๐ Async Concurrent Evaluation¶
**Maximum throughput with async processing** **๐ Complete File**: [`examples/advanced_async_usage.py`](../../examples/advanced_async_usage.py)
"""
High-performance async evaluation example
Demonstrates concurrent processing for maximum throughput
"""
import asyncio
import time
from llm_evaluation_framework.engines.async_inference_engine import AsyncInferenceEngine
async def high_performance_evaluation():
"""
Execute multiple evaluations concurrently
Ideal for batch processing and high-throughput scenarios
"""
# Setup
registry = ModelRegistry()
generator = TestDatasetGenerator()
async_engine = AsyncInferenceEngine(registry)
# Register models for concurrent testing
models = ["gpt-3.5-turbo", "claude-3-sonnet", "gpt-4"]
for model in models:
registry.register_model(model, {
"provider": "openai" if "gpt" in model else "anthropic",
"capabilities": ["reasoning", "creativity"]
})
# Generate different test suites
test_suites = {
"reasoning": generator.generate_test_cases(
{"domain": "logic", "required_capabilities": ["reasoning"]},
count=50
),
"creativity": generator.generate_test_cases(
{"domain": "creative", "required_capabilities": ["creativity"]},
count=30
),
"coding": generator.generate_test_cases(
{"domain": "programming", "required_capabilities": ["coding"]},
count=40
)
}
# Create concurrent evaluation tasks
tasks = []
start_time = time.time()
for model in models:
for suite_name, test_cases in test_suites.items():
task = async_engine.evaluate_model_async(
model_name=model,
test_cases=test_cases,
use_case={"domain": suite_name},
max_concurrent=15, # Concurrent requests per model
timeout=45.0
)
tasks.append((model, suite_name, task))
print(f"๐ Starting {len(tasks)} concurrent evaluations...")
# Execute all tasks concurrently
results = await asyncio.gather(
*[task for _, _, task in tasks],
return_exceptions=True
)
total_time = time.time() - start_time
# Process results
successful_evaluations = 0
total_tests = 0
total_cost = 0.0
for (model, suite, _), result in zip(tasks, results):
if isinstance(result, Exception):
print(f"โ {model} ({suite}): {result}")
else:
successful_evaluations += 1
metrics = result['aggregate_metrics']
total_tests += metrics['test_count']
total_cost += metrics['total_cost']
print(f"โ
{model} ({suite}): "
f"{metrics['accuracy']:.1%} accuracy, "
f"${metrics['total_cost']:.4f} cost")
# Performance summary
throughput = total_tests / total_time if total_time > 0 else 0
print(f"\n๐ PERFORMANCE SUMMARY")
print(f"Total Time: {total_time:.2f}s")
print(f"Total Tests: {total_tests}")
print(f"Throughput: {throughput:.1f} tests/second")
print(f"Total Cost: ${total_cost:.4f}")
print(f"Success Rate: {successful_evaluations}/{len(tasks)} evaluations")
return results
async def batch_processing_example():
"""
Process large datasets in optimized batches
"""
async_engine = AsyncInferenceEngine(registry)
# Large dataset simulation
large_dataset = generator.generate_test_cases(
{"domain": "general"},
count=1000 # Large number of test cases
)
# Process in optimized batches
batch_size = 50
batch_results = []
for i in range(0, len(large_dataset), batch_size):
batch = large_dataset[i:i + batch_size]
print(f"Processing batch {i//batch_size + 1}/{len(large_dataset)//batch_size + 1}")
batch_result = await async_engine.evaluate_model_async(
model_name="gpt-3.5-turbo",
test_cases=batch,
max_concurrent=20
)
batch_results.append(batch_result)
# Aggregate batch results
total_accuracy = sum(r['aggregate_metrics']['accuracy'] for r in batch_results)
average_accuracy = total_accuracy / len(batch_results)
print(f"๐ฏ Final Average Accuracy: {average_accuracy:.1%}")
return batch_results
if __name__ == "__main__":
# Run high-performance evaluation
results = asyncio.run(high_performance_evaluation())
# Run batch processing example
batch_results = asyncio.run(batch_processing_example())
๐ Real-time Streaming Evaluation¶
**Process evaluation results as they complete**
"""
Real-time streaming evaluation with progress monitoring
Perfect for long-running evaluations with live updates
"""
import asyncio
from typing import AsyncGenerator, Dict, Any
async def streaming_evaluation_example():
"""
Stream evaluation results in real-time as they complete
"""
async def evaluation_stream() -> AsyncGenerator[Dict[str, Any], None]:
"""Generator that yields evaluation results as they complete"""
# Setup
async_engine = AsyncInferenceEngine(registry)
test_cases = generator.generate_test_cases({"domain": "general"}, count=100)
# Process each test case individually for streaming
semaphore = asyncio.Semaphore(10) # Limit concurrent requests
async def evaluate_single_test(test_case: Dict, test_id: int):
async with semaphore:
try:
result = await async_engine.evaluate_single_test(
model_name="gpt-3.5-turbo",
test_case=test_case,
timeout=30.0
)
return {
'test_id': test_id,
'status': 'completed',
'result': result,
'timestamp': time.time()
}
except Exception as e:
return {
'test_id': test_id,
'status': 'failed',
'error': str(e),
'timestamp': time.time()
}
# Create tasks for all test cases
tasks = [
evaluate_single_test(test_case, i)
for i, test_case in enumerate(test_cases)
]
# Yield results as they complete
for completed_task in asyncio.as_completed(tasks):
result = await completed_task
yield result
# Consume the stream
completed_count = 0
failed_count = 0
total_accuracy = 0.0
print("๐ Starting streaming evaluation...")
print("Results will appear in real-time as they complete:\n")
async for result in evaluation_stream():
if result['status'] == 'completed':
completed_count += 1
accuracy = result['result']['score']
total_accuracy += accuracy
print(f"โ
Test {result['test_id']:3d}: {accuracy:.1%} accuracy")
# Show running statistics every 10 completions
if completed_count % 10 == 0:
avg_accuracy = total_accuracy / completed_count
total_processed = completed_count + failed_count
print(f"๐ Progress: {total_processed}/100 | "
f"Avg Accuracy: {avg_accuracy:.1%} | "
f"Success Rate: {completed_count/total_processed:.1%}")
print("-" * 50)
else:
failed_count += 1
print(f"โ Test {result['test_id']:3d}: {result['error']}")
# Final summary
final_accuracy = total_accuracy / completed_count if completed_count > 0 else 0
total_tests = completed_count + failed_count
print(f"\n๐ฏ FINAL RESULTS")
print(f"Total Tests: {total_tests}")
print(f"Completed: {completed_count}")
print(f"Failed: {failed_count}")
print(f"Final Accuracy: {final_accuracy:.1%}")
print(f"Success Rate: {completed_count/total_tests:.1%}")
if __name__ == "__main__":
asyncio.run(streaming_evaluation_example())
๐ง Customization Examples¶
๐ฏ Custom Scoring Strategies¶
**Build domain-specific evaluation metrics** **๐ Complete File**: [`examples/custom_scoring_and_persistence.py`](../../examples/custom_scoring_and_persistence.py)
"""
Custom scoring strategies for specialized evaluation needs
Examples include domain-specific metrics and multi-dimensional scoring
"""
from llm_evaluation_framework.evaluation.scoring_strategies import ScoringStrategy
from typing import List, Dict, Any
import re
import json
class BusinessWritingScorer(ScoringStrategy):
"""
Specialized scorer for business communication evaluation
Evaluates clarity, professionalism, tone, and effectiveness
"""
def __init__(self, weights: Dict[str, float] = None):
self.weights = weights or {
'clarity': 0.3,
'professionalism': 0.25,
'tone': 0.2,
'effectiveness': 0.25
}
def calculate_score(self, predictions: List[str], references: List[str]) -> Dict[str, Any]:
"""Calculate comprehensive business writing scores"""
component_scores = {
'clarity': [],
'professionalism': [],
'tone': [],
'effectiveness': []
}
for pred, ref in zip(predictions, references):
# Evaluate each component
clarity = self._evaluate_clarity(pred)
professionalism = self._evaluate_professionalism(pred)
tone = self._evaluate_tone(pred, ref)
effectiveness = self._evaluate_effectiveness(pred, ref)
component_scores['clarity'].append(clarity)
component_scores['professionalism'].append(professionalism)
component_scores['tone'].append(tone)
component_scores['effectiveness'].append(effectiveness)
# Calculate averages
avg_scores = {
component: sum(scores) / len(scores)
for component, scores in component_scores.items()
}
# Calculate weighted overall score
overall_score = sum(
avg_scores[component] * self.weights[component]
for component in avg_scores
)
return {
'overall_score': overall_score,
'component_scores': avg_scores,
'detailed_scores': component_scores
}
def _evaluate_clarity(self, text: str) -> float:
"""Evaluate text clarity (sentence length, jargon, structure)"""
sentences = text.split('.')
avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
# Optimal sentence length is 15-20 words
if 15 <= avg_sentence_length <= 20:
length_score = 1.0
else:
length_score = max(0, 1 - abs(avg_sentence_length - 17.5) / 17.5)
# Check for clarity indicators
clarity_indicators = [
'therefore', 'however', 'furthermore', 'in conclusion',
'for example', 'specifically', 'in other words'
]
indicator_count = sum(1 for indicator in clarity_indicators
if indicator in text.lower())
indicator_score = min(1.0, indicator_count / 3)
return (length_score + indicator_score) / 2
def _evaluate_professionalism(self, text: str) -> float:
"""Evaluate professional tone and language"""
# Check for unprofessional elements
unprofessional_patterns = [
r'\b(yeah|yep|nah|gonna|wanna|gotta)\b',
r'[!]{2,}', # Multiple exclamation marks
r'[A-Z]{3,}', # ALL CAPS words
r'\b(awesome|cool|amazing|totally)\b'
]
violations = sum(len(re.findall(pattern, text, re.IGNORECASE))
for pattern in unprofessional_patterns)
# Professional language indicators
professional_indicators = [
'please', 'thank you', 'we appreciate', 'we recommend',
'kindly', 'respectfully', 'sincerely'
]
professional_count = sum(1 for indicator in professional_indicators
if indicator in text.lower())
violation_penalty = max(0, 1 - violations * 0.2)
professional_bonus = min(0.5, professional_count * 0.1)
return min(1.0, violation_penalty + professional_bonus)
class TechnicalAccuracyScorer(ScoringStrategy):
"""
Scorer for technical content accuracy
Validates technical facts, terminology, and implementation details
"""
def __init__(self, domain_knowledge: Dict[str, Any]):
self.domain_knowledge = domain_knowledge
self.technical_terms = domain_knowledge.get('technical_terms', [])
self.facts_db = domain_knowledge.get('facts', {})
def calculate_score(self, predictions: List[str], references: List[str]) -> float:
"""Calculate technical accuracy score"""
total_score = 0.0
for pred, ref in zip(predictions, references):
# Technical terminology accuracy
terminology_score = self._check_terminology(pred, ref)
# Factual accuracy
factual_score = self._check_facts(pred, ref)
# Implementation details accuracy
implementation_score = self._check_implementation(pred, ref)
# Weighted combination
combined_score = (
0.4 * terminology_score +
0.4 * factual_score +
0.2 * implementation_score
)
total_score += combined_score
return total_score / len(predictions)
def _check_terminology(self, prediction: str, reference: str) -> float:
"""Check for correct technical terminology usage"""
pred_terms = set(term.lower() for term in self.technical_terms
if term.lower() in prediction.lower())
ref_terms = set(term.lower() for term in self.technical_terms
if term.lower() in reference.lower())
if not ref_terms:
return 1.0
correct_terms = pred_terms.intersection(ref_terms)
return len(correct_terms) / len(ref_terms)
def _check_facts(self, prediction: str, reference: str) -> float:
"""Validate technical facts against knowledge base"""
score = 1.0
for fact_key, fact_value in self.facts_db.items():
if fact_key.lower() in prediction.lower():
# Check if the stated fact matches the knowledge base
if str(fact_value).lower() not in prediction.lower():
score -= 0.2 # Penalty for incorrect fact
return max(0.0, score)
# Usage example
def custom_scoring_example():
"""Demonstrate custom scoring strategies"""
# Business writing evaluation
business_scorer = BusinessWritingScorer()
business_predictions = [
"Dear valued customer, we sincerely appreciate your inquiry regarding our premium services. We would be delighted to provide you with comprehensive information.",
"hey there! ur question is AWESOME!!! we totally love helping customers like u :)"
]
business_references = [
"Professional response with clear information and courteous tone.",
"Professional response with clear information and courteous tone."
]
business_scores = business_scorer.calculate_score(business_predictions, business_references)
print("๐ Business Writing Scores:")
print(f"Overall Score: {business_scores['overall_score']:.2f}")
for component, score in business_scores['component_scores'].items():
print(f" {component.title()}: {score:.2f}")
# Technical accuracy evaluation
domain_knowledge = {
'technical_terms': ['API', 'REST', 'JSON', 'authentication', 'HTTP'],
'facts': {
'HTTP status 200': 'success',
'HTTP status 404': 'not found',
'REST': 'architectural style'
}
}
technical_scorer = TechnicalAccuracyScorer(domain_knowledge)
technical_predictions = [
"The REST API returns HTTP 200 status for successful requests and uses JSON format for data exchange.",
"The REST API returns HTTP 200 status for failed requests and uses XML format."
]
technical_references = [
"REST APIs typically return HTTP 200 for success and use JSON format.",
"REST APIs typically return HTTP 200 for success and use JSON format."
]
technical_score = technical_scorer.calculate_score(technical_predictions, technical_references)
print(f"\n๐ง Technical Accuracy Score: {technical_score:.2f}")
if __name__ == "__main__":
custom_scoring_example()
๐๏ธ Custom Persistence Backends¶
**Integrate with your existing data infrastructure**
"""
Custom persistence backend examples
Demonstrates integration with various storage systems
"""
import sqlite3
import json
from typing import Dict, Any, List
from llm_evaluation_framework.persistence.base_store import BaseStore
class CustomDatabaseStore(BaseStore):
"""
Custom database persistence for evaluation results
Supports complex queries and relational data
"""
def __init__(self, database_path: str):
self.database_path = database_path
self._initialize_database()
def _initialize_database(self):
"""Initialize database schema"""
with sqlite3.connect(self.database_path) as conn:
conn.execute('''
CREATE TABLE IF NOT EXISTS evaluations (
id TEXT PRIMARY KEY,
model_name TEXT NOT NULL,
timestamp TEXT NOT NULL,
accuracy REAL,
total_cost REAL,
test_count INTEGER,
metadata TEXT,
full_results TEXT
)
''')
conn.execute('''
CREATE INDEX IF NOT EXISTS idx_model_name ON evaluations(model_name)
''')
conn.execute('''
CREATE INDEX IF NOT EXISTS idx_timestamp ON evaluations(timestamp)
''')
def save(self, key: str, data: Dict[str, Any]) -> bool:
"""Save evaluation results with structured data"""
try:
with sqlite3.connect(self.database_path) as conn:
metrics = data.get('aggregate_metrics', {})
conn.execute('''
INSERT OR REPLACE INTO evaluations
(id, model_name, timestamp, accuracy, total_cost, test_count, metadata, full_results)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''', (
key,
data.get('model_name', ''),
data.get('timestamp', ''),
metrics.get('accuracy', 0.0),
metrics.get('total_cost', 0.0),
metrics.get('test_count', 0),
json.dumps(data.get('metadata', {})),
json.dumps(data)
))
return True
except Exception as e:
print(f"Save failed: {e}")
return False
def load(self, key: str) -> Dict[str, Any]:
"""Load evaluation results by key"""
with sqlite3.connect(self.database_path) as conn:
cursor = conn.execute('SELECT full_results FROM evaluations WHERE id = ?', (key,))
row = cursor.fetchone()
if row is None:
raise KeyError(f"Key not found: {key}")
return json.loads(row[0])
def query_by_model(self, model_name: str, limit: int = 10) -> List[Dict[str, Any]]:
"""Query evaluations by model name"""
with sqlite3.connect(self.database_path) as conn:
cursor = conn.execute('''
SELECT id, accuracy, total_cost, timestamp
FROM evaluations
WHERE model_name = ?
ORDER BY timestamp DESC
LIMIT ?
''', (model_name, limit))
return [
{
'id': row[0],
'accuracy': row[1],
'total_cost': row[2],
'timestamp': row[3]
}
for row in cursor.fetchall()
]
def get_model_statistics(self, model_name: str) -> Dict[str, Any]:
"""Get aggregated statistics for a model"""
with sqlite3.connect(self.database_path) as conn:
cursor = conn.execute('''
SELECT
COUNT(*) as evaluation_count,
AVG(accuracy) as avg_accuracy,
MAX(accuracy) as max_accuracy,
MIN(accuracy) as min_accuracy,
SUM(total_cost) as total_cost_sum,
SUM(test_count) as total_tests
FROM evaluations
WHERE model_name = ?
''', (model_name,))
row = cursor.fetchone()
return {
'evaluation_count': row[0],
'average_accuracy': row[1] or 0.0,
'max_accuracy': row[2] or 0.0,
'min_accuracy': row[3] or 0.0,
'total_cost': row[4] or 0.0,
'total_tests': row[5] or 0
}
def custom_persistence_example():
"""Demonstrate custom persistence backend usage"""
# Initialize custom database store
db_store = CustomDatabaseStore('evaluation_results.db')
# Setup evaluation framework with custom store
persistence_manager = PersistenceManager({
'database': db_store,
'json': JsonStore('./backup_results/')
})
# Run evaluation and save to custom backend
registry = ModelRegistry()
engine = ModelInferenceEngine(registry, persistence_manager)
# Register model and run evaluation
registry.register_model("gpt-3.5-turbo", {
"provider": "openai",
"capabilities": ["reasoning"]
})
test_cases = generator.generate_test_cases({"domain": "general"}, count=10)
results = engine.evaluate_model("gpt-3.5-turbo", test_cases)
# Save to custom database
evaluation_id = f"eval_{int(time.time())}"
db_store.save(evaluation_id, results)
# Query using custom methods
model_stats = db_store.get_model_statistics("gpt-3.5-turbo")
print(f"๐ Model Statistics: {model_stats}")
recent_evaluations = db_store.query_by_model("gpt-3.5-turbo", limit=5)
print(f"๐ Recent Evaluations: {len(recent_evaluations)} found")
if __name__ == "__main__":
custom_persistence_example()
๐ข Enterprise Examples¶
๐ CI/CD Integration¶
**Integrate LLM evaluation into your development workflow** **๐ Complete Files**: - [`examples/ci_cd_integration.py`](../../examples/ci_cd_integration.py) - [`.github/workflows/llm_evaluation.yml`](../../.github/workflows/llm_evaluation.yml)
"""
CI/CD Integration Example
Automated LLM evaluation in continuous integration pipelines
"""
import os
import sys
import json
from typing import Dict, Any, List
class CIPipelineEvaluator:
"""
Evaluation system designed for CI/CD pipelines
Includes test result reporting, failure thresholds, and artifact generation
"""
def __init__(self, config_path: str = "ci_evaluation_config.json"):
self.config = self._load_config(config_path)
self.registry = ModelRegistry()
self.engine = ModelInferenceEngine(self.registry)
self.test_results = []
def _load_config(self, config_path: str) -> Dict[str, Any]:
"""Load CI configuration"""
try:
with open(config_path, 'r') as f:
return json.load(f)
except FileNotFoundError:
return self._default_config()
def _default_config(self) -> Dict[str, Any]:
"""Default CI configuration"""
return {
"models_to_test": ["gpt-3.5-turbo"],
"test_suite": "regression",
"accuracy_threshold": 0.85,
"cost_threshold": 1.00,
"timeout_threshold": 30.0,
"generate_report": True,
"fail_on_threshold": True
}
def run_regression_tests(self) -> Dict[str, Any]:
"""Run regression tests for all configured models"""
print("๐ Starting CI/CD LLM Evaluation Pipeline")
print(f"Testing models: {self.config['models_to_test']}")
# Load regression test suite
test_cases = self._load_regression_tests()
overall_results = {
'pipeline_status': 'success',
'model_results': {},
'summary': {}
}
for model_name in self.config['models_to_test']:
print(f"\n๐ค Testing model: {model_name}")
try:
# Register model from environment
self._register_model_from_env(model_name)
# Run evaluation
results = self.engine.evaluate_model(model_name, test_cases)
# Check thresholds
threshold_results = self._check_thresholds(results, model_name)
overall_results['model_results'][model_name] = {
'evaluation_results': results,
'threshold_checks': threshold_results,
'status': 'passed' if threshold_results['all_passed'] else 'failed'
}
if not threshold_results['all_passed']:
overall_results['pipeline_status'] = 'failed'
except Exception as e:
print(f"โ Model {model_name} evaluation failed: {e}")
overall_results['model_results'][model_name] = {
'status': 'error',
'error': str(e)
}
overall_results['pipeline_status'] = 'failed'
# Generate summary
overall_results['summary'] = self._generate_summary(overall_results)
# Generate reports
if self.config['generate_report']:
self._generate_ci_report(overall_results)
# Exit with appropriate code
if self.config['fail_on_threshold'] and overall_results['pipeline_status'] == 'failed':
print("โ Pipeline failed due to threshold violations")
sys.exit(1)
print("โ
Pipeline completed successfully")
return overall_results
def _load_regression_tests(self) -> List[Dict]:
"""Load regression test cases"""
test_file = f"tests/regression_{self.config['test_suite']}.json"
try:
with open(test_file, 'r') as f:
return json.load(f)
except FileNotFoundError:
# Generate minimal test suite if not found
generator = TestDatasetGenerator()
return generator.generate_test_cases(
{"domain": "regression", "required_capabilities": ["reasoning"]},
count=20
)
def _register_model_from_env(self, model_name: str):
"""Register model using environment variables"""
# Get configuration from environment
api_key = os.getenv(f"{model_name.upper().replace('-', '_')}_API_KEY")
if not api_key:
raise ValueError(f"API key not found for {model_name}")
# Basic model configuration
config = {
"provider": "openai" if "gpt" in model_name else "anthropic",
"api_key": api_key,
"capabilities": ["reasoning", "creativity"]
}
self.registry.register_model(model_name, config)
def _check_thresholds(self, results: Dict, model_name: str) -> Dict[str, Any]:
"""Check if results meet quality thresholds"""
metrics = results['aggregate_metrics']
threshold_checks = {}
# Accuracy threshold
accuracy_passed = metrics['accuracy'] >= self.config['accuracy_threshold']
threshold_checks['accuracy'] = {
'value': metrics['accuracy'],
'threshold': self.config['accuracy_threshold'],
'passed': accuracy_passed
}
# Cost threshold
cost_passed = metrics['total_cost'] <= self.config['cost_threshold']
threshold_checks['cost'] = {
'value': metrics['total_cost'],
'threshold': self.config['cost_threshold'],
'passed': cost_passed
}
# Timeout threshold
timeout_passed = metrics['average_response_time'] <= self.config['timeout_threshold']
threshold_checks['response_time'] = {
'value': metrics['average_response_time'],
'threshold': self.config['timeout_threshold'],
'passed': timeout_passed
}
threshold_checks['all_passed'] = all(
check['passed'] for check in threshold_checks.values()
)
# Print threshold results
for metric, check in threshold_checks.items():
if metric == 'all_passed':
continue
status = "โ
" if check['passed'] else "โ"
print(f" {status} {metric}: {check['value']:.4f} "
f"(threshold: {check['threshold']:.4f})")
return threshold_checks
def _generate_ci_report(self, results: Dict[str, Any]):
"""Generate CI-friendly evaluation report"""
# Create reports directory
os.makedirs('reports', exist_ok=True)
# Generate JSON report
with open('reports/llm_evaluation_report.json', 'w') as f:
json.dump(results, f, indent=2, default=str)
# Generate markdown report
markdown_report = self._create_markdown_report(results)
with open('reports/llm_evaluation_report.md', 'w') as f:
f.write(markdown_report)
print("๐ Reports generated in ./reports/")
def _create_markdown_report(self, results: Dict[str, Any]) -> str:
"""Create markdown evaluation report"""
report = "# LLM Evaluation Report\n\n"
report += f"**Pipeline Status**: {results['pipeline_status'].upper()}\n\n"
report += "## Model Results\n\n"
for model_name, model_results in results['model_results'].items():
report += f"### {model_name}\n\n"
if model_results['status'] == 'error':
report += f"โ **Error**: {model_results['error']}\n\n"
continue
eval_results = model_results['evaluation_results']
metrics = eval_results['aggregate_metrics']
report += f"- **Status**: {model_results['status'].upper()}\n"
report += f"- **Accuracy**: {metrics['accuracy']:.1%}\n"
report += f"- **Total Cost**: ${metrics['total_cost']:.4f}\n"
report += f"- **Average Response Time**: {metrics['average_response_time']:.2f}s\n"
report += f"- **Success Rate**: {metrics['success_rate']:.1%}\n\n"
# Threshold checks
threshold_checks = model_results['threshold_checks']
report += "#### Threshold Checks\n\n"
for metric, check in threshold_checks.items():
if metric == 'all_passed':
continue
status = "โ
Pass" if check['passed'] else "โ Fail"
report += f"- **{metric.title()}**: {status} "
report += f"({check['value']:.4f} vs {check['threshold']:.4f})\n"
report += "\n"
return report
# CI/CD Usage Script
def main():
"""Main CI/CD evaluation script"""
# Initialize evaluator
evaluator = CIPipelineEvaluator()
# Run regression tests
results = evaluator.run_regression_tests()
# Additional CI/CD specific actions
if os.getenv('CI') == 'true':
# Running in CI environment
print("๐ CI environment detected")
# Set GitHub Actions outputs
if os.getenv('GITHUB_ACTIONS') == 'true':
with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
f.write(f"evaluation_status={results['pipeline_status']}\n")
f.write(f"report_path=reports/llm_evaluation_report.md\n")
if __name__ == "__main__":
main()
๐ฏ Complete Workflow Examples¶
๐ End-to-End Evaluation Pipeline¶
**Production-ready evaluation pipeline with all features** **๐ Complete File**: [`examples/full_pipeline_demo.py`](../../examples/full_pipeline_demo.py)
"""
Complete end-to-end evaluation pipeline
Demonstrates full framework capabilities in a production workflow
"""
def complete_evaluation_pipeline():
"""
Comprehensive evaluation pipeline demonstrating all framework features
"""
print("๐ Starting Complete LLM Evaluation Pipeline")
print("=" * 60)
# 1๏ธโฃ SETUP AND CONFIGURATION
print("\n๐ Step 1: Setup and Configuration")
# Initialize all components
registry = ModelRegistry()
generator = TestDatasetGenerator()
# Setup custom persistence with multiple backends
persistence_manager = PersistenceManager({
'json': JsonStore('./results/json/'),
'database': CustomDatabaseStore('./results/evaluations.db')
})
# Initialize engines
sync_engine = ModelInferenceEngine(registry, persistence_manager)
async_engine = AsyncInferenceEngine(registry, persistence_manager)
# Setup custom scoring
custom_scorer = ComprehensiveScorer({
'accuracy': 0.4,
'fluency': 0.2,
'relevance': 0.2,
'safety': 0.2
})
print("โ
Components initialized")
# 2๏ธโฃ MODEL REGISTRATION
print("\n๐ค Step 2: Model Registration")
models_config = {
"gpt-3.5-turbo": {
"provider": "openai",
"api_cost_input": 0.0015,
"api_cost_output": 0.002,
"capabilities": ["reasoning", "creativity", "coding"],
"parameters": {"temperature": 0.7, "max_tokens": 1000}
},
"claude-3-sonnet": {
"provider": "anthropic",
"api_cost_input": 0.003,
"api_cost_output": 0.015,
"capabilities": ["reasoning", "analysis", "writing"],
"parameters": {"temperature": 0.7, "max_tokens": 1000}
},
"gpt-4": {
"provider": "openai",
"api_cost_input": 0.03,
"api_cost_output": 0.06,
"capabilities": ["reasoning", "creativity", "coding", "analysis"],
"parameters": {"temperature": 0.5, "max_tokens": 1500}
}
}
for model_name, config in models_config.items():
registry.register_model(model_name, config)
print(f" โ
Registered {model_name}")
# 3๏ธโฃ TEST CASE GENERATION
print("\n๐งช Step 3: Test Case Generation")
# Generate multiple test suites for different capabilities
test_suites = {}
use_cases = [
{"domain": "customer_service", "required_capabilities": ["reasoning", "empathy"], "difficulty": "medium"},
{"domain": "technical_writing", "required_capabilities": ["analysis", "clarity"], "difficulty": "hard"},
{"domain": "creative_writing", "required_capabilities": ["creativity", "fluency"], "difficulty": "medium"},
{"domain": "code_review", "required_capabilities": ["coding", "analysis"], "difficulty": "hard"}
]
for use_case in use_cases:
suite_name = use_case["domain"]
test_cases = generator.generate_test_cases(use_case, count=25)
test_suites[suite_name] = {
'use_case': use_case,
'test_cases': test_cases
}
print(f" โ
Generated {len(test_cases)} tests for {suite_name}")
# 4๏ธโฃ EVALUATION EXECUTION
print("\nโก Step 4: Evaluation Execution")
all_results = {}
for model_name in models_config.keys():
print(f"\n๐ Evaluating {model_name}...")
model_results = {}
for suite_name, suite_data in test_suites.items():
print(f" ๐งช Running {suite_name} tests...")
try:
# Run evaluation with custom scoring
results = sync_engine.evaluate_model(
model_name=model_name,
test_cases=suite_data['test_cases'],
use_case=suite_data['use_case'],
scoring_strategy=custom_scorer
)
model_results[suite_name] = results
# Display immediate results
metrics = results['aggregate_metrics']
print(f" โ
Accuracy: {metrics['accuracy']:.1%}, "
f"Cost: ${metrics['total_cost']:.4f}, "
f"Time: {metrics['average_response_time']:.2f}s")
except Exception as e:
print(f" โ Failed: {e}")
model_results[suite_name] = {'error': str(e)}
all_results[model_name] = model_results
# 5๏ธโฃ ADVANCED ANALYTICS
print("\n๐ Step 5: Advanced Analytics")
# Cross-model comparison
comparison_analysis = analyze_cross_model_performance(all_results)
# Cost-benefit analysis
cost_analysis = analyze_cost_efficiency(all_results)
# Capability analysis
capability_analysis = analyze_capability_performance(all_results, test_suites)
print("โ
Analytics completed")
# 6๏ธโฃ REPORT GENERATION
print("\n๐ Step 6: Report Generation")
# Generate comprehensive report
report = generate_comprehensive_report({
'evaluation_results': all_results,
'comparison_analysis': comparison_analysis,
'cost_analysis': cost_analysis,
'capability_analysis': capability_analysis,
'test_suites': test_suites
})
# Save reports in multiple formats
save_report(report, 'comprehensive_evaluation_report')
print("โ
Reports generated")
# 7๏ธโฃ RECOMMENDATIONS
print("\n๐ก Step 7: AI-Powered Recommendations")
# Generate recommendations using auto-suggestion engine
suggestion_engine = AutoSuggestionEngine(registry, all_results)
recommendations = suggestion_engine.generate_recommendations()
print("๐ RECOMMENDATIONS:")
for recommendation in recommendations:
print(f" โข {recommendation}")
print("\n๐ Pipeline completed successfully!")
print("=" * 60)
return {
'results': all_results,
'analytics': {
'comparison': comparison_analysis,
'cost': cost_analysis,
'capability': capability_analysis
},
'recommendations': recommendations,
'report': report
}
def analyze_cross_model_performance(results: Dict) -> Dict:
"""Analyze performance across different models"""
# Implementation for cross-model analysis
pass
def analyze_cost_efficiency(results: Dict) -> Dict:
"""Analyze cost efficiency across models and test suites"""
# Implementation for cost analysis
pass
def analyze_capability_performance(results: Dict, test_suites: Dict) -> Dict:
"""Analyze performance by capability"""
# Implementation for capability analysis
pass
def generate_comprehensive_report(data: Dict) -> Dict:
"""Generate comprehensive evaluation report"""
# Implementation for report generation
pass
def save_report(report: Dict, filename: str):
"""Save report in multiple formats"""
# Save as JSON
with open(f'reports/{filename}.json', 'w') as f:
json.dump(report, f, indent=2, default=str)
# Save as Markdown
markdown_content = create_markdown_report(report)
with open(f'reports/{filename}.md', 'w') as f:
f.write(markdown_content)
print(f"๐ Reports saved: {filename}.json, {filename}.md")
if __name__ == "__main__":
pipeline_results = complete_evaluation_pipeline()
๐ Quick Navigation to Examples¶
### ๐ **Example Files Index** | Category | Example | File | Description | |----------|---------|------|-------------| | **๐ Basic** | Simple Evaluation | [`basic_usage.py`](../../examples/basic_usage.py) | Quick model evaluation setup | | **๐ Basic** | Model Comparison | [`model_comparison.py`](../../examples/model_comparison.py) | Compare multiple models side-by-side | | **โก Performance** | Async Processing | [`advanced_async_usage.py`](../../examples/advanced_async_usage.py) | High-throughput concurrent evaluation | | **โก Performance** | Streaming Results | [`streaming_evaluation.py`](../../examples/streaming_evaluation.py) | Real-time result processing | | **๐ง Custom** | Custom Scoring | [`custom_scoring_and_persistence.py`](../../examples/custom_scoring_and_persistence.py) | Domain-specific evaluation metrics | | **๐ง Custom** | Custom Persistence | [`custom_persistence.py`](../../examples/custom_persistence.py) | Database and storage integration | | **๐ข Enterprise** | CI/CD Integration | [`ci_cd_integration.py`](../../examples/ci_cd_integration.py) | Automated pipeline evaluation | | **๐ข Enterprise** | Complete Workflow | [`full_pipeline_demo.py`](../../examples/full_pipeline_demo.py) | End-to-end production pipeline | | **๐ Analytics** | Data Analysis | [`dataset_generation_and_analysis.py`](../../examples/dataset_generation_and_analysis.py) | Test data generation and analysis | | **๐ก๏ธ Production** | Error Handling | [`error_handling_and_logging.py`](../../examples/error_handling_and_logging.py) | Robust error handling patterns | | **๐ฅ๏ธ CLI** | Command Line | [`cli_usage.py`](../../examples/cli_usage.py) | CLI workflow examples | | **๐ก AI** | Model Recommendations | [`model_recommendation.py`](../../examples/model_recommendation.py) | AI-powered model suggestions |
## ๐ Ready to Build Amazing Applications? **Choose your path and start building with confidence!** [](getting-started.md) [](../index.md) [](advanced-usage.md) [](../contributing.md) --- *Transform your ideas into production-ready LLM evaluation systems! ๐ก*