๐ API Reference¶
๐ API Overview¶
The LLM Evaluation Framework provides a comprehensive Python API with full type hints, detailed documentation, and enterprise-grade reliability. All components are designed for ease of use, extensibility, and production deployment.
๐๏ธ Module Structure¶
๐ฆ Import Structure¶
# Main framework imports
from llm_evaluation_framework import (
ModelRegistry,
ModelInferenceEngine,
TestDatasetGenerator
)
# Specialized imports
from llm_evaluation_framework.evaluation.scoring_strategies import (
AccuracyScoringStrategy,
F1ScoringStrategy,
ScoringContext
)
from llm_evaluation_framework.persistence import (
JSONStore,
DBStore,
PersistenceManager
)
from llm_evaluation_framework.utils import (
setup_logging,
retry_with_backoff,
validate_model_config
)
๐๏ธ ModelRegistry¶
Primary interface for managing model configurations and metadata.
๐ Class Definition¶
class ModelRegistry:
"""
Centralized registry for model configurations, capabilities, and metadata.
Provides methods to register, retrieve, and manage LLM model configurations
with support for multiple providers and custom parameters.
"""
๐ง Methods¶
register_model(name: str, config: Dict[str, Any]) -> None¶
Register a new model with configuration.
Parameters: - name (str): Unique model identifier - config (Dict[str, Any]): Model configuration dictionary
Configuration Schema:
{
"provider": str, # e.g., "openai", "anthropic", "azure"
"api_cost_input": float, # Cost per 1K input tokens
"api_cost_output": float, # Cost per 1K output tokens
"capabilities": List[str], # Supported capabilities
"parameters": Dict, # Model-specific parameters
"metadata": Dict # Additional metadata (optional)
}
Example:
registry = ModelRegistry()
registry.register_model("gpt-3.5-turbo", {
"provider": "openai",
"api_cost_input": 0.0015,
"api_cost_output": 0.002,
"capabilities": ["reasoning", "creativity", "coding"],
"parameters": {
"temperature": 0.7,
"max_tokens": 150,
"top_p": 1.0
},
"metadata": {
"version": "0125",
"context_window": 4096
}
})
get_model(name: str) -> Dict[str, Any]¶
Retrieve model configuration by name.
Parameters: - name (str): Model name to retrieve
Returns: - Dict[str, Any]: Model configuration dictionary
Raises: - KeyError: If model not found
Example:
try:
config = registry.get_model("gpt-3.5-turbo")
print(f"Provider: {config['provider']}")
print(f"Capabilities: {config['capabilities']}")
except KeyError:
print("Model not found")
list_models() -> List[str]¶
Get list of all registered model names.
Returns: - List[str]: List of registered model names
Example:
get_models_by_capability(capability: str) -> List[str]¶
Find models that support a specific capability.
Parameters: - capability (str): Capability to search for
Returns: - List[str]: List of model names supporting the capability
Example:
coding_models = registry.get_models_by_capability("coding")
print(f"Models with coding capability: {coding_models}")
get_available_capabilities() -> List[str]¶
Get all unique capabilities across registered models.
Returns: - List[str]: List of all available capabilities
Example:
๐ก Usage Examples¶
๐ Advanced ModelRegistry Examples
from llm_evaluation_framework import ModelRegistry
# Initialize registry
registry = ModelRegistry()
# Register multiple models
models_config = {
"gpt-3.5-turbo": {
"provider": "openai",
"api_cost_input": 0.0015,
"api_cost_output": 0.002,
"capabilities": ["reasoning", "creativity", "coding"],
"parameters": {"temperature": 0.7, "max_tokens": 150}
},
"gpt-4": {
"provider": "openai",
"api_cost_input": 0.03,
"api_cost_output": 0.06,
"capabilities": ["reasoning", "creativity", "coding", "factual"],
"parameters": {"temperature": 0.5, "max_tokens": 200}
},
"claude-3": {
"provider": "anthropic",
"api_cost_input": 0.015,
"api_cost_output": 0.075,
"capabilities": ["reasoning", "creativity", "instruction"],
"parameters": {"temperature": 0.6, "max_tokens": 180}
}
}
# Register all models
for name, config in models_config.items():
registry.register_model(name, config)
print(f"โ
Registered {name}")
# Query capabilities
print("\n๐ Capability Analysis:")
for capability in registry.get_available_capabilities():
models = registry.get_models_by_capability(capability)
print(f" {capability}: {len(models)} models - {models}")
# Cost comparison
print("\n๐ฐ Cost Comparison:")
for model_name in registry.list_models():
config = registry.get_model(model_name)
cost_input = config['api_cost_input']
cost_output = config['api_cost_output']
print(f" {model_name}: ${cost_input:.4f}/${cost_output:.3f} per 1K tokens")
โ๏ธ ModelInferenceEngine¶
Core engine for executing model evaluations and managing inference workflows.
๐ Class Definition¶
class ModelInferenceEngine:
"""
Primary engine for executing LLM inferences and evaluations.
Handles model execution, cost tracking, performance metrics,
and result aggregation with support for batch processing.
"""
def __init__(self, model_registry: ModelRegistry):
"""
Initialize inference engine with model registry.
Args:
model_registry: ModelRegistry instance for model management
"""
๐ง Methods¶
evaluate_model(model_name: str, test_cases: List[Dict], use_case_requirements: Dict = None) -> Dict[str, Any]¶
Execute comprehensive model evaluation.
Parameters: - model_name (str): Name of registered model to evaluate - test_cases (List[Dict]): List of test case dictionaries - use_case_requirements (Dict, optional): Evaluation requirements
Returns: - Dict[str, Any]: Comprehensive evaluation results
Test Case Schema:
{
"prompt": str, # Input prompt
"expected_output": str, # Expected response
"evaluation_criteria": str, # Evaluation criteria
"metadata": Dict # Additional metadata
}
Results Schema:
{
"model_name": str,
"timestamp": str,
"aggregate_metrics": {
"accuracy": float, # Overall accuracy (0.0-1.0)
"total_cost": float, # Total evaluation cost ($)
"total_time": float, # Total time (seconds)
"average_response_time": float, # Avg time per test (seconds)
"test_count": int # Number of tests executed
},
"test_results": [
{
"test_id": int,
"prompt": str,
"expected": str,
"actual": str,
"score": float, # Individual test score (0.0-1.0)
"cost": float, # Test cost ($)
"response_time": float, # Test execution time (seconds)
"metadata": Dict
}
],
"model_config": Dict, # Model configuration used
"evaluation_config": Dict # Evaluation parameters
}
Example:
from llm_evaluation_framework import ModelInferenceEngine, ModelRegistry
# Setup
registry = ModelRegistry()
registry.register_model("gpt-3.5-turbo", {...})
engine = ModelInferenceEngine(registry)
# Test cases
test_cases = [
{
"prompt": "What is 2 + 2?",
"expected_output": "4",
"evaluation_criteria": "Exact numerical answer",
"metadata": {"category": "math", "difficulty": "easy"}
},
{
"prompt": "Explain photosynthesis briefly",
"expected_output": "Process where plants convert light to energy",
"evaluation_criteria": "Accurate scientific explanation",
"metadata": {"category": "science", "difficulty": "medium"}
}
]
# Run evaluation
results = engine.evaluate_model("gpt-3.5-turbo", test_cases)
# Access results
print(f"Accuracy: {results['aggregate_metrics']['accuracy']:.1%}")
print(f"Cost: ${results['aggregate_metrics']['total_cost']:.4f}")
print(f"Time: {results['aggregate_metrics']['total_time']:.2f}s")
# Individual test results
for test_result in results['test_results']:
print(f"Test {test_result['test_id']}: {test_result['score']:.1%}")
evaluate_batch(evaluations: List[Dict]) -> List[Dict]¶
Execute multiple evaluations in batch mode.
Parameters: - evaluations (List[Dict]): List of evaluation configurations
Evaluation Config Schema:
{
"model_name": str,
"test_cases": List[Dict],
"use_case_requirements": Dict,
"evaluation_id": str # Optional unique identifier
}
Returns: - List[Dict]: List of evaluation results
Example:
# Batch evaluation setup
evaluations = [
{
"model_name": "gpt-3.5-turbo",
"test_cases": reasoning_tests,
"use_case_requirements": {"capability": "reasoning"},
"evaluation_id": "reasoning_eval_001"
},
{
"model_name": "gpt-4",
"test_cases": reasoning_tests,
"use_case_requirements": {"capability": "reasoning"},
"evaluation_id": "reasoning_eval_002"
}
]
# Execute batch
batch_results = engine.evaluate_batch(evaluations)
# Process results
for result in batch_results:
model = result['model_name']
accuracy = result['aggregate_metrics']['accuracy']
print(f"{model}: {accuracy:.1%}")
compare_models(model_names: List[str], test_cases: List[Dict]) -> Dict[str, Any]¶
Compare multiple models on the same test cases.
Parameters: - model_names (List[str]): List of model names to compare - test_cases (List[Dict]): Common test cases for all models
Returns: - Dict[str, Any]: Comparative analysis results
Example:
# Model comparison
comparison = engine.compare_models(
model_names=["gpt-3.5-turbo", "gpt-4", "claude-3"],
test_cases=standard_test_suite
)
# Results include:
# - Individual model results
# - Comparative metrics
# - Cost analysis
# - Performance rankings
print("๐ Model Comparison Results:")
for model, metrics in comparison['model_metrics'].items():
print(f" {model}: {metrics['accuracy']:.1%} - ${metrics['cost']:.4f}")
๐ก Usage Examples¶
๐ Advanced ModelInferenceEngine Examples
from llm_evaluation_framework import ModelInferenceEngine, ModelRegistry
from llm_evaluation_framework.persistence import JSONStore
import time
# Setup
registry = ModelRegistry()
engine = ModelInferenceEngine(registry)
# Register models for comparison
models_to_test = {
"gpt-3.5-turbo": {"provider": "openai", "api_cost_input": 0.0015, "api_cost_output": 0.002},
"gpt-4": {"provider": "openai", "api_cost_input": 0.03, "api_cost_output": 0.06}
}
for name, config in models_to_test.items():
registry.register_model(name, config)
# Create comprehensive test suite
test_categories = {
"reasoning": [
{"prompt": "If A>B and B>C, what's the relationship between A and C?",
"expected_output": "A > C",
"evaluation_criteria": "Logical reasoning"},
{"prompt": "What comes next: 2, 4, 8, 16, ?",
"expected_output": "32",
"evaluation_criteria": "Pattern recognition"}
],
"creativity": [
{"prompt": "Write a haiku about coding",
"expected_output": "Short creative poem with 5-7-5 structure",
"evaluation_criteria": "Creative expression, proper haiku format"},
{"prompt": "Create a metaphor for debugging",
"expected_output": "Creative comparison explaining debugging",
"evaluation_criteria": "Metaphorical thinking, accuracy"}
]
}
# Comprehensive evaluation workflow
results_summary = {}
for category, tests in test_categories.items():
print(f"\n๐งช Testing {category.upper()} capability...")
category_results = {}
for model_name in models_to_test.keys():
print(f" ๐ค Evaluating {model_name}...")
# Run evaluation
result = engine.evaluate_model(
model_name=model_name,
test_cases=tests,
use_case_requirements={"capability": category}
)
category_results[model_name] = result
# Log key metrics
metrics = result['aggregate_metrics']
print(f" โ
Accuracy: {metrics['accuracy']:.1%}")
print(f" ๐ฐ Cost: ${metrics['total_cost']:.4f}")
print(f" โฑ๏ธ Time: {metrics['total_time']:.2f}s")
results_summary[category] = category_results
# Generate comparison report
print("\n๐ COMPREHENSIVE COMPARISON REPORT")
print("=" * 50)
for category, category_results in results_summary.items():
print(f"\n{category.upper()} Results:")
# Sort by accuracy
sorted_results = sorted(
category_results.items(),
key=lambda x: x[1]['aggregate_metrics']['accuracy'],
reverse=True
)
for rank, (model, result) in enumerate(sorted_results, 1):
metrics = result['aggregate_metrics']
print(f" #{rank} {model}:")
print(f" ๐ Accuracy: {metrics['accuracy']:.1%}")
print(f" ๐ต Cost: ${metrics['total_cost']:.4f}")
print(f" ๐ Speed: {metrics['average_response_time']:.2f}s/test")
# Save detailed results
store = JSONStore("comprehensive_evaluation_results.json")
store.save("full_comparison", results_summary)
print(f"\n๐พ Detailed results saved to comprehensive_evaluation_results.json")
# Cost-effectiveness analysis
print(f"\n๐ก COST-EFFECTIVENESS ANALYSIS")
for category, category_results in results_summary.items():
print(f"\n{category.title()} - Best Value:")
best_value = min(
category_results.items(),
key=lambda x: x[1]['aggregate_metrics']['total_cost'] / max(x[1]['aggregate_metrics']['accuracy'], 0.01)
)
model_name, result = best_value
metrics = result['aggregate_metrics']
value_score = metrics['accuracy'] / metrics['total_cost'] if metrics['total_cost'] > 0 else float('inf')
print(f" ๐ {model_name}: {value_score:.0f} accuracy points per $")
print(f" {metrics['accuracy']:.1%} accuracy at ${metrics['total_cost']:.4f}")
๐งช TestDatasetGenerator¶
Generates synthetic test datasets for model evaluation across different capabilities.
๐ Class Definition¶
class TestDatasetGenerator:
"""
Generator for synthetic test datasets with capability-based scenarios.
Creates realistic test cases for evaluating LLM performance across
different domains and capabilities with customizable parameters.
"""
๐ง Methods¶
generate_test_cases(use_case_requirements: Dict[str, Any], count: int = 5) -> List[Dict[str, Any]]¶
Generate test cases based on requirements.
Parameters: - use_case_requirements (Dict[str, Any]): Generation requirements - count (int): Number of test cases to generate (default: 5)
Requirements Schema:
{
"domain": str, # e.g., "general", "medical", "legal"
"required_capabilities": List[str], # Capabilities to test
"difficulty_level": str, # "easy", "medium", "hard"
"language": str, # Language code (default: "en")
"custom_context": str # Additional context (optional)
}
Returns: - List[Dict[str, Any]]: Generated test cases
Example:
from llm_evaluation_framework import TestDatasetGenerator
generator = TestDatasetGenerator()
# Basic generation
requirements = {
"domain": "general",
"required_capabilities": ["reasoning"],
"difficulty_level": "medium"
}
test_cases = generator.generate_test_cases(requirements, count=10)
for i, test_case in enumerate(test_cases, 1):
print(f"\nTest Case {i}:")
print(f" Prompt: {test_case['prompt']}")
print(f" Expected: {test_case['expected_output']}")
print(f" Criteria: {test_case['evaluation_criteria']}")
generate_reasoning_tests(count: int = 5, difficulty: str = "medium") -> List[Dict]¶
Generate logic and reasoning test cases.
Parameters: - count (int): Number of tests to generate - difficulty (str): Difficulty level
Example:
reasoning_tests = generator.generate_reasoning_tests(count=8, difficulty="hard")
# Example output:
# [
# {
# "prompt": "If all roses are flowers and some flowers are red, can we conclude that some roses are red?",
# "expected_output": "No, we cannot definitively conclude that some roses are red...",
# "evaluation_criteria": "Logical reasoning, syllogistic logic understanding"
# }
# ]
generate_creativity_tests(count: int = 5, theme: str = "general") -> List[Dict]¶
Generate creative writing and ideation tests.
Parameters: - count (int): Number of tests to generate - theme (str): Creative theme or domain
Example:
creativity_tests = generator.generate_creativity_tests(count=5, theme="technology")
# Example output:
# [
# {
# "prompt": "Write a short story about an AI that discovers emotions",
# "expected_output": "Creative narrative with emotional development...",
# "evaluation_criteria": "Creativity, narrative structure, emotional depth"
# }
# ]
generate_coding_tests(count: int = 5, language: str = "python") -> List[Dict]¶
Generate programming and code-related tests.
Parameters: - count (int): Number of tests to generate - language (str): Programming language focus
Example:
coding_tests = generator.generate_coding_tests(count=6, language="python")
# Example output:
# [
# {
# "prompt": "Write a Python function to find the longest palindromic substring",
# "expected_output": "def longest_palindrome(s): ...",
# "evaluation_criteria": "Correct algorithm, efficient implementation, edge case handling"
# }
# ]
๐ฏ Available Capabilities¶
| Capability | Description | Example Tests |
|---|---|---|
| reasoning | Logic, problem-solving, critical thinking | Syllogisms, math problems, puzzles |
| creativity | Creative writing, ideation, artistic expression | Stories, poems, creative solutions |
| coding | Programming, algorithms, code analysis | Function writing, debugging, optimization |
| factual | Knowledge recall, fact verification | Historical facts, scientific data |
| instruction | Following complex multi-step instructions | Procedures, recipes, assembly guides |
๐ก Usage Examples¶
๐ Advanced TestDatasetGenerator Examples
from llm_evaluation_framework import TestDatasetGenerator
from llm_evaluation_framework.persistence import JSONStore
generator = TestDatasetGenerator()
# Multi-capability dataset generation
capabilities = ["reasoning", "creativity", "coding", "factual", "instruction"]
difficulty_levels = ["easy", "medium", "hard"]
comprehensive_dataset = {}
for capability in capabilities:
print(f"๐งช Generating {capability} tests...")
capability_tests = {}
for difficulty in difficulty_levels:
# Generate tests for each difficulty level
requirements = {
"domain": "general",
"required_capabilities": [capability],
"difficulty_level": difficulty
}
tests = generator.generate_test_cases(requirements, count=10)
capability_tests[difficulty] = tests
print(f" โ
{difficulty}: {len(tests)} tests")
comprehensive_dataset[capability] = capability_tests
# Domain-specific dataset generation
domains = ["medical", "legal", "technical", "creative", "academic"]
domain_datasets = {}
for domain in domains:
print(f"๐ฅ Generating {domain} domain tests...")
domain_requirements = {
"domain": domain,
"required_capabilities": ["reasoning", "factual"],
"difficulty_level": "medium",
"custom_context": f"Focus on {domain}-specific scenarios and terminology"
}
domain_tests = generator.generate_test_cases(domain_requirements, count=15)
domain_datasets[domain] = domain_tests
print(f" โ
Generated {len(domain_tests)} {domain} tests")
# Specialized test generation
print("๐ฏ Generating specialized test suites...")
# Coding challenge suite
coding_suite = {
"algorithms": generator.generate_coding_tests(count=10, language="python"),
"data_structures": generator.generate_test_cases({
"domain": "computer_science",
"required_capabilities": ["coding", "reasoning"],
"difficulty_level": "hard",
"custom_context": "Focus on data structure implementation and analysis"
}, count=8),
"debugging": generator.generate_test_cases({
"domain": "software_development",
"required_capabilities": ["coding"],
"difficulty_level": "medium",
"custom_context": "Present buggy code that needs to be fixed"
}, count=6)
}
# Creative writing suite
creative_suite = {
"storytelling": generator.generate_creativity_tests(count=8, theme="fiction"),
"poetry": generator.generate_test_cases({
"domain": "literature",
"required_capabilities": ["creativity"],
"difficulty_level": "medium",
"custom_context": "Focus on different poetic forms and styles"
}, count=6),
"ideation": generator.generate_test_cases({
"domain": "business",
"required_capabilities": ["creativity", "reasoning"],
"difficulty_level": "medium",
"custom_context": "Generate innovative business solutions and ideas"
}, count=10)
}
# Advanced reasoning suite
reasoning_suite = {
"logical_puzzles": generator.generate_reasoning_tests(count=12, difficulty="hard"),
"mathematical_reasoning": generator.generate_test_cases({
"domain": "mathematics",
"required_capabilities": ["reasoning"],
"difficulty_level": "hard",
"custom_context": "Complex mathematical proofs and problem-solving"
}, count=8),
"causal_reasoning": generator.generate_test_cases({
"domain": "philosophy",
"required_capabilities": ["reasoning"],
"difficulty_level": "medium",
"custom_context": "Cause-and-effect relationships and logical inference"
}, count=10)
}
# Save all generated datasets
store = JSONStore("test_datasets.json")
datasets_to_save = {
"comprehensive": comprehensive_dataset,
"domain_specific": domain_datasets,
"coding_suite": coding_suite,
"creative_suite": creative_suite,
"reasoning_suite": reasoning_suite
}
for dataset_name, dataset in datasets_to_save.items():
store.save(dataset_name, dataset)
print(f"๐พ Saved {dataset_name} dataset")
# Generate summary statistics
total_tests = 0
for dataset in datasets_to_save.values():
total_tests += sum(len(v) if isinstance(v, list) else
sum(len(vv) if isinstance(vv, list) else
sum(len(vvv) for vvv in vv.values()) if isinstance(vv, dict) else 0
for vv in v.values())
for v in dataset.values())
print(f"\n๐ Dataset Generation Complete!")
print(f"๐ Total Tests Generated: {total_tests}")
print(f"๐ Datasets: {list(datasets_to_save.keys())}")
print(f"๐พ Saved to: test_datasets.json")
๐ Evaluation & Scoring¶
ScoringContext¶
Strategy pattern implementation for different scoring algorithms.
class ScoringContext:
"""
Context class for scoring strategies using strategy pattern.
Allows switching between different scoring algorithms at runtime
while maintaining a consistent interface.
"""
def __init__(self, strategy: ScoringStrategy):
"""Initialize with a scoring strategy."""
self.strategy = strategy
def evaluate(self, predictions: List[str], references: List[str]) -> float:
"""
Evaluate predictions against references using the current strategy.
Args:
predictions: List of model predictions
references: List of reference/expected outputs
Returns:
float: Score between 0.0 and 1.0
"""
return self.strategy.calculate_score(predictions, references)
Available Scoring Strategies¶
Exact match accuracy scoring.
from llm_evaluation_framework.evaluation.scoring_strategies import (
AccuracyScoringStrategy,
ScoringContext
)
# Initialize accuracy scoring
accuracy_strategy = AccuracyScoringStrategy()
context = ScoringContext(accuracy_strategy)
# Score predictions
predictions = ["Paris", "London", "Berlin"]
references = ["Paris", "London", "Vienna"]
accuracy_score = context.evaluate(predictions, references)
print(f"Accuracy: {accuracy_score:.1%}") # Output: 66.7%
F1 score for token-level evaluation.
from llm_evaluation_framework.evaluation.scoring_strategies import (
F1ScoringStrategy,
ScoringContext
)
# Initialize F1 scoring
f1_strategy = F1ScoringStrategy()
context = ScoringContext(f1_strategy)
# Score predictions (handles partial matches)
predictions = ["The cat sits on the mat", "Machine learning is AI"]
references = ["A cat sits on a mat", "ML is artificial intelligence"]
f1_score = context.evaluate(predictions, references)
print(f"F1 Score: {f1_score:.1%}")
Create your own scoring algorithm.
from llm_evaluation_framework.evaluation.scoring_strategies import ScoringContext
class CustomSimilarityStrategy:
"""Custom scoring using semantic similarity."""
def calculate_score(self, predictions, references):
# Your custom scoring logic
scores = []
for pred, ref in zip(predictions, references):
# Example: simple word overlap
pred_words = set(pred.lower().split())
ref_words = set(ref.lower().split())
if len(ref_words) == 0:
score = 1.0 if len(pred_words) == 0 else 0.0
else:
overlap = len(pred_words.intersection(ref_words))
score = overlap / len(ref_words)
scores.append(score)
return sum(scores) / len(scores) if scores else 0.0
# Use custom strategy
custom_strategy = CustomSimilarityStrategy()
context = ScoringContext(custom_strategy)
score = context.evaluate(predictions, references)
print(f"Custom Score: {score:.1%}")
๐พ Persistence Layer¶
The framework provides multiple storage backends for evaluation results.
JSONStore¶
File-based JSON storage with backup and metadata.
from llm_evaluation_framework.persistence import JSONStore
# Initialize store
store = JSONStore("evaluation_results.json")
# Save evaluation result
store.save_evaluation_result(evaluation_result)
# Save custom data
store.save("experiment_001", {
"model": "gpt-3.5-turbo",
"accuracy": 0.85,
"cost": 0.0234,
"timestamp": "2024-01-15T10:30:00Z"
})
# Load data
data = store.load("experiment_001")
all_keys = store.list_keys()
# Backup and restore
store.backup("backup_20240115.json")
store.restore("backup_20240115.json")
DBStore¶
SQLite database storage with advanced querying.
from llm_evaluation_framework.persistence import DBStore
# Initialize database store
db_store = DBStore("evaluations.db")
# Save evaluation result
db_store.save_evaluation_result(evaluation_result)
# Advanced querying
recent_results = db_store.query(
limit=10,
model_name="gpt-3.5-turbo",
min_accuracy=0.8,
start_date="2024-01-01"
)
# Get statistics
stats = db_store.get_statistics()
print(f"Total evaluations: {stats['total_evaluations']}")
print(f"Average accuracy: {stats['average_accuracy']:.1%}")
# Export data
db_store.export_to_csv("evaluation_export.csv")
PersistenceManager¶
Unified interface for multiple storage backends.
from llm_evaluation_framework.persistence import PersistenceManager
# Initialize with multiple backends
manager = PersistenceManager({
"json": JSONStore("results.json"),
"database": DBStore("results.db")
})
# Save to all backends
manager.save_evaluation_result(result, backends=["json", "database"])
# Load from specific backend
data = manager.load("experiment_001", backend="database")
# Query across backends
all_results = manager.query_all_backends(model_name="gpt-3.5-turbo")
๐ ๏ธ Utilities¶
Error Handling¶
from llm_evaluation_framework.utils.error_handler import (
retry_with_backoff,
validate_model_config,
EvaluationError,
ConfigurationError
)
# Retry decorator
@retry_with_backoff(max_retries=3, base_delay=1.0)
def api_call():
# Your API call that might fail
pass
# Configuration validation
try:
validate_model_config(config)
except ConfigurationError as e:
print(f"Invalid config: {e}")
# Custom exception handling
try:
result = engine.evaluate_model("invalid-model", test_cases)
except EvaluationError as e:
print(f"Evaluation failed: {e}")
Logging¶
from llm_evaluation_framework.utils.logger import setup_logging
# Setup structured logging
logger = setup_logging(
level="INFO",
log_file="evaluation.log",
enable_file=True,
log_dir="./logs"
)
# Use logger
logger.info("Starting evaluation")
logger.error("Model not found", extra={"model_name": "invalid-model"})
Auto Suggestions¶
from llm_evaluation_framework import AutoSuggestionEngine
# Initialize suggestion engine
suggester = AutoSuggestionEngine()
# Get model recommendations
recommendations = suggester.suggest_models(
capability="coding",
max_cost=0.01,
min_accuracy=0.8
)
# Get optimization suggestions
optimizations = suggester.suggest_optimizations(evaluation_result)
๐ API Chaining & Workflows¶
Complete Evaluation Workflow¶
from llm_evaluation_framework import (
ModelRegistry,
ModelInferenceEngine,
TestDatasetGenerator
)
from llm_evaluation_framework.persistence import JSONStore
from llm_evaluation_framework.evaluation.scoring_strategies import (
AccuracyScoringStrategy,
ScoringContext
)
# Complete workflow example
def complete_evaluation_workflow():
# 1. Setup components
registry = ModelRegistry()
generator = TestDatasetGenerator()
engine = ModelInferenceEngine(registry)
store = JSONStore("workflow_results.json")
# 2. Register models
registry.register_model("gpt-3.5-turbo", {
"provider": "openai",
"api_cost_input": 0.0015,
"api_cost_output": 0.002,
"capabilities": ["reasoning", "creativity", "coding"]
})
# 3. Generate test data
test_cases = generator.generate_test_cases({
"domain": "general",
"required_capabilities": ["reasoning"],
"difficulty_level": "medium"
}, count=20)
# 4. Run evaluation
results = engine.evaluate_model("gpt-3.5-turbo", test_cases)
# 5. Custom scoring
scoring_context = ScoringContext(AccuracyScoringStrategy())
predictions = [r['actual'] for r in results['test_results']]
references = [r['expected'] for r in results['test_results']]
custom_score = scoring_context.evaluate(predictions, references)
# 6. Save results
final_results = {
**results,
"custom_accuracy": custom_score
}
store.save_evaluation_result(final_results)
return final_results
# Execute workflow
results = complete_evaluation_workflow()
print(f"Workflow completed! Accuracy: {results['aggregate_metrics']['accuracy']:.1%}")