💡 Comprehensive Examples & Use Cases¶

🎯 Example Categories¶

🚀 Getting Started

Basic patterns & quick setup

⚡ Performance

Async & high-throughput

🔧 Customization

Extensions & custom logic

🏢 Enterprise

Production deployments

🚀 Getting Started Examples¶

📝 Basic Model Evaluation¶

**Quick 5-minute evaluation setup**

"""
Basic model evaluation example
Perfect for testing single models quickly
"""

from llm_evaluation_framework import (
    ModelRegistry, 
    ModelInferenceEngine, 
    TestDatasetGenerator
)

def basic_evaluation_example():
    """Simple evaluation workflow"""

    # 1️⃣ Initialize components
    registry = ModelRegistry()
    generator = TestDatasetGenerator()
    engine = ModelInferenceEngine(registry)

    # 2️⃣ Register your model
    registry.register_model("gpt-3.5-turbo", {
        "provider": "openai",
        "api_cost_input": 0.0015,   # Cost per 1K input tokens
        "api_cost_output": 0.002,   # Cost per 1K output tokens
        "capabilities": ["reasoning", "creativity", "coding"],
        "parameters": {
            "temperature": 0.7,
            "max_tokens": 1000,
            "top_p": 0.9
        }
    })

    # 3️⃣ Generate test cases
    use_case = {
        "domain": "customer_service",
        "required_capabilities": ["reasoning", "empathy"],
        "difficulty": "medium"
    }

    test_cases = generator.generate_test_cases(use_case, count=20)
    print(f"Generated {len(test_cases)} test cases")

    # 4️⃣ Run evaluation
    results = engine.evaluate_model("gpt-3.5-turbo", test_cases, use_case)

    # 5️⃣ Display results
    metrics = results['aggregate_metrics']
    print(f"🎯 Accuracy: {metrics['accuracy']:.1%}")
    print(f"💰 Total Cost: ${metrics['total_cost']:.4f}")
    print(f"⏱️  Average Time: {metrics['average_response_time']:.2f}s")
    print(f"✅ Success Rate: {metrics['success_rate']:.1%}")

    return results

if __name__ == "__main__":
    results = basic_evaluation_example()

**📁 Complete File**: [`examples/basic_usage.py`](../../examples/basic_usage.py)

🔄 Model Comparison¶

**Compare multiple models side-by-side**

"""
Multi-model comparison example
Compare different LLMs on the same test cases
"""

def model_comparison_example():
    """Compare multiple models on identical test cases"""

    registry = ModelRegistry()
    generator = TestDatasetGenerator()
    engine = ModelInferenceEngine(registry)

    # Register multiple models
    models_config = {
        "gpt-3.5-turbo": {
            "provider": "openai",
            "api_cost_input": 0.0015,
            "api_cost_output": 0.002,
            "capabilities": ["reasoning", "creativity"]
        },
        "claude-3-sonnet": {
            "provider": "anthropic", 
            "api_cost_input": 0.003,
            "api_cost_output": 0.015,
            "capabilities": ["reasoning", "analysis", "writing"]
        },
        "gpt-4": {
            "provider": "openai",
            "api_cost_input": 0.03,
            "api_cost_output": 0.06,
            "capabilities": ["reasoning", "creativity", "coding"]
        }
    }

    # Register all models
    for model_name, config in models_config.items():
        registry.register_model(model_name, config)

    # Generate common test cases
    use_case = {
        "domain": "technical_writing",
        "required_capabilities": ["reasoning", "analysis"],
        "difficulty": "hard"
    }

    test_cases = generator.generate_test_cases(use_case, count=30)

    # Evaluate each model
    comparison_results = {}

    for model_name in models_config.keys():
        print(f"\n🤖 Evaluating {model_name}...")

        results = engine.evaluate_model(model_name, test_cases, use_case)
        comparison_results[model_name] = results['aggregate_metrics']

        print(f"   Accuracy: {results['aggregate_metrics']['accuracy']:.1%}")
        print(f"   Cost: ${results['aggregate_metrics']['total_cost']:.4f}")

    # Create comparison table
    print("\n📊 COMPARISON RESULTS")
    print("="*60)
    print(f"{'Model':<20} {'Accuracy':<12} {'Cost':<10} {'Time':<8}")
    print("-"*60)

    for model, metrics in comparison_results.items():
        print(f"{model:<20} {metrics['accuracy']:<12.1%} "
              f"${metrics['total_cost']:<9.4f} {metrics['average_response_time']:<8.2f}s")

    # Find best model by criteria
    best_accuracy = max(comparison_results.items(), 
                       key=lambda x: x[1]['accuracy'])
    best_cost = min(comparison_results.items(), 
                   key=lambda x: x[1]['total_cost'])

    print(f"\n🏆 Best Accuracy: {best_accuracy[0]} ({best_accuracy[1]['accuracy']:.1%})")
    print(f"💰 Most Cost-Effective: {best_cost[0]} (${best_cost[1]['total_cost']:.4f})")

    return comparison_results

if __name__ == "__main__":
    comparison_results = model_comparison_example()

**📁 Complete File**: [`examples/model_comparison.py`](../../examples/model_comparison.py)

⚡ High-Performance Examples¶

🚀 Async Concurrent Evaluation¶

**Maximum throughput with async processing**

"""
High-performance async evaluation example
Demonstrates concurrent processing for maximum throughput
"""

import asyncio
import time
from llm_evaluation_framework.engines.async_inference_engine import AsyncInferenceEngine

async def high_performance_evaluation():
    """
    Execute multiple evaluations concurrently
    Ideal for batch processing and high-throughput scenarios
    """

    # Setup
    registry = ModelRegistry()
    generator = TestDatasetGenerator()
    async_engine = AsyncInferenceEngine(registry)

    # Register models for concurrent testing
    models = ["gpt-3.5-turbo", "claude-3-sonnet", "gpt-4"]
    for model in models:
        registry.register_model(model, {
            "provider": "openai" if "gpt" in model else "anthropic",
            "capabilities": ["reasoning", "creativity"]
        })

    # Generate different test suites
    test_suites = {
        "reasoning": generator.generate_test_cases(
            {"domain": "logic", "required_capabilities": ["reasoning"]}, 
            count=50
        ),
        "creativity": generator.generate_test_cases(
            {"domain": "creative", "required_capabilities": ["creativity"]}, 
            count=30
        ),
        "coding": generator.generate_test_cases(
            {"domain": "programming", "required_capabilities": ["coding"]}, 
            count=40
        )
    }

    # Create concurrent evaluation tasks
    tasks = []
    start_time = time.time()

    for model in models:
        for suite_name, test_cases in test_suites.items():
            task = async_engine.evaluate_model_async(
                model_name=model,
                test_cases=test_cases,
                use_case={"domain": suite_name},
                max_concurrent=15,  # Concurrent requests per model
                timeout=45.0
            )
            tasks.append((model, suite_name, task))

    print(f"🚀 Starting {len(tasks)} concurrent evaluations...")

    # Execute all tasks concurrently
    results = await asyncio.gather(
        *[task for _, _, task in tasks], 
        return_exceptions=True
    )

    total_time = time.time() - start_time

    # Process results
    successful_evaluations = 0
    total_tests = 0
    total_cost = 0.0

    for (model, suite, _), result in zip(tasks, results):
        if isinstance(result, Exception):
            print(f"❌ {model} ({suite}): {result}")
        else:
            successful_evaluations += 1
            metrics = result['aggregate_metrics']
            total_tests += metrics['test_count']
            total_cost += metrics['total_cost']

            print(f"✅ {model} ({suite}): "
                  f"{metrics['accuracy']:.1%} accuracy, "
                  f"${metrics['total_cost']:.4f} cost")

    # Performance summary
    throughput = total_tests / total_time if total_time > 0 else 0

    print(f"\n📊 PERFORMANCE SUMMARY")
    print(f"Total Time: {total_time:.2f}s")
    print(f"Total Tests: {total_tests}")
    print(f"Throughput: {throughput:.1f} tests/second")
    print(f"Total Cost: ${total_cost:.4f}")
    print(f"Success Rate: {successful_evaluations}/{len(tasks)} evaluations")

    return results

async def batch_processing_example():
    """
    Process large datasets in optimized batches
    """

    async_engine = AsyncInferenceEngine(registry)

    # Large dataset simulation
    large_dataset = generator.generate_test_cases(
        {"domain": "general"}, 
        count=1000  # Large number of test cases
    )

    # Process in optimized batches
    batch_size = 50
    batch_results = []

    for i in range(0, len(large_dataset), batch_size):
        batch = large_dataset[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{len(large_dataset)//batch_size + 1}")

        batch_result = await async_engine.evaluate_model_async(
            model_name="gpt-3.5-turbo",
            test_cases=batch,
            max_concurrent=20
        )

        batch_results.append(batch_result)

    # Aggregate batch results
    total_accuracy = sum(r['aggregate_metrics']['accuracy'] for r in batch_results)
    average_accuracy = total_accuracy / len(batch_results)

    print(f"🎯 Final Average Accuracy: {average_accuracy:.1%}")

    return batch_results

if __name__ == "__main__":
    # Run high-performance evaluation
    results = asyncio.run(high_performance_evaluation())

    # Run batch processing example
    batch_results = asyncio.run(batch_processing_example())

**📁 Complete File**: [`examples/advanced_async_usage.py`](../../examples/advanced_async_usage.py)

🔄 Real-time Streaming Evaluation¶

**Process evaluation results as they complete**

"""
Real-time streaming evaluation with progress monitoring
Perfect for long-running evaluations with live updates
"""

import asyncio
from typing import AsyncGenerator, Dict, Any

async def streaming_evaluation_example():
    """
    Stream evaluation results in real-time as they complete
    """

    async def evaluation_stream() -> AsyncGenerator[Dict[str, Any], None]:
        """Generator that yields evaluation results as they complete"""

        # Setup
        async_engine = AsyncInferenceEngine(registry)
        test_cases = generator.generate_test_cases({"domain": "general"}, count=100)

        # Process each test case individually for streaming
        semaphore = asyncio.Semaphore(10)  # Limit concurrent requests

        async def evaluate_single_test(test_case: Dict, test_id: int):
            async with semaphore:
                try:
                    result = await async_engine.evaluate_single_test(
                        model_name="gpt-3.5-turbo",
                        test_case=test_case,
                        timeout=30.0
                    )
                    return {
                        'test_id': test_id,
                        'status': 'completed',
                        'result': result,
                        'timestamp': time.time()
                    }
                except Exception as e:
                    return {
                        'test_id': test_id,
                        'status': 'failed',
                        'error': str(e),
                        'timestamp': time.time()
                    }

        # Create tasks for all test cases
        tasks = [
            evaluate_single_test(test_case, i) 
            for i, test_case in enumerate(test_cases)
        ]

        # Yield results as they complete
        for completed_task in asyncio.as_completed(tasks):
            result = await completed_task
            yield result

    # Consume the stream
    completed_count = 0
    failed_count = 0
    total_accuracy = 0.0

    print("🚀 Starting streaming evaluation...")
    print("Results will appear in real-time as they complete:\n")

    async for result in evaluation_stream():
        if result['status'] == 'completed':
            completed_count += 1
            accuracy = result['result']['score']
            total_accuracy += accuracy

            print(f"✅ Test {result['test_id']:3d}: {accuracy:.1%} accuracy")

            # Show running statistics every 10 completions
            if completed_count % 10 == 0:
                avg_accuracy = total_accuracy / completed_count
                total_processed = completed_count + failed_count
                print(f"📊 Progress: {total_processed}/100 | "
                      f"Avg Accuracy: {avg_accuracy:.1%} | "
                      f"Success Rate: {completed_count/total_processed:.1%}")
                print("-" * 50)

        else:
            failed_count += 1
            print(f"❌ Test {result['test_id']:3d}: {result['error']}")

    # Final summary
    final_accuracy = total_accuracy / completed_count if completed_count > 0 else 0
    total_tests = completed_count + failed_count

    print(f"\n🎯 FINAL RESULTS")
    print(f"Total Tests: {total_tests}")
    print(f"Completed: {completed_count}")
    print(f"Failed: {failed_count}")
    print(f"Final Accuracy: {final_accuracy:.1%}")
    print(f"Success Rate: {completed_count/total_tests:.1%}")

if __name__ == "__main__":
    asyncio.run(streaming_evaluation_example())

🔧 Customization Examples¶

🎯 Custom Scoring Strategies¶

**Build domain-specific evaluation metrics**

"""
Custom scoring strategies for specialized evaluation needs
Examples include domain-specific metrics and multi-dimensional scoring
"""

from llm_evaluation_framework.evaluation.scoring_strategies import ScoringStrategy
from typing import List, Dict, Any
import re
import json

class BusinessWritingScorer(ScoringStrategy):
    """
    Specialized scorer for business communication evaluation
    Evaluates clarity, professionalism, tone, and effectiveness
    """

    def __init__(self, weights: Dict[str, float] = None):
        self.weights = weights or {
            'clarity': 0.3,
            'professionalism': 0.25,
            'tone': 0.2,
            'effectiveness': 0.25
        }

    def calculate_score(self, predictions: List[str], references: List[str]) -> Dict[str, Any]:
        """Calculate comprehensive business writing scores"""

        component_scores = {
            'clarity': [],
            'professionalism': [],
            'tone': [],
            'effectiveness': []
        }

        for pred, ref in zip(predictions, references):
            # Evaluate each component
            clarity = self._evaluate_clarity(pred)
            professionalism = self._evaluate_professionalism(pred)
            tone = self._evaluate_tone(pred, ref)
            effectiveness = self._evaluate_effectiveness(pred, ref)

            component_scores['clarity'].append(clarity)
            component_scores['professionalism'].append(professionalism)
            component_scores['tone'].append(tone)
            component_scores['effectiveness'].append(effectiveness)

        # Calculate averages
        avg_scores = {
            component: sum(scores) / len(scores)
            for component, scores in component_scores.items()
        }

        # Calculate weighted overall score
        overall_score = sum(
            avg_scores[component] * self.weights[component]
            for component in avg_scores
        )

        return {
            'overall_score': overall_score,
            'component_scores': avg_scores,
            'detailed_scores': component_scores
        }

    def _evaluate_clarity(self, text: str) -> float:
        """Evaluate text clarity (sentence length, jargon, structure)"""
        sentences = text.split('.')
        avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)

        # Optimal sentence length is 15-20 words
        if 15 <= avg_sentence_length <= 20:
            length_score = 1.0
        else:
            length_score = max(0, 1 - abs(avg_sentence_length - 17.5) / 17.5)

        # Check for clarity indicators
        clarity_indicators = [
            'therefore', 'however', 'furthermore', 'in conclusion',
            'for example', 'specifically', 'in other words'
        ]

        indicator_count = sum(1 for indicator in clarity_indicators 
                            if indicator in text.lower())
        indicator_score = min(1.0, indicator_count / 3)

        return (length_score + indicator_score) / 2

    def _evaluate_professionalism(self, text: str) -> float:
        """Evaluate professional tone and language"""
        # Check for unprofessional elements
        unprofessional_patterns = [
            r'\b(yeah|yep|nah|gonna|wanna|gotta)\b',
            r'[!]{2,}',  # Multiple exclamation marks
            r'[A-Z]{3,}',  # ALL CAPS words
            r'\b(awesome|cool|amazing|totally)\b'
        ]

        violations = sum(len(re.findall(pattern, text, re.IGNORECASE))
                        for pattern in unprofessional_patterns)

        # Professional language indicators
        professional_indicators = [
            'please', 'thank you', 'we appreciate', 'we recommend',
            'kindly', 'respectfully', 'sincerely'
        ]

        professional_count = sum(1 for indicator in professional_indicators
                               if indicator in text.lower())

        violation_penalty = max(0, 1 - violations * 0.2)
        professional_bonus = min(0.5, professional_count * 0.1)

        return min(1.0, violation_penalty + professional_bonus)

class TechnicalAccuracyScorer(ScoringStrategy):
    """
    Scorer for technical content accuracy
    Validates technical facts, terminology, and implementation details
    """

    def __init__(self, domain_knowledge: Dict[str, Any]):
        self.domain_knowledge = domain_knowledge
        self.technical_terms = domain_knowledge.get('technical_terms', [])
        self.facts_db = domain_knowledge.get('facts', {})

    def calculate_score(self, predictions: List[str], references: List[str]) -> float:
        """Calculate technical accuracy score"""

        total_score = 0.0

        for pred, ref in zip(predictions, references):
            # Technical terminology accuracy
            terminology_score = self._check_terminology(pred, ref)

            # Factual accuracy
            factual_score = self._check_facts(pred, ref)

            # Implementation details accuracy
            implementation_score = self._check_implementation(pred, ref)

            # Weighted combination
            combined_score = (
                0.4 * terminology_score +
                0.4 * factual_score +
                0.2 * implementation_score
            )

            total_score += combined_score

        return total_score / len(predictions)

    def _check_terminology(self, prediction: str, reference: str) -> float:
        """Check for correct technical terminology usage"""
        pred_terms = set(term.lower() for term in self.technical_terms 
                        if term.lower() in prediction.lower())
        ref_terms = set(term.lower() for term in self.technical_terms 
                       if term.lower() in reference.lower())

        if not ref_terms:
            return 1.0

        correct_terms = pred_terms.intersection(ref_terms)
        return len(correct_terms) / len(ref_terms)

    def _check_facts(self, prediction: str, reference: str) -> float:
        """Validate technical facts against knowledge base"""
        score = 1.0

        for fact_key, fact_value in self.facts_db.items():
            if fact_key.lower() in prediction.lower():
                # Check if the stated fact matches the knowledge base
                if str(fact_value).lower() not in prediction.lower():
                    score -= 0.2  # Penalty for incorrect fact

        return max(0.0, score)

# Usage example
def custom_scoring_example():
    """Demonstrate custom scoring strategies"""

    # Business writing evaluation
    business_scorer = BusinessWritingScorer()

    business_predictions = [
        "Dear valued customer, we sincerely appreciate your inquiry regarding our premium services. We would be delighted to provide you with comprehensive information.",
        "hey there! ur question is AWESOME!!! we totally love helping customers like u :)"
    ]

    business_references = [
        "Professional response with clear information and courteous tone.",
        "Professional response with clear information and courteous tone."
    ]

    business_scores = business_scorer.calculate_score(business_predictions, business_references)
    print("📊 Business Writing Scores:")
    print(f"Overall Score: {business_scores['overall_score']:.2f}")
    for component, score in business_scores['component_scores'].items():
        print(f"  {component.title()}: {score:.2f}")

    # Technical accuracy evaluation
    domain_knowledge = {
        'technical_terms': ['API', 'REST', 'JSON', 'authentication', 'HTTP'],
        'facts': {
            'HTTP status 200': 'success',
            'HTTP status 404': 'not found',
            'REST': 'architectural style'
        }
    }

    technical_scorer = TechnicalAccuracyScorer(domain_knowledge)

    technical_predictions = [
        "The REST API returns HTTP 200 status for successful requests and uses JSON format for data exchange.",
        "The REST API returns HTTP 200 status for failed requests and uses XML format."
    ]

    technical_references = [
        "REST APIs typically return HTTP 200 for success and use JSON format.",
        "REST APIs typically return HTTP 200 for success and use JSON format."
    ]

    technical_score = technical_scorer.calculate_score(technical_predictions, technical_references)
    print(f"\n🔧 Technical Accuracy Score: {technical_score:.2f}")

if __name__ == "__main__":
    custom_scoring_example()

**📁 Complete File**: [`examples/custom_scoring_and_persistence.py`](../../examples/custom_scoring_and_persistence.py)

🗄️ Custom Persistence Backends¶

**Integrate with your existing data infrastructure**

"""
Custom persistence backend examples
Demonstrates integration with various storage systems
"""

import sqlite3
import json
from typing import Dict, Any, List
from llm_evaluation_framework.persistence.base_store import BaseStore

class CustomDatabaseStore(BaseStore):
    """
    Custom database persistence for evaluation results
    Supports complex queries and relational data
    """

    def __init__(self, database_path: str):
        self.database_path = database_path
        self._initialize_database()

    def _initialize_database(self):
        """Initialize database schema"""
        with sqlite3.connect(self.database_path) as conn:
            conn.execute('''
                CREATE TABLE IF NOT EXISTS evaluations (
                    id TEXT PRIMARY KEY,
                    model_name TEXT NOT NULL,
                    timestamp TEXT NOT NULL,
                    accuracy REAL,
                    total_cost REAL,
                    test_count INTEGER,
                    metadata TEXT,
                    full_results TEXT
                )
            ''')

            conn.execute('''
                CREATE INDEX IF NOT EXISTS idx_model_name ON evaluations(model_name)
            ''')

            conn.execute('''
                CREATE INDEX IF NOT EXISTS idx_timestamp ON evaluations(timestamp)
            ''')

    def save(self, key: str, data: Dict[str, Any]) -> bool:
        """Save evaluation results with structured data"""
        try:
            with sqlite3.connect(self.database_path) as conn:
                metrics = data.get('aggregate_metrics', {})

                conn.execute('''
                    INSERT OR REPLACE INTO evaluations 
                    (id, model_name, timestamp, accuracy, total_cost, test_count, metadata, full_results)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                ''', (
                    key,
                    data.get('model_name', ''),
                    data.get('timestamp', ''),
                    metrics.get('accuracy', 0.0),
                    metrics.get('total_cost', 0.0),
                    metrics.get('test_count', 0),
                    json.dumps(data.get('metadata', {})),
                    json.dumps(data)
                ))

            return True
        except Exception as e:
            print(f"Save failed: {e}")
            return False

    def load(self, key: str) -> Dict[str, Any]:
        """Load evaluation results by key"""
        with sqlite3.connect(self.database_path) as conn:
            cursor = conn.execute('SELECT full_results FROM evaluations WHERE id = ?', (key,))
            row = cursor.fetchone()

            if row is None:
                raise KeyError(f"Key not found: {key}")

            return json.loads(row[0])

    def query_by_model(self, model_name: str, limit: int = 10) -> List[Dict[str, Any]]:
        """Query evaluations by model name"""
        with sqlite3.connect(self.database_path) as conn:
            cursor = conn.execute('''
                SELECT id, accuracy, total_cost, timestamp 
                FROM evaluations 
                WHERE model_name = ? 
                ORDER BY timestamp DESC 
                LIMIT ?
            ''', (model_name, limit))

            return [
                {
                    'id': row[0],
                    'accuracy': row[1],
                    'total_cost': row[2],
                    'timestamp': row[3]
                }
                for row in cursor.fetchall()
            ]

    def get_model_statistics(self, model_name: str) -> Dict[str, Any]:
        """Get aggregated statistics for a model"""
        with sqlite3.connect(self.database_path) as conn:
            cursor = conn.execute('''
                SELECT 
                    COUNT(*) as evaluation_count,
                    AVG(accuracy) as avg_accuracy,
                    MAX(accuracy) as max_accuracy,
                    MIN(accuracy) as min_accuracy,
                    SUM(total_cost) as total_cost_sum,
                    SUM(test_count) as total_tests
                FROM evaluations 
                WHERE model_name = ?
            ''', (model_name,))

            row = cursor.fetchone()

            return {
                'evaluation_count': row[0],
                'average_accuracy': row[1] or 0.0,
                'max_accuracy': row[2] or 0.0,
                'min_accuracy': row[3] or 0.0,
                'total_cost': row[4] or 0.0,
                'total_tests': row[5] or 0
            }

def custom_persistence_example():
    """Demonstrate custom persistence backend usage"""

    # Initialize custom database store
    db_store = CustomDatabaseStore('evaluation_results.db')

    # Setup evaluation framework with custom store
    persistence_manager = PersistenceManager({
        'database': db_store,
        'json': JsonStore('./backup_results/')
    })

    # Run evaluation and save to custom backend
    registry = ModelRegistry()
    engine = ModelInferenceEngine(registry, persistence_manager)

    # Register model and run evaluation
    registry.register_model("gpt-3.5-turbo", {
        "provider": "openai",
        "capabilities": ["reasoning"]
    })

    test_cases = generator.generate_test_cases({"domain": "general"}, count=10)
    results = engine.evaluate_model("gpt-3.5-turbo", test_cases)

    # Save to custom database
    evaluation_id = f"eval_{int(time.time())}"
    db_store.save(evaluation_id, results)

    # Query using custom methods
    model_stats = db_store.get_model_statistics("gpt-3.5-turbo")
    print(f"📊 Model Statistics: {model_stats}")

    recent_evaluations = db_store.query_by_model("gpt-3.5-turbo", limit=5)
    print(f"📈 Recent Evaluations: {len(recent_evaluations)} found")

if __name__ == "__main__":
    custom_persistence_example()

🏢 Enterprise Examples¶

🔄 CI/CD Integration¶

**Integrate LLM evaluation into your development workflow**

"""
CI/CD Integration Example
Automated LLM evaluation in continuous integration pipelines
"""

import os
import sys
import json
from typing import Dict, Any, List

class CIPipelineEvaluator:
    """
    Evaluation system designed for CI/CD pipelines
    Includes test result reporting, failure thresholds, and artifact generation
    """

    def __init__(self, config_path: str = "ci_evaluation_config.json"):
        self.config = self._load_config(config_path)
        self.registry = ModelRegistry()
        self.engine = ModelInferenceEngine(self.registry)
        self.test_results = []

    def _load_config(self, config_path: str) -> Dict[str, Any]:
        """Load CI configuration"""
        try:
            with open(config_path, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            return self._default_config()

    def _default_config(self) -> Dict[str, Any]:
        """Default CI configuration"""
        return {
            "models_to_test": ["gpt-3.5-turbo"],
            "test_suite": "regression",
            "accuracy_threshold": 0.85,
            "cost_threshold": 1.00,
            "timeout_threshold": 30.0,
            "generate_report": True,
            "fail_on_threshold": True
        }

    def run_regression_tests(self) -> Dict[str, Any]:
        """Run regression tests for all configured models"""

        print("🚀 Starting CI/CD LLM Evaluation Pipeline")
        print(f"Testing models: {self.config['models_to_test']}")

        # Load regression test suite
        test_cases = self._load_regression_tests()

        overall_results = {
            'pipeline_status': 'success',
            'model_results': {},
            'summary': {}
        }

        for model_name in self.config['models_to_test']:
            print(f"\n🤖 Testing model: {model_name}")

            try:
                # Register model from environment
                self._register_model_from_env(model_name)

                # Run evaluation
                results = self.engine.evaluate_model(model_name, test_cases)

                # Check thresholds
                threshold_results = self._check_thresholds(results, model_name)

                overall_results['model_results'][model_name] = {
                    'evaluation_results': results,
                    'threshold_checks': threshold_results,
                    'status': 'passed' if threshold_results['all_passed'] else 'failed'
                }

                if not threshold_results['all_passed']:
                    overall_results['pipeline_status'] = 'failed'

            except Exception as e:
                print(f"❌ Model {model_name} evaluation failed: {e}")
                overall_results['model_results'][model_name] = {
                    'status': 'error',
                    'error': str(e)
                }
                overall_results['pipeline_status'] = 'failed'

        # Generate summary
        overall_results['summary'] = self._generate_summary(overall_results)

        # Generate reports
        if self.config['generate_report']:
            self._generate_ci_report(overall_results)

        # Exit with appropriate code
        if self.config['fail_on_threshold'] and overall_results['pipeline_status'] == 'failed':
            print("❌ Pipeline failed due to threshold violations")
            sys.exit(1)

        print("✅ Pipeline completed successfully")
        return overall_results

    def _load_regression_tests(self) -> List[Dict]:
        """Load regression test cases"""
        test_file = f"tests/regression_{self.config['test_suite']}.json"

        try:
            with open(test_file, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            # Generate minimal test suite if not found
            generator = TestDatasetGenerator()
            return generator.generate_test_cases(
                {"domain": "regression", "required_capabilities": ["reasoning"]},
                count=20
            )

    def _register_model_from_env(self, model_name: str):
        """Register model using environment variables"""

        # Get configuration from environment
        api_key = os.getenv(f"{model_name.upper().replace('-', '_')}_API_KEY")
        if not api_key:
            raise ValueError(f"API key not found for {model_name}")

        # Basic model configuration
        config = {
            "provider": "openai" if "gpt" in model_name else "anthropic",
            "api_key": api_key,
            "capabilities": ["reasoning", "creativity"]
        }

        self.registry.register_model(model_name, config)

    def _check_thresholds(self, results: Dict, model_name: str) -> Dict[str, Any]:
        """Check if results meet quality thresholds"""

        metrics = results['aggregate_metrics']
        threshold_checks = {}

        # Accuracy threshold
        accuracy_passed = metrics['accuracy'] >= self.config['accuracy_threshold']
        threshold_checks['accuracy'] = {
            'value': metrics['accuracy'],
            'threshold': self.config['accuracy_threshold'],
            'passed': accuracy_passed
        }

        # Cost threshold
        cost_passed = metrics['total_cost'] <= self.config['cost_threshold']
        threshold_checks['cost'] = {
            'value': metrics['total_cost'],
            'threshold': self.config['cost_threshold'],
            'passed': cost_passed
        }

        # Timeout threshold
        timeout_passed = metrics['average_response_time'] <= self.config['timeout_threshold']
        threshold_checks['response_time'] = {
            'value': metrics['average_response_time'],
            'threshold': self.config['timeout_threshold'],
            'passed': timeout_passed
        }

        threshold_checks['all_passed'] = all(
            check['passed'] for check in threshold_checks.values()
        )

        # Print threshold results
        for metric, check in threshold_checks.items():
            if metric == 'all_passed':
                continue

            status = "✅" if check['passed'] else "❌"
            print(f"  {status} {metric}: {check['value']:.4f} "
                  f"(threshold: {check['threshold']:.4f})")

        return threshold_checks

    def _generate_ci_report(self, results: Dict[str, Any]):
        """Generate CI-friendly evaluation report"""

        # Create reports directory
        os.makedirs('reports', exist_ok=True)

        # Generate JSON report
        with open('reports/llm_evaluation_report.json', 'w') as f:
            json.dump(results, f, indent=2, default=str)

        # Generate markdown report
        markdown_report = self._create_markdown_report(results)
        with open('reports/llm_evaluation_report.md', 'w') as f:
            f.write(markdown_report)

        print("📊 Reports generated in ./reports/")

    def _create_markdown_report(self, results: Dict[str, Any]) -> str:
        """Create markdown evaluation report"""

        report = "# LLM Evaluation Report\n\n"
        report += f"**Pipeline Status**: {results['pipeline_status'].upper()}\n\n"

        report += "## Model Results\n\n"

        for model_name, model_results in results['model_results'].items():
            report += f"### {model_name}\n\n"

            if model_results['status'] == 'error':
                report += f"❌ **Error**: {model_results['error']}\n\n"
                continue

            eval_results = model_results['evaluation_results']
            metrics = eval_results['aggregate_metrics']

            report += f"- **Status**: {model_results['status'].upper()}\n"
            report += f"- **Accuracy**: {metrics['accuracy']:.1%}\n"
            report += f"- **Total Cost**: ${metrics['total_cost']:.4f}\n"
            report += f"- **Average Response Time**: {metrics['average_response_time']:.2f}s\n"
            report += f"- **Success Rate**: {metrics['success_rate']:.1%}\n\n"

            # Threshold checks
            threshold_checks = model_results['threshold_checks']
            report += "#### Threshold Checks\n\n"

            for metric, check in threshold_checks.items():
                if metric == 'all_passed':
                    continue

                status = "✅ Pass" if check['passed'] else "❌ Fail"
                report += f"- **{metric.title()}**: {status} "
                report += f"({check['value']:.4f} vs {check['threshold']:.4f})\n"

            report += "\n"

        return report

# CI/CD Usage Script
def main():
    """Main CI/CD evaluation script"""

    # Initialize evaluator
    evaluator = CIPipelineEvaluator()

    # Run regression tests
    results = evaluator.run_regression_tests()

    # Additional CI/CD specific actions
    if os.getenv('CI') == 'true':
        # Running in CI environment
        print("🔄 CI environment detected")

        # Set GitHub Actions outputs
        if os.getenv('GITHUB_ACTIONS') == 'true':
            with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
                f.write(f"evaluation_status={results['pipeline_status']}\n")
                f.write(f"report_path=reports/llm_evaluation_report.md\n")

if __name__ == "__main__":
    main()

**📁 Complete Files**: - [`examples/ci_cd_integration.py`](../../examples/ci_cd_integration.py) - [`.github/workflows/llm_evaluation.yml`](../../.github/workflows/llm_evaluation.yml)

🎯 Complete Workflow Examples¶

📊 End-to-End Evaluation Pipeline¶

**Production-ready evaluation pipeline with all features**

"""
Complete end-to-end evaluation pipeline
Demonstrates full framework capabilities in a production workflow
"""

def complete_evaluation_pipeline():
    """
    Comprehensive evaluation pipeline demonstrating all framework features
    """

    print("🚀 Starting Complete LLM Evaluation Pipeline")
    print("=" * 60)

    # 1️⃣ SETUP AND CONFIGURATION
    print("\n📋 Step 1: Setup and Configuration")

    # Initialize all components
    registry = ModelRegistry()
    generator = TestDatasetGenerator()

    # Setup custom persistence with multiple backends
    persistence_manager = PersistenceManager({
        'json': JsonStore('./results/json/'),
        'database': CustomDatabaseStore('./results/evaluations.db')
    })

    # Initialize engines
    sync_engine = ModelInferenceEngine(registry, persistence_manager)
    async_engine = AsyncInferenceEngine(registry, persistence_manager)

    # Setup custom scoring
    custom_scorer = ComprehensiveScorer({
        'accuracy': 0.4,
        'fluency': 0.2,
        'relevance': 0.2,
        'safety': 0.2
    })

    print("✅ Components initialized")

    # 2️⃣ MODEL REGISTRATION
    print("\n🤖 Step 2: Model Registration")

    models_config = {
        "gpt-3.5-turbo": {
            "provider": "openai",
            "api_cost_input": 0.0015,
            "api_cost_output": 0.002,
            "capabilities": ["reasoning", "creativity", "coding"],
            "parameters": {"temperature": 0.7, "max_tokens": 1000}
        },
        "claude-3-sonnet": {
            "provider": "anthropic",
            "api_cost_input": 0.003,
            "api_cost_output": 0.015,
            "capabilities": ["reasoning", "analysis", "writing"],
            "parameters": {"temperature": 0.7, "max_tokens": 1000}
        },
        "gpt-4": {
            "provider": "openai",
            "api_cost_input": 0.03,
            "api_cost_output": 0.06,
            "capabilities": ["reasoning", "creativity", "coding", "analysis"],
            "parameters": {"temperature": 0.5, "max_tokens": 1500}
        }
    }

    for model_name, config in models_config.items():
        registry.register_model(model_name, config)
        print(f"   ✅ Registered {model_name}")

    # 3️⃣ TEST CASE GENERATION
    print("\n🧪 Step 3: Test Case Generation")

    # Generate multiple test suites for different capabilities
    test_suites = {}

    use_cases = [
        {"domain": "customer_service", "required_capabilities": ["reasoning", "empathy"], "difficulty": "medium"},
        {"domain": "technical_writing", "required_capabilities": ["analysis", "clarity"], "difficulty": "hard"},
        {"domain": "creative_writing", "required_capabilities": ["creativity", "fluency"], "difficulty": "medium"},
        {"domain": "code_review", "required_capabilities": ["coding", "analysis"], "difficulty": "hard"}
    ]

    for use_case in use_cases:
        suite_name = use_case["domain"]
        test_cases = generator.generate_test_cases(use_case, count=25)
        test_suites[suite_name] = {
            'use_case': use_case,
            'test_cases': test_cases
        }
        print(f"   ✅ Generated {len(test_cases)} tests for {suite_name}")

    # 4️⃣ EVALUATION EXECUTION
    print("\n⚡ Step 4: Evaluation Execution")

    all_results = {}

    for model_name in models_config.keys():
        print(f"\n🔄 Evaluating {model_name}...")
        model_results = {}

        for suite_name, suite_data in test_suites.items():
            print(f"   🧪 Running {suite_name} tests...")

            try:
                # Run evaluation with custom scoring
                results = sync_engine.evaluate_model(
                    model_name=model_name,
                    test_cases=suite_data['test_cases'],
                    use_case=suite_data['use_case'],
                    scoring_strategy=custom_scorer
                )

                model_results[suite_name] = results

                # Display immediate results
                metrics = results['aggregate_metrics']
                print(f"      ✅ Accuracy: {metrics['accuracy']:.1%}, "
                      f"Cost: ${metrics['total_cost']:.4f}, "
                      f"Time: {metrics['average_response_time']:.2f}s")

            except Exception as e:
                print(f"      ❌ Failed: {e}")
                model_results[suite_name] = {'error': str(e)}

        all_results[model_name] = model_results

    # 5️⃣ ADVANCED ANALYTICS
    print("\n📊 Step 5: Advanced Analytics")

    # Cross-model comparison
    comparison_analysis = analyze_cross_model_performance(all_results)

    # Cost-benefit analysis
    cost_analysis = analyze_cost_efficiency(all_results)

    # Capability analysis
    capability_analysis = analyze_capability_performance(all_results, test_suites)

    print("✅ Analytics completed")

    # 6️⃣ REPORT GENERATION
    print("\n📋 Step 6: Report Generation")

    # Generate comprehensive report
    report = generate_comprehensive_report({
        'evaluation_results': all_results,
        'comparison_analysis': comparison_analysis,
        'cost_analysis': cost_analysis,
        'capability_analysis': capability_analysis,
        'test_suites': test_suites
    })

    # Save reports in multiple formats
    save_report(report, 'comprehensive_evaluation_report')

    print("✅ Reports generated")

    # 7️⃣ RECOMMENDATIONS
    print("\n💡 Step 7: AI-Powered Recommendations")

    # Generate recommendations using auto-suggestion engine
    suggestion_engine = AutoSuggestionEngine(registry, all_results)
    recommendations = suggestion_engine.generate_recommendations()

    print("📈 RECOMMENDATIONS:")
    for recommendation in recommendations:
        print(f"   • {recommendation}")

    print("\n🎉 Pipeline completed successfully!")
    print("=" * 60)

    return {
        'results': all_results,
        'analytics': {
            'comparison': comparison_analysis,
            'cost': cost_analysis,
            'capability': capability_analysis
        },
        'recommendations': recommendations,
        'report': report
    }

def analyze_cross_model_performance(results: Dict) -> Dict:
    """Analyze performance across different models"""
    # Implementation for cross-model analysis
    pass

def analyze_cost_efficiency(results: Dict) -> Dict:
    """Analyze cost efficiency across models and test suites"""
    # Implementation for cost analysis
    pass

def analyze_capability_performance(results: Dict, test_suites: Dict) -> Dict:
    """Analyze performance by capability"""
    # Implementation for capability analysis
    pass

def generate_comprehensive_report(data: Dict) -> Dict:
    """Generate comprehensive evaluation report"""
    # Implementation for report generation
    pass

def save_report(report: Dict, filename: str):
    """Save report in multiple formats"""
    # Save as JSON
    with open(f'reports/{filename}.json', 'w') as f:
        json.dump(report, f, indent=2, default=str)

    # Save as Markdown
    markdown_content = create_markdown_report(report)
    with open(f'reports/{filename}.md', 'w') as f:
        f.write(markdown_content)

    print(f"📊 Reports saved: {filename}.json, {filename}.md")

if __name__ == "__main__":
    pipeline_results = complete_evaluation_pipeline()

**📁 Complete File**: [`examples/full_pipeline_demo.py`](../../examples/full_pipeline_demo.py)

### 🔗 **Example Files Index** | Category | Example | File | Description | |----------|---------|------|-------------| | **🚀 Basic** | Simple Evaluation | [`basic_usage.py`](../../examples/basic_usage.py) | Quick model evaluation setup | | **🚀 Basic** | Model Comparison | [`model_comparison.py`](../../examples/model_comparison.py) | Compare multiple models side-by-side | | **⚡ Performance** | Async Processing | [`advanced_async_usage.py`](../../examples/advanced_async_usage.py) | High-throughput concurrent evaluation | | **⚡ Performance** | Streaming Results | [`streaming_evaluation.py`](../../examples/streaming_evaluation.py) | Real-time result processing | | **🔧 Custom** | Custom Scoring | [`custom_scoring_and_persistence.py`](../../examples/custom_scoring_and_persistence.py) | Domain-specific evaluation metrics | | **🔧 Custom** | Custom Persistence | [`custom_persistence.py`](../../examples/custom_persistence.py) | Database and storage integration | | **🏢 Enterprise** | CI/CD Integration | [`ci_cd_integration.py`](../../examples/ci_cd_integration.py) | Automated pipeline evaluation | | **🏢 Enterprise** | Complete Workflow | [`full_pipeline_demo.py`](../../examples/full_pipeline_demo.py) | End-to-end production pipeline | | **📊 Analytics** | Data Analysis | [`dataset_generation_and_analysis.py`](../../examples/dataset_generation_and_analysis.py) | Test data generation and analysis | | **🛡️ Production** | Error Handling | [`error_handling_and_logging.py`](../../examples/error_handling_and_logging.py) | Robust error handling patterns | | **🖥️ CLI** | Command Line | [`cli_usage.py`](../../examples/cli_usage.py) | CLI workflow examples | | **💡 AI** | Model Recommendations | [`model_recommendation.py`](../../examples/model_recommendation.py) | AI-powered model suggestions |

## 🚀 Ready to Build Amazing Applications? **Choose your path and start building with confidence!** [![🚀 Quick Start](https://img.shields.io/badge/🚀_Quick_Start-Get%20Started%20Now-22c55e?style=for-the-badge)](getting-started.md) [![📚 Full Documentation](https://img.shields.io/badge/📚_Full_Documentation-Read%20the%20Docs-6366f1?style=for-the-badge)](../index.md) [![🔧 Advanced Patterns](https://img.shields.io/badge/🔧_Advanced_Patterns-Power%20User%20Guide-f59e0b?style=for-the-badge)](advanced-usage.md) [![🤝 Contribute](https://img.shields.io/badge/🤝_Contribute-Join%20the%20Community-ef4444?style=for-the-badge)](../contributing.md) --- *Transform your ideas into production-ready LLM evaluation systems! 💡*