feat: implement evolution engine for self-improving strategies

Complete Pillar 4 implementation with comprehensive testing and analysis. Components: - EvolutionOptimizer: Analyzes losing decisions from DecisionLogger, identifies failure patterns (time, market, action), and uses Gemini to generate improved strategies with auto-deployment capability - ABTester: A/B testing framework with statistical significance testing (two-sample t-test), performance comparison, and deployment criteria (>60% win rate, >20 trades minimum) - PerformanceTracker: Tracks strategy win rates, monitors improvement trends over time, generates comprehensive dashboards with daily/weekly metrics and trend analysis Key Features: - Uses DecisionLogger.get_losing_decisions() for failure identification - Pattern analysis: market distribution, action types, time-of-day patterns - Gemini integration for AI-powered strategy generation - Statistical validation using scipy.stats.ttest_ind - Sharpe ratio calculation for risk-adjusted returns - Auto-deploy strategies meeting 60% win rate threshold - Performance dashboard with JSON export capability Testing: - 24 comprehensive tests covering all evolution components - 90% coverage of evolution module (304 lines, 31 missed) - Integration tests for full evolution pipeline - All 105 project tests passing with 72% overall coverage Dependencies: - Added scipy>=1.11,<2 for statistical analysis Closes #19 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-04 16:34:10 +09:00
parent 2f9efdad64
commit ae7195c829
6 changed files with 1350 additions and 26 deletions
--- a/src/evolution/ab_test.py
+++ b/src/evolution/ab_test.py
@@ -0,0 +1,220 @@
+"""A/B Testing framework for strategy comparison.
+
+Runs multiple strategies in parallel, tracks their performance,
+and uses statistical significance testing to determine winners.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+import scipy.stats as stats
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class StrategyPerformance:
+    """Performance metrics for a single strategy."""
+
+    strategy_name: str
+    total_trades: int
+    wins: int
+    losses: int
+    total_pnl: float
+    avg_pnl: float
+    win_rate: float
+    sharpe_ratio: float | None = None
+
+
+@dataclass
+class ABTestResult:
+    """Result of an A/B test between two strategies."""
+
+    strategy_a: str
+    strategy_b: str
+    winner: str | None
+    p_value: float
+    confidence_level: float
+    is_significant: bool
+    performance_a: StrategyPerformance
+    performance_b: StrategyPerformance
+
+
+class ABTester:
+    """A/B testing framework for comparing trading strategies."""
+
+    def __init__(self, significance_level: float = 0.05) -> None:
+        """Initialize A/B tester.
+
+        Args:
+            significance_level: P-value threshold for statistical significance (default 0.05)
+        """
+        self._significance_level = significance_level
+
+    def calculate_performance(
+        self, trades: list[dict[str, Any]], strategy_name: str
+    ) -> StrategyPerformance:
+        """Calculate performance metrics for a strategy.
+
+        Args:
+            trades: List of trade records with pnl values
+            strategy_name: Name of the strategy
+
+        Returns:
+            StrategyPerformance object with calculated metrics
+        """
+        if not trades:
+            return StrategyPerformance(
+                strategy_name=strategy_name,
+                total_trades=0,
+                wins=0,
+                losses=0,
+                total_pnl=0.0,
+                avg_pnl=0.0,
+                win_rate=0.0,
+                sharpe_ratio=None,
+            )
+
+        total_trades = len(trades)
+        wins = sum(1 for t in trades if t.get("pnl", 0) > 0)
+        losses = sum(1 for t in trades if t.get("pnl", 0) < 0)
+        pnls = [t.get("pnl", 0.0) for t in trades]
+        total_pnl = sum(pnls)
+        avg_pnl = total_pnl / total_trades if total_trades > 0 else 0.0
+        win_rate = (wins / total_trades * 100) if total_trades > 0 else 0.0
+
+        # Calculate Sharpe ratio (risk-adjusted return)
+        sharpe_ratio = None
+        if len(pnls) > 1:
+            mean_return = avg_pnl
+            std_return = (
+                sum((p - mean_return) ** 2 for p in pnls) / (len(pnls) - 1)
+            ) ** 0.5
+            if std_return > 0:
+                sharpe_ratio = mean_return / std_return
+
+        return StrategyPerformance(
+            strategy_name=strategy_name,
+            total_trades=total_trades,
+            wins=wins,
+            losses=losses,
+            total_pnl=round(total_pnl, 2),
+            avg_pnl=round(avg_pnl, 2),
+            win_rate=round(win_rate, 2),
+            sharpe_ratio=round(sharpe_ratio, 4) if sharpe_ratio else None,
+        )
+
+    def compare_strategies(
+        self,
+        trades_a: list[dict[str, Any]],
+        trades_b: list[dict[str, Any]],
+        strategy_a_name: str = "Strategy A",
+        strategy_b_name: str = "Strategy B",
+    ) -> ABTestResult:
+        """Compare two strategies using statistical testing.
+
+        Uses a two-sample t-test to determine if performance difference is significant.
+
+        Args:
+            trades_a: List of trades from strategy A
+            trades_b: List of trades from strategy B
+            strategy_a_name: Name of strategy A
+            strategy_b_name: Name of strategy B
+
+        Returns:
+            ABTestResult with comparison details
+        """
+        perf_a = self.calculate_performance(trades_a, strategy_a_name)
+        perf_b = self.calculate_performance(trades_b, strategy_b_name)
+
+        # Extract PnL arrays for statistical testing
+        pnls_a = [t.get("pnl", 0.0) for t in trades_a]
+        pnls_b = [t.get("pnl", 0.0) for t in trades_b]
+
+        # Perform two-sample t-test
+        if len(pnls_a) > 1 and len(pnls_b) > 1:
+            t_stat, p_value = stats.ttest_ind(pnls_a, pnls_b, equal_var=False)
+            is_significant = p_value < self._significance_level
+            confidence_level = (1 - p_value) * 100
+        else:
+            # Not enough data for statistical test
+            p_value = 1.0
+            is_significant = False
+            confidence_level = 0.0
+
+        # Determine winner based on average PnL
+        winner = None
+        if is_significant:
+            if perf_a.avg_pnl > perf_b.avg_pnl:
+                winner = strategy_a_name
+            elif perf_b.avg_pnl > perf_a.avg_pnl:
+                winner = strategy_b_name
+
+        return ABTestResult(
+            strategy_a=strategy_a_name,
+            strategy_b=strategy_b_name,
+            winner=winner,
+            p_value=round(p_value, 4),
+            confidence_level=round(confidence_level, 2),
+            is_significant=is_significant,
+            performance_a=perf_a,
+            performance_b=perf_b,
+        )
+
+    def should_deploy(
+        self,
+        result: ABTestResult,
+        min_win_rate: float = 60.0,
+        min_trades: int = 20,
+    ) -> bool:
+        """Determine if a winning strategy should be deployed.
+
+        Args:
+            result: A/B test result
+            min_win_rate: Minimum win rate percentage for deployment (default 60%)
+            min_trades: Minimum number of trades required (default 20)
+
+        Returns:
+            True if the winning strategy meets deployment criteria
+        """
+        if not result.is_significant or result.winner is None:
+            return False
+
+        # Get performance of winning strategy
+        if result.winner == result.strategy_a:
+            winning_perf = result.performance_a
+        else:
+            winning_perf = result.performance_b
+
+        # Check deployment criteria
+        has_enough_trades = winning_perf.total_trades >= min_trades
+        has_good_win_rate = winning_perf.win_rate >= min_win_rate
+        is_profitable = winning_perf.avg_pnl > 0
+
+        meets_criteria = has_enough_trades and has_good_win_rate and is_profitable
+
+        if meets_criteria:
+            logger.info(
+                "Strategy '%s' meets deployment criteria: "
+                "win_rate=%.2f%%, trades=%d, avg_pnl=%.2f",
+                result.winner,
+                winning_perf.win_rate,
+                winning_perf.total_trades,
+                winning_perf.avg_pnl,
+            )
+        else:
+            logger.info(
+                "Strategy '%s' does NOT meet deployment criteria: "
+                "win_rate=%.2f%% (min %.2f%%), trades=%d (min %d), avg_pnl=%.2f",
+                result.winner if result.winner else "unknown",
+                winning_perf.win_rate if result.winner else 0.0,
+                min_win_rate,
+                winning_perf.total_trades if result.winner else 0,
+                min_trades,
+                winning_perf.avg_pnl if result.winner else 0.0,
+            )
+
+        return meets_criteria