Some checks failed
CI / test (pull_request) Has been cancelled
Complete Pillar 4 implementation with comprehensive testing and analysis. Components: - EvolutionOptimizer: Analyzes losing decisions from DecisionLogger, identifies failure patterns (time, market, action), and uses Gemini to generate improved strategies with auto-deployment capability - ABTester: A/B testing framework with statistical significance testing (two-sample t-test), performance comparison, and deployment criteria (>60% win rate, >20 trades minimum) - PerformanceTracker: Tracks strategy win rates, monitors improvement trends over time, generates comprehensive dashboards with daily/weekly metrics and trend analysis Key Features: - Uses DecisionLogger.get_losing_decisions() for failure identification - Pattern analysis: market distribution, action types, time-of-day patterns - Gemini integration for AI-powered strategy generation - Statistical validation using scipy.stats.ttest_ind - Sharpe ratio calculation for risk-adjusted returns - Auto-deploy strategies meeting 60% win rate threshold - Performance dashboard with JSON export capability Testing: - 24 comprehensive tests covering all evolution components - 90% coverage of evolution module (304 lines, 31 missed) - Integration tests for full evolution pipeline - All 105 project tests passing with 72% overall coverage Dependencies: - Added scipy>=1.11,<2 for statistical analysis Closes #19 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
221 lines
7.0 KiB
Python
221 lines
7.0 KiB
Python
"""A/B Testing framework for strategy comparison.
|
|
|
|
Runs multiple strategies in parallel, tracks their performance,
|
|
and uses statistical significance testing to determine winners.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
import scipy.stats as stats
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class StrategyPerformance:
|
|
"""Performance metrics for a single strategy."""
|
|
|
|
strategy_name: str
|
|
total_trades: int
|
|
wins: int
|
|
losses: int
|
|
total_pnl: float
|
|
avg_pnl: float
|
|
win_rate: float
|
|
sharpe_ratio: float | None = None
|
|
|
|
|
|
@dataclass
|
|
class ABTestResult:
|
|
"""Result of an A/B test between two strategies."""
|
|
|
|
strategy_a: str
|
|
strategy_b: str
|
|
winner: str | None
|
|
p_value: float
|
|
confidence_level: float
|
|
is_significant: bool
|
|
performance_a: StrategyPerformance
|
|
performance_b: StrategyPerformance
|
|
|
|
|
|
class ABTester:
|
|
"""A/B testing framework for comparing trading strategies."""
|
|
|
|
def __init__(self, significance_level: float = 0.05) -> None:
|
|
"""Initialize A/B tester.
|
|
|
|
Args:
|
|
significance_level: P-value threshold for statistical significance (default 0.05)
|
|
"""
|
|
self._significance_level = significance_level
|
|
|
|
def calculate_performance(
|
|
self, trades: list[dict[str, Any]], strategy_name: str
|
|
) -> StrategyPerformance:
|
|
"""Calculate performance metrics for a strategy.
|
|
|
|
Args:
|
|
trades: List of trade records with pnl values
|
|
strategy_name: Name of the strategy
|
|
|
|
Returns:
|
|
StrategyPerformance object with calculated metrics
|
|
"""
|
|
if not trades:
|
|
return StrategyPerformance(
|
|
strategy_name=strategy_name,
|
|
total_trades=0,
|
|
wins=0,
|
|
losses=0,
|
|
total_pnl=0.0,
|
|
avg_pnl=0.0,
|
|
win_rate=0.0,
|
|
sharpe_ratio=None,
|
|
)
|
|
|
|
total_trades = len(trades)
|
|
wins = sum(1 for t in trades if t.get("pnl", 0) > 0)
|
|
losses = sum(1 for t in trades if t.get("pnl", 0) < 0)
|
|
pnls = [t.get("pnl", 0.0) for t in trades]
|
|
total_pnl = sum(pnls)
|
|
avg_pnl = total_pnl / total_trades if total_trades > 0 else 0.0
|
|
win_rate = (wins / total_trades * 100) if total_trades > 0 else 0.0
|
|
|
|
# Calculate Sharpe ratio (risk-adjusted return)
|
|
sharpe_ratio = None
|
|
if len(pnls) > 1:
|
|
mean_return = avg_pnl
|
|
std_return = (
|
|
sum((p - mean_return) ** 2 for p in pnls) / (len(pnls) - 1)
|
|
) ** 0.5
|
|
if std_return > 0:
|
|
sharpe_ratio = mean_return / std_return
|
|
|
|
return StrategyPerformance(
|
|
strategy_name=strategy_name,
|
|
total_trades=total_trades,
|
|
wins=wins,
|
|
losses=losses,
|
|
total_pnl=round(total_pnl, 2),
|
|
avg_pnl=round(avg_pnl, 2),
|
|
win_rate=round(win_rate, 2),
|
|
sharpe_ratio=round(sharpe_ratio, 4) if sharpe_ratio else None,
|
|
)
|
|
|
|
def compare_strategies(
|
|
self,
|
|
trades_a: list[dict[str, Any]],
|
|
trades_b: list[dict[str, Any]],
|
|
strategy_a_name: str = "Strategy A",
|
|
strategy_b_name: str = "Strategy B",
|
|
) -> ABTestResult:
|
|
"""Compare two strategies using statistical testing.
|
|
|
|
Uses a two-sample t-test to determine if performance difference is significant.
|
|
|
|
Args:
|
|
trades_a: List of trades from strategy A
|
|
trades_b: List of trades from strategy B
|
|
strategy_a_name: Name of strategy A
|
|
strategy_b_name: Name of strategy B
|
|
|
|
Returns:
|
|
ABTestResult with comparison details
|
|
"""
|
|
perf_a = self.calculate_performance(trades_a, strategy_a_name)
|
|
perf_b = self.calculate_performance(trades_b, strategy_b_name)
|
|
|
|
# Extract PnL arrays for statistical testing
|
|
pnls_a = [t.get("pnl", 0.0) for t in trades_a]
|
|
pnls_b = [t.get("pnl", 0.0) for t in trades_b]
|
|
|
|
# Perform two-sample t-test
|
|
if len(pnls_a) > 1 and len(pnls_b) > 1:
|
|
t_stat, p_value = stats.ttest_ind(pnls_a, pnls_b, equal_var=False)
|
|
is_significant = p_value < self._significance_level
|
|
confidence_level = (1 - p_value) * 100
|
|
else:
|
|
# Not enough data for statistical test
|
|
p_value = 1.0
|
|
is_significant = False
|
|
confidence_level = 0.0
|
|
|
|
# Determine winner based on average PnL
|
|
winner = None
|
|
if is_significant:
|
|
if perf_a.avg_pnl > perf_b.avg_pnl:
|
|
winner = strategy_a_name
|
|
elif perf_b.avg_pnl > perf_a.avg_pnl:
|
|
winner = strategy_b_name
|
|
|
|
return ABTestResult(
|
|
strategy_a=strategy_a_name,
|
|
strategy_b=strategy_b_name,
|
|
winner=winner,
|
|
p_value=round(p_value, 4),
|
|
confidence_level=round(confidence_level, 2),
|
|
is_significant=is_significant,
|
|
performance_a=perf_a,
|
|
performance_b=perf_b,
|
|
)
|
|
|
|
def should_deploy(
|
|
self,
|
|
result: ABTestResult,
|
|
min_win_rate: float = 60.0,
|
|
min_trades: int = 20,
|
|
) -> bool:
|
|
"""Determine if a winning strategy should be deployed.
|
|
|
|
Args:
|
|
result: A/B test result
|
|
min_win_rate: Minimum win rate percentage for deployment (default 60%)
|
|
min_trades: Minimum number of trades required (default 20)
|
|
|
|
Returns:
|
|
True if the winning strategy meets deployment criteria
|
|
"""
|
|
if not result.is_significant or result.winner is None:
|
|
return False
|
|
|
|
# Get performance of winning strategy
|
|
if result.winner == result.strategy_a:
|
|
winning_perf = result.performance_a
|
|
else:
|
|
winning_perf = result.performance_b
|
|
|
|
# Check deployment criteria
|
|
has_enough_trades = winning_perf.total_trades >= min_trades
|
|
has_good_win_rate = winning_perf.win_rate >= min_win_rate
|
|
is_profitable = winning_perf.avg_pnl > 0
|
|
|
|
meets_criteria = has_enough_trades and has_good_win_rate and is_profitable
|
|
|
|
if meets_criteria:
|
|
logger.info(
|
|
"Strategy '%s' meets deployment criteria: "
|
|
"win_rate=%.2f%%, trades=%d, avg_pnl=%.2f",
|
|
result.winner,
|
|
winning_perf.win_rate,
|
|
winning_perf.total_trades,
|
|
winning_perf.avg_pnl,
|
|
)
|
|
else:
|
|
logger.info(
|
|
"Strategy '%s' does NOT meet deployment criteria: "
|
|
"win_rate=%.2f%% (min %.2f%%), trades=%d (min %d), avg_pnl=%.2f",
|
|
result.winner if result.winner else "unknown",
|
|
winning_perf.win_rate if result.winner else 0.0,
|
|
min_win_rate,
|
|
winning_perf.total_trades if result.winner else 0,
|
|
min_trades,
|
|
winning_perf.avg_pnl if result.winner else 0.0,
|
|
)
|
|
|
|
return meets_criteria
|