From c27decb6b1a31fcee01bc440129a5fdd54b45923 Mon Sep 17 00:00:00 2001 From: agentson Date: Mon, 2 Mar 2026 02:10:08 +0900 Subject: [PATCH] backtest: reflect cost/execution effects in fold scoring (#368) --- src/analysis/backtest_cost_guard.py | 11 +++ src/analysis/backtest_pipeline.py | 104 ++++++++++++++++++-- tests/test_backtest_cost_guard.py | 32 ++++++ tests/test_backtest_pipeline_integration.py | 45 +++++++++ 4 files changed, 186 insertions(+), 6 deletions(-) diff --git a/src/analysis/backtest_cost_guard.py b/src/analysis/backtest_cost_guard.py index 97e1cd3..ae0d729 100644 --- a/src/analysis/backtest_cost_guard.py +++ b/src/analysis/backtest_cost_guard.py @@ -11,6 +11,7 @@ class BacktestCostModel: commission_bps: float | None = None slippage_bps_by_session: dict[str, float] | None = None failure_rate_by_session: dict[str, float] | None = None + partial_fill_rate_by_session: dict[str, float] | None = None unfavorable_fill_required: bool = True @@ -31,6 +32,7 @@ def validate_backtest_cost_model( slippage = model.slippage_bps_by_session or {} failure = model.failure_rate_by_session or {} + partial_fill = model.partial_fill_rate_by_session or {} missing_slippage = [s for s in required_sessions if s not in slippage] if missing_slippage: @@ -43,6 +45,12 @@ def validate_backtest_cost_model( raise ValueError( f"missing failure_rate_by_session for sessions: {', '.join(missing_failure)}" ) + missing_partial_fill = [s for s in required_sessions if s not in partial_fill] + if missing_partial_fill: + raise ValueError( + "missing partial_fill_rate_by_session for sessions: " + f"{', '.join(missing_partial_fill)}" + ) for sess, bps in slippage.items(): if not math.isfinite(bps) or bps < 0: @@ -50,3 +58,6 @@ def validate_backtest_cost_model( for sess, rate in failure.items(): if not math.isfinite(rate) or rate < 0 or rate > 1: raise ValueError(f"failure rate must be within [0,1] for session={sess}") + for sess, rate in partial_fill.items(): + if not math.isfinite(rate) or rate < 0 or rate > 1: + raise ValueError(f"partial fill rate must be within [0,1] for session={sess}") diff --git a/src/analysis/backtest_pipeline.py b/src/analysis/backtest_pipeline.py index 985e0e0..ce27f1f 100644 --- a/src/analysis/backtest_pipeline.py +++ b/src/analysis/backtest_pipeline.py @@ -13,6 +13,11 @@ from statistics import mean from typing import Literal, cast from src.analysis.backtest_cost_guard import BacktestCostModel, validate_backtest_cost_model +from src.analysis.backtest_execution_model import ( + BacktestExecutionModel, + ExecutionAssumptions, + ExecutionRequest, +) from src.analysis.triple_barrier import TripleBarrierSpec, label_with_triple_barrier from src.analysis.walk_forward_split import WalkForwardFold, generate_walk_forward_splits @@ -40,6 +45,7 @@ class WalkForwardConfig: class BaselineScore: name: Literal["B0", "B1", "M1"] accuracy: float + cost_adjusted_accuracy: float @dataclass(frozen=True) @@ -115,6 +121,8 @@ def run_v2_backtest_pipeline( ).label ordered_labels = [labels_by_bar_index[idx] for idx in normalized_entries] + ordered_sessions = [bars[idx].session_id for idx in normalized_entries] + ordered_prices = [bars[idx].close for idx in normalized_entries] folds = generate_walk_forward_splits( n_samples=len(normalized_entries), train_size=walk_forward.train_size, @@ -129,8 +137,13 @@ def run_v2_backtest_pipeline( for fold_idx, fold in enumerate(folds): train_labels = [ordered_labels[i] for i in fold.train_indices] test_labels = [ordered_labels[i] for i in fold.test_indices] + test_sessions = [ordered_sessions[i] for i in fold.test_indices] + test_prices = [ordered_prices[i] for i in fold.test_indices] if not test_labels: continue + execution_model = _build_execution_model(cost_model=cost_model, fold_seed=fold_idx) + b0_pred = _baseline_b0_pred(train_labels) + m1_pred = _m1_pred(train_labels) fold_results.append( BacktestFoldResult( fold_index=fold_idx, @@ -139,11 +152,41 @@ def run_v2_backtest_pipeline( train_label_distribution=_label_dist(train_labels), test_label_distribution=_label_dist(test_labels), baseline_scores=[ - BaselineScore(name="B0", accuracy=_baseline_b0(train_labels, test_labels)), - BaselineScore(name="B1", accuracy=_score_constant(1, test_labels)), + BaselineScore( + name="B0", + accuracy=_score_constant(b0_pred, test_labels), + cost_adjusted_accuracy=_score_with_execution( + prediction=b0_pred, + actual=test_labels, + sessions=test_sessions, + reference_prices=test_prices, + execution_model=execution_model, + commission_bps=float(cost_model.commission_bps or 0.0), + ), + ), + BaselineScore( + name="B1", + accuracy=_score_constant(1, test_labels), + cost_adjusted_accuracy=_score_with_execution( + prediction=1, + actual=test_labels, + sessions=test_sessions, + reference_prices=test_prices, + execution_model=execution_model, + commission_bps=float(cost_model.commission_bps or 0.0), + ), + ), BaselineScore( name="M1", - accuracy=_score_constant(_m1_pred(train_labels), test_labels), + accuracy=_score_constant(m1_pred, test_labels), + cost_adjusted_accuracy=_score_with_execution( + prediction=m1_pred, + actual=test_labels, + sessions=test_sessions, + reference_prices=test_prices, + execution_model=execution_model, + commission_bps=float(cost_model.commission_bps or 0.0), + ), ), ], ) @@ -176,12 +219,15 @@ def _score_constant(pred: int, actual: Sequence[int]) -> float: def _baseline_b0(train_labels: Sequence[int], test_labels: Sequence[int]) -> float: + return _score_constant(_baseline_b0_pred(train_labels), test_labels) + + +def _baseline_b0_pred(train_labels: Sequence[int]) -> int: if not train_labels: - return _score_constant(0, test_labels) + return 0 # Majority-class baseline from training fold. choices = (-1, 0, 1) - pred = max(choices, key=lambda c: train_labels.count(c)) - return _score_constant(pred, test_labels) + return max(choices, key=lambda c: train_labels.count(c)) def _m1_pred(train_labels: Sequence[int]) -> int: @@ -190,6 +236,52 @@ def _m1_pred(train_labels: Sequence[int]) -> int: return train_labels[-1] +def _build_execution_model(*, cost_model: BacktestCostModel, fold_seed: int) -> BacktestExecutionModel: + return BacktestExecutionModel( + ExecutionAssumptions( + slippage_bps_by_session=dict(cost_model.slippage_bps_by_session or {}), + failure_rate_by_session=dict(cost_model.failure_rate_by_session or {}), + partial_fill_rate_by_session=dict(cost_model.partial_fill_rate_by_session or {}), + seed=fold_seed, + ) + ) + + +def _score_with_execution( + *, + prediction: int, + actual: Sequence[int], + sessions: Sequence[str], + reference_prices: Sequence[float], + execution_model: BacktestExecutionModel, + commission_bps: float, +) -> float: + if not actual: + return 0.0 + contributions: list[float] = [] + for label, session_id, reference_price in zip(actual, sessions, reference_prices, strict=True): + if prediction == 0: + contributions.append(1.0 if label == 0 else 0.0) + continue + side = "BUY" if prediction > 0 else "SELL" + execution = execution_model.simulate( + ExecutionRequest( + side=side, + session_id=session_id, + qty=100, + reference_price=reference_price, + ) + ) + if execution.status == "REJECTED": + contributions.append(0.0) + continue + fill_ratio = execution.filled_qty / 100.0 + cost_penalty = min(0.99, (commission_bps + execution.slippage_bps) / 10000.0) + correctness = 1.0 if prediction == label else 0.0 + contributions.append(correctness * fill_ratio * (1.0 - cost_penalty)) + return mean(contributions) + + def _build_run_id(*, n_entries: int, n_folds: int, sessions: Sequence[str]) -> str: sess_key = "_".join(sessions) return f"v2p-e{n_entries}-f{n_folds}-s{sess_key}" diff --git a/tests/test_backtest_cost_guard.py b/tests/test_backtest_cost_guard.py index 6c73a30..bc315f6 100644 --- a/tests/test_backtest_cost_guard.py +++ b/tests/test_backtest_cost_guard.py @@ -10,6 +10,7 @@ def test_valid_backtest_cost_model_passes() -> None: commission_bps=5.0, slippage_bps_by_session={"KRX_REG": 10.0, "US_PRE": 50.0}, failure_rate_by_session={"KRX_REG": 0.01, "US_PRE": 0.08}, + partial_fill_rate_by_session={"KRX_REG": 0.1, "US_PRE": 0.2}, unfavorable_fill_required=True, ) validate_backtest_cost_model(model=model, required_sessions=["KRX_REG", "US_PRE"]) @@ -20,6 +21,7 @@ def test_missing_required_slippage_session_raises() -> None: commission_bps=5.0, slippage_bps_by_session={"KRX_REG": 10.0}, failure_rate_by_session={"KRX_REG": 0.01, "US_PRE": 0.08}, + partial_fill_rate_by_session={"KRX_REG": 0.1, "US_PRE": 0.2}, unfavorable_fill_required=True, ) with pytest.raises(ValueError, match="missing slippage_bps_by_session.*US_PRE"): @@ -31,6 +33,7 @@ def test_missing_required_failure_rate_session_raises() -> None: commission_bps=5.0, slippage_bps_by_session={"KRX_REG": 10.0, "US_PRE": 50.0}, failure_rate_by_session={"KRX_REG": 0.01}, + partial_fill_rate_by_session={"KRX_REG": 0.1, "US_PRE": 0.2}, unfavorable_fill_required=True, ) with pytest.raises(ValueError, match="missing failure_rate_by_session.*US_PRE"): @@ -42,6 +45,7 @@ def test_invalid_failure_rate_range_raises() -> None: commission_bps=5.0, slippage_bps_by_session={"KRX_REG": 10.0}, failure_rate_by_session={"KRX_REG": 1.2}, + partial_fill_rate_by_session={"KRX_REG": 0.2}, unfavorable_fill_required=True, ) with pytest.raises(ValueError, match="failure rate must be within"): @@ -53,6 +57,7 @@ def test_unfavorable_fill_requirement_cannot_be_disabled() -> None: commission_bps=5.0, slippage_bps_by_session={"KRX_REG": 10.0}, failure_rate_by_session={"KRX_REG": 0.02}, + partial_fill_rate_by_session={"KRX_REG": 0.2}, unfavorable_fill_required=False, ) with pytest.raises(ValueError, match="unfavorable_fill_required must be True"): @@ -65,6 +70,7 @@ def test_non_finite_commission_rejected(bad_commission: float) -> None: commission_bps=bad_commission, slippage_bps_by_session={"KRX_REG": 10.0}, failure_rate_by_session={"KRX_REG": 0.02}, + partial_fill_rate_by_session={"KRX_REG": 0.2}, unfavorable_fill_required=True, ) with pytest.raises(ValueError, match="commission_bps"): @@ -77,7 +83,33 @@ def test_non_finite_slippage_rejected(bad_slippage: float) -> None: commission_bps=5.0, slippage_bps_by_session={"KRX_REG": bad_slippage}, failure_rate_by_session={"KRX_REG": 0.02}, + partial_fill_rate_by_session={"KRX_REG": 0.2}, unfavorable_fill_required=True, ) with pytest.raises(ValueError, match="slippage bps"): validate_backtest_cost_model(model=model, required_sessions=["KRX_REG"]) + + +def test_missing_required_partial_fill_session_raises() -> None: + model = BacktestCostModel( + commission_bps=5.0, + slippage_bps_by_session={"KRX_REG": 10.0, "US_PRE": 50.0}, + failure_rate_by_session={"KRX_REG": 0.01, "US_PRE": 0.08}, + partial_fill_rate_by_session={"KRX_REG": 0.1}, + unfavorable_fill_required=True, + ) + with pytest.raises(ValueError, match="missing partial_fill_rate_by_session.*US_PRE"): + validate_backtest_cost_model(model=model, required_sessions=["KRX_REG", "US_PRE"]) + + +@pytest.mark.parametrize("bad_partial_fill", [float("nan"), float("inf"), float("-inf"), -0.1, 1.1]) +def test_invalid_partial_fill_rate_rejected(bad_partial_fill: float) -> None: + model = BacktestCostModel( + commission_bps=5.0, + slippage_bps_by_session={"KRX_REG": 10.0}, + failure_rate_by_session={"KRX_REG": 0.02}, + partial_fill_rate_by_session={"KRX_REG": bad_partial_fill}, + unfavorable_fill_required=True, + ) + with pytest.raises(ValueError, match="partial fill rate must be within"): + validate_backtest_cost_model(model=model, required_sessions=["KRX_REG"]) diff --git a/tests/test_backtest_pipeline_integration.py b/tests/test_backtest_pipeline_integration.py index c0ad496..d63a540 100644 --- a/tests/test_backtest_pipeline_integration.py +++ b/tests/test_backtest_pipeline_integration.py @@ -35,6 +35,7 @@ def _cost_model() -> BacktestCostModel: commission_bps=3.0, slippage_bps_by_session={"KRX_REG": 10.0, "US_PRE": 50.0}, failure_rate_by_session={"KRX_REG": 0.01, "US_PRE": 0.08}, + partial_fill_rate_by_session={"KRX_REG": 0.05, "US_PRE": 0.2}, unfavorable_fill_required=True, ) @@ -71,6 +72,7 @@ def test_pipeline_happy_path_returns_fold_and_artifact_contract() -> None: assert names == {"B0", "B1", "M1"} for score in fold.baseline_scores: assert 0.0 <= score.accuracy <= 1.0 + assert 0.0 <= score.cost_adjusted_accuracy <= 1.0 def test_pipeline_cost_guard_fail_fast() -> None: @@ -78,6 +80,7 @@ def test_pipeline_cost_guard_fail_fast() -> None: commission_bps=3.0, slippage_bps_by_session={"KRX_REG": 10.0}, failure_rate_by_session={"KRX_REG": 0.01}, + partial_fill_rate_by_session={"KRX_REG": 0.05}, unfavorable_fill_required=True, ) try: @@ -166,3 +169,45 @@ def test_pipeline_rejects_minutes_spec_when_timestamp_missing() -> None: assert "BacktestBar.timestamp is required" in str(exc) else: raise AssertionError("expected timestamp validation error") + + +def test_pipeline_fold_scores_reflect_cost_and_execution_effects() -> None: + cfg = dict( + bars=_bars(), + entry_indices=[0, 1, 2, 3, 4, 5, 6, 7], + side=1, + triple_barrier_spec=TripleBarrierSpec( + take_profit_pct=0.02, + stop_loss_pct=0.01, + max_holding_minutes=3, + ), + walk_forward=WalkForwardConfig( + train_size=4, + test_size=2, + step_size=2, + purge_size=1, + embargo_size=1, + min_train_size=3, + ), + ) + optimistic = BacktestCostModel( + commission_bps=0.0, + slippage_bps_by_session={"KRX_REG": 0.0, "US_PRE": 0.0}, + failure_rate_by_session={"KRX_REG": 0.0, "US_PRE": 0.0}, + partial_fill_rate_by_session={"KRX_REG": 0.0, "US_PRE": 0.0}, + unfavorable_fill_required=True, + ) + conservative = BacktestCostModel( + commission_bps=10.0, + slippage_bps_by_session={"KRX_REG": 30.0, "US_PRE": 80.0}, + failure_rate_by_session={"KRX_REG": 0.2, "US_PRE": 0.4}, + partial_fill_rate_by_session={"KRX_REG": 0.5, "US_PRE": 0.7}, + unfavorable_fill_required=True, + ) + optimistic_out = run_v2_backtest_pipeline(cost_model=optimistic, **cfg) + conservative_out = run_v2_backtest_pipeline(cost_model=conservative, **cfg) + + assert optimistic_out.folds and conservative_out.folds + optimistic_score = optimistic_out.folds[0].baseline_scores[1].cost_adjusted_accuracy + conservative_score = conservative_out.folds[0].baseline_scores[1].cost_adjusted_accuracy + assert conservative_score < optimistic_score