backtest: reflect cost/execution effects in fold scoring (#368)
Some checks failed
Gitea CI / test (push) Failing after 6s
Gitea CI / test (pull_request) Failing after 5s

This commit is contained in:
agentson
2026-03-02 02:10:08 +09:00
parent 8ac7436953
commit c27decb6b1
4 changed files with 186 additions and 6 deletions

View File

@@ -11,6 +11,7 @@ class BacktestCostModel:
commission_bps: float | None = None commission_bps: float | None = None
slippage_bps_by_session: dict[str, float] | None = None slippage_bps_by_session: dict[str, float] | None = None
failure_rate_by_session: dict[str, float] | None = None failure_rate_by_session: dict[str, float] | None = None
partial_fill_rate_by_session: dict[str, float] | None = None
unfavorable_fill_required: bool = True unfavorable_fill_required: bool = True
@@ -31,6 +32,7 @@ def validate_backtest_cost_model(
slippage = model.slippage_bps_by_session or {} slippage = model.slippage_bps_by_session or {}
failure = model.failure_rate_by_session or {} failure = model.failure_rate_by_session or {}
partial_fill = model.partial_fill_rate_by_session or {}
missing_slippage = [s for s in required_sessions if s not in slippage] missing_slippage = [s for s in required_sessions if s not in slippage]
if missing_slippage: if missing_slippage:
@@ -43,6 +45,12 @@ def validate_backtest_cost_model(
raise ValueError( raise ValueError(
f"missing failure_rate_by_session for sessions: {', '.join(missing_failure)}" f"missing failure_rate_by_session for sessions: {', '.join(missing_failure)}"
) )
missing_partial_fill = [s for s in required_sessions if s not in partial_fill]
if missing_partial_fill:
raise ValueError(
"missing partial_fill_rate_by_session for sessions: "
f"{', '.join(missing_partial_fill)}"
)
for sess, bps in slippage.items(): for sess, bps in slippage.items():
if not math.isfinite(bps) or bps < 0: if not math.isfinite(bps) or bps < 0:
@@ -50,3 +58,6 @@ def validate_backtest_cost_model(
for sess, rate in failure.items(): for sess, rate in failure.items():
if not math.isfinite(rate) or rate < 0 or rate > 1: if not math.isfinite(rate) or rate < 0 or rate > 1:
raise ValueError(f"failure rate must be within [0,1] for session={sess}") raise ValueError(f"failure rate must be within [0,1] for session={sess}")
for sess, rate in partial_fill.items():
if not math.isfinite(rate) or rate < 0 or rate > 1:
raise ValueError(f"partial fill rate must be within [0,1] for session={sess}")

View File

@@ -13,6 +13,11 @@ from statistics import mean
from typing import Literal, cast from typing import Literal, cast
from src.analysis.backtest_cost_guard import BacktestCostModel, validate_backtest_cost_model from src.analysis.backtest_cost_guard import BacktestCostModel, validate_backtest_cost_model
from src.analysis.backtest_execution_model import (
BacktestExecutionModel,
ExecutionAssumptions,
ExecutionRequest,
)
from src.analysis.triple_barrier import TripleBarrierSpec, label_with_triple_barrier from src.analysis.triple_barrier import TripleBarrierSpec, label_with_triple_barrier
from src.analysis.walk_forward_split import WalkForwardFold, generate_walk_forward_splits from src.analysis.walk_forward_split import WalkForwardFold, generate_walk_forward_splits
@@ -40,6 +45,7 @@ class WalkForwardConfig:
class BaselineScore: class BaselineScore:
name: Literal["B0", "B1", "M1"] name: Literal["B0", "B1", "M1"]
accuracy: float accuracy: float
cost_adjusted_accuracy: float
@dataclass(frozen=True) @dataclass(frozen=True)
@@ -115,6 +121,8 @@ def run_v2_backtest_pipeline(
).label ).label
ordered_labels = [labels_by_bar_index[idx] for idx in normalized_entries] ordered_labels = [labels_by_bar_index[idx] for idx in normalized_entries]
ordered_sessions = [bars[idx].session_id for idx in normalized_entries]
ordered_prices = [bars[idx].close for idx in normalized_entries]
folds = generate_walk_forward_splits( folds = generate_walk_forward_splits(
n_samples=len(normalized_entries), n_samples=len(normalized_entries),
train_size=walk_forward.train_size, train_size=walk_forward.train_size,
@@ -129,8 +137,13 @@ def run_v2_backtest_pipeline(
for fold_idx, fold in enumerate(folds): for fold_idx, fold in enumerate(folds):
train_labels = [ordered_labels[i] for i in fold.train_indices] train_labels = [ordered_labels[i] for i in fold.train_indices]
test_labels = [ordered_labels[i] for i in fold.test_indices] test_labels = [ordered_labels[i] for i in fold.test_indices]
test_sessions = [ordered_sessions[i] for i in fold.test_indices]
test_prices = [ordered_prices[i] for i in fold.test_indices]
if not test_labels: if not test_labels:
continue continue
execution_model = _build_execution_model(cost_model=cost_model, fold_seed=fold_idx)
b0_pred = _baseline_b0_pred(train_labels)
m1_pred = _m1_pred(train_labels)
fold_results.append( fold_results.append(
BacktestFoldResult( BacktestFoldResult(
fold_index=fold_idx, fold_index=fold_idx,
@@ -139,11 +152,41 @@ def run_v2_backtest_pipeline(
train_label_distribution=_label_dist(train_labels), train_label_distribution=_label_dist(train_labels),
test_label_distribution=_label_dist(test_labels), test_label_distribution=_label_dist(test_labels),
baseline_scores=[ baseline_scores=[
BaselineScore(name="B0", accuracy=_baseline_b0(train_labels, test_labels)), BaselineScore(
BaselineScore(name="B1", accuracy=_score_constant(1, test_labels)), name="B0",
accuracy=_score_constant(b0_pred, test_labels),
cost_adjusted_accuracy=_score_with_execution(
prediction=b0_pred,
actual=test_labels,
sessions=test_sessions,
reference_prices=test_prices,
execution_model=execution_model,
commission_bps=float(cost_model.commission_bps or 0.0),
),
),
BaselineScore(
name="B1",
accuracy=_score_constant(1, test_labels),
cost_adjusted_accuracy=_score_with_execution(
prediction=1,
actual=test_labels,
sessions=test_sessions,
reference_prices=test_prices,
execution_model=execution_model,
commission_bps=float(cost_model.commission_bps or 0.0),
),
),
BaselineScore( BaselineScore(
name="M1", name="M1",
accuracy=_score_constant(_m1_pred(train_labels), test_labels), accuracy=_score_constant(m1_pred, test_labels),
cost_adjusted_accuracy=_score_with_execution(
prediction=m1_pred,
actual=test_labels,
sessions=test_sessions,
reference_prices=test_prices,
execution_model=execution_model,
commission_bps=float(cost_model.commission_bps or 0.0),
),
), ),
], ],
) )
@@ -176,12 +219,15 @@ def _score_constant(pred: int, actual: Sequence[int]) -> float:
def _baseline_b0(train_labels: Sequence[int], test_labels: Sequence[int]) -> float: def _baseline_b0(train_labels: Sequence[int], test_labels: Sequence[int]) -> float:
return _score_constant(_baseline_b0_pred(train_labels), test_labels)
def _baseline_b0_pred(train_labels: Sequence[int]) -> int:
if not train_labels: if not train_labels:
return _score_constant(0, test_labels) return 0
# Majority-class baseline from training fold. # Majority-class baseline from training fold.
choices = (-1, 0, 1) choices = (-1, 0, 1)
pred = max(choices, key=lambda c: train_labels.count(c)) return max(choices, key=lambda c: train_labels.count(c))
return _score_constant(pred, test_labels)
def _m1_pred(train_labels: Sequence[int]) -> int: def _m1_pred(train_labels: Sequence[int]) -> int:
@@ -190,6 +236,52 @@ def _m1_pred(train_labels: Sequence[int]) -> int:
return train_labels[-1] return train_labels[-1]
def _build_execution_model(*, cost_model: BacktestCostModel, fold_seed: int) -> BacktestExecutionModel:
return BacktestExecutionModel(
ExecutionAssumptions(
slippage_bps_by_session=dict(cost_model.slippage_bps_by_session or {}),
failure_rate_by_session=dict(cost_model.failure_rate_by_session or {}),
partial_fill_rate_by_session=dict(cost_model.partial_fill_rate_by_session or {}),
seed=fold_seed,
)
)
def _score_with_execution(
*,
prediction: int,
actual: Sequence[int],
sessions: Sequence[str],
reference_prices: Sequence[float],
execution_model: BacktestExecutionModel,
commission_bps: float,
) -> float:
if not actual:
return 0.0
contributions: list[float] = []
for label, session_id, reference_price in zip(actual, sessions, reference_prices, strict=True):
if prediction == 0:
contributions.append(1.0 if label == 0 else 0.0)
continue
side = "BUY" if prediction > 0 else "SELL"
execution = execution_model.simulate(
ExecutionRequest(
side=side,
session_id=session_id,
qty=100,
reference_price=reference_price,
)
)
if execution.status == "REJECTED":
contributions.append(0.0)
continue
fill_ratio = execution.filled_qty / 100.0
cost_penalty = min(0.99, (commission_bps + execution.slippage_bps) / 10000.0)
correctness = 1.0 if prediction == label else 0.0
contributions.append(correctness * fill_ratio * (1.0 - cost_penalty))
return mean(contributions)
def _build_run_id(*, n_entries: int, n_folds: int, sessions: Sequence[str]) -> str: def _build_run_id(*, n_entries: int, n_folds: int, sessions: Sequence[str]) -> str:
sess_key = "_".join(sessions) sess_key = "_".join(sessions)
return f"v2p-e{n_entries}-f{n_folds}-s{sess_key}" return f"v2p-e{n_entries}-f{n_folds}-s{sess_key}"

View File

@@ -10,6 +10,7 @@ def test_valid_backtest_cost_model_passes() -> None:
commission_bps=5.0, commission_bps=5.0,
slippage_bps_by_session={"KRX_REG": 10.0, "US_PRE": 50.0}, slippage_bps_by_session={"KRX_REG": 10.0, "US_PRE": 50.0},
failure_rate_by_session={"KRX_REG": 0.01, "US_PRE": 0.08}, failure_rate_by_session={"KRX_REG": 0.01, "US_PRE": 0.08},
partial_fill_rate_by_session={"KRX_REG": 0.1, "US_PRE": 0.2},
unfavorable_fill_required=True, unfavorable_fill_required=True,
) )
validate_backtest_cost_model(model=model, required_sessions=["KRX_REG", "US_PRE"]) validate_backtest_cost_model(model=model, required_sessions=["KRX_REG", "US_PRE"])
@@ -20,6 +21,7 @@ def test_missing_required_slippage_session_raises() -> None:
commission_bps=5.0, commission_bps=5.0,
slippage_bps_by_session={"KRX_REG": 10.0}, slippage_bps_by_session={"KRX_REG": 10.0},
failure_rate_by_session={"KRX_REG": 0.01, "US_PRE": 0.08}, failure_rate_by_session={"KRX_REG": 0.01, "US_PRE": 0.08},
partial_fill_rate_by_session={"KRX_REG": 0.1, "US_PRE": 0.2},
unfavorable_fill_required=True, unfavorable_fill_required=True,
) )
with pytest.raises(ValueError, match="missing slippage_bps_by_session.*US_PRE"): with pytest.raises(ValueError, match="missing slippage_bps_by_session.*US_PRE"):
@@ -31,6 +33,7 @@ def test_missing_required_failure_rate_session_raises() -> None:
commission_bps=5.0, commission_bps=5.0,
slippage_bps_by_session={"KRX_REG": 10.0, "US_PRE": 50.0}, slippage_bps_by_session={"KRX_REG": 10.0, "US_PRE": 50.0},
failure_rate_by_session={"KRX_REG": 0.01}, failure_rate_by_session={"KRX_REG": 0.01},
partial_fill_rate_by_session={"KRX_REG": 0.1, "US_PRE": 0.2},
unfavorable_fill_required=True, unfavorable_fill_required=True,
) )
with pytest.raises(ValueError, match="missing failure_rate_by_session.*US_PRE"): with pytest.raises(ValueError, match="missing failure_rate_by_session.*US_PRE"):
@@ -42,6 +45,7 @@ def test_invalid_failure_rate_range_raises() -> None:
commission_bps=5.0, commission_bps=5.0,
slippage_bps_by_session={"KRX_REG": 10.0}, slippage_bps_by_session={"KRX_REG": 10.0},
failure_rate_by_session={"KRX_REG": 1.2}, failure_rate_by_session={"KRX_REG": 1.2},
partial_fill_rate_by_session={"KRX_REG": 0.2},
unfavorable_fill_required=True, unfavorable_fill_required=True,
) )
with pytest.raises(ValueError, match="failure rate must be within"): with pytest.raises(ValueError, match="failure rate must be within"):
@@ -53,6 +57,7 @@ def test_unfavorable_fill_requirement_cannot_be_disabled() -> None:
commission_bps=5.0, commission_bps=5.0,
slippage_bps_by_session={"KRX_REG": 10.0}, slippage_bps_by_session={"KRX_REG": 10.0},
failure_rate_by_session={"KRX_REG": 0.02}, failure_rate_by_session={"KRX_REG": 0.02},
partial_fill_rate_by_session={"KRX_REG": 0.2},
unfavorable_fill_required=False, unfavorable_fill_required=False,
) )
with pytest.raises(ValueError, match="unfavorable_fill_required must be True"): with pytest.raises(ValueError, match="unfavorable_fill_required must be True"):
@@ -65,6 +70,7 @@ def test_non_finite_commission_rejected(bad_commission: float) -> None:
commission_bps=bad_commission, commission_bps=bad_commission,
slippage_bps_by_session={"KRX_REG": 10.0}, slippage_bps_by_session={"KRX_REG": 10.0},
failure_rate_by_session={"KRX_REG": 0.02}, failure_rate_by_session={"KRX_REG": 0.02},
partial_fill_rate_by_session={"KRX_REG": 0.2},
unfavorable_fill_required=True, unfavorable_fill_required=True,
) )
with pytest.raises(ValueError, match="commission_bps"): with pytest.raises(ValueError, match="commission_bps"):
@@ -77,7 +83,33 @@ def test_non_finite_slippage_rejected(bad_slippage: float) -> None:
commission_bps=5.0, commission_bps=5.0,
slippage_bps_by_session={"KRX_REG": bad_slippage}, slippage_bps_by_session={"KRX_REG": bad_slippage},
failure_rate_by_session={"KRX_REG": 0.02}, failure_rate_by_session={"KRX_REG": 0.02},
partial_fill_rate_by_session={"KRX_REG": 0.2},
unfavorable_fill_required=True, unfavorable_fill_required=True,
) )
with pytest.raises(ValueError, match="slippage bps"): with pytest.raises(ValueError, match="slippage bps"):
validate_backtest_cost_model(model=model, required_sessions=["KRX_REG"]) validate_backtest_cost_model(model=model, required_sessions=["KRX_REG"])
def test_missing_required_partial_fill_session_raises() -> None:
model = BacktestCostModel(
commission_bps=5.0,
slippage_bps_by_session={"KRX_REG": 10.0, "US_PRE": 50.0},
failure_rate_by_session={"KRX_REG": 0.01, "US_PRE": 0.08},
partial_fill_rate_by_session={"KRX_REG": 0.1},
unfavorable_fill_required=True,
)
with pytest.raises(ValueError, match="missing partial_fill_rate_by_session.*US_PRE"):
validate_backtest_cost_model(model=model, required_sessions=["KRX_REG", "US_PRE"])
@pytest.mark.parametrize("bad_partial_fill", [float("nan"), float("inf"), float("-inf"), -0.1, 1.1])
def test_invalid_partial_fill_rate_rejected(bad_partial_fill: float) -> None:
model = BacktestCostModel(
commission_bps=5.0,
slippage_bps_by_session={"KRX_REG": 10.0},
failure_rate_by_session={"KRX_REG": 0.02},
partial_fill_rate_by_session={"KRX_REG": bad_partial_fill},
unfavorable_fill_required=True,
)
with pytest.raises(ValueError, match="partial fill rate must be within"):
validate_backtest_cost_model(model=model, required_sessions=["KRX_REG"])

View File

@@ -35,6 +35,7 @@ def _cost_model() -> BacktestCostModel:
commission_bps=3.0, commission_bps=3.0,
slippage_bps_by_session={"KRX_REG": 10.0, "US_PRE": 50.0}, slippage_bps_by_session={"KRX_REG": 10.0, "US_PRE": 50.0},
failure_rate_by_session={"KRX_REG": 0.01, "US_PRE": 0.08}, failure_rate_by_session={"KRX_REG": 0.01, "US_PRE": 0.08},
partial_fill_rate_by_session={"KRX_REG": 0.05, "US_PRE": 0.2},
unfavorable_fill_required=True, unfavorable_fill_required=True,
) )
@@ -71,6 +72,7 @@ def test_pipeline_happy_path_returns_fold_and_artifact_contract() -> None:
assert names == {"B0", "B1", "M1"} assert names == {"B0", "B1", "M1"}
for score in fold.baseline_scores: for score in fold.baseline_scores:
assert 0.0 <= score.accuracy <= 1.0 assert 0.0 <= score.accuracy <= 1.0
assert 0.0 <= score.cost_adjusted_accuracy <= 1.0
def test_pipeline_cost_guard_fail_fast() -> None: def test_pipeline_cost_guard_fail_fast() -> None:
@@ -78,6 +80,7 @@ def test_pipeline_cost_guard_fail_fast() -> None:
commission_bps=3.0, commission_bps=3.0,
slippage_bps_by_session={"KRX_REG": 10.0}, slippage_bps_by_session={"KRX_REG": 10.0},
failure_rate_by_session={"KRX_REG": 0.01}, failure_rate_by_session={"KRX_REG": 0.01},
partial_fill_rate_by_session={"KRX_REG": 0.05},
unfavorable_fill_required=True, unfavorable_fill_required=True,
) )
try: try:
@@ -166,3 +169,45 @@ def test_pipeline_rejects_minutes_spec_when_timestamp_missing() -> None:
assert "BacktestBar.timestamp is required" in str(exc) assert "BacktestBar.timestamp is required" in str(exc)
else: else:
raise AssertionError("expected timestamp validation error") raise AssertionError("expected timestamp validation error")
def test_pipeline_fold_scores_reflect_cost_and_execution_effects() -> None:
cfg = dict(
bars=_bars(),
entry_indices=[0, 1, 2, 3, 4, 5, 6, 7],
side=1,
triple_barrier_spec=TripleBarrierSpec(
take_profit_pct=0.02,
stop_loss_pct=0.01,
max_holding_minutes=3,
),
walk_forward=WalkForwardConfig(
train_size=4,
test_size=2,
step_size=2,
purge_size=1,
embargo_size=1,
min_train_size=3,
),
)
optimistic = BacktestCostModel(
commission_bps=0.0,
slippage_bps_by_session={"KRX_REG": 0.0, "US_PRE": 0.0},
failure_rate_by_session={"KRX_REG": 0.0, "US_PRE": 0.0},
partial_fill_rate_by_session={"KRX_REG": 0.0, "US_PRE": 0.0},
unfavorable_fill_required=True,
)
conservative = BacktestCostModel(
commission_bps=10.0,
slippage_bps_by_session={"KRX_REG": 30.0, "US_PRE": 80.0},
failure_rate_by_session={"KRX_REG": 0.2, "US_PRE": 0.4},
partial_fill_rate_by_session={"KRX_REG": 0.5, "US_PRE": 0.7},
unfavorable_fill_required=True,
)
optimistic_out = run_v2_backtest_pipeline(cost_model=optimistic, **cfg)
conservative_out = run_v2_backtest_pipeline(cost_model=conservative, **cfg)
assert optimistic_out.folds and conservative_out.folds
optimistic_score = optimistic_out.folds[0].baseline_scores[1].cost_adjusted_accuracy
conservative_score = conservative_out.folds[0].baseline_scores[1].cost_adjusted_accuracy
assert conservative_score < optimistic_score