risk: define and implement kill-switch refresh retry policy (#377) #389

Merged
jihoson merged 2 commits from feature/issue-377-kill-switch-refresh-retry into feature/v3-session-policy-stream 2026-03-02 09:47:56 +09:00
6 changed files with 123 additions and 7 deletions
Showing only changes of commit ba2370e40e - Show all commits

View File

@@ -1,6 +1,6 @@
<!--
Doc-ID: DOC-REQ-001
Version: 1.0.10
Version: 1.0.11
Status: active
Owner: strategy
Updated: 2026-03-02
@@ -19,7 +19,7 @@ Updated: 2026-03-02
- `REQ-V2-005`: 라벨링은 Triple Barrier(Upper/Lower/Time) 방식이어야 한다.
- `REQ-V2-006`: 검증은 Walk-forward + Purge/Embargo를 강제한다.
- `REQ-V2-007`: 백테스트는 비용/슬리피지/체결실패를 반영하지 않으면 채택 불가다.
- `REQ-V2-008`: Kill Switch는 신규주문차단 -> 미체결취소 -> 재조회 -> 리스크축소 -> 스냅샷 순서다.
- `REQ-V2-008`: Kill Switch는 신규주문차단 -> 미체결취소 -> 재조회(실패 시 최대 3회, 1s/2s backoff 재시도, 성공 시 즉시 중단) -> 리스크축소 -> 스냅샷 순서다.
## v3 핵심 요구사항

View File

@@ -36,7 +36,7 @@ Updated: 2026-03-02
| REQ-V2-005 | Triple Barrier 라벨링 | `src/analysis/triple_barrier.py` | ✅ 완료 |
| REQ-V2-006 | Walk-Forward + Purge/Embargo 검증 | `src/analysis/walk_forward_split.py` | ✅ 완료 |
| REQ-V2-007 | 비용/슬리피지/체결실패 모델 필수 | `src/analysis/backtest_cost_guard.py`, `src/analysis/backtest_pipeline.py` | ✅ 완료 |
| REQ-V2-008 | Kill Switch 실행 순서 (Block→Cancel→Refresh→Reduce→Snapshot) | `src/core/kill_switch.py` | ⚠️ 부분 (`#377`) |
| REQ-V2-008 | Kill Switch 실행 순서 (Block→Cancel→Refresh(retry)→Reduce→Snapshot) | `src/core/kill_switch.py` | ✅ 완료 |
### 1.3 v3 구현 상태: 부분 완료 (2026-03-02 기준)

View File

@@ -3,13 +3,14 @@
Order is fixed:
1) block new orders
2) cancel pending orders
3) refresh order state
3) refresh order state (retry up to 3 attempts with exponential backoff)
4) reduce risk
5) snapshot and notify
"""
from __future__ import annotations
import asyncio
import inspect
from collections.abc import Awaitable, Callable
from dataclasses import dataclass, field
@@ -34,16 +35,55 @@ class KillSwitchOrchestrator:
report: KillSwitchReport,
name: str,
fn: StepCallable | None,
) -> None:
) -> bool:
report.steps.append(name)
if fn is None:
return
return True
try:
result = fn()
if inspect.isawaitable(result):
await result
if result is False:
raise RuntimeError("step returned False")
return True
except Exception as exc: # pragma: no cover - intentionally resilient
report.errors.append(f"{name}: {exc}")
return False
async def _run_refresh_with_retry(
self,
report: KillSwitchReport,
fn: StepCallable | None,
*,
max_attempts: int,
base_delay_sec: float,
) -> None:
report.steps.append("refresh_order_state")
if fn is None:
return
attempts = max(1, max_attempts)
delay = max(0.0, base_delay_sec)
last_exc: Exception | None = None
for attempt in range(1, attempts + 1):
try:
result = fn()
if inspect.isawaitable(result):
await result
if result is False:
raise RuntimeError("step returned False")
return
except Exception as exc: # pragma: no cover - intentionally resilient
last_exc = exc
if attempt >= attempts:
break
if delay > 0:
await asyncio.sleep(delay * (2 ** (attempt - 1)))
if last_exc is not None:
report.errors.append(
"refresh_order_state: failed after "
f"{attempts} attempts ({last_exc})"
)
async def trigger(
self,
@@ -54,6 +94,8 @@ class KillSwitchOrchestrator:
reduce_risk: StepCallable | None = None,
snapshot_state: StepCallable | None = None,
notify: StepCallable | None = None,
refresh_retry_attempts: int = 3,
refresh_retry_base_delay_sec: float = 1.0,
) -> KillSwitchReport:
report = KillSwitchReport(reason=reason)
@@ -61,7 +103,12 @@ class KillSwitchOrchestrator:
report.steps.append("block_new_orders")
await self._run_step(report, "cancel_pending_orders", cancel_pending_orders)
await self._run_step(report, "refresh_order_state", refresh_order_state)
await self._run_refresh_with_retry(
report,
refresh_order_state,
max_attempts=refresh_retry_attempts,
base_delay_sec=refresh_retry_base_delay_sec,
)
await self._run_step(report, "reduce_risk", reduce_risk)
await self._run_step(report, "snapshot_state", snapshot_state)
await self._run_step(report, "notify", notify)

View File

@@ -1384,6 +1384,7 @@ async def _refresh_order_state_for_kill_switch(
overseas_broker: OverseasBroker,
markets: list[MarketInfo],
) -> None:
failures: list[str] = []
seen_overseas: set[str] = set()
for market in markets:
try:
@@ -1399,6 +1400,9 @@ async def _refresh_order_state_for_kill_switch(
market.exchange_code,
exc,
)
failures.append(f"{market.code}/{market.exchange_code}: {exc}")
if failures:
raise RuntimeError("; ".join(failures[:3]))
def _reduce_risk_for_kill_switch() -> None:

View File

@@ -53,3 +53,52 @@ async def test_kill_switch_collects_step_errors() -> None:
report = await ks.trigger(reason="test", cancel_pending_orders=_boom)
assert any(err.startswith("cancel_pending_orders:") for err in report.errors)
@pytest.mark.asyncio
async def test_kill_switch_refresh_retries_then_succeeds() -> None:
ks = KillSwitchOrchestrator()
refresh_calls = {"count": 0}
def _flaky_refresh() -> None:
refresh_calls["count"] += 1
if refresh_calls["count"] < 3:
raise RuntimeError("temporary refresh failure")
report = await ks.trigger(
reason="test",
refresh_order_state=_flaky_refresh,
refresh_retry_attempts=3,
refresh_retry_base_delay_sec=0.0,
)
assert refresh_calls["count"] == 3
assert report.errors == []
@pytest.mark.asyncio
async def test_kill_switch_refresh_retry_exhausted_records_error_and_continues() -> None:
ks = KillSwitchOrchestrator()
calls: list[str] = []
def _refresh_fail() -> None:
raise RuntimeError("persistent refresh failure")
def _reduce() -> None:
calls.append("reduce")
def _snapshot() -> None:
calls.append("snapshot")
report = await ks.trigger(
reason="test",
refresh_order_state=_refresh_fail,
reduce_risk=_reduce,
snapshot_state=_snapshot,
refresh_retry_attempts=2,
refresh_retry_base_delay_sec=0.0,
)
assert any(
err.startswith("refresh_order_state: failed after 2 attempts")
for err in report.errors
)
assert calls == ["reduce", "snapshot"]

View File

@@ -121,3 +121,19 @@
- next_ticket: #369
- process_gate_checked: process_ticket=#306,#308 merged_to_feature_branch=yes
- risks_or_notes: v2 사양 기준으로 model_exit_signal을 직접 청산 트리거가 아닌 보조 트리거로 정합화하고 테스트/문서를 동기화한다.
### 2026-03-02 | session=codex-v3-stream-next-ticket-377
- branch: feature/v3-session-policy-stream
- docs_checked: docs/workflow.md, docs/commands.md, docs/agent-constraints.md
- open_issues_reviewed: #377, #370, #371, #375, #376, #381
- next_ticket: #377
- process_gate_checked: process_ticket=#306,#308 merged_to_feature_branch=yes
- risks_or_notes: kill switch refresh 재시도 정책(횟수/간격/중단조건)을 코드/테스트/요구사항 원장/감사 문서에 동시 반영한다.
### 2026-03-02 | session=codex-issue377-start
- branch: feature/issue-377-kill-switch-refresh-retry
- docs_checked: docs/workflow.md, docs/commands.md, docs/agent-constraints.md
- open_issues_reviewed: #377
- next_ticket: #377
- process_gate_checked: process_ticket=#306,#308 merged_to_feature_branch=yes
- risks_or_notes: refresh 단계를 최대 3회(초기+재시도2), 실패 시 지수 백오프로 재시도하고 성공 시 즉시 중단, 소진 시 오류를 기록한 뒤 다음 단계를 계속 수행한다.