2026-03-04 00:47:20 +09:00
7 changed files with 154 additions and 8 deletions
--- a/docs/ouroboros/01_requirements_registry.md
+++ b/docs/ouroboros/01_requirements_registry.md
@@ -1,6 +1,6 @@
 <!--
 Doc-ID: DOC-REQ-001
-Version: 1.0.10
+Version: 1.0.11
 Status: active
 Owner: strategy
 Updated: 2026-03-02
@@ -19,7 +19,7 @@ Updated: 2026-03-02
 - `REQ-V2-005`: 라벨링은 Triple Barrier(Upper/Lower/Time) 방식이어야 한다.
 - `REQ-V2-006`: 검증은 Walk-forward + Purge/Embargo를 강제한다.
 - `REQ-V2-007`: 백테스트는 비용/슬리피지/체결실패를 반영하지 않으면 채택 불가다.
- `REQ-V2-008`: Kill Switch는 신규주문차단 -> 미체결취소 -> 재조회 -> 리스크축소 -> 스냅샷 순서다.
+- `REQ-V2-008`: Kill Switch는 신규주문차단 -> 미체결취소 -> 재조회(실패 시 최대 3회, 1s/2s backoff 재시도, 성공 시 즉시 중단) -> 리스크축소 -> 스냅샷 순서다.
 ## v3 핵심 요구사항
--- a/docs/ouroboros/80_implementation_audit.md
+++ b/docs/ouroboros/80_implementation_audit.md
@@ -36,7 +36,7 @@ Updated: 2026-03-02
 | REQ-V2-005 | Triple Barrier 라벨링 | `src/analysis/triple_barrier.py` | ✅ 완료 |
 | REQ-V2-006 | Walk-Forward + Purge/Embargo 검증 | `src/analysis/walk_forward_split.py` | ✅ 완료 |
 | REQ-V2-007 | 비용/슬리피지/체결실패 모델 필수 | `src/analysis/backtest_cost_guard.py`, `src/analysis/backtest_pipeline.py` | ✅ 완료 |
-| REQ-V2-008 | Kill Switch 실행 순서 (Block→Cancel→Refresh→Reduce→Snapshot) | `src/core/kill_switch.py` | ⚠️ 부분 (`#377`) |
+| REQ-V2-008 | Kill Switch 실행 순서 (Block→Cancel→Refresh(retry)→Reduce→Snapshot) | `src/core/kill_switch.py` | ✅ 완료 |
 ### 1.3 v3 구현 상태: 부분 완료 (2026-03-02 기준)
--- a/src/core/kill_switch.py
+++ b/src/core/kill_switch.py
@@ -3,13 +3,14 @@
 Order is fixed:
 1) block new orders
 2) cancel pending orders
-3) refresh order state
+3) refresh order state (retry up to 3 attempts with exponential backoff)
 4) reduce risk
 5) snapshot and notify
 """
 from __future__ import annotations
 import asyncio
 import inspect
 from collections.abc import Awaitable, Callable
 from dataclasses import dataclass, field
@@ -34,16 +35,55 @@ class KillSwitchOrchestrator:
        report: KillSwitchReport,
        name: str,
        fn: StepCallable | None,
-    ) -> None:
+    ) -> bool:
        report.steps.append(name)
        if fn is None:
-            return
+            return True
        try:
            result = fn()
            if inspect.isawaitable(result):
                await result
            if result is False:
                raise RuntimeError("step returned False")
            return True
        except Exception as exc:  # pragma: no cover - intentionally resilient
            report.errors.append(f"{name}: {exc}")
            return False
    async def _run_refresh_with_retry(
        self,
        report: KillSwitchReport,
        fn: StepCallable | None,
        *,
        max_attempts: int,
        base_delay_sec: float,
    ) -> None:
        report.steps.append("refresh_order_state")
        if fn is None:
            return
        attempts = max(1, max_attempts)
        delay = max(0.0, base_delay_sec)
        last_exc: Exception | None = None
        for attempt in range(1, attempts + 1):
            try:
                result = fn()
                if inspect.isawaitable(result):
                    await result
                if result is False:
                    raise RuntimeError("step returned False")
                return
            except Exception as exc:
                last_exc = exc
                if attempt >= attempts:
                    break
                if delay > 0:
                    await asyncio.sleep(delay * (2 ** (attempt - 1)))
        if last_exc is not None:
            report.errors.append(
                "refresh_order_state: failed after "
                f"{attempts} attempts ({last_exc})"
            )
    async def trigger(
        self,
@@ -54,6 +94,8 @@ class KillSwitchOrchestrator:
        reduce_risk: StepCallable | None = None,
        snapshot_state: StepCallable | None = None,
        notify: StepCallable | None = None,
        refresh_retry_attempts: int = 3,
        refresh_retry_base_delay_sec: float = 1.0,
    ) -> KillSwitchReport:
        report = KillSwitchReport(reason=reason)
@@ -61,7 +103,12 @@ class KillSwitchOrchestrator:
        report.steps.append("block_new_orders")
        await self._run_step(report, "cancel_pending_orders", cancel_pending_orders)
-        await self._run_step(report, "refresh_order_state", refresh_order_state)
+        await self._run_refresh_with_retry(
            report,
            refresh_order_state,
            max_attempts=refresh_retry_attempts,
            base_delay_sec=refresh_retry_base_delay_sec,
        )
        await self._run_step(report, "reduce_risk", reduce_risk)
        await self._run_step(report, "snapshot_state", snapshot_state)
        await self._run_step(report, "notify", notify)
--- a/src/main.py
+++ b/src/main.py
@@ -1375,7 +1375,10 @@ async def _cancel_pending_orders_for_kill_switch(
                )
    if failures:
-        raise RuntimeError("; ".join(failures[:3]))
+        summary = "; ".join(failures[:3])
        if len(failures) > 3:
            summary = f"{summary} (+{len(failures) - 3} more)"
        raise RuntimeError(summary)
 async def _refresh_order_state_for_kill_switch(
@@ -1384,6 +1387,7 @@ async def _refresh_order_state_for_kill_switch(
    overseas_broker: OverseasBroker,
    markets: list[MarketInfo],
 ) -> None:
    failures: list[str] = []
    seen_overseas: set[str] = set()
    for market in markets:
        try:
@@ -1399,6 +1403,12 @@ async def _refresh_order_state_for_kill_switch(
                market.exchange_code,
                exc,
            )
            failures.append(f"{market.code}/{market.exchange_code}: {exc}")
    if failures:
        summary = "; ".join(failures[:3])
        if len(failures) > 3:
            summary = f"{summary} (+{len(failures) - 3} more)"
        raise RuntimeError(summary)
 def _reduce_risk_for_kill_switch() -> None:
--- a/tests/test_kill_switch.py
+++ b/tests/test_kill_switch.py
@@ -53,3 +53,52 @@ async def test_kill_switch_collects_step_errors() -> None:
    report = await ks.trigger(reason="test", cancel_pending_orders=_boom)
    assert any(err.startswith("cancel_pending_orders:") for err in report.errors)
@pytest.mark.asyncio
 async def test_kill_switch_refresh_retries_then_succeeds() -> None:
    ks = KillSwitchOrchestrator()
    refresh_calls = {"count": 0}
    def _flaky_refresh() -> None:
        refresh_calls["count"] += 1
        if refresh_calls["count"] < 3:
            raise RuntimeError("temporary refresh failure")
    report = await ks.trigger(
        reason="test",
        refresh_order_state=_flaky_refresh,
        refresh_retry_attempts=3,
        refresh_retry_base_delay_sec=0.0,
    )
    assert refresh_calls["count"] == 3
    assert report.errors == []
@pytest.mark.asyncio
 async def test_kill_switch_refresh_retry_exhausted_records_error_and_continues() -> None:
    ks = KillSwitchOrchestrator()
    calls: list[str] = []
    def _refresh_fail() -> None:
        raise RuntimeError("persistent refresh failure")
    def _reduce() -> None:
        calls.append("reduce")
    def _snapshot() -> None:
        calls.append("snapshot")
    report = await ks.trigger(
        reason="test",
        refresh_order_state=_refresh_fail,
        reduce_risk=_reduce,
        snapshot_state=_snapshot,
        refresh_retry_attempts=2,
        refresh_retry_base_delay_sec=0.0,
    )
    assert any(
        err.startswith("refresh_order_state: failed after 2 attempts")
        for err in report.errors
    )
    assert calls == ["reduce", "snapshot"]
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -7154,3 +7154,27 @@ async def test_trigger_emergency_kill_switch_records_cancel_failure() -> None:
        )
    assert any(err.startswith("cancel_pending_orders:") for err in report.errors)
@pytest.mark.asyncio
 async def test_refresh_order_state_failure_summary_includes_more_count() -> None:
    broker = MagicMock()
    broker.get_balance = AsyncMock(side_effect=RuntimeError("domestic down"))
    overseas_broker = MagicMock()
    overseas_broker.get_overseas_balance = AsyncMock(side_effect=RuntimeError("overseas down"))
    markets = []
    for code, exchange in [("KR", "KRX"), ("US_PRE", "NASD"), ("US_DAY", "NYSE"), ("JP", "TKSE")]:
        market = MagicMock()
        market.code = code
        market.exchange_code = exchange
        market.is_domestic = code == "KR"
        markets.append(market)
    with pytest.raises(RuntimeError, match=r"\(\+1 more\)$") as exc_info:
        await main_module._refresh_order_state_for_kill_switch(
            broker=broker,
            overseas_broker=overseas_broker,
            markets=markets,
        )
    assert "KR/KRX" in str(exc_info.value)
--- a/workflow/session-handover.md
+++ b/workflow/session-handover.md
@@ -121,3 +121,19 @@
 - next_ticket: #369
 - process_gate_checked: process_ticket=#306,#308 merged_to_feature_branch=yes
 - risks_or_notes: v2 사양 기준으로 model_exit_signal을 직접 청산 트리거가 아닌 보조 트리거로 정합화하고 테스트/문서를 동기화한다.
 ### 2026-03-02 | session=codex-v3-stream-next-ticket-377
 - branch: feature/v3-session-policy-stream
 - docs_checked: docs/workflow.md, docs/commands.md, docs/agent-constraints.md
 - open_issues_reviewed: #377, #370, #371, #375, #376, #381
 - next_ticket: #377
 - process_gate_checked: process_ticket=#306,#308 merged_to_feature_branch=yes
 - risks_or_notes: kill switch refresh 재시도 정책(횟수/간격/중단조건)을 코드/테스트/요구사항 원장/감사 문서에 동시 반영한다.
 ### 2026-03-02 | session=codex-issue377-start
 - branch: feature/issue-377-kill-switch-refresh-retry
 - docs_checked: docs/workflow.md, docs/commands.md, docs/agent-constraints.md
 - open_issues_reviewed: #377
 - next_ticket: #377
 - process_gate_checked: process_ticket=#306,#308 merged_to_feature_branch=yes
 - risks_or_notes: refresh 단계를 최대 3회(초기+재시도2), 실패 시 지수 백오프로 재시도하고 성공 시 즉시 중단, 소진 시 오류를 기록한 뒤 다음 단계를 계속 수행한다.