Merge pull request 'fix: runtime anomaly handling for overnight startup and monitor (#396 #397)' (#404) from feature/issue-396-397-runtime-anomaly-fixes into main

Reviewed-on: #404 Reviewed-by: jihoson <kiparang7th@gmail.com>
2026-03-04 02:46:38 +09:00
parent d2f3fe9108 bcbbf80d16
commit c217e8cd72
3 changed files with 332 additions and 23 deletions
--- a/scripts/run_overnight.sh
+++ b/scripts/run_overnight.sh
@@ -8,8 +8,32 @@ CHECK_INTERVAL="${CHECK_INTERVAL:-30}"
 TMUX_AUTO="${TMUX_AUTO:-true}"
 TMUX_ATTACH="${TMUX_ATTACH:-true}"
 TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}"
 STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}"
 dashboard_port="${DASHBOARD_PORT:-8080}"
 APP_CMD_BIN="${APP_CMD_BIN:-}"
 APP_CMD_ARGS="${APP_CMD_ARGS:-}"
 RUNS_DASHBOARD="false"
-if [ -z "${APP_CMD:-}" ]; then
+# Custom override contract:
 # 1) Preferred: APP_CMD_BIN + APP_CMD_ARGS
 #    - APP_CMD_BIN is treated as a single executable token.
 #    - APP_CMD_ARGS uses shell-style word splitting; quote/escape inside this
 #      variable is NOT preserved as a nested shell parse.
 # 2) Legacy fallback: APP_CMD (raw shell command string)
 #    - This path remains for backward compatibility.
 #    - When APP_CMD includes --dashboard, caller should include explicit
 #      DASHBOARD_PORT assignment in APP_CMD if non-default port is required.
 if [ -n "$APP_CMD_BIN" ]; then
    USE_DEFAULT_APP_CMD="false"
    USE_SAFE_CUSTOM_APP_CMD="true"
    APP_CMD="${APP_CMD_BIN} ${APP_CMD_ARGS}"
    if [[ " $APP_CMD_ARGS " == *" --dashboard "* ]]; then
        RUNS_DASHBOARD="true"
    fi
 elif [ -z "${APP_CMD:-}" ]; then
    USE_DEFAULT_APP_CMD="true"
    USE_SAFE_CUSTOM_APP_CMD="false"
    if [ -x ".venv/bin/python" ]; then
        PYTHON_BIN=".venv/bin/python"
    elif command -v python3 >/dev/null 2>&1; then
@@ -21,9 +45,14 @@ if [ -z "${APP_CMD:-}" ]; then
        exit 1
    fi
-    dashboard_port="${DASHBOARD_PORT:-8080}"
+    APP_CMD="$PYTHON_BIN -m src.main --mode=live --dashboard"
-
+    RUNS_DASHBOARD="true"
-    APP_CMD="DASHBOARD_PORT=$dashboard_port $PYTHON_BIN -m src.main --mode=live --dashboard"
+else
    USE_DEFAULT_APP_CMD="false"
    USE_SAFE_CUSTOM_APP_CMD="false"
    if [[ "$APP_CMD" == *"--dashboard"* ]]; then
        RUNS_DASHBOARD="true"
    fi
 fi
 mkdir -p "$LOG_DIR"
@@ -34,6 +63,24 @@ WATCHDOG_LOG="$LOG_DIR/watchdog_${timestamp}.log"
 PID_FILE="$LOG_DIR/app.pid"
 WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid"
 is_port_in_use() {
    local port="$1"
    if command -v ss >/dev/null 2>&1; then
        ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
        return $?
    fi
    if command -v lsof >/dev/null 2>&1; then
        lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1
        return $?
    fi
    if command -v netstat >/dev/null 2>&1; then
        netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]"
        return $?
    fi
    # No supported socket inspection command found.
    return 1
 }
 if [ -f "$PID_FILE" ]; then
    old_pid="$(cat "$PID_FILE" || true)"
    if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
@@ -43,7 +90,29 @@ if [ -f "$PID_FILE" ]; then
 fi
 echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG"
-nohup bash -lc "$APP_CMD" >>"$RUN_LOG" 2>&1 &
+if [ "$RUNS_DASHBOARD" = "true" ] && is_port_in_use "$dashboard_port"; then
    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: dashboard port ${dashboard_port} already in use" | tee -a "$RUN_LOG"
    exit 1
 fi
 if [ "$USE_DEFAULT_APP_CMD" = "true" ]; then
    # Default path avoids shell word-splitting on executable paths.
    nohup env DASHBOARD_PORT="$dashboard_port" "$PYTHON_BIN" -m src.main --mode=live --dashboard >>"$RUN_LOG" 2>&1 &
 elif [ "$USE_SAFE_CUSTOM_APP_CMD" = "true" ]; then
    # Safer custom path: executable path is handled as a single token.
    if [ -n "$APP_CMD_ARGS" ]; then
        # shellcheck disable=SC2206
        app_args=( $APP_CMD_ARGS )
        nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" "${app_args[@]}" >>"$RUN_LOG" 2>&1 &
    else
        nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" >>"$RUN_LOG" 2>&1 &
    fi
 else
    # Custom APP_CMD is treated as a shell command string.
    # If executable paths include spaces, they must be quoted inside APP_CMD.
    # Legacy compatibility path: caller owns quoting and env var injection.
    nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 &
 fi
 app_pid=$!
 echo "$app_pid" > "$PID_FILE"
@@ -54,6 +123,20 @@ nohup env PID_FILE="$PID_FILE" LOG_FILE="$WATCHDOG_LOG" CHECK_INTERVAL="$CHECK_I
 watchdog_pid=$!
 echo "$watchdog_pid" > "$WATCHDOG_PID_FILE"
 sleep "$STARTUP_GRACE_SEC"
 if ! kill -0 "$app_pid" 2>/dev/null; then
    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: app process exited early (pid=$app_pid)" | tee -a "$RUN_LOG"
    [ -n "${watchdog_pid:-}" ] && kill "$watchdog_pid" 2>/dev/null || true
    tail -n 20 "$RUN_LOG" || true
    exit 1
 fi
 if ! kill -0 "$watchdog_pid" 2>/dev/null; then
    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: watchdog exited early (pid=$watchdog_pid)" | tee -a "$WATCHDOG_LOG"
    kill "$app_pid" 2>/dev/null || true
    tail -n 20 "$WATCHDOG_LOG" || true
    exit 1
 fi
 cat <<EOF
 시작 완료
 - app pid: $app_pid
--- a/scripts/runtime_verify_monitor.sh
+++ b/scripts/runtime_verify_monitor.sh
@@ -7,12 +7,15 @@ ROOT_DIR="${ROOT_DIR:-/home/agentson/repos/The-Ouroboros}"
 LOG_DIR="${LOG_DIR:-$ROOT_DIR/data/overnight}"
 INTERVAL_SEC="${INTERVAL_SEC:-60}"
 MAX_HOURS="${MAX_HOURS:-24}"
 MAX_LOOPS="${MAX_LOOPS:-0}"
 POLICY_TZ="${POLICY_TZ:-Asia/Seoul}"
 DASHBOARD_PORT="${DASHBOARD_PORT:-8080}"
 cd "$ROOT_DIR"
 OUT_LOG="$LOG_DIR/runtime_verify_$(date +%Y%m%d_%H%M%S).log"
 END_TS=$(( $(date +%s) + MAX_HOURS*3600 ))
 loops=0
 log() {
  printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$1" | tee -a "$OUT_LOG" >/dev/null
@@ -31,6 +34,11 @@ check_signal() {
  return 1
 }
 find_live_pids() {
  # Detect live-mode process even when run_overnight pid files are absent.
  pgrep -af "[s]rc.main --mode=live" 2>/dev/null | awk '{print $1}' | tr '\n' ',' | sed 's/,$//'
 }
 check_forbidden() {
  local name="$1"
  local pattern="$2"
@@ -44,42 +52,94 @@ check_forbidden() {
  return 0
 }
 is_port_listening() {
  local port="$1"
  if command -v ss >/dev/null 2>&1; then
    ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
    return $?
  fi
  if command -v lsof >/dev/null 2>&1; then
    lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1
    return $?
  fi
  if command -v netstat >/dev/null 2>&1; then
    netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]"
    return $?
  fi
  return 1
 }
 log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}"
 while true; do
  loops=$((loops + 1))
  now=$(date +%s)
  if [ "$now" -ge "$END_TS" ]; then
    log "[INFO] monitor completed (time window reached)"
    exit 0
  fi
  if [ "$MAX_LOOPS" -gt 0 ] && [ "$loops" -gt "$MAX_LOOPS" ]; then
    log "[INFO] monitor completed (max loops reached)"
    exit 0
  fi
  latest_run="$(ls -t "$LOG_DIR"/run_*.log 2>/dev/null | head -n1 || true)"
  if [ -z "$latest_run" ]; then
    log "[ANOMALY] no run log found"
    sleep "$INTERVAL_SEC"
    continue
  fi
  # Basic liveness hints.
  app_pid="$(cat "$LOG_DIR/app.pid" 2>/dev/null || true)"
  wd_pid="$(cat "$LOG_DIR/watchdog.pid" 2>/dev/null || true)"
  live_pids="$(find_live_pids)"
  app_alive=0
  wd_alive=0
  port_alive=0
  [ -n "$app_pid" ] && kill -0 "$app_pid" 2>/dev/null && app_alive=1
  [ -n "$wd_pid" ] && kill -0 "$wd_pid" 2>/dev/null && wd_alive=1
-  ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1
+  if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then
-  log "[HEARTBEAT] run_log=$latest_run app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive"
+    app_alive=1
  fi
  is_port_listening "$DASHBOARD_PORT" && port_alive=1
  log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port=${DASHBOARD_PORT} alive=$port_alive live_pids=${live_pids:-none}"
  defer_log_checks=0
  if [ -z "$latest_run" ] && [ "$app_alive" -eq 1 ]; then
    defer_log_checks=1
    log "[INFO] run log not yet available; defer log-based coverage checks"
  fi
  if [ -z "$latest_run" ] && [ "$defer_log_checks" -eq 0 ]; then
    log "[ANOMALY] no run log found"
  fi
  # Coverage matrix rows (session paths and policy gate evidence).
  not_observed=0
  if [ "$app_alive" -eq 1 ]; then
    log "[COVERAGE] LIVE_MODE=PASS source=process_liveness"
  else
    if [ -n "$latest_run" ]; then
      check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1))
    else
      log "[COVERAGE] LIVE_MODE=NOT_OBSERVED reason=no_run_log_no_live_pid"
      not_observed=$((not_observed+1))
    fi
  fi
  if [ "$defer_log_checks" -eq 1 ]; then
    for deferred in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do
      log "[COVERAGE] ${deferred}=DEFERRED reason=no_run_log_process_alive"
    done
  elif [ -n "$latest_run" ]; then
    check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1))
    check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1))
    check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1))
    check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1))
    check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1))
    check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1))
  else
    for missing in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do
      log "[COVERAGE] ${missing}=NOT_OBSERVED reason=no_run_log"
      not_observed=$((not_observed+1))
    done
  fi
  if [ "$not_observed" -gt 0 ]; then
    log "[ANOMALY] coverage_not_observed=$not_observed (treat as FAIL)"
@@ -95,11 +155,17 @@ while true; do
    is_weekend=1
  fi
-  if [ "$is_weekend" -eq 1 ]; then
+  if [ "$defer_log_checks" -eq 1 ]; then
    log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log_process_alive"
  elif [ "$is_weekend" -eq 1 ]; then
    # Weekend policy: KR regular session loop must never appear.
    if [ -n "$latest_run" ]; then
      check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \
        "Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \
        "$latest_run" || forbidden_hits=$((forbidden_hits+1))
    else
      log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log"
    fi
  else
    log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=weekday"
  fi
--- a/tests/test_runtime_overnight_scripts.py
+++ b/tests/test_runtime_overnight_scripts.py
@@ -0,0 +1,160 @@
 from __future__ import annotations
 import os
 import signal
 import socket
 import subprocess
 from pathlib import Path
 import pytest
 REPO_ROOT = Path(__file__).resolve().parent.parent
 RUN_OVERNIGHT = REPO_ROOT / "scripts" / "run_overnight.sh"
 RUNTIME_MONITOR = REPO_ROOT / "scripts" / "runtime_verify_monitor.sh"
 def _latest_runtime_log(log_dir: Path) -> str:
    logs = sorted(log_dir.glob("runtime_verify_*.log"))
    assert logs, "runtime monitor did not produce log output"
    return logs[-1].read_text(encoding="utf-8")
 def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path: Path) -> None:
    log_dir = tmp_path / "overnight"
    log_dir.mkdir(parents=True, exist_ok=True)
    fake_live = subprocess.Popen(
        ["bash", "-lc", 'exec -a "src.main --mode=live" sleep 10'],
        cwd=REPO_ROOT,
    )
    try:
        env = os.environ.copy()
        env.update(
            {
                "ROOT_DIR": str(REPO_ROOT),
                "LOG_DIR": str(log_dir),
                "INTERVAL_SEC": "1",
                "MAX_HOURS": "1",
                "MAX_LOOPS": "1",
                "POLICY_TZ": "UTC",
            }
        )
        completed = subprocess.run(
            ["bash", str(RUNTIME_MONITOR)],
            cwd=REPO_ROOT,
            env=env,
            capture_output=True,
            text=True,
            check=False,
        )
        assert completed.returncode == 0, completed.stderr
        log_text = _latest_runtime_log(log_dir)
        assert "app_alive=1" in log_text
        assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text
        assert "[ANOMALY]" not in log_text
    finally:
        fake_live.terminate()
        fake_live.wait(timeout=5)
 def test_run_overnight_fails_fast_when_dashboard_port_in_use(tmp_path: Path) -> None:
    log_dir = tmp_path / "overnight"
    log_dir.mkdir(parents=True, exist_ok=True)
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.bind(("127.0.0.1", 0))
    sock.listen(1)
    port = sock.getsockname()[1]
    try:
        env = os.environ.copy()
        env.update(
            {
                "LOG_DIR": str(log_dir),
                "TMUX_AUTO": "false",
                "DASHBOARD_PORT": str(port),
            }
        )
        completed = subprocess.run(
            ["bash", str(RUN_OVERNIGHT)],
            cwd=REPO_ROOT,
            env=env,
            capture_output=True,
            text=True,
            check=False,
        )
        assert completed.returncode != 0
        output = f"{completed.stdout}\n{completed.stderr}"
        assert "already in use" in output
    finally:
        sock.close()
 def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None:
    log_dir = tmp_path / "overnight"
    log_dir.mkdir(parents=True, exist_ok=True)
    env = os.environ.copy()
    env.update(
        {
            "LOG_DIR": str(log_dir),
            "TMUX_AUTO": "false",
            "STARTUP_GRACE_SEC": "1",
            "CHECK_INTERVAL": "2",
            "APP_CMD_BIN": "sleep",
            "APP_CMD_ARGS": "10",
        }
    )
    completed = subprocess.run(
        ["bash", str(RUN_OVERNIGHT)],
        cwd=REPO_ROOT,
        env=env,
        capture_output=True,
        text=True,
        check=False,
    )
    assert completed.returncode == 0, f"{completed.stdout}\n{completed.stderr}"
    app_pid = int((log_dir / "app.pid").read_text(encoding="utf-8").strip())
    watchdog_pid = int((log_dir / "watchdog.pid").read_text(encoding="utf-8").strip())
    os.kill(app_pid, 0)
    os.kill(watchdog_pid, 0)
    for pid in (watchdog_pid, app_pid):
        try:
            os.kill(pid, signal.SIGTERM)
        except ProcessLookupError:
            pass
 def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Path) -> None:
    log_dir = tmp_path / "overnight"
    log_dir.mkdir(parents=True, exist_ok=True)
    env = os.environ.copy()
    env.update(
        {
            "LOG_DIR": str(log_dir),
            "TMUX_AUTO": "false",
            "STARTUP_GRACE_SEC": "1",
            "APP_CMD_BIN": "false",
        }
    )
    completed = subprocess.run(
        ["bash", str(RUN_OVERNIGHT)],
        cwd=REPO_ROOT,
        env=env,
        capture_output=True,
        text=True,
        check=False,
    )
    assert completed.returncode != 0
    output = f"{completed.stdout}\n{completed.stderr}"
    assert "startup failed:" in output
    watchdog_pid_file = log_dir / "watchdog.pid"
    if watchdog_pid_file.exists():
        watchdog_pid = int(watchdog_pid_file.read_text(encoding="utf-8").strip())
        with pytest.raises(ProcessLookupError):
            os.kill(watchdog_pid, 0)