diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh index 4db704b..87576c1 100755 --- a/scripts/run_overnight.sh +++ b/scripts/run_overnight.sh @@ -8,6 +8,8 @@ CHECK_INTERVAL="${CHECK_INTERVAL:-30}" TMUX_AUTO="${TMUX_AUTO:-true}" TMUX_ATTACH="${TMUX_ATTACH:-true}" TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}" +STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}" +dashboard_port="${DASHBOARD_PORT:-8080}" if [ -z "${APP_CMD:-}" ]; then if [ -x ".venv/bin/python" ]; then @@ -21,8 +23,6 @@ if [ -z "${APP_CMD:-}" ]; then exit 1 fi - dashboard_port="${DASHBOARD_PORT:-8080}" - APP_CMD="DASHBOARD_PORT=$dashboard_port $PYTHON_BIN -m src.main --mode=live --dashboard" fi @@ -34,6 +34,11 @@ WATCHDOG_LOG="$LOG_DIR/watchdog_${timestamp}.log" PID_FILE="$LOG_DIR/app.pid" WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid" +is_port_in_use() { + local port="$1" + ss -ltn 2>/dev/null | rg -q ":${port}\\b" +} + if [ -f "$PID_FILE" ]; then old_pid="$(cat "$PID_FILE" || true)" if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then @@ -43,7 +48,12 @@ if [ -f "$PID_FILE" ]; then fi echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG" -nohup bash -lc "$APP_CMD" >>"$RUN_LOG" 2>&1 & +if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then + echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: dashboard port ${dashboard_port} already in use" | tee -a "$RUN_LOG" + exit 1 +fi + +nohup bash -lc "exec $APP_CMD" >>"$RUN_LOG" 2>&1 & app_pid=$! echo "$app_pid" > "$PID_FILE" @@ -54,6 +64,18 @@ nohup env PID_FILE="$PID_FILE" LOG_FILE="$WATCHDOG_LOG" CHECK_INTERVAL="$CHECK_I watchdog_pid=$! echo "$watchdog_pid" > "$WATCHDOG_PID_FILE" +sleep "$STARTUP_GRACE_SEC" +if ! kill -0 "$app_pid" 2>/dev/null; then + echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: app process exited early (pid=$app_pid)" | tee -a "$RUN_LOG" + tail -n 20 "$RUN_LOG" || true + exit 1 +fi +if ! kill -0 "$watchdog_pid" 2>/dev/null; then + echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: watchdog exited early (pid=$watchdog_pid)" | tee -a "$WATCHDOG_LOG" + tail -n 20 "$WATCHDOG_LOG" || true + exit 1 +fi + cat </dev/null @@ -31,6 +33,11 @@ check_signal() { return 1 } +find_live_pids() { + # Detect live-mode process even when run_overnight pid files are absent. + pgrep -af "[s]rc.main --mode=live" 2>/dev/null | awk '{print $1}' | tr '\n' ',' | sed 's/,$//' +} + check_forbidden() { local name="$1" local pattern="$2" @@ -47,39 +54,63 @@ check_forbidden() { log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}" while true; do + loops=$((loops + 1)) now=$(date +%s) if [ "$now" -ge "$END_TS" ]; then log "[INFO] monitor completed (time window reached)" exit 0 fi + if [ "$MAX_LOOPS" -gt 0 ] && [ "$loops" -gt "$MAX_LOOPS" ]; then + log "[INFO] monitor completed (max loops reached)" + exit 0 + fi latest_run="$(ls -t "$LOG_DIR"/run_*.log 2>/dev/null | head -n1 || true)" - if [ -z "$latest_run" ]; then - log "[ANOMALY] no run log found" - sleep "$INTERVAL_SEC" - continue - fi # Basic liveness hints. app_pid="$(cat "$LOG_DIR/app.pid" 2>/dev/null || true)" wd_pid="$(cat "$LOG_DIR/watchdog.pid" 2>/dev/null || true)" + live_pids="$(find_live_pids)" app_alive=0 wd_alive=0 port_alive=0 [ -n "$app_pid" ] && kill -0 "$app_pid" 2>/dev/null && app_alive=1 [ -n "$wd_pid" ] && kill -0 "$wd_pid" 2>/dev/null && wd_alive=1 + if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then + app_alive=1 + fi ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1 - log "[HEARTBEAT] run_log=$latest_run app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive" + log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive live_pids=${live_pids:-none}" + + if [ -z "$latest_run" ]; then + log "[ANOMALY] no run log found" + fi # Coverage matrix rows (session paths and policy gate evidence). not_observed=0 - check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1)) + if [ "$app_alive" -eq 1 ]; then + log "[COVERAGE] LIVE_MODE=PASS source=process_liveness" + else + if [ -n "$latest_run" ]; then + check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1)) + else + log "[COVERAGE] LIVE_MODE=NOT_OBSERVED reason=no_run_log_no_live_pid" + not_observed=$((not_observed+1)) + fi + fi + if [ -n "$latest_run" ]; then + check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1)) + else + for missing in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do + log "[COVERAGE] ${missing}=NOT_OBSERVED reason=no_run_log" + not_observed=$((not_observed+1)) + done + fi if [ "$not_observed" -gt 0 ]; then log "[ANOMALY] coverage_not_observed=$not_observed (treat as FAIL)" @@ -97,9 +128,13 @@ while true; do if [ "$is_weekend" -eq 1 ]; then # Weekend policy: KR regular session loop must never appear. - check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \ - "Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \ - "$latest_run" || forbidden_hits=$((forbidden_hits+1)) + if [ -n "$latest_run" ]; then + check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \ + "Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \ + "$latest_run" || forbidden_hits=$((forbidden_hits+1)) + else + log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log" + fi else log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=weekday" fi diff --git a/tests/test_runtime_overnight_scripts.py b/tests/test_runtime_overnight_scripts.py new file mode 100644 index 0000000..81b3195 --- /dev/null +++ b/tests/test_runtime_overnight_scripts.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +import os +import signal +import socket +import subprocess +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +RUN_OVERNIGHT = REPO_ROOT / "scripts" / "run_overnight.sh" +RUNTIME_MONITOR = REPO_ROOT / "scripts" / "runtime_verify_monitor.sh" + + +def _latest_runtime_log(log_dir: Path) -> str: + logs = sorted(log_dir.glob("runtime_verify_*.log")) + assert logs, "runtime monitor did not produce log output" + return logs[-1].read_text(encoding="utf-8") + + +def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path: Path) -> None: + log_dir = tmp_path / "overnight" + log_dir.mkdir(parents=True, exist_ok=True) + + fake_live = subprocess.Popen( + ["bash", "-lc", 'exec -a "src.main --mode=live" sleep 10'], + cwd=REPO_ROOT, + ) + try: + env = os.environ.copy() + env.update( + { + "ROOT_DIR": str(REPO_ROOT), + "LOG_DIR": str(log_dir), + "INTERVAL_SEC": "1", + "MAX_HOURS": "1", + "MAX_LOOPS": "1", + "POLICY_TZ": "UTC", + } + ) + completed = subprocess.run( + ["bash", str(RUNTIME_MONITOR)], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode == 0, completed.stderr + + log_text = _latest_runtime_log(log_dir) + assert "app_alive=1" in log_text + assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text + finally: + fake_live.terminate() + fake_live.wait(timeout=5) + + +def test_run_overnight_fails_fast_when_dashboard_port_in_use(tmp_path: Path) -> None: + log_dir = tmp_path / "overnight" + log_dir.mkdir(parents=True, exist_ok=True) + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("127.0.0.1", 0)) + sock.listen(1) + port = sock.getsockname()[1] + try: + env = os.environ.copy() + env.update( + { + "LOG_DIR": str(log_dir), + "TMUX_AUTO": "false", + "DASHBOARD_PORT": str(port), + } + ) + completed = subprocess.run( + ["bash", str(RUN_OVERNIGHT)], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode != 0 + output = f"{completed.stdout}\n{completed.stderr}" + assert "already in use" in output + finally: + sock.close() + + +def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None: + log_dir = tmp_path / "overnight" + log_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env.update( + { + "LOG_DIR": str(log_dir), + "TMUX_AUTO": "false", + "STARTUP_GRACE_SEC": "1", + "CHECK_INTERVAL": "2", + "APP_CMD": "sleep 10", + } + ) + completed = subprocess.run( + ["bash", str(RUN_OVERNIGHT)], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode == 0, f"{completed.stdout}\n{completed.stderr}" + + app_pid = int((log_dir / "app.pid").read_text(encoding="utf-8").strip()) + watchdog_pid = int((log_dir / "watchdog.pid").read_text(encoding="utf-8").strip()) + + os.kill(app_pid, 0) + os.kill(watchdog_pid, 0) + + for pid in (watchdog_pid, app_pid): + try: + os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + pass