diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh index 4db704b..af88e23 100755 --- a/scripts/run_overnight.sh +++ b/scripts/run_overnight.sh @@ -8,8 +8,32 @@ CHECK_INTERVAL="${CHECK_INTERVAL:-30}" TMUX_AUTO="${TMUX_AUTO:-true}" TMUX_ATTACH="${TMUX_ATTACH:-true}" TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}" +STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}" +dashboard_port="${DASHBOARD_PORT:-8080}" +APP_CMD_BIN="${APP_CMD_BIN:-}" +APP_CMD_ARGS="${APP_CMD_ARGS:-}" +RUNS_DASHBOARD="false" -if [ -z "${APP_CMD:-}" ]; then +# Custom override contract: +# 1) Preferred: APP_CMD_BIN + APP_CMD_ARGS +# - APP_CMD_BIN is treated as a single executable token. +# - APP_CMD_ARGS uses shell-style word splitting; quote/escape inside this +# variable is NOT preserved as a nested shell parse. +# 2) Legacy fallback: APP_CMD (raw shell command string) +# - This path remains for backward compatibility. +# - When APP_CMD includes --dashboard, caller should include explicit +# DASHBOARD_PORT assignment in APP_CMD if non-default port is required. + +if [ -n "$APP_CMD_BIN" ]; then + USE_DEFAULT_APP_CMD="false" + USE_SAFE_CUSTOM_APP_CMD="true" + APP_CMD="${APP_CMD_BIN} ${APP_CMD_ARGS}" + if [[ " $APP_CMD_ARGS " == *" --dashboard "* ]]; then + RUNS_DASHBOARD="true" + fi +elif [ -z "${APP_CMD:-}" ]; then + USE_DEFAULT_APP_CMD="true" + USE_SAFE_CUSTOM_APP_CMD="false" if [ -x ".venv/bin/python" ]; then PYTHON_BIN=".venv/bin/python" elif command -v python3 >/dev/null 2>&1; then @@ -21,9 +45,14 @@ if [ -z "${APP_CMD:-}" ]; then exit 1 fi - dashboard_port="${DASHBOARD_PORT:-8080}" - - APP_CMD="DASHBOARD_PORT=$dashboard_port $PYTHON_BIN -m src.main --mode=live --dashboard" + APP_CMD="$PYTHON_BIN -m src.main --mode=live --dashboard" + RUNS_DASHBOARD="true" +else + USE_DEFAULT_APP_CMD="false" + USE_SAFE_CUSTOM_APP_CMD="false" + if [[ "$APP_CMD" == *"--dashboard"* ]]; then + RUNS_DASHBOARD="true" + fi fi mkdir -p "$LOG_DIR" @@ -34,6 +63,24 @@ WATCHDOG_LOG="$LOG_DIR/watchdog_${timestamp}.log" PID_FILE="$LOG_DIR/app.pid" WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid" +is_port_in_use() { + local port="$1" + if command -v ss >/dev/null 2>&1; then + ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]" + return $? + fi + if command -v lsof >/dev/null 2>&1; then + lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1 + return $? + fi + if command -v netstat >/dev/null 2>&1; then + netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]" + return $? + fi + # No supported socket inspection command found. + return 1 +} + if [ -f "$PID_FILE" ]; then old_pid="$(cat "$PID_FILE" || true)" if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then @@ -43,7 +90,29 @@ if [ -f "$PID_FILE" ]; then fi echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG" -nohup bash -lc "$APP_CMD" >>"$RUN_LOG" 2>&1 & +if [ "$RUNS_DASHBOARD" = "true" ] && is_port_in_use "$dashboard_port"; then + echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: dashboard port ${dashboard_port} already in use" | tee -a "$RUN_LOG" + exit 1 +fi + +if [ "$USE_DEFAULT_APP_CMD" = "true" ]; then + # Default path avoids shell word-splitting on executable paths. + nohup env DASHBOARD_PORT="$dashboard_port" "$PYTHON_BIN" -m src.main --mode=live --dashboard >>"$RUN_LOG" 2>&1 & +elif [ "$USE_SAFE_CUSTOM_APP_CMD" = "true" ]; then + # Safer custom path: executable path is handled as a single token. + if [ -n "$APP_CMD_ARGS" ]; then + # shellcheck disable=SC2206 + app_args=( $APP_CMD_ARGS ) + nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" "${app_args[@]}" >>"$RUN_LOG" 2>&1 & + else + nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" >>"$RUN_LOG" 2>&1 & + fi +else + # Custom APP_CMD is treated as a shell command string. + # If executable paths include spaces, they must be quoted inside APP_CMD. + # Legacy compatibility path: caller owns quoting and env var injection. + nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 & +fi app_pid=$! echo "$app_pid" > "$PID_FILE" @@ -54,6 +123,20 @@ nohup env PID_FILE="$PID_FILE" LOG_FILE="$WATCHDOG_LOG" CHECK_INTERVAL="$CHECK_I watchdog_pid=$! echo "$watchdog_pid" > "$WATCHDOG_PID_FILE" +sleep "$STARTUP_GRACE_SEC" +if ! kill -0 "$app_pid" 2>/dev/null; then + echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: app process exited early (pid=$app_pid)" | tee -a "$RUN_LOG" + [ -n "${watchdog_pid:-}" ] && kill "$watchdog_pid" 2>/dev/null || true + tail -n 20 "$RUN_LOG" || true + exit 1 +fi +if ! kill -0 "$watchdog_pid" 2>/dev/null; then + echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: watchdog exited early (pid=$watchdog_pid)" | tee -a "$WATCHDOG_LOG" + kill "$app_pid" 2>/dev/null || true + tail -n 20 "$WATCHDOG_LOG" || true + exit 1 +fi + cat </dev/null @@ -31,6 +34,11 @@ check_signal() { return 1 } +find_live_pids() { + # Detect live-mode process even when run_overnight pid files are absent. + pgrep -af "[s]rc.main --mode=live" 2>/dev/null | awk '{print $1}' | tr '\n' ',' | sed 's/,$//' +} + check_forbidden() { local name="$1" local pattern="$2" @@ -44,42 +52,94 @@ check_forbidden() { return 0 } +is_port_listening() { + local port="$1" + + if command -v ss >/dev/null 2>&1; then + ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]" + return $? + fi + if command -v lsof >/dev/null 2>&1; then + lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1 + return $? + fi + if command -v netstat >/dev/null 2>&1; then + netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]" + return $? + fi + return 1 +} + log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}" while true; do + loops=$((loops + 1)) now=$(date +%s) if [ "$now" -ge "$END_TS" ]; then log "[INFO] monitor completed (time window reached)" exit 0 fi + if [ "$MAX_LOOPS" -gt 0 ] && [ "$loops" -gt "$MAX_LOOPS" ]; then + log "[INFO] monitor completed (max loops reached)" + exit 0 + fi latest_run="$(ls -t "$LOG_DIR"/run_*.log 2>/dev/null | head -n1 || true)" - if [ -z "$latest_run" ]; then - log "[ANOMALY] no run log found" - sleep "$INTERVAL_SEC" - continue - fi # Basic liveness hints. app_pid="$(cat "$LOG_DIR/app.pid" 2>/dev/null || true)" wd_pid="$(cat "$LOG_DIR/watchdog.pid" 2>/dev/null || true)" + live_pids="$(find_live_pids)" app_alive=0 wd_alive=0 port_alive=0 [ -n "$app_pid" ] && kill -0 "$app_pid" 2>/dev/null && app_alive=1 [ -n "$wd_pid" ] && kill -0 "$wd_pid" 2>/dev/null && wd_alive=1 - ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1 - log "[HEARTBEAT] run_log=$latest_run app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive" + if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then + app_alive=1 + fi + is_port_listening "$DASHBOARD_PORT" && port_alive=1 + log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port=${DASHBOARD_PORT} alive=$port_alive live_pids=${live_pids:-none}" + + defer_log_checks=0 + if [ -z "$latest_run" ] && [ "$app_alive" -eq 1 ]; then + defer_log_checks=1 + log "[INFO] run log not yet available; defer log-based coverage checks" + fi + + if [ -z "$latest_run" ] && [ "$defer_log_checks" -eq 0 ]; then + log "[ANOMALY] no run log found" + fi # Coverage matrix rows (session paths and policy gate evidence). not_observed=0 - check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1)) + if [ "$app_alive" -eq 1 ]; then + log "[COVERAGE] LIVE_MODE=PASS source=process_liveness" + else + if [ -n "$latest_run" ]; then + check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1)) + else + log "[COVERAGE] LIVE_MODE=NOT_OBSERVED reason=no_run_log_no_live_pid" + not_observed=$((not_observed+1)) + fi + fi + if [ "$defer_log_checks" -eq 1 ]; then + for deferred in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do + log "[COVERAGE] ${deferred}=DEFERRED reason=no_run_log_process_alive" + done + elif [ -n "$latest_run" ]; then + check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1)) + else + for missing in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do + log "[COVERAGE] ${missing}=NOT_OBSERVED reason=no_run_log" + not_observed=$((not_observed+1)) + done + fi if [ "$not_observed" -gt 0 ]; then log "[ANOMALY] coverage_not_observed=$not_observed (treat as FAIL)" @@ -95,11 +155,17 @@ while true; do is_weekend=1 fi - if [ "$is_weekend" -eq 1 ]; then + if [ "$defer_log_checks" -eq 1 ]; then + log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log_process_alive" + elif [ "$is_weekend" -eq 1 ]; then # Weekend policy: KR regular session loop must never appear. - check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \ - "Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \ - "$latest_run" || forbidden_hits=$((forbidden_hits+1)) + if [ -n "$latest_run" ]; then + check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \ + "Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \ + "$latest_run" || forbidden_hits=$((forbidden_hits+1)) + else + log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log" + fi else log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=weekday" fi diff --git a/tests/test_runtime_overnight_scripts.py b/tests/test_runtime_overnight_scripts.py new file mode 100644 index 0000000..a5b6182 --- /dev/null +++ b/tests/test_runtime_overnight_scripts.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +import os +import signal +import socket +import subprocess +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parent.parent +RUN_OVERNIGHT = REPO_ROOT / "scripts" / "run_overnight.sh" +RUNTIME_MONITOR = REPO_ROOT / "scripts" / "runtime_verify_monitor.sh" + + +def _latest_runtime_log(log_dir: Path) -> str: + logs = sorted(log_dir.glob("runtime_verify_*.log")) + assert logs, "runtime monitor did not produce log output" + return logs[-1].read_text(encoding="utf-8") + + +def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path: Path) -> None: + log_dir = tmp_path / "overnight" + log_dir.mkdir(parents=True, exist_ok=True) + + fake_live = subprocess.Popen( + ["bash", "-lc", 'exec -a "src.main --mode=live" sleep 10'], + cwd=REPO_ROOT, + ) + try: + env = os.environ.copy() + env.update( + { + "ROOT_DIR": str(REPO_ROOT), + "LOG_DIR": str(log_dir), + "INTERVAL_SEC": "1", + "MAX_HOURS": "1", + "MAX_LOOPS": "1", + "POLICY_TZ": "UTC", + } + ) + completed = subprocess.run( + ["bash", str(RUNTIME_MONITOR)], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode == 0, completed.stderr + + log_text = _latest_runtime_log(log_dir) + assert "app_alive=1" in log_text + assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text + assert "[ANOMALY]" not in log_text + finally: + fake_live.terminate() + fake_live.wait(timeout=5) + + +def test_run_overnight_fails_fast_when_dashboard_port_in_use(tmp_path: Path) -> None: + log_dir = tmp_path / "overnight" + log_dir.mkdir(parents=True, exist_ok=True) + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("127.0.0.1", 0)) + sock.listen(1) + port = sock.getsockname()[1] + try: + env = os.environ.copy() + env.update( + { + "LOG_DIR": str(log_dir), + "TMUX_AUTO": "false", + "DASHBOARD_PORT": str(port), + } + ) + completed = subprocess.run( + ["bash", str(RUN_OVERNIGHT)], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode != 0 + output = f"{completed.stdout}\n{completed.stderr}" + assert "already in use" in output + finally: + sock.close() + + +def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None: + log_dir = tmp_path / "overnight" + log_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env.update( + { + "LOG_DIR": str(log_dir), + "TMUX_AUTO": "false", + "STARTUP_GRACE_SEC": "1", + "CHECK_INTERVAL": "2", + "APP_CMD_BIN": "sleep", + "APP_CMD_ARGS": "10", + } + ) + completed = subprocess.run( + ["bash", str(RUN_OVERNIGHT)], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode == 0, f"{completed.stdout}\n{completed.stderr}" + + app_pid = int((log_dir / "app.pid").read_text(encoding="utf-8").strip()) + watchdog_pid = int((log_dir / "watchdog.pid").read_text(encoding="utf-8").strip()) + + os.kill(app_pid, 0) + os.kill(watchdog_pid, 0) + + for pid in (watchdog_pid, app_pid): + try: + os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + pass + + +def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Path) -> None: + log_dir = tmp_path / "overnight" + log_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env.update( + { + "LOG_DIR": str(log_dir), + "TMUX_AUTO": "false", + "STARTUP_GRACE_SEC": "1", + "APP_CMD_BIN": "false", + } + ) + completed = subprocess.run( + ["bash", str(RUN_OVERNIGHT)], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode != 0 + output = f"{completed.stdout}\n{completed.stderr}" + assert "startup failed:" in output + + watchdog_pid_file = log_dir / "watchdog.pid" + if watchdog_pid_file.exists(): + watchdog_pid = int(watchdog_pid_file.read_text(encoding="utf-8").strip()) + with pytest.raises(ProcessLookupError): + os.kill(watchdog_pid, 0)