From 528e17a29c08d9d999d0359e04d7f60bd5f69840 Mon Sep 17 00:00:00 2001 From: agentson Date: Wed, 4 Mar 2026 02:04:13 +0900 Subject: [PATCH 1/6] fix: stabilize overnight startup and monitor live fallback (#396 #397) --- scripts/run_overnight.sh | 28 +++++- scripts/runtime_verify_monitor.sh | 67 ++++++++++--- tests/test_runtime_overnight_scripts.py | 124 ++++++++++++++++++++++++ 3 files changed, 200 insertions(+), 19 deletions(-) create mode 100644 tests/test_runtime_overnight_scripts.py diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh index 4db704b..87576c1 100755 --- a/scripts/run_overnight.sh +++ b/scripts/run_overnight.sh @@ -8,6 +8,8 @@ CHECK_INTERVAL="${CHECK_INTERVAL:-30}" TMUX_AUTO="${TMUX_AUTO:-true}" TMUX_ATTACH="${TMUX_ATTACH:-true}" TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}" +STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}" +dashboard_port="${DASHBOARD_PORT:-8080}" if [ -z "${APP_CMD:-}" ]; then if [ -x ".venv/bin/python" ]; then @@ -21,8 +23,6 @@ if [ -z "${APP_CMD:-}" ]; then exit 1 fi - dashboard_port="${DASHBOARD_PORT:-8080}" - APP_CMD="DASHBOARD_PORT=$dashboard_port $PYTHON_BIN -m src.main --mode=live --dashboard" fi @@ -34,6 +34,11 @@ WATCHDOG_LOG="$LOG_DIR/watchdog_${timestamp}.log" PID_FILE="$LOG_DIR/app.pid" WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid" +is_port_in_use() { + local port="$1" + ss -ltn 2>/dev/null | rg -q ":${port}\\b" +} + if [ -f "$PID_FILE" ]; then old_pid="$(cat "$PID_FILE" || true)" if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then @@ -43,7 +48,12 @@ if [ -f "$PID_FILE" ]; then fi echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG" -nohup bash -lc "$APP_CMD" >>"$RUN_LOG" 2>&1 & +if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then + echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: dashboard port ${dashboard_port} already in use" | tee -a "$RUN_LOG" + exit 1 +fi + +nohup bash -lc "exec $APP_CMD" >>"$RUN_LOG" 2>&1 & app_pid=$! echo "$app_pid" > "$PID_FILE" @@ -54,6 +64,18 @@ nohup env PID_FILE="$PID_FILE" LOG_FILE="$WATCHDOG_LOG" CHECK_INTERVAL="$CHECK_I watchdog_pid=$! echo "$watchdog_pid" > "$WATCHDOG_PID_FILE" +sleep "$STARTUP_GRACE_SEC" +if ! kill -0 "$app_pid" 2>/dev/null; then + echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: app process exited early (pid=$app_pid)" | tee -a "$RUN_LOG" + tail -n 20 "$RUN_LOG" || true + exit 1 +fi +if ! kill -0 "$watchdog_pid" 2>/dev/null; then + echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: watchdog exited early (pid=$watchdog_pid)" | tee -a "$WATCHDOG_LOG" + tail -n 20 "$WATCHDOG_LOG" || true + exit 1 +fi + cat </dev/null @@ -31,6 +33,11 @@ check_signal() { return 1 } +find_live_pids() { + # Detect live-mode process even when run_overnight pid files are absent. + pgrep -af "[s]rc.main --mode=live" 2>/dev/null | awk '{print $1}' | tr '\n' ',' | sed 's/,$//' +} + check_forbidden() { local name="$1" local pattern="$2" @@ -47,39 +54,63 @@ check_forbidden() { log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}" while true; do + loops=$((loops + 1)) now=$(date +%s) if [ "$now" -ge "$END_TS" ]; then log "[INFO] monitor completed (time window reached)" exit 0 fi + if [ "$MAX_LOOPS" -gt 0 ] && [ "$loops" -gt "$MAX_LOOPS" ]; then + log "[INFO] monitor completed (max loops reached)" + exit 0 + fi latest_run="$(ls -t "$LOG_DIR"/run_*.log 2>/dev/null | head -n1 || true)" - if [ -z "$latest_run" ]; then - log "[ANOMALY] no run log found" - sleep "$INTERVAL_SEC" - continue - fi # Basic liveness hints. app_pid="$(cat "$LOG_DIR/app.pid" 2>/dev/null || true)" wd_pid="$(cat "$LOG_DIR/watchdog.pid" 2>/dev/null || true)" + live_pids="$(find_live_pids)" app_alive=0 wd_alive=0 port_alive=0 [ -n "$app_pid" ] && kill -0 "$app_pid" 2>/dev/null && app_alive=1 [ -n "$wd_pid" ] && kill -0 "$wd_pid" 2>/dev/null && wd_alive=1 + if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then + app_alive=1 + fi ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1 - log "[HEARTBEAT] run_log=$latest_run app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive" + log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive live_pids=${live_pids:-none}" + + if [ -z "$latest_run" ]; then + log "[ANOMALY] no run log found" + fi # Coverage matrix rows (session paths and policy gate evidence). not_observed=0 - check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1)) - check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1)) + if [ "$app_alive" -eq 1 ]; then + log "[COVERAGE] LIVE_MODE=PASS source=process_liveness" + else + if [ -n "$latest_run" ]; then + check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1)) + else + log "[COVERAGE] LIVE_MODE=NOT_OBSERVED reason=no_run_log_no_live_pid" + not_observed=$((not_observed+1)) + fi + fi + if [ -n "$latest_run" ]; then + check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1)) + check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1)) + else + for missing in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do + log "[COVERAGE] ${missing}=NOT_OBSERVED reason=no_run_log" + not_observed=$((not_observed+1)) + done + fi if [ "$not_observed" -gt 0 ]; then log "[ANOMALY] coverage_not_observed=$not_observed (treat as FAIL)" @@ -97,9 +128,13 @@ while true; do if [ "$is_weekend" -eq 1 ]; then # Weekend policy: KR regular session loop must never appear. - check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \ - "Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \ - "$latest_run" || forbidden_hits=$((forbidden_hits+1)) + if [ -n "$latest_run" ]; then + check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \ + "Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \ + "$latest_run" || forbidden_hits=$((forbidden_hits+1)) + else + log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log" + fi else log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=weekday" fi diff --git a/tests/test_runtime_overnight_scripts.py b/tests/test_runtime_overnight_scripts.py new file mode 100644 index 0000000..81b3195 --- /dev/null +++ b/tests/test_runtime_overnight_scripts.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +import os +import signal +import socket +import subprocess +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +RUN_OVERNIGHT = REPO_ROOT / "scripts" / "run_overnight.sh" +RUNTIME_MONITOR = REPO_ROOT / "scripts" / "runtime_verify_monitor.sh" + + +def _latest_runtime_log(log_dir: Path) -> str: + logs = sorted(log_dir.glob("runtime_verify_*.log")) + assert logs, "runtime monitor did not produce log output" + return logs[-1].read_text(encoding="utf-8") + + +def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path: Path) -> None: + log_dir = tmp_path / "overnight" + log_dir.mkdir(parents=True, exist_ok=True) + + fake_live = subprocess.Popen( + ["bash", "-lc", 'exec -a "src.main --mode=live" sleep 10'], + cwd=REPO_ROOT, + ) + try: + env = os.environ.copy() + env.update( + { + "ROOT_DIR": str(REPO_ROOT), + "LOG_DIR": str(log_dir), + "INTERVAL_SEC": "1", + "MAX_HOURS": "1", + "MAX_LOOPS": "1", + "POLICY_TZ": "UTC", + } + ) + completed = subprocess.run( + ["bash", str(RUNTIME_MONITOR)], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode == 0, completed.stderr + + log_text = _latest_runtime_log(log_dir) + assert "app_alive=1" in log_text + assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text + finally: + fake_live.terminate() + fake_live.wait(timeout=5) + + +def test_run_overnight_fails_fast_when_dashboard_port_in_use(tmp_path: Path) -> None: + log_dir = tmp_path / "overnight" + log_dir.mkdir(parents=True, exist_ok=True) + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("127.0.0.1", 0)) + sock.listen(1) + port = sock.getsockname()[1] + try: + env = os.environ.copy() + env.update( + { + "LOG_DIR": str(log_dir), + "TMUX_AUTO": "false", + "DASHBOARD_PORT": str(port), + } + ) + completed = subprocess.run( + ["bash", str(RUN_OVERNIGHT)], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode != 0 + output = f"{completed.stdout}\n{completed.stderr}" + assert "already in use" in output + finally: + sock.close() + + +def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None: + log_dir = tmp_path / "overnight" + log_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env.update( + { + "LOG_DIR": str(log_dir), + "TMUX_AUTO": "false", + "STARTUP_GRACE_SEC": "1", + "CHECK_INTERVAL": "2", + "APP_CMD": "sleep 10", + } + ) + completed = subprocess.run( + ["bash", str(RUN_OVERNIGHT)], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode == 0, f"{completed.stdout}\n{completed.stderr}" + + app_pid = int((log_dir / "app.pid").read_text(encoding="utf-8").strip()) + watchdog_pid = int((log_dir / "watchdog.pid").read_text(encoding="utf-8").strip()) + + os.kill(app_pid, 0) + os.kill(watchdog_pid, 0) + + for pid in (watchdog_pid, app_pid): + try: + os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + pass From 370ee8cc85083b93f9281947256e33b556a9253c Mon Sep 17 00:00:00 2001 From: agentson Date: Wed, 4 Mar 2026 02:07:52 +0900 Subject: [PATCH 2/6] fix: make overnight startup portable in CI environments --- scripts/run_overnight.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh index 87576c1..b5d6902 100755 --- a/scripts/run_overnight.sh +++ b/scripts/run_overnight.sh @@ -36,7 +36,7 @@ WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid" is_port_in_use() { local port="$1" - ss -ltn 2>/dev/null | rg -q ":${port}\\b" + ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]" } if [ -f "$PID_FILE" ]; then @@ -53,7 +53,8 @@ if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then exit 1 fi -nohup bash -lc "exec $APP_CMD" >>"$RUN_LOG" 2>&1 & +# `env` keeps inline VAR=value prefixes in APP_CMD working with `exec`. +nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 & app_pid=$! echo "$app_pid" > "$PID_FILE" From 3cde8779fa40d59b0cf40b1d2fa6c08a2d33c0a0 Mon Sep 17 00:00:00 2001 From: agentson Date: Wed, 4 Mar 2026 02:23:43 +0900 Subject: [PATCH 3/6] fix: address PR #404 review feedback --- scripts/run_overnight.sh | 18 +++++++++++++++-- scripts/runtime_verify_monitor.sh | 23 ++++++++++++++++----- tests/test_runtime_overnight_scripts.py | 27 +++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh index b5d6902..e71a7b0 100755 --- a/scripts/run_overnight.sh +++ b/scripts/run_overnight.sh @@ -36,7 +36,20 @@ WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid" is_port_in_use() { local port="$1" - ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]" + if command -v ss >/dev/null 2>&1; then + ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]" + return $? + fi + if command -v lsof >/dev/null 2>&1; then + lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1 + return $? + fi + if command -v netstat >/dev/null 2>&1; then + netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]" + return $? + fi + # No supported socket inspection command found. + return 1 } if [ -f "$PID_FILE" ]; then @@ -53,7 +66,8 @@ if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then exit 1 fi -# `env` keeps inline VAR=value prefixes in APP_CMD working with `exec`. +# `APP_CMD` is treated as a shell command string. +# If executable paths include spaces, they must be quoted inside APP_CMD. nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 & app_pid=$! echo "$app_pid" > "$PID_FILE" diff --git a/scripts/runtime_verify_monitor.sh b/scripts/runtime_verify_monitor.sh index e5a78a1..5d7aad2 100755 --- a/scripts/runtime_verify_monitor.sh +++ b/scripts/runtime_verify_monitor.sh @@ -9,6 +9,7 @@ INTERVAL_SEC="${INTERVAL_SEC:-60}" MAX_HOURS="${MAX_HOURS:-24}" MAX_LOOPS="${MAX_LOOPS:-0}" POLICY_TZ="${POLICY_TZ:-Asia/Seoul}" +DASHBOARD_PORT="${DASHBOARD_PORT:-8080}" cd "$ROOT_DIR" @@ -79,10 +80,16 @@ while true; do if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then app_alive=1 fi - ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1 - log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive live_pids=${live_pids:-none}" + ss -ltnp 2>/dev/null | rg -q ":${DASHBOARD_PORT}\\b" && port_alive=1 + log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port=${DASHBOARD_PORT} alive=$port_alive live_pids=${live_pids:-none}" - if [ -z "$latest_run" ]; then + defer_log_checks=0 + if [ -z "$latest_run" ] && [ "$app_alive" -eq 1 ]; then + defer_log_checks=1 + log "[INFO] run log not yet available; defer log-based coverage checks" + fi + + if [ -z "$latest_run" ] && [ "$defer_log_checks" -eq 0 ]; then log "[ANOMALY] no run log found" fi @@ -98,7 +105,11 @@ while true; do not_observed=$((not_observed+1)) fi fi - if [ -n "$latest_run" ]; then + if [ "$defer_log_checks" -eq 1 ]; then + for deferred in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do + log "[COVERAGE] ${deferred}=DEFERRED reason=no_run_log_process_alive" + done + elif [ -n "$latest_run" ]; then check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1)) check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1)) check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1)) @@ -126,7 +137,9 @@ while true; do is_weekend=1 fi - if [ "$is_weekend" -eq 1 ]; then + if [ "$defer_log_checks" -eq 1 ]; then + log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log_process_alive" + elif [ "$is_weekend" -eq 1 ]; then # Weekend policy: KR regular session loop must never appear. if [ -n "$latest_run" ]; then check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \ diff --git a/tests/test_runtime_overnight_scripts.py b/tests/test_runtime_overnight_scripts.py index 81b3195..f0ec038 100644 --- a/tests/test_runtime_overnight_scripts.py +++ b/tests/test_runtime_overnight_scripts.py @@ -50,6 +50,7 @@ def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path: log_text = _latest_runtime_log(log_dir) assert "app_alive=1" in log_text assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text + assert "[ANOMALY]" not in log_text finally: fake_live.terminate() fake_live.wait(timeout=5) @@ -122,3 +123,29 @@ def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None: os.kill(pid, signal.SIGTERM) except ProcessLookupError: pass + + +def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Path) -> None: + log_dir = tmp_path / "overnight" + log_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env.update( + { + "LOG_DIR": str(log_dir), + "TMUX_AUTO": "false", + "STARTUP_GRACE_SEC": "1", + "APP_CMD": "false", + } + ) + completed = subprocess.run( + ["bash", str(RUN_OVERNIGHT)], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode != 0 + output = f"{completed.stdout}\n{completed.stderr}" + assert "startup failed:" in output From c412412f7be4b687187ee945666407c9490e27b6 Mon Sep 17 00:00:00 2001 From: agentson Date: Wed, 4 Mar 2026 02:29:54 +0900 Subject: [PATCH 4/6] fix: address second-round review findings on PR #404 --- scripts/run_overnight.sh | 18 ++++++++++++++---- scripts/runtime_verify_monitor.sh | 20 +++++++++++++++++++- tests/test_runtime_overnight_scripts.py | 8 ++++++++ 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh index e71a7b0..09458a7 100755 --- a/scripts/run_overnight.sh +++ b/scripts/run_overnight.sh @@ -12,6 +12,7 @@ STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}" dashboard_port="${DASHBOARD_PORT:-8080}" if [ -z "${APP_CMD:-}" ]; then + USE_DEFAULT_APP_CMD="true" if [ -x ".venv/bin/python" ]; then PYTHON_BIN=".venv/bin/python" elif command -v python3 >/dev/null 2>&1; then @@ -23,7 +24,9 @@ if [ -z "${APP_CMD:-}" ]; then exit 1 fi - APP_CMD="DASHBOARD_PORT=$dashboard_port $PYTHON_BIN -m src.main --mode=live --dashboard" + APP_CMD="$PYTHON_BIN -m src.main --mode=live --dashboard" +else + USE_DEFAULT_APP_CMD="false" fi mkdir -p "$LOG_DIR" @@ -66,9 +69,14 @@ if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then exit 1 fi -# `APP_CMD` is treated as a shell command string. -# If executable paths include spaces, they must be quoted inside APP_CMD. -nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 & +if [ "$USE_DEFAULT_APP_CMD" = "true" ]; then + # Default path avoids shell word-splitting on executable paths. + nohup env DASHBOARD_PORT="$dashboard_port" "$PYTHON_BIN" -m src.main --mode=live --dashboard >>"$RUN_LOG" 2>&1 & +else + # Custom APP_CMD is treated as a shell command string. + # If executable paths include spaces, they must be quoted inside APP_CMD. + nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 & +fi app_pid=$! echo "$app_pid" > "$PID_FILE" @@ -82,11 +90,13 @@ echo "$watchdog_pid" > "$WATCHDOG_PID_FILE" sleep "$STARTUP_GRACE_SEC" if ! kill -0 "$app_pid" 2>/dev/null; then echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: app process exited early (pid=$app_pid)" | tee -a "$RUN_LOG" + [ -n "${watchdog_pid:-}" ] && kill "$watchdog_pid" 2>/dev/null || true tail -n 20 "$RUN_LOG" || true exit 1 fi if ! kill -0 "$watchdog_pid" 2>/dev/null; then echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: watchdog exited early (pid=$watchdog_pid)" | tee -a "$WATCHDOG_LOG" + kill "$app_pid" 2>/dev/null || true tail -n 20 "$WATCHDOG_LOG" || true exit 1 fi diff --git a/scripts/runtime_verify_monitor.sh b/scripts/runtime_verify_monitor.sh index 5d7aad2..5ad92eb 100755 --- a/scripts/runtime_verify_monitor.sh +++ b/scripts/runtime_verify_monitor.sh @@ -52,6 +52,24 @@ check_forbidden() { return 0 } +is_port_listening() { + local port="$1" + + if command -v ss >/dev/null 2>&1; then + ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]" + return $? + fi + if command -v lsof >/dev/null 2>&1; then + lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1 + return $? + fi + if command -v netstat >/dev/null 2>&1; then + netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]" + return $? + fi + return 1 +} + log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}" while true; do @@ -80,7 +98,7 @@ while true; do if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then app_alive=1 fi - ss -ltnp 2>/dev/null | rg -q ":${DASHBOARD_PORT}\\b" && port_alive=1 + is_port_listening "$DASHBOARD_PORT" && port_alive=1 log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port=${DASHBOARD_PORT} alive=$port_alive live_pids=${live_pids:-none}" defer_log_checks=0 diff --git a/tests/test_runtime_overnight_scripts.py b/tests/test_runtime_overnight_scripts.py index f0ec038..3cad774 100644 --- a/tests/test_runtime_overnight_scripts.py +++ b/tests/test_runtime_overnight_scripts.py @@ -6,6 +6,8 @@ import socket import subprocess from pathlib import Path +import pytest + REPO_ROOT = Path(__file__).resolve().parent.parent RUN_OVERNIGHT = REPO_ROOT / "scripts" / "run_overnight.sh" RUNTIME_MONITOR = REPO_ROOT / "scripts" / "runtime_verify_monitor.sh" @@ -149,3 +151,9 @@ def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Pa assert completed.returncode != 0 output = f"{completed.stdout}\n{completed.stderr}" assert "startup failed:" in output + + watchdog_pid_file = log_dir / "watchdog.pid" + if watchdog_pid_file.exists(): + watchdog_pid = int(watchdog_pid_file.read_text(encoding="utf-8").strip()) + with pytest.raises(ProcessLookupError): + os.kill(watchdog_pid, 0) From dc0775cbc6cebaa1907d6936d7acea7f468f41cd Mon Sep 17 00:00:00 2001 From: agentson Date: Wed, 4 Mar 2026 02:36:24 +0900 Subject: [PATCH 5/6] fix: add safer custom command path for run_overnight --- scripts/run_overnight.sh | 29 +++++++++++++++++++++++-- tests/test_runtime_overnight_scripts.py | 5 +++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh index 09458a7..9007806 100755 --- a/scripts/run_overnight.sh +++ b/scripts/run_overnight.sh @@ -10,9 +10,20 @@ TMUX_ATTACH="${TMUX_ATTACH:-true}" TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}" STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}" dashboard_port="${DASHBOARD_PORT:-8080}" +APP_CMD_BIN="${APP_CMD_BIN:-}" +APP_CMD_ARGS="${APP_CMD_ARGS:-}" +RUNS_DASHBOARD="false" -if [ -z "${APP_CMD:-}" ]; then +if [ -n "$APP_CMD_BIN" ]; then + USE_DEFAULT_APP_CMD="false" + USE_SAFE_CUSTOM_APP_CMD="true" + APP_CMD="${APP_CMD_BIN} ${APP_CMD_ARGS}" + if [[ " $APP_CMD_ARGS " == *" --dashboard "* ]]; then + RUNS_DASHBOARD="true" + fi +elif [ -z "${APP_CMD:-}" ]; then USE_DEFAULT_APP_CMD="true" + USE_SAFE_CUSTOM_APP_CMD="false" if [ -x ".venv/bin/python" ]; then PYTHON_BIN=".venv/bin/python" elif command -v python3 >/dev/null 2>&1; then @@ -25,8 +36,13 @@ if [ -z "${APP_CMD:-}" ]; then fi APP_CMD="$PYTHON_BIN -m src.main --mode=live --dashboard" + RUNS_DASHBOARD="true" else USE_DEFAULT_APP_CMD="false" + USE_SAFE_CUSTOM_APP_CMD="false" + if [[ "$APP_CMD" == *"--dashboard"* ]]; then + RUNS_DASHBOARD="true" + fi fi mkdir -p "$LOG_DIR" @@ -64,7 +80,7 @@ if [ -f "$PID_FILE" ]; then fi echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG" -if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then +if [ "$RUNS_DASHBOARD" = "true" ] && is_port_in_use "$dashboard_port"; then echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: dashboard port ${dashboard_port} already in use" | tee -a "$RUN_LOG" exit 1 fi @@ -72,6 +88,15 @@ fi if [ "$USE_DEFAULT_APP_CMD" = "true" ]; then # Default path avoids shell word-splitting on executable paths. nohup env DASHBOARD_PORT="$dashboard_port" "$PYTHON_BIN" -m src.main --mode=live --dashboard >>"$RUN_LOG" 2>&1 & +elif [ "$USE_SAFE_CUSTOM_APP_CMD" = "true" ]; then + # Safer custom path: executable path is handled as a single token. + if [ -n "$APP_CMD_ARGS" ]; then + # shellcheck disable=SC2206 + app_args=( $APP_CMD_ARGS ) + nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" "${app_args[@]}" >>"$RUN_LOG" 2>&1 & + else + nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" >>"$RUN_LOG" 2>&1 & + fi else # Custom APP_CMD is treated as a shell command string. # If executable paths include spaces, they must be quoted inside APP_CMD. diff --git a/tests/test_runtime_overnight_scripts.py b/tests/test_runtime_overnight_scripts.py index 3cad774..a5b6182 100644 --- a/tests/test_runtime_overnight_scripts.py +++ b/tests/test_runtime_overnight_scripts.py @@ -101,7 +101,8 @@ def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None: "TMUX_AUTO": "false", "STARTUP_GRACE_SEC": "1", "CHECK_INTERVAL": "2", - "APP_CMD": "sleep 10", + "APP_CMD_BIN": "sleep", + "APP_CMD_ARGS": "10", } ) completed = subprocess.run( @@ -137,7 +138,7 @@ def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Pa "LOG_DIR": str(log_dir), "TMUX_AUTO": "false", "STARTUP_GRACE_SEC": "1", - "APP_CMD": "false", + "APP_CMD_BIN": "false", } ) completed = subprocess.run( From bcbbf80d1619d3deb2a3e926464ea8852cfabfd3 Mon Sep 17 00:00:00 2001 From: agentson Date: Wed, 4 Mar 2026 02:43:32 +0900 Subject: [PATCH 6/6] docs: clarify APP_CMD legacy and APP_CMD_ARGS contract --- scripts/run_overnight.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh index 9007806..af88e23 100755 --- a/scripts/run_overnight.sh +++ b/scripts/run_overnight.sh @@ -14,6 +14,16 @@ APP_CMD_BIN="${APP_CMD_BIN:-}" APP_CMD_ARGS="${APP_CMD_ARGS:-}" RUNS_DASHBOARD="false" +# Custom override contract: +# 1) Preferred: APP_CMD_BIN + APP_CMD_ARGS +# - APP_CMD_BIN is treated as a single executable token. +# - APP_CMD_ARGS uses shell-style word splitting; quote/escape inside this +# variable is NOT preserved as a nested shell parse. +# 2) Legacy fallback: APP_CMD (raw shell command string) +# - This path remains for backward compatibility. +# - When APP_CMD includes --dashboard, caller should include explicit +# DASHBOARD_PORT assignment in APP_CMD if non-default port is required. + if [ -n "$APP_CMD_BIN" ]; then USE_DEFAULT_APP_CMD="false" USE_SAFE_CUSTOM_APP_CMD="true" @@ -100,6 +110,7 @@ elif [ "$USE_SAFE_CUSTOM_APP_CMD" = "true" ]; then else # Custom APP_CMD is treated as a shell command string. # If executable paths include spaces, they must be quoted inside APP_CMD. + # Legacy compatibility path: caller owns quoting and env var injection. nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 & fi app_pid=$!