From 528e17a29c08d9d999d0359e04d7f60bd5f69840 Mon Sep 17 00:00:00 2001
From: agentson <agentson@localhost>
Date: Wed, 4 Mar 2026 02:04:13 +0900
Subject: [PATCH 1/6] fix: stabilize overnight startup and monitor live
 fallback (#396 #397)

---
 scripts/run_overnight.sh                |  28 +++++-
 scripts/runtime_verify_monitor.sh       |  67 ++++++++++---
 tests/test_runtime_overnight_scripts.py | 124 ++++++++++++++++++++++++
 3 files changed, 200 insertions(+), 19 deletions(-)
 create mode 100644 tests/test_runtime_overnight_scripts.py

diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh
index 4db704b..87576c1 100755
--- a/scripts/run_overnight.sh
+++ b/scripts/run_overnight.sh
@@ -8,6 +8,8 @@ CHECK_INTERVAL="${CHECK_INTERVAL:-30}"
 TMUX_AUTO="${TMUX_AUTO:-true}"
 TMUX_ATTACH="${TMUX_ATTACH:-true}"
 TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}"
+STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}"
+dashboard_port="${DASHBOARD_PORT:-8080}"
 
 if [ -z "${APP_CMD:-}" ]; then
     if [ -x ".venv/bin/python" ]; then
@@ -21,8 +23,6 @@ if [ -z "${APP_CMD:-}" ]; then
         exit 1
     fi
 
-    dashboard_port="${DASHBOARD_PORT:-8080}"
-
     APP_CMD="DASHBOARD_PORT=$dashboard_port $PYTHON_BIN -m src.main --mode=live --dashboard"
 fi
 
@@ -34,6 +34,11 @@ WATCHDOG_LOG="$LOG_DIR/watchdog_${timestamp}.log"
 PID_FILE="$LOG_DIR/app.pid"
 WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid"
 
+is_port_in_use() {
+    local port="$1"
+    ss -ltn 2>/dev/null | rg -q ":${port}\\b"
+}
+
 if [ -f "$PID_FILE" ]; then
     old_pid="$(cat "$PID_FILE" || true)"
     if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
@@ -43,7 +48,12 @@ if [ -f "$PID_FILE" ]; then
 fi
 
 echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG"
-nohup bash -lc "$APP_CMD" >>"$RUN_LOG" 2>&1 &
+if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then
+    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: dashboard port ${dashboard_port} already in use" | tee -a "$RUN_LOG"
+    exit 1
+fi
+
+nohup bash -lc "exec $APP_CMD" >>"$RUN_LOG" 2>&1 &
 app_pid=$!
 echo "$app_pid" > "$PID_FILE"
 
@@ -54,6 +64,18 @@ nohup env PID_FILE="$PID_FILE" LOG_FILE="$WATCHDOG_LOG" CHECK_INTERVAL="$CHECK_I
 watchdog_pid=$!
 echo "$watchdog_pid" > "$WATCHDOG_PID_FILE"
 
+sleep "$STARTUP_GRACE_SEC"
+if ! kill -0 "$app_pid" 2>/dev/null; then
+    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: app process exited early (pid=$app_pid)" | tee -a "$RUN_LOG"
+    tail -n 20 "$RUN_LOG" || true
+    exit 1
+fi
+if ! kill -0 "$watchdog_pid" 2>/dev/null; then
+    echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: watchdog exited early (pid=$watchdog_pid)" | tee -a "$WATCHDOG_LOG"
+    tail -n 20 "$WATCHDOG_LOG" || true
+    exit 1
+fi
+
 cat <<EOF
 시작 완료
 - app pid: $app_pid
diff --git a/scripts/runtime_verify_monitor.sh b/scripts/runtime_verify_monitor.sh
index c2fbd58..e5a78a1 100755
--- a/scripts/runtime_verify_monitor.sh
+++ b/scripts/runtime_verify_monitor.sh
@@ -7,12 +7,14 @@ ROOT_DIR="${ROOT_DIR:-/home/agentson/repos/The-Ouroboros}"
 LOG_DIR="${LOG_DIR:-$ROOT_DIR/data/overnight}"
 INTERVAL_SEC="${INTERVAL_SEC:-60}"
 MAX_HOURS="${MAX_HOURS:-24}"
+MAX_LOOPS="${MAX_LOOPS:-0}"
 POLICY_TZ="${POLICY_TZ:-Asia/Seoul}"
 
 cd "$ROOT_DIR"
 
 OUT_LOG="$LOG_DIR/runtime_verify_$(date +%Y%m%d_%H%M%S).log"
 END_TS=$(( $(date +%s) + MAX_HOURS*3600 ))
+loops=0
 
 log() {
   printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$1" | tee -a "$OUT_LOG" >/dev/null
@@ -31,6 +33,11 @@ check_signal() {
   return 1
 }
 
+find_live_pids() {
+  # Detect live-mode process even when run_overnight pid files are absent.
+  pgrep -af "[s]rc.main --mode=live" 2>/dev/null | awk '{print $1}' | tr '\n' ',' | sed 's/,$//'
+}
+
 check_forbidden() {
   local name="$1"
   local pattern="$2"
@@ -47,39 +54,63 @@ check_forbidden() {
 log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}"
 
 while true; do
+  loops=$((loops + 1))
   now=$(date +%s)
   if [ "$now" -ge "$END_TS" ]; then
     log "[INFO] monitor completed (time window reached)"
     exit 0
   fi
+  if [ "$MAX_LOOPS" -gt 0 ] && [ "$loops" -gt "$MAX_LOOPS" ]; then
+    log "[INFO] monitor completed (max loops reached)"
+    exit 0
+  fi
 
   latest_run="$(ls -t "$LOG_DIR"/run_*.log 2>/dev/null | head -n1 || true)"
-  if [ -z "$latest_run" ]; then
-    log "[ANOMALY] no run log found"
-    sleep "$INTERVAL_SEC"
-    continue
-  fi
 
   # Basic liveness hints.
   app_pid="$(cat "$LOG_DIR/app.pid" 2>/dev/null || true)"
   wd_pid="$(cat "$LOG_DIR/watchdog.pid" 2>/dev/null || true)"
+  live_pids="$(find_live_pids)"
   app_alive=0
   wd_alive=0
   port_alive=0
   [ -n "$app_pid" ] && kill -0 "$app_pid" 2>/dev/null && app_alive=1
   [ -n "$wd_pid" ] && kill -0 "$wd_pid" 2>/dev/null && wd_alive=1
+  if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then
+    app_alive=1
+  fi
   ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1
-  log "[HEARTBEAT] run_log=$latest_run app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive"
+  log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive live_pids=${live_pids:-none}"
+
+  if [ -z "$latest_run" ]; then
+    log "[ANOMALY] no run log found"
+  fi
 
   # Coverage matrix rows (session paths and policy gate evidence).
   not_observed=0
-  check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1))
-  check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1))
-  check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1))
-  check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1))
-  check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1))
-  check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1))
-  check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1))
+  if [ "$app_alive" -eq 1 ]; then
+    log "[COVERAGE] LIVE_MODE=PASS source=process_liveness"
+  else
+    if [ -n "$latest_run" ]; then
+      check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1))
+    else
+      log "[COVERAGE] LIVE_MODE=NOT_OBSERVED reason=no_run_log_no_live_pid"
+      not_observed=$((not_observed+1))
+    fi
+  fi
+  if [ -n "$latest_run" ]; then
+    check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1))
+    check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1))
+    check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1))
+    check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1))
+    check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1))
+    check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1))
+  else
+    for missing in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do
+      log "[COVERAGE] ${missing}=NOT_OBSERVED reason=no_run_log"
+      not_observed=$((not_observed+1))
+    done
+  fi
 
   if [ "$not_observed" -gt 0 ]; then
     log "[ANOMALY] coverage_not_observed=$not_observed (treat as FAIL)"
@@ -97,9 +128,13 @@ while true; do
 
   if [ "$is_weekend" -eq 1 ]; then
     # Weekend policy: KR regular session loop must never appear.
-    check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \
-      "Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \
-      "$latest_run" || forbidden_hits=$((forbidden_hits+1))
+    if [ -n "$latest_run" ]; then
+      check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \
+        "Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \
+        "$latest_run" || forbidden_hits=$((forbidden_hits+1))
+    else
+      log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log"
+    fi
   else
     log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=weekday"
   fi
diff --git a/tests/test_runtime_overnight_scripts.py b/tests/test_runtime_overnight_scripts.py
new file mode 100644
index 0000000..81b3195
--- /dev/null
+++ b/tests/test_runtime_overnight_scripts.py
@@ -0,0 +1,124 @@
+from __future__ import annotations
+
+import os
+import signal
+import socket
+import subprocess
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+RUN_OVERNIGHT = REPO_ROOT / "scripts" / "run_overnight.sh"
+RUNTIME_MONITOR = REPO_ROOT / "scripts" / "runtime_verify_monitor.sh"
+
+
+def _latest_runtime_log(log_dir: Path) -> str:
+    logs = sorted(log_dir.glob("runtime_verify_*.log"))
+    assert logs, "runtime monitor did not produce log output"
+    return logs[-1].read_text(encoding="utf-8")
+
+
+def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path: Path) -> None:
+    log_dir = tmp_path / "overnight"
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    fake_live = subprocess.Popen(
+        ["bash", "-lc", 'exec -a "src.main --mode=live" sleep 10'],
+        cwd=REPO_ROOT,
+    )
+    try:
+        env = os.environ.copy()
+        env.update(
+            {
+                "ROOT_DIR": str(REPO_ROOT),
+                "LOG_DIR": str(log_dir),
+                "INTERVAL_SEC": "1",
+                "MAX_HOURS": "1",
+                "MAX_LOOPS": "1",
+                "POLICY_TZ": "UTC",
+            }
+        )
+        completed = subprocess.run(
+            ["bash", str(RUNTIME_MONITOR)],
+            cwd=REPO_ROOT,
+            env=env,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        assert completed.returncode == 0, completed.stderr
+
+        log_text = _latest_runtime_log(log_dir)
+        assert "app_alive=1" in log_text
+        assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text
+    finally:
+        fake_live.terminate()
+        fake_live.wait(timeout=5)
+
+
+def test_run_overnight_fails_fast_when_dashboard_port_in_use(tmp_path: Path) -> None:
+    log_dir = tmp_path / "overnight"
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.bind(("127.0.0.1", 0))
+    sock.listen(1)
+    port = sock.getsockname()[1]
+    try:
+        env = os.environ.copy()
+        env.update(
+            {
+                "LOG_DIR": str(log_dir),
+                "TMUX_AUTO": "false",
+                "DASHBOARD_PORT": str(port),
+            }
+        )
+        completed = subprocess.run(
+            ["bash", str(RUN_OVERNIGHT)],
+            cwd=REPO_ROOT,
+            env=env,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        assert completed.returncode != 0
+        output = f"{completed.stdout}\n{completed.stderr}"
+        assert "already in use" in output
+    finally:
+        sock.close()
+
+
+def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None:
+    log_dir = tmp_path / "overnight"
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    env = os.environ.copy()
+    env.update(
+        {
+            "LOG_DIR": str(log_dir),
+            "TMUX_AUTO": "false",
+            "STARTUP_GRACE_SEC": "1",
+            "CHECK_INTERVAL": "2",
+            "APP_CMD": "sleep 10",
+        }
+    )
+    completed = subprocess.run(
+        ["bash", str(RUN_OVERNIGHT)],
+        cwd=REPO_ROOT,
+        env=env,
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    assert completed.returncode == 0, f"{completed.stdout}\n{completed.stderr}"
+
+    app_pid = int((log_dir / "app.pid").read_text(encoding="utf-8").strip())
+    watchdog_pid = int((log_dir / "watchdog.pid").read_text(encoding="utf-8").strip())
+
+    os.kill(app_pid, 0)
+    os.kill(watchdog_pid, 0)
+
+    for pid in (watchdog_pid, app_pid):
+        try:
+            os.kill(pid, signal.SIGTERM)
+        except ProcessLookupError:
+            pass

From 370ee8cc85083b93f9281947256e33b556a9253c Mon Sep 17 00:00:00 2001
From: agentson <agentson@localhost>
Date: Wed, 4 Mar 2026 02:07:52 +0900
Subject: [PATCH 2/6] fix: make overnight startup portable in CI environments

---
 scripts/run_overnight.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh
index 87576c1..b5d6902 100755
--- a/scripts/run_overnight.sh
+++ b/scripts/run_overnight.sh
@@ -36,7 +36,7 @@ WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid"
 
 is_port_in_use() {
     local port="$1"
-    ss -ltn 2>/dev/null | rg -q ":${port}\\b"
+    ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
 }
 
 if [ -f "$PID_FILE" ]; then
@@ -53,7 +53,8 @@ if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then
     exit 1
 fi
 
-nohup bash -lc "exec $APP_CMD" >>"$RUN_LOG" 2>&1 &
+# `env` keeps inline VAR=value prefixes in APP_CMD working with `exec`.
+nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 &
 app_pid=$!
 echo "$app_pid" > "$PID_FILE"
 

From 3cde8779fa40d59b0cf40b1d2fa6c08a2d33c0a0 Mon Sep 17 00:00:00 2001
From: agentson <agentson@localhost>
Date: Wed, 4 Mar 2026 02:23:43 +0900
Subject: [PATCH 3/6] fix: address PR #404 review feedback

---
 scripts/run_overnight.sh                | 18 +++++++++++++++--
 scripts/runtime_verify_monitor.sh       | 23 ++++++++++++++++-----
 tests/test_runtime_overnight_scripts.py | 27 +++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh
index b5d6902..e71a7b0 100755
--- a/scripts/run_overnight.sh
+++ b/scripts/run_overnight.sh
@@ -36,7 +36,20 @@ WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid"
 
 is_port_in_use() {
     local port="$1"
-    ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
+    if command -v ss >/dev/null 2>&1; then
+        ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
+        return $?
+    fi
+    if command -v lsof >/dev/null 2>&1; then
+        lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1
+        return $?
+    fi
+    if command -v netstat >/dev/null 2>&1; then
+        netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]"
+        return $?
+    fi
+    # No supported socket inspection command found.
+    return 1
 }
 
 if [ -f "$PID_FILE" ]; then
@@ -53,7 +66,8 @@ if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then
     exit 1
 fi
 
-# `env` keeps inline VAR=value prefixes in APP_CMD working with `exec`.
+# `APP_CMD` is treated as a shell command string.
+# If executable paths include spaces, they must be quoted inside APP_CMD.
 nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 &
 app_pid=$!
 echo "$app_pid" > "$PID_FILE"
diff --git a/scripts/runtime_verify_monitor.sh b/scripts/runtime_verify_monitor.sh
index e5a78a1..5d7aad2 100755
--- a/scripts/runtime_verify_monitor.sh
+++ b/scripts/runtime_verify_monitor.sh
@@ -9,6 +9,7 @@ INTERVAL_SEC="${INTERVAL_SEC:-60}"
 MAX_HOURS="${MAX_HOURS:-24}"
 MAX_LOOPS="${MAX_LOOPS:-0}"
 POLICY_TZ="${POLICY_TZ:-Asia/Seoul}"
+DASHBOARD_PORT="${DASHBOARD_PORT:-8080}"
 
 cd "$ROOT_DIR"
 
@@ -79,10 +80,16 @@ while true; do
   if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then
     app_alive=1
   fi
-  ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1
-  log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive live_pids=${live_pids:-none}"
+  ss -ltnp 2>/dev/null | rg -q ":${DASHBOARD_PORT}\\b" && port_alive=1
+  log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port=${DASHBOARD_PORT} alive=$port_alive live_pids=${live_pids:-none}"
 
-  if [ -z "$latest_run" ]; then
+  defer_log_checks=0
+  if [ -z "$latest_run" ] && [ "$app_alive" -eq 1 ]; then
+    defer_log_checks=1
+    log "[INFO] run log not yet available; defer log-based coverage checks"
+  fi
+
+  if [ -z "$latest_run" ] && [ "$defer_log_checks" -eq 0 ]; then
     log "[ANOMALY] no run log found"
   fi
 
@@ -98,7 +105,11 @@ while true; do
       not_observed=$((not_observed+1))
     fi
   fi
-  if [ -n "$latest_run" ]; then
+  if [ "$defer_log_checks" -eq 1 ]; then
+    for deferred in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do
+      log "[COVERAGE] ${deferred}=DEFERRED reason=no_run_log_process_alive"
+    done
+  elif [ -n "$latest_run" ]; then
     check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1))
     check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1))
     check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1))
@@ -126,7 +137,9 @@ while true; do
     is_weekend=1
   fi
 
-  if [ "$is_weekend" -eq 1 ]; then
+  if [ "$defer_log_checks" -eq 1 ]; then
+    log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log_process_alive"
+  elif [ "$is_weekend" -eq 1 ]; then
     # Weekend policy: KR regular session loop must never appear.
     if [ -n "$latest_run" ]; then
       check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \
diff --git a/tests/test_runtime_overnight_scripts.py b/tests/test_runtime_overnight_scripts.py
index 81b3195..f0ec038 100644
--- a/tests/test_runtime_overnight_scripts.py
+++ b/tests/test_runtime_overnight_scripts.py
@@ -50,6 +50,7 @@ def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path:
         log_text = _latest_runtime_log(log_dir)
         assert "app_alive=1" in log_text
         assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text
+        assert "[ANOMALY]" not in log_text
     finally:
         fake_live.terminate()
         fake_live.wait(timeout=5)
@@ -122,3 +123,29 @@ def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None:
             os.kill(pid, signal.SIGTERM)
         except ProcessLookupError:
             pass
+
+
+def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Path) -> None:
+    log_dir = tmp_path / "overnight"
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    env = os.environ.copy()
+    env.update(
+        {
+            "LOG_DIR": str(log_dir),
+            "TMUX_AUTO": "false",
+            "STARTUP_GRACE_SEC": "1",
+            "APP_CMD": "false",
+        }
+    )
+    completed = subprocess.run(
+        ["bash", str(RUN_OVERNIGHT)],
+        cwd=REPO_ROOT,
+        env=env,
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    assert completed.returncode != 0
+    output = f"{completed.stdout}\n{completed.stderr}"
+    assert "startup failed:" in output

From c412412f7be4b687187ee945666407c9490e27b6 Mon Sep 17 00:00:00 2001
From: agentson <agentson@localhost>
Date: Wed, 4 Mar 2026 02:29:54 +0900
Subject: [PATCH 4/6] fix: address second-round review findings on PR #404

---
 scripts/run_overnight.sh                | 18 ++++++++++++++----
 scripts/runtime_verify_monitor.sh       | 20 +++++++++++++++++++-
 tests/test_runtime_overnight_scripts.py |  8 ++++++++
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh
index e71a7b0..09458a7 100755
--- a/scripts/run_overnight.sh
+++ b/scripts/run_overnight.sh
@@ -12,6 +12,7 @@ STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}"
 dashboard_port="${DASHBOARD_PORT:-8080}"
 
 if [ -z "${APP_CMD:-}" ]; then
+    USE_DEFAULT_APP_CMD="true"
     if [ -x ".venv/bin/python" ]; then
         PYTHON_BIN=".venv/bin/python"
     elif command -v python3 >/dev/null 2>&1; then
@@ -23,7 +24,9 @@ if [ -z "${APP_CMD:-}" ]; then
         exit 1
     fi
 
-    APP_CMD="DASHBOARD_PORT=$dashboard_port $PYTHON_BIN -m src.main --mode=live --dashboard"
+    APP_CMD="$PYTHON_BIN -m src.main --mode=live --dashboard"
+else
+    USE_DEFAULT_APP_CMD="false"
 fi
 
 mkdir -p "$LOG_DIR"
@@ -66,9 +69,14 @@ if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then
     exit 1
 fi
 
-# `APP_CMD` is treated as a shell command string.
-# If executable paths include spaces, they must be quoted inside APP_CMD.
-nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 &
+if [ "$USE_DEFAULT_APP_CMD" = "true" ]; then
+    # Default path avoids shell word-splitting on executable paths.
+    nohup env DASHBOARD_PORT="$dashboard_port" "$PYTHON_BIN" -m src.main --mode=live --dashboard >>"$RUN_LOG" 2>&1 &
+else
+    # Custom APP_CMD is treated as a shell command string.
+    # If executable paths include spaces, they must be quoted inside APP_CMD.
+    nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 &
+fi
 app_pid=$!
 echo "$app_pid" > "$PID_FILE"
 
@@ -82,11 +90,13 @@ echo "$watchdog_pid" > "$WATCHDOG_PID_FILE"
 sleep "$STARTUP_GRACE_SEC"
 if ! kill -0 "$app_pid" 2>/dev/null; then
     echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: app process exited early (pid=$app_pid)" | tee -a "$RUN_LOG"
+    [ -n "${watchdog_pid:-}" ] && kill "$watchdog_pid" 2>/dev/null || true
     tail -n 20 "$RUN_LOG" || true
     exit 1
 fi
 if ! kill -0 "$watchdog_pid" 2>/dev/null; then
     echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: watchdog exited early (pid=$watchdog_pid)" | tee -a "$WATCHDOG_LOG"
+    kill "$app_pid" 2>/dev/null || true
     tail -n 20 "$WATCHDOG_LOG" || true
     exit 1
 fi
diff --git a/scripts/runtime_verify_monitor.sh b/scripts/runtime_verify_monitor.sh
index 5d7aad2..5ad92eb 100755
--- a/scripts/runtime_verify_monitor.sh
+++ b/scripts/runtime_verify_monitor.sh
@@ -52,6 +52,24 @@ check_forbidden() {
   return 0
 }
 
+is_port_listening() {
+  local port="$1"
+
+  if command -v ss >/dev/null 2>&1; then
+    ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
+    return $?
+  fi
+  if command -v lsof >/dev/null 2>&1; then
+    lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1
+    return $?
+  fi
+  if command -v netstat >/dev/null 2>&1; then
+    netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]"
+    return $?
+  fi
+  return 1
+}
+
 log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}"
 
 while true; do
@@ -80,7 +98,7 @@ while true; do
   if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then
     app_alive=1
   fi
-  ss -ltnp 2>/dev/null | rg -q ":${DASHBOARD_PORT}\\b" && port_alive=1
+  is_port_listening "$DASHBOARD_PORT" && port_alive=1
   log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port=${DASHBOARD_PORT} alive=$port_alive live_pids=${live_pids:-none}"
 
   defer_log_checks=0
diff --git a/tests/test_runtime_overnight_scripts.py b/tests/test_runtime_overnight_scripts.py
index f0ec038..3cad774 100644
--- a/tests/test_runtime_overnight_scripts.py
+++ b/tests/test_runtime_overnight_scripts.py
@@ -6,6 +6,8 @@ import socket
 import subprocess
 from pathlib import Path
 
+import pytest
+
 REPO_ROOT = Path(__file__).resolve().parent.parent
 RUN_OVERNIGHT = REPO_ROOT / "scripts" / "run_overnight.sh"
 RUNTIME_MONITOR = REPO_ROOT / "scripts" / "runtime_verify_monitor.sh"
@@ -149,3 +151,9 @@ def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Pa
     assert completed.returncode != 0
     output = f"{completed.stdout}\n{completed.stderr}"
     assert "startup failed:" in output
+
+    watchdog_pid_file = log_dir / "watchdog.pid"
+    if watchdog_pid_file.exists():
+        watchdog_pid = int(watchdog_pid_file.read_text(encoding="utf-8").strip())
+        with pytest.raises(ProcessLookupError):
+            os.kill(watchdog_pid, 0)

From dc0775cbc6cebaa1907d6936d7acea7f468f41cd Mon Sep 17 00:00:00 2001
From: agentson <agentson@localhost>
Date: Wed, 4 Mar 2026 02:36:24 +0900
Subject: [PATCH 5/6] fix: add safer custom command path for run_overnight

---
 scripts/run_overnight.sh                | 29 +++++++++++++++++++++++--
 tests/test_runtime_overnight_scripts.py |  5 +++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh
index 09458a7..9007806 100755
--- a/scripts/run_overnight.sh
+++ b/scripts/run_overnight.sh
@@ -10,9 +10,20 @@ TMUX_ATTACH="${TMUX_ATTACH:-true}"
 TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}"
 STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}"
 dashboard_port="${DASHBOARD_PORT:-8080}"
+APP_CMD_BIN="${APP_CMD_BIN:-}"
+APP_CMD_ARGS="${APP_CMD_ARGS:-}"
+RUNS_DASHBOARD="false"
 
-if [ -z "${APP_CMD:-}" ]; then
+if [ -n "$APP_CMD_BIN" ]; then
+    USE_DEFAULT_APP_CMD="false"
+    USE_SAFE_CUSTOM_APP_CMD="true"
+    APP_CMD="${APP_CMD_BIN} ${APP_CMD_ARGS}"
+    if [[ " $APP_CMD_ARGS " == *" --dashboard "* ]]; then
+        RUNS_DASHBOARD="true"
+    fi
+elif [ -z "${APP_CMD:-}" ]; then
     USE_DEFAULT_APP_CMD="true"
+    USE_SAFE_CUSTOM_APP_CMD="false"
     if [ -x ".venv/bin/python" ]; then
         PYTHON_BIN=".venv/bin/python"
     elif command -v python3 >/dev/null 2>&1; then
@@ -25,8 +36,13 @@ if [ -z "${APP_CMD:-}" ]; then
     fi
 
     APP_CMD="$PYTHON_BIN -m src.main --mode=live --dashboard"
+    RUNS_DASHBOARD="true"
 else
     USE_DEFAULT_APP_CMD="false"
+    USE_SAFE_CUSTOM_APP_CMD="false"
+    if [[ "$APP_CMD" == *"--dashboard"* ]]; then
+        RUNS_DASHBOARD="true"
+    fi
 fi
 
 mkdir -p "$LOG_DIR"
@@ -64,7 +80,7 @@ if [ -f "$PID_FILE" ]; then
 fi
 
 echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG"
-if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then
+if [ "$RUNS_DASHBOARD" = "true" ] && is_port_in_use "$dashboard_port"; then
     echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: dashboard port ${dashboard_port} already in use" | tee -a "$RUN_LOG"
     exit 1
 fi
@@ -72,6 +88,15 @@ fi
 if [ "$USE_DEFAULT_APP_CMD" = "true" ]; then
     # Default path avoids shell word-splitting on executable paths.
     nohup env DASHBOARD_PORT="$dashboard_port" "$PYTHON_BIN" -m src.main --mode=live --dashboard >>"$RUN_LOG" 2>&1 &
+elif [ "$USE_SAFE_CUSTOM_APP_CMD" = "true" ]; then
+    # Safer custom path: executable path is handled as a single token.
+    if [ -n "$APP_CMD_ARGS" ]; then
+        # shellcheck disable=SC2206
+        app_args=( $APP_CMD_ARGS )
+        nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" "${app_args[@]}" >>"$RUN_LOG" 2>&1 &
+    else
+        nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" >>"$RUN_LOG" 2>&1 &
+    fi
 else
     # Custom APP_CMD is treated as a shell command string.
     # If executable paths include spaces, they must be quoted inside APP_CMD.
diff --git a/tests/test_runtime_overnight_scripts.py b/tests/test_runtime_overnight_scripts.py
index 3cad774..a5b6182 100644
--- a/tests/test_runtime_overnight_scripts.py
+++ b/tests/test_runtime_overnight_scripts.py
@@ -101,7 +101,8 @@ def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None:
             "TMUX_AUTO": "false",
             "STARTUP_GRACE_SEC": "1",
             "CHECK_INTERVAL": "2",
-            "APP_CMD": "sleep 10",
+            "APP_CMD_BIN": "sleep",
+            "APP_CMD_ARGS": "10",
         }
     )
     completed = subprocess.run(
@@ -137,7 +138,7 @@ def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Pa
             "LOG_DIR": str(log_dir),
             "TMUX_AUTO": "false",
             "STARTUP_GRACE_SEC": "1",
-            "APP_CMD": "false",
+            "APP_CMD_BIN": "false",
         }
     )
     completed = subprocess.run(

From bcbbf80d1619d3deb2a3e926464ea8852cfabfd3 Mon Sep 17 00:00:00 2001
From: agentson <agentson@localhost>
Date: Wed, 4 Mar 2026 02:43:32 +0900
Subject: [PATCH 6/6] docs: clarify APP_CMD legacy and APP_CMD_ARGS contract

---
 scripts/run_overnight.sh | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/scripts/run_overnight.sh b/scripts/run_overnight.sh
index 9007806..af88e23 100755
--- a/scripts/run_overnight.sh
+++ b/scripts/run_overnight.sh
@@ -14,6 +14,16 @@ APP_CMD_BIN="${APP_CMD_BIN:-}"
 APP_CMD_ARGS="${APP_CMD_ARGS:-}"
 RUNS_DASHBOARD="false"
 
+# Custom override contract:
+# 1) Preferred: APP_CMD_BIN + APP_CMD_ARGS
+#    - APP_CMD_BIN is treated as a single executable token.
+#    - APP_CMD_ARGS uses shell-style word splitting; quote/escape inside this
+#      variable is NOT preserved as a nested shell parse.
+# 2) Legacy fallback: APP_CMD (raw shell command string)
+#    - This path remains for backward compatibility.
+#    - When APP_CMD includes --dashboard, caller should include explicit
+#      DASHBOARD_PORT assignment in APP_CMD if non-default port is required.
+
 if [ -n "$APP_CMD_BIN" ]; then
     USE_DEFAULT_APP_CMD="false"
     USE_SAFE_CUSTOM_APP_CMD="true"
@@ -100,6 +110,7 @@ elif [ "$USE_SAFE_CUSTOM_APP_CMD" = "true" ]; then
 else
     # Custom APP_CMD is treated as a shell command string.
     # If executable paths include spaces, they must be quoted inside APP_CMD.
+    # Legacy compatibility path: caller owns quoting and env var injection.
     nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 &
 fi
 app_pid=$!