Merge pull request 'fix: runtime anomaly handling for overnight startup and monitor (#396 #397)' (#404) from feature/issue-396-397-runtime-anomaly-fixes into main
All checks were successful
Gitea CI / test (push) Successful in 36s
All checks were successful
Gitea CI / test (push) Successful in 36s
Reviewed-on: #404 Reviewed-by: jihoson <kiparang7th@gmail.com>
This commit was merged in pull request #404.
This commit is contained in:
@@ -8,8 +8,32 @@ CHECK_INTERVAL="${CHECK_INTERVAL:-30}"
|
|||||||
TMUX_AUTO="${TMUX_AUTO:-true}"
|
TMUX_AUTO="${TMUX_AUTO:-true}"
|
||||||
TMUX_ATTACH="${TMUX_ATTACH:-true}"
|
TMUX_ATTACH="${TMUX_ATTACH:-true}"
|
||||||
TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}"
|
TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}"
|
||||||
|
STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}"
|
||||||
|
dashboard_port="${DASHBOARD_PORT:-8080}"
|
||||||
|
APP_CMD_BIN="${APP_CMD_BIN:-}"
|
||||||
|
APP_CMD_ARGS="${APP_CMD_ARGS:-}"
|
||||||
|
RUNS_DASHBOARD="false"
|
||||||
|
|
||||||
if [ -z "${APP_CMD:-}" ]; then
|
# Custom override contract:
|
||||||
|
# 1) Preferred: APP_CMD_BIN + APP_CMD_ARGS
|
||||||
|
# - APP_CMD_BIN is treated as a single executable token.
|
||||||
|
# - APP_CMD_ARGS uses shell-style word splitting; quote/escape inside this
|
||||||
|
# variable is NOT preserved as a nested shell parse.
|
||||||
|
# 2) Legacy fallback: APP_CMD (raw shell command string)
|
||||||
|
# - This path remains for backward compatibility.
|
||||||
|
# - When APP_CMD includes --dashboard, caller should include explicit
|
||||||
|
# DASHBOARD_PORT assignment in APP_CMD if non-default port is required.
|
||||||
|
|
||||||
|
if [ -n "$APP_CMD_BIN" ]; then
|
||||||
|
USE_DEFAULT_APP_CMD="false"
|
||||||
|
USE_SAFE_CUSTOM_APP_CMD="true"
|
||||||
|
APP_CMD="${APP_CMD_BIN} ${APP_CMD_ARGS}"
|
||||||
|
if [[ " $APP_CMD_ARGS " == *" --dashboard "* ]]; then
|
||||||
|
RUNS_DASHBOARD="true"
|
||||||
|
fi
|
||||||
|
elif [ -z "${APP_CMD:-}" ]; then
|
||||||
|
USE_DEFAULT_APP_CMD="true"
|
||||||
|
USE_SAFE_CUSTOM_APP_CMD="false"
|
||||||
if [ -x ".venv/bin/python" ]; then
|
if [ -x ".venv/bin/python" ]; then
|
||||||
PYTHON_BIN=".venv/bin/python"
|
PYTHON_BIN=".venv/bin/python"
|
||||||
elif command -v python3 >/dev/null 2>&1; then
|
elif command -v python3 >/dev/null 2>&1; then
|
||||||
@@ -21,9 +45,14 @@ if [ -z "${APP_CMD:-}" ]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dashboard_port="${DASHBOARD_PORT:-8080}"
|
APP_CMD="$PYTHON_BIN -m src.main --mode=live --dashboard"
|
||||||
|
RUNS_DASHBOARD="true"
|
||||||
APP_CMD="DASHBOARD_PORT=$dashboard_port $PYTHON_BIN -m src.main --mode=live --dashboard"
|
else
|
||||||
|
USE_DEFAULT_APP_CMD="false"
|
||||||
|
USE_SAFE_CUSTOM_APP_CMD="false"
|
||||||
|
if [[ "$APP_CMD" == *"--dashboard"* ]]; then
|
||||||
|
RUNS_DASHBOARD="true"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
mkdir -p "$LOG_DIR"
|
mkdir -p "$LOG_DIR"
|
||||||
@@ -34,6 +63,24 @@ WATCHDOG_LOG="$LOG_DIR/watchdog_${timestamp}.log"
|
|||||||
PID_FILE="$LOG_DIR/app.pid"
|
PID_FILE="$LOG_DIR/app.pid"
|
||||||
WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid"
|
WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid"
|
||||||
|
|
||||||
|
is_port_in_use() {
|
||||||
|
local port="$1"
|
||||||
|
if command -v ss >/dev/null 2>&1; then
|
||||||
|
ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
if command -v lsof >/dev/null 2>&1; then
|
||||||
|
lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
if command -v netstat >/dev/null 2>&1; then
|
||||||
|
netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]"
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
# No supported socket inspection command found.
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
if [ -f "$PID_FILE" ]; then
|
if [ -f "$PID_FILE" ]; then
|
||||||
old_pid="$(cat "$PID_FILE" || true)"
|
old_pid="$(cat "$PID_FILE" || true)"
|
||||||
if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
|
if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
|
||||||
@@ -43,7 +90,29 @@ if [ -f "$PID_FILE" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG"
|
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG"
|
||||||
nohup bash -lc "$APP_CMD" >>"$RUN_LOG" 2>&1 &
|
if [ "$RUNS_DASHBOARD" = "true" ] && is_port_in_use "$dashboard_port"; then
|
||||||
|
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: dashboard port ${dashboard_port} already in use" | tee -a "$RUN_LOG"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$USE_DEFAULT_APP_CMD" = "true" ]; then
|
||||||
|
# Default path avoids shell word-splitting on executable paths.
|
||||||
|
nohup env DASHBOARD_PORT="$dashboard_port" "$PYTHON_BIN" -m src.main --mode=live --dashboard >>"$RUN_LOG" 2>&1 &
|
||||||
|
elif [ "$USE_SAFE_CUSTOM_APP_CMD" = "true" ]; then
|
||||||
|
# Safer custom path: executable path is handled as a single token.
|
||||||
|
if [ -n "$APP_CMD_ARGS" ]; then
|
||||||
|
# shellcheck disable=SC2206
|
||||||
|
app_args=( $APP_CMD_ARGS )
|
||||||
|
nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" "${app_args[@]}" >>"$RUN_LOG" 2>&1 &
|
||||||
|
else
|
||||||
|
nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" >>"$RUN_LOG" 2>&1 &
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# Custom APP_CMD is treated as a shell command string.
|
||||||
|
# If executable paths include spaces, they must be quoted inside APP_CMD.
|
||||||
|
# Legacy compatibility path: caller owns quoting and env var injection.
|
||||||
|
nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 &
|
||||||
|
fi
|
||||||
app_pid=$!
|
app_pid=$!
|
||||||
echo "$app_pid" > "$PID_FILE"
|
echo "$app_pid" > "$PID_FILE"
|
||||||
|
|
||||||
@@ -54,6 +123,20 @@ nohup env PID_FILE="$PID_FILE" LOG_FILE="$WATCHDOG_LOG" CHECK_INTERVAL="$CHECK_I
|
|||||||
watchdog_pid=$!
|
watchdog_pid=$!
|
||||||
echo "$watchdog_pid" > "$WATCHDOG_PID_FILE"
|
echo "$watchdog_pid" > "$WATCHDOG_PID_FILE"
|
||||||
|
|
||||||
|
sleep "$STARTUP_GRACE_SEC"
|
||||||
|
if ! kill -0 "$app_pid" 2>/dev/null; then
|
||||||
|
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: app process exited early (pid=$app_pid)" | tee -a "$RUN_LOG"
|
||||||
|
[ -n "${watchdog_pid:-}" ] && kill "$watchdog_pid" 2>/dev/null || true
|
||||||
|
tail -n 20 "$RUN_LOG" || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! kill -0 "$watchdog_pid" 2>/dev/null; then
|
||||||
|
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: watchdog exited early (pid=$watchdog_pid)" | tee -a "$WATCHDOG_LOG"
|
||||||
|
kill "$app_pid" 2>/dev/null || true
|
||||||
|
tail -n 20 "$WATCHDOG_LOG" || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
cat <<EOF
|
cat <<EOF
|
||||||
시작 완료
|
시작 완료
|
||||||
- app pid: $app_pid
|
- app pid: $app_pid
|
||||||
|
|||||||
@@ -7,12 +7,15 @@ ROOT_DIR="${ROOT_DIR:-/home/agentson/repos/The-Ouroboros}"
|
|||||||
LOG_DIR="${LOG_DIR:-$ROOT_DIR/data/overnight}"
|
LOG_DIR="${LOG_DIR:-$ROOT_DIR/data/overnight}"
|
||||||
INTERVAL_SEC="${INTERVAL_SEC:-60}"
|
INTERVAL_SEC="${INTERVAL_SEC:-60}"
|
||||||
MAX_HOURS="${MAX_HOURS:-24}"
|
MAX_HOURS="${MAX_HOURS:-24}"
|
||||||
|
MAX_LOOPS="${MAX_LOOPS:-0}"
|
||||||
POLICY_TZ="${POLICY_TZ:-Asia/Seoul}"
|
POLICY_TZ="${POLICY_TZ:-Asia/Seoul}"
|
||||||
|
DASHBOARD_PORT="${DASHBOARD_PORT:-8080}"
|
||||||
|
|
||||||
cd "$ROOT_DIR"
|
cd "$ROOT_DIR"
|
||||||
|
|
||||||
OUT_LOG="$LOG_DIR/runtime_verify_$(date +%Y%m%d_%H%M%S).log"
|
OUT_LOG="$LOG_DIR/runtime_verify_$(date +%Y%m%d_%H%M%S).log"
|
||||||
END_TS=$(( $(date +%s) + MAX_HOURS*3600 ))
|
END_TS=$(( $(date +%s) + MAX_HOURS*3600 ))
|
||||||
|
loops=0
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$1" | tee -a "$OUT_LOG" >/dev/null
|
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$1" | tee -a "$OUT_LOG" >/dev/null
|
||||||
@@ -31,6 +34,11 @@ check_signal() {
|
|||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
find_live_pids() {
|
||||||
|
# Detect live-mode process even when run_overnight pid files are absent.
|
||||||
|
pgrep -af "[s]rc.main --mode=live" 2>/dev/null | awk '{print $1}' | tr '\n' ',' | sed 's/,$//'
|
||||||
|
}
|
||||||
|
|
||||||
check_forbidden() {
|
check_forbidden() {
|
||||||
local name="$1"
|
local name="$1"
|
||||||
local pattern="$2"
|
local pattern="$2"
|
||||||
@@ -44,42 +52,94 @@ check_forbidden() {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
is_port_listening() {
|
||||||
|
local port="$1"
|
||||||
|
|
||||||
|
if command -v ss >/dev/null 2>&1; then
|
||||||
|
ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
if command -v lsof >/dev/null 2>&1; then
|
||||||
|
lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
if command -v netstat >/dev/null 2>&1; then
|
||||||
|
netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]"
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}"
|
log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}"
|
||||||
|
|
||||||
while true; do
|
while true; do
|
||||||
|
loops=$((loops + 1))
|
||||||
now=$(date +%s)
|
now=$(date +%s)
|
||||||
if [ "$now" -ge "$END_TS" ]; then
|
if [ "$now" -ge "$END_TS" ]; then
|
||||||
log "[INFO] monitor completed (time window reached)"
|
log "[INFO] monitor completed (time window reached)"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
if [ "$MAX_LOOPS" -gt 0 ] && [ "$loops" -gt "$MAX_LOOPS" ]; then
|
||||||
|
log "[INFO] monitor completed (max loops reached)"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
latest_run="$(ls -t "$LOG_DIR"/run_*.log 2>/dev/null | head -n1 || true)"
|
latest_run="$(ls -t "$LOG_DIR"/run_*.log 2>/dev/null | head -n1 || true)"
|
||||||
if [ -z "$latest_run" ]; then
|
|
||||||
log "[ANOMALY] no run log found"
|
|
||||||
sleep "$INTERVAL_SEC"
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Basic liveness hints.
|
# Basic liveness hints.
|
||||||
app_pid="$(cat "$LOG_DIR/app.pid" 2>/dev/null || true)"
|
app_pid="$(cat "$LOG_DIR/app.pid" 2>/dev/null || true)"
|
||||||
wd_pid="$(cat "$LOG_DIR/watchdog.pid" 2>/dev/null || true)"
|
wd_pid="$(cat "$LOG_DIR/watchdog.pid" 2>/dev/null || true)"
|
||||||
|
live_pids="$(find_live_pids)"
|
||||||
app_alive=0
|
app_alive=0
|
||||||
wd_alive=0
|
wd_alive=0
|
||||||
port_alive=0
|
port_alive=0
|
||||||
[ -n "$app_pid" ] && kill -0 "$app_pid" 2>/dev/null && app_alive=1
|
[ -n "$app_pid" ] && kill -0 "$app_pid" 2>/dev/null && app_alive=1
|
||||||
[ -n "$wd_pid" ] && kill -0 "$wd_pid" 2>/dev/null && wd_alive=1
|
[ -n "$wd_pid" ] && kill -0 "$wd_pid" 2>/dev/null && wd_alive=1
|
||||||
ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1
|
if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then
|
||||||
log "[HEARTBEAT] run_log=$latest_run app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive"
|
app_alive=1
|
||||||
|
fi
|
||||||
|
is_port_listening "$DASHBOARD_PORT" && port_alive=1
|
||||||
|
log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port=${DASHBOARD_PORT} alive=$port_alive live_pids=${live_pids:-none}"
|
||||||
|
|
||||||
|
defer_log_checks=0
|
||||||
|
if [ -z "$latest_run" ] && [ "$app_alive" -eq 1 ]; then
|
||||||
|
defer_log_checks=1
|
||||||
|
log "[INFO] run log not yet available; defer log-based coverage checks"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$latest_run" ] && [ "$defer_log_checks" -eq 0 ]; then
|
||||||
|
log "[ANOMALY] no run log found"
|
||||||
|
fi
|
||||||
|
|
||||||
# Coverage matrix rows (session paths and policy gate evidence).
|
# Coverage matrix rows (session paths and policy gate evidence).
|
||||||
not_observed=0
|
not_observed=0
|
||||||
|
if [ "$app_alive" -eq 1 ]; then
|
||||||
|
log "[COVERAGE] LIVE_MODE=PASS source=process_liveness"
|
||||||
|
else
|
||||||
|
if [ -n "$latest_run" ]; then
|
||||||
check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1))
|
check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1))
|
||||||
|
else
|
||||||
|
log "[COVERAGE] LIVE_MODE=NOT_OBSERVED reason=no_run_log_no_live_pid"
|
||||||
|
not_observed=$((not_observed+1))
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if [ "$defer_log_checks" -eq 1 ]; then
|
||||||
|
for deferred in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do
|
||||||
|
log "[COVERAGE] ${deferred}=DEFERRED reason=no_run_log_process_alive"
|
||||||
|
done
|
||||||
|
elif [ -n "$latest_run" ]; then
|
||||||
check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1))
|
check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1))
|
||||||
check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1))
|
check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1))
|
||||||
check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1))
|
check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1))
|
||||||
check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1))
|
check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1))
|
||||||
check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1))
|
check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1))
|
||||||
check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1))
|
check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1))
|
||||||
|
else
|
||||||
|
for missing in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do
|
||||||
|
log "[COVERAGE] ${missing}=NOT_OBSERVED reason=no_run_log"
|
||||||
|
not_observed=$((not_observed+1))
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
if [ "$not_observed" -gt 0 ]; then
|
if [ "$not_observed" -gt 0 ]; then
|
||||||
log "[ANOMALY] coverage_not_observed=$not_observed (treat as FAIL)"
|
log "[ANOMALY] coverage_not_observed=$not_observed (treat as FAIL)"
|
||||||
@@ -95,11 +155,17 @@ while true; do
|
|||||||
is_weekend=1
|
is_weekend=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$is_weekend" -eq 1 ]; then
|
if [ "$defer_log_checks" -eq 1 ]; then
|
||||||
|
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log_process_alive"
|
||||||
|
elif [ "$is_weekend" -eq 1 ]; then
|
||||||
# Weekend policy: KR regular session loop must never appear.
|
# Weekend policy: KR regular session loop must never appear.
|
||||||
|
if [ -n "$latest_run" ]; then
|
||||||
check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \
|
check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \
|
||||||
"Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \
|
"Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \
|
||||||
"$latest_run" || forbidden_hits=$((forbidden_hits+1))
|
"$latest_run" || forbidden_hits=$((forbidden_hits+1))
|
||||||
|
else
|
||||||
|
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log"
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=weekday"
|
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=weekday"
|
||||||
fi
|
fi
|
||||||
|
|||||||
160
tests/test_runtime_overnight_scripts.py
Normal file
160
tests/test_runtime_overnight_scripts.py
Normal file
@@ -0,0 +1,160 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
RUN_OVERNIGHT = REPO_ROOT / "scripts" / "run_overnight.sh"
|
||||||
|
RUNTIME_MONITOR = REPO_ROOT / "scripts" / "runtime_verify_monitor.sh"
|
||||||
|
|
||||||
|
|
||||||
|
def _latest_runtime_log(log_dir: Path) -> str:
|
||||||
|
logs = sorted(log_dir.glob("runtime_verify_*.log"))
|
||||||
|
assert logs, "runtime monitor did not produce log output"
|
||||||
|
return logs[-1].read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path: Path) -> None:
|
||||||
|
log_dir = tmp_path / "overnight"
|
||||||
|
log_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
fake_live = subprocess.Popen(
|
||||||
|
["bash", "-lc", 'exec -a "src.main --mode=live" sleep 10'],
|
||||||
|
cwd=REPO_ROOT,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
env = os.environ.copy()
|
||||||
|
env.update(
|
||||||
|
{
|
||||||
|
"ROOT_DIR": str(REPO_ROOT),
|
||||||
|
"LOG_DIR": str(log_dir),
|
||||||
|
"INTERVAL_SEC": "1",
|
||||||
|
"MAX_HOURS": "1",
|
||||||
|
"MAX_LOOPS": "1",
|
||||||
|
"POLICY_TZ": "UTC",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
completed = subprocess.run(
|
||||||
|
["bash", str(RUNTIME_MONITOR)],
|
||||||
|
cwd=REPO_ROOT,
|
||||||
|
env=env,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
assert completed.returncode == 0, completed.stderr
|
||||||
|
|
||||||
|
log_text = _latest_runtime_log(log_dir)
|
||||||
|
assert "app_alive=1" in log_text
|
||||||
|
assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text
|
||||||
|
assert "[ANOMALY]" not in log_text
|
||||||
|
finally:
|
||||||
|
fake_live.terminate()
|
||||||
|
fake_live.wait(timeout=5)
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_overnight_fails_fast_when_dashboard_port_in_use(tmp_path: Path) -> None:
|
||||||
|
log_dir = tmp_path / "overnight"
|
||||||
|
log_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
sock.bind(("127.0.0.1", 0))
|
||||||
|
sock.listen(1)
|
||||||
|
port = sock.getsockname()[1]
|
||||||
|
try:
|
||||||
|
env = os.environ.copy()
|
||||||
|
env.update(
|
||||||
|
{
|
||||||
|
"LOG_DIR": str(log_dir),
|
||||||
|
"TMUX_AUTO": "false",
|
||||||
|
"DASHBOARD_PORT": str(port),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
completed = subprocess.run(
|
||||||
|
["bash", str(RUN_OVERNIGHT)],
|
||||||
|
cwd=REPO_ROOT,
|
||||||
|
env=env,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
assert completed.returncode != 0
|
||||||
|
output = f"{completed.stdout}\n{completed.stderr}"
|
||||||
|
assert "already in use" in output
|
||||||
|
finally:
|
||||||
|
sock.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None:
|
||||||
|
log_dir = tmp_path / "overnight"
|
||||||
|
log_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
env = os.environ.copy()
|
||||||
|
env.update(
|
||||||
|
{
|
||||||
|
"LOG_DIR": str(log_dir),
|
||||||
|
"TMUX_AUTO": "false",
|
||||||
|
"STARTUP_GRACE_SEC": "1",
|
||||||
|
"CHECK_INTERVAL": "2",
|
||||||
|
"APP_CMD_BIN": "sleep",
|
||||||
|
"APP_CMD_ARGS": "10",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
completed = subprocess.run(
|
||||||
|
["bash", str(RUN_OVERNIGHT)],
|
||||||
|
cwd=REPO_ROOT,
|
||||||
|
env=env,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
assert completed.returncode == 0, f"{completed.stdout}\n{completed.stderr}"
|
||||||
|
|
||||||
|
app_pid = int((log_dir / "app.pid").read_text(encoding="utf-8").strip())
|
||||||
|
watchdog_pid = int((log_dir / "watchdog.pid").read_text(encoding="utf-8").strip())
|
||||||
|
|
||||||
|
os.kill(app_pid, 0)
|
||||||
|
os.kill(watchdog_pid, 0)
|
||||||
|
|
||||||
|
for pid in (watchdog_pid, app_pid):
|
||||||
|
try:
|
||||||
|
os.kill(pid, signal.SIGTERM)
|
||||||
|
except ProcessLookupError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Path) -> None:
|
||||||
|
log_dir = tmp_path / "overnight"
|
||||||
|
log_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
env = os.environ.copy()
|
||||||
|
env.update(
|
||||||
|
{
|
||||||
|
"LOG_DIR": str(log_dir),
|
||||||
|
"TMUX_AUTO": "false",
|
||||||
|
"STARTUP_GRACE_SEC": "1",
|
||||||
|
"APP_CMD_BIN": "false",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
completed = subprocess.run(
|
||||||
|
["bash", str(RUN_OVERNIGHT)],
|
||||||
|
cwd=REPO_ROOT,
|
||||||
|
env=env,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
assert completed.returncode != 0
|
||||||
|
output = f"{completed.stdout}\n{completed.stderr}"
|
||||||
|
assert "startup failed:" in output
|
||||||
|
|
||||||
|
watchdog_pid_file = log_dir / "watchdog.pid"
|
||||||
|
if watchdog_pid_file.exists():
|
||||||
|
watchdog_pid = int(watchdog_pid_file.read_text(encoding="utf-8").strip())
|
||||||
|
with pytest.raises(ProcessLookupError):
|
||||||
|
os.kill(watchdog_pid, 0)
|
||||||
Reference in New Issue
Block a user