fix: runtime anomaly handling for overnight startup and monitor (#396 #397) #404

Merged
jihoson merged 6 commits from feature/issue-396-397-runtime-anomaly-fixes into main 2026-03-04 02:46:38 +09:00
3 changed files with 332 additions and 23 deletions

View File

@@ -8,8 +8,32 @@ CHECK_INTERVAL="${CHECK_INTERVAL:-30}"
TMUX_AUTO="${TMUX_AUTO:-true}"
TMUX_ATTACH="${TMUX_ATTACH:-true}"
TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}"
STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}"
dashboard_port="${DASHBOARD_PORT:-8080}"
APP_CMD_BIN="${APP_CMD_BIN:-}"
APP_CMD_ARGS="${APP_CMD_ARGS:-}"
RUNS_DASHBOARD="false"
if [ -z "${APP_CMD:-}" ]; then
# Custom override contract:
# 1) Preferred: APP_CMD_BIN + APP_CMD_ARGS
# - APP_CMD_BIN is treated as a single executable token.
# - APP_CMD_ARGS uses shell-style word splitting; quote/escape inside this
# variable is NOT preserved as a nested shell parse.
# 2) Legacy fallback: APP_CMD (raw shell command string)
# - This path remains for backward compatibility.
# - When APP_CMD includes --dashboard, caller should include explicit
# DASHBOARD_PORT assignment in APP_CMD if non-default port is required.
if [ -n "$APP_CMD_BIN" ]; then
USE_DEFAULT_APP_CMD="false"
USE_SAFE_CUSTOM_APP_CMD="true"
APP_CMD="${APP_CMD_BIN} ${APP_CMD_ARGS}"
if [[ " $APP_CMD_ARGS " == *" --dashboard "* ]]; then
RUNS_DASHBOARD="true"
fi
elif [ -z "${APP_CMD:-}" ]; then
USE_DEFAULT_APP_CMD="true"
USE_SAFE_CUSTOM_APP_CMD="false"
if [ -x ".venv/bin/python" ]; then
PYTHON_BIN=".venv/bin/python"
elif command -v python3 >/dev/null 2>&1; then
@@ -21,9 +45,14 @@ if [ -z "${APP_CMD:-}" ]; then
exit 1
fi
dashboard_port="${DASHBOARD_PORT:-8080}"
APP_CMD="DASHBOARD_PORT=$dashboard_port $PYTHON_BIN -m src.main --mode=live --dashboard"
APP_CMD="$PYTHON_BIN -m src.main --mode=live --dashboard"
RUNS_DASHBOARD="true"
else
USE_DEFAULT_APP_CMD="false"
USE_SAFE_CUSTOM_APP_CMD="false"
if [[ "$APP_CMD" == *"--dashboard"* ]]; then
RUNS_DASHBOARD="true"
fi
fi
mkdir -p "$LOG_DIR"
@@ -34,6 +63,24 @@ WATCHDOG_LOG="$LOG_DIR/watchdog_${timestamp}.log"
PID_FILE="$LOG_DIR/app.pid"
WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid"
is_port_in_use() {
local port="$1"
if command -v ss >/dev/null 2>&1; then
ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
return $?
fi
if command -v lsof >/dev/null 2>&1; then
lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1
return $?
fi
if command -v netstat >/dev/null 2>&1; then
netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]"
return $?
fi
# No supported socket inspection command found.
return 1
}
if [ -f "$PID_FILE" ]; then
old_pid="$(cat "$PID_FILE" || true)"
if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
@@ -43,7 +90,29 @@ if [ -f "$PID_FILE" ]; then
fi
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG"
nohup bash -lc "$APP_CMD" >>"$RUN_LOG" 2>&1 &
if [ "$RUNS_DASHBOARD" = "true" ] && is_port_in_use "$dashboard_port"; then
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: dashboard port ${dashboard_port} already in use" | tee -a "$RUN_LOG"
exit 1
fi
if [ "$USE_DEFAULT_APP_CMD" = "true" ]; then
# Default path avoids shell word-splitting on executable paths.
nohup env DASHBOARD_PORT="$dashboard_port" "$PYTHON_BIN" -m src.main --mode=live --dashboard >>"$RUN_LOG" 2>&1 &
elif [ "$USE_SAFE_CUSTOM_APP_CMD" = "true" ]; then
# Safer custom path: executable path is handled as a single token.
if [ -n "$APP_CMD_ARGS" ]; then
# shellcheck disable=SC2206
app_args=( $APP_CMD_ARGS )
nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" "${app_args[@]}" >>"$RUN_LOG" 2>&1 &
else
nohup env DASHBOARD_PORT="$dashboard_port" "$APP_CMD_BIN" >>"$RUN_LOG" 2>&1 &
fi
else
# Custom APP_CMD is treated as a shell command string.
# If executable paths include spaces, they must be quoted inside APP_CMD.
# Legacy compatibility path: caller owns quoting and env var injection.
nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 &
fi
app_pid=$!
echo "$app_pid" > "$PID_FILE"
@@ -54,6 +123,20 @@ nohup env PID_FILE="$PID_FILE" LOG_FILE="$WATCHDOG_LOG" CHECK_INTERVAL="$CHECK_I
watchdog_pid=$!
echo "$watchdog_pid" > "$WATCHDOG_PID_FILE"
sleep "$STARTUP_GRACE_SEC"
if ! kill -0 "$app_pid" 2>/dev/null; then
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: app process exited early (pid=$app_pid)" | tee -a "$RUN_LOG"
[ -n "${watchdog_pid:-}" ] && kill "$watchdog_pid" 2>/dev/null || true
tail -n 20 "$RUN_LOG" || true
exit 1
fi
if ! kill -0 "$watchdog_pid" 2>/dev/null; then
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: watchdog exited early (pid=$watchdog_pid)" | tee -a "$WATCHDOG_LOG"
kill "$app_pid" 2>/dev/null || true
tail -n 20 "$WATCHDOG_LOG" || true
exit 1
fi
cat <<EOF
시작 완료
- app pid: $app_pid

View File

@@ -7,12 +7,15 @@ ROOT_DIR="${ROOT_DIR:-/home/agentson/repos/The-Ouroboros}"
LOG_DIR="${LOG_DIR:-$ROOT_DIR/data/overnight}"
INTERVAL_SEC="${INTERVAL_SEC:-60}"
MAX_HOURS="${MAX_HOURS:-24}"
MAX_LOOPS="${MAX_LOOPS:-0}"
POLICY_TZ="${POLICY_TZ:-Asia/Seoul}"
DASHBOARD_PORT="${DASHBOARD_PORT:-8080}"
cd "$ROOT_DIR"
OUT_LOG="$LOG_DIR/runtime_verify_$(date +%Y%m%d_%H%M%S).log"
END_TS=$(( $(date +%s) + MAX_HOURS*3600 ))
loops=0
log() {
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$1" | tee -a "$OUT_LOG" >/dev/null
@@ -31,6 +34,11 @@ check_signal() {
return 1
}
find_live_pids() {
# Detect live-mode process even when run_overnight pid files are absent.
pgrep -af "[s]rc.main --mode=live" 2>/dev/null | awk '{print $1}' | tr '\n' ',' | sed 's/,$//'
}
check_forbidden() {
local name="$1"
local pattern="$2"
@@ -44,42 +52,94 @@ check_forbidden() {
return 0
}
is_port_listening() {
local port="$1"
if command -v ss >/dev/null 2>&1; then
ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
return $?
fi
if command -v lsof >/dev/null 2>&1; then
lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1
return $?
fi
if command -v netstat >/dev/null 2>&1; then
netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]"
return $?
fi
return 1
}
log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}"
while true; do
loops=$((loops + 1))
now=$(date +%s)
if [ "$now" -ge "$END_TS" ]; then
log "[INFO] monitor completed (time window reached)"
exit 0
fi
if [ "$MAX_LOOPS" -gt 0 ] && [ "$loops" -gt "$MAX_LOOPS" ]; then
log "[INFO] monitor completed (max loops reached)"
exit 0
fi
latest_run="$(ls -t "$LOG_DIR"/run_*.log 2>/dev/null | head -n1 || true)"
if [ -z "$latest_run" ]; then
log "[ANOMALY] no run log found"
sleep "$INTERVAL_SEC"
continue
fi
# Basic liveness hints.
app_pid="$(cat "$LOG_DIR/app.pid" 2>/dev/null || true)"
wd_pid="$(cat "$LOG_DIR/watchdog.pid" 2>/dev/null || true)"
live_pids="$(find_live_pids)"
app_alive=0
wd_alive=0
port_alive=0
[ -n "$app_pid" ] && kill -0 "$app_pid" 2>/dev/null && app_alive=1
[ -n "$wd_pid" ] && kill -0 "$wd_pid" 2>/dev/null && wd_alive=1
ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1
log "[HEARTBEAT] run_log=$latest_run app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive"
if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then
app_alive=1
fi
is_port_listening "$DASHBOARD_PORT" && port_alive=1
log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port=${DASHBOARD_PORT} alive=$port_alive live_pids=${live_pids:-none}"
defer_log_checks=0
if [ -z "$latest_run" ] && [ "$app_alive" -eq 1 ]; then
defer_log_checks=1
log "[INFO] run log not yet available; defer log-based coverage checks"
fi
if [ -z "$latest_run" ] && [ "$defer_log_checks" -eq 0 ]; then
log "[ANOMALY] no run log found"
fi
# Coverage matrix rows (session paths and policy gate evidence).
not_observed=0
if [ "$app_alive" -eq 1 ]; then
log "[COVERAGE] LIVE_MODE=PASS source=process_liveness"
else
if [ -n "$latest_run" ]; then
check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1))
else
log "[COVERAGE] LIVE_MODE=NOT_OBSERVED reason=no_run_log_no_live_pid"
not_observed=$((not_observed+1))
fi
fi
if [ "$defer_log_checks" -eq 1 ]; then
for deferred in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do
log "[COVERAGE] ${deferred}=DEFERRED reason=no_run_log_process_alive"
done
elif [ -n "$latest_run" ]; then
check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1))
check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1))
check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1))
check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1))
check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1))
check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1))
else
for missing in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do
log "[COVERAGE] ${missing}=NOT_OBSERVED reason=no_run_log"
not_observed=$((not_observed+1))
done
fi
if [ "$not_observed" -gt 0 ]; then
log "[ANOMALY] coverage_not_observed=$not_observed (treat as FAIL)"
@@ -95,11 +155,17 @@ while true; do
is_weekend=1
fi
if [ "$is_weekend" -eq 1 ]; then
if [ "$defer_log_checks" -eq 1 ]; then
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log_process_alive"
elif [ "$is_weekend" -eq 1 ]; then
# Weekend policy: KR regular session loop must never appear.
if [ -n "$latest_run" ]; then
check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \
"Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \
"$latest_run" || forbidden_hits=$((forbidden_hits+1))
else
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log"
fi
else
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=weekday"
fi

View File

@@ -0,0 +1,160 @@
from __future__ import annotations
import os
import signal
import socket
import subprocess
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parent.parent
RUN_OVERNIGHT = REPO_ROOT / "scripts" / "run_overnight.sh"
RUNTIME_MONITOR = REPO_ROOT / "scripts" / "runtime_verify_monitor.sh"
def _latest_runtime_log(log_dir: Path) -> str:
logs = sorted(log_dir.glob("runtime_verify_*.log"))
assert logs, "runtime monitor did not produce log output"
return logs[-1].read_text(encoding="utf-8")
def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path: Path) -> None:
log_dir = tmp_path / "overnight"
log_dir.mkdir(parents=True, exist_ok=True)
fake_live = subprocess.Popen(
["bash", "-lc", 'exec -a "src.main --mode=live" sleep 10'],
cwd=REPO_ROOT,
)
try:
env = os.environ.copy()
env.update(
{
"ROOT_DIR": str(REPO_ROOT),
"LOG_DIR": str(log_dir),
"INTERVAL_SEC": "1",
"MAX_HOURS": "1",
"MAX_LOOPS": "1",
"POLICY_TZ": "UTC",
}
)
completed = subprocess.run(
["bash", str(RUNTIME_MONITOR)],
cwd=REPO_ROOT,
env=env,
capture_output=True,
text=True,
check=False,
)
assert completed.returncode == 0, completed.stderr
log_text = _latest_runtime_log(log_dir)
assert "app_alive=1" in log_text
assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text
assert "[ANOMALY]" not in log_text
finally:
fake_live.terminate()
fake_live.wait(timeout=5)
def test_run_overnight_fails_fast_when_dashboard_port_in_use(tmp_path: Path) -> None:
log_dir = tmp_path / "overnight"
log_dir.mkdir(parents=True, exist_ok=True)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind(("127.0.0.1", 0))
sock.listen(1)
port = sock.getsockname()[1]
try:
env = os.environ.copy()
env.update(
{
"LOG_DIR": str(log_dir),
"TMUX_AUTO": "false",
"DASHBOARD_PORT": str(port),
}
)
completed = subprocess.run(
["bash", str(RUN_OVERNIGHT)],
cwd=REPO_ROOT,
env=env,
capture_output=True,
text=True,
check=False,
)
assert completed.returncode != 0
output = f"{completed.stdout}\n{completed.stderr}"
assert "already in use" in output
finally:
sock.close()
def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None:
log_dir = tmp_path / "overnight"
log_dir.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env.update(
{
"LOG_DIR": str(log_dir),
"TMUX_AUTO": "false",
"STARTUP_GRACE_SEC": "1",
"CHECK_INTERVAL": "2",
"APP_CMD_BIN": "sleep",
"APP_CMD_ARGS": "10",
}
)
completed = subprocess.run(
["bash", str(RUN_OVERNIGHT)],
cwd=REPO_ROOT,
env=env,
capture_output=True,
text=True,
check=False,
)
assert completed.returncode == 0, f"{completed.stdout}\n{completed.stderr}"
app_pid = int((log_dir / "app.pid").read_text(encoding="utf-8").strip())
watchdog_pid = int((log_dir / "watchdog.pid").read_text(encoding="utf-8").strip())
os.kill(app_pid, 0)
os.kill(watchdog_pid, 0)
for pid in (watchdog_pid, app_pid):
try:
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
pass
def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Path) -> None:
log_dir = tmp_path / "overnight"
log_dir.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env.update(
{
"LOG_DIR": str(log_dir),
"TMUX_AUTO": "false",
"STARTUP_GRACE_SEC": "1",
"APP_CMD_BIN": "false",
}
)
completed = subprocess.run(
["bash", str(RUN_OVERNIGHT)],
cwd=REPO_ROOT,
env=env,
capture_output=True,
text=True,
check=False,
)
assert completed.returncode != 0
output = f"{completed.stdout}\n{completed.stderr}"
assert "startup failed:" in output
watchdog_pid_file = log_dir / "watchdog.pid"
if watchdog_pid_file.exists():
watchdog_pid = int(watchdog_pid_file.read_text(encoding="utf-8").strip())
with pytest.raises(ProcessLookupError):
os.kill(watchdog_pid, 0)