@@ -8,6 +8,8 @@ CHECK_INTERVAL="${CHECK_INTERVAL:-30}"
|
||||
TMUX_AUTO="${TMUX_AUTO:-true}"
|
||||
TMUX_ATTACH="${TMUX_ATTACH:-true}"
|
||||
TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}"
|
||||
STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}"
|
||||
dashboard_port="${DASHBOARD_PORT:-8080}"
|
||||
|
||||
if [ -z "${APP_CMD:-}" ]; then
|
||||
if [ -x ".venv/bin/python" ]; then
|
||||
@@ -21,8 +23,6 @@ if [ -z "${APP_CMD:-}" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
dashboard_port="${DASHBOARD_PORT:-8080}"
|
||||
|
||||
APP_CMD="DASHBOARD_PORT=$dashboard_port $PYTHON_BIN -m src.main --mode=live --dashboard"
|
||||
fi
|
||||
|
||||
@@ -34,6 +34,11 @@ WATCHDOG_LOG="$LOG_DIR/watchdog_${timestamp}.log"
|
||||
PID_FILE="$LOG_DIR/app.pid"
|
||||
WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid"
|
||||
|
||||
is_port_in_use() {
|
||||
local port="$1"
|
||||
ss -ltn 2>/dev/null | rg -q ":${port}\\b"
|
||||
}
|
||||
|
||||
if [ -f "$PID_FILE" ]; then
|
||||
old_pid="$(cat "$PID_FILE" || true)"
|
||||
if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
|
||||
@@ -43,7 +48,12 @@ if [ -f "$PID_FILE" ]; then
|
||||
fi
|
||||
|
||||
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG"
|
||||
nohup bash -lc "$APP_CMD" >>"$RUN_LOG" 2>&1 &
|
||||
if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then
|
||||
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: dashboard port ${dashboard_port} already in use" | tee -a "$RUN_LOG"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
nohup bash -lc "exec $APP_CMD" >>"$RUN_LOG" 2>&1 &
|
||||
app_pid=$!
|
||||
echo "$app_pid" > "$PID_FILE"
|
||||
|
||||
@@ -54,6 +64,18 @@ nohup env PID_FILE="$PID_FILE" LOG_FILE="$WATCHDOG_LOG" CHECK_INTERVAL="$CHECK_I
|
||||
watchdog_pid=$!
|
||||
echo "$watchdog_pid" > "$WATCHDOG_PID_FILE"
|
||||
|
||||
sleep "$STARTUP_GRACE_SEC"
|
||||
if ! kill -0 "$app_pid" 2>/dev/null; then
|
||||
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: app process exited early (pid=$app_pid)" | tee -a "$RUN_LOG"
|
||||
tail -n 20 "$RUN_LOG" || true
|
||||
exit 1
|
||||
fi
|
||||
if ! kill -0 "$watchdog_pid" 2>/dev/null; then
|
||||
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: watchdog exited early (pid=$watchdog_pid)" | tee -a "$WATCHDOG_LOG"
|
||||
tail -n 20 "$WATCHDOG_LOG" || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cat <<EOF
|
||||
시작 완료
|
||||
- app pid: $app_pid
|
||||
|
||||
@@ -7,12 +7,14 @@ ROOT_DIR="${ROOT_DIR:-/home/agentson/repos/The-Ouroboros}"
|
||||
LOG_DIR="${LOG_DIR:-$ROOT_DIR/data/overnight}"
|
||||
INTERVAL_SEC="${INTERVAL_SEC:-60}"
|
||||
MAX_HOURS="${MAX_HOURS:-24}"
|
||||
MAX_LOOPS="${MAX_LOOPS:-0}"
|
||||
POLICY_TZ="${POLICY_TZ:-Asia/Seoul}"
|
||||
|
||||
cd "$ROOT_DIR"
|
||||
|
||||
OUT_LOG="$LOG_DIR/runtime_verify_$(date +%Y%m%d_%H%M%S).log"
|
||||
END_TS=$(( $(date +%s) + MAX_HOURS*3600 ))
|
||||
loops=0
|
||||
|
||||
log() {
|
||||
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$1" | tee -a "$OUT_LOG" >/dev/null
|
||||
@@ -31,6 +33,11 @@ check_signal() {
|
||||
return 1
|
||||
}
|
||||
|
||||
find_live_pids() {
|
||||
# Detect live-mode process even when run_overnight pid files are absent.
|
||||
pgrep -af "[s]rc.main --mode=live" 2>/dev/null | awk '{print $1}' | tr '\n' ',' | sed 's/,$//'
|
||||
}
|
||||
|
||||
check_forbidden() {
|
||||
local name="$1"
|
||||
local pattern="$2"
|
||||
@@ -47,39 +54,63 @@ check_forbidden() {
|
||||
log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}"
|
||||
|
||||
while true; do
|
||||
loops=$((loops + 1))
|
||||
now=$(date +%s)
|
||||
if [ "$now" -ge "$END_TS" ]; then
|
||||
log "[INFO] monitor completed (time window reached)"
|
||||
exit 0
|
||||
fi
|
||||
if [ "$MAX_LOOPS" -gt 0 ] && [ "$loops" -gt "$MAX_LOOPS" ]; then
|
||||
log "[INFO] monitor completed (max loops reached)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
latest_run="$(ls -t "$LOG_DIR"/run_*.log 2>/dev/null | head -n1 || true)"
|
||||
if [ -z "$latest_run" ]; then
|
||||
log "[ANOMALY] no run log found"
|
||||
sleep "$INTERVAL_SEC"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Basic liveness hints.
|
||||
app_pid="$(cat "$LOG_DIR/app.pid" 2>/dev/null || true)"
|
||||
wd_pid="$(cat "$LOG_DIR/watchdog.pid" 2>/dev/null || true)"
|
||||
live_pids="$(find_live_pids)"
|
||||
app_alive=0
|
||||
wd_alive=0
|
||||
port_alive=0
|
||||
[ -n "$app_pid" ] && kill -0 "$app_pid" 2>/dev/null && app_alive=1
|
||||
[ -n "$wd_pid" ] && kill -0 "$wd_pid" 2>/dev/null && wd_alive=1
|
||||
if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then
|
||||
app_alive=1
|
||||
fi
|
||||
ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1
|
||||
log "[HEARTBEAT] run_log=$latest_run app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive"
|
||||
log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive live_pids=${live_pids:-none}"
|
||||
|
||||
if [ -z "$latest_run" ]; then
|
||||
log "[ANOMALY] no run log found"
|
||||
fi
|
||||
|
||||
# Coverage matrix rows (session paths and policy gate evidence).
|
||||
not_observed=0
|
||||
if [ "$app_alive" -eq 1 ]; then
|
||||
log "[COVERAGE] LIVE_MODE=PASS source=process_liveness"
|
||||
else
|
||||
if [ -n "$latest_run" ]; then
|
||||
check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1))
|
||||
else
|
||||
log "[COVERAGE] LIVE_MODE=NOT_OBSERVED reason=no_run_log_no_live_pid"
|
||||
not_observed=$((not_observed+1))
|
||||
fi
|
||||
fi
|
||||
if [ -n "$latest_run" ]; then
|
||||
check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1))
|
||||
check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1))
|
||||
check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1))
|
||||
check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1))
|
||||
check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1))
|
||||
check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1))
|
||||
else
|
||||
for missing in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do
|
||||
log "[COVERAGE] ${missing}=NOT_OBSERVED reason=no_run_log"
|
||||
not_observed=$((not_observed+1))
|
||||
done
|
||||
fi
|
||||
|
||||
if [ "$not_observed" -gt 0 ]; then
|
||||
log "[ANOMALY] coverage_not_observed=$not_observed (treat as FAIL)"
|
||||
@@ -97,9 +128,13 @@ while true; do
|
||||
|
||||
if [ "$is_weekend" -eq 1 ]; then
|
||||
# Weekend policy: KR regular session loop must never appear.
|
||||
if [ -n "$latest_run" ]; then
|
||||
check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \
|
||||
"Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \
|
||||
"$latest_run" || forbidden_hits=$((forbidden_hits+1))
|
||||
else
|
||||
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log"
|
||||
fi
|
||||
else
|
||||
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=weekday"
|
||||
fi
|
||||
|
||||
124
tests/test_runtime_overnight_scripts.py
Normal file
124
tests/test_runtime_overnight_scripts.py
Normal file
@@ -0,0 +1,124 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
RUN_OVERNIGHT = REPO_ROOT / "scripts" / "run_overnight.sh"
|
||||
RUNTIME_MONITOR = REPO_ROOT / "scripts" / "runtime_verify_monitor.sh"
|
||||
|
||||
|
||||
def _latest_runtime_log(log_dir: Path) -> str:
|
||||
logs = sorted(log_dir.glob("runtime_verify_*.log"))
|
||||
assert logs, "runtime monitor did not produce log output"
|
||||
return logs[-1].read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path: Path) -> None:
|
||||
log_dir = tmp_path / "overnight"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
fake_live = subprocess.Popen(
|
||||
["bash", "-lc", 'exec -a "src.main --mode=live" sleep 10'],
|
||||
cwd=REPO_ROOT,
|
||||
)
|
||||
try:
|
||||
env = os.environ.copy()
|
||||
env.update(
|
||||
{
|
||||
"ROOT_DIR": str(REPO_ROOT),
|
||||
"LOG_DIR": str(log_dir),
|
||||
"INTERVAL_SEC": "1",
|
||||
"MAX_HOURS": "1",
|
||||
"MAX_LOOPS": "1",
|
||||
"POLICY_TZ": "UTC",
|
||||
}
|
||||
)
|
||||
completed = subprocess.run(
|
||||
["bash", str(RUNTIME_MONITOR)],
|
||||
cwd=REPO_ROOT,
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
assert completed.returncode == 0, completed.stderr
|
||||
|
||||
log_text = _latest_runtime_log(log_dir)
|
||||
assert "app_alive=1" in log_text
|
||||
assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text
|
||||
finally:
|
||||
fake_live.terminate()
|
||||
fake_live.wait(timeout=5)
|
||||
|
||||
|
||||
def test_run_overnight_fails_fast_when_dashboard_port_in_use(tmp_path: Path) -> None:
|
||||
log_dir = tmp_path / "overnight"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.bind(("127.0.0.1", 0))
|
||||
sock.listen(1)
|
||||
port = sock.getsockname()[1]
|
||||
try:
|
||||
env = os.environ.copy()
|
||||
env.update(
|
||||
{
|
||||
"LOG_DIR": str(log_dir),
|
||||
"TMUX_AUTO": "false",
|
||||
"DASHBOARD_PORT": str(port),
|
||||
}
|
||||
)
|
||||
completed = subprocess.run(
|
||||
["bash", str(RUN_OVERNIGHT)],
|
||||
cwd=REPO_ROOT,
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
assert completed.returncode != 0
|
||||
output = f"{completed.stdout}\n{completed.stderr}"
|
||||
assert "already in use" in output
|
||||
finally:
|
||||
sock.close()
|
||||
|
||||
|
||||
def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None:
|
||||
log_dir = tmp_path / "overnight"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update(
|
||||
{
|
||||
"LOG_DIR": str(log_dir),
|
||||
"TMUX_AUTO": "false",
|
||||
"STARTUP_GRACE_SEC": "1",
|
||||
"CHECK_INTERVAL": "2",
|
||||
"APP_CMD": "sleep 10",
|
||||
}
|
||||
)
|
||||
completed = subprocess.run(
|
||||
["bash", str(RUN_OVERNIGHT)],
|
||||
cwd=REPO_ROOT,
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
assert completed.returncode == 0, f"{completed.stdout}\n{completed.stderr}"
|
||||
|
||||
app_pid = int((log_dir / "app.pid").read_text(encoding="utf-8").strip())
|
||||
watchdog_pid = int((log_dir / "watchdog.pid").read_text(encoding="utf-8").strip())
|
||||
|
||||
os.kill(app_pid, 0)
|
||||
os.kill(watchdog_pid, 0)
|
||||
|
||||
for pid in (watchdog_pid, app_pid):
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
Reference in New Issue
Block a user