fix: stabilize overnight startup and monitor live fallback (#396 #397)
Some checks failed
Gitea CI / test (push) Failing after 37s
Gitea CI / test (pull_request) Failing after 38s

This commit is contained in:
agentson
2026-03-04 02:04:13 +09:00
parent d2f3fe9108
commit 528e17a29c
3 changed files with 200 additions and 19 deletions

View File

@@ -8,6 +8,8 @@ CHECK_INTERVAL="${CHECK_INTERVAL:-30}"
TMUX_AUTO="${TMUX_AUTO:-true}"
TMUX_ATTACH="${TMUX_ATTACH:-true}"
TMUX_SESSION_PREFIX="${TMUX_SESSION_PREFIX:-ouroboros_overnight}"
STARTUP_GRACE_SEC="${STARTUP_GRACE_SEC:-3}"
dashboard_port="${DASHBOARD_PORT:-8080}"
if [ -z "${APP_CMD:-}" ]; then
if [ -x ".venv/bin/python" ]; then
@@ -21,8 +23,6 @@ if [ -z "${APP_CMD:-}" ]; then
exit 1
fi
dashboard_port="${DASHBOARD_PORT:-8080}"
APP_CMD="DASHBOARD_PORT=$dashboard_port $PYTHON_BIN -m src.main --mode=live --dashboard"
fi
@@ -34,6 +34,11 @@ WATCHDOG_LOG="$LOG_DIR/watchdog_${timestamp}.log"
PID_FILE="$LOG_DIR/app.pid"
WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid"
is_port_in_use() {
local port="$1"
ss -ltn 2>/dev/null | rg -q ":${port}\\b"
}
if [ -f "$PID_FILE" ]; then
old_pid="$(cat "$PID_FILE" || true)"
if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
@@ -43,7 +48,12 @@ if [ -f "$PID_FILE" ]; then
fi
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] starting: $APP_CMD" | tee -a "$RUN_LOG"
nohup bash -lc "$APP_CMD" >>"$RUN_LOG" 2>&1 &
if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: dashboard port ${dashboard_port} already in use" | tee -a "$RUN_LOG"
exit 1
fi
nohup bash -lc "exec $APP_CMD" >>"$RUN_LOG" 2>&1 &
app_pid=$!
echo "$app_pid" > "$PID_FILE"
@@ -54,6 +64,18 @@ nohup env PID_FILE="$PID_FILE" LOG_FILE="$WATCHDOG_LOG" CHECK_INTERVAL="$CHECK_I
watchdog_pid=$!
echo "$watchdog_pid" > "$WATCHDOG_PID_FILE"
sleep "$STARTUP_GRACE_SEC"
if ! kill -0 "$app_pid" 2>/dev/null; then
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: app process exited early (pid=$app_pid)" | tee -a "$RUN_LOG"
tail -n 20 "$RUN_LOG" || true
exit 1
fi
if ! kill -0 "$watchdog_pid" 2>/dev/null; then
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] startup failed: watchdog exited early (pid=$watchdog_pid)" | tee -a "$WATCHDOG_LOG"
tail -n 20 "$WATCHDOG_LOG" || true
exit 1
fi
cat <<EOF
시작 완료
- app pid: $app_pid

View File

@@ -7,12 +7,14 @@ ROOT_DIR="${ROOT_DIR:-/home/agentson/repos/The-Ouroboros}"
LOG_DIR="${LOG_DIR:-$ROOT_DIR/data/overnight}"
INTERVAL_SEC="${INTERVAL_SEC:-60}"
MAX_HOURS="${MAX_HOURS:-24}"
MAX_LOOPS="${MAX_LOOPS:-0}"
POLICY_TZ="${POLICY_TZ:-Asia/Seoul}"
cd "$ROOT_DIR"
OUT_LOG="$LOG_DIR/runtime_verify_$(date +%Y%m%d_%H%M%S).log"
END_TS=$(( $(date +%s) + MAX_HOURS*3600 ))
loops=0
log() {
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$1" | tee -a "$OUT_LOG" >/dev/null
@@ -31,6 +33,11 @@ check_signal() {
return 1
}
find_live_pids() {
# Detect live-mode process even when run_overnight pid files are absent.
pgrep -af "[s]rc.main --mode=live" 2>/dev/null | awk '{print $1}' | tr '\n' ',' | sed 's/,$//'
}
check_forbidden() {
local name="$1"
local pattern="$2"
@@ -47,39 +54,63 @@ check_forbidden() {
log "[INFO] runtime verify monitor started interval=${INTERVAL_SEC}s max_hours=${MAX_HOURS} policy_tz=${POLICY_TZ}"
while true; do
loops=$((loops + 1))
now=$(date +%s)
if [ "$now" -ge "$END_TS" ]; then
log "[INFO] monitor completed (time window reached)"
exit 0
fi
if [ "$MAX_LOOPS" -gt 0 ] && [ "$loops" -gt "$MAX_LOOPS" ]; then
log "[INFO] monitor completed (max loops reached)"
exit 0
fi
latest_run="$(ls -t "$LOG_DIR"/run_*.log 2>/dev/null | head -n1 || true)"
if [ -z "$latest_run" ]; then
log "[ANOMALY] no run log found"
sleep "$INTERVAL_SEC"
continue
fi
# Basic liveness hints.
app_pid="$(cat "$LOG_DIR/app.pid" 2>/dev/null || true)"
wd_pid="$(cat "$LOG_DIR/watchdog.pid" 2>/dev/null || true)"
live_pids="$(find_live_pids)"
app_alive=0
wd_alive=0
port_alive=0
[ -n "$app_pid" ] && kill -0 "$app_pid" 2>/dev/null && app_alive=1
[ -n "$wd_pid" ] && kill -0 "$wd_pid" 2>/dev/null && wd_alive=1
if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then
app_alive=1
fi
ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1
log "[HEARTBEAT] run_log=$latest_run app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive"
log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive live_pids=${live_pids:-none}"
if [ -z "$latest_run" ]; then
log "[ANOMALY] no run log found"
fi
# Coverage matrix rows (session paths and policy gate evidence).
not_observed=0
check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1))
check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1))
check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1))
check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1))
check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1))
check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1))
check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1))
if [ "$app_alive" -eq 1 ]; then
log "[COVERAGE] LIVE_MODE=PASS source=process_liveness"
else
if [ -n "$latest_run" ]; then
check_signal "LIVE_MODE" "Mode: live" "$latest_run" || not_observed=$((not_observed+1))
else
log "[COVERAGE] LIVE_MODE=NOT_OBSERVED reason=no_run_log_no_live_pid"
not_observed=$((not_observed+1))
fi
fi
if [ -n "$latest_run" ]; then
check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1))
check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1))
check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1))
check_signal "US_DAY_PATH" "US_DAY|session=US_DAY|Processing market: .*NASDAQ|Processing market: .*NYSE|Processing market: .*AMEX" "$latest_run" || not_observed=$((not_observed+1))
check_signal "US_AFTER_PATH" "US_AFTER|session=US_AFTER" "$latest_run" || not_observed=$((not_observed+1))
check_signal "ORDER_POLICY_SESSION" "Order policy rejected .*\\[session=" "$latest_run" || not_observed=$((not_observed+1))
else
for missing in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do
log "[COVERAGE] ${missing}=NOT_OBSERVED reason=no_run_log"
not_observed=$((not_observed+1))
done
fi
if [ "$not_observed" -gt 0 ]; then
log "[ANOMALY] coverage_not_observed=$not_observed (treat as FAIL)"
@@ -97,9 +128,13 @@ while true; do
if [ "$is_weekend" -eq 1 ]; then
# Weekend policy: KR regular session loop must never appear.
check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \
"Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \
"$latest_run" || forbidden_hits=$((forbidden_hits+1))
if [ -n "$latest_run" ]; then
check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \
"Market session active: KR|session=KRX_REG|Processing market: Korea Exchange" \
"$latest_run" || forbidden_hits=$((forbidden_hits+1))
else
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log"
fi
else
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=weekday"
fi