fix: runtime anomaly handling for overnight startup and monitor (#396 #397) #404

Merged
jihoson merged 6 commits from feature/issue-396-397-runtime-anomaly-fixes into main 2026-03-04 02:46:38 +09:00
3 changed files with 61 additions and 7 deletions
Showing only changes of commit 3cde8779fa - Show all commits

View File

@@ -36,7 +36,20 @@ WATCHDOG_PID_FILE="$LOG_DIR/watchdog.pid"
is_port_in_use() {
local port="$1"
ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
if command -v ss >/dev/null 2>&1; then
ss -ltn 2>/dev/null | grep -Eq ":${port}[[:space:]]"
return $?
fi
if command -v lsof >/dev/null 2>&1; then
lsof -nP -iTCP:"$port" -sTCP:LISTEN >/dev/null 2>&1
return $?
fi
if command -v netstat >/dev/null 2>&1; then
netstat -ltn 2>/dev/null | grep -Eq "[:.]${port}[[:space:]]"
return $?
fi
# No supported socket inspection command found.
return 1
}
if [ -f "$PID_FILE" ]; then
@@ -53,7 +66,8 @@ if [[ "$APP_CMD" == *"--dashboard"* ]] && is_port_in_use "$dashboard_port"; then
exit 1
fi
# `env` keeps inline VAR=value prefixes in APP_CMD working with `exec`.
# `APP_CMD` is treated as a shell command string.
# If executable paths include spaces, they must be quoted inside APP_CMD.
nohup bash -lc "exec env $APP_CMD" >>"$RUN_LOG" 2>&1 &
app_pid=$!
echo "$app_pid" > "$PID_FILE"

View File

@@ -9,6 +9,7 @@ INTERVAL_SEC="${INTERVAL_SEC:-60}"
MAX_HOURS="${MAX_HOURS:-24}"
MAX_LOOPS="${MAX_LOOPS:-0}"
POLICY_TZ="${POLICY_TZ:-Asia/Seoul}"
DASHBOARD_PORT="${DASHBOARD_PORT:-8080}"
cd "$ROOT_DIR"
@@ -79,10 +80,16 @@ while true; do
if [ "$app_alive" -eq 0 ] && [ -n "$live_pids" ]; then
app_alive=1
fi
ss -ltnp 2>/dev/null | rg -q ':8080' && port_alive=1
log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port8080=$port_alive live_pids=${live_pids:-none}"
ss -ltnp 2>/dev/null | rg -q ":${DASHBOARD_PORT}\\b" && port_alive=1
log "[HEARTBEAT] run_log=${latest_run:-none} app_alive=$app_alive watchdog_alive=$wd_alive port=${DASHBOARD_PORT} alive=$port_alive live_pids=${live_pids:-none}"
if [ -z "$latest_run" ]; then
defer_log_checks=0
if [ -z "$latest_run" ] && [ "$app_alive" -eq 1 ]; then
defer_log_checks=1
log "[INFO] run log not yet available; defer log-based coverage checks"
fi
if [ -z "$latest_run" ] && [ "$defer_log_checks" -eq 0 ]; then
log "[ANOMALY] no run log found"
fi
@@ -98,7 +105,11 @@ while true; do
not_observed=$((not_observed+1))
fi
fi
if [ -n "$latest_run" ]; then
if [ "$defer_log_checks" -eq 1 ]; then
for deferred in KR_LOOP NXT_PATH US_PRE_PATH US_DAY_PATH US_AFTER_PATH ORDER_POLICY_SESSION; do
log "[COVERAGE] ${deferred}=DEFERRED reason=no_run_log_process_alive"
done
elif [ -n "$latest_run" ]; then
check_signal "KR_LOOP" "Processing market: Korea Exchange" "$latest_run" || not_observed=$((not_observed+1))
check_signal "NXT_PATH" "NXT_PRE|NXT_AFTER|session=NXT_" "$latest_run" || not_observed=$((not_observed+1))
check_signal "US_PRE_PATH" "US_PRE|session=US_PRE" "$latest_run" || not_observed=$((not_observed+1))
@@ -126,7 +137,9 @@ while true; do
is_weekend=1
fi
if [ "$is_weekend" -eq 1 ]; then
if [ "$defer_log_checks" -eq 1 ]; then
log "[FORBIDDEN] WEEKEND_KR_SESSION_ACTIVE=SKIP reason=no_run_log_process_alive"
elif [ "$is_weekend" -eq 1 ]; then
# Weekend policy: KR regular session loop must never appear.
if [ -n "$latest_run" ]; then
check_forbidden "WEEKEND_KR_SESSION_ACTIVE" \

View File

@@ -50,6 +50,7 @@ def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path:
log_text = _latest_runtime_log(log_dir)
assert "app_alive=1" in log_text
assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text
assert "[ANOMALY]" not in log_text
finally:
fake_live.terminate()
fake_live.wait(timeout=5)
@@ -122,3 +123,29 @@ def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None:
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
pass
def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Path) -> None:
log_dir = tmp_path / "overnight"
log_dir.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env.update(
{
"LOG_DIR": str(log_dir),
"TMUX_AUTO": "false",
"STARTUP_GRACE_SEC": "1",
"APP_CMD": "false",
}
)
completed = subprocess.run(
["bash", str(RUN_OVERNIGHT)],
cwd=REPO_ROOT,
env=env,
capture_output=True,
text=True,
check=False,
)
assert completed.returncode != 0
output = f"{completed.stdout}\n{completed.stderr}"
assert "startup failed:" in output