## #413 — runtime_verify_monitor.sh pipefail fix - find_live_pids() now captures pgrep output via local variable with || true so pipefail never triggers on no-match (pgrep exit 1) - Regression test added: monitor survives MAX_LOOPS=1 without crash ## #412 — startup crash logging improvement - Add asyncio.CancelledError catch in sync_positions_from_broker call so BaseException-level cancellations are logged before propagating - Provides evidence in run log if CancelledError causes future startup aborts ## #414 — PR governance preflight - validate_pr_body.py: add REQ-ID/TASK-ID/TEST-ID pattern checks (--no-governance flag to skip) - docs/workflow.md: new "PR Governance Preflight (Mandatory)" section - docs/commands.md: "PR Body Governance Preflight" section before tea pulls create - Tests: 4 new governance traceability tests in test_validate_pr_body.py Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
204 lines
6.3 KiB
Python
204 lines
6.3 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
import signal
|
|
import socket
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
RUN_OVERNIGHT = REPO_ROOT / "scripts" / "run_overnight.sh"
|
|
RUNTIME_MONITOR = REPO_ROOT / "scripts" / "runtime_verify_monitor.sh"
|
|
|
|
|
|
def _latest_runtime_log(log_dir: Path) -> str:
|
|
logs = sorted(log_dir.glob("runtime_verify_*.log"))
|
|
assert logs, "runtime monitor did not produce log output"
|
|
return logs[-1].read_text(encoding="utf-8")
|
|
|
|
|
|
def test_runtime_verify_monitor_detects_live_process_without_pid_files(tmp_path: Path) -> None:
|
|
log_dir = tmp_path / "overnight"
|
|
log_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
fake_live = subprocess.Popen(
|
|
["bash", "-lc", 'exec -a "src.main --mode=live" sleep 10'],
|
|
cwd=REPO_ROOT,
|
|
)
|
|
try:
|
|
env = os.environ.copy()
|
|
env.update(
|
|
{
|
|
"ROOT_DIR": str(REPO_ROOT),
|
|
"LOG_DIR": str(log_dir),
|
|
"INTERVAL_SEC": "1",
|
|
"MAX_HOURS": "1",
|
|
"MAX_LOOPS": "1",
|
|
"POLICY_TZ": "UTC",
|
|
}
|
|
)
|
|
completed = subprocess.run(
|
|
["bash", str(RUNTIME_MONITOR)],
|
|
cwd=REPO_ROOT,
|
|
env=env,
|
|
capture_output=True,
|
|
text=True,
|
|
check=False,
|
|
)
|
|
assert completed.returncode == 0, completed.stderr
|
|
|
|
log_text = _latest_runtime_log(log_dir)
|
|
assert "app_alive=1" in log_text
|
|
assert "[COVERAGE] LIVE_MODE=PASS source=process_liveness" in log_text
|
|
assert "[ANOMALY]" not in log_text
|
|
finally:
|
|
fake_live.terminate()
|
|
fake_live.wait(timeout=5)
|
|
|
|
|
|
def test_run_overnight_fails_fast_when_dashboard_port_in_use(tmp_path: Path) -> None:
|
|
log_dir = tmp_path / "overnight"
|
|
log_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
sock.bind(("127.0.0.1", 0))
|
|
sock.listen(1)
|
|
port = sock.getsockname()[1]
|
|
try:
|
|
env = os.environ.copy()
|
|
env.update(
|
|
{
|
|
"LOG_DIR": str(log_dir),
|
|
"TMUX_AUTO": "false",
|
|
"DASHBOARD_PORT": str(port),
|
|
}
|
|
)
|
|
completed = subprocess.run(
|
|
["bash", str(RUN_OVERNIGHT)],
|
|
cwd=REPO_ROOT,
|
|
env=env,
|
|
capture_output=True,
|
|
text=True,
|
|
check=False,
|
|
)
|
|
assert completed.returncode != 0
|
|
output = f"{completed.stdout}\n{completed.stderr}"
|
|
assert "already in use" in output
|
|
finally:
|
|
sock.close()
|
|
|
|
|
|
def test_run_overnight_writes_live_pid_and_watchdog_pid(tmp_path: Path) -> None:
|
|
log_dir = tmp_path / "overnight"
|
|
log_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
env = os.environ.copy()
|
|
env.update(
|
|
{
|
|
"LOG_DIR": str(log_dir),
|
|
"TMUX_AUTO": "false",
|
|
"STARTUP_GRACE_SEC": "1",
|
|
"CHECK_INTERVAL": "2",
|
|
"APP_CMD_BIN": "sleep",
|
|
"APP_CMD_ARGS": "10",
|
|
}
|
|
)
|
|
completed = subprocess.run(
|
|
["bash", str(RUN_OVERNIGHT)],
|
|
cwd=REPO_ROOT,
|
|
env=env,
|
|
capture_output=True,
|
|
text=True,
|
|
check=False,
|
|
)
|
|
assert completed.returncode == 0, f"{completed.stdout}\n{completed.stderr}"
|
|
|
|
app_pid = int((log_dir / "app.pid").read_text(encoding="utf-8").strip())
|
|
watchdog_pid = int((log_dir / "watchdog.pid").read_text(encoding="utf-8").strip())
|
|
|
|
os.kill(app_pid, 0)
|
|
os.kill(watchdog_pid, 0)
|
|
|
|
for pid in (watchdog_pid, app_pid):
|
|
try:
|
|
os.kill(pid, signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
pass
|
|
|
|
|
|
def test_run_overnight_fails_when_process_exits_before_grace_period(tmp_path: Path) -> None:
|
|
log_dir = tmp_path / "overnight"
|
|
log_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
env = os.environ.copy()
|
|
env.update(
|
|
{
|
|
"LOG_DIR": str(log_dir),
|
|
"TMUX_AUTO": "false",
|
|
"STARTUP_GRACE_SEC": "1",
|
|
"APP_CMD_BIN": "false",
|
|
}
|
|
)
|
|
completed = subprocess.run(
|
|
["bash", str(RUN_OVERNIGHT)],
|
|
cwd=REPO_ROOT,
|
|
env=env,
|
|
capture_output=True,
|
|
text=True,
|
|
check=False,
|
|
)
|
|
assert completed.returncode != 0
|
|
output = f"{completed.stdout}\n{completed.stderr}"
|
|
assert "startup failed:" in output
|
|
|
|
watchdog_pid_file = log_dir / "watchdog.pid"
|
|
if watchdog_pid_file.exists():
|
|
watchdog_pid = int(watchdog_pid_file.read_text(encoding="utf-8").strip())
|
|
with pytest.raises(ProcessLookupError):
|
|
os.kill(watchdog_pid, 0)
|
|
|
|
|
|
def test_runtime_verify_monitor_survives_when_no_live_pid(tmp_path: Path) -> None:
|
|
"""Regression test for #413: monitor loop must not exit when pgrep finds no live process.
|
|
|
|
With set -euo pipefail, pgrep returning exit 1 (no match) would cause the
|
|
whole script to abort via the pipefail mechanism. The fix captures pgrep
|
|
output via a local variable with || true so pipefail is never triggered.
|
|
|
|
Verifies that the script: (1) exits 0 after completing MAX_LOOPS=1, and
|
|
(2) logs a HEARTBEAT entry. Whether live_pids is 'none' or not depends on
|
|
what processes happen to be running; either way the script must not crash.
|
|
"""
|
|
log_dir = tmp_path / "overnight"
|
|
log_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
env = os.environ.copy()
|
|
env.update(
|
|
{
|
|
"ROOT_DIR": str(REPO_ROOT),
|
|
"LOG_DIR": str(log_dir),
|
|
"INTERVAL_SEC": "1",
|
|
"MAX_HOURS": "1",
|
|
"MAX_LOOPS": "1",
|
|
"POLICY_TZ": "UTC",
|
|
}
|
|
)
|
|
completed = subprocess.run(
|
|
["bash", str(RUNTIME_MONITOR)],
|
|
cwd=REPO_ROOT,
|
|
env=env,
|
|
capture_output=True,
|
|
text=True,
|
|
check=False,
|
|
)
|
|
assert completed.returncode == 0, (
|
|
f"monitor exited non-zero (pipefail regression?): {completed.stderr}"
|
|
)
|
|
log_text = _latest_runtime_log(log_dir)
|
|
assert "[INFO] runtime verify monitor started" in log_text
|
|
assert "[HEARTBEAT]" in log_text, "monitor did not complete a heartbeat cycle"
|
|
# live_pids may be 'none' (no match) or a pid (process found) — either is valid.
|
|
# The critical invariant is that the script survived the loop without pipefail abort.
|