diff --git a/skills/openclaw-native/heartbeat-governor/SKILL.md b/skills/openclaw-native/heartbeat-governor/SKILL.md new file mode 100644 index 0000000..5d5dc52 --- /dev/null +++ b/skills/openclaw-native/heartbeat-governor/SKILL.md @@ -0,0 +1,105 @@ +--- +name: heartbeat-governor +version: "1.0" +category: openclaw-native +description: Enforces per-skill execution budgets for scheduled cron skills — pauses runaway skills that exceed their token or wall-clock budget before they drain your monthly API allowance. +stateful: true +cron: "0 * * * *" +--- + +# Heartbeat Governor + +## What it does + +Cron skills run autonomously. A skill with a bug — an infinite retry, an unexpectedly large context, a model call inside a loop — can silently consume hundreds of dollars before you notice. + +Heartbeat Governor tracks cumulative execution cost and wall-clock time per scheduled skill on a rolling 30-day basis. When a skill exceeds its budget, the governor pauses it and sends an alert. The skill won't fire again until you explicitly review and resume it. + +It runs every hour to catch budget overruns within one cron cycle. + +## When to invoke + +- Automatically, every hour (cron) +- Manually after noticing an unexpected API bill spike +- When a cron skill has been running unusually long + +## Budget types + +| Budget type | Default | Configurable | +|---|---|---| +| `max_usd_monthly` | $5.00 | Yes, per skill | +| `max_usd_per_run` | $0.50 | Yes, per skill | +| `max_wall_minutes` | 30 | Yes, per skill | +| `max_runs_daily` | 48 | Yes, per skill | + +## Actions on budget breach + +| Breach type | Action | +|---|---| +| `monthly_usd` exceeded | Pause skill, log breach, alert | +| `per_run_usd` exceeded | Abort current run, log breach | +| `wall_clock` exceeded | Abort current run, log breach | +| `daily_runs` exceeded | Skip remaining runs today, log | + +## How to use + +```bash +python3 governor.py --status # Show all skills and budget utilisation +python3 governor.py --record --usd 0.12 --minutes 4 # Record a run +python3 governor.py --pause # Manually pause a skill +python3 governor.py --resume # Resume a paused skill after review +python3 governor.py --set-budget --monthly 10.00 # Override budget +python3 governor.py --check # Run hourly check (called by cron) +python3 governor.py --report # Full monthly spend report +python3 governor.py --format json +``` + +## Cron wakeup behaviour + +Every hour the governor runs `--check`: + +1. Load all skill ledgers from state +2. For each skill with `paused: false`: + - If 30-day rolling spend exceeds `max_usd_monthly` → `paused: true`, log + - If runs today exceed `max_runs_daily` → skip, log +3. Print summary of paused skills and budget utilisation +4. Save updated state + +## Procedure + +**Step 1 — Set sensible budgets** + +After installing any new cron skill, set its monthly budget: + +```bash +python3 governor.py --set-budget daily-review --monthly 2.00 +python3 governor.py --set-budget morning-briefing --monthly 3.00 +``` + +Defaults are conservative ($5/month) but explicit is better. + +**Step 2 — Monitor utilisation** + +```bash +python3 governor.py --status +``` + +Review the utilisation column. Any skill above 80% monthly budget warrants investigation. + +**Step 3 — Respond to pause alerts** + +When the governor pauses a skill, investigate why it's over budget: +- Was there a one-time expensive run (large context)? +- Is there a bug causing repeated expensive calls? +- Does the budget simply need to be raised? + +Resume after investigating: +```bash +python3 governor.py --resume +``` + +## State + +Per-skill ledgers and pause flags stored in `~/.openclaw/skill-state/heartbeat-governor/state.yaml`. + +Fields: `skill_ledgers` map, `paused_skills` list, `breach_log`, `monthly_summary`. diff --git a/skills/openclaw-native/heartbeat-governor/STATE_SCHEMA.yaml b/skills/openclaw-native/heartbeat-governor/STATE_SCHEMA.yaml new file mode 100644 index 0000000..535e4cf --- /dev/null +++ b/skills/openclaw-native/heartbeat-governor/STATE_SCHEMA.yaml @@ -0,0 +1,41 @@ +version: "1.0" +description: Per-skill execution budgets, spend ledgers, pause flags, and breach log. +fields: + skill_ledgers: + type: object + description: Map of skill_name -> budget + rolling spend ledger + items: + budget: + type: object + properties: + max_usd_monthly: { type: float, default: 5.0 } + max_usd_per_run: { type: float, default: 0.5 } + max_wall_minutes: { type: integer, default: 30 } + max_runs_daily: { type: integer, default: 48 } + paused: { type: boolean, default: false } + pause_reason: { type: string } + paused_at: { type: datetime } + runs: + type: list + description: Rolling 30-day run log + items: + ran_at: { type: datetime } + usd_spent: { type: float } + wall_minutes: { type: float } + breach_log: + type: list + description: All budget breach events + items: + skill_name: { type: string } + breach_type: { type: enum, values: [monthly_usd, per_run_usd, wall_clock, daily_runs] } + value: { type: float } + limit: { type: float } + breached_at: { type: datetime } + resolved: { type: boolean } + monthly_summary: + type: object + description: Aggregated spend by skill for current calendar month + items: + skill_name: { type: string } + total_usd: { type: float } + total_runs: { type: integer } diff --git a/skills/openclaw-native/heartbeat-governor/example-state.yaml b/skills/openclaw-native/heartbeat-governor/example-state.yaml new file mode 100644 index 0000000..6a93eb7 --- /dev/null +++ b/skills/openclaw-native/heartbeat-governor/example-state.yaml @@ -0,0 +1,64 @@ +# Example runtime state for heartbeat-governor +skill_ledgers: + morning-briefing: + budget: + max_usd_monthly: 4.00 + max_usd_per_run: 0.30 + max_wall_minutes: 15 + max_runs_daily: 1 + paused: false + pause_reason: null + paused_at: null + runs: + - ran_at: "2026-03-15T07:00:05.000000" + usd_spent: 0.18 + wall_minutes: 6.2 + - ran_at: "2026-03-14T07:00:03.000000" + usd_spent: 0.21 + wall_minutes: 7.1 + long-running-task-management: + budget: + max_usd_monthly: 5.00 + max_usd_per_run: 0.50 + max_wall_minutes: 30 + max_runs_daily: 96 + paused: true + pause_reason: "30-day spend $5.12 reached monthly limit $5.00" + paused_at: "2026-03-15T08:00:00.000000" + runs: [] + cron-hygiene: + budget: + max_usd_monthly: 1.00 + max_usd_per_run: 0.10 + max_wall_minutes: 10 + max_runs_daily: 2 + paused: false + pause_reason: null + paused_at: null + runs: + - ran_at: "2026-03-10T09:00:07.000000" + usd_spent: 0.07 + wall_minutes: 2.1 +breach_log: + - skill_name: long-running-task-management + breach_type: monthly_usd + value: 5.12 + limit: 5.00 + breached_at: "2026-03-15T08:00:00.000000" + resolved: false +monthly_summary: {} +# ── Walkthrough ────────────────────────────────────────────────────────────── +# Hourly cron runs: python3 governor.py --check +# +# Heartbeat Governor — 2026-03-15 08:00 +# ────────────────────────────────────────────────────────────── +# ⏸ Paused: long-running-task-management +# +# python3 governor.py --status +# Skill Spend Budget % Status +# cron-hygiene $0.07 $1.00 7% ✓ +# long-running-task-management $5.12 $5.00 102% ⏸ PAUSED +# morning-briefing $0.39 $4.00 10% ✓ +# +# python3 governor.py --resume long-running-task-management +# ✓ Resumed 'long-running-task-management'. Will fire on next scheduled run. diff --git a/skills/openclaw-native/heartbeat-governor/governor.py b/skills/openclaw-native/heartbeat-governor/governor.py new file mode 100755 index 0000000..39664a6 --- /dev/null +++ b/skills/openclaw-native/heartbeat-governor/governor.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python3 +""" +Heartbeat Governor for openclaw-superpowers. + +Enforces per-skill execution budgets for cron skills. +Pauses runaway skills before they drain your monthly API allowance. + +Usage: + python3 governor.py --check # Hourly cron check + python3 governor.py --status # Current utilisation + python3 governor.py --record --usd 0.12 --minutes 4 + python3 governor.py --pause # Manual pause + python3 governor.py --resume # Resume after review + python3 governor.py --set-budget --monthly 10.00 [--per-run 1.00] + python3 governor.py --report # Monthly spend report + python3 governor.py --format json +""" + +import argparse +import json +import os +from datetime import datetime, timedelta +from pathlib import Path + +try: + import yaml + HAS_YAML = True +except ImportError: + HAS_YAML = False + +OPENCLAW_DIR = Path(os.environ.get("OPENCLAW_HOME", Path.home() / ".openclaw")) +STATE_FILE = OPENCLAW_DIR / "skill-state" / "heartbeat-governor" / "state.yaml" + +DEFAULT_BUDGET = { + "max_usd_monthly": 5.0, + "max_usd_per_run": 0.50, + "max_wall_minutes": 30, + "max_runs_daily": 48, +} +ROLLING_DAYS = 30 +MAX_BREACH_LOG = 200 + + +# ── State helpers ───────────────────────────────────────────────────────────── + +def load_state() -> dict: + if not STATE_FILE.exists(): + return {"skill_ledgers": {}, "breach_log": [], "monthly_summary": {}} + try: + text = STATE_FILE.read_text() + return (yaml.safe_load(text) or {}) if HAS_YAML else {} + except Exception: + return {} + + +def save_state(state: dict) -> None: + STATE_FILE.parent.mkdir(parents=True, exist_ok=True) + if HAS_YAML: + with open(STATE_FILE, "w") as f: + yaml.dump(state, f, default_flow_style=False, allow_unicode=True) + + +# ── Ledger helpers ──────────────────────────────────────────────────────────── + +def get_ledger(state: dict, skill_name: str) -> dict: + ledgers = state.setdefault("skill_ledgers", {}) + if skill_name not in ledgers: + ledgers[skill_name] = { + "budget": dict(DEFAULT_BUDGET), + "paused": False, + "pause_reason": None, + "paused_at": None, + "runs": [], + } + return ledgers[skill_name] + + +def prune_old_runs(runs: list) -> list: + cutoff = datetime.now() - timedelta(days=ROLLING_DAYS) + return [r for r in runs if _parse_dt(r.get("ran_at", "")) >= cutoff] + + +def _parse_dt(s: str) -> datetime: + try: + return datetime.fromisoformat(s) + except Exception: + return datetime.min + + +def rolling_usd(runs: list) -> float: + return sum(r.get("usd_spent", 0) for r in runs) + + +def runs_today(runs: list) -> int: + today = datetime.now().date() + return sum(1 for r in runs if _parse_dt(r.get("ran_at", "")).date() == today) + + +def add_breach(state: dict, skill_name: str, breach_type: str, + value: float, limit: float) -> None: + breach_log = state.setdefault("breach_log", []) + breach_log.append({ + "skill_name": skill_name, + "breach_type": breach_type, + "value": round(value, 4), + "limit": round(limit, 4), + "breached_at": datetime.now().isoformat(), + "resolved": False, + }) + state["breach_log"] = breach_log[-MAX_BREACH_LOG:] + + +def pause_skill(state: dict, skill_name: str, reason: str) -> None: + ledger = get_ledger(state, skill_name) + ledger["paused"] = True + ledger["pause_reason"] = reason + ledger["paused_at"] = datetime.now().isoformat() + print(f" ⏸ PAUSED: {skill_name} — {reason}") + + +# ── Commands ────────────────────────────────────────────────────────────────── + +def cmd_record(state: dict, skill_name: str, usd: float, minutes: float) -> None: + ledger = get_ledger(state, skill_name) + ledger["runs"] = prune_old_runs(ledger.get("runs") or []) + + now = datetime.now().isoformat() + run = {"ran_at": now, "usd_spent": usd, "wall_minutes": minutes} + + # Per-run checks + budget = ledger.get("budget") or DEFAULT_BUDGET + per_run_limit = budget.get("max_usd_per_run", DEFAULT_BUDGET["max_usd_per_run"]) + wall_limit = budget.get("max_wall_minutes", DEFAULT_BUDGET["max_wall_minutes"]) + + if usd > per_run_limit: + add_breach(state, skill_name, "per_run_usd", usd, per_run_limit) + print(f"⚠ {skill_name}: per-run spend ${usd:.2f} exceeds limit ${per_run_limit:.2f}") + + if minutes > wall_limit: + add_breach(state, skill_name, "wall_clock", minutes, wall_limit) + print(f"⚠ {skill_name}: wall-clock {minutes:.1f}m exceeds limit {wall_limit}m") + + ledger["runs"].append(run) + save_state(state) + print(f"✓ Recorded run for '{skill_name}': ${usd:.4f} in {minutes:.1f}m") + + +def cmd_check(state: dict, fmt: str) -> None: + """Hourly cron check — evaluate all skill budgets.""" + ledgers = state.get("skill_ledgers") or {} + paused_now = [] + alerts = [] + + for skill_name, ledger in ledgers.items(): + if ledger.get("paused"): + continue + + budget = ledger.get("budget") or DEFAULT_BUDGET + ledger["runs"] = prune_old_runs(ledger.get("runs") or []) + + # Monthly budget check + monthly_limit = budget.get("max_usd_monthly", DEFAULT_BUDGET["max_usd_monthly"]) + total = rolling_usd(ledger["runs"]) + if total >= monthly_limit: + reason = f"30-day spend ${total:.2f} reached monthly limit ${monthly_limit:.2f}" + pause_skill(state, skill_name, reason) + add_breach(state, skill_name, "monthly_usd", total, monthly_limit) + paused_now.append(skill_name) + alerts.append({"skill": skill_name, "breach": "monthly_usd", + "value": total, "limit": monthly_limit}) + continue + + # Daily runs check + daily_limit = budget.get("max_runs_daily", DEFAULT_BUDGET["max_runs_daily"]) + today_runs = runs_today(ledger["runs"]) + if today_runs >= daily_limit: + alerts.append({"skill": skill_name, "breach": "daily_runs", + "value": today_runs, "limit": daily_limit}) + + now = datetime.now().isoformat() + if fmt == "json": + print(json.dumps({ + "checked_at": now, + "paused_this_run": paused_now, + "alerts": alerts, + }, indent=2)) + else: + print(f"\nHeartbeat Governor — {datetime.now().strftime('%Y-%m-%d %H:%M')}") + print("─" * 48) + if paused_now: + for name in paused_now: + print(f" ⏸ Paused: {name}") + if not paused_now and not alerts: + print(" ✓ All skills within budget.") + for a in alerts: + if a["breach"] == "daily_runs": + print(f" ⚠ {a['skill']}: {int(a['value'])} runs today " + f"(limit {int(a['limit'])})") + print() + + save_state(state) + + +def cmd_status(state: dict, fmt: str) -> None: + ledgers = state.get("skill_ledgers") or {} + rows = [] + for skill_name, ledger in sorted(ledgers.items()): + budget = ledger.get("budget") or DEFAULT_BUDGET + ledger["runs"] = prune_old_runs(ledger.get("runs") or []) + total = rolling_usd(ledger["runs"]) + monthly_limit = budget.get("max_usd_monthly", DEFAULT_BUDGET["max_usd_monthly"]) + pct = int(100 * total / monthly_limit) if monthly_limit else 0 + rows.append({ + "skill": skill_name, + "paused": ledger.get("paused", False), + "monthly_usd": round(total, 4), + "monthly_limit": monthly_limit, + "pct": pct, + }) + + if fmt == "json": + print(json.dumps(rows, indent=2)) + return + + print(f"\nHeartbeat Governor — Skill Budget Status") + print("─" * 55) + print(f" {'Skill':30s} {'Spend':>7s} {'Budget':>7s} {'%':>4s} Status") + for r in rows: + status = "⏸ PAUSED" if r["paused"] else ("⚠" if r["pct"] >= 80 else "✓") + print(f" {r['skill']:30s} ${r['monthly_usd']:>6.2f} " + f"${r['monthly_limit']:>6.2f} {r['pct']:>3d}% {status}") + print() + + +def cmd_pause(state: dict, skill_name: str) -> None: + pause_skill(state, skill_name, "Manual pause") + save_state(state) + + +def cmd_resume(state: dict, skill_name: str) -> None: + ledger = get_ledger(state, skill_name) + ledger["paused"] = False + ledger["pause_reason"] = None + ledger["paused_at"] = None + save_state(state) + print(f"✓ Resumed '{skill_name}'. Will fire on next scheduled run.") + + +def cmd_set_budget(state: dict, skill_name: str, monthly: float, + per_run: float, wall_minutes: int, daily_runs: int) -> None: + ledger = get_ledger(state, skill_name) + budget = ledger.setdefault("budget", dict(DEFAULT_BUDGET)) + if monthly is not None: + budget["max_usd_monthly"] = monthly + if per_run is not None: + budget["max_usd_per_run"] = per_run + if wall_minutes is not None: + budget["max_wall_minutes"] = wall_minutes + if daily_runs is not None: + budget["max_runs_daily"] = daily_runs + save_state(state) + print(f"✓ Budget updated for '{skill_name}': {budget}") + + +def cmd_report(state: dict, fmt: str) -> None: + ledgers = state.get("skill_ledgers") or {} + month_start = datetime.now().replace(day=1, hour=0, minute=0, second=0) + rows = [] + for skill_name, ledger in sorted(ledgers.items()): + runs = [r for r in (ledger.get("runs") or []) + if _parse_dt(r.get("ran_at", "")) >= month_start] + total = sum(r.get("usd_spent", 0) for r in runs) + rows.append({"skill": skill_name, "runs": len(runs), + "total_usd": round(total, 4)}) + + grand_total = sum(r["total_usd"] for r in rows) + + if fmt == "json": + print(json.dumps({"rows": rows, "grand_total_usd": round(grand_total, 4)}, + indent=2)) + return + + print(f"\nMonthly Spend Report — {datetime.now().strftime('%B %Y')}") + print("─" * 48) + for r in rows: + print(f" {r['skill']:35s} {r['runs']:3d} runs ${r['total_usd']:.4f}") + print(f" {'TOTAL':35s} ${grand_total:.4f}") + print() + + +# ── Main ────────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description="Heartbeat Governor") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--check", action="store_true", + help="Hourly budget check (cron entry point)") + group.add_argument("--status", action="store_true") + group.add_argument("--record", metavar="SKILL") + group.add_argument("--pause", metavar="SKILL") + group.add_argument("--resume", metavar="SKILL") + group.add_argument("--set-budget", metavar="SKILL") + group.add_argument("--report", action="store_true") + parser.add_argument("--usd", type=float, default=0.0) + parser.add_argument("--minutes", type=float, default=0.0) + parser.add_argument("--monthly", type=float) + parser.add_argument("--per-run", type=float) + parser.add_argument("--wall-minutes", type=int) + parser.add_argument("--daily-runs", type=int) + parser.add_argument("--format", choices=["text", "json"], default="text") + args = parser.parse_args() + + state = load_state() + + if args.check: + cmd_check(state, args.format) + elif args.status: + cmd_status(state, args.format) + elif args.record: + cmd_record(state, args.record, args.usd, args.minutes) + elif args.pause: + cmd_pause(state, args.pause) + elif args.resume: + cmd_resume(state, args.resume) + elif args.set_budget: + cmd_set_budget(state, args.set_budget, args.monthly, args.per_run, + args.wall_minutes, args.daily_runs) + elif args.report: + cmd_report(state, args.format) + + +if __name__ == "__main__": + main()