feat: relay daemon skeleton — queue, dispatch, conversation, ntfy (#1)
First-PR scope from #1. Single-process Python daemon that relays between Claude Code instances and chat-Claude (Anthropic API). Components: * relay.config — .env + config.yaml loader. Auto-generates ntfy topic on first run and persists it back to .env. * relay.state — atomic file I/O via tempfile + rename, advisory flock at state/.lock to enforce single-instance. * relay.conversation — append-only history with summarization. Triggers a summarize call when total chars exceed HISTORY_CHAR_CAP (default 400k); replaces history with the summary plus the most recent 10 turns. * relay.anthropic_client — SDK wrapper. Marks the system prompt cacheable (5-min ephemeral cache); concatenates text blocks; estimates per-call cost from the Anthropic price table with cache-write/read accounted for. * relay.queue — JSON envelope intake; oldest-by-mtime; malformed envelopes moved to queue/.rejected/. * relay.dispatch — one-input-at-a-time per session (dispatch/<session_id>/input.txt). Won't overwrite a pending dispatch; queues internally and waits for CC to delete. * relay.ntfy — best-effort POST to https://ntfy.sh/<topic>; failures logged but never block the main loop. * relay.daemon — main loop. Polls jc_input.txt (priority) then queue/. Detects [NEEDS-JC] in the first 200 chars of any response and pauses dispatch until JC writes jc_input.txt. JC override supports @session-N: prefix for direct dispatch without an API call. * relay.__main__ — CLI: relay run / relay status / relay topic. Tests: 57 unit tests pass (config, state, conversation, queue, dispatch, anthropic_client, ntfy, full daemon loop with a fake client). One real-API smoke test marked real_api, opt-in via pytest -m real_api; skips cleanly on credit-balance errors. Out of scope for this PR (deferred to follow-ups): Flask status endpoint, multi-session config in production, exponential backoff, systemd unit, cost-tracking aggregation. Closes #1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
0
relay/__init__.py
Normal file
0
relay/__init__.py
Normal file
75
relay/__main__.py
Normal file
75
relay/__main__.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""CLI entry point: ``relay <command>``."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
|
||||
from relay.config import load_settings
|
||||
from relay.daemon import Daemon
|
||||
from relay.logs import configure as configure_logs
|
||||
from relay.ntfy import topic_url
|
||||
from relay.state import read_json
|
||||
|
||||
|
||||
def _cmd_run(args: argparse.Namespace) -> int:
|
||||
settings = load_settings()
|
||||
configure_logs(settings.logs_dir, level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
daemon = Daemon(settings)
|
||||
|
||||
def _stop(signum: int, _frame: object) -> None:
|
||||
logging.getLogger(__name__).info("received signal %s; shutting down", signum)
|
||||
daemon.stop()
|
||||
|
||||
signal.signal(signal.SIGINT, _stop)
|
||||
signal.signal(signal.SIGTERM, _stop)
|
||||
|
||||
try:
|
||||
daemon.run()
|
||||
except RuntimeError as exc:
|
||||
# Most commonly the lock file: another daemon is running.
|
||||
print(f"error: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def _cmd_status(_args: argparse.Namespace) -> int:
|
||||
settings = load_settings()
|
||||
status_path = settings.state_dir / "status.json"
|
||||
payload = read_json(status_path, default=None)
|
||||
if payload is None:
|
||||
print("no status file yet — has the daemon run?", file=sys.stderr)
|
||||
return 1
|
||||
print(json.dumps(payload, indent=2, sort_keys=False))
|
||||
return 0
|
||||
|
||||
|
||||
def _cmd_topic(_args: argparse.Namespace) -> int:
|
||||
settings = load_settings()
|
||||
print(topic_url(settings.ntfy_topic))
|
||||
return 0
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(prog="relay", description="risv3-relay daemon")
|
||||
sub = parser.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
run = sub.add_parser("run", help="Run the daemon in the foreground")
|
||||
run.add_argument("-v", "--verbose", action="store_true")
|
||||
run.set_defaults(func=_cmd_run)
|
||||
|
||||
status = sub.add_parser("status", help="Print the current daemon status")
|
||||
status.set_defaults(func=_cmd_status)
|
||||
|
||||
topic = sub.add_parser("topic", help="Print the ntfy subscription URL")
|
||||
topic.set_defaults(func=_cmd_topic)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
return args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
119
relay/anthropic_client.py
Normal file
119
relay/anthropic_client.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""Anthropic API wrapper.
|
||||
|
||||
Wraps the SDK call so the daemon can send a turn and get back the
|
||||
assistant text + token usage + estimated cost. Implements prompt
|
||||
caching on the system prompt: subsequent calls within the 5-minute TTL
|
||||
get a cache hit on the (large, repeated) system prompt and pay the
|
||||
cheaper cache-hit rate. Per-turn user/assistant content is never marked
|
||||
cacheable because it changes every call.
|
||||
|
||||
Cost estimation uses a model→price table; the table is the source of
|
||||
truth and is easy to update when pricing changes.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from anthropic import Anthropic
|
||||
from anthropic.types import Message
|
||||
|
||||
# Pricing per 1M tokens (USD), pulled from Anthropic's published schedule.
|
||||
# Cache-hit input is billed at the cache-read rate (~10% of standard input).
|
||||
# Cache-write is ~25% more than standard input. These numbers are
|
||||
# approximate and used only for log-line cost estimation; the source of
|
||||
# truth for billing is Anthropic's invoice.
|
||||
_PRICES_PER_MILLION: dict[str, dict[str, float]] = {
|
||||
"claude-opus-4-7": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},
|
||||
"claude-opus-4-7-1m": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},
|
||||
"claude-sonnet-4-6": {"input": 3.0, "output": 15.0, "cache_write": 3.75, "cache_read": 0.30},
|
||||
"claude-haiku-4-5-20251001": {
|
||||
"input": 0.80,
|
||||
"output": 4.0,
|
||||
"cache_write": 1.00,
|
||||
"cache_read": 0.08,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _price_for(model: str) -> dict[str, float]:
|
||||
if model in _PRICES_PER_MILLION:
|
||||
return _PRICES_PER_MILLION[model]
|
||||
# Fallback: charge as Opus 4.7 (worst-case) so estimates don't
|
||||
# under-report for unknown models. Logged once at startup.
|
||||
return _PRICES_PER_MILLION["claude-opus-4-7"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class TurnResult:
|
||||
text: str
|
||||
input_tokens: int
|
||||
output_tokens: int
|
||||
cache_creation_input_tokens: int
|
||||
cache_read_input_tokens: int
|
||||
estimated_cost_usd: float
|
||||
raw: Message
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnthropicClient:
|
||||
api_key: str
|
||||
model: str
|
||||
max_output_tokens: int = 4096
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self._sdk = Anthropic(api_key=self.api_key)
|
||||
|
||||
def send(
|
||||
self,
|
||||
*,
|
||||
system_prompt: str,
|
||||
messages: list[dict[str, str]],
|
||||
) -> TurnResult:
|
||||
"""Send a single API turn. The system prompt is marked cacheable.
|
||||
|
||||
``messages`` is the full conversation history shaped for the
|
||||
Messages API. The most recent message is the user turn we're
|
||||
responding to. The daemon is responsible for appending the
|
||||
assistant text it gets back into history before the next call.
|
||||
"""
|
||||
|
||||
response = self._sdk.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=self.max_output_tokens,
|
||||
system=[
|
||||
{
|
||||
"type": "text",
|
||||
"text": system_prompt,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
text = "".join(
|
||||
block.text for block in response.content if getattr(block, "type", None) == "text"
|
||||
)
|
||||
usage = response.usage
|
||||
in_tokens = int(getattr(usage, "input_tokens", 0) or 0)
|
||||
out_tokens = int(getattr(usage, "output_tokens", 0) or 0)
|
||||
cache_create = int(getattr(usage, "cache_creation_input_tokens", 0) or 0)
|
||||
cache_read = int(getattr(usage, "cache_read_input_tokens", 0) or 0)
|
||||
|
||||
prices = _price_for(self.model)
|
||||
cost = (
|
||||
in_tokens * prices["input"]
|
||||
+ out_tokens * prices["output"]
|
||||
+ cache_create * prices["cache_write"]
|
||||
+ cache_read * prices["cache_read"]
|
||||
) / 1_000_000.0
|
||||
|
||||
return TurnResult(
|
||||
text=text,
|
||||
input_tokens=in_tokens,
|
||||
output_tokens=out_tokens,
|
||||
cache_creation_input_tokens=cache_create,
|
||||
cache_read_input_tokens=cache_read,
|
||||
estimated_cost_usd=cost,
|
||||
raw=response,
|
||||
)
|
||||
163
relay/config.py
Normal file
163
relay/config.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""Settings + config.yaml loader.
|
||||
|
||||
The relay reads two config sources:
|
||||
|
||||
1. ``.env`` for secrets (API key) and per-host overrides (status port,
|
||||
history cap). The ntfy topic is auto-generated on first run and
|
||||
written back to ``.env`` so it persists across restarts.
|
||||
|
||||
2. ``config.yaml`` for project-level settings: the registered CC
|
||||
sessions, the system prompt, the summarization prompt. First-PR
|
||||
default config is generated if the file doesn't exist, with one
|
||||
``session-1`` registered.
|
||||
|
||||
Settings are read once at startup; the daemon does not hot-reload them.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import secrets
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from dotenv import dotenv_values, set_key
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
ENV_FILE = REPO_ROOT / ".env"
|
||||
CONFIG_FILE = REPO_ROOT / "config.yaml"
|
||||
|
||||
DEFAULT_SYSTEM_PROMPT = (
|
||||
"You are the chat-side counterpart to Claude Code instances working on the "
|
||||
"risv3 project. CC sends you its progress; you respond as a project lead "
|
||||
"would: check decisions, ask clarifying questions, approve or correct. "
|
||||
"When a CC turn raises a question only JC (the human owner) can answer, "
|
||||
"begin your response with the literal token [NEEDS-JC] so the relay daemon "
|
||||
"pauses and notifies JC. Otherwise reply normally and the relay forwards "
|
||||
"your reply to the originating CC session."
|
||||
)
|
||||
|
||||
DEFAULT_SUMMARIZATION_PROMPT = (
|
||||
"Summarize the conversation so far. Preserve project context, decisions "
|
||||
"made, open work items, and any outstanding [NEEDS-JC] questions. The "
|
||||
"summary will replace earlier turns in the conversation history; the most "
|
||||
"recent turns will be retained verbatim. Be specific where specificity "
|
||||
"matters (file paths, issue numbers, decisions); be brief on routine "
|
||||
"back-and-forth."
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SessionConfig:
|
||||
"""One registered CC session."""
|
||||
|
||||
session_id: str
|
||||
working_dir: str | None = None
|
||||
description: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Settings:
|
||||
api_key: str
|
||||
model: str
|
||||
ntfy_topic: str
|
||||
status_port: int
|
||||
history_char_cap: int
|
||||
repo_root: Path
|
||||
queue_dir: Path
|
||||
dispatch_dir: Path
|
||||
state_dir: Path
|
||||
logs_dir: Path
|
||||
system_prompt: str
|
||||
summarization_prompt: str
|
||||
sessions: tuple[SessionConfig, ...] = field(default_factory=tuple)
|
||||
|
||||
|
||||
def _ensure_runtime_dirs(repo_root: Path) -> None:
|
||||
for sub in ("queue", "dispatch", "state", "logs"):
|
||||
(repo_root / sub).mkdir(exist_ok=True)
|
||||
|
||||
|
||||
def _generate_ntfy_topic() -> str:
|
||||
"""Cryptographically random 16-character topic. Functionally a password."""
|
||||
return secrets.token_urlsafe(12)
|
||||
|
||||
|
||||
def _load_or_init_ntfy_topic(env_values: dict[str, str | None]) -> str:
|
||||
raw = (env_values.get("NTFY_TOPIC") or "").strip()
|
||||
# Defensive: dotenv treats everything after = as the value, so an
|
||||
# inline `#` comment ends up as the topic. Treat any value that
|
||||
# starts with `#` (or contains whitespace, which a real topic never
|
||||
# does) as empty.
|
||||
if raw and not raw.startswith("#") and " " not in raw:
|
||||
return raw
|
||||
topic = _generate_ntfy_topic()
|
||||
set_key(str(ENV_FILE), "NTFY_TOPIC", topic, quote_mode="never")
|
||||
return topic
|
||||
|
||||
|
||||
def _load_yaml_config() -> dict:
|
||||
if not CONFIG_FILE.exists():
|
||||
default = {
|
||||
"system_prompt": DEFAULT_SYSTEM_PROMPT,
|
||||
"summarization_prompt": DEFAULT_SUMMARIZATION_PROMPT,
|
||||
"sessions": [
|
||||
{
|
||||
"session_id": "session-1",
|
||||
"working_dir": None,
|
||||
"description": "Default CC session",
|
||||
}
|
||||
],
|
||||
}
|
||||
CONFIG_FILE.write_text(yaml.safe_dump(default, sort_keys=False))
|
||||
return default
|
||||
with CONFIG_FILE.open() as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
|
||||
|
||||
def load_settings() -> Settings:
|
||||
"""Read .env + config.yaml. Mutates .env on first run to record the ntfy topic."""
|
||||
|
||||
env_values = dotenv_values(ENV_FILE)
|
||||
api_key = (
|
||||
env_values.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_API_KEY") or ""
|
||||
).strip()
|
||||
if not api_key:
|
||||
raise RuntimeError(f"ANTHROPIC_API_KEY missing from {ENV_FILE}")
|
||||
|
||||
model = (env_values.get("ANTHROPIC_MODEL") or "claude-opus-4-7").strip()
|
||||
status_port = int((env_values.get("STATUS_PORT") or "8765").strip())
|
||||
history_char_cap = int((env_values.get("HISTORY_CHAR_CAP") or "400000").strip())
|
||||
ntfy_topic = _load_or_init_ntfy_topic(env_values)
|
||||
|
||||
yaml_cfg = _load_yaml_config()
|
||||
system_prompt = str(yaml_cfg.get("system_prompt") or DEFAULT_SYSTEM_PROMPT)
|
||||
summarization_prompt = str(yaml_cfg.get("summarization_prompt") or DEFAULT_SUMMARIZATION_PROMPT)
|
||||
sessions_cfg = yaml_cfg.get("sessions") or []
|
||||
sessions = tuple(
|
||||
SessionConfig(
|
||||
session_id=str(s["session_id"]),
|
||||
working_dir=s.get("working_dir"),
|
||||
description=s.get("description"),
|
||||
)
|
||||
for s in sessions_cfg
|
||||
)
|
||||
|
||||
_ensure_runtime_dirs(REPO_ROOT)
|
||||
|
||||
return Settings(
|
||||
api_key=api_key,
|
||||
model=model,
|
||||
ntfy_topic=ntfy_topic,
|
||||
status_port=status_port,
|
||||
history_char_cap=history_char_cap,
|
||||
repo_root=REPO_ROOT,
|
||||
queue_dir=REPO_ROOT / "queue",
|
||||
dispatch_dir=REPO_ROOT / "dispatch",
|
||||
state_dir=REPO_ROOT / "state",
|
||||
logs_dir=REPO_ROOT / "logs",
|
||||
system_prompt=system_prompt,
|
||||
summarization_prompt=summarization_prompt,
|
||||
sessions=sessions,
|
||||
)
|
||||
99
relay/conversation.py
Normal file
99
relay/conversation.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""Conversation history with summarization.
|
||||
|
||||
History shape: list of turns ``{role, content, ts, session_id?}`` where
|
||||
``role`` is ``"user"`` or ``"assistant"``. The user role represents
|
||||
either CC output or JC override; the assistant role represents the
|
||||
chat-Claude response.
|
||||
|
||||
Summarization fires when the total content character count exceeds
|
||||
``HISTORY_CHAR_CAP``. The summarization prompt is sent as a normal user
|
||||
turn, the API's response replaces all earlier turns, and the most
|
||||
recent ``RECENT_TURNS_KEPT`` turns are appended verbatim. The summary
|
||||
turn is marked with ``meta="summary"`` so the daemon can recognize it
|
||||
when paginating.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from relay.state import read_json, write_json_atomic
|
||||
|
||||
RECENT_TURNS_KEPT = 10
|
||||
|
||||
|
||||
@dataclass
|
||||
class Turn:
|
||||
role: str # 'user' | 'assistant'
|
||||
content: str
|
||||
ts: str = field(default_factory=lambda: datetime.utcnow().isoformat(timespec="seconds") + "Z")
|
||||
session_id: str | None = None
|
||||
meta: str | None = None # 'summary' for replaced summary turns
|
||||
|
||||
def to_api_message(self) -> dict[str, str]:
|
||||
"""Anthropic Messages API shape — only role + content needed."""
|
||||
return {"role": self.role, "content": self.content}
|
||||
|
||||
|
||||
class Conversation:
|
||||
"""In-memory + on-disk conversation history."""
|
||||
|
||||
def __init__(self, history_path: Path):
|
||||
self._path = history_path
|
||||
raw = read_json(self._path, default=[])
|
||||
if not isinstance(raw, list):
|
||||
raise ValueError(f"Expected list at {self._path}, got {type(raw).__name__}")
|
||||
self._turns: list[Turn] = [Turn(**dict(t)) for t in raw]
|
||||
|
||||
@property
|
||||
def turns(self) -> list[Turn]:
|
||||
return list(self._turns)
|
||||
|
||||
def append(
|
||||
self, role: str, content: str, *, session_id: str | None = None, meta: str | None = None
|
||||
) -> Turn:
|
||||
turn = Turn(role=role, content=content, session_id=session_id, meta=meta)
|
||||
self._turns.append(turn)
|
||||
self._persist()
|
||||
return turn
|
||||
|
||||
def replace_with_summary(self, summary_text: str) -> None:
|
||||
"""Replace all but the last RECENT_TURNS_KEPT turns with one summary turn."""
|
||||
recent = self._turns[-RECENT_TURNS_KEPT:] if len(self._turns) > RECENT_TURNS_KEPT else []
|
||||
summary_turn = Turn(role="assistant", content=summary_text, meta="summary")
|
||||
self._turns = [summary_turn, *recent]
|
||||
self._persist()
|
||||
|
||||
def total_chars(self) -> int:
|
||||
return sum(len(t.content) for t in self._turns)
|
||||
|
||||
def needs_summarization(self, cap: int) -> bool:
|
||||
return self.total_chars() > cap
|
||||
|
||||
def to_api_messages(self) -> list[dict[str, str]]:
|
||||
return [t.to_api_message() for t in self._turns]
|
||||
|
||||
def _persist(self) -> None:
|
||||
write_json_atomic(self._path, [asdict(t) for t in self._turns])
|
||||
|
||||
# Test/debug helper
|
||||
def reset(self) -> None:
|
||||
self._turns = []
|
||||
self._persist()
|
||||
|
||||
|
||||
def render_for_log(turn: Turn, max_chars: int = 200) -> dict[str, Any]:
|
||||
"""Compact representation for log lines — full content elided."""
|
||||
content = turn.content
|
||||
if len(content) > max_chars:
|
||||
content = content[:max_chars] + f"... <{len(turn.content) - max_chars} more chars>"
|
||||
return {
|
||||
"role": turn.role,
|
||||
"ts": turn.ts,
|
||||
"session_id": turn.session_id,
|
||||
"meta": turn.meta,
|
||||
"content": content,
|
||||
}
|
||||
330
relay/daemon.py
Normal file
330
relay/daemon.py
Normal file
@@ -0,0 +1,330 @@
|
||||
"""Main relay loop.
|
||||
|
||||
One process, one thread, polling-based. The loop:
|
||||
|
||||
1. Drain ``state/jc_input.txt`` if present (highest priority).
|
||||
2. Drain the ``queue/`` directory oldest-first.
|
||||
3. Heartbeat: check for stuck-queue alerts.
|
||||
4. Sleep briefly, repeat.
|
||||
|
||||
Each turn (queue entry or jc_input) goes through ``handle_turn`` which:
|
||||
|
||||
1. Appends the user-side content to history.
|
||||
2. Summarizes if history exceeds the cap.
|
||||
3. Sends to the Anthropic API.
|
||||
4. Appends the assistant response to history.
|
||||
5. Routes the response: if it begins (within the first 200 chars) with
|
||||
``[NEEDS-JC]``, set status to ``needs_jc`` and ntfy JC; otherwise
|
||||
dispatch to the originating session.
|
||||
|
||||
The status flag is in-memory only (single process); it controls whether
|
||||
new queue entries are processed while the daemon is paused waiting for
|
||||
JC input. ``state/status.json`` mirrors it on disk for the future
|
||||
status endpoint.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from relay.anthropic_client import AnthropicClient, TurnResult
|
||||
from relay.config import Settings
|
||||
from relay.conversation import Conversation
|
||||
from relay.dispatch import DispatchManager
|
||||
from relay.ntfy import notify, topic_url
|
||||
from relay.queue import QueueEntry, ack, stuck_age_seconds, take_oldest
|
||||
from relay.state import InstanceLock, write_json_atomic
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
NEEDS_JC_TOKEN = "[NEEDS-JC]"
|
||||
NEEDS_JC_SCAN_CHARS = 200
|
||||
JC_INPUT_FILE = "jc_input.txt"
|
||||
STATUS_FILE = "status.json"
|
||||
STUCK_QUEUE_THRESHOLD_SEC = 600 # 10 min per spec
|
||||
STUCK_QUEUE_REPEAT_SEC = 600 # don't re-notify more often than this
|
||||
LOOP_SLEEP_SEC = 1.0
|
||||
DISPATCH_PREFIX = re.compile(r"^@(?P<session>[A-Za-z0-9_-]+):\s*", re.MULTILINE)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DaemonStatus:
|
||||
started_at: str = field(
|
||||
default_factory=lambda: datetime.now(timezone.utc).isoformat(timespec="seconds")
|
||||
)
|
||||
state: str = "running" # running | needs_jc | error
|
||||
last_needs_jc_at: str | None = None
|
||||
last_needs_jc_text: str | None = None
|
||||
last_dispatch_at: str | None = None
|
||||
last_dispatch_session: str | None = None
|
||||
queue_depth: int = 0
|
||||
history_chars: int = 0
|
||||
history_turns: int = 0
|
||||
total_input_tokens: int = 0
|
||||
total_output_tokens: int = 0
|
||||
total_cost_usd: float = 0.0
|
||||
last_stuck_alert_ts: float = 0.0
|
||||
|
||||
def as_dict(self) -> dict:
|
||||
d = self.__dict__.copy()
|
||||
d.pop("last_stuck_alert_ts", None)
|
||||
return d
|
||||
|
||||
|
||||
class Daemon:
|
||||
def __init__(self, settings: Settings):
|
||||
self.settings = settings
|
||||
self.lock = InstanceLock(settings.state_dir / ".lock")
|
||||
self.conversation = Conversation(settings.state_dir / "conversation.json")
|
||||
self.dispatch = DispatchManager(settings.dispatch_dir)
|
||||
self.client = AnthropicClient(api_key=settings.api_key, model=settings.model)
|
||||
self.status = DaemonStatus()
|
||||
self._stop = False
|
||||
|
||||
# ---- public API used by __main__ ----
|
||||
|
||||
def run(self) -> None:
|
||||
self.lock.acquire()
|
||||
try:
|
||||
self._announce_startup()
|
||||
while not self._stop:
|
||||
try:
|
||||
self._tick()
|
||||
except Exception:
|
||||
logger.exception("uncaught error in daemon loop; continuing")
|
||||
self._notify_error(
|
||||
"Daemon loop error",
|
||||
"An uncaught exception was logged. Check logs/relay.log.",
|
||||
)
|
||||
self._persist_status()
|
||||
time.sleep(LOOP_SLEEP_SEC)
|
||||
finally:
|
||||
self.lock.release()
|
||||
|
||||
def stop(self) -> None:
|
||||
self._stop = True
|
||||
|
||||
# ---- internals ----
|
||||
|
||||
def _announce_startup(self) -> None:
|
||||
url = topic_url(self.settings.ntfy_topic)
|
||||
logger.info("=" * 72)
|
||||
logger.info("relay daemon starting")
|
||||
logger.info("model: %s", self.settings.model)
|
||||
logger.info("status state: %s", self.settings.state_dir)
|
||||
logger.info("ntfy topic: %s", url)
|
||||
logger.info("Subscribe on phone/laptop to receive needs_jc + error alerts.")
|
||||
logger.info("registered sessions: %s", [s.session_id for s in self.settings.sessions])
|
||||
logger.info("history cap: %d chars", self.settings.history_char_cap)
|
||||
logger.info("=" * 72)
|
||||
notify(
|
||||
self.settings.ntfy_topic,
|
||||
title="relay daemon online",
|
||||
message=f"model={self.settings.model}, sessions={len(self.settings.sessions)}",
|
||||
tags=["robot"],
|
||||
)
|
||||
|
||||
def _tick(self) -> None:
|
||||
# 1) Try to flush any queued dispatches that were waiting on CC consumption.
|
||||
self.dispatch.flush_all()
|
||||
|
||||
# 2) JC override always takes priority.
|
||||
if self._handle_jc_input():
|
||||
return
|
||||
|
||||
# 3) If paused for needs_jc, do nothing further on the queue.
|
||||
if self.status.state == "needs_jc":
|
||||
return
|
||||
|
||||
# 4) Drain queue (one entry per tick — keeps logs and dispatch ordering predictable).
|
||||
entry = take_oldest(self.settings.queue_dir)
|
||||
if entry is not None:
|
||||
self._handle_queue_entry(entry)
|
||||
|
||||
# 5) Heartbeat: stuck-queue check.
|
||||
self._check_stuck_queue()
|
||||
|
||||
def _handle_jc_input(self) -> bool:
|
||||
path = self.settings.state_dir / JC_INPUT_FILE
|
||||
if not path.exists():
|
||||
return False
|
||||
|
||||
try:
|
||||
content = path.read_text(encoding="utf-8")
|
||||
except OSError as exc:
|
||||
logger.error("could not read %s: %s", path, exc)
|
||||
return False
|
||||
|
||||
path.unlink() # consume immediately so a slow API call doesn't double-process
|
||||
if not content.strip():
|
||||
logger.info("jc_input.txt was empty; ignoring")
|
||||
return False
|
||||
|
||||
# Prefix routing: "@session-id: ..." dispatches directly without an API call.
|
||||
match = DISPATCH_PREFIX.match(content)
|
||||
if match:
|
||||
session_id = match.group("session")
|
||||
payload = content[match.end() :]
|
||||
logger.info("JC override: direct dispatch to %s (%d chars)", session_id, len(payload))
|
||||
self.dispatch.queue_or_write(session_id, payload)
|
||||
self.status.last_dispatch_at = datetime.now(timezone.utc).isoformat(timespec="seconds")
|
||||
self.status.last_dispatch_session = session_id
|
||||
# JC override clears any needs_jc pause.
|
||||
self._clear_needs_jc()
|
||||
return True
|
||||
|
||||
# No prefix → treat as next chat-side turn (JC speaking from chat).
|
||||
logger.info("JC override: chat-side turn (%d chars)", len(content))
|
||||
self._clear_needs_jc()
|
||||
self._send_chat_turn(user_role_content=content, originating_session=None, source="jc")
|
||||
return True
|
||||
|
||||
def _handle_queue_entry(self, entry: QueueEntry) -> None:
|
||||
logger.info("queue entry from %s, %d chars", entry.session_id, len(entry.content))
|
||||
try:
|
||||
self._send_chat_turn(
|
||||
user_role_content=entry.content,
|
||||
originating_session=entry.session_id,
|
||||
source="queue",
|
||||
)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"error processing queue entry %s; leaving in queue for retry", entry.path
|
||||
)
|
||||
return
|
||||
ack(entry)
|
||||
|
||||
def _send_chat_turn(
|
||||
self, *, user_role_content: str, originating_session: str | None, source: str
|
||||
) -> None:
|
||||
# Append the user-side turn to history before the API call so a crash
|
||||
# mid-call doesn't lose the prompt.
|
||||
self.conversation.append("user", user_role_content, session_id=originating_session)
|
||||
|
||||
# Summarize if we've outgrown the cap.
|
||||
if self.conversation.needs_summarization(self.settings.history_char_cap):
|
||||
self._summarize()
|
||||
|
||||
# API call.
|
||||
result = self.client.send(
|
||||
system_prompt=self.settings.system_prompt,
|
||||
messages=self.conversation.to_api_messages(),
|
||||
)
|
||||
self._record_usage(result)
|
||||
logger.info(
|
||||
"[%s] api turn ok: in=%d out=%d cache_w=%d cache_r=%d cost=$%.4f",
|
||||
source,
|
||||
result.input_tokens,
|
||||
result.output_tokens,
|
||||
result.cache_creation_input_tokens,
|
||||
result.cache_read_input_tokens,
|
||||
result.estimated_cost_usd,
|
||||
)
|
||||
|
||||
# Append assistant response.
|
||||
self.conversation.append("assistant", result.text)
|
||||
|
||||
# Route: NEEDS-JC pause vs dispatch.
|
||||
if self._contains_needs_jc(result.text):
|
||||
self._enter_needs_jc(result.text)
|
||||
return
|
||||
|
||||
target_session = originating_session or self._fallback_session_id()
|
||||
if not target_session:
|
||||
logger.warning(
|
||||
"no originating session and no fallback session in config; chat reply dropped"
|
||||
)
|
||||
return
|
||||
self.dispatch.queue_or_write(target_session, result.text)
|
||||
self.status.last_dispatch_at = datetime.now(timezone.utc).isoformat(timespec="seconds")
|
||||
self.status.last_dispatch_session = target_session
|
||||
|
||||
def _summarize(self) -> None:
|
||||
before = self.conversation.total_chars()
|
||||
# Send the summarization as a fresh user turn appended to current history.
|
||||
# The API responds with the summary; we then collapse history into
|
||||
# [summary, last 10 turns].
|
||||
self.conversation.append(
|
||||
"user", self.settings.summarization_prompt, meta="summarize_request"
|
||||
)
|
||||
result = self.client.send(
|
||||
system_prompt=self.settings.system_prompt,
|
||||
messages=self.conversation.to_api_messages(),
|
||||
)
|
||||
self._record_usage(result)
|
||||
# Replace history with summary + most-recent. This drops the
|
||||
# summarize_request turn we just appended (it's only there to
|
||||
# produce the summary; not useful in the rolling history).
|
||||
self.conversation.replace_with_summary(result.text)
|
||||
after = self.conversation.total_chars()
|
||||
logger.info(
|
||||
"summarization: %d chars -> %d chars (cost $%.4f)",
|
||||
before,
|
||||
after,
|
||||
result.estimated_cost_usd,
|
||||
)
|
||||
|
||||
def _contains_needs_jc(self, text: str) -> bool:
|
||||
return NEEDS_JC_TOKEN in text[:NEEDS_JC_SCAN_CHARS]
|
||||
|
||||
def _enter_needs_jc(self, response_text: str) -> None:
|
||||
self.status.state = "needs_jc"
|
||||
self.status.last_needs_jc_at = datetime.now(timezone.utc).isoformat(timespec="seconds")
|
||||
self.status.last_needs_jc_text = response_text[:1000]
|
||||
logger.warning("[NEEDS-JC] flagged; daemon paused awaiting state/jc_input.txt")
|
||||
notify(
|
||||
self.settings.ntfy_topic,
|
||||
title="[NEEDS-JC] relay paused",
|
||||
message=response_text[:400],
|
||||
priority="high",
|
||||
tags=["warning"],
|
||||
)
|
||||
|
||||
def _clear_needs_jc(self) -> None:
|
||||
if self.status.state != "running":
|
||||
logger.info("clearing needs_jc state (was %s)", self.status.state)
|
||||
self.status.state = "running"
|
||||
|
||||
def _fallback_session_id(self) -> str | None:
|
||||
if self.settings.sessions:
|
||||
return self.settings.sessions[0].session_id
|
||||
return None
|
||||
|
||||
def _record_usage(self, result: TurnResult) -> None:
|
||||
self.status.total_input_tokens += result.input_tokens
|
||||
self.status.total_output_tokens += result.output_tokens
|
||||
self.status.total_cost_usd += result.estimated_cost_usd
|
||||
|
||||
def _persist_status(self) -> None:
|
||||
self.status.queue_depth = (
|
||||
len(list((self.settings.queue_dir).iterdir()))
|
||||
if self.settings.queue_dir.exists()
|
||||
else 0
|
||||
)
|
||||
self.status.history_chars = self.conversation.total_chars()
|
||||
self.status.history_turns = len(self.conversation.turns)
|
||||
write_json_atomic(self.settings.state_dir / STATUS_FILE, self.status.as_dict())
|
||||
|
||||
def _check_stuck_queue(self) -> None:
|
||||
age = stuck_age_seconds(self.settings.queue_dir)
|
||||
if age <= STUCK_QUEUE_THRESHOLD_SEC:
|
||||
return
|
||||
now = time.time()
|
||||
if now - self.status.last_stuck_alert_ts < STUCK_QUEUE_REPEAT_SEC:
|
||||
return
|
||||
self.status.last_stuck_alert_ts = now
|
||||
logger.warning("queue stuck: oldest entry is %.0fs old", age)
|
||||
notify(
|
||||
self.settings.ntfy_topic,
|
||||
title="relay queue stuck",
|
||||
message=f"oldest entry is {int(age)}s old; daemon may be paused or the API failing.",
|
||||
priority="high",
|
||||
tags=["warning"],
|
||||
)
|
||||
|
||||
def _notify_error(self, title: str, message: str) -> None:
|
||||
notify(self.settings.ntfy_topic, title=title, message=message, priority="high", tags=["x"])
|
||||
71
relay/dispatch.py
Normal file
71
relay/dispatch.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""Dispatch writer.
|
||||
|
||||
Each registered CC session has a directory ``dispatch/<session_id>/``.
|
||||
The daemon delivers a chat-side response by writing it to
|
||||
``dispatch/<session_id>/input.txt``. CC's polling loop reads, acts on
|
||||
the content, and **deletes** the file as the acknowledgement.
|
||||
|
||||
The "only write when prior is consumed" rule means the daemon must not
|
||||
overwrite a pending dispatch — otherwise CC could miss a turn. If the
|
||||
daemon has new content for a session that hasn't yet consumed the
|
||||
previous one, it queues internally and waits.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections import defaultdict, deque
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from relay.state import write_atomic
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DispatchManager:
|
||||
dispatch_dir: Path
|
||||
_pending: dict[str, deque[str]] = field(default_factory=lambda: defaultdict(deque))
|
||||
|
||||
def session_dir(self, session_id: str) -> Path:
|
||||
return self.dispatch_dir / session_id
|
||||
|
||||
def input_path(self, session_id: str) -> Path:
|
||||
return self.session_dir(session_id) / "input.txt"
|
||||
|
||||
def has_pending(self, session_id: str) -> bool:
|
||||
return bool(self._pending[session_id])
|
||||
|
||||
def session_input_present(self, session_id: str) -> bool:
|
||||
"""True iff the session's input.txt exists (CC hasn't consumed yet)."""
|
||||
return self.input_path(session_id).exists()
|
||||
|
||||
def queue_or_write(self, session_id: str, content: str) -> bool:
|
||||
"""Try to deliver content. If session's input.txt is still present,
|
||||
queue internally and return False; otherwise write and return True.
|
||||
"""
|
||||
self._pending[session_id].append(content)
|
||||
return self._flush_session(session_id)
|
||||
|
||||
def flush_all(self) -> int:
|
||||
"""Try to flush queued dispatches for every session. Returns count delivered."""
|
||||
delivered = 0
|
||||
for session_id in list(self._pending.keys()):
|
||||
while self._pending[session_id]:
|
||||
if not self._flush_session(session_id):
|
||||
break
|
||||
delivered += 1
|
||||
return delivered
|
||||
|
||||
def _flush_session(self, session_id: str) -> bool:
|
||||
if not self._pending[session_id]:
|
||||
return False
|
||||
if self.session_input_present(session_id):
|
||||
return False
|
||||
next_content = self._pending[session_id].popleft()
|
||||
path = self.input_path(session_id)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
write_atomic(path, next_content)
|
||||
logger.info("dispatched %d chars to %s -> %s", len(next_content), session_id, path)
|
||||
return True
|
||||
46
relay/logs.py
Normal file
46
relay/logs.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""Logging setup.
|
||||
|
||||
Root logger writes to stdout (so tmux/systemd captures it) and to
|
||||
``logs/relay-YYYY-MM-DD.log`` with daily rotation. Format includes
|
||||
timestamp, level, logger name, and message.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from logging.handlers import TimedRotatingFileHandler
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def configure(logs_dir: Path, level: int = logging.INFO) -> None:
|
||||
logs_dir.mkdir(parents=True, exist_ok=True)
|
||||
fmt = logging.Formatter(
|
||||
fmt="%(asctime)s %(levelname)-7s %(name)-22s %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%S",
|
||||
)
|
||||
|
||||
root = logging.getLogger()
|
||||
root.setLevel(level)
|
||||
|
||||
# Remove pre-existing handlers (idempotent across reload during tests)
|
||||
for handler in list(root.handlers):
|
||||
root.removeHandler(handler)
|
||||
|
||||
stdout = logging.StreamHandler(stream=sys.stdout)
|
||||
stdout.setFormatter(fmt)
|
||||
root.addHandler(stdout)
|
||||
|
||||
file_handler = TimedRotatingFileHandler(
|
||||
filename=str(logs_dir / "relay.log"),
|
||||
when="midnight",
|
||||
backupCount=14,
|
||||
encoding="utf-8",
|
||||
utc=True,
|
||||
)
|
||||
file_handler.setFormatter(fmt)
|
||||
root.addHandler(file_handler)
|
||||
|
||||
# Quiet libraries that are too chatty at INFO
|
||||
for noisy in ("urllib3", "httpx", "httpcore", "anthropic"):
|
||||
logging.getLogger(noisy).setLevel(logging.WARNING)
|
||||
59
relay/ntfy.py
Normal file
59
relay/ntfy.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""ntfy.sh notifications.
|
||||
|
||||
Topic is loaded from settings (auto-generated on first run). The topic
|
||||
is functionally a password — anyone subscribed to the topic URL
|
||||
receives notifications. We use cryptographically-random topics
|
||||
(secrets.token_urlsafe(12)) to make brute-force discovery impractical.
|
||||
|
||||
Notifications are best-effort: a failure to deliver (network down,
|
||||
ntfy.sh outage) is logged but does NOT block the daemon's main loop.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
NTFY_BASE = "https://ntfy.sh"
|
||||
|
||||
|
||||
def topic_url(topic: str) -> str:
|
||||
return f"{NTFY_BASE}/{topic}"
|
||||
|
||||
|
||||
def notify(
|
||||
topic: str,
|
||||
title: str,
|
||||
message: str,
|
||||
*,
|
||||
priority: str = "default",
|
||||
tags: list[str] | None = None,
|
||||
) -> bool:
|
||||
"""Post one notification. Returns True on HTTP 200, False otherwise.
|
||||
|
||||
Never raises on transport errors — this path runs from the daemon's
|
||||
main loop and a failed notification should not stop work.
|
||||
"""
|
||||
if not topic:
|
||||
logger.warning("ntfy topic is empty; skipping notification: %s", title)
|
||||
return False
|
||||
headers = {
|
||||
"Title": title,
|
||||
"Priority": priority,
|
||||
}
|
||||
if tags:
|
||||
headers["Tags"] = ",".join(tags)
|
||||
try:
|
||||
resp = requests.post(
|
||||
topic_url(topic), data=message.encode("utf-8"), headers=headers, timeout=10
|
||||
)
|
||||
except requests.RequestException as exc:
|
||||
logger.warning("ntfy delivery failed for topic <redacted>: %s", exc)
|
||||
return False
|
||||
if resp.status_code != 200:
|
||||
logger.warning("ntfy non-200 (%s): %s", resp.status_code, resp.text[:200])
|
||||
return False
|
||||
return True
|
||||
129
relay/queue.py
Normal file
129
relay/queue.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""Queue intake.
|
||||
|
||||
CC sessions drop JSON envelopes into ``queue/``. The envelope shape:
|
||||
|
||||
{
|
||||
"session_id": "session-1",
|
||||
"timestamp": "2026-05-02T12:34:56Z",
|
||||
"content": "..."
|
||||
}
|
||||
|
||||
The daemon picks the oldest entry by mtime, validates the envelope,
|
||||
returns it, and deletes the file once processing succeeds. A
|
||||
malformed envelope is moved to ``queue/.rejected/`` so the daemon
|
||||
doesn't retry it forever.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class QueueError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class QueueEntry:
|
||||
path: Path
|
||||
session_id: str
|
||||
timestamp: str
|
||||
content: str
|
||||
mtime: float
|
||||
|
||||
|
||||
def _validate_envelope(payload: object, *, path: Path) -> tuple[str, str, str]:
|
||||
if not isinstance(payload, dict):
|
||||
raise QueueError(f"{path}: envelope must be a JSON object")
|
||||
for required in ("session_id", "timestamp", "content"):
|
||||
if required not in payload:
|
||||
raise QueueError(f"{path}: missing required key '{required}'")
|
||||
session_id = str(payload["session_id"]).strip()
|
||||
timestamp = str(payload["timestamp"]).strip()
|
||||
content = str(payload["content"])
|
||||
if not session_id:
|
||||
raise QueueError(f"{path}: session_id is empty")
|
||||
if not timestamp:
|
||||
raise QueueError(f"{path}: timestamp is empty")
|
||||
if not content.strip():
|
||||
raise QueueError(f"{path}: content is empty")
|
||||
return session_id, timestamp, content
|
||||
|
||||
|
||||
def _reject(path: Path, reason: str) -> None:
|
||||
rejected_dir = path.parent / ".rejected"
|
||||
rejected_dir.mkdir(exist_ok=True)
|
||||
target = rejected_dir / path.name
|
||||
shutil.move(str(path), str(target))
|
||||
logger.warning("rejected queue entry %s -> %s: %s", path, target, reason)
|
||||
|
||||
|
||||
def list_pending(queue_dir: Path) -> list[Path]:
|
||||
"""Return queue entries oldest-first, excluding the .rejected dir."""
|
||||
if not queue_dir.exists():
|
||||
return []
|
||||
entries = [
|
||||
p
|
||||
for p in queue_dir.iterdir()
|
||||
if p.is_file() and p.suffix == ".json" and not p.name.startswith(".")
|
||||
]
|
||||
entries.sort(key=lambda p: p.stat().st_mtime)
|
||||
return entries
|
||||
|
||||
|
||||
def take_oldest(queue_dir: Path) -> QueueEntry | None:
|
||||
"""Read and validate the oldest queue entry. Returns None if queue empty.
|
||||
|
||||
Rejects malformed envelopes by moving them to .rejected/. The caller
|
||||
is responsible for calling ``ack(entry)`` once processing succeeds
|
||||
to delete the file; until then it remains in the queue.
|
||||
"""
|
||||
pending = list_pending(queue_dir)
|
||||
if not pending:
|
||||
return None
|
||||
path = pending[0]
|
||||
try:
|
||||
raw = path.read_text(encoding="utf-8")
|
||||
payload = json.loads(raw)
|
||||
session_id, timestamp, content = _validate_envelope(payload, path=path)
|
||||
except (OSError, json.JSONDecodeError) as exc:
|
||||
_reject(path, f"unreadable / not JSON: {exc}")
|
||||
return None
|
||||
except QueueError as exc:
|
||||
_reject(path, str(exc))
|
||||
return None
|
||||
|
||||
return QueueEntry(
|
||||
path=path,
|
||||
session_id=session_id,
|
||||
timestamp=timestamp,
|
||||
content=content,
|
||||
mtime=path.stat().st_mtime,
|
||||
)
|
||||
|
||||
|
||||
def ack(entry: QueueEntry) -> None:
|
||||
"""Delete the queue file. Called by the daemon after the turn is dispatched."""
|
||||
try:
|
||||
entry.path.unlink()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
def stuck_age_seconds(queue_dir: Path) -> float:
|
||||
"""How long the oldest pending entry has been waiting. 0 if queue empty.
|
||||
|
||||
Used by the daemon's heartbeat to fire a ntfy alert when the queue
|
||||
is stuck (e.g., persistent API failure).
|
||||
"""
|
||||
pending = list_pending(queue_dir)
|
||||
if not pending:
|
||||
return 0.0
|
||||
return time.time() - pending[0].stat().st_mtime
|
||||
100
relay/state.py
Normal file
100
relay/state.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""Atomic state-file I/O and instance lock.
|
||||
|
||||
The conversation history lives at ``state/conversation.json``. Mutated
|
||||
in-memory by the daemon and written via temp+rename to avoid partial
|
||||
writes if the process is killed mid-write. A ``state/.lock`` advisory
|
||||
file stops two daemons from running against the same directory.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import errno
|
||||
import fcntl
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
|
||||
class StateError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class InstanceLock:
|
||||
"""Holds a flock on state/.lock for the daemon's lifetime.
|
||||
|
||||
Released automatically on process exit (kernel closes the fd) or by
|
||||
calling ``release()``. ``acquire()`` raises StateError if another
|
||||
daemon already holds the lock.
|
||||
"""
|
||||
|
||||
lock_path: Path
|
||||
_fd: int | None = None
|
||||
|
||||
def acquire(self) -> None:
|
||||
self._fd = os.open(self.lock_path, os.O_RDWR | os.O_CREAT, 0o600)
|
||||
try:
|
||||
fcntl.flock(self._fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except OSError as exc:
|
||||
os.close(self._fd)
|
||||
self._fd = None
|
||||
if exc.errno in {errno.EAGAIN, errno.EACCES}:
|
||||
raise StateError(
|
||||
f"Another daemon is holding {self.lock_path}; refusing to start"
|
||||
) from exc
|
||||
raise
|
||||
os.write(self._fd, str(os.getpid()).encode())
|
||||
|
||||
def release(self) -> None:
|
||||
if self._fd is None:
|
||||
return
|
||||
try:
|
||||
fcntl.flock(self._fd, fcntl.LOCK_UN)
|
||||
finally:
|
||||
os.close(self._fd)
|
||||
self._fd = None
|
||||
|
||||
|
||||
def write_atomic(path: Path, data: str) -> None:
|
||||
"""Atomically write text to ``path`` via temp file + rename.
|
||||
|
||||
Crash-safe: a partial write leaves the temp file but does not
|
||||
overwrite the target. fsync the data file (not the directory) so the
|
||||
rename atomicity gives us durability up to the OS-level rename
|
||||
barrier.
|
||||
"""
|
||||
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with NamedTemporaryFile(
|
||||
mode="w",
|
||||
encoding="utf-8",
|
||||
dir=str(path.parent),
|
||||
prefix=f".{path.name}.",
|
||||
suffix=".tmp",
|
||||
delete=False,
|
||||
) as tmp:
|
||||
tmp.write(data)
|
||||
tmp.flush()
|
||||
os.fsync(tmp.fileno())
|
||||
tmp_path = Path(tmp.name)
|
||||
os.replace(tmp_path, path)
|
||||
|
||||
|
||||
def write_json_atomic(path: Path, value: object) -> None:
|
||||
write_atomic(path, json.dumps(value, indent=2, ensure_ascii=False, sort_keys=False))
|
||||
|
||||
|
||||
def read_json(path: Path, default: object) -> object:
|
||||
"""Read JSON or return default when the file is missing or empty."""
|
||||
|
||||
if not path.exists():
|
||||
return default
|
||||
raw = path.read_text(encoding="utf-8").strip()
|
||||
if not raw:
|
||||
return default
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError as exc:
|
||||
raise StateError(f"Corrupt JSON at {path}: {exc}") from exc
|
||||
Reference in New Issue
Block a user