First-PR scope from #1. Single-process Python daemon that relays between Claude Code instances and chat-Claude (Anthropic API). Components: * relay.config — .env + config.yaml loader. Auto-generates ntfy topic on first run and persists it back to .env. * relay.state — atomic file I/O via tempfile + rename, advisory flock at state/.lock to enforce single-instance. * relay.conversation — append-only history with summarization. Triggers a summarize call when total chars exceed HISTORY_CHAR_CAP (default 400k); replaces history with the summary plus the most recent 10 turns. * relay.anthropic_client — SDK wrapper. Marks the system prompt cacheable (5-min ephemeral cache); concatenates text blocks; estimates per-call cost from the Anthropic price table with cache-write/read accounted for. * relay.queue — JSON envelope intake; oldest-by-mtime; malformed envelopes moved to queue/.rejected/. * relay.dispatch — one-input-at-a-time per session (dispatch/<session_id>/input.txt). Won't overwrite a pending dispatch; queues internally and waits for CC to delete. * relay.ntfy — best-effort POST to https://ntfy.sh/<topic>; failures logged but never block the main loop. * relay.daemon — main loop. Polls jc_input.txt (priority) then queue/. Detects [NEEDS-JC] in the first 200 chars of any response and pauses dispatch until JC writes jc_input.txt. JC override supports @session-N: prefix for direct dispatch without an API call. * relay.__main__ — CLI: relay run / relay status / relay topic. Tests: 57 unit tests pass (config, state, conversation, queue, dispatch, anthropic_client, ntfy, full daemon loop with a fake client). One real-API smoke test marked real_api, opt-in via pytest -m real_api; skips cleanly on credit-balance errors. Out of scope for this PR (deferred to follow-ups): Flask status endpoint, multi-session config in production, exponential backoff, systemd unit, cost-tracking aggregation. Closes #1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
120 lines
4.0 KiB
Python
120 lines
4.0 KiB
Python
"""Anthropic API wrapper.
|
|
|
|
Wraps the SDK call so the daemon can send a turn and get back the
|
|
assistant text + token usage + estimated cost. Implements prompt
|
|
caching on the system prompt: subsequent calls within the 5-minute TTL
|
|
get a cache hit on the (large, repeated) system prompt and pay the
|
|
cheaper cache-hit rate. Per-turn user/assistant content is never marked
|
|
cacheable because it changes every call.
|
|
|
|
Cost estimation uses a model→price table; the table is the source of
|
|
truth and is easy to update when pricing changes.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from anthropic import Anthropic
|
|
from anthropic.types import Message
|
|
|
|
# Pricing per 1M tokens (USD), pulled from Anthropic's published schedule.
|
|
# Cache-hit input is billed at the cache-read rate (~10% of standard input).
|
|
# Cache-write is ~25% more than standard input. These numbers are
|
|
# approximate and used only for log-line cost estimation; the source of
|
|
# truth for billing is Anthropic's invoice.
|
|
_PRICES_PER_MILLION: dict[str, dict[str, float]] = {
|
|
"claude-opus-4-7": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},
|
|
"claude-opus-4-7-1m": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},
|
|
"claude-sonnet-4-6": {"input": 3.0, "output": 15.0, "cache_write": 3.75, "cache_read": 0.30},
|
|
"claude-haiku-4-5-20251001": {
|
|
"input": 0.80,
|
|
"output": 4.0,
|
|
"cache_write": 1.00,
|
|
"cache_read": 0.08,
|
|
},
|
|
}
|
|
|
|
|
|
def _price_for(model: str) -> dict[str, float]:
|
|
if model in _PRICES_PER_MILLION:
|
|
return _PRICES_PER_MILLION[model]
|
|
# Fallback: charge as Opus 4.7 (worst-case) so estimates don't
|
|
# under-report for unknown models. Logged once at startup.
|
|
return _PRICES_PER_MILLION["claude-opus-4-7"]
|
|
|
|
|
|
@dataclass
|
|
class TurnResult:
|
|
text: str
|
|
input_tokens: int
|
|
output_tokens: int
|
|
cache_creation_input_tokens: int
|
|
cache_read_input_tokens: int
|
|
estimated_cost_usd: float
|
|
raw: Message
|
|
|
|
|
|
@dataclass
|
|
class AnthropicClient:
|
|
api_key: str
|
|
model: str
|
|
max_output_tokens: int = 4096
|
|
|
|
def __post_init__(self) -> None:
|
|
self._sdk = Anthropic(api_key=self.api_key)
|
|
|
|
def send(
|
|
self,
|
|
*,
|
|
system_prompt: str,
|
|
messages: list[dict[str, str]],
|
|
) -> TurnResult:
|
|
"""Send a single API turn. The system prompt is marked cacheable.
|
|
|
|
``messages`` is the full conversation history shaped for the
|
|
Messages API. The most recent message is the user turn we're
|
|
responding to. The daemon is responsible for appending the
|
|
assistant text it gets back into history before the next call.
|
|
"""
|
|
|
|
response = self._sdk.messages.create(
|
|
model=self.model,
|
|
max_tokens=self.max_output_tokens,
|
|
system=[
|
|
{
|
|
"type": "text",
|
|
"text": system_prompt,
|
|
"cache_control": {"type": "ephemeral"},
|
|
}
|
|
],
|
|
messages=messages,
|
|
)
|
|
|
|
text = "".join(
|
|
block.text for block in response.content if getattr(block, "type", None) == "text"
|
|
)
|
|
usage = response.usage
|
|
in_tokens = int(getattr(usage, "input_tokens", 0) or 0)
|
|
out_tokens = int(getattr(usage, "output_tokens", 0) or 0)
|
|
cache_create = int(getattr(usage, "cache_creation_input_tokens", 0) or 0)
|
|
cache_read = int(getattr(usage, "cache_read_input_tokens", 0) or 0)
|
|
|
|
prices = _price_for(self.model)
|
|
cost = (
|
|
in_tokens * prices["input"]
|
|
+ out_tokens * prices["output"]
|
|
+ cache_create * prices["cache_write"]
|
|
+ cache_read * prices["cache_read"]
|
|
) / 1_000_000.0
|
|
|
|
return TurnResult(
|
|
text=text,
|
|
input_tokens=in_tokens,
|
|
output_tokens=out_tokens,
|
|
cache_creation_input_tokens=cache_create,
|
|
cache_read_input_tokens=cache_read,
|
|
estimated_cost_usd=cost,
|
|
raw=response,
|
|
)
|