This repository has been archived on 2026-05-02. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
risv3-relay/relay/anthropic_client.py
ac 540b4f5b01 feat: relay daemon skeleton — queue, dispatch, conversation, ntfy (#1)
First-PR scope from #1. Single-process Python daemon that relays
between Claude Code instances and chat-Claude (Anthropic API).

Components:

* relay.config — .env + config.yaml loader. Auto-generates ntfy
  topic on first run and persists it back to .env.
* relay.state — atomic file I/O via tempfile + rename, advisory
  flock at state/.lock to enforce single-instance.
* relay.conversation — append-only history with summarization.
  Triggers a summarize call when total chars exceed
  HISTORY_CHAR_CAP (default 400k); replaces history with the
  summary plus the most recent 10 turns.
* relay.anthropic_client — SDK wrapper. Marks the system prompt
  cacheable (5-min ephemeral cache); concatenates text blocks;
  estimates per-call cost from the Anthropic price table with
  cache-write/read accounted for.
* relay.queue — JSON envelope intake; oldest-by-mtime;
  malformed envelopes moved to queue/.rejected/.
* relay.dispatch — one-input-at-a-time per session
  (dispatch/<session_id>/input.txt). Won't overwrite a pending
  dispatch; queues internally and waits for CC to delete.
* relay.ntfy — best-effort POST to https://ntfy.sh/<topic>;
  failures logged but never block the main loop.
* relay.daemon — main loop. Polls jc_input.txt (priority) then
  queue/. Detects [NEEDS-JC] in the first 200 chars of any
  response and pauses dispatch until JC writes jc_input.txt.
  JC override supports @session-N: prefix for direct dispatch
  without an API call.
* relay.__main__ — CLI: relay run / relay status / relay topic.

Tests: 57 unit tests pass (config, state, conversation, queue,
dispatch, anthropic_client, ntfy, full daemon loop with a fake
client). One real-API smoke test marked real_api, opt-in via
pytest -m real_api; skips cleanly on credit-balance errors.

Out of scope for this PR (deferred to follow-ups): Flask status
endpoint, multi-session config in production, exponential
backoff, systemd unit, cost-tracking aggregation.

Closes #1.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 15:24:47 +00:00

120 lines
4.0 KiB
Python

"""Anthropic API wrapper.
Wraps the SDK call so the daemon can send a turn and get back the
assistant text + token usage + estimated cost. Implements prompt
caching on the system prompt: subsequent calls within the 5-minute TTL
get a cache hit on the (large, repeated) system prompt and pay the
cheaper cache-hit rate. Per-turn user/assistant content is never marked
cacheable because it changes every call.
Cost estimation uses a model→price table; the table is the source of
truth and is easy to update when pricing changes.
"""
from __future__ import annotations
from dataclasses import dataclass
from anthropic import Anthropic
from anthropic.types import Message
# Pricing per 1M tokens (USD), pulled from Anthropic's published schedule.
# Cache-hit input is billed at the cache-read rate (~10% of standard input).
# Cache-write is ~25% more than standard input. These numbers are
# approximate and used only for log-line cost estimation; the source of
# truth for billing is Anthropic's invoice.
_PRICES_PER_MILLION: dict[str, dict[str, float]] = {
"claude-opus-4-7": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},
"claude-opus-4-7-1m": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},
"claude-sonnet-4-6": {"input": 3.0, "output": 15.0, "cache_write": 3.75, "cache_read": 0.30},
"claude-haiku-4-5-20251001": {
"input": 0.80,
"output": 4.0,
"cache_write": 1.00,
"cache_read": 0.08,
},
}
def _price_for(model: str) -> dict[str, float]:
if model in _PRICES_PER_MILLION:
return _PRICES_PER_MILLION[model]
# Fallback: charge as Opus 4.7 (worst-case) so estimates don't
# under-report for unknown models. Logged once at startup.
return _PRICES_PER_MILLION["claude-opus-4-7"]
@dataclass
class TurnResult:
text: str
input_tokens: int
output_tokens: int
cache_creation_input_tokens: int
cache_read_input_tokens: int
estimated_cost_usd: float
raw: Message
@dataclass
class AnthropicClient:
api_key: str
model: str
max_output_tokens: int = 4096
def __post_init__(self) -> None:
self._sdk = Anthropic(api_key=self.api_key)
def send(
self,
*,
system_prompt: str,
messages: list[dict[str, str]],
) -> TurnResult:
"""Send a single API turn. The system prompt is marked cacheable.
``messages`` is the full conversation history shaped for the
Messages API. The most recent message is the user turn we're
responding to. The daemon is responsible for appending the
assistant text it gets back into history before the next call.
"""
response = self._sdk.messages.create(
model=self.model,
max_tokens=self.max_output_tokens,
system=[
{
"type": "text",
"text": system_prompt,
"cache_control": {"type": "ephemeral"},
}
],
messages=messages,
)
text = "".join(
block.text for block in response.content if getattr(block, "type", None) == "text"
)
usage = response.usage
in_tokens = int(getattr(usage, "input_tokens", 0) or 0)
out_tokens = int(getattr(usage, "output_tokens", 0) or 0)
cache_create = int(getattr(usage, "cache_creation_input_tokens", 0) or 0)
cache_read = int(getattr(usage, "cache_read_input_tokens", 0) or 0)
prices = _price_for(self.model)
cost = (
in_tokens * prices["input"]
+ out_tokens * prices["output"]
+ cache_create * prices["cache_write"]
+ cache_read * prices["cache_read"]
) / 1_000_000.0
return TurnResult(
text=text,
input_tokens=in_tokens,
output_tokens=out_tokens,
cache_creation_input_tokens=cache_create,
cache_read_input_tokens=cache_read,
estimated_cost_usd=cost,
raw=response,
)