relay/anthropic_client.py

"""Anthropic API wrapper.

Wraps the SDK call so the daemon can send a turn and get back the
assistant text + token usage + estimated cost. Implements prompt
caching on the system prompt: subsequent calls within the 5-minute TTL
get a cache hit on the (large, repeated) system prompt and pay the
cheaper cache-hit rate. Per-turn user/assistant content is never marked
cacheable because it changes every call.

Cost estimation uses a model→price table; the table is the source of
truth and is easy to update when pricing changes.
"""

from __future__ import annotations

from dataclasses import dataclass

from anthropic import Anthropic
from anthropic.types import Message

# Pricing per 1M tokens (USD), pulled from Anthropic's published schedule.
# Cache-hit input is billed at the cache-read rate (~10% of standard input).
# Cache-write is ~25% more than standard input. These numbers are
# approximate and used only for log-line cost estimation; the source of
# truth for billing is Anthropic's invoice.
_PRICES_PER_MILLION: dict[str, dict[str, float]] = {
    "claude-opus-4-7": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},
    "claude-opus-4-7-1m": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},
    "claude-sonnet-4-6": {"input": 3.0, "output": 15.0, "cache_write": 3.75, "cache_read": 0.30},
    "claude-haiku-4-5-20251001": {
        "input": 0.80,
        "output": 4.0,
        "cache_write": 1.00,
        "cache_read": 0.08,
    },
}


def _price_for(model: str) -> dict[str, float]:
    if model in _PRICES_PER_MILLION:
        return _PRICES_PER_MILLION[model]
    # Fallback: charge as Opus 4.7 (worst-case) so estimates don't
    # under-report for unknown models. Logged once at startup.
    return _PRICES_PER_MILLION["claude-opus-4-7"]


@dataclass
class TurnResult:
    text: str
    input_tokens: int
    output_tokens: int
    cache_creation_input_tokens: int
    cache_read_input_tokens: int
    estimated_cost_usd: float
    raw: Message


@dataclass
class AnthropicClient:
    api_key: str
    model: str
    max_output_tokens: int = 4096

    def __post_init__(self) -> None:
        self._sdk = Anthropic(api_key=self.api_key)

    def send(
        self,
        *,
        system_prompt: str,
        messages: list[dict[str, str]],
    ) -> TurnResult:
        """Send a single API turn. The system prompt is marked cacheable.

        ``messages`` is the full conversation history shaped for the
        Messages API. The most recent message is the user turn we're
        responding to. The daemon is responsible for appending the
        assistant text it gets back into history before the next call.
        """

        response = self._sdk.messages.create(
            model=self.model,
            max_tokens=self.max_output_tokens,
            system=[
                {
                    "type": "text",
                    "text": system_prompt,
                    "cache_control": {"type": "ephemeral"},
                }
            ],
            messages=messages,
        )

        text = "".join(
            block.text for block in response.content if getattr(block, "type", None) == "text"
        )
        usage = response.usage
        in_tokens = int(getattr(usage, "input_tokens", 0) or 0)
        out_tokens = int(getattr(usage, "output_tokens", 0) or 0)
        cache_create = int(getattr(usage, "cache_creation_input_tokens", 0) or 0)
        cache_read = int(getattr(usage, "cache_read_input_tokens", 0) or 0)

        prices = _price_for(self.model)
        cost = (
            in_tokens * prices["input"]
            + out_tokens * prices["output"]
            + cache_create * prices["cache_write"]
            + cache_read * prices["cache_read"]
        ) / 1_000_000.0

        return TurnResult(
            text=text,
            input_tokens=in_tokens,
            output_tokens=out_tokens,
            cache_creation_input_tokens=cache_create,
            cache_read_input_tokens=cache_read,
            estimated_cost_usd=cost,
            raw=response,
        )
feat: relay daemon skeleton — queue, dispatch, conversation, ntfy (#1) First-PR scope from #1. Single-process Python daemon that relays between Claude Code instances and chat-Claude (Anthropic API). Components: * relay.config — .env + config.yaml loader. Auto-generates ntfy topic on first run and persists it back to .env. * relay.state — atomic file I/O via tempfile + rename, advisory flock at state/.lock to enforce single-instance. * relay.conversation — append-only history with summarization. Triggers a summarize call when total chars exceed HISTORY_CHAR_CAP (default 400k); replaces history with the summary plus the most recent 10 turns. * relay.anthropic_client — SDK wrapper. Marks the system prompt cacheable (5-min ephemeral cache); concatenates text blocks; estimates per-call cost from the Anthropic price table with cache-write/read accounted for. * relay.queue — JSON envelope intake; oldest-by-mtime; malformed envelopes moved to queue/.rejected/. * relay.dispatch — one-input-at-a-time per session (dispatch/<session_id>/input.txt). Won't overwrite a pending dispatch; queues internally and waits for CC to delete. * relay.ntfy — best-effort POST to https://ntfy.sh/<topic>; failures logged but never block the main loop. * relay.daemon — main loop. Polls jc_input.txt (priority) then queue/. Detects [NEEDS-JC] in the first 200 chars of any response and pauses dispatch until JC writes jc_input.txt. JC override supports @session-N: prefix for direct dispatch without an API call. * relay.__main__ — CLI: relay run / relay status / relay topic. Tests: 57 unit tests pass (config, state, conversation, queue, dispatch, anthropic_client, ntfy, full daemon loop with a fake client). One real-API smoke test marked real_api, opt-in via pytest -m real_api; skips cleanly on credit-balance errors. Out of scope for this PR (deferred to follow-ups): Flask status endpoint, multi-session config in production, exponential backoff, systemd unit, cost-tracking aggregation. Closes #1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-05-02 15:24:47 +00:00			`"""Anthropic API wrapper.`

			`Wraps the SDK call so the daemon can send a turn and get back the`
			`assistant text + token usage + estimated cost. Implements prompt`
			`caching on the system prompt: subsequent calls within the 5-minute TTL`
			`get a cache hit on the (large, repeated) system prompt and pay the`
			`cheaper cache-hit rate. Per-turn user/assistant content is never marked`
			`cacheable because it changes every call.`

			`Cost estimation uses a model→price table; the table is the source of`
			`truth and is easy to update when pricing changes.`
			`"""`

			`from __future__ import annotations`

			`from dataclasses import dataclass`

			`from anthropic import Anthropic`
			`from anthropic.types import Message`

			`# Pricing per 1M tokens (USD), pulled from Anthropic's published schedule.`
			`# Cache-hit input is billed at the cache-read rate (~10% of standard input).`
			`# Cache-write is ~25% more than standard input. These numbers are`
			`# approximate and used only for log-line cost estimation; the source of`
			`# truth for billing is Anthropic's invoice.`
			`_PRICES_PER_MILLION: dict[str, dict[str, float]] = {`
			`"claude-opus-4-7": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},`
			`"claude-opus-4-7-1m": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},`
			`"claude-sonnet-4-6": {"input": 3.0, "output": 15.0, "cache_write": 3.75, "cache_read": 0.30},`
			`"claude-haiku-4-5-20251001": {`
			`"input": 0.80,`
			`"output": 4.0,`
			`"cache_write": 1.00,`
			`"cache_read": 0.08,`
			`},`
			`}`


			`def _price_for(model: str) -> dict[str, float]:`
			`if model in _PRICES_PER_MILLION:`
			`return _PRICES_PER_MILLION[model]`
			`# Fallback: charge as Opus 4.7 (worst-case) so estimates don't`
			`# under-report for unknown models. Logged once at startup.`
			`return _PRICES_PER_MILLION["claude-opus-4-7"]`


			`@dataclass`
			`class TurnResult:`
			`text: str`
			`input_tokens: int`
			`output_tokens: int`
			`cache_creation_input_tokens: int`
			`cache_read_input_tokens: int`
			`estimated_cost_usd: float`
			`raw: Message`


			`@dataclass`
			`class AnthropicClient:`
			`api_key: str`
			`model: str`
			`max_output_tokens: int = 4096`

			`def __post_init__(self) -> None:`
			`self._sdk = Anthropic(api_key=self.api_key)`

			`def send(`
			`self,`
			`*,`
			`system_prompt: str,`
			`messages: list[dict[str, str]],`
			`) -> TurnResult:`
			`"""Send a single API turn. The system prompt is marked cacheable.`

			``messages`` is the full conversation history shaped for the
			`Messages API. The most recent message is the user turn we're`
			`responding to. The daemon is responsible for appending the`
			`assistant text it gets back into history before the next call.`
			`"""`

			`response = self._sdk.messages.create(`
			`model=self.model,`
			`max_tokens=self.max_output_tokens,`
			`system=[`
			`{`
			`"type": "text",`
			`"text": system_prompt,`
			`"cache_control": {"type": "ephemeral"},`
			`}`
			`],`
			`messages=messages,`
			`)`

			`text = "".join(`
			`block.text for block in response.content if getattr(block, "type", None) == "text"`
			`)`
			`usage = response.usage`
			`in_tokens = int(getattr(usage, "input_tokens", 0) or 0)`
			`out_tokens = int(getattr(usage, "output_tokens", 0) or 0)`
			`cache_create = int(getattr(usage, "cache_creation_input_tokens", 0) or 0)`
			`cache_read = int(getattr(usage, "cache_read_input_tokens", 0) or 0)`

			`prices = _price_for(self.model)`
			`cost = (`
			`in_tokens * prices["input"]`
			`+ out_tokens * prices["output"]`
			`+ cache_create * prices["cache_write"]`
			`+ cache_read * prices["cache_read"]`
			`) / 1_000_000.0`

			`return TurnResult(`
			`text=text,`
			`input_tokens=in_tokens,`
			`output_tokens=out_tokens,`
			`cache_creation_input_tokens=cache_create,`
			`cache_read_input_tokens=cache_read,`
			`estimated_cost_usd=cost,`
			`raw=response,`
			`)`