diff --git a/.gitignore b/.gitignore index 36b13f1..65398b3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ -# ---> Python -# Byte-compiled / optimized / DLL files +# Python bytecode __pycache__/ *.py[cod] *$py.class @@ -8,169 +7,74 @@ __pycache__/ *.so # Distribution / packaging -.Python build/ -develop-eggs/ dist/ downloads/ eggs/ .eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ *.egg-info/ -.installed.cfg +pip-wheel-metadata/ *.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec # Installer logs pip-log.txt pip-delete-this-directory.txt -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ +# Unit test / coverage +.pytest_cache/ +.cache .coverage .coverage.* -.cache -nosetests.xml +htmlcov/ coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ +nose2-junit.xml -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# UV -# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -#uv.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ +# Virtual environments +.venv/ venv/ ENV/ +env/ env.bak/ venv.bak/ -# Spyder project settings -.spyderproject -.spyproject +# Environment files +.env +.env.* +credentials.json -# Rope project settings -.ropeproject +# IDEs and editors +.vscode/ +.idea/ +*.iml -# mkdocs documentation -/site - -# mypy +# Type checkers .mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker .pyre/ -# pytype static type analyzer -.pytype/ +# Jupyter +.ipynb_checkpoints -# Cython debug symbols -cython_debug/ +# Logs +*.log -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +# System files +.DS_Store +Thumbs.db -# Ruff stuff: -.ruff_cache/ +# Node +node_modules/ -# PyPI configuration file -.pypirc +# Docker +docker-compose.override.yml +# ESP-IDF / CMake / embedded build artifacts +sdkconfig +sdkconfig.* +build/ +components/**/build/ +components/**/.build/ +partitions.csv + +# Misc +*.sqlite3 +*.db diff --git a/README.md b/README.md index 598f6ac..bdc4f36 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,604 @@ -# stentor +# Stentor — Voice Gateway Specification + +> *\"Stentor, whose voice was as powerful as fifty voices of other men.\"* +> — Homer, *Iliad*, Book V + +**Stentor** is a voice gateway that connects physical audio hardware (ESP32-S3-AUDIO-Board) to AI agents (FastAgent) via speech services (Speaches), enabling real-time voice conversations with AI. + +--- + +## Architecture Overview + +``` +┌──────────────────────────────────┐ +│ ESP32-S3-AUDIO-Board │ +│ (stentor-ear) │ +│ │ +│ Mic Array ──▶ Wake Word ──▶ WS │ +│ Speaker ◀── PCM playback ◀── │ +└──────────────┬───────────────────┘ + │ WebSocket (audio) + ▼ +┌──────────────────────────────────┐ +│ Stentor Gateway │ +│ (Python service) │ +│ │ +│ WS Server ──▶ STT ──▶ Agent │ +│ WS Server ◀── TTS ◀── Agent │ +└──────┬──────────┬──────────┬─────┘ + │ │ │ + ▼ ▼ ▼ + Speaches FastAgent Speaches + STT (HTTP) TTS +``` + +--- + +## Components + +### 1. Stentor Gateway (`stentor-gateway/`) + +**Language:** Python 3.12+ +**Framework:** FastAPI + WebSockets +**Deployment:** Agathos container (Caliban or dedicated) + +The gateway is the central orchestrator. It: +- Accepts WebSocket connections from ESP32 clients +- Streams audio to Speaches STT for transcription +- Sends transcripts to FastAgent for AI response +- Sends AI response text to Speaches TTS for synthesis +- Streams synthesized audio back to the ESP32 client + +#### Configuration + +All configuration via environment variables (12-factor): + +| Variable | Description | Default | +|----------|-------------|---------| +| `STENTOR_HOST` | Gateway bind address | `0.0.0.0` | +| `STENTOR_PORT` | Gateway bind port | `8600` | +| `STENTOR_STT_URL` | Speaches STT endpoint (OpenAI-compatible) | `http://perseus.incus:8000` | +| `STENTOR_TTS_URL` | Speaches TTS endpoint (OpenAI-compatible) | `http://pan.incus:8000` | +| `STENTOR_AGENT_URL` | FastAgent HTTP endpoint | `http://localhost:8001` | +| `STENTOR_STT_MODEL` | Whisper model for STT | `Systran/faster-whisper-small` | +| `STENTOR_TTS_MODEL` | TTS model name | `kokoro` | +| `STENTOR_TTS_VOICE` | TTS voice ID | `af_heart` | +| `STENTOR_AUDIO_SAMPLE_RATE` | Audio sample rate in Hz | `16000` | +| `STENTOR_AUDIO_CHANNELS` | Audio channel count | `1` | +| `STENTOR_AUDIO_SAMPLE_WIDTH` | Bits per sample | `16` | +| `STENTOR_LOG_LEVEL` | Logging level | `INFO` | + +#### WebSocket Protocol + +Endpoint: `ws://{STENTOR_HOST}:{STENTOR_PORT}/ws/audio` + +##### Client → Gateway Messages + +```json +// Session start (sent once on connect) +{ + \"type\": \"session.start\", + \"client_id\": \"esp32-kitchen\", + \"audio_config\": { + \"sample_rate\": 16000, + \"channels\": 1, + \"sample_width\": 16, + \"encoding\": \"pcm_s16le\" + } +} + +// Audio data (sent continuously while listening) +{ + \"type\": \"audio.chunk\", + \"data\": \"\" +} + +// Client signals end of speech +{ + \"type\": \"audio.end\" +} + +// Client requests session termination +{ + \"type\": \"session.end\" +} +``` + +##### Gateway → Client Messages + +```json +// Acknowledge session +{ + \"type\": \"session.ready\", + \"session_id\": \"uuid\" +} + +// Gateway is processing (ESP32 can show \"thinking\" LED) +{ + \"type\": \"status\", + \"state\": \"listening\" | \"transcribing\" | \"thinking\" | \"speaking\" +} + +// Transcript of what user said (for debugging/display) +{ + \"type\": \"transcript\", + \"text\": \"What is the weather like today?\" +} + +// AI response text (for debugging/display) +{ + \"type\": \"response.text\", + \"text\": \"I don\\'t have weather tools yet, but...\" +} + +// Audio response (streamed in chunks) +{ + \"type\": \"audio.chunk\", + \"data\": \"\" +} + +// Audio response complete +{ + \"type\": \"audio.end\" +} + +// Error +{ + \"type\": \"error\", + \"message\": \"STT service unavailable\", + \"code\": \"stt_error\" +} +``` + +#### Processing Pipeline + +``` +1. ESP32 detects wake word locally (ESP-SR) +2. ESP32 opens WebSocket, sends session.start +3. Gateway sends session.ready +4. ESP32 streams audio.chunk messages +5. Gateway buffers audio (with Voice Activity Detection) +6. ESP32 sends audio.end (or VAD detects silence) +7. Gateway sends status: \"transcribing\" +8. Gateway calls Speaches STT: + POST {STENTOR_STT_URL}/v1/audio/transcriptions + - model: {STENTOR_STT_MODEL} + - file: collected audio as WAV + - response_format: json +9. Gateway sends transcript message to client +10. Gateway sends status: \"thinking\" +11. Gateway calls FastAgent: + POST {STENTOR_AGENT_URL}/message + - content: transcript text +12. Gateway sends response.text to client +13. Gateway sends status: \"speaking\" +14. Gateway calls Speaches TTS: + POST {STENTOR_TTS_URL}/v1/audio/speech + - model: {STENTOR_TTS_MODEL} + - voice: {STENTOR_TTS_VOICE} + - input: agent response text + - response_format: pcm + - speed: 1.0 +15. Gateway streams audio.chunk messages to client +16. Gateway sends audio.end +17. Gateway returns to listening state (step 4) + OR ESP32 sends session.end to disconnect +``` + +#### Health & Observability + +- `GET /health` — Returns 200 if gateway is running +- `GET /health/ready` — Returns 200 if STT, TTS, and Agent endpoints are reachable +- `GET /metrics` — Prometheus-compatible metrics + - `stentor_sessions_active` — Current active WebSocket sessions + - `stentor_transcriptions_total` — Total STT calls + - `stentor_tts_requests_total` — Total TTS calls + - `stentor_agent_requests_total` — Total agent calls + - `stentor_pipeline_duration_seconds` — Histogram of full pipeline latency + - `stentor_stt_duration_seconds` — Histogram of STT latency + - `stentor_tts_duration_seconds` — Histogram of TTS latency + - `stentor_agent_duration_seconds` — Histogram of agent response latency + +#### Dependencies (Python) + +```toml +[project] +name = \"stentor-gateway\" +version = \"0.1.0\" +requires-python = \">=3.12\" +dependencies = [ + \"fastapi>=0.115\", + \"uvicorn[standard]>=0.34\", + \"websockets>=14.0\", + \"httpx>=0.28\", + \"pydantic>=2.10\", + \"pydantic-settings>=2.7\", +] + +[project.optional-dependencies] +dev = [ + \"pytest>=8.0\", + \"pytest-asyncio>=0.25\", + \"ruff>=0.9\", +] +``` + +#### Project Structure + +``` +stentor-gateway/ +├── pyproject.toml +├── src/ +│ └── stentor/ +│ ├── __init__.py +│ ├── main.py # FastAPI app, WebSocket endpoint +│ ├── config.py # Pydantic Settings configuration +│ ├── models.py # WebSocket message models +│ ├── pipeline.py # Orchestration: STT → Agent → TTS +│ ├── stt_client.py # Speaches STT client (OpenAI-compatible) +│ ├── tts_client.py # Speaches TTS client (OpenAI-compatible) +│ ├── agent_client.py # FastAgent HTTP client +│ ├── audio.py # Audio utilities (PCM↔WAV, resampling) +│ └── health.py # Health check endpoints +├── tests/ +│ ├── test_pipeline.py +│ ├── test_stt_client.py +│ ├── test_tts_client.py +│ └── test_websocket.py +└── Dockerfile +``` + +--- + +### 2. Stentor Ear (`stentor-ear/`) + +**Platform:** ESP32-S3-AUDIO-Board (Waveshare) +**Framework:** ESP-IDF 5.x + ESP-ADF +**Language:** C + +The ESP32 firmware handles local audio capture, wake word detection, and communication with the Stentor Gateway. + +#### Hardware Capabilities (Waveshare ESP32-S3-AUDIO-Board) + +| Component | Chip | Purpose | +|-----------|------|---------| +| Microphone array | 2× MEMS via ES7210 ADC | Audio capture, noise reduction | +| Speaker output | ES8311 DAC + Class-D amp | Audio playback | +| Display | 1.85\" LCD (ST7789V2) | Status display (optional) | +| LEDs | WS2812B RGB ring (×2) | Visual status feedback | +| Storage | TF card slot | Local audio cache (optional) | +| Camera | OV2640 (2MP) | Not used for Stentor | +| RTC | PCF85063 | Timestamping (optional) | +| Connectivity | WiFi 802.11 b/g/n + BLE 5 | Network communication | +| Wake word | ESP-SR | Local \"Hi ESP\" detection | + +Reference: Waveshare wiki documents ESP-SR wake word \"hi esp\" with command recognition support [2]. + +#### Firmware State Machine + +``` + ┌──────────┐ + ┌─────────│ IDLE │◀──────────┐ + │ │ (LEDs: dim blue) │ + │ └────┬─────┘ │ + │ │ Wake word │ + │ │ detected │ + │ ▼ │ + │ ┌──────────┐ │ + │ │ CONNECT │ │ + │ │ (LEDs: pulse white) │ + │ └────┬─────┘ │ + │ │ WS connected │ + │ ▼ │ + │ ┌──────────┐ │ + Timeout│ │ LISTEN │ │ audio.end + / Error│ │ (LEDs: green) │ received + │ └────┬─────┘ │ + │ │ Silence / │ + │ │ button press │ + │ ▼ │ + │ ┌──────────┐ │ + │ │ WAITING │ │ + │ │ (LEDs: pulse yellow)│ + │ └────┬─────┘ │ + │ │ audio.chunk │ + │ │ from gateway │ + │ ▼ │ + │ ┌──────────┐ │ + │ │ PLAYING ├──────────┘ + │ │ (LEDs: pulse cyan) │ + │ └──────────┘ │ + │ │ + └────────────────────────────────┘ +``` + +#### Audio Configuration + +| Parameter | Value | Notes | +|-----------|-------|-------| +| Sample rate | 16000 Hz | Standard for speech recognition | +| Bit depth | 16-bit | Signed, little-endian (PCM_S16LE) | +| Channels | 1 (mono) | Downmixed from dual-mic array | +| Chunk size | 512 samples (32ms) | Balance between latency and overhead | +| Buffer | 4096 samples (256ms) | Ring buffer for network jitter | + +#### WiFi & Network + +- WiFi credentials via `menuconfig` or BLE provisioning +- mDNS discovery for gateway: `_stentor._tcp.local` +- Fallback to configured static gateway URL +- Auto-reconnect with exponential backoff + +#### LED Feedback (WS2812B Ring) + +| State | Pattern | Color | +|-------|---------|-------| +| Idle / standby | Dim steady | Blue | +| Wake word detected / connecting | Pulse | White | +| Listening (recording) | Steady | Green | +| Transcribing / thinking | Rotating pulse | Yellow | +| Speaking (playback) | Pulse with audio | Cyan | +| Error | Flash | Red | + +#### Firmware Project Structure + +``` +stentor-ear/ +├── CMakeLists.txt +├── sdkconfig.defaults +├── main/ +│ ├── CMakeLists.txt +│ ├── main.c # App entry, state machine +│ ├── audio_capture.c/.h # I2S mic input via ES7210 +│ ├── audio_playback.c/.h # I2S speaker output via ES8311 +│ ├── wake_word.c/.h # ESP-SR integration +│ ├── ws_client.c/.h # WebSocket client to gateway +│ ├── led_status.c/.h # WS2812B LED patterns +│ ├── wifi_manager.c/.h # WiFi connection management +│ └── config.h # Compile-time configuration +├── components/ +│ └── (ESP-ADF / ESP-SR as managed components) +└── partitions.csv +``` + +--- + +### 3. FastAgent Integration + +The Stentor Gateway communicates with FastAgent via its HTTP transport mode. + +#### Starting FastAgent for Stentor + +```bash +# Expose Harper (or any agent) as an HTTP server +uv run fast-agent --transport http --port 8001 +``` + +#### Agent Request/Response + +The gateway sends a simple message and receives the agent\'s response. The exact API shape depends on FastAgent\'s HTTP transport implementation — expected to follow MCP-over-HTTP conventions: + +``` +POST /message +Content-Type: application/json + +{ + \"content\": \"What can you tell me about the Agathos infrastructure?\" +} + +Response: +{ + \"content\": \"Agathos is a set of Incus containers on a 10.10.0.0/24 network...\" +} +``` + +> **Note for Cline:** The FastAgent HTTP transport API may need investigation. Check the fast-agent source for the actual endpoint shape. The gateway\'s `agent_client.py` should be the thinnest possible adapter so it\'s easy to swap. + +--- + +### 4. Speaches Integration + +Speaches provides OpenAI API-compatible STT and TTS endpoints. + +#### STT (Speech-to-Text) + +``` +POST {STENTOR_STT_URL}/v1/audio/transcriptions +Content-Type: multipart/form-data + +- file: audio.wav (PCM wrapped in WAV header) +- model: \"Systran/faster-whisper-small\" +- language: \"en\" (optional, auto-detect if omitted) +- response_format: \"json\" + +Response: +{ + \"text\": \"What is the weather like today?\" +} +``` + +#### TTS (Text-to-Speech) + +``` +POST {STENTOR_TTS_URL}/v1/audio/speech +Content-Type: application/json + +{ + \"model\": \"kokoro\", + \"voice\": \"af_heart\", + \"input\": \"I don\\'t have weather tools yet, but I can help with other things.\", + \"response_format\": \"pcm\", + \"speed\": 1.0 +} + +Response: Binary PCM audio stream (24kHz, mono, 16-bit) +``` + +> **Note:** Speaches TTS may output at 24kHz. The gateway\'s `audio.py` module should handle resampling to 16kHz if the ESP32 is configured for 16kHz playback. Alternatively, configure the ESP32 for 24kHz playback. + +--- + +## Build & Run + +### Gateway + +```bash +cd stentor-gateway +pip install -e . +stentor +``` + +Or with more control: + +```bash +python -m stentor +# or: uvicorn stentor.main:app --host 0.0.0.0 --port 25573 +``` + +See [docs/stentor.md](docs/stentor.md) for full usage documentation. + +### ESP32 Firmware + +```bash +cd stentor-ear +idf.py set-target esp32s3 +idf.py menuconfig # Set WiFi creds, gateway URL +idf.py build flash monitor +``` + +--- + +## Phase 2 (Future) + +These are explicitly **out of scope** for v0.1 but documented for planning: + +- [ ] **Streaming STT** — Use Speaches SSE streaming for real-time partial transcripts +- [ ] **Streaming TTS** — Stream TTS audio chunks as they\'re generated (reduce time-to-first-audio) +- [ ] **Barge-in / Interruption** — Allow user to interrupt AI playback with new speech +- [ ] **Multi-turn context** — Maintain conversation history across turns within a session +- [ ] **PJSUA2 SIP bridge** — Enable calling Stentor from a phone via SIP +- [ ] **Multiple agents** — Route to different FastAgent instances based on wake word or command +- [ ] **Local fallback** — ESP32 handles simple commands locally when gateway is unreachable +- [ ] **OTA firmware updates** — Update stentor-ear firmware over WiFi +- [ ] **LCD UI** — Show transcript and response text on the 1.85\" display + +--- + +## Design Decisions & Rationale + +| Decision | Rationale | +|----------|-----------| +| WebSocket over SIP/RTP | No SIP infrastructure needed; ESP-IDF has native WS support; Speaches speaks HTTP not SIP | +| Base64 audio in JSON | Simplicity for v0.1; binary WS frames are a Phase 2 optimization | +| Wake word on ESP32 | Zero-latency wake detection; no audio streaming when idle; privacy-preserving | +| Gateway as separate service | Decouples hardware from AI; can swap agents, add SIP later, scale independently | +| PCM_S16LE format | Universal, no codec overhead, both ESP32 and Speaches handle it natively | +| Pydantic Settings for config | Type-safe, env-var-driven, zero config files to manage | + +--- + +## Notes for Cline + +1. **Start with the gateway.** The ESP32 firmware is more complex and needs the gateway to test against. +2. **Use `httpx.AsyncClient`** for all HTTP calls — it supports streaming responses which you\'ll need for TTS. +3. **The WebSocket protocol is intentionally simple.** JSON messages with base64 audio. Optimize to binary frames later. +4. **Test with a Python WebSocket client first** before touching ESP32 hardware. Write a `test_client.py` that sends a WAV file over the WebSocket and plays back the response. +5. **FastAgent HTTP transport** needs investigation — check the fast-agent repo for actual endpoint signatures. +6. **Audio resampling** — Speaches TTS may output 24kHz; ESP32 may expect 16kHz. Handle this in `audio.py`. +7. **Don\'t overthink error handling in v0.1** — log errors, send error messages to the client, keep the pipeline simple. +SPECEOF +echo 'SPEC.md written'"}} + + +SPEC.md written + + +Now let me push it to the repo: + + +{"name": "execute_command", "arguments": {"command": "cd /tmp/stentor && git init && git remote add origin https://gitea.service.consul/robert/stentor.git && git fetch origin && git checkout main && cp SPEC.md . && git add SPEC.md && git commit -m 'Add Stentor voice gateway specification' && git push origin main"}} + + +hint: Using 'master' as the name for the initial branch. This name is subject to change. To +hint: configure the initial branch name to use in all of your new repositories, which will +hint: suppress this warning, call: +hint: +hint: git config --global init.defaultBranch +hint: +hint: Names commonly chosen instead of 'master' are 'main', 'trunk' and 'development'. The +hint: just-created branch can be renamed via this command: +hint: +hint: git branch -m +From https://gitea.service.consul/robert/stentor + * [new branch] main -> origin/main +branch 'main' set up to track 'origin/main'. +Already on 'main' +[main 2e4e0b5] Add Stentor voice gateway specification + 1 file changed, 362 insertions(+) + create mode 100644 SPEC.md +To https://gitea.service.consul/robert/stentor.git + 7e5e7b2..2e4e0b5 main -> main + + +Now let me also create a basic README and scaffold the directory structure: + + +{"name": "execute_command", "arguments": {"command": "cd /tmp/stentor && cat > README.md << 'EOF' +# Στέντωρ (Stentor) + +> *\"Stentor, whose voice was as powerful as fifty voices of other men.\"* +> — Homer, *Iliad*, Book V + +Voice gateway for AI agents. Bridges physical audio hardware to AI agents via speech services. + +## Architecture + +``` +ESP32-S3-AUDIO-Board ◀──WebSocket──▶ Stentor Gateway ◀──HTTP──▶ Speaches (STT/TTS) + (stentor-ear) (Python) FastAgent (AI) +``` + +**Stentor Gateway** orchestrates the voice pipeline: +1. Receives audio from ESP32 via WebSocket +2. Transcribes speech via Speaches STT (OpenAI-compatible API) +3. Sends transcript to FastAgent for AI response +4. Synthesizes speech via Speaches TTS +5. Streams audio back to ESP32 for playback + +**Stentor Ear** is the ESP32-S3 firmware: +- Local wake word detection (\"Hi ESP\" via ESP-SR) +- Audio capture from dual-mic array +- Audio playback via onboard speaker +- LED ring feedback for conversation state + +## Quick Start + +### Gateway + +```bash +cd stentor-gateway +uv sync +uv run uvicorn stentor.main:app --host 0.0.0.0 --port 8600 +``` + +### ESP32 Firmware + +```bash +cd stentor-ear +idf.py set-target esp32s3 +idf.py menuconfig # Set WiFi creds, gateway URL +idf.py build flash monitor +``` + +## Documentation + +- [Full Specification](SPEC.md) + +## Status + +🚧 **Pre-alpha** — Under active development. + +## License + +MIT diff --git a/docs/api-reference.md b/docs/api-reference.md new file mode 100644 index 0000000..a90e04a --- /dev/null +++ b/docs/api-reference.md @@ -0,0 +1,315 @@ +# Stentor Gateway API Reference + +> Version 0.1.0 + +## Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/` | GET | Dashboard (Bootstrap UI) | +| `/api/v1/realtime` | WebSocket | Real-time audio conversation | +| `/api/v1/info` | GET | Gateway information and configuration | +| `/api/live/` | GET | Liveness probe (Kubernetes) | +| `/api/ready/` | GET | Readiness probe (Kubernetes) | +| `/api/metrics` | GET | Prometheus-compatible metrics | +| `/api/docs` | GET | Interactive API documentation (Swagger UI) | +| `/api/openapi.json` | GET | OpenAPI schema | + +--- + +## WebSocket: `/api/v1/realtime` + +Real-time voice conversation endpoint. Protocol inspired by the OpenAI Realtime API. + +### Connection + +``` +ws://{host}:{port}/api/v1/realtime +``` + +### Client Events + +#### `session.start` + +Initiates a new conversation session. Must be sent first. + +```json +{ + "type": "session.start", + "client_id": "esp32-kitchen", + "audio_config": { + "sample_rate": 16000, + "channels": 1, + "sample_width": 16, + "encoding": "pcm_s16le" + } +} +``` + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `type` | string | ✔ | Must be `"session.start"` | +| `client_id` | string | | Client identifier for tracking | +| `audio_config` | object | | Audio format configuration | + +#### `input_audio_buffer.append` + +Sends a chunk of audio data. Stream continuously while user is speaking. + +```json +{ + "type": "input_audio_buffer.append", + "audio": "" +} +``` + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `type` | string | ✔ | Must be `"input_audio_buffer.append"` | +| `audio` | string | ✔ | Base64-encoded PCM S16LE audio | + +#### `input_audio_buffer.commit` + +Signals end of speech. Triggers the STT → Agent → TTS pipeline. + +```json +{ + "type": "input_audio_buffer.commit" +} +``` + +#### `session.close` + +Requests session termination. The WebSocket connection will close. + +```json +{ + "type": "session.close" +} +``` + +### Server Events + +#### `session.created` + +Acknowledges session creation. + +```json +{ + "type": "session.created", + "session_id": "550e8400-e29b-41d4-a716-446655440000" +} +``` + +#### `status` + +Processing status update. Use for LED feedback on ESP32. + +```json +{ + "type": "status", + "state": "listening" +} +``` + +| State | Description | Suggested LED | +|-------|-------------|--------------| +| `listening` | Ready for audio input | Green | +| `transcribing` | Running STT | Yellow | +| `thinking` | Waiting for agent response | Yellow | +| `speaking` | Playing TTS audio | Cyan | + +#### `transcript.done` + +Transcript of what the user said. + +```json +{ + "type": "transcript.done", + "text": "What is the weather like today?" +} +``` + +#### `response.text.done` + +AI agent's response text. + +```json +{ + "type": "response.text.done", + "text": "I don't have weather tools yet, but I can help with other things." +} +``` + +#### `response.audio.delta` + +Streamed audio response chunk. + +```json +{ + "type": "response.audio.delta", + "delta": "" +} +``` + +#### `response.audio.done` + +Audio response streaming complete. + +```json +{ + "type": "response.audio.done" +} +``` + +#### `response.done` + +Full response cycle complete. Gateway returns to listening state. + +```json +{ + "type": "response.done" +} +``` + +#### `error` + +Error event. + +```json +{ + "type": "error", + "message": "STT service unavailable", + "code": "stt_error" +} +``` + +| Code | Description | +|------|-------------| +| `invalid_json` | Client sent malformed JSON | +| `validation_error` | Message failed schema validation | +| `no_session` | Action requires an active session | +| `empty_buffer` | Audio buffer was empty on commit | +| `empty_transcript` | STT returned no speech | +| `empty_response` | Agent returned empty response | +| `pipeline_error` | Internal pipeline failure | +| `unknown_event` | Unrecognized event type | +| `internal_error` | Unexpected server error | + +--- + +## REST: `/api/v1/info` + +Returns gateway information and current configuration. + +**Response:** + +```json +{ + "name": "stentor-gateway", + "version": "0.1.0", + "endpoints": { + "realtime": "/api/v1/realtime", + "live": "/api/live/", + "ready": "/api/ready/", + "metrics": "/api/metrics" + }, + "config": { + "stt_url": "http://perseus.incus:8000", + "tts_url": "http://pan.incus:8000", + "agent_url": "http://localhost:8001", + "stt_model": "Systran/faster-whisper-small", + "tts_model": "kokoro", + "tts_voice": "af_heart", + "audio_sample_rate": 16000, + "audio_channels": 1, + "audio_sample_width": 16 + } +} +``` + +--- + +## REST: `/api/live/` + +Kubernetes liveness probe. + +**Response (200):** + +```json +{ + "status": "ok" +} +``` + +--- + +## REST: `/api/ready/` + +Kubernetes readiness probe. Checks connectivity to STT, TTS, and Agent services. + +**Response (200 — all services reachable):** + +```json +{ + "status": "ready", + "checks": { + "stt": true, + "tts": true, + "agent": true + } +} +``` + +**Response (503 — one or more services unavailable):** + +```json +{ + "status": "not_ready", + "checks": { + "stt": true, + "tts": false, + "agent": true + } +} +``` + +--- + +## REST: `/api/metrics` + +Prometheus-compatible metrics in text exposition format. + +**Metrics exported:** + +| Metric | Type | Description | +|--------|------|-------------| +| `stentor_sessions_active` | Gauge | Current active WebSocket sessions | +| `stentor_transcriptions_total` | Counter | Total STT transcription calls | +| `stentor_tts_requests_total` | Counter | Total TTS synthesis calls | +| `stentor_agent_requests_total` | Counter | Total agent message calls | +| `stentor_pipeline_duration_seconds` | Histogram | Full pipeline latency | +| `stentor_stt_duration_seconds` | Histogram | STT transcription latency | +| `stentor_tts_duration_seconds` | Histogram | TTS synthesis latency | +| `stentor_agent_duration_seconds` | Histogram | Agent response latency | + +--- + +## Configuration + +All configuration via environment variables (12-factor): + +| Variable | Description | Default | +|----------|-------------|---------| +| `STENTOR_HOST` | Gateway bind address | `0.0.0.0` | +| `STENTOR_PORT` | Gateway bind port | `8600` | +| `STENTOR_STT_URL` | Speaches STT endpoint | `http://perseus.incus:8000` | +| `STENTOR_TTS_URL` | Speaches TTS endpoint | `http://pan.incus:8000` | +| `STENTOR_AGENT_URL` | FastAgent HTTP endpoint | `http://localhost:8001` | +| `STENTOR_STT_MODEL` | Whisper model for STT | `Systran/faster-whisper-small` | +| `STENTOR_TTS_MODEL` | TTS model name | `kokoro` | +| `STENTOR_TTS_VOICE` | TTS voice ID | `af_heart` | +| `STENTOR_AUDIO_SAMPLE_RATE` | Audio sample rate in Hz | `16000` | +| `STENTOR_AUDIO_CHANNELS` | Audio channel count | `1` | +| `STENTOR_AUDIO_SAMPLE_WIDTH` | Bits per sample | `16` | +| `STENTOR_LOG_LEVEL` | Logging level | `INFO` | diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..d2d9a70 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,222 @@ +# Stentor Architecture + +> Version 0.2.0 — Daedalus-integrated architecture + +## Overview + +Stentor is a voice interface that connects physical audio hardware to AI agents via speech services. The system consists of two main components: + +1. **stentor-ear** — ESP32-S3 firmware handling microphone input, speaker output, wake word detection, and VAD +2. **Daedalus voice module** — Python code integrated into the Daedalus FastAPI backend, handling the STT → Agent → TTS pipeline + +The Python gateway that was previously a standalone service (`stentor-gateway/`) has been merged into the Daedalus backend as `daedalus/backend/daedalus/voice/`. See [daedalus/docs/stentor_integration.md](../../daedalus/docs/stentor_integration.md) for the full integration specification. + +## System Architecture + +```mermaid +graph TB + subgraph "ESP32-S3-AUDIO-Board" + MIC["Mic Array
ES7210 ADC"] + WW["Wake Word
ESP-SR"] + VAD["VAD
On-Device"] + SPK["Speaker
ES8311 DAC"] + LED["LED Ring
WS2812B"] + NVS["NVS
Device UUID"] + MIC --> WW + MIC --> VAD + end + + subgraph "Daedalus Backend (puck.incus)" + REG["Device Registry
/api/v1/voice/devices"] + WS["WebSocket Server
/api/v1/voice/realtime"] + PIPE["Voice Pipeline
STT → MCP → TTS"] + DB["PostgreSQL
Conversations & Messages"] + MCP["MCP Connection Manager
Pallas Agents"] + end + + subgraph "Speech Services" + STT["Speaches STT
Whisper (perseus)"] + TTS["Speaches TTS
Kokoro (perseus)"] + end + + subgraph "AI Agents" + PALLAS["Pallas MCP Servers
Research · Infra · Orchestrator"] + end + + NVS -->|"POST /register"| REG + WW -->|"WebSocket
JSON + base64 audio"| WS + VAD -->|"commit on silence"| WS + WS --> PIPE + PIPE -->|"POST /v1/audio/transcriptions"| STT + PIPE -->|"MCP call_tool"| MCP + MCP -->|"MCP Streamable HTTP"| PALLAS + PIPE -->|"POST /v1/audio/speech"| TTS + STT -->|"transcript text"| PIPE + PALLAS -->|"response text"| MCP + MCP --> PIPE + TTS -->|"PCM audio stream"| PIPE + PIPE --> DB + PIPE --> WS + WS -->|"audio + status"| SPK + WS -->|"status events"| LED +``` + +## Device Registration & Lifecycle + +```mermaid +sequenceDiagram + participant ESP as ESP32 + participant DAE as Daedalus + participant UI as Daedalus Web UI + + Note over ESP: First boot — generate UUID, store in NVS + ESP->>DAE: POST /api/v1/voice/devices/register {device_id, firmware} + DAE->>ESP: {status: "registered"} + + Note over UI: User sees new device in Settings → Voice Devices + UI->>DAE: PUT /api/v1/voice/devices/{id} {name, workspace, agent} + + Note over ESP: Wake word detected + ESP->>DAE: WS /api/v1/voice/realtime?device_id=uuid + ESP->>DAE: session.start + DAE->>ESP: session.created {session_id, conversation_id} +``` + +## Voice Pipeline + +```mermaid +sequenceDiagram + participant ESP as ESP32 + participant GW as Daedalus Voice + participant STT as Speaches STT + participant MCP as MCP Manager + participant PALLAS as Pallas Agent + participant TTS as Speaches TTS + participant DB as PostgreSQL + + Note over ESP: VAD: speech detected + loop Audio streaming + ESP->>GW: input_audio_buffer.append (base64 PCM) + end + Note over ESP: VAD: silence detected + ESP->>GW: input_audio_buffer.commit + + GW->>ESP: status: transcribing + GW->>STT: POST /v1/audio/transcriptions (WAV) + STT->>GW: {"text": "..."} + GW->>ESP: transcript.done + GW->>DB: Save Message(role="user", content=transcript) + + GW->>ESP: status: thinking + GW->>MCP: call_tool(workspace, agent, tool, {message}) + MCP->>PALLAS: MCP Streamable HTTP + PALLAS->>MCP: CallToolResult + MCP->>GW: response text + GW->>ESP: response.text.done + GW->>DB: Save Message(role="assistant", content=response) + + GW->>ESP: status: speaking + GW->>TTS: POST /v1/audio/speech + TTS->>GW: PCM audio stream + + loop Audio chunks + GW->>ESP: response.audio.delta (base64 PCM) + end + + GW->>ESP: response.audio.done + GW->>ESP: response.done + GW->>ESP: status: listening + + Note over GW: Timeout timer starts (120s default) + + alt Timeout — no speech + GW->>ESP: session.end {reason: "timeout"} + else Agent ends conversation + GW->>ESP: session.end {reason: "agent"} + else User speaks again + Note over ESP: VAD triggers next turn (same conversation) + end +``` + +## Component Communication + +| Source | Destination | Protocol | Format | +|--------|------------|----------|--------| +| ESP32 | Daedalus | WebSocket | JSON + base64 PCM | +| ESP32 | Daedalus | HTTP POST | JSON (device registration) | +| Daedalus | Speaches STT | HTTP POST | multipart/form-data (WAV) | +| Daedalus | Pallas Agents | MCP Streamable HTTP | MCP call_tool | +| Daedalus | Speaches TTS | HTTP POST | JSON request, binary PCM response | +| Daedalus | PostgreSQL | SQL | Conversations + Messages | + +## Network Topology + +```mermaid +graph LR + ESP["ESP32
WiFi"] + DAE["Daedalus
puck.incus:8000"] + STT["Speaches STT
perseus.helu.ca:22070"] + TTS["Speaches TTS
perseus.helu.ca:22070"] + PALLAS["Pallas Agents
puck.incus:23031-33"] + DB["PostgreSQL
portia.incus:5432"] + + ESP <-->|"WS :22181
(via Nginx)"| DAE + DAE -->|"HTTP"| STT + DAE -->|"HTTP"| TTS + DAE -->|"MCP"| PALLAS + DAE -->|"SQL"| DB +``` + +## Audio Flow + +```mermaid +graph LR + MIC["Microphone
16kHz/16-bit/mono"] -->|"PCM S16LE"| B64["Base64 Encode"] + B64 -->|"WebSocket JSON"| GW["Daedalus Voice
Audio Buffer"] + GW -->|"WAV header wrap"| STT["Speaches STT"] + + TTS["Speaches TTS"] -->|"PCM 24kHz"| RESAMPLE["Resample
24kHz → 16kHz"] + RESAMPLE -->|"PCM 16kHz"| B64OUT["Base64 Encode"] + B64OUT -->|"WebSocket JSON"| SPK["Speaker
16kHz/16-bit/mono"] +``` + +## Key Design Decisions + +| Decision | Why | +|----------|-----| +| Gateway merged into Daedalus | Shares MCP connections, DB, auth, metrics, frontend — no duplicate infrastructure | +| Agent calls via MCP (not POST /message) | Same Pallas path as text chat; unified connection management and health checks | +| Device self-registration with UUID in NVS | Plug-and-play; user configures workspace assignment in web UI | +| VAD on ESP32, not server-side | Reduces bandwidth; ESP32-SR provides reliable on-device VAD | +| JSON + base64 over WebSocket | Simple for v1; binary frames planned for future | +| One conversation per WebSocket session | Multi-turn within a session; natural mapping to voice interaction | +| Timeout + LLM-initiated end | Two natural ways to close: silence timeout or agent recognizes goodbye | +| No audio storage | Only transcripts persisted; audio processed in-memory and discarded | + +## Repository Structure + +``` +stentor/ # This repository +├── docs/ +│ ├── stentor.md # Usage guide (updated) +│ └── architecture.md # This file +├── stentor-ear/ # ESP32 firmware +│ ├── main/ +│ ├── components/ +│ └── ... +├── stentor-gateway/ # Legacy — gateway code migrated to Daedalus +│ └── ... +└── README.md + +daedalus/ # Separate repository +├── backend/daedalus/voice/ # Voice module (migrated from stentor-gateway) +│ ├── audio.py +│ ├── models.py +│ ├── pipeline.py +│ ├── stt_client.py +│ └── tts_client.py +├── backend/daedalus/api/v1/ +│ └── voice.py # Voice REST + WebSocket endpoints +└── docs/ + └── stentor_integration.md # Full integration specification +``` diff --git a/docs/stentor.md b/docs/stentor.md new file mode 100644 index 0000000..ae14752 --- /dev/null +++ b/docs/stentor.md @@ -0,0 +1,315 @@ +# Stentor — Usage Guide + +> *"Stentor, whose voice was as powerful as fifty voices of other men."* +> — Homer, *Iliad*, Book V + +Stentor is a voice interface that connects physical audio hardware (ESP32-S3-AUDIO-Board) to AI agents via speech services. The voice gateway runs as part of the **Daedalus** web application backend — there is no separate Stentor server process. + +--- + +## Table of Contents + +- [How It Works](#how-it-works) +- [Components](#components) +- [ESP32 Device Setup](#esp32-device-setup) +- [Daedalus Configuration](#daedalus-configuration) +- [Device Registration Flow](#device-registration-flow) +- [Voice Conversation Flow](#voice-conversation-flow) +- [WebSocket Protocol](#websocket-protocol) +- [API Endpoints](#api-endpoints) +- [Observability](#observability) +- [Troubleshooting](#troubleshooting) +- [Architecture Overview](#architecture-overview) + +--- + +## How It Works + +1. An ESP32-S3-AUDIO-Board generates a UUID on first boot and registers itself with Daedalus +2. A user assigns the device to a workspace and Pallas agent via the Daedalus web UI +3. When the ESP32 detects a wake word, it opens a WebSocket to Daedalus and starts a voice session +4. On-device VAD (Voice Activity Detection) detects speech and silence +5. Audio streams to Daedalus, which runs: **Speaches STT** → **Pallas Agent (MCP)** → **Speaches TTS** +6. The response audio streams back to the ESP32 speaker +7. Transcripts are saved as conversations in PostgreSQL — visible in the Daedalus web UI alongside text conversations + +--- + +## Components + +| Component | Location | Purpose | +|-----------|----------|---------| +| **stentor-ear** | `stentor/stentor-ear/` | ESP32-S3 firmware — microphone, speaker, wake word, VAD | +| **Daedalus voice module** | `daedalus/backend/daedalus/voice/` | Voice pipeline — STT, MCP agent calls, TTS | +| **Daedalus voice API** | `daedalus/backend/daedalus/api/v1/voice.py` | WebSocket + REST endpoints for devices and sessions | +| **Daedalus web UI** | `daedalus/frontend/` | Device management, conversation history | + +The Python gateway code that was previously in `stentor/stentor-gateway/` has been merged into Daedalus. That directory is retained for reference but is no longer deployed as a standalone service. + +--- + +## ESP32 Device Setup + +The ESP32-S3-AUDIO-Board firmware needs one configuration value: + +| Setting | Description | Example | +|---------|-------------|---------| +| Daedalus URL | Base URL of the Daedalus instance | `http://puck.incus:22181` | + +On first boot, the device: +1. Generates a UUID v4 and stores it in NVS (non-volatile storage) +2. Registers with Daedalus via `POST /api/v1/voice/devices/register` +3. The UUID persists across reboots — the device keeps its identity + +--- + +## Daedalus Configuration + +Voice settings are configured via environment variables with the `DAEDALUS_` prefix: + +| Variable | Description | Default | +|----------|-------------|---------| +| `DAEDALUS_VOICE_STT_URL` | Speaches STT endpoint | `http://perseus.helu.ca:22070` | +| `DAEDALUS_VOICE_TTS_URL` | Speaches TTS endpoint | `http://perseus.helu.ca:22070` | +| `DAEDALUS_VOICE_STT_MODEL` | Whisper model for STT | `Systran/faster-whisper-small` | +| `DAEDALUS_VOICE_TTS_MODEL` | TTS model name | `kokoro` | +| `DAEDALUS_VOICE_TTS_VOICE` | TTS voice ID | `af_heart` | +| `DAEDALUS_VOICE_AUDIO_SAMPLE_RATE` | Sample rate in Hz | `16000` | +| `DAEDALUS_VOICE_AUDIO_CHANNELS` | Audio channels | `1` | +| `DAEDALUS_VOICE_AUDIO_SAMPLE_WIDTH` | Bits per sample | `16` | +| `DAEDALUS_VOICE_CONVERSATION_TIMEOUT` | Seconds of silence before auto-end | `120` | + +--- + +## Device Registration Flow + +``` +ESP32 Daedalus + │ │ + │ [First boot — UUID generated] │ + ├─ POST /api/v1/voice/devices/register ▶│ + │ {device_id, firmware_version} │ + │◀─ {status: "registered"} ─────────┤ + │ │ + │ [Device appears in Daedalus │ + │ Settings → Voice Devices] │ + │ │ + │ [User assigns workspace + agent │ + │ via web UI] │ + │ │ + │ [Subsequent boots — same UUID] │ + ├─ POST /api/v1/voice/devices/register ▶│ + │ {device_id, firmware_version} │ + │◀─ {status: "already_registered"} ──┤ + │ │ +``` + +After registration, the device appears in the Daedalus settings page. The user assigns it: +- A **name** (e.g. "Kitchen Speaker") +- A **description** (optional) +- A **workspace** (which workspace voice conversations go to) +- An **agent** (which Pallas agent to target) + +Until assigned, the device cannot process voice. + +--- + +## Voice Conversation Flow + +A voice conversation is a multi-turn session driven by on-device VAD: + +``` +ESP32 Daedalus + │ │ + ├─ [Wake word detected] │ + ├─ WS /api/v1/voice/realtime ──────▶│ + ├─ session.start ───────────────────▶│ → Create Conversation in DB + │◀──── session.created ─────────────┤ {session_id, conversation_id} + │◀──── status: listening ────────────┤ + │ │ + │ [VAD: user speaks] │ + ├─ input_audio_buffer.append ×N ────▶│ + │ [VAD: silence detected] │ + ├─ input_audio_buffer.commit ───────▶│ + │◀──── status: transcribing ────────┤ → STT + │◀──── transcript.done ─────────────┤ → Save user message + │◀──── status: thinking ────────────┤ → MCP call to Pallas + │◀──── response.text.done ──────────┤ → Save assistant message + │◀──── status: speaking ────────────┤ → TTS + │◀──── response.audio.delta ×N ─────┤ + │◀──── response.audio.done ─────────┤ + │◀──── response.done ───────────────┤ + │◀──── status: listening ────────────┤ + │ │ + │ [VAD: user speaks again] │ (same conversation) + ├─ (next turn cycle) ──────────────▶│ + │ │ + │ [Conversation ends by:] │ + │ • 120s silence → timeout │ + │ • Agent says goodbye │ + │ • WebSocket disconnect │ + │◀──── session.end ─────────────────┤ +``` + +### Conversation End + +A conversation ends in three ways: + +1. **Inactivity timeout** — no speech for `VOICE_CONVERSATION_TIMEOUT` seconds (default 120) +2. **Agent-initiated** — the Pallas agent recognizes the conversation is over and signals it +3. **Client disconnect** — ESP32 sends `session.close` or WebSocket drops + +All conversations are saved in PostgreSQL and visible in the Daedalus workspace chat history. + +--- + +## WebSocket Protocol + +### Connection + +``` +WS /api/v1/voice/realtime?device_id={uuid} +``` + +### Client → Gateway Messages + +| Type | Description | Fields | +|------|-------------|--------| +| `session.start` | Start a new conversation | `client_id` (optional), `audio_config` (optional) | +| `input_audio_buffer.append` | Audio chunk | `audio` (base64 PCM) | +| `input_audio_buffer.commit` | End of speech, trigger pipeline | — | +| `session.close` | End the session | — | + +### Gateway → Client Messages + +| Type | Description | Fields | +|------|-------------|--------| +| `session.created` | Session started | `session_id`, `conversation_id` | +| `status` | Processing state | `state` (`listening` / `transcribing` / `thinking` / `speaking`) | +| `transcript.done` | User's speech as text | `text` | +| `response.text.done` | Agent's text response | `text` | +| `response.audio.delta` | Audio chunk (streamed) | `delta` (base64 PCM) | +| `response.audio.done` | Audio streaming complete | — | +| `response.done` | Turn complete | — | +| `session.end` | Conversation ended | `reason` (`timeout` / `agent` / `client`) | +| `error` | Error occurred | `message`, `code` | + +### Audio Format + +All audio is **PCM signed 16-bit little-endian** (`pcm_s16le`), base64-encoded in JSON: + +- **Sample rate:** 16,000 Hz +- **Channels:** 1 (mono) +- **Bit depth:** 16-bit + +--- + +## API Endpoints + +All endpoints are served by the Daedalus FastAPI backend. + +### Voice Device Management + +| Method | Route | Purpose | +|--------|-------|---------| +| `POST` | `/api/v1/voice/devices/register` | ESP32 self-registration (idempotent) | +| `GET` | `/api/v1/voice/devices` | List all registered devices | +| `GET` | `/api/v1/voice/devices/{id}` | Get device details | +| `PUT` | `/api/v1/voice/devices/{id}` | Update device (name, description, workspace, agent) | +| `DELETE` | `/api/v1/voice/devices/{id}` | Remove a device | + +### Voice Sessions + +| Method | Route | Purpose | +|--------|-------|---------| +| `WS` | `/api/v1/voice/realtime?device_id={id}` | WebSocket for audio conversations | +| `GET` | `/api/v1/voice/sessions` | List active voice sessions | + +### Voice Configuration & Health + +| Method | Route | Purpose | +|--------|-------|---------| +| `GET` | `/api/v1/voice/config` | Current voice configuration | +| `PUT` | `/api/v1/voice/config` | Update voice settings | +| `GET` | `/api/v1/voice/health` | STT + TTS reachability check | + +--- + +## Observability + +### Prometheus Metrics + +Voice metrics are exposed at Daedalus's `GET /metrics` endpoint with the `daedalus_voice_` prefix: + +| Metric | Type | Description | +|--------|------|-------------| +| `daedalus_voice_sessions_active` | gauge | Active WebSocket sessions | +| `daedalus_voice_pipeline_duration_seconds` | histogram | Full pipeline latency | +| `daedalus_voice_stt_duration_seconds` | histogram | STT latency | +| `daedalus_voice_tts_duration_seconds` | histogram | TTS latency | +| `daedalus_voice_agent_duration_seconds` | histogram | Agent (MCP) latency | +| `daedalus_voice_transcriptions_total` | counter | Total STT calls | +| `daedalus_voice_conversations_total` | counter | Conversations by end reason | +| `daedalus_voice_devices_online` | gauge | Currently connected devices | + +### Logs + +Voice events flow through the standard Daedalus logging pipeline: structlog → stdout → syslog → Alloy → Loki. + +Key log events: `voice_device_registered`, `voice_session_started`, `voice_pipeline_complete`, `voice_conversation_ended`, `voice_pipeline_error`. + +--- + +## Troubleshooting + +### Device not appearing in Daedalus settings + +- Check the ESP32 can reach the Daedalus URL +- Verify the registration endpoint responds: `curl -X POST http://puck.incus:22181/api/v1/voice/devices/register -H 'Content-Type: application/json' -d '{"device_id":"test","firmware_version":"1.0"}'` + +### Device registered but voice doesn't work + +- Assign the device to a workspace and agent in **Settings → Voice Devices** +- Unassigned devices get: `{"type": "error", "code": "no_workspace"}` + +### STT returns empty transcripts + +- Check Speaches STT is running: `curl http://perseus.helu.ca:22070/v1/models` +- Check the voice health endpoint: `curl http://puck.incus:22181/api/v1/voice/health` + +### High latency + +- Check `daedalus_voice_pipeline_duration_seconds` in Prometheus/Grafana +- Breakdown by stage: STT, Agent, TTS histograms identify the bottleneck +- Agent latency depends on the Pallas agent and its downstream MCP servers + +### Audio sounds wrong (chipmunk / slow) + +- Speaches TTS outputs at 24 kHz; the pipeline resamples to 16 kHz +- Verify `DAEDALUS_VOICE_AUDIO_SAMPLE_RATE` matches the ESP32's playback rate + +--- + +## Architecture Overview + +``` +┌──────────────────┐ WebSocket ┌──────────────────────────────────────┐ +│ ESP32-S3 Board │◀══════════════════════▶ │ Daedalus Backend (FastAPI) │ +│ (stentor-ear) │ JSON + base64 audio │ puck.incus │ +│ UUID in NVS │ │ │ +│ Wake Word + VAD │ │ voice/ module: │ +└──────────────────┘ │ STT → MCP (Pallas) → TTS │ + │ Conversations → PostgreSQL │ + └──────┬──────────┬────────┬───────────┘ + │ │ │ + MCP │ HTTP │ HTTP │ + ▼ ▼ ▼ + ┌──────────┐ ┌────────┐ ┌────────┐ + │ Pallas │ │Speaches│ │Speaches│ + │ Agents │ │ STT │ │ TTS │ + └──────────┘ └────────┘ └────────┘ +``` + +For full architectural details including Mermaid diagrams, see [architecture.md](architecture.md). + +For the complete integration specification, see [daedalus/docs/stentor_integration.md](../../daedalus/docs/stentor_integration.md). diff --git a/stentor-gateway/Dockerfile b/stentor-gateway/Dockerfile new file mode 100644 index 0000000..79d2a53 --- /dev/null +++ b/stentor-gateway/Dockerfile @@ -0,0 +1,49 @@ +# Stentor Gateway — Multi-stage Dockerfile +# Builds a minimal production image for the voice gateway. + +# --- Build stage --- +FROM python:3.12-slim AS builder + +WORKDIR /build + +# Install uv for fast dependency resolution +COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv + +# Copy project files +COPY pyproject.toml . +COPY src/ src/ + +# Install dependencies into a virtual environment +RUN uv venv /opt/venv && \ + uv pip install --python /opt/venv/bin/python . --no-cache + +# --- Runtime stage --- +FROM python:3.12-slim AS runtime + +LABEL maintainer="robert" +LABEL description="Stentor Gateway — Voice gateway for AI agents" +LABEL version="0.1.0" + +# Copy virtual environment from builder +COPY --from=builder /opt/venv /opt/venv + +# Copy application source (for templates/static) +COPY src/ /app/src/ + +WORKDIR /app + +# Use the venv Python +ENV PATH="/opt/venv/bin:$PATH" +ENV PYTHONPATH="/app/src" +ENV PYTHONUNBUFFERED=1 + +# Default configuration +ENV STENTOR_HOST=0.0.0.0 +ENV STENTOR_PORT=8600 + +EXPOSE 8600 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD python -c "import httpx; r = httpx.get('http://localhost:8600/api/live/'); r.raise_for_status()" + +CMD ["uvicorn", "stentor.main:app", "--host", "0.0.0.0", "--port", "8600"] diff --git a/stentor-gateway/pyproject.toml b/stentor-gateway/pyproject.toml new file mode 100644 index 0000000..75118c8 --- /dev/null +++ b/stentor-gateway/pyproject.toml @@ -0,0 +1,45 @@ +[project] +name = "stentor-gateway" +version = "0.1.0" +description = "Voice gateway connecting ESP32 audio hardware to AI agents via speech services" +requires-python = ">=3.12" +license = "MIT" +dependencies = [ + "fastapi>=0.115", + "uvicorn[standard]>=0.34", + "websockets>=14.0", + "httpx>=0.28", + "pydantic>=2.10", + "pydantic-settings>=2.7", + "jinja2>=3.1", + "prometheus-client>=0.21", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0", + "pytest-asyncio>=0.25", + "pytest-httpx>=0.35", + "ruff>=0.9", +] + +[project.scripts] +stentor = "stentor.main:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/stentor"] + +[tool.ruff] +target-version = "py312" +line-length = 100 + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W", "UP", "B", "SIM", "RUF"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] diff --git a/stentor-gateway/src/stentor/__init__.py b/stentor-gateway/src/stentor/__init__.py new file mode 100644 index 0000000..8af492a --- /dev/null +++ b/stentor-gateway/src/stentor/__init__.py @@ -0,0 +1,3 @@ +"""Stentor — Voice Gateway for AI Agents.""" + +__version__ = "0.1.0" diff --git a/stentor-gateway/src/stentor/__main__.py b/stentor-gateway/src/stentor/__main__.py new file mode 100644 index 0000000..7b4fe4c --- /dev/null +++ b/stentor-gateway/src/stentor/__main__.py @@ -0,0 +1,5 @@ +"""Allow running Stentor Gateway with ``python -m stentor``.""" + +from stentor.main import main + +main() diff --git a/stentor-gateway/src/stentor/agent_client.py b/stentor-gateway/src/stentor/agent_client.py new file mode 100644 index 0000000..ce50835 --- /dev/null +++ b/stentor-gateway/src/stentor/agent_client.py @@ -0,0 +1,62 @@ +"""FastAgent HTTP client. + +Thin adapter for communicating with FastAgent's HTTP transport. +Designed to be easily swappable if the API shape changes. +""" + +import logging +import time + +import httpx + +from stentor.config import Settings + +logger = logging.getLogger(__name__) + + +class AgentClient: + """Async client for FastAgent HTTP transport.""" + + def __init__(self, settings: Settings, http_client: httpx.AsyncClient) -> None: + self._settings = settings + self._http = http_client + self._url = f"{settings.agent_url.rstrip('/')}/message" + + async def send_message(self, content: str) -> str: + """Send a message to the agent and return the response. + + Args: + content: The user's transcribed message. + + Returns: + The agent's response text. + + Raises: + httpx.HTTPStatusError: If the agent returns an error. + """ + start = time.monotonic() + + payload = {"content": content} + + logger.debug("Agent request to %s: %r", self._url, content[:80]) + + response = await self._http.post(self._url, json=payload, timeout=60.0) + response.raise_for_status() + + result = response.json() + text = result.get("content", "") + + elapsed = time.monotonic() - start + logger.info("Agent responded in %.2fs: %r", elapsed, text[:80]) + + return text + + async def is_available(self) -> bool: + """Check if the agent service is reachable.""" + try: + base_url = self._settings.agent_url.rstrip("/") + response = await self._http.get(base_url, timeout=5.0) + # Accept any non-5xx response as "available" + return response.status_code < 500 + except (httpx.ConnectError, httpx.TimeoutException): + return False diff --git a/stentor-gateway/src/stentor/audio.py b/stentor-gateway/src/stentor/audio.py new file mode 100644 index 0000000..4c806dc --- /dev/null +++ b/stentor-gateway/src/stentor/audio.py @@ -0,0 +1,127 @@ +"""Audio utilities for Stentor Gateway. + +Handles PCM↔WAV conversion, resampling, and base64 encoding/decoding. +""" + +import base64 +import io +import struct +import wave + + +def pcm_to_wav( + pcm_data: bytes, + sample_rate: int = 16000, + channels: int = 1, + sample_width: int = 2, +) -> bytes: + """Wrap raw PCM bytes in a WAV header. + + Args: + pcm_data: Raw PCM audio bytes (signed 16-bit little-endian). + sample_rate: Sample rate in Hz. + channels: Number of audio channels. + sample_width: Bytes per sample (2 for 16-bit). + + Returns: + Complete WAV file as bytes. + """ + buf = io.BytesIO() + with wave.open(buf, "wb") as wf: + wf.setnchannels(channels) + wf.setsampwidth(sample_width) + wf.setframerate(sample_rate) + wf.writeframes(pcm_data) + return buf.getvalue() + + +def wav_to_pcm(wav_data: bytes) -> tuple[bytes, int, int, int]: + """Extract raw PCM data from a WAV file. + + Args: + wav_data: Complete WAV file as bytes. + + Returns: + Tuple of (pcm_data, sample_rate, channels, sample_width). + """ + buf = io.BytesIO(wav_data) + with wave.open(buf, "rb") as wf: + pcm_data = wf.readframes(wf.getnframes()) + return pcm_data, wf.getframerate(), wf.getnchannels(), wf.getsampwidth() + + +def resample_pcm( + pcm_data: bytes, + src_rate: int, + dst_rate: int, + sample_width: int = 2, + channels: int = 1, +) -> bytes: + """Resample PCM audio using linear interpolation. + + Simple resampler with no external dependencies. Adequate for speech audio + where the primary use case is converting TTS output (24kHz) to the ESP32 + playback rate (16kHz). + + Args: + pcm_data: Raw PCM bytes (signed 16-bit little-endian). + src_rate: Source sample rate in Hz. + dst_rate: Destination sample rate in Hz. + sample_width: Bytes per sample (must be 2 for 16-bit). + channels: Number of audio channels. + + Returns: + Resampled PCM bytes. + """ + if src_rate == dst_rate: + return pcm_data + + if sample_width != 2: + msg = f"Only 16-bit PCM supported for resampling, got {sample_width * 8}-bit" + raise ValueError(msg) + + fmt = " str: + """Base64-encode PCM audio data for WebSocket transmission.""" + return base64.b64encode(pcm_data).decode("ascii") + + +def decode_audio(b64_data: str) -> bytes: + """Decode base64-encoded PCM audio data from WebSocket.""" + return base64.b64decode(b64_data) diff --git a/stentor-gateway/src/stentor/config.py b/stentor-gateway/src/stentor/config.py new file mode 100644 index 0000000..ebba676 --- /dev/null +++ b/stentor-gateway/src/stentor/config.py @@ -0,0 +1,43 @@ +"""Pydantic Settings configuration for Stentor Gateway. + +All configuration is driven by environment variables (12-factor). +""" + +from pydantic_settings import BaseSettings + + +class Settings(BaseSettings): + """Stentor Gateway configuration.""" + + model_config = {"env_prefix": "STENTOR_"} + + # Server + host: str = "0.0.0.0" + port: int = 8600 + + # Speaches endpoints (OpenAI-compatible) + stt_url: str = "http://perseus.helu.ca:22070" + tts_url: str = "http://perseus.helu.ca:22070" + + # FastAgent endpoint + agent_url: str = "http://localhost:8001" + + # STT configuration + stt_model: str = "Systran/faster-whisper-small" + + # TTS configuration + tts_model: str = "kokoro" + tts_voice: str = "af_heart" + + # Audio configuration + audio_sample_rate: int = 16000 + audio_channels: int = 1 + audio_sample_width: int = 16 + + # Logging + log_level: str = "DEBUG" + + +def get_settings() -> Settings: + """Create and return a Settings instance.""" + return Settings() diff --git a/stentor-gateway/src/stentor/health.py b/stentor-gateway/src/stentor/health.py new file mode 100644 index 0000000..515a882 --- /dev/null +++ b/stentor-gateway/src/stentor/health.py @@ -0,0 +1,166 @@ +"""Health check and metrics endpoints for Stentor Gateway. + +Follows Kubernetes health check conventions: +- /api/live/ — Liveness probe (is the process alive?) +- /api/ready/ — Readiness probe (are dependencies reachable?) +- /api/metrics — Prometheus-compatible metrics +""" + +import time + +from fastapi import APIRouter, Response +from prometheus_client import ( + Counter, + Gauge, + Histogram, + generate_latest, +) + +from stentor.agent_client import AgentClient +from stentor.stt_client import STTClient +from stentor.tts_client import TTSClient + +router = APIRouter(prefix="/api") + +# --------------------------------------------------------------------------- +# Prometheus metrics +# --------------------------------------------------------------------------- + +SESSIONS_ACTIVE = Gauge( + "stentor_sessions_active", + "Current active WebSocket sessions", +) + +TRANSCRIPTIONS_TOTAL = Counter( + "stentor_transcriptions_total", + "Total STT transcription calls", +) + +TTS_REQUESTS_TOTAL = Counter( + "stentor_tts_requests_total", + "Total TTS synthesis calls", +) + +AGENT_REQUESTS_TOTAL = Counter( + "stentor_agent_requests_total", + "Total agent message calls", +) + +PIPELINE_DURATION = Histogram( + "stentor_pipeline_duration_seconds", + "Full pipeline latency (STT + Agent + TTS)", + buckets=(0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0, 30.0), +) + +STT_DURATION = Histogram( + "stentor_stt_duration_seconds", + "STT transcription latency", + buckets=(0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0), +) + +TTS_DURATION = Histogram( + "stentor_tts_duration_seconds", + "TTS synthesis latency", + buckets=(0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0), +) + +AGENT_DURATION = Histogram( + "stentor_agent_duration_seconds", + "Agent response latency", + buckets=(0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0), +) + +# --------------------------------------------------------------------------- +# Startup time tracking +# --------------------------------------------------------------------------- + +_start_time: float = 0.0 + + +def record_start_time() -> None: + """Record the application start time.""" + global _start_time + _start_time = time.monotonic() + + +def get_uptime() -> float: + """Return uptime in seconds.""" + if _start_time == 0.0: + return 0.0 + return time.monotonic() - _start_time + + +# --------------------------------------------------------------------------- +# Dependency references (set during app startup) +# --------------------------------------------------------------------------- + +_stt_client: STTClient | None = None +_tts_client: TTSClient | None = None +_agent_client: AgentClient | None = None + + +def set_clients(stt: STTClient, tts: TTSClient, agent: AgentClient) -> None: + """Register service clients for health checks.""" + global _stt_client, _tts_client, _agent_client + _stt_client = stt + _tts_client = tts + _agent_client = agent + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + + +@router.get("/live/") +async def liveness() -> dict: + """Liveness probe — is the process alive and responding? + + Returns 200 if the application is running. + Used by Kubernetes to determine if the pod should be restarted. + """ + return {"status": "ok"} + + +@router.get("/ready/") +async def readiness(response: Response) -> dict: + """Readiness probe — are all dependencies reachable? + + Checks connectivity to STT, TTS, and Agent services. + Returns 200 if ready, 503 if any dependency is unavailable. + Used by load balancers to determine if pod should receive traffic. + """ + checks: dict[str, bool] = {} + + if _stt_client: + checks["stt"] = await _stt_client.is_available() + else: + checks["stt"] = False + + if _tts_client: + checks["tts"] = await _tts_client.is_available() + else: + checks["tts"] = False + + if _agent_client: + checks["agent"] = await _agent_client.is_available() + else: + checks["agent"] = False + + all_ready = all(checks.values()) + if not all_ready: + response.status_code = 503 + + return { + "status": "ready" if all_ready else "not_ready", + "checks": checks, + } + + +@router.get("/metrics") +async def metrics() -> Response: + """Prometheus-compatible metrics endpoint.""" + return Response( + content=generate_latest(), + media_type="text/plain; version=0.0.4; charset=utf-8", + ) diff --git a/stentor-gateway/src/stentor/main.py b/stentor-gateway/src/stentor/main.py new file mode 100644 index 0000000..27ab381 --- /dev/null +++ b/stentor-gateway/src/stentor/main.py @@ -0,0 +1,421 @@ +"""Stentor Gateway — FastAPI application. + +Central orchestrator connecting ESP32 audio hardware to AI agents via speech services. +""" + +import json +import logging +import uuid +from contextlib import asynccontextmanager, suppress +from dataclasses import dataclass, field +from pathlib import Path + +import httpx +from fastapi import FastAPI, WebSocket, WebSocketDisconnect +from fastapi.responses import HTMLResponse +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates +from pydantic import ValidationError +from starlette.requests import Request + +from stentor import __version__ +from stentor.agent_client import AgentClient +from stentor.audio import decode_audio +from stentor.config import Settings, get_settings +from stentor.health import ( + AGENT_DURATION, + AGENT_REQUESTS_TOTAL, + PIPELINE_DURATION, + SESSIONS_ACTIVE, + STT_DURATION, + TRANSCRIPTIONS_TOTAL, + TTS_DURATION, + TTS_REQUESTS_TOTAL, + get_uptime, + record_start_time, + set_clients, +) +from stentor.health import ( + router as health_router, +) +from stentor.models import ( + ErrorEvent, + InputAudioBufferAppend, + SessionCreated, + SessionStart, + StatusUpdate, +) +from stentor.pipeline import Pipeline, PipelineState +from stentor.stt_client import STTClient +from stentor.tts_client import TTSClient + +logger = logging.getLogger(__name__) + +# Paths for templates and static files +BASE_DIR = Path(__file__).resolve().parent +TEMPLATES_DIR = BASE_DIR / "templates" +STATIC_DIR = BASE_DIR / "static" + + +# --------------------------------------------------------------------------- +# Session tracking +# --------------------------------------------------------------------------- + + +@dataclass +class Session: + """Active WebSocket session.""" + + session_id: str + client_id: str + websocket: WebSocket + audio_buffer: bytearray = field(default_factory=bytearray) + + def reset_buffer(self) -> bytes: + """Return buffered audio and reset the buffer.""" + data = bytes(self.audio_buffer) + self.audio_buffer.clear() + return data + + +# Active sessions keyed by session_id +_sessions: dict[str, Session] = {} + + +def get_active_sessions() -> dict[str, Session]: + """Return the active sessions dict.""" + return _sessions + + +# --------------------------------------------------------------------------- +# Application lifecycle +# --------------------------------------------------------------------------- + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Application startup and shutdown.""" + settings = get_settings() + + # Configure logging + logging.basicConfig( + level=getattr(logging, settings.log_level.upper(), logging.INFO), + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + + logger.info("Stentor Gateway v%s starting", __version__) + logger.info("STT: %s (model: %s)", settings.stt_url, settings.stt_model) + logger.info( + "TTS: %s (model: %s, voice: %s)", + settings.tts_url, settings.tts_model, settings.tts_voice, + ) + logger.info("Agent: %s", settings.agent_url) + + record_start_time() + + # Create shared HTTP client + http_client = httpx.AsyncClient() + + # Create service clients + stt_client = STTClient(settings, http_client) + tts_client = TTSClient(settings, http_client) + agent_client = AgentClient(settings, http_client) + + # Register clients for health checks + set_clients(stt_client, tts_client, agent_client) + + # Create pipeline state and pipeline + pipeline_state = PipelineState() + pipeline = Pipeline(settings, stt_client, tts_client, agent_client, pipeline_state) + + # Store on app state for access in endpoints + app.state.settings = settings + app.state.http_client = http_client + app.state.stt_client = stt_client + app.state.tts_client = tts_client + app.state.agent_client = agent_client + app.state.pipeline = pipeline + app.state.pipeline_state = pipeline_state + + logger.info("Stentor Gateway ready on %s:%d", settings.host, settings.port) + + yield + + # Shutdown + logger.info("Stentor Gateway shutting down") + await http_client.aclose() + + +# --------------------------------------------------------------------------- +# FastAPI app +# --------------------------------------------------------------------------- + +app = FastAPI( + title="Stentor Gateway", + version=__version__, + description="Voice gateway connecting ESP32 audio hardware to AI agents via speech services.", + lifespan=lifespan, + docs_url="/api/docs", + openapi_url="/api/openapi.json", +) + +# Include health/metrics routes +app.include_router(health_router) + +# Mount static files +if STATIC_DIR.exists(): + app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static") + +# Templates +templates = Jinja2Templates(directory=str(TEMPLATES_DIR)) + + +# --------------------------------------------------------------------------- +# Dashboard +# --------------------------------------------------------------------------- + + +@app.get("/", response_class=HTMLResponse) +async def dashboard(request: Request) -> HTMLResponse: + """Serve the Bootstrap dashboard.""" + settings: Settings = request.app.state.settings + pipeline_state: PipelineState = request.app.state.pipeline_state + + return templates.TemplateResponse( + "dashboard.html", + { + "request": request, + "version": __version__, + "settings": settings, + "active_sessions": len(_sessions), + "sessions": { + sid: {"client_id": s.client_id, "buffer_size": len(s.audio_buffer)} + for sid, s in _sessions.items() + }, + "uptime": get_uptime(), + "pipeline_state": pipeline_state, + }, + ) + + +# --------------------------------------------------------------------------- +# API info endpoint +# --------------------------------------------------------------------------- + + +@app.get("/api/v1/info") +async def api_info() -> dict: + """Return gateway information.""" + settings: Settings = app.state.settings + return { + "name": "stentor-gateway", + "version": __version__, + "endpoints": { + "realtime": "/api/v1/realtime", + "live": "/api/live/", + "ready": "/api/ready/", + "metrics": "/api/metrics", + }, + "config": { + "stt_url": settings.stt_url, + "tts_url": settings.tts_url, + "agent_url": settings.agent_url, + "stt_model": settings.stt_model, + "tts_model": settings.tts_model, + "tts_voice": settings.tts_voice, + "audio_sample_rate": settings.audio_sample_rate, + "audio_channels": settings.audio_channels, + "audio_sample_width": settings.audio_sample_width, + }, + } + + +# --------------------------------------------------------------------------- +# WebSocket endpoint (OpenAI Realtime API-inspired) +# --------------------------------------------------------------------------- + + +@app.websocket("/api/v1/realtime") +async def realtime_websocket(websocket: WebSocket) -> None: + """WebSocket endpoint for real-time audio conversations. + + Protocol inspired by the OpenAI Realtime API: + - Client sends session.start, then streams input_audio_buffer.append events + - Client sends input_audio_buffer.commit to trigger pipeline + - Gateway responds with status updates, transcript, response text, and audio + """ + await websocket.accept() + + session: Session | None = None + pipeline: Pipeline = websocket.app.state.pipeline + + try: + while True: + raw = await websocket.receive_text() + + try: + data = json.loads(raw) + except json.JSONDecodeError: + await _send_error(websocket, "Invalid JSON", "invalid_json") + continue + + msg_type = data.get("type", "") + + # --- session.start --- + if msg_type == "session.start": + try: + msg = SessionStart.model_validate(data) + except ValidationError as e: + await _send_error(websocket, str(e), "validation_error") + continue + + session_id = str(uuid.uuid4()) + session = Session( + session_id=session_id, + client_id=msg.client_id, + websocket=websocket, + ) + _sessions[session_id] = session + SESSIONS_ACTIVE.inc() + + logger.info( + "Session started: %s (client: %s)", + session_id, + msg.client_id, + ) + + await websocket.send_text( + SessionCreated(session_id=session_id).model_dump_json() + ) + await websocket.send_text( + StatusUpdate(state="listening").model_dump_json() + ) + + # --- input_audio_buffer.append --- + elif msg_type == "input_audio_buffer.append": + if not session: + await _send_error(websocket, "No active session", "no_session") + continue + + try: + msg = InputAudioBufferAppend.model_validate(data) + except ValidationError as e: + await _send_error(websocket, str(e), "validation_error") + continue + + pcm_chunk = decode_audio(msg.audio) + session.audio_buffer.extend(pcm_chunk) + + # --- input_audio_buffer.commit --- + elif msg_type == "input_audio_buffer.commit": + if not session: + await _send_error(websocket, "No active session", "no_session") + continue + + audio_data = session.reset_buffer() + + if not audio_data: + await _send_error(websocket, "Empty audio buffer", "empty_buffer") + await websocket.send_text( + StatusUpdate(state="listening").model_dump_json() + ) + continue + + logger.info( + "Processing %d bytes of audio for session %s", + len(audio_data), + session.session_id, + ) + + # Run the pipeline and stream events back + async for event in pipeline.process(audio_data): + await websocket.send_text(event.model_dump_json()) + + # Record Prometheus metrics from pipeline events + if event.type == "transcript.done": + TRANSCRIPTIONS_TOTAL.inc() + elif event.type == "response.text.done": + AGENT_REQUESTS_TOTAL.inc() + elif event.type == "response.audio.done": + TTS_REQUESTS_TOTAL.inc() + + # Record pipeline duration from state + pipeline_state: PipelineState = websocket.app.state.pipeline_state + if pipeline_state.recent_metrics: + last = pipeline_state.recent_metrics[-1] + PIPELINE_DURATION.observe(last.total_duration) + STT_DURATION.observe(last.stt_duration) + AGENT_DURATION.observe(last.agent_duration) + TTS_DURATION.observe(last.tts_duration) + + # Return to listening state + await websocket.send_text( + StatusUpdate(state="listening").model_dump_json() + ) + + # --- session.close --- + elif msg_type == "session.close": + logger.info("Session close requested") + break + + else: + await _send_error( + websocket, f"Unknown event type: {msg_type}", "unknown_event" + ) + + except WebSocketDisconnect: + logger.info("WebSocket disconnected") + + except Exception: + logger.exception("WebSocket error") + with suppress(Exception): + await _send_error(websocket, "Internal server error", "internal_error") + + finally: + if session and session.session_id in _sessions: + del _sessions[session.session_id] + SESSIONS_ACTIVE.dec() + logger.info("Session ended: %s", session.session_id) + + +async def _send_error(websocket: WebSocket, message: str, code: str) -> None: + """Send an error event to the client.""" + event = ErrorEvent(message=message, code=code) + await websocket.send_text(event.model_dump_json()) + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + + +def main() -> None: + """Start the Stentor Gateway server. + + This is the main entry point used by: + - ``python -m stentor`` + - The ``stentor`` console script (installed via pip) + - ``python main.py`` (when run directly) + """ + import uvicorn + + settings = get_settings() + + # Configure root logging early so startup messages are visible + logging.basicConfig( + level=getattr(logging, settings.log_level.upper(), logging.INFO), + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) + + logger.info("Starting Stentor Gateway v%s on %s:%d", __version__, settings.host, settings.port) + + uvicorn.run( + "stentor.main:app", + host=settings.host, + port=settings.port, + log_level=settings.log_level.lower(), + ) + + +if __name__ == "__main__": + main() diff --git a/stentor-gateway/src/stentor/models.py b/stentor-gateway/src/stentor/models.py new file mode 100644 index 0000000..efd26e5 --- /dev/null +++ b/stentor-gateway/src/stentor/models.py @@ -0,0 +1,129 @@ +"""WebSocket message models for Stentor Gateway. + +Inspired by the OpenAI Realtime API event naming conventions. +Messages are JSON with base64-encoded audio data. +""" + +from __future__ import annotations + +from typing import Annotated, Literal + +from pydantic import BaseModel, Field + +# --------------------------------------------------------------------------- +# Client → Gateway events +# --------------------------------------------------------------------------- + + +class AudioConfig(BaseModel): + """Audio configuration sent by the client on session start.""" + + sample_rate: int = 16000 + channels: int = 1 + sample_width: int = 16 + encoding: str = "pcm_s16le" + + +class SessionStart(BaseModel): + """Client requests a new session.""" + + type: Literal["session.start"] = "session.start" + client_id: str = "" + audio_config: AudioConfig = Field(default_factory=AudioConfig) + + +class InputAudioBufferAppend(BaseModel): + """Client sends a chunk of audio data.""" + + type: Literal["input_audio_buffer.append"] = "input_audio_buffer.append" + audio: str # base64-encoded PCM + + +class InputAudioBufferCommit(BaseModel): + """Client signals end of speech / commits the audio buffer.""" + + type: Literal["input_audio_buffer.commit"] = "input_audio_buffer.commit" + + +class SessionClose(BaseModel): + """Client requests session termination.""" + + type: Literal["session.close"] = "session.close" + + +ClientEvent = Annotated[ + SessionStart | InputAudioBufferAppend | InputAudioBufferCommit | SessionClose, + Field(discriminator="type"), +] + + +# --------------------------------------------------------------------------- +# Gateway → Client events +# --------------------------------------------------------------------------- + + +class SessionCreated(BaseModel): + """Acknowledge session creation.""" + + type: Literal["session.created"] = "session.created" + session_id: str + + +class StatusUpdate(BaseModel): + """Gateway processing status update.""" + + type: Literal["status"] = "status" + state: Literal["listening", "transcribing", "thinking", "speaking"] + + +class TranscriptDone(BaseModel): + """Transcript of what the user said.""" + + type: Literal["transcript.done"] = "transcript.done" + text: str + + +class ResponseTextDone(BaseModel): + """AI agent response text.""" + + type: Literal["response.text.done"] = "response.text.done" + text: str + + +class ResponseAudioDelta(BaseModel): + """Streamed audio response chunk.""" + + type: Literal["response.audio.delta"] = "response.audio.delta" + delta: str # base64-encoded PCM + + +class ResponseAudioDone(BaseModel): + """Audio response streaming complete.""" + + type: Literal["response.audio.done"] = "response.audio.done" + + +class ResponseDone(BaseModel): + """Full response cycle complete.""" + + type: Literal["response.done"] = "response.done" + + +class ErrorEvent(BaseModel): + """Error event.""" + + type: Literal["error"] = "error" + message: str + code: str = "unknown_error" + + +ServerEvent = ( + SessionCreated + | StatusUpdate + | TranscriptDone + | ResponseTextDone + | ResponseAudioDelta + | ResponseAudioDone + | ResponseDone + | ErrorEvent +) diff --git a/stentor-gateway/src/stentor/pipeline.py b/stentor-gateway/src/stentor/pipeline.py new file mode 100644 index 0000000..16e1aae --- /dev/null +++ b/stentor-gateway/src/stentor/pipeline.py @@ -0,0 +1,175 @@ +"""Voice pipeline orchestrator: STT → Agent → TTS. + +Ties together the three service clients into a single processing pipeline. +""" + +import logging +import time +from collections.abc import AsyncIterator +from dataclasses import dataclass, field + +from stentor.agent_client import AgentClient +from stentor.audio import encode_audio, pcm_to_wav, resample_pcm +from stentor.config import Settings +from stentor.models import ( + ErrorEvent, + ResponseAudioDelta, + ResponseAudioDone, + ResponseDone, + ResponseTextDone, + ServerEvent, + StatusUpdate, + TranscriptDone, +) +from stentor.stt_client import STTClient +from stentor.tts_client import TTS_OUTPUT_SAMPLE_RATE, TTSClient + +logger = logging.getLogger(__name__) + +# Audio chunk size for streaming to client (2KB ≈ 64ms at 16kHz/16bit/mono) +AUDIO_CHUNK_SIZE = 2048 + + +@dataclass +class PipelineMetrics: + """Accumulated metrics for a single pipeline run.""" + + stt_duration: float = 0.0 + agent_duration: float = 0.0 + tts_duration: float = 0.0 + total_duration: float = 0.0 + transcript: str = "" + response_text: str = "" + audio_bytes_sent: int = 0 + + +@dataclass +class PipelineState: + """Shared state for pipeline metrics collection.""" + + total_transcriptions: int = 0 + total_tts_requests: int = 0 + total_agent_requests: int = 0 + recent_metrics: list[PipelineMetrics] = field(default_factory=list) + + def record(self, metrics: PipelineMetrics) -> None: + """Record metrics from a pipeline run.""" + self.total_transcriptions += 1 + self.total_agent_requests += 1 + self.total_tts_requests += 1 + self.recent_metrics.append(metrics) + # Keep only last 100 runs + if len(self.recent_metrics) > 100: + self.recent_metrics = self.recent_metrics[-100:] + + +class Pipeline: + """Orchestrates the STT → Agent → TTS voice pipeline.""" + + def __init__( + self, + settings: Settings, + stt: STTClient, + tts: TTSClient, + agent: AgentClient, + state: PipelineState, + ) -> None: + self._settings = settings + self._stt = stt + self._tts = tts + self._agent = agent + self._state = state + + async def process(self, audio_buffer: bytes) -> AsyncIterator[ServerEvent]: + """Run the full voice pipeline on buffered audio. + + Yields server events as the pipeline progresses through each stage: + status updates, transcript, response text, audio chunks, and completion. + + Args: + audio_buffer: Raw PCM audio bytes from the client. + + Yields: + ServerEvent instances to send back to the client. + """ + pipeline_start = time.monotonic() + metrics = PipelineMetrics() + + try: + # --- Stage 1: STT --- + yield StatusUpdate(state="transcribing") + + wav_data = pcm_to_wav( + audio_buffer, + sample_rate=self._settings.audio_sample_rate, + channels=self._settings.audio_channels, + ) + + stt_start = time.monotonic() + transcript = await self._stt.transcribe(wav_data) + metrics.stt_duration = time.monotonic() - stt_start + metrics.transcript = transcript + + if not transcript: + logger.warning("STT returned empty transcript") + yield ErrorEvent(message="No speech detected", code="empty_transcript") + return + + yield TranscriptDone(text=transcript) + + # --- Stage 2: Agent --- + yield StatusUpdate(state="thinking") + + agent_start = time.monotonic() + response_text = await self._agent.send_message(transcript) + metrics.agent_duration = time.monotonic() - agent_start + metrics.response_text = response_text + + if not response_text: + logger.warning("Agent returned empty response") + yield ErrorEvent(message="Agent returned empty response", code="empty_response") + return + + yield ResponseTextDone(text=response_text) + + # --- Stage 3: TTS --- + yield StatusUpdate(state="speaking") + + tts_start = time.monotonic() + tts_audio = await self._tts.synthesize(response_text) + metrics.tts_duration = time.monotonic() - tts_start + + # Resample from TTS output rate to client playback rate if needed + target_rate = self._settings.audio_sample_rate + if target_rate != TTS_OUTPUT_SAMPLE_RATE: + tts_audio = resample_pcm( + tts_audio, + src_rate=TTS_OUTPUT_SAMPLE_RATE, + dst_rate=target_rate, + ) + + # Stream audio in chunks + offset = 0 + while offset < len(tts_audio): + chunk = tts_audio[offset : offset + AUDIO_CHUNK_SIZE] + yield ResponseAudioDelta(delta=encode_audio(chunk)) + metrics.audio_bytes_sent += len(chunk) + offset += AUDIO_CHUNK_SIZE + + yield ResponseAudioDone() + yield ResponseDone() + + except Exception: + logger.exception("Pipeline error") + yield ErrorEvent(message="Internal pipeline error", code="pipeline_error") + + finally: + metrics.total_duration = time.monotonic() - pipeline_start + self._state.record(metrics) + logger.info( + "Pipeline complete: stt=%.2fs agent=%.2fs tts=%.2fs total=%.2fs", + metrics.stt_duration, + metrics.agent_duration, + metrics.tts_duration, + metrics.total_duration, + ) diff --git a/stentor-gateway/src/stentor/static/.gitkeep b/stentor-gateway/src/stentor/static/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/stentor-gateway/src/stentor/stt_client.py b/stentor-gateway/src/stentor/stt_client.py new file mode 100644 index 0000000..93948ed --- /dev/null +++ b/stentor-gateway/src/stentor/stt_client.py @@ -0,0 +1,68 @@ +"""Speaches STT client (OpenAI-compatible). + +Posts audio as multipart/form-data to the /v1/audio/transcriptions endpoint. +""" + +import logging +import time + +import httpx + +from stentor.config import Settings + +logger = logging.getLogger(__name__) + + +class STTClient: + """Async client for Speaches Speech-to-Text service.""" + + def __init__(self, settings: Settings, http_client: httpx.AsyncClient) -> None: + self._settings = settings + self._http = http_client + self._url = f"{settings.stt_url.rstrip('/')}/v1/audio/transcriptions" + + async def transcribe(self, wav_data: bytes, language: str | None = None) -> str: + """Send audio to Speaches STT and return the transcript. + + Args: + wav_data: Complete WAV file bytes. + language: Optional language code (e.g., "en"). Auto-detect if omitted. + + Returns: + Transcribed text. + + Raises: + httpx.HTTPStatusError: If the STT service returns an error. + """ + start = time.monotonic() + + files = {"file": ("audio.wav", wav_data, "audio/wav")} + data: dict[str, str] = { + "model": self._settings.stt_model, + "response_format": "json", + } + if language: + data["language"] = language + + logger.debug("STT request to %s (model=%s)", self._url, self._settings.stt_model) + + response = await self._http.post(self._url, files=files, data=data, timeout=30.0) + response.raise_for_status() + + result = response.json() + text = result.get("text", "").strip() + + elapsed = time.monotonic() - start + logger.info("STT completed in %.2fs: %r", elapsed, text[:80]) + + return text + + async def is_available(self) -> bool: + """Check if the STT service is reachable.""" + try: + # Hit the base URL to check availability + base_url = self._settings.stt_url.rstrip("/") + response = await self._http.get(f"{base_url}/v1/models", timeout=5.0) + return response.status_code == 200 + except (httpx.ConnectError, httpx.TimeoutException): + return False diff --git a/stentor-gateway/src/stentor/templates/dashboard.html b/stentor-gateway/src/stentor/templates/dashboard.html new file mode 100644 index 0000000..c354ff6 --- /dev/null +++ b/stentor-gateway/src/stentor/templates/dashboard.html @@ -0,0 +1,361 @@ + + + + + + Stentor Gateway — Dashboard + + + + + + + + +
+ +
+
+

+ “Stentor, whose voice was as powerful as fifty voices of other men.” + — Homer, Iliad, Book V +

+
+
+ + +
+
+
Service Connectivity
+
+
+
+
+
+ +
Speaches STT
+
+

+ Speech-to-Text +

+ {{ settings.stt_url }} +
+ Model: + {{ settings.stt_model }} +
+
+ Checking... +
+
+
+
+
+
+
+
+ +
Speaches TTS
+
+

+ Text-to-Speech +

+ {{ settings.tts_url }} +
+ Model: + {{ settings.tts_model }} + Voice: + {{ settings.tts_voice }} +
+
+ Checking... +
+
+
+
+
+
+
+
+ +
FastAgent
+
+

+ AI Agent +

+ {{ settings.agent_url }} +
+ Checking... +
+
+
+
+
+ + +
+
+
Metrics
+
+
+
+
+
{{ active_sessions }}
+
Active Sessions
+
+
+
+
+
+
+
{{ pipeline_state.total_transcriptions }}
+
Transcriptions
+
+
+
+
+
+
+
{{ pipeline_state.total_tts_requests }}
+
TTS Requests
+
+
+
+
+
+
+
+
Uptime
+
+
+
+
+ + +
+
+
Connected Clients
+
+
+ {% if sessions %} +
+ + + + + + + + + + {% for sid, info in sessions.items() %} + + + + + + {% endfor %} + +
Session IDClient IDBuffer
{{ sid[:8] }}...{{ info.client_id or "—" }}{{ info.buffer_size }} bytes
+
+ {% else %} +

+ No clients connected. +

+ {% endif %} +
+
+
+ + +
+
Audio Configuration
+
+
+ + + + + + + + + + + + + + + + + + + + + + + +
Sample Rate{{ settings.audio_sample_rate }} Hz
Channels{{ settings.audio_channels }} (mono)
Sample Width{{ settings.audio_sample_width }}-bit
EncodingPCM S16LE
WebSocket/api/v1/realtime
+
+
+
+
+ + +
+
+
Recent Pipeline Runs
+
+
+ {% if pipeline_state.recent_metrics %} +
+ + + + + + + + + + + + + {% for m in pipeline_state.recent_metrics[-10:]|reverse %} + + + + + + + + + {% endfor %} + +
TranscriptResponseSTTAgentTTSTotal
{{ m.transcript[:50] }}{% if m.transcript|length > 50 %}...{% endif %}{{ m.response_text[:50] }}{% if m.response_text|length > 50 %}...{% endif %}{{ "%.2f"|format(m.stt_duration) }}s{{ "%.2f"|format(m.agent_duration) }}s{{ "%.2f"|format(m.tts_duration) }}s{{ "%.2f"|format(m.total_duration) }}s
+
+ {% else %} +

+ No pipeline runs yet. +

+ {% endif %} +
+
+
+
+ + +
+
+

+ Stentor Gateway v{{ version }} · + API Docs · + Metrics · + Readiness +

+
+
+
+ + + + + diff --git a/stentor-gateway/src/stentor/tts_client.py b/stentor-gateway/src/stentor/tts_client.py new file mode 100644 index 0000000..5b628f2 --- /dev/null +++ b/stentor-gateway/src/stentor/tts_client.py @@ -0,0 +1,102 @@ +"""Speaches TTS client (OpenAI-compatible). + +Posts JSON to the /v1/audio/speech endpoint and streams back PCM audio. +""" + +import logging +import time +from collections.abc import AsyncIterator + +import httpx + +from stentor.config import Settings + +logger = logging.getLogger(__name__) + +# Speaches TTS typically outputs at 24kHz +TTS_OUTPUT_SAMPLE_RATE = 24000 + +# Chunk size for streaming TTS audio (4KB ≈ 85ms at 24kHz/16bit/mono) +TTS_STREAM_CHUNK_SIZE = 4096 + + +class TTSClient: + """Async client for Speaches Text-to-Speech service.""" + + def __init__(self, settings: Settings, http_client: httpx.AsyncClient) -> None: + self._settings = settings + self._http = http_client + self._url = f"{settings.tts_url.rstrip('/')}/v1/audio/speech" + + async def synthesize(self, text: str) -> bytes: + """Synthesize speech from text, returning complete PCM audio. + + Args: + text: Text to synthesize. + + Returns: + Raw PCM audio bytes (24kHz, mono, 16-bit signed LE). + + Raises: + httpx.HTTPStatusError: If the TTS service returns an error. + """ + start = time.monotonic() + + payload = { + "model": self._settings.tts_model, + "voice": self._settings.tts_voice, + "input": text, + "response_format": "pcm", + "speed": 1.0, + } + + logger.debug( + "TTS request to %s (model=%s, voice=%s): %r", + self._url, + self._settings.tts_model, + self._settings.tts_voice, + text[:80], + ) + + response = await self._http.post(self._url, json=payload, timeout=60.0) + response.raise_for_status() + + pcm_data = response.content + + elapsed = time.monotonic() - start + logger.info("TTS completed in %.2fs (%d bytes)", elapsed, len(pcm_data)) + + return pcm_data + + async def synthesize_stream(self, text: str) -> AsyncIterator[bytes]: + """Synthesize speech and yield PCM audio chunks as they arrive. + + Args: + text: Text to synthesize. + + Yields: + Chunks of raw PCM audio bytes. + """ + payload = { + "model": self._settings.tts_model, + "voice": self._settings.tts_voice, + "input": text, + "response_format": "pcm", + "speed": 1.0, + } + + logger.debug("TTS streaming request: %r", text[:80]) + + async with self._http.stream("POST", self._url, json=payload, timeout=60.0) as response: + response.raise_for_status() + async for chunk in response.aiter_bytes(chunk_size=TTS_STREAM_CHUNK_SIZE): + yield chunk + + async def is_available(self) -> bool: + """Check if the TTS service is reachable.""" + try: + base_url = self._settings.tts_url.rstrip("/") + response = await self._http.get(f"{base_url}/v1/models", timeout=5.0) + return response.status_code == 200 + except (httpx.ConnectError, httpx.TimeoutException): + return False diff --git a/stentor-gateway/test_client.py b/stentor-gateway/test_client.py new file mode 100644 index 0000000..ce699aa --- /dev/null +++ b/stentor-gateway/test_client.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +"""Stentor Gateway test client. + +Sends a WAV file over WebSocket to the Stentor Gateway and plays back +or saves the audio response. Useful for testing without ESP32 hardware. + +Usage: + # Send a WAV file and save the response + python test_client.py --input recording.wav --output response.pcm + + # Send a WAV file to a custom gateway URL + python test_client.py --input recording.wav --gateway ws://10.10.0.5:8600 + + # Generate silent audio for testing connectivity + python test_client.py --test-silence +""" + +import argparse +import asyncio +import base64 +import json +import struct +import sys +import wave +from pathlib import Path + + +async def run_client( + gateway_url: str, + audio_data: bytes, + client_id: str = "test-client", + output_file: str | None = None, +) -> None: + """Connect to the gateway, send audio, and receive the response. + + Args: + gateway_url: WebSocket URL of the Stentor Gateway. + audio_data: Raw PCM audio bytes to send. + client_id: Client identifier. + output_file: Optional path to save response PCM audio. + """ + try: + import websockets + except ImportError: + print("Error: 'websockets' package required. Install with: pip install websockets") + sys.exit(1) + + print(f"Connecting to {gateway_url}...") + + async with websockets.connect(gateway_url) as ws: + # 1. Start session + await ws.send(json.dumps({ + "type": "session.start", + "client_id": client_id, + "audio_config": { + "sample_rate": 16000, + "channels": 1, + "sample_width": 16, + "encoding": "pcm_s16le", + }, + })) + + # Wait for session.created + msg = json.loads(await ws.recv()) + assert msg["type"] == "session.created", f"Expected session.created, got {msg}" + session_id = msg["session_id"] + print(f"Session created: {session_id}") + + # Wait for listening status + msg = json.loads(await ws.recv()) + print(f"Status: {msg.get('state', msg)}") + + # 2. Stream audio in chunks (32ms chunks at 16kHz = 1024 bytes) + chunk_size = 1024 + total_chunks = 0 + for offset in range(0, len(audio_data), chunk_size): + chunk = audio_data[offset : offset + chunk_size] + b64_chunk = base64.b64encode(chunk).decode("ascii") + await ws.send(json.dumps({ + "type": "input_audio_buffer.append", + "audio": b64_chunk, + })) + total_chunks += 1 + + print(f"Sent {total_chunks} audio chunks ({len(audio_data)} bytes)") + + # 3. Commit the audio buffer + await ws.send(json.dumps({"type": "input_audio_buffer.commit"})) + print("Audio committed, waiting for response...") + + # 4. Receive response events + response_audio = bytearray() + done = False + + while not done: + raw = await ws.recv() + msg = json.loads(raw) + msg_type = msg.get("type", "") + + if msg_type == "status": + print(f" Status: {msg['state']}") + + elif msg_type == "transcript.done": + print(f" Transcript: {msg['text']}") + + elif msg_type == "response.text.done": + print(f" Response: {msg['text']}") + + elif msg_type == "response.audio.delta": + chunk = base64.b64decode(msg["delta"]) + response_audio.extend(chunk) + print(f" Audio chunk: {len(chunk)} bytes", end="\r") + + elif msg_type == "response.audio.done": + print(f"\n Audio complete: {len(response_audio)} bytes total") + + elif msg_type == "response.done": + print(" Response complete!") + done = True + + elif msg_type == "error": + print(f" ERROR [{msg.get('code', '?')}]: {msg['message']}") + done = True + + else: + print(f" Unknown event: {msg_type}") + + # 5. Save response audio + if response_audio: + if output_file: + out_path = Path(output_file) + if out_path.suffix == ".wav": + # Write as WAV + with wave.open(str(out_path), "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(16000) + wf.writeframes(bytes(response_audio)) + else: + # Write raw PCM + out_path.write_bytes(bytes(response_audio)) + print(f"Response audio saved to {output_file}") + else: + print("(Use --output to save response audio)") + + # 6. Close session + await ws.send(json.dumps({"type": "session.close"})) + print("Session closed.") + + +def load_wav_as_pcm(wav_path: str) -> bytes: + """Load a WAV file and return raw PCM data.""" + with wave.open(wav_path, "rb") as wf: + print(f"Input: {wav_path}") + print(f" Channels: {wf.getnchannels()}") + print(f" Sample rate: {wf.getframerate()} Hz") + print(f" Sample width: {wf.getsampwidth() * 8}-bit") + print(f" Frames: {wf.getnframes()}") + print(f" Duration: {wf.getnframes() / wf.getframerate():.2f}s") + + if wf.getframerate() != 16000: + print(f" WARNING: Expected 16kHz, got {wf.getframerate()} Hz") + if wf.getnchannels() != 1: + print(f" WARNING: Expected mono, got {wf.getnchannels()} channels") + if wf.getsampwidth() != 2: + print(f" WARNING: Expected 16-bit, got {wf.getsampwidth() * 8}-bit") + + return wf.readframes(wf.getnframes()) + + +def generate_silence(duration_ms: int = 2000) -> bytes: + """Generate silent PCM audio for testing.""" + num_samples = int(16000 * duration_ms / 1000) + print(f"Generated {duration_ms}ms of silence ({num_samples} samples)") + return struct.pack(f"<{num_samples}h", *([0] * num_samples)) + + +def main(): + parser = argparse.ArgumentParser( + description="Stentor Gateway test client", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--gateway", + default="ws://localhost:8600/api/v1/realtime", + help="Gateway WebSocket URL (default: ws://localhost:8600/api/v1/realtime)", + ) + parser.add_argument( + "--input", "-i", + help="Path to input WAV file (16kHz, mono, 16-bit)", + ) + parser.add_argument( + "--output", "-o", + help="Path to save response audio (.wav or .pcm)", + ) + parser.add_argument( + "--client-id", + default="test-client", + help="Client identifier (default: test-client)", + ) + parser.add_argument( + "--test-silence", + action="store_true", + help="Send 2 seconds of silence (for connectivity testing)", + ) + + args = parser.parse_args() + + if args.test_silence: + audio_data = generate_silence() + elif args.input: + audio_data = load_wav_as_pcm(args.input) + else: + parser.error("Specify --input WAV_FILE or --test-silence") + + asyncio.run(run_client( + gateway_url=args.gateway, + audio_data=audio_data, + client_id=args.client_id, + output_file=args.output, + )) + + +if __name__ == "__main__": + main() diff --git a/stentor-gateway/tests/__init__.py b/stentor-gateway/tests/__init__.py new file mode 100644 index 0000000..dfb46d5 --- /dev/null +++ b/stentor-gateway/tests/__init__.py @@ -0,0 +1 @@ +"""Stentor Gateway tests.""" diff --git a/stentor-gateway/tests/conftest.py b/stentor-gateway/tests/conftest.py new file mode 100644 index 0000000..5cb407e --- /dev/null +++ b/stentor-gateway/tests/conftest.py @@ -0,0 +1,34 @@ +"""Shared test fixtures for Stentor Gateway tests.""" + +import pytest + +from stentor.config import Settings + + +@pytest.fixture +def settings() -> Settings: + """Test settings with localhost endpoints.""" + return Settings( + stt_url="http://localhost:9001", + tts_url="http://localhost:9002", + agent_url="http://localhost:9003", + log_level="DEBUG", + ) + + +@pytest.fixture +def sample_pcm() -> bytes: + """Generate 1 second of silent PCM audio (16kHz, mono, 16-bit).""" + import struct + + num_samples = 16000 # 1 second at 16kHz + return struct.pack(f"<{num_samples}h", *([0] * num_samples)) + + +@pytest.fixture +def sample_pcm_short() -> bytes: + """Generate 100ms of silent PCM audio.""" + import struct + + num_samples = 1600 # 100ms at 16kHz + return struct.pack(f"<{num_samples}h", *([0] * num_samples)) diff --git a/stentor-gateway/tests/test_pipeline.py b/stentor-gateway/tests/test_pipeline.py new file mode 100644 index 0000000..30d52bb --- /dev/null +++ b/stentor-gateway/tests/test_pipeline.py @@ -0,0 +1,120 @@ +"""Tests for the voice pipeline orchestrator.""" + +import struct +from unittest.mock import AsyncMock + +import pytest + +from stentor.agent_client import AgentClient +from stentor.pipeline import Pipeline, PipelineState +from stentor.stt_client import STTClient +from stentor.tts_client import TTSClient + + +class TestPipeline: + """Tests for the Pipeline orchestrator.""" + + @pytest.fixture + def mock_stt(self): + """Create a mock STT client.""" + stt = AsyncMock(spec=STTClient) + stt.transcribe.return_value = "What is the weather?" + return stt + + @pytest.fixture + def mock_tts(self): + """Create a mock TTS client.""" + tts = AsyncMock(spec=TTSClient) + # Return 100 samples of silence as PCM (at 24kHz, will be resampled) + tts.synthesize.return_value = struct.pack("<100h", *([0] * 100)) + return tts + + @pytest.fixture + def mock_agent(self): + """Create a mock agent client.""" + agent = AsyncMock(spec=AgentClient) + agent.send_message.return_value = "I don't have weather tools yet." + return agent + + @pytest.fixture + def pipeline(self, settings, mock_stt, mock_tts, mock_agent): + """Create a pipeline with mock clients.""" + state = PipelineState() + return Pipeline(settings, mock_stt, mock_tts, mock_agent, state) + + async def test_full_pipeline(self, pipeline, sample_pcm, mock_stt, mock_tts, mock_agent): + """Test the complete pipeline produces expected event sequence.""" + events = [] + async for event in pipeline.process(sample_pcm): + events.append(event) + + # Verify event sequence + event_types = [e.type for e in events] + + assert "status" in event_types # transcribing status + assert "transcript.done" in event_types + assert "response.text.done" in event_types + assert "response.audio.delta" in event_types or "response.audio.done" in event_types + assert "response.done" in event_types + + # Verify services were called + mock_stt.transcribe.assert_called_once() + mock_agent.send_message.assert_called_once_with("What is the weather?") + mock_tts.synthesize.assert_called_once_with("I don't have weather tools yet.") + + async def test_pipeline_empty_transcript(self, settings, mock_tts, mock_agent): + """Test pipeline handles empty transcript gracefully.""" + mock_stt = AsyncMock(spec=STTClient) + mock_stt.transcribe.return_value = "" + + state = PipelineState() + pipeline = Pipeline(settings, mock_stt, mock_tts, mock_agent, state) + + events = [] + sample_pcm = struct.pack("<100h", *([0] * 100)) + async for event in pipeline.process(sample_pcm): + events.append(event) + + event_types = [e.type for e in events] + assert "error" in event_types + + # Agent and TTS should NOT have been called + mock_agent.send_message.assert_not_called() + mock_tts.synthesize.assert_not_called() + + async def test_pipeline_empty_agent_response(self, settings, mock_stt, mock_tts): + """Test pipeline handles empty agent response.""" + mock_agent = AsyncMock(spec=AgentClient) + mock_agent.send_message.return_value = "" + + state = PipelineState() + pipeline = Pipeline(settings, mock_stt, mock_tts, mock_agent, state) + + events = [] + sample_pcm = struct.pack("<100h", *([0] * 100)) + async for event in pipeline.process(sample_pcm): + events.append(event) + + event_types = [e.type for e in events] + assert "error" in event_types + mock_tts.synthesize.assert_not_called() + + async def test_pipeline_metrics_recorded(self, pipeline, sample_pcm): + """Test that pipeline metrics are recorded.""" + state = pipeline._state + + assert state.total_transcriptions == 0 + + events = [] + async for event in pipeline.process(sample_pcm): + events.append(event) + + assert state.total_transcriptions == 1 + assert state.total_agent_requests == 1 + assert state.total_tts_requests == 1 + assert len(state.recent_metrics) == 1 + + last = state.recent_metrics[-1] + assert last.total_duration > 0 + assert last.transcript == "What is the weather?" + assert last.response_text == "I don't have weather tools yet." diff --git a/stentor-gateway/tests/test_stt_client.py b/stentor-gateway/tests/test_stt_client.py new file mode 100644 index 0000000..422838b --- /dev/null +++ b/stentor-gateway/tests/test_stt_client.py @@ -0,0 +1,89 @@ +"""Tests for the Speaches STT client.""" + +import httpx +import pytest + +from stentor.audio import pcm_to_wav +from stentor.stt_client import STTClient + + +class TestSTTClient: + """Tests for STTClient.""" + + @pytest.fixture + def stt_client(self, settings): + """Create an STT client with a mock HTTP client.""" + http_client = httpx.AsyncClient() + return STTClient(settings, http_client) + + async def test_transcribe_success(self, settings, sample_pcm, httpx_mock): + """Test successful transcription.""" + httpx_mock.add_response( + url=f"{settings.stt_url}/v1/audio/transcriptions", + method="POST", + json={"text": "Hello world"}, + ) + + async with httpx.AsyncClient() as http_client: + client = STTClient(settings, http_client) + wav_data = pcm_to_wav(sample_pcm) + result = await client.transcribe(wav_data) + + assert result == "Hello world" + + async def test_transcribe_with_language(self, settings, sample_pcm, httpx_mock): + """Test transcription with explicit language.""" + httpx_mock.add_response( + url=f"{settings.stt_url}/v1/audio/transcriptions", + method="POST", + json={"text": "Bonjour le monde"}, + ) + + async with httpx.AsyncClient() as http_client: + client = STTClient(settings, http_client) + wav_data = pcm_to_wav(sample_pcm) + result = await client.transcribe(wav_data, language="fr") + + assert result == "Bonjour le monde" + + async def test_transcribe_empty_result(self, settings, sample_pcm, httpx_mock): + """Test transcription returning empty text.""" + httpx_mock.add_response( + url=f"{settings.stt_url}/v1/audio/transcriptions", + method="POST", + json={"text": " "}, + ) + + async with httpx.AsyncClient() as http_client: + client = STTClient(settings, http_client) + wav_data = pcm_to_wav(sample_pcm) + result = await client.transcribe(wav_data) + + assert result == "" + + async def test_is_available_success(self, settings, httpx_mock): + """Test availability check when service is up.""" + httpx_mock.add_response( + url=f"{settings.stt_url}/v1/models", + method="GET", + json={"models": []}, + ) + + async with httpx.AsyncClient() as http_client: + client = STTClient(settings, http_client) + available = await client.is_available() + + assert available is True + + async def test_is_available_failure(self, settings, httpx_mock): + """Test availability check when service is down.""" + httpx_mock.add_exception( + httpx.ConnectError("Connection refused"), + url=f"{settings.stt_url}/v1/models", + ) + + async with httpx.AsyncClient() as http_client: + client = STTClient(settings, http_client) + available = await client.is_available() + + assert available is False diff --git a/stentor-gateway/tests/test_tts_client.py b/stentor-gateway/tests/test_tts_client.py new file mode 100644 index 0000000..a5349eb --- /dev/null +++ b/stentor-gateway/tests/test_tts_client.py @@ -0,0 +1,78 @@ +"""Tests for the Speaches TTS client.""" + +import struct + +import httpx + +from stentor.tts_client import TTSClient + + +class TestTTSClient: + """Tests for TTSClient.""" + + async def test_synthesize_success(self, settings, httpx_mock): + """Test successful TTS synthesis.""" + # Generate fake PCM audio (100 samples of silence) + fake_pcm = struct.pack("<100h", *([0] * 100)) + + httpx_mock.add_response( + url=f"{settings.tts_url}/v1/audio/speech", + method="POST", + content=fake_pcm, + ) + + async with httpx.AsyncClient() as http_client: + client = TTSClient(settings, http_client) + result = await client.synthesize("Hello world") + + assert result == fake_pcm + + async def test_synthesize_uses_correct_params(self, settings, httpx_mock): + """Test that TTS requests include correct parameters.""" + httpx_mock.add_response( + url=f"{settings.tts_url}/v1/audio/speech", + method="POST", + content=b"\x00\x00", + ) + + async with httpx.AsyncClient() as http_client: + client = TTSClient(settings, http_client) + await client.synthesize("Test text") + + request = httpx_mock.get_request() + assert request is not None + + import json + body = json.loads(request.content) + assert body["model"] == settings.tts_model + assert body["voice"] == settings.tts_voice + assert body["input"] == "Test text" + assert body["response_format"] == "pcm" + assert body["speed"] == 1.0 + + async def test_is_available_success(self, settings, httpx_mock): + """Test availability check when service is up.""" + httpx_mock.add_response( + url=f"{settings.tts_url}/v1/models", + method="GET", + json={"models": []}, + ) + + async with httpx.AsyncClient() as http_client: + client = TTSClient(settings, http_client) + available = await client.is_available() + + assert available is True + + async def test_is_available_failure(self, settings, httpx_mock): + """Test availability check when service is down.""" + httpx_mock.add_exception( + httpx.ConnectError("Connection refused"), + url=f"{settings.tts_url}/v1/models", + ) + + async with httpx.AsyncClient() as http_client: + client = TTSClient(settings, http_client) + available = await client.is_available() + + assert available is False diff --git a/stentor-gateway/tests/test_websocket.py b/stentor-gateway/tests/test_websocket.py new file mode 100644 index 0000000..ee31d99 --- /dev/null +++ b/stentor-gateway/tests/test_websocket.py @@ -0,0 +1,185 @@ +"""Tests for the WebSocket endpoint and message models.""" + +import json + +import pytest +from fastapi.testclient import TestClient + +from stentor.audio import encode_audio +from stentor.models import ( + AudioConfig, + ErrorEvent, + InputAudioBufferAppend, + ResponseAudioDelta, + ResponseAudioDone, + ResponseDone, + ResponseTextDone, + SessionCreated, + SessionStart, + StatusUpdate, + TranscriptDone, +) + + +class TestMessageModels: + """Tests for WebSocket message serialization.""" + + def test_session_start_defaults(self): + """Test SessionStart with default values.""" + msg = SessionStart() + data = msg.model_dump() + assert data["type"] == "session.start" + assert data["client_id"] == "" + assert data["audio_config"]["sample_rate"] == 16000 + + def test_session_start_custom(self): + """Test SessionStart with custom values.""" + msg = SessionStart( + client_id="esp32-kitchen", + audio_config=AudioConfig(sample_rate=24000), + ) + assert msg.client_id == "esp32-kitchen" + assert msg.audio_config.sample_rate == 24000 + + def test_input_audio_buffer_append(self): + """Test audio append message.""" + audio_b64 = encode_audio(b"\x00\x00\x01\x00") + msg = InputAudioBufferAppend(audio=audio_b64) + data = msg.model_dump() + assert data["type"] == "input_audio_buffer.append" + assert data["audio"] == audio_b64 + + def test_session_created(self): + """Test session created response.""" + msg = SessionCreated(session_id="test-uuid") + data = json.loads(msg.model_dump_json()) + assert data["type"] == "session.created" + assert data["session_id"] == "test-uuid" + + def test_status_update(self): + """Test status update message.""" + for state in ("listening", "transcribing", "thinking", "speaking"): + msg = StatusUpdate(state=state) + assert msg.state == state + + def test_transcript_done(self): + """Test transcript done message.""" + msg = TranscriptDone(text="Hello world") + data = json.loads(msg.model_dump_json()) + assert data["type"] == "transcript.done" + assert data["text"] == "Hello world" + + def test_response_text_done(self): + """Test response text done message.""" + msg = ResponseTextDone(text="I can help with that.") + data = json.loads(msg.model_dump_json()) + assert data["type"] == "response.text.done" + assert data["text"] == "I can help with that." + + def test_response_audio_delta(self): + """Test audio delta message.""" + msg = ResponseAudioDelta(delta="AAAA") + data = json.loads(msg.model_dump_json()) + assert data["type"] == "response.audio.delta" + assert data["delta"] == "AAAA" + + def test_response_audio_done(self): + """Test audio done message.""" + msg = ResponseAudioDone() + assert msg.type == "response.audio.done" + + def test_response_done(self): + """Test response done message.""" + msg = ResponseDone() + assert msg.type == "response.done" + + def test_error_event(self): + """Test error event message.""" + msg = ErrorEvent(message="Something went wrong", code="test_error") + data = json.loads(msg.model_dump_json()) + assert data["type"] == "error" + assert data["message"] == "Something went wrong" + assert data["code"] == "test_error" + + def test_error_event_default_code(self): + """Test error event with default code.""" + msg = ErrorEvent(message="Oops") + assert msg.code == "unknown_error" + + +class TestWebSocketEndpoint: + """Tests for the /api/v1/realtime WebSocket endpoint.""" + + @pytest.fixture + def client(self): + """Create a test client with lifespan to populate app.state.""" + from stentor.main import app + with TestClient(app) as c: + yield c + + def test_health_live(self, client): + """Test liveness endpoint.""" + response = client.get("/api/live/") + assert response.status_code == 200 + assert response.json()["status"] == "ok" + + def test_api_info(self, client): + """Test API info endpoint.""" + response = client.get("/api/v1/info") + assert response.status_code == 200 + data = response.json() + assert data["name"] == "stentor-gateway" + assert data["version"] == "0.1.0" + assert "realtime" in data["endpoints"] + + def test_websocket_session_lifecycle(self, client): + """Test basic WebSocket session start and close.""" + with client.websocket_connect("/api/v1/realtime") as ws: + # Send session.start + ws.send_json({ + "type": "session.start", + "client_id": "test-client", + }) + + # Receive session.created + msg = ws.receive_json() + assert msg["type"] == "session.created" + assert "session_id" in msg + + # Receive initial status: listening + msg = ws.receive_json() + assert msg["type"] == "status" + assert msg["state"] == "listening" + + # Send session.close + ws.send_json({"type": "session.close"}) + + def test_websocket_no_session_error(self, client): + """Test sending audio without starting a session.""" + with client.websocket_connect("/api/v1/realtime") as ws: + ws.send_json({ + "type": "input_audio_buffer.append", + "audio": "AAAA", + }) + + msg = ws.receive_json() + assert msg["type"] == "error" + assert msg["code"] == "no_session" + + def test_websocket_invalid_json(self, client): + """Test sending invalid JSON.""" + with client.websocket_connect("/api/v1/realtime") as ws: + ws.send_text("not valid json{{{") + + msg = ws.receive_json() + assert msg["type"] == "error" + assert msg["code"] == "invalid_json" + + def test_websocket_unknown_event(self, client): + """Test sending unknown event type.""" + with client.websocket_connect("/api/v1/realtime") as ws: + ws.send_json({"type": "scooby.dooby.doo"}) + + msg = ws.receive_json() + assert msg["type"] == "error" + assert msg["code"] == "unknown_event"