feat: add initial Hold Slayer AI telephony gateway implementation

Complete project scaffolding and core implementation of an AI-powered telephony system that calls companies, navigates IVR menus, waits on hold, and transfers to the user when a human answers. Key components: - FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces - SIP/VoIP call management via PJSUA2 with RTP audio streaming - LLM-powered IVR navigation using OpenAI/Anthropic with tool calling - Hold detection service combining audio analysis and silence detection - Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines - Call recording with per-channel and mixed audio capture - Event bus (asyncio pub/sub) for real-time client updates - Web dashboard with live call monitoring - SQLite persistence via SQLAlchemy with call history and analytics - Notification support (email, SMS, webhook, desktop) - Docker Compose deployment with Opal VoIP and Opal Media containers - Comprehensive test suite with unit, integration, and E2E tests - Simplified .gitignore and full project documentation in README
2026-03-21 19:23:26 +00:00
parent c9ff60702b
commit ecf37658ce
56 changed files with 11601 additions and 164 deletions
--- a/services/transcription.py
+++ b/services/transcription.py
@@ -0,0 +1,161 @@
+"""
+Transcription Service — Speaches STT integration.
+
+Sends audio to your Speaches instances for real-time speech-to-text.
+Used by the Hold Slayer to understand IVR prompts and detect menu options.
+"""
+
+import io
+import logging
+from typing import Optional
+
+import httpx
+
+from config import SpeachesSettings
+
+logger = logging.getLogger(__name__)
+
+
+class TranscriptionService:
+    """
+    Client for Speaches STT service.
+
+    Speaches exposes an OpenAI-compatible API:
+    POST /v1/audio/transcriptions
+    """
+
+    def __init__(self, settings: SpeachesSettings):
+        self.settings = settings
+        self._client: Optional[httpx.AsyncClient] = None
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create the HTTP client."""
+        if self._client is None or self._client.is_closed:
+            self._client = httpx.AsyncClient(
+                base_url=self.settings.url,
+                timeout=httpx.Timeout(30.0, connect=5.0),
+            )
+        return self._client
+
+    async def transcribe(
+        self,
+        audio_data: bytes,
+        language: str = "en",
+        prompt: Optional[str] = None,
+    ) -> str:
+        """
+        Transcribe audio data to text.
+
+        Args:
+            audio_data: Raw PCM audio (16-bit signed, 16kHz, mono)
+            language: Language code (default: "en")
+            prompt: Optional context hint for better accuracy
+                    (e.g., "IVR menu options, phone banking")
+
+        Returns:
+            Transcribed text
+        """
+        client = await self._get_client()
+
+        # Convert raw PCM to WAV format for the API
+        wav_data = self._pcm_to_wav(audio_data)
+
+        try:
+            response = await client.post(
+                "/v1/audio/transcriptions",
+                files={"file": ("audio.wav", wav_data, "audio/wav")},
+                data={
+                    "model": self.settings.model,
+                    "language": language,
+                    "response_format": "text",
+                    **({"prompt": prompt} if prompt else {}),
+                },
+            )
+            response.raise_for_status()
+            text = response.text.strip()
+            logger.debug(f"Transcription: '{text}'")
+            return text
+
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Speaches API error: {e.response.status_code} {e.response.text}")
+            return ""
+        except httpx.ConnectError:
+            logger.error(f"Cannot connect to Speaches at {self.settings.url}")
+            return ""
+        except Exception as e:
+            logger.error(f"Transcription failed: {e}")
+            return ""
+
+    async def transcribe_stream(
+        self,
+        audio_data: bytes,
+        language: str = "en",
+    ):
+        """
+        Stream transcription — for real-time results.
+
+        Uses Speaches streaming endpoint if available,
+        falls back to chunked transcription.
+
+        Yields:
+            str: Partial transcription chunks
+        """
+        # For now, do chunked transcription
+        # TODO: Implement WebSocket streaming when Speaches supports it
+        chunk_size = 16000 * 2 * 3  # 3 seconds of 16kHz 16-bit mono
+
+        for i in range(0, len(audio_data), chunk_size):
+            chunk = audio_data[i:i + chunk_size]
+            if len(chunk) > 0:
+                text = await self.transcribe(chunk, language)
+                if text:
+                    yield text
+
+    async def close(self) -> None:
+        """Close the HTTP client."""
+        if self._client and not self._client.is_closed:
+            await self._client.aclose()
+            self._client = None
+
+    @staticmethod
+    def _pcm_to_wav(pcm_data: bytes, sample_rate: int = 16000, channels: int = 1, sample_width: int = 2) -> bytes:
+        """
+        Convert raw PCM data to WAV format.
+
+        Args:
+            pcm_data: Raw PCM audio bytes
+            sample_rate: Sample rate in Hz (default: 16000)
+            channels: Number of channels (default: 1 = mono)
+            sample_width: Bytes per sample (default: 2 = 16-bit)
+
+        Returns:
+            WAV file as bytes
+        """
+        import struct
+
+        data_size = len(pcm_data)
+        file_size = 36 + data_size  # Header is 44 bytes, minus 8 for RIFF header
+
+        wav = io.BytesIO()
+
+        # RIFF header
+        wav.write(b"RIFF")
+        wav.write(struct.pack("<I", file_size))
+        wav.write(b"WAVE")
+
+        # fmt chunk
+        wav.write(b"fmt ")
+        wav.write(struct.pack("<I", 16))  # Chunk size
+        wav.write(struct.pack("<H", 1))  # PCM format
+        wav.write(struct.pack("<H", channels))
+        wav.write(struct.pack("<I", sample_rate))
+        wav.write(struct.pack("<I", sample_rate * channels * sample_width))  # Byte rate
+        wav.write(struct.pack("<H", channels * sample_width))  # Block align
+        wav.write(struct.pack("<H", sample_width * 8))  # Bits per sample
+
+        # data chunk
+        wav.write(b"data")
+        wav.write(struct.pack("<I", data_size))
+        wav.write(pcm_data)
+
+        return wav.getvalue()