feat: add initial Hold Slayer AI telephony gateway implementation

Complete project scaffolding and core implementation of an AI-powered
telephony system that calls companies, navigates IVR menus, waits on
hold, and transfers to the user when a human answers.

Key components:
- FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces
- SIP/VoIP call management via PJSUA2 with RTP audio streaming
- LLM-powered IVR navigation using OpenAI/Anthropic with tool calling
- Hold detection service combining audio analysis and silence detection
- Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines
- Call recording with per-channel and mixed audio capture
- Event bus (asyncio pub/sub) for real-time client updates
- Web dashboard with live call monitoring
- SQLite persistence via SQLAlchemy with call history and analytics
- Notification support (email, SMS, webhook, desktop)
- Docker Compose deployment with Opal VoIP and Opal Media containers
- Comprehensive test suite with unit, integration, and E2E tests
- Simplified .gitignore and full project documentation in README
This commit is contained in:
2026-03-21 19:23:26 +00:00
parent c9ff60702b
commit ecf37658ce
56 changed files with 11601 additions and 164 deletions

161
services/transcription.py Normal file
View File

@@ -0,0 +1,161 @@
"""
Transcription Service — Speaches STT integration.
Sends audio to your Speaches instances for real-time speech-to-text.
Used by the Hold Slayer to understand IVR prompts and detect menu options.
"""
import io
import logging
from typing import Optional
import httpx
from config import SpeachesSettings
logger = logging.getLogger(__name__)
class TranscriptionService:
"""
Client for Speaches STT service.
Speaches exposes an OpenAI-compatible API:
POST /v1/audio/transcriptions
"""
def __init__(self, settings: SpeachesSettings):
self.settings = settings
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create the HTTP client."""
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(
base_url=self.settings.url,
timeout=httpx.Timeout(30.0, connect=5.0),
)
return self._client
async def transcribe(
self,
audio_data: bytes,
language: str = "en",
prompt: Optional[str] = None,
) -> str:
"""
Transcribe audio data to text.
Args:
audio_data: Raw PCM audio (16-bit signed, 16kHz, mono)
language: Language code (default: "en")
prompt: Optional context hint for better accuracy
(e.g., "IVR menu options, phone banking")
Returns:
Transcribed text
"""
client = await self._get_client()
# Convert raw PCM to WAV format for the API
wav_data = self._pcm_to_wav(audio_data)
try:
response = await client.post(
"/v1/audio/transcriptions",
files={"file": ("audio.wav", wav_data, "audio/wav")},
data={
"model": self.settings.model,
"language": language,
"response_format": "text",
**({"prompt": prompt} if prompt else {}),
},
)
response.raise_for_status()
text = response.text.strip()
logger.debug(f"Transcription: '{text}'")
return text
except httpx.HTTPStatusError as e:
logger.error(f"Speaches API error: {e.response.status_code} {e.response.text}")
return ""
except httpx.ConnectError:
logger.error(f"Cannot connect to Speaches at {self.settings.url}")
return ""
except Exception as e:
logger.error(f"Transcription failed: {e}")
return ""
async def transcribe_stream(
self,
audio_data: bytes,
language: str = "en",
):
"""
Stream transcription — for real-time results.
Uses Speaches streaming endpoint if available,
falls back to chunked transcription.
Yields:
str: Partial transcription chunks
"""
# For now, do chunked transcription
# TODO: Implement WebSocket streaming when Speaches supports it
chunk_size = 16000 * 2 * 3 # 3 seconds of 16kHz 16-bit mono
for i in range(0, len(audio_data), chunk_size):
chunk = audio_data[i:i + chunk_size]
if len(chunk) > 0:
text = await self.transcribe(chunk, language)
if text:
yield text
async def close(self) -> None:
"""Close the HTTP client."""
if self._client and not self._client.is_closed:
await self._client.aclose()
self._client = None
@staticmethod
def _pcm_to_wav(pcm_data: bytes, sample_rate: int = 16000, channels: int = 1, sample_width: int = 2) -> bytes:
"""
Convert raw PCM data to WAV format.
Args:
pcm_data: Raw PCM audio bytes
sample_rate: Sample rate in Hz (default: 16000)
channels: Number of channels (default: 1 = mono)
sample_width: Bytes per sample (default: 2 = 16-bit)
Returns:
WAV file as bytes
"""
import struct
data_size = len(pcm_data)
file_size = 36 + data_size # Header is 44 bytes, minus 8 for RIFF header
wav = io.BytesIO()
# RIFF header
wav.write(b"RIFF")
wav.write(struct.pack("<I", file_size))
wav.write(b"WAVE")
# fmt chunk
wav.write(b"fmt ")
wav.write(struct.pack("<I", 16)) # Chunk size
wav.write(struct.pack("<H", 1)) # PCM format
wav.write(struct.pack("<H", channels))
wav.write(struct.pack("<I", sample_rate))
wav.write(struct.pack("<I", sample_rate * channels * sample_width)) # Byte rate
wav.write(struct.pack("<H", channels * sample_width)) # Block align
wav.write(struct.pack("<H", sample_width * 8)) # Bits per sample
# data chunk
wav.write(b"data")
wav.write(struct.pack("<I", data_size))
wav.write(pcm_data)
return wav.getvalue()