feat: add initial Hold Slayer AI telephony gateway implementation
Complete project scaffolding and core implementation of an AI-powered telephony system that calls companies, navigates IVR menus, waits on hold, and transfers to the user when a human answers. Key components: - FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces - SIP/VoIP call management via PJSUA2 with RTP audio streaming - LLM-powered IVR navigation using OpenAI/Anthropic with tool calling - Hold detection service combining audio analysis and silence detection - Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines - Call recording with per-channel and mixed audio capture - Event bus (asyncio pub/sub) for real-time client updates - Web dashboard with live call monitoring - SQLite persistence via SQLAlchemy with call history and analytics - Notification support (email, SMS, webhook, desktop) - Docker Compose deployment with Opal VoIP and Opal Media containers - Comprehensive test suite with unit, integration, and E2E tests - Simplified .gitignore and full project documentation in README
This commit is contained in:
161
services/transcription.py
Normal file
161
services/transcription.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""
|
||||
Transcription Service — Speaches STT integration.
|
||||
|
||||
Sends audio to your Speaches instances for real-time speech-to-text.
|
||||
Used by the Hold Slayer to understand IVR prompts and detect menu options.
|
||||
"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from config import SpeachesSettings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TranscriptionService:
|
||||
"""
|
||||
Client for Speaches STT service.
|
||||
|
||||
Speaches exposes an OpenAI-compatible API:
|
||||
POST /v1/audio/transcriptions
|
||||
"""
|
||||
|
||||
def __init__(self, settings: SpeachesSettings):
|
||||
self.settings = settings
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
async def _get_client(self) -> httpx.AsyncClient:
|
||||
"""Get or create the HTTP client."""
|
||||
if self._client is None or self._client.is_closed:
|
||||
self._client = httpx.AsyncClient(
|
||||
base_url=self.settings.url,
|
||||
timeout=httpx.Timeout(30.0, connect=5.0),
|
||||
)
|
||||
return self._client
|
||||
|
||||
async def transcribe(
|
||||
self,
|
||||
audio_data: bytes,
|
||||
language: str = "en",
|
||||
prompt: Optional[str] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Transcribe audio data to text.
|
||||
|
||||
Args:
|
||||
audio_data: Raw PCM audio (16-bit signed, 16kHz, mono)
|
||||
language: Language code (default: "en")
|
||||
prompt: Optional context hint for better accuracy
|
||||
(e.g., "IVR menu options, phone banking")
|
||||
|
||||
Returns:
|
||||
Transcribed text
|
||||
"""
|
||||
client = await self._get_client()
|
||||
|
||||
# Convert raw PCM to WAV format for the API
|
||||
wav_data = self._pcm_to_wav(audio_data)
|
||||
|
||||
try:
|
||||
response = await client.post(
|
||||
"/v1/audio/transcriptions",
|
||||
files={"file": ("audio.wav", wav_data, "audio/wav")},
|
||||
data={
|
||||
"model": self.settings.model,
|
||||
"language": language,
|
||||
"response_format": "text",
|
||||
**({"prompt": prompt} if prompt else {}),
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
text = response.text.strip()
|
||||
logger.debug(f"Transcription: '{text}'")
|
||||
return text
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"Speaches API error: {e.response.status_code} {e.response.text}")
|
||||
return ""
|
||||
except httpx.ConnectError:
|
||||
logger.error(f"Cannot connect to Speaches at {self.settings.url}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error(f"Transcription failed: {e}")
|
||||
return ""
|
||||
|
||||
async def transcribe_stream(
|
||||
self,
|
||||
audio_data: bytes,
|
||||
language: str = "en",
|
||||
):
|
||||
"""
|
||||
Stream transcription — for real-time results.
|
||||
|
||||
Uses Speaches streaming endpoint if available,
|
||||
falls back to chunked transcription.
|
||||
|
||||
Yields:
|
||||
str: Partial transcription chunks
|
||||
"""
|
||||
# For now, do chunked transcription
|
||||
# TODO: Implement WebSocket streaming when Speaches supports it
|
||||
chunk_size = 16000 * 2 * 3 # 3 seconds of 16kHz 16-bit mono
|
||||
|
||||
for i in range(0, len(audio_data), chunk_size):
|
||||
chunk = audio_data[i:i + chunk_size]
|
||||
if len(chunk) > 0:
|
||||
text = await self.transcribe(chunk, language)
|
||||
if text:
|
||||
yield text
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the HTTP client."""
|
||||
if self._client and not self._client.is_closed:
|
||||
await self._client.aclose()
|
||||
self._client = None
|
||||
|
||||
@staticmethod
|
||||
def _pcm_to_wav(pcm_data: bytes, sample_rate: int = 16000, channels: int = 1, sample_width: int = 2) -> bytes:
|
||||
"""
|
||||
Convert raw PCM data to WAV format.
|
||||
|
||||
Args:
|
||||
pcm_data: Raw PCM audio bytes
|
||||
sample_rate: Sample rate in Hz (default: 16000)
|
||||
channels: Number of channels (default: 1 = mono)
|
||||
sample_width: Bytes per sample (default: 2 = 16-bit)
|
||||
|
||||
Returns:
|
||||
WAV file as bytes
|
||||
"""
|
||||
import struct
|
||||
|
||||
data_size = len(pcm_data)
|
||||
file_size = 36 + data_size # Header is 44 bytes, minus 8 for RIFF header
|
||||
|
||||
wav = io.BytesIO()
|
||||
|
||||
# RIFF header
|
||||
wav.write(b"RIFF")
|
||||
wav.write(struct.pack("<I", file_size))
|
||||
wav.write(b"WAVE")
|
||||
|
||||
# fmt chunk
|
||||
wav.write(b"fmt ")
|
||||
wav.write(struct.pack("<I", 16)) # Chunk size
|
||||
wav.write(struct.pack("<H", 1)) # PCM format
|
||||
wav.write(struct.pack("<H", channels))
|
||||
wav.write(struct.pack("<I", sample_rate))
|
||||
wav.write(struct.pack("<I", sample_rate * channels * sample_width)) # Byte rate
|
||||
wav.write(struct.pack("<H", channels * sample_width)) # Block align
|
||||
wav.write(struct.pack("<H", sample_width * 8)) # Bits per sample
|
||||
|
||||
# data chunk
|
||||
wav.write(b"data")
|
||||
wav.write(struct.pack("<I", data_size))
|
||||
wav.write(pcm_data)
|
||||
|
||||
return wav.getvalue()
|
||||
Reference in New Issue
Block a user