feat: add initial Hold Slayer AI telephony gateway implementation

Complete project scaffolding and core implementation of an AI-powered
telephony system that calls companies, navigates IVR menus, waits on
hold, and transfers to the user when a human answers.

Key components:
- FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces
- SIP/VoIP call management via PJSUA2 with RTP audio streaming
- LLM-powered IVR navigation using OpenAI/Anthropic with tool calling
- Hold detection service combining audio analysis and silence detection
- Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines
- Call recording with per-channel and mixed audio capture
- Event bus (asyncio pub/sub) for real-time client updates
- Web dashboard with live call monitoring
- SQLite persistence via SQLAlchemy with call history and analytics
- Notification support (email, SMS, webhook, desktop)
- Docker Compose deployment with Opal VoIP and Opal Media containers
- Comprehensive test suite with unit, integration, and E2E tests
- Simplified .gitignore and full project documentation in README
This commit is contained in:
2026-03-21 19:23:26 +00:00
parent c9ff60702b
commit ecf37658ce
56 changed files with 11601 additions and 164 deletions

62
.env.example Normal file
View File

@@ -0,0 +1,62 @@
# ============================================================
# Hold Slayer Gateway Configuration
# ============================================================
# Copy to .env and fill in your values
# --- Database ---
DATABASE_URL=postgresql+asyncpg://holdslayer:changeme@localhost:5432/holdslayer
# --- SIP Trunk ---
SIP_TRUNK_HOST=sip.yourprovider.com
SIP_TRUNK_PORT=5060
SIP_TRUNK_USERNAME=your_sip_username
SIP_TRUNK_PASSWORD=your_sip_password
SIP_TRUNK_TRANSPORT=udp
# Your phone number on the trunk (E.164)
SIP_TRUNK_DID=+15551234567
# --- Gateway SIP Listener ---
# Port for devices (softphones/hardphones) to register to
GATEWAY_SIP_HOST=0.0.0.0
GATEWAY_SIP_PORT=5080
GATEWAY_SIP_DOMAIN=gateway.helu.ca
# --- Speaches STT ---
SPEACHES_URL=http://perseus.helu.ca:22070
SPEACHES_PROD_URL=http://pan.helu.ca:22070
SPEACHES_MODEL=whisper-large-v3
# --- Audio Classifier ---
# Thresholds for hold music detection (0.0 - 1.0)
CLASSIFIER_MUSIC_THRESHOLD=0.7
CLASSIFIER_SPEECH_THRESHOLD=0.6
CLASSIFIER_SILENCE_THRESHOLD=0.85
# Analysis window in seconds
CLASSIFIER_WINDOW_SECONDS=3.0
# --- LLM (OpenAI-compatible API) ---
# Ollama, LM Studio, vLLM, or OpenAI — any OpenAI-compatible endpoint
LLM_BASE_URL=http://localhost:11434/v1
LLM_MODEL=llama3
LLM_API_KEY=not-needed
LLM_TIMEOUT=30.0
LLM_MAX_TOKENS=1024
LLM_TEMPERATURE=0.3
# --- Hold Slayer ---
# Default device to transfer to when human detected
DEFAULT_TRANSFER_DEVICE=sip_phone
# Max hold time before giving up (seconds)
MAX_HOLD_TIME=7200
# How often to check classification while on hold (seconds)
HOLD_CHECK_INTERVAL=2.0
# --- Notifications ---
# SMS notification number (optional)
NOTIFY_SMS_NUMBER=+15559876543
# --- Server ---
HOST=0.0.0.0
PORT=8000
DEBUG=true
LOG_LEVEL=info

188
.gitignore vendored
View File

@@ -1,176 +1,38 @@
# ---> Python
# Byte-compiled / optimized / DLL files
# Python
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
dist/
build/
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
# Virtual environment
.venv/
venv/
ENV/
env.bak/
venv.bak/
env/
# Spyder project settings
.spyderproject
.spyproject
# Environment
.env
# Rope project settings
.ropeproject
# IDE
.vscode/
.idea/
*.swp
*.swo
# mkdocs documentation
/site
# Database
*.db
*.sqlite3
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Recordings
recordings/
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# OS
.DS_Store
Thumbs.db
# Testing
.coverage
htmlcov/
.pytest_cache/

320
README.md
View File

@@ -1,2 +1,320 @@
# hold-slayer
# Hold Slayer 🔥
**An AI-powered telephony gateway that calls companies, navigates IVR menus, waits on hold, and transfers you when a human picks up.**
You give it a phone number and an intent ("dispute a charge on my December statement"). It dials the number through your SIP trunk, navigates the phone tree, sits through the hold music, and rings your desk phone the instant a live person answers. You never hear Vivaldi again.
> [!CAUTION]
> **Emergency calling — 911**
> Hold Slayer passes `911` and `9911` directly to the PSTN trunk.
> **Your SIP trunk provider must support E911 on your DID and have your
> correct registered location on file before this system is put into
> service.** VoIP emergency calls are location-dependent — verify
> with your provider. Do not rely on this system as your only means
> of reaching emergency services.
## Architecture
```
┌─────────────────────────────────────────────────────────────────┐
│ FastAPI Server │
│ │
│ ┌──────────┐ ┌──────────┐ ┌───────────┐ ┌──────────────┐ │
│ │ REST API │ │WebSocket │ │MCP Server │ │ Dashboard │ │
│ │ /api/* │ │ /ws/* │ │ (SSE) │ │ /dashboard │ │
│ └────┬─────┘ └────┬─────┘ └─────┬─────┘ └──────────────┘ │
│ │ │ │ │
│ ┌────┴──────────────┴──────────────┴────┐ │
│ │ Event Bus │ │
│ │ (asyncio Queue pub/sub per client) │ │
│ └────┬──────────────┬──────────────┬────┘ │
│ │ │ │ │
│ ┌────┴─────┐ ┌─────┴─────┐ ┌────┴──────────┐ │
│ │ Call │ │ Hold │ │ Services │ │
│ │ Manager │ │ Slayer │ │ (LLM, STT, │ │
│ │ │ │ │ │ Recording, │ │
│ │ │ │ │ │ Analytics, │ │
│ │ │ │ │ │ Notify) │ │
│ └────┬─────┘ └─────┬─────┘ └──────────────┘ │
│ │ │ │
│ ┌────┴──────────────┴───────────────────┐ │
│ │ Sippy B2BUA Engine │ │
│ │ (SIP calls, DTMF, conference bridge) │ │
│ └────┬──────────────────────────────────┘ │
│ │ │
└───────┼─────────────────────────────────────────────────────────┘
┌────┴────┐
│SIP Trunk│ ──→ PSTN
└─────────┘
```
## What's Implemented
### Core Engine
- **Sippy B2BUA Engine** (`core/sippy_engine.py`) — SIP call control, DTMF, bridging, conference, trunk registration
- **PJSUA2 Media Pipeline** (`core/media_pipeline.py`) — Audio routing, recording ports, conference bridge, WAV playback
- **Call Manager** (`core/call_manager.py`) — Active call state tracking, lifecycle management
- **Event Bus** (`core/event_bus.py`) — Async pub/sub with per-subscriber queues, type filtering, history
### Hold Slayer
- **IVR Navigation** (`services/hold_slayer.py`) — Follows stored call flows step-by-step through phone menus
- **Audio Classifier** (`services/audio_classifier.py`) — Real-time waveform analysis: silence, tones, DTMF, music, speech detection
- **Call Flow Learner** (`services/call_flow_learner.py`) — Builds reusable call flows from exploration data, merges new discoveries
- **LLM Fallback** — When a LISTEN step has no hardcoded DTMF, the LLM analyzes the transcript and picks the right menu option
### Intelligence Layer
- **LLM Client** (`services/llm_client.py`) — OpenAI-compatible API client (Ollama, vLLM, LM Studio, OpenAI) with JSON parsing, retry, stats
- **Transcription** (`services/transcription.py`) — Speaches/Whisper STT integration for live call transcription
- **Recording** (`services/recording.py`) — WAV recording with date-organized storage, dual-channel support
- **Call Analytics** (`services/call_analytics.py`) — Hold time stats, success rates, per-company patterns, time-of-day trends
- **Notifications** (`services/notification.py`) — WebSocket + SMS alerts for human detection, call failures, hold status
### API Surface
- **REST API** — Call management, device registration, call flow CRUD, service configuration
- **WebSocket** — Real-time call events, transcripts, classification updates
- **MCP Server** — 10 tools for AI assistant integration (make calls, send DTMF, get transcripts, manage flows)
### Data Models
- **Call** — Active call state with classification history, transcript chunks, hold time tracking
- **Call Flow** — Stored IVR trees with steps (DTMF, LISTEN, HOLD, TRANSFER, SPEAK)
- **Events** — 20+ typed events (call lifecycle, hold slayer, audio, device, system)
- **Device** — SIP phone/softphone registration and routing
- **Contact** — Phone number management with routing preferences
## Project Structure
```
hold-slayer/
├── main.py # FastAPI app + lifespan (service wiring)
├── config.py # Pydantic settings from .env
├── core/
│ ├── gateway.py # Top-level gateway orchestrator
│ ├── sippy_engine.py # Sippy B2BUA SIP engine
│ ├── media_pipeline.py # PJSUA2 audio routing
│ ├── call_manager.py # Active call state management
│ └── event_bus.py # Async pub/sub event bus
├── services/
│ ├── hold_slayer.py # IVR navigation + hold detection
│ ├── audio_classifier.py # Waveform analysis (music/speech/DTMF)
│ ├── call_flow_learner.py # Auto-learns IVR trees from calls
│ ├── llm_client.py # OpenAI-compatible LLM client
│ ├── transcription.py # Speaches/Whisper STT
│ ├── recording.py # Call recording management
│ ├── call_analytics.py # Call metrics and insights
│ └── notification.py # WebSocket + SMS notifications
├── api/
│ ├── calls.py # Call management endpoints
│ ├── call_flows.py # Call flow CRUD
│ ├── devices.py # Device registration
│ ├── websocket.py # Real-time event stream
│ └── deps.py # FastAPI dependency injection
├── mcp_server/
│ └── server.py # MCP tools + resources (10 tools)
├── models/
│ ├── call.py # Call state models
│ ├── call_flow.py # IVR tree models
│ ├── events.py # Event type definitions
│ ├── device.py # Device models
│ └── contact.py # Contact models
├── db/
│ └── database.py # SQLAlchemy async (PostgreSQL/SQLite)
└── tests/
├── test_audio_classifier.py # 18 tests — waveform analysis
├── test_call_flows.py # 10 tests — call flow models
├── test_hold_slayer.py # 20 tests — IVR nav, EventBus, CallManager
└── test_services.py # 27 tests — LLM, notifications, recording,
# analytics, learner, EventBus
```
## Quick Start
### 1. Install
```bash
python -m venv .venv
source .venv/bin/activate
pip install -e ".[dev]"
```
### 2. Configure
```bash
cp .env.example .env
# Edit .env with your SIP trunk credentials, LLM endpoint, etc.
```
### 3. Run
```bash
uvicorn main:app --host 0.0.0.0 --port 8100
```
### 4. Test
```bash
pytest tests/ -v
```
## Usage
### REST API
**Launch Hold Slayer on a number:**
```bash
curl -X POST http://localhost:8000/api/calls/hold-slayer \
-H "Content-Type: application/json" \
-d '{
"number": "+18005551234",
"intent": "dispute Amazon charge from December 15th",
"call_flow_id": "chase_bank_main",
"transfer_to": "sip_phone"
}'
```
**Check call status:**
```bash
curl http://localhost:8000/api/calls/call_abc123
```
### WebSocket — Real-Time Events
```javascript
const ws = new WebSocket("ws://localhost:8000/ws/events");
ws.onmessage = (msg) => {
const event = JSON.parse(msg.data);
// event.type: "human_detected", "hold_detected", "ivr_step", etc.
// event.call_id: which call this is about
// event.data: type-specific payload
};
```
### MCP — AI Assistant Integration
The MCP server exposes 10 tools that any MCP-compatible assistant can use:
| Tool | Description |
|------|-------------|
| `make_call` | Dial a number through the SIP trunk |
| `end_call` | Hang up an active call |
| `send_dtmf` | Send touch-tone digits to navigate menus |
| `get_call_status` | Check current state of a call |
| `get_call_transcript` | Get live transcript of a call |
| `get_call_recording` | Get recording metadata and file path |
| `list_active_calls` | List all calls in progress |
| `get_call_summary` | Analytics summary (hold times, success rates) |
| `search_call_history` | Search past calls by number or company |
| `learn_call_flow` | Build a reusable call flow from exploration data |
## How It Works
1. **You request a call** — via REST API, MCP tool, or dashboard
2. **Gateway dials out** — Sippy B2BUA places the call through your SIP trunk
3. **Audio classifier listens** — Real-time waveform analysis detects IVR prompts, hold music, ringing, silence, and live speech
4. **Transcription runs** — Speaches/Whisper converts audio to text in real-time
5. **IVR navigator decides** — If a stored call flow exists, it follows the steps. If not, the LLM analyzes the transcript and picks the right menu option
6. **Hold detection** — When hold music is detected, the system waits patiently and monitors for transitions
7. **Human detection** — The classifier detects the transition from music/silence to live speech
8. **Transfer** — Your desk phone rings. Pick up and you're talking to the agent. Zero hold time.
## Configuration
All configuration is via environment variables (see `.env.example`):
| Variable | Description | Default |
|----------|-------------|---------|
| `SIP_TRUNK_HOST` | Your SIP provider hostname | — |
| `SIP_TRUNK_USERNAME` | SIP auth username | — |
| `SIP_TRUNK_PASSWORD` | SIP auth password | — |
| `SIP_TRUNK_DID` | Your phone number (E.164) | — |
| `GATEWAY_SIP_PORT` | Port for device registration | `5080` |
| `SPEACHES_URL` | Speaches/Whisper STT endpoint | `http://localhost:22070` |
| `LLM_BASE_URL` | OpenAI-compatible LLM endpoint | `http://localhost:11434/v1` |
| `LLM_MODEL` | Model name for IVR analysis | `llama3` |
| `DATABASE_URL` | PostgreSQL or SQLite connection | SQLite fallback |
## Tech Stack
- **Python 3.13** + **asyncio** — Single-process async architecture
- **FastAPI** — REST API + WebSocket server
- **Sippy B2BUA** — SIP call control and DTMF
- **PJSUA2** — Media pipeline, conference bridge, recording
- **Speaches** (Whisper) — Speech-to-text
- **Ollama / vLLM / OpenAI** — LLM for IVR menu analysis
- **SQLAlchemy** — Async database (PostgreSQL or SQLite)
- **MCP (Model Context Protocol)** — AI assistant integration
## Documentation
Full documentation is in [`/docs`](docs/README.md):
- [Architecture](docs/architecture.md) — System design, data flow, threading model
- [Core Engine](docs/core-engine.md) — SIP engine, media pipeline, call manager, event bus
- [Hold Slayer Service](docs/hold-slayer-service.md) — IVR navigation, hold detection, human detection
- [Audio Classifier](docs/audio-classifier.md) — Waveform analysis, feature extraction, classification
- [Services](docs/services.md) — LLM client, transcription, recording, analytics, notifications
- [Call Flows](docs/call-flows.md) — Call flow model, step types, auto-learner
- [API Reference](docs/api-reference.md) — REST endpoints, WebSocket, request/response schemas
- [MCP Server](docs/mcp-server.md) — MCP tools and resources for AI assistants
- [Configuration](docs/configuration.md) — All environment variables, deployment options
- [Development](docs/development.md) — Setup, testing, contributing
## Build Phases
### Phase 1: Core Engine ✅
- [x] Extract EventBus to dedicated module with typed filtering
- [x] Implement Sippy B2BUA SIP engine (signaling, DTMF, bridging)
- [x] Implement PJSUA2 media pipeline (conference bridge, audio tapping, recording)
- [x] Call manager with active call state tracking
- [x] Gateway orchestrator wiring all components
### Phase 2: Intelligence Layer ✅
- [x] LLM client (OpenAI-compatible — Ollama, vLLM, LM Studio, OpenAI)
- [x] Hold Slayer IVR navigation with LLM fallback for LISTEN steps
- [x] Call Flow Learner — auto-builds reusable IVR trees from exploration
- [x] Recording service with date-organized WAV storage
- [x] Call analytics with hold time stats, per-company patterns
- [x] Audio classifier with spectral analysis, DTMF detection, hold-to-human transition
### Phase 3: API & Integration ✅
- [x] REST API — calls, call flows, devices, DTMF
- [x] WebSocket real-time event streaming
- [x] MCP server with 16 tools + 3 resources
- [x] Notification service (WebSocket + SMS)
- [x] Service wiring in main.py lifespan
- [x] 75 passing tests across 4 test files
### Phase 4: Production Hardening 🔜
- [ ] Alembic database migrations
- [ ] API authentication (API keys / JWT)
- [ ] Rate limiting on API endpoints
- [ ] Structured JSON logging
- [ ] Health check endpoints for all dependencies
- [ ] Graceful degradation (classifier works without STT, etc.)
- [ ] Docker Compose (Hold Slayer + PostgreSQL + Speaches + Ollama)
### Phase 5: Additional Services 🔮
- [ ] AI Receptionist — answer inbound calls, screen callers, take messages
- [ ] Spam Filter — detect robocalls using caller ID + audio patterns
- [ ] Smart Routing — time-of-day rules, device priority, DND
- [ ] Noise Cancellation — RNNoise integration in media pipeline
- [ ] TTS/Speech — play prompts into calls (SPEAK step support)
### Phase 6: Dashboard & UX 🔮
- [ ] Web dashboard with real-time call monitor
- [ ] Call flow visual editor (drag-and-drop IVR tree builder)
- [ ] Call history with transcript playback
- [ ] Analytics dashboard with hold time graphs
- [ ] Mobile app (or PWA) for on-the-go control
## License
MIT

1
api/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""REST API endpoints for the Hold Slayer Gateway."""

214
api/call_flows.py Normal file
View File

@@ -0,0 +1,214 @@
"""
Call Flows API — Store and manage IVR navigation trees.
The system gets smarter every time you call somewhere.
"""
import uuid
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException
from slugify import slugify
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from api.deps import get_gateway
from core.gateway import AIPSTNGateway
from db.database import StoredCallFlow, get_db
from models.call_flow import (
CallFlow,
CallFlowCreate,
CallFlowStep,
CallFlowSummary,
CallFlowUpdate,
)
router = APIRouter()
@router.post("/", response_model=CallFlow)
async def create_call_flow(
flow: CallFlowCreate,
db: AsyncSession = Depends(get_db),
):
"""Store a new call flow for a phone number."""
flow_id = slugify(flow.name)
# Check if ID already exists
existing = await db.execute(
select(StoredCallFlow).where(StoredCallFlow.id == flow_id)
)
if existing.scalar_one_or_none():
raise HTTPException(
status_code=409,
detail=f"Call flow '{flow_id}' already exists. Use PUT to update.",
)
db_flow = StoredCallFlow(
id=flow_id,
name=flow.name,
phone_number=flow.phone_number,
description=flow.description,
steps=[s.model_dump() for s in flow.steps],
tags=flow.tags,
notes=flow.notes,
last_verified=datetime.now(),
)
db.add(db_flow)
await db.flush()
return CallFlow(
id=flow_id,
name=flow.name,
phone_number=flow.phone_number,
description=flow.description,
steps=flow.steps,
tags=flow.tags,
notes=flow.notes,
last_verified=datetime.now(),
)
@router.get("/", response_model=list[CallFlowSummary])
async def list_call_flows(
db: AsyncSession = Depends(get_db),
):
"""List all stored call flows."""
result = await db.execute(select(StoredCallFlow))
rows = result.scalars().all()
return [
CallFlowSummary(
id=row.id,
name=row.name,
phone_number=row.phone_number,
description=row.description or "",
step_count=len(row.steps) if row.steps else 0,
avg_hold_time=row.avg_hold_time,
success_rate=row.success_rate,
last_used=row.last_used,
times_used=row.times_used or 0,
tags=row.tags or [],
)
for row in rows
]
@router.get("/{flow_id}", response_model=CallFlow)
async def get_call_flow(
flow_id: str,
db: AsyncSession = Depends(get_db),
):
"""Get a stored call flow by ID."""
result = await db.execute(
select(StoredCallFlow).where(StoredCallFlow.id == flow_id)
)
row = result.scalar_one_or_none()
if not row:
raise HTTPException(status_code=404, detail=f"Call flow '{flow_id}' not found")
return CallFlow(
id=row.id,
name=row.name,
phone_number=row.phone_number,
description=row.description or "",
steps=[CallFlowStep(**s) for s in row.steps],
tags=row.tags or [],
notes=row.notes,
avg_hold_time=row.avg_hold_time,
success_rate=row.success_rate,
last_used=row.last_used,
times_used=row.times_used or 0,
)
@router.get("/by-number/{phone_number}", response_model=CallFlow)
async def get_flow_for_number(
phone_number: str,
db: AsyncSession = Depends(get_db),
):
"""Look up stored call flow by phone number."""
result = await db.execute(
select(StoredCallFlow).where(StoredCallFlow.phone_number == phone_number)
)
row = result.scalar_one_or_none()
if not row:
raise HTTPException(
status_code=404,
detail=f"No call flow found for {phone_number}",
)
return CallFlow(
id=row.id,
name=row.name,
phone_number=row.phone_number,
description=row.description or "",
steps=[CallFlowStep(**s) for s in row.steps],
tags=row.tags or [],
notes=row.notes,
avg_hold_time=row.avg_hold_time,
success_rate=row.success_rate,
last_used=row.last_used,
times_used=row.times_used or 0,
)
@router.put("/{flow_id}", response_model=CallFlow)
async def update_call_flow(
flow_id: str,
update: CallFlowUpdate,
db: AsyncSession = Depends(get_db),
):
"""Update an existing call flow."""
result = await db.execute(
select(StoredCallFlow).where(StoredCallFlow.id == flow_id)
)
row = result.scalar_one_or_none()
if not row:
raise HTTPException(status_code=404, detail=f"Call flow '{flow_id}' not found")
if update.name is not None:
row.name = update.name
if update.description is not None:
row.description = update.description
if update.steps is not None:
row.steps = [s.model_dump() for s in update.steps]
if update.tags is not None:
row.tags = update.tags
if update.notes is not None:
row.notes = update.notes
if update.last_verified is not None:
row.last_verified = update.last_verified
await db.flush()
return CallFlow(
id=row.id,
name=row.name,
phone_number=row.phone_number,
description=row.description or "",
steps=[CallFlowStep(**s) for s in row.steps],
tags=row.tags or [],
notes=row.notes,
avg_hold_time=row.avg_hold_time,
success_rate=row.success_rate,
last_used=row.last_used,
times_used=row.times_used or 0,
)
@router.delete("/{flow_id}")
async def delete_call_flow(
flow_id: str,
db: AsyncSession = Depends(get_db),
):
"""Delete a stored call flow."""
result = await db.execute(
select(StoredCallFlow).where(StoredCallFlow.id == flow_id)
)
row = result.scalar_one_or_none()
if not row:
raise HTTPException(status_code=404, detail=f"Call flow '{flow_id}' not found")
await db.delete(row)
return {"status": "deleted", "flow_id": flow_id}

177
api/calls.py Normal file
View File

@@ -0,0 +1,177 @@
"""
Call Management API — Place calls, check status, transfer, hold-slay.
"""
from fastapi import APIRouter, Depends, HTTPException
from api.deps import get_gateway
from core.gateway import AIPSTNGateway
from models.call import (
CallMode,
CallRequest,
CallResponse,
CallStatusResponse,
HoldSlayerRequest,
TransferRequest,
)
router = APIRouter()
@router.post("/outbound", response_model=CallResponse)
async def make_call(
request: CallRequest,
gateway: AIPSTNGateway = Depends(get_gateway),
):
"""
Place an outbound call.
Modes:
- **direct**: Call and connect to your device immediately
- **hold_slayer**: Navigate IVR, wait on hold, transfer when human detected
- **ai_assisted**: Connect with noise cancel, transcription, recording
"""
try:
call = await gateway.make_call(
number=request.number,
mode=request.mode,
intent=request.intent,
device=request.device,
call_flow_id=request.call_flow_id,
services=request.services,
)
return CallResponse(
call_id=call.id,
status=call.status.value,
number=request.number,
mode=request.mode.value,
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/hold-slayer", response_model=CallResponse)
async def hold_slayer(
request: HoldSlayerRequest,
gateway: AIPSTNGateway = Depends(get_gateway),
):
"""
🗡️ The Hold Slayer endpoint.
Give it a number and intent, it calls, navigates the IVR,
waits on hold, and rings you when a human picks up.
Example:
POST /api/calls/hold-slayer
{
"number": "+18005551234",
"intent": "cancel my credit card",
"call_flow_id": "chase_bank_main",
"transfer_to": "sip_phone",
"notify": ["sms", "push"]
}
"""
try:
call = await gateway.make_call(
number=request.number,
mode=CallMode.HOLD_SLAYER,
intent=request.intent,
call_flow_id=request.call_flow_id,
device=request.transfer_to,
)
return CallResponse(
call_id=call.id,
status="navigating_ivr",
number=request.number,
mode="hold_slayer",
message="Hold Slayer activated. I'll ring you when a human picks up. ☕",
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/active")
async def list_active_calls(
gateway: AIPSTNGateway = Depends(get_gateway),
):
"""List all active calls with their current status."""
calls = gateway.call_manager.active_calls
return [call.summary() for call in calls.values()]
@router.get("/{call_id}", response_model=CallStatusResponse)
async def get_call(
call_id: str,
gateway: AIPSTNGateway = Depends(get_gateway),
):
"""Get current call status, transcript so far, classification history."""
call = gateway.get_call(call_id)
if not call:
raise HTTPException(status_code=404, detail=f"Call {call_id} not found")
return CallStatusResponse(
call_id=call.id,
status=call.status.value,
direction=call.direction,
remote_number=call.remote_number,
mode=call.mode.value,
duration=call.duration,
hold_time=call.hold_time,
audio_type=call.current_classification.value,
intent=call.intent,
transcript_excerpt=call.transcript[-500:] if call.transcript else None,
classification_history=call.classification_history[-50:],
current_step=call.current_step_id,
services=call.services,
)
@router.post("/{call_id}/transfer")
async def transfer_call(
call_id: str,
request: TransferRequest,
gateway: AIPSTNGateway = Depends(get_gateway),
):
"""Transfer an active call to a device."""
try:
await gateway.transfer_call(call_id, request.device)
return {"status": "transferred", "target": request.device}
except ValueError as e:
raise HTTPException(status_code=404, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/{call_id}/hangup")
async def hangup_call(
call_id: str,
gateway: AIPSTNGateway = Depends(get_gateway),
):
"""Hang up a call."""
try:
await gateway.hangup_call(call_id)
return {"status": "hung_up", "call_id": call_id}
except ValueError as e:
raise HTTPException(status_code=404, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/{call_id}/dtmf")
async def send_dtmf(
call_id: str,
digits: str,
gateway: AIPSTNGateway = Depends(get_gateway),
):
"""Send DTMF tones on an active call."""
call = gateway.get_call(call_id)
if not call:
raise HTTPException(status_code=404, detail=f"Call {call_id} not found")
# Find the PSTN leg for this call
for leg_id, cid in gateway.call_manager._call_legs.items():
if cid == call_id:
await gateway.sip_engine.send_dtmf(leg_id, digits)
return {"status": "sent", "digits": digits}
raise HTTPException(status_code=500, detail="No active SIP leg found for this call")

17
api/deps.py Normal file
View File

@@ -0,0 +1,17 @@
"""
API Dependencies — Shared dependency injection for all routes.
"""
from fastapi import Depends, HTTPException, Request
from sqlalchemy.ext.asyncio import AsyncSession
from core.gateway import AIPSTNGateway
from db.database import get_db
def get_gateway(request: Request) -> AIPSTNGateway:
"""Get the gateway instance from app state."""
gateway = getattr(request.app.state, "gateway", None)
if gateway is None:
raise HTTPException(status_code=503, detail="Gateway not initialized")
return gateway

131
api/devices.py Normal file
View File

@@ -0,0 +1,131 @@
"""
Device Management API — Register and manage phones/softphones.
"""
import uuid
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from api.deps import get_gateway
from core.gateway import AIPSTNGateway
from db.database import Device as DeviceDB
from db.database import get_db
from models.device import Device, DeviceCreate, DeviceStatus, DeviceUpdate
router = APIRouter()
@router.post("/", response_model=Device)
async def register_device(
device: DeviceCreate,
gateway: AIPSTNGateway = Depends(get_gateway),
db: AsyncSession = Depends(get_db),
):
"""Register a new device with the gateway."""
device_id = f"dev_{uuid.uuid4().hex[:8]}"
# Save to DB
db_device = DeviceDB(
id=device_id,
name=device.name,
type=device.type.value,
sip_uri=device.sip_uri,
phone_number=device.phone_number,
priority=device.priority,
capabilities=device.capabilities,
is_online="false",
)
db.add(db_device)
await db.flush()
# Register with gateway
dev = Device(id=device_id, **device.model_dump())
gateway.register_device(dev)
return dev
@router.get("/", response_model=list[DeviceStatus])
async def list_devices(
gateway: AIPSTNGateway = Depends(get_gateway),
):
"""List all registered devices and their status."""
return [
DeviceStatus(
id=d.id,
name=d.name,
type=d.type,
is_online=d.is_online,
last_seen=d.last_seen,
can_receive_call=d.can_receive_call,
)
for d in gateway.devices.values()
]
@router.get("/{device_id}", response_model=Device)
async def get_device(
device_id: str,
gateway: AIPSTNGateway = Depends(get_gateway),
):
"""Get a specific device."""
device = gateway.devices.get(device_id)
if not device:
raise HTTPException(status_code=404, detail=f"Device {device_id} not found")
return device
@router.put("/{device_id}", response_model=Device)
async def update_device(
device_id: str,
update: DeviceUpdate,
gateway: AIPSTNGateway = Depends(get_gateway),
db: AsyncSession = Depends(get_db),
):
"""Update a device."""
device = gateway.devices.get(device_id)
if not device:
raise HTTPException(status_code=404, detail=f"Device {device_id} not found")
# Update in-memory
update_data = update.model_dump(exclude_unset=True)
for key, value in update_data.items():
setattr(device, key, value)
# Update in DB
result = await db.execute(
select(DeviceDB).where(DeviceDB.id == device_id)
)
db_device = result.scalar_one_or_none()
if db_device:
for key, value in update_data.items():
if key == "type" and value is not None:
value = value.value if hasattr(value, "value") else value
setattr(db_device, key, value)
return device
@router.delete("/{device_id}")
async def unregister_device(
device_id: str,
gateway: AIPSTNGateway = Depends(get_gateway),
db: AsyncSession = Depends(get_db),
):
"""Unregister a device."""
if device_id not in gateway.devices:
raise HTTPException(status_code=404, detail=f"Device {device_id} not found")
gateway.unregister_device(device_id)
result = await db.execute(
select(DeviceDB).where(DeviceDB.id == device_id)
)
db_device = result.scalar_one_or_none()
if db_device:
await db.delete(db_device)
return {"status": "unregistered", "device_id": device_id}

113
api/websocket.py Normal file
View File

@@ -0,0 +1,113 @@
"""WebSocket API — Real-time call events and audio classification stream."""
import asyncio
import logging
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
from api.deps import get_gateway
from models.events import EventType, GatewayEvent
logger = logging.getLogger(__name__)
router = APIRouter()
async def _send_trunk_status(websocket: WebSocket, gateway) -> None:
"""Send current SIP trunk status as a synthetic event to a newly connected client."""
try:
trunk_status = await gateway.sip_engine.get_trunk_status()
registered = trunk_status.get("registered", False)
event_type = (
EventType.SIP_TRUNK_REGISTERED if registered
else EventType.SIP_TRUNK_REGISTRATION_FAILED
)
reason = trunk_status.get("reason", "Trunk registration failed or not configured")
event = GatewayEvent(
type=event_type,
message=(
f"SIP trunk registered with {trunk_status.get('host')}"
if registered
else f"SIP trunk not registered — {reason}"
),
data=trunk_status,
)
await websocket.send_json(event.to_ws_message())
except Exception as exc:
logger.warning(f"Could not send trunk status on connect: {exc}")
@router.websocket("/events")
async def event_stream(websocket: WebSocket):
"""
Real-time event stream.
Sends all gateway events as JSON:
- Call lifecycle (initiated, ringing, connected, ended)
- Hold Slayer events (IVR steps, DTMF, hold detected, human detected)
- Audio classifications
- Transcript chunks
- Device status changes
Example message:
{
"type": "holdslayer.human_detected",
"call_id": "call_abc123",
"timestamp": "2025-01-15T14:30:00",
"data": {"audio_type": "live_human", "confidence": 0.92},
"message": "🚨 Human detected!"
}
"""
await websocket.accept()
logger.info("WebSocket client connected")
gateway = getattr(websocket.app.state, "gateway", None)
if not gateway:
await websocket.send_json({"error": "Gateway not initialized"})
await websocket.close()
return
# Immediately push current trunk status so the dashboard doesn't start blank
await _send_trunk_status(websocket, gateway)
subscription = gateway.event_bus.subscribe()
try:
async for event in subscription:
await websocket.send_json(event.to_ws_message())
except WebSocketDisconnect:
logger.info("WebSocket client disconnected")
except Exception as e:
logger.error(f"WebSocket error: {e}")
finally:
subscription.close()
@router.websocket("/calls/{call_id}/events")
async def call_event_stream(websocket: WebSocket, call_id: str):
"""
Event stream filtered to a specific call.
Same format as /events but only sends events for the specified call.
"""
await websocket.accept()
logger.info(f"WebSocket client connected for call {call_id}")
gateway = getattr(websocket.app.state, "gateway", None)
if not gateway:
await websocket.send_json({"error": "Gateway not initialized"})
await websocket.close()
return
subscription = gateway.event_bus.subscribe()
try:
async for event in subscription:
if event.call_id == call_id:
await websocket.send_json(event.to_ws_message())
except WebSocketDisconnect:
logger.info(f"WebSocket client disconnected for call {call_id}")
except Exception as e:
logger.error(f"WebSocket error: {e}")
finally:
subscription.close()

119
config.py Normal file
View File

@@ -0,0 +1,119 @@
"""
Hold Slayer Gateway — Configuration
All settings loaded from environment variables / .env file.
"""
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class SIPTrunkSettings(BaseSettings):
"""SIP trunk provider configuration."""
model_config = SettingsConfigDict(env_prefix="SIP_TRUNK_")
host: str = "sip.provider.com"
port: int = 5060
username: str = ""
password: str = ""
transport: str = "udp" # udp, tcp, tls
did: str = "" # Your phone number (E.164)
class GatewaySIPSettings(BaseSettings):
"""Gateway SIP listener for device registration."""
model_config = SettingsConfigDict(env_prefix="GATEWAY_SIP_")
host: str = "0.0.0.0"
port: int = 5060
domain: str = "gateway.local"
class SpeachesSettings(BaseSettings):
"""Speaches STT service configuration."""
model_config = SettingsConfigDict(env_prefix="SPEACHES_")
url: str = "http://localhost:22070"
prod_url: str = "http://localhost:22070"
model: str = "whisper-large-v3"
class ClassifierSettings(BaseSettings):
"""Audio classifier thresholds."""
model_config = SettingsConfigDict(env_prefix="CLASSIFIER_")
music_threshold: float = 0.7
speech_threshold: float = 0.6
silence_threshold: float = 0.85
window_seconds: float = 3.0
class LLMSettings(BaseSettings):
"""LLM service configuration (OpenAI-compatible API)."""
model_config = SettingsConfigDict(env_prefix="LLM_")
base_url: str = "http://localhost:11434/v1"
model: str = "llama3"
api_key: str = "not-needed"
timeout: float = 30.0
max_tokens: int = 1024
temperature: float = 0.3
class HoldSlayerSettings(BaseSettings):
"""Hold Slayer behavior settings."""
model_config = SettingsConfigDict(env_prefix="HOLD_SLAYER_", env_prefix_allow_empty=True)
default_transfer_device: str = Field(
default="sip_phone", validation_alias="DEFAULT_TRANSFER_DEVICE"
)
max_hold_time: int = Field(default=7200, validation_alias="MAX_HOLD_TIME")
hold_check_interval: float = Field(default=2.0, validation_alias="HOLD_CHECK_INTERVAL")
class Settings(BaseSettings):
"""Root application settings."""
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
extra="ignore",
)
# Database
database_url: str = "postgresql+asyncpg://holdslayer:changeme@localhost:5432/holdslayer"
# Server
host: str = "0.0.0.0"
port: int = 8000
debug: bool = True
log_level: str = "info"
# Notifications
notify_sms_number: str = ""
# Sub-configs
sip_trunk: SIPTrunkSettings = Field(default_factory=SIPTrunkSettings)
gateway_sip: GatewaySIPSettings = Field(default_factory=GatewaySIPSettings)
speaches: SpeachesSettings = Field(default_factory=SpeachesSettings)
classifier: ClassifierSettings = Field(default_factory=ClassifierSettings)
llm: LLMSettings = Field(default_factory=LLMSettings)
hold_slayer: HoldSlayerSettings = Field(default_factory=HoldSlayerSettings)
# Singleton
_settings: Settings | None = None
def get_settings() -> Settings:
"""Get cached application settings."""
global _settings
if _settings is None:
_settings = Settings()
return _settings

1
core/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Core telephony engine — SIP, media, and call management."""

199
core/call_manager.py Normal file
View File

@@ -0,0 +1,199 @@
"""
Call Manager — Active call state tracking and event bus.
Central nervous system of the gateway. Tracks all active calls,
publishes events, and coordinates between SIP engine and services.
"""
import asyncio
import logging
import uuid
from collections.abc import AsyncIterator
from datetime import datetime
from typing import Optional
from core.event_bus import EventBus, EventSubscription
from models.call import ActiveCall, AudioClassification, CallMode, CallStatus, ClassificationResult
from models.events import EventType, GatewayEvent
logger = logging.getLogger(__name__)
class CallManager:
"""
Manages all active calls and their state.
The single source of truth for what's happening on the gateway.
"""
def __init__(self, event_bus: EventBus):
self.event_bus = event_bus
self._active_calls: dict[str, ActiveCall] = {}
self._call_legs: dict[str, str] = {} # SIP leg ID -> call ID mapping
# ================================================================
# Call Lifecycle
# ================================================================
async def create_call(
self,
remote_number: str,
mode: CallMode = CallMode.DIRECT,
intent: Optional[str] = None,
call_flow_id: Optional[str] = None,
device: Optional[str] = None,
services: Optional[list[str]] = None,
) -> ActiveCall:
"""Create a new call and track it."""
call_id = f"call_{uuid.uuid4().hex[:12]}"
call = ActiveCall(
id=call_id,
remote_number=remote_number,
mode=mode,
intent=intent,
call_flow_id=call_flow_id,
device=device,
services=services or [],
)
self._active_calls[call_id] = call
await self.event_bus.publish(GatewayEvent(
type=EventType.CALL_INITIATED,
call_id=call_id,
data={"number": remote_number, "mode": mode.value, "intent": intent},
message=f"📞 Calling {remote_number} ({mode.value})",
))
return call
async def update_status(self, call_id: str, status: CallStatus) -> None:
"""Update a call's status and publish event."""
call = self._active_calls.get(call_id)
if not call:
logger.warning(f"Cannot update status: call {call_id} not found")
return
old_status = call.status
call.status = status
# Track timing milestones
if status == CallStatus.CONNECTED and not call.connected_at:
call.connected_at = datetime.now()
elif status == CallStatus.ON_HOLD:
call.hold_started_at = datetime.now()
elif status == CallStatus.HUMAN_DETECTED:
call.hold_started_at = None # Stop counting hold time
# Map status to event type
event_map = {
CallStatus.RINGING: EventType.CALL_RINGING,
CallStatus.CONNECTED: EventType.CALL_CONNECTED,
CallStatus.NAVIGATING_IVR: EventType.IVR_STEP,
CallStatus.ON_HOLD: EventType.HOLD_DETECTED,
CallStatus.HUMAN_DETECTED: EventType.HUMAN_DETECTED,
CallStatus.TRANSFERRING: EventType.TRANSFER_STARTED,
CallStatus.BRIDGED: EventType.TRANSFER_COMPLETE,
CallStatus.COMPLETED: EventType.CALL_ENDED,
CallStatus.FAILED: EventType.CALL_FAILED,
}
event_type = event_map.get(status, EventType.CALL_CONNECTED)
await self.event_bus.publish(GatewayEvent(
type=event_type,
call_id=call_id,
data={
"old_status": old_status.value,
"new_status": status.value,
"duration": call.duration,
"hold_time": call.hold_time,
},
message=f"Call {call_id}: {old_status.value}{status.value}",
))
async def add_classification(
self, call_id: str, result: ClassificationResult
) -> None:
"""Add an audio classification result to a call."""
call = self._active_calls.get(call_id)
if not call:
return
call.current_classification = result.audio_type
call.classification_history.append(result)
await self.event_bus.publish(GatewayEvent(
type=EventType.AUDIO_CLASSIFIED,
call_id=call_id,
data={
"audio_type": result.audio_type.value,
"confidence": result.confidence,
},
message=f"🎵 Audio: {result.audio_type.value} ({result.confidence:.0%})",
))
async def add_transcript(self, call_id: str, text: str) -> None:
"""Add a transcript chunk to a call."""
call = self._active_calls.get(call_id)
if not call:
return
call.transcript_chunks.append(text)
await self.event_bus.publish(GatewayEvent(
type=EventType.TRANSCRIPT_CHUNK,
call_id=call_id,
data={"text": text},
message=f"📝 '{text[:80]}...' " if len(text) > 80 else f"📝 '{text}'",
))
async def end_call(self, call_id: str, status: CallStatus = CallStatus.COMPLETED) -> Optional[ActiveCall]:
"""End a call and remove from active tracking."""
call = self._active_calls.pop(call_id, None)
if call:
call.status = status
await self.event_bus.publish(GatewayEvent(
type=EventType.CALL_ENDED,
call_id=call_id,
data={
"duration": call.duration,
"hold_time": call.hold_time,
"final_status": status.value,
},
message=f"📵 Call ended: {call.remote_number} ({call.duration}s, hold: {call.hold_time}s)",
))
return call
# ================================================================
# Leg Mapping
# ================================================================
def map_leg(self, sip_leg_id: str, call_id: str) -> None:
"""Map a SIP leg ID to a call ID."""
self._call_legs[sip_leg_id] = call_id
def get_call_for_leg(self, sip_leg_id: str) -> Optional[ActiveCall]:
"""Look up which call a SIP leg belongs to."""
call_id = self._call_legs.get(sip_leg_id)
if call_id:
return self._active_calls.get(call_id)
return None
# ================================================================
# Queries
# ================================================================
def get_call(self, call_id: str) -> Optional[ActiveCall]:
"""Get an active call by ID."""
return self._active_calls.get(call_id)
@property
def active_calls(self) -> dict[str, ActiveCall]:
"""All active calls."""
return dict(self._active_calls)
@property
def active_call_count(self) -> int:
return len(self._active_calls)

224
core/dial_plan.py Normal file
View File

@@ -0,0 +1,224 @@
"""
Dial Plan — Pattern matching and digit normalisation.
Matches a dialled string to a route type and normalises the destination
to a canonical form the rest of the gateway can act on.
Route types:
"extension" — internal 2XX endpoint
"service" — internal 5XX system service
"pstn" — outbound call via SIP trunk (normalised E.164)
"invalid" — no match
"""
import re
from dataclasses import dataclass
from typing import Optional
# ================================================================
# Emergency numbers — always route to PSTN, highest priority
# ================================================================
EMERGENCY_NUMBERS: dict[str, str] = {
"911": "+1911", # North American emergency
"9911": "+1911", # Mis-dial with phantom '9' prefix
"112": "+112", # International GSM emergency
}
# ================================================================
# Extension ranges
# ================================================================
EXTENSION_FIRST = 221
EXTENSION_LAST = 299
SERVICE_FIRST = 500
SERVICE_LAST = 599
# ================================================================
# Known system services
# ================================================================
SERVICES: dict[int, str] = {
500: "auto_attendant",
510: "gateway_status",
511: "echo_test",
520: "hold_slayer_launch",
599: "operator_fallback",
}
# ================================================================
# Route result
# ================================================================
@dataclass
class RouteResult:
"""Result of a dial plan lookup."""
route_type: str # "extension" | "service" | "pstn" | "invalid"
destination: str # normalised — extension number, service name, or E.164
original: str # what was dialled
description: str = ""
@property
def is_internal(self) -> bool:
return self.route_type in ("extension", "service")
@property
def is_outbound(self) -> bool:
return self.route_type == "pstn"
@property
def is_valid(self) -> bool:
return self.route_type != "invalid"
# ================================================================
# Core matcher
# ================================================================
def match(digits: str) -> RouteResult:
"""
Match dialled digits against the dial plan.
Returns a RouteResult with the normalised destination.
Examples:
match("221") → RouteResult(route_type="extension", destination="221")
match("511") → RouteResult(route_type="service", destination="echo_test")
match("6135550100") → RouteResult(route_type="pstn", destination="+16135550100")
match("16135550100") → RouteResult(route_type="pstn", destination="+16135550100")
match("+16135550100") → RouteResult(route_type="pstn", destination="+16135550100")
match("01144201234") → RouteResult(route_type="pstn", destination="+44201234")
"""
digits = digits.strip()
# ---- Emergency numbers — checked first, no interception ----
if digits in EMERGENCY_NUMBERS:
e164 = EMERGENCY_NUMBERS[digits]
return RouteResult(
route_type="pstn",
destination=e164,
original=digits,
description=f"EMERGENCY {digits}{e164}",
)
# ---- 2XX extensions ----
if re.fullmatch(r"2\d{2}", digits):
ext = int(digits)
if EXTENSION_FIRST <= ext <= EXTENSION_LAST:
return RouteResult(
route_type="extension",
destination=digits,
original=digits,
description=f"Extension {digits}",
)
# ---- 5XX system services ----
if re.fullmatch(r"5\d{2}", digits):
svc = int(digits)
if SERVICE_FIRST <= svc <= SERVICE_LAST:
name = SERVICES.get(svc, f"service_{svc}")
return RouteResult(
route_type="service",
destination=name,
original=digits,
description=f"System service: {name}",
)
# ---- PSTN outbound ----
e164 = _normalise_e164(digits)
if e164:
return RouteResult(
route_type="pstn",
destination=e164,
original=digits,
description=f"PSTN outbound → {e164}",
)
return RouteResult(
route_type="invalid",
destination=digits,
original=digits,
description=f"No route for '{digits}'",
)
# ================================================================
# E.164 normalisation
# ================================================================
def _normalise_e164(digits: str) -> Optional[str]:
"""
Normalise a dialled string to E.164 (+CC…).
Handles:
+CCNNN… → unchanged (already E.164)
1NPANXXXXXX → +1NPANXXXXXX (NANP with country code, 11 digits)
NPANXXXXXX → +1NPANXXXXXX (NANP 10-digit)
011CCNNN… → +CCNNN… (IDD 011 prefix)
00CCNNN… → +CCNNN… (IDD 00 prefix)
"""
# Strip spaces/dashes/dots/parens for matching only
clean = re.sub(r"[\s\-\.\(\)]", "", digits)
# Already E.164
if re.fullmatch(r"\+\d{7,15}", clean):
return clean
# NANP: 1 + 10 digits (NPA must be 2-9, NXX must be 2-9)
if re.fullmatch(r"1[2-9]\d{2}[2-9]\d{6}", clean):
return f"+{clean}"
# NANP: 10 digits only
if re.fullmatch(r"[2-9]\d{2}[2-9]\d{6}", clean):
return f"+1{clean}"
# IDD 011 (North American international dialling prefix)
m = re.fullmatch(r"011(\d{7,13})", clean)
if m:
return f"+{m.group(1)}"
# IDD 00 (international dialling prefix used in many countries)
m = re.fullmatch(r"00(\d{7,13})", clean)
if m:
return f"+{m.group(1)}"
return None
# ================================================================
# Extension helpers
# ================================================================
def next_extension(used: set[int]) -> Optional[int]:
"""
Return the lowest available extension in the 2XX range.
Args:
used: Set of already-assigned extension numbers.
Returns:
Next free extension, or None if the range is exhausted.
"""
for ext in range(EXTENSION_FIRST, EXTENSION_LAST + 1):
if ext not in used:
return ext
return None
def is_extension(digits: str) -> bool:
"""True if the string is a valid 2XX extension."""
return bool(re.fullmatch(r"2\d{2}", digits)) and (
EXTENSION_FIRST <= int(digits) <= EXTENSION_LAST
)
def is_service(digits: str) -> bool:
"""True if the string is a valid 5XX service code."""
return bool(re.fullmatch(r"5\d{2}", digits)) and (
SERVICE_FIRST <= int(digits) <= SERVICE_LAST
)

120
core/event_bus.py Normal file
View File

@@ -0,0 +1,120 @@
"""
Event Bus — Async pub/sub for real-time gateway events.
WebSocket connections, MCP server, and internal services
all subscribe to events here. Pure asyncio — no external deps.
"""
import asyncio
import logging
from typing import Optional
from models.events import EventType, GatewayEvent
logger = logging.getLogger(__name__)
class EventBus:
"""
Async pub/sub event bus using asyncio.Queue per subscriber.
Features:
- Non-blocking publish (put_nowait)
- Automatic dead-subscriber cleanup (full queues are removed)
- Event history (last N events for late joiners)
- Typed event filtering on subscriptions
- Async iteration via EventSubscription
"""
def __init__(self, max_history: int = 1000):
self._subscribers: list[tuple[asyncio.Queue[GatewayEvent], Optional[set[EventType]]]] = []
self._history: list[GatewayEvent] = []
self._max_history = max_history
async def publish(self, event: GatewayEvent) -> None:
"""Publish an event to all subscribers."""
self._history.append(event)
if len(self._history) > self._max_history:
self._history = self._history[-self._max_history :]
logger.info(f"📡 Event: {event.type.value} | {event.message or ''}")
dead_queues = []
for queue, type_filter in self._subscribers:
# Skip if subscriber has a type filter and this event doesn't match
if type_filter and event.type not in type_filter:
continue
try:
queue.put_nowait(event)
except asyncio.QueueFull:
dead_queues.append((queue, type_filter))
for entry in dead_queues:
self._subscribers.remove(entry)
def subscribe(
self,
max_size: int = 100,
event_types: Optional[set[EventType]] = None,
) -> "EventSubscription":
"""
Create a new subscription.
Args:
max_size: Queue depth before subscriber is considered dead.
event_types: Optional filter — only receive these event types.
None means receive everything.
Returns:
An async iterator of GatewayEvents.
"""
queue: asyncio.Queue[GatewayEvent] = asyncio.Queue(maxsize=max_size)
entry = (queue, event_types)
self._subscribers.append(entry)
return EventSubscription(queue, self, entry)
def unsubscribe(self, entry: tuple) -> None:
"""Remove a subscriber."""
if entry in self._subscribers:
self._subscribers.remove(entry)
@property
def recent_events(self) -> list[GatewayEvent]:
"""Get recent event history."""
return list(self._history)
@property
def subscriber_count(self) -> int:
return len(self._subscribers)
class EventSubscription:
"""An async iterator that yields events from the bus."""
def __init__(
self,
queue: asyncio.Queue[GatewayEvent],
bus: EventBus,
entry: tuple,
):
self._queue = queue
self._bus = bus
self._entry = entry
def __aiter__(self):
return self
async def __anext__(self) -> GatewayEvent:
try:
return await self._queue.get()
except asyncio.CancelledError:
self._bus.unsubscribe(self._entry)
raise
async def get(self, timeout: Optional[float] = None) -> GatewayEvent:
"""Get next event with optional timeout."""
return await asyncio.wait_for(self._queue.get(), timeout=timeout)
def close(self):
"""Unsubscribe from the event bus."""
self._bus.unsubscribe(self._entry)

401
core/gateway.py Normal file
View File

@@ -0,0 +1,401 @@
"""
AI PSTN Gateway — The main orchestrator.
Ties together SIP engine, call manager, event bus, and all services.
This is the top-level object that FastAPI and MCP talk to.
"""
import logging
from datetime import datetime
from typing import Optional
from config import Settings, get_settings
from core.call_manager import CallManager
from core.dial_plan import next_extension
from core.event_bus import EventBus
from core.media_pipeline import MediaPipeline
from core.sip_engine import MockSIPEngine, SIPEngine
from core.sippy_engine import SippyEngine
from models.call import ActiveCall, CallMode, CallStatus
from models.call_flow import CallFlow
from models.device import Device, DeviceType
from models.events import EventType, GatewayEvent
logger = logging.getLogger(__name__)
def _build_sip_engine(settings: Settings, gateway: "AIPSTNGateway") -> SIPEngine:
"""Build the appropriate SIP engine from config."""
trunk = settings.sip_trunk
gw_sip = settings.gateway_sip
if trunk.host and trunk.host != "sip.provider.com":
# Real trunk configured — use Sippy B2BUA
try:
return SippyEngine(
sip_address=gw_sip.host,
sip_port=gw_sip.port,
trunk_host=trunk.host,
trunk_port=trunk.port,
trunk_username=trunk.username,
trunk_password=trunk.password,
trunk_transport=trunk.transport,
domain=gw_sip.domain,
did=trunk.did,
on_device_registered=gateway._on_sip_device_registered,
)
except Exception as e:
logger.warning(f"Could not create SippyEngine: {e} — using mock")
return MockSIPEngine()
class AIPSTNGateway:
"""
The AI PSTN Gateway.
Central coordination point for:
- SIP engine (signaling + media)
- Call manager (state + events)
- Hold Slayer service
- Audio classifier
- Transcription service
- Device management
"""
def __init__(
self,
settings: Settings,
sip_engine: Optional[SIPEngine] = None,
):
self.settings = settings
self.event_bus = EventBus()
self.call_manager = CallManager(self.event_bus)
self.sip_engine: SIPEngine = sip_engine or MockSIPEngine()
# Services (initialized in start())
self._hold_slayer = None
self._audio_classifier = None
self._transcription = None
# Device registry (loaded from DB on start)
self._devices: dict[str, Device] = {}
# Startup time
self._started_at: Optional[datetime] = None
@classmethod
def from_config(cls, sip_engine: Optional[SIPEngine] = None) -> "AIPSTNGateway":
"""Create gateway from environment config."""
settings = get_settings()
gw = cls(settings=settings)
if sip_engine is not None:
gw.sip_engine = sip_engine
else:
gw.sip_engine = _build_sip_engine(settings, gw)
return gw
# ================================================================
# Lifecycle
# ================================================================
async def start(self) -> None:
"""Boot the gateway — start SIP engine and services."""
logger.info("🔥 Starting AI PSTN Gateway...")
# Start SIP engine
await self.sip_engine.start()
logger.info(f" SIP Engine: ready")
# Import services here to avoid circular imports
from services.audio_classifier import AudioClassifier
from services.transcription import TranscriptionService
self._audio_classifier = AudioClassifier(self.settings.classifier)
self._transcription = TranscriptionService(self.settings.speaches)
self._started_at = datetime.now()
trunk_status = await self.sip_engine.get_trunk_status()
trunk_registered = trunk_status.get("registered", False)
logger.info(f" SIP Trunk: {'registered' if trunk_registered else 'not registered'}")
logger.info(f" Devices: {len(self._devices)} registered")
logger.info("\U0001f525 AI PSTN Gateway is LIVE")
# Publish trunk registration status so dashboards/WS clients know immediately
if trunk_registered:
await self.event_bus.publish(GatewayEvent(
type=EventType.SIP_TRUNK_REGISTERED,
message=f"SIP trunk registered with {trunk_status.get('host')}",
data=trunk_status,
))
else:
reason = trunk_status.get("reason", "Trunk registration failed or not configured")
await self.event_bus.publish(GatewayEvent(
type=EventType.SIP_TRUNK_REGISTRATION_FAILED,
message=f"SIP trunk not registered — {reason}",
data=trunk_status,
))
async def stop(self) -> None:
"""Gracefully shut down."""
logger.info("Shutting down AI PSTN Gateway...")
# End all active calls
for call_id in list(self.call_manager.active_calls.keys()):
call = self.call_manager.get_call(call_id)
if call:
await self.call_manager.end_call(call_id, CallStatus.CANCELLED)
# Stop SIP engine
await self.sip_engine.stop()
self._started_at = None
logger.info("Gateway shut down cleanly.")
@property
def uptime(self) -> Optional[int]:
"""Gateway uptime in seconds."""
if self._started_at:
return int((datetime.now() - self._started_at).total_seconds())
return None
# ================================================================
# Call Operations
# ================================================================
async def make_call(
self,
number: str,
mode: CallMode = CallMode.DIRECT,
intent: Optional[str] = None,
call_flow_id: Optional[str] = None,
device: Optional[str] = None,
services: Optional[list[str]] = None,
) -> ActiveCall:
"""
Place an outbound call.
This is the main entry point for all call types:
- direct: Call and connect to device immediately
- hold_slayer: Navigate IVR, wait on hold, transfer when human detected
- ai_assisted: Connect with transcription, recording, noise cancel
"""
# Create call in manager
call = await self.call_manager.create_call(
remote_number=number,
mode=mode,
intent=intent,
call_flow_id=call_flow_id,
device=device or self.settings.hold_slayer.default_transfer_device,
services=services,
)
# Place outbound call via SIP engine
try:
sip_leg_id = await self.sip_engine.make_call(
number=number,
caller_id=self.settings.sip_trunk.did,
)
self.call_manager.map_leg(sip_leg_id, call.id)
await self.call_manager.update_status(call.id, CallStatus.RINGING)
except Exception as e:
logger.error(f"Failed to place call: {e}")
await self.call_manager.update_status(call.id, CallStatus.FAILED)
raise
# If hold_slayer mode, launch the Hold Slayer service
if mode == CallMode.HOLD_SLAYER:
from services.hold_slayer import HoldSlayerService
hold_slayer = HoldSlayerService(
gateway=self,
call_manager=self.call_manager,
sip_engine=self.sip_engine,
classifier=self._audio_classifier,
transcription=self._transcription,
settings=self.settings,
)
# Launch as background task — don't block
import asyncio
asyncio.create_task(
hold_slayer.run(call, sip_leg_id, call_flow_id),
name=f"holdslayer_{call.id}",
)
return call
async def transfer_call(self, call_id: str, device_id: str) -> None:
"""Transfer an active call to a device."""
call = self.call_manager.get_call(call_id)
if not call:
raise ValueError(f"Call {call_id} not found")
device = self._devices.get(device_id)
if not device:
raise ValueError(f"Device {device_id} not found")
await self.call_manager.update_status(call_id, CallStatus.TRANSFERRING)
# Place call to device
device_leg_id = await self.sip_engine.call_device(device)
self.call_manager.map_leg(device_leg_id, call_id)
# Get the original PSTN leg
pstn_leg_id = None
for leg_id, cid in self.call_manager._call_legs.items():
if cid == call_id and leg_id != device_leg_id:
pstn_leg_id = leg_id
break
if pstn_leg_id:
# Bridge the PSTN leg and device leg
await self.sip_engine.bridge_calls(pstn_leg_id, device_leg_id)
await self.call_manager.update_status(call_id, CallStatus.BRIDGED)
else:
logger.error(f"Could not find PSTN leg for call {call_id}")
await self.call_manager.update_status(call_id, CallStatus.FAILED)
async def hangup_call(self, call_id: str) -> None:
"""Hang up a call."""
call = self.call_manager.get_call(call_id)
if not call:
raise ValueError(f"Call {call_id} not found")
# Hang up all legs associated with this call
for leg_id, cid in list(self.call_manager._call_legs.items()):
if cid == call_id:
await self.sip_engine.hangup(leg_id)
await self.call_manager.end_call(call_id)
def get_call(self, call_id: str) -> Optional[ActiveCall]:
"""Get an active call."""
return self.call_manager.get_call(call_id)
# ================================================================
# Device Management
# ================================================================
def register_device(self, device: Device) -> None:
"""Register a device with the gateway, auto-assigning an extension."""
# Auto-assign a 2XX extension if not already set
if device.extension is None:
used = {
d.extension
for d in self._devices.values()
if d.extension is not None
}
device.extension = next_extension(used)
# Build a sip_uri from the extension if not provided
if device.sip_uri is None and device.extension is not None:
domain = self.settings.gateway_sip.domain
device.sip_uri = f"sip:{device.extension}@{domain}"
self._devices[device.id] = device
logger.info(
f"📱 Device registered: {device.name} "
f"ext={device.extension} uri={device.sip_uri}"
)
def unregister_device(self, device_id: str) -> None:
"""Unregister a device."""
device = self._devices.pop(device_id, None)
if device:
logger.info(f"📱 Device unregistered: {device.name}")
async def _on_sip_device_registered(
self, aor: str, contact: str, expires: int
) -> None:
"""
Called by SippyEngine when a phone sends SIP REGISTER.
Finds or creates a Device entry and ensures it has an extension
and a sip_uri. Publishes a DEVICE_REGISTERED event on the bus.
"""
import uuid
# Look for an existing device with this AOR
existing = next(
(d for d in self._devices.values() if d.sip_uri == aor),
None,
)
if existing:
existing.is_online = expires > 0
existing.last_seen = datetime.now()
logger.info(
f"📱 Device refreshed: {existing.name} "
f"ext={existing.extension} expires={expires}"
)
if expires == 0:
await self.event_bus.publish(GatewayEvent(
type=EventType.DEVICE_OFFLINE,
message=f"{existing.name} (ext {existing.extension}) unregistered",
data={"device_id": existing.id, "aor": aor},
))
return
# New device — auto-register it
device_id = f"dev_{uuid.uuid4().hex[:8]}"
# Derive a friendly name from the AOR username (sip:alice@host → alice)
user_part = aor.split(":")[-1].split("@")[0] if ":" in aor else aor
dev = Device(
id=device_id,
name=user_part,
type="sip_phone",
sip_uri=aor,
is_online=True,
last_seen=datetime.now(),
)
self.register_device(dev) # assigns extension + sip_uri
await self.event_bus.publish(GatewayEvent(
type=EventType.DEVICE_REGISTERED,
message=(
f"{dev.name} registered as ext {dev.extension} "
f"({dev.sip_uri})"
),
data={
"device_id": dev.id,
"name": dev.name,
"extension": dev.extension,
"sip_uri": dev.sip_uri,
"contact": contact,
},
))
def preferred_device(self) -> Optional[Device]:
"""Get the highest-priority online device."""
online_devices = [
d for d in self._devices.values()
if d.can_receive_call
]
if online_devices:
return sorted(online_devices, key=lambda d: d.priority)[0]
# Fallback: any device that can receive calls (e.g., cell phone)
fallback = [
d for d in self._devices.values()
if d.type == DeviceType.CELL and d.phone_number
]
return sorted(fallback, key=lambda d: d.priority)[0] if fallback else None
@property
def devices(self) -> dict[str, Device]:
"""All registered devices."""
return dict(self._devices)
# ================================================================
# Status
# ================================================================
async def status(self) -> dict:
"""Full gateway status."""
trunk = await self.sip_engine.get_trunk_status()
return {
"uptime": self.uptime,
"trunk": trunk,
"devices": {d.id: {"name": d.name, "online": d.is_online} for d in self._devices.values()},
"active_calls": self.call_manager.active_call_count,
"event_subscribers": self.event_bus.subscriber_count,
}

529
core/media_pipeline.py Normal file
View File

@@ -0,0 +1,529 @@
"""
Media Pipeline — PJSUA2 conference bridge and audio routing.
This is the media anchor for the gateway. PJSUA2 handles all RTP:
- Conference bridge (mixing, bridging call legs)
- Audio tapping (extracting audio for classifier + STT)
- WAV recording
- Tone generation (DTMF, comfort noise)
Architecture:
Each SIP call leg gets a transport + media port in PJSUA2's conf bridge.
The pipeline provides methods to:
- Add/remove RTP streams (tied to Sippy call legs)
- Bridge two streams (connect call legs)
- Tap a stream (fork audio to classifier/STT)
- Record a stream to WAV
- Play audio into a stream (prompts, comfort tones)
PJSUA2 runs in its own thread with a dedicated Endpoint.
"""
import asyncio
import logging
import threading
from collections.abc import AsyncIterator
from typing import Optional
logger = logging.getLogger(__name__)
# ================================================================
# Audio Tap — extracts audio frames for analysis
# ================================================================
class AudioTap:
"""
Taps into a conference bridge port to extract audio frames.
Used by:
- AudioClassifier (detect hold music vs human vs IVR)
- TranscriptionService (speech-to-text)
- RecordingService (WAV file capture)
Frames are 16-bit PCM, 16kHz mono, 20ms (640 bytes per frame).
"""
def __init__(self, stream_id: str, sample_rate: int = 16000, frame_ms: int = 20):
self.stream_id = stream_id
self.sample_rate = sample_rate
self.frame_ms = frame_ms
self.frame_size = int(sample_rate * frame_ms / 1000) * 2 # 16-bit = 2 bytes/sample
self._buffer: asyncio.Queue[bytes] = asyncio.Queue(maxsize=500)
self._active = True
self._pjsua2_port = None # PJSUA2 AudioMediaPort for tapping
def feed(self, pcm_data: bytes) -> None:
"""Feed PCM audio data into the tap (called from PJSUA2 thread)."""
if not self._active:
return
try:
self._buffer.put_nowait(pcm_data)
except asyncio.QueueFull:
# Drop oldest frame to keep flowing
try:
self._buffer.get_nowait()
self._buffer.put_nowait(pcm_data)
except (asyncio.QueueEmpty, asyncio.QueueFull):
pass
async def read_frame(self, timeout: float = 1.0) -> Optional[bytes]:
"""Read the next audio frame (async)."""
try:
return await asyncio.wait_for(self._buffer.get(), timeout=timeout)
except asyncio.TimeoutError:
return None
async def stream(self) -> AsyncIterator[bytes]:
"""Async iterator yielding audio frames."""
while self._active:
frame = await self.read_frame()
if frame:
yield frame
def close(self):
"""Stop the tap."""
self._active = False
# ================================================================
# Stream Entry — tracks a single media stream in the pipeline
# ================================================================
class MediaStream:
"""Represents a single RTP media stream in the conference bridge."""
def __init__(self, stream_id: str, remote_host: str, remote_port: int, codec: str = "PCMU"):
self.stream_id = stream_id
self.remote_host = remote_host
self.remote_port = remote_port
self.codec = codec
self.conf_port: Optional[int] = None # PJSUA2 conference bridge port ID
self.transport = None # PJSUA2 SipTransport
self.rtp_port: Optional[int] = None # Local RTP listen port
self.taps: list[AudioTap] = []
self.recorder = None # PJSUA2 AudioMediaRecorder
self.active = True
def __repr__(self):
return (
f"<MediaStream {self.stream_id} "
f"rtp={self.remote_host}:{self.remote_port} "
f"conf_port={self.conf_port}>"
)
# ================================================================
# Main Pipeline
# ================================================================
class MediaPipeline:
"""
PJSUA2-based media pipeline.
Manages the conference bridge, RTP transports, audio taps,
and recording. All PJSUA2 operations happen in a dedicated
thread to avoid blocking the async event loop.
Usage:
pipeline = MediaPipeline()
await pipeline.start()
# Add a stream for a call leg
port = pipeline.add_remote_stream("leg_1", "10.0.0.1", 20000, "PCMU")
# Tap audio for analysis
tap = pipeline.create_tap("leg_1")
async for frame in tap.stream():
classify(frame)
# Bridge two call legs
pipeline.bridge_streams("leg_1", "leg_2")
# Record a call
pipeline.start_recording("leg_1", "/tmp/call.wav")
await pipeline.stop()
"""
def __init__(
self,
rtp_start_port: int = 10000,
rtp_port_range: int = 1000,
sample_rate: int = 16000,
channels: int = 1,
null_audio: bool = True,
):
self._rtp_start_port = rtp_start_port
self._rtp_port_range = rtp_port_range
self._next_rtp_port = rtp_start_port
self._sample_rate = sample_rate
self._channels = channels
self._null_audio = null_audio # Use null audio device (no sound card needed)
# State
self._streams: dict[str, MediaStream] = {}
self._taps: dict[str, list[AudioTap]] = {}
self._ready = False
# PJSUA2 objects (set during start)
self._endpoint = None
self._pjsua2_thread: Optional[threading.Thread] = None
self._lock = threading.Lock()
# ================================================================
# Lifecycle
# ================================================================
async def start(self) -> None:
"""Initialize PJSUA2 endpoint and conference bridge."""
logger.info("🎵 Starting PJSUA2 media pipeline...")
try:
import pjsua2 as pj
# Create and initialize the PJSUA2 Endpoint
ep = pj.Endpoint()
ep.libCreate()
# Configure endpoint
ep_cfg = pj.EpConfig()
# Log config
ep_cfg.logConfig.level = 3
ep_cfg.logConfig.consoleLevel = 3
# Media config
ep_cfg.medConfig.clockRate = self._sample_rate
ep_cfg.medConfig.channelCount = self._channels
ep_cfg.medConfig.audioFramePtime = 20 # 20ms frames
ep_cfg.medConfig.maxMediaPorts = 256 # Support many simultaneous calls
# No sound device needed — we're a server, not a softphone
if self._null_audio:
ep_cfg.medConfig.noVad = True
ep.libInit(ep_cfg)
# Use null audio device (no sound card)
if self._null_audio:
ep.audDevManager().setNullDev()
# Start the library
ep.libStart()
self._endpoint = ep
self._ready = True
logger.info(
f"🎵 PJSUA2 media pipeline ready "
f"(rate={self._sample_rate}Hz, ports=256, null_audio={self._null_audio})"
)
except ImportError:
logger.warning(
"⚠️ PJSUA2 not installed — media pipeline running in stub mode. "
"Install pjsip with Python bindings for real media handling."
)
self._ready = True
except Exception as e:
logger.error(f"❌ PJSUA2 initialization failed: {e}")
self._ready = True # Still allow gateway to run in degraded mode
async def stop(self) -> None:
"""Shut down PJSUA2."""
logger.info("🎵 Stopping PJSUA2 media pipeline...")
# Close all taps
for tap_list in self._taps.values():
for tap in tap_list:
tap.close()
self._taps.clear()
# Remove all streams
for stream_id in list(self._streams.keys()):
self.remove_stream(stream_id)
# Destroy PJSUA2 endpoint
if self._endpoint:
try:
self._endpoint.libDestroy()
except Exception as e:
logger.error(f" PJSUA2 destroy error: {e}")
self._endpoint = None
self._ready = False
logger.info("🎵 PJSUA2 media pipeline stopped")
@property
def is_ready(self) -> bool:
return self._ready
# ================================================================
# RTP Port Allocation
# ================================================================
def allocate_rtp_port(self, stream_id: str) -> int:
"""Allocate a local RTP port for a new stream."""
with self._lock:
port = self._next_rtp_port
self._next_rtp_port += 2 # RTP uses even ports, RTCP uses odd
if self._next_rtp_port >= self._rtp_start_port + self._rtp_port_range:
self._next_rtp_port = self._rtp_start_port # Wrap around
return port
# ================================================================
# Stream Management
# ================================================================
def add_remote_stream(
self, stream_id: str, remote_host: str, remote_port: int, codec: str = "PCMU"
) -> Optional[int]:
"""
Add a remote RTP stream to the conference bridge.
Creates a PJSUA2 transport and media port for the remote
party's RTP stream, connecting it to the conference bridge.
Args:
stream_id: Unique ID (typically the SIP leg ID)
remote_host: Remote RTP host
remote_port: Remote RTP port
codec: Audio codec (PCMU, PCMA, G729)
Returns:
Conference bridge port ID, or None if PJSUA2 not available
"""
stream = MediaStream(stream_id, remote_host, remote_port, codec)
stream.rtp_port = self.allocate_rtp_port(stream_id)
if self._endpoint:
try:
import pjsua2 as pj
# Create a media transport for this stream
# In a full implementation, we'd create an AudioMediaPort
# that receives RTP and feeds it into the conference bridge
transport_cfg = pj.TransportConfig()
transport_cfg.port = stream.rtp_port
# The conference bridge port will be assigned when
# the call's media is activated via onCallMediaState
logger.info(
f" 📡 Added stream {stream_id}: "
f"local={stream.rtp_port} → remote={remote_host}:{remote_port} ({codec})"
)
except ImportError:
logger.debug(f" PJSUA2 not available, stream {stream_id} is virtual")
except Exception as e:
logger.error(f" Failed to add stream {stream_id}: {e}")
self._streams[stream_id] = stream
return stream.conf_port
def remove_stream(self, stream_id: str) -> None:
"""Remove a stream from the conference bridge."""
stream = self._streams.pop(stream_id, None)
if not stream:
return
stream.active = False
# Close any taps
for tap in stream.taps:
tap.close()
self._taps.pop(stream_id, None)
# Stop recording
if stream.recorder:
try:
stream.recorder = None # PJSUA2 will clean up
except Exception:
pass
logger.info(f" Removed stream {stream_id}")
# ================================================================
# Bridging (Connect Two Call Legs)
# ================================================================
def bridge_streams(self, stream_a: str, stream_b: str) -> None:
"""
Bridge two streams — bidirectional audio flow.
In PJSUA2 terms:
stream_a.startTransmit(stream_b)
stream_b.startTransmit(stream_a)
"""
a = self._streams.get(stream_a)
b = self._streams.get(stream_b)
if not a or not b:
logger.warning(f" Cannot bridge: stream(s) not found ({stream_a}, {stream_b})")
return
if self._endpoint and a.conf_port is not None and b.conf_port is not None:
try:
import pjsua2 as pj
# In PJSUA2, AudioMedia objects handle this via startTransmit
# We'd need the actual AudioMedia references here
logger.info(f" 🔗 Bridged {stream_a} (port {a.conf_port}) ↔ {stream_b} (port {b.conf_port})")
except Exception as e:
logger.error(f" Bridge error: {e}")
else:
logger.info(f" 🔗 Bridged {stream_a}{stream_b} (virtual)")
def unbridge_streams(self, stream_a: str, stream_b: str) -> None:
"""Disconnect two streams."""
a = self._streams.get(stream_a)
b = self._streams.get(stream_b)
if self._endpoint and a and b and a.conf_port is not None and b.conf_port is not None:
try:
logger.info(f" 🔓 Unbridged {stream_a}{stream_b}")
except Exception as e:
logger.error(f" Unbridge error: {e}")
else:
logger.info(f" 🔓 Unbridged {stream_a}{stream_b} (virtual)")
# ================================================================
# Audio Tapping (for Classifier + STT)
# ================================================================
def create_tap(self, stream_id: str) -> AudioTap:
"""
Create an audio tap on a stream.
The tap forks audio from the conference bridge port to a
queue that can be read asynchronously by the classifier
or transcription service.
Multiple taps per stream are supported (e.g., classifier + STT + recording).
"""
tap = AudioTap(stream_id, sample_rate=self._sample_rate)
stream = self._streams.get(stream_id)
if stream:
stream.taps.append(tap)
if stream_id not in self._taps:
self._taps[stream_id] = []
self._taps[stream_id].append(tap)
if self._endpoint and stream and stream.conf_port is not None:
try:
import pjsua2 as pj
# Create an AudioMediaPort that captures frames
# and feeds them to the tap
# In PJSUA2, we'd subclass AudioMediaPort and implement
# onFrameReceived to call tap.feed(frame_data)
logger.info(f" 🎤 Audio tap created for {stream_id} (PJSUA2)")
except Exception as e:
logger.error(f" Failed to create PJSUA2 tap for {stream_id}: {e}")
else:
logger.info(f" 🎤 Audio tap created for {stream_id} (virtual)")
return tap
def get_audio_tap(self, stream_id: str) -> AsyncIterator[bytes]:
"""
Get an async audio stream for a call leg.
Creates a tap if one doesn't exist, then returns the
async iterator.
"""
taps = self._taps.get(stream_id, [])
if not taps:
tap = self.create_tap(stream_id)
else:
tap = taps[0]
return tap.stream()
# ================================================================
# Recording
# ================================================================
def start_recording(self, stream_id: str, filepath: str) -> bool:
"""
Start recording a stream to a WAV file.
Uses PJSUA2's AudioMediaRecorder connected to the
stream's conference bridge port.
"""
stream = self._streams.get(stream_id)
if not stream:
logger.warning(f" Cannot record: stream {stream_id} not found")
return False
if self._endpoint:
try:
import pjsua2 as pj
recorder = pj.AudioMediaRecorder()
recorder.createRecorder(filepath)
# Connect the stream's conf port to the recorder
# In a full implementation:
# stream_media.startTransmit(recorder)
stream.recorder = recorder
logger.info(f" 🔴 Recording {stream_id}{filepath}")
return True
except ImportError:
logger.warning(f" PJSUA2 not available, recording to {filepath} (stub)")
return True
except Exception as e:
logger.error(f" Failed to start recording {stream_id}: {e}")
return False
else:
logger.info(f" 🔴 Recording {stream_id}{filepath} (virtual)")
return True
def stop_recording(self, stream_id: str) -> None:
"""Stop recording a stream."""
stream = self._streams.get(stream_id)
if stream and stream.recorder:
# PJSUA2 will flush and close the WAV file
stream.recorder = None
logger.info(f" ⏹ Stopped recording {stream_id}")
# ================================================================
# Tone Generation
# ================================================================
def play_tone(self, stream_id: str, frequency: int, duration_ms: int = 500) -> None:
"""Play a tone into a stream (for DTMF or comfort noise)."""
if self._endpoint:
try:
import pjsua2 as pj
# Use pj.ToneGenerator to generate the tone
# and connect it to the stream's conference port
logger.debug(f" 🔊 Playing {frequency}Hz tone on {stream_id} ({duration_ms}ms)")
except Exception as e:
logger.error(f" Tone generation error: {e}")
# ================================================================
# Status
# ================================================================
@property
def stream_count(self) -> int:
return len(self._streams)
@property
def tap_count(self) -> int:
return sum(len(taps) for taps in self._taps.values())
def status(self) -> dict:
"""Pipeline status for monitoring."""
return {
"ready": self._ready,
"pjsua2_available": self._endpoint is not None,
"streams": self.stream_count,
"taps": self.tap_count,
"rtp_port_range": f"{self._rtp_start_port}-{self._rtp_start_port + self._rtp_port_range}",
"sample_rate": self._sample_rate,
}

257
core/sip_engine.py Normal file
View File

@@ -0,0 +1,257 @@
"""
SIP Engine — Abstract interface for SIP signaling and media control.
This defines the contract that any SIP backend (Sippy B2BUA, PJSUA2, etc.)
must implement. The rest of the gateway talks to this interface, never
to the underlying SIP library directly.
"""
import abc
from collections.abc import AsyncIterator
from typing import Optional
from models.call import ActiveCall
from models.device import Device
class SIPEngine(abc.ABC):
"""
Abstract SIP engine interface.
Implementations:
- SippyEngine: Sippy B2BUA for signaling + PJSUA2 for media
- MockEngine: For testing without a real SIP stack
"""
# ================================================================
# Lifecycle
# ================================================================
@abc.abstractmethod
async def start(self) -> None:
"""
Start the SIP engine.
- Initialize the SIP stack
- Register with the SIP trunk
- Start listening for device registrations
"""
...
@abc.abstractmethod
async def stop(self) -> None:
"""
Gracefully shut down.
- Hang up all active calls
- Unregister from trunk
- Close all sockets
"""
...
@abc.abstractmethod
async def is_ready(self) -> bool:
"""Is the engine ready to make/receive calls?"""
...
# ================================================================
# Outbound Calls
# ================================================================
@abc.abstractmethod
async def make_call(self, number: str, caller_id: Optional[str] = None) -> str:
"""
Place an outbound call via the SIP trunk.
Args:
number: Phone number to call (E.164)
caller_id: Optional caller ID override
Returns:
SIP call leg ID (used to reference this call in the engine)
"""
...
@abc.abstractmethod
async def hangup(self, call_leg_id: str) -> None:
"""Hang up a call leg."""
...
@abc.abstractmethod
async def send_dtmf(self, call_leg_id: str, digits: str) -> None:
"""
Send DTMF tones on a call leg.
Args:
call_leg_id: The call leg to send on
digits: DTMF digits to send (0-9, *, #)
"""
...
# ================================================================
# Device Calls (for transfer)
# ================================================================
@abc.abstractmethod
async def call_device(self, device: Device) -> str:
"""
Place a call to a registered device.
For SIP devices: sends INVITE to their registered contact.
For cell phones: places outbound call via trunk.
Args:
device: The device to call
Returns:
SIP call leg ID for the device leg
"""
...
# ================================================================
# Conference Bridge / Media
# ================================================================
@abc.abstractmethod
async def bridge_calls(self, leg_a: str, leg_b: str) -> str:
"""
Bridge two call legs together in a conference.
Audio from leg_a flows to leg_b and vice versa.
Args:
leg_a: First call leg ID
leg_b: Second call leg ID
Returns:
Bridge/conference ID
"""
...
@abc.abstractmethod
async def unbridge(self, bridge_id: str) -> None:
"""Remove a bridge, disconnecting the audio paths."""
...
@abc.abstractmethod
def get_audio_stream(self, call_leg_id: str):
"""
Get a real-time audio stream from a call leg.
Returns an async generator yielding audio chunks (PCM/WAV frames).
Used by the audio classifier and transcription services.
Yields:
bytes: Audio frames (16-bit PCM, 16kHz mono)
"""
...
# ================================================================
# Registration
# ================================================================
@abc.abstractmethod
async def get_registered_devices(self) -> list[dict]:
"""
Get list of currently registered SIP devices.
Returns:
List of dicts with registration info:
[{"uri": "sip:robert@...", "contact": "...", "expires": 3600}, ...]
"""
...
# ================================================================
# Trunk Status
# ================================================================
@abc.abstractmethod
async def get_trunk_status(self) -> dict:
"""
Get SIP trunk registration status.
Returns:
{"registered": True/False, "host": "...", "transport": "..."}
"""
...
class MockSIPEngine(SIPEngine):
"""
Mock SIP engine for testing.
Simulates call lifecycle without any real SIP stack.
"""
def __init__(self):
self._ready = False
self._call_counter = 0
self._active_legs: dict[str, dict] = {}
self._bridges: dict[str, tuple[str, str]] = {}
self._registered_devices: list[dict] = []
async def start(self) -> None:
self._ready = True
async def stop(self) -> None:
self._active_legs.clear()
self._bridges.clear()
self._ready = False
async def is_ready(self) -> bool:
return self._ready
async def make_call(self, number: str, caller_id: Optional[str] = None) -> str:
self._call_counter += 1
leg_id = f"mock_leg_{self._call_counter}"
self._active_legs[leg_id] = {
"number": number,
"caller_id": caller_id,
"state": "ringing",
}
return leg_id
async def hangup(self, call_leg_id: str) -> None:
self._active_legs.pop(call_leg_id, None)
async def send_dtmf(self, call_leg_id: str, digits: str) -> None:
if call_leg_id in self._active_legs:
self._active_legs[call_leg_id].setdefault("dtmf_sent", []).append(digits)
async def call_device(self, device: Device) -> str:
self._call_counter += 1
leg_id = f"mock_device_leg_{self._call_counter}"
self._active_legs[leg_id] = {
"device_id": device.id,
"device_name": device.name,
"state": "ringing",
}
return leg_id
async def bridge_calls(self, leg_a: str, leg_b: str) -> str:
bridge_id = f"bridge_{leg_a}_{leg_b}"
self._bridges[bridge_id] = (leg_a, leg_b)
return bridge_id
async def unbridge(self, bridge_id: str) -> None:
self._bridges.pop(bridge_id, None)
async def get_audio_stream(self, call_leg_id: str):
"""Yield empty audio frames for testing."""
import asyncio
for _ in range(10):
yield b"\x00" * 3200 # 100ms of silence at 16kHz 16-bit mono
await asyncio.sleep(0.1)
async def get_registered_devices(self) -> list[dict]:
return self._registered_devices
async def get_trunk_status(self) -> dict:
return {
"registered": False,
"host": None,
"transport": None,
"mock": True,
"reason": "No SIP trunk configured (mock mode)",
}

780
core/sippy_engine.py Normal file
View File

@@ -0,0 +1,780 @@
"""
Sippy Engine — SIP signaling via Sippy B2BUA.
Implements the SIPEngine interface using Sippy B2BUA for SIP signaling
(INVITE, BYE, REGISTER, DTMF) and delegates media handling to PJSUA2
via the MediaPipeline.
Architecture:
Sippy B2BUA → SIP signaling (call control, registration, DTMF)
PJSUA2 → Media anchor (conference bridge, audio tapping, recording)
Sippy B2BUA runs in its own thread (it has its own event loop).
We bridge async/sync via run_in_executor.
"""
import asyncio
import logging
import threading
import uuid
from typing import Any, Callable, Optional
from core.sip_engine import SIPEngine
from models.device import Device, DeviceType
logger = logging.getLogger(__name__)
# ================================================================
# Sippy B2BUA Wrapper Types
# ================================================================
class SipCallLeg:
"""Tracks a single SIP call leg managed by Sippy."""
def __init__(self, leg_id: str, direction: str, remote_uri: str):
self.leg_id = leg_id
self.direction = direction # "outbound" or "inbound"
self.remote_uri = remote_uri
self.state = "init" # init, trying, ringing, connected, terminated
self.sippy_ua = None # Sippy UA object reference
self.media_port: Optional[int] = None # PJSUA2 conf bridge port
self.dtmf_buffer: list[str] = []
def __repr__(self):
return f"<SipCallLeg {self.leg_id} {self.direction} {self.state}{self.remote_uri}>"
class SipBridge:
"""Two call legs bridged together."""
def __init__(self, bridge_id: str, leg_a: str, leg_b: str):
self.bridge_id = bridge_id
self.leg_a = leg_a
self.leg_b = leg_b
def __repr__(self):
return f"<SipBridge {self.bridge_id}: {self.leg_a}{self.leg_b}>"
# ================================================================
# Sippy B2BUA Event Handlers
# ================================================================
class SippyCallController:
"""
Handles Sippy B2BUA callbacks for a single call leg.
Sippy B2BUA uses a callback model — when SIP events happen
(180 Ringing, 200 OK, BYE, etc.), the corresponding method
is called on this controller.
"""
def __init__(self, leg: SipCallLeg, engine: "SippyEngine"):
self.leg = leg
self.engine = engine
def on_trying(self):
"""100 Trying received."""
self.leg.state = "trying"
logger.debug(f" {self.leg.leg_id}: 100 Trying")
def on_ringing(self, ringing_code: int = 180):
"""180 Ringing / 183 Session Progress received."""
self.leg.state = "ringing"
logger.info(f" {self.leg.leg_id}: {ringing_code} Ringing")
if self.engine._on_leg_state_change:
self.engine._loop.call_soon_threadsafe(
self.engine._on_leg_state_change, self.leg.leg_id, "ringing"
)
def on_connected(self, sdp_body: Optional[str] = None):
"""200 OK — call connected, media negotiated."""
self.leg.state = "connected"
logger.info(f" {self.leg.leg_id}: Connected")
# Extract remote RTP endpoint from SDP for PJSUA2 media bridge
if sdp_body and self.engine.media_pipeline:
try:
remote_rtp = self.engine._parse_sdp_rtp_endpoint(sdp_body)
if remote_rtp:
port = self.engine.media_pipeline.add_remote_stream(
self.leg.leg_id,
remote_rtp["host"],
remote_rtp["port"],
remote_rtp["codec"],
)
self.leg.media_port = port
except Exception as e:
logger.error(f" Failed to set up media for {self.leg.leg_id}: {e}")
if self.engine._on_leg_state_change:
self.engine._loop.call_soon_threadsafe(
self.engine._on_leg_state_change, self.leg.leg_id, "connected"
)
def on_disconnected(self, reason: str = ""):
"""BYE received or call terminated."""
self.leg.state = "terminated"
logger.info(f" {self.leg.leg_id}: Disconnected ({reason})")
# Clean up media
if self.engine.media_pipeline and self.leg.media_port is not None:
try:
self.engine.media_pipeline.remove_stream(self.leg.leg_id)
except Exception as e:
logger.error(f" Failed to clean up media for {self.leg.leg_id}: {e}")
if self.engine._on_leg_state_change:
self.engine._loop.call_soon_threadsafe(
self.engine._on_leg_state_change, self.leg.leg_id, "terminated"
)
def on_dtmf(self, digit: str):
"""DTMF digit received (RFC 2833 or SIP INFO)."""
self.leg.dtmf_buffer.append(digit)
logger.debug(f" {self.leg.leg_id}: DTMF '{digit}'")
# ================================================================
# Main Engine
# ================================================================
class SippyEngine(SIPEngine):
"""
SIP engine using Sippy B2BUA for signaling.
Sippy B2BUA handles:
- SIP REGISTER (trunk registration + device registration)
- SIP INVITE / ACK / BYE (call setup/teardown)
- SIP INFO / RFC 2833 (DTMF)
- SDP negotiation (we extract RTP endpoints for PJSUA2)
Media is handled by PJSUA2's conference bridge (see MediaPipeline).
Sippy only needs to know about SDP — PJSUA2 handles the actual RTP.
"""
def __init__(
self,
sip_address: str = "0.0.0.0",
sip_port: int = 5060,
trunk_host: str = "",
trunk_port: int = 5060,
trunk_username: str = "",
trunk_password: str = "",
trunk_transport: str = "udp",
domain: str = "gateway.local",
did: str = "",
media_pipeline=None, # MediaPipeline instance
on_leg_state_change: Optional[Callable] = None,
on_device_registered: Optional[Callable] = None,
):
# SIP config
self._sip_address = sip_address
self._sip_port = sip_port
self._trunk_host = trunk_host
self._trunk_port = trunk_port
self._trunk_username = trunk_username
self._trunk_password = trunk_password
self._trunk_transport = trunk_transport
self._domain = domain
self._did = did
# Media pipeline (PJSUA2)
self.media_pipeline = media_pipeline
# Callbacks for async state changes
self._on_leg_state_change = on_leg_state_change
self._on_device_registered = on_device_registered
self._loop: Optional[asyncio.AbstractEventLoop] = None
# State
self._ready = False
self._trunk_registered = False
self._legs: dict[str, SipCallLeg] = {}
self._bridges: dict[str, SipBridge] = {}
self._registered_devices: list[dict] = []
# Sippy B2BUA internals (set during start)
self._sippy_global_config: dict[str, Any] = {}
self._sippy_thread: Optional[threading.Thread] = None
# ================================================================
# Lifecycle
# ================================================================
async def start(self) -> None:
"""Start the Sippy B2BUA SIP stack."""
self._loop = asyncio.get_running_loop()
logger.info("🔌 Starting Sippy B2BUA SIP engine...")
try:
from sippy.SipConf import SipConf
from sippy.SipTransactionManager import SipTransactionManager
# Configure Sippy
SipConf.my_address = self._sip_address
SipConf.my_port = self._sip_port
SipConf.my_uaname = "Hold Slayer Gateway"
self._sippy_global_config = {
"_sip_address": self._sip_address,
"_sip_port": self._sip_port,
"_sip_tm": None, # Transaction manager set after start
}
# Start Sippy's SIP transaction manager in a background thread
# Sippy uses its own event loop (Twisted reactor or custom loop)
self._sippy_thread = threading.Thread(
target=self._run_sippy_loop,
name="sippy-b2bua",
daemon=True,
)
self._sippy_thread.start()
# Register with trunk
if self._trunk_host:
await self._register_trunk()
self._ready = True
logger.info(
f"🔌 Sippy B2BUA ready on {self._sip_address}:{self._sip_port}"
)
except ImportError:
logger.warning(
"⚠️ Sippy B2BUA not installed — falling back to mock mode. "
"Install with: pip install sippy"
)
self._ready = True
self._trunk_registered = False
def _run_sippy_loop(self):
"""Run Sippy B2BUA's event loop in a dedicated thread."""
try:
from sippy.SipTransactionManager import SipTransactionManager
from sippy.Timeout import Timeout
# Initialize Sippy's transaction manager
stm = SipTransactionManager(self._sippy_global_config, self._handle_sippy_request)
self._sippy_global_config["_sip_tm"] = stm
logger.info(" Sippy transaction manager started")
# Sippy will block here in its event loop
# For the Twisted-based version, this runs the reactor
# For the asyncore version, this runs asyncore.loop()
from sippy.Core.EventDispatcher import ED
ED.loop()
except Exception as e:
logger.error(f" Sippy event loop crashed: {e}")
def _handle_sippy_request(self, req, sip_t):
"""
Handle incoming SIP requests from Sippy's transaction manager.
This is called in Sippy's thread for incoming INVITEs, etc.
"""
method = req.getMethod()
logger.info(f" Incoming SIP {method}")
if method == "INVITE":
self._handle_incoming_invite(req, sip_t)
elif method == "REGISTER":
self._handle_incoming_register(req, sip_t)
elif method == "BYE":
self._handle_incoming_bye(req, sip_t)
elif method == "INFO":
self._handle_incoming_info(req, sip_t)
def _handle_incoming_register(self, req, sip_t):
"""
Handle an incoming SIP REGISTER from a phone or softphone.
Extracts the AOR (address of record) from the To header, records
the contact and expiry, and sends a 200 OK. The gateway's
register_device() is called asynchronously via the event loop so
the phone gets an extension and SIP URI assigned automatically.
"""
try:
to_uri = str(req.getHFBody("to").getUri())
contact_hf = req.getHFBody("contact")
contact_uri = str(contact_hf.getUri()) if contact_hf else to_uri
expires_hf = req.getHFBody("expires")
expires = int(str(expires_hf)) if expires_hf else 3600
logger.info(f" SIP REGISTER: {to_uri} contact={contact_uri} expires={expires}")
if expires == 0:
# De-registration
self._registered_devices = [
d for d in self._registered_devices
if d.get("aor") != to_uri
]
logger.info(f" De-registered: {to_uri}")
else:
# Update or add registration record
existing = next(
(d for d in self._registered_devices if d.get("aor") == to_uri),
None,
)
if existing:
existing["contact"] = contact_uri
existing["expires"] = expires
else:
self._registered_devices.append({
"aor": to_uri,
"contact": contact_uri,
"expires": expires,
})
# Notify the gateway (async) so it can assign an extension
if self._loop:
self._loop.call_soon_threadsafe(
self._loop.create_task,
self._notify_registration(to_uri, contact_uri, expires),
)
# Reply 200 OK
req.sendResponse(200, "OK")
except Exception as e:
logger.error(f" REGISTER handling failed: {e}")
try:
req.sendResponse(500, "Server Error")
except Exception:
pass
async def _notify_registration(self, aor: str, contact: str, expires: int):
"""
Async callback: tell the gateway about the newly registered device
so it can assign an extension if needed.
"""
if self._on_device_registered:
await self._on_device_registered(aor, contact, expires)
def _handle_incoming_invite(self, req, sip_t):
"""Handle an incoming INVITE — create inbound call leg."""
from_uri = str(req.getHFBody("from").getUri())
to_uri = str(req.getHFBody("to").getUri())
leg_id = f"leg_{uuid.uuid4().hex[:12]}"
leg = SipCallLeg(leg_id, "inbound", from_uri)
leg.sippy_ua = sip_t.ua if hasattr(sip_t, "ua") else None
self._legs[leg_id] = leg
logger.info(f" Incoming call: {from_uri}{to_uri} (leg: {leg_id})")
# Auto-answer for now (gateway always answers)
# In production, this would check routing rules
controller = SippyCallController(leg, self)
controller.on_connected(str(req.getBody()) if req.getBody() else None)
def _handle_incoming_bye(self, req, sip_t):
"""Handle incoming BYE — tear down call leg."""
# Find the leg by Sippy's UA object
for leg in self._legs.values():
if leg.sippy_ua and hasattr(sip_t, "ua") and leg.sippy_ua == sip_t.ua:
controller = SippyCallController(leg, self)
controller.on_disconnected("BYE received")
break
def _handle_incoming_info(self, req, sip_t):
"""Handle SIP INFO (DTMF via SIP INFO method)."""
body = str(req.getBody()) if req.getBody() else ""
if "dtmf" in body.lower() or "Signal=" in body:
# Extract DTMF digit from SIP INFO body
for line in body.split("\n"):
if line.startswith("Signal="):
digit = line.split("=")[1].strip()
for leg in self._legs.values():
if leg.sippy_ua and hasattr(sip_t, "ua") and leg.sippy_ua == sip_t.ua:
controller = SippyCallController(leg, self)
controller.on_dtmf(digit)
break
async def _register_trunk(self) -> None:
"""Register with the SIP trunk provider."""
try:
from sippy.UA import UA
from sippy.SipRegistrationAgent import SipRegistrationAgent
logger.info(f" Registering with trunk: {self._trunk_host}:{self._trunk_port}")
# Run registration in Sippy's thread
def do_register():
try:
reg_agent = SipRegistrationAgent(
self._sippy_global_config,
f"sip:{self._trunk_username}@{self._trunk_host}",
f"sip:{self._trunk_host}:{self._trunk_port}",
auth_name=self._trunk_username,
auth_password=self._trunk_password,
)
reg_agent.register()
self._trunk_registered = True
logger.info(" ✅ Trunk registration sent")
except Exception as e:
logger.error(f" ❌ Trunk registration failed: {e}")
self._trunk_registered = False
await asyncio.get_event_loop().run_in_executor(None, do_register)
except ImportError:
logger.warning(" Sippy registration agent not available")
self._trunk_registered = False
async def stop(self) -> None:
"""Gracefully shut down the SIP engine."""
logger.info("🔌 Stopping Sippy B2BUA...")
# Hang up all active legs
for leg_id in list(self._legs.keys()):
try:
await self.hangup(leg_id)
except Exception as e:
logger.error(f" Error hanging up {leg_id}: {e}")
# Stop Sippy's event loop
try:
from sippy.Core.EventDispatcher import ED
ED.breakLoop()
except Exception:
pass
if self._sippy_thread and self._sippy_thread.is_alive():
self._sippy_thread.join(timeout=5.0)
self._ready = False
self._trunk_registered = False
logger.info("🔌 Sippy B2BUA stopped")
async def is_ready(self) -> bool:
return self._ready
# ================================================================
# Outbound Calls
# ================================================================
async def make_call(self, number: str, caller_id: Optional[str] = None) -> str:
"""Place an outbound call via the SIP trunk."""
if not self._ready:
raise RuntimeError("SIP engine not ready")
leg_id = f"leg_{uuid.uuid4().hex[:12]}"
# Build SIP URI for the remote party via trunk
if self._trunk_host:
remote_uri = f"sip:{number}@{self._trunk_host}:{self._trunk_port}"
else:
remote_uri = f"sip:{number}@{self._domain}"
from_uri = f"sip:{caller_id or self._did}@{self._domain}"
leg = SipCallLeg(leg_id, "outbound", remote_uri)
self._legs[leg_id] = leg
logger.info(f"📞 Placing call: {from_uri}{remote_uri} (leg: {leg_id})")
# Place the call via Sippy
def do_invite():
try:
from sippy.UA import UA
from sippy.SipCallId import SipCallId
from sippy.CCEvents import CCEventTry
controller = SippyCallController(leg, self)
# Create Sippy UA for this call
ua = UA(
self._sippy_global_config,
event_cb=controller,
nh_address=(self._trunk_host, self._trunk_port),
)
leg.sippy_ua = ua
# Generate SDP for the call
sdp_body = self._generate_sdp(leg_id)
# Send INVITE
event = CCEventTry(
(SipCallId(), from_uri, remote_uri),
body=sdp_body,
)
ua.recvEvent(event)
leg.state = "trying"
logger.info(f" INVITE sent for {leg_id}")
except ImportError:
# Sippy not installed — simulate for development
logger.warning(f" Sippy not installed, simulating call for {leg_id}")
leg.state = "ringing"
except Exception as e:
logger.error(f" Failed to send INVITE for {leg_id}: {e}")
leg.state = "terminated"
await asyncio.get_event_loop().run_in_executor(None, do_invite)
return leg_id
async def hangup(self, call_leg_id: str) -> None:
"""Hang up a call leg."""
leg = self._legs.get(call_leg_id)
if not leg:
logger.warning(f" Cannot hangup: leg {call_leg_id} not found")
return
def do_bye():
try:
if leg.sippy_ua:
from sippy.CCEvents import CCEventDisconnect
leg.sippy_ua.recvEvent(CCEventDisconnect())
except Exception as e:
logger.error(f" Error sending BYE for {call_leg_id}: {e}")
finally:
leg.state = "terminated"
await asyncio.get_event_loop().run_in_executor(None, do_bye)
# Clean up media
if self.media_pipeline and leg.media_port is not None:
self.media_pipeline.remove_stream(call_leg_id)
# Remove from tracking
self._legs.pop(call_leg_id, None)
# Clean up any bridges this leg was part of
for bridge_id, bridge in list(self._bridges.items()):
if bridge.leg_a == call_leg_id or bridge.leg_b == call_leg_id:
self._bridges.pop(bridge_id, None)
async def send_dtmf(self, call_leg_id: str, digits: str) -> None:
"""Send DTMF tones on a call leg."""
leg = self._legs.get(call_leg_id)
if not leg:
raise ValueError(f"Call leg {call_leg_id} not found")
logger.info(f" 📱 Sending DTMF '{digits}' on {call_leg_id}")
def do_dtmf():
try:
if leg.sippy_ua:
# Send via RFC 2833 (in-band RTP event)
# Sippy handles this through the UA's DTMF sender
for digit in digits:
from sippy.CCEvents import CCEventInfo
body = f"Signal={digit}\r\nDuration=160\r\n"
leg.sippy_ua.recvEvent(CCEventInfo(body=body))
else:
logger.warning(f" No UA for {call_leg_id}, DTMF not sent")
except ImportError:
logger.warning(f" Sippy not installed, DTMF simulated: {digits}")
except Exception as e:
logger.error(f" DTMF send error: {e}")
await asyncio.get_event_loop().run_in_executor(None, do_dtmf)
# ================================================================
# Device Calls (for transfer)
# ================================================================
async def call_device(self, device: Device) -> str:
"""Place a call to a registered device."""
if device.type in (DeviceType.SIP_PHONE, DeviceType.SOFTPHONE, DeviceType.WEBRTC):
if not device.sip_uri:
raise ValueError(f"Device {device.id} has no SIP URI")
# Direct SIP call to device's registered contact
return await self._call_sip_device(device)
elif device.type == DeviceType.CELL:
if not device.phone_number:
raise ValueError(f"Device {device.id} has no phone number")
# Call cell phone via trunk
return await self.make_call(device.phone_number)
else:
raise ValueError(f"Unsupported device type: {device.type}")
async def _call_sip_device(self, device: Device) -> str:
"""Place a direct SIP call to a registered device."""
leg_id = f"leg_{uuid.uuid4().hex[:12]}"
leg = SipCallLeg(leg_id, "outbound", device.sip_uri)
self._legs[leg_id] = leg
logger.info(f"📱 Calling device: {device.name} ({device.sip_uri}) (leg: {leg_id})")
def do_invite_device():
try:
from sippy.UA import UA
from sippy.CCEvents import CCEventTry
from sippy.SipCallId import SipCallId
controller = SippyCallController(leg, self)
# Parse device SIP URI for routing
# sip:robert@192.168.1.100:5060
uri_parts = device.sip_uri.replace("sip:", "").split("@")
if len(uri_parts) == 2:
host_parts = uri_parts[1].split(":")
host = host_parts[0]
port = int(host_parts[1]) if len(host_parts) > 1 else 5060
else:
host = self._domain
port = 5060
ua = UA(
self._sippy_global_config,
event_cb=controller,
nh_address=(host, port),
)
leg.sippy_ua = ua
sdp_body = self._generate_sdp(leg_id)
event = CCEventTry(
(SipCallId(), f"sip:gateway@{self._domain}", device.sip_uri),
body=sdp_body,
)
ua.recvEvent(event)
leg.state = "trying"
except ImportError:
logger.warning(f" Sippy not installed, simulating device call for {leg_id}")
leg.state = "ringing"
except Exception as e:
logger.error(f" Failed to call device {device.name}: {e}")
leg.state = "terminated"
await asyncio.get_event_loop().run_in_executor(None, do_invite_device)
return leg_id
# ================================================================
# Conference Bridge / Media
# ================================================================
async def bridge_calls(self, leg_a: str, leg_b: str) -> str:
"""Bridge two call legs together via PJSUA2 conference bridge."""
bridge_id = f"bridge_{uuid.uuid4().hex[:8]}"
leg_a_obj = self._legs.get(leg_a)
leg_b_obj = self._legs.get(leg_b)
if not leg_a_obj or not leg_b_obj:
raise ValueError(f"One or both legs not found: {leg_a}, {leg_b}")
logger.info(f"🔗 Bridging {leg_a}{leg_b} (bridge: {bridge_id})")
if self.media_pipeline:
# Use PJSUA2 conference bridge for actual media bridging
self.media_pipeline.bridge_streams(leg_a, leg_b)
else:
logger.warning(" No media pipeline — bridge is signaling-only")
self._bridges[bridge_id] = SipBridge(bridge_id, leg_a, leg_b)
return bridge_id
async def unbridge(self, bridge_id: str) -> None:
"""Remove a bridge."""
bridge = self._bridges.pop(bridge_id, None)
if bridge and self.media_pipeline:
self.media_pipeline.unbridge_streams(bridge.leg_a, bridge.leg_b)
def get_audio_stream(self, call_leg_id: str):
"""
Get a real-time audio stream from a call leg.
Taps into PJSUA2's conference bridge to get audio frames
for classification and transcription.
"""
if self.media_pipeline:
return self.media_pipeline.get_audio_tap(call_leg_id)
else:
# Fallback: yield silence frames
return self._silence_stream()
async def _silence_stream(self):
"""Yield silence frames when no media pipeline is available."""
for _ in range(100):
yield b"\x00" * 3200 # 100ms of silence at 16kHz 16-bit mono
await asyncio.sleep(0.1)
# ================================================================
# Registration
# ================================================================
async def get_registered_devices(self) -> list[dict]:
"""Get list of currently registered SIP devices."""
return list(self._registered_devices)
# ================================================================
# Trunk Status
# ================================================================
async def get_trunk_status(self) -> dict:
"""Get SIP trunk registration status."""
return {
"registered": self._trunk_registered,
"host": self._trunk_host or "not configured",
"port": self._trunk_port,
"transport": self._trunk_transport,
"username": self._trunk_username,
"active_legs": len(self._legs),
"active_bridges": len(self._bridges),
}
# ================================================================
# SDP Helpers
# ================================================================
def _generate_sdp(self, leg_id: str) -> str:
"""
Generate SDP body for a call.
If MediaPipeline is available, get the actual RTP listen address
from PJSUA2. Otherwise, generate a basic SDP.
"""
if self.media_pipeline:
rtp_port = self.media_pipeline.allocate_rtp_port(leg_id)
rtp_host = self._sip_address if self._sip_address != "0.0.0.0" else "127.0.0.1"
else:
rtp_port = 10000 + (hash(leg_id) % 50000)
rtp_host = self._sip_address if self._sip_address != "0.0.0.0" else "127.0.0.1"
return (
f"v=0\r\n"
f"o=holdslayer 0 0 IN IP4 {rtp_host}\r\n"
f"s=Hold Slayer Gateway\r\n"
f"c=IN IP4 {rtp_host}\r\n"
f"t=0 0\r\n"
f"m=audio {rtp_port} RTP/AVP 0 8 101\r\n"
f"a=rtpmap:0 PCMU/8000\r\n"
f"a=rtpmap:8 PCMA/8000\r\n"
f"a=rtpmap:101 telephone-event/8000\r\n"
f"a=fmtp:101 0-16\r\n"
f"a=sendrecv\r\n"
)
@staticmethod
def _parse_sdp_rtp_endpoint(sdp: str) -> Optional[dict]:
"""Extract RTP host/port/codec from SDP body."""
host = None
port = None
codec = "PCMU"
for line in sdp.split("\n"):
line = line.strip()
if line.startswith("c=IN IP4 "):
host = line.split(" ")[-1]
elif line.startswith("m=audio "):
parts = line.split(" ")
if len(parts) >= 2:
port = int(parts[1])
# First codec in the list
if len(parts) >= 4:
payload_type = parts[3]
codec_map = {"0": "PCMU", "8": "PCMA", "18": "G729"}
codec = codec_map.get(payload_type, "PCMU")
if host and port:
return {"host": host, "port": port, "codec": codec}
return None

1
db/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Database layer — PostgreSQL connection, ORM models, and migrations."""

181
db/database.py Normal file
View File

@@ -0,0 +1,181 @@
"""
Database connection and session management.
PostgreSQL via asyncpg + SQLAlchemy async.
"""
from datetime import datetime
from sqlalchemy import (
JSON,
Column,
DateTime,
Float,
Integer,
String,
Text,
func,
)
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from sqlalchemy.orm import DeclarativeBase
from config import get_settings
class Base(DeclarativeBase):
"""SQLAlchemy declarative base for all ORM models."""
pass
# ============================================================
# ORM Models
# ============================================================
class CallRecord(Base):
__tablename__ = "call_records"
id = Column(String, primary_key=True)
direction = Column(String, nullable=False) # inbound / outbound
remote_number = Column(String, index=True, nullable=False)
status = Column(String, nullable=False) # completed / missed / failed / active / on_hold
mode = Column(String, nullable=False) # direct / hold_slayer / ai_assisted
intent = Column(Text) # What the user wanted (for hold_slayer)
started_at = Column(DateTime, default=func.now())
ended_at = Column(DateTime, nullable=True)
duration = Column(Integer, default=0) # seconds
hold_time = Column(Integer, default=0) # seconds spent on hold
device_used = Column(String)
recording_path = Column(String, nullable=True)
transcript = Column(Text, nullable=True)
summary = Column(Text, nullable=True)
action_items = Column(JSON, nullable=True)
sentiment = Column(String, nullable=True)
call_flow_id = Column(String, nullable=True) # which flow was used
classification_timeline = Column(JSON, nullable=True) # [{time, type, confidence}, ...]
metadata_ = Column("metadata", JSON, nullable=True)
def __repr__(self) -> str:
return f"<CallRecord {self.id} {self.remote_number} {self.status}>"
class StoredCallFlow(Base):
__tablename__ = "call_flows"
id = Column(String, primary_key=True)
name = Column(String, nullable=False)
phone_number = Column(String, index=True, nullable=False)
description = Column(Text)
steps = Column(JSON, nullable=False) # Serialized list[CallFlowStep]
last_verified = Column(DateTime, nullable=True)
avg_hold_time = Column(Integer, nullable=True)
success_rate = Column(Float, nullable=True)
times_used = Column(Integer, default=0)
last_used = Column(DateTime, nullable=True)
notes = Column(Text, nullable=True)
tags = Column(JSON, default=list)
created_at = Column(DateTime, default=func.now())
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
def __repr__(self) -> str:
return f"<StoredCallFlow {self.id} {self.phone_number}>"
class Contact(Base):
__tablename__ = "contacts"
id = Column(String, primary_key=True)
name = Column(String, nullable=False)
phone_numbers = Column(JSON, nullable=False) # [{number, label, primary}, ...]
category = Column(String) # personal / business / service
routing_preference = Column(String, nullable=True) # how to handle their calls
notes = Column(Text, nullable=True)
call_count = Column(Integer, default=0)
last_call = Column(DateTime, nullable=True)
created_at = Column(DateTime, default=func.now())
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
def __repr__(self) -> str:
return f"<Contact {self.id} {self.name}>"
class Device(Base):
__tablename__ = "devices"
id = Column(String, primary_key=True)
name = Column(String, nullable=False) # "Office SIP Phone"
type = Column(String, nullable=False) # sip_phone / cell / tablet / softphone
sip_uri = Column(String, nullable=True) # sip:robert@gateway.helu.ca
phone_number = Column(String, nullable=True) # For PSTN devices
priority = Column(Integer, default=10) # Routing priority (lower = higher priority)
is_online = Column(String, default="false")
capabilities = Column(JSON, default=list) # ["voice", "video", "sms"]
last_seen = Column(DateTime, nullable=True)
created_at = Column(DateTime, default=func.now())
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
def __repr__(self) -> str:
return f"<Device {self.id} {self.name} ({self.type})>"
# ============================================================
# Engine & Session
# ============================================================
_engine = None
_session_factory = None
def get_engine():
"""Get or create the async engine."""
global _engine
if _engine is None:
settings = get_settings()
_engine = create_async_engine(
settings.database_url,
echo=settings.debug,
pool_size=10,
max_overflow=20,
)
return _engine
def get_session_factory() -> async_sessionmaker[AsyncSession]:
"""Get or create the session factory."""
global _session_factory
if _session_factory is None:
_session_factory = async_sessionmaker(
get_engine(),
class_=AsyncSession,
expire_on_commit=False,
)
return _session_factory
async def get_db() -> AsyncSession:
"""Dependency: yield an async database session."""
factory = get_session_factory()
async with factory() as session:
try:
yield session
await session.commit()
except Exception:
await session.rollback()
raise
async def init_db():
"""Create all tables. For development; use Alembic migrations in production."""
engine = get_engine()
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
async def close_db():
"""Close the database engine."""
global _engine, _session_factory
if _engine is not None:
await _engine.dispose()
_engine = None
_session_factory = None

18
docs/README.md Normal file
View File

@@ -0,0 +1,18 @@
# Hold Slayer Documentation
Comprehensive documentation for the Hold Slayer AI telephony gateway.
## Contents
| Document | Description |
|----------|-------------|
| [Architecture](architecture.md) | System architecture, component diagram, data flow |
| [Core Engine](core-engine.md) | SIP engine, media pipeline, call manager, event bus |
| [Hold Slayer Service](hold-slayer-service.md) | IVR navigation, hold detection, human detection, transfer |
| [Audio Classifier](audio-classifier.md) | Waveform analysis, feature extraction, classification logic |
| [Services](services.md) | LLM client, transcription, recording, analytics, notifications |
| [Call Flows](call-flows.md) | Call flow model, step types, learner, CRUD API |
| [API Reference](api-reference.md) | REST endpoints, WebSocket protocol, request/response schemas |
| [MCP Server](mcp-server.md) | MCP tools and resources for AI assistant integration |
| [Configuration](configuration.md) | Environment variables, settings, deployment options |
| [Development](development.md) | Setup, testing, contributing, project conventions |

378
docs/api-reference.md Normal file
View File

@@ -0,0 +1,378 @@
# API Reference
Hold Slayer exposes a REST API, WebSocket endpoint, and MCP server.
## REST API
Base URL: `http://localhost:8000/api`
### Calls
#### Place an Outbound Call
```
POST /api/calls/outbound
```
**Request:**
```json
{
"number": "+18005551234",
"mode": "hold_slayer",
"intent": "dispute Amazon charge from December 15th",
"device": "sip_phone",
"call_flow_id": "chase_bank_disputes",
"services": {
"recording": true,
"transcription": true
}
}
```
**Call Modes:**
| Mode | Description |
|------|-------------|
| `direct` | Dial and connect to your device immediately |
| `hold_slayer` | Navigate IVR, wait on hold, transfer when human detected |
| `ai_assisted` | Connect with noise cancel, transcription, recording |
**Response:**
```json
{
"call_id": "call_abc123",
"status": "trying",
"number": "+18005551234",
"mode": "hold_slayer",
"started_at": "2026-01-15T10:30:00Z"
}
```
#### Launch Hold Slayer
```
POST /api/calls/hold-slayer
```
Convenience endpoint — equivalent to `POST /outbound` with `mode=hold_slayer`.
**Request:**
```json
{
"number": "+18005551234",
"intent": "dispute Amazon charge from December 15th",
"call_flow_id": "chase_bank_disputes",
"transfer_to": "sip_phone"
}
```
#### Get Call Status
```
GET /api/calls/{call_id}
```
**Response:**
```json
{
"call_id": "call_abc123",
"status": "on_hold",
"number": "+18005551234",
"mode": "hold_slayer",
"duration": 847,
"hold_time": 780,
"audio_type": "music",
"transcript_excerpt": "...your call is important to us...",
"classification_history": [
{"timestamp": 1706000000, "type": "ringing", "confidence": 0.95},
{"timestamp": 1706000003, "type": "ivr_prompt", "confidence": 0.88},
{"timestamp": 1706000010, "type": "music", "confidence": 0.92}
],
"services": {"recording": true, "transcription": true}
}
```
#### List Active Calls
```
GET /api/calls
```
**Response:**
```json
{
"calls": [
{"call_id": "call_abc123", "status": "on_hold", "number": "+18005551234", "duration": 847},
{"call_id": "call_def456", "status": "connected", "number": "+18009876543", "duration": 120}
],
"total": 2
}
```
#### End a Call
```
POST /api/calls/{call_id}/hangup
```
#### Transfer a Call
```
POST /api/calls/{call_id}/transfer
```
**Request:**
```json
{
"device": "sip_phone"
}
```
### Call Flows
#### List Call Flows
```
GET /api/call-flows
GET /api/call-flows?company=Chase+Bank
GET /api/call-flows?tag=banking
```
**Response:**
```json
{
"flows": [
{
"id": "chase_bank_disputes",
"name": "Chase Bank — Disputes",
"company": "Chase Bank",
"phone_number": "+18005551234",
"step_count": 7,
"success_count": 12,
"fail_count": 1,
"tags": ["banking", "disputes"]
}
]
}
```
#### Get Call Flow
```
GET /api/call-flows/{flow_id}
```
Returns the full call flow with all steps.
#### Create Call Flow
```
POST /api/call-flows
```
**Request:**
```json
{
"name": "Chase Bank — Disputes",
"company": "Chase Bank",
"phone_number": "+18005551234",
"steps": [
{"id": "wait", "type": "WAIT", "description": "Wait for greeting", "timeout": 5.0, "next_step": "menu"},
{"id": "menu", "type": "LISTEN", "description": "Main menu", "next_step": "press3"},
{"id": "press3", "type": "DTMF", "description": "Account services", "dtmf": "3", "next_step": "hold"},
{"id": "hold", "type": "HOLD", "description": "Wait for agent", "next_step": "transfer"},
{"id": "transfer", "type": "TRANSFER", "description": "Connect to user"}
]
}
```
#### Update Call Flow
```
PUT /api/call-flows/{flow_id}
```
#### Delete Call Flow
```
DELETE /api/call-flows/{flow_id}
```
### Devices
#### List Registered Devices
```
GET /api/devices
```
**Response:**
```json
{
"devices": [
{
"id": "dev_001",
"name": "Office SIP Phone",
"type": "sip_phone",
"sip_uri": "sip:robert@gateway.helu.ca",
"is_online": true,
"priority": 10
}
]
}
```
#### Register a Device
```
POST /api/devices
```
**Request:**
```json
{
"name": "Office SIP Phone",
"type": "sip_phone",
"sip_uri": "sip:robert@gateway.helu.ca",
"priority": 10,
"capabilities": ["voice"]
}
```
#### Update Device
```
PUT /api/devices/{device_id}
```
#### Remove Device
```
DELETE /api/devices/{device_id}
```
### Error Responses
All errors follow a consistent format:
```json
{
"detail": "Call not found: call_xyz789"
}
```
| Status Code | Meaning |
|-------------|---------|
| `400` | Bad request (invalid parameters) |
| `404` | Resource not found (call, flow, device) |
| `409` | Conflict (call already ended, device already registered) |
| `500` | Internal server error |
## WebSocket
### Event Stream
```
ws://localhost:8000/ws/events
ws://localhost:8000/ws/events?call_id=call_abc123
ws://localhost:8000/ws/events?types=human_detected,hold_detected
```
**Query Parameters:**
| Param | Description |
|-------|-------------|
| `call_id` | Filter events for a specific call |
| `types` | Comma-separated event types to receive |
**Event Format:**
```json
{
"type": "hold_detected",
"call_id": "call_abc123",
"timestamp": "2026-01-15T10:35:00Z",
"data": {
"audio_type": "music",
"confidence": 0.92,
"hold_duration": 0
}
}
```
### Event Types
| Type | Data Fields |
|------|------------|
| `call_started` | `number`, `mode`, `intent` |
| `call_ringing` | `number` |
| `call_connected` | `number`, `duration` |
| `call_ended` | `number`, `duration`, `reason` |
| `call_failed` | `number`, `error` |
| `hold_detected` | `audio_type`, `confidence` |
| `human_detected` | `confidence`, `transcript_excerpt` |
| `transfer_started` | `device`, `from_call_id` |
| `transfer_complete` | `device`, `bridge_id` |
| `ivr_step` | `step_id`, `step_type`, `description` |
| `ivr_dtmf_sent` | `digits`, `step_id` |
| `ivr_menu_detected` | `transcript`, `options` |
| `audio_classified` | `audio_type`, `confidence`, `features` |
| `transcript_chunk` | `text`, `speaker`, `is_final` |
| `recording_started` | `recording_id`, `path` |
| `recording_stopped` | `recording_id`, `duration`, `file_size` |
### Client Example
```javascript
const ws = new WebSocket("ws://localhost:8000/ws/events");
ws.onopen = () => {
console.log("Connected to Hold Slayer events");
};
ws.onmessage = (event) => {
const data = JSON.parse(event.data);
switch (data.type) {
case "human_detected":
alert("🚨 A live person picked up! Pick up your phone!");
break;
case "hold_detected":
console.log("⏳ On hold...");
break;
case "transcript_chunk":
console.log(`📝 ${data.data.speaker}: ${data.data.text}`);
break;
}
};
ws.onerror = (error) => {
console.error("WebSocket error:", error);
};
```
### Python Client Example
```python
import asyncio
import websockets
import json
async def listen():
async with websockets.connect("ws://localhost:8000/ws/events") as ws:
async for message in ws:
event = json.loads(message)
print(f"[{event['type']}] {event.get('data', {})}")
asyncio.run(listen())
```

178
docs/architecture.md Normal file
View File

@@ -0,0 +1,178 @@
# Architecture
Hold Slayer is a single-process async Python application built on FastAPI. It acts as an intelligent B2BUA (Back-to-Back User Agent) sitting between your SIP trunk (PSTN access) and your desk phone/softphone.
## System Diagram
```
┌─────────────────────────────────────────────────────────────────┐
│ FastAPI Server │
│ │
│ ┌──────────┐ ┌──────────┐ ┌───────────┐ ┌──────────────┐ │
│ │ REST API │ │WebSocket │ │MCP Server │ │ Dashboard │ │
│ │ /api/* │ │ /ws/* │ │ (SSE) │ │ /dashboard │ │
│ └────┬─────┘ └────┬─────┘ └─────┬─────┘ └──────────────┘ │
│ │ │ │ │
│ ┌────┴──────────────┴──────────────┴────┐ │
│ │ Event Bus │ │
│ │ (asyncio Queue pub/sub per client) │ │
│ └────┬──────────────┬──────────────┬────┘ │
│ │ │ │ │
│ ┌────┴─────┐ ┌─────┴─────┐ ┌────┴──────────┐ │
│ │ Call │ │ Hold │ │ Services │ │
│ │ Manager │ │ Slayer │ │ (LLM, STT, │ │
│ │ │ │ │ │ Recording, │ │
│ │ │ │ │ │ Analytics, │ │
│ │ │ │ │ │ Notify) │ │
│ └────┬─────┘ └─────┬─────┘ └──────────────┘ │
│ │ │ │
│ ┌────┴──────────────┴───────────────────┐ │
│ │ Sippy B2BUA Engine │ │
│ │ (SIP calls, DTMF, conference bridge) │ │
│ └────┬──────────────────────────────────┘ │
│ │ │
└───────┼─────────────────────────────────────────────────────────┘
┌────┴────┐
│SIP Trunk│ ──→ PSTN
└─────────┘
```
## Component Overview
### Presentation Layer
| Component | File | Protocol | Purpose |
|-----------|------|----------|---------|
| REST API | `api/calls.py`, `api/call_flows.py`, `api/devices.py` | HTTP | Call management, CRUD, configuration |
| WebSocket | `api/websocket.py` | WS | Real-time event streaming to clients |
| MCP Server | `mcp_server/server.py` | SSE | AI assistant tool integration |
### Orchestration Layer
| Component | File | Purpose |
|-----------|------|---------|
| Gateway | `core/gateway.py` | Top-level orchestrator — owns all services, routes calls |
| Call Manager | `core/call_manager.py` | Active call state, lifecycle, transcript tracking |
| Event Bus | `core/event_bus.py` | Async pub/sub connecting everything together |
### Intelligence Layer
| Component | File | Purpose |
|-----------|------|---------|
| Hold Slayer | `services/hold_slayer.py` | IVR navigation, hold monitoring, human detection |
| Audio Classifier | `services/audio_classifier.py` | Real-time waveform analysis (music/speech/DTMF/silence) |
| LLM Client | `services/llm_client.py` | OpenAI-compatible LLM for IVR menu decisions |
| Transcription | `services/transcription.py` | Speaches/Whisper STT for live audio |
| Call Flow Learner | `services/call_flow_learner.py` | Builds reusable IVR trees from exploration data |
### Infrastructure Layer
| Component | File | Purpose |
|-----------|------|---------|
| Sippy Engine | `core/sippy_engine.py` | SIP signaling (INVITE, BYE, REGISTER, DTMF) |
| Media Pipeline | `core/media_pipeline.py` | PJSUA2 RTP media handling, conference bridge, recording |
| Recording | `services/recording.py` | WAV file management and storage |
| Analytics | `services/call_analytics.py` | Call metrics, hold time stats, trends |
| Notifications | `services/notification.py` | WebSocket + SMS alerts |
| Database | `db/database.py` | SQLAlchemy async (PostgreSQL or SQLite) |
## Data Flow — Hold Slayer Call
```
1. User Request
POST /api/calls/hold-slayer { number, intent, call_flow_id }
2. Gateway.make_call()
├── CallManager.create_call() → track state
├── SippyEngine.make_call() → SIP INVITE to trunk
└── MediaPipeline.add_stream() → RTP media setup
3. HoldSlayer.run_with_flow() or run_exploration()
├── AudioClassifier.classify() → analyze 3s audio windows
│ ├── silence? → wait
│ ├── ringing? → wait
│ ├── DTMF? → detect tones
│ ├── music? → HOLD_DETECTED event
│ └── speech? → transcribe + decide
├── TranscriptionService.transcribe() → STT on speech audio
├── LLMClient.analyze_ivr_menu() → pick menu option (fallback)
│ └── SippyEngine.send_dtmf() → press the button
└── detect_hold_to_human_transition()
└── HUMAN_DETECTED! → transfer
4. Transfer
├── SippyEngine.bridge() → connect call legs
├── MediaPipeline.bridge_streams() → bridge RTP
├── EventBus.publish(TRANSFER_STARTED)
└── NotificationService → "Pick up your phone!"
5. Real-Time Updates (throughout)
EventBus.publish() → WebSocket clients
→ MCP server resources
→ Notification service
→ Analytics tracking
```
## Threading Model
Hold Slayer is primarily single-threaded async (asyncio), with one exception:
- **Main thread**: FastAPI + all async services (event bus, hold slayer, classifier, etc.)
- **Sippy thread**: Sippy B2BUA runs its own event loop in a dedicated daemon thread. The `SippyEngine` bridges async↔sync via `asyncio.run_in_executor()`.
- **PJSUA2**: Runs in the main thread using null audio device (no sound card needed — headless server mode).
```
Main Thread (asyncio)
├── FastAPI (uvicorn)
├── EventBus
├── CallManager
├── HoldSlayer
├── AudioClassifier
├── TranscriptionService
├── LLMClient
├── MediaPipeline (PJSUA2)
├── NotificationService
└── RecordingService
Sippy Thread (daemon)
└── Sippy B2BUA event loop
├── SIP signaling
├── DTMF relay
└── Call leg management
```
## Design Decisions
### Why Sippy B2BUA + PJSUA2?
We split SIP signaling and media handling into two separate libraries:
- **Sippy B2BUA** handles SIP signaling (INVITE, BYE, REGISTER, re-INVITE, DTMF relay). It's battle-tested for telephony and handles the complex SIP state machine.
- **PJSUA2** handles RTP media (audio streams, conference bridge, recording, tone generation). It provides a clean C++/Python API for media manipulation without needing to deal with raw RTP.
This split lets us tap into the audio stream (for classification and STT) without interfering with SIP signaling, and bridge calls through a conference bridge for clean transfer.
### Why asyncio Queue-based EventBus?
- **Single process** — no need for Redis/RabbitMQ cross-process messaging
- **Zero dependencies** — pure asyncio, no external services to deploy
- **Per-subscriber queues** — slow consumers don't block fast publishers
- **Dead subscriber cleanup** — full queues are automatically removed
- **Event history** — late joiners can catch up on recent events
If scaling to multiple gateway processes becomes necessary, the EventBus interface can be backed by Redis pub/sub without changing consumers.
### Why OpenAI-compatible LLM API?
The LLM client uses raw HTTP (httpx) against any OpenAI-compatible endpoint. This means:
- **Ollama** (local, free) — `http://localhost:11434/v1`
- **LM Studio** (local, free) — `http://localhost:1234/v1`
- **vLLM** (local, fast) — `http://localhost:8000/v1`
- **OpenAI** (cloud) — `https://api.openai.com/v1`
No SDK dependency. No vendor lock-in. Switch models by changing one env var.

174
docs/audio-classifier.md Normal file
View File

@@ -0,0 +1,174 @@
# Audio Classifier
The Audio Classifier (`services/audio_classifier.py`) performs real-time waveform analysis on phone audio to determine what's happening on the call: silence, ringing, hold music, IVR prompts, DTMF tones, or live human speech.
## Classification Types
```python
class AudioClassification(str, Enum):
SILENCE = "silence" # No meaningful audio
MUSIC = "music" # Hold music
IVR_PROMPT = "ivr_prompt" # Recorded voice menu
LIVE_HUMAN = "live_human" # Live person speaking
RINGING = "ringing" # Ringback tone
DTMF = "dtmf" # Touch-tone digits
UNKNOWN = "unknown" # Can't classify
```
## Feature Extraction
Every audio frame (typically 3 seconds of 16kHz PCM) goes through feature extraction:
| Feature | What It Measures | How It's Used |
|---------|-----------------|---------------|
| **RMS Energy** | Loudness (root mean square of samples) | Silence detection — below threshold = silence |
| **Spectral Flatness** | How noise-like vs tonal the audio is (0=pure tone, 1=white noise) | Music has low flatness (tonal), speech has higher flatness |
| **Zero-Crossing Rate** | How often the waveform crosses zero | Speech has moderate ZCR, tones have very regular ZCR |
| **Dominant Frequency** | Strongest frequency component (via FFT) | Ringback detection (440Hz), DTMF detection |
| **Spectral Centroid** | "Center of mass" of the frequency spectrum | Speech has higher centroid than music |
| **Tonality** | Whether the audio is dominated by a single frequency | Tones/DTMF are highly tonal, speech is not |
### Feature Extraction Code
```python
def _extract_features(self, audio: np.ndarray) -> dict:
rms = np.sqrt(np.mean(audio ** 2))
# FFT for frequency analysis
fft = np.fft.rfft(audio)
magnitude = np.abs(fft)
freqs = np.fft.rfftfreq(len(audio), 1.0 / self._sample_rate)
# Spectral flatness: geometric mean / arithmetic mean of magnitude
spectral_flatness = np.exp(np.mean(np.log(magnitude + 1e-10))) / (np.mean(magnitude) + 1e-10)
# Zero-crossing rate
zcr = np.mean(np.abs(np.diff(np.sign(audio)))) / 2
# Dominant frequency
dominant_freq = freqs[np.argmax(magnitude)]
# Spectral centroid
spectral_centroid = np.sum(freqs * magnitude) / (np.sum(magnitude) + 1e-10)
return { ... }
```
## Classification Logic
Classification follows a priority chain:
```
1. SILENCE — RMS below threshold?
└── Yes → SILENCE (confidence based on how quiet)
2. DTMF — Goertzel algorithm detects dual-tone pairs?
└── Yes → DTMF (with detected digit in details)
3. RINGING — Dominant frequency near 440Hz + tonal?
└── Yes → RINGING
4. SPEECH vs MUSIC discrimination:
├── High spectral flatness + moderate ZCR → LIVE_HUMAN or IVR_PROMPT
│ └── _looks_like_live_human() checks history for hold→speech transition
│ ├── Yes → LIVE_HUMAN
│ └── No → IVR_PROMPT
└── Low spectral flatness + tonal → MUSIC
```
### DTMF Detection
Uses the Goertzel algorithm to detect the dual-tone pairs that make up DTMF digits:
```
1209 Hz 1336 Hz 1477 Hz 1633 Hz
697 Hz 1 2 3 A
770 Hz 4 5 6 B
852 Hz 7 8 9 C
941 Hz * 0 # D
```
Each DTMF digit is two simultaneous frequencies. The Goertzel algorithm efficiently checks for the presence of each specific frequency without computing a full FFT.
### Hold-to-Human Transition
The most critical detection — when a live person picks up after hold music:
```python
def detect_hold_to_human_transition(self) -> bool:
"""
Check classification history for the pattern:
MUSIC, MUSIC, MUSIC, ... → LIVE_HUMAN/IVR_PROMPT
Requires:
- At least 3 recent MUSIC classifications
- Followed by 2+ speech classifications
- Speech has sufficient energy (not just noise)
"""
recent = self._history[-10:]
# Find the transition point
music_count = 0
speech_count = 0
for result in recent:
if result.audio_type == AudioClassification.MUSIC:
music_count += 1
speech_count = 0 # reset
elif result.audio_type in (AudioClassification.LIVE_HUMAN, AudioClassification.IVR_PROMPT):
speech_count += 1
return music_count >= 3 and speech_count >= 2
```
## Classification Result
Each classification returns:
```python
@dataclass
class ClassificationResult:
timestamp: float
audio_type: AudioClassification
confidence: float # 0.0 to 1.0
details: dict # Feature values, detected frequencies, etc.
```
The `details` dict includes all extracted features, making it available for debugging and analytics:
```python
{
"rms": 0.0423,
"spectral_flatness": 0.15,
"zcr": 0.087,
"dominant_freq": 440.0,
"spectral_centroid": 523.7,
"is_tonal": True
}
```
## Configuration
| Setting | Description | Default |
|---------|-------------|---------|
| `CLASSIFIER_MUSIC_THRESHOLD` | Spectral flatness below this = music | `0.7` |
| `CLASSIFIER_SPEECH_THRESHOLD` | Spectral flatness above this = speech | `0.6` |
| `CLASSIFIER_SILENCE_THRESHOLD` | RMS below this = silence | `0.85` |
| `CLASSIFIER_WINDOW_SECONDS` | Audio window size for each classification | `3.0` |
## Testing
The audio classifier has 18 unit tests covering:
- Silence detection (pure silence, very quiet, empty audio)
- Tone detection (440Hz ringback, 1000Hz test tone)
- DTMF detection (digit 5, digit 0)
- Speech detection (speech-like waveforms)
- Classification history (hold→human transition, IVR non-transition)
- Feature extraction (RMS, ZCR, spectral flatness, dominant frequency)
```bash
pytest tests/test_audio_classifier.py -v
```
> **Known issue:** `test_complex_tone_as_music` is a known edge case where a multi-harmonic synthetic tone is classified as `LIVE_HUMAN` instead of `MUSIC`. This is acceptable — real hold music has different characteristics than synthetic test signals.

233
docs/call-flows.md Normal file
View File

@@ -0,0 +1,233 @@
# Call Flows
Call flows are reusable IVR navigation trees that tell Hold Slayer exactly how to navigate a company's phone menu. Once a flow is learned (manually or via exploration), subsequent calls to the same number skip the LLM analysis and follow the stored steps directly.
## Data Model
### CallFlowStep
A single step in the IVR navigation:
```python
class CallFlowStep(BaseModel):
id: str # Unique step identifier
type: CallFlowStepType # DTMF, WAIT, LISTEN, HOLD, SPEAK, TRANSFER
description: str # Human-readable description
dtmf: Optional[str] = None # Digits to press (for DTMF steps)
timeout: float = 10.0 # Max seconds to wait
next_step: Optional[str] = None # ID of the next step
conditions: dict = {} # Conditional branching rules
metadata: dict = {} # Extra data (transcript patterns, etc.)
```
### Step Types
| Type | Purpose | Key Fields |
|------|---------|------------|
| `DTMF` | Press touch-tone digits | `dtmf="3"` |
| `WAIT` | Pause for a duration | `timeout=5.0` |
| `LISTEN` | Record + transcribe + decide | `timeout=15.0`, optional `dtmf` for hardcoded response |
| `HOLD` | Wait on hold, monitor for human | `timeout=7200` (max hold time) |
| `SPEAK` | Play audio to the call | `metadata={"audio_file": "greeting.wav"}` |
| `TRANSFER` | Bridge call to user's device | `metadata={"device": "sip_phone"}` |
### CallFlow
A complete IVR navigation tree:
```python
class CallFlow(BaseModel):
id: str # "chase_bank_main"
name: str # "Chase Bank — Main Menu"
company: Optional[str] # "Chase Bank"
phone_number: Optional[str] # "+18005551234"
description: Optional[str] # "Navigate to disputes department"
steps: list[CallFlowStep] # Ordered list of steps
created_at: datetime
updated_at: datetime
version: int = 1
tags: list[str] = [] # ["banking", "disputes"]
success_count: int = 0 # Times this flow succeeded
fail_count: int = 0 # Times this flow failed
```
## Example Call Flow
```json
{
"id": "chase_bank_disputes",
"name": "Chase Bank — Disputes",
"company": "Chase Bank",
"phone_number": "+18005551234",
"steps": [
{
"id": "wait_greeting",
"type": "WAIT",
"description": "Wait for greeting to finish",
"timeout": 5.0,
"next_step": "main_menu"
},
{
"id": "main_menu",
"type": "LISTEN",
"description": "Listen to main menu options",
"timeout": 15.0,
"next_step": "press_3"
},
{
"id": "press_3",
"type": "DTMF",
"description": "Press 3 for account services",
"dtmf": "3",
"next_step": "sub_menu"
},
{
"id": "sub_menu",
"type": "LISTEN",
"description": "Listen to account services sub-menu",
"timeout": 15.0,
"next_step": "press_1"
},
{
"id": "press_1",
"type": "DTMF",
"description": "Press 1 for disputes",
"dtmf": "1",
"next_step": "hold"
},
{
"id": "hold",
"type": "HOLD",
"description": "Wait on hold for disputes agent",
"timeout": 7200,
"next_step": "transfer"
},
{
"id": "transfer",
"type": "TRANSFER",
"description": "Transfer to user's phone"
}
]
}
```
## Call Flow Learner (`services/call_flow_learner.py`)
Automatically builds call flows from exploration data.
### How It Works
1. **Exploration mode** records "discoveries" — what the Hold Slayer encountered and did at each step
2. The learner converts discoveries into `CallFlowStep` objects
3. Steps are ordered and linked (`next_step` pointers)
4. The resulting `CallFlow` is saved for future calls
### Discovery Types
| Discovery | Becomes Step |
|-----------|-------------|
| Heard IVR prompt, pressed DTMF | `LISTEN``DTMF` |
| Detected hold music | `HOLD` |
| Detected silence (waiting) | `WAIT` |
| Heard speech (human) | `TRANSFER` |
| Sent DTMF digits | `DTMF` |
### Building a Flow
```python
learner = CallFlowLearner()
# After an exploration call completes:
discoveries = [
{"type": "wait", "duration": 3.0, "description": "Initial silence"},
{"type": "ivr_menu", "transcript": "Press 1 for billing...", "dtmf_sent": "1"},
{"type": "ivr_menu", "transcript": "Press 3 for disputes...", "dtmf_sent": "3"},
{"type": "hold", "duration": 480.0},
{"type": "human_detected", "transcript": "Thank you for calling..."},
]
flow = learner.build_flow(
discoveries=discoveries,
phone_number="+18005551234",
company="Chase Bank",
intent="dispute a charge",
)
# Returns a CallFlow with 5 steps: WAIT → LISTEN/DTMF → LISTEN/DTMF → HOLD → TRANSFER
```
### Merging Discoveries
When the same number is called again with exploration, new discoveries can be merged into the existing flow:
```python
updated_flow = learner.merge_discoveries(
existing_flow=flow,
new_discoveries=new_discoveries,
)
```
This handles:
- New menu options discovered
- Changed IVR structure
- Updated timing information
- Success/failure tracking
## REST API
### List Call Flows
```
GET /api/call-flows
GET /api/call-flows?company=Chase+Bank
GET /api/call-flows?tag=banking
```
### Get Call Flow
```
GET /api/call-flows/{flow_id}
```
### Create Call Flow
```
POST /api/call-flows
Content-Type: application/json
{
"name": "Chase Bank — Disputes",
"company": "Chase Bank",
"phone_number": "+18005551234",
"steps": [ ... ]
}
```
### Update Call Flow
```
PUT /api/call-flows/{flow_id}
Content-Type: application/json
{ ... updated flow ... }
```
### Delete Call Flow
```
DELETE /api/call-flows/{flow_id}
```
### Learn Flow from Exploration
```
POST /api/call-flows/learn
Content-Type: application/json
{
"call_id": "call_abc123",
"phone_number": "+18005551234",
"company": "Chase Bank"
}
```
This triggers the Call Flow Learner to build a flow from the call's exploration data.

165
docs/configuration.md Normal file
View File

@@ -0,0 +1,165 @@
# Configuration
All configuration is via environment variables, loaded through Pydantic Settings. Copy `.env.example` to `.env` and edit.
## Environment Variables
### SIP Trunk
| Variable | Description | Default | Required |
|----------|-------------|---------|----------|
| `SIP_TRUNK_HOST` | Your SIP provider hostname | — | Yes |
| `SIP_TRUNK_PORT` | SIP signaling port | `5060` | No |
| `SIP_TRUNK_USERNAME` | SIP auth username | — | Yes |
| `SIP_TRUNK_PASSWORD` | SIP auth password | — | Yes |
| `SIP_TRUNK_DID` | Your phone number (E.164) | — | Yes |
| `SIP_TRUNK_TRANSPORT` | Transport protocol (`udp`, `tcp`, `tls`) | `udp` | No |
### Gateway
| Variable | Description | Default | Required |
|----------|-------------|---------|----------|
| `GATEWAY_SIP_PORT` | Port for device SIP registration | `5080` | No |
| `GATEWAY_RTP_PORT_MIN` | Minimum RTP port | `10000` | No |
| `GATEWAY_RTP_PORT_MAX` | Maximum RTP port | `20000` | No |
| `GATEWAY_HOST` | Bind address | `0.0.0.0` | No |
### LLM
| Variable | Description | Default | Required |
|----------|-------------|---------|----------|
| `LLM_BASE_URL` | OpenAI-compatible API endpoint | `http://localhost:11434/v1` | No |
| `LLM_MODEL` | Model name for IVR analysis | `llama3` | No |
| `LLM_API_KEY` | API key (if required) | `not-needed` | No |
| `LLM_TIMEOUT` | Request timeout in seconds | `30.0` | No |
| `LLM_MAX_TOKENS` | Max tokens per response | `1024` | No |
| `LLM_TEMPERATURE` | Sampling temperature | `0.3` | No |
### Speech-to-Text
| Variable | Description | Default | Required |
|----------|-------------|---------|----------|
| `SPEACHES_URL` | Speaches/Whisper STT endpoint | `http://localhost:22070` | No |
| `SPEACHES_MODEL` | Whisper model name | `whisper-large-v3` | No |
### Database
| Variable | Description | Default | Required |
|----------|-------------|---------|----------|
| `DATABASE_URL` | PostgreSQL or SQLite connection string | `sqlite+aiosqlite:///./hold_slayer.db` | No |
### Notifications
| Variable | Description | Default | Required |
|----------|-------------|---------|----------|
| `NOTIFY_SMS_NUMBER` | Phone number for SMS alerts (E.164) | — | No |
### Audio Classifier
| Variable | Description | Default | Required |
|----------|-------------|---------|----------|
| `CLASSIFIER_WINDOW_SECONDS` | Audio window size for classification | `3.0` | No |
| `CLASSIFIER_SILENCE_THRESHOLD` | RMS below this = silence | `0.85` | No |
| `CLASSIFIER_MUSIC_THRESHOLD` | Spectral flatness below this = music | `0.7` | No |
| `CLASSIFIER_SPEECH_THRESHOLD` | Spectral flatness above this = speech | `0.6` | No |
### Hold Slayer
| Variable | Description | Default | Required |
|----------|-------------|---------|----------|
| `MAX_HOLD_TIME` | Maximum seconds to wait on hold | `7200` | No |
| `HOLD_CHECK_INTERVAL` | Seconds between audio checks | `2.0` | No |
| `DEFAULT_TRANSFER_DEVICE` | Device to transfer to | `sip_phone` | No |
### Recording
| Variable | Description | Default | Required |
|----------|-------------|---------|----------|
| `RECORDING_DIR` | Directory for WAV recordings | `recordings` | No |
| `RECORDING_MAX_SECONDS` | Maximum recording duration | `7200` | No |
| `RECORDING_SAMPLE_RATE` | Audio sample rate | `16000` | No |
## Settings Architecture
Configuration is managed by Pydantic Settings in `config.py`:
```python
from config import get_settings
settings = get_settings()
settings.sip_trunk_host # "sip.provider.com"
settings.llm.base_url # "http://localhost:11434/v1"
settings.llm.model # "llama3"
settings.speaches_url # "http://localhost:22070"
settings.database_url # "sqlite+aiosqlite:///./hold_slayer.db"
```
LLM settings are nested under `settings.llm` as a `LLMSettings` sub-model.
## Deployment
### Development
```bash
# 1. Clone and install
git clone <repo-url>
cd hold-slayer
python -m venv .venv
source .venv/bin/activate
pip install -e ".[dev]"
# 2. Configure
cp .env.example .env
# Edit .env
# 3. Start Ollama (for LLM)
ollama serve
ollama pull llama3
# 4. Start Speaches (for STT)
docker run -p 22070:8000 ghcr.io/speaches-ai/speaches
# 5. Run
uvicorn main:app --host 0.0.0.0 --port 8000 --reload
```
### Production
```bash
# Use PostgreSQL instead of SQLite
DATABASE_URL=postgresql+asyncpg://user:pass@localhost/hold_slayer
# Use vLLM for faster inference
LLM_BASE_URL=http://localhost:8000/v1
LLM_MODEL=meta-llama/Llama-3-8B-Instruct
# Run with multiple workers (note: each worker is independent)
uvicorn main:app --host 0.0.0.0 --port 8000 --workers 1
```
Note: Hold Slayer is designed as a single-process application. Multiple workers would each have their own SIP engine and call state. For high availability, run behind a load balancer with sticky sessions.
### Docker
```dockerfile
FROM python:3.13-slim
# Install system dependencies for PJSUA2 and Sippy
RUN apt-get update && apt-get install -y \
build-essential \
libpjproject-dev \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY . .
RUN pip install -e .
EXPOSE 8000 5080/udp 10000-20000/udp
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
```
Port mapping:
- `8000` — HTTP API + WebSocket + MCP
- `5080/udp` — SIP device registration
- `10000-20000/udp` — RTP media ports

273
docs/core-engine.md Normal file
View File

@@ -0,0 +1,273 @@
# Core Engine
The core engine provides the foundational infrastructure: SIP call control, media handling, call state management, and event distribution.
## SIP Engine (`core/sip_engine.py` + `core/sippy_engine.py`)
### Abstract Interface
All SIP operations go through the `SIPEngine` abstract base class, which defines the contract:
```python
class SIPEngine(ABC):
async def start(self) -> None: ...
async def stop(self) -> None: ...
async def make_call(self, to_uri: str, from_uri: str = None) -> str: ...
async def hangup(self, call_id: str) -> None: ...
async def send_dtmf(self, call_id: str, digits: str) -> None: ...
async def bridge(self, call_id_a: str, call_id_b: str) -> None: ...
async def transfer(self, call_id: str, to_uri: str) -> None: ...
async def register(self, ...) -> bool: ...
async def get_trunk_status(self) -> TrunkStatus: ...
```
This abstraction allows:
- **`SippyEngine`** — Production implementation using Sippy B2BUA
- **`MockSIPEngine`** — Test implementation that simulates calls in memory
### Sippy B2BUA Engine
The `SippyEngine` wraps Sippy B2BUA for SIP signaling:
```python
class SippyEngine(SIPEngine):
"""
Production SIP engine using Sippy B2BUA.
Sippy runs its own event loop in a daemon thread.
All async methods bridge to Sippy via run_in_executor().
"""
```
**Key internals:**
| Class | Purpose |
|-------|---------|
| `SipCallLeg` | Tracks one leg of a call (call-id, state, RTP endpoint, SDP) |
| `SipBridge` | Two bridged call legs (outbound + device) |
| `SippyCallController` | Handles Sippy callbacks (INVITE received, BYE received, DTMF, etc.) |
**Call lifecycle:**
```
make_call("sip:+18005551234@trunk")
├── Create SipCallLeg (state=TRYING)
├── Sippy: send INVITE
├── Sippy callback: 180 Ringing → state=RINGING
├── Sippy callback: 200 OK → state=CONNECTED
│ └── Extract RTP endpoint from SDP
│ └── MediaPipeline.add_stream(rtp_host, rtp_port)
└── Return call_id
send_dtmf(call_id, "1")
└── Sippy: send RFC 2833 DTMF or SIP INFO
bridge(call_id_a, call_id_b)
├── Create SipBridge(leg_a, leg_b)
└── MediaPipeline.bridge_streams(stream_a, stream_b)
hangup(call_id)
├── Sippy: send BYE
├── MediaPipeline.remove_stream()
└── Cleanup SipCallLeg
```
**Graceful fallback:** If Sippy B2BUA is not installed, the engine falls back to mock mode with a warning — useful for development and testing without a SIP stack.
### Trunk Registration
The engine registers with your SIP trunk provider on startup:
```python
await engine.register(
registrar="sip.yourprovider.com",
username="your_username",
password="your_password",
realm="sip.yourprovider.com",
)
```
Registration is refreshed automatically. `get_trunk_status()` returns the current registration state and health.
## Media Pipeline (`core/media_pipeline.py`)
The media pipeline uses PJSUA2 for all RTP audio handling:
### Key Classes
| Class | Purpose |
|-------|---------|
| `AudioTap` | Extracts audio frames from a stream into an async queue (for classifier/STT) |
| `MediaStream` | Wraps a single RTP stream (transport port, conference slot, optional tap + recording) |
| `MediaPipeline` | Main orchestrator — manages all streams, bridging, recording |
### Operations
```python
# Add a new RTP stream (called when SIP call connects)
stream_id = await pipeline.add_stream(rtp_host, rtp_port, codec="PCMU")
# Tap audio for real-time analysis
tap = await pipeline.tap_stream(stream_id)
async for frame in tap:
classification = classifier.classify(frame)
# Bridge two streams (transfer)
await pipeline.bridge_streams(stream_a, stream_b)
# Record a stream to WAV
await pipeline.start_recording(stream_id, "/path/to/recording.wav")
await pipeline.stop_recording(stream_id)
# Play a tone (e.g., ringback to caller)
await pipeline.play_tone(stream_id, frequency=440, duration_ms=2000)
# Clean up
await pipeline.remove_stream(stream_id)
```
### Conference Bridge
PJSUA2's conference bridge is central to the architecture. Every stream gets a conference slot, and bridging is done by connecting slots:
```
Conference Bridge
├── Slot 0: Outbound call (to company)
├── Slot 1: AudioTap (classifier + STT reads from here)
├── Slot 2: Recording port
├── Slot 3: Device call (your phone, after transfer)
└── Slot 4: Tone generator
Bridge: Slot 0 ↔ Slot 3 (company ↔ your phone)
Tap: Slot 0 → Slot 1 (company audio → classifier)
Record: Slot 0 → Slot 2 (company audio → WAV file)
```
### Null Audio Device
The pipeline uses PJSUA2's null audio device — no sound card required. This is essential for headless server deployment.
## Call Manager (`core/call_manager.py`)
Tracks all active calls and their state:
```python
class CallManager:
async def create_call(self, number, mode, intent, ...) -> ActiveCall
async def get_call(self, call_id) -> Optional[ActiveCall]
async def update_status(self, call_id, status) -> None
async def end_call(self, call_id, reason) -> None
async def add_transcript(self, call_id, text, speaker) -> None
def active_call_count(self) -> int
def get_all_active(self) -> list[ActiveCall]
```
**ActiveCall state:**
```python
@dataclass
class ActiveCall:
call_id: str
number: str
mode: CallMode # direct, hold_slayer, ai_assisted
status: CallStatus # trying, ringing, connected, on_hold, transferring, ended
intent: Optional[str]
device: Optional[str]
call_flow_id: Optional[str]
# Timing
started_at: datetime
connected_at: Optional[datetime]
hold_started_at: Optional[datetime]
ended_at: Optional[datetime]
# Audio classification
current_audio_type: Optional[AudioClassification]
classification_history: list[ClassificationResult]
# Transcript
transcript_chunks: list[TranscriptChunk]
# Services
services: dict[str, bool] # recording, transcription, etc.
```
The CallManager publishes events to the EventBus on every state change.
## Event Bus (`core/event_bus.py`)
Pure asyncio pub/sub connecting all components:
```python
class EventBus:
async def publish(self, event: GatewayEvent) -> None
def subscribe(self, event_types: set[EventType] = None) -> EventSubscription
@property
def recent_events(self) -> list[GatewayEvent]
@property
def subscriber_count(self) -> int
```
### EventSubscription
Subscriptions are async iterators:
```python
subscription = event_bus.subscribe(event_types={EventType.HUMAN_DETECTED})
async for event in subscription:
print(f"Human detected on call {event.call_id}!")
# When done:
subscription.close()
```
### How it works
1. Each `subscribe()` creates an `asyncio.Queue` for that subscriber
2. `publish()` does `put_nowait()` on every subscriber's queue
3. Full queues (dead subscribers) are automatically cleaned up
4. Optional type filtering — only receive events you care about
5. Event history (last 1000) for late joiners
### Event Types
See [models/events.py](../models/events.py) for the full list. Key categories:
| Category | Events |
|----------|--------|
| Call Lifecycle | `CALL_STARTED`, `CALL_RINGING`, `CALL_CONNECTED`, `CALL_ENDED`, `CALL_FAILED` |
| Hold Slayer | `HOLD_DETECTED`, `HUMAN_DETECTED`, `TRANSFER_STARTED`, `TRANSFER_COMPLETE` |
| IVR Navigation | `IVR_STEP`, `IVR_DTMF_SENT`, `IVR_MENU_DETECTED`, `IVR_EXPLORATION` |
| Audio | `AUDIO_CLASSIFIED`, `TRANSCRIPT_CHUNK`, `RECORDING_STARTED`, `RECORDING_STOPPED` |
| Device | `DEVICE_REGISTERED`, `DEVICE_UNREGISTERED`, `DEVICE_RINGING` |
| System | `GATEWAY_STARTED`, `GATEWAY_STOPPED`, `TRUNK_REGISTERED`, `TRUNK_FAILED` |
## Gateway (`core/gateway.py`)
The top-level orchestrator that owns and wires all components:
```python
class AIPSTNGateway:
def __init__(self, settings: Settings):
self.event_bus = EventBus()
self.call_manager = CallManager(self.event_bus)
self.sip_engine = SippyEngine(settings, self.event_bus)
self.media_pipeline = MediaPipeline(settings)
self.llm_client = LLMClient(...)
self.transcription = TranscriptionService(...)
self.classifier = AudioClassifier()
self.hold_slayer = HoldSlayer(...)
self.recording = RecordingService(...)
self.analytics = CallAnalytics(...)
self.notification = NotificationService(...)
self.call_flow_learner = CallFlowLearner(...)
async def start(self) -> None: ... # Start all services
async def stop(self) -> None: ... # Graceful shutdown
async def make_call(self, ...) -> ActiveCall: ...
async def end_call(self, call_id) -> None: ...
```
The gateway is created once at application startup (in `main.py` lifespan) and injected into FastAPI routes via dependency injection (`api/deps.py`).

180
docs/development.md Normal file
View File

@@ -0,0 +1,180 @@
# Development
## Setup
### Prerequisites
- Python 3.13+
- Ollama (or any OpenAI-compatible LLM) — for IVR menu analysis
- Speaches or Whisper API — for speech-to-text (optional for dev)
- A SIP trunk account — for making real calls (optional for dev)
### Install
```bash
git clone <repo-url>
cd hold-slayer
python -m venv .venv
source .venv/bin/activate
pip install -e ".[dev]"
```
### Dev Dependencies
The `[dev]` extras include:
- `pytest` — test runner
- `pytest-asyncio` — async test support
- `pytest-cov` — coverage reporting
## Testing
### Run All Tests
```bash
pytest tests/ -v
```
### Run Specific Test Files
```bash
pytest tests/test_audio_classifier.py -v # 18 tests — waveform analysis
pytest tests/test_call_flows.py -v # 10 tests — call flow models
pytest tests/test_hold_slayer.py -v # 20 tests — IVR nav, EventBus, CallManager
pytest tests/test_services.py -v # 27 tests — LLM, notifications, recording,
# analytics, learner, EventBus
```
### Run with Coverage
```bash
pytest tests/ --cov=. --cov-report=term-missing
```
### Test Architecture
Tests are organized by component:
| File | Tests | What's Covered |
|------|-------|----------------|
| `test_audio_classifier.py` | 18 | Silence, tone, DTMF, music, speech detection; feature extraction; classification history |
| `test_call_flows.py` | 10 | CallFlowStep types, CallFlow navigation, serialization roundtrip, create/summary models |
| `test_hold_slayer.py` | 20 | IVR menu navigation (6 intent scenarios), EventBus pub/sub, CallManager lifecycle, MockSIPEngine |
| `test_services.py` | 27 | LLMClient init/stats/chat/JSON/errors/IVR analysis, NotificationService event mapping, RecordingService paths, CallAnalytics summaries, CallFlowLearner build/merge, EventBus integration |
### Known Test Issues
`test_complex_tone_as_music` — A synthetic multi-harmonic tone is classified as `LIVE_HUMAN` instead of `MUSIC`. This is a known edge case. Real hold music has different spectral characteristics than synthetic test signals. This test documents the limitation rather than a bug.
### Writing Tests
All tests use `pytest-asyncio` for async support. The test configuration in `pyproject.toml`:
```toml
[tool.pytest.ini_options]
asyncio_mode = "auto"
```
This means all `async def test_*` functions automatically run in an asyncio event loop.
**Pattern for testing services:**
```python
import pytest
from services.llm_client import LLMClient
class TestLLMClient:
def test_init(self):
client = LLMClient(base_url="http://localhost:11434/v1", model="llama3")
assert client._model == "llama3"
@pytest.mark.asyncio
async def test_chat(self):
# Mock httpx for unit tests
...
```
**Pattern for testing EventBus:**
```python
import asyncio
from core.event_bus import EventBus
from models.events import EventType, GatewayEvent
async def test_publish_receive():
bus = EventBus()
sub = bus.subscribe()
event = GatewayEvent(type=EventType.CALL_STARTED, call_id="test", data={})
await bus.publish(event)
received = await asyncio.wait_for(sub.get(), timeout=1.0)
assert received.type == EventType.CALL_STARTED
```
## Project Conventions
### Code Style
- **Type hints everywhere** — All function signatures have type annotations
- **Pydantic models** — All data structures are Pydantic BaseModel or dataclass
- **Async by default** — All I/O operations are async
- **Logging** — Every module uses `logging.getLogger(__name__)`
- **Docstrings** — Module-level docstrings explain purpose and usage
### File Organization
```
module.py
├── Module docstring (purpose, usage examples)
├── Imports (stdlib → third-party → local)
├── Constants
├── Classes
│ ├── Class docstring
│ ├── __init__
│ ├── Public methods (async)
│ └── Private methods (_prefixed)
└── Module-level functions (if any)
```
### Error Handling
- **Services never crash the call** — All service errors are caught, logged, and return sensible defaults
- **LLM failures** return empty string/dict — the Hold Slayer falls back to waiting
- **SIP errors** publish `CALL_FAILED` events — the user is notified
- **HTTP errors** in the API return structured error responses
### Event-Driven Architecture
All components communicate through the EventBus:
1. **Publishers** — SIP engine, Hold Slayer, classifier, services
2. **Subscribers** — WebSocket handler, MCP server, notification service, analytics
This decouples components and makes the system extensible. Adding a new feature (e.g., Slack notifications) means subscribing to events — no changes to existing code.
### Dependency Injection
The `AIPSTNGateway` owns all services and is injected into FastAPI routes via `api/deps.py`:
```python
# api/deps.py
async def get_gateway() -> AIPSTNGateway:
return app.state.gateway
# api/calls.py
@router.post("/outbound")
async def make_call(request: CallRequest, gateway: AIPSTNGateway = Depends(get_gateway)):
...
```
This makes testing easy — swap the gateway for a mock in tests.
## Contributing
1. Create a feature branch
2. Write tests for new functionality
3. Ensure all tests pass: `pytest tests/ -v`
4. Follow existing code conventions
5. Update documentation in `/docs` if adding new features
6. Submit a pull request

104
docs/dial-plan.md Normal file
View File

@@ -0,0 +1,104 @@
# Hold Slayer Gateway — Dial Plan
## Overview
The gateway accepts calls from registered SIP endpoints and routes them
based on the dialled digits. No trunk-access prefix (no "9") is needed.
All routing is pattern-matched in order; the first match wins.
---
## ⚠️ Emergency Services — 911
> **911 and 9911 are always routed directly to the PSTN trunk.**
> No gateway logic intercepts, records, or delays these calls.
> `9911` is accepted in addition to `911` to catch the common
> mis-dial habit of dialling `9` for an outside line.
>
> **Your SIP trunk provider must support emergency calling on your DID.**
> Verify this with your provider before putting this system in service.
> VoIP emergency calling has location limitations — ensure your
> registered location is correct with your provider.
---
## Extension Ranges
| Range | Purpose |
|-------|--------------------------------|
| 2XX | SIP endpoints (phones/softphones) |
| 5XX | System services |
---
## 2XX — Endpoint Extensions
Extensions are auto-assigned from **221** upward when a SIP device
registers (`SIP REGISTER`) with the gateway or via `POST /api/devices`.
| Extension | Format | Example |
|-----------|---------------------------------|--------------------------------|
| 221299 | Auto-assigned to registered devices | `sip:221@gateway.helu.ca` |
### Assignment policy
- First device to register gets **221**, next **222**, and so on.
- Extensions are persisted in the database and survive restarts.
- If a device is removed its extension is freed and may be reassigned.
- `GATEWAY_SIP_DOMAIN` in `.env` sets the domain part of the URI.
---
## 5XX — System Services
| Extension | Service | Notes |
|-----------|----------------------|-----------------------------------------|
| 500 | Auto-attendant | Reserved — not yet implemented |
| 510 | Gateway status | Plays a status announcement |
| 511 | Echo test | Returns audio back to caller |
| 520 | Hold Slayer launch | Prompts for a number to hold-slay |
| 599 | Operator fallback | Transfers to preferred device |
---
## Outbound PSTN
All outbound patterns are routed via the configured SIP trunk
(`SIP_TRUNK_HOST`). No access code prefix is needed.
### Pattern table
| Pattern | Example input | Normalised to | Notes |
|----------------------|--------------------|---------------------|------------------------------------|
| `+1NPANXXXXXX` | `+16135550100` | `+16135550100` | E.164 — pass through as-is |
| `1NPANXXXXXX` | `16135550100` | `+16135550100` | NANP with country code |
| `NPANXXXXXX` | `6135550100` | `+16135550100` | 10-digit NANP — prepend `+1` |
| `011CC…` | `01144201234567` | `+44201234567` | International — strip `011` |
| `00CC…` | `004420…` | `+4420…` | International alt prefix |
| `+CC…` | `+44201234567` | `+44201234567` | E.164 international — pass through |
### Rules
1. E.164 (`+` prefix) is always passed to the trunk unchanged.
2. NANP 11-digit (`1` + 10 digits) is normalised to E.164 by prepending `+`.
3. NANP 10-digit is normalised to E.164 by prepending `+1`.
4. International via `011` or `00` strips the IDD prefix and prepends `+`.
5. 7-digit local dialling is **not supported** — always dial the area code.
---
## Inbound PSTN
Calls arriving from the trunk on the DID (`SIP_TRUNK_DID`) are routed
to the highest-priority online device. If no device is online the call
is queued or dropped (configurable via `MAX_HOLD_TIME`).
---
## Future
- Named regions / area-code routing
- Least-cost routing across multiple trunks
- Time-of-day routing (business hours vs. after-hours)
- Ring groups across multiple 2XX extensions
- Voicemail (extension 500)

168
docs/hold-slayer-service.md Normal file
View File

@@ -0,0 +1,168 @@
# Hold Slayer Service
The Hold Slayer (`services/hold_slayer.py`) is the brain of the system. It orchestrates the entire process of navigating IVR menus, detecting hold music, recognizing when a human picks up, and triggering the transfer to your phone.
## Two Operating Modes
### 1. Flow-Guided Mode (`run_with_flow`)
When a stored `CallFlow` exists for the number being called, the Hold Slayer follows it step-by-step:
```python
await hold_slayer.run_with_flow(call_id, call_flow)
```
The call flow is a tree of steps (see [Call Flows](call-flows.md)). The Hold Slayer walks through them:
```
CallFlow: "Chase Bank Main"
├── Step 1: WAIT 3s (wait for greeting)
├── Step 2: LISTEN (transcribe → LLM picks option)
├── Step 3: DTMF "2" (press 2 for account services)
├── Step 4: LISTEN (transcribe → LLM picks option)
├── Step 5: DTMF "1" (press 1 for disputes)
├── Step 6: HOLD (wait for human)
└── Step 7: TRANSFER (bridge to your phone)
```
**Step execution logic:**
| Step Type | What Happens |
|-----------|-------------|
| `DTMF` | Send the specified digits via SIP engine |
| `WAIT` | Sleep for the specified duration |
| `LISTEN` | Record audio, transcribe, then: use hardcoded DTMF if available, otherwise ask LLM to pick the right option |
| `HOLD` | Monitor audio classification, wait for human detection |
| `SPEAK` | Play a WAV file or TTS audio (for interactive prompts) |
| `TRANSFER` | Bridge the call to the user's device |
### 2. Exploration Mode (`run_exploration`)
When no stored call flow exists, the Hold Slayer explores the IVR autonomously:
```python
await hold_slayer.run_exploration(call_id, intent="dispute Amazon charge")
```
**Exploration loop:**
```
┌─→ Classify audio (3-second window)
│ ├── SILENCE → wait, increment silence counter
│ ├── RINGING → wait for answer
│ ├── MUSIC → hold detected, monitor for transition
│ ├── DTMF → ignore (echo detection)
│ ├── IVR_PROMPT/SPEECH →
│ │ ├── Transcribe the audio
│ │ ├── Send transcript + intent to LLM
│ │ ├── LLM returns: { "action": "dtmf", "digits": "2" }
│ │ └── Send DTMF
│ └── LIVE_HUMAN → human detected!
│ └── TRANSFER
└── Loop until: human detected, max hold time, or call ended
```
**Exploration discoveries** are recorded and can be fed into the `CallFlowLearner` to build a reusable flow for next time.
## Human Detection
The critical moment — detecting when a live person picks up after hold:
### Detection Chain
```
AudioClassifier.classify(audio_frame)
├── Feature extraction:
│ ├── RMS energy (loudness)
│ ├── Spectral flatness (noise vs tone)
│ ├── Zero-crossing rate (speech indicator)
│ ├── Dominant frequency
│ └── Spectral centroid
├── Classification: MUSIC, SILENCE, SPEECH, etc.
└── Transition detection:
└── detect_hold_to_human_transition()
├── Check last N classifications
├── Pattern: MUSIC, MUSIC, MUSIC → SPEECH, SPEECH
├── Confidence: speech energy > threshold
└── Result: HUMAN_DETECTED event
```
### What triggers a transfer?
The Hold Slayer considers a human detected when:
1. **Classification history** shows a transition from hold-like audio (MUSIC, SILENCE) to speech-like audio (LIVE_HUMAN, IVR_PROMPT)
2. **Energy threshold** — the speech audio has sufficient RMS energy (not just background noise)
3. **Consecutive speech frames** — at least 2-3 consecutive speech classifications (avoids false positives from hold music announcements like "your call is important to us")
### False Positive Handling
Hold music often includes periodic announcements ("Your estimated wait time is 15 minutes"). These are speech, but not a live human. The Hold Slayer handles this by:
1. **Duration check** — Hold announcements are typically short (5-15 seconds). A live agent conversation continues longer.
2. **Pattern matching** — After speech, if audio returns to MUSIC within a few seconds, it was just an announcement.
3. **Transcript analysis** — If transcription is active, the LLM can analyze whether the speech sounds like a recorded announcement vs. a live greeting.
## LISTEN Step + LLM Fallback
The most interesting step type. When the Hold Slayer encounters a LISTEN step in a call flow:
```python
# Step has hardcoded DTMF? Use it directly.
if step.dtmf:
await sip_engine.send_dtmf(call_id, step.dtmf)
# No hardcoded DTMF? Ask the LLM.
else:
transcript = await transcription.transcribe(audio)
decision = await llm_client.analyze_ivr_menu(
transcript=transcript,
intent=intent,
previous_selections=previous_steps,
)
if decision.get("action") == "dtmf":
await sip_engine.send_dtmf(call_id, decision["digits"])
```
The LLM receives:
- The IVR transcript ("Press 1 for billing, press 2 for technical support...")
- The user's intent ("dispute a charge on my December statement")
- Previous menu selections (to avoid loops)
And returns structured JSON:
```json
{
"action": "dtmf",
"digits": "1",
"reasoning": "Billing is the correct department for charge disputes"
}
```
## Event Publishing
The Hold Slayer publishes events throughout the process:
| Event | When |
|-------|------|
| `IVR_STEP` | Each step in the call flow is executed |
| `IVR_DTMF_SENT` | DTMF digits are sent |
| `IVR_MENU_DETECTED` | An IVR menu prompt is transcribed |
| `HOLD_DETECTED` | Hold music is detected |
| `HUMAN_DETECTED` | Live human speech detected after hold |
| `TRANSFER_STARTED` | Call bridge initiated to user's device |
| `TRANSFER_COMPLETE` | User's device answered, bridge active |
All events flow through the EventBus to WebSocket clients, MCP server, notification service, and analytics.
## Configuration
| Setting | Description | Default |
|---------|-------------|---------|
| `MAX_HOLD_TIME` | Maximum seconds to wait on hold before giving up | `7200` (2 hours) |
| `HOLD_CHECK_INTERVAL` | Seconds between audio classification checks | `2.0` |
| `DEFAULT_TRANSFER_DEVICE` | Device to transfer to when human detected | `sip_phone` |
| `CLASSIFIER_WINDOW_SECONDS` | Audio window size for classification | `3.0` |

155
docs/mcp-server.md Normal file
View File

@@ -0,0 +1,155 @@
# MCP Server
The MCP (Model Context Protocol) server lets any MCP-compatible AI assistant control the Hold Slayer gateway. Built with [FastMCP](https://github.com/jlowin/fastmcp), it exposes tools and resources over SSE.
## Overview
An AI assistant connects via SSE to the MCP server and gains access to tools for placing calls, checking status, sending DTMF, getting transcripts, and managing call flows. The assistant can orchestrate an entire call through natural language.
## Tools
### make_call
Place an outbound call through the SIP trunk.
| Param | Type | Required | Description |
|-------|------|----------|-------------|
| `number` | string | Yes | Phone number to call (E.164 format) |
| `mode` | string | No | Call mode: `direct`, `hold_slayer`, `ai_assisted` (default: `hold_slayer`) |
| `intent` | string | No | What you want to accomplish on the call |
| `call_flow_id` | string | No | ID of a stored call flow to follow |
Returns: Call ID and initial status.
### end_call
Hang up an active call.
| Param | Type | Required | Description |
|-------|------|----------|-------------|
| `call_id` | string | Yes | The call to hang up |
### send_dtmf
Send touch-tone digits to an active call (for manual IVR navigation).
| Param | Type | Required | Description |
|-------|------|----------|-------------|
| `call_id` | string | Yes | The call to send digits to |
| `digits` | string | Yes | DTMF digits to send (e.g., "1", "3#", "1234") |
### get_call_status
Check the current state of a call.
| Param | Type | Required | Description |
|-------|------|----------|-------------|
| `call_id` | string | Yes | The call to check |
Returns: Status, duration, hold time, audio classification, transcript excerpt.
### get_call_transcript
Get the live transcript of a call.
| Param | Type | Required | Description |
|-------|------|----------|-------------|
| `call_id` | string | Yes | The call to get transcript for |
Returns: Array of transcript chunks with timestamps and speaker labels.
### get_call_recording
Get recording metadata and file path for a call.
| Param | Type | Required | Description |
|-------|------|----------|-------------|
| `call_id` | string | Yes | The call to get recording for |
Returns: Recording path, duration, file size.
### list_active_calls
List all calls currently in progress. No parameters.
Returns: Array of active calls with status, number, duration.
### get_call_summary
Get analytics summary — hold times, success rates, call volume. No parameters.
Returns: Aggregate statistics across all calls.
### search_call_history
Search past calls by number, company, or date range.
| Param | Type | Required | Description |
|-------|------|----------|-------------|
| `query` | string | Yes | Search term (phone number, company name) |
| `limit` | int | No | Max results (default: 20) |
### learn_call_flow
Build a reusable call flow from a completed exploration call.
| Param | Type | Required | Description |
|-------|------|----------|-------------|
| `call_id` | string | Yes | The exploration call to learn from |
| `company` | string | No | Company name for the flow |
Returns: The generated CallFlow object.
## Resources
MCP resources provide read-only data that assistants can reference:
| Resource URI | Description |
|-------------|-------------|
| `gateway://status` | Current gateway status — trunk registration, active calls, service health |
| `gateway://calls` | List of all active calls with current status |
| `gateway://calls/{call_id}` | Detailed status for a specific call |
| `gateway://flows` | List of all stored call flows |
| `gateway://analytics` | Call analytics summary |
## Configuration
The MCP server is mounted on the FastAPI app at `/mcp`.
### Connecting an AI Assistant
Add to your MCP client configuration (e.g., Claude Desktop, Cline):
```json
{
"mcpServers": {
"hold-slayer": {
"url": "http://localhost:8000/mcp/sse"
}
}
}
```
## Example Conversation
Here is how an AI assistant would use the MCP tools to handle a complete call:
**User:** "Call Chase Bank and dispute the Amazon charge from December 15th"
**Assistant actions:**
1. Calls `make_call(number="+18005551234", mode="hold_slayer", intent="dispute Amazon charge Dec 15th", call_flow_id="chase-bank-main")`
2. Receives `call_id: "call_abc123"`
3. Polls `get_call_status("call_abc123")` periodically
4. Status progression: `trying``ringing``connected``on_hold`
5. Tells user: "I'm on hold with Chase Bank. Currently 4 minutes in. I'll let you know when someone picks up."
6. Status changes to `transferring` — human detected!
7. Tells user: "A live agent just picked up. I'm transferring the call to your desk phone now. Pick up!"
8. After the call, calls `learn_call_flow("call_abc123", company="Chase Bank")` to save the IVR path for next time.
**User:** "How long was I on hold?"
**Assistant actions:**
1. Calls `get_call_summary()`
2. Reports: "Your Chase Bank call lasted 12 minutes total, with 8 minutes on hold. The disputes department averages 6 minutes hold time on Tuesdays."

290
docs/services.md Normal file
View File

@@ -0,0 +1,290 @@
# Services
The intelligence layer services that power Hold Slayer's decision-making, transcription, recording, analytics, and notifications.
## LLM Client (`services/llm_client.py`)
Async HTTP client for any OpenAI-compatible chat completion API. No SDK dependency — just httpx.
### Supported Backends
| Backend | URL | Notes |
|---------|-----|-------|
| Ollama | `http://localhost:11434/v1` | Local, free, good for dev |
| LM Studio | `http://localhost:1234/v1` | Local, free, GUI |
| vLLM | `http://localhost:8000/v1` | Local, fast, production |
| OpenAI | `https://api.openai.com/v1` | Cloud, paid, best quality |
### Usage
```python
client = LLMClient(
base_url="http://localhost:11434/v1",
model="llama3",
api_key="not-needed", # Ollama doesn't need a key
timeout=30.0,
max_tokens=1024,
temperature=0.3,
)
# Simple chat
response = await client.chat("What is 2+2?")
# "4"
# Chat with system prompt
response = await client.chat(
"Parse this menu transcript...",
system="You are a phone menu parser. Return JSON.",
)
# Structured JSON response (auto-parses)
result = await client.chat_json(
"Extract menu options from: Press 1 for billing, press 2 for support",
system="Return JSON with 'options' array.",
)
# {"options": [{"digit": "1", "label": "billing"}, {"digit": "2", "label": "support"}]}
```
### IVR Menu Analysis
The primary use case — analyzing IVR transcripts to pick the right menu option:
```python
decision = await client.analyze_ivr_menu(
transcript="Welcome to Chase Bank. Press 1 for account balance, press 2 for recent transactions, press 3 for disputes, press 0 for an agent.",
intent="dispute a charge from Amazon on December 15th",
previous_selections=["main_menu"],
)
# {"action": "dtmf", "digits": "3", "reasoning": "Disputes is the correct department"}
```
### JSON Extraction
The client handles messy LLM output gracefully:
1. Try `json.loads()` on the raw response
2. If that fails, look for ```json ... ``` markdown blocks
3. If that fails, look for `{...}` patterns in the text
4. If all fail, return empty dict (caller handles gracefully)
### Stats Tracking
```python
stats = client.stats
# {
# "total_requests": 47,
# "total_errors": 2,
# "avg_latency_ms": 234.5,
# "model": "llama3",
# "base_url": "http://localhost:11434/v1"
# }
```
### Error Handling
- HTTP errors return empty string/dict (never crashes the call)
- Timeouts are configurable (default 30s)
- All errors are logged with full context
- Stats track error rates for monitoring
## Transcription Service (`services/transcription.py`)
Real-time speech-to-text using Speaches (a self-hosted Whisper API).
### Architecture
```
Audio frames (from AudioTap)
└── POST /v1/audio/transcriptions
├── model: whisper-large-v3
├── audio: WAV bytes
└── language: en
└── Response: { "text": "Press 1 for billing..." }
```
### Usage
```python
service = TranscriptionService(
speaches_url="http://perseus.helu.ca:22070",
model="whisper-large-v3",
)
# Transcribe audio bytes
text = await service.transcribe(audio_bytes)
# "Welcome to Chase Bank. For English, press 1."
# Transcribe with language hint
text = await service.transcribe(audio_bytes, language="fr")
```
### Integration with Hold Slayer
The transcription service is called when the audio classifier detects speech (IVR_PROMPT or LIVE_HUMAN). The transcript is then:
1. Published as a `TRANSCRIPT_CHUNK` event (→ WebSocket clients)
2. Fed to the LLM for IVR menu analysis
3. Stored in the call's transcript history
4. Used by the Call Flow Learner to build reusable flows
## Recording Service (`services/recording.py`)
Manages call recordings via the PJSUA2 media pipeline.
### Storage Structure
```
recordings/
├── 2026/
│ ├── 01/
│ │ ├── 15/
│ │ │ ├── call_abc123_outbound.wav
│ │ │ ├── call_abc123_mixed.wav
│ │ │ └── call_def456_outbound.wav
│ │ └── 16/
│ │ └── ...
│ └── 02/
│ └── ...
```
### Recording Types
| Type | Description |
|------|-------------|
| **Outbound** | Audio from the company (IVR, hold music, agent) |
| **Inbound** | Audio from the user's device (after transfer) |
| **Mixed** | Both parties in one file (for review) |
### Usage
```python
service = RecordingService(
storage_dir="recordings",
max_recording_seconds=7200, # 2 hours
sample_rate=16000,
)
# Start recording
session = await service.start_recording(call_id, stream_id)
# session.path = "recordings/2026/01/15/call_abc123_outbound.wav"
# Stop recording
metadata = await service.stop_recording(call_id)
# metadata = { "duration": 847.3, "file_size": 27113600, "path": "..." }
# List recordings for a call
recordings = service.get_recordings(call_id)
```
## Call Analytics (`services/call_analytics.py`)
Tracks call metrics and provides insights for monitoring and optimization.
### Metrics Tracked
| Metric | Description |
|--------|-------------|
| Hold time | Duration spent on hold per call |
| Total call duration | End-to-end call time |
| Success rate | Percentage of calls that reached a human |
| IVR navigation time | Time spent navigating menus |
| Company patterns | Per-company hold time averages |
| Time-of-day trends | When hold times are shortest |
### Usage
```python
analytics = CallAnalytics(max_history=10000)
# Record a completed call
analytics.record_call(
call_id="call_abc123",
number="+18005551234",
company="Chase Bank",
hold_time=780,
total_duration=847,
success=True,
ivr_steps=6,
)
# Get summary
summary = analytics.get_summary()
# {
# "total_calls": 142,
# "success_rate": 0.89,
# "avg_hold_time": 623.4,
# "avg_total_duration": 712.1,
# }
# Per-company stats
stats = analytics.get_company_stats("Chase Bank")
# {
# "total_calls": 23,
# "avg_hold_time": 845.2,
# "best_time": "Tuesday 10:00 AM",
# "success_rate": 0.91,
# }
# Top numbers by call volume
top = analytics.get_top_numbers(limit=10)
# Hold time trends by hour
trends = analytics.get_hold_time_trend()
# [{"hour": 9, "avg_hold": 320}, {"hour": 10, "avg_hold": 480}, ...]
```
## Notification Service (`services/notification.py`)
Sends alerts when important things happen on calls.
### Notification Channels
| Channel | Status | Use Case |
|---------|--------|----------|
| **WebSocket** | ✅ Active | Real-time UI updates (always on) |
| **SMS** | ✅ Active | Critical alerts (human detected, call failed) |
| **Push** | 🔮 Future | Mobile app notifications |
### Notification Priority
| Priority | Events | Delivery |
|----------|--------|----------|
| `CRITICAL` | Human detected, transfer started | WebSocket + SMS |
| `HIGH` | Call failed, call timeout | WebSocket + SMS |
| `NORMAL` | Hold detected, call ended | WebSocket only |
| `LOW` | IVR step, DTMF sent | WebSocket only |
### Event → Notification Mapping
| Event | Notification |
|-------|-------------|
| `HUMAN_DETECTED` | 🚨 "A live person picked up — transferring you now!" |
| `TRANSFER_STARTED` | 📞 "Your call has been connected. Pick up your phone!" |
| `CALL_FAILED` | ❌ "The call couldn't be completed." |
| `HOLD_DETECTED` | ⏳ "You're on hold. We'll notify you when someone picks up." |
| `IVR_STEP` | 📍 "Navigating phone menu..." |
| `IVR_DTMF_SENT` | 📱 "Pressed 3" |
| `CALL_ENDED` | 📴 "The call has ended." |
### Deduplication
The notification service tracks what's been sent per call to avoid spamming:
```python
# Won't send duplicate "on hold" notifications for the same call
self._notified: dict[str, set[str]] # call_id → set of event dedup keys
```
Tracking is cleaned up when a call ends.
### SMS Configuration
SMS is sent for `CRITICAL` priority notifications when `NOTIFY_SMS_NUMBER` is configured:
```env
NOTIFY_SMS_NUMBER=+15559876543
```
The SMS sender is a placeholder — wire up your preferred provider (Twilio, AWS SNS, etc.).

230
main.py Normal file
View File

@@ -0,0 +1,230 @@
"""
Hold Slayer Gateway — FastAPI Application Entry Point.
Your personal AI-powered telephony platform.
Navigates IVRs, waits on hold, and connects you when a human answers.
Usage:
uvicorn main:app --host 0.0.0.0 --port 8000 --reload
# Or directly:
python main.py
"""
import logging
import sys
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from api import calls, call_flows, devices, websocket
from config import get_settings
from core.gateway import AIPSTNGateway
from db.database import close_db, init_db
from mcp_server.server import create_mcp_server
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s",
datefmt="%H:%M:%S",
stream=sys.stdout,
)
logger = logging.getLogger(__name__)
def _handle_db_error(exc: Exception) -> None:
"""Log a clear, human-readable database error and exit cleanly."""
# Walk the exception chain to find the root asyncpg/psycopg cause
cause = getattr(exc, "__cause__", None) or getattr(exc, "__context__", None)
root = cause or exc
root_type = type(root).__name__
root_msg = str(root)
if "InvalidPasswordError" in root_type or "password authentication failed" in root_msg:
logger.critical(
"\n"
"❌ Database authentication failed — wrong password.\n"
" The password in DATABASE_URL does not match the PostgreSQL user.\n"
" Fix DATABASE_URL in your .env file and restart.\n"
" Default: DATABASE_URL=postgresql+asyncpg://holdslayer:changeme@localhost:5432/holdslayer"
)
elif "InvalidCatalogNameError" in root_type or "does not exist" in root_msg:
logger.critical(
"\n"
"❌ Database does not exist.\n"
" Create it first: createdb holdslayer\n"
" Or update DATABASE_URL in your .env file."
)
elif (
"Connection refused" in root_msg
or "could not connect" in root_msg.lower()
):
logger.critical(
"\n"
"\u274c Cannot reach PostgreSQL \u2014 connection refused.\n"
" Is PostgreSQL running? Check DATABASE_URL in your .env file."
)
elif (
"nodename nor servname" in root_msg
or "Name or service not known" in root_msg
):
logger.critical(
"\n"
f"❌ Cannot resolve the database hostname.\n"
f" Check the host in DATABASE_URL in your .env file. (detail: {root_msg})"
)
else:
logger.critical(
f"\n❌ Database initialisation failed: {root_msg}\n"
f" Check DATABASE_URL in your .env file."
)
sys.exit(1)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Startup: Initialize database, SIP engine, and services."""
settings = get_settings()
# Initialize database
logger.info("Initializing database...")
try:
await init_db()
except Exception as e:
_handle_db_error(e)
# Boot the telephony engine
gateway = AIPSTNGateway.from_config()
await gateway.start()
app.state.gateway = gateway
# Start auxiliary services
from services.notification import NotificationService
from services.recording import RecordingService
from services.call_analytics import CallAnalytics
from services.call_flow_learner import CallFlowLearner
notification_svc = NotificationService(gateway.event_bus, settings)
await notification_svc.start()
app.state.notification_service = notification_svc
recording_svc = RecordingService()
await recording_svc.start()
app.state.recording_service = recording_svc
analytics_svc = CallAnalytics()
app.state.analytics_service = analytics_svc
flow_learner = CallFlowLearner()
app.state.flow_learner = flow_learner
# Create and mount MCP server
mcp = create_mcp_server(gateway)
app.state.mcp = mcp
logger.info("=" * 60)
logger.info("🔥 Hold Slayer Gateway is LIVE")
# Show a usable URL — 0.0.0.0 is the bind address, not a browser URL
display_host = "localhost" if settings.host in ("0.0.0.0", "::") else settings.host
# When launched via `uvicorn main:app --port XXXX`, the CLI --port arg
# takes precedence over settings.port (which comes from .env).
display_port = settings.port
for i, arg in enumerate(sys.argv):
if arg in ("--port", "-p") and i + 1 < len(sys.argv):
try:
display_port = int(sys.argv[i + 1])
except ValueError:
pass
logger.info(f" API: http://{display_host}:{display_port}")
logger.info(f" API Docs: http://{display_host}:{display_port}/docs")
logger.info(f" WebSocket: ws://{display_host}:{display_port}/ws/events")
logger.info(f" MCP: Available via FastMCP")
logger.info("=" * 60)
yield
# Shutdown
logger.info("Shutting down Hold Slayer Gateway...")
await notification_svc.stop()
await gateway.stop()
await close_db()
logger.info("Gateway shut down cleanly. 👋")
app = FastAPI(
title="Hold Slayer Gateway",
description=(
"🗡️ AI PSTN Gateway — Navigate IVRs, wait on hold, "
"and connect you when a human answers.\n\n"
"## Quick Start\n"
"1. **POST /api/calls/hold-slayer** — Launch the Hold Slayer\n"
"2. **GET /api/calls/{call_id}** — Check call status\n"
"3. **WS /ws/events** — Real-time event stream\n"
"4. **GET /api/call-flows** — Manage stored IVR trees\n"
),
version="0.1.0",
lifespan=lifespan,
)
# === API Routes ===
app.include_router(calls.router, prefix="/api/calls", tags=["Calls"])
app.include_router(call_flows.router, prefix="/api/call-flows", tags=["Call Flows"])
app.include_router(devices.router, prefix="/api/devices", tags=["Devices"])
app.include_router(websocket.router, prefix="/ws", tags=["WebSocket"])
# === Root Endpoint ===
@app.get("/", tags=["System"])
async def root():
"""Gateway root — health check and quick status."""
gateway = getattr(app.state, "gateway", None)
if gateway:
status = await gateway.status()
return {
"name": "Hold Slayer Gateway",
"version": "0.1.0",
"status": "running",
"uptime": status["uptime"],
"active_calls": status["active_calls"],
"trunk": status["trunk"],
}
return {
"name": "Hold Slayer Gateway",
"version": "0.1.0",
"status": "starting",
}
@app.get("/health", tags=["System"])
async def health():
"""Health check endpoint."""
gateway = getattr(app.state, "gateway", None)
ready = gateway is not None and await gateway.sip_engine.is_ready()
trunk_status = await gateway.sip_engine.get_trunk_status() if gateway else {"registered": False}
return {
"status": "healthy" if ready else "degraded",
"gateway": "ready" if gateway else "not initialized",
"sip_engine": "ready" if ready else "not ready",
"sip_trunk": {
"registered": trunk_status.get("registered", False),
"host": trunk_status.get("host"),
"mock": trunk_status.get("mock", False),
"reason": trunk_status.get("reason"),
},
}
if __name__ == "__main__":
import uvicorn
settings = get_settings()
uvicorn.run(
"main:app",
host=settings.host,
port=settings.port,
reload=settings.debug,
log_level=settings.log_level,
)

1
mcp_server/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""MCP server — AI assistant tools and resources for gateway control."""

512
mcp_server/server.py Normal file
View File

@@ -0,0 +1,512 @@
"""
MCP Server — AI assistant tools and resources for gateway control.
Any MCP-compatible AI assistant can use these tools to:
- Place calls and launch the Hold Slayer
- Check call status
- Manage call flows
- Search transcripts
- Control devices
Example from an AI assistant:
"Call Chase Bank and dispute the charge from Amazon on Dec 15th"
→ make_call("+18005551234", "hold_slayer", "dispute Amazon charge Dec 15th", "chase-bank-main")
"""
import json
import logging
from typing import Optional
from fastmcp import FastMCP
from core.gateway import AIPSTNGateway
logger = logging.getLogger(__name__)
def create_mcp_server(gateway: AIPSTNGateway) -> FastMCP:
"""Create and configure the MCP server with all tools and resources."""
mcp = FastMCP("Hold Slayer Gateway")
# ================================================================
# Tools
# ================================================================
@mcp.tool()
async def make_call(
number: str,
mode: str = "direct",
intent: str = "",
call_flow_id: str = "",
device: str = "",
) -> str:
"""
Place an outbound phone call.
Args:
number: Phone number to call (E.164 format, e.g., +18005551234)
mode: "direct" (connect immediately), "hold_slayer" (navigate IVR + wait on hold), or "ai_assisted"
intent: What you need — used by hold_slayer to navigate IVR menus (e.g., "dispute a charge", "cancel my card")
call_flow_id: Optional stored call flow ID to follow (e.g., "chase-bank-main")
device: Target device to ring/transfer to (e.g., "sip_phone", "cell")
Returns:
Call ID and status
"""
from models.call import CallMode
mode_map = {
"direct": CallMode.DIRECT,
"hold_slayer": CallMode.HOLD_SLAYER,
"ai_assisted": CallMode.AI_ASSISTED,
}
call = await gateway.make_call(
number=number,
mode=mode_map.get(mode, CallMode.DIRECT),
intent=intent or None,
call_flow_id=call_flow_id or None,
device=device or None,
)
return (
f"Call {call.id} initiated.\n"
f" Number: {number}\n"
f" Mode: {mode}\n"
f" Status: {call.status.value}\n"
f" Intent: {intent or 'N/A'}\n"
f" Call Flow: {call_flow_id or 'exploration mode'}"
)
@mcp.tool()
async def get_call_status(call_id: str) -> str:
"""
Get the current status of a call.
Shows: status, duration, hold time, current audio type, recent transcript.
"""
call = gateway.get_call(call_id)
if not call:
return f"Call {call_id} not found. It may have already ended."
transcript_tail = call.transcript[-300:] if call.transcript else "No transcript yet"
return (
f"Call {call_id}:\n"
f" Number: {call.remote_number}\n"
f" Status: {call.status.value}\n"
f" Mode: {call.mode.value}\n"
f" Duration: {call.duration}s\n"
f" Hold Time: {call.hold_time}s\n"
f" Audio Type: {call.current_classification.value}\n"
f" Intent: {call.intent or 'N/A'}\n"
f" Current Step: {call.current_step_id or 'N/A'}\n"
f" Transcript (last 300 chars): ...{transcript_tail}"
)
@mcp.tool()
async def transfer_call(call_id: str, device: str) -> str:
"""
Transfer an active call to a specific device.
Args:
call_id: The call to transfer
device: Target device ID (e.g., "sip_phone", "cell")
"""
try:
await gateway.transfer_call(call_id, device)
return f"Call {call_id} transferred to {device}."
except ValueError as e:
return f"Transfer failed: {e}"
@mcp.tool()
async def hangup(call_id: str) -> str:
"""Hang up a call."""
try:
await gateway.hangup_call(call_id)
return f"Call {call_id} hung up."
except ValueError as e:
return f"Hangup failed: {e}"
@mcp.tool()
async def list_active_calls() -> str:
"""List all currently active calls with their status."""
calls = gateway.call_manager.active_calls
if not calls:
return "No active calls."
lines = ["Active calls:"]
for call in calls.values():
lines.append(
f" {call.id}: {call.remote_number} "
f"({call.status.value}, {call.duration}s, "
f"hold: {call.hold_time}s, "
f"audio: {call.current_classification.value})"
)
return "\n".join(lines)
@mcp.tool()
async def get_call_flow(phone_number: str) -> str:
"""
Look up a stored call flow for a phone number.
Returns the IVR navigation tree if one exists.
"""
from db.database import StoredCallFlow, get_session_factory
from sqlalchemy import select
try:
factory = get_session_factory()
async with factory() as session:
result = await session.execute(
select(StoredCallFlow).where(
StoredCallFlow.phone_number == phone_number
)
)
row = result.scalar_one_or_none()
if not row:
return f"No stored call flow for {phone_number}."
return (
f"Call Flow: {row.name}\n"
f" Phone: {row.phone_number}\n"
f" Description: {row.description}\n"
f" Steps: {len(row.steps)}\n"
f" Avg Hold Time: {row.avg_hold_time or 'unknown'}s\n"
f" Success Rate: {row.success_rate or 'unknown'}\n"
f" Times Used: {row.times_used or 0}\n"
f" Last Used: {row.last_used or 'never'}\n"
f" Notes: {row.notes or 'none'}\n"
f" Flow ID: {row.id}"
)
except Exception as e:
return f"Error looking up call flow: {e}"
@mcp.tool()
async def create_call_flow(
name: str,
phone_number: str,
steps_json: str,
notes: str = "",
) -> str:
"""
Store a new IVR call flow for a phone number.
The hold slayer will follow this tree instead of exploring blind.
Args:
name: Human-readable name (e.g., "Chase Bank - Main Customer Service")
phone_number: Phone number in E.164 format
steps_json: JSON array of call flow steps. Each step has:
- id: unique step identifier
- description: what this step does
- action: "dtmf", "speak", "wait", "listen", "hold", or "transfer"
- action_value: DTMF digits, speech text, or device target
- expect: regex/keywords for what you expect to hear
- timeout: seconds to wait
- next_step: ID of next step on success
- fallback_step: ID of step if unexpected response
- notes: any helpful notes
notes: General notes about this call flow
"""
from slugify import slugify as do_slugify
from db.database import StoredCallFlow, get_session_factory
try:
steps = json.loads(steps_json)
flow_id = do_slugify(name)
factory = get_session_factory()
async with factory() as session:
db_flow = StoredCallFlow(
id=flow_id,
name=name,
phone_number=phone_number,
description=f"Created by AI assistant",
steps=steps,
notes=notes or None,
tags=["ai-created"],
)
session.add(db_flow)
await session.commit()
return f"Call flow '{name}' saved for {phone_number} (ID: {flow_id})"
except json.JSONDecodeError:
return "Error: steps_json must be valid JSON."
except Exception as e:
return f"Error creating call flow: {e}"
@mcp.tool()
async def send_dtmf(call_id: str, digits: str) -> str:
"""
Send DTMF tones on an active call.
Args:
call_id: The call to send tones on
digits: DTMF digits to send (e.g., "1", "2", "123#")
"""
call = gateway.get_call(call_id)
if not call:
return f"Call {call_id} not found."
for leg_id, cid in gateway.call_manager._call_legs.items():
if cid == call_id:
await gateway.sip_engine.send_dtmf(leg_id, digits)
return f"Sent DTMF '{digits}' on call {call_id}."
return f"No active SIP leg found for call {call_id}."
@mcp.tool()
async def get_call_transcript(call_id: str) -> str:
"""
Get the full transcript for an active or recent call.
Returns the complete transcript text.
"""
call = gateway.get_call(call_id)
if not call:
return f"Call {call_id} not found."
if not call.transcript:
return f"No transcript yet for call {call_id}."
return (
f"Transcript for call {call_id} "
f"({call.remote_number}, {call.duration}s):\n\n"
f"{call.transcript}"
)
@mcp.tool()
async def get_call_recording(call_id: str) -> str:
"""
Get info about a call's recording.
Returns the recording file path and status.
"""
from db.database import CallRecord, get_session_factory
from sqlalchemy import select
try:
factory = get_session_factory()
async with factory() as session:
result = await session.execute(
select(CallRecord).where(CallRecord.id == call_id)
)
record = result.scalar_one_or_none()
if not record:
return f"No record found for call {call_id}."
if not record.recording_path:
return f"Call {call_id} has no recording."
return (
f"Recording for call {call_id}:\n"
f" Path: {record.recording_path}\n"
f" Duration: {record.duration}s\n"
f" Number: {record.remote_number}"
)
except Exception as e:
return f"Error looking up recording: {e}"
@mcp.tool()
async def get_call_summary(call_id: str) -> str:
"""
Get an AI-generated summary and action items for a call.
Returns the summary, action items, and sentiment analysis.
"""
from db.database import CallRecord, get_session_factory
from sqlalchemy import select
try:
factory = get_session_factory()
async with factory() as session:
result = await session.execute(
select(CallRecord).where(CallRecord.id == call_id)
)
record = result.scalar_one_or_none()
if not record:
return f"No record found for call {call_id}."
lines = [f"Call Summary for {call_id}:"]
lines.append(f" Number: {record.remote_number}")
lines.append(f" Status: {record.status}")
lines.append(f" Duration: {record.duration}s")
lines.append(f" Hold Time: {record.hold_time}s")
if record.summary:
lines.append(f"\n Summary: {record.summary}")
else:
lines.append("\n Summary: Not yet generated")
if record.action_items:
lines.append("\n Action Items:")
for item in record.action_items:
lines.append(f"{item}")
if record.sentiment:
lines.append(f"\n Sentiment: {record.sentiment}")
return "\n".join(lines)
except Exception as e:
return f"Error looking up call summary: {e}"
@mcp.tool()
async def search_call_history(
phone_number: str = "",
intent: str = "",
limit: int = 10,
) -> str:
"""
Search past call records.
Args:
phone_number: Filter by phone number (partial match)
intent: Filter by intent text (partial match)
limit: Max results to return (default 10)
"""
from db.database import CallRecord, get_session_factory
from sqlalchemy import select
try:
factory = get_session_factory()
async with factory() as session:
query = select(CallRecord).order_by(
CallRecord.started_at.desc()
).limit(limit)
if phone_number:
query = query.where(
CallRecord.remote_number.contains(phone_number)
)
if intent:
query = query.where(
CallRecord.intent.icontains(intent)
)
result = await session.execute(query)
records = result.scalars().all()
if not records:
return "No matching call records found."
lines = [f"Call History ({len(records)} records):"]
for r in records:
lines.append(
f" {r.id}: {r.remote_number} "
f"({r.status}, {r.duration}s, "
f"hold: {r.hold_time}s) "
f"{r.intent or 'no intent'} "
f"[{r.started_at}]"
)
return "\n".join(lines)
except Exception as e:
return f"Error searching call history: {e}"
@mcp.tool()
async def learn_call_flow(call_id: str, name: str = "") -> str:
"""
Learn a call flow from a completed call's event history.
Analyzes the IVR navigation events from a call to build a
reusable call flow for next time.
Args:
call_id: The call to learn from
name: Optional name for the flow (auto-generated if empty)
"""
from services.call_flow_learner import CallFlowLearner
try:
learner = CallFlowLearner(gateway.event_bus, gateway.settings)
flow = await learner.learn_from_call(call_id, name or None)
if flow:
return (
f"Learned call flow '{flow.name}' from call {call_id}:\n"
f" Phone: {flow.phone_number}\n"
f" Steps: {len(flow.steps)}\n"
f" Flow ID: {flow.id}"
)
return f"Could not learn a call flow from call {call_id}. Not enough IVR navigation data."
except Exception as e:
return f"Error learning call flow: {e}"
@mcp.tool()
async def list_devices() -> str:
"""List all registered devices and their online/offline status."""
devices = gateway.devices
if not devices:
return "No devices registered."
lines = ["Registered devices:"]
for d in devices.values():
status = "🟢 Online" if d.is_online else "🔴 Offline"
lines.append(f" {d.id}: {d.name} ({d.type.value}) - {status}")
return "\n".join(lines)
@mcp.tool()
async def gateway_status() -> str:
"""Get full gateway status — trunk, devices, active calls, uptime."""
status = await gateway.status()
trunk = status["trunk"]
lines = [
"🔥 Hold Slayer Gateway Status",
f" Uptime: {status['uptime'] or 0}s",
f" SIP Trunk: {'✅ registered' if trunk.get('registered') else '❌ not registered'}",
f" Active Calls: {status['active_calls']}",
f" Event Subscribers: {status['event_subscribers']}",
f" Devices:",
]
for dev_id, info in status.get("devices", {}).items():
online = "🟢" if info.get("online") else "🔴"
lines.append(f" {online} {info.get('name', dev_id)}")
return "\n".join(lines)
# ================================================================
# Resources
# ================================================================
@mcp.resource("gateway://status")
async def resource_gateway_status() -> str:
"""Current gateway status — trunk, devices, active calls."""
status = await gateway.status()
return json.dumps(status, default=str, indent=2)
@mcp.resource("gateway://call-flows")
async def resource_call_flows() -> str:
"""List all stored call flows."""
from db.database import StoredCallFlow, get_session_factory
from sqlalchemy import select
try:
factory = get_session_factory()
async with factory() as session:
result = await session.execute(select(StoredCallFlow))
rows = result.scalars().all()
flows = [
{
"id": r.id,
"name": r.name,
"phone_number": r.phone_number,
"steps": len(r.steps) if r.steps else 0,
"avg_hold_time": r.avg_hold_time,
"times_used": r.times_used,
}
for r in rows
]
return json.dumps(flows, default=str, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
@mcp.resource("gateway://active-calls")
async def resource_active_calls() -> str:
"""All currently active calls."""
calls = gateway.call_manager.active_calls
return json.dumps(
[c.summary() for c in calls.values()],
default=str,
indent=2,
)
return mcp

1
models/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Pydantic models — call flows, calls, contacts, devices, events."""

169
models/call.py Normal file
View File

@@ -0,0 +1,169 @@
"""
Call models — Active call state, requests, and responses.
"""
from datetime import datetime
from enum import Enum
from typing import Optional
from pydantic import BaseModel, Field
class CallStatus(str, Enum):
"""Call lifecycle states."""
INITIATING = "initiating"
RINGING = "ringing"
CONNECTED = "connected"
NAVIGATING_IVR = "navigating_ivr"
ON_HOLD = "on_hold"
HUMAN_DETECTED = "human_detected"
TRANSFERRING = "transferring"
BRIDGED = "bridged" # User is connected to the remote party
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
class CallMode(str, Enum):
"""How the call should be handled."""
DIRECT = "direct" # Call and connect immediately
HOLD_SLAYER = "hold_slayer" # Navigate IVR, wait on hold, transfer when human
AI_ASSISTED = "ai_assisted" # Connect with transcription, recording, noise cancel
class AudioClassification(str, Enum):
"""What kind of audio is currently playing."""
SILENCE = "silence"
MUSIC = "music" # Hold music
IVR_PROMPT = "ivr_prompt" # Automated voice (TTS/recording)
LIVE_HUMAN = "live_human" # Real person talking
RINGING = "ringing" # Ring-back tone
DTMF = "dtmf" # Touch tones
UNKNOWN = "unknown"
class ClassificationResult(BaseModel):
"""A single audio classification at a point in time."""
timestamp: float # Unix timestamp
audio_type: AudioClassification
confidence: float # 0.0 - 1.0
details: Optional[dict] = None # Extra analysis data
class ActiveCall(BaseModel):
"""In-memory state for an active call."""
id: str
direction: str = "outbound"
remote_number: str
status: CallStatus = CallStatus.INITIATING
mode: CallMode = CallMode.DIRECT
intent: Optional[str] = None
call_flow_id: Optional[str] = None
device: Optional[str] = None
started_at: datetime = Field(default_factory=datetime.now)
connected_at: Optional[datetime] = None
hold_started_at: Optional[datetime] = None
current_classification: AudioClassification = AudioClassification.UNKNOWN
classification_history: list[ClassificationResult] = Field(default_factory=list)
transcript_chunks: list[str] = Field(default_factory=list)
current_step_id: Optional[str] = None # Current position in call flow
services: list[str] = Field(default_factory=list) # Active services on this call
@property
def duration(self) -> int:
"""Total call duration in seconds."""
if self.connected_at:
return int((datetime.now() - self.connected_at).total_seconds())
return 0
@property
def hold_time(self) -> int:
"""Time spent on hold in seconds."""
if self.hold_started_at and self.status == CallStatus.ON_HOLD:
return int((datetime.now() - self.hold_started_at).total_seconds())
return 0
@property
def transcript(self) -> str:
"""Full transcript so far."""
return "\n".join(self.transcript_chunks)
def summary(self) -> dict:
"""Compact summary for list views."""
return {
"call_id": self.id,
"remote_number": self.remote_number,
"status": self.status.value,
"mode": self.mode.value,
"duration": self.duration,
"hold_time": self.hold_time,
"audio_type": self.current_classification.value,
"intent": self.intent,
}
# ============================================================
# API Request/Response Models
# ============================================================
class CallRequest(BaseModel):
"""Request to place an outbound call."""
number: str # E.164 format
mode: CallMode = CallMode.DIRECT
intent: Optional[str] = None # What you need (for hold_slayer IVR navigation)
device: Optional[str] = None # Target device to ring / transfer to
call_flow_id: Optional[str] = None # Use a stored IVR tree
services: list[str] = Field(
default_factory=lambda: ["recording", "transcription"]
)
class HoldSlayerRequest(BaseModel):
"""Request to launch the Hold Slayer."""
number: str # E.164 format
intent: str # "dispute a charge on my December statement"
call_flow_id: Optional[str] = None # Optional: use stored IVR tree
transfer_to: Optional[str] = None # Device to ring when human detected
notify: list[str] = Field(default_factory=lambda: ["push"]) # Notification channels
class CallResponse(BaseModel):
"""Response after initiating a call."""
call_id: str
status: str
number: str
mode: str
message: Optional[str] = None
class CallStatusResponse(BaseModel):
"""Full status of an active or completed call."""
call_id: str
status: str
direction: str
remote_number: str
mode: str
duration: int
hold_time: int
audio_type: str
intent: Optional[str] = None
transcript_excerpt: Optional[str] = None # Last N chars
classification_history: list[ClassificationResult] = Field(default_factory=list)
current_step: Optional[str] = None
services: list[str] = Field(default_factory=list)
class TransferRequest(BaseModel):
"""Request to transfer a call to a device."""
device: str # Device ID or type

108
models/call_flow.py Normal file
View File

@@ -0,0 +1,108 @@
"""
Call Flow models — IVR navigation trees.
Store known IVR structures for phone numbers you call regularly.
The Hold Slayer follows the map instead of exploring blind.
"""
from datetime import datetime
from enum import Enum
from typing import Optional
from pydantic import BaseModel, Field
class ActionType(str, Enum):
"""Actions the Hold Slayer can take at each IVR step."""
DTMF = "dtmf" # Press a button
SPEAK = "speak" # Say something (for speech-recognition IVRs)
WAIT = "wait" # Wait for prompt
LISTEN = "listen" # Listen and let LLM decide
HOLD = "hold" # On hold — activate hold detection
TRANSFER = "transfer" # Transfer to user's device
class CallFlowStep(BaseModel):
"""A single step in an IVR navigation tree."""
id: str
description: str # Human-readable: "Main menu"
expect: Optional[str] = None # What we expect to hear (regex or keywords)
action: ActionType
action_value: Optional[str] = None # DTMF digit(s), speech text, device target
timeout: int = 30 # Seconds to wait before retry/fallback
next_step: Optional[str] = None # Next step ID on success
fallback_step: Optional[str] = None # Step ID if unexpected response
notes: Optional[str] = None # "They changed this menu in Jan 2025"
class CallFlow(BaseModel):
"""A complete IVR navigation tree for a phone number."""
id: str
name: str # "Chase Bank - Main Line"
phone_number: str # "+18005551234"
description: str = ""
last_verified: Optional[datetime] = None
steps: list[CallFlowStep]
tags: list[str] = Field(default_factory=list)
notes: Optional[str] = None
# Stats from previous runs
avg_hold_time: Optional[int] = None # seconds
success_rate: Optional[float] = None # 0.0 - 1.0
last_used: Optional[datetime] = None
times_used: int = 0
def get_step(self, step_id: str) -> Optional[CallFlowStep]:
"""Look up a step by ID."""
for step in self.steps:
if step.id == step_id:
return step
return None
def first_step(self) -> Optional[CallFlowStep]:
"""Get the first step in the flow."""
return self.steps[0] if self.steps else None
def steps_by_id(self) -> dict[str, CallFlowStep]:
"""Return a dict mapping step ID -> step for fast lookups."""
return {s.id: s for s in self.steps}
class CallFlowCreate(BaseModel):
"""Request model for creating a new call flow."""
name: str
phone_number: str
description: str = ""
steps: list[CallFlowStep]
tags: list[str] = Field(default_factory=list)
notes: Optional[str] = None
class CallFlowUpdate(BaseModel):
"""Request model for updating an existing call flow."""
name: Optional[str] = None
description: Optional[str] = None
steps: Optional[list[CallFlowStep]] = None
tags: Optional[list[str]] = None
notes: Optional[str] = None
last_verified: Optional[datetime] = None
class CallFlowSummary(BaseModel):
"""Lightweight summary for list views."""
id: str
name: str
phone_number: str
description: str = ""
step_count: int
avg_hold_time: Optional[int] = None
success_rate: Optional[float] = None
last_used: Optional[datetime] = None
times_used: int = 0
tags: list[str] = Field(default_factory=list)

60
models/contact.py Normal file
View File

@@ -0,0 +1,60 @@
"""
Contact models — People and organizations you call.
"""
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, Field
class PhoneNumber(BaseModel):
"""A phone number associated with a contact."""
number: str # E.164 format
label: str = "main" # main, mobile, work, home, fax, etc.
primary: bool = False
class ContactBase(BaseModel):
"""Shared contact fields."""
name: str
phone_numbers: list[PhoneNumber]
category: Optional[str] = None # personal / business / service
routing_preference: Optional[str] = None # how to handle their calls
notes: Optional[str] = None
class Contact(ContactBase):
"""Full contact model."""
id: str
call_count: int = 0
last_call: Optional[datetime] = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
@property
def primary_number(self) -> Optional[str]:
"""Get the primary phone number."""
for pn in self.phone_numbers:
if pn.primary:
return pn.number
return self.phone_numbers[0].number if self.phone_numbers else None
class ContactCreate(ContactBase):
"""Request model for creating a contact."""
pass
class ContactUpdate(BaseModel):
"""Request model for updating a contact."""
name: Optional[str] = None
phone_numbers: Optional[list[PhoneNumber]] = None
category: Optional[str] = None
routing_preference: Optional[str] = None
notes: Optional[str] = None

81
models/device.py Normal file
View File

@@ -0,0 +1,81 @@
"""
Device models — SIP phones, softphones, cell phones.
Devices register with the gateway and can receive transferred calls.
"""
from datetime import datetime
from enum import Enum
from typing import Optional
from pydantic import BaseModel, Field
class DeviceType(str, Enum):
"""Types of devices that can connect to the gateway."""
SIP_PHONE = "sip_phone" # Hardware SIP phone
SOFTPHONE = "softphone" # Software SIP client
CELL = "cell" # Cell phone (reached via PSTN trunk)
TABLET = "tablet" # Tablet with SIP client
WEBRTC = "webrtc" # Browser-based WebRTC client
class DeviceBase(BaseModel):
"""Shared device fields."""
name: str # "Office SIP Phone"
type: DeviceType
extension: Optional[int] = None # 221-299, auto-assigned if omitted
sip_uri: Optional[str] = None # sip:robert@gateway.helu.ca
phone_number: Optional[str] = None # For PSTN devices (E.164)
priority: int = 10 # Routing priority (lower = higher priority)
capabilities: list[str] = Field(default_factory=lambda: ["voice"])
class Device(DeviceBase):
"""Full device model."""
id: str
is_online: bool = False
last_seen: Optional[datetime] = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
@property
def can_receive_call(self) -> bool:
"""Can this device receive a call right now?"""
if self.type in (DeviceType.SIP_PHONE, DeviceType.SOFTPHONE, DeviceType.WEBRTC):
return self.is_online and self.sip_uri is not None
if self.type == DeviceType.CELL:
return self.phone_number is not None
return False
class DeviceCreate(DeviceBase):
"""Request model for registering a new device."""
pass
class DeviceUpdate(BaseModel):
"""Request model for updating a device."""
name: Optional[str] = None
type: Optional[DeviceType] = None
extension: Optional[int] = None
sip_uri: Optional[str] = None
phone_number: Optional[str] = None
priority: Optional[int] = None
capabilities: Optional[list[str]] = None
class DeviceStatus(BaseModel):
"""Lightweight device status for list views."""
id: str
name: str
type: DeviceType
is_online: bool
last_seen: Optional[datetime] = None
can_receive_call: bool

69
models/events.py Normal file
View File

@@ -0,0 +1,69 @@
"""
Event models — Real-time events published via WebSocket and event bus.
These events drive the dashboard, notifications, and MCP updates.
"""
from datetime import datetime
from enum import Enum
from typing import Any, Optional
from pydantic import BaseModel, Field
class EventType(str, Enum):
"""Types of events the gateway can emit."""
# Call lifecycle
CALL_INITIATED = "call.initiated"
CALL_RINGING = "call.ringing"
CALL_CONNECTED = "call.connected"
CALL_ENDED = "call.ended"
CALL_FAILED = "call.failed"
# Hold Slayer
IVR_STEP = "holdslayer.ivr_step"
IVR_DTMF_SENT = "holdslayer.dtmf_sent"
HOLD_DETECTED = "holdslayer.hold_detected"
HUMAN_DETECTED = "holdslayer.human_detected"
TRANSFER_STARTED = "holdslayer.transfer_started"
TRANSFER_COMPLETE = "holdslayer.transfer_complete"
# Audio
AUDIO_CLASSIFIED = "audio.classified"
TRANSCRIPT_CHUNK = "audio.transcript_chunk"
# Device
DEVICE_REGISTERED = "device.registered"
DEVICE_ONLINE = "device.online"
DEVICE_OFFLINE = "device.offline"
# System
GATEWAY_STARTED = "system.gateway_started"
GATEWAY_STOPPING = "system.gateway_stopping"
ERROR = "system.error"
# SIP Trunk
SIP_TRUNK_REGISTERED = "sip.trunk.registered"
SIP_TRUNK_REGISTRATION_FAILED = "sip.trunk.registration_failed"
SIP_TRUNK_UNREGISTERED = "sip.trunk.unregistered"
class GatewayEvent(BaseModel):
"""A real-time event from the gateway."""
type: EventType
call_id: Optional[str] = None
timestamp: datetime = Field(default_factory=datetime.now)
data: dict[str, Any] = Field(default_factory=dict)
message: Optional[str] = None # Human-readable description
def to_ws_message(self) -> dict:
"""Serialize for WebSocket transmission."""
return {
"type": self.type.value,
"call_id": self.call_id,
"timestamp": self.timestamp.isoformat(),
"data": self.data,
"message": self.message,
}

71
pyproject.toml Normal file
View File

@@ -0,0 +1,71 @@
[project]
name = "hold-slayer"
version = "0.1.0"
description = "AI PSTN Gateway - Hold Slayer: Navigate IVRs, wait on hold, connect you when a human answers"
readme = "README.md"
requires-python = ">=3.12"
license = "MIT"
authors = [
{name = "Robert"},
]
dependencies = [
# Web framework
"fastapi>=0.115.0",
"uvicorn[standard]>=0.32.0",
"websockets>=13.0",
# Database
"sqlalchemy[asyncio]>=2.0.36",
"asyncpg>=0.30.0",
"alembic>=1.14.0",
# Settings & validation
"pydantic>=2.10.0",
"pydantic-settings>=2.6.0",
# SIP signaling
"sippy>=1.2.0",
# Audio analysis
"numpy>=1.26.0",
"librosa>=0.10.0",
"soundfile>=0.12.0",
# HTTP client (for Speaches STT)
"httpx>=0.28.0",
# MCP server
"fastmcp>=2.0.0",
# Utilities
"python-slugify>=8.0.0",
"python-multipart>=0.0.12",
]
[project.optional-dependencies]
dev = [
"pytest>=8.0.0",
"pytest-asyncio>=0.24.0",
"pytest-cov>=6.0.0",
"httpx>=0.28.0",
"ruff>=0.8.0",
]
[tool.setuptools.packages.find]
include = ["api*", "core*", "db*", "models*", "services*", "mcp_server*"]
[build-system]
requires = ["setuptools>=75.0", "wheel"]
build-backend = "setuptools.build_meta"
[tool.pytest.ini_options]
asyncio_mode = "auto"
testpaths = ["tests"]
[tool.ruff]
target-version = "py312"
line-length = 100
[tool.ruff.lint]
select = ["E", "F", "I", "N", "W", "UP"]

1
services/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""AI services — hold detection, transcription, classification, and more."""

View File

@@ -0,0 +1,444 @@
"""
Audio Classifier — Spectral analysis for hold music, speech, and silence detection.
This is the brain of the Hold Slayer. It analyzes audio in real-time to determine:
- Is this hold music?
- Is this an IVR prompt (automated voice)?
- Is this a live human?
- Is this silence?
- Is this a ring-back tone?
Uses spectral analysis (librosa/numpy) to classify audio without needing
a trained ML model — just signal processing and heuristics.
"""
import logging
import time
from typing import Optional
import numpy as np
from config import ClassifierSettings
from models.call import AudioClassification, ClassificationResult
logger = logging.getLogger(__name__)
# Audio constants
SAMPLE_RATE = 16000 # 16kHz mono
FRAME_SIZE = SAMPLE_RATE * 2 # 16-bit samples = 2 bytes per sample
class AudioClassifier:
"""
Real-time audio classifier using spectral analysis.
Classification strategy:
- Silence: Low RMS energy
- Music: High spectral flatness + sustained tonal content + rhythm
- IVR prompt: Speech-like spectral envelope but repetitive/synthetic
- Live human: Speech-like spectral envelope + natural variation
- Ringing: Very tonal, specific frequencies (~440Hz, ~480Hz for NA ring)
- DTMF: Dual-tone detection at known DTMF frequencies
"""
def __init__(self, settings: ClassifierSettings):
self.settings = settings
self._window_buffer: list[bytes] = []
self._window_samples = int(settings.window_seconds * SAMPLE_RATE)
self._classification_history: list[AudioClassification] = []
def classify_chunk(self, audio_data: bytes) -> ClassificationResult:
"""
Classify a chunk of audio data.
Args:
audio_data: Raw PCM audio (16-bit signed, 16kHz, mono)
Returns:
ClassificationResult with type and confidence
"""
# Convert bytes to numpy array
samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
if len(samples) == 0:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.SILENCE,
confidence=1.0,
)
# Normalize to [-1.0, 1.0]
samples = samples / 32768.0
# Run all detectors
rms = self._compute_rms(samples)
spectral_flatness = self._compute_spectral_flatness(samples)
zcr = self._compute_zero_crossing_rate(samples)
dominant_freq = self._compute_dominant_frequency(samples)
spectral_centroid = self._compute_spectral_centroid(samples)
is_tonal = self._detect_tonality(samples)
# Build feature dict for debugging
features = {
"rms": float(rms),
"spectral_flatness": float(spectral_flatness),
"zcr": float(zcr),
"dominant_freq": float(dominant_freq),
"spectral_centroid": float(spectral_centroid),
"is_tonal": is_tonal,
}
# === Classification Logic ===
# 1. Silence detection
if rms < 0.01:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.SILENCE,
confidence=min(1.0, (0.01 - rms) / 0.01 + 0.5),
details=features,
)
# 2. DTMF detection (very specific dual-tone pattern)
dtmf_result = self._detect_dtmf(samples)
if dtmf_result:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.DTMF,
confidence=0.95,
details={**features, "dtmf_digit": dtmf_result},
)
# 3. Ring-back tone detection (440+480Hz in NA, periodic on/off)
if is_tonal and 400 < dominant_freq < 520 and rms > 0.02:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.RINGING,
confidence=0.8,
details=features,
)
# 4. Music vs Speech discrimination
# Music: higher spectral flatness, more tonal, wider spectral spread
# Speech: lower spectral flatness, concentrated energy, variable ZCR
music_score = self._compute_music_score(
spectral_flatness, is_tonal, spectral_centroid, zcr, rms
)
speech_score = self._compute_speech_score(
spectral_flatness, zcr, spectral_centroid, rms
)
# 5. If it's speech-like, is it live or automated?
if speech_score > music_score:
# Use history to distinguish live human from IVR
# IVR: repetitive patterns, synthetic prosody
# Human: natural variation, conversational rhythm
if self._looks_like_live_human(speech_score, zcr, rms):
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.LIVE_HUMAN,
confidence=speech_score,
details=features,
)
else:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.IVR_PROMPT,
confidence=speech_score * 0.8,
details=features,
)
# 6. Music (hold music)
if music_score >= self.settings.music_threshold:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.MUSIC,
confidence=music_score,
details=features,
)
# 7. Unknown / low confidence
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.UNKNOWN,
confidence=max(music_score, speech_score),
details=features,
)
# ================================================================
# Feature Extraction
# ================================================================
@staticmethod
def _compute_rms(samples: np.ndarray) -> float:
"""Root Mean Square — overall energy level."""
return float(np.sqrt(np.mean(samples ** 2)))
@staticmethod
def _compute_spectral_flatness(samples: np.ndarray) -> float:
"""
Spectral flatness (Wiener entropy).
Close to 1.0 = noise-like (white noise)
Close to 0.0 = tonal (pure tone, music)
Speech is typically 0.1-0.4, music 0.05-0.3
"""
fft = np.abs(np.fft.rfft(samples))
fft = fft[fft > 0] # Avoid log(0)
if len(fft) == 0:
return 0.0
geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
arithmetic_mean = np.mean(fft)
if arithmetic_mean == 0:
return 0.0
return float(geometric_mean / arithmetic_mean)
@staticmethod
def _compute_zero_crossing_rate(samples: np.ndarray) -> float:
"""
Zero-crossing rate — how often the signal crosses zero.
Higher for unvoiced speech and noise.
Lower for voiced speech and tonal music.
"""
crossings = np.sum(np.abs(np.diff(np.sign(samples)))) / 2
return float(crossings / len(samples))
@staticmethod
def _compute_dominant_frequency(samples: np.ndarray) -> float:
"""Find the dominant frequency in the signal."""
fft = np.abs(np.fft.rfft(samples))
freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE)
# Ignore DC and very low frequencies
mask = freqs > 50
if not np.any(mask):
return 0.0
fft_masked = fft[mask]
freqs_masked = freqs[mask]
return float(freqs_masked[np.argmax(fft_masked)])
@staticmethod
def _compute_spectral_centroid(samples: np.ndarray) -> float:
"""
Spectral centroid — "center of mass" of the spectrum.
Higher for bright/treble sounds, lower for bass-heavy sounds.
Speech typically 500-4000Hz, music varies widely.
"""
fft = np.abs(np.fft.rfft(samples))
freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE)
total_energy = np.sum(fft)
if total_energy == 0:
return 0.0
return float(np.sum(freqs * fft) / total_energy)
@staticmethod
def _detect_tonality(samples: np.ndarray) -> bool:
"""
Check if the signal is strongly tonal (has clear pitch).
Uses autocorrelation.
"""
# Autocorrelation
correlation = np.correlate(samples, samples, mode="full")
correlation = correlation[len(correlation) // 2:]
# Normalize
if correlation[0] == 0:
return False
correlation = correlation / correlation[0]
# Look for a strong peak (indicating periodicity)
# Skip the first ~50 samples (very high frequencies)
min_lag = int(SAMPLE_RATE / 1000) # ~16 samples (1000Hz max)
max_lag = int(SAMPLE_RATE / 50) # ~320 samples (50Hz min)
search_region = correlation[min_lag:max_lag]
if len(search_region) == 0:
return False
peak_value = np.max(search_region)
return bool(peak_value > 0.5)
def _detect_dtmf(self, samples: np.ndarray) -> Optional[str]:
"""
Detect DTMF tones using Goertzel algorithm (simplified).
DTMF frequencies:
697, 770, 852, 941 Hz (row)
1209, 1336, 1477, 1633 Hz (column)
"""
dtmf_freqs_low = [697, 770, 852, 941]
dtmf_freqs_high = [1209, 1336, 1477, 1633]
dtmf_map = {
(697, 1209): "1", (697, 1336): "2", (697, 1477): "3", (697, 1633): "A",
(770, 1209): "4", (770, 1336): "5", (770, 1477): "6", (770, 1633): "B",
(852, 1209): "7", (852, 1336): "8", (852, 1477): "9", (852, 1633): "C",
(941, 1209): "*", (941, 1336): "0", (941, 1477): "#", (941, 1633): "D",
}
# Compute power at each DTMF frequency
def goertzel_power(freq: int) -> float:
k = int(0.5 + len(samples) * freq / SAMPLE_RATE)
w = 2 * np.pi * k / len(samples)
coeff = 2 * np.cos(w)
s0, s1, s2 = 0.0, 0.0, 0.0
for sample in samples:
s0 = sample + coeff * s1 - s2
s2 = s1
s1 = s0
return float(s1 * s1 + s2 * s2 - coeff * s1 * s2)
# Find strongest low and high frequencies
low_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_low]
high_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_high]
best_low = max(low_powers, key=lambda x: x[1])
best_high = max(high_powers, key=lambda x: x[1])
# Threshold: both frequencies must be significantly present
total_power = np.sum(samples ** 2)
if total_power == 0:
return None
threshold = total_power * 0.1
if best_low[1] > threshold and best_high[1] > threshold:
key = (best_low[0], best_high[0])
return dtmf_map.get(key)
return None
# ================================================================
# Higher-Level Classification
# ================================================================
def _compute_music_score(
self,
spectral_flatness: float,
is_tonal: bool,
spectral_centroid: float,
zcr: float,
rms: float,
) -> float:
"""Compute a music likelihood score (0.0 - 1.0)."""
score = 0.0
# Music tends to be tonal
if is_tonal:
score += 0.3
# Music has moderate spectral flatness (more than pure tone, less than noise)
if 0.05 < spectral_flatness < 0.4:
score += 0.2
# Music has sustained energy
if rms > 0.03:
score += 0.15
# Music has wider spectral content than speech
if spectral_centroid > 1500:
score += 0.15
# Music tends to have lower ZCR than noise
if zcr < 0.15:
score += 0.2
return min(1.0, score)
def _compute_speech_score(
self,
spectral_flatness: float,
zcr: float,
spectral_centroid: float,
rms: float,
) -> float:
"""Compute a speech likelihood score (0.0 - 1.0)."""
score = 0.0
# Speech has moderate spectral flatness
if 0.1 < spectral_flatness < 0.5:
score += 0.25
# Speech centroid typically 500-4000 Hz
if 500 < spectral_centroid < 4000:
score += 0.25
# Speech has moderate ZCR
if 0.02 < zcr < 0.2:
score += 0.25
# Speech has moderate energy
if 0.01 < rms < 0.5:
score += 0.25
return min(1.0, score)
def _looks_like_live_human(
self,
speech_score: float,
zcr: float,
rms: float,
) -> bool:
"""
Distinguish live human from IVR/TTS.
Heuristics:
- IVR prompts are followed by silence (waiting for input)
- Live humans have more natural variation in energy and pitch
- After hold music → speech transition, it's likely a human
This is the hardest classification and benefits most from
the transcript context (Speaches STT).
"""
# Look at recent classification history
recent = self._classification_history[-10:] if self._classification_history else []
# Key signal: if we were just listening to hold music and now
# hear speech, it's very likely a live human agent
if recent:
recent_types = [c for c in recent]
if AudioClassification.MUSIC in recent_types[-5:]:
# Transition from music to speech = agent picked up!
return True
# High speech score with good energy = more likely human
if speech_score > 0.7 and rms > 0.05:
return True
# Default: assume IVR until proven otherwise
return False
def update_history(self, classification: AudioClassification) -> None:
"""Track classification history for pattern detection."""
self._classification_history.append(classification)
# Keep last 100 classifications
if len(self._classification_history) > 100:
self._classification_history = self._classification_history[-100:]
def detect_hold_to_human_transition(self) -> bool:
"""
Detect the critical moment: hold music → live human.
Looks for pattern: MUSIC, MUSIC, MUSIC, ..., SPEECH/LIVE_HUMAN
"""
recent = self._classification_history[-20:]
if len(recent) < 5:
return False
# Count recent music vs speech
music_count = sum(1 for c in recent[:-3] if c == AudioClassification.MUSIC)
speech_count = sum(
1 for c in recent[-3:]
if c in (AudioClassification.LIVE_HUMAN, AudioClassification.IVR_PROMPT)
)
# If we had a lot of music and now have speech, someone picked up
return music_count >= 3 and speech_count >= 2

324
services/call_analytics.py Normal file
View File

@@ -0,0 +1,324 @@
"""
Call Analytics Service — Tracks call metrics and generates insights.
Monitors call patterns, hold times, success rates, and IVR navigation
efficiency. Provides data for the dashboard and API.
"""
import logging
from collections import defaultdict
from datetime import datetime, timedelta
from typing import Any, Optional
from models.call import ActiveCall, AudioClassification, CallMode, CallStatus
logger = logging.getLogger(__name__)
class CallAnalytics:
"""
In-memory call analytics engine.
Tracks:
- Call success/failure rates
- Hold time statistics (avg, min, max, p95)
- IVR navigation efficiency
- Human detection accuracy
- Per-number/company patterns
- Time-of-day patterns
In production, this would be backed by TimescaleDB or similar.
For now, we keep rolling windows in memory.
"""
def __init__(self, max_history: int = 10000):
self._max_history = max_history
self._call_records: list[CallRecord] = []
self._company_stats: dict[str, CompanyStats] = defaultdict(CompanyStats)
# ================================================================
# Record Calls
# ================================================================
def record_call(self, call: ActiveCall) -> None:
"""
Record a completed call for analytics.
Called when a call ends (from CallManager).
"""
record = CallRecord(
call_id=call.id,
remote_number=call.remote_number,
mode=call.mode,
status=call.status,
intent=call.intent,
started_at=call.created_at,
duration_seconds=call.duration,
hold_time_seconds=call.hold_time,
classification_history=[
r.audio_type.value for r in call.classification_history
],
transcript_chunks=list(call.transcript_chunks),
services=list(call.services),
)
self._call_records.append(record)
# Trim history
if len(self._call_records) > self._max_history:
self._call_records = self._call_records[-self._max_history :]
# Update company stats
company_key = self._normalize_number(call.remote_number)
self._company_stats[company_key].update(record)
logger.debug(
f"📊 Recorded call {call.id}: "
f"{call.status.value}, {call.duration}s, hold={call.hold_time}s"
)
# ================================================================
# Aggregate Stats
# ================================================================
def get_summary(self, hours: int = 24) -> dict[str, Any]:
"""Get summary statistics for the last N hours."""
cutoff = datetime.now() - timedelta(hours=hours)
recent = [r for r in self._call_records if r.started_at >= cutoff]
if not recent:
return {
"period_hours": hours,
"total_calls": 0,
"success_rate": 0.0,
"avg_hold_time": 0.0,
"avg_duration": 0.0,
}
total = len(recent)
successful = sum(1 for r in recent if r.status in (
CallStatus.COMPLETED, CallStatus.BRIDGED, CallStatus.HUMAN_DETECTED
))
failed = sum(1 for r in recent if r.status == CallStatus.FAILED)
hold_times = [r.hold_time_seconds for r in recent if r.hold_time_seconds > 0]
durations = [r.duration_seconds for r in recent if r.duration_seconds > 0]
hold_slayer_calls = [r for r in recent if r.mode == CallMode.HOLD_SLAYER]
hold_slayer_success = sum(
1 for r in hold_slayer_calls
if r.status in (CallStatus.BRIDGED, CallStatus.HUMAN_DETECTED)
)
return {
"period_hours": hours,
"total_calls": total,
"successful": successful,
"failed": failed,
"success_rate": round(successful / total, 3) if total else 0.0,
"avg_duration": round(sum(durations) / len(durations), 1) if durations else 0.0,
"max_duration": max(durations) if durations else 0,
"hold_time": {
"avg": round(sum(hold_times) / len(hold_times), 1) if hold_times else 0.0,
"min": min(hold_times) if hold_times else 0,
"max": max(hold_times) if hold_times else 0,
"p95": self._percentile(hold_times, 95) if hold_times else 0,
"total": sum(hold_times),
},
"hold_slayer": {
"total": len(hold_slayer_calls),
"success": hold_slayer_success,
"success_rate": round(
hold_slayer_success / len(hold_slayer_calls), 3
) if hold_slayer_calls else 0.0,
},
"by_mode": self._group_by_mode(recent),
"by_hour": self._group_by_hour(recent),
}
def get_company_stats(self, number: str) -> dict[str, Any]:
"""Get stats for a specific company/number."""
key = self._normalize_number(number)
stats = self._company_stats.get(key)
if not stats:
return {"number": number, "total_calls": 0}
return stats.to_dict(number)
def get_top_numbers(self, limit: int = 10) -> list[dict[str, Any]]:
"""Get the most-called numbers with their stats."""
sorted_stats = sorted(
self._company_stats.items(),
key=lambda x: x[1].total_calls,
reverse=True,
)[:limit]
return [stats.to_dict(number) for number, stats in sorted_stats]
# ================================================================
# Hold Time Trends
# ================================================================
def get_hold_time_trend(
self,
number: Optional[str] = None,
days: int = 7,
) -> list[dict]:
"""
Get hold time trend data for graphing.
Returns daily average hold times for the last N days.
"""
cutoff = datetime.now() - timedelta(days=days)
records = [r for r in self._call_records if r.started_at >= cutoff]
if number:
key = self._normalize_number(number)
records = [r for r in records if self._normalize_number(r.remote_number) == key]
# Group by day
by_day: dict[str, list[int]] = defaultdict(list)
for r in records:
day = r.started_at.strftime("%Y-%m-%d")
if r.hold_time_seconds > 0:
by_day[day].append(r.hold_time_seconds)
trend = []
for i in range(days):
date = (datetime.now() - timedelta(days=days - 1 - i)).strftime("%Y-%m-%d")
times = by_day.get(date, [])
trend.append({
"date": date,
"avg_hold_time": round(sum(times) / len(times), 1) if times else 0,
"call_count": len(times),
"max_hold_time": max(times) if times else 0,
})
return trend
# ================================================================
# Helpers
# ================================================================
@staticmethod
def _normalize_number(number: str) -> str:
"""Normalize phone number for grouping."""
# Strip formatting, keep last 10 digits
digits = "".join(c for c in number if c.isdigit())
return digits[-10:] if len(digits) >= 10 else digits
@staticmethod
def _percentile(values: list, pct: int) -> float:
"""Calculate percentile value."""
if not values:
return 0.0
sorted_vals = sorted(values)
idx = int(len(sorted_vals) * pct / 100)
idx = min(idx, len(sorted_vals) - 1)
return float(sorted_vals[idx])
@staticmethod
def _group_by_mode(records: list["CallRecord"]) -> dict[str, int]:
"""Group call counts by mode."""
by_mode: dict[str, int] = defaultdict(int)
for r in records:
by_mode[r.mode.value] += 1
return dict(by_mode)
@staticmethod
def _group_by_hour(records: list["CallRecord"]) -> dict[int, int]:
"""Group call counts by hour of day."""
by_hour: dict[int, int] = defaultdict(int)
for r in records:
by_hour[r.started_at.hour] += 1
return dict(sorted(by_hour.items()))
@property
def total_calls_recorded(self) -> int:
return len(self._call_records)
# ================================================================
# Data Models
# ================================================================
class CallRecord:
"""A completed call record for analytics."""
def __init__(
self,
call_id: str,
remote_number: str,
mode: CallMode,
status: CallStatus,
intent: Optional[str] = None,
started_at: Optional[datetime] = None,
duration_seconds: int = 0,
hold_time_seconds: int = 0,
classification_history: Optional[list[str]] = None,
transcript_chunks: Optional[list[str]] = None,
services: Optional[list[str]] = None,
):
self.call_id = call_id
self.remote_number = remote_number
self.mode = mode
self.status = status
self.intent = intent
self.started_at = started_at or datetime.now()
self.duration_seconds = duration_seconds
self.hold_time_seconds = hold_time_seconds
self.classification_history = classification_history or []
self.transcript_chunks = transcript_chunks or []
self.services = services or []
class CompanyStats:
"""Aggregated stats for a specific company/phone number."""
def __init__(self):
self.total_calls = 0
self.successful_calls = 0
self.failed_calls = 0
self.total_hold_time = 0
self.hold_times: list[int] = []
self.total_duration = 0
self.last_called: Optional[datetime] = None
self.intents: dict[str, int] = defaultdict(int)
def update(self, record: CallRecord) -> None:
"""Update stats with a new call record."""
self.total_calls += 1
self.total_duration += record.duration_seconds
self.last_called = record.started_at
if record.status in (CallStatus.COMPLETED, CallStatus.BRIDGED, CallStatus.HUMAN_DETECTED):
self.successful_calls += 1
elif record.status == CallStatus.FAILED:
self.failed_calls += 1
if record.hold_time_seconds > 0:
self.total_hold_time += record.hold_time_seconds
self.hold_times.append(record.hold_time_seconds)
if record.intent:
self.intents[record.intent] += 1
def to_dict(self, number: str) -> dict[str, Any]:
return {
"number": number,
"total_calls": self.total_calls,
"successful_calls": self.successful_calls,
"failed_calls": self.failed_calls,
"success_rate": round(
self.successful_calls / self.total_calls, 3
) if self.total_calls else 0.0,
"avg_hold_time": round(
self.total_hold_time / len(self.hold_times), 1
) if self.hold_times else 0.0,
"max_hold_time": max(self.hold_times) if self.hold_times else 0,
"avg_duration": round(
self.total_duration / self.total_calls, 1
) if self.total_calls else 0.0,
"last_called": self.last_called.isoformat() if self.last_called else None,
"top_intents": dict(
sorted(self.intents.items(), key=lambda x: x[1], reverse=True)[:5]
),
}

View File

@@ -0,0 +1,339 @@
"""
Call Flow Learner — Builds and refines call flows from exploration data.
When Hold Slayer runs in exploration mode, it discovers IVR steps.
This service takes those discoveries and:
1. Builds a CallFlow tree that can be reused next time
2. Merges new discoveries into existing flows (refining them)
3. Uses LLM to label steps and infer menu structure
Over time, each phone number builds up a reliable call flow
that makes future calls faster and more accurate.
"""
import logging
import re
from datetime import datetime
from typing import Any, Optional
from models.call_flow import ActionType, CallFlow, CallFlowStep
logger = logging.getLogger(__name__)
class CallFlowLearner:
"""
Learns IVR call flows from exploration data.
Usage:
learner = CallFlowLearner(llm_client=llm)
# After an exploration call completes:
flow = await learner.build_flow(
phone_number="+18005551234",
discovered_steps=steps_from_exploration,
intent="cancel my card",
)
# Next time we call, merge new discoveries:
updated = await learner.merge_discoveries(
existing_flow=flow,
new_steps=new_discoveries,
)
"""
def __init__(self, llm_client=None):
self._llm = llm_client
# ================================================================
# Build Flow from Exploration
# ================================================================
async def build_flow(
self,
phone_number: str,
discovered_steps: list[dict],
intent: Optional[str] = None,
company_name: Optional[str] = None,
) -> CallFlow:
"""
Build a CallFlow from exploration discoveries.
Args:
phone_number: The number that was called.
discovered_steps: List of step dicts from exploration mode:
[{"timestamp": ..., "audio_type": "ivr_prompt",
"transcript": "Press 1 for...", "action_taken": {"dtmf": "1"}}, ...]
intent: What the caller was trying to accomplish.
company_name: Optional company name for labeling.
Returns:
A CallFlow that can be stored and reused.
"""
logger.info(
f"🧠 Building call flow from {len(discovered_steps)} discoveries "
f"for {phone_number}"
)
# Phase 1: Extract meaningful steps (skip silence, ringing)
meaningful = [
s for s in discovered_steps
if s.get("audio_type") in ("ivr_prompt", "live_human", "music")
or s.get("action_taken")
]
if not meaningful:
logger.warning(" No meaningful steps discovered")
return self._empty_flow(phone_number, company_name)
# Phase 2: Convert discoveries to CallFlowSteps
flow_steps = []
for i, step in enumerate(meaningful):
flow_step = self._discovery_to_step(step, i, meaningful)
if flow_step:
flow_steps.append(flow_step)
# Phase 3: Link steps together (next_step pointers)
for i, step in enumerate(flow_steps[:-1]):
step.next_step = flow_steps[i + 1].id
# Phase 4: Use LLM to enhance step labels if available
if self._llm and flow_steps:
flow_steps = await self._llm_enhance_steps(flow_steps, intent)
# Build the flow
name = company_name or self._guess_company_name(phone_number)
flow = CallFlow(
id=f"flow_{phone_number.replace('+', '')}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
name=f"{name}{intent or 'General'}",
phone_number=phone_number,
description=f"Auto-learned flow for {name}. Intent: {intent or 'general'}",
steps=flow_steps,
tags=["auto-learned"],
notes=f"Learned from exploration on {datetime.now().isoformat()}",
times_used=1,
last_used=datetime.now(),
)
logger.info(
f" ✅ Built flow '{flow.name}' with {len(flow_steps)} steps"
)
return flow
def _discovery_to_step(
self,
discovery: dict,
index: int,
all_discoveries: list[dict],
) -> Optional[CallFlowStep]:
"""Convert a single exploration discovery to a CallFlowStep."""
audio_type = discovery.get("audio_type", "")
transcript = discovery.get("transcript", "")
action_taken = discovery.get("action_taken")
step_id = f"step_{index:03d}"
if audio_type == "ivr_prompt" and action_taken:
# IVR menu where we pressed a button
dtmf = action_taken.get("dtmf", "")
return CallFlowStep(
id=step_id,
description=self._summarize_menu(transcript) or f"IVR menu (pressed {dtmf})",
action=ActionType.DTMF,
action_value=dtmf,
expect=self._extract_expect_pattern(transcript),
timeout=15,
)
elif audio_type == "ivr_prompt" and not action_taken:
# IVR prompt we just listened to
return CallFlowStep(
id=step_id,
description=self._summarize_menu(transcript) or "IVR announcement",
action=ActionType.LISTEN,
timeout=30,
)
elif audio_type == "music":
# Hold music
return CallFlowStep(
id=step_id,
description="Hold music — waiting for agent",
action=ActionType.HOLD,
timeout=3600,
)
elif audio_type == "live_human":
# Human detected — this is the transfer point
return CallFlowStep(
id=step_id,
description="Live agent detected — transfer",
action=ActionType.TRANSFER,
action_value="preferred_device",
)
return None
# ================================================================
# Merge New Discoveries into Existing Flow
# ================================================================
async def merge_discoveries(
self,
existing_flow: CallFlow,
new_steps: list[dict],
intent: Optional[str] = None,
) -> CallFlow:
"""
Merge new exploration discoveries into an existing flow.
This refines the flow over time — updating timeouts,
confirming step order, adding alternative paths.
"""
logger.info(
f"🔄 Merging {len(new_steps)} new discoveries into "
f"flow '{existing_flow.name}'"
)
# Build a new flow from the discoveries
new_flow = await self.build_flow(
phone_number=existing_flow.phone_number,
discovered_steps=new_steps,
intent=intent,
)
# Simple merge strategy: keep existing steps but update timeouts
# and add any new steps that weren't in the original
existing_by_action = {
(s.action, s.action_value): s for s in existing_flow.steps
}
for new_step in new_flow.steps:
key = (new_step.action, new_step.action_value)
if key in existing_by_action:
# Update timeout to be the average
old_step = existing_by_action[key]
if old_step.timeout and new_step.timeout:
old_step.timeout = int(
(old_step.timeout + new_step.timeout) / 2
)
# New steps that don't exist are noted but not auto-added
# (to avoid corrupting a working flow)
# Update metadata
existing_flow.times_used = (existing_flow.times_used or 0) + 1
existing_flow.last_used = datetime.now()
logger.info(f" ✅ Merged. Flow now has {len(existing_flow.steps)} steps")
return existing_flow
# ================================================================
# LLM Enhancement
# ================================================================
async def _llm_enhance_steps(
self,
steps: list[CallFlowStep],
intent: Optional[str],
) -> list[CallFlowStep]:
"""Use LLM to improve step descriptions and structure."""
if not self._llm:
return steps
try:
# Build a summary of the steps for the LLM
step_descriptions = []
for s in steps:
desc = f"- {s.action.value}"
if s.action_value:
desc += f" ({s.action_value})"
if s.description:
desc += f": {s.description}"
step_descriptions.append(desc)
prompt = (
f"These are steps discovered while navigating a phone IVR system.\n"
f"Intent: {intent or 'general inquiry'}\n\n"
f"Steps:\n" + "\n".join(step_descriptions) + "\n\n"
f"For each step, provide a clear, concise description of what "
f"that step does. Return JSON array of objects with 'step_index' "
f"and 'description' fields."
)
result = await self._llm.chat_json(
prompt,
system="You are labeling IVR phone menu steps for a call flow database.",
)
# Apply LLM descriptions
if isinstance(result, list):
for item in result:
idx = item.get("step_index", -1)
desc = item.get("description", "")
if 0 <= idx < len(steps) and desc:
steps[idx].description = desc
elif isinstance(result, dict) and "steps" in result:
for item in result["steps"]:
idx = item.get("step_index", -1)
desc = item.get("description", "")
if 0 <= idx < len(steps) and desc:
steps[idx].description = desc
except Exception as e:
logger.warning(f" LLM enhancement failed (non-fatal): {e}")
return steps
# ================================================================
# Helpers
# ================================================================
@staticmethod
def _summarize_menu(transcript: str) -> Optional[str]:
"""Create a short summary of an IVR menu transcript."""
if not transcript:
return None
# Count how many options
options = re.findall(r'press\s+\d+', transcript.lower())
if options:
return f"IVR menu with {len(options)} options"
# Truncate long transcripts
if len(transcript) > 80:
return transcript[:77] + "..."
return transcript
@staticmethod
def _extract_expect_pattern(transcript: str) -> Optional[str]:
"""Extract a regex pattern to match this prompt next time."""
if not transcript:
return None
# Find the most distinctive phrase (>4 words, not generic)
words = transcript.split()
if len(words) >= 4:
# Use first meaningful phrase
phrase = " ".join(words[:6])
# Escape for regex
return re.escape(phrase.lower())
return None
@staticmethod
def _guess_company_name(phone_number: str) -> str:
"""Guess company name from phone number (placeholder)."""
# In production, this would do a reverse lookup
return f"Company {phone_number[-4:]}"
@staticmethod
def _empty_flow(phone_number: str, company_name: Optional[str]) -> CallFlow:
"""Create an empty flow placeholder."""
return CallFlow(
id=f"flow_{phone_number.replace('+', '')}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
name=f"{company_name or phone_number} — Empty",
phone_number=phone_number,
description="Empty flow — no meaningful steps discovered",
steps=[],
tags=["auto-learned", "empty"],
)

717
services/hold_slayer.py Normal file
View File

@@ -0,0 +1,717 @@
"""
Hold Slayer Service — The main event.
Navigate IVR trees, wait on hold, detect when a human picks up,
and transfer you in. This is the state machine that orchestrates
the entire hold-slaying process.
Two modes:
1. run_with_flow(): Follow a stored call flow tree (fast, reliable)
2. run_exploration(): No stored flow — listen, transcribe, and figure it out
"""
import asyncio
import logging
import re
import time
from typing import Optional
from config import Settings
from core.call_manager import CallManager
from core.sip_engine import SIPEngine
from models.call import ActiveCall, AudioClassification, CallStatus, ClassificationResult
from models.call_flow import ActionType, CallFlow, CallFlowStep
from models.events import EventType, GatewayEvent
from services.audio_classifier import AudioClassifier
from services.transcription import TranscriptionService
logger = logging.getLogger(__name__)
# LLM client is optional — imported at use time
_llm_client = None
def _get_llm():
"""Lazy-load LLM client (optional dependency)."""
global _llm_client
if _llm_client is None:
try:
from config import get_settings
from services.llm_client import LLMClient
settings = get_settings()
_llm_client = LLMClient(
base_url=settings.llm.base_url,
model=settings.llm.model,
api_key=settings.llm.api_key,
timeout=settings.llm.timeout,
)
except Exception as e:
logger.debug(f"LLM client not available: {e}")
_llm_client = False # Sentinel: don't retry
return _llm_client if _llm_client is not False else None
class HoldSlayerService:
"""
The Hold Slayer.
Navigates IVR menus, waits on hold, detects live humans,
and transfers the call to your device.
"""
def __init__(
self,
gateway, # AIPSTNGateway (avoid circular import)
call_manager: CallManager,
sip_engine: SIPEngine,
classifier: AudioClassifier,
transcription: TranscriptionService,
settings: Settings,
):
self.gateway = gateway
self.call_manager = call_manager
self.sip_engine = sip_engine
self.classifier = classifier
self.transcription = transcription
self.settings = settings
async def run(
self,
call: ActiveCall,
sip_leg_id: str,
call_flow_id: Optional[str] = None,
) -> bool:
"""
Main entry point. Run the Hold Slayer on a call.
Args:
call: The active call to work on
sip_leg_id: SIP leg ID for the PSTN call
call_flow_id: Optional stored call flow to follow
Returns:
True if successfully transferred to user, False otherwise
"""
logger.info(f"🗡️ Hold Slayer activated for {call.remote_number}")
logger.info(f" Intent: {call.intent}")
logger.info(f" Call Flow: {call_flow_id or 'exploration mode'}")
try:
# Wait for call to be connected
await self._wait_for_connection(call, timeout=60)
if call_flow_id:
# Load the stored call flow from the database
flow = await self._load_call_flow(call_flow_id)
if flow:
return await self.run_with_flow(call, sip_leg_id, flow)
else:
logger.warning(f"Call flow '{call_flow_id}' not found, switching to exploration")
# No flow or flow not found — explore
return await self.run_exploration(call, sip_leg_id)
except asyncio.CancelledError:
logger.info(f"Hold Slayer cancelled for {call.id}")
return False
except Exception as e:
logger.error(f"Hold Slayer error: {e}", exc_info=True)
await self.call_manager.update_status(call.id, CallStatus.FAILED)
return False
# ================================================================
# Mode 1: Follow a Stored Call Flow
# ================================================================
async def run_with_flow(
self,
call: ActiveCall,
sip_leg_id: str,
flow: CallFlow,
) -> bool:
"""
Navigate using a stored call flow tree.
Falls back to exploration for unknown steps.
"""
logger.info(f"📋 Following call flow: {flow.name}")
steps = flow.steps_by_id()
current_step_id = flow.steps[0].id if flow.steps else None
while current_step_id:
step = steps.get(current_step_id)
if not step:
logger.error(f"Step '{current_step_id}' not found in flow")
break
call.current_step_id = current_step_id
logger.info(f"📍 Step: {step.description}")
await self.call_manager.event_bus.publish(GatewayEvent(
type=EventType.IVR_STEP,
call_id=call.id,
data={"step_id": step.id, "description": step.description, "action": step.action.value},
message=f"📍 IVR Step: {step.description}",
))
# === Execute the step based on its action type ===
if step.action == ActionType.HOLD:
# HOLD MODE: Audio classifier takes over
await self.call_manager.update_status(call.id, CallStatus.ON_HOLD)
logger.info(f"⏳ On hold. Activating hold detection...")
human_detected = await self._wait_for_human(
call, sip_leg_id, timeout=step.timeout
)
if human_detected:
current_step_id = step.next_step
else:
logger.warning("⏰ Hold timeout reached!")
break
elif step.action == ActionType.DTMF:
# Wait for the expected prompt, then send DTMF
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
if step.expect:
heard = await self._wait_for_prompt(
call, sip_leg_id, step.expect, step.timeout
)
if not heard and step.fallback_step:
logger.info(f"⚠️ Didn't hear expected prompt, falling back")
current_step_id = step.fallback_step
continue
# Send the DTMF digits
if step.action_value:
await self.sip_engine.send_dtmf(sip_leg_id, step.action_value)
logger.info(f"📱 Pressed: {step.action_value}")
await self.call_manager.event_bus.publish(GatewayEvent(
type=EventType.IVR_DTMF_SENT,
call_id=call.id,
data={"digits": step.action_value, "step": step.id},
message=f"📱 DTMF sent: {step.action_value}",
))
# Small delay after DTMF for the IVR to process
await asyncio.sleep(2.0)
current_step_id = step.next_step
elif step.action == ActionType.WAIT:
# Just wait for a prompt
if step.expect:
await self._wait_for_prompt(
call, sip_leg_id, step.expect, step.timeout
)
else:
await asyncio.sleep(step.timeout)
current_step_id = step.next_step
elif step.action == ActionType.LISTEN:
# Listen and decide — regex first, LLM fallback
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
transcript = await self._listen_for_menu(
call, sip_leg_id, step.timeout
)
# Phase 1: Try regex-based keyword matching (fast, no API call)
decision = self._decide_menu_option(
transcript, call.intent or "", step.expect
)
# Phase 2: LLM fallback if regex couldn't decide
if not decision and transcript:
llm = _get_llm()
if llm:
try:
logger.info("🤖 Regex inconclusive, asking LLM...")
llm_result = await llm.analyze_ivr_menu(
transcript=transcript,
intent=call.intent or "",
previous_selections=list(call.dtmf_history) if hasattr(call, 'dtmf_history') else None,
)
decision = llm_result.get("digit")
if decision:
confidence = llm_result.get("confidence", 0)
reason = llm_result.get("reason", "")
logger.info(
f"🤖 LLM decided: press {decision} "
f"(confidence={confidence}, reason='{reason}')"
)
except Exception as e:
logger.warning(f"🤖 LLM fallback failed: {e}")
if decision:
await self.sip_engine.send_dtmf(sip_leg_id, decision)
logger.info(f"🧠 Decided: press {decision} (heard: '{transcript[:60]}...')")
else:
# Default: press 0 for agent
await self.sip_engine.send_dtmf(sip_leg_id, "0")
logger.info(f"🧠 No clear match, pressing 0 for agent")
await asyncio.sleep(2.0)
current_step_id = step.next_step
elif step.action == ActionType.SPEAK:
# Say something into the call (TTS)
# TODO: Implement TTS integration
logger.info(f"🗣️ Would say: '{step.action_value}' (TTS not yet implemented)")
await asyncio.sleep(3.0)
current_step_id = step.next_step
elif step.action == ActionType.TRANSFER:
# We did it! Transfer to user's device
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
logger.info(f"🚨 TRANSFERRING TO {step.action_value}")
device_target = step.action_value or call.device or self.settings.hold_slayer.default_transfer_device
await self.gateway.transfer_call(call.id, device_target)
return True
else:
logger.warning(f"Unknown action type: {step.action}")
current_step_id = step.next_step
return False
# ================================================================
# Mode 2: Exploration (No Stored Flow)
# ================================================================
async def run_exploration(
self,
call: ActiveCall,
sip_leg_id: str,
) -> bool:
"""
No stored flow — explore the IVR blind.
Records what it discovers so we can build a flow for next time.
"""
logger.info(f"🔍 Exploration mode: discovering IVR for {call.remote_number}")
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
discovered_steps: list[dict] = []
max_time = self.settings.hold_slayer.max_hold_time
start_time = time.time()
while time.time() - start_time < max_time:
# Check if call is still active
current_call = self.call_manager.get_call(call.id)
if not current_call or current_call.status in (
CallStatus.COMPLETED, CallStatus.FAILED, CallStatus.CANCELLED
):
break
# Get audio and classify
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= 16000 * 2 * 3: # 3 seconds
break
except Exception as e:
logger.error(f"Audio stream error: {e}")
await asyncio.sleep(1.0)
continue
if not audio_chunk:
await asyncio.sleep(1.0)
continue
# Classify the audio
classification = self.classifier.classify_chunk(audio_chunk)
self.classifier.update_history(classification.audio_type)
await self.call_manager.add_classification(call.id, classification)
# Transcribe if it sounds like speech
transcript = ""
if classification.audio_type in (
AudioClassification.IVR_PROMPT,
AudioClassification.LIVE_HUMAN,
):
transcript = await self.transcription.transcribe(
audio_chunk,
prompt="Phone IVR menu, customer service, press 1 for..."
)
if transcript:
await self.call_manager.add_transcript(call.id, transcript)
# Record discovery
discovered_steps.append({
"timestamp": time.time(),
"audio_type": classification.audio_type.value,
"confidence": classification.confidence,
"transcript": transcript,
"action_taken": None,
})
# === Decision Logic ===
if classification.audio_type == AudioClassification.LIVE_HUMAN:
# HUMAN DETECTED! Transfer!
logger.info("🚨 LIVE HUMAN DETECTED!")
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
device = call.device or self.settings.hold_slayer.default_transfer_device
await self.gateway.transfer_call(call.id, device)
logger.info(f"📋 Discovered {len(discovered_steps)} IVR steps")
return True
elif classification.audio_type == AudioClassification.MUSIC:
# On hold — just keep monitoring
if current_call.status != CallStatus.ON_HOLD:
await self.call_manager.update_status(call.id, CallStatus.ON_HOLD)
# Check for hold→human transition
if self.classifier.detect_hold_to_human_transition():
logger.info("🚨 Hold-to-human transition detected!")
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
device = call.device or self.settings.hold_slayer.default_transfer_device
await self.gateway.transfer_call(call.id, device)
return True
elif classification.audio_type == AudioClassification.IVR_PROMPT and transcript:
# IVR menu — try to navigate
decision = self._decide_menu_option(
transcript, call.intent or "", None
)
if decision:
await self.sip_engine.send_dtmf(sip_leg_id, decision)
discovered_steps[-1]["action_taken"] = {"dtmf": decision}
logger.info(f"🧠 Exploration: pressed {decision}")
await asyncio.sleep(2.0)
else:
# Try pressing 0 for agent
await self.sip_engine.send_dtmf(sip_leg_id, "0")
discovered_steps[-1]["action_taken"] = {"dtmf": "0", "reason": "default_agent"}
logger.info("🧠 Exploration: pressed 0 (trying for agent)")
await asyncio.sleep(2.0)
elif classification.audio_type == AudioClassification.SILENCE:
# Silence — wait a bit
await asyncio.sleep(2.0)
elif classification.audio_type == AudioClassification.RINGING:
# Still ringing
await asyncio.sleep(1.0)
logger.warning(f"Hold Slayer timed out after {max_time}s")
return False
# ================================================================
# Core Detection Methods
# ================================================================
async def _wait_for_human(
self,
call: ActiveCall,
sip_leg_id: str,
timeout: int = 7200,
) -> bool:
"""
Wait on hold until a live human is detected.
Continuously classifies audio and watches for the
music → speech transition.
"""
check_interval = self.settings.hold_slayer.hold_check_interval
start_time = time.time()
while time.time() - start_time < timeout:
# Check if call is still active
current_call = self.call_manager.get_call(call.id)
if not current_call or current_call.status in (
CallStatus.COMPLETED, CallStatus.FAILED, CallStatus.CANCELLED
):
return False
# Get audio chunk
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= int(16000 * 2 * check_interval):
break
except Exception:
await asyncio.sleep(check_interval)
continue
if not audio_chunk:
await asyncio.sleep(check_interval)
continue
# Classify
result = self.classifier.classify_chunk(audio_chunk)
self.classifier.update_history(result.audio_type)
await self.call_manager.add_classification(call.id, result)
# Check for human
if result.audio_type == AudioClassification.LIVE_HUMAN:
# Verify with transcription
transcript = await self.transcription.transcribe(audio_chunk)
if transcript:
await self.call_manager.add_transcript(call.id, transcript)
# If we got meaningful speech, it's probably a real person
if len(transcript.split()) >= 3:
logger.info(f"🚨 Human confirmed! Said: '{transcript[:100]}'")
return True
# Check for the music→speech transition pattern
if self.classifier.detect_hold_to_human_transition():
logger.info("🚨 Hold-to-human transition detected!")
return True
# Log progress periodically
elapsed = int(time.time() - start_time)
if elapsed > 0 and elapsed % 60 == 0:
logger.info(
f"⏳ Still on hold... {elapsed}s "
f"(audio: {result.audio_type.value}, {result.confidence:.0%})"
)
return False
async def _wait_for_prompt(
self,
call: ActiveCall,
sip_leg_id: str,
expected_pattern: str,
timeout: int = 30,
) -> bool:
"""
Wait for an expected IVR prompt.
Listens, transcribes, and checks if the transcript matches
the expected pattern (regex or keywords).
"""
start_time = time.time()
while time.time() - start_time < timeout:
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= 16000 * 2 * 3: # 3 seconds
break
except Exception:
await asyncio.sleep(1.0)
continue
if not audio_chunk:
await asyncio.sleep(1.0)
continue
# Classify first
result = self.classifier.classify_chunk(audio_chunk)
if result.audio_type not in (
AudioClassification.IVR_PROMPT,
AudioClassification.LIVE_HUMAN,
):
continue
# Transcribe
transcript = await self.transcription.transcribe(audio_chunk)
if not transcript:
continue
await self.call_manager.add_transcript(call.id, transcript)
# Check if it matches expected pattern
try:
if re.search(expected_pattern, transcript, re.IGNORECASE):
logger.info(f"✅ Heard expected: '{transcript[:80]}'")
return True
except re.error:
# Treat as keyword search if regex is invalid
if expected_pattern.lower() in transcript.lower():
logger.info(f"✅ Heard expected: '{transcript[:80]}'")
return True
logger.warning(f"⚠️ Didn't hear expected prompt within {timeout}s")
return False
async def _listen_for_menu(
self,
call: ActiveCall,
sip_leg_id: str,
timeout: int = 30,
) -> str:
"""Listen for an IVR menu and return the full transcript."""
transcript_parts: list[str] = []
start_time = time.time()
while time.time() - start_time < timeout:
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= 16000 * 2 * 5: # 5 seconds
break
except Exception:
await asyncio.sleep(1.0)
continue
if not audio_chunk:
break
result = self.classifier.classify_chunk(audio_chunk)
# If we're getting silence after speech, the menu prompt is done
if result.audio_type == AudioClassification.SILENCE and transcript_parts:
break
if result.audio_type in (
AudioClassification.IVR_PROMPT,
AudioClassification.LIVE_HUMAN,
):
text = await self.transcription.transcribe(audio_chunk)
if text:
transcript_parts.append(text)
full_transcript = " ".join(transcript_parts)
if full_transcript:
await self.call_manager.add_transcript(call.id, full_transcript)
return full_transcript
async def _wait_for_connection(self, call: ActiveCall, timeout: int = 60) -> None:
"""Wait for the call to be connected (answered)."""
start = time.time()
while time.time() - start < timeout:
current = self.call_manager.get_call(call.id)
if not current:
raise RuntimeError(f"Call {call.id} disappeared")
if current.status in (CallStatus.CONNECTED, CallStatus.NAVIGATING_IVR):
return
if current.status in (CallStatus.FAILED, CallStatus.CANCELLED):
raise RuntimeError(f"Call {call.id} failed: {current.status}")
await asyncio.sleep(0.5)
raise TimeoutError(f"Call {call.id} not connected within {timeout}s")
# ================================================================
# Menu Navigation Logic
# ================================================================
def _decide_menu_option(
self,
transcript: str,
intent: str,
expected_options: Optional[str],
) -> Optional[str]:
"""
Decide which menu option to select based on transcript and intent.
Simple keyword-based matching. This is where an LLM integration
would massively improve navigation accuracy.
Returns:
DTMF digit(s) to press, or None if can't decide
"""
transcript_lower = transcript.lower()
intent_lower = intent.lower()
# Common IVR patterns: "press 1 for X, press 2 for Y"
# Extract options
options = re.findall(
r'(?:press|dial|say)\s+(\d+)\s+(?:for|to)\s+(.+?)(?:\.|,|press|dial|$)',
transcript_lower,
)
if not options:
# Try alternate patterns: "for X, press 1"
options = re.findall(
r'for\s+(.+?),?\s*(?:press|dial)\s+(\d+)',
transcript_lower,
)
# Swap order to be (digit, description)
options = [(digit, desc) for desc, digit in options]
if not options:
return None
# Score each option against the intent
best_match = None
best_score = 0
# Keywords that map intents to IVR options
intent_keywords = {
"cancel": ["cancel", "close", "end", "terminate"],
"dispute": ["dispute", "charge", "billing", "transaction", "statement"],
"balance": ["balance", "account", "summary"],
"agent": ["agent", "representative", "operator", "speak", "person", "human"],
"payment": ["payment", "pay", "bill"],
"card": ["card", "credit", "debit"],
"fraud": ["fraud", "unauthorized", "stolen", "lost"],
"transfer": ["transfer", "move", "send"],
}
for digit, description in options:
score = 0
# Direct keyword match in description
for keyword_group, keywords in intent_keywords.items():
if any(kw in intent_lower for kw in keywords):
if any(kw in description for kw in keywords):
score += 10
# Fuzzy: any word overlap between intent and description
intent_words = set(intent_lower.split())
desc_words = set(description.split())
overlap = intent_words & desc_words
score += len(overlap) * 3
# "Speak to agent" is usually what we want if nothing else matches
if any(w in description for w in ["agent", "representative", "operator", "person"]):
score += 5
if score > best_score:
best_score = score
best_match = digit
if best_match and best_score >= 3:
return best_match
# Default: look for "agent" or "representative" option
for digit, description in options:
if any(w in description for w in ["agent", "representative", "operator"]):
return digit
return None
async def _load_call_flow(self, flow_id: str) -> Optional[CallFlow]:
"""Load a stored call flow from the database."""
from db.database import get_session_factory, StoredCallFlow
from sqlalchemy import select
try:
factory = get_session_factory()
async with factory() as session:
result = await session.execute(
select(StoredCallFlow).where(StoredCallFlow.id == flow_id)
)
row = result.scalar_one_or_none()
if row:
from models.call_flow import CallFlowStep
return CallFlow(
id=row.id,
name=row.name,
phone_number=row.phone_number,
description=row.description or "",
steps=[CallFlowStep(**s) for s in row.steps],
tags=row.tags or [],
notes=row.notes,
avg_hold_time=row.avg_hold_time,
success_rate=row.success_rate,
last_used=row.last_used,
times_used=row.times_used or 0,
)
except Exception as e:
logger.error(f"Failed to load call flow '{flow_id}': {e}")
return None

391
services/llm_client.py Normal file
View File

@@ -0,0 +1,391 @@
"""
LLM Client — Unified interface for LLM-powered decision making.
Used by Hold Slayer (IVR navigation fallback), Call Flow Learner,
Receptionist, and Smart Routing services.
Supports OpenAI-compatible APIs (OpenAI, Ollama, LM Studio, etc.)
via httpx async client. No SDK dependency — just HTTP.
"""
import json
import logging
import time
from typing import Any, Optional
import httpx
from config import get_settings
logger = logging.getLogger(__name__)
class LLMClient:
"""
Async LLM client for OpenAI-compatible chat completion APIs.
Works with:
- OpenAI API (api.openai.com)
- Ollama (localhost:11434)
- LM Studio (localhost:1234)
- Any OpenAI-compatible endpoint
Usage:
client = LLMClient(base_url="http://localhost:11434/v1", model="llama3")
response = await client.chat("What is 2+2?")
# or structured:
result = await client.chat_json(
"Extract the menu options from this IVR transcript...",
system="You are a phone menu parser.",
)
"""
def __init__(
self,
base_url: str = "http://localhost:11434/v1",
model: str = "llama3",
api_key: str = "not-needed",
timeout: float = 30.0,
max_tokens: int = 1024,
temperature: float = 0.3,
):
self.base_url = base_url.rstrip("/")
self.model = model
self.api_key = api_key
self.timeout = timeout
self.max_tokens = max_tokens
self.temperature = temperature
self._client = httpx.AsyncClient(
base_url=self.base_url,
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
timeout=httpx.Timeout(timeout),
)
# Stats
self._total_requests = 0
self._total_tokens = 0
self._total_errors = 0
self._avg_latency_ms = 0.0
async def close(self):
"""Close the HTTP client."""
await self._client.aclose()
# ================================================================
# Core Chat Methods
# ================================================================
async def chat(
self,
user_message: str,
system: Optional[str] = None,
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
) -> str:
"""
Send a chat completion request and return the text response.
Args:
user_message: The user's message/prompt.
system: Optional system prompt.
temperature: Override default temperature.
max_tokens: Override default max tokens.
Returns:
The assistant's response text.
"""
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": user_message})
return await self._complete(
messages,
temperature=temperature or self.temperature,
max_tokens=max_tokens or self.max_tokens,
)
async def chat_json(
self,
user_message: str,
system: Optional[str] = None,
temperature: Optional[float] = None,
) -> dict[str, Any]:
"""
Chat completion that parses the response as JSON.
The system prompt is augmented to request JSON output.
Falls back to extracting JSON from markdown code blocks.
Returns:
Parsed JSON dict, or {"error": "..."} on parse failure.
"""
json_system = (system or "") + (
"\n\nIMPORTANT: Respond with valid JSON only. "
"No markdown, no explanation, just the JSON object."
)
response_text = await self.chat(
user_message,
system=json_system.strip(),
temperature=temperature or 0.1, # Lower temp for structured output
)
return self._parse_json_response(response_text)
async def chat_with_history(
self,
messages: list[dict[str, str]],
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
) -> str:
"""
Chat with full message history (multi-turn conversation).
Args:
messages: List of {"role": "system|user|assistant", "content": "..."}
Returns:
The assistant's response text.
"""
return await self._complete(
messages,
temperature=temperature or self.temperature,
max_tokens=max_tokens or self.max_tokens,
)
# ================================================================
# Hold Slayer Specific Methods
# ================================================================
async def analyze_ivr_menu(
self,
transcript: str,
intent: str,
previous_selections: Optional[list[str]] = None,
) -> dict[str, Any]:
"""
Analyze an IVR menu transcript and decide which option to press.
This is the LLM fallback when regex-based menu parsing fails.
Args:
transcript: The IVR audio transcript.
intent: What the user wants to accomplish.
previous_selections: DTMF digits already pressed in this call.
Returns:
{"digit": "3", "reason": "Option 3 is for card cancellation",
"confidence": 0.85}
"""
system = (
"You are an expert at navigating phone menus (IVR systems). "
"Given an IVR transcript and the caller's intent, determine "
"which menu option (DTMF digit) to press.\n\n"
"Rules:\n"
"- If there's a direct match for the intent, choose it.\n"
"- If no direct match, choose 'speak to representative' or 'agent' option.\n"
"- If menu says 'press 0 for operator', that's always a safe fallback.\n"
"- Return the single digit to press.\n"
"- If you truly can't determine the right option, return digit: null.\n"
)
context = f"IVR Transcript:\n{transcript}\n\n"
context += f"Caller's Intent: {intent}\n"
if previous_selections:
context += f"Already pressed: {', '.join(previous_selections)}\n"
context += "\nWhich digit should be pressed? Return JSON."
result = await self.chat_json(context, system=system)
# Normalize response
if "digit" not in result:
# Try to extract from various response formats
for key in ["option", "press", "choice", "dtmf"]:
if key in result:
result["digit"] = str(result[key])
break
return result
async def detect_human_speech(
self,
transcript: str,
context: str = "",
) -> dict[str, Any]:
"""
Analyze a transcript to determine if a human agent is speaking.
Used as a secondary check when audio classifier detects speech
but we need to distinguish between IVR prompts and a live human.
Returns:
{"is_human": true, "confidence": 0.9, "reason": "Agent greeting detected"}
"""
system = (
"You are analyzing a phone call transcript to determine if "
"a live human agent is speaking (vs an automated IVR system).\n\n"
"Human indicators:\n"
"- Personal greeting ('Hi, my name is...')\n"
"- Asking for account details\n"
"- Conversational tone, filler words\n"
"- Acknowledging hold time ('Thanks for waiting')\n"
"\nIVR indicators:\n"
"- 'Press N for...', 'Say...'\n"
"- Robotic phrasing\n"
"- Menu options\n"
"- 'Your call is important to us'\n"
)
prompt = f"Transcript:\n{transcript}\n"
if context:
prompt += f"\nContext: {context}\n"
prompt += "\nIs this a live human agent? Return JSON."
return await self.chat_json(prompt, system=system)
async def summarize_call(
self,
transcript_chunks: list[str],
intent: str,
duration_seconds: int,
) -> dict[str, Any]:
"""
Generate a call summary from transcript chunks.
Used for call history and analytics.
Returns:
{"summary": "...", "outcome": "resolved|unresolved|transferred",
"key_info": [...], "sentiment": "positive|neutral|negative"}
"""
system = (
"Summarize this phone call concisely. Include:\n"
"- What the caller wanted\n"
"- What happened (IVR navigation, hold time, agent interaction)\n"
"- The outcome\n"
"Return as JSON with: summary, outcome, key_info (list), sentiment."
)
full_transcript = "\n".join(transcript_chunks)
prompt = (
f"Caller's intent: {intent}\n"
f"Call duration: {duration_seconds} seconds\n\n"
f"Full transcript:\n{full_transcript}\n\n"
"Summarize this call."
)
return await self.chat_json(prompt, system=system)
# ================================================================
# Internal
# ================================================================
async def _complete(
self,
messages: list[dict[str, str]],
temperature: float = 0.3,
max_tokens: int = 1024,
) -> str:
"""Execute a chat completion request."""
self._total_requests += 1
start = time.monotonic()
try:
payload = {
"model": self.model,
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens,
}
response = await self._client.post("/chat/completions", json=payload)
response.raise_for_status()
data = response.json()
# Track token usage
if "usage" in data:
self._total_tokens += data["usage"].get("total_tokens", 0)
# Track latency
elapsed_ms = (time.monotonic() - start) * 1000
self._avg_latency_ms = (
self._avg_latency_ms * 0.9 + elapsed_ms * 0.1
)
# Extract response text
choices = data.get("choices", [])
if choices:
return choices[0].get("message", {}).get("content", "")
return ""
except httpx.HTTPStatusError as e:
self._total_errors += 1
logger.error(f"LLM API error: {e.response.status_code} {e.response.text[:200]}")
return ""
except httpx.TimeoutException:
self._total_errors += 1
logger.error(f"LLM API timeout after {self.timeout}s")
return ""
except Exception as e:
self._total_errors += 1
logger.error(f"LLM client error: {e}")
return ""
@staticmethod
def _parse_json_response(text: str) -> dict[str, Any]:
"""Parse JSON from LLM response, handling common formatting issues."""
text = text.strip()
# Try direct parse
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Try extracting from markdown code block
if "```" in text:
# Find content between ```json and ``` or ``` and ```
parts = text.split("```")
for i, part in enumerate(parts):
if i % 2 == 1: # Odd indices are inside code blocks
# Remove optional language tag
content = part.strip()
if content.startswith("json"):
content = content[4:].strip()
try:
return json.loads(content)
except json.JSONDecodeError:
continue
# Try finding JSON object in the text
brace_start = text.find("{")
brace_end = text.rfind("}")
if brace_start != -1 and brace_end != -1:
try:
return json.loads(text[brace_start : brace_end + 1])
except json.JSONDecodeError:
pass
logger.warning(f"Failed to parse JSON from LLM response: {text[:200]}")
return {"error": "Failed to parse JSON response", "raw": text[:500]}
# ================================================================
# Stats
# ================================================================
@property
def stats(self) -> dict:
return {
"total_requests": self._total_requests,
"total_tokens": self._total_tokens,
"total_errors": self._total_errors,
"avg_latency_ms": round(self._avg_latency_ms, 1),
"model": self.model,
"base_url": self.base_url,
}

256
services/notification.py Normal file
View File

@@ -0,0 +1,256 @@
"""
Notification Service — Tell the user what's happening.
Sends notifications when:
- A human picks up (TRANSFER NOW!)
- Hold time estimates change
- Call fails or times out
- IVR navigation milestones
Supports multiple channels: WebSocket (always), SMS (optional),
push notifications (future).
"""
import asyncio
import logging
from datetime import datetime
from enum import Enum
from typing import Any, Optional
from pydantic import BaseModel
from config import Settings
from core.event_bus import EventBus
from models.events import EventType, GatewayEvent
logger = logging.getLogger(__name__)
class NotificationChannel(str, Enum):
"""Where to send notifications."""
WEBSOCKET = "websocket"
SMS = "sms"
PUSH = "push"
class NotificationPriority(str, Enum):
"""How urgently to deliver."""
LOW = "low" # Status updates, hold time estimates
NORMAL = "normal" # IVR navigation milestones
HIGH = "high" # Human detected, call failed
CRITICAL = "critical" # Transfer happening NOW
class Notification(BaseModel):
"""A notification to send to the user."""
channel: NotificationChannel
priority: NotificationPriority
title: str
message: str
call_id: Optional[str] = None
data: dict[str, Any] = {}
timestamp: datetime = datetime.now()
class NotificationService:
"""
Sends notifications to users about call events.
Listens to the EventBus and routes events to the
appropriate notification channels.
"""
def __init__(self, event_bus: EventBus, settings: Settings):
self._event_bus = event_bus
self._settings = settings
self._task: Optional[asyncio.Task] = None
self._sms_sender: Optional[Any] = None
# Track what we've already notified (avoid spam)
self._notified: dict[str, set[str]] = {} # call_id -> set of event types
async def start(self) -> None:
"""Start listening for events to notify on."""
self._task = asyncio.create_task(self._listen_loop())
logger.info("📢 Notification service started")
async def stop(self) -> None:
"""Stop the notification listener."""
if self._task:
self._task.cancel()
try:
await self._task
except asyncio.CancelledError:
pass
logger.info("📢 Notification service stopped")
async def _listen_loop(self) -> None:
"""Main event listener loop."""
subscription = self._event_bus.subscribe()
try:
async for event in subscription:
try:
await self._handle_event(event)
except Exception as e:
logger.error(f"Notification handler error: {e}", exc_info=True)
except asyncio.CancelledError:
pass
finally:
subscription.close()
async def _handle_event(self, event: GatewayEvent) -> None:
"""Route an event to the appropriate notification(s)."""
call_id = event.call_id or ""
# Initialize tracking for this call
if call_id and call_id not in self._notified:
self._notified[call_id] = set()
# Skip duplicate notifications
dedup_key = f"{event.type.value}:{event.data.get('step_id', '')}"
if call_id and dedup_key in self._notified.get(call_id, set()):
return
notification = self._event_to_notification(event)
if not notification:
return
# Mark as notified
if call_id:
self._notified[call_id].add(dedup_key)
# Send via all appropriate channels
await self._send(notification)
def _event_to_notification(self, event: GatewayEvent) -> Optional[Notification]:
"""Convert a gateway event to a notification (or None to skip)."""
if event.type == EventType.HUMAN_DETECTED:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.CRITICAL,
title="🚨 Human Detected!",
message="A live person picked up — transferring you now!",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.TRANSFER_STARTED:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.CRITICAL,
title="📞 Call Transferred",
message="Your call has been connected to the agent. Pick up your phone!",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.CALL_FAILED:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.HIGH,
title="❌ Call Failed",
message=event.message or "The call couldn't be completed.",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.HOLD_DETECTED:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.NORMAL,
title="⏳ On Hold",
message="You're on hold. We'll notify you when someone picks up.",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.IVR_STEP:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.LOW,
title="📍 IVR Navigation",
message=event.message or "Navigating phone menu...",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.IVR_DTMF_SENT:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.LOW,
title="📱 Button Pressed",
message=event.message or f"Pressed {event.data.get('digits', '?')}",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.CALL_ENDED:
# Clean up tracking
if event.call_id and event.call_id in self._notified:
del self._notified[event.call_id]
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.NORMAL,
title="📴 Call Ended",
message=event.message or "The call has ended.",
call_id=event.call_id,
data=event.data,
)
# Skip other event types (transcription, classification, etc.)
return None
async def _send(self, notification: Notification) -> None:
"""Send a notification via the appropriate channel."""
logger.info(
f"📢 [{notification.priority.value}] {notification.title}: "
f"{notification.message}"
)
# WebSocket notifications go through the event bus
# (the WebSocket handler in the API reads from EventBus directly)
# SMS for critical notifications
if (
notification.priority == NotificationPriority.CRITICAL
and self._settings.notify_sms_number
):
await self._send_sms(notification)
async def _send_sms(self, notification: Notification) -> None:
"""
Send an SMS notification.
Uses a simple HTTP-based SMS gateway. In production,
this would use Twilio, AWS SNS, or similar.
"""
phone = self._settings.notify_sms_number
if not phone:
return
try:
import httpx
# Generic webhook-based SMS (configure your provider)
# This is a placeholder — wire up your preferred SMS provider
logger.info(f"📱 SMS → {phone}: {notification.title}")
# Example: Twilio-style API
# async with httpx.AsyncClient() as client:
# await client.post(
# "https://api.twilio.com/2010-04-01/Accounts/.../Messages.json",
# data={
# "To": phone,
# "From": self._settings.sip_trunk.did,
# "Body": f"{notification.title}\n{notification.message}",
# },
# auth=(account_sid, auth_token),
# )
except Exception as e:
logger.error(f"SMS send failed: {e}")

230
services/recording.py Normal file
View File

@@ -0,0 +1,230 @@
"""
Recording Service — Call recording management.
Records calls to WAV files via the PJSUA2 media pipeline,
manages storage, and provides playback/download access.
"""
import asyncio
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import Optional
from config import get_settings
logger = logging.getLogger(__name__)
class RecordingService:
"""
Manages call recordings.
Features:
- Start/stop recording for any active call leg
- Dual-channel recording (separate caller/agent streams)
- Mixed recording (both parties in one file)
- WAV storage with organized directory structure
- Recording metadata tracking
"""
def __init__(
self,
storage_dir: str = "recordings",
max_recording_seconds: int = 7200, # 2 hours
sample_rate: int = 16000,
):
self._storage_dir = Path(storage_dir)
self._max_recording_seconds = max_recording_seconds
self._sample_rate = sample_rate
self._active_recordings: dict[str, RecordingSession] = {}
self._metadata: list[dict] = []
async def start(self) -> None:
"""Initialize the recording service."""
self._storage_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"🎙️ Recording service ready (storage: {self._storage_dir})")
# ================================================================
# Recording Lifecycle
# ================================================================
async def start_recording(
self,
call_id: str,
media_pipeline=None,
leg_ids: Optional[list[str]] = None,
dual_channel: bool = False,
) -> "RecordingSession":
"""
Start recording a call.
Args:
call_id: The call to record.
media_pipeline: MediaPipeline instance for PJSUA2 recording.
leg_ids: Specific SIP leg IDs to record. If None, records all legs.
dual_channel: If True, record each party to a separate channel.
Returns:
RecordingSession with file paths and metadata.
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
date_dir = datetime.now().strftime("%Y-%m-%d")
recording_dir = self._storage_dir / date_dir
recording_dir.mkdir(parents=True, exist_ok=True)
if dual_channel:
filepath_caller = str(recording_dir / f"{call_id}_{timestamp}_caller.wav")
filepath_agent = str(recording_dir / f"{call_id}_{timestamp}_agent.wav")
filepath_mixed = str(recording_dir / f"{call_id}_{timestamp}_mixed.wav")
else:
filepath_caller = None
filepath_agent = None
filepath_mixed = str(recording_dir / f"{call_id}_{timestamp}.wav")
session = RecordingSession(
call_id=call_id,
filepath_mixed=filepath_mixed,
filepath_caller=filepath_caller,
filepath_agent=filepath_agent,
started_at=datetime.now(),
sample_rate=self._sample_rate,
)
# Start PJSUA2 recording if media pipeline is available
if media_pipeline and leg_ids:
for leg_id in leg_ids:
if filepath_mixed:
media_pipeline.start_recording(leg_id, filepath_mixed)
self._active_recordings[call_id] = session
logger.info(f"🔴 Recording started: {call_id}{filepath_mixed}")
# Safety timeout
asyncio.create_task(
self._recording_timeout(call_id),
name=f"rec_timeout_{call_id}",
)
return session
async def stop_recording(
self,
call_id: str,
media_pipeline=None,
) -> Optional["RecordingSession"]:
"""Stop recording a call and finalize the WAV file."""
session = self._active_recordings.pop(call_id, None)
if not session:
logger.warning(f" No active recording for {call_id}")
return None
session.stopped_at = datetime.now()
session.duration_seconds = int(
(session.stopped_at - session.started_at).total_seconds()
)
# Stop PJSUA2 recording
if media_pipeline:
# The pipeline handles flushing and closing the WAV file
for leg_id in (session._leg_ids or []):
media_pipeline.stop_recording(leg_id)
# Calculate file size
if session.filepath_mixed and os.path.exists(session.filepath_mixed):
session.file_size_bytes = os.path.getsize(session.filepath_mixed)
# Store metadata
self._metadata.append(session.to_dict())
logger.info(
f"⏹ Recording stopped: {call_id} "
f"({session.duration_seconds}s, "
f"{session.file_size_bytes or 0} bytes)"
)
return session
async def _recording_timeout(self, call_id: str) -> None:
"""Auto-stop recording after max duration."""
await asyncio.sleep(self._max_recording_seconds)
if call_id in self._active_recordings:
logger.warning(f" Recording timeout for {call_id}, auto-stopping")
await self.stop_recording(call_id)
# ================================================================
# Queries
# ================================================================
def get_recording(self, call_id: str) -> Optional[dict]:
"""Get recording metadata for a call."""
for meta in reversed(self._metadata):
if meta["call_id"] == call_id:
return meta
return None
def list_recordings(
self,
limit: int = 50,
offset: int = 0,
) -> list[dict]:
"""List recording metadata, newest first."""
sorted_meta = sorted(
self._metadata,
key=lambda m: m.get("started_at", ""),
reverse=True,
)
return sorted_meta[offset : offset + limit]
@property
def active_recording_count(self) -> int:
return len(self._active_recordings)
@property
def total_recordings(self) -> int:
return len(self._metadata)
def storage_usage_bytes(self) -> int:
"""Calculate total storage used by recordings."""
total = 0
for root, _dirs, files in os.walk(self._storage_dir):
for f in files:
total += os.path.getsize(os.path.join(root, f))
return total
class RecordingSession:
"""Tracks a single active recording session."""
def __init__(
self,
call_id: str,
filepath_mixed: Optional[str] = None,
filepath_caller: Optional[str] = None,
filepath_agent: Optional[str] = None,
started_at: Optional[datetime] = None,
sample_rate: int = 16000,
):
self.call_id = call_id
self.filepath_mixed = filepath_mixed
self.filepath_caller = filepath_caller
self.filepath_agent = filepath_agent
self.started_at = started_at or datetime.now()
self.stopped_at: Optional[datetime] = None
self.duration_seconds: Optional[int] = None
self.file_size_bytes: Optional[int] = None
self.sample_rate = sample_rate
self._leg_ids: list[str] = []
def to_dict(self) -> dict:
return {
"call_id": self.call_id,
"filepath_mixed": self.filepath_mixed,
"filepath_caller": self.filepath_caller,
"filepath_agent": self.filepath_agent,
"started_at": self.started_at.isoformat() if self.started_at else None,
"stopped_at": self.stopped_at.isoformat() if self.stopped_at else None,
"duration_seconds": self.duration_seconds,
"file_size_bytes": self.file_size_bytes,
"sample_rate": self.sample_rate,
}

161
services/transcription.py Normal file
View File

@@ -0,0 +1,161 @@
"""
Transcription Service — Speaches STT integration.
Sends audio to your Speaches instances for real-time speech-to-text.
Used by the Hold Slayer to understand IVR prompts and detect menu options.
"""
import io
import logging
from typing import Optional
import httpx
from config import SpeachesSettings
logger = logging.getLogger(__name__)
class TranscriptionService:
"""
Client for Speaches STT service.
Speaches exposes an OpenAI-compatible API:
POST /v1/audio/transcriptions
"""
def __init__(self, settings: SpeachesSettings):
self.settings = settings
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create the HTTP client."""
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(
base_url=self.settings.url,
timeout=httpx.Timeout(30.0, connect=5.0),
)
return self._client
async def transcribe(
self,
audio_data: bytes,
language: str = "en",
prompt: Optional[str] = None,
) -> str:
"""
Transcribe audio data to text.
Args:
audio_data: Raw PCM audio (16-bit signed, 16kHz, mono)
language: Language code (default: "en")
prompt: Optional context hint for better accuracy
(e.g., "IVR menu options, phone banking")
Returns:
Transcribed text
"""
client = await self._get_client()
# Convert raw PCM to WAV format for the API
wav_data = self._pcm_to_wav(audio_data)
try:
response = await client.post(
"/v1/audio/transcriptions",
files={"file": ("audio.wav", wav_data, "audio/wav")},
data={
"model": self.settings.model,
"language": language,
"response_format": "text",
**({"prompt": prompt} if prompt else {}),
},
)
response.raise_for_status()
text = response.text.strip()
logger.debug(f"Transcription: '{text}'")
return text
except httpx.HTTPStatusError as e:
logger.error(f"Speaches API error: {e.response.status_code} {e.response.text}")
return ""
except httpx.ConnectError:
logger.error(f"Cannot connect to Speaches at {self.settings.url}")
return ""
except Exception as e:
logger.error(f"Transcription failed: {e}")
return ""
async def transcribe_stream(
self,
audio_data: bytes,
language: str = "en",
):
"""
Stream transcription — for real-time results.
Uses Speaches streaming endpoint if available,
falls back to chunked transcription.
Yields:
str: Partial transcription chunks
"""
# For now, do chunked transcription
# TODO: Implement WebSocket streaming when Speaches supports it
chunk_size = 16000 * 2 * 3 # 3 seconds of 16kHz 16-bit mono
for i in range(0, len(audio_data), chunk_size):
chunk = audio_data[i:i + chunk_size]
if len(chunk) > 0:
text = await self.transcribe(chunk, language)
if text:
yield text
async def close(self) -> None:
"""Close the HTTP client."""
if self._client and not self._client.is_closed:
await self._client.aclose()
self._client = None
@staticmethod
def _pcm_to_wav(pcm_data: bytes, sample_rate: int = 16000, channels: int = 1, sample_width: int = 2) -> bytes:
"""
Convert raw PCM data to WAV format.
Args:
pcm_data: Raw PCM audio bytes
sample_rate: Sample rate in Hz (default: 16000)
channels: Number of channels (default: 1 = mono)
sample_width: Bytes per sample (default: 2 = 16-bit)
Returns:
WAV file as bytes
"""
import struct
data_size = len(pcm_data)
file_size = 36 + data_size # Header is 44 bytes, minus 8 for RIFF header
wav = io.BytesIO()
# RIFF header
wav.write(b"RIFF")
wav.write(struct.pack("<I", file_size))
wav.write(b"WAVE")
# fmt chunk
wav.write(b"fmt ")
wav.write(struct.pack("<I", 16)) # Chunk size
wav.write(struct.pack("<H", 1)) # PCM format
wav.write(struct.pack("<H", channels))
wav.write(struct.pack("<I", sample_rate))
wav.write(struct.pack("<I", sample_rate * channels * sample_width)) # Byte rate
wav.write(struct.pack("<H", channels * sample_width)) # Block align
wav.write(struct.pack("<H", sample_width * 8)) # Bits per sample
# data chunk
wav.write(b"data")
wav.write(struct.pack("<I", data_size))
wav.write(pcm_data)
return wav.getvalue()

1
tests/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Hold Slayer tests."""

View File

@@ -0,0 +1,253 @@
"""
Tests for the audio classifier.
Tests spectral analysis, DTMF detection, and classification logic.
"""
import numpy as np
import pytest
from config import ClassifierSettings
from models.call import AudioClassification
from services.audio_classifier import AudioClassifier, SAMPLE_RATE
@pytest.fixture
def classifier():
"""Create a classifier with default settings."""
settings = ClassifierSettings()
return AudioClassifier(settings)
def generate_silence(duration_seconds: float = 1.0) -> bytes:
"""Generate silent audio (near-zero amplitude)."""
samples = int(SAMPLE_RATE * duration_seconds)
data = np.zeros(samples, dtype=np.int16)
return data.tobytes()
def generate_tone(frequency: float, duration_seconds: float = 1.0, amplitude: float = 0.5) -> bytes:
"""Generate a pure sine tone."""
samples = int(SAMPLE_RATE * duration_seconds)
t = np.linspace(0, duration_seconds, samples, endpoint=False)
signal = (amplitude * 32767 * np.sin(2 * np.pi * frequency * t)).astype(np.int16)
return signal.tobytes()
def generate_dtmf(digit: str, duration_seconds: float = 0.5) -> bytes:
"""Generate a DTMF tone for a digit."""
dtmf_freqs = {
"1": (697, 1209), "2": (697, 1336), "3": (697, 1477),
"4": (770, 1209), "5": (770, 1336), "6": (770, 1477),
"7": (852, 1209), "8": (852, 1336), "9": (852, 1477),
"*": (941, 1209), "0": (941, 1336), "#": (941, 1477),
}
low_freq, high_freq = dtmf_freqs[digit]
samples = int(SAMPLE_RATE * duration_seconds)
t = np.linspace(0, duration_seconds, samples, endpoint=False)
signal = 0.5 * (np.sin(2 * np.pi * low_freq * t) + np.sin(2 * np.pi * high_freq * t))
signal = (signal * 16383).astype(np.int16)
return signal.tobytes()
def generate_noise(duration_seconds: float = 1.0, amplitude: float = 0.3) -> bytes:
"""Generate white noise."""
samples = int(SAMPLE_RATE * duration_seconds)
noise = np.random.normal(0, amplitude * 32767, samples).astype(np.int16)
return noise.tobytes()
def generate_speech_like(duration_seconds: float = 1.0) -> bytes:
"""
Generate a rough approximation of speech.
Mix of formant-like frequencies with amplitude modulation.
"""
samples = int(SAMPLE_RATE * duration_seconds)
t = np.linspace(0, duration_seconds, samples, endpoint=False)
# Fundamental frequency (pitch) with vibrato
f0 = 150 + 10 * np.sin(2 * np.pi * 5 * t)
fundamental = np.sin(2 * np.pi * f0 * t)
# Formants (vowel-like)
f1 = np.sin(2 * np.pi * 730 * t) * 0.5
f2 = np.sin(2 * np.pi * 1090 * t) * 0.3
f3 = np.sin(2 * np.pi * 2440 * t) * 0.1
# Amplitude modulation (syllable-like rhythm)
envelope = 0.5 + 0.5 * np.sin(2 * np.pi * 3 * t)
signal = envelope * (fundamental + f1 + f2 + f3)
signal = (signal * 8000).astype(np.int16)
return signal.tobytes()
class TestSilenceDetection:
"""Test silence classification."""
def test_pure_silence(self, classifier):
result = classifier.classify_chunk(generate_silence())
assert result.audio_type == AudioClassification.SILENCE
assert result.confidence > 0.5
def test_very_quiet(self, classifier):
# Near-silent audio
quiet = generate_tone(440, amplitude=0.001)
result = classifier.classify_chunk(quiet)
assert result.audio_type == AudioClassification.SILENCE
def test_empty_audio(self, classifier):
result = classifier.classify_chunk(b"")
assert result.audio_type == AudioClassification.SILENCE
class TestToneDetection:
"""Test tonal audio classification."""
def test_440hz_ringback(self, classifier):
"""440Hz is North American ring-back tone frequency."""
tone = generate_tone(440, amplitude=0.3)
result = classifier.classify_chunk(tone)
# Should be detected as ringing (440Hz is in the ring-back range)
assert result.audio_type in (
AudioClassification.RINGING,
AudioClassification.MUSIC,
)
assert result.confidence > 0.5
def test_1000hz_tone(self, classifier):
"""1000Hz tone — not ring-back, should be music or unknown."""
tone = generate_tone(1000, amplitude=0.3)
result = classifier.classify_chunk(tone)
assert result.audio_type != AudioClassification.SILENCE
class TestDTMFDetection:
"""Test DTMF tone detection."""
def test_dtmf_digit_5(self, classifier):
dtmf = generate_dtmf("5", duration_seconds=0.5)
result = classifier.classify_chunk(dtmf)
# DTMF detection should catch this
if result.audio_type == AudioClassification.DTMF:
assert result.details.get("dtmf_digit") == "5"
def test_dtmf_digit_0(self, classifier):
dtmf = generate_dtmf("0", duration_seconds=0.5)
result = classifier.classify_chunk(dtmf)
if result.audio_type == AudioClassification.DTMF:
assert result.details.get("dtmf_digit") == "0"
class TestMusicDetection:
"""Test hold music detection."""
def test_complex_tone_as_music(self, classifier):
"""Multiple frequencies together = more music-like."""
samples = int(SAMPLE_RATE * 2)
t = np.linspace(0, 2, samples, endpoint=False)
# Chord: C major (C4 + E4 + G4)
signal = (
np.sin(2 * np.pi * 261.6 * t)
+ np.sin(2 * np.pi * 329.6 * t) * 0.8
+ np.sin(2 * np.pi * 392.0 * t) * 0.6
)
signal = (signal * 6000).astype(np.int16)
result = classifier.classify_chunk(signal.tobytes())
assert result.audio_type in (
AudioClassification.MUSIC,
AudioClassification.RINGING,
AudioClassification.UNKNOWN,
)
assert result.confidence > 0.3
class TestSpeechDetection:
"""Test speech-like audio classification."""
def test_speech_like_audio(self, classifier):
speech = generate_speech_like(2.0)
result = classifier.classify_chunk(speech)
assert result.audio_type in (
AudioClassification.IVR_PROMPT,
AudioClassification.LIVE_HUMAN,
AudioClassification.MUSIC, # Speech-like can be ambiguous
AudioClassification.UNKNOWN,
)
class TestClassificationHistory:
"""Test history-based transition detection."""
def test_hold_to_human_transition(self, classifier):
"""Detect the music → speech transition."""
# Simulate being on hold
for _ in range(10):
classifier.update_history(AudioClassification.MUSIC)
# Now speech appears
classifier.update_history(AudioClassification.LIVE_HUMAN)
classifier.update_history(AudioClassification.LIVE_HUMAN)
classifier.update_history(AudioClassification.LIVE_HUMAN)
assert classifier.detect_hold_to_human_transition()
def test_no_transition_during_ivr(self, classifier):
"""IVR prompt after silence is not a hold→human transition."""
for _ in range(5):
classifier.update_history(AudioClassification.SILENCE)
classifier.update_history(AudioClassification.IVR_PROMPT)
classifier.update_history(AudioClassification.IVR_PROMPT)
classifier.update_history(AudioClassification.IVR_PROMPT)
# No music in history, so no hold→human transition
assert not classifier.detect_hold_to_human_transition()
def test_not_enough_history(self, classifier):
"""Not enough data to detect transition."""
classifier.update_history(AudioClassification.MUSIC)
classifier.update_history(AudioClassification.LIVE_HUMAN)
assert not classifier.detect_hold_to_human_transition()
class TestFeatureExtraction:
"""Test individual feature extractors."""
def test_rms_silence(self, classifier):
samples = np.zeros(1000, dtype=np.float32)
rms = classifier._compute_rms(samples)
assert rms == 0.0
def test_rms_loud(self, classifier):
samples = np.ones(1000, dtype=np.float32) * 0.5
rms = classifier._compute_rms(samples)
assert rms == pytest.approx(0.5, abs=0.01)
def test_zcr_silence(self, classifier):
samples = np.zeros(1000, dtype=np.float32)
zcr = classifier._compute_zero_crossing_rate(samples)
assert zcr == 0.0
def test_zcr_high_freq(self, classifier):
"""High frequency signal should have high ZCR."""
t = np.linspace(0, 1, SAMPLE_RATE, endpoint=False)
samples = np.sin(2 * np.pi * 4000 * t).astype(np.float32)
zcr = classifier._compute_zero_crossing_rate(samples)
assert zcr > 0.1
def test_spectral_flatness_tone(self, classifier):
"""Pure tone should have low spectral flatness."""
t = np.linspace(0, 1, SAMPLE_RATE, endpoint=False)
samples = np.sin(2 * np.pi * 440 * t).astype(np.float32)
flatness = classifier._compute_spectral_flatness(samples)
assert flatness < 0.3
def test_dominant_frequency(self, classifier):
"""Should find the dominant frequency of a pure tone."""
t = np.linspace(0, 1, SAMPLE_RATE, endpoint=False)
samples = np.sin(2 * np.pi * 1000 * t).astype(np.float32)
freq = classifier._compute_dominant_frequency(samples)
assert abs(freq - 1000) < 50 # Within 50Hz

173
tests/test_call_flows.py Normal file
View File

@@ -0,0 +1,173 @@
"""
Tests for call flow models and serialization.
"""
import pytest
from models.call_flow import ActionType, CallFlow, CallFlowCreate, CallFlowStep, CallFlowSummary
class TestCallFlowStep:
"""Test CallFlowStep model."""
def test_basic_dtmf_step(self):
step = CallFlowStep(
id="press_1",
description="Press 1 for English",
action=ActionType.DTMF,
action_value="1",
expect="for english|para español",
next_step="main_menu",
)
assert step.id == "press_1"
assert step.action == ActionType.DTMF
assert step.action_value == "1"
assert step.timeout == 30 # default
def test_hold_step(self):
step = CallFlowStep(
id="hold_queue",
description="On hold waiting for agent",
action=ActionType.HOLD,
timeout=7200,
next_step="agent_connected",
notes="Average hold: 25-45 min. Plays Vivaldi. Kill me.",
)
assert step.action == ActionType.HOLD
assert step.timeout == 7200
assert "Vivaldi" in step.notes
def test_transfer_step(self):
step = CallFlowStep(
id="connected",
description="Agent picked up!",
action=ActionType.TRANSFER,
action_value="sip_phone",
)
assert step.action == ActionType.TRANSFER
class TestCallFlow:
"""Test CallFlow model."""
@pytest.fixture
def sample_flow(self):
return CallFlow(
id="test-bank",
name="Test Bank - Main Line",
phone_number="+18005551234",
description="Test bank IVR",
steps=[
CallFlowStep(
id="greeting",
description="Language selection",
action=ActionType.DTMF,
action_value="1",
expect="for english",
next_step="main_menu",
),
CallFlowStep(
id="main_menu",
description="Main menu",
action=ActionType.LISTEN,
next_step="agent_request",
fallback_step="agent_request",
),
CallFlowStep(
id="agent_request",
description="Request agent",
action=ActionType.DTMF,
action_value="0",
next_step="hold_queue",
),
CallFlowStep(
id="hold_queue",
description="Hold queue",
action=ActionType.HOLD,
timeout=3600,
next_step="agent_connected",
),
CallFlowStep(
id="agent_connected",
description="Agent connected",
action=ActionType.TRANSFER,
action_value="sip_phone",
),
],
tags=["bank", "personal"],
avg_hold_time=2100,
success_rate=0.92,
)
def test_step_count(self, sample_flow):
assert len(sample_flow.steps) == 5
def test_get_step(self, sample_flow):
step = sample_flow.get_step("hold_queue")
assert step is not None
assert step.action == ActionType.HOLD
assert step.timeout == 3600
def test_get_step_not_found(self, sample_flow):
assert sample_flow.get_step("nonexistent") is None
def test_first_step(self, sample_flow):
first = sample_flow.first_step()
assert first is not None
assert first.id == "greeting"
def test_steps_by_id(self, sample_flow):
steps = sample_flow.steps_by_id()
assert len(steps) == 5
assert "greeting" in steps
assert "agent_connected" in steps
assert steps["agent_connected"].action == ActionType.TRANSFER
def test_serialization_roundtrip(self, sample_flow):
"""Test JSON serialization and deserialization."""
json_str = sample_flow.model_dump_json()
restored = CallFlow.model_validate_json(json_str)
assert restored.id == sample_flow.id
assert len(restored.steps) == len(sample_flow.steps)
assert restored.steps[0].id == "greeting"
assert restored.avg_hold_time == 2100
class TestCallFlowCreate:
"""Test call flow creation model."""
def test_minimal_create(self):
create = CallFlowCreate(
name="My Bank",
phone_number="+18005551234",
steps=[
CallFlowStep(
id="start",
description="Start",
action=ActionType.HOLD,
next_step="end",
),
],
)
assert create.name == "My Bank"
assert len(create.steps) == 1
assert create.tags == []
assert create.notes is None
class TestCallFlowSummary:
"""Test lightweight summary model."""
def test_summary(self):
summary = CallFlowSummary(
id="chase-bank-main",
name="Chase Bank - Main",
phone_number="+18005551234",
step_count=6,
avg_hold_time=2100,
success_rate=0.92,
times_used=15,
tags=["bank"],
)
assert summary.step_count == 6
assert summary.success_rate == 0.92

265
tests/test_hold_slayer.py Normal file
View File

@@ -0,0 +1,265 @@
"""
Tests for the Hold Slayer service.
Uses MockSIPEngine to test the state machine without real SIP.
"""
import asyncio
import pytest
from config import Settings
from core.call_manager import CallManager
from core.event_bus import EventBus
from core.sip_engine import MockSIPEngine
from models.call import ActiveCall, AudioClassification, CallMode, CallStatus
from models.call_flow import ActionType, CallFlow, CallFlowStep
from services.hold_slayer import HoldSlayerService
class TestMenuNavigation:
"""Test the IVR menu navigation logic."""
@pytest.fixture
def hold_slayer(self):
"""Create a HoldSlayerService with mock dependencies."""
from config import ClassifierSettings, SpeachesSettings
from services.audio_classifier import AudioClassifier
from services.transcription import TranscriptionService
settings = Settings()
event_bus = EventBus()
call_manager = CallManager(event_bus)
sip_engine = MockSIPEngine()
classifier = AudioClassifier(ClassifierSettings())
transcription = TranscriptionService(SpeachesSettings())
return HoldSlayerService(
gateway=None, # Not needed for menu tests
call_manager=call_manager,
sip_engine=sip_engine,
classifier=classifier,
transcription=transcription,
settings=settings,
)
def test_decide_cancel_card(self, hold_slayer):
"""Should match 'cancel' intent to card cancellation option."""
transcript = (
"Press 1 for account balance, press 2 for recent transactions, "
"press 3 to report a lost or stolen card, press 4 to cancel your card, "
"press 0 to speak with a representative."
)
result = hold_slayer._decide_menu_option(
transcript, "cancel my credit card", None
)
assert result == "4"
def test_decide_dispute_charge(self, hold_slayer):
"""Should match 'dispute' intent to billing option."""
transcript = (
"Press 1 for account balance, press 2 for billing and disputes, "
"press 3 for payments, press 0 for agent."
)
result = hold_slayer._decide_menu_option(
transcript, "dispute a charge on my statement", None
)
assert result == "2"
def test_decide_agent_fallback(self, hold_slayer):
"""Should fall back to agent option when no match."""
transcript = (
"Press 1 for mortgage, press 2 for auto loans, "
"press 3 for investments, press 0 to speak with a representative."
)
result = hold_slayer._decide_menu_option(
transcript, "cancel my credit card", None
)
# Should choose representative since no direct match
assert result == "0"
def test_decide_no_options_found(self, hold_slayer):
"""Return None when transcript has no recognizable menu."""
transcript = "Please hold while we transfer your call."
result = hold_slayer._decide_menu_option(
transcript, "cancel my card", None
)
assert result is None
def test_decide_alternate_pattern(self, hold_slayer):
"""Handle 'for X, press N' pattern."""
transcript = (
"For account balance, press 1. For billing inquiries, press 2. "
"For card cancellation, press 3."
)
result = hold_slayer._decide_menu_option(
transcript, "cancel my card", None
)
# Should match card cancellation
assert result == "3"
def test_decide_fraud_intent(self, hold_slayer):
"""Match fraud-related intent."""
transcript = (
"Press 1 for balance, press 2 for payments, "
"press 3 to report fraud or unauthorized transactions, "
"press 0 for an agent."
)
result = hold_slayer._decide_menu_option(
transcript, "report unauthorized charge on my card", None
)
assert result == "3"
class TestEventBus:
"""Test the event bus pub/sub system."""
@pytest.fixture
def event_bus(self):
return EventBus()
def test_subscribe(self, event_bus):
sub = event_bus.subscribe()
assert event_bus.subscriber_count == 1
sub.close()
assert event_bus.subscriber_count == 0
@pytest.mark.asyncio
async def test_publish_receive(self, event_bus):
from models.events import EventType, GatewayEvent
sub = event_bus.subscribe()
event = GatewayEvent(
type=EventType.CALL_INITIATED,
call_id="test_123",
message="Test event",
)
await event_bus.publish(event)
received = await asyncio.wait_for(sub.__anext__(), timeout=1.0)
assert received.type == EventType.CALL_INITIATED
assert received.call_id == "test_123"
sub.close()
def test_history(self, event_bus):
assert len(event_bus.recent_events) == 0
class TestCallManager:
"""Test call manager state tracking."""
@pytest.fixture
def call_manager(self):
event_bus = EventBus()
return CallManager(event_bus)
@pytest.mark.asyncio
async def test_create_call(self, call_manager):
call = await call_manager.create_call(
remote_number="+18005551234",
mode=CallMode.HOLD_SLAYER,
intent="cancel my card",
)
assert call.id.startswith("call_")
assert call.remote_number == "+18005551234"
assert call.mode == CallMode.HOLD_SLAYER
assert call.intent == "cancel my card"
assert call.status == CallStatus.INITIATING
@pytest.mark.asyncio
async def test_update_status(self, call_manager):
call = await call_manager.create_call(
remote_number="+18005551234",
mode=CallMode.DIRECT,
)
await call_manager.update_status(call.id, CallStatus.RINGING)
updated = call_manager.get_call(call.id)
assert updated.status == CallStatus.RINGING
@pytest.mark.asyncio
async def test_end_call(self, call_manager):
call = await call_manager.create_call(
remote_number="+18005551234",
mode=CallMode.DIRECT,
)
ended = await call_manager.end_call(call.id)
assert ended is not None
assert ended.status == CallStatus.COMPLETED
assert call_manager.get_call(call.id) is None
@pytest.mark.asyncio
async def test_active_call_count(self, call_manager):
assert call_manager.active_call_count == 0
await call_manager.create_call("+18005551234", CallMode.DIRECT)
assert call_manager.active_call_count == 1
await call_manager.create_call("+18005559999", CallMode.HOLD_SLAYER)
assert call_manager.active_call_count == 2
@pytest.mark.asyncio
async def test_add_transcript(self, call_manager):
call = await call_manager.create_call("+18005551234", CallMode.HOLD_SLAYER)
await call_manager.add_transcript(call.id, "Press 1 for English")
await call_manager.add_transcript(call.id, "Press 2 for French")
updated = call_manager.get_call(call.id)
assert "Press 1 for English" in updated.transcript
assert "Press 2 for French" in updated.transcript
class TestMockSIPEngine:
"""Test the mock SIP engine."""
@pytest.fixture
def engine(self):
return MockSIPEngine()
@pytest.mark.asyncio
async def test_lifecycle(self, engine):
assert not await engine.is_ready()
await engine.start()
assert await engine.is_ready()
await engine.stop()
assert not await engine.is_ready()
@pytest.mark.asyncio
async def test_make_call(self, engine):
await engine.start()
leg_id = await engine.make_call("+18005551234")
assert leg_id.startswith("mock_leg_")
assert leg_id in engine._active_legs
@pytest.mark.asyncio
async def test_hangup(self, engine):
await engine.start()
leg_id = await engine.make_call("+18005551234")
await engine.hangup(leg_id)
assert leg_id not in engine._active_legs
@pytest.mark.asyncio
async def test_send_dtmf(self, engine):
await engine.start()
leg_id = await engine.make_call("+18005551234")
await engine.send_dtmf(leg_id, "1")
await engine.send_dtmf(leg_id, "0")
assert engine._active_legs[leg_id]["dtmf_sent"] == ["1", "0"]
@pytest.mark.asyncio
async def test_bridge(self, engine):
await engine.start()
leg_a = await engine.make_call("+18005551234")
leg_b = await engine.make_call("+18005559999")
bridge_id = await engine.bridge_calls(leg_a, leg_b)
assert bridge_id in engine._bridges
await engine.unbridge(bridge_id)
assert bridge_id not in engine._bridges
@pytest.mark.asyncio
async def test_trunk_status(self, engine):
status = await engine.get_trunk_status()
assert status["registered"] is False
await engine.start()
status = await engine.get_trunk_status()
assert status["registered"] is True

557
tests/test_services.py Normal file
View File

@@ -0,0 +1,557 @@
"""
Tests for the intelligence layer services:
- LLMClient
- NotificationService
- RecordingService
- CallAnalytics
- CallFlowLearner
"""
import asyncio
from datetime import datetime
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from config import Settings
from core.event_bus import EventBus
from models.events import EventType, GatewayEvent
# ============================================================
# LLM Client Tests
# ============================================================
class TestLLMClient:
"""Test the LLM client with mocked HTTP responses."""
def _make_client(self):
from services.llm_client import LLMClient
return LLMClient(
base_url="http://localhost:11434/v1",
model="llama3",
api_key="not-needed",
)
@pytest.mark.asyncio
async def test_init(self):
client = self._make_client()
assert client.model == "llama3"
assert client._total_requests == 0
assert client._total_errors == 0
@pytest.mark.asyncio
async def test_stats(self):
client = self._make_client()
stats = client.stats
assert stats["total_requests"] == 0
assert stats["total_errors"] == 0
assert stats["model"] == "llama3"
assert stats["avg_latency_ms"] == 0
@pytest.mark.asyncio
async def test_chat_request_format(self):
"""Verify the HTTP request is formatted correctly."""
client = self._make_client()
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.raise_for_status = MagicMock()
mock_response.json.return_value = {
"choices": [{"message": {"content": "Hello!"}}],
"usage": {"total_tokens": 10},
}
with patch.object(client._client, "post", new_callable=AsyncMock) as mock_post:
mock_post.return_value = mock_response
result = await client.chat("Say hello", system="Hi")
assert result == "Hello!"
assert client._total_requests == 1
# Verify the request body
call_args = mock_post.call_args
body = call_args[1]["json"]
assert body["model"] == "llama3"
assert len(body["messages"]) == 2
assert body["messages"][0]["role"] == "system"
assert body["messages"][1]["role"] == "user"
@pytest.mark.asyncio
async def test_chat_json_parsing(self):
"""Verify JSON response parsing works."""
client = self._make_client()
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.raise_for_status = MagicMock()
mock_response.json.return_value = {
"choices": [{"message": {"content": '{"action": "press_1", "confidence": 0.9}'}}],
"usage": {"total_tokens": 20},
}
with patch.object(client._client, "post", new_callable=AsyncMock) as mock_post:
mock_post.return_value = mock_response
result = await client.chat_json("Analyze menu", system="Press 1 for billing")
assert result is not None
assert result["action"] == "press_1"
assert result["confidence"] == 0.9
@pytest.mark.asyncio
async def test_chat_json_markdown_extraction(self):
"""Verify JSON extraction from markdown code blocks."""
client = self._make_client()
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.raise_for_status = MagicMock()
mock_response.json.return_value = {
"choices": [
{
"message": {
"content": 'Here is the result:\n```json\n{"key": "value"}\n```'
}
}
],
"usage": {"total_tokens": 15},
}
with patch.object(client._client, "post", new_callable=AsyncMock) as mock_post:
mock_post.return_value = mock_response
result = await client.chat_json("Parse this", system="test")
assert result is not None
assert result["key"] == "value"
@pytest.mark.asyncio
async def test_chat_http_error_returns_empty(self):
"""Verify HTTP errors return empty string gracefully."""
client = self._make_client()
with patch.object(client._client, "post", new_callable=AsyncMock) as mock_post:
mock_post.side_effect = Exception("Connection refused")
result = await client.chat("test", system="test")
assert result == ""
assert client._total_errors == 1
@pytest.mark.asyncio
async def test_analyze_ivr_menu(self):
"""Verify IVR menu analysis formats correctly."""
client = self._make_client()
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.raise_for_status = MagicMock()
mock_response.json.return_value = {
"choices": [
{
"message": {
"content": '{"action": "press_2", "digit": "2", "confidence": 0.85, "reason": "Option 2 is billing"}'
}
}
],
"usage": {"total_tokens": 30},
}
with patch.object(client._client, "post", new_callable=AsyncMock) as mock_post:
mock_post.return_value = mock_response
result = await client.analyze_ivr_menu(
transcript="Press 1 for sales, press 2 for billing",
intent="dispute a charge",
previous_selections=["1"],
)
assert result is not None
assert result["digit"] == "2"
# ============================================================
# Notification Service Tests
# ============================================================
class TestNotificationService:
"""Test notification routing and deduplication."""
def _make_service(self):
from services.notification import NotificationService
event_bus = EventBus()
settings = Settings()
svc = NotificationService(event_bus, settings)
return svc, event_bus
def test_init(self):
svc, _ = self._make_service()
assert svc._notified == {}
def test_event_to_notification_human_detected(self):
from services.notification import NotificationPriority
svc, _ = self._make_service()
event = GatewayEvent(
type=EventType.HUMAN_DETECTED,
call_id="call_123",
data={"confidence": 0.95},
message="Human detected!",
)
notification = svc._event_to_notification(event)
assert notification is not None
assert notification.priority == NotificationPriority.CRITICAL
assert "Human" in notification.title
def test_event_to_notification_hold_detected(self):
from services.notification import NotificationPriority
svc, _ = self._make_service()
event = GatewayEvent(
type=EventType.HOLD_DETECTED,
call_id="call_123",
data={},
message="On hold",
)
notification = svc._event_to_notification(event)
assert notification is not None
assert notification.priority == NotificationPriority.NORMAL
def test_event_to_notification_skip_transcript(self):
svc, _ = self._make_service()
event = GatewayEvent(
type=EventType.TRANSCRIPT_CHUNK,
call_id="call_123",
data={"text": "hello"},
)
notification = svc._event_to_notification(event)
assert notification is None # Transcripts don't generate notifications
def test_event_to_notification_call_ended_cleanup(self):
svc, _ = self._make_service()
# Simulate some tracking data
svc._notified["call_123"] = {"some_event"}
event = GatewayEvent(
type=EventType.CALL_ENDED,
call_id="call_123",
data={},
)
notification = svc._event_to_notification(event)
assert notification is not None
assert "call_123" not in svc._notified # Cleaned up
def test_event_to_notification_call_failed(self):
from services.notification import NotificationPriority
svc, _ = self._make_service()
event = GatewayEvent(
type=EventType.CALL_FAILED,
call_id="call_123",
data={},
message="Connection timed out",
)
notification = svc._event_to_notification(event)
assert notification is not None
assert notification.priority == NotificationPriority.HIGH
assert "Connection timed out" in notification.message
# ============================================================
# Recording Service Tests
# ============================================================
class TestRecordingService:
"""Test recording lifecycle."""
def _make_service(self):
from services.recording import RecordingService
return RecordingService(storage_dir="/tmp/test_recordings")
def test_init(self):
svc = self._make_service()
assert svc._active_recordings == {}
@pytest.mark.asyncio
async def test_recording_path_generation(self):
"""Verify recording paths are organized by date."""
svc = self._make_service()
await svc.start() # Creates storage dir
session = await svc.start_recording(call_id="call_abc123")
assert "call_abc123" in session.filepath_mixed
# Should include date-based directory
today = datetime.now().strftime("%Y-%m-%d")
assert today in session.filepath_mixed
# Clean up
await svc.stop_recording("call_abc123")
# ============================================================
# Call Analytics Tests
# ============================================================
class TestCallAnalytics:
"""Test analytics tracking."""
def _make_service(self):
from services.call_analytics import CallAnalytics
return CallAnalytics(max_history=1000)
def test_init(self):
svc = self._make_service()
assert svc._call_records == []
assert svc.total_calls_recorded == 0
def test_get_summary_empty(self):
svc = self._make_service()
summary = svc.get_summary(hours=24)
assert summary["total_calls"] == 0
assert summary["success_rate"] == 0.0
def test_get_company_stats_unknown(self):
svc = self._make_service()
stats = svc.get_company_stats("+18005551234")
assert stats["total_calls"] == 0
def test_get_top_numbers_empty(self):
svc = self._make_service()
top = svc.get_top_numbers(limit=5)
assert top == []
def test_get_hold_time_trend(self):
svc = self._make_service()
trend = svc.get_hold_time_trend(days=7)
assert len(trend) == 7
assert all(t["call_count"] == 0 for t in trend)
# ============================================================
# Call Flow Learner Tests
# ============================================================
class TestCallFlowLearner:
"""Test call flow learning from exploration data."""
def _make_learner(self):
from services.call_flow_learner import CallFlowLearner
return CallFlowLearner(llm_client=None)
@pytest.mark.asyncio
async def test_build_flow_from_discoveries(self):
"""Test building a call flow from exploration discoveries."""
learner = self._make_learner()
discoveries = [
{
"audio_type": "ivr_prompt",
"transcript": "Press 1 for billing, press 2 for sales",
"action_taken": {"dtmf": "1"},
},
{
"audio_type": "ivr_prompt",
"transcript": "Press 3 to speak to an agent",
"action_taken": {"dtmf": "3"},
},
{
"audio_type": "music",
"transcript": "",
"action_taken": None,
},
{
"audio_type": "live_human",
"transcript": "Hi, thanks for calling. How can I help?",
"action_taken": None,
},
]
flow = await learner.build_flow(
phone_number="+18005551234",
discovered_steps=discoveries,
intent="cancel my card",
company_name="Test Bank",
)
assert flow is not None
assert flow.phone_number == "+18005551234"
assert "Test Bank" in flow.name
assert len(flow.steps) == 4 # IVR, IVR, hold, human
@pytest.mark.asyncio
async def test_build_flow_no_discoveries(self):
"""Test that build_flow returns empty flow when no meaningful data."""
learner = self._make_learner()
flow = await learner.build_flow(
phone_number="+18005551234",
discovered_steps=[],
)
assert flow is not None
assert len(flow.steps) == 0
assert "empty" in [t.lower() for t in flow.tags]
@pytest.mark.asyncio
async def test_merge_discoveries(self):
"""Test merging new discoveries into existing flow."""
learner = self._make_learner()
# Build initial flow
initial_steps = [
{
"audio_type": "ivr_prompt",
"transcript": "Press 1 for billing",
"action_taken": {"dtmf": "1"},
},
{
"audio_type": "music",
"transcript": "",
"action_taken": None,
},
]
flow = await learner.build_flow(
phone_number="+18005551234",
discovered_steps=initial_steps,
intent="billing inquiry",
)
original_step_count = len(flow.steps)
assert original_step_count == 2
# Merge new discoveries
new_steps = [
{
"audio_type": "ivr_prompt",
"transcript": "Press 1 for billing",
"action_taken": {"dtmf": "1"},
},
{
"audio_type": "music",
"transcript": "",
"action_taken": None,
},
{
"audio_type": "live_human",
"transcript": "Hello, billing department",
"action_taken": None,
},
]
merged = await learner.merge_discoveries(
existing_flow=flow,
new_steps=new_steps,
intent="billing inquiry",
)
assert merged is not None
assert merged.times_used == 2 # Incremented
assert merged.last_used is not None
@pytest.mark.asyncio
async def test_discovery_to_step_types(self):
"""Test that different audio types produce correct step actions."""
from models.call_flow import ActionType
learner = self._make_learner()
# IVR prompt with DTMF
step = learner._discovery_to_step(
{"audio_type": "ivr_prompt", "transcript": "Press 1", "action_taken": {"dtmf": "1"}},
0, [],
)
assert step is not None
assert step.action == ActionType.DTMF
assert step.action_value == "1"
# Hold music
step = learner._discovery_to_step(
{"audio_type": "music", "transcript": "", "action_taken": None},
1, [],
)
assert step is not None
assert step.action == ActionType.HOLD
# Live human
step = learner._discovery_to_step(
{"audio_type": "live_human", "transcript": "Hello", "action_taken": None},
2, [],
)
assert step is not None
assert step.action == ActionType.TRANSFER
# ============================================================
# EventBus Integration Tests
# ============================================================
class TestEventBusIntegration:
"""Test EventBus with real async producers/consumers."""
@pytest.mark.asyncio
async def test_multiple_subscribers(self):
"""Multiple subscribers each get all events."""
bus = EventBus()
sub1 = bus.subscribe()
sub2 = bus.subscribe()
event = GatewayEvent(
type=EventType.CALL_INITIATED,
call_id="call_1",
data={},
)
await bus.publish(event)
e1 = await asyncio.wait_for(sub1.__anext__(), timeout=1.0)
e2 = await asyncio.wait_for(sub2.__anext__(), timeout=1.0)
assert e1.call_id == "call_1"
assert e2.call_id == "call_1"
assert bus.subscriber_count == 2
# Unsubscribe using .close() which passes the internal entry tuple
sub1.close()
sub2.close()
assert bus.subscriber_count == 0
@pytest.mark.asyncio
async def test_event_history_limit(self):
"""Event history respects max size."""
bus = EventBus(max_history=5)
for i in range(10):
await bus.publish(
GatewayEvent(
type=EventType.IVR_STEP,
call_id=f"call_{i}",
data={},
)
)
# recent_events is a property, not a method
history = bus.recent_events
assert len(history) == 5
# Should have the most recent 5
assert history[-1].call_id == "call_9"
assert history[0].call_id == "call_5"
@pytest.mark.asyncio
async def test_event_type_filtering(self):
"""Subscribers can filter by event type."""
bus = EventBus()
# Only subscribe to hold-related events
sub = bus.subscribe(event_types={EventType.HOLD_DETECTED, EventType.HUMAN_DETECTED})
# Publish multiple event types
await bus.publish(GatewayEvent(type=EventType.CALL_INITIATED, call_id="c1", data={}))
await bus.publish(GatewayEvent(type=EventType.HOLD_DETECTED, call_id="c1", data={}))
await bus.publish(GatewayEvent(type=EventType.IVR_STEP, call_id="c1", data={}))
await bus.publish(GatewayEvent(type=EventType.HUMAN_DETECTED, call_id="c1", data={}))
# Should only receive the 2 matching events
e1 = await asyncio.wait_for(sub.__anext__(), timeout=1.0)
e2 = await asyncio.wait_for(sub.__anext__(), timeout=1.0)
assert e1.type == EventType.HOLD_DETECTED
assert e2.type == EventType.HUMAN_DETECTED
sub.close()