Complete project scaffolding and core implementation of an AI-powered telephony system that calls companies, navigates IVR menus, waits on hold, and transfers to the user when a human answers. Key components: - FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces - SIP/VoIP call management via PJSUA2 with RTP audio streaming - LLM-powered IVR navigation using OpenAI/Anthropic with tool calling - Hold detection service combining audio analysis and silence detection - Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines - Call recording with per-channel and mixed audio capture - Event bus (asyncio pub/sub) for real-time client updates - Web dashboard with live call monitoring - SQLite persistence via SQLAlchemy with call history and analytics - Notification support (email, SMS, webhook, desktop) - Docker Compose deployment with Opal VoIP and Opal Media containers - Comprehensive test suite with unit, integration, and E2E tests - Simplified .gitignore and full project documentation in README
254 lines
9.2 KiB
Python
254 lines
9.2 KiB
Python
"""
|
|
Tests for the audio classifier.
|
|
|
|
Tests spectral analysis, DTMF detection, and classification logic.
|
|
"""
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from config import ClassifierSettings
|
|
from models.call import AudioClassification
|
|
from services.audio_classifier import AudioClassifier, SAMPLE_RATE
|
|
|
|
|
|
@pytest.fixture
|
|
def classifier():
|
|
"""Create a classifier with default settings."""
|
|
settings = ClassifierSettings()
|
|
return AudioClassifier(settings)
|
|
|
|
|
|
def generate_silence(duration_seconds: float = 1.0) -> bytes:
|
|
"""Generate silent audio (near-zero amplitude)."""
|
|
samples = int(SAMPLE_RATE * duration_seconds)
|
|
data = np.zeros(samples, dtype=np.int16)
|
|
return data.tobytes()
|
|
|
|
|
|
def generate_tone(frequency: float, duration_seconds: float = 1.0, amplitude: float = 0.5) -> bytes:
|
|
"""Generate a pure sine tone."""
|
|
samples = int(SAMPLE_RATE * duration_seconds)
|
|
t = np.linspace(0, duration_seconds, samples, endpoint=False)
|
|
signal = (amplitude * 32767 * np.sin(2 * np.pi * frequency * t)).astype(np.int16)
|
|
return signal.tobytes()
|
|
|
|
|
|
def generate_dtmf(digit: str, duration_seconds: float = 0.5) -> bytes:
|
|
"""Generate a DTMF tone for a digit."""
|
|
dtmf_freqs = {
|
|
"1": (697, 1209), "2": (697, 1336), "3": (697, 1477),
|
|
"4": (770, 1209), "5": (770, 1336), "6": (770, 1477),
|
|
"7": (852, 1209), "8": (852, 1336), "9": (852, 1477),
|
|
"*": (941, 1209), "0": (941, 1336), "#": (941, 1477),
|
|
}
|
|
low_freq, high_freq = dtmf_freqs[digit]
|
|
samples = int(SAMPLE_RATE * duration_seconds)
|
|
t = np.linspace(0, duration_seconds, samples, endpoint=False)
|
|
signal = 0.5 * (np.sin(2 * np.pi * low_freq * t) + np.sin(2 * np.pi * high_freq * t))
|
|
signal = (signal * 16383).astype(np.int16)
|
|
return signal.tobytes()
|
|
|
|
|
|
def generate_noise(duration_seconds: float = 1.0, amplitude: float = 0.3) -> bytes:
|
|
"""Generate white noise."""
|
|
samples = int(SAMPLE_RATE * duration_seconds)
|
|
noise = np.random.normal(0, amplitude * 32767, samples).astype(np.int16)
|
|
return noise.tobytes()
|
|
|
|
|
|
def generate_speech_like(duration_seconds: float = 1.0) -> bytes:
|
|
"""
|
|
Generate a rough approximation of speech.
|
|
Mix of formant-like frequencies with amplitude modulation.
|
|
"""
|
|
samples = int(SAMPLE_RATE * duration_seconds)
|
|
t = np.linspace(0, duration_seconds, samples, endpoint=False)
|
|
|
|
# Fundamental frequency (pitch) with vibrato
|
|
f0 = 150 + 10 * np.sin(2 * np.pi * 5 * t)
|
|
fundamental = np.sin(2 * np.pi * f0 * t)
|
|
|
|
# Formants (vowel-like)
|
|
f1 = np.sin(2 * np.pi * 730 * t) * 0.5
|
|
f2 = np.sin(2 * np.pi * 1090 * t) * 0.3
|
|
f3 = np.sin(2 * np.pi * 2440 * t) * 0.1
|
|
|
|
# Amplitude modulation (syllable-like rhythm)
|
|
envelope = 0.5 + 0.5 * np.sin(2 * np.pi * 3 * t)
|
|
|
|
signal = envelope * (fundamental + f1 + f2 + f3)
|
|
signal = (signal * 8000).astype(np.int16)
|
|
return signal.tobytes()
|
|
|
|
|
|
class TestSilenceDetection:
|
|
"""Test silence classification."""
|
|
|
|
def test_pure_silence(self, classifier):
|
|
result = classifier.classify_chunk(generate_silence())
|
|
assert result.audio_type == AudioClassification.SILENCE
|
|
assert result.confidence > 0.5
|
|
|
|
def test_very_quiet(self, classifier):
|
|
# Near-silent audio
|
|
quiet = generate_tone(440, amplitude=0.001)
|
|
result = classifier.classify_chunk(quiet)
|
|
assert result.audio_type == AudioClassification.SILENCE
|
|
|
|
def test_empty_audio(self, classifier):
|
|
result = classifier.classify_chunk(b"")
|
|
assert result.audio_type == AudioClassification.SILENCE
|
|
|
|
|
|
class TestToneDetection:
|
|
"""Test tonal audio classification."""
|
|
|
|
def test_440hz_ringback(self, classifier):
|
|
"""440Hz is North American ring-back tone frequency."""
|
|
tone = generate_tone(440, amplitude=0.3)
|
|
result = classifier.classify_chunk(tone)
|
|
# Should be detected as ringing (440Hz is in the ring-back range)
|
|
assert result.audio_type in (
|
|
AudioClassification.RINGING,
|
|
AudioClassification.MUSIC,
|
|
)
|
|
assert result.confidence > 0.5
|
|
|
|
def test_1000hz_tone(self, classifier):
|
|
"""1000Hz tone — not ring-back, should be music or unknown."""
|
|
tone = generate_tone(1000, amplitude=0.3)
|
|
result = classifier.classify_chunk(tone)
|
|
assert result.audio_type != AudioClassification.SILENCE
|
|
|
|
|
|
class TestDTMFDetection:
|
|
"""Test DTMF tone detection."""
|
|
|
|
def test_dtmf_digit_5(self, classifier):
|
|
dtmf = generate_dtmf("5", duration_seconds=0.5)
|
|
result = classifier.classify_chunk(dtmf)
|
|
# DTMF detection should catch this
|
|
if result.audio_type == AudioClassification.DTMF:
|
|
assert result.details.get("dtmf_digit") == "5"
|
|
|
|
def test_dtmf_digit_0(self, classifier):
|
|
dtmf = generate_dtmf("0", duration_seconds=0.5)
|
|
result = classifier.classify_chunk(dtmf)
|
|
if result.audio_type == AudioClassification.DTMF:
|
|
assert result.details.get("dtmf_digit") == "0"
|
|
|
|
|
|
class TestMusicDetection:
|
|
"""Test hold music detection."""
|
|
|
|
def test_complex_tone_as_music(self, classifier):
|
|
"""Multiple frequencies together = more music-like."""
|
|
samples = int(SAMPLE_RATE * 2)
|
|
t = np.linspace(0, 2, samples, endpoint=False)
|
|
|
|
# Chord: C major (C4 + E4 + G4)
|
|
signal = (
|
|
np.sin(2 * np.pi * 261.6 * t)
|
|
+ np.sin(2 * np.pi * 329.6 * t) * 0.8
|
|
+ np.sin(2 * np.pi * 392.0 * t) * 0.6
|
|
)
|
|
signal = (signal * 6000).astype(np.int16)
|
|
|
|
result = classifier.classify_chunk(signal.tobytes())
|
|
assert result.audio_type in (
|
|
AudioClassification.MUSIC,
|
|
AudioClassification.RINGING,
|
|
AudioClassification.UNKNOWN,
|
|
)
|
|
assert result.confidence > 0.3
|
|
|
|
|
|
class TestSpeechDetection:
|
|
"""Test speech-like audio classification."""
|
|
|
|
def test_speech_like_audio(self, classifier):
|
|
speech = generate_speech_like(2.0)
|
|
result = classifier.classify_chunk(speech)
|
|
assert result.audio_type in (
|
|
AudioClassification.IVR_PROMPT,
|
|
AudioClassification.LIVE_HUMAN,
|
|
AudioClassification.MUSIC, # Speech-like can be ambiguous
|
|
AudioClassification.UNKNOWN,
|
|
)
|
|
|
|
|
|
class TestClassificationHistory:
|
|
"""Test history-based transition detection."""
|
|
|
|
def test_hold_to_human_transition(self, classifier):
|
|
"""Detect the music → speech transition."""
|
|
# Simulate being on hold
|
|
for _ in range(10):
|
|
classifier.update_history(AudioClassification.MUSIC)
|
|
|
|
# Now speech appears
|
|
classifier.update_history(AudioClassification.LIVE_HUMAN)
|
|
classifier.update_history(AudioClassification.LIVE_HUMAN)
|
|
classifier.update_history(AudioClassification.LIVE_HUMAN)
|
|
|
|
assert classifier.detect_hold_to_human_transition()
|
|
|
|
def test_no_transition_during_ivr(self, classifier):
|
|
"""IVR prompt after silence is not a hold→human transition."""
|
|
for _ in range(5):
|
|
classifier.update_history(AudioClassification.SILENCE)
|
|
|
|
classifier.update_history(AudioClassification.IVR_PROMPT)
|
|
classifier.update_history(AudioClassification.IVR_PROMPT)
|
|
classifier.update_history(AudioClassification.IVR_PROMPT)
|
|
|
|
# No music in history, so no hold→human transition
|
|
assert not classifier.detect_hold_to_human_transition()
|
|
|
|
def test_not_enough_history(self, classifier):
|
|
"""Not enough data to detect transition."""
|
|
classifier.update_history(AudioClassification.MUSIC)
|
|
classifier.update_history(AudioClassification.LIVE_HUMAN)
|
|
assert not classifier.detect_hold_to_human_transition()
|
|
|
|
|
|
class TestFeatureExtraction:
|
|
"""Test individual feature extractors."""
|
|
|
|
def test_rms_silence(self, classifier):
|
|
samples = np.zeros(1000, dtype=np.float32)
|
|
rms = classifier._compute_rms(samples)
|
|
assert rms == 0.0
|
|
|
|
def test_rms_loud(self, classifier):
|
|
samples = np.ones(1000, dtype=np.float32) * 0.5
|
|
rms = classifier._compute_rms(samples)
|
|
assert rms == pytest.approx(0.5, abs=0.01)
|
|
|
|
def test_zcr_silence(self, classifier):
|
|
samples = np.zeros(1000, dtype=np.float32)
|
|
zcr = classifier._compute_zero_crossing_rate(samples)
|
|
assert zcr == 0.0
|
|
|
|
def test_zcr_high_freq(self, classifier):
|
|
"""High frequency signal should have high ZCR."""
|
|
t = np.linspace(0, 1, SAMPLE_RATE, endpoint=False)
|
|
samples = np.sin(2 * np.pi * 4000 * t).astype(np.float32)
|
|
zcr = classifier._compute_zero_crossing_rate(samples)
|
|
assert zcr > 0.1
|
|
|
|
def test_spectral_flatness_tone(self, classifier):
|
|
"""Pure tone should have low spectral flatness."""
|
|
t = np.linspace(0, 1, SAMPLE_RATE, endpoint=False)
|
|
samples = np.sin(2 * np.pi * 440 * t).astype(np.float32)
|
|
flatness = classifier._compute_spectral_flatness(samples)
|
|
assert flatness < 0.3
|
|
|
|
def test_dominant_frequency(self, classifier):
|
|
"""Should find the dominant frequency of a pure tone."""
|
|
t = np.linspace(0, 1, SAMPLE_RATE, endpoint=False)
|
|
samples = np.sin(2 * np.pi * 1000 * t).astype(np.float32)
|
|
freq = classifier._compute_dominant_frequency(samples)
|
|
assert abs(freq - 1000) < 50 # Within 50Hz
|