Files
hold-slayer/tests/test_audio_classifier.py
Robert Helewka ecf37658ce feat: add initial Hold Slayer AI telephony gateway implementation
Complete project scaffolding and core implementation of an AI-powered
telephony system that calls companies, navigates IVR menus, waits on
hold, and transfers to the user when a human answers.

Key components:
- FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces
- SIP/VoIP call management via PJSUA2 with RTP audio streaming
- LLM-powered IVR navigation using OpenAI/Anthropic with tool calling
- Hold detection service combining audio analysis and silence detection
- Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines
- Call recording with per-channel and mixed audio capture
- Event bus (asyncio pub/sub) for real-time client updates
- Web dashboard with live call monitoring
- SQLite persistence via SQLAlchemy with call history and analytics
- Notification support (email, SMS, webhook, desktop)
- Docker Compose deployment with Opal VoIP and Opal Media containers
- Comprehensive test suite with unit, integration, and E2E tests
- Simplified .gitignore and full project documentation in README
2026-03-21 19:23:26 +00:00

254 lines
9.2 KiB
Python

"""
Tests for the audio classifier.
Tests spectral analysis, DTMF detection, and classification logic.
"""
import numpy as np
import pytest
from config import ClassifierSettings
from models.call import AudioClassification
from services.audio_classifier import AudioClassifier, SAMPLE_RATE
@pytest.fixture
def classifier():
"""Create a classifier with default settings."""
settings = ClassifierSettings()
return AudioClassifier(settings)
def generate_silence(duration_seconds: float = 1.0) -> bytes:
"""Generate silent audio (near-zero amplitude)."""
samples = int(SAMPLE_RATE * duration_seconds)
data = np.zeros(samples, dtype=np.int16)
return data.tobytes()
def generate_tone(frequency: float, duration_seconds: float = 1.0, amplitude: float = 0.5) -> bytes:
"""Generate a pure sine tone."""
samples = int(SAMPLE_RATE * duration_seconds)
t = np.linspace(0, duration_seconds, samples, endpoint=False)
signal = (amplitude * 32767 * np.sin(2 * np.pi * frequency * t)).astype(np.int16)
return signal.tobytes()
def generate_dtmf(digit: str, duration_seconds: float = 0.5) -> bytes:
"""Generate a DTMF tone for a digit."""
dtmf_freqs = {
"1": (697, 1209), "2": (697, 1336), "3": (697, 1477),
"4": (770, 1209), "5": (770, 1336), "6": (770, 1477),
"7": (852, 1209), "8": (852, 1336), "9": (852, 1477),
"*": (941, 1209), "0": (941, 1336), "#": (941, 1477),
}
low_freq, high_freq = dtmf_freqs[digit]
samples = int(SAMPLE_RATE * duration_seconds)
t = np.linspace(0, duration_seconds, samples, endpoint=False)
signal = 0.5 * (np.sin(2 * np.pi * low_freq * t) + np.sin(2 * np.pi * high_freq * t))
signal = (signal * 16383).astype(np.int16)
return signal.tobytes()
def generate_noise(duration_seconds: float = 1.0, amplitude: float = 0.3) -> bytes:
"""Generate white noise."""
samples = int(SAMPLE_RATE * duration_seconds)
noise = np.random.normal(0, amplitude * 32767, samples).astype(np.int16)
return noise.tobytes()
def generate_speech_like(duration_seconds: float = 1.0) -> bytes:
"""
Generate a rough approximation of speech.
Mix of formant-like frequencies with amplitude modulation.
"""
samples = int(SAMPLE_RATE * duration_seconds)
t = np.linspace(0, duration_seconds, samples, endpoint=False)
# Fundamental frequency (pitch) with vibrato
f0 = 150 + 10 * np.sin(2 * np.pi * 5 * t)
fundamental = np.sin(2 * np.pi * f0 * t)
# Formants (vowel-like)
f1 = np.sin(2 * np.pi * 730 * t) * 0.5
f2 = np.sin(2 * np.pi * 1090 * t) * 0.3
f3 = np.sin(2 * np.pi * 2440 * t) * 0.1
# Amplitude modulation (syllable-like rhythm)
envelope = 0.5 + 0.5 * np.sin(2 * np.pi * 3 * t)
signal = envelope * (fundamental + f1 + f2 + f3)
signal = (signal * 8000).astype(np.int16)
return signal.tobytes()
class TestSilenceDetection:
"""Test silence classification."""
def test_pure_silence(self, classifier):
result = classifier.classify_chunk(generate_silence())
assert result.audio_type == AudioClassification.SILENCE
assert result.confidence > 0.5
def test_very_quiet(self, classifier):
# Near-silent audio
quiet = generate_tone(440, amplitude=0.001)
result = classifier.classify_chunk(quiet)
assert result.audio_type == AudioClassification.SILENCE
def test_empty_audio(self, classifier):
result = classifier.classify_chunk(b"")
assert result.audio_type == AudioClassification.SILENCE
class TestToneDetection:
"""Test tonal audio classification."""
def test_440hz_ringback(self, classifier):
"""440Hz is North American ring-back tone frequency."""
tone = generate_tone(440, amplitude=0.3)
result = classifier.classify_chunk(tone)
# Should be detected as ringing (440Hz is in the ring-back range)
assert result.audio_type in (
AudioClassification.RINGING,
AudioClassification.MUSIC,
)
assert result.confidence > 0.5
def test_1000hz_tone(self, classifier):
"""1000Hz tone — not ring-back, should be music or unknown."""
tone = generate_tone(1000, amplitude=0.3)
result = classifier.classify_chunk(tone)
assert result.audio_type != AudioClassification.SILENCE
class TestDTMFDetection:
"""Test DTMF tone detection."""
def test_dtmf_digit_5(self, classifier):
dtmf = generate_dtmf("5", duration_seconds=0.5)
result = classifier.classify_chunk(dtmf)
# DTMF detection should catch this
if result.audio_type == AudioClassification.DTMF:
assert result.details.get("dtmf_digit") == "5"
def test_dtmf_digit_0(self, classifier):
dtmf = generate_dtmf("0", duration_seconds=0.5)
result = classifier.classify_chunk(dtmf)
if result.audio_type == AudioClassification.DTMF:
assert result.details.get("dtmf_digit") == "0"
class TestMusicDetection:
"""Test hold music detection."""
def test_complex_tone_as_music(self, classifier):
"""Multiple frequencies together = more music-like."""
samples = int(SAMPLE_RATE * 2)
t = np.linspace(0, 2, samples, endpoint=False)
# Chord: C major (C4 + E4 + G4)
signal = (
np.sin(2 * np.pi * 261.6 * t)
+ np.sin(2 * np.pi * 329.6 * t) * 0.8
+ np.sin(2 * np.pi * 392.0 * t) * 0.6
)
signal = (signal * 6000).astype(np.int16)
result = classifier.classify_chunk(signal.tobytes())
assert result.audio_type in (
AudioClassification.MUSIC,
AudioClassification.RINGING,
AudioClassification.UNKNOWN,
)
assert result.confidence > 0.3
class TestSpeechDetection:
"""Test speech-like audio classification."""
def test_speech_like_audio(self, classifier):
speech = generate_speech_like(2.0)
result = classifier.classify_chunk(speech)
assert result.audio_type in (
AudioClassification.IVR_PROMPT,
AudioClassification.LIVE_HUMAN,
AudioClassification.MUSIC, # Speech-like can be ambiguous
AudioClassification.UNKNOWN,
)
class TestClassificationHistory:
"""Test history-based transition detection."""
def test_hold_to_human_transition(self, classifier):
"""Detect the music → speech transition."""
# Simulate being on hold
for _ in range(10):
classifier.update_history(AudioClassification.MUSIC)
# Now speech appears
classifier.update_history(AudioClassification.LIVE_HUMAN)
classifier.update_history(AudioClassification.LIVE_HUMAN)
classifier.update_history(AudioClassification.LIVE_HUMAN)
assert classifier.detect_hold_to_human_transition()
def test_no_transition_during_ivr(self, classifier):
"""IVR prompt after silence is not a hold→human transition."""
for _ in range(5):
classifier.update_history(AudioClassification.SILENCE)
classifier.update_history(AudioClassification.IVR_PROMPT)
classifier.update_history(AudioClassification.IVR_PROMPT)
classifier.update_history(AudioClassification.IVR_PROMPT)
# No music in history, so no hold→human transition
assert not classifier.detect_hold_to_human_transition()
def test_not_enough_history(self, classifier):
"""Not enough data to detect transition."""
classifier.update_history(AudioClassification.MUSIC)
classifier.update_history(AudioClassification.LIVE_HUMAN)
assert not classifier.detect_hold_to_human_transition()
class TestFeatureExtraction:
"""Test individual feature extractors."""
def test_rms_silence(self, classifier):
samples = np.zeros(1000, dtype=np.float32)
rms = classifier._compute_rms(samples)
assert rms == 0.0
def test_rms_loud(self, classifier):
samples = np.ones(1000, dtype=np.float32) * 0.5
rms = classifier._compute_rms(samples)
assert rms == pytest.approx(0.5, abs=0.01)
def test_zcr_silence(self, classifier):
samples = np.zeros(1000, dtype=np.float32)
zcr = classifier._compute_zero_crossing_rate(samples)
assert zcr == 0.0
def test_zcr_high_freq(self, classifier):
"""High frequency signal should have high ZCR."""
t = np.linspace(0, 1, SAMPLE_RATE, endpoint=False)
samples = np.sin(2 * np.pi * 4000 * t).astype(np.float32)
zcr = classifier._compute_zero_crossing_rate(samples)
assert zcr > 0.1
def test_spectral_flatness_tone(self, classifier):
"""Pure tone should have low spectral flatness."""
t = np.linspace(0, 1, SAMPLE_RATE, endpoint=False)
samples = np.sin(2 * np.pi * 440 * t).astype(np.float32)
flatness = classifier._compute_spectral_flatness(samples)
assert flatness < 0.3
def test_dominant_frequency(self, classifier):
"""Should find the dominant frequency of a pure tone."""
t = np.linspace(0, 1, SAMPLE_RATE, endpoint=False)
samples = np.sin(2 * np.pi * 1000 * t).astype(np.float32)
freq = classifier._compute_dominant_frequency(samples)
assert abs(freq - 1000) < 50 # Within 50Hz