Files
mnemosyne/mnemosyne/library/tests/test_pipeline.py
Robert Helewka 634845fee0 feat: add Phase 3 hybrid search with Synesis reranking
Implement hybrid search pipeline combining vector, fulltext, and graph
search across Neo4j, with cross-attention reranking via Synesis
(Qwen3-VL-Reranker-2B) `/v1/rerank` endpoint.

- Add SearchService with vector, fulltext, and graph search strategies
- Add SynesisRerankerClient for multimodal reranking via HTTP API
- Add search API endpoint (POST /search/) with filtering by library,
  collection, and library_type
- Add SearchRequest/Response serializers and image search results
- Add "nonfiction" to library_type choices
- Consolidate reranker stack from two models to single Synesis service
- Handle image analysis_status as "skipped" when analysis is unavailable
- Add comprehensive tests for search pipeline and reranker client
2026-03-29 18:09:50 +00:00

329 lines
13 KiB
Python

"""
Tests for the embedding pipeline orchestrator.
Pipeline tests mock external dependencies (Neo4j, S3, LLM APIs).
"""
from unittest.mock import MagicMock, patch
from django.test import TestCase
from library.services.pipeline import (
CHUNK_S3_KEY,
IMAGE_S3_KEY,
ORIGINAL_S3_KEY,
EmbeddingPipeline,
)
class S3KeyPatternTests(TestCase):
"""Tests for S3 key pattern formatting."""
def test_original_key_format(self):
key = ORIGINAL_S3_KEY.format(item_uid="abc123", ext="pdf")
self.assertEqual(key, "items/abc123/original.pdf")
def test_chunk_key_format(self):
key = CHUNK_S3_KEY.format(item_uid="abc123", index=5)
self.assertEqual(key, "chunks/abc123/chunk_5.txt")
def test_image_key_format(self):
key = IMAGE_S3_KEY.format(item_uid="abc123", index=2, ext="png")
self.assertEqual(key, "images/abc123/2.png")
class EmbeddingPipelineInitTests(TestCase):
"""Tests for pipeline initialization."""
def test_init_without_user(self):
pipeline = EmbeddingPipeline()
self.assertIsNone(pipeline.user)
def test_init_with_user(self):
user = MagicMock()
pipeline = EmbeddingPipeline(user=user)
self.assertEqual(pipeline.user, user)
class PipelineItemNotFoundTests(TestCase):
"""Tests for handling missing items."""
@patch("library.services.pipeline.Item")
def test_process_nonexistent_item_raises(self, mock_item_cls):
mock_item_cls.nodes.get.side_effect = Exception("Not found")
pipeline = EmbeddingPipeline()
with self.assertRaises(ValueError) as ctx:
pipeline.process_item("nonexistent-uid")
self.assertIn("Item not found", str(ctx.exception))
@patch("library.services.pipeline.Item")
def test_reprocess_nonexistent_item_raises(self, mock_item_cls):
mock_item_cls.nodes.get.side_effect = Exception("Not found")
pipeline = EmbeddingPipeline()
with self.assertRaises(ValueError):
pipeline.reprocess_item("nonexistent-uid")
class PipelineNoEmbeddingModelTests(TestCase):
"""Tests for handling missing system embedding model."""
@patch("library.services.pipeline.LLMModel")
@patch("library.services.pipeline.default_storage")
@patch("library.services.pipeline.DocumentParser")
def test_no_embedding_model_raises(self, mock_parser, mock_storage, mock_llm):
"""Pipeline raises ValueError if no system embedding model is configured."""
mock_llm.get_system_embedding_model.return_value = None
# Mock item
mock_item = MagicMock()
mock_item.uid = "test-uid"
mock_item.title = "Test"
mock_item.file_type = "txt"
mock_item.s3_key = "items/test-uid/original.txt"
mock_item.embedding_status = "pending"
mock_item.chunks.all.return_value = []
mock_item.images.all.return_value = []
with patch("library.services.pipeline.Item") as mock_item_cls:
mock_item_cls.nodes.get.return_value = mock_item
# Mock S3 read
mock_storage.open.return_value.__enter__ = MagicMock(
return_value=MagicMock(read=MagicMock(return_value=b"test content"))
)
mock_storage.open.return_value.__exit__ = MagicMock(return_value=False)
pipeline = EmbeddingPipeline()
with self.assertRaises(ValueError) as ctx:
pipeline.process_item("test-uid")
self.assertIn("No system embedding model", str(ctx.exception))
class PipelineVisionPromptTests(TestCase):
"""Tests for the _get_vision_prompt helper."""
def test_returns_empty_for_no_library(self):
pipeline = EmbeddingPipeline()
result = pipeline._get_vision_prompt(None)
self.assertEqual(result, "")
@patch("library.content_types.get_library_type_config")
def test_returns_vision_prompt_from_config(self, mock_config):
mock_config.return_value = {
"vision_prompt": "Analyze this technical diagram.",
}
mock_library = MagicMock()
mock_library.library_type = "technical"
pipeline = EmbeddingPipeline()
result = pipeline._get_vision_prompt(mock_library)
self.assertEqual(result, "Analyze this technical diagram.")
mock_config.assert_called_once_with("technical")
@patch("library.content_types.get_library_type_config")
def test_returns_empty_when_no_vision_prompt_key(self, mock_config):
mock_config.return_value = {"embedding_instruction": "something"}
mock_library = MagicMock()
mock_library.library_type = "fiction"
pipeline = EmbeddingPipeline()
result = pipeline._get_vision_prompt(mock_library)
self.assertEqual(result, "")
@patch("library.content_types.get_library_type_config")
def test_returns_empty_on_exception(self, mock_config):
mock_config.side_effect = ValueError("Unknown type")
mock_library = MagicMock()
mock_library.library_type = "bogus"
pipeline = EmbeddingPipeline()
result = pipeline._get_vision_prompt(mock_library)
self.assertEqual(result, "")
class PipelineVisionStageTests(TestCase):
"""Tests for Stage 5.5 — vision analysis integration in _run_pipeline."""
def _make_mock_item(self):
"""Create a common mock Item for pipeline tests."""
item = MagicMock()
item.uid = "test-uid"
item.title = "Test Doc"
item.file_type = "pdf"
item.s3_key = "items/test-uid/original.pdf"
item.embedding_status = "pending"
item.content_hash = ""
item.chunks = MagicMock()
item.chunks.all.return_value = []
item.images = MagicMock()
item.images.all.return_value = []
return item
@patch("library.services.pipeline.ConceptExtractor")
@patch("library.services.pipeline.EmbeddingClient")
@patch("library.services.pipeline.ContentTypeChunker")
@patch("library.services.pipeline.DocumentParser")
@patch("library.services.pipeline.LLMModel")
@patch("library.services.pipeline.default_storage")
def test_no_vision_model_marks_images_skipped(
self, mock_storage, mock_llm, mock_parser_cls,
mock_chunker_cls, mock_embed_cls, mock_concept_cls,
):
"""When no vision model is configured, images get analysis_status='skipped'."""
# Setup embedding model
mock_embed_model = MagicMock()
mock_embed_model.name = "test-embed"
mock_embed_model.vector_dimensions = None
mock_embed_model.supports_multimodal = False
mock_llm.get_system_embedding_model.return_value = mock_embed_model
mock_llm.get_system_vision_model.return_value = None
mock_llm.get_system_chat_model.return_value = None
# Setup parser — returns text + images
mock_parse_result = MagicMock()
mock_parse_result.images = [MagicMock(source_index=0, ext="png", data=b"img", width=100, height=100, source_page=0)]
mock_parse_result.text_blocks = []
mock_parser = MagicMock()
mock_parser.parse_bytes.return_value = mock_parse_result
mock_parser_cls.return_value = mock_parser
# Setup chunker — empty chunks
mock_chunk_result = MagicMock()
mock_chunk_result.chunks = []
mock_chunk_result.chunk_page_map = {}
mock_chunker = MagicMock()
mock_chunker.chunk.return_value = mock_chunk_result
mock_chunker_cls.return_value = mock_chunker
# Setup S3
mock_file = MagicMock()
mock_file.read.return_value = b"file data"
mock_storage.open.return_value.__enter__ = MagicMock(return_value=mock_file)
mock_storage.open.return_value.__exit__ = MagicMock(return_value=False)
item = self._make_mock_item()
pipeline = EmbeddingPipeline()
# Mock _store_images to return a mock image node
img_node = MagicMock()
img_node.s3_key = "images/test-uid/0.png"
with patch.object(pipeline, "_get_item_library", return_value=None), \
patch.object(pipeline, "_read_item_from_s3", return_value=b"data"), \
patch.object(pipeline, "_store_chunks", return_value=[]), \
patch.object(pipeline, "_store_images", return_value=[img_node]), \
patch.object(pipeline, "_associate_images_with_chunks"):
result = pipeline._run_pipeline(item, None)
# Image should be marked as skipped
self.assertEqual(img_node.analysis_status, "skipped")
img_node.save.assert_called()
self.assertEqual(result["images_analyzed"], 0)
@patch("library.services.pipeline.VisionAnalyzer")
@patch("library.services.pipeline.ConceptExtractor")
@patch("library.services.pipeline.EmbeddingClient")
@patch("library.services.pipeline.ContentTypeChunker")
@patch("library.services.pipeline.DocumentParser")
@patch("library.services.pipeline.LLMModel")
@patch("library.services.pipeline.default_storage")
def test_vision_model_triggers_analysis(
self, mock_storage, mock_llm, mock_parser_cls,
mock_chunker_cls, mock_embed_cls, mock_concept_cls, mock_vision_cls,
):
"""When vision model is configured and images exist, analysis runs."""
# Setup models
mock_embed_model = MagicMock()
mock_embed_model.name = "test-embed"
mock_embed_model.vector_dimensions = None
mock_embed_model.supports_multimodal = False
mock_vision_model = MagicMock()
mock_llm.get_system_embedding_model.return_value = mock_embed_model
mock_llm.get_system_vision_model.return_value = mock_vision_model
mock_llm.get_system_chat_model.return_value = None
# Setup parser
mock_parse_result = MagicMock()
mock_parse_result.images = []
mock_parse_result.text_blocks = []
mock_parser = MagicMock()
mock_parser.parse_bytes.return_value = mock_parse_result
mock_parser_cls.return_value = mock_parser
# Setup chunker
mock_chunk_result = MagicMock()
mock_chunk_result.chunks = []
mock_chunk_result.chunk_page_map = {}
mock_chunker = MagicMock()
mock_chunker.chunk.return_value = mock_chunk_result
mock_chunker_cls.return_value = mock_chunker
# Setup vision analyzer
mock_analyzer = MagicMock()
mock_analyzer.analyze_images.return_value = 3
mock_vision_cls.return_value = mock_analyzer
item = self._make_mock_item()
img_nodes = [MagicMock(), MagicMock(), MagicMock()]
pipeline = EmbeddingPipeline()
with patch.object(pipeline, "_get_item_library", return_value=None), \
patch.object(pipeline, "_read_item_from_s3", return_value=b"data"), \
patch.object(pipeline, "_store_chunks", return_value=[]), \
patch.object(pipeline, "_store_images", return_value=img_nodes), \
patch.object(pipeline, "_associate_images_with_chunks"), \
patch.object(pipeline, "_get_vision_prompt", return_value="Analyze"):
result = pipeline._run_pipeline(item, None)
self.assertEqual(result["images_analyzed"], 3)
mock_vision_cls.assert_called_once_with(mock_vision_model, user=None)
mock_analyzer.analyze_images.assert_called_once()
@patch("library.services.pipeline.LLMModel")
def test_no_images_skips_vision_entirely(self, mock_llm):
"""When there are no images, vision stage is a no-op regardless of model."""
mock_vision_model = MagicMock()
mock_llm.get_system_vision_model.return_value = mock_vision_model
mock_llm.get_system_embedding_model.return_value = MagicMock(
name="embed", vector_dimensions=None, supports_multimodal=False
)
mock_llm.get_system_chat_model.return_value = None
item = self._make_mock_item()
pipeline = EmbeddingPipeline()
mock_chunk_result = MagicMock()
mock_chunk_result.chunks = []
mock_chunk_result.chunk_page_map = {}
with patch.object(pipeline, "_get_item_library", return_value=None), \
patch.object(pipeline, "_read_item_from_s3", return_value=b"data"), \
patch.object(pipeline, "_store_chunks", return_value=[]), \
patch.object(pipeline, "_store_images", return_value=[]), \
patch.object(pipeline, "_associate_images_with_chunks"), \
patch("library.services.pipeline.DocumentParser") as mock_parser_cls, \
patch("library.services.pipeline.ContentTypeChunker") as mock_chunker_cls, \
patch("library.services.pipeline.EmbeddingClient"), \
patch("library.services.pipeline.VisionAnalyzer") as mock_vision_cls:
mock_parser = MagicMock()
mock_parser.parse_bytes.return_value = MagicMock(images=[], text_blocks=[])
mock_parser_cls.return_value = mock_parser
mock_chunker = MagicMock()
mock_chunker.chunk.return_value = mock_chunk_result
mock_chunker_cls.return_value = mock_chunker
result = pipeline._run_pipeline(item, None)
# VisionAnalyzer should never be instantiated
mock_vision_cls.assert_not_called()
self.assertEqual(result["images_analyzed"], 0)