Implement hybrid search pipeline combining vector, fulltext, and graph search across Neo4j, with cross-attention reranking via Synesis (Qwen3-VL-Reranker-2B) `/v1/rerank` endpoint. - Add SearchService with vector, fulltext, and graph search strategies - Add SynesisRerankerClient for multimodal reranking via HTTP API - Add search API endpoint (POST /search/) with filtering by library, collection, and library_type - Add SearchRequest/Response serializers and image search results - Add "nonfiction" to library_type choices - Consolidate reranker stack from two models to single Synesis service - Handle image analysis_status as "skipped" when analysis is unavailable - Add comprehensive tests for search pipeline and reranker client
329 lines
13 KiB
Python
329 lines
13 KiB
Python
"""
|
|
Tests for the embedding pipeline orchestrator.
|
|
|
|
Pipeline tests mock external dependencies (Neo4j, S3, LLM APIs).
|
|
"""
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
from django.test import TestCase
|
|
|
|
from library.services.pipeline import (
|
|
CHUNK_S3_KEY,
|
|
IMAGE_S3_KEY,
|
|
ORIGINAL_S3_KEY,
|
|
EmbeddingPipeline,
|
|
)
|
|
|
|
|
|
class S3KeyPatternTests(TestCase):
|
|
"""Tests for S3 key pattern formatting."""
|
|
|
|
def test_original_key_format(self):
|
|
key = ORIGINAL_S3_KEY.format(item_uid="abc123", ext="pdf")
|
|
self.assertEqual(key, "items/abc123/original.pdf")
|
|
|
|
def test_chunk_key_format(self):
|
|
key = CHUNK_S3_KEY.format(item_uid="abc123", index=5)
|
|
self.assertEqual(key, "chunks/abc123/chunk_5.txt")
|
|
|
|
def test_image_key_format(self):
|
|
key = IMAGE_S3_KEY.format(item_uid="abc123", index=2, ext="png")
|
|
self.assertEqual(key, "images/abc123/2.png")
|
|
|
|
|
|
class EmbeddingPipelineInitTests(TestCase):
|
|
"""Tests for pipeline initialization."""
|
|
|
|
def test_init_without_user(self):
|
|
pipeline = EmbeddingPipeline()
|
|
self.assertIsNone(pipeline.user)
|
|
|
|
def test_init_with_user(self):
|
|
user = MagicMock()
|
|
pipeline = EmbeddingPipeline(user=user)
|
|
self.assertEqual(pipeline.user, user)
|
|
|
|
|
|
class PipelineItemNotFoundTests(TestCase):
|
|
"""Tests for handling missing items."""
|
|
|
|
@patch("library.services.pipeline.Item")
|
|
def test_process_nonexistent_item_raises(self, mock_item_cls):
|
|
mock_item_cls.nodes.get.side_effect = Exception("Not found")
|
|
|
|
pipeline = EmbeddingPipeline()
|
|
with self.assertRaises(ValueError) as ctx:
|
|
pipeline.process_item("nonexistent-uid")
|
|
self.assertIn("Item not found", str(ctx.exception))
|
|
|
|
@patch("library.services.pipeline.Item")
|
|
def test_reprocess_nonexistent_item_raises(self, mock_item_cls):
|
|
mock_item_cls.nodes.get.side_effect = Exception("Not found")
|
|
|
|
pipeline = EmbeddingPipeline()
|
|
with self.assertRaises(ValueError):
|
|
pipeline.reprocess_item("nonexistent-uid")
|
|
|
|
|
|
class PipelineNoEmbeddingModelTests(TestCase):
|
|
"""Tests for handling missing system embedding model."""
|
|
|
|
@patch("library.services.pipeline.LLMModel")
|
|
@patch("library.services.pipeline.default_storage")
|
|
@patch("library.services.pipeline.DocumentParser")
|
|
def test_no_embedding_model_raises(self, mock_parser, mock_storage, mock_llm):
|
|
"""Pipeline raises ValueError if no system embedding model is configured."""
|
|
mock_llm.get_system_embedding_model.return_value = None
|
|
|
|
# Mock item
|
|
mock_item = MagicMock()
|
|
mock_item.uid = "test-uid"
|
|
mock_item.title = "Test"
|
|
mock_item.file_type = "txt"
|
|
mock_item.s3_key = "items/test-uid/original.txt"
|
|
mock_item.embedding_status = "pending"
|
|
mock_item.chunks.all.return_value = []
|
|
mock_item.images.all.return_value = []
|
|
|
|
with patch("library.services.pipeline.Item") as mock_item_cls:
|
|
mock_item_cls.nodes.get.return_value = mock_item
|
|
|
|
# Mock S3 read
|
|
mock_storage.open.return_value.__enter__ = MagicMock(
|
|
return_value=MagicMock(read=MagicMock(return_value=b"test content"))
|
|
)
|
|
mock_storage.open.return_value.__exit__ = MagicMock(return_value=False)
|
|
|
|
pipeline = EmbeddingPipeline()
|
|
|
|
with self.assertRaises(ValueError) as ctx:
|
|
pipeline.process_item("test-uid")
|
|
|
|
self.assertIn("No system embedding model", str(ctx.exception))
|
|
|
|
|
|
class PipelineVisionPromptTests(TestCase):
|
|
"""Tests for the _get_vision_prompt helper."""
|
|
|
|
def test_returns_empty_for_no_library(self):
|
|
pipeline = EmbeddingPipeline()
|
|
result = pipeline._get_vision_prompt(None)
|
|
self.assertEqual(result, "")
|
|
|
|
@patch("library.content_types.get_library_type_config")
|
|
def test_returns_vision_prompt_from_config(self, mock_config):
|
|
mock_config.return_value = {
|
|
"vision_prompt": "Analyze this technical diagram.",
|
|
}
|
|
mock_library = MagicMock()
|
|
mock_library.library_type = "technical"
|
|
|
|
pipeline = EmbeddingPipeline()
|
|
result = pipeline._get_vision_prompt(mock_library)
|
|
|
|
self.assertEqual(result, "Analyze this technical diagram.")
|
|
mock_config.assert_called_once_with("technical")
|
|
|
|
@patch("library.content_types.get_library_type_config")
|
|
def test_returns_empty_when_no_vision_prompt_key(self, mock_config):
|
|
mock_config.return_value = {"embedding_instruction": "something"}
|
|
mock_library = MagicMock()
|
|
mock_library.library_type = "fiction"
|
|
|
|
pipeline = EmbeddingPipeline()
|
|
result = pipeline._get_vision_prompt(mock_library)
|
|
|
|
self.assertEqual(result, "")
|
|
|
|
@patch("library.content_types.get_library_type_config")
|
|
def test_returns_empty_on_exception(self, mock_config):
|
|
mock_config.side_effect = ValueError("Unknown type")
|
|
mock_library = MagicMock()
|
|
mock_library.library_type = "bogus"
|
|
|
|
pipeline = EmbeddingPipeline()
|
|
result = pipeline._get_vision_prompt(mock_library)
|
|
|
|
self.assertEqual(result, "")
|
|
|
|
|
|
class PipelineVisionStageTests(TestCase):
|
|
"""Tests for Stage 5.5 — vision analysis integration in _run_pipeline."""
|
|
|
|
def _make_mock_item(self):
|
|
"""Create a common mock Item for pipeline tests."""
|
|
item = MagicMock()
|
|
item.uid = "test-uid"
|
|
item.title = "Test Doc"
|
|
item.file_type = "pdf"
|
|
item.s3_key = "items/test-uid/original.pdf"
|
|
item.embedding_status = "pending"
|
|
item.content_hash = ""
|
|
item.chunks = MagicMock()
|
|
item.chunks.all.return_value = []
|
|
item.images = MagicMock()
|
|
item.images.all.return_value = []
|
|
return item
|
|
|
|
@patch("library.services.pipeline.ConceptExtractor")
|
|
@patch("library.services.pipeline.EmbeddingClient")
|
|
@patch("library.services.pipeline.ContentTypeChunker")
|
|
@patch("library.services.pipeline.DocumentParser")
|
|
@patch("library.services.pipeline.LLMModel")
|
|
@patch("library.services.pipeline.default_storage")
|
|
def test_no_vision_model_marks_images_skipped(
|
|
self, mock_storage, mock_llm, mock_parser_cls,
|
|
mock_chunker_cls, mock_embed_cls, mock_concept_cls,
|
|
):
|
|
"""When no vision model is configured, images get analysis_status='skipped'."""
|
|
# Setup embedding model
|
|
mock_embed_model = MagicMock()
|
|
mock_embed_model.name = "test-embed"
|
|
mock_embed_model.vector_dimensions = None
|
|
mock_embed_model.supports_multimodal = False
|
|
mock_llm.get_system_embedding_model.return_value = mock_embed_model
|
|
mock_llm.get_system_vision_model.return_value = None
|
|
mock_llm.get_system_chat_model.return_value = None
|
|
|
|
# Setup parser — returns text + images
|
|
mock_parse_result = MagicMock()
|
|
mock_parse_result.images = [MagicMock(source_index=0, ext="png", data=b"img", width=100, height=100, source_page=0)]
|
|
mock_parse_result.text_blocks = []
|
|
mock_parser = MagicMock()
|
|
mock_parser.parse_bytes.return_value = mock_parse_result
|
|
mock_parser_cls.return_value = mock_parser
|
|
|
|
# Setup chunker — empty chunks
|
|
mock_chunk_result = MagicMock()
|
|
mock_chunk_result.chunks = []
|
|
mock_chunk_result.chunk_page_map = {}
|
|
mock_chunker = MagicMock()
|
|
mock_chunker.chunk.return_value = mock_chunk_result
|
|
mock_chunker_cls.return_value = mock_chunker
|
|
|
|
# Setup S3
|
|
mock_file = MagicMock()
|
|
mock_file.read.return_value = b"file data"
|
|
mock_storage.open.return_value.__enter__ = MagicMock(return_value=mock_file)
|
|
mock_storage.open.return_value.__exit__ = MagicMock(return_value=False)
|
|
|
|
item = self._make_mock_item()
|
|
pipeline = EmbeddingPipeline()
|
|
|
|
# Mock _store_images to return a mock image node
|
|
img_node = MagicMock()
|
|
img_node.s3_key = "images/test-uid/0.png"
|
|
with patch.object(pipeline, "_get_item_library", return_value=None), \
|
|
patch.object(pipeline, "_read_item_from_s3", return_value=b"data"), \
|
|
patch.object(pipeline, "_store_chunks", return_value=[]), \
|
|
patch.object(pipeline, "_store_images", return_value=[img_node]), \
|
|
patch.object(pipeline, "_associate_images_with_chunks"):
|
|
|
|
result = pipeline._run_pipeline(item, None)
|
|
|
|
# Image should be marked as skipped
|
|
self.assertEqual(img_node.analysis_status, "skipped")
|
|
img_node.save.assert_called()
|
|
self.assertEqual(result["images_analyzed"], 0)
|
|
|
|
@patch("library.services.pipeline.VisionAnalyzer")
|
|
@patch("library.services.pipeline.ConceptExtractor")
|
|
@patch("library.services.pipeline.EmbeddingClient")
|
|
@patch("library.services.pipeline.ContentTypeChunker")
|
|
@patch("library.services.pipeline.DocumentParser")
|
|
@patch("library.services.pipeline.LLMModel")
|
|
@patch("library.services.pipeline.default_storage")
|
|
def test_vision_model_triggers_analysis(
|
|
self, mock_storage, mock_llm, mock_parser_cls,
|
|
mock_chunker_cls, mock_embed_cls, mock_concept_cls, mock_vision_cls,
|
|
):
|
|
"""When vision model is configured and images exist, analysis runs."""
|
|
# Setup models
|
|
mock_embed_model = MagicMock()
|
|
mock_embed_model.name = "test-embed"
|
|
mock_embed_model.vector_dimensions = None
|
|
mock_embed_model.supports_multimodal = False
|
|
mock_vision_model = MagicMock()
|
|
mock_llm.get_system_embedding_model.return_value = mock_embed_model
|
|
mock_llm.get_system_vision_model.return_value = mock_vision_model
|
|
mock_llm.get_system_chat_model.return_value = None
|
|
|
|
# Setup parser
|
|
mock_parse_result = MagicMock()
|
|
mock_parse_result.images = []
|
|
mock_parse_result.text_blocks = []
|
|
mock_parser = MagicMock()
|
|
mock_parser.parse_bytes.return_value = mock_parse_result
|
|
mock_parser_cls.return_value = mock_parser
|
|
|
|
# Setup chunker
|
|
mock_chunk_result = MagicMock()
|
|
mock_chunk_result.chunks = []
|
|
mock_chunk_result.chunk_page_map = {}
|
|
mock_chunker = MagicMock()
|
|
mock_chunker.chunk.return_value = mock_chunk_result
|
|
mock_chunker_cls.return_value = mock_chunker
|
|
|
|
# Setup vision analyzer
|
|
mock_analyzer = MagicMock()
|
|
mock_analyzer.analyze_images.return_value = 3
|
|
mock_vision_cls.return_value = mock_analyzer
|
|
|
|
item = self._make_mock_item()
|
|
img_nodes = [MagicMock(), MagicMock(), MagicMock()]
|
|
pipeline = EmbeddingPipeline()
|
|
|
|
with patch.object(pipeline, "_get_item_library", return_value=None), \
|
|
patch.object(pipeline, "_read_item_from_s3", return_value=b"data"), \
|
|
patch.object(pipeline, "_store_chunks", return_value=[]), \
|
|
patch.object(pipeline, "_store_images", return_value=img_nodes), \
|
|
patch.object(pipeline, "_associate_images_with_chunks"), \
|
|
patch.object(pipeline, "_get_vision_prompt", return_value="Analyze"):
|
|
|
|
result = pipeline._run_pipeline(item, None)
|
|
|
|
self.assertEqual(result["images_analyzed"], 3)
|
|
mock_vision_cls.assert_called_once_with(mock_vision_model, user=None)
|
|
mock_analyzer.analyze_images.assert_called_once()
|
|
|
|
@patch("library.services.pipeline.LLMModel")
|
|
def test_no_images_skips_vision_entirely(self, mock_llm):
|
|
"""When there are no images, vision stage is a no-op regardless of model."""
|
|
mock_vision_model = MagicMock()
|
|
mock_llm.get_system_vision_model.return_value = mock_vision_model
|
|
mock_llm.get_system_embedding_model.return_value = MagicMock(
|
|
name="embed", vector_dimensions=None, supports_multimodal=False
|
|
)
|
|
mock_llm.get_system_chat_model.return_value = None
|
|
|
|
item = self._make_mock_item()
|
|
pipeline = EmbeddingPipeline()
|
|
|
|
mock_chunk_result = MagicMock()
|
|
mock_chunk_result.chunks = []
|
|
mock_chunk_result.chunk_page_map = {}
|
|
|
|
with patch.object(pipeline, "_get_item_library", return_value=None), \
|
|
patch.object(pipeline, "_read_item_from_s3", return_value=b"data"), \
|
|
patch.object(pipeline, "_store_chunks", return_value=[]), \
|
|
patch.object(pipeline, "_store_images", return_value=[]), \
|
|
patch.object(pipeline, "_associate_images_with_chunks"), \
|
|
patch("library.services.pipeline.DocumentParser") as mock_parser_cls, \
|
|
patch("library.services.pipeline.ContentTypeChunker") as mock_chunker_cls, \
|
|
patch("library.services.pipeline.EmbeddingClient"), \
|
|
patch("library.services.pipeline.VisionAnalyzer") as mock_vision_cls:
|
|
|
|
mock_parser = MagicMock()
|
|
mock_parser.parse_bytes.return_value = MagicMock(images=[], text_blocks=[])
|
|
mock_parser_cls.return_value = mock_parser
|
|
mock_chunker = MagicMock()
|
|
mock_chunker.chunk.return_value = mock_chunk_result
|
|
mock_chunker_cls.return_value = mock_chunker
|
|
|
|
result = pipeline._run_pipeline(item, None)
|
|
|
|
# VisionAnalyzer should never be instantiated
|
|
mock_vision_cls.assert_not_called()
|
|
self.assertEqual(result["images_analyzed"], 0)
|