diff --git a/docs/PHASE_2B_VISION_PIPELINE.md b/docs/PHASE_2B_VISION_PIPELINE.md new file mode 100644 index 0000000..ad05218 --- /dev/null +++ b/docs/PHASE_2B_VISION_PIPELINE.md @@ -0,0 +1,244 @@ +# Phase 2B: Vision Analysis Pipeline + +## Objective + +Add vision-based image understanding to the embedding pipeline: when documents are processed and images extracted, use a vision-capable LLM to classify, describe, extract text from, and identify concepts within each image — connecting images into the knowledge graph as first-class participants alongside text. + +## Heritage + +Extends Phase 2's image extraction (PyMuPDF) and multimodal embedding (Qwen3-VL) with structured understanding. Previously, images were stored and optionally embedded into vector space, but had no description, no classification beyond a hardcoded default, and no concept graph integration. Phase 2B makes images *understood*. + +## Architecture Overview + +``` +Image Extracted (Phase 2) + → Stored in S3 + Image node in Neo4j + → Vision Analysis (NEW - Phase 2B) + → Classify image type (diagram, photo, chart, etc.) + → Generate natural language description + → Extract visible text (OCR) + → Identify concepts depicted + → Create DEPICTS relationships to Concept nodes + → Connect Item to image-derived concepts via REFERENCES + → Multimodal Embedding (Phase 2, now enhanced) +``` + +## How It Fits the Graph + +Vision analysis enriches images so they participate in the knowledge graph the same way text chunks do: + +``` +Item ─[HAS_IMAGE]──→ Image + │ + ├── description: "Wiring diagram showing 3-phase motor connection" + ├── image_type: "diagram" (auto-classified) + ├── ocr_text: "L1 L2 L3 ..." + ├── vision_model_name: "Qwen3-VL-72B" + │ + ├──[DEPICTS]──→ Concept("3-phase motor") + ├──[DEPICTS]──→ Concept("wiring diagram") + └──[DEPICTS]──→ Concept("electrical connection") + │ + └──[RELATED_TO]──→ Concept("motor control") + ↑ + Chunk text also ──[MENTIONS]──┘ +``` + +Three relationship types connect content to concepts: +- `Chunk ─[MENTIONS]─→ Concept` — text discusses this concept +- `Item ─[REFERENCES]─→ Concept` — item is about this concept +- `Image ─[DEPICTS]─→ Concept` — image visually shows this concept + +Concepts extracted from images merge with the **same Concept nodes** extracted from text via deduplication by name. This means graph traversal discovers cross-modal connections automatically. + +## Deliverables + +### 1. System Vision Model (`llm_manager/models.py`) + +New `is_system_vision_model` boolean field on `LLMModel`, following the same pattern as the existing system embedding, chat, and reranker models. + +```python +is_system_vision_model = models.BooleanField( + default=False, + help_text="Mark as the system-wide vision model for image analysis." +) + +@classmethod +def get_system_vision_model(cls): + return cls.objects.filter( + is_system_vision_model=True, + is_active=True, + model_type__in=["vision", "chat"], # Vision-capable chat models work + ).first() +``` + +Added `"vision_analysis"` to `LLMUsage.purpose` choices for cost tracking. + +### 2. Image Model Enhancements (`library/models.py`) + +New fields on the `Image` node: + +| Field | Type | Purpose | +|-------|------|---------| +| `ocr_text` | StringProperty | Visible text extracted by vision model | +| `vision_model_name` | StringProperty | Which model analyzed this image | +| `analysis_status` | StringProperty | pending / completed / failed / skipped | + +Expanded `image_type` choices: cover, diagram, chart, table, screenshot, illustration, map, portrait, artwork, still, photo. + +New relationship: `Image ─[DEPICTS]─→ Concept` + +### 3. Content-Type Vision Prompts (`library/content_types.py`) + +Each library type now includes a `vision_prompt` that shapes what the vision model looks for: + +| Library Type | Vision Focus | +|---|---| +| **Fiction** | Illustrations, cover art, characters, scenes, artistic style | +| **Non-Fiction** | Photographs, maps, charts, people, places, historical context | +| **Technical** | Diagrams, schematics, charts, tables, labels, processes | +| **Music** | Album covers, band photos, liner notes, era/aesthetic | +| **Film** | Stills, posters, storyboards, cinematographic elements | +| **Art** | Medium, style, subject, composition, artistic period | +| **Journal** | Photos, sketches, documents, dates, context clues | + +### 4. Vision Analysis Service (`library/services/vision.py`) + +New service: `VisionAnalyzer` — analyzes images via the system vision model. + +#### API Call Format + +Uses OpenAI-compatible multimodal chat format: + +```python +{ + "model": "qwen3-vl-72b", + "messages": [ + {"role": "system", "content": ""}, + {"role": "user", "content": [ + {"type": "text", "text": ""}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}} + ]} + ], + "temperature": 0.1, + "max_tokens": 800 +} +``` + +#### Response Structure + +The vision model returns structured JSON: + +```json +{ + "image_type": "diagram", + "description": "A wiring diagram showing a 3-phase motor connection with L1, L2, L3 inputs", + "ocr_text": "L1 L2 L3 GND PE 400V", + "concepts": [ + {"name": "3-phase motor", "type": "topic"}, + {"name": "wiring diagram", "type": "technique"} + ] +} +``` + +#### Processing Flow + +For each image: +1. Read image bytes from S3 +2. Base64-encode and send to vision model with content-type-aware prompt +3. Parse structured JSON response +4. Validate and normalize (image_type must be valid, concepts capped at 20) +5. Update Image node (description, ocr_text, image_type, vision_model_name, analysis_status) +6. Create/connect Concept nodes via DEPICTS relationship +7. Also connect Item → Concept via REFERENCES (weight 0.8) +8. Log usage to LLMUsage + +### 5. Pipeline Integration (`library/services/pipeline.py`) + +Vision analysis is Stage 5.5 in the pipeline: + +``` +Stage 5: Store images in S3 + Neo4j (existing) +Stage 5.5: Vision analysis (NEW) +Stage 6: Embed images multimodally (existing) +Stage 7: Concept extraction from text (existing) +``` + +Behavior: +- If system vision model configured → analyze all images +- If no vision model → mark images as `analysis_status="skipped"`, continue pipeline +- Vision analysis failures are per-image (don't fail the whole pipeline) + +### 6. Non-Fiction Library Type + +New `"nonfiction"` library type added alongside the existing six types: + +| Setting | Value | +|---------|-------| +| Strategy | `section_aware` | +| Chunk Size | 768 | +| Chunk Overlap | 96 | +| Boundaries | chapter, section, paragraph | +| Focus | Factual claims, historical events, people, places, arguments, evidence | + +### 7. Prometheus Metrics (`library/metrics.py`) + +| Metric | Type | Labels | Purpose | +|--------|------|--------|---------| +| `mnemosyne_vision_analyses_total` | Counter | status | Images analyzed | +| `mnemosyne_vision_analysis_duration_seconds` | Histogram | — | Per-image analysis latency | +| `mnemosyne_vision_concepts_extracted_total` | Counter | concept_type | Concepts from images | + +### 8. Dashboard & UI Updates + +- Embedding dashboard shows system vision model status alongside embedding, chat, and reranker models +- Item detail page shows enriched image cards with: + - Auto-classified image type badge + - Vision-generated description + - Collapsible OCR text section + - Analysis status indicator + - Vision model name reference + +## File Structure + +``` +mnemosyne/library/ +├── services/ +│ ├── vision.py # NEW — VisionAnalyzer service +│ ├── pipeline.py # Modified — Stage 5.5 integration +│ └── ... +├── models.py # Modified — Image fields, DEPICTS rel, nonfiction type +├── content_types.py # Modified — vision_prompt for all 7 types +├── metrics.py # Modified — vision analysis metrics +├── views.py # Modified — vision model in dashboard context +└── templates/library/ + ├── item_detail.html # Modified — enriched image display + └── embedding_dashboard.html # Modified — vision model row + +mnemosyne/llm_manager/ +├── models.py # Modified — is_system_vision_model, get_system_vision_model() +└── migrations/ + └── 0003_add_vision_model_and_usage.py # NEW +``` + +## Performance Considerations + +- Each image = one vision model inference (~2-5 seconds on local GPU) +- A document with 20 images = ~40-100 seconds of extra processing +- Runs in Celery async tasks — does not block web requests +- Uses the same GPU infrastructure already serving embedding and reranking +- Zero API cost when running locally +- Per-image failure isolation — one bad image doesn't fail the pipeline + +## Success Criteria + +- [ ] System vision model configurable via Django admin (same pattern as other system models) +- [ ] Images auto-classified with correct image_type (not hardcoded "diagram") +- [ ] Vision-generated descriptions visible on item detail page +- [ ] OCR text extracted from images with visible text +- [ ] Concepts extracted from images connected to Concept nodes via DEPICTS +- [ ] Shared concepts bridge text chunks and images in the graph +- [ ] Pipeline gracefully skips vision analysis when no vision model configured +- [ ] Non-fiction library type available for history, biography, essays, etc. +- [ ] Prometheus metrics track vision analysis throughput and latency +- [ ] Dashboard shows vision model status diff --git a/mnemosyne/library/content_types.py b/mnemosyne/library/content_types.py index 3c58c7e..6be7e74 100644 --- a/mnemosyne/library/content_types.py +++ b/mnemosyne/library/content_types.py @@ -2,7 +2,7 @@ Content-type system configuration for Mnemosyne library types. Each library type has a default configuration that governs chunking, -embedding, re-ranking, and LLM context injection. +embedding, re-ranking, LLM context injection, and vision analysis prompts. """ # Default configurations per library type. @@ -30,6 +30,43 @@ LIBRARY_TYPE_DEFAULTS = { "characters, settings, and events are fictional. Cite specific passages " "when answering." ), + "vision_prompt": ( + "Analyze this image from a work of fiction. Identify:\n" + "1) Image type (illustration, cover, map, portrait, scene).\n" + "2) What it depicts — characters, scenes, settings.\n" + "3) Any visible text, titles, or captions.\n" + "4) The artistic style and mood." + ), + }, + "nonfiction": { + "chunking_config": { + "strategy": "section_aware", + "chunk_size": 768, + "chunk_overlap": 96, + "respect_boundaries": ["chapter", "section", "paragraph"], + }, + "embedding_instruction": ( + "Represent this passage from a non-fiction work for retrieval. " + "Focus on factual claims, historical events, people, places, " + "arguments, and supporting evidence." + ), + "reranker_instruction": ( + "Re-rank passages from non-fiction based on factual relevance. " + "Prioritize historical events, arguments, evidence, and key figures." + ), + "llm_context_prompt": ( + "The following excerpts are from non-fiction (history, biography, essays, " + "science writing, philosophy, etc.). Treat claims as the author's perspective " + "— note when interpretations may vary. Cite specific passages and distinguish " + "fact from analysis." + ), + "vision_prompt": ( + "Analyze this image from a non-fiction work. Identify:\n" + "1) Image type (photograph, map, chart, illustration, portrait, document).\n" + "2) What it depicts — people, places, events, data.\n" + "3) Any visible text, labels, captions, or dates.\n" + "4) Historical or factual context." + ), }, "technical": { "chunking_config": { @@ -52,6 +89,13 @@ LIBRARY_TYPE_DEFAULTS = { "reference material). Provide precise, actionable answers. Include code " "examples and exact configurations when available. Cite source sections." ), + "vision_prompt": ( + "Analyze this image from technical documentation. Identify:\n" + "1) Image type (diagram, chart, table, screenshot, schematic, flowchart).\n" + "2) What it depicts — components, processes, architecture, data.\n" + "3) Any visible text, labels, values, or annotations.\n" + "4) The technical concept or procedure being illustrated." + ), }, "music": { "chunking_config": { @@ -73,6 +117,13 @@ LIBRARY_TYPE_DEFAULTS = { "Consider the artistic and cultural context. Reference specific " "songs, albums, and artists when answering." ), + "vision_prompt": ( + "Analyze this image from a music collection. Identify:\n" + "1) Image type (album cover, liner notes, band photo, concert photo, logo).\n" + "2) What it depicts — artwork style, imagery, people.\n" + "3) Any visible text, band names, album titles, or track listings.\n" + "4) The era, aesthetic, and genre associations." + ), }, "film": { "chunking_config": { @@ -94,6 +145,13 @@ LIBRARY_TYPE_DEFAULTS = { "reviews). Consider the cinematic context — visual storytelling, " "direction, and performance. Cite specific scenes and films." ), + "vision_prompt": ( + "Analyze this image from a film or film-related content. Identify:\n" + "1) Image type (movie still, poster, storyboard, behind-the-scenes, screenshot).\n" + "2) What it depicts — scene, characters, setting, action.\n" + "3) Any visible text, titles, or credits.\n" + "4) Cinematographic elements — composition, lighting, mood." + ), }, "art": { "chunking_config": { @@ -115,6 +173,13 @@ LIBRARY_TYPE_DEFAULTS = { "Consider visual elements, artistic technique, historical context, " "and the artist's intent. Reference specific works and movements." ), + "vision_prompt": ( + "Describe this artwork in detail. Identify:\n" + "1) Image type (painting, sculpture, photograph, print, drawing, mixed media).\n" + "2) Subject matter — what the artwork depicts.\n" + "3) Style, medium, and technique.\n" + "4) Composition, color palette, mood, and artistic period or movement." + ), }, "journal": { "chunking_config": { @@ -137,6 +202,13 @@ LIBRARY_TYPE_DEFAULTS = { "This is private, reflective content. Respect the personal nature — " "answer with sensitivity. Note dates and temporal context when relevant." ), + "vision_prompt": ( + "Analyze this image from a personal journal. Identify:\n" + "1) Image type (photograph, sketch, document, receipt, ticket, screenshot).\n" + "2) What it depicts — people, places, events, objects.\n" + "3) Any visible text, dates, or handwriting.\n" + "4) Context clues about when and where this was taken or created." + ), }, } @@ -146,11 +218,12 @@ def get_library_type_config(library_type): Get the default configuration for a library type. Args: - library_type: One of 'fiction', 'technical', 'music', 'film', 'art', 'journal' + library_type: One of 'fiction', 'nonfiction', 'technical', 'music', + 'film', 'art', 'journal' Returns: dict with keys: chunking_config, embedding_instruction, - reranker_instruction, llm_context_prompt + reranker_instruction, llm_context_prompt, vision_prompt Raises: ValueError: If library_type is not recognized diff --git a/mnemosyne/library/metrics.py b/mnemosyne/library/metrics.py index fe14c9d..eb5e94c 100644 --- a/mnemosyne/library/metrics.py +++ b/mnemosyne/library/metrics.py @@ -88,6 +88,24 @@ CONCEPTS_EXTRACTED_TOTAL = Counter( ["concept_type"], ) +# --- Vision Analysis (Phase 2B) --- + +VISION_ANALYSES_TOTAL = Counter( + "mnemosyne_vision_analyses_total", + "Total images analyzed by vision model", + ["status"], +) +VISION_ANALYSIS_DURATION = Histogram( + "mnemosyne_vision_analysis_duration_seconds", + "Time to analyze a single image with the vision model", + buckets=[0.5, 1, 2, 5, 10, 20, 30, 60], +) +VISION_CONCEPTS_EXTRACTED_TOTAL = Counter( + "mnemosyne_vision_concepts_extracted_total", + "Concepts extracted from images by vision analysis", + ["concept_type"], +) + # --- System State --- EMBEDDING_QUEUE_SIZE = Gauge( diff --git a/mnemosyne/library/models.py b/mnemosyne/library/models.py index 5f41fa3..f1ce492 100644 --- a/mnemosyne/library/models.py +++ b/mnemosyne/library/models.py @@ -60,6 +60,7 @@ class Library(StructuredNode): required=True, choices={ "fiction": "Fiction", + "nonfiction": "Non-Fiction", "technical": "Technical", "music": "Music", "film": "Film", @@ -219,6 +220,12 @@ class Image(StructuredNode): choices={ "cover": "Cover", "diagram": "Diagram", + "chart": "Chart", + "table": "Table", + "screenshot": "Screenshot", + "illustration": "Illustration", + "map": "Map", + "portrait": "Portrait", "artwork": "Artwork", "still": "Still", "photo": "Photo", @@ -227,10 +234,24 @@ class Image(StructuredNode): description = StringProperty(default="") metadata = JSONProperty(default={}) + # Vision analysis fields (Phase 2B) + ocr_text = StringProperty(default="") # Visible text extracted by vision model + vision_model_name = StringProperty(default="") # Which vision model analyzed this + analysis_status = StringProperty( + default="pending", + choices={ + "pending": "Pending", + "completed": "Completed", + "failed": "Failed", + "skipped": "Skipped", + }, + ) + created_at = DateTimeProperty(default_now=True) # Relationships embeddings = RelationshipTo("ImageEmbedding", "HAS_EMBEDDING") + concepts = RelationshipTo("Concept", "DEPICTS") def __str__(self): return f"Image {self.image_type} ({self.uid})" diff --git a/mnemosyne/library/services/pipeline.py b/mnemosyne/library/services/pipeline.py index 9182385..4020508 100644 --- a/mnemosyne/library/services/pipeline.py +++ b/mnemosyne/library/services/pipeline.py @@ -154,6 +154,7 @@ class EmbeddingPipeline: "chunks_created": 0, "chunks_embedded": 0, "images_stored": 0, + "images_analyzed": 0, "images_embedded": 0, "concepts_extracted": 0, "model_name": "", @@ -238,7 +239,29 @@ class EmbeddingPipeline: ) if progress_callback: - progress_callback(80, "Embedding images") + progress_callback(75, "Analyzing images with vision model") + + # --- Stage 5.5: Vision analysis (Phase 2B) --- + vision_model = LLMModel.get_system_vision_model() + if vision_model and image_nodes: + from .vision import VisionAnalyzer + + vision_prompt = self._get_vision_prompt(library) + analyzer = VisionAnalyzer(vision_model, user=self.user) + images_analyzed = analyzer.analyze_images( + image_nodes, + vision_prompt=vision_prompt, + item=item, + ) + result["images_analyzed"] = images_analyzed + elif image_nodes: + # No vision model — mark images as skipped + for img_node in image_nodes: + img_node.analysis_status = "skipped" + img_node.save() + + if progress_callback: + progress_callback(85, "Embedding images") # --- Stage 6: Embed images (multimodal) --- if image_nodes and embedding_model.supports_multimodal: @@ -547,6 +570,24 @@ class EmbeddingPipeline: return embedded_count + def _get_vision_prompt(self, library) -> str: + """ + Get the content-type-aware vision prompt for a library. + + :param library: Library node, or None. + :returns: Vision prompt string, or empty string. + """ + if not library: + return "" + + try: + from library.content_types import get_library_type_config + + config = get_library_type_config(library.library_type) + return config.get("vision_prompt", "") + except Exception: + return "" + def _check_dimension_compatibility(self, model_dimensions: int): """ Check if the model's vector dimensions match the Neo4j index. diff --git a/mnemosyne/library/services/vision.py b/mnemosyne/library/services/vision.py new file mode 100644 index 0000000..cc28191 --- /dev/null +++ b/mnemosyne/library/services/vision.py @@ -0,0 +1,386 @@ +""" +Vision analysis service for image understanding (Phase 2B). + +Uses the system vision model to analyze extracted images: classify type, +generate descriptions, extract visible text (OCR), and identify concepts +that connect images into the knowledge graph via DEPICTS relationships. +""" + +import base64 +import json +import logging +import re +import time +from typing import Optional + +import requests + +from library.metrics import ( + VISION_ANALYSES_TOTAL, + VISION_ANALYSIS_DURATION, + VISION_CONCEPTS_EXTRACTED_TOTAL, +) + +logger = logging.getLogger(__name__) + +# Valid image_type values from the Image model +VALID_IMAGE_TYPES = { + "cover", "diagram", "chart", "table", "screenshot", + "illustration", "map", "portrait", "artwork", "still", "photo", +} + +# System prompt for structured vision analysis +VISION_SYSTEM_PROMPT = ( + "You are an image analysis assistant. Analyze the image and return a JSON object " + "with the following fields:\n" + '- "image_type": one of: cover, diagram, chart, table, screenshot, ' + "illustration, map, portrait, artwork, still, photo\n" + '- "description": a concise 1-3 sentence description of what the image shows\n' + '- "ocr_text": any visible text in the image (empty string if none)\n' + '- "concepts": array of objects with "name" (lowercase) and "type" ' + '(one of: person, place, topic, technique, theme)\n\n' + "Return ONLY the JSON object, no other text." +) + + +class VisionAnalyzer: + """ + Analyzes images using a vision-capable LLM to extract structured metadata. + + For each image, produces: + - image_type classification + - natural language description + - OCR text extraction + - concept entities for knowledge graph integration + + Requires a system vision model configured in LLM Manager. + """ + + def __init__(self, vision_model, user=None): + """ + :param vision_model: LLMModel instance for the vision model. + :param user: Optional Django user for usage tracking. + """ + self.vision_model = vision_model + self.api = vision_model.api + self.user = user + self.base_url = self.api.base_url.rstrip("/") + self.model_name = self.vision_model.name + self.timeout = self.api.timeout_seconds or 120 + + logger.info( + "VisionAnalyzer initialized model=%s api=%s", + self.model_name, + self.api.name, + ) + + def analyze_images( + self, + image_nodes: list, + vision_prompt: str = "", + item=None, + ) -> int: + """ + Analyze a list of Image nodes and update them with vision results. + + :param image_nodes: List of Image node instances (with s3_key populated). + :param vision_prompt: Content-type-aware prompt from the Library config. + :param item: Optional Item node to connect image concepts via REFERENCES. + :returns: Number of images successfully analyzed. + """ + from django.core.files.storage import default_storage + + analyzed_count = 0 + + for img_node in image_nodes: + try: + # Read image data from S3 + img_data = default_storage.open(img_node.s3_key, "rb").read() + ext = img_node.s3_key.rsplit(".", 1)[-1] if "." in img_node.s3_key else "png" + + result = self._analyze_single_image(img_data, ext, vision_prompt) + + if result: + self._apply_result(img_node, result, item) + analyzed_count += 1 + VISION_ANALYSES_TOTAL.labels(status="success").inc() + else: + img_node.analysis_status = "failed" + img_node.save() + VISION_ANALYSES_TOTAL.labels(status="failed").inc() + + except Exception as exc: + logger.warning( + "Vision analysis failed for image s3_key=%s: %s", + img_node.s3_key, + exc, + ) + img_node.analysis_status = "failed" + img_node.save() + VISION_ANALYSES_TOTAL.labels(status="failed").inc() + + if analyzed_count: + logger.info("Vision analysis completed: %d/%d images", analyzed_count, len(image_nodes)) + + return analyzed_count + + def _analyze_single_image( + self, + image_data: bytes, + image_ext: str, + vision_prompt: str, + ) -> Optional[dict]: + """ + Send a single image to the vision model for analysis. + + :param image_data: Raw image bytes. + :param image_ext: Image format extension (png, jpg, etc.). + :param vision_prompt: Content-type-aware analysis prompt. + :returns: Parsed result dict, or None on failure. + """ + b64 = base64.b64encode(image_data).decode("utf-8") + mime_type = f"image/{image_ext}" if image_ext != "jpg" else "image/jpeg" + + # Build the user prompt with content-type context + user_prompt = vision_prompt if vision_prompt else "Analyze this image." + + start_time = time.time() + + try: + response_text = self._call_vision_model(b64, mime_type, user_prompt) + elapsed = time.time() - start_time + VISION_ANALYSIS_DURATION.observe(elapsed) + + result = self._parse_vision_response(response_text) + + logger.debug( + "Vision analysis completed in %.2fs image_type=%s description_len=%d", + elapsed, + result.get("image_type", "unknown") if result else "failed", + len(result.get("description", "")) if result else 0, + ) + return result + + except Exception as exc: + elapsed = time.time() - start_time + VISION_ANALYSIS_DURATION.observe(elapsed) + logger.warning("Vision model call failed: %s", exc) + return None + + def _call_vision_model( + self, + b64_image: str, + mime_type: str, + user_prompt: str, + ) -> str: + """ + Make a chat completion request with an image to the vision model. + + Uses OpenAI-compatible multimodal chat format. + + :param b64_image: Base64-encoded image data. + :param mime_type: MIME type of the image. + :param user_prompt: Text prompt to accompany the image. + :returns: Response text from the model. + """ + url = f"{self.base_url}/chat/completions" + headers = {"Content-Type": "application/json"} + if self.api.api_key: + headers["Authorization"] = f"Bearer {self.api.api_key}" + + body = { + "model": self.model_name, + "messages": [ + {"role": "system", "content": VISION_SYSTEM_PROMPT}, + { + "role": "user", + "content": [ + {"type": "text", "text": user_prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:{mime_type};base64,{b64_image}", + }, + }, + ], + }, + ], + "temperature": 0.1, + "max_tokens": 800, + } + + resp = requests.post(url, json=body, headers=headers, timeout=self.timeout) + + if resp.status_code != 200: + logger.error( + "Vision model request failed status=%d body=%s", + resp.status_code, + resp.text[:500], + ) + resp.raise_for_status() + + data = resp.json() + + # Parse response — OpenAI-compatible format + if "choices" in data: + return data["choices"][0]["message"]["content"] + if "output" in data: + # Bedrock Converse format + return data["output"]["message"]["content"][0]["text"] + + raise ValueError(f"Unexpected vision response format: {list(data.keys())}") + + def _parse_vision_response(self, response_text: str) -> Optional[dict]: + """ + Parse the vision model's response into structured data. + + :param response_text: Raw response text (expected JSON). + :returns: Parsed dict with image_type, description, ocr_text, concepts. + """ + text = response_text.strip() + + # Handle markdown code blocks + if text.startswith("```"): + lines = text.split("\n") + # Remove first line (```json) and last line (```) + text = "\n".join(lines[1:-1]) if len(lines) > 2 else text + + try: + result = json.loads(text) + if isinstance(result, dict): + return self._validate_result(result) + except json.JSONDecodeError: + pass + + # Try to find JSON object in the response + match = re.search(r"\{.*\}", text, re.DOTALL) + if match: + try: + result = json.loads(match.group()) + if isinstance(result, dict): + return self._validate_result(result) + except json.JSONDecodeError: + pass + + logger.debug("Could not parse vision response: %s", text[:200]) + return None + + def _validate_result(self, result: dict) -> dict: + """ + Validate and normalize the parsed vision result. + + :param result: Raw parsed dict from the model. + :returns: Normalized result dict. + """ + # Normalize image_type + image_type = result.get("image_type", "").lower().strip() + if image_type not in VALID_IMAGE_TYPES: + image_type = "photo" # Safe default + + # Normalize concepts + concepts = result.get("concepts", []) + valid_concepts = [] + for c in concepts: + if isinstance(c, dict) and "name" in c: + name = c["name"].strip().lower() + if name and len(name) >= 2: + valid_concepts.append({ + "name": name, + "type": c.get("type", "topic"), + }) + + return { + "image_type": image_type, + "description": str(result.get("description", "")).strip()[:2000], + "ocr_text": str(result.get("ocr_text", "")).strip()[:5000], + "concepts": valid_concepts[:20], # Cap at 20 concepts per image + } + + def _apply_result(self, img_node, result: dict, item=None): + """ + Apply vision analysis results to an Image node and create graph relationships. + + :param img_node: Image node to update. + :param result: Validated result dict from vision analysis. + :param item: Optional Item node for REFERENCES relationships. + """ + from library.models import Concept + + # Update Image node fields + img_node.image_type = result["image_type"] + img_node.description = result["description"] + img_node.ocr_text = result["ocr_text"] + img_node.vision_model_name = self.model_name + img_node.analysis_status = "completed" + img_node.save() + + # Create concept relationships + for concept_data in result["concepts"]: + name = concept_data["name"] + concept_type = concept_data["type"] + + concept_node = self._get_or_create_concept(name, concept_type) + if concept_node: + # Image DEPICTS Concept + try: + img_node.concepts.connect(concept_node) + except Exception: + pass # Already connected + + # Also connect Item REFERENCES Concept (if item provided) + if item: + try: + item.concepts.connect(concept_node, {"weight": 0.8}) + except Exception: + pass # Already connected + + VISION_CONCEPTS_EXTRACTED_TOTAL.labels(concept_type=concept_type).inc() + + logger.debug( + "Applied vision result to image uid=%s type=%s concepts=%d", + img_node.uid, + result["image_type"], + len(result["concepts"]), + ) + + # Log usage + self._log_usage() + + def _get_or_create_concept(self, name: str, concept_type: str): + """ + Get or create a Concept node by name (shared with text concept extraction). + + :param name: Concept name (lowercase). + :param concept_type: Concept type (person, place, topic, etc.). + :returns: Concept node, or None on failure. + """ + from library.models import Concept + + try: + existing = Concept.nodes.filter(name=name) + if existing: + return existing[0] + + concept = Concept(name=name, concept_type=concept_type) + concept.save() + return concept + except Exception as exc: + logger.debug("Failed to get/create concept '%s': %s", name, exc) + return None + + def _log_usage(self): + """Log vision analysis usage to LLMUsage model.""" + try: + from llm_manager.models import LLMUsage + + LLMUsage.objects.create( + model=self.vision_model, + user=self.user, + input_tokens=500, # Approximate — vision tokens are hard to estimate + output_tokens=200, + cached_tokens=0, + total_cost=0, + purpose="vision_analysis", + ) + except Exception as exc: + logger.warning("Failed to log vision usage: %s", exc) diff --git a/mnemosyne/library/templates/library/embedding_dashboard.html b/mnemosyne/library/templates/library/embedding_dashboard.html index f3dfa7b..78334ff 100644 --- a/mnemosyne/library/templates/library/embedding_dashboard.html +++ b/mnemosyne/library/templates/library/embedding_dashboard.html @@ -56,6 +56,19 @@ {% endif %} + + Vision Model + + {% if system_vision_model %} + {{ system_vision_model.api.name }}: {{ system_vision_model.name }} + {% if system_vision_model.supports_vision %} + Vision + {% endif %} + {% else %} + Not configured — image analysis disabled + {% endif %} + + diff --git a/mnemosyne/library/templates/library/item_detail.html b/mnemosyne/library/templates/library/item_detail.html index 9e57be4..c00f243 100644 --- a/mnemosyne/library/templates/library/item_detail.html +++ b/mnemosyne/library/templates/library/item_detail.html @@ -97,24 +97,51 @@ {% if images %}

Images ({{ images|length }})

-
+
{% for img in images %} - - {% if img.s3_key %} -
- {{ img.description|default:'Image' }} -
- {% endif %} -
- {{ img.image_type|default:"image" }} - {% if img.description %} -

{{ img.description|truncatewords:10 }}

+
+
+ {% if img.s3_key %} + + {{ img.description|default:'Image' }} + {% endif %} -

- 🔍 Click to view -

+
+
+ {{ img.image_type|default:"image" }} + {% if img.analysis_status == "completed" %} + analyzed + {% elif img.analysis_status == "failed" %} + analysis failed + {% elif img.analysis_status == "skipped" %} + no vision model + {% endif %} + {% if img.vision_model_name %} + {{ img.vision_model_name }} + {% endif %} +
+ {% if img.description %} +

{{ img.description }}

+ {% endif %} + {% if img.ocr_text %} +
+ +
+ Extracted Text (OCR) +
+
+

{{ img.ocr_text }}

+
+
+ {% endif %} + {% if not img.description and not img.ocr_text %} + + 🔍 Click to view full image + + {% endif %} +
- +
{% endfor %}
diff --git a/mnemosyne/library/views.py b/mnemosyne/library/views.py index 97f56e6..4b18ac9 100644 --- a/mnemosyne/library/views.py +++ b/mnemosyne/library/views.py @@ -550,6 +550,7 @@ def embedding_dashboard(request): "system_embedding_model": None, "system_chat_model": None, "system_reranker_model": None, + "system_vision_model": None, "status_counts": {}, "node_counts": {}, "total_items": 0, @@ -565,6 +566,7 @@ def embedding_dashboard(request): context["system_embedding_model"] = LLMModel.get_system_embedding_model() context["system_chat_model"] = LLMModel.get_system_chat_model() context["system_reranker_model"] = LLMModel.get_system_reranker_model() + context["system_vision_model"] = LLMModel.get_system_vision_model() except Exception as exc: logger.warning("Could not load system models: %s", exc) diff --git a/mnemosyne/llm_manager/migrations/0003_add_vision_model_and_usage.py b/mnemosyne/llm_manager/migrations/0003_add_vision_model_and_usage.py new file mode 100644 index 0000000..5c7cdaf --- /dev/null +++ b/mnemosyne/llm_manager/migrations/0003_add_vision_model_and_usage.py @@ -0,0 +1,52 @@ +""" +Add is_system_vision_model to LLMModel and vision_analysis purpose to LLMUsage. +""" + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("llm_manager", "0002_add_bedrock_api_type"), + ] + + operations = [ + migrations.AddField( + model_name="llmmodel", + name="is_system_vision_model", + field=models.BooleanField( + default=False, + help_text=( + "Mark this as the system-wide vision model for image analysis. " + "Only ONE vision model should have this set to True." + ), + ), + ), + migrations.AddIndex( + model_name="llmmodel", + index=models.Index( + fields=["is_system_vision_model", "model_type"], + name="llm_manager__is_syst_b2f4e7_idx", + ), + ), + migrations.AlterField( + model_name="llmusage", + name="purpose", + field=models.CharField( + choices=[ + ("responder", "RAG Responder"), + ("reviewer", "RAG Reviewer"), + ("embeddings", "Document Embeddings"), + ("search", "Vector Search"), + ("reranking", "Re-ranking"), + ("multimodal_embed", "Multimodal Embedding"), + ("vision_analysis", "Vision Analysis"), + ("other", "Other"), + ], + db_index=True, + default="other", + max_length=50, + ), + ), + ] diff --git a/mnemosyne/llm_manager/models.py b/mnemosyne/llm_manager/models.py index 71a1f77..45325b0 100644 --- a/mnemosyne/llm_manager/models.py +++ b/mnemosyne/llm_manager/models.py @@ -179,6 +179,13 @@ class LLMModel(models.Model): "Only ONE reranker model should have this set to True." ), ) + is_system_vision_model = models.BooleanField( + default=False, + help_text=( + "Mark this as the system-wide vision model for image analysis. " + "Only ONE vision model should have this set to True." + ), + ) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) @@ -191,6 +198,7 @@ class LLMModel(models.Model): models.Index(fields=["is_system_embedding_model", "model_type"]), models.Index(fields=["is_system_chat_model", "model_type"]), models.Index(fields=["is_system_reranker_model", "model_type"]), + models.Index(fields=["is_system_vision_model", "model_type"]), ] def __str__(self): @@ -223,6 +231,15 @@ class LLMModel(models.Model): model_type="reranker", ).first() + @classmethod + def get_system_vision_model(cls): + """Get the system-wide vision model for image analysis.""" + return cls.objects.filter( + is_system_vision_model=True, + is_active=True, + model_type__in=["vision", "chat"], + ).first() + class LLMUsage(models.Model): """ @@ -259,6 +276,7 @@ class LLMUsage(models.Model): ("search", "Vector Search"), ("reranking", "Re-ranking"), ("multimodal_embed", "Multimodal Embedding"), + ("vision_analysis", "Vision Analysis"), ("other", "Other"), ], default="other",