Add vision analysis capabilities to the embedding pipeline

- Introduced a new vision analysis service to classify, describe, and extract text from images. - Enhanced the Image model with fields for OCR text, vision model name, and analysis status. - Added a new "nonfiction" library type with specific chunking and embedding configurations. - Updated content types to include vision prompts for various library types. - Integrated vision analysis into the embedding pipeline, allowing for image analysis during document processing. - Implemented metrics to track vision analysis performance and usage. - Updated UI components to display vision analysis results and statuses in item details and the embedding dashboard. - Added migration for new vision model fields and usage tracking.
2026-03-22 15:14:34 +00:00
parent 6585beed20
commit 90db904959
11 changed files with 914 additions and 19 deletions
--- a/mnemosyne/llm_manager/migrations/0003_add_vision_model_and_usage.py
+++ b/mnemosyne/llm_manager/migrations/0003_add_vision_model_and_usage.py
@@ -0,0 +1,52 @@
+"""
+Add is_system_vision_model to LLMModel and vision_analysis purpose to LLMUsage.
+"""
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("llm_manager", "0002_add_bedrock_api_type"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="llmmodel",
+            name="is_system_vision_model",
+            field=models.BooleanField(
+                default=False,
+                help_text=(
+                    "Mark this as the system-wide vision model for image analysis. "
+                    "Only ONE vision model should have this set to True."
+                ),
+            ),
+        ),
+        migrations.AddIndex(
+            model_name="llmmodel",
+            index=models.Index(
+                fields=["is_system_vision_model", "model_type"],
+                name="llm_manager__is_syst_b2f4e7_idx",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="llmusage",
+            name="purpose",
+            field=models.CharField(
+                choices=[
+                    ("responder", "RAG Responder"),
+                    ("reviewer", "RAG Reviewer"),
+                    ("embeddings", "Document Embeddings"),
+                    ("search", "Vector Search"),
+                    ("reranking", "Re-ranking"),
+                    ("multimodal_embed", "Multimodal Embedding"),
+                    ("vision_analysis", "Vision Analysis"),
+                    ("other", "Other"),
+                ],
+                db_index=True,
+                default="other",
+                max_length=50,
+            ),
+        ),
+    ]
--- a/mnemosyne/llm_manager/models.py
+++ b/mnemosyne/llm_manager/models.py
@@ -179,6 +179,13 @@ class LLMModel(models.Model):
            "Only ONE reranker model should have this set to True."
        ),
    )
+    is_system_vision_model = models.BooleanField(
+        default=False,
+        help_text=(
+            "Mark this as the system-wide vision model for image analysis. "
+            "Only ONE vision model should have this set to True."
+        ),
+    )

    created_at = models.DateTimeField(auto_now_add=True)
    updated_at = models.DateTimeField(auto_now=True)
@@ -191,6 +198,7 @@ class LLMModel(models.Model):
            models.Index(fields=["is_system_embedding_model", "model_type"]),
            models.Index(fields=["is_system_chat_model", "model_type"]),
            models.Index(fields=["is_system_reranker_model", "model_type"]),
+            models.Index(fields=["is_system_vision_model", "model_type"]),
        ]

    def __str__(self):
@@ -223,6 +231,15 @@ class LLMModel(models.Model):
            model_type="reranker",
        ).first()

+    @classmethod
+    def get_system_vision_model(cls):
+        """Get the system-wide vision model for image analysis."""
+        return cls.objects.filter(
+            is_system_vision_model=True,
+            is_active=True,
+            model_type__in=["vision", "chat"],
+        ).first()
+

 class LLMUsage(models.Model):
    """
@@ -259,6 +276,7 @@ class LLMUsage(models.Model):
            ("search", "Vector Search"),
            ("reranking", "Re-ranking"),
            ("multimodal_embed", "Multimodal Embedding"),
+            ("vision_analysis", "Vision Analysis"),
            ("other", "Other"),
        ],
        default="other",