mnemosyne/mnemosyne/library/models.py

"""
Models for the Mnemosyne content library.

Most content (libraries, collections, items, chunks, concepts, images)
lives in Neo4j as a knowledge graph via neomodel StructuredNode. These do
NOT participate in Django's ORM or migrations.

The IngestJob model at the bottom of this file is the exception: it tracks
the lifecycle of asynchronous ingestion requests (file → embedding pipeline)
in PostgreSQL via Django's ORM.
"""

from django.db import models
from neomodel import (
    ArrayProperty,
    DateTimeProperty,
    FloatProperty,
    IntegerProperty,
    JSONProperty,
    RelationshipTo,
    StringProperty,
    StructuredNode,
    StructuredRel,
    UniqueIdProperty,
)


# --- Relationship models ---


class ReferencesRel(StructuredRel):
    """Relationship properties for Item -> Concept REFERENCES edges."""

    weight = FloatProperty(default=1.0)
    context = StringProperty(default="")


class RelatedToRel(StructuredRel):
    """Relationship properties for Item -> Item RELATED_TO edges."""

    relationship_type = StringProperty(default="")
    weight = FloatProperty(default=1.0)


class NearbyImageRel(StructuredRel):
    """Relationship properties for Chunk -> Image HAS_NEARBY_IMAGE edges."""

    proximity = StringProperty(default="same_page")  # same_page, inline, same_slide, same_chapter


# --- Node models ---


class Library(StructuredNode):
    """
    Top-level container representing a content library.

    Each library has a type (fiction, nonfiction, technical, music, film,
    art, journal, business, finance) that drives chunking strategy,
    embedding instructions, and LLM prompts.

    A library may be either *global* (workspace_id is null — searchable
    across the whole instance) or *workspace-scoped* (workspace_id set —
    visible only to agents inside that Daedalus workspace). Scoping is
    enforced structurally by every search query.
    """

    uid = UniqueIdProperty()
    name = StringProperty(unique_index=True, required=True)
    library_type = StringProperty(
        required=True,
        choices={
            "fiction": "Fiction",
            "nonfiction": "Non-Fiction",
            "technical": "Technical",
            "music": "Music",
            "film": "Film",
            "art": "Art",
            "journal": "Journal",
            "business": "Business",
            "finance": "Finance",
        },
    )
    description = StringProperty(default="")

    # Daedalus workspace UUID this library is scoped to. Null for global
    # libraries. Unique-indexed so a workspace cannot have two libraries.
    workspace_id = StringProperty(unique_index=True, required=False)

    # For workspace-scoped libraries: the Mnemosyne username that owns
    # the workspace. Mutations via the workspaces API are restricted to
    # this user. Null for global libraries.
    owner_username = StringProperty(required=False, index=True)

    # Content-type configuration
    chunking_config = JSONProperty(default={})
    embedding_instruction = StringProperty(default="")
    reranker_instruction = StringProperty(default="")
    llm_context_prompt = StringProperty(default="")

    created_at = DateTimeProperty(default_now=True)

    # Relationships
    collections = RelationshipTo("Collection", "CONTAINS")

    def __str__(self):
        return f"{self.name} ({self.library_type})"


class Collection(StructuredNode):
    """
    A grouping of items within a library.

    Examples: a book series, an album discography, a project folder.
    """

    uid = UniqueIdProperty()
    name = StringProperty(required=True)
    description = StringProperty(default="")
    metadata = JSONProperty(default={})

    created_at = DateTimeProperty(default_now=True)

    # Relationships
    items = RelationshipTo("Item", "CONTAINS")
    library = RelationshipTo("Library", "BELONGS_TO")

    def __str__(self):
        return self.name


class Item(StructuredNode):
    """
    An individual piece of content: a document, song, image set, journal entry, etc.

    Items store their original file in S3 (via s3_key) and are chunked
    for embedding and retrieval.
    """

    uid = UniqueIdProperty()
    title = StringProperty(required=True)
    item_type = StringProperty(default="")
    s3_key = StringProperty(default="")
    content_hash = StringProperty(index=True)
    file_type = StringProperty(default="")
    file_size = IntegerProperty(default=0)
    metadata = JSONProperty(default={})

    created_at = DateTimeProperty(default_now=True)
    updated_at = DateTimeProperty(default_now=True)

    # Embedding pipeline fields (Phase 2)
    embedding_status = StringProperty(
        default="pending",
        choices={
            "pending": "Pending",
            "processing": "Processing",
            "completed": "Completed",
            "failed": "Failed",
        },
    )
    embedding_model_name = StringProperty(default="")
    chunk_count = IntegerProperty(default=0)
    image_count = IntegerProperty(default=0)
    error_message = StringProperty(default="")

    # Relationships
    chunks = RelationshipTo("Chunk", "HAS_CHUNK")
    images = RelationshipTo("Image", "HAS_IMAGE")
    concepts = RelationshipTo("Concept", "REFERENCES", model=ReferencesRel)
    related_items = RelationshipTo("Item", "RELATED_TO", model=RelatedToRel)

    def __str__(self):
        return self.title


class Chunk(StructuredNode):
    """
    A text chunk extracted from an Item for embedding and retrieval.

    Chunk text is stored in S3; text_preview holds the first 500 chars
    for Neo4j full-text indexing.
    """

    uid = UniqueIdProperty()
    chunk_index = IntegerProperty(required=True)
    chunk_s3_key = StringProperty(required=True)
    chunk_size = IntegerProperty(default=0)
    text_preview = StringProperty(default="")  # First 500 chars for full-text index
    embedding = ArrayProperty(FloatProperty())  # 4096d vector

    created_at = DateTimeProperty(default_now=True)

    # Relationships
    mentions = RelationshipTo("Concept", "MENTIONS")
    nearby_images = RelationshipTo("Image", "HAS_NEARBY_IMAGE", model=NearbyImageRel)

    def __str__(self):
        return f"Chunk {self.chunk_index} ({self.uid})"


class Concept(StructuredNode):
    """
    A named entity or topic extracted from content.

    Concepts form the backbone of the knowledge graph, linking items
    and chunks through shared references.
    """

    uid = UniqueIdProperty()
    name = StringProperty(unique_index=True, required=True)
    concept_type = StringProperty(
        default="",
        choices={
            "person": "Person",
            "place": "Place",
            "topic": "Topic",
            "technique": "Technique",
            "theme": "Theme",
        },
    )
    embedding = ArrayProperty(FloatProperty())  # 4096d vector

    # Relationships
    related_concepts = RelationshipTo("Concept", "RELATED_TO")

    def __str__(self):
        return self.name


class Image(StructuredNode):
    """
    An image associated with an Item (cover art, diagram, photo, etc.).

    The image file is stored in S3; embeddings enable multimodal search.
    """

    uid = UniqueIdProperty()
    s3_key = StringProperty(required=True)
    image_type = StringProperty(
        default="",
        choices={
            "cover": "Cover",
            "diagram": "Diagram",
            "chart": "Chart",
            "table": "Table",
            "screenshot": "Screenshot",
            "illustration": "Illustration",
            "map": "Map",
            "portrait": "Portrait",
            "artwork": "Artwork",
            "still": "Still",
            "photo": "Photo",
        },
    )
    description = StringProperty(default="")
    metadata = JSONProperty(default={})

    # Vision analysis fields (Phase 2B)
    ocr_text = StringProperty(default="")  # Visible text extracted by vision model
    vision_model_name = StringProperty(default="")  # Which vision model analyzed this
    analysis_status = StringProperty(
        default="pending",
        choices={
            "pending": "Pending",
            "completed": "Completed",
            "failed": "Failed",
            "skipped": "Skipped",
        },
    )

    created_at = DateTimeProperty(default_now=True)

    # Relationships
    embeddings = RelationshipTo("ImageEmbedding", "HAS_EMBEDDING")
    concepts = RelationshipTo("Concept", "DEPICTS")

    def __str__(self):
        return f"Image {self.image_type} ({self.uid})"


class ImageEmbedding(StructuredNode):
    """
    A multimodal embedding vector for an Image node.

    Generated by Qwen3-VL for unified text+image vector space.
    """

    uid = UniqueIdProperty()
    embedding = ArrayProperty(FloatProperty())  # 4096d multimodal vector
    created_at = DateTimeProperty(default_now=True)

    def __str__(self):
        return f"ImageEmbedding ({self.uid})"


# --- Django ORM models (PostgreSQL) ---


class IngestJob(models.Model):
    """
    Tracks the lifecycle of an asynchronous ingestion + embedding job.

    Created when an external client (e.g. Daedalus) posts a file via the
    REST ingest API. The Celery worker reads and updates this row as the
    job moves through fetch / chunk / embed / graph stages.

    Idempotency: a (library, source_ref, content_hash) triple uniquely
    identifies a piece of content. A second POST with the same triple
    returns the existing job; a POST with the same source_ref but a new
    content_hash supersedes the prior Item.
    """

    STATUS_CHOICES = [
        ("pending", "Pending"),
        ("processing", "Processing"),
        ("completed", "Completed"),
        ("failed", "Failed"),
    ]

    id = models.CharField(max_length=64, primary_key=True)
    item_uid = models.CharField(max_length=64, db_index=True, blank=True)
    library_uid = models.CharField(max_length=64, db_index=True)
    celery_task_id = models.CharField(max_length=255, blank=True)

    status = models.CharField(
        max_length=20,
        choices=STATUS_CHOICES,
        default="pending",
        db_index=True,
    )
    progress = models.CharField(max_length=50, default="queued")
    error = models.TextField(blank=True, null=True)
    retry_count = models.PositiveIntegerField(default=0)

    chunks_created = models.PositiveIntegerField(default=0)
    concepts_extracted = models.PositiveIntegerField(default=0)
    embedding_model = models.CharField(max_length=100, blank=True)

    # The file's content hash (sha256). Used for idempotency: a second
    # ingest with the same source_ref + same hash is a no-op; a second
    # ingest with the same source_ref + different hash supersedes.
    content_hash = models.CharField(max_length=64, db_index=True, blank=True)

    # Where the file came from. For Daedalus: source="daedalus",
    # source_ref="<workspace_id>/<file_id>".
    source = models.CharField(max_length=100, default="")
    source_ref = models.CharField(max_length=200, blank=True, db_index=True)
    s3_key = models.CharField(max_length=500)

    # Optional metadata carried forward to the Item node.
    title = models.CharField(max_length=500, blank=True)
    file_type = models.CharField(max_length=100, blank=True)
    file_size = models.PositiveBigIntegerField(default=0)
    collection_uid = models.CharField(max_length=64, blank=True)

    created_at = models.DateTimeField(auto_now_add=True)
    started_at = models.DateTimeField(null=True, blank=True)
    completed_at = models.DateTimeField(null=True, blank=True)

    class Meta:
        ordering = ["-created_at"]
        indexes = [
            models.Index(fields=["status", "-created_at"]),
            models.Index(fields=["source", "source_ref"]),
        ]

    def __str__(self):
        return f"IngestJob {self.id} [{self.status}]"