370 lines
12 KiB
Python
370 lines
12 KiB
Python
"""
|
|
Models for the Mnemosyne content library.
|
|
|
|
Most content (libraries, collections, items, chunks, concepts, images)
|
|
lives in Neo4j as a knowledge graph via neomodel StructuredNode. These do
|
|
NOT participate in Django's ORM or migrations.
|
|
|
|
The IngestJob model at the bottom of this file is the exception: it tracks
|
|
the lifecycle of asynchronous ingestion requests (file → embedding pipeline)
|
|
in PostgreSQL via Django's ORM.
|
|
"""
|
|
|
|
from django.db import models
|
|
from neomodel import (
|
|
ArrayProperty,
|
|
DateTimeProperty,
|
|
FloatProperty,
|
|
IntegerProperty,
|
|
JSONProperty,
|
|
RelationshipTo,
|
|
StringProperty,
|
|
StructuredNode,
|
|
StructuredRel,
|
|
UniqueIdProperty,
|
|
)
|
|
|
|
|
|
# --- Relationship models ---
|
|
|
|
|
|
class ReferencesRel(StructuredRel):
|
|
"""Relationship properties for Item -> Concept REFERENCES edges."""
|
|
|
|
weight = FloatProperty(default=1.0)
|
|
context = StringProperty(default="")
|
|
|
|
|
|
class RelatedToRel(StructuredRel):
|
|
"""Relationship properties for Item -> Item RELATED_TO edges."""
|
|
|
|
relationship_type = StringProperty(default="")
|
|
weight = FloatProperty(default=1.0)
|
|
|
|
|
|
class NearbyImageRel(StructuredRel):
|
|
"""Relationship properties for Chunk -> Image HAS_NEARBY_IMAGE edges."""
|
|
|
|
proximity = StringProperty(default="same_page") # same_page, inline, same_slide, same_chapter
|
|
|
|
|
|
# --- Node models ---
|
|
|
|
|
|
class Library(StructuredNode):
|
|
"""
|
|
Top-level container representing a content library.
|
|
|
|
Each library has a type (fiction, nonfiction, technical, music, film,
|
|
art, journal, business, finance) that drives chunking strategy,
|
|
embedding instructions, and LLM prompts.
|
|
|
|
A library may be either *global* (workspace_id is null — searchable
|
|
across the whole instance) or *workspace-scoped* (workspace_id set —
|
|
visible only to agents inside that Daedalus workspace). Scoping is
|
|
enforced structurally by every search query.
|
|
"""
|
|
|
|
uid = UniqueIdProperty()
|
|
name = StringProperty(unique_index=True, required=True)
|
|
library_type = StringProperty(
|
|
required=True,
|
|
choices={
|
|
"fiction": "Fiction",
|
|
"nonfiction": "Non-Fiction",
|
|
"technical": "Technical",
|
|
"music": "Music",
|
|
"film": "Film",
|
|
"art": "Art",
|
|
"journal": "Journal",
|
|
"business": "Business",
|
|
"finance": "Finance",
|
|
},
|
|
)
|
|
description = StringProperty(default="")
|
|
|
|
# Daedalus workspace UUID this library is scoped to. Null for global
|
|
# libraries. Unique-indexed so a workspace cannot have two libraries.
|
|
workspace_id = StringProperty(unique_index=True, required=False)
|
|
|
|
# For workspace-scoped libraries: the Mnemosyne username that owns
|
|
# the workspace. Mutations via the workspaces API are restricted to
|
|
# this user. Null for global libraries.
|
|
owner_username = StringProperty(required=False, index=True)
|
|
|
|
# Content-type configuration
|
|
chunking_config = JSONProperty(default={})
|
|
embedding_instruction = StringProperty(default="")
|
|
reranker_instruction = StringProperty(default="")
|
|
llm_context_prompt = StringProperty(default="")
|
|
|
|
created_at = DateTimeProperty(default_now=True)
|
|
|
|
# Relationships
|
|
collections = RelationshipTo("Collection", "CONTAINS")
|
|
|
|
def __str__(self):
|
|
return f"{self.name} ({self.library_type})"
|
|
|
|
|
|
class Collection(StructuredNode):
|
|
"""
|
|
A grouping of items within a library.
|
|
|
|
Examples: a book series, an album discography, a project folder.
|
|
"""
|
|
|
|
uid = UniqueIdProperty()
|
|
name = StringProperty(required=True)
|
|
description = StringProperty(default="")
|
|
metadata = JSONProperty(default={})
|
|
|
|
created_at = DateTimeProperty(default_now=True)
|
|
|
|
# Relationships
|
|
items = RelationshipTo("Item", "CONTAINS")
|
|
library = RelationshipTo("Library", "BELONGS_TO")
|
|
|
|
def __str__(self):
|
|
return self.name
|
|
|
|
|
|
class Item(StructuredNode):
|
|
"""
|
|
An individual piece of content: a document, song, image set, journal entry, etc.
|
|
|
|
Items store their original file in S3 (via s3_key) and are chunked
|
|
for embedding and retrieval.
|
|
"""
|
|
|
|
uid = UniqueIdProperty()
|
|
title = StringProperty(required=True)
|
|
item_type = StringProperty(default="")
|
|
s3_key = StringProperty(default="")
|
|
content_hash = StringProperty(index=True)
|
|
file_type = StringProperty(default="")
|
|
file_size = IntegerProperty(default=0)
|
|
metadata = JSONProperty(default={})
|
|
|
|
created_at = DateTimeProperty(default_now=True)
|
|
updated_at = DateTimeProperty(default_now=True)
|
|
|
|
# Embedding pipeline fields (Phase 2)
|
|
embedding_status = StringProperty(
|
|
default="pending",
|
|
choices={
|
|
"pending": "Pending",
|
|
"processing": "Processing",
|
|
"completed": "Completed",
|
|
"failed": "Failed",
|
|
},
|
|
)
|
|
embedding_model_name = StringProperty(default="")
|
|
chunk_count = IntegerProperty(default=0)
|
|
image_count = IntegerProperty(default=0)
|
|
error_message = StringProperty(default="")
|
|
|
|
# Relationships
|
|
chunks = RelationshipTo("Chunk", "HAS_CHUNK")
|
|
images = RelationshipTo("Image", "HAS_IMAGE")
|
|
concepts = RelationshipTo("Concept", "REFERENCES", model=ReferencesRel)
|
|
related_items = RelationshipTo("Item", "RELATED_TO", model=RelatedToRel)
|
|
|
|
def __str__(self):
|
|
return self.title
|
|
|
|
|
|
class Chunk(StructuredNode):
|
|
"""
|
|
A text chunk extracted from an Item for embedding and retrieval.
|
|
|
|
Chunk text is stored in S3; text_preview holds the first 500 chars
|
|
for Neo4j full-text indexing.
|
|
"""
|
|
|
|
uid = UniqueIdProperty()
|
|
chunk_index = IntegerProperty(required=True)
|
|
chunk_s3_key = StringProperty(required=True)
|
|
chunk_size = IntegerProperty(default=0)
|
|
text_preview = StringProperty(default="") # First 500 chars for full-text index
|
|
embedding = ArrayProperty(FloatProperty()) # 4096d vector
|
|
|
|
created_at = DateTimeProperty(default_now=True)
|
|
|
|
# Relationships
|
|
mentions = RelationshipTo("Concept", "MENTIONS")
|
|
nearby_images = RelationshipTo("Image", "HAS_NEARBY_IMAGE", model=NearbyImageRel)
|
|
|
|
def __str__(self):
|
|
return f"Chunk {self.chunk_index} ({self.uid})"
|
|
|
|
|
|
class Concept(StructuredNode):
|
|
"""
|
|
A named entity or topic extracted from content.
|
|
|
|
Concepts form the backbone of the knowledge graph, linking items
|
|
and chunks through shared references.
|
|
"""
|
|
|
|
uid = UniqueIdProperty()
|
|
name = StringProperty(unique_index=True, required=True)
|
|
concept_type = StringProperty(
|
|
default="",
|
|
choices={
|
|
"person": "Person",
|
|
"place": "Place",
|
|
"topic": "Topic",
|
|
"technique": "Technique",
|
|
"theme": "Theme",
|
|
},
|
|
)
|
|
embedding = ArrayProperty(FloatProperty()) # 4096d vector
|
|
|
|
# Relationships
|
|
related_concepts = RelationshipTo("Concept", "RELATED_TO")
|
|
|
|
def __str__(self):
|
|
return self.name
|
|
|
|
|
|
class Image(StructuredNode):
|
|
"""
|
|
An image associated with an Item (cover art, diagram, photo, etc.).
|
|
|
|
The image file is stored in S3; embeddings enable multimodal search.
|
|
"""
|
|
|
|
uid = UniqueIdProperty()
|
|
s3_key = StringProperty(required=True)
|
|
image_type = StringProperty(
|
|
default="",
|
|
choices={
|
|
"cover": "Cover",
|
|
"diagram": "Diagram",
|
|
"chart": "Chart",
|
|
"table": "Table",
|
|
"screenshot": "Screenshot",
|
|
"illustration": "Illustration",
|
|
"map": "Map",
|
|
"portrait": "Portrait",
|
|
"artwork": "Artwork",
|
|
"still": "Still",
|
|
"photo": "Photo",
|
|
},
|
|
)
|
|
description = StringProperty(default="")
|
|
metadata = JSONProperty(default={})
|
|
|
|
# Vision analysis fields (Phase 2B)
|
|
ocr_text = StringProperty(default="") # Visible text extracted by vision model
|
|
vision_model_name = StringProperty(default="") # Which vision model analyzed this
|
|
analysis_status = StringProperty(
|
|
default="pending",
|
|
choices={
|
|
"pending": "Pending",
|
|
"completed": "Completed",
|
|
"failed": "Failed",
|
|
"skipped": "Skipped",
|
|
},
|
|
)
|
|
|
|
created_at = DateTimeProperty(default_now=True)
|
|
|
|
# Relationships
|
|
embeddings = RelationshipTo("ImageEmbedding", "HAS_EMBEDDING")
|
|
concepts = RelationshipTo("Concept", "DEPICTS")
|
|
|
|
def __str__(self):
|
|
return f"Image {self.image_type} ({self.uid})"
|
|
|
|
|
|
class ImageEmbedding(StructuredNode):
|
|
"""
|
|
A multimodal embedding vector for an Image node.
|
|
|
|
Generated by Qwen3-VL for unified text+image vector space.
|
|
"""
|
|
|
|
uid = UniqueIdProperty()
|
|
embedding = ArrayProperty(FloatProperty()) # 4096d multimodal vector
|
|
created_at = DateTimeProperty(default_now=True)
|
|
|
|
def __str__(self):
|
|
return f"ImageEmbedding ({self.uid})"
|
|
|
|
|
|
# --- Django ORM models (PostgreSQL) ---
|
|
|
|
|
|
class IngestJob(models.Model):
|
|
"""
|
|
Tracks the lifecycle of an asynchronous ingestion + embedding job.
|
|
|
|
Created when an external client (e.g. Daedalus) posts a file via the
|
|
REST ingest API. The Celery worker reads and updates this row as the
|
|
job moves through fetch / chunk / embed / graph stages.
|
|
|
|
Idempotency: a (library, source_ref, content_hash) triple uniquely
|
|
identifies a piece of content. A second POST with the same triple
|
|
returns the existing job; a POST with the same source_ref but a new
|
|
content_hash supersedes the prior Item.
|
|
"""
|
|
|
|
STATUS_CHOICES = [
|
|
("pending", "Pending"),
|
|
("processing", "Processing"),
|
|
("completed", "Completed"),
|
|
("failed", "Failed"),
|
|
]
|
|
|
|
id = models.CharField(max_length=64, primary_key=True)
|
|
item_uid = models.CharField(max_length=64, db_index=True, blank=True)
|
|
library_uid = models.CharField(max_length=64, db_index=True)
|
|
celery_task_id = models.CharField(max_length=255, blank=True)
|
|
|
|
status = models.CharField(
|
|
max_length=20,
|
|
choices=STATUS_CHOICES,
|
|
default="pending",
|
|
db_index=True,
|
|
)
|
|
progress = models.CharField(max_length=50, default="queued")
|
|
error = models.TextField(blank=True, null=True)
|
|
retry_count = models.PositiveIntegerField(default=0)
|
|
|
|
chunks_created = models.PositiveIntegerField(default=0)
|
|
concepts_extracted = models.PositiveIntegerField(default=0)
|
|
embedding_model = models.CharField(max_length=100, blank=True)
|
|
|
|
# The file's content hash (sha256). Used for idempotency: a second
|
|
# ingest with the same source_ref + same hash is a no-op; a second
|
|
# ingest with the same source_ref + different hash supersedes.
|
|
content_hash = models.CharField(max_length=64, db_index=True, blank=True)
|
|
|
|
# Where the file came from. For Daedalus: source="daedalus",
|
|
# source_ref="<workspace_id>/<file_id>".
|
|
source = models.CharField(max_length=100, default="")
|
|
source_ref = models.CharField(max_length=200, blank=True, db_index=True)
|
|
s3_key = models.CharField(max_length=500)
|
|
|
|
# Optional metadata carried forward to the Item node.
|
|
title = models.CharField(max_length=500, blank=True)
|
|
file_type = models.CharField(max_length=100, blank=True)
|
|
file_size = models.PositiveBigIntegerField(default=0)
|
|
collection_uid = models.CharField(max_length=64, blank=True)
|
|
|
|
created_at = models.DateTimeField(auto_now_add=True)
|
|
started_at = models.DateTimeField(null=True, blank=True)
|
|
completed_at = models.DateTimeField(null=True, blank=True)
|
|
|
|
class Meta:
|
|
ordering = ["-created_at"]
|
|
indexes = [
|
|
models.Index(fields=["status", "-created_at"]),
|
|
models.Index(fields=["source", "source_ref"]),
|
|
]
|
|
|
|
def __str__(self):
|
|
return f"IngestJob {self.id} [{self.status}]"
|