From 33658fbc8d6ef2be776221bc8af665224e573712 Mon Sep 17 00:00:00 2001 From: Robert Helewka Date: Wed, 29 Apr 2026 06:26:17 -0400 Subject: [PATCH] feat(library): add business + finance types, workspace_id, IngestJob MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two new content-type-aware library types — `business` for proposals/marketing/strategy (used by the work-team agents) and `finance` for statements/tax/market commentary (used by Garth). Each ships with chunking config, embedding/reranker instructions, an LLM-context prompt that forbids fabricating financial figures, and a vision prompt. Adds a unique-indexed `workspace_id` property to `Library` so a node can be scoped to a Daedalus workspace. Null means a global library; non-null means workspace-scoped. Search Cypher (added in a later commit) enforces the boundary. Adds an `IngestJob` Django ORM model — separate from neomodel — that tracks asynchronous ingestion lifecycle (Daedalus → S3 → Celery → embedding pipeline) with idempotency on (library, source_ref, hash). Migration 0001_initial creates the table. Co-Authored-By: Claude Opus 4.7 --- mnemosyne/library/content_types.py | 65 ++++++++++- mnemosyne/library/migrations/0001_initial.py | 45 ++++++++ mnemosyne/library/models.py | 104 +++++++++++++++++- mnemosyne/library/tests/test_content_types.py | 22 +++- 4 files changed, 228 insertions(+), 8 deletions(-) create mode 100644 mnemosyne/library/migrations/0001_initial.py diff --git a/mnemosyne/library/content_types.py b/mnemosyne/library/content_types.py index 6be7e74..215df02 100644 --- a/mnemosyne/library/content_types.py +++ b/mnemosyne/library/content_types.py @@ -210,6 +210,69 @@ LIBRARY_TYPE_DEFAULTS = { "4) Context clues about when and where this was taken or created." ), }, + "business": { + "chunking_config": { + "strategy": "section_aware", + "chunk_size": 640, + "chunk_overlap": 96, + "respect_boundaries": ["section", "subsection", "list", "table"], + }, + "embedding_instruction": ( + "Represent this passage from a business document for retrieval. " + "Focus on value propositions, positioning, pricing, scope of work, " + "client outcomes, and commercial commitments." + ), + "reranker_instruction": ( + "Re-rank passages from business documents based on commercial relevance. " + "Prioritize value framing, deliverables, client outcomes, and specific " + "pricing or scope language." + ), + "llm_context_prompt": ( + "The following excerpts are from business documents (proposals, marketing, " + "sales, strategy). Interpret in commercial context. Distinguish positioning " + "claims from committed deliverables. Preserve numbers, scope language, and " + "client names exactly as written." + ), + "vision_prompt": ( + "Analyze this image from a business document. Identify:\n" + "1) Image type (logo, chart, diagram, screenshot, photograph, table).\n" + "2) What it depicts — brand marks, data, organizational structure, products.\n" + "3) Any visible text — company names, figures, captions, headings.\n" + "4) The commercial purpose — positioning, pricing, capability demonstration." + ), + }, + "finance": { + "chunking_config": { + "strategy": "section_aware", + "chunk_size": 512, + "chunk_overlap": 64, + "respect_boundaries": ["section", "table", "row", "paragraph"], + }, + "embedding_instruction": ( + "Represent this passage from a financial document for retrieval. " + "Focus on accounts, instruments, dates, amounts, balances, and " + "analytical commentary." + ), + "reranker_instruction": ( + "Re-rank passages from financial documents based on relevance to the query. " + "Prioritize the matching account, instrument, time period, and figures." + ), + "llm_context_prompt": ( + "The following excerpts are from financial documents (statements, tax " + "documents, market commentary, planning). Distinguish factual figures " + "(statements, transactions, balances) from opinion (forecasts, commentary). " + "Quote numbers, dates, and account identifiers exactly as they appear. " + "Do not infer, round, or fabricate financial figures. If a figure is not " + "present in the excerpts, say so explicitly." + ), + "vision_prompt": ( + "Analyze this image from a financial document. Identify:\n" + "1) Image type (chart, table, statement scan, dashboard screenshot, receipt).\n" + "2) What it depicts — account, instrument, time period, data series.\n" + "3) Any visible text — figures, dates, account identifiers, labels.\n" + "4) Whether the data is factual (statement) or analytical (forecast/commentary)." + ), + }, } @@ -219,7 +282,7 @@ def get_library_type_config(library_type): Args: library_type: One of 'fiction', 'nonfiction', 'technical', 'music', - 'film', 'art', 'journal' + 'film', 'art', 'journal', 'business', 'finance' Returns: dict with keys: chunking_config, embedding_instruction, diff --git a/mnemosyne/library/migrations/0001_initial.py b/mnemosyne/library/migrations/0001_initial.py new file mode 100644 index 0000000..abec455 --- /dev/null +++ b/mnemosyne/library/migrations/0001_initial.py @@ -0,0 +1,45 @@ +# Generated by Django 5.2.13 on 2026-04-28 12:36 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='IngestJob', + fields=[ + ('id', models.CharField(max_length=64, primary_key=True, serialize=False)), + ('item_uid', models.CharField(blank=True, db_index=True, max_length=64)), + ('library_uid', models.CharField(db_index=True, max_length=64)), + ('celery_task_id', models.CharField(blank=True, max_length=255)), + ('status', models.CharField(choices=[('pending', 'Pending'), ('processing', 'Processing'), ('completed', 'Completed'), ('failed', 'Failed')], db_index=True, default='pending', max_length=20)), + ('progress', models.CharField(default='queued', max_length=50)), + ('error', models.TextField(blank=True, null=True)), + ('retry_count', models.PositiveIntegerField(default=0)), + ('chunks_created', models.PositiveIntegerField(default=0)), + ('concepts_extracted', models.PositiveIntegerField(default=0)), + ('embedding_model', models.CharField(blank=True, max_length=100)), + ('content_hash', models.CharField(blank=True, db_index=True, max_length=64)), + ('source', models.CharField(default='', max_length=50)), + ('source_ref', models.CharField(blank=True, db_index=True, max_length=200)), + ('s3_key', models.CharField(max_length=500)), + ('title', models.CharField(blank=True, max_length=500)), + ('file_type', models.CharField(blank=True, max_length=50)), + ('file_size', models.PositiveBigIntegerField(default=0)), + ('collection_uid', models.CharField(blank=True, max_length=64)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('started_at', models.DateTimeField(blank=True, null=True)), + ('completed_at', models.DateTimeField(blank=True, null=True)), + ], + options={ + 'ordering': ['-created_at'], + 'indexes': [models.Index(fields=['status', '-created_at'], name='library_ing_status_9c95b2_idx'), models.Index(fields=['source', 'source_ref'], name='library_ing_source_a48684_idx')], + }, + ), + ] diff --git a/mnemosyne/library/models.py b/mnemosyne/library/models.py index f1ce492..171d7e6 100644 --- a/mnemosyne/library/models.py +++ b/mnemosyne/library/models.py @@ -1,11 +1,16 @@ """ -Neo4j graph models for the Mnemosyne content library. +Models for the Mnemosyne content library. -All content data (libraries, collections, items, chunks, concepts, images) -lives in Neo4j as a knowledge graph. These models use neomodel's StructuredNode -OGM — they do NOT participate in Django's ORM or migrations. +Most content (libraries, collections, items, chunks, concepts, images) +lives in Neo4j as a knowledge graph via neomodel StructuredNode. These do +NOT participate in Django's ORM or migrations. + +The IngestJob model at the bottom of this file is the exception: it tracks +the lifecycle of asynchronous ingestion requests (file → embedding pipeline) +in PostgreSQL via Django's ORM. """ +from django.db import models from neomodel import ( ArrayProperty, DateTimeProperty, @@ -50,8 +55,14 @@ class Library(StructuredNode): """ Top-level container representing a content library. - Each library has a type (fiction, technical, music, film, art, journal) - that drives chunking strategy, embedding instructions, and LLM prompts. + Each library has a type (fiction, nonfiction, technical, music, film, + art, journal, business, finance) that drives chunking strategy, + embedding instructions, and LLM prompts. + + A library may be either *global* (workspace_id is null — searchable + across the whole instance) or *workspace-scoped* (workspace_id set — + visible only to agents inside that Daedalus workspace). Scoping is + enforced structurally by every search query. """ uid = UniqueIdProperty() @@ -66,10 +77,16 @@ class Library(StructuredNode): "film": "Film", "art": "Art", "journal": "Journal", + "business": "Business", + "finance": "Finance", }, ) description = StringProperty(default="") + # Daedalus workspace UUID this library is scoped to. Null for global + # libraries. Unique-indexed so a workspace cannot have two libraries. + workspace_id = StringProperty(unique_index=True, required=False) + # Content-type configuration chunking_config = JSONProperty(default={}) embedding_instruction = StringProperty(default="") @@ -270,3 +287,78 @@ class ImageEmbedding(StructuredNode): def __str__(self): return f"ImageEmbedding ({self.uid})" + + +# --- Django ORM models (PostgreSQL) --- + + +class IngestJob(models.Model): + """ + Tracks the lifecycle of an asynchronous ingestion + embedding job. + + Created when an external client (e.g. Daedalus) posts a file via the + REST ingest API. The Celery worker reads and updates this row as the + job moves through fetch / chunk / embed / graph stages. + + Idempotency: a (library, source_ref, content_hash) triple uniquely + identifies a piece of content. A second POST with the same triple + returns the existing job; a POST with the same source_ref but a new + content_hash supersedes the prior Item. + """ + + STATUS_CHOICES = [ + ("pending", "Pending"), + ("processing", "Processing"), + ("completed", "Completed"), + ("failed", "Failed"), + ] + + id = models.CharField(max_length=64, primary_key=True) + item_uid = models.CharField(max_length=64, db_index=True, blank=True) + library_uid = models.CharField(max_length=64, db_index=True) + celery_task_id = models.CharField(max_length=255, blank=True) + + status = models.CharField( + max_length=20, + choices=STATUS_CHOICES, + default="pending", + db_index=True, + ) + progress = models.CharField(max_length=50, default="queued") + error = models.TextField(blank=True, null=True) + retry_count = models.PositiveIntegerField(default=0) + + chunks_created = models.PositiveIntegerField(default=0) + concepts_extracted = models.PositiveIntegerField(default=0) + embedding_model = models.CharField(max_length=100, blank=True) + + # The file's content hash (sha256). Used for idempotency: a second + # ingest with the same source_ref + same hash is a no-op; a second + # ingest with the same source_ref + different hash supersedes. + content_hash = models.CharField(max_length=64, db_index=True, blank=True) + + # Where the file came from. For Daedalus: source="daedalus", + # source_ref="/". + source = models.CharField(max_length=50, default="") + source_ref = models.CharField(max_length=200, blank=True, db_index=True) + s3_key = models.CharField(max_length=500) + + # Optional metadata carried forward to the Item node. + title = models.CharField(max_length=500, blank=True) + file_type = models.CharField(max_length=50, blank=True) + file_size = models.PositiveBigIntegerField(default=0) + collection_uid = models.CharField(max_length=64, blank=True) + + created_at = models.DateTimeField(auto_now_add=True) + started_at = models.DateTimeField(null=True, blank=True) + completed_at = models.DateTimeField(null=True, blank=True) + + class Meta: + ordering = ["-created_at"] + indexes = [ + models.Index(fields=["status", "-created_at"]), + models.Index(fields=["source", "source_ref"]), + ] + + def __str__(self): + return f"IngestJob {self.id} [{self.status}]" diff --git a/mnemosyne/library/tests/test_content_types.py b/mnemosyne/library/tests/test_content_types.py index 2f00605..8725c55 100644 --- a/mnemosyne/library/tests/test_content_types.py +++ b/mnemosyne/library/tests/test_content_types.py @@ -13,7 +13,17 @@ from library.content_types import LIBRARY_TYPE_DEFAULTS, get_library_type_config class LibraryTypeDefaultsTests(TestCase): """Tests for the LIBRARY_TYPE_DEFAULTS registry.""" - EXPECTED_TYPES = {"fiction", "nonfiction", "technical", "music", "film", "art", "journal"} + EXPECTED_TYPES = { + "fiction", + "nonfiction", + "technical", + "music", + "film", + "art", + "journal", + "business", + "finance", + } def test_all_expected_types_present(self): for lib_type in self.EXPECTED_TYPES: @@ -105,6 +115,16 @@ class VisionPromptTests(TestCase): prompt = config["vision_prompt"].lower() self.assertIn("historical", prompt) + def test_business_vision_prompt_mentions_logo_or_chart(self): + config = get_library_type_config("business") + prompt = config["vision_prompt"].lower() + self.assertTrue("logo" in prompt or "chart" in prompt) + + def test_finance_llm_context_forbids_fabrication(self): + config = get_library_type_config("finance") + prompt = config["llm_context_prompt"].lower() + self.assertIn("fabricate", prompt) + class GetLibraryTypeConfigTests(TestCase): """Tests for the get_library_type_config helper."""