feat(library): add business + finance types, workspace_id, IngestJob
Adds two new content-type-aware library types — `business` for proposals/marketing/strategy (used by the work-team agents) and `finance` for statements/tax/market commentary (used by Garth). Each ships with chunking config, embedding/reranker instructions, an LLM-context prompt that forbids fabricating financial figures, and a vision prompt. Adds a unique-indexed `workspace_id` property to `Library` so a node can be scoped to a Daedalus workspace. Null means a global library; non-null means workspace-scoped. Search Cypher (added in a later commit) enforces the boundary. Adds an `IngestJob` Django ORM model — separate from neomodel — that tracks asynchronous ingestion lifecycle (Daedalus → S3 → Celery → embedding pipeline) with idempotency on (library, source_ref, hash). Migration 0001_initial creates the table. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -210,6 +210,69 @@ LIBRARY_TYPE_DEFAULTS = {
|
|||||||
"4) Context clues about when and where this was taken or created."
|
"4) Context clues about when and where this was taken or created."
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
"business": {
|
||||||
|
"chunking_config": {
|
||||||
|
"strategy": "section_aware",
|
||||||
|
"chunk_size": 640,
|
||||||
|
"chunk_overlap": 96,
|
||||||
|
"respect_boundaries": ["section", "subsection", "list", "table"],
|
||||||
|
},
|
||||||
|
"embedding_instruction": (
|
||||||
|
"Represent this passage from a business document for retrieval. "
|
||||||
|
"Focus on value propositions, positioning, pricing, scope of work, "
|
||||||
|
"client outcomes, and commercial commitments."
|
||||||
|
),
|
||||||
|
"reranker_instruction": (
|
||||||
|
"Re-rank passages from business documents based on commercial relevance. "
|
||||||
|
"Prioritize value framing, deliverables, client outcomes, and specific "
|
||||||
|
"pricing or scope language."
|
||||||
|
),
|
||||||
|
"llm_context_prompt": (
|
||||||
|
"The following excerpts are from business documents (proposals, marketing, "
|
||||||
|
"sales, strategy). Interpret in commercial context. Distinguish positioning "
|
||||||
|
"claims from committed deliverables. Preserve numbers, scope language, and "
|
||||||
|
"client names exactly as written."
|
||||||
|
),
|
||||||
|
"vision_prompt": (
|
||||||
|
"Analyze this image from a business document. Identify:\n"
|
||||||
|
"1) Image type (logo, chart, diagram, screenshot, photograph, table).\n"
|
||||||
|
"2) What it depicts — brand marks, data, organizational structure, products.\n"
|
||||||
|
"3) Any visible text — company names, figures, captions, headings.\n"
|
||||||
|
"4) The commercial purpose — positioning, pricing, capability demonstration."
|
||||||
|
),
|
||||||
|
},
|
||||||
|
"finance": {
|
||||||
|
"chunking_config": {
|
||||||
|
"strategy": "section_aware",
|
||||||
|
"chunk_size": 512,
|
||||||
|
"chunk_overlap": 64,
|
||||||
|
"respect_boundaries": ["section", "table", "row", "paragraph"],
|
||||||
|
},
|
||||||
|
"embedding_instruction": (
|
||||||
|
"Represent this passage from a financial document for retrieval. "
|
||||||
|
"Focus on accounts, instruments, dates, amounts, balances, and "
|
||||||
|
"analytical commentary."
|
||||||
|
),
|
||||||
|
"reranker_instruction": (
|
||||||
|
"Re-rank passages from financial documents based on relevance to the query. "
|
||||||
|
"Prioritize the matching account, instrument, time period, and figures."
|
||||||
|
),
|
||||||
|
"llm_context_prompt": (
|
||||||
|
"The following excerpts are from financial documents (statements, tax "
|
||||||
|
"documents, market commentary, planning). Distinguish factual figures "
|
||||||
|
"(statements, transactions, balances) from opinion (forecasts, commentary). "
|
||||||
|
"Quote numbers, dates, and account identifiers exactly as they appear. "
|
||||||
|
"Do not infer, round, or fabricate financial figures. If a figure is not "
|
||||||
|
"present in the excerpts, say so explicitly."
|
||||||
|
),
|
||||||
|
"vision_prompt": (
|
||||||
|
"Analyze this image from a financial document. Identify:\n"
|
||||||
|
"1) Image type (chart, table, statement scan, dashboard screenshot, receipt).\n"
|
||||||
|
"2) What it depicts — account, instrument, time period, data series.\n"
|
||||||
|
"3) Any visible text — figures, dates, account identifiers, labels.\n"
|
||||||
|
"4) Whether the data is factual (statement) or analytical (forecast/commentary)."
|
||||||
|
),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -219,7 +282,7 @@ def get_library_type_config(library_type):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
library_type: One of 'fiction', 'nonfiction', 'technical', 'music',
|
library_type: One of 'fiction', 'nonfiction', 'technical', 'music',
|
||||||
'film', 'art', 'journal'
|
'film', 'art', 'journal', 'business', 'finance'
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict with keys: chunking_config, embedding_instruction,
|
dict with keys: chunking_config, embedding_instruction,
|
||||||
|
|||||||
45
mnemosyne/library/migrations/0001_initial.py
Normal file
45
mnemosyne/library/migrations/0001_initial.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# Generated by Django 5.2.13 on 2026-04-28 12:36
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='IngestJob',
|
||||||
|
fields=[
|
||||||
|
('id', models.CharField(max_length=64, primary_key=True, serialize=False)),
|
||||||
|
('item_uid', models.CharField(blank=True, db_index=True, max_length=64)),
|
||||||
|
('library_uid', models.CharField(db_index=True, max_length=64)),
|
||||||
|
('celery_task_id', models.CharField(blank=True, max_length=255)),
|
||||||
|
('status', models.CharField(choices=[('pending', 'Pending'), ('processing', 'Processing'), ('completed', 'Completed'), ('failed', 'Failed')], db_index=True, default='pending', max_length=20)),
|
||||||
|
('progress', models.CharField(default='queued', max_length=50)),
|
||||||
|
('error', models.TextField(blank=True, null=True)),
|
||||||
|
('retry_count', models.PositiveIntegerField(default=0)),
|
||||||
|
('chunks_created', models.PositiveIntegerField(default=0)),
|
||||||
|
('concepts_extracted', models.PositiveIntegerField(default=0)),
|
||||||
|
('embedding_model', models.CharField(blank=True, max_length=100)),
|
||||||
|
('content_hash', models.CharField(blank=True, db_index=True, max_length=64)),
|
||||||
|
('source', models.CharField(default='', max_length=50)),
|
||||||
|
('source_ref', models.CharField(blank=True, db_index=True, max_length=200)),
|
||||||
|
('s3_key', models.CharField(max_length=500)),
|
||||||
|
('title', models.CharField(blank=True, max_length=500)),
|
||||||
|
('file_type', models.CharField(blank=True, max_length=50)),
|
||||||
|
('file_size', models.PositiveBigIntegerField(default=0)),
|
||||||
|
('collection_uid', models.CharField(blank=True, max_length=64)),
|
||||||
|
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||||
|
('started_at', models.DateTimeField(blank=True, null=True)),
|
||||||
|
('completed_at', models.DateTimeField(blank=True, null=True)),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'ordering': ['-created_at'],
|
||||||
|
'indexes': [models.Index(fields=['status', '-created_at'], name='library_ing_status_9c95b2_idx'), models.Index(fields=['source', 'source_ref'], name='library_ing_source_a48684_idx')],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -1,11 +1,16 @@
|
|||||||
"""
|
"""
|
||||||
Neo4j graph models for the Mnemosyne content library.
|
Models for the Mnemosyne content library.
|
||||||
|
|
||||||
All content data (libraries, collections, items, chunks, concepts, images)
|
Most content (libraries, collections, items, chunks, concepts, images)
|
||||||
lives in Neo4j as a knowledge graph. These models use neomodel's StructuredNode
|
lives in Neo4j as a knowledge graph via neomodel StructuredNode. These do
|
||||||
OGM — they do NOT participate in Django's ORM or migrations.
|
NOT participate in Django's ORM or migrations.
|
||||||
|
|
||||||
|
The IngestJob model at the bottom of this file is the exception: it tracks
|
||||||
|
the lifecycle of asynchronous ingestion requests (file → embedding pipeline)
|
||||||
|
in PostgreSQL via Django's ORM.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from django.db import models
|
||||||
from neomodel import (
|
from neomodel import (
|
||||||
ArrayProperty,
|
ArrayProperty,
|
||||||
DateTimeProperty,
|
DateTimeProperty,
|
||||||
@@ -50,8 +55,14 @@ class Library(StructuredNode):
|
|||||||
"""
|
"""
|
||||||
Top-level container representing a content library.
|
Top-level container representing a content library.
|
||||||
|
|
||||||
Each library has a type (fiction, technical, music, film, art, journal)
|
Each library has a type (fiction, nonfiction, technical, music, film,
|
||||||
that drives chunking strategy, embedding instructions, and LLM prompts.
|
art, journal, business, finance) that drives chunking strategy,
|
||||||
|
embedding instructions, and LLM prompts.
|
||||||
|
|
||||||
|
A library may be either *global* (workspace_id is null — searchable
|
||||||
|
across the whole instance) or *workspace-scoped* (workspace_id set —
|
||||||
|
visible only to agents inside that Daedalus workspace). Scoping is
|
||||||
|
enforced structurally by every search query.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
uid = UniqueIdProperty()
|
uid = UniqueIdProperty()
|
||||||
@@ -66,10 +77,16 @@ class Library(StructuredNode):
|
|||||||
"film": "Film",
|
"film": "Film",
|
||||||
"art": "Art",
|
"art": "Art",
|
||||||
"journal": "Journal",
|
"journal": "Journal",
|
||||||
|
"business": "Business",
|
||||||
|
"finance": "Finance",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
description = StringProperty(default="")
|
description = StringProperty(default="")
|
||||||
|
|
||||||
|
# Daedalus workspace UUID this library is scoped to. Null for global
|
||||||
|
# libraries. Unique-indexed so a workspace cannot have two libraries.
|
||||||
|
workspace_id = StringProperty(unique_index=True, required=False)
|
||||||
|
|
||||||
# Content-type configuration
|
# Content-type configuration
|
||||||
chunking_config = JSONProperty(default={})
|
chunking_config = JSONProperty(default={})
|
||||||
embedding_instruction = StringProperty(default="")
|
embedding_instruction = StringProperty(default="")
|
||||||
@@ -270,3 +287,78 @@ class ImageEmbedding(StructuredNode):
|
|||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"ImageEmbedding ({self.uid})"
|
return f"ImageEmbedding ({self.uid})"
|
||||||
|
|
||||||
|
|
||||||
|
# --- Django ORM models (PostgreSQL) ---
|
||||||
|
|
||||||
|
|
||||||
|
class IngestJob(models.Model):
|
||||||
|
"""
|
||||||
|
Tracks the lifecycle of an asynchronous ingestion + embedding job.
|
||||||
|
|
||||||
|
Created when an external client (e.g. Daedalus) posts a file via the
|
||||||
|
REST ingest API. The Celery worker reads and updates this row as the
|
||||||
|
job moves through fetch / chunk / embed / graph stages.
|
||||||
|
|
||||||
|
Idempotency: a (library, source_ref, content_hash) triple uniquely
|
||||||
|
identifies a piece of content. A second POST with the same triple
|
||||||
|
returns the existing job; a POST with the same source_ref but a new
|
||||||
|
content_hash supersedes the prior Item.
|
||||||
|
"""
|
||||||
|
|
||||||
|
STATUS_CHOICES = [
|
||||||
|
("pending", "Pending"),
|
||||||
|
("processing", "Processing"),
|
||||||
|
("completed", "Completed"),
|
||||||
|
("failed", "Failed"),
|
||||||
|
]
|
||||||
|
|
||||||
|
id = models.CharField(max_length=64, primary_key=True)
|
||||||
|
item_uid = models.CharField(max_length=64, db_index=True, blank=True)
|
||||||
|
library_uid = models.CharField(max_length=64, db_index=True)
|
||||||
|
celery_task_id = models.CharField(max_length=255, blank=True)
|
||||||
|
|
||||||
|
status = models.CharField(
|
||||||
|
max_length=20,
|
||||||
|
choices=STATUS_CHOICES,
|
||||||
|
default="pending",
|
||||||
|
db_index=True,
|
||||||
|
)
|
||||||
|
progress = models.CharField(max_length=50, default="queued")
|
||||||
|
error = models.TextField(blank=True, null=True)
|
||||||
|
retry_count = models.PositiveIntegerField(default=0)
|
||||||
|
|
||||||
|
chunks_created = models.PositiveIntegerField(default=0)
|
||||||
|
concepts_extracted = models.PositiveIntegerField(default=0)
|
||||||
|
embedding_model = models.CharField(max_length=100, blank=True)
|
||||||
|
|
||||||
|
# The file's content hash (sha256). Used for idempotency: a second
|
||||||
|
# ingest with the same source_ref + same hash is a no-op; a second
|
||||||
|
# ingest with the same source_ref + different hash supersedes.
|
||||||
|
content_hash = models.CharField(max_length=64, db_index=True, blank=True)
|
||||||
|
|
||||||
|
# Where the file came from. For Daedalus: source="daedalus",
|
||||||
|
# source_ref="<workspace_id>/<file_id>".
|
||||||
|
source = models.CharField(max_length=50, default="")
|
||||||
|
source_ref = models.CharField(max_length=200, blank=True, db_index=True)
|
||||||
|
s3_key = models.CharField(max_length=500)
|
||||||
|
|
||||||
|
# Optional metadata carried forward to the Item node.
|
||||||
|
title = models.CharField(max_length=500, blank=True)
|
||||||
|
file_type = models.CharField(max_length=50, blank=True)
|
||||||
|
file_size = models.PositiveBigIntegerField(default=0)
|
||||||
|
collection_uid = models.CharField(max_length=64, blank=True)
|
||||||
|
|
||||||
|
created_at = models.DateTimeField(auto_now_add=True)
|
||||||
|
started_at = models.DateTimeField(null=True, blank=True)
|
||||||
|
completed_at = models.DateTimeField(null=True, blank=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
ordering = ["-created_at"]
|
||||||
|
indexes = [
|
||||||
|
models.Index(fields=["status", "-created_at"]),
|
||||||
|
models.Index(fields=["source", "source_ref"]),
|
||||||
|
]
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"IngestJob {self.id} [{self.status}]"
|
||||||
|
|||||||
@@ -13,7 +13,17 @@ from library.content_types import LIBRARY_TYPE_DEFAULTS, get_library_type_config
|
|||||||
class LibraryTypeDefaultsTests(TestCase):
|
class LibraryTypeDefaultsTests(TestCase):
|
||||||
"""Tests for the LIBRARY_TYPE_DEFAULTS registry."""
|
"""Tests for the LIBRARY_TYPE_DEFAULTS registry."""
|
||||||
|
|
||||||
EXPECTED_TYPES = {"fiction", "nonfiction", "technical", "music", "film", "art", "journal"}
|
EXPECTED_TYPES = {
|
||||||
|
"fiction",
|
||||||
|
"nonfiction",
|
||||||
|
"technical",
|
||||||
|
"music",
|
||||||
|
"film",
|
||||||
|
"art",
|
||||||
|
"journal",
|
||||||
|
"business",
|
||||||
|
"finance",
|
||||||
|
}
|
||||||
|
|
||||||
def test_all_expected_types_present(self):
|
def test_all_expected_types_present(self):
|
||||||
for lib_type in self.EXPECTED_TYPES:
|
for lib_type in self.EXPECTED_TYPES:
|
||||||
@@ -105,6 +115,16 @@ class VisionPromptTests(TestCase):
|
|||||||
prompt = config["vision_prompt"].lower()
|
prompt = config["vision_prompt"].lower()
|
||||||
self.assertIn("historical", prompt)
|
self.assertIn("historical", prompt)
|
||||||
|
|
||||||
|
def test_business_vision_prompt_mentions_logo_or_chart(self):
|
||||||
|
config = get_library_type_config("business")
|
||||||
|
prompt = config["vision_prompt"].lower()
|
||||||
|
self.assertTrue("logo" in prompt or "chart" in prompt)
|
||||||
|
|
||||||
|
def test_finance_llm_context_forbids_fabrication(self):
|
||||||
|
config = get_library_type_config("finance")
|
||||||
|
prompt = config["llm_context_prompt"].lower()
|
||||||
|
self.assertIn("fabricate", prompt)
|
||||||
|
|
||||||
|
|
||||||
class GetLibraryTypeConfigTests(TestCase):
|
class GetLibraryTypeConfigTests(TestCase):
|
||||||
"""Tests for the get_library_type_config helper."""
|
"""Tests for the get_library_type_config helper."""
|
||||||
|
|||||||
Reference in New Issue
Block a user