feat(library): add business + finance types, workspace_id, IngestJob

Adds two new content-type-aware library types — `business` for
proposals/marketing/strategy (used by the work-team agents) and `finance`
for statements/tax/market commentary (used by Garth). Each ships with
chunking config, embedding/reranker instructions, an LLM-context prompt
that forbids fabricating financial figures, and a vision prompt.

Adds a unique-indexed `workspace_id` property to `Library` so a node
can be scoped to a Daedalus workspace. Null means a global library;
non-null means workspace-scoped. Search Cypher (added in a later
commit) enforces the boundary.

Adds an `IngestJob` Django ORM model — separate from neomodel — that
tracks asynchronous ingestion lifecycle (Daedalus → S3 → Celery →
embedding pipeline) with idempotency on (library, source_ref, hash).
Migration 0001_initial creates the table.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-29 06:26:17 -04:00
parent 81426327bf
commit 33658fbc8d
4 changed files with 228 additions and 8 deletions

View File

@@ -210,6 +210,69 @@ LIBRARY_TYPE_DEFAULTS = {
"4) Context clues about when and where this was taken or created." "4) Context clues about when and where this was taken or created."
), ),
}, },
"business": {
"chunking_config": {
"strategy": "section_aware",
"chunk_size": 640,
"chunk_overlap": 96,
"respect_boundaries": ["section", "subsection", "list", "table"],
},
"embedding_instruction": (
"Represent this passage from a business document for retrieval. "
"Focus on value propositions, positioning, pricing, scope of work, "
"client outcomes, and commercial commitments."
),
"reranker_instruction": (
"Re-rank passages from business documents based on commercial relevance. "
"Prioritize value framing, deliverables, client outcomes, and specific "
"pricing or scope language."
),
"llm_context_prompt": (
"The following excerpts are from business documents (proposals, marketing, "
"sales, strategy). Interpret in commercial context. Distinguish positioning "
"claims from committed deliverables. Preserve numbers, scope language, and "
"client names exactly as written."
),
"vision_prompt": (
"Analyze this image from a business document. Identify:\n"
"1) Image type (logo, chart, diagram, screenshot, photograph, table).\n"
"2) What it depicts — brand marks, data, organizational structure, products.\n"
"3) Any visible text — company names, figures, captions, headings.\n"
"4) The commercial purpose — positioning, pricing, capability demonstration."
),
},
"finance": {
"chunking_config": {
"strategy": "section_aware",
"chunk_size": 512,
"chunk_overlap": 64,
"respect_boundaries": ["section", "table", "row", "paragraph"],
},
"embedding_instruction": (
"Represent this passage from a financial document for retrieval. "
"Focus on accounts, instruments, dates, amounts, balances, and "
"analytical commentary."
),
"reranker_instruction": (
"Re-rank passages from financial documents based on relevance to the query. "
"Prioritize the matching account, instrument, time period, and figures."
),
"llm_context_prompt": (
"The following excerpts are from financial documents (statements, tax "
"documents, market commentary, planning). Distinguish factual figures "
"(statements, transactions, balances) from opinion (forecasts, commentary). "
"Quote numbers, dates, and account identifiers exactly as they appear. "
"Do not infer, round, or fabricate financial figures. If a figure is not "
"present in the excerpts, say so explicitly."
),
"vision_prompt": (
"Analyze this image from a financial document. Identify:\n"
"1) Image type (chart, table, statement scan, dashboard screenshot, receipt).\n"
"2) What it depicts — account, instrument, time period, data series.\n"
"3) Any visible text — figures, dates, account identifiers, labels.\n"
"4) Whether the data is factual (statement) or analytical (forecast/commentary)."
),
},
} }
@@ -219,7 +282,7 @@ def get_library_type_config(library_type):
Args: Args:
library_type: One of 'fiction', 'nonfiction', 'technical', 'music', library_type: One of 'fiction', 'nonfiction', 'technical', 'music',
'film', 'art', 'journal' 'film', 'art', 'journal', 'business', 'finance'
Returns: Returns:
dict with keys: chunking_config, embedding_instruction, dict with keys: chunking_config, embedding_instruction,

View File

@@ -0,0 +1,45 @@
# Generated by Django 5.2.13 on 2026-04-28 12:36
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='IngestJob',
fields=[
('id', models.CharField(max_length=64, primary_key=True, serialize=False)),
('item_uid', models.CharField(blank=True, db_index=True, max_length=64)),
('library_uid', models.CharField(db_index=True, max_length=64)),
('celery_task_id', models.CharField(blank=True, max_length=255)),
('status', models.CharField(choices=[('pending', 'Pending'), ('processing', 'Processing'), ('completed', 'Completed'), ('failed', 'Failed')], db_index=True, default='pending', max_length=20)),
('progress', models.CharField(default='queued', max_length=50)),
('error', models.TextField(blank=True, null=True)),
('retry_count', models.PositiveIntegerField(default=0)),
('chunks_created', models.PositiveIntegerField(default=0)),
('concepts_extracted', models.PositiveIntegerField(default=0)),
('embedding_model', models.CharField(blank=True, max_length=100)),
('content_hash', models.CharField(blank=True, db_index=True, max_length=64)),
('source', models.CharField(default='', max_length=50)),
('source_ref', models.CharField(blank=True, db_index=True, max_length=200)),
('s3_key', models.CharField(max_length=500)),
('title', models.CharField(blank=True, max_length=500)),
('file_type', models.CharField(blank=True, max_length=50)),
('file_size', models.PositiveBigIntegerField(default=0)),
('collection_uid', models.CharField(blank=True, max_length=64)),
('created_at', models.DateTimeField(auto_now_add=True)),
('started_at', models.DateTimeField(blank=True, null=True)),
('completed_at', models.DateTimeField(blank=True, null=True)),
],
options={
'ordering': ['-created_at'],
'indexes': [models.Index(fields=['status', '-created_at'], name='library_ing_status_9c95b2_idx'), models.Index(fields=['source', 'source_ref'], name='library_ing_source_a48684_idx')],
},
),
]

View File

@@ -1,11 +1,16 @@
""" """
Neo4j graph models for the Mnemosyne content library. Models for the Mnemosyne content library.
All content data (libraries, collections, items, chunks, concepts, images) Most content (libraries, collections, items, chunks, concepts, images)
lives in Neo4j as a knowledge graph. These models use neomodel's StructuredNode lives in Neo4j as a knowledge graph via neomodel StructuredNode. These do
OGM — they do NOT participate in Django's ORM or migrations. NOT participate in Django's ORM or migrations.
The IngestJob model at the bottom of this file is the exception: it tracks
the lifecycle of asynchronous ingestion requests (file → embedding pipeline)
in PostgreSQL via Django's ORM.
""" """
from django.db import models
from neomodel import ( from neomodel import (
ArrayProperty, ArrayProperty,
DateTimeProperty, DateTimeProperty,
@@ -50,8 +55,14 @@ class Library(StructuredNode):
""" """
Top-level container representing a content library. Top-level container representing a content library.
Each library has a type (fiction, technical, music, film, art, journal) Each library has a type (fiction, nonfiction, technical, music, film,
that drives chunking strategy, embedding instructions, and LLM prompts. art, journal, business, finance) that drives chunking strategy,
embedding instructions, and LLM prompts.
A library may be either *global* (workspace_id is null — searchable
across the whole instance) or *workspace-scoped* (workspace_id set —
visible only to agents inside that Daedalus workspace). Scoping is
enforced structurally by every search query.
""" """
uid = UniqueIdProperty() uid = UniqueIdProperty()
@@ -66,10 +77,16 @@ class Library(StructuredNode):
"film": "Film", "film": "Film",
"art": "Art", "art": "Art",
"journal": "Journal", "journal": "Journal",
"business": "Business",
"finance": "Finance",
}, },
) )
description = StringProperty(default="") description = StringProperty(default="")
# Daedalus workspace UUID this library is scoped to. Null for global
# libraries. Unique-indexed so a workspace cannot have two libraries.
workspace_id = StringProperty(unique_index=True, required=False)
# Content-type configuration # Content-type configuration
chunking_config = JSONProperty(default={}) chunking_config = JSONProperty(default={})
embedding_instruction = StringProperty(default="") embedding_instruction = StringProperty(default="")
@@ -270,3 +287,78 @@ class ImageEmbedding(StructuredNode):
def __str__(self): def __str__(self):
return f"ImageEmbedding ({self.uid})" return f"ImageEmbedding ({self.uid})"
# --- Django ORM models (PostgreSQL) ---
class IngestJob(models.Model):
"""
Tracks the lifecycle of an asynchronous ingestion + embedding job.
Created when an external client (e.g. Daedalus) posts a file via the
REST ingest API. The Celery worker reads and updates this row as the
job moves through fetch / chunk / embed / graph stages.
Idempotency: a (library, source_ref, content_hash) triple uniquely
identifies a piece of content. A second POST with the same triple
returns the existing job; a POST with the same source_ref but a new
content_hash supersedes the prior Item.
"""
STATUS_CHOICES = [
("pending", "Pending"),
("processing", "Processing"),
("completed", "Completed"),
("failed", "Failed"),
]
id = models.CharField(max_length=64, primary_key=True)
item_uid = models.CharField(max_length=64, db_index=True, blank=True)
library_uid = models.CharField(max_length=64, db_index=True)
celery_task_id = models.CharField(max_length=255, blank=True)
status = models.CharField(
max_length=20,
choices=STATUS_CHOICES,
default="pending",
db_index=True,
)
progress = models.CharField(max_length=50, default="queued")
error = models.TextField(blank=True, null=True)
retry_count = models.PositiveIntegerField(default=0)
chunks_created = models.PositiveIntegerField(default=0)
concepts_extracted = models.PositiveIntegerField(default=0)
embedding_model = models.CharField(max_length=100, blank=True)
# The file's content hash (sha256). Used for idempotency: a second
# ingest with the same source_ref + same hash is a no-op; a second
# ingest with the same source_ref + different hash supersedes.
content_hash = models.CharField(max_length=64, db_index=True, blank=True)
# Where the file came from. For Daedalus: source="daedalus",
# source_ref="<workspace_id>/<file_id>".
source = models.CharField(max_length=50, default="")
source_ref = models.CharField(max_length=200, blank=True, db_index=True)
s3_key = models.CharField(max_length=500)
# Optional metadata carried forward to the Item node.
title = models.CharField(max_length=500, blank=True)
file_type = models.CharField(max_length=50, blank=True)
file_size = models.PositiveBigIntegerField(default=0)
collection_uid = models.CharField(max_length=64, blank=True)
created_at = models.DateTimeField(auto_now_add=True)
started_at = models.DateTimeField(null=True, blank=True)
completed_at = models.DateTimeField(null=True, blank=True)
class Meta:
ordering = ["-created_at"]
indexes = [
models.Index(fields=["status", "-created_at"]),
models.Index(fields=["source", "source_ref"]),
]
def __str__(self):
return f"IngestJob {self.id} [{self.status}]"

View File

@@ -13,7 +13,17 @@ from library.content_types import LIBRARY_TYPE_DEFAULTS, get_library_type_config
class LibraryTypeDefaultsTests(TestCase): class LibraryTypeDefaultsTests(TestCase):
"""Tests for the LIBRARY_TYPE_DEFAULTS registry.""" """Tests for the LIBRARY_TYPE_DEFAULTS registry."""
EXPECTED_TYPES = {"fiction", "nonfiction", "technical", "music", "film", "art", "journal"} EXPECTED_TYPES = {
"fiction",
"nonfiction",
"technical",
"music",
"film",
"art",
"journal",
"business",
"finance",
}
def test_all_expected_types_present(self): def test_all_expected_types_present(self):
for lib_type in self.EXPECTED_TYPES: for lib_type in self.EXPECTED_TYPES:
@@ -105,6 +115,16 @@ class VisionPromptTests(TestCase):
prompt = config["vision_prompt"].lower() prompt = config["vision_prompt"].lower()
self.assertIn("historical", prompt) self.assertIn("historical", prompt)
def test_business_vision_prompt_mentions_logo_or_chart(self):
config = get_library_type_config("business")
prompt = config["vision_prompt"].lower()
self.assertTrue("logo" in prompt or "chart" in prompt)
def test_finance_llm_context_forbids_fabrication(self):
config = get_library_type_config("finance")
prompt = config["llm_context_prompt"].lower()
self.assertIn("fabricate", prompt)
class GetLibraryTypeConfigTests(TestCase): class GetLibraryTypeConfigTests(TestCase):
"""Tests for the get_library_type_config helper.""" """Tests for the get_library_type_config helper."""