feat(ingest): add Daedalus cross-bucket S3 fetch + ingest_from_daedalus task
Adds DAEDALUS_S3_* settings (read-only credentials for the Daedalus bucket)
and a small `daedalus_s3.py` helper that fetches a file from Daedalus's
bucket and writes it into Mnemosyne's bucket via default_storage.
Adds the Celery task `library.tasks.ingest_from_daedalus`. Given an
IngestJob row, it:
1. Resolves the target Library (by library_uid).
2. Supersedes a prior Item with the same source_ref but different
content_hash by deleting the old Item + chunks first.
3. Fetches from Daedalus S3, copies into items/{item_uid}/original.{ext}.
4. Creates the Item node, links it to a default Collection.
5. Runs the existing EmbeddingPipeline.process_item.
6. Marks the job completed with chunks/concepts counts.
Failures retry up to 3× with exponential backoff; final failure marks
the job failed with the exception text. Routed to the embedding queue
so single-worker setups must consume it.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,23 @@ AWS_S3_REGION_NAME=us-east-1
|
||||
# Set to True to use local FileSystemStorage instead of S3 (dev/test)
|
||||
USE_LOCAL_STORAGE=True
|
||||
|
||||
# --- Daedalus S3 (cross-bucket reads for ingest) ---
|
||||
# Mnemosyne ingests files from the Daedalus S3 bucket. These vars
|
||||
# configure read access; the file is copied into AWS_STORAGE_BUCKET_NAME
|
||||
# (Mnemosyne's own bucket) by the ingest Celery task.
|
||||
DAEDALUS_S3_ENDPOINT_URL=
|
||||
DAEDALUS_S3_ACCESS_KEY_ID=
|
||||
DAEDALUS_S3_SECRET_ACCESS_KEY=
|
||||
DAEDALUS_S3_BUCKET_NAME=daedalus
|
||||
DAEDALUS_S3_REGION_NAME=us-east-1
|
||||
DAEDALUS_S3_USE_SSL=False
|
||||
DAEDALUS_S3_VERIFY=True
|
||||
|
||||
# --- MCP Server ---
|
||||
# Set to False for internal-only deployments where the MCP transport is
|
||||
# already on a trusted network (10.10.0.0/24).
|
||||
MCP_REQUIRE_AUTH=True
|
||||
|
||||
# --- Email (smtp4dev on Oberon) ---
|
||||
EMAIL_HOST=oberon.incus
|
||||
EMAIL_PORT=22025
|
||||
|
||||
70
mnemosyne/library/services/daedalus_s3.py
Normal file
70
mnemosyne/library/services/daedalus_s3.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""
|
||||
Cross-bucket S3 helper for ingesting files from the Daedalus S3 bucket
|
||||
into Mnemosyne's own bucket.
|
||||
|
||||
Daedalus uploads files to its own bucket (configured per Daedalus deployment)
|
||||
and posts an ingest request to Mnemosyne with the s3_key. This module fetches
|
||||
that file using read-only Daedalus credentials and writes it to Mnemosyne's
|
||||
bucket via the standard `default_storage` backend so the rest of the pipeline
|
||||
(parsing, chunking, embedding) works unchanged.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import boto3
|
||||
from django.conf import settings
|
||||
from django.core.files.base import ContentFile
|
||||
from django.core.files.storage import default_storage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _daedalus_s3_client():
|
||||
"""Build a boto3 S3 client pointed at the Daedalus bucket."""
|
||||
return boto3.client(
|
||||
"s3",
|
||||
endpoint_url=settings.DAEDALUS_S3_ENDPOINT_URL or None,
|
||||
aws_access_key_id=settings.DAEDALUS_S3_ACCESS_KEY_ID,
|
||||
aws_secret_access_key=settings.DAEDALUS_S3_SECRET_ACCESS_KEY,
|
||||
region_name=settings.DAEDALUS_S3_REGION_NAME,
|
||||
use_ssl=settings.DAEDALUS_S3_USE_SSL,
|
||||
verify=settings.DAEDALUS_S3_VERIFY,
|
||||
)
|
||||
|
||||
|
||||
def fetch_from_daedalus(daedalus_s3_key: str) -> bytes:
|
||||
"""
|
||||
Read a file from the Daedalus S3 bucket.
|
||||
|
||||
:param daedalus_s3_key: Object key in the Daedalus bucket.
|
||||
:returns: File bytes.
|
||||
:raises: botocore exceptions on failure (caller decides retry).
|
||||
"""
|
||||
client = _daedalus_s3_client()
|
||||
bucket = settings.DAEDALUS_S3_BUCKET_NAME
|
||||
logger.debug(
|
||||
"Fetching from Daedalus S3 bucket=%s key=%s", bucket, daedalus_s3_key
|
||||
)
|
||||
response = client.get_object(Bucket=bucket, Key=daedalus_s3_key)
|
||||
data = response["Body"].read()
|
||||
logger.info(
|
||||
"Fetched from Daedalus S3 bucket=%s key=%s size=%d",
|
||||
bucket, daedalus_s3_key, len(data),
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
def copy_into_mnemosyne(data: bytes, mnemosyne_s3_key: str) -> str:
|
||||
"""
|
||||
Write bytes into Mnemosyne's S3 bucket via the default_storage backend.
|
||||
|
||||
:param data: File bytes (already in memory).
|
||||
:param mnemosyne_s3_key: Target object key in Mnemosyne's bucket.
|
||||
:returns: The actual key written (may differ from requested if
|
||||
`file_overwrite=False` and the key existed).
|
||||
"""
|
||||
saved_key = default_storage.save(mnemosyne_s3_key, ContentFile(data))
|
||||
logger.info(
|
||||
"Wrote to Mnemosyne S3 key=%s size=%d", saved_key, len(data),
|
||||
)
|
||||
return saved_key
|
||||
@@ -10,6 +10,7 @@ import logging
|
||||
|
||||
from celery import shared_task
|
||||
from django.core.cache import cache
|
||||
from neomodel import db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -280,3 +281,212 @@ def _resolve_user(user_id: int = None):
|
||||
return User.objects.get(pk=user_id)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Ingest task (Daedalus integration)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@shared_task(
|
||||
name="library.tasks.ingest_from_daedalus",
|
||||
bind=True,
|
||||
queue="embedding",
|
||||
max_retries=3,
|
||||
default_retry_delay=60,
|
||||
acks_late=True,
|
||||
)
|
||||
def ingest_from_daedalus(self, job_id: str):
|
||||
"""
|
||||
Process a single IngestJob: fetch from Daedalus S3 → create Item →
|
||||
run embedding pipeline → mark complete.
|
||||
|
||||
Idempotent on (library_uid, source_ref, content_hash) — handled in the
|
||||
REST view that creates the IngestJob, so by the time this task runs the
|
||||
job either represents new content or a content_hash-changed re-ingest.
|
||||
|
||||
For a content_hash-changed re-ingest, the prior Item with the same
|
||||
source_ref is deleted before the new one is processed (ensures no
|
||||
stale chunks linger).
|
||||
"""
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from library.models import IngestJob, Item, Library
|
||||
from library.services.daedalus_s3 import (
|
||||
copy_into_mnemosyne,
|
||||
fetch_from_daedalus,
|
||||
)
|
||||
from library.services.pipeline import EmbeddingPipeline
|
||||
|
||||
logger.info(
|
||||
"Task ingest_from_daedalus starting job_id=%s task_id=%s",
|
||||
job_id, self.request.id,
|
||||
)
|
||||
|
||||
try:
|
||||
job = IngestJob.objects.get(pk=job_id)
|
||||
except IngestJob.DoesNotExist:
|
||||
logger.error("IngestJob not found job_id=%s", job_id)
|
||||
return {"success": False, "error": "job_not_found"}
|
||||
|
||||
job.status = "processing"
|
||||
job.progress = "fetching"
|
||||
job.started_at = datetime.now(timezone.utc)
|
||||
job.celery_task_id = self.request.id
|
||||
job.save(update_fields=["status", "progress", "started_at", "celery_task_id"])
|
||||
|
||||
try:
|
||||
# --- 1. Resolve target Library ---
|
||||
try:
|
||||
lib = Library.nodes.get(uid=job.library_uid)
|
||||
except Library.DoesNotExist:
|
||||
raise RuntimeError(f"Library not found: {job.library_uid}")
|
||||
|
||||
# --- 2. Supersede prior Item with same source_ref but different hash ---
|
||||
prior_item_uid = None
|
||||
if job.source_ref:
|
||||
rows, _ = db.cypher_query(
|
||||
"""
|
||||
MATCH (l:Library {uid: $library_uid})-[:CONTAINS]->(:Collection)
|
||||
-[:CONTAINS]->(i:Item)
|
||||
WHERE i.metadata IS NOT NULL
|
||||
AND i.metadata CONTAINS $source_ref_marker
|
||||
RETURN i.uid LIMIT 1
|
||||
""",
|
||||
{
|
||||
"library_uid": lib.uid,
|
||||
"source_ref_marker": f'"source_ref": "{job.source_ref}"',
|
||||
},
|
||||
)
|
||||
if rows:
|
||||
prior_item_uid = rows[0][0]
|
||||
logger.info(
|
||||
"Superseding prior Item job_id=%s prior_item_uid=%s",
|
||||
job_id, prior_item_uid,
|
||||
)
|
||||
_delete_item_and_chunks(prior_item_uid)
|
||||
|
||||
# --- 3. Fetch from Daedalus, copy into Mnemosyne bucket ---
|
||||
job.progress = "copying"
|
||||
job.save(update_fields=["progress"])
|
||||
|
||||
data = fetch_from_daedalus(job.s3_key)
|
||||
|
||||
# --- 4. Create Item node ---
|
||||
ext = (job.file_type or "bin").lstrip(".").lower() or "bin"
|
||||
item = Item(
|
||||
title=job.title,
|
||||
file_type=ext,
|
||||
file_size=len(data),
|
||||
content_hash=job.content_hash,
|
||||
embedding_status="pending",
|
||||
metadata={
|
||||
"source": job.source,
|
||||
"source_ref": job.source_ref,
|
||||
},
|
||||
)
|
||||
item.save()
|
||||
|
||||
mnemosyne_s3_key = f"items/{item.uid}/original.{ext}"
|
||||
copy_into_mnemosyne(data, mnemosyne_s3_key)
|
||||
item.s3_key = mnemosyne_s3_key
|
||||
item.save()
|
||||
|
||||
# --- 5. Connect to library/collection ---
|
||||
col = _resolve_or_create_default_collection(lib, job.collection_uid)
|
||||
col.items.connect(item)
|
||||
|
||||
job.item_uid = item.uid
|
||||
job.save(update_fields=["item_uid"])
|
||||
|
||||
# --- 6. Run the embedding pipeline ---
|
||||
job.progress = "embedding"
|
||||
job.save(update_fields=["progress"])
|
||||
|
||||
def progress_cb(percent, message):
|
||||
_update_progress(self, percent, message)
|
||||
|
||||
pipeline = EmbeddingPipeline(user=None)
|
||||
result = pipeline.process_item(item.uid, progress_callback=progress_cb)
|
||||
|
||||
# --- 7. Mark complete ---
|
||||
job.status = "completed"
|
||||
job.progress = "done"
|
||||
job.chunks_created = result.get("chunks_created", 0)
|
||||
job.concepts_extracted = result.get("concepts_extracted", 0)
|
||||
job.embedding_model = result.get("embedding_model", "")
|
||||
job.completed_at = datetime.now(timezone.utc)
|
||||
job.save()
|
||||
|
||||
logger.info(
|
||||
"Task ingest_from_daedalus completed job_id=%s item_uid=%s "
|
||||
"chunks=%d concepts=%d",
|
||||
job_id, item.uid, job.chunks_created, job.concepts_extracted,
|
||||
)
|
||||
return {
|
||||
"success": True,
|
||||
"job_id": job_id,
|
||||
"item_uid": item.uid,
|
||||
**result,
|
||||
}
|
||||
|
||||
except Exception as exc:
|
||||
logger.error(
|
||||
"Task ingest_from_daedalus failed job_id=%s: %s",
|
||||
job_id, exc, exc_info=True,
|
||||
)
|
||||
if self.request.retries < self.max_retries:
|
||||
job.retry_count = self.request.retries + 1
|
||||
job.save(update_fields=["retry_count"])
|
||||
raise self.retry(exc=exc)
|
||||
|
||||
job.status = "failed"
|
||||
job.error = str(exc)
|
||||
job.completed_at = datetime.now(timezone.utc)
|
||||
job.save(update_fields=["status", "error", "completed_at"])
|
||||
return {"success": False, "job_id": job_id, "error": str(exc)}
|
||||
|
||||
|
||||
def _delete_item_and_chunks(item_uid: str):
|
||||
"""Delete an Item, its chunks, and its images. Concept GC is workspace-delete only."""
|
||||
db.cypher_query(
|
||||
"""
|
||||
MATCH (i:Item {uid: $uid})
|
||||
OPTIONAL MATCH (i)-[:HAS_CHUNK]->(c:Chunk)
|
||||
OPTIONAL MATCH (i)-[:HAS_IMAGE]->(img:Image)
|
||||
OPTIONAL MATCH (img)-[:HAS_EMBEDDING]->(emb:ImageEmbedding)
|
||||
DETACH DELETE c, img, emb, i
|
||||
""",
|
||||
{"uid": item_uid},
|
||||
)
|
||||
|
||||
|
||||
def _resolve_or_create_default_collection(lib, collection_uid: str = ""):
|
||||
"""
|
||||
Find or create the default Collection for a Library.
|
||||
|
||||
Daedalus integration creates one Collection per Library, named "default".
|
||||
Explicit collection_uid is honored if provided.
|
||||
"""
|
||||
from library.models import Collection
|
||||
|
||||
if collection_uid:
|
||||
try:
|
||||
return Collection.nodes.get(uid=collection_uid)
|
||||
except Collection.DoesNotExist:
|
||||
pass
|
||||
|
||||
# Look for an existing "default" collection in this library
|
||||
rows, _ = db.cypher_query(
|
||||
"MATCH (l:Library {uid: $library_uid})-[:CONTAINS]->(c:Collection {name: 'default'}) "
|
||||
"RETURN c.uid LIMIT 1",
|
||||
{"library_uid": lib.uid},
|
||||
)
|
||||
if rows:
|
||||
return Collection.nodes.get(uid=rows[0][0])
|
||||
|
||||
col = Collection(name="default", description="Default collection")
|
||||
col.save()
|
||||
lib.collections.connect(col)
|
||||
col.library.connect(lib)
|
||||
return col
|
||||
|
||||
@@ -181,6 +181,18 @@ else:
|
||||
},
|
||||
}
|
||||
|
||||
# --- Daedalus S3 (cross-bucket reads for ingest) ---
|
||||
# Mnemosyne ingests files written to Daedalus's S3 bucket. These vars
|
||||
# configure read access; the file is copied into AWS_STORAGE_BUCKET_NAME
|
||||
# (Mnemosyne's own bucket) by the Celery ingest task before processing.
|
||||
DAEDALUS_S3_ENDPOINT_URL = env("DAEDALUS_S3_ENDPOINT_URL", default="")
|
||||
DAEDALUS_S3_ACCESS_KEY_ID = env("DAEDALUS_S3_ACCESS_KEY_ID", default="")
|
||||
DAEDALUS_S3_SECRET_ACCESS_KEY = env("DAEDALUS_S3_SECRET_ACCESS_KEY", default="")
|
||||
DAEDALUS_S3_BUCKET_NAME = env("DAEDALUS_S3_BUCKET_NAME", default="daedalus")
|
||||
DAEDALUS_S3_REGION_NAME = env("DAEDALUS_S3_REGION_NAME", default="us-east-1")
|
||||
DAEDALUS_S3_USE_SSL = env.bool("DAEDALUS_S3_USE_SSL", default=False)
|
||||
DAEDALUS_S3_VERIFY = env.bool("DAEDALUS_S3_VERIFY", default=True)
|
||||
|
||||
# --- Celery / RabbitMQ ---
|
||||
CELERY_BROKER_URL = env(
|
||||
"CELERY_BROKER_URL",
|
||||
@@ -196,6 +208,7 @@ CELERY_TASK_ACKS_LATE = True
|
||||
CELERY_WORKER_PREFETCH_MULTIPLIER = 1
|
||||
CELERY_TASK_ROUTES = {
|
||||
"library.tasks.embed_*": {"queue": "embedding"},
|
||||
"library.tasks.ingest_*": {"queue": "embedding"},
|
||||
"library.tasks.batch_*": {"queue": "batch"},
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user