feat(ingest): add Daedalus cross-bucket S3 fetch + ingest_from_daedalus task

Adds DAEDALUS_S3_* settings (read-only credentials for the Daedalus bucket) and a small `daedalus_s3.py` helper that fetches a file from Daedalus's bucket and writes it into Mnemosyne's bucket via default_storage. Adds the Celery task `library.tasks.ingest_from_daedalus`. Given an IngestJob row, it: 1. Resolves the target Library (by library_uid). 2. Supersedes a prior Item with the same source_ref but different content_hash by deleting the old Item + chunks first. 3. Fetches from Daedalus S3, copies into items/{item_uid}/original.{ext}. 4. Creates the Item node, links it to a default Collection. 5. Runs the existing EmbeddingPipeline.process_item. 6. Marks the job completed with chunks/concepts counts. Failures retry up to 3× with exponential backoff; final failure marks the job failed with the exception text. Routed to the embedding queue so single-worker setups must consume it. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-29 06:26:48 -04:00
parent 33658fbc8d
commit c485a8560c
4 changed files with 310 additions and 0 deletions
--- a/mnemosyne/.env
+++ b/mnemosyne/.env
@@ -40,6 +40,23 @@ AWS_S3_REGION_NAME=us-east-1
 # Set to True to use local FileSystemStorage instead of S3 (dev/test)
 USE_LOCAL_STORAGE=True
 # --- Daedalus S3 (cross-bucket reads for ingest) ---
 # Mnemosyne ingests files from the Daedalus S3 bucket. These vars
 # configure read access; the file is copied into AWS_STORAGE_BUCKET_NAME
 # (Mnemosyne's own bucket) by the ingest Celery task.
 DAEDALUS_S3_ENDPOINT_URL=
 DAEDALUS_S3_ACCESS_KEY_ID=
 DAEDALUS_S3_SECRET_ACCESS_KEY=
 DAEDALUS_S3_BUCKET_NAME=daedalus
 DAEDALUS_S3_REGION_NAME=us-east-1
 DAEDALUS_S3_USE_SSL=False
 DAEDALUS_S3_VERIFY=True
 # --- MCP Server ---
 # Set to False for internal-only deployments where the MCP transport is
 # already on a trusted network (10.10.0.0/24).
 MCP_REQUIRE_AUTH=True
 # --- Email (smtp4dev on Oberon) ---
 EMAIL_HOST=oberon.incus
 EMAIL_PORT=22025
--- a/mnemosyne/library/services/daedalus_s3.py
+++ b/mnemosyne/library/services/daedalus_s3.py
@@ -0,0 +1,70 @@
 """
 Cross-bucket S3 helper for ingesting files from the Daedalus S3 bucket
 into Mnemosyne's own bucket.
 Daedalus uploads files to its own bucket (configured per Daedalus deployment)
 and posts an ingest request to Mnemosyne with the s3_key. This module fetches
 that file using read-only Daedalus credentials and writes it to Mnemosyne's
 bucket via the standard `default_storage` backend so the rest of the pipeline
 (parsing, chunking, embedding) works unchanged.
 """
 import logging
 import boto3
 from django.conf import settings
 from django.core.files.base import ContentFile
 from django.core.files.storage import default_storage
 logger = logging.getLogger(__name__)
 def _daedalus_s3_client():
    """Build a boto3 S3 client pointed at the Daedalus bucket."""
    return boto3.client(
        "s3",
        endpoint_url=settings.DAEDALUS_S3_ENDPOINT_URL or None,
        aws_access_key_id=settings.DAEDALUS_S3_ACCESS_KEY_ID,
        aws_secret_access_key=settings.DAEDALUS_S3_SECRET_ACCESS_KEY,
        region_name=settings.DAEDALUS_S3_REGION_NAME,
        use_ssl=settings.DAEDALUS_S3_USE_SSL,
        verify=settings.DAEDALUS_S3_VERIFY,
    )
 def fetch_from_daedalus(daedalus_s3_key: str) -> bytes:
    """
    Read a file from the Daedalus S3 bucket.
    :param daedalus_s3_key: Object key in the Daedalus bucket.
    :returns: File bytes.
    :raises: botocore exceptions on failure (caller decides retry).
    """
    client = _daedalus_s3_client()
    bucket = settings.DAEDALUS_S3_BUCKET_NAME
    logger.debug(
        "Fetching from Daedalus S3 bucket=%s key=%s", bucket, daedalus_s3_key
    )
    response = client.get_object(Bucket=bucket, Key=daedalus_s3_key)
    data = response["Body"].read()
    logger.info(
        "Fetched from Daedalus S3 bucket=%s key=%s size=%d",
        bucket, daedalus_s3_key, len(data),
    )
    return data
 def copy_into_mnemosyne(data: bytes, mnemosyne_s3_key: str) -> str:
    """
    Write bytes into Mnemosyne's S3 bucket via the default_storage backend.
    :param data: File bytes (already in memory).
    :param mnemosyne_s3_key: Target object key in Mnemosyne's bucket.
    :returns: The actual key written (may differ from requested if
              `file_overwrite=False` and the key existed).
    """
    saved_key = default_storage.save(mnemosyne_s3_key, ContentFile(data))
    logger.info(
        "Wrote to Mnemosyne S3 key=%s size=%d", saved_key, len(data),
    )
    return saved_key
--- a/mnemosyne/library/tasks.py
+++ b/mnemosyne/library/tasks.py
@@ -10,6 +10,7 @@ import logging
 from celery import shared_task
 from django.core.cache import cache
 from neomodel import db
 logger = logging.getLogger(__name__)
@@ -280,3 +281,212 @@ def _resolve_user(user_id: int = None):
        return User.objects.get(pk=user_id)
    except Exception:
        return None
 # ---------------------------------------------------------------------------
 # Ingest task (Daedalus integration)
 # ---------------------------------------------------------------------------
@shared_task(
    name="library.tasks.ingest_from_daedalus",
    bind=True,
    queue="embedding",
    max_retries=3,
    default_retry_delay=60,
    acks_late=True,
 )
 def ingest_from_daedalus(self, job_id: str):
    """
    Process a single IngestJob: fetch from Daedalus S3 → create Item →
    run embedding pipeline → mark complete.
    Idempotent on (library_uid, source_ref, content_hash) — handled in the
    REST view that creates the IngestJob, so by the time this task runs the
    job either represents new content or a content_hash-changed re-ingest.
    For a content_hash-changed re-ingest, the prior Item with the same
    source_ref is deleted before the new one is processed (ensures no
    stale chunks linger).
    """
    from datetime import datetime, timezone
    from library.models import IngestJob, Item, Library
    from library.services.daedalus_s3 import (
        copy_into_mnemosyne,
        fetch_from_daedalus,
    )
    from library.services.pipeline import EmbeddingPipeline
    logger.info(
        "Task ingest_from_daedalus starting job_id=%s task_id=%s",
        job_id, self.request.id,
    )
    try:
        job = IngestJob.objects.get(pk=job_id)
    except IngestJob.DoesNotExist:
        logger.error("IngestJob not found job_id=%s", job_id)
        return {"success": False, "error": "job_not_found"}
    job.status = "processing"
    job.progress = "fetching"
    job.started_at = datetime.now(timezone.utc)
    job.celery_task_id = self.request.id
    job.save(update_fields=["status", "progress", "started_at", "celery_task_id"])
    try:
        # --- 1. Resolve target Library ---
        try:
            lib = Library.nodes.get(uid=job.library_uid)
        except Library.DoesNotExist:
            raise RuntimeError(f"Library not found: {job.library_uid}")
        # --- 2. Supersede prior Item with same source_ref but different hash ---
        prior_item_uid = None
        if job.source_ref:
            rows, _ = db.cypher_query(
                """
                MATCH (l:Library {uid: $library_uid})-[:CONTAINS]->(:Collection)
                -[:CONTAINS]->(i:Item)
                WHERE i.metadata IS NOT NULL
                  AND i.metadata CONTAINS $source_ref_marker
                RETURN i.uid LIMIT 1
                """,
                {
                    "library_uid": lib.uid,
                    "source_ref_marker": f'"source_ref": "{job.source_ref}"',
                },
            )
            if rows:
                prior_item_uid = rows[0][0]
                logger.info(
                    "Superseding prior Item job_id=%s prior_item_uid=%s",
                    job_id, prior_item_uid,
                )
                _delete_item_and_chunks(prior_item_uid)
        # --- 3. Fetch from Daedalus, copy into Mnemosyne bucket ---
        job.progress = "copying"
        job.save(update_fields=["progress"])
        data = fetch_from_daedalus(job.s3_key)
        # --- 4. Create Item node ---
        ext = (job.file_type or "bin").lstrip(".").lower() or "bin"
        item = Item(
            title=job.title,
            file_type=ext,
            file_size=len(data),
            content_hash=job.content_hash,
            embedding_status="pending",
            metadata={
                "source": job.source,
                "source_ref": job.source_ref,
            },
        )
        item.save()
        mnemosyne_s3_key = f"items/{item.uid}/original.{ext}"
        copy_into_mnemosyne(data, mnemosyne_s3_key)
        item.s3_key = mnemosyne_s3_key
        item.save()
        # --- 5. Connect to library/collection ---
        col = _resolve_or_create_default_collection(lib, job.collection_uid)
        col.items.connect(item)
        job.item_uid = item.uid
        job.save(update_fields=["item_uid"])
        # --- 6. Run the embedding pipeline ---
        job.progress = "embedding"
        job.save(update_fields=["progress"])
        def progress_cb(percent, message):
            _update_progress(self, percent, message)
        pipeline = EmbeddingPipeline(user=None)
        result = pipeline.process_item(item.uid, progress_callback=progress_cb)
        # --- 7. Mark complete ---
        job.status = "completed"
        job.progress = "done"
        job.chunks_created = result.get("chunks_created", 0)
        job.concepts_extracted = result.get("concepts_extracted", 0)
        job.embedding_model = result.get("embedding_model", "")
        job.completed_at = datetime.now(timezone.utc)
        job.save()
        logger.info(
            "Task ingest_from_daedalus completed job_id=%s item_uid=%s "
            "chunks=%d concepts=%d",
            job_id, item.uid, job.chunks_created, job.concepts_extracted,
        )
        return {
            "success": True,
            "job_id": job_id,
            "item_uid": item.uid,
            **result,
        }
    except Exception as exc:
        logger.error(
            "Task ingest_from_daedalus failed job_id=%s: %s",
            job_id, exc, exc_info=True,
        )
        if self.request.retries < self.max_retries:
            job.retry_count = self.request.retries + 1
            job.save(update_fields=["retry_count"])
            raise self.retry(exc=exc)
        job.status = "failed"
        job.error = str(exc)
        job.completed_at = datetime.now(timezone.utc)
        job.save(update_fields=["status", "error", "completed_at"])
        return {"success": False, "job_id": job_id, "error": str(exc)}
 def _delete_item_and_chunks(item_uid: str):
    """Delete an Item, its chunks, and its images. Concept GC is workspace-delete only."""
    db.cypher_query(
        """
        MATCH (i:Item {uid: $uid})
        OPTIONAL MATCH (i)-[:HAS_CHUNK]->(c:Chunk)
        OPTIONAL MATCH (i)-[:HAS_IMAGE]->(img:Image)
        OPTIONAL MATCH (img)-[:HAS_EMBEDDING]->(emb:ImageEmbedding)
        DETACH DELETE c, img, emb, i
        """,
        {"uid": item_uid},
    )
 def _resolve_or_create_default_collection(lib, collection_uid: str = ""):
    """
    Find or create the default Collection for a Library.
    Daedalus integration creates one Collection per Library, named "default".
    Explicit collection_uid is honored if provided.
    """
    from library.models import Collection
    if collection_uid:
        try:
            return Collection.nodes.get(uid=collection_uid)
        except Collection.DoesNotExist:
            pass
    # Look for an existing "default" collection in this library
    rows, _ = db.cypher_query(
        "MATCH (l:Library {uid: $library_uid})-[:CONTAINS]->(c:Collection {name: 'default'}) "
        "RETURN c.uid LIMIT 1",
        {"library_uid": lib.uid},
    )
    if rows:
        return Collection.nodes.get(uid=rows[0][0])
    col = Collection(name="default", description="Default collection")
    col.save()
    lib.collections.connect(col)
    col.library.connect(lib)
    return col
--- a/mnemosyne/mnemosyne/settings.py
+++ b/mnemosyne/mnemosyne/settings.py
@@ -181,6 +181,18 @@ else:
        },
    }
 # --- Daedalus S3 (cross-bucket reads for ingest) ---
 # Mnemosyne ingests files written to Daedalus's S3 bucket. These vars
 # configure read access; the file is copied into AWS_STORAGE_BUCKET_NAME
 # (Mnemosyne's own bucket) by the Celery ingest task before processing.
 DAEDALUS_S3_ENDPOINT_URL = env("DAEDALUS_S3_ENDPOINT_URL", default="")
 DAEDALUS_S3_ACCESS_KEY_ID = env("DAEDALUS_S3_ACCESS_KEY_ID", default="")
 DAEDALUS_S3_SECRET_ACCESS_KEY = env("DAEDALUS_S3_SECRET_ACCESS_KEY", default="")
 DAEDALUS_S3_BUCKET_NAME = env("DAEDALUS_S3_BUCKET_NAME", default="daedalus")
 DAEDALUS_S3_REGION_NAME = env("DAEDALUS_S3_REGION_NAME", default="us-east-1")
 DAEDALUS_S3_USE_SSL = env.bool("DAEDALUS_S3_USE_SSL", default=False)
 DAEDALUS_S3_VERIFY = env.bool("DAEDALUS_S3_VERIFY", default=True)
 # --- Celery / RabbitMQ ---
 CELERY_BROKER_URL = env(
    "CELERY_BROKER_URL",
@@ -196,6 +208,7 @@ CELERY_TASK_ACKS_LATE = True
 CELERY_WORKER_PREFETCH_MULTIPLIER = 1
 CELERY_TASK_ROUTES = {
    "library.tasks.embed_*": {"queue": "embedding"},
    "library.tasks.ingest_*": {"queue": "embedding"},
    "library.tasks.batch_*": {"queue": "batch"},
 }