feat(library): allow admin delete of Daedalus-managed library via shared cascade

Admin/HTML library delete previously hard-blocked workspace-scoped (Daedalus-managed) libraries, leaving no way to clear an orphaned Library node — e.g. one left behind when a Daedalus workspace delete failed to propagate. A recreate of that workspace then collides on the global Library.name unique constraint and 500s, freezing ingest. Allow the delete behind the existing confirm warning (low risk: source content lives in Daedalus and is recreated + re-embedded on next sync), and route both the API and HTML delete paths through one shared cascade. - Add library/services/library_delete.delete_library_cascade(lib), keyed on Library uid so it covers global and workspace-scoped libraries. It removes Chunks, Images/ImageEmbeddings, Items, Collections, the Library, then GCs orphan-only Concepts (verbatim from the API view, re-keyed workspace_id->uid). - workspace_detail_or_delete (API) now calls the shared helper. - library_delete (HTML) no longer blocks workspace_id libraries; it calls the cascade instead of a bare lib.delete() (which leaked child nodes — also a latent bug for global libraries with content). - Confirm-delete template shows a caution banner for Daedalus-managed libraries. No migration: Mnemosyne library data is in Neo4j (neomodel); no schema change. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
feat(metrics): add scrape-time system model health collector
2026-06-17 19:37:58 -04:00 · 2026-06-17 09:06:11 -04:00
9 changed files with 415 additions and 77 deletions
--- a/mnemosyne/library/api/workspaces.py
+++ b/mnemosyne/library/api/workspaces.py
@@ -23,6 +23,7 @@ from rest_framework.permissions import IsAuthenticated
 from rest_framework.response import Response
 from library.content_types import get_library_type_config
 from library.services.library_delete import delete_library_cascade
 from .serializers import WorkspaceCreateSerializer, WorkspaceStatusSerializer
@@ -165,74 +166,15 @@ def workspace_detail_or_delete(request, workspace_id):
    if lib is None:
        return Response(status=status.HTTP_204_NO_CONTENT)
-    library_uid = lib.uid
+    # Delete the Library and everything reachable + unique to it, plus
-    library_name = lib.name
+    # orphan-Concept GC. Shared with the admin/HTML delete path.
-
+    result = delete_library_cascade(lib)
    # Step 1-4: delete chunks, items, collections, then the library itself.
    # We collect Item s3_keys first so the caller can clean up S3
    # asynchronously (a future enhancement — for now, the keys are logged).
    s3_rows, _ = db.cypher_query(
        "MATCH (l:Library {workspace_id: $wsid})-[:CONTAINS]->(:Collection)"
        "-[:CONTAINS]->(i:Item) RETURN i.uid, i.s3_key",
        {"wsid": workspace_id},
    )
    item_s3_keys = [(r[0], r[1]) for r in s3_rows if r[1]]
    db.cypher_query(
        """
        MATCH (l:Library {workspace_id: $wsid})-[:CONTAINS]->(:Collection)
        -[:CONTAINS]->(i:Item)-[:HAS_CHUNK]->(c:Chunk)
        DETACH DELETE c
        """,
        {"wsid": workspace_id},
    )
    db.cypher_query(
        """
        MATCH (l:Library {workspace_id: $wsid})-[:CONTAINS]->(:Collection)
        -[:CONTAINS]->(i:Item)-[:HAS_IMAGE]->(img:Image)
        OPTIONAL MATCH (img)-[:HAS_EMBEDDING]->(emb:ImageEmbedding)
        DETACH DELETE img, emb
        """,
        {"wsid": workspace_id},
    )
    db.cypher_query(
        """
        MATCH (l:Library {workspace_id: $wsid})-[:CONTAINS]->(:Collection)
        -[:CONTAINS]->(i:Item)
        DETACH DELETE i
        """,
        {"wsid": workspace_id},
    )
    db.cypher_query(
        """
        MATCH (l:Library {workspace_id: $wsid})-[:CONTAINS]->(col:Collection)
        DETACH DELETE col
        """,
        {"wsid": workspace_id},
    )
    db.cypher_query(
        "MATCH (l:Library {workspace_id: $wsid}) DETACH DELETE l",
        {"wsid": workspace_id},
    )
    # Step 5: orphan Concept garbage collection.
    orphan_result, _ = db.cypher_query(
        """
        MATCH (con:Concept)
        WHERE NOT (con)<-[:REFERENCES]-() AND NOT (con)<-[:MENTIONS]-()
              AND NOT (con)<-[:DEPICTS]-()
        WITH con
        DETACH DELETE con
        RETURN count(con) AS deleted
        """
    )
    orphans_deleted = orphan_result[0][0] if orphan_result else 0
    logger.info(
        "Workspace deleted workspace_id=%s library_uid=%s name=%s "
        "items=%d orphans_deleted=%d",
-        workspace_id, library_uid, library_name,
+        workspace_id, result["library_uid"], result["name"],
-        len(item_s3_keys), orphans_deleted,
+        result["item_count"], result["orphans_deleted"],
    )
    return Response(status=status.HTTP_204_NO_CONTENT)
--- a/mnemosyne/library/apps.py
+++ b/mnemosyne/library/apps.py
@@ -88,6 +88,29 @@ def _should_skip_probe() -> bool:
    return False
 def _is_web_process() -> bool:
    """
    True when running inside the web (gunicorn / runserver) process.
    The reachability collector must only register here: ``/metrics`` is served
    by the web process, and registering in the Celery worker would both probe
    the GPU endpoints from a process whose metrics nobody scrapes and risk
    duplicate registration. Celery launches via ``celery`` argv; management
    commands are excluded above.
    """
    argv0 = sys.argv[0]
    if "celery" in argv0 or (len(sys.argv) >= 2 and sys.argv[1] == "celery"):
        return False
    if "pytest" in argv0 or "PYTEST_CURRENT_TEST" in os.environ:
        return False
    # gunicorn (prod) or runserver (dev).
    if "gunicorn" in argv0:
        return True
    if len(sys.argv) >= 2 and sys.argv[1] == "runserver":
        return True
    return False
 def _run_startup_probe():
    """
    Emit ERROR/WARNING logs if the stack is misconfigured for search.
@@ -199,4 +222,7 @@ class LibraryConfig(AppConfig):
    verbose_name = "Library"
    def ready(self):
-        pass
+        if _is_web_process():
            from library.health_collector import register
            register()
--- a/mnemosyne/library/health_collector.py
+++ b/mnemosyne/library/health_collector.py
@@ -0,0 +1,99 @@
 """
 Scrape-time Prometheus collector for system-default model reachability.
 The ingest-pipeline counters in ``library/metrics.py`` live in the Celery
 worker process and only move during an active ingest, so they cannot signal
 "models down" on an idle queue. This collector runs in the **web** process
 (where ``/metrics`` is served by ``django_prometheus``) and probes the four
 system-default models at scrape time, emitting an up/down gauge that is
 present regardless of queue activity.
 Probe results are cached for a short TTL so rapid scrapes — or multiple
 gunicorn workers each scraped in turn — cannot hammer the GPU endpoints.
 """
 import logging
 import threading
 import time
 from prometheus_client.core import GaugeMetricFamily
 from library.services.model_health import probe_system_models
 logger = logging.getLogger(__name__)
 # Cache probe results so repeated scrapes don't re-probe the router. The
 # value is comfortably above a 15s scrape_interval but bounded so a recovered
 # model shows green within a minute.
 _CACHE_TTL_SECONDS = 55
 _lock = threading.Lock()
 _cache: dict = {"ts": 0.0, "results": None}
 def _cached_probe() -> list[dict]:
    """Return probe results, re-probing only when the cache has expired."""
    now = time.monotonic()
    with _lock:
        if _cache["results"] is not None and (now - _cache["ts"]) < _CACHE_TTL_SECONDS:
            return _cache["results"]
        try:
            results = probe_system_models()
        except Exception as exc:  # never let a probe failure break /metrics
            logger.warning("Model health probe failed during scrape: %s", exc)
            # Serve the stale cache if we have one; otherwise report nothing.
            return _cache["results"] or []
        _cache["ts"] = now
        _cache["results"] = results
        return results
 class SystemModelHealthCollector:
    """prometheus_client custom collector for system-default model health."""
    def collect(self):
        results = _cached_probe()
        up = GaugeMetricFamily(
            "mnemosyne_system_default_model_up",
            "System-default model endpoint reachable (1) or not (0)",
            labels=["role", "model", "api"],
        )
        configured = GaugeMetricFamily(
            "mnemosyne_system_default_model_configured",
            "A system-default model is configured for this role (1) or not (0)",
            labels=["role"],
        )
        latency = GaugeMetricFamily(
            "mnemosyne_system_default_model_probe_latency_seconds",
            "Latency of the last reachability probe for this role",
            labels=["role"],
        )
        for r in results:
            role = r["role"]
            configured.add_metric([role], 1 if r["configured"] else 0)
            if not r["configured"]:
                continue
            up.add_metric(
                [role, r["model_name"] or "", r["api_name"] or ""],
                1 if r["ok"] else 0,
            )
            if r["latency_ms"] is not None:
                latency.add_metric([role], r["latency_ms"] / 1000.0)
        yield configured
        yield up
        yield latency
 def register():
    """Register the collector against the default registry (idempotent)."""
    from prometheus_client import REGISTRY
    # Guard against duplicate registration (autoreload, repeated ready()).
    for collector in list(getattr(REGISTRY, "_collector_to_names", {})):
        if isinstance(collector, SystemModelHealthCollector):
            return
    REGISTRY.register(SystemModelHealthCollector())
    logger.info("Registered SystemModelHealthCollector on Prometheus default registry")
--- a/mnemosyne/library/services/library_delete.py
+++ b/mnemosyne/library/services/library_delete.py
@@ -0,0 +1,108 @@
 """
 Shared Library deletion cascade.
 Deletes a Library node and everything reachable AND unique to it
 (Collections, Items, Chunks, Images + ImageEmbeddings), then garbage-collects
 Concepts that are no longer referenced by any other Library.
 Keyed on the Library ``uid`` so it works for *both* global libraries
 (``workspace_id`` is null) and workspace-scoped libraries. This is the single
 source of truth used by:
  * the Daedalus integration API (``DELETE /library/api/workspaces/{id}/``), and
  * the admin/HTML delete view (``library_delete``).
 Concept-safe: orphan-only Concept GC happens at the end. Concepts still
 referenced by another library (workspace or global) are preserved.
 """
 import logging
 from neomodel import db
 logger = logging.getLogger(__name__)
 def delete_library_cascade(lib) -> dict:
    """Delete ``lib`` and all content reachable and unique to it.
    :param lib: A ``library.models.Library`` node instance.
    :returns: Dict with ``library_uid``, ``name``, ``item_count``,
        ``item_s3_keys`` (list of ``(uid, s3_key)`` for async S3 cleanup),
        and ``orphans_deleted`` (Concept GC count).
    """
    library_uid = lib.uid
    library_name = lib.name
    # Collect Item s3_keys first so the caller can clean up S3 asynchronously
    # (a future enhancement — for now, the keys are returned/logged).
    s3_rows, _ = db.cypher_query(
        "MATCH (l:Library {uid: $uid})-[:CONTAINS]->(:Collection)"
        "-[:CONTAINS]->(i:Item) RETURN i.uid, i.s3_key",
        {"uid": library_uid},
    )
    item_s3_keys = [(r[0], r[1]) for r in s3_rows if r[1]]
    db.cypher_query(
        """
        MATCH (l:Library {uid: $uid})-[:CONTAINS]->(:Collection)
        -[:CONTAINS]->(i:Item)-[:HAS_CHUNK]->(c:Chunk)
        DETACH DELETE c
        """,
        {"uid": library_uid},
    )
    db.cypher_query(
        """
        MATCH (l:Library {uid: $uid})-[:CONTAINS]->(:Collection)
        -[:CONTAINS]->(i:Item)-[:HAS_IMAGE]->(img:Image)
        OPTIONAL MATCH (img)-[:HAS_EMBEDDING]->(emb:ImageEmbedding)
        DETACH DELETE img, emb
        """,
        {"uid": library_uid},
    )
    db.cypher_query(
        """
        MATCH (l:Library {uid: $uid})-[:CONTAINS]->(:Collection)
        -[:CONTAINS]->(i:Item)
        DETACH DELETE i
        """,
        {"uid": library_uid},
    )
    db.cypher_query(
        """
        MATCH (l:Library {uid: $uid})-[:CONTAINS]->(col:Collection)
        DETACH DELETE col
        """,
        {"uid": library_uid},
    )
    db.cypher_query(
        "MATCH (l:Library {uid: $uid}) DETACH DELETE l",
        {"uid": library_uid},
    )
    # Orphan Concept garbage collection: drop Concepts no longer referenced
    # by any Item (REFERENCES/MENTIONS) or Image (DEPICTS).
    orphan_result, _ = db.cypher_query(
        """
        MATCH (con:Concept)
        WHERE NOT (con)<-[:REFERENCES]-() AND NOT (con)<-[:MENTIONS]-()
              AND NOT (con)<-[:DEPICTS]-()
        WITH con
        DETACH DELETE con
        RETURN count(con) AS deleted
        """
    )
    orphans_deleted = orphan_result[0][0] if orphan_result else 0
    logger.info(
        "Library cascade-deleted library_uid=%s name=%s items=%d orphans_deleted=%d",
        library_uid, library_name, len(item_s3_keys), orphans_deleted,
    )
    return {
        "library_uid": library_uid,
        "name": library_name,
        "item_count": len(item_s3_keys),
        "item_s3_keys": item_s3_keys,
        "orphans_deleted": orphans_deleted,
    }
--- a/mnemosyne/library/services/model_health.py
+++ b/mnemosyne/library/services/model_health.py
@@ -0,0 +1,119 @@
 """
 System-default model reachability probes.
 Provides a cheap, bounded liveness check for the four system-default models
 (embedding, chat, vision, reranker) so the embedding dashboard and the
 scrape-time Prometheus collector can surface "model not responding" without
 running an ingest.
 The probe deliberately hits ``GET {base_url}/models`` as its primary check:
 on an OpenAI-compatible router (e.g. the llama-router) this answers instantly
 without loading a model, so repeated probes never burn GPU time. This mirrors
 the GPU-avoidance principle in ``mcp_server/tools/health.py``.
 """
 import logging
 import time
 from typing import Optional
 import requests
 logger = logging.getLogger(__name__)
 # api_type values whose endpoints expose an OpenAI-compatible ``/models`` list.
 _OPENAI_COMPATIBLE = {"openai", "azure", "ollama", "llama-cpp", "vllm"}
 # (role, getter method name) pairs — order is the dashboard/metrics order.
 ROLE_GETTERS = [
    ("embedding", "get_system_embedding_model"),
    ("chat", "get_system_chat_model"),
    ("vision", "get_system_vision_model"),
    ("reranker", "get_system_reranker_model"),
 ]
 def probe_api(api, timeout: int = 5) -> tuple[bool, str]:
    """Check whether an ``LLMApi`` endpoint is responding.
    Args:
        api: ``LLMApi`` instance (provides base_url, api_key, api_type).
        timeout: Per-request timeout in seconds.
    Returns:
        ``(ok, detail)`` — ok is True if the endpoint answered acceptably;
        detail is a short human-readable status (HTTP code, error, or "ok").
    """
    base_url = api.base_url.rstrip("/")
    headers = {}
    if api.api_key:
        headers["Authorization"] = f"Bearer {api.api_key}"
    if api.api_type not in _OPENAI_COMPATIBLE:
        # bedrock / anthropic have no equivalent cheap unauthenticated list;
        # treat a reachable host as the liveness signal via a HEAD on base_url.
        try:
            resp = requests.head(base_url, headers=headers, timeout=timeout)
            return True, f"reachable (HTTP {resp.status_code})"
        except requests.RequestException as exc:
            return False, type(exc).__name__
    url = f"{base_url}/models"
    try:
        resp = requests.get(url, headers=headers, timeout=timeout)
    except requests.Timeout:
        return False, f"timeout after {timeout}s"
    except requests.RequestException as exc:
        return False, type(exc).__name__
    if resp.status_code == 200:
        return True, "ok"
    return False, f"HTTP {resp.status_code}"
 def probe_system_models(timeout: int = 5) -> list[dict]:
    """Probe all four system-default models for reachability.
    Returns:
        One dict per role with keys: ``role``, ``configured``, ``model_name``,
        ``api_name``, ``base_url``, ``ok``, ``detail``, ``latency_ms``.
        For an unconfigured role, ``configured`` is False and the probe is
        skipped (``ok`` is None).
    """
    from llm_manager.models import LLMModel
    results: list[dict] = []
    for role, getter_name in ROLE_GETTERS:
        model = getattr(LLMModel, getter_name)()
        if model is None:
            results.append(
                {
                    "role": role,
                    "configured": False,
                    "model_name": None,
                    "api_name": None,
                    "base_url": None,
                    "ok": None,
                    "detail": "not configured",
                    "latency_ms": None,
                }
            )
            continue
        api = model.api
        start = time.monotonic()
        ok, detail = probe_api(api, timeout=timeout)
        latency_ms = round((time.monotonic() - start) * 1000, 1)
        results.append(
            {
                "role": role,
                "configured": True,
                "model_name": model.name,
                "api_name": api.name,
                "base_url": api.base_url,
                "ok": ok,
                "detail": detail,
                "latency_ms": latency_ms,
            }
        )
    return results
--- a/mnemosyne/library/templates/library/_model_health_badge.html
+++ b/mnemosyne/library/templates/library/_model_health_badge.html
@@ -0,0 +1,18 @@
 {% comment %}
 Reachability badge for a system-default model. Expects `h` = one entry from
 the `model_health` dict (keys: configured, ok, detail, latency_ms). Renders
 nothing when the role is absent from model_health (probe failed entirely).
 Text-only badges to match the existing dashboard palette (no emoji per house
 HTML rule).
 {% endcomment %}
 {% if h %}
    {% if not h.configured %}
        <span class="badge badge-ghost badge-sm ml-2" title="No system-default model set for this role">NOT CONFIGURED</span>
    {% elif h.ok %}
        <span class="badge badge-success badge-sm ml-2" title="{{ h.detail }}">REACHABLE</span>
        {% if h.latency_ms is not None %}<span class="text-xs opacity-50 ml-1">{{ h.latency_ms }} ms</span>{% endif %}
    {% else %}
        <span class="badge badge-error badge-sm ml-2" title="Probe detail: {{ h.detail }}">NOT RESPONDING</span>
        <span class="text-xs opacity-60 ml-1">{{ h.detail }}</span>
    {% endif %}
 {% endif %}
--- a/mnemosyne/library/templates/library/embedding_dashboard.html
+++ b/mnemosyne/library/templates/library/embedding_dashboard.html
@@ -28,6 +28,7 @@
                                {% if system_embedding_model.supports_multimodal %}
                                    <span class="badge badge-accent badge-sm ml-1">Multimodal</span>
                                {% endif %}
                                {% include "library/_model_health_badge.html" with h=model_health.embedding %}
                            {% else %}
                                <div class="flex items-center gap-2">
                                    <span class="badge badge-error">NOT CONFIGURED</span>
@@ -41,6 +42,7 @@
                        <td>
                            {% if system_chat_model %}
                                <span class="font-semibold">{{ system_chat_model.api.name }}: {{ system_chat_model.name }}</span>
                                {% include "library/_model_health_badge.html" with h=model_health.chat %}
                            {% else %}
                                <span class="text-sm opacity-60">Not configured — concept extraction disabled</span>
                            {% endif %}
@@ -51,6 +53,7 @@
                        <td>
                            {% if system_reranker_model %}
                                <span class="font-semibold">{{ system_reranker_model.api.name }}: {{ system_reranker_model.name }}</span>
                                {% include "library/_model_health_badge.html" with h=model_health.reranker %}
                            {% else %}
                                <span class="text-sm opacity-60">Not configured — Phase 3</span>
                            {% endif %}
@@ -64,6 +67,7 @@
                                {% if system_vision_model.supports_vision %}
                                    <span class="badge badge-accent badge-sm ml-1">Vision</span>
                                {% endif %}
                                {% include "library/_model_health_badge.html" with h=model_health.vision %}
                            {% else %}
                                <span class="text-sm opacity-60">Not configured — image analysis disabled</span>
                            {% endif %}
--- a/mnemosyne/library/templates/library/library_confirm_delete.html
+++ b/mnemosyne/library/templates/library/library_confirm_delete.html
@@ -12,6 +12,18 @@
    <div class="alert alert-warning mb-6">
        <span>Are you sure you want to delete <strong>{{ library.name }}</strong>? This action cannot be undone.</span>
    </div>
    {% if library.workspace_id %}
    <div class="alert alert-error mb-6">
        <span>
            <strong>This Library is managed by Daedalus</strong>
            (workspace <code>{{ library.workspace_id }}</code>).
            Deleting it here removes its embedded content from Mnemosyne, but the
            source files still live in Daedalus — it will be <strong>recreated and
            re-embedded on the next Daedalus sync</strong>. Use this to clear an
            orphaned Library that is blocking workspace re-registration.
        </span>
    </div>
    {% endif %}
    <form method="post">
        {% csrf_token %}
        <div class="flex gap-2">
--- a/mnemosyne/library/views.py
+++ b/mnemosyne/library/views.py
@@ -319,20 +319,20 @@ def library_delete(request, uid):
        messages.error(request, f"Library not found: {e}")
        return redirect("library:library-list")
-    # Daedalus owns the lifecycle of workspace-scoped libraries — they can
+    # Daedalus owns the lifecycle of workspace-scoped libraries. Deleting one
-    # only be deleted via DELETE /library/api/workspaces/{workspace_id}/.
+    # here is allowed but discouraged: the confirm page warns that Daedalus
-    # Block the human delete path so a stray click can't desync state.
+    # still holds the source content and will recreate + re-embed it on the
-    if lib.workspace_id:
+    # next sync. The risk is low (no data loss — only re-embedding cost), and
-        messages.error(
+    # this is the supported escape hatch for clearing an orphaned Library that
-            request,
+    # blocks workspace re-registration.
            f'"{lib.name}" is managed by Daedalus workspace '
            f"{lib.workspace_id}. Delete it from Daedalus, not here.",
        )
        return redirect("library:library-detail", uid=uid)
    if request.method == "POST":
        name = lib.name
-        lib.delete()
+        # Use the shared cascade so child nodes (Collections/Items/Chunks/
        # Images) and orphan Concepts are removed too — a bare lib.delete()
        # would leak them.
        from .services.library_delete import delete_library_cascade
        delete_library_cascade(lib)
        messages.success(request, f'Library "{name}" deleted.')
        return redirect("library:library-list")
    return render(request, "library/library_confirm_delete.html", {"library": lib})
@@ -729,6 +729,16 @@ def embedding_dashboard(request):
    except Exception as exc:
        logger.warning("Could not load system models: %s", exc)
    # Reachability of the system-default models (keyed by role for the
    # template). A probe failure must never 500 the dashboard.
    context["model_health"] = {}
    try:
        from library.services.model_health import probe_system_models
        context["model_health"] = {r["role"]: r for r in probe_system_models()}
    except Exception as exc:
        logger.warning("Could not probe system model health: %s", exc)
    # Get item status counts and node counts from Neo4j
    if neo4j_available():
        context["neo4j_available"] = True