From 7185d326eb69cedd352bf6187340c19af10ad74e Mon Sep 17 00:00:00 2001 From: Robert Helewka Date: Sun, 3 May 2026 19:35:27 -0400 Subject: [PATCH] feat(docker): rename web service to app, add nginx as web Reorganize Docker Compose services: the Django/gunicorn container is now `app` and nginx is `web`, better reflecting their roles. Add a dedicated gunicorn configuration and install curl in the runtime image for health checks. Update documentation to reflect: - Neo4j migration from ariel.incus to a dedicated umbriel.incus instance - Rationale for requiring a dedicated Neo4j instance (single-tenancy assumptions, label/index isolation, schema ownership) - New service naming in compose commands and log tailing examples --- .env.example | 13 +++++++ Dockerfile | 2 ++ README.md | 27 ++++++++++----- docker-compose.yaml | 35 +++++++++++-------- docker/entrypoint.sh | 1 + docker/gunicorn.conf.py | 27 +++++++++++++++ mnemosyne/.env example | 4 ++- mnemosyne/mnemosyne/urls.py | 3 ++ mnemosyne/mnemosyne/views.py | 22 ++++++++++++ nginx/mnemosyne.conf | 67 +++++++++++++++++++++++++++++------- 10 files changed, 163 insertions(+), 38 deletions(-) create mode 100644 .env.example create mode 100644 docker/gunicorn.conf.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..e29196f --- /dev/null +++ b/.env.example @@ -0,0 +1,13 @@ +# ============================================================================= +# Mnemosyne — Docker Compose environment +# ============================================================================= +# This file documents variables consumed by docker-compose.yaml itself +# (image tags, port overrides, etc.). It is NOT the application config. +# +# Application config lives in mnemosyne/.env — copy mnemosyne/.env\ example +# to mnemosyne/.env and fill in your values before running `docker compose up`. +# +# This file has no required variables for a default deployment: the compose +# file uses a fixed image tag and port. Add overrides here if you parameterise +# those in docker-compose.yaml (e.g. MNEMOSYNE_IMAGE, MNEMOSYNE_PORT). +# ============================================================================= diff --git a/Dockerfile b/Dockerfile index 260472b..5f7285b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -62,6 +62,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ zlib1g \ libssl3 \ ca-certificates \ + curl \ && rm -rf /var/lib/apt/lists/* ENV PYTHONDONTWRITEBYTECODE=1 \ @@ -77,6 +78,7 @@ COPY --from=builder /usr/local/bin /usr/local/bin WORKDIR /app COPY --from=builder /build/mnemosyne /app COPY docker/entrypoint.sh /usr/local/bin/entrypoint.sh +COPY docker/gunicorn.conf.py /app/docker/gunicorn.conf.py RUN chmod +x /usr/local/bin/entrypoint.sh # Non-root user for everything that runs in this image. uid:gid 1000:1000 diff --git a/README.md b/README.md index 698a662..25c4438 100644 --- a/README.md +++ b/README.md @@ -64,11 +64,13 @@ Mnemosyne runs as three cooperating processes: the Django web app (REST API + ad Hosts in the Ouranos lab: - **Postgres** — `portia.incus:5432` (Django ORM: users, IngestJob) -- **Neo4j** — `ariel.incus:25554` (knowledge graph + vectors) +- **Neo4j** — `umbriel.incus:7687` (Bolt; dedicated instance — see note below — knowledge graph + vectors; HTTP Browser on `umbriel.incus:25555`) - **RabbitMQ** — `oberon.incus:5672` (Celery broker) - **MinIO** — `nyx.helu.ca:8555` (S3-compatible; `mnemosyne-content` and `daedalus` buckets) - **Memcached** — `127.0.0.1:11211` (task progress) +> **Neo4j must be dedicated to Mnemosyne.** Don't share the instance with Spelunker or any other graph workload. Mnemosyne owns the `Library`, `Collection`, `Item`, `Chunk`, and `Concept` labels and runs its own indexes (`chunk_embedding_index`, full-text indexes per library_type) and schema migrations (`setup_neo4j_indexes`, `load_library_types`). The Phase-1 workspace-delete path runs label-scoped `DETACH DELETE` over those labels, and a workspace_id-scoped subgraph is the unit of isolation — both assume single-tenancy. A shared instance risks (1) label/property collisions corrupting the other tenant's graph, (2) vector-index memory contention degrading search latency for both apps, (3) management commands mutating schema another tenant depends on, and (4) backup/restore that can't be reasoned about per-app. Neo4j Community Edition is sufficient — the multi-database feature is Enterprise-only, so isolation has to come from running a separate server process. Run a dedicated instance per environment (one for staging, one for production); point each via `NEOMODEL_NEO4J_BOLT_URL` in that environment's `mnemosyne/.env`. + ### One-time setup ```bash @@ -159,14 +161,14 @@ Production runs as four containers from a single image (built and pushed by [`.g | Service | Role | Port | |---------|------|------| -| `web` | Django REST API + admin (gunicorn) | internal :8000 | +| `app` | Django REST API + admin (gunicorn) | internal :8000 | | `mcp` | FastMCP server (uvicorn) | internal :22091 | | `worker` | Celery worker — embedding/ingest/batch | — | -| `nginx` | Reverse proxy + static files | host :23090 | +| `web` | Reverse proxy + static files (nginx) | host :23090 | Plus a one-shot `static-init` service that copies `/app/staticfiles` (baked into the image at build time via `collectstatic`) into the shared volume nginx reads from. It runs to completion on every `up`, so static-file changes propagate on each deploy without manual intervention. -External services (NOT spun up by compose): Postgres on Portia, Neo4j on Ariel, RabbitMQ on Oberon, S3/MinIO on Nyx, Memcached, embedder + reranker. All reached over the internal 10.10.0.0/24 network. URLs and credentials live in `mnemosyne/.env`. +External services (NOT spun up by compose): Postgres on Portia, Neo4j on Umbriel (dedicated Mnemosyne instance), RabbitMQ on Oberon, S3/MinIO on Nyx, Memcached, embedder + reranker. All reached over the internal 10.10.0.0/24 network. URLs and credentials live in `mnemosyne/.env`. ### First-time bring-up @@ -175,10 +177,10 @@ External services (NOT spun up by compose): Postgres on Portia, Neo4j on Ariel, docker compose pull # DB migrations (one-shot) -docker compose run --rm web migrate +docker compose run --rm app migrate # Neo4j indexes + library_type defaults (one-shot) -docker compose run --rm web setup +docker compose run --rm app setup # Bring the stack up docker compose up -d @@ -188,7 +190,8 @@ docker compose up -d ```bash docker compose ps # service status + health -docker compose logs -f web # tail web logs +docker compose logs -f app # tail Django app logs +docker compose logs -f web # tail nginx logs docker compose logs -f worker # tail Celery worker logs docker compose restart mcp # restart just the MCP server @@ -210,8 +213,14 @@ The development `.env` has a few values that need adjusting for production: ### Health probes -- `GET http://nginx-host:23090/healthz` → proxies to `/mcp/health`, returns `{"status":"ok"}` when the MCP server is up -- `GET http://nginx-host:23090/metrics` → Prometheus scrape endpoint, internal-network-only +| Endpoint | Probes | Auth | +|----------|--------|------| +| `GET /live/` | Django process alive (always 200 if gunicorn is up) | None | +| `GET /ready/` | PostgreSQL + Memcached reachable (503 if either is down) | None | +| `GET /healthz` | MCP server `/mcp/health` — used as the HAProxy `health_path` | None | +| `GET /metrics` | Prometheus scrape | Internal networks only | + +> **Trailing slashes matter.** Always use `/live/` and `/ready/` (with the trailing slash). The un-slashed forms (`/live`, `/ready`) trigger Django's `APPEND_SLASH` 301 redirect — health check clients that don't follow redirects will report a failure even when the service is healthy. ## Architecture Note: Retrieval, Not Synthesis diff --git a/docker-compose.yaml b/docker-compose.yaml index 55a0714..5936bc0 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -2,20 +2,20 @@ # Mnemosyne — production deployment # ============================================================================= # Four services, all from the same image: -# web — Django REST API + admin (gunicorn, port 8000) +# app — Django REST API + admin (gunicorn, port 8000) # mcp — FastMCP server (uvicorn, port 22091) # worker — Celery worker (embedding/ingest/batch queues) -# nginx — reverse proxy, public port 23090 +# web — reverse proxy, public port 23090 (nginx) # -# External services (NOT spun up here): Postgres on Portia, Neo4j on Ariel, +# External services (NOT spun up here): Postgres on Portia, Neo4j on Umbriel, # RabbitMQ on Oberon, S3/MinIO on Nyx, Memcached on its own host, embedder # and reranker on Nyx, smtp4dev on Oberon. All reached over the internal # 10.10.0.0/24 network. # # Run: # docker compose up -d -# docker compose run --rm web migrate # one-shot DB migrate -# docker compose run --rm web setup # Neo4j indexes + library types +# docker compose run --rm app migrate # one-shot DB migrate +# docker compose run --rm app setup # Neo4j indexes + library types # ============================================================================= services: @@ -31,8 +31,8 @@ services: - mnemosyne-static:/shared-static restart: "no" - # ── Web app: Django REST API + admin ─────────────────────────────────────── - web: + # ── App: Django REST API + admin ────────────────────────────────────────── + app: image: git.helu.ca/r/mnemosyne:latest command: ["web"] env_file: mnemosyne/.env @@ -45,7 +45,7 @@ services: expose: - "8000" healthcheck: - test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/admin/login/').read()"] + test: ["CMD", "curl", "-f", "http://localhost:8000/live/"] interval: 30s timeout: 5s retries: 3 @@ -62,7 +62,7 @@ services: expose: - "22091" healthcheck: - test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:22091/mcp/health').read()"] + test: ["CMD", "curl", "-f", "http://localhost:22091/mcp/health"] interval: 30s timeout: 5s retries: 3 @@ -74,6 +74,9 @@ services: command: ["worker"] env_file: mnemosyne/.env restart: unless-stopped + depends_on: + app: + condition: service_healthy volumes: - mnemosyne-media:/app/media healthcheck: @@ -83,26 +86,28 @@ services: retries: 3 start_period: 60s - # ── nginx: reverse proxy, public port 23090 ──────────────────────────────── - nginx: + # ── Web: nginx reverse proxy, public port 23090 ─────────────────────────── + web: image: nginx:alpine restart: unless-stopped depends_on: - - web - - mcp + app: + condition: service_healthy + mcp: + condition: service_healthy ports: - "23090:80" volumes: - ./nginx/mnemosyne.conf:/etc/nginx/conf.d/default.conf:ro - mnemosyne-static:/var/www/static:ro healthcheck: - test: ["CMD", "wget", "-qO-", "http://localhost/healthz"] + test: ["CMD", "curl", "-f", "http://localhost/live/"] interval: 30s timeout: 5s retries: 3 volumes: - # Static files baked into the image at /app/staticfiles. The web service + # Static files baked into the image at /app/staticfiles. The app service # mounts this volume, populating it on first start; nginx reads from it. mnemosyne-static: # Local FileSystemStorage fallback. Production uses USE_LOCAL_STORAGE=False diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 45b9487..1a7e992 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -10,6 +10,7 @@ case "$1" in web) # Django REST API + admin (gunicorn → wsgi). exec gunicorn \ + --config /app/docker/gunicorn.conf.py \ --bind 0.0.0.0:8000 \ --workers "${GUNICORN_WORKERS:-3}" \ --access-logfile - \ diff --git a/docker/gunicorn.conf.py b/docker/gunicorn.conf.py new file mode 100644 index 0000000..c91eaa8 --- /dev/null +++ b/docker/gunicorn.conf.py @@ -0,0 +1,27 @@ +import logging +import re + +_PROBE_PATH = re.compile( + r"^(?:/live|/ready|/metrics|/healthz|/health[^ ]*|/ping)/?(?:\?|$)" +) + + +class _ProbePathFilter(logging.Filter): + def filter(self, record: logging.LogRecord) -> bool: + request = getattr(record, "args", None) + if isinstance(request, dict): + path = request.get("U") or request.get("r", "") + else: + path = record.getMessage() + return not _PROBE_PATH.search(path) + + +_filter = _ProbePathFilter() + + +def on_starting(server): + logging.getLogger("gunicorn.access").addFilter(_filter) + + +def post_worker_init(worker): + logging.getLogger("gunicorn.access").addFilter(_filter) diff --git a/mnemosyne/.env example b/mnemosyne/.env example index 44c69d5..edeac7d 100644 --- a/mnemosyne/.env example +++ b/mnemosyne/.env example @@ -18,7 +18,9 @@ DB_HOST=portia.incus DB_PORT=5432 # --- Neo4j Graph Database --- -NEOMODEL_NEO4J_BOLT_URL=bolt://neo4j:password@ariel.incus:25554 +# Dedicated Mnemosyne instance on Umbriel — do not share with Spelunker or any +# other graph workload. See README.md for the full rationale. +NEOMODEL_NEO4J_BOLT_URL=bolt://neo4j:password@umbriel.incus:7687 # --- Memcached --- KVDB_LOCATION=127.0.0.1:11211 diff --git a/mnemosyne/mnemosyne/urls.py b/mnemosyne/mnemosyne/urls.py index a7ad8fd..d595b63 100644 --- a/mnemosyne/mnemosyne/urls.py +++ b/mnemosyne/mnemosyne/urls.py @@ -8,6 +8,9 @@ from django.urls import include, path from . import views urlpatterns = [ + # Health checks — no auth, must return 200 (not redirect) — use trailing slash + path("live/", views.live, name="live"), + path("ready/", views.ready, name="ready"), # Landing / Dashboard path("", views.landing, name="landing"), path("dashboard/", views.dashboard, name="dashboard"), diff --git a/mnemosyne/mnemosyne/views.py b/mnemosyne/mnemosyne/views.py index 778c3b0..2769346 100644 --- a/mnemosyne/mnemosyne/views.py +++ b/mnemosyne/mnemosyne/views.py @@ -3,6 +3,9 @@ Mnemosyne project-level views — landing page and dashboard. """ from django.contrib.auth.decorators import login_required +from django.core.cache import cache +from django.db import connection +from django.http import JsonResponse from django.shortcuts import render from llm_manager.models import LLMApi, LLMModel @@ -43,3 +46,22 @@ def dashboard(request): context["library_count"] = None return render(request, "mnemosyne/dashboard.html", context) + + +def live(request): + return JsonResponse({"status": "ok"}) + + +def ready(request): + errors = {} + try: + connection.ensure_connection() + except Exception as e: + errors["db"] = str(e) + try: + cache.get("__readiness_probe__") + except Exception as e: + errors["cache"] = str(e) + if errors: + return JsonResponse({"status": "error", "errors": errors}, status=503) + return JsonResponse({"status": "ok"}) diff --git a/nginx/mnemosyne.conf b/nginx/mnemosyne.conf index 51a8ce9..4c679ba 100644 --- a/nginx/mnemosyne.conf +++ b/nginx/mnemosyne.conf @@ -2,9 +2,22 @@ # and the FastMCP server. HAProxy on Titania terminates TLS and routes by # hostname; this nginx is plain HTTP on the internal network. +# Suppress probe paths from the access log (health checks, Prometheus scrapes). +# These fire every 15–30 s and would drown out real traffic in Loki. +map $request_uri $loggable { + default 1; + ~^/live(/|\?|$) 0; + ~^/ready(/|\?|$) 0; + ~^/metrics(/|\?|$) 0; + ~^/healthz(/|\?|$) 0; + ~^/health 0; + ~^/mcp/health(/|\?|$) 0; + ~^/ping(/|\?|$) 0; +} + # Map of upstreams to give us readable proxy_pass targets and easy retries. -upstream mnemosyne_web { - server web:8000 max_fails=3 fail_timeout=30s; +upstream mnemosyne_app { + server app:8000 max_fails=3 fail_timeout=30s; } upstream mnemosyne_mcp { @@ -15,16 +28,50 @@ server { listen 80 default_server; server_name _; + access_log /var/log/nginx/access.log combined if=$loggable; + # Reasonable limits — file uploads to the ingest endpoint can be big, # but the bulk path is S3-direct from Daedalus. 64 MB covers admin # uploads and direct REST POST /library/api/items/upload. client_max_body_size 64m; client_body_timeout 120s; + # Liveness probe — always 200 if the Django process is up. + # Use the trailing-slash form: /live/ returns 200 directly. + # /live (no slash) triggers Django's APPEND_SLASH 301 redirect, which + # will cause health check clients that don't follow redirects to fail. + location = /live/ { + proxy_pass http://mnemosyne_app; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + access_log off; + } + + # Readiness probe — 200 only when PostgreSQL + Memcached are reachable. + # Same trailing-slash rule applies. + location = /ready/ { + proxy_pass http://mnemosyne_app; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + access_log off; + } + + # HAProxy liveness probe — proxies through to the MCP health endpoint. + location = /healthz { + proxy_pass http://mnemosyne_mcp/mcp/health; + access_log off; + } + # Mnemosyne's REST API — Django REST Framework views + admin. # Under /library/api/* per mnemosyne/urls.py and /admin/* per Django. location /library/ { - proxy_pass http://mnemosyne_web; + proxy_pass http://mnemosyne_app; proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; @@ -34,7 +81,7 @@ server { } location /admin/ { - proxy_pass http://mnemosyne_web; + proxy_pass http://mnemosyne_app; proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; @@ -59,7 +106,7 @@ server { } # Static files baked into the image at /app/staticfiles, mounted into - # this nginx via a named volume populated by the web service. + # this nginx via a named volume populated by the app service. location /static/ { alias /var/www/static/; access_log off; @@ -67,20 +114,14 @@ server { } # Prometheus scrape endpoint — internal networks only. - # Allows: localhost + RFC1918 private ranges (10/8, 172.16/12, 192.168/16). + # Allows: loopback + all RFC1918 private ranges. location /metrics { allow 127.0.0.0/8; allow 10.0.0.0/8; allow 172.16.0.0/12; allow 192.168.0.0/16; deny all; - proxy_pass http://mnemosyne_web; - access_log off; - } - - # Liveness probe — proxies through to the MCP health endpoint. - location = /healthz { - proxy_pass http://mnemosyne_mcp/mcp/health; + proxy_pass http://mnemosyne_app; access_log off; } }