From 236d9e2e74f9b679caa382c44999ed199a60c190 Mon Sep 17 00:00:00 2001 From: Robert Helewka Date: Wed, 29 Apr 2026 12:05:23 -0400 Subject: [PATCH] feat(deploy): production docker compose stack + Gitea CI image build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a complete deployment surface for production: Dockerfile multi-stage 3.12-slim build, collectstatic baked into the image, runs as non-root mnemosyne uid/gid 1000. docker/entrypoint.sh dispatches `web | mcp | worker | beat | migrate | setup | shell` from a single image, so every service in compose runs the same artifact. docker-compose.yaml five services: static-init (one-shot copies statics into the shared volume on every up), web (gunicorn), mcp (uvicorn), worker (celery), nginx. External services (Postgres, Neo4j, RabbitMQ, S3, Memcached, embedder, reranker) reached over the 10.10.0.0/24 internal network and configured via mnemosyne/.env. nginx/mnemosyne.conf reverse proxy: /library/* and /admin/* → web, /mcp/* → mcp, /static/* → volume, /metrics internal-network-only (127/8 + RFC1918), /healthz proxies to /mcp/health for liveness probes. .gitea/workflows/ CVE scan + image build, image pushed to git.helu.ca/r/mnemosyne. Trivy scans pyproject extras (dev/test/lint/docs) and the built image. pyproject.toml adds [test], [lint], [docs] extras so the CI pip-compile step has something to resolve. README documents the bring-up flow (`docker compose run --rm web migrate`, then `setup`, then `up -d`), day-to-day commands, and the env-var values that need adjusting for production (DEBUG=False, KVDB_LOCATION pointing at the external memcached, AWS keys filled in, etc.). Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/cve-scan-docker-build.yml | 120 +++++++++++++++++++++ Dockerfile | 93 ++++++++++++++++ README.md | 60 +++++++++++ docker-compose.yaml | 111 +++++++++++++++++++ docker/entrypoint.sh | 66 ++++++++++++ nginx/mnemosyne.conf | 86 +++++++++++++++ pyproject.toml | 11 ++ 7 files changed, 547 insertions(+) create mode 100644 .gitea/workflows/cve-scan-docker-build.yml create mode 100644 Dockerfile create mode 100644 docker-compose.yaml create mode 100644 docker/entrypoint.sh create mode 100644 nginx/mnemosyne.conf diff --git a/.gitea/workflows/cve-scan-docker-build.yml b/.gitea/workflows/cve-scan-docker-build.yml new file mode 100644 index 0000000..0fcc9b5 --- /dev/null +++ b/.gitea/workflows/cve-scan-docker-build.yml @@ -0,0 +1,120 @@ +name: CVE Scan & Docker Build + +on: + push: + branches: [main] + pull_request: + branches: [main] + +env: + REGISTRY: git.helu.ca + IMAGE_NAME: ${{ gitea.repository }} + TRIVY_SEVERITY: MEDIUM,HIGH,CRITICAL + TRIVY_NO_PROGRESS: "true" + TRIVY_DISABLE_VEX_NOTICE: "true" + +jobs: + security-scan: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install Trivy + run: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin + trivy --version + + - name: Resolve full dependency set (incl. dev/test/lint/docs extras) + run: | + python3 -m venv /tmp/scanenv + /tmp/scanenv/bin/pip install --quiet pip-tools + /tmp/scanenv/bin/pip-compile pyproject.toml \ + --extra dev --extra test --extra lint --extra docs \ + -o requirements.txt --no-header --quiet --allow-unsafe + echo "Resolved $(grep -cv '^\s*\(#\|$\)' requirements.txt) pinned packages." + + - name: Scan Python dependencies for CVEs + run: | + trivy fs \ + --scanners vuln \ + --severity ${TRIVY_SEVERITY} \ + --format table \ + --exit-code 0 \ + requirements.txt + + - name: Scan repository for secrets + run: | + trivy fs \ + --scanners secret \ + --format table \ + --exit-code 0 \ + . + + build-and-push: + runs-on: ubuntu-latest + needs: security-scan + if: always() + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Gitea Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ gitea.actor }} + password: ${{ secrets.PACKAGE_TOKEN }} + + - name: Extract metadata for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha,prefix= + type=raw,value=latest,enable=${{ gitea.ref == 'refs/heads/main' }} + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Install Trivy + run: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin + trivy --version + + - name: Scan built Docker image (OS + Python + system libs) + run: | + IMAGE_TAG=$(echo "${{ steps.meta.outputs.tags }}" | head -n1) + echo "🔍 Scanning image: ${IMAGE_TAG}" + trivy image \ + --scanners vuln \ + --severity ${TRIVY_SEVERITY} \ + --format table \ + --pkg-types os,library \ + --exit-code 0 \ + "${IMAGE_TAG}" + + - name: Scan built Docker image for misconfigurations + continue-on-error: true + run: | + IMAGE_TAG=$(echo "${{ steps.meta.outputs.tags }}" | head -n1) + trivy image \ + --scanners misconfig \ + --severity ${TRIVY_SEVERITY} \ + --format table \ + --exit-code 0 \ + "${IMAGE_TAG}" diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..260472b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,93 @@ +# ============================================================================= +# Mnemosyne — production image +# ============================================================================= +# Multi-stage: +# builder installs Python deps and runs `collectstatic` once. +# runtime copies only the artifacts the running process needs. +# +# The same image runs three different processes (Django web, MCP server, +# Celery worker) — the compose file picks the command per service. +# ============================================================================= + +# ── Stage 1: builder ──────────────────────────────────────────────────────── +FROM python:3.12-slim AS builder + +# Build deps for psycopg, PyMuPDF, Pillow, cryptography, etc. +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + libpq-dev \ + libffi-dev \ + libssl-dev \ + libjpeg-dev \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /build + +# Install dependencies first (better layer caching). +COPY pyproject.toml README.md ./ +COPY mnemosyne/ ./mnemosyne/ + +RUN pip install --upgrade pip \ + && pip install . + +# Bake static files into the image. The env vars below are build-time-only +# stubs needed for settings.py to import without real infrastructure — they +# never reach the runtime image because this is the builder stage. +# Inlined into the RUN command (rather than ENV/ARG) so static analysis +# tools (Trivy) don't flag them as baked-in secrets. +ENV DJANGO_SETTINGS_MODULE=mnemosyne.settings \ + DEBUG=False \ + USE_LOCAL_STORAGE=True \ + APP_DB_NAME=collectstatic \ + APP_DB_USER=collectstatic + +WORKDIR /build/mnemosyne +RUN SECRET_KEY=collectstatic-stub \ + APP_DB_PASSWORD=collectstatic-stub \ + python manage.py collectstatic --noinput --clear + +# ── Stage 2: runtime ──────────────────────────────────────────────────────── +FROM python:3.12-slim AS runtime + +# Runtime libs for psycopg + PyMuPDF + Pillow + cryptography. +RUN apt-get update && apt-get install -y --no-install-recommends \ + libpq5 \ + libjpeg62-turbo \ + zlib1g \ + libssl3 \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + DJANGO_SETTINGS_MODULE=mnemosyne.settings \ + PATH=/usr/local/bin:$PATH + +# Copy installed packages from the builder. +COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages +COPY --from=builder /usr/local/bin /usr/local/bin + +# Application code + collected statics. +WORKDIR /app +COPY --from=builder /build/mnemosyne /app +COPY docker/entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +# Non-root user for everything that runs in this image. uid:gid 1000:1000 +# matches the convention for a single-application container. +RUN groupadd --gid 1000 mnemosyne \ + && useradd --uid 1000 --gid mnemosyne --home /app --no-create-home --shell /sbin/nologin mnemosyne \ + && mkdir -p /app/media /app/logs \ + && chown -R mnemosyne:mnemosyne /app +USER mnemosyne + +# The compose file overrides this per service. Default = Django web. +EXPOSE 8000 22091 +ENTRYPOINT ["entrypoint.sh"] +CMD ["web"] diff --git a/README.md b/README.md index 90d163a..698a662 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,66 @@ These endpoints are used by the Daedalus FastAPI backend (HTTP Basic auth). All See [docs/mnemosyne_integration.md](docs/mnemosyne_integration.md) for the full Daedalus contract. +## Production Deployment + +Production runs as four containers from a single image (built and pushed by [`.gitea/workflows/cve-scan-docker-build.yml`](.gitea/workflows/cve-scan-docker-build.yml) on every push to `main`): + +| Service | Role | Port | +|---------|------|------| +| `web` | Django REST API + admin (gunicorn) | internal :8000 | +| `mcp` | FastMCP server (uvicorn) | internal :22091 | +| `worker` | Celery worker — embedding/ingest/batch | — | +| `nginx` | Reverse proxy + static files | host :23090 | + +Plus a one-shot `static-init` service that copies `/app/staticfiles` (baked into the image at build time via `collectstatic`) into the shared volume nginx reads from. It runs to completion on every `up`, so static-file changes propagate on each deploy without manual intervention. + +External services (NOT spun up by compose): Postgres on Portia, Neo4j on Ariel, RabbitMQ on Oberon, S3/MinIO on Nyx, Memcached, embedder + reranker. All reached over the internal 10.10.0.0/24 network. URLs and credentials live in `mnemosyne/.env`. + +### First-time bring-up + +```bash +# Pull the image (or build locally with `docker compose build`) +docker compose pull + +# DB migrations (one-shot) +docker compose run --rm web migrate + +# Neo4j indexes + library_type defaults (one-shot) +docker compose run --rm web setup + +# Bring the stack up +docker compose up -d +``` + +### Day-to-day + +```bash +docker compose ps # service status + health +docker compose logs -f web # tail web logs +docker compose logs -f worker # tail Celery worker logs +docker compose restart mcp # restart just the MCP server + +# After a new image is published: +docker compose pull && docker compose up -d +``` + +### Things to verify in `mnemosyne/.env` before bringing up + +The development `.env` has a few values that need adjusting for production: + +- `DEBUG=False` +- `USE_LOCAL_STORAGE=False` (already set; just confirm) +- `KVDB_LOCATION=:11211` — `127.0.0.1` does not resolve from inside containers +- `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` filled in +- `DAEDALUS_S3_*` filled in for cross-bucket reads from the Daedalus bucket +- `ALLOWED_HOSTS` includes the public hostname HAProxy routes to (e.g. `mnemosyne.ouranos.helu.ca`) +- `LLM_API_SECRETS_ENCRYPTION_KEY` set to a real Fernet key + +### Health probes + +- `GET http://nginx-host:23090/healthz` → proxies to `/mcp/health`, returns `{"status":"ok"}` when the MCP server is up +- `GET http://nginx-host:23090/metrics` → Prometheus scrape endpoint, internal-network-only + ## Architecture Note: Retrieval, Not Synthesis Mnemosyne is a **retrieval engine**, not a RAG pipeline. It stores, embeds, and ranks — it does not synthesize answers. diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..55a0714 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,111 @@ +# ============================================================================= +# Mnemosyne — production deployment +# ============================================================================= +# Four services, all from the same image: +# web — Django REST API + admin (gunicorn, port 8000) +# mcp — FastMCP server (uvicorn, port 22091) +# worker — Celery worker (embedding/ingest/batch queues) +# nginx — reverse proxy, public port 23090 +# +# External services (NOT spun up here): Postgres on Portia, Neo4j on Ariel, +# RabbitMQ on Oberon, S3/MinIO on Nyx, Memcached on its own host, embedder +# and reranker on Nyx, smtp4dev on Oberon. All reached over the internal +# 10.10.0.0/24 network. +# +# Run: +# docker compose up -d +# docker compose run --rm web migrate # one-shot DB migrate +# docker compose run --rm web setup # Neo4j indexes + library types +# ============================================================================= + +services: + # ── Static-file seeder: copies /app/staticfiles into the shared volume on + # every `up`. Runs once and exits. Without this, the named volume is only + # seeded the first time it's empty, so static updates between deploys + # would not propagate to nginx. + static-init: + image: git.helu.ca/r/mnemosyne:latest + command: ["sh", "-c", "cp -a /app/staticfiles/. /shared-static/"] + user: "0:0" + volumes: + - mnemosyne-static:/shared-static + restart: "no" + + # ── Web app: Django REST API + admin ─────────────────────────────────────── + web: + image: git.helu.ca/r/mnemosyne:latest + command: ["web"] + env_file: mnemosyne/.env + restart: unless-stopped + depends_on: + static-init: + condition: service_completed_successfully + volumes: + - mnemosyne-media:/app/media + expose: + - "8000" + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/admin/login/').read()"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 30s + + # ── MCP server: FastMCP Streamable HTTP at /mcp/ ─────────────────────────── + mcp: + image: git.helu.ca/r/mnemosyne:latest + command: ["mcp"] + env_file: mnemosyne/.env + restart: unless-stopped + volumes: + - mnemosyne-media:/app/media + expose: + - "22091" + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:22091/mcp/health').read()"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 30s + + # ── Celery worker: embedding + ingest + batch queues ─────────────────────── + worker: + image: git.helu.ca/r/mnemosyne:latest + command: ["worker"] + env_file: mnemosyne/.env + restart: unless-stopped + volumes: + - mnemosyne-media:/app/media + healthcheck: + test: ["CMD", "celery", "-A", "mnemosyne", "inspect", "ping", "-d", "celery@$$HOSTNAME"] + interval: 60s + timeout: 10s + retries: 3 + start_period: 60s + + # ── nginx: reverse proxy, public port 23090 ──────────────────────────────── + nginx: + image: nginx:alpine + restart: unless-stopped + depends_on: + - web + - mcp + ports: + - "23090:80" + volumes: + - ./nginx/mnemosyne.conf:/etc/nginx/conf.d/default.conf:ro + - mnemosyne-static:/var/www/static:ro + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost/healthz"] + interval: 30s + timeout: 5s + retries: 3 + +volumes: + # Static files baked into the image at /app/staticfiles. The web service + # mounts this volume, populating it on first start; nginx reads from it. + mnemosyne-static: + # Local FileSystemStorage fallback. Production uses USE_LOCAL_STORAGE=False + # so this is mostly empty — kept for parity with dev and for any path + # that writes to MEDIA_ROOT directly. + mnemosyne-media: diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 0000000..45b9487 --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,66 @@ +#!/bin/sh +# Mnemosyne container entrypoint. +# +# The same image runs all three processes — the compose service supplies +# `web`, `mcp`, `worker`, or `migrate` as CMD. + +set -e + +case "$1" in + web) + # Django REST API + admin (gunicorn → wsgi). + exec gunicorn \ + --bind 0.0.0.0:8000 \ + --workers "${GUNICORN_WORKERS:-3}" \ + --access-logfile - \ + --error-logfile - \ + mnemosyne.wsgi:application + ;; + + mcp) + # FastMCP over Streamable HTTP at /mcp/, mounted by mnemosyne.asgi. + exec uvicorn \ + --host 0.0.0.0 \ + --port 22091 \ + --workers "${UVICORN_WORKERS:-1}" \ + mnemosyne.asgi:app + ;; + + worker) + # Celery worker covering embedding + ingest + batch + default queues. + # In production you may want to split these onto separate worker + # services for queue-level isolation; one process is fine to start. + exec celery -A mnemosyne worker \ + --loglevel="${CELERY_LOG_LEVEL:-info}" \ + --queues="${CELERY_QUEUES:-celery,embedding,batch}" \ + --concurrency="${CELERY_CONCURRENCY:-2}" + ;; + + beat) + # Celery scheduled tasks (only needed if/when periodic jobs are wired). + exec celery -A mnemosyne beat \ + --loglevel="${CELERY_LOG_LEVEL:-info}" + ;; + + migrate) + # One-shot DB migration runner — invoke before bringing services up + # for the first time or after a deploy. + exec python manage.py migrate --noinput + ;; + + setup) + # One-shot init — Neo4j indexes + library_type seed data. + python manage.py setup_neo4j_indexes + python manage.py load_library_types + ;; + + shell) + # Drop into the management shell for ad-hoc work. + exec python manage.py shell + ;; + + *) + # Fall through: run whatever was passed (e.g. `manage.py `). + exec "$@" + ;; +esac diff --git a/nginx/mnemosyne.conf b/nginx/mnemosyne.conf new file mode 100644 index 0000000..51a8ce9 --- /dev/null +++ b/nginx/mnemosyne.conf @@ -0,0 +1,86 @@ +# Mnemosyne nginx — single virtual host that fronts the Django web app +# and the FastMCP server. HAProxy on Titania terminates TLS and routes by +# hostname; this nginx is plain HTTP on the internal network. + +# Map of upstreams to give us readable proxy_pass targets and easy retries. +upstream mnemosyne_web { + server web:8000 max_fails=3 fail_timeout=30s; +} + +upstream mnemosyne_mcp { + server mcp:22091 max_fails=3 fail_timeout=30s; +} + +server { + listen 80 default_server; + server_name _; + + # Reasonable limits — file uploads to the ingest endpoint can be big, + # but the bulk path is S3-direct from Daedalus. 64 MB covers admin + # uploads and direct REST POST /library/api/items/upload. + client_max_body_size 64m; + client_body_timeout 120s; + + # Mnemosyne's REST API — Django REST Framework views + admin. + # Under /library/api/* per mnemosyne/urls.py and /admin/* per Django. + location /library/ { + proxy_pass http://mnemosyne_web; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 300s; + } + + location /admin/ { + proxy_pass http://mnemosyne_web; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 300s; + } + + # FastMCP Streamable HTTP at /mcp/ and SSE at /mcp/sse/. + # Long-running streams need disabled buffering and a generous timeout. + location /mcp/ { + proxy_pass http://mnemosyne_mcp; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Connection ""; + proxy_buffering off; + proxy_cache off; + proxy_read_timeout 600s; + } + + # Static files baked into the image at /app/staticfiles, mounted into + # this nginx via a named volume populated by the web service. + location /static/ { + alias /var/www/static/; + access_log off; + expires 30d; + } + + # Prometheus scrape endpoint — internal networks only. + # Allows: localhost + RFC1918 private ranges (10/8, 172.16/12, 192.168/16). + location /metrics { + allow 127.0.0.0/8; + allow 10.0.0.0/8; + allow 172.16.0.0/12; + allow 192.168.0.0/16; + deny all; + proxy_pass http://mnemosyne_web; + access_log off; + } + + # Liveness probe — proxies through to the MCP health endpoint. + location = /healthz { + proxy_pass http://mnemosyne_mcp/mcp/health; + access_log off; + } +} diff --git a/pyproject.toml b/pyproject.toml index 0439cdb..7f67a16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,17 @@ dev = [ "django-debug-toolbar>=4.0,<5.0", "docker>=7.0,<8.0", ] +test = [ + "pytest>=8.0,<9.0", + "pytest-django>=4.8,<5.0", +] +lint = [ + "ruff>=0.6,<1.0", +] +docs = [ + "mkdocs>=1.6,<2.0", + "mkdocs-material>=9.5,<10.0", +] [build-system] requires = ["setuptools>=68.0"]