From d107b492cfbc7c196ac2966129dbd4980bad6958 Mon Sep 17 00:00:00 2001 From: Robert Helewka Date: Sat, 11 Apr 2026 17:51:32 +0000 Subject: [PATCH] feat(server): implement structured logging, metrics, and dockerignore - Configure structured logging via environment variable - Add Prometheus metrics for HTTP request tracking and health - Secure /metrics endpoint with IP allowlisting - Update .dockerignore to exclude build artifacts --- .dockerignore | 13 ++ .gitea/workflows/cve-scan-docker-build.yml | 94 ++++++++ docs/red_panda_standards.md | 242 +++++++++++++++++++++ nike/config.py | 1 + nike/logging_config.py | 100 +++++++++ nike/server.py | 106 ++++++++- pyproject.toml | 1 + 7 files changed, 547 insertions(+), 10 deletions(-) create mode 100644 .gitea/workflows/cve-scan-docker-build.yml create mode 100644 docs/red_panda_standards.md create mode 100644 nike/logging_config.py diff --git a/.dockerignore b/.dockerignore index 9b468fe..ea5b545 100644 --- a/.dockerignore +++ b/.dockerignore @@ -8,3 +8,16 @@ build/ .env *.log .git/ +dashboard/src/ +dashboard/static/ +dashboard/node_modules/ +dashboard/.svelte-kit/ +dashboard/package*.json +dashboard/svelte.config.* +dashboard/vite.config.* +dashboard/tsconfig.* +docs/ +scripts/ +*.md +*.sql +*.service diff --git a/.gitea/workflows/cve-scan-docker-build.yml b/.gitea/workflows/cve-scan-docker-build.yml new file mode 100644 index 0000000..ed772ed --- /dev/null +++ b/.gitea/workflows/cve-scan-docker-build.yml @@ -0,0 +1,94 @@ +name: CVE Scan & Docker Build + +on: + push: + branches: [main] + pull_request: + branches: [main] + +env: + REGISTRY: git.helu.ca + IMAGE_NAME: ${{ gitea.repository }} + +jobs: + security-scan: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install Trivy + run: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin + trivy --version + + - name: Install pip-tools and resolve dependencies + run: | + python3 -m venv /tmp/scanenv + /tmp/scanenv/bin/pip install --quiet pip-tools + /tmp/scanenv/bin/pip-compile pyproject.toml \ + -o requirements.txt \ + --no-header --quiet --allow-unsafe --strip-extras \ + --resolver=backtracking || { + /tmp/scanenv/bin/pip install --quiet . && \ + /tmp/scanenv/bin/pip freeze > requirements.txt + } + cat requirements.txt + + - name: Scan Python dependencies for CVEs + continue-on-error: true + run: | + trivy fs --scanners vuln --severity HIGH,CRITICAL --format table requirements.txt + + - name: Scan repository for secrets + continue-on-error: true + run: | + trivy fs --scanners secret --severity HIGH,CRITICAL --format table . + + build-and-push: + runs-on: ubuntu-latest + needs: security-scan + if: always() + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Gitea Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ gitea.actor }} + password: ${{ secrets.PACKAGE_TOKEN }} + + - name: Extract metadata for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha,prefix= + type=raw,value=latest,enable=${{ gitea.ref == 'refs/heads/main' }} + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Scan built Docker image + continue-on-error: true + run: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin + IMAGE_TAG=$(echo "${{ steps.meta.outputs.tags }}" | head -n1) + echo "Scanning image: ${IMAGE_TAG}" + trivy image --severity HIGH,CRITICAL --format table "${IMAGE_TAG}" diff --git a/docs/red_panda_standards.md b/docs/red_panda_standards.md new file mode 100644 index 0000000..3bddd4a --- /dev/null +++ b/docs/red_panda_standards.md @@ -0,0 +1,242 @@ +# Red Panda Approvalβ„’ Standards + +Quality and observability standards for the Ouranos Lab. All infrastructure code, application code, and LLM-generated code deployed into this environment must meet these standards. + +--- + +## 🐾 Red Panda Approvalβ„’ + +All implementations must meet the 5 Sacred Criteria: + +1. **Fresh Environment Test** β€” Clean runs on new systems without drift. No leftover state, no manual steps. +2. **Elegant Simplicity** β€” Modular, reusable, no copy-paste sprawl. One playbook per concern. +3. **Observable & Auditable** β€” Clear task names, proper logging, check mode compatible. You can see what happened. +4. **Idempotent Patterns** β€” Run multiple times with consistent results. No side effects on re-runs. +5. **Actually Provisions & Configures** β€” Resources work, dependencies resolve, services integrate. It does the thing. + +--- + +## Vault Security + +All sensitive information is encrypted using Ansible Vault with AES256 encryption. + +**Encrypted secrets:** +- Database passwords (PostgreSQL, Neo4j) +- API keys (OpenAI, Anthropic, Mistral, Groq) +- Application secrets (Grafana, SearXNG, Arke) +- Monitoring alerts (Pushover integration) + +**Security rules:** +- AES256 encryption with `ansible-vault` +- Password file for automation β€” never pass `--vault-password-file` inline in scripts +- Vault variables use the `vault_` prefix; map to friendly names in `group_vars/all/vars.yml` +- No secrets in plain text files, ever + +--- + +## Log Level Standards + +All services in the Ouranos Lab MUST follow these log level conventions. These rules apply to application code, infrastructure services, and any LLM-generated code deployed into this environment. Log output flows through Alloy β†’ Loki β†’ Grafana, so disciplined leveling is not cosmetic β€” it directly determines alert quality, dashboard usefulness, and on-call signal-to-noise ratio. + +### Level Definitions + +| Level | When to Use | What MUST Be Included | Loki / Grafana Role | +|-------|-------------|----------------------|---------------------| +| **ERROR** | Something is broken and requires human intervention. The service cannot fulfil the current request or operation. | Exception class, message, stack trace, and relevant context (request ID, user, resource identifier). Never a bare `"something failed"`. | AlertManager rules fire on `level=~"error\|fatal\|critical"`. These trigger Pushover notifications. | +| **WARNING** | Degraded but self-recovering: retries succeeding, fallback paths taken, thresholds approaching, deprecated features invoked. | What degraded, what recovery action was taken, current metric value vs. threshold. | Grafana dashboard panels. Rate-based alerting (e.g., >N warnings/min). | +| **INFO** | Significant lifecycle and business events: service start/stop, configuration loaded, deployment markers, user authentication, job completion, schema migrations. | The event and its outcome. This level tells the *story* of what the system did. | Default production visibility. The go-to level for post-incident timelines. | +| **DEBUG** | Diagnostic detail for active troubleshooting: request/response payloads, SQL queries, internal state, variable values. | **Actionable context is mandatory.** A DEBUG line with no detail is worse than no line at all. Include variable values, object states, or decision paths. | Never enabled in production by default. Used on-demand via per-service level override. | + +### Anti-Patterns + +These are explicit violations of Ouranos logging standards: + +| ❌ Anti-Pattern | Why It's Wrong | βœ… Correct Approach | +|----------------|---------------|-------------------| +| Health checks logged at INFO (`GET /health β†’ 200 OK`) | Routine HAProxy/Prometheus probes flood syslog with thousands of identical lines per hour, burying real events. | Suppress health endpoints from access logs entirely, or demote to DEBUG. | +| DEBUG with no context (`logger.debug("error occurred")`) | Provides zero diagnostic value. If DEBUG is noisy *and* useless, nobody will ever enable it. | `logger.debug("PaymentService.process failed: order_id=%s, provider=%s, response=%r", oid, provider, resp)` | +| ERROR without exception details (`logger.error("task failed")`) | Cannot be triaged without reproduction steps. Wastes on-call time. | `logger.error("Celery task invoice_gen failed: order_id=%s", oid, exc_info=True)` | +| Logging sensitive data at any level | Passwords, tokens, API keys, and PII in Loki are a security incident. | Mask or redact: `api_key=sk-...a3f2`, `password=*****`. | +| Inconsistent level casing | Breaks LogQL filters and Grafana label selectors. | **Python / Django**: UPPERCASE (`INFO`, `WARNING`, `ERROR`, `DEBUG`). **Go / infrastructure** (HAProxy, Alloy, Gitea): lowercase (`info`, `warn`, `error`, `debug`). | +| Logging expected conditions as ERROR | A user entering a wrong password is not an error β€” it is normal business logic. | Use WARNING or INFO for expected-but-notable conditions. Reserve ERROR for things that are actually broken. | + +### Health Check Rule + +> All services exposed through HAProxy MUST suppress or demote health check endpoints (`/health`, `/healthz`, `/api/health`, `/metrics`, `/ping`) to DEBUG or below. Health check success is the *absence* of errors, not the presence of 200s. If your syslog shows a successful health probe, your log level is wrong. + +**Implementation guidance:** +- **Django / Gunicorn**: Filter health paths in the access log handler or use middleware that skips logging for probe user-agents. +- **FastAPI / Uvicorn**: Add a `logging.Filter` on the `uvicorn.access` logger that matches health paths in the access log message. Uvicorn's access log format includes the full request line in quotes (e.g., `"GET /live HTTP/1.1"`), so filter regexes must account for that. See also the structured logging notes below. +- **Docker services**: Configure the application's internal logging to exclude health routes β€” the syslog driver forwards everything it receives. +- **HAProxy**: HAProxy's own health check logs (`option httpchk`) should remain at the HAProxy level for connection debugging, but backend application responses to those probes must not surface at INFO. + +### Background Worker & Queue Monitoring + +> **The most dangerous failure is the one that produces no logs.** + +When a background worker (Celery task consumer, RabbitMQ subscriber, Gitea Runner, cron job) fails to start or crashes on startup, it generates no ongoing log output. Error-rate dashboards stay green because there is no process running to produce errors. Meanwhile, queues grow unbounded and work silently stops being processed. + +**Required practices:** + +1. **Heartbeat logging** β€” Every long-running background worker MUST emit a periodic INFO-level heartbeat (e.g., `"worker alive, processed N jobs in last 5m, queue depth: M"`). The *absence* of this heartbeat is the alertable condition. + +2. **Startup and shutdown at INFO** β€” Worker start, ready, graceful shutdown, and crash-exit are significant lifecycle events. These MUST log at INFO. + +3. **Queue depth as a metric** β€” RabbitMQ queue depths and any application-level task queues MUST be exposed as Prometheus metrics. A growing queue with zero consumer activity is an **ERROR**-level alert, not a warning. + +4. **Grafana "last seen" alerts** β€” For every background worker, configure a Grafana alert using `absent_over_time()` or equivalent staleness detection: *"Worker X has not logged a heartbeat in >10 minutes"* β†’ ERROR severity β†’ Pushover notification. + +5. **Crash-on-start is ERROR** β€” If a worker exits within seconds of starting (missing config, failed DB connection, import error), the exit MUST be captured at ERROR level by the service manager (`systemd OnFailure=`, Docker restart policy logs). Do not rely on the crashing application to log its own death β€” it may never get the chance. + +### Production Defaults + +| Service Category | Default Level | Rationale | +|-----------------|---------------|-----------| +| Django apps (Angelia, Athena, Kairos, Icarlos, Spelunker, Peitho, MCP Switchboard) | `WARNING` | Business logic β€” only degraded or broken conditions surface. Lifecycle events (start/stop/deploy) still log at INFO via Gunicorn and systemd. | +| FastAPI apps (Periplus) | `WARNING` | Same rationale as Django. Uvicorn lifecycle events (start/stop) are pinned to INFO via the `uvicorn.error` logger regardless of app log level. | +| Gunicorn access logs | Suppress 2xx/3xx health probes | Routine request logging deferred to HAProxy access logs in Loki. | +| Infrastructure agents (Alloy, Prometheus, Node Exporter) | `warn` | Stable β€” do not change without cause. | +| HAProxy (Titania) | `warning` | Connection-level logging handled by HAProxy's own log format β†’ Alloy β†’ Loki. | +| Databases (PostgreSQL, Neo4j) | `warning` | Query-level logging only enabled for active troubleshooting. | +| Docker services (Gitea, LobeChat, Nextcloud, AnythingLLM, SearXNG) | `warn` / `warning` | Per-service default. Tune individually if needed. | +| LLM Proxy (Arke) | `info` | Token usage tracking and provider routing decisions justify INFO. Review periodically for noise. | +| Observability stack (Grafana, Loki, AlertManager) | `warn` | Should be quiet unless something is wrong with observability itself. | + +### Structured Logging β€” FastAPI / Uvicorn + +FastAPI apps using uvicorn require special handling to achieve JSON-structured log output for the Alloy β†’ Loki pipeline. Uvicorn manages its own loggers aggressively, and naive approaches will fail silently. + +**Required practices:** + +1. **Override uvicorn's handlers, don't just add to root** β€” Uvicorn's `config.load()` creates its own `StreamHandler` instances on `uvicorn`, `uvicorn.error`, and `uvicorn.access`. You must remove these handlers and set `propagate = True` so log records flow to the root logger where your JSON formatter lives. + +2. **Re-apply logging config in the lifespan** β€” Configuring logging at module import time is not sufficient. Uvicorn's `config.load()` runs *after* your module is imported but *before* the ASGI lifespan starts. Call your logging configuration function again inside the FastAPI `lifespan` context manager to recapture control. + +3. **Remap uvicorn logger names** β€” Uvicorn uses `uvicorn.error` for all lifecycle messages (startup, shutdown, errors) despite the misleading name. Remap it to `uvicorn` in your JSON formatter's output for clarity in Loki queries. + +4. **Use `pydantic-settings` with `extra = "ignore"`** β€” When loading config from `.env` files that contain variables for other services (e.g., oauth2-proxy), pydantic-settings will reject unknown fields by default. Always set `extra = "ignore"` in the model config. + +### Loki & Grafana Alignment + +**Label normalization**: Alloy pipelines (syslog listeners and journal relabeling) MUST extract and forward a `level` label on every log line. Without a `level` label, the log entry is invisible to level-based dashboard filters and alert rules. + +**LogQL conventions for dashboards:** +```logql +# Production error monitoring (default dashboard view) +{job="syslog", hostname="puck"} | json | level=~"error|fatal|critical" + +# Warning-and-above for a specific service +{service_name="haproxy"} | logfmt | level=~"warn|error|fatal" + +# Debug-level troubleshooting (temporary, never permanent dashboards) +{container="angelia"} | json | level="debug" +``` + +**Alerting rules** β€” Grafana alert rules MUST key off the normalized `level` label: +- `level=~"error|fatal|critical"` β†’ Immediate Pushover notification via AlertManager +- `absent_over_time({service_name="celery_worker"}[10m])` β†’ Worker heartbeat staleness β†’ ERROR severity +- Rate-based: `rate({service_name="arke"} | json | level="error" [5m]) > 0.1` β†’ Sustained error rate + +**Retention alignment**: Loki retention policies should preserve ERROR and WARNING logs longer than DEBUG. DEBUG-level logs generated during troubleshooting sessions should have a short TTL or be explicitly cleaned up. + +--- + +## Health Check Endpoints + +All services MUST expose Kubernetes-style health endpoints at these paths: + +| Endpoint | Purpose | Auth | +|----------|---------|------| +| `GET /live` | **Liveness** β€” process is running and accepting connections | None | +| `GET /ready` | **Readiness** β€” process is running AND all dependencies (DB, cache, upstream APIs) are healthy | None | +| `GET /metrics` | Prometheus metrics | IP-restricted (no JWT) | + +- HAProxy uses `health_path: /ready/` for backend health checks β€” return HTTP 200 when ready +- Health endpoints MUST NOT require authentication +- Third-party services use their native paths (`/api/health`, `/api/healthz`, `/-/healthy`, etc.) + +### Docker Compose Healthchecks + +Use `curl -f` (install curl in images if needed). Do not use `wget --spider`. + +```yaml +healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/live"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s +``` + +--- + +## Endpoint Protection + +| Protected (require valid JWT) | Unprotected | +|-------------------------------|-------------| +| All `/api/v1/*` routes | `GET /live` | +| | `GET /ready` | +| | `GET /metrics` (IP-restricted to internal networks) | +| | `GET /api/auth/login-url` | +| | `POST /api/auth/token` | +| | `POST /api/v1/telemetry` (sendBeacon cannot set headers) | + +> **Why `/api/v1/telemetry` is unprotected**: The browser `sendBeacon` API cannot set `Authorization` headers. The telemetry endpoint must be open to receive client-side error reports and performance data, or browser errors will be silently lost. + +--- + +## Prometheus Metrics + +All services SHOULD expose `GET /metrics` in Prometheus exposition format, scraped by Prospero's Prometheus at 15s intervals. + +- **IP-restricted** to internal networks: `10.10.0.0/24`, `172.16.0.0/12`, `127.0.0.0/8` +- No JWT required β€” HAProxy and Prometheus scrapers cannot authenticate +- Useful metrics to expose: request totals and durations, error rates, active connections, queue depths, dependency health + +--- + +## Browser Telemetry + +Frontend/browser code MUST report errors and performance data back to the server. + +- Send to `POST /api/v1/telemetry` β€” unprotected endpoint +- Capture: JavaScript exceptions, promise rejections, resource load failures, performance metrics +- The server MUST log client-side exceptions at **WARNING** level (they indicate user-facing problems but are not server failures) +- Include enough context to reproduce: URL, user agent, error message, stack trace (if available) + +--- + +## Environment Variable Naming + +All environment variables for an application MUST use a consistent prefix matching the service name (e.g., `PERIPLUS_`, `ARKE_`, `ANGELIA_`). This applies to every variable in the `.env` file, including those consumed by sidecar services like oauth2-proxy. + +**Rules:** +- All vars in `.env` use the `SERVICENAME_` prefix β€” no exceptions +- `compose.yaml` maps prefixed vars to the sidecar's expected names (e.g., `OAUTH2_PROXY_CLIENT_ID: ${PERIPLUS_CASDOOR_CLIENT_ID}`) +- The application's Settings model SHOULD declare all prefixed vars, even those only consumed by sidecars, so the full configuration is documented in one place +- Every repo MUST include a `.env.example` with placeholder values for all required variables. Add `!.env.example` to `.gitignore` if a broad `.env.*` pattern would otherwise exclude it +- `.env` files with real secrets are ALWAYS gitignored β€” no exceptions + +--- + +## Docker Networking + +- Use the **default Docker bridge network** for simple deployments +- Add additional named networks only when required (e.g., isolating database traffic) or explicitly requested +- Do not define custom networks for single-service Docker Compose stacks + +--- + +## Documentation Standards + +Place documentation in the `/docs/` directory of the repository. + +### HTML Documents + +HTML documents must follow [docs/documentation_style_guide.html](documentation_style_guide.html). + +- Use Bootstrap CDN with Bootswatch theme **Flatly** +- Include a dark mode toggle button in the navbar +- Use Bootstrap Icons for icons +- Use Bootstrap CSS for styles β€” avoid custom CSS +- Use **Mermaid** for diagrams diff --git a/nike/config.py b/nike/config.py index 58b8858..ce6a208 100644 --- a/nike/config.py +++ b/nike/config.py @@ -39,3 +39,4 @@ SERVER_NAME = "Nike β€” Football Data Platform" # IPs allowed to set X-Forwarded-Proto (your HAProxy host). # '*' trusts all β€” safe when Nike's port is firewalled to HAProxy only. TRUSTED_PROXY_IPS = os.getenv('NIKE_TRUSTED_PROXY', '*') +LOG_LEVEL = os.getenv('NIKE_LOG_LEVEL', 'WARNING') diff --git a/nike/logging_config.py b/nike/logging_config.py new file mode 100644 index 0000000..f968726 --- /dev/null +++ b/nike/logging_config.py @@ -0,0 +1,100 @@ +""" +Structured logging configuration for Nike. + +Follows Red Panda Standards: +- JSON-structured output for Alloy β†’ Loki β†’ Grafana pipeline +- UPPERCASE level names (Python convention) +- Health check paths suppressed from access logs +- Production default: WARNING (lifecycle events via uvicorn at INFO) +""" + +import json +import logging +import re +from datetime import datetime, timezone + +# Paths that should never appear in access logs at INFO level. +# Health probes are the *absence* of errors, not the presence of 200s. +# Uvicorn access log format: '127.0.0.1:1234 - "GET /live HTTP/1.1" 200' +_HEALTH_PATHS = re.compile(r'"(GET|HEAD) /(live|ready|metrics)(/|\s)') + + +# Map uvicorn's internal logger names to cleaner names for Loki. +# uvicorn.error = lifecycle + errors, uvicorn.access = request logs. +_LOGGER_NAME_MAP = { + "uvicorn.error": "uvicorn", + "uvicorn.access": "uvicorn.access", +} + + +class JSONFormatter(logging.Formatter): + """Emit one JSON object per log line for structured ingestion.""" + + def format(self, record: logging.LogRecord) -> str: + log_entry = { + "timestamp": datetime.fromtimestamp(record.created, tz=timezone.utc).isoformat(), + "level": record.levelname, + "logger": _LOGGER_NAME_MAP.get(record.name, record.name), + "message": record.getMessage(), + } + if record.exc_info and record.exc_info[0] is not None: + log_entry["exception"] = self.formatException(record.exc_info) + if hasattr(record, "request_id"): + log_entry["request_id"] = record.request_id + return json.dumps(log_entry, default=str) + + +class HealthCheckFilter(logging.Filter): + """Suppress health/readiness/metrics probe lines from uvicorn access logs. + + These endpoints are polled every 30s by HAProxy/Prometheus. Logging + them at INFO floods Loki with thousands of identical lines per hour. + """ + + def filter(self, record: logging.LogRecord) -> bool: + message = record.getMessage() + if _HEALTH_PATHS.search(message): + return False + return True + + +def configure_logging(level: str = "WARNING") -> None: + """Apply structured JSON logging to the root logger and uvicorn loggers. + + Args: + level: Log level string (DEBUG, INFO, WARNING, ERROR). Per Red Panda + Standards the production default for application code is WARNING. + """ + numeric_level = getattr(logging, level.upper(), logging.WARNING) + + json_formatter = JSONFormatter() + health_filter = HealthCheckFilter() + + # Root handler β€” all nike.* loggers inherit this + root = logging.getLogger() + root.setLevel(numeric_level) + + # Remove any pre-existing handlers (avoids duplicate lines in containers) + for handler in root.handlers[:]: + root.removeHandler(handler) + + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(json_formatter) + root.addHandler(stream_handler) + + # Force uvicorn loggers to propagate to root (where our JSON handler lives) + # and remove any handlers uvicorn's config.load() may have installed. + for uv_name in ("uvicorn", "uvicorn.error", "uvicorn.access"): + uv_logger = logging.getLogger(uv_name) + for h in uv_logger.handlers[:]: + uv_logger.removeHandler(h) + uv_logger.propagate = True + + # Uvicorn access logger β€” suppress health probes + uvicorn_access = logging.getLogger("uvicorn.access") + uvicorn_access.addFilter(health_filter) + + # Uvicorn error logger (startup/shutdown lifecycle) β€” always at INFO + # so "Started server process" and "Shutting down" are captured + uvicorn_error = logging.getLogger("uvicorn.error") + uvicorn_error.setLevel(logging.INFO) diff --git a/nike/server.py b/nike/server.py index 39abda9..21c2108 100644 --- a/nike/server.py +++ b/nike/server.py @@ -14,6 +14,9 @@ Data flow: """ from __future__ import annotations +import ipaddress +import logging +import re import time from collections import deque from contextlib import asynccontextmanager @@ -23,9 +26,17 @@ from typing import Any import uvicorn from fastapi import FastAPI, Request -from fastapi.responses import FileResponse, HTMLResponse, JSONResponse +from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse, Response from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates +from prometheus_client import ( + CONTENT_TYPE_LATEST, + CollectorRegistry, + Counter, + Gauge, + Histogram, + generate_latest, +) from pydantic import BaseModel from fastmcp import FastMCP @@ -34,6 +45,38 @@ from mcp.types import ToolAnnotations import nike.db as db import nike.sportsdb as api from nike import config +from nike.logging_config import configure_logging + +# ── Structured logging ──────────────────────────────────── +configure_logging(config.LOG_LEVEL) +logger = logging.getLogger("nike") + +# ── Prometheus metrics ──────────────────────────────────── +_registry = CollectorRegistry() +REQUEST_COUNT = Counter( + "nike_http_requests_total", + "Total HTTP requests", + ["method", "path", "status"], + registry=_registry, +) +REQUEST_DURATION = Histogram( + "nike_http_request_duration_seconds", + "HTTP request duration in seconds", + ["method", "path"], + registry=_registry, +) +DB_UP = Gauge( + "nike_database_up", + "1 if database connection is healthy, 0 otherwise", + registry=_registry, +) + +_METRICS_ALLOWED_NETS = [ + ipaddress.ip_network("10.10.0.0/24"), + ipaddress.ip_network("172.16.0.0/12"), + ipaddress.ip_network("127.0.0.0/8"), + ipaddress.ip_network("::1/128"), +] # ── Request log ────────────────────────────────────────── _REQUEST_LOG: deque[dict] = deque(maxlen=500) @@ -696,6 +739,8 @@ _TEMPLATES = Jinja2Templates( @asynccontextmanager async def _lifespan(app: FastAPI): """Start DB pool + MCP session manager on startup, close on shutdown.""" + # Re-apply structured logging after uvicorn's config.load() overwrites handlers. + configure_logging(config.LOG_LEVEL) db.create_pool(minconn=2, maxconn=10) async with _mcp_app.router.lifespan_context(app): yield @@ -704,6 +749,32 @@ async def _lifespan(app: FastAPI): dashboard = FastAPI(title="Nike Dashboard", lifespan=_lifespan) +# ── Request metrics middleware ──────────────────────────── +_SKIP_METRICS_PREFIXES = ("/live", "/ready", "/metrics", "/mcp") + + +@dashboard.middleware("http") +async def prometheus_metrics_middleware(request: Request, call_next): + """Record request count and duration for non-health/static paths.""" + path = request.url.path + if any(path.startswith(p) for p in _SKIP_METRICS_PREFIXES): + return await call_next(request) + + normalized = re.sub( + r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", + "{id}", + path, + ) + + start = time.perf_counter() + response = await call_next(request) + duration = time.perf_counter() - start + + REQUEST_COUNT.labels(method=request.method, path=normalized, status=response.status_code).inc() + REQUEST_DURATION.labels(method=request.method, path=normalized).observe(duration) + return response + + # ── Dashboard routes ────────────────────────────────────── _SVELTE_BUILD = Path(__file__).parent.parent / "dashboard" / "build" @@ -813,6 +884,7 @@ async def api_run(body: _RunRequest): result = fn(**body.args) return JSONResponse({"ok": True, "result": result}) except Exception as exc: + logger.error("Tool run failed: tool=%s args=%s", body.tool, body.args, exc_info=True) return JSONResponse({"ok": False, "error": str(exc)}, status_code=500) @@ -835,15 +907,29 @@ async def readiness(): @dashboard.get("/metrics") -async def metrics(): +async def metrics(request: Request): + """Prometheus exposition format metrics endpoint. + + IP-restricted to internal networks per Red Panda Standards: + 10.10.0.0/24, 172.16.0.0/12, 127.0.0.0/8. No auth required β€” + HAProxy and Prometheus scrapers cannot authenticate. + """ + client_ip = request.client.host if request.client else "0.0.0.0" + try: + addr = ipaddress.ip_address(client_ip) + except ValueError: + return PlainTextResponse("Forbidden", status_code=403) + + if not any(addr in net for net in _METRICS_ALLOWED_NETS): + return PlainTextResponse("Forbidden", status_code=403) + db_check = db.check_connection() - counts = db.get_table_counts() - uptime_s = int((datetime.now(timezone.utc) - _SERVER_START).total_seconds()) - return JSONResponse({ - "uptime_seconds": uptime_s, - "database": db_check, - "table_counts": counts, - }) + DB_UP.set(1 if db_check.get("connected") else 0) + + return Response( + content=generate_latest(_registry), + media_type=CONTENT_TYPE_LATEST, + ) # ── Mount MCP onto dashboard ────────────────────────────── @@ -862,7 +948,7 @@ def main(): host=config.SERVER_HOST, port=config.SERVER_PORT, reload=False, - log_level="info", + log_level=config.LOG_LEVEL.lower(), ws="wsproto", # Trust X-Forwarded-Proto / X-Forwarded-For from HAProxy proxy_headers=True, diff --git a/pyproject.toml b/pyproject.toml index 2b4e798..e622b3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "requests>=2.32", "jinja2>=3.1", "wsproto>=1.2", + "prometheus-client>=0.20", ] [project.optional-dependencies]