feat(server): implement structured logging, metrics, and dockerignore
- Configure structured logging via environment variable - Add Prometheus metrics for HTTP request tracking and health - Secure /metrics endpoint with IP allowlisting - Update .dockerignore to exclude build artifacts
This commit is contained in:
@@ -8,3 +8,16 @@ build/
|
|||||||
.env
|
.env
|
||||||
*.log
|
*.log
|
||||||
.git/
|
.git/
|
||||||
|
dashboard/src/
|
||||||
|
dashboard/static/
|
||||||
|
dashboard/node_modules/
|
||||||
|
dashboard/.svelte-kit/
|
||||||
|
dashboard/package*.json
|
||||||
|
dashboard/svelte.config.*
|
||||||
|
dashboard/vite.config.*
|
||||||
|
dashboard/tsconfig.*
|
||||||
|
docs/
|
||||||
|
scripts/
|
||||||
|
*.md
|
||||||
|
*.sql
|
||||||
|
*.service
|
||||||
|
|||||||
94
.gitea/workflows/cve-scan-docker-build.yml
Normal file
94
.gitea/workflows/cve-scan-docker-build.yml
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
name: CVE Scan & Docker Build
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
pull_request:
|
||||||
|
branches: [main]
|
||||||
|
|
||||||
|
env:
|
||||||
|
REGISTRY: git.helu.ca
|
||||||
|
IMAGE_NAME: ${{ gitea.repository }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
security-scan:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install Trivy
|
||||||
|
run: |
|
||||||
|
curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin
|
||||||
|
trivy --version
|
||||||
|
|
||||||
|
- name: Install pip-tools and resolve dependencies
|
||||||
|
run: |
|
||||||
|
python3 -m venv /tmp/scanenv
|
||||||
|
/tmp/scanenv/bin/pip install --quiet pip-tools
|
||||||
|
/tmp/scanenv/bin/pip-compile pyproject.toml \
|
||||||
|
-o requirements.txt \
|
||||||
|
--no-header --quiet --allow-unsafe --strip-extras \
|
||||||
|
--resolver=backtracking || {
|
||||||
|
/tmp/scanenv/bin/pip install --quiet . && \
|
||||||
|
/tmp/scanenv/bin/pip freeze > requirements.txt
|
||||||
|
}
|
||||||
|
cat requirements.txt
|
||||||
|
|
||||||
|
- name: Scan Python dependencies for CVEs
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
trivy fs --scanners vuln --severity HIGH,CRITICAL --format table requirements.txt
|
||||||
|
|
||||||
|
- name: Scan repository for secrets
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
trivy fs --scanners secret --severity HIGH,CRITICAL --format table .
|
||||||
|
|
||||||
|
build-and-push:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: security-scan
|
||||||
|
if: always()
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Log in to Gitea Container Registry
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: ${{ env.REGISTRY }}
|
||||||
|
username: ${{ gitea.actor }}
|
||||||
|
password: ${{ secrets.PACKAGE_TOKEN }}
|
||||||
|
|
||||||
|
- name: Extract metadata for Docker
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
with:
|
||||||
|
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||||
|
tags: |
|
||||||
|
type=ref,event=branch
|
||||||
|
type=ref,event=pr
|
||||||
|
type=sha,prefix=
|
||||||
|
type=raw,value=latest,enable=${{ gitea.ref == 'refs/heads/main' }}
|
||||||
|
|
||||||
|
- name: Build and push Docker image
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: Dockerfile
|
||||||
|
push: true
|
||||||
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
|
||||||
|
- name: Scan built Docker image
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin
|
||||||
|
IMAGE_TAG=$(echo "${{ steps.meta.outputs.tags }}" | head -n1)
|
||||||
|
echo "Scanning image: ${IMAGE_TAG}"
|
||||||
|
trivy image --severity HIGH,CRITICAL --format table "${IMAGE_TAG}"
|
||||||
242
docs/red_panda_standards.md
Normal file
242
docs/red_panda_standards.md
Normal file
@@ -0,0 +1,242 @@
|
|||||||
|
# Red Panda Approval™ Standards
|
||||||
|
|
||||||
|
Quality and observability standards for the Ouranos Lab. All infrastructure code, application code, and LLM-generated code deployed into this environment must meet these standards.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🐾 Red Panda Approval™
|
||||||
|
|
||||||
|
All implementations must meet the 5 Sacred Criteria:
|
||||||
|
|
||||||
|
1. **Fresh Environment Test** — Clean runs on new systems without drift. No leftover state, no manual steps.
|
||||||
|
2. **Elegant Simplicity** — Modular, reusable, no copy-paste sprawl. One playbook per concern.
|
||||||
|
3. **Observable & Auditable** — Clear task names, proper logging, check mode compatible. You can see what happened.
|
||||||
|
4. **Idempotent Patterns** — Run multiple times with consistent results. No side effects on re-runs.
|
||||||
|
5. **Actually Provisions & Configures** — Resources work, dependencies resolve, services integrate. It does the thing.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Vault Security
|
||||||
|
|
||||||
|
All sensitive information is encrypted using Ansible Vault with AES256 encryption.
|
||||||
|
|
||||||
|
**Encrypted secrets:**
|
||||||
|
- Database passwords (PostgreSQL, Neo4j)
|
||||||
|
- API keys (OpenAI, Anthropic, Mistral, Groq)
|
||||||
|
- Application secrets (Grafana, SearXNG, Arke)
|
||||||
|
- Monitoring alerts (Pushover integration)
|
||||||
|
|
||||||
|
**Security rules:**
|
||||||
|
- AES256 encryption with `ansible-vault`
|
||||||
|
- Password file for automation — never pass `--vault-password-file` inline in scripts
|
||||||
|
- Vault variables use the `vault_` prefix; map to friendly names in `group_vars/all/vars.yml`
|
||||||
|
- No secrets in plain text files, ever
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Log Level Standards
|
||||||
|
|
||||||
|
All services in the Ouranos Lab MUST follow these log level conventions. These rules apply to application code, infrastructure services, and any LLM-generated code deployed into this environment. Log output flows through Alloy → Loki → Grafana, so disciplined leveling is not cosmetic — it directly determines alert quality, dashboard usefulness, and on-call signal-to-noise ratio.
|
||||||
|
|
||||||
|
### Level Definitions
|
||||||
|
|
||||||
|
| Level | When to Use | What MUST Be Included | Loki / Grafana Role |
|
||||||
|
|-------|-------------|----------------------|---------------------|
|
||||||
|
| **ERROR** | Something is broken and requires human intervention. The service cannot fulfil the current request or operation. | Exception class, message, stack trace, and relevant context (request ID, user, resource identifier). Never a bare `"something failed"`. | AlertManager rules fire on `level=~"error\|fatal\|critical"`. These trigger Pushover notifications. |
|
||||||
|
| **WARNING** | Degraded but self-recovering: retries succeeding, fallback paths taken, thresholds approaching, deprecated features invoked. | What degraded, what recovery action was taken, current metric value vs. threshold. | Grafana dashboard panels. Rate-based alerting (e.g., >N warnings/min). |
|
||||||
|
| **INFO** | Significant lifecycle and business events: service start/stop, configuration loaded, deployment markers, user authentication, job completion, schema migrations. | The event and its outcome. This level tells the *story* of what the system did. | Default production visibility. The go-to level for post-incident timelines. |
|
||||||
|
| **DEBUG** | Diagnostic detail for active troubleshooting: request/response payloads, SQL queries, internal state, variable values. | **Actionable context is mandatory.** A DEBUG line with no detail is worse than no line at all. Include variable values, object states, or decision paths. | Never enabled in production by default. Used on-demand via per-service level override. |
|
||||||
|
|
||||||
|
### Anti-Patterns
|
||||||
|
|
||||||
|
These are explicit violations of Ouranos logging standards:
|
||||||
|
|
||||||
|
| ❌ Anti-Pattern | Why It's Wrong | ✅ Correct Approach |
|
||||||
|
|----------------|---------------|-------------------|
|
||||||
|
| Health checks logged at INFO (`GET /health → 200 OK`) | Routine HAProxy/Prometheus probes flood syslog with thousands of identical lines per hour, burying real events. | Suppress health endpoints from access logs entirely, or demote to DEBUG. |
|
||||||
|
| DEBUG with no context (`logger.debug("error occurred")`) | Provides zero diagnostic value. If DEBUG is noisy *and* useless, nobody will ever enable it. | `logger.debug("PaymentService.process failed: order_id=%s, provider=%s, response=%r", oid, provider, resp)` |
|
||||||
|
| ERROR without exception details (`logger.error("task failed")`) | Cannot be triaged without reproduction steps. Wastes on-call time. | `logger.error("Celery task invoice_gen failed: order_id=%s", oid, exc_info=True)` |
|
||||||
|
| Logging sensitive data at any level | Passwords, tokens, API keys, and PII in Loki are a security incident. | Mask or redact: `api_key=sk-...a3f2`, `password=*****`. |
|
||||||
|
| Inconsistent level casing | Breaks LogQL filters and Grafana label selectors. | **Python / Django**: UPPERCASE (`INFO`, `WARNING`, `ERROR`, `DEBUG`). **Go / infrastructure** (HAProxy, Alloy, Gitea): lowercase (`info`, `warn`, `error`, `debug`). |
|
||||||
|
| Logging expected conditions as ERROR | A user entering a wrong password is not an error — it is normal business logic. | Use WARNING or INFO for expected-but-notable conditions. Reserve ERROR for things that are actually broken. |
|
||||||
|
|
||||||
|
### Health Check Rule
|
||||||
|
|
||||||
|
> All services exposed through HAProxy MUST suppress or demote health check endpoints (`/health`, `/healthz`, `/api/health`, `/metrics`, `/ping`) to DEBUG or below. Health check success is the *absence* of errors, not the presence of 200s. If your syslog shows a successful health probe, your log level is wrong.
|
||||||
|
|
||||||
|
**Implementation guidance:**
|
||||||
|
- **Django / Gunicorn**: Filter health paths in the access log handler or use middleware that skips logging for probe user-agents.
|
||||||
|
- **FastAPI / Uvicorn**: Add a `logging.Filter` on the `uvicorn.access` logger that matches health paths in the access log message. Uvicorn's access log format includes the full request line in quotes (e.g., `"GET /live HTTP/1.1"`), so filter regexes must account for that. See also the structured logging notes below.
|
||||||
|
- **Docker services**: Configure the application's internal logging to exclude health routes — the syslog driver forwards everything it receives.
|
||||||
|
- **HAProxy**: HAProxy's own health check logs (`option httpchk`) should remain at the HAProxy level for connection debugging, but backend application responses to those probes must not surface at INFO.
|
||||||
|
|
||||||
|
### Background Worker & Queue Monitoring
|
||||||
|
|
||||||
|
> **The most dangerous failure is the one that produces no logs.**
|
||||||
|
|
||||||
|
When a background worker (Celery task consumer, RabbitMQ subscriber, Gitea Runner, cron job) fails to start or crashes on startup, it generates no ongoing log output. Error-rate dashboards stay green because there is no process running to produce errors. Meanwhile, queues grow unbounded and work silently stops being processed.
|
||||||
|
|
||||||
|
**Required practices:**
|
||||||
|
|
||||||
|
1. **Heartbeat logging** — Every long-running background worker MUST emit a periodic INFO-level heartbeat (e.g., `"worker alive, processed N jobs in last 5m, queue depth: M"`). The *absence* of this heartbeat is the alertable condition.
|
||||||
|
|
||||||
|
2. **Startup and shutdown at INFO** — Worker start, ready, graceful shutdown, and crash-exit are significant lifecycle events. These MUST log at INFO.
|
||||||
|
|
||||||
|
3. **Queue depth as a metric** — RabbitMQ queue depths and any application-level task queues MUST be exposed as Prometheus metrics. A growing queue with zero consumer activity is an **ERROR**-level alert, not a warning.
|
||||||
|
|
||||||
|
4. **Grafana "last seen" alerts** — For every background worker, configure a Grafana alert using `absent_over_time()` or equivalent staleness detection: *"Worker X has not logged a heartbeat in >10 minutes"* → ERROR severity → Pushover notification.
|
||||||
|
|
||||||
|
5. **Crash-on-start is ERROR** — If a worker exits within seconds of starting (missing config, failed DB connection, import error), the exit MUST be captured at ERROR level by the service manager (`systemd OnFailure=`, Docker restart policy logs). Do not rely on the crashing application to log its own death — it may never get the chance.
|
||||||
|
|
||||||
|
### Production Defaults
|
||||||
|
|
||||||
|
| Service Category | Default Level | Rationale |
|
||||||
|
|-----------------|---------------|-----------|
|
||||||
|
| Django apps (Angelia, Athena, Kairos, Icarlos, Spelunker, Peitho, MCP Switchboard) | `WARNING` | Business logic — only degraded or broken conditions surface. Lifecycle events (start/stop/deploy) still log at INFO via Gunicorn and systemd. |
|
||||||
|
| FastAPI apps (Periplus) | `WARNING` | Same rationale as Django. Uvicorn lifecycle events (start/stop) are pinned to INFO via the `uvicorn.error` logger regardless of app log level. |
|
||||||
|
| Gunicorn access logs | Suppress 2xx/3xx health probes | Routine request logging deferred to HAProxy access logs in Loki. |
|
||||||
|
| Infrastructure agents (Alloy, Prometheus, Node Exporter) | `warn` | Stable — do not change without cause. |
|
||||||
|
| HAProxy (Titania) | `warning` | Connection-level logging handled by HAProxy's own log format → Alloy → Loki. |
|
||||||
|
| Databases (PostgreSQL, Neo4j) | `warning` | Query-level logging only enabled for active troubleshooting. |
|
||||||
|
| Docker services (Gitea, LobeChat, Nextcloud, AnythingLLM, SearXNG) | `warn` / `warning` | Per-service default. Tune individually if needed. |
|
||||||
|
| LLM Proxy (Arke) | `info` | Token usage tracking and provider routing decisions justify INFO. Review periodically for noise. |
|
||||||
|
| Observability stack (Grafana, Loki, AlertManager) | `warn` | Should be quiet unless something is wrong with observability itself. |
|
||||||
|
|
||||||
|
### Structured Logging — FastAPI / Uvicorn
|
||||||
|
|
||||||
|
FastAPI apps using uvicorn require special handling to achieve JSON-structured log output for the Alloy → Loki pipeline. Uvicorn manages its own loggers aggressively, and naive approaches will fail silently.
|
||||||
|
|
||||||
|
**Required practices:**
|
||||||
|
|
||||||
|
1. **Override uvicorn's handlers, don't just add to root** — Uvicorn's `config.load()` creates its own `StreamHandler` instances on `uvicorn`, `uvicorn.error`, and `uvicorn.access`. You must remove these handlers and set `propagate = True` so log records flow to the root logger where your JSON formatter lives.
|
||||||
|
|
||||||
|
2. **Re-apply logging config in the lifespan** — Configuring logging at module import time is not sufficient. Uvicorn's `config.load()` runs *after* your module is imported but *before* the ASGI lifespan starts. Call your logging configuration function again inside the FastAPI `lifespan` context manager to recapture control.
|
||||||
|
|
||||||
|
3. **Remap uvicorn logger names** — Uvicorn uses `uvicorn.error` for all lifecycle messages (startup, shutdown, errors) despite the misleading name. Remap it to `uvicorn` in your JSON formatter's output for clarity in Loki queries.
|
||||||
|
|
||||||
|
4. **Use `pydantic-settings` with `extra = "ignore"`** — When loading config from `.env` files that contain variables for other services (e.g., oauth2-proxy), pydantic-settings will reject unknown fields by default. Always set `extra = "ignore"` in the model config.
|
||||||
|
|
||||||
|
### Loki & Grafana Alignment
|
||||||
|
|
||||||
|
**Label normalization**: Alloy pipelines (syslog listeners and journal relabeling) MUST extract and forward a `level` label on every log line. Without a `level` label, the log entry is invisible to level-based dashboard filters and alert rules.
|
||||||
|
|
||||||
|
**LogQL conventions for dashboards:**
|
||||||
|
```logql
|
||||||
|
# Production error monitoring (default dashboard view)
|
||||||
|
{job="syslog", hostname="puck"} | json | level=~"error|fatal|critical"
|
||||||
|
|
||||||
|
# Warning-and-above for a specific service
|
||||||
|
{service_name="haproxy"} | logfmt | level=~"warn|error|fatal"
|
||||||
|
|
||||||
|
# Debug-level troubleshooting (temporary, never permanent dashboards)
|
||||||
|
{container="angelia"} | json | level="debug"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Alerting rules** — Grafana alert rules MUST key off the normalized `level` label:
|
||||||
|
- `level=~"error|fatal|critical"` → Immediate Pushover notification via AlertManager
|
||||||
|
- `absent_over_time({service_name="celery_worker"}[10m])` → Worker heartbeat staleness → ERROR severity
|
||||||
|
- Rate-based: `rate({service_name="arke"} | json | level="error" [5m]) > 0.1` → Sustained error rate
|
||||||
|
|
||||||
|
**Retention alignment**: Loki retention policies should preserve ERROR and WARNING logs longer than DEBUG. DEBUG-level logs generated during troubleshooting sessions should have a short TTL or be explicitly cleaned up.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Health Check Endpoints
|
||||||
|
|
||||||
|
All services MUST expose Kubernetes-style health endpoints at these paths:
|
||||||
|
|
||||||
|
| Endpoint | Purpose | Auth |
|
||||||
|
|----------|---------|------|
|
||||||
|
| `GET /live` | **Liveness** — process is running and accepting connections | None |
|
||||||
|
| `GET /ready` | **Readiness** — process is running AND all dependencies (DB, cache, upstream APIs) are healthy | None |
|
||||||
|
| `GET /metrics` | Prometheus metrics | IP-restricted (no JWT) |
|
||||||
|
|
||||||
|
- HAProxy uses `health_path: /ready/` for backend health checks — return HTTP 200 when ready
|
||||||
|
- Health endpoints MUST NOT require authentication
|
||||||
|
- Third-party services use their native paths (`/api/health`, `/api/healthz`, `/-/healthy`, etc.)
|
||||||
|
|
||||||
|
### Docker Compose Healthchecks
|
||||||
|
|
||||||
|
Use `curl -f` (install curl in images if needed). Do not use `wget --spider`.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/live"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 40s
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Endpoint Protection
|
||||||
|
|
||||||
|
| Protected (require valid JWT) | Unprotected |
|
||||||
|
|-------------------------------|-------------|
|
||||||
|
| All `/api/v1/*` routes | `GET /live` |
|
||||||
|
| | `GET /ready` |
|
||||||
|
| | `GET /metrics` (IP-restricted to internal networks) |
|
||||||
|
| | `GET /api/auth/login-url` |
|
||||||
|
| | `POST /api/auth/token` |
|
||||||
|
| | `POST /api/v1/telemetry` (sendBeacon cannot set headers) |
|
||||||
|
|
||||||
|
> **Why `/api/v1/telemetry` is unprotected**: The browser `sendBeacon` API cannot set `Authorization` headers. The telemetry endpoint must be open to receive client-side error reports and performance data, or browser errors will be silently lost.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prometheus Metrics
|
||||||
|
|
||||||
|
All services SHOULD expose `GET /metrics` in Prometheus exposition format, scraped by Prospero's Prometheus at 15s intervals.
|
||||||
|
|
||||||
|
- **IP-restricted** to internal networks: `10.10.0.0/24`, `172.16.0.0/12`, `127.0.0.0/8`
|
||||||
|
- No JWT required — HAProxy and Prometheus scrapers cannot authenticate
|
||||||
|
- Useful metrics to expose: request totals and durations, error rates, active connections, queue depths, dependency health
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Browser Telemetry
|
||||||
|
|
||||||
|
Frontend/browser code MUST report errors and performance data back to the server.
|
||||||
|
|
||||||
|
- Send to `POST /api/v1/telemetry` — unprotected endpoint
|
||||||
|
- Capture: JavaScript exceptions, promise rejections, resource load failures, performance metrics
|
||||||
|
- The server MUST log client-side exceptions at **WARNING** level (they indicate user-facing problems but are not server failures)
|
||||||
|
- Include enough context to reproduce: URL, user agent, error message, stack trace (if available)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Environment Variable Naming
|
||||||
|
|
||||||
|
All environment variables for an application MUST use a consistent prefix matching the service name (e.g., `PERIPLUS_`, `ARKE_`, `ANGELIA_`). This applies to every variable in the `.env` file, including those consumed by sidecar services like oauth2-proxy.
|
||||||
|
|
||||||
|
**Rules:**
|
||||||
|
- All vars in `.env` use the `SERVICENAME_` prefix — no exceptions
|
||||||
|
- `compose.yaml` maps prefixed vars to the sidecar's expected names (e.g., `OAUTH2_PROXY_CLIENT_ID: ${PERIPLUS_CASDOOR_CLIENT_ID}`)
|
||||||
|
- The application's Settings model SHOULD declare all prefixed vars, even those only consumed by sidecars, so the full configuration is documented in one place
|
||||||
|
- Every repo MUST include a `.env.example` with placeholder values for all required variables. Add `!.env.example` to `.gitignore` if a broad `.env.*` pattern would otherwise exclude it
|
||||||
|
- `.env` files with real secrets are ALWAYS gitignored — no exceptions
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Docker Networking
|
||||||
|
|
||||||
|
- Use the **default Docker bridge network** for simple deployments
|
||||||
|
- Add additional named networks only when required (e.g., isolating database traffic) or explicitly requested
|
||||||
|
- Do not define custom networks for single-service Docker Compose stacks
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Documentation Standards
|
||||||
|
|
||||||
|
Place documentation in the `/docs/` directory of the repository.
|
||||||
|
|
||||||
|
### HTML Documents
|
||||||
|
|
||||||
|
HTML documents must follow [docs/documentation_style_guide.html](documentation_style_guide.html).
|
||||||
|
|
||||||
|
- Use Bootstrap CDN with Bootswatch theme **Flatly**
|
||||||
|
- Include a dark mode toggle button in the navbar
|
||||||
|
- Use Bootstrap Icons for icons
|
||||||
|
- Use Bootstrap CSS for styles — avoid custom CSS
|
||||||
|
- Use **Mermaid** for diagrams
|
||||||
@@ -39,3 +39,4 @@ SERVER_NAME = "Nike — Football Data Platform"
|
|||||||
# IPs allowed to set X-Forwarded-Proto (your HAProxy host).
|
# IPs allowed to set X-Forwarded-Proto (your HAProxy host).
|
||||||
# '*' trusts all — safe when Nike's port is firewalled to HAProxy only.
|
# '*' trusts all — safe when Nike's port is firewalled to HAProxy only.
|
||||||
TRUSTED_PROXY_IPS = os.getenv('NIKE_TRUSTED_PROXY', '*')
|
TRUSTED_PROXY_IPS = os.getenv('NIKE_TRUSTED_PROXY', '*')
|
||||||
|
LOG_LEVEL = os.getenv('NIKE_LOG_LEVEL', 'WARNING')
|
||||||
|
|||||||
100
nike/logging_config.py
Normal file
100
nike/logging_config.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
"""
|
||||||
|
Structured logging configuration for Nike.
|
||||||
|
|
||||||
|
Follows Red Panda Standards:
|
||||||
|
- JSON-structured output for Alloy → Loki → Grafana pipeline
|
||||||
|
- UPPERCASE level names (Python convention)
|
||||||
|
- Health check paths suppressed from access logs
|
||||||
|
- Production default: WARNING (lifecycle events via uvicorn at INFO)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
# Paths that should never appear in access logs at INFO level.
|
||||||
|
# Health probes are the *absence* of errors, not the presence of 200s.
|
||||||
|
# Uvicorn access log format: '127.0.0.1:1234 - "GET /live HTTP/1.1" 200'
|
||||||
|
_HEALTH_PATHS = re.compile(r'"(GET|HEAD) /(live|ready|metrics)(/|\s)')
|
||||||
|
|
||||||
|
|
||||||
|
# Map uvicorn's internal logger names to cleaner names for Loki.
|
||||||
|
# uvicorn.error = lifecycle + errors, uvicorn.access = request logs.
|
||||||
|
_LOGGER_NAME_MAP = {
|
||||||
|
"uvicorn.error": "uvicorn",
|
||||||
|
"uvicorn.access": "uvicorn.access",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class JSONFormatter(logging.Formatter):
|
||||||
|
"""Emit one JSON object per log line for structured ingestion."""
|
||||||
|
|
||||||
|
def format(self, record: logging.LogRecord) -> str:
|
||||||
|
log_entry = {
|
||||||
|
"timestamp": datetime.fromtimestamp(record.created, tz=timezone.utc).isoformat(),
|
||||||
|
"level": record.levelname,
|
||||||
|
"logger": _LOGGER_NAME_MAP.get(record.name, record.name),
|
||||||
|
"message": record.getMessage(),
|
||||||
|
}
|
||||||
|
if record.exc_info and record.exc_info[0] is not None:
|
||||||
|
log_entry["exception"] = self.formatException(record.exc_info)
|
||||||
|
if hasattr(record, "request_id"):
|
||||||
|
log_entry["request_id"] = record.request_id
|
||||||
|
return json.dumps(log_entry, default=str)
|
||||||
|
|
||||||
|
|
||||||
|
class HealthCheckFilter(logging.Filter):
|
||||||
|
"""Suppress health/readiness/metrics probe lines from uvicorn access logs.
|
||||||
|
|
||||||
|
These endpoints are polled every 30s by HAProxy/Prometheus. Logging
|
||||||
|
them at INFO floods Loki with thousands of identical lines per hour.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def filter(self, record: logging.LogRecord) -> bool:
|
||||||
|
message = record.getMessage()
|
||||||
|
if _HEALTH_PATHS.search(message):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def configure_logging(level: str = "WARNING") -> None:
|
||||||
|
"""Apply structured JSON logging to the root logger and uvicorn loggers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
level: Log level string (DEBUG, INFO, WARNING, ERROR). Per Red Panda
|
||||||
|
Standards the production default for application code is WARNING.
|
||||||
|
"""
|
||||||
|
numeric_level = getattr(logging, level.upper(), logging.WARNING)
|
||||||
|
|
||||||
|
json_formatter = JSONFormatter()
|
||||||
|
health_filter = HealthCheckFilter()
|
||||||
|
|
||||||
|
# Root handler — all nike.* loggers inherit this
|
||||||
|
root = logging.getLogger()
|
||||||
|
root.setLevel(numeric_level)
|
||||||
|
|
||||||
|
# Remove any pre-existing handlers (avoids duplicate lines in containers)
|
||||||
|
for handler in root.handlers[:]:
|
||||||
|
root.removeHandler(handler)
|
||||||
|
|
||||||
|
stream_handler = logging.StreamHandler()
|
||||||
|
stream_handler.setFormatter(json_formatter)
|
||||||
|
root.addHandler(stream_handler)
|
||||||
|
|
||||||
|
# Force uvicorn loggers to propagate to root (where our JSON handler lives)
|
||||||
|
# and remove any handlers uvicorn's config.load() may have installed.
|
||||||
|
for uv_name in ("uvicorn", "uvicorn.error", "uvicorn.access"):
|
||||||
|
uv_logger = logging.getLogger(uv_name)
|
||||||
|
for h in uv_logger.handlers[:]:
|
||||||
|
uv_logger.removeHandler(h)
|
||||||
|
uv_logger.propagate = True
|
||||||
|
|
||||||
|
# Uvicorn access logger — suppress health probes
|
||||||
|
uvicorn_access = logging.getLogger("uvicorn.access")
|
||||||
|
uvicorn_access.addFilter(health_filter)
|
||||||
|
|
||||||
|
# Uvicorn error logger (startup/shutdown lifecycle) — always at INFO
|
||||||
|
# so "Started server process" and "Shutting down" are captured
|
||||||
|
uvicorn_error = logging.getLogger("uvicorn.error")
|
||||||
|
uvicorn_error.setLevel(logging.INFO)
|
||||||
106
nike/server.py
106
nike/server.py
@@ -14,6 +14,9 @@ Data flow:
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ipaddress
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
@@ -23,9 +26,17 @@ from typing import Any
|
|||||||
|
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import FastAPI, Request
|
from fastapi import FastAPI, Request
|
||||||
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
|
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse, Response
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
|
from prometheus_client import (
|
||||||
|
CONTENT_TYPE_LATEST,
|
||||||
|
CollectorRegistry,
|
||||||
|
Counter,
|
||||||
|
Gauge,
|
||||||
|
Histogram,
|
||||||
|
generate_latest,
|
||||||
|
)
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from fastmcp import FastMCP
|
from fastmcp import FastMCP
|
||||||
@@ -34,6 +45,38 @@ from mcp.types import ToolAnnotations
|
|||||||
import nike.db as db
|
import nike.db as db
|
||||||
import nike.sportsdb as api
|
import nike.sportsdb as api
|
||||||
from nike import config
|
from nike import config
|
||||||
|
from nike.logging_config import configure_logging
|
||||||
|
|
||||||
|
# ── Structured logging ────────────────────────────────────
|
||||||
|
configure_logging(config.LOG_LEVEL)
|
||||||
|
logger = logging.getLogger("nike")
|
||||||
|
|
||||||
|
# ── Prometheus metrics ────────────────────────────────────
|
||||||
|
_registry = CollectorRegistry()
|
||||||
|
REQUEST_COUNT = Counter(
|
||||||
|
"nike_http_requests_total",
|
||||||
|
"Total HTTP requests",
|
||||||
|
["method", "path", "status"],
|
||||||
|
registry=_registry,
|
||||||
|
)
|
||||||
|
REQUEST_DURATION = Histogram(
|
||||||
|
"nike_http_request_duration_seconds",
|
||||||
|
"HTTP request duration in seconds",
|
||||||
|
["method", "path"],
|
||||||
|
registry=_registry,
|
||||||
|
)
|
||||||
|
DB_UP = Gauge(
|
||||||
|
"nike_database_up",
|
||||||
|
"1 if database connection is healthy, 0 otherwise",
|
||||||
|
registry=_registry,
|
||||||
|
)
|
||||||
|
|
||||||
|
_METRICS_ALLOWED_NETS = [
|
||||||
|
ipaddress.ip_network("10.10.0.0/24"),
|
||||||
|
ipaddress.ip_network("172.16.0.0/12"),
|
||||||
|
ipaddress.ip_network("127.0.0.0/8"),
|
||||||
|
ipaddress.ip_network("::1/128"),
|
||||||
|
]
|
||||||
|
|
||||||
# ── Request log ──────────────────────────────────────────
|
# ── Request log ──────────────────────────────────────────
|
||||||
_REQUEST_LOG: deque[dict] = deque(maxlen=500)
|
_REQUEST_LOG: deque[dict] = deque(maxlen=500)
|
||||||
@@ -696,6 +739,8 @@ _TEMPLATES = Jinja2Templates(
|
|||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def _lifespan(app: FastAPI):
|
async def _lifespan(app: FastAPI):
|
||||||
"""Start DB pool + MCP session manager on startup, close on shutdown."""
|
"""Start DB pool + MCP session manager on startup, close on shutdown."""
|
||||||
|
# Re-apply structured logging after uvicorn's config.load() overwrites handlers.
|
||||||
|
configure_logging(config.LOG_LEVEL)
|
||||||
db.create_pool(minconn=2, maxconn=10)
|
db.create_pool(minconn=2, maxconn=10)
|
||||||
async with _mcp_app.router.lifespan_context(app):
|
async with _mcp_app.router.lifespan_context(app):
|
||||||
yield
|
yield
|
||||||
@@ -704,6 +749,32 @@ async def _lifespan(app: FastAPI):
|
|||||||
|
|
||||||
dashboard = FastAPI(title="Nike Dashboard", lifespan=_lifespan)
|
dashboard = FastAPI(title="Nike Dashboard", lifespan=_lifespan)
|
||||||
|
|
||||||
|
# ── Request metrics middleware ────────────────────────────
|
||||||
|
_SKIP_METRICS_PREFIXES = ("/live", "/ready", "/metrics", "/mcp")
|
||||||
|
|
||||||
|
|
||||||
|
@dashboard.middleware("http")
|
||||||
|
async def prometheus_metrics_middleware(request: Request, call_next):
|
||||||
|
"""Record request count and duration for non-health/static paths."""
|
||||||
|
path = request.url.path
|
||||||
|
if any(path.startswith(p) for p in _SKIP_METRICS_PREFIXES):
|
||||||
|
return await call_next(request)
|
||||||
|
|
||||||
|
normalized = re.sub(
|
||||||
|
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
|
||||||
|
"{id}",
|
||||||
|
path,
|
||||||
|
)
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
response = await call_next(request)
|
||||||
|
duration = time.perf_counter() - start
|
||||||
|
|
||||||
|
REQUEST_COUNT.labels(method=request.method, path=normalized, status=response.status_code).inc()
|
||||||
|
REQUEST_DURATION.labels(method=request.method, path=normalized).observe(duration)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
# ── Dashboard routes ──────────────────────────────────────
|
# ── Dashboard routes ──────────────────────────────────────
|
||||||
|
|
||||||
_SVELTE_BUILD = Path(__file__).parent.parent / "dashboard" / "build"
|
_SVELTE_BUILD = Path(__file__).parent.parent / "dashboard" / "build"
|
||||||
@@ -813,6 +884,7 @@ async def api_run(body: _RunRequest):
|
|||||||
result = fn(**body.args)
|
result = fn(**body.args)
|
||||||
return JSONResponse({"ok": True, "result": result})
|
return JSONResponse({"ok": True, "result": result})
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
logger.error("Tool run failed: tool=%s args=%s", body.tool, body.args, exc_info=True)
|
||||||
return JSONResponse({"ok": False, "error": str(exc)}, status_code=500)
|
return JSONResponse({"ok": False, "error": str(exc)}, status_code=500)
|
||||||
|
|
||||||
|
|
||||||
@@ -835,15 +907,29 @@ async def readiness():
|
|||||||
|
|
||||||
|
|
||||||
@dashboard.get("/metrics")
|
@dashboard.get("/metrics")
|
||||||
async def metrics():
|
async def metrics(request: Request):
|
||||||
|
"""Prometheus exposition format metrics endpoint.
|
||||||
|
|
||||||
|
IP-restricted to internal networks per Red Panda Standards:
|
||||||
|
10.10.0.0/24, 172.16.0.0/12, 127.0.0.0/8. No auth required —
|
||||||
|
HAProxy and Prometheus scrapers cannot authenticate.
|
||||||
|
"""
|
||||||
|
client_ip = request.client.host if request.client else "0.0.0.0"
|
||||||
|
try:
|
||||||
|
addr = ipaddress.ip_address(client_ip)
|
||||||
|
except ValueError:
|
||||||
|
return PlainTextResponse("Forbidden", status_code=403)
|
||||||
|
|
||||||
|
if not any(addr in net for net in _METRICS_ALLOWED_NETS):
|
||||||
|
return PlainTextResponse("Forbidden", status_code=403)
|
||||||
|
|
||||||
db_check = db.check_connection()
|
db_check = db.check_connection()
|
||||||
counts = db.get_table_counts()
|
DB_UP.set(1 if db_check.get("connected") else 0)
|
||||||
uptime_s = int((datetime.now(timezone.utc) - _SERVER_START).total_seconds())
|
|
||||||
return JSONResponse({
|
return Response(
|
||||||
"uptime_seconds": uptime_s,
|
content=generate_latest(_registry),
|
||||||
"database": db_check,
|
media_type=CONTENT_TYPE_LATEST,
|
||||||
"table_counts": counts,
|
)
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
# ── Mount MCP onto dashboard ──────────────────────────────
|
# ── Mount MCP onto dashboard ──────────────────────────────
|
||||||
@@ -862,7 +948,7 @@ def main():
|
|||||||
host=config.SERVER_HOST,
|
host=config.SERVER_HOST,
|
||||||
port=config.SERVER_PORT,
|
port=config.SERVER_PORT,
|
||||||
reload=False,
|
reload=False,
|
||||||
log_level="info",
|
log_level=config.LOG_LEVEL.lower(),
|
||||||
ws="wsproto",
|
ws="wsproto",
|
||||||
# Trust X-Forwarded-Proto / X-Forwarded-For from HAProxy
|
# Trust X-Forwarded-Proto / X-Forwarded-For from HAProxy
|
||||||
proxy_headers=True,
|
proxy_headers=True,
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ dependencies = [
|
|||||||
"requests>=2.32",
|
"requests>=2.32",
|
||||||
"jinja2>=3.1",
|
"jinja2>=3.1",
|
||||||
"wsproto>=1.2",
|
"wsproto>=1.2",
|
||||||
|
"prometheus-client>=0.20",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
Reference in New Issue
Block a user