feat(ansible): standardize Neo4j ports and add monitoring

- Unify Neo4j HTTP/Bolt/syslog ports across ariel and umbriel hosts
- Add neo4j_metrics_port (22094) for APOC exporter sidecar
- Add umbriel to Prometheus node_exporter targets
- Add Neo4j scrape config and alerts for tx rollback rate and
  stalled store growth
- Replace kernos_harper MCP with andromeda (caliban.helu.ca)
- Remove angelia MCP from kottos fastagent config
- Switch neo4j group membership from keeper_user to ponos
This commit is contained in:
2026-05-22 22:19:13 -04:00
parent 698ceacb74
commit 43fae203d1
9 changed files with 458 additions and 20 deletions

View File

@@ -9,7 +9,7 @@ services:
# Alloy
alloy_log_level: "warn"
neo4j_syslog_port: 22011
neo4j_syslog_port: 51414
# Neo4j
neo4j_user: neo4j
@@ -17,6 +17,7 @@ neo4j_group: neo4j
neo4j_directory: /srv/neo4j
neo4j_auth_user: neo4j
neo4j_auth_password: "{{ vault_neo4j_auth_password }}"
neo4j_http_port: 25554
neo4j_bolt_port: 7687
neo4j_http_port: 22084
neo4j_bolt_port: 22074
neo4j_metrics_port: 22094
neo4j_apoc_unrestricted: "apoc.*"

View File

@@ -72,6 +72,7 @@ prometheus_targets:
- 'sycorax.incus:9100'
- 'prospero.incus:9100'
- 'rosalind.incus:9100'
- 'umbriel.incus:9100'
# Prometheus OAuth2-Proxy Sidecar
prometheus_proxy_port: 9091

View File

@@ -13,7 +13,7 @@ services:
# Alloy
alloy_log_level: "warn"
neo4j_syslog_port: 22012
neo4j_syslog_port: 51414
# Neo4j
neo4j_user: neo4j
@@ -21,6 +21,7 @@ neo4j_group: neo4j
neo4j_directory: /srv/neo4j
neo4j_auth_user: neo4j
neo4j_auth_password: "{{ vault_mnemosyne_neo4j_auth_password }}"
neo4j_http_port: 25555
neo4j_bolt_port: 7687
neo4j_http_port: 22084
neo4j_bolt_port: 22074
neo4j_metrics_port: 22094
neo4j_apoc_unrestricted: "apoc.*"

View File

@@ -54,17 +54,11 @@ mcp:
url: "{{ kottos_grafana_url | default('http://miranda.incus:25533/mcp') }}"
# ── Shell + file operations — Kernos (Korax) ─────────────────────────────
kernos_harper:
andromeda:
transport: http
url: "{{ kottos_kernos_harper_url | default('http://korax.helu.ca:20261/mcp') }}"
url: "{{ kottos_kernos_harper_url | default('http://caliban.helu.ca:20261/mcp') }}"
load_on_start: false
# ── Angelia messaging ───────────────────────────────────────────────────
# Auth header provided by fastagent.secrets.yaml (vault-rendered).
angelia:
transport: http
url: "{{ kottos_angelia_url | default('https://ouranos.helu.ca/mcp/') }}"
# ── GitHub MCP Server (local Docker, stdio) ──────────────────────────────
# GITHUB_PERSONAL_ACCESS_TOKEN provided by fastagent.secrets.yaml
github:

View File

@@ -24,9 +24,9 @@
group: "{{neo4j_group}}"
system: true
- name: Add group neo4j to keeper_user
- name: Add group neo4j to user ponos
ansible.builtin.user:
name: "{{keeper_user}}"
name: ponos
groups: "{{neo4j_group}}"
append: true
@@ -38,6 +38,14 @@
state: directory
mode: '750'
- name: Create neo4j data directory
ansible.builtin.file:
path: "{{neo4j_directory}}/data"
owner: "{{neo4j_user}}"
group: "{{neo4j_group}}"
state: directory
mode: '750'
- name: Template docker-compose file
ansible.builtin.template:
src: docker-compose.yml.j2

View File

@@ -1,6 +1,7 @@
services:
neo4j:
image: neo4j:{{neo4j_image_version}}
image: neo4j:{{neo4j_version}}
pull_policy: always
container_name: neo4j
restart: unless-stopped
ports:
@@ -11,9 +12,11 @@ services:
- neo4j_logs:/logs
- neo4j_plugins:/plugins
environment:
NEO4J_AUTH: "{{neo4j_auth_user}}/{{neo4j_auth_password}}"
# APOC Plugin
NEO4J_PLUGINS: '["apoc"]'
NEO4J_AUTH: "{{neo4j_user}}/{{neo4j_password}}"
# APOC Plugin — core ("apoc") is required by apoc-extended.
# Listing only apoc-extended fails to expose apoc.version(),
# apoc.coll.*, apoc.date.* — declare both.
NEO4J_PLUGINS: '["apoc", "apoc-extended"]'
NEO4J_apoc_export_file_enabled: "true"
NEO4J_apoc_import_file_enabled: "true"
NEO4J_apoc_import_file_use__neo4j__config: "true"
@@ -25,7 +28,31 @@ services:
syslog-format: "{{syslog_format}}"
tag: "neo4j"
neo4j-exporter:
image: stscoundrel/neo4j-apoc-exporter:v0.1.0
restart: unless-stopped
ports:
- "{{neo4j_metrics_port}}:17687"
environment:
- NEO4J_URI=bolt://neo4j:7687
- NEO4J_USER={{neo4j_user}}
- NEO4J_PASSWORD={{neo4j_password}}
- EXPORTER_PORT=17687
depends_on:
- neo4j
logging:
driver: syslog
options:
syslog-address: "tcp://127.0.0.1:{{neo4j_syslog_port}}"
syslog-format: "{{syslog_format}}"
tag: "neo4j-exporter"
volumes:
neo4j_data:
driver: local
driver_opts:
type: none
device: {{neo4j_directory}}/data
o: bind
neo4j_logs:
neo4j_plugins:

View File

@@ -384,6 +384,48 @@ groups:
summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
# ============================================================================
# Neo4j Alerts (neo4j-apoc-exporter sidecar)
# ============================================================================
# Metrics come from stscoundrel/neo4j-apoc-exporter, which connects to
# Neo4j over Bolt and surfaces apoc.monitor.* gauges plus standard JVM
# metrics. "Exporter down" therefore covers both "exporter container
# crashed" and "exporter cannot reach Bolt" — either way Neo4j is
# effectively unobservable. Hostname-only — purpose of each instance
# is implied by the host (e.g. ariel = LLM memory, umbriel = Mnemosyne).
- name: neo4j_alerts
rules:
- alert: Neo4jExporterDown
expr: up{job="neo4j"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Neo4j exporter down on {{ $labels.instance }}"
description: "The neo4j-apoc-exporter on {{ $labels.instance }} has been unreachable for more than 5 minutes. Either the sidecar container is down or it cannot connect to Neo4j over Bolt — check `docker ps` and `docker logs neo4j-exporter` on the host."
- alert: Neo4jHighRollbackRate
expr: |
rate(neo4j_monitor_tx_rolledBackTx[10m])
/ clamp_min(rate(neo4j_monitor_tx_totalOpenedTx[10m]), 1) > 0.10
for: 10m
labels:
severity: warning
annotations:
summary: "Neo4j transaction rollback rate above 10% on {{ $labels.instance }}"
description: "More than 10% of transactions on {{ $labels.instance }} have rolled back over the last 10 minutes — check application logs in Loki ({job=\"neo4j\", hostname=\"{{ $labels.instance }}\"})."
- alert: Neo4jStoreGrowthStalled
expr: |
rate(neo4j_monitor_tx_totalOpenedTx[15m]) == 0
and neo4j_monitor_tx_currentOpenedTx > 0
for: 15m
labels:
severity: warning
annotations:
summary: "Neo4j has open transactions but zero throughput on {{ $labels.instance }}"
description: "{{ $labels.instance }} shows {{ $value }} currently-open transactions but no new transactions opened in 15 minutes — possible Bolt-side hang or stuck query."
# Red Panda Seal of Approval 🐼
# "If the metrics aren't red, go back to bed"
{% endraw %}

View File

@@ -62,4 +62,17 @@ scrape_configs:
metrics_path: '/metrics'
scrape_interval: 15s
# Neo4j — stscoundrel/neo4j-apoc-exporter sidecar connects to the local
# Neo4j over Bolt and exposes apoc.monitor.* (tx/ids/store) plus JVM
# metrics on the standard metrics port (22094). Both Ariel (LLM memory
# via neo4j-cypher MCP) and Umbriel (Mnemosyne graph+vector DB) use the
# same port — they are differentiated by hostname only.
- job_name: 'neo4j'
static_configs:
- targets:
- 'ariel.incus:22094'
- 'umbriel.incus:22094'
metrics_path: '/metrics'
scrape_interval: 15s
# Red Panda Approved Prometheus Configuration