feat(ansible): standardize Neo4j ports and add monitoring

- Unify Neo4j HTTP/Bolt/syslog ports across ariel and umbriel hosts
- Add neo4j_metrics_port (22094) for APOC exporter sidecar
- Add umbriel to Prometheus node_exporter targets
- Add Neo4j scrape config and alerts for tx rollback rate and
  stalled store growth
- Replace kernos_harper MCP with andromeda (caliban.helu.ca)
- Remove angelia MCP from kottos fastagent config
- Switch neo4j group membership from keeper_user to ponos
This commit is contained in:
2026-05-22 22:19:13 -04:00
parent 698ceacb74
commit 43fae203d1
9 changed files with 458 additions and 20 deletions

View File

@@ -9,7 +9,7 @@ services:
# Alloy
alloy_log_level: "warn"
neo4j_syslog_port: 22011
neo4j_syslog_port: 51414
# Neo4j
neo4j_user: neo4j
@@ -17,6 +17,7 @@ neo4j_group: neo4j
neo4j_directory: /srv/neo4j
neo4j_auth_user: neo4j
neo4j_auth_password: "{{ vault_neo4j_auth_password }}"
neo4j_http_port: 25554
neo4j_bolt_port: 7687
neo4j_http_port: 22084
neo4j_bolt_port: 22074
neo4j_metrics_port: 22094
neo4j_apoc_unrestricted: "apoc.*"

View File

@@ -72,6 +72,7 @@ prometheus_targets:
- 'sycorax.incus:9100'
- 'prospero.incus:9100'
- 'rosalind.incus:9100'
- 'umbriel.incus:9100'
# Prometheus OAuth2-Proxy Sidecar
prometheus_proxy_port: 9091

View File

@@ -13,7 +13,7 @@ services:
# Alloy
alloy_log_level: "warn"
neo4j_syslog_port: 22012
neo4j_syslog_port: 51414
# Neo4j
neo4j_user: neo4j
@@ -21,6 +21,7 @@ neo4j_group: neo4j
neo4j_directory: /srv/neo4j
neo4j_auth_user: neo4j
neo4j_auth_password: "{{ vault_mnemosyne_neo4j_auth_password }}"
neo4j_http_port: 25555
neo4j_bolt_port: 7687
neo4j_http_port: 22084
neo4j_bolt_port: 22074
neo4j_metrics_port: 22094
neo4j_apoc_unrestricted: "apoc.*"

View File

@@ -54,17 +54,11 @@ mcp:
url: "{{ kottos_grafana_url | default('http://miranda.incus:25533/mcp') }}"
# ── Shell + file operations — Kernos (Korax) ─────────────────────────────
kernos_harper:
andromeda:
transport: http
url: "{{ kottos_kernos_harper_url | default('http://korax.helu.ca:20261/mcp') }}"
url: "{{ kottos_kernos_harper_url | default('http://caliban.helu.ca:20261/mcp') }}"
load_on_start: false
# ── Angelia messaging ───────────────────────────────────────────────────
# Auth header provided by fastagent.secrets.yaml (vault-rendered).
angelia:
transport: http
url: "{{ kottos_angelia_url | default('https://ouranos.helu.ca/mcp/') }}"
# ── GitHub MCP Server (local Docker, stdio) ──────────────────────────────
# GITHUB_PERSONAL_ACCESS_TOKEN provided by fastagent.secrets.yaml
github:

View File

@@ -24,9 +24,9 @@
group: "{{neo4j_group}}"
system: true
- name: Add group neo4j to keeper_user
- name: Add group neo4j to user ponos
ansible.builtin.user:
name: "{{keeper_user}}"
name: ponos
groups: "{{neo4j_group}}"
append: true
@@ -38,6 +38,14 @@
state: directory
mode: '750'
- name: Create neo4j data directory
ansible.builtin.file:
path: "{{neo4j_directory}}/data"
owner: "{{neo4j_user}}"
group: "{{neo4j_group}}"
state: directory
mode: '750'
- name: Template docker-compose file
ansible.builtin.template:
src: docker-compose.yml.j2

View File

@@ -1,6 +1,7 @@
services:
neo4j:
image: neo4j:{{neo4j_image_version}}
image: neo4j:{{neo4j_version}}
pull_policy: always
container_name: neo4j
restart: unless-stopped
ports:
@@ -11,9 +12,11 @@ services:
- neo4j_logs:/logs
- neo4j_plugins:/plugins
environment:
NEO4J_AUTH: "{{neo4j_auth_user}}/{{neo4j_auth_password}}"
# APOC Plugin
NEO4J_PLUGINS: '["apoc"]'
NEO4J_AUTH: "{{neo4j_user}}/{{neo4j_password}}"
# APOC Plugin — core ("apoc") is required by apoc-extended.
# Listing only apoc-extended fails to expose apoc.version(),
# apoc.coll.*, apoc.date.* — declare both.
NEO4J_PLUGINS: '["apoc", "apoc-extended"]'
NEO4J_apoc_export_file_enabled: "true"
NEO4J_apoc_import_file_enabled: "true"
NEO4J_apoc_import_file_use__neo4j__config: "true"
@@ -25,7 +28,31 @@ services:
syslog-format: "{{syslog_format}}"
tag: "neo4j"
neo4j-exporter:
image: stscoundrel/neo4j-apoc-exporter:v0.1.0
restart: unless-stopped
ports:
- "{{neo4j_metrics_port}}:17687"
environment:
- NEO4J_URI=bolt://neo4j:7687
- NEO4J_USER={{neo4j_user}}
- NEO4J_PASSWORD={{neo4j_password}}
- EXPORTER_PORT=17687
depends_on:
- neo4j
logging:
driver: syslog
options:
syslog-address: "tcp://127.0.0.1:{{neo4j_syslog_port}}"
syslog-format: "{{syslog_format}}"
tag: "neo4j-exporter"
volumes:
neo4j_data:
driver: local
driver_opts:
type: none
device: {{neo4j_directory}}/data
o: bind
neo4j_logs:
neo4j_plugins:

View File

@@ -384,6 +384,48 @@ groups:
summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
# ============================================================================
# Neo4j Alerts (neo4j-apoc-exporter sidecar)
# ============================================================================
# Metrics come from stscoundrel/neo4j-apoc-exporter, which connects to
# Neo4j over Bolt and surfaces apoc.monitor.* gauges plus standard JVM
# metrics. "Exporter down" therefore covers both "exporter container
# crashed" and "exporter cannot reach Bolt" — either way Neo4j is
# effectively unobservable. Hostname-only — purpose of each instance
# is implied by the host (e.g. ariel = LLM memory, umbriel = Mnemosyne).
- name: neo4j_alerts
rules:
- alert: Neo4jExporterDown
expr: up{job="neo4j"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Neo4j exporter down on {{ $labels.instance }}"
description: "The neo4j-apoc-exporter on {{ $labels.instance }} has been unreachable for more than 5 minutes. Either the sidecar container is down or it cannot connect to Neo4j over Bolt — check `docker ps` and `docker logs neo4j-exporter` on the host."
- alert: Neo4jHighRollbackRate
expr: |
rate(neo4j_monitor_tx_rolledBackTx[10m])
/ clamp_min(rate(neo4j_monitor_tx_totalOpenedTx[10m]), 1) > 0.10
for: 10m
labels:
severity: warning
annotations:
summary: "Neo4j transaction rollback rate above 10% on {{ $labels.instance }}"
description: "More than 10% of transactions on {{ $labels.instance }} have rolled back over the last 10 minutes — check application logs in Loki ({job=\"neo4j\", hostname=\"{{ $labels.instance }}\"})."
- alert: Neo4jStoreGrowthStalled
expr: |
rate(neo4j_monitor_tx_totalOpenedTx[15m]) == 0
and neo4j_monitor_tx_currentOpenedTx > 0
for: 15m
labels:
severity: warning
annotations:
summary: "Neo4j has open transactions but zero throughput on {{ $labels.instance }}"
description: "{{ $labels.instance }} shows {{ $value }} currently-open transactions but no new transactions opened in 15 minutes — possible Bolt-side hang or stuck query."
# Red Panda Seal of Approval 🐼
# "If the metrics aren't red, go back to bed"
{% endraw %}

View File

@@ -62,4 +62,17 @@ scrape_configs:
metrics_path: '/metrics'
scrape_interval: 15s
# Neo4j — stscoundrel/neo4j-apoc-exporter sidecar connects to the local
# Neo4j over Bolt and exposes apoc.monitor.* (tx/ids/store) plus JVM
# metrics on the standard metrics port (22094). Both Ariel (LLM memory
# via neo4j-cypher MCP) and Umbriel (Mnemosyne graph+vector DB) use the
# same port — they are differentiated by hostname only.
- job_name: 'neo4j'
static_configs:
- targets:
- 'ariel.incus:22094'
- 'umbriel.incus:22094'
metrics_path: '/metrics'
scrape_interval: 15s
# Red Panda Approved Prometheus Configuration

351
dashboards/neo4j.json Normal file
View File

@@ -0,0 +1,351 @@
{
"title": "Neo4j",
"uid": "neo4j",
"tags": ["neo4j", "graph"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"editable": true,
"fiscalYearStartMonth": 0,
"weekStart": "",
"refresh": "30s",
"time": {"from": "now-1h", "to": "now"},
"templating": {
"list": [
{
"name": "loki",
"type": "datasource",
"query": "loki",
"current": {"selected": false, "text": "Loki", "value": "Loki"},
"hide": 0,
"label": "Loki datasource"
},
{
"name": "prom",
"type": "datasource",
"query": "prometheus",
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
"hide": 0,
"label": "Prometheus datasource"
},
{
"name": "instance",
"type": "query",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"query": "label_values(up{job=\"neo4j\"}, instance)",
"refresh": 1,
"includeAll": true,
"multi": true,
"current": {"selected": true, "text": "All", "value": "$__all"},
"label": "Instance"
}
]
},
"panels": [
{
"id": 1,
"type": "row",
"title": "Overview",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
},
{
"id": 2,
"type": "stat",
"title": "Exporter up",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 1},
"targets": [
{
"refId": "A",
"expr": "up{job=\"neo4j\", instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 3,
"type": "stat",
"title": "Nodes",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 1},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_ids_nodeIds{instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "short"}}
},
{
"id": 4,
"type": "stat",
"title": "Relationships",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 1},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_ids_relIds{instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "short"}}
},
{
"id": 5,
"type": "stat",
"title": "Total store size",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 1},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_store_totalStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "bytes"}}
},
{
"id": 10,
"type": "row",
"title": "Transactions",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}
},
{
"id": 11,
"type": "timeseries",
"title": "Transaction open rate (per second)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 6},
"targets": [
{
"refId": "A",
"expr": "rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"$instance\"}[5m])",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 12,
"type": "timeseries",
"title": "Currently open transactions",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 6},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_tx_currentOpenedTx{instance=~\"$instance\"}",
"legendFormat": "{{instance}} current"
},
{
"refId": "B",
"expr": "neo4j_monitor_tx_peakTx{instance=~\"$instance\"}",
"legendFormat": "{{instance}} peak"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 13,
"type": "stat",
"title": "Rollback ratio (10m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 12, "x": 0, "y": 14},
"targets": [
{
"refId": "A",
"expr": "rate(neo4j_monitor_tx_rolledBackTx{instance=~\"$instance\"}[10m]) / clamp_min(rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"$instance\"}[10m]), 0.0001)",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
},
{
"id": 14,
"type": "stat",
"title": "Last tx ID",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 12, "x": 12, "y": 14},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_tx_lastTxId{instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "short"}}
},
{
"id": 20,
"type": "row",
"title": "Store breakdown",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 18}
},
{
"id": 21,
"type": "timeseries",
"title": "Store size by component",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 19},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_store_nodeStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}} nodes"
},
{
"refId": "B",
"expr": "neo4j_monitor_store_relStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}} rels"
},
{
"refId": "C",
"expr": "neo4j_monitor_store_propStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}} props"
},
{
"refId": "D",
"expr": "neo4j_monitor_store_stringStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}} strings"
},
{
"refId": "E",
"expr": "neo4j_monitor_store_arrayStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}} arrays"
}
],
"fieldConfig": {"defaults": {"unit": "bytes"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 22,
"type": "timeseries",
"title": "Transaction log size",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 19},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_store_logSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {"defaults": {"unit": "bytes"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 30,
"type": "row",
"title": "Exporter JVM (sidecar health)",
"collapsed": true,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 27}
},
{
"id": 31,
"type": "timeseries",
"title": "Exporter JVM heap used / max",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 28},
"targets": [
{
"refId": "A",
"expr": "jvm_memory_used_bytes{job=\"neo4j\", area=\"heap\", instance=~\"$instance\"}",
"legendFormat": "{{instance}} used"
},
{
"refId": "B",
"expr": "jvm_memory_max_bytes{job=\"neo4j\", area=\"heap\", instance=~\"$instance\"}",
"legendFormat": "{{instance}} max"
}
],
"fieldConfig": {"defaults": {"unit": "bytes"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 32,
"type": "timeseries",
"title": "Exporter GC time",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 28},
"targets": [
{
"refId": "A",
"expr": "rate(jvm_gc_collection_seconds_sum{job=\"neo4j\", instance=~\"$instance\"}[5m])",
"legendFormat": "{{instance}} {{gc}}"
}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 40,
"type": "row",
"title": "Logs",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 36}
},
{
"id": 41,
"type": "timeseries",
"title": "Neo4j log rate by host",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 37},
"targets": [
{
"refId": "A",
"expr": "sum by (hostname) (rate({job=\"neo4j\"}[5m]))",
"legendFormat": "{{hostname}}"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 42,
"type": "logs",
"title": "Neo4j — last 50 lines (errors/warnings first)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 37},
"targets": [
{
"refId": "A",
"expr": "{job=\"neo4j\"} |~ \"(?i)error|warn|exception\"",
"maxLines": 50
}
],
"options": {"showLabels": true, "showTime": true, "wrapLogMessage": true}
},
{
"id": 43,
"type": "logs",
"title": "Neo4j — all logs (live tail)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 45},
"targets": [
{
"refId": "A",
"expr": "{job=\"neo4j\"}",
"maxLines": 100
}
],
"options": {"showLabels": true, "showTime": true, "wrapLogMessage": true}
}
]
}