From 43fae203d18903fe6f7bbf736870eda29cc96f92 Mon Sep 17 00:00:00 2001 From: Robert Helewka Date: Fri, 22 May 2026 22:19:13 -0400 Subject: [PATCH] feat(ansible): standardize Neo4j ports and add monitoring - Unify Neo4j HTTP/Bolt/syslog ports across ariel and umbriel hosts - Add neo4j_metrics_port (22094) for APOC exporter sidecar - Add umbriel to Prometheus node_exporter targets - Add Neo4j scrape config and alerts for tx rollback rate and stalled store growth - Replace kernos_harper MCP with andromeda (caliban.helu.ca) - Remove angelia MCP from kottos fastagent config - Switch neo4j group membership from keeper_user to ponos --- ansible/inventory/host_vars/ariel.incus.yml | 7 +- .../inventory/host_vars/prospero.incus.yml | 1 + ansible/inventory/host_vars/umbriel.incus.yml | 7 +- ansible/kottos/fastagent.config.yaml.j2 | 10 +- ansible/neo4j/deploy.yml | 12 +- ansible/neo4j/docker-compose.yml.j2 | 35 +- ansible/pplg/alert_rules.yml.j2 | 42 +++ ansible/pplg/prometheus.yml.j2 | 13 + dashboards/neo4j.json | 351 ++++++++++++++++++ 9 files changed, 458 insertions(+), 20 deletions(-) create mode 100644 dashboards/neo4j.json diff --git a/ansible/inventory/host_vars/ariel.incus.yml b/ansible/inventory/host_vars/ariel.incus.yml index 69ddff6..573c3ab 100644 --- a/ansible/inventory/host_vars/ariel.incus.yml +++ b/ansible/inventory/host_vars/ariel.incus.yml @@ -9,7 +9,7 @@ services: # Alloy alloy_log_level: "warn" -neo4j_syslog_port: 22011 +neo4j_syslog_port: 51414 # Neo4j neo4j_user: neo4j @@ -17,6 +17,7 @@ neo4j_group: neo4j neo4j_directory: /srv/neo4j neo4j_auth_user: neo4j neo4j_auth_password: "{{ vault_neo4j_auth_password }}" -neo4j_http_port: 25554 -neo4j_bolt_port: 7687 +neo4j_http_port: 22084 +neo4j_bolt_port: 22074 +neo4j_metrics_port: 22094 neo4j_apoc_unrestricted: "apoc.*" diff --git a/ansible/inventory/host_vars/prospero.incus.yml b/ansible/inventory/host_vars/prospero.incus.yml index 5be562a..f407559 100644 --- a/ansible/inventory/host_vars/prospero.incus.yml +++ b/ansible/inventory/host_vars/prospero.incus.yml @@ -72,6 +72,7 @@ prometheus_targets: - 'sycorax.incus:9100' - 'prospero.incus:9100' - 'rosalind.incus:9100' + - 'umbriel.incus:9100' # Prometheus OAuth2-Proxy Sidecar prometheus_proxy_port: 9091 diff --git a/ansible/inventory/host_vars/umbriel.incus.yml b/ansible/inventory/host_vars/umbriel.incus.yml index e2dc614..ee84f69 100644 --- a/ansible/inventory/host_vars/umbriel.incus.yml +++ b/ansible/inventory/host_vars/umbriel.incus.yml @@ -13,7 +13,7 @@ services: # Alloy alloy_log_level: "warn" -neo4j_syslog_port: 22012 +neo4j_syslog_port: 51414 # Neo4j neo4j_user: neo4j @@ -21,6 +21,7 @@ neo4j_group: neo4j neo4j_directory: /srv/neo4j neo4j_auth_user: neo4j neo4j_auth_password: "{{ vault_mnemosyne_neo4j_auth_password }}" -neo4j_http_port: 25555 -neo4j_bolt_port: 7687 +neo4j_http_port: 22084 +neo4j_bolt_port: 22074 +neo4j_metrics_port: 22094 neo4j_apoc_unrestricted: "apoc.*" diff --git a/ansible/kottos/fastagent.config.yaml.j2 b/ansible/kottos/fastagent.config.yaml.j2 index 9fed8d0..d011380 100644 --- a/ansible/kottos/fastagent.config.yaml.j2 +++ b/ansible/kottos/fastagent.config.yaml.j2 @@ -54,17 +54,11 @@ mcp: url: "{{ kottos_grafana_url | default('http://miranda.incus:25533/mcp') }}" # ── Shell + file operations — Kernos (Korax) ───────────────────────────── - kernos_harper: + andromeda: transport: http - url: "{{ kottos_kernos_harper_url | default('http://korax.helu.ca:20261/mcp') }}" + url: "{{ kottos_kernos_harper_url | default('http://caliban.helu.ca:20261/mcp') }}" load_on_start: false - # ── Angelia messaging ─────────────────────────────────────────────────── - # Auth header provided by fastagent.secrets.yaml (vault-rendered). - angelia: - transport: http - url: "{{ kottos_angelia_url | default('https://ouranos.helu.ca/mcp/') }}" - # ── GitHub MCP Server (local Docker, stdio) ────────────────────────────── # GITHUB_PERSONAL_ACCESS_TOKEN provided by fastagent.secrets.yaml github: diff --git a/ansible/neo4j/deploy.yml b/ansible/neo4j/deploy.yml index 2ab5059..1768eee 100644 --- a/ansible/neo4j/deploy.yml +++ b/ansible/neo4j/deploy.yml @@ -24,9 +24,9 @@ group: "{{neo4j_group}}" system: true - - name: Add group neo4j to keeper_user + - name: Add group neo4j to user ponos ansible.builtin.user: - name: "{{keeper_user}}" + name: ponos groups: "{{neo4j_group}}" append: true @@ -38,6 +38,14 @@ state: directory mode: '750' + - name: Create neo4j data directory + ansible.builtin.file: + path: "{{neo4j_directory}}/data" + owner: "{{neo4j_user}}" + group: "{{neo4j_group}}" + state: directory + mode: '750' + - name: Template docker-compose file ansible.builtin.template: src: docker-compose.yml.j2 diff --git a/ansible/neo4j/docker-compose.yml.j2 b/ansible/neo4j/docker-compose.yml.j2 index 8cbc2bc..2db6f3b 100644 --- a/ansible/neo4j/docker-compose.yml.j2 +++ b/ansible/neo4j/docker-compose.yml.j2 @@ -1,6 +1,7 @@ services: neo4j: - image: neo4j:{{neo4j_image_version}} + image: neo4j:{{neo4j_version}} + pull_policy: always container_name: neo4j restart: unless-stopped ports: @@ -11,9 +12,11 @@ services: - neo4j_logs:/logs - neo4j_plugins:/plugins environment: - NEO4J_AUTH: "{{neo4j_auth_user}}/{{neo4j_auth_password}}" - # APOC Plugin - NEO4J_PLUGINS: '["apoc"]' + NEO4J_AUTH: "{{neo4j_user}}/{{neo4j_password}}" + # APOC Plugin — core ("apoc") is required by apoc-extended. + # Listing only apoc-extended fails to expose apoc.version(), + # apoc.coll.*, apoc.date.* — declare both. + NEO4J_PLUGINS: '["apoc", "apoc-extended"]' NEO4J_apoc_export_file_enabled: "true" NEO4J_apoc_import_file_enabled: "true" NEO4J_apoc_import_file_use__neo4j__config: "true" @@ -25,7 +28,31 @@ services: syslog-format: "{{syslog_format}}" tag: "neo4j" + neo4j-exporter: + image: stscoundrel/neo4j-apoc-exporter:v0.1.0 + restart: unless-stopped + ports: + - "{{neo4j_metrics_port}}:17687" + environment: + - NEO4J_URI=bolt://neo4j:7687 + - NEO4J_USER={{neo4j_user}} + - NEO4J_PASSWORD={{neo4j_password}} + - EXPORTER_PORT=17687 + depends_on: + - neo4j + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{neo4j_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "neo4j-exporter" + volumes: neo4j_data: + driver: local + driver_opts: + type: none + device: {{neo4j_directory}}/data + o: bind neo4j_logs: neo4j_plugins: \ No newline at end of file diff --git a/ansible/pplg/alert_rules.yml.j2 b/ansible/pplg/alert_rules.yml.j2 index 82716d0..fdfa65e 100644 --- a/ansible/pplg/alert_rules.yml.j2 +++ b/ansible/pplg/alert_rules.yml.j2 @@ -384,6 +384,48 @@ groups: summary: "Mnemosyne Celery backlog on {{ $labels.queue }}" description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})." + # ============================================================================ + # Neo4j Alerts (neo4j-apoc-exporter sidecar) + # ============================================================================ + # Metrics come from stscoundrel/neo4j-apoc-exporter, which connects to + # Neo4j over Bolt and surfaces apoc.monitor.* gauges plus standard JVM + # metrics. "Exporter down" therefore covers both "exporter container + # crashed" and "exporter cannot reach Bolt" — either way Neo4j is + # effectively unobservable. Hostname-only — purpose of each instance + # is implied by the host (e.g. ariel = LLM memory, umbriel = Mnemosyne). + - name: neo4j_alerts + rules: + - alert: Neo4jExporterDown + expr: up{job="neo4j"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Neo4j exporter down on {{ $labels.instance }}" + description: "The neo4j-apoc-exporter on {{ $labels.instance }} has been unreachable for more than 5 minutes. Either the sidecar container is down or it cannot connect to Neo4j over Bolt — check `docker ps` and `docker logs neo4j-exporter` on the host." + + - alert: Neo4jHighRollbackRate + expr: | + rate(neo4j_monitor_tx_rolledBackTx[10m]) + / clamp_min(rate(neo4j_monitor_tx_totalOpenedTx[10m]), 1) > 0.10 + for: 10m + labels: + severity: warning + annotations: + summary: "Neo4j transaction rollback rate above 10% on {{ $labels.instance }}" + description: "More than 10% of transactions on {{ $labels.instance }} have rolled back over the last 10 minutes — check application logs in Loki ({job=\"neo4j\", hostname=\"{{ $labels.instance }}\"})." + + - alert: Neo4jStoreGrowthStalled + expr: | + rate(neo4j_monitor_tx_totalOpenedTx[15m]) == 0 + and neo4j_monitor_tx_currentOpenedTx > 0 + for: 15m + labels: + severity: warning + annotations: + summary: "Neo4j has open transactions but zero throughput on {{ $labels.instance }}" + description: "{{ $labels.instance }} shows {{ $value }} currently-open transactions but no new transactions opened in 15 minutes — possible Bolt-side hang or stuck query." + # Red Panda Seal of Approval 🐼 # "If the metrics aren't red, go back to bed" {% endraw %} diff --git a/ansible/pplg/prometheus.yml.j2 b/ansible/pplg/prometheus.yml.j2 index 35bd7e1..4b9b72b 100644 --- a/ansible/pplg/prometheus.yml.j2 +++ b/ansible/pplg/prometheus.yml.j2 @@ -62,4 +62,17 @@ scrape_configs: metrics_path: '/metrics' scrape_interval: 15s + # Neo4j — stscoundrel/neo4j-apoc-exporter sidecar connects to the local + # Neo4j over Bolt and exposes apoc.monitor.* (tx/ids/store) plus JVM + # metrics on the standard metrics port (22094). Both Ariel (LLM memory + # via neo4j-cypher MCP) and Umbriel (Mnemosyne graph+vector DB) use the + # same port — they are differentiated by hostname only. + - job_name: 'neo4j' + static_configs: + - targets: + - 'ariel.incus:22094' + - 'umbriel.incus:22094' + metrics_path: '/metrics' + scrape_interval: 15s + # Red Panda Approved Prometheus Configuration diff --git a/dashboards/neo4j.json b/dashboards/neo4j.json new file mode 100644 index 0000000..03a2ffe --- /dev/null +++ b/dashboards/neo4j.json @@ -0,0 +1,351 @@ +{ + "title": "Neo4j", + "uid": "neo4j", + "tags": ["neo4j", "graph"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "editable": true, + "fiscalYearStartMonth": 0, + "weekStart": "", + "refresh": "30s", + "time": {"from": "now-1h", "to": "now"}, + "templating": { + "list": [ + { + "name": "loki", + "type": "datasource", + "query": "loki", + "current": {"selected": false, "text": "Loki", "value": "Loki"}, + "hide": 0, + "label": "Loki datasource" + }, + { + "name": "prom", + "type": "datasource", + "query": "prometheus", + "current": {"selected": false, "text": "Prometheus", "value": "Prometheus"}, + "hide": 0, + "label": "Prometheus datasource" + }, + { + "name": "instance", + "type": "query", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "query": "label_values(up{job=\"neo4j\"}, instance)", + "refresh": 1, + "includeAll": true, + "multi": true, + "current": {"selected": true, "text": "All", "value": "$__all"}, + "label": "Instance" + } + ] + }, + "panels": [ + { + "id": 1, + "type": "row", + "title": "Overview", + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0} + }, + { + "id": 2, + "type": "stat", + "title": "Exporter up", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 1}, + "targets": [ + { + "refId": "A", + "expr": "up{job=\"neo4j\", instance=~\"$instance\"}", + "legendFormat": "{{instance}}" + } + ], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value_and_name"}, + "fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}} + }, + { + "id": 3, + "type": "stat", + "title": "Nodes", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 4, "w": 6, "x": 6, "y": 1}, + "targets": [ + { + "refId": "A", + "expr": "neo4j_monitor_ids_nodeIds{instance=~\"$instance\"}", + "legendFormat": "{{instance}}" + } + ], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"}, + "fieldConfig": {"defaults": {"unit": "short"}} + }, + { + "id": 4, + "type": "stat", + "title": "Relationships", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 4, "w": 6, "x": 12, "y": 1}, + "targets": [ + { + "refId": "A", + "expr": "neo4j_monitor_ids_relIds{instance=~\"$instance\"}", + "legendFormat": "{{instance}}" + } + ], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"}, + "fieldConfig": {"defaults": {"unit": "short"}} + }, + { + "id": 5, + "type": "stat", + "title": "Total store size", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 1}, + "targets": [ + { + "refId": "A", + "expr": "neo4j_monitor_store_totalStoreSize{instance=~\"$instance\"}", + "legendFormat": "{{instance}}" + } + ], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"}, + "fieldConfig": {"defaults": {"unit": "bytes"}} + }, + + { + "id": 10, + "type": "row", + "title": "Transactions", + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5} + }, + { + "id": 11, + "type": "timeseries", + "title": "Transaction open rate (per second)", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 6}, + "targets": [ + { + "refId": "A", + "expr": "rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"$instance\"}[5m])", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": {"defaults": {"unit": "ops"}}, + "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}} + }, + { + "id": 12, + "type": "timeseries", + "title": "Currently open transactions", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 6}, + "targets": [ + { + "refId": "A", + "expr": "neo4j_monitor_tx_currentOpenedTx{instance=~\"$instance\"}", + "legendFormat": "{{instance}} current" + }, + { + "refId": "B", + "expr": "neo4j_monitor_tx_peakTx{instance=~\"$instance\"}", + "legendFormat": "{{instance}} peak" + } + ], + "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}} + }, + { + "id": 13, + "type": "stat", + "title": "Rollback ratio (10m)", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 4, "w": 12, "x": 0, "y": 14}, + "targets": [ + { + "refId": "A", + "expr": "rate(neo4j_monitor_tx_rolledBackTx{instance=~\"$instance\"}[10m]) / clamp_min(rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"$instance\"}[10m]), 0.0001)", + "legendFormat": "{{instance}}" + } + ], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"}, + "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}} + }, + { + "id": 14, + "type": "stat", + "title": "Last tx ID", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 4, "w": 12, "x": 12, "y": 14}, + "targets": [ + { + "refId": "A", + "expr": "neo4j_monitor_tx_lastTxId{instance=~\"$instance\"}", + "legendFormat": "{{instance}}" + } + ], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"}, + "fieldConfig": {"defaults": {"unit": "short"}} + }, + + { + "id": 20, + "type": "row", + "title": "Store breakdown", + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 18} + }, + { + "id": 21, + "type": "timeseries", + "title": "Store size by component", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 19}, + "targets": [ + { + "refId": "A", + "expr": "neo4j_monitor_store_nodeStoreSize{instance=~\"$instance\"}", + "legendFormat": "{{instance}} nodes" + }, + { + "refId": "B", + "expr": "neo4j_monitor_store_relStoreSize{instance=~\"$instance\"}", + "legendFormat": "{{instance}} rels" + }, + { + "refId": "C", + "expr": "neo4j_monitor_store_propStoreSize{instance=~\"$instance\"}", + "legendFormat": "{{instance}} props" + }, + { + "refId": "D", + "expr": "neo4j_monitor_store_stringStoreSize{instance=~\"$instance\"}", + "legendFormat": "{{instance}} strings" + }, + { + "refId": "E", + "expr": "neo4j_monitor_store_arrayStoreSize{instance=~\"$instance\"}", + "legendFormat": "{{instance}} arrays" + } + ], + "fieldConfig": {"defaults": {"unit": "bytes"}}, + "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}} + }, + { + "id": 22, + "type": "timeseries", + "title": "Transaction log size", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 19}, + "targets": [ + { + "refId": "A", + "expr": "neo4j_monitor_store_logSize{instance=~\"$instance\"}", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": {"defaults": {"unit": "bytes"}}, + "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}} + }, + + { + "id": 30, + "type": "row", + "title": "Exporter JVM (sidecar health)", + "collapsed": true, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 27} + }, + { + "id": 31, + "type": "timeseries", + "title": "Exporter JVM heap used / max", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 28}, + "targets": [ + { + "refId": "A", + "expr": "jvm_memory_used_bytes{job=\"neo4j\", area=\"heap\", instance=~\"$instance\"}", + "legendFormat": "{{instance}} used" + }, + { + "refId": "B", + "expr": "jvm_memory_max_bytes{job=\"neo4j\", area=\"heap\", instance=~\"$instance\"}", + "legendFormat": "{{instance}} max" + } + ], + "fieldConfig": {"defaults": {"unit": "bytes"}}, + "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}} + }, + { + "id": 32, + "type": "timeseries", + "title": "Exporter GC time", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 28}, + "targets": [ + { + "refId": "A", + "expr": "rate(jvm_gc_collection_seconds_sum{job=\"neo4j\", instance=~\"$instance\"}[5m])", + "legendFormat": "{{instance}} {{gc}}" + } + ], + "fieldConfig": {"defaults": {"unit": "s"}}, + "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}} + }, + + { + "id": 40, + "type": "row", + "title": "Logs", + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 36} + }, + { + "id": 41, + "type": "timeseries", + "title": "Neo4j log rate by host", + "datasource": {"type": "loki", "uid": "${loki}"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 37}, + "targets": [ + { + "refId": "A", + "expr": "sum by (hostname) (rate({job=\"neo4j\"}[5m]))", + "legendFormat": "{{hostname}}" + } + ], + "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}} + }, + { + "id": 42, + "type": "logs", + "title": "Neo4j — last 50 lines (errors/warnings first)", + "datasource": {"type": "loki", "uid": "${loki}"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 37}, + "targets": [ + { + "refId": "A", + "expr": "{job=\"neo4j\"} |~ \"(?i)error|warn|exception\"", + "maxLines": 50 + } + ], + "options": {"showLabels": true, "showTime": true, "wrapLogMessage": true} + }, + { + "id": 43, + "type": "logs", + "title": "Neo4j — all logs (live tail)", + "datasource": {"type": "loki", "uid": "${loki}"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 45}, + "targets": [ + { + "refId": "A", + "expr": "{job=\"neo4j\"}", + "maxLines": 100 + } + ], + "options": {"showLabels": true, "showTime": true, "wrapLogMessage": true} + } + ] +}