feat(ansible): standardize Neo4j ports and add monitoring
- Unify Neo4j HTTP/Bolt/syslog ports across ariel and umbriel hosts - Add neo4j_metrics_port (22094) for APOC exporter sidecar - Add umbriel to Prometheus node_exporter targets - Add Neo4j scrape config and alerts for tx rollback rate and stalled store growth - Replace kernos_harper MCP with andromeda (caliban.helu.ca) - Remove angelia MCP from kottos fastagent config - Switch neo4j group membership from keeper_user to ponos
This commit is contained in:
@@ -384,6 +384,48 @@ groups:
|
||||
summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
|
||||
description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
|
||||
|
||||
# ============================================================================
|
||||
# Neo4j Alerts (neo4j-apoc-exporter sidecar)
|
||||
# ============================================================================
|
||||
# Metrics come from stscoundrel/neo4j-apoc-exporter, which connects to
|
||||
# Neo4j over Bolt and surfaces apoc.monitor.* gauges plus standard JVM
|
||||
# metrics. "Exporter down" therefore covers both "exporter container
|
||||
# crashed" and "exporter cannot reach Bolt" — either way Neo4j is
|
||||
# effectively unobservable. Hostname-only — purpose of each instance
|
||||
# is implied by the host (e.g. ariel = LLM memory, umbriel = Mnemosyne).
|
||||
- name: neo4j_alerts
|
||||
rules:
|
||||
- alert: Neo4jExporterDown
|
||||
expr: up{job="neo4j"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Neo4j exporter down on {{ $labels.instance }}"
|
||||
description: "The neo4j-apoc-exporter on {{ $labels.instance }} has been unreachable for more than 5 minutes. Either the sidecar container is down or it cannot connect to Neo4j over Bolt — check `docker ps` and `docker logs neo4j-exporter` on the host."
|
||||
|
||||
- alert: Neo4jHighRollbackRate
|
||||
expr: |
|
||||
rate(neo4j_monitor_tx_rolledBackTx[10m])
|
||||
/ clamp_min(rate(neo4j_monitor_tx_totalOpenedTx[10m]), 1) > 0.10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Neo4j transaction rollback rate above 10% on {{ $labels.instance }}"
|
||||
description: "More than 10% of transactions on {{ $labels.instance }} have rolled back over the last 10 minutes — check application logs in Loki ({job=\"neo4j\", hostname=\"{{ $labels.instance }}\"})."
|
||||
|
||||
- alert: Neo4jStoreGrowthStalled
|
||||
expr: |
|
||||
rate(neo4j_monitor_tx_totalOpenedTx[15m]) == 0
|
||||
and neo4j_monitor_tx_currentOpenedTx > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Neo4j has open transactions but zero throughput on {{ $labels.instance }}"
|
||||
description: "{{ $labels.instance }} shows {{ $value }} currently-open transactions but no new transactions opened in 15 minutes — possible Bolt-side hang or stuck query."
|
||||
|
||||
# Red Panda Seal of Approval 🐼
|
||||
# "If the metrics aren't red, go back to bed"
|
||||
{% endraw %}
|
||||
|
||||
@@ -62,4 +62,17 @@ scrape_configs:
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 15s
|
||||
|
||||
# Neo4j — stscoundrel/neo4j-apoc-exporter sidecar connects to the local
|
||||
# Neo4j over Bolt and exposes apoc.monitor.* (tx/ids/store) plus JVM
|
||||
# metrics on the standard metrics port (22094). Both Ariel (LLM memory
|
||||
# via neo4j-cypher MCP) and Umbriel (Mnemosyne graph+vector DB) use the
|
||||
# same port — they are differentiated by hostname only.
|
||||
- job_name: 'neo4j'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'ariel.incus:22094'
|
||||
- 'umbriel.incus:22094'
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 15s
|
||||
|
||||
# Red Panda Approved Prometheus Configuration
|
||||
|
||||
Reference in New Issue
Block a user