feat(ansible): standardize Neo4j ports and add monitoring
- Unify Neo4j HTTP/Bolt/syslog ports across ariel and umbriel hosts - Add neo4j_metrics_port (22094) for APOC exporter sidecar - Add umbriel to Prometheus node_exporter targets - Add Neo4j scrape config and alerts for tx rollback rate and stalled store growth - Replace kernos_harper MCP with andromeda (caliban.helu.ca) - Remove angelia MCP from kottos fastagent config - Switch neo4j group membership from keeper_user to ponos
This commit is contained in:
@@ -9,7 +9,7 @@ services:
|
||||
|
||||
# Alloy
|
||||
alloy_log_level: "warn"
|
||||
neo4j_syslog_port: 22011
|
||||
neo4j_syslog_port: 51414
|
||||
|
||||
# Neo4j
|
||||
neo4j_user: neo4j
|
||||
@@ -17,6 +17,7 @@ neo4j_group: neo4j
|
||||
neo4j_directory: /srv/neo4j
|
||||
neo4j_auth_user: neo4j
|
||||
neo4j_auth_password: "{{ vault_neo4j_auth_password }}"
|
||||
neo4j_http_port: 25554
|
||||
neo4j_bolt_port: 7687
|
||||
neo4j_http_port: 22084
|
||||
neo4j_bolt_port: 22074
|
||||
neo4j_metrics_port: 22094
|
||||
neo4j_apoc_unrestricted: "apoc.*"
|
||||
|
||||
@@ -72,6 +72,7 @@ prometheus_targets:
|
||||
- 'sycorax.incus:9100'
|
||||
- 'prospero.incus:9100'
|
||||
- 'rosalind.incus:9100'
|
||||
- 'umbriel.incus:9100'
|
||||
|
||||
# Prometheus OAuth2-Proxy Sidecar
|
||||
prometheus_proxy_port: 9091
|
||||
|
||||
@@ -13,7 +13,7 @@ services:
|
||||
|
||||
# Alloy
|
||||
alloy_log_level: "warn"
|
||||
neo4j_syslog_port: 22012
|
||||
neo4j_syslog_port: 51414
|
||||
|
||||
# Neo4j
|
||||
neo4j_user: neo4j
|
||||
@@ -21,6 +21,7 @@ neo4j_group: neo4j
|
||||
neo4j_directory: /srv/neo4j
|
||||
neo4j_auth_user: neo4j
|
||||
neo4j_auth_password: "{{ vault_mnemosyne_neo4j_auth_password }}"
|
||||
neo4j_http_port: 25555
|
||||
neo4j_bolt_port: 7687
|
||||
neo4j_http_port: 22084
|
||||
neo4j_bolt_port: 22074
|
||||
neo4j_metrics_port: 22094
|
||||
neo4j_apoc_unrestricted: "apoc.*"
|
||||
|
||||
@@ -54,17 +54,11 @@ mcp:
|
||||
url: "{{ kottos_grafana_url | default('http://miranda.incus:25533/mcp') }}"
|
||||
|
||||
# ── Shell + file operations — Kernos (Korax) ─────────────────────────────
|
||||
kernos_harper:
|
||||
andromeda:
|
||||
transport: http
|
||||
url: "{{ kottos_kernos_harper_url | default('http://korax.helu.ca:20261/mcp') }}"
|
||||
url: "{{ kottos_kernos_harper_url | default('http://caliban.helu.ca:20261/mcp') }}"
|
||||
load_on_start: false
|
||||
|
||||
# ── Angelia messaging ───────────────────────────────────────────────────
|
||||
# Auth header provided by fastagent.secrets.yaml (vault-rendered).
|
||||
angelia:
|
||||
transport: http
|
||||
url: "{{ kottos_angelia_url | default('https://ouranos.helu.ca/mcp/') }}"
|
||||
|
||||
# ── GitHub MCP Server (local Docker, stdio) ──────────────────────────────
|
||||
# GITHUB_PERSONAL_ACCESS_TOKEN provided by fastagent.secrets.yaml
|
||||
github:
|
||||
|
||||
@@ -24,9 +24,9 @@
|
||||
group: "{{neo4j_group}}"
|
||||
system: true
|
||||
|
||||
- name: Add group neo4j to keeper_user
|
||||
- name: Add group neo4j to user ponos
|
||||
ansible.builtin.user:
|
||||
name: "{{keeper_user}}"
|
||||
name: ponos
|
||||
groups: "{{neo4j_group}}"
|
||||
append: true
|
||||
|
||||
@@ -38,6 +38,14 @@
|
||||
state: directory
|
||||
mode: '750'
|
||||
|
||||
- name: Create neo4j data directory
|
||||
ansible.builtin.file:
|
||||
path: "{{neo4j_directory}}/data"
|
||||
owner: "{{neo4j_user}}"
|
||||
group: "{{neo4j_group}}"
|
||||
state: directory
|
||||
mode: '750'
|
||||
|
||||
- name: Template docker-compose file
|
||||
ansible.builtin.template:
|
||||
src: docker-compose.yml.j2
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
services:
|
||||
neo4j:
|
||||
image: neo4j:{{neo4j_image_version}}
|
||||
image: neo4j:{{neo4j_version}}
|
||||
pull_policy: always
|
||||
container_name: neo4j
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
@@ -11,9 +12,11 @@ services:
|
||||
- neo4j_logs:/logs
|
||||
- neo4j_plugins:/plugins
|
||||
environment:
|
||||
NEO4J_AUTH: "{{neo4j_auth_user}}/{{neo4j_auth_password}}"
|
||||
# APOC Plugin
|
||||
NEO4J_PLUGINS: '["apoc"]'
|
||||
NEO4J_AUTH: "{{neo4j_user}}/{{neo4j_password}}"
|
||||
# APOC Plugin — core ("apoc") is required by apoc-extended.
|
||||
# Listing only apoc-extended fails to expose apoc.version(),
|
||||
# apoc.coll.*, apoc.date.* — declare both.
|
||||
NEO4J_PLUGINS: '["apoc", "apoc-extended"]'
|
||||
NEO4J_apoc_export_file_enabled: "true"
|
||||
NEO4J_apoc_import_file_enabled: "true"
|
||||
NEO4J_apoc_import_file_use__neo4j__config: "true"
|
||||
@@ -25,7 +28,31 @@ services:
|
||||
syslog-format: "{{syslog_format}}"
|
||||
tag: "neo4j"
|
||||
|
||||
neo4j-exporter:
|
||||
image: stscoundrel/neo4j-apoc-exporter:v0.1.0
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "{{neo4j_metrics_port}}:17687"
|
||||
environment:
|
||||
- NEO4J_URI=bolt://neo4j:7687
|
||||
- NEO4J_USER={{neo4j_user}}
|
||||
- NEO4J_PASSWORD={{neo4j_password}}
|
||||
- EXPORTER_PORT=17687
|
||||
depends_on:
|
||||
- neo4j
|
||||
logging:
|
||||
driver: syslog
|
||||
options:
|
||||
syslog-address: "tcp://127.0.0.1:{{neo4j_syslog_port}}"
|
||||
syslog-format: "{{syslog_format}}"
|
||||
tag: "neo4j-exporter"
|
||||
|
||||
volumes:
|
||||
neo4j_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
device: {{neo4j_directory}}/data
|
||||
o: bind
|
||||
neo4j_logs:
|
||||
neo4j_plugins:
|
||||
@@ -384,6 +384,48 @@ groups:
|
||||
summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
|
||||
description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
|
||||
|
||||
# ============================================================================
|
||||
# Neo4j Alerts (neo4j-apoc-exporter sidecar)
|
||||
# ============================================================================
|
||||
# Metrics come from stscoundrel/neo4j-apoc-exporter, which connects to
|
||||
# Neo4j over Bolt and surfaces apoc.monitor.* gauges plus standard JVM
|
||||
# metrics. "Exporter down" therefore covers both "exporter container
|
||||
# crashed" and "exporter cannot reach Bolt" — either way Neo4j is
|
||||
# effectively unobservable. Hostname-only — purpose of each instance
|
||||
# is implied by the host (e.g. ariel = LLM memory, umbriel = Mnemosyne).
|
||||
- name: neo4j_alerts
|
||||
rules:
|
||||
- alert: Neo4jExporterDown
|
||||
expr: up{job="neo4j"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Neo4j exporter down on {{ $labels.instance }}"
|
||||
description: "The neo4j-apoc-exporter on {{ $labels.instance }} has been unreachable for more than 5 minutes. Either the sidecar container is down or it cannot connect to Neo4j over Bolt — check `docker ps` and `docker logs neo4j-exporter` on the host."
|
||||
|
||||
- alert: Neo4jHighRollbackRate
|
||||
expr: |
|
||||
rate(neo4j_monitor_tx_rolledBackTx[10m])
|
||||
/ clamp_min(rate(neo4j_monitor_tx_totalOpenedTx[10m]), 1) > 0.10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Neo4j transaction rollback rate above 10% on {{ $labels.instance }}"
|
||||
description: "More than 10% of transactions on {{ $labels.instance }} have rolled back over the last 10 minutes — check application logs in Loki ({job=\"neo4j\", hostname=\"{{ $labels.instance }}\"})."
|
||||
|
||||
- alert: Neo4jStoreGrowthStalled
|
||||
expr: |
|
||||
rate(neo4j_monitor_tx_totalOpenedTx[15m]) == 0
|
||||
and neo4j_monitor_tx_currentOpenedTx > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Neo4j has open transactions but zero throughput on {{ $labels.instance }}"
|
||||
description: "{{ $labels.instance }} shows {{ $value }} currently-open transactions but no new transactions opened in 15 minutes — possible Bolt-side hang or stuck query."
|
||||
|
||||
# Red Panda Seal of Approval 🐼
|
||||
# "If the metrics aren't red, go back to bed"
|
||||
{% endraw %}
|
||||
|
||||
@@ -62,4 +62,17 @@ scrape_configs:
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 15s
|
||||
|
||||
# Neo4j — stscoundrel/neo4j-apoc-exporter sidecar connects to the local
|
||||
# Neo4j over Bolt and exposes apoc.monitor.* (tx/ids/store) plus JVM
|
||||
# metrics on the standard metrics port (22094). Both Ariel (LLM memory
|
||||
# via neo4j-cypher MCP) and Umbriel (Mnemosyne graph+vector DB) use the
|
||||
# same port — they are differentiated by hostname only.
|
||||
- job_name: 'neo4j'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'ariel.incus:22094'
|
||||
- 'umbriel.incus:22094'
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 15s
|
||||
|
||||
# Red Panda Approved Prometheus Configuration
|
||||
|
||||
Reference in New Issue
Block a user