diff --git a/README.md b/README.md index 033fb3b..1d61b52 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,56 @@ -# ouranos +# Ouranos -Agathos is a comprehensive infrastructure-as-code project that provisions and manages a complete development sandbox environment. The project combines **Terraform** for infrastructure provisioning and **Ansible** for configuration management, themed around the moons of Uranus. \ No newline at end of file +**Red Panda Approved™ Infrastructure as Code** + +Ouranos is an infrastructure-as-code project that provisions and manages the **Ouranos Lab** — a development sandbox at [ouranos.helu.ca](https://ouranos.helu.ca). All infrastructure is tracked in Git for fully reproducible deployments. + +| Component | Purpose | +|-----------|---------| +| **Terraform** | Provisions 10 specialised Incus containers (LXC) with networking, security policies, and resource dependencies | +| **Ansible** | Configures Docker, databases, observability stack, and application runtimes across all containers | + +Containers are named after moons of Uranus and resolved via the `.incus` DNS domain. + +## Quick Start + +ℹ️ The Ansible virtual environment is expected at `~/env/agathos/bin/activate`. + +```bash +# Provision containers +cd terraform +terraform init && terraform apply + +# Configure services +cd ../ansible +source ~/env/agathos/bin/activate +ansible-playbook site.yml +``` + +### Common Operations + +```bash +# Start all containers +ansible-playbook sandbox_up.yml + +# Stop all containers +ansible-playbook sandbox_down.yml + +# Update all hosts +ansible-playbook apt_update.yml + +# Deploy a specific service +ansible-playbook /deploy.yml +``` + +## Documentation + +| Document | Description | +|----------|-------------| +| [docs/ouranos.md](docs/ouranos.md) | Complete lab reference — hosts, services, routing, workflows | +| [docs/terraform.md](docs/terraform.md) | Terraform practices and patterns | +| [docs/ansible.md](docs/ansible.md) | Ansible project structure and conventions | +| [docs/red_panda_standards.md](docs/red_panda_standards.md) | Red Panda Approval™ quality standards | + +## 🐾 Red Panda Approval™ + +This project adheres to [Red Panda Approval™ standards](docs/red_panda_standards.md). \ No newline at end of file diff --git a/ansible/.vault_pass b/ansible/.vault_pass new file mode 100644 index 0000000..5676ef6 --- /dev/null +++ b/ansible/.vault_pass @@ -0,0 +1 @@ +redpanda_approved_vault_password \ No newline at end of file diff --git a/ansible/adduser_harper.yml b/ansible/adduser_harper.yml new file mode 100644 index 0000000..3f97e5a --- /dev/null +++ b/ansible/adduser_harper.yml @@ -0,0 +1,63 @@ +--- +# Create Harper User Account +# Creates the harper user on all ubuntu hosts and deploys SSH authorized keys +# +# Usage: +# ansible-playbook adduser_harper.yml +# +# Target specific host: +# ansible-playbook adduser_harper.yml --limit ariel.incus + +- name: Create Harper User Account + hosts: ubuntu + become: true + + vars: + harper_user: + name: harper + comment: "Harper - Autonomous Agent" + shell: /bin/bash + groups: + - sudo + + tasks: + - name: Create harper user account + ansible.builtin.user: + name: "{{ harper_user.name }}" + comment: "{{ harper_user.comment }}" + shell: "{{ harper_user.shell }}" + groups: "{{ harper_user.groups }}" + append: true + create_home: true + state: present + + - name: Ensure .ssh directory exists for harper + ansible.builtin.file: + path: "/home/{{ harper_user.name }}/.ssh" + state: directory + mode: '0700' + owner: "{{ harper_user.name }}" + group: "{{ harper_user.name }}" + + - name: Get harper keys from ssh_authorized_users + ansible.builtin.set_fact: + harper_keys: "{{ ssh_authorized_users | selectattr('name', 'equalto', 'harper') | map(attribute='keys') | first | default([]) }}" + + - name: Deploy authorized keys for harper + ansible.posix.authorized_key: + user: "{{ harper_user.name }}" + key: "{{ item }}" + state: present + exclusive: false + loop: "{{ harper_keys }}" + loop_control: + label: "{{ item | truncate(50) }}" + when: harper_keys | length > 0 + + - name: Configure passwordless sudo for harper + ansible.builtin.lineinfile: + path: /etc/sudoers.d/harper + line: "harper ALL=(ALL) NOPASSWD:ALL" + create: true + mode: '0440' + validate: "visudo -cf %s" diff --git a/ansible/alloy/ariel/config.alloy.j2 b/ansible/alloy/ariel/config.alloy.j2 new file mode 100644 index 0000000..16c8171 --- /dev/null +++ b/ansible/alloy/ariel/config.alloy.j2 @@ -0,0 +1,57 @@ +logging { + level = "{{alloy_log_level}}" +} + +loki.source.file "system_logs" { + targets = [ + {__path__ = "/var/log/syslog", job = "syslog"}, + {__path__ = "/var/log/auth.log", job = "auth"}, + ] + forward_to = [loki.write.default.receiver] +} + +loki.source.journal "systemd_logs" { + forward_to = [loki.write.default.receiver] + labels = { + job = "systemd", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } +} + +loki.source.syslog "neo4j_logs" { + listener { + address = "127.0.0.1:{{neo4j_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "neo4j", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +prometheus.exporter.unix "default" { + include_exporter_metrics = true + disable_collectors = ["mdadm"] +} + +prometheus.scrape "default" { + targets = prometheus.exporter.unix.default.targets + forward_to = [prometheus.remote_write.default.receiver] + job_name = "containers" +} + +prometheus.remote_write "default" { + endpoint { + url = "{{prometheus_remote_write_url}}" + } +} + +loki.write "default" { + endpoint { + url = "{{loki_url}}" + } +} \ No newline at end of file diff --git a/ansible/alloy/config.alloy.j2 b/ansible/alloy/config.alloy.j2 new file mode 100644 index 0000000..d643ad2 --- /dev/null +++ b/ansible/alloy/config.alloy.j2 @@ -0,0 +1,24 @@ +// Default Alloy Configuration +// Standard system monitoring and log collection + +logging { + level = "{{alloy_log_level}}" + format = "logfmt" +} + +// Loki log forwarding +loki.write "default" { + endpoint { + url = "{{ loki_url }}" + } +} + +// System log collection +loki.source.journal "systemd_logs" { + forward_to = [loki.write.default.receiver] + labels = { + job = "systemd", + hostname = "{{ inventory_hostname }}", + environment = "{{ deployment_environment }}", + } +} diff --git a/ansible/alloy/deploy.yml b/ansible/alloy/deploy.yml new file mode 100644 index 0000000..dd1126b --- /dev/null +++ b/ansible/alloy/deploy.yml @@ -0,0 +1,116 @@ +--- +- name: Deploy Alloy to All Ubuntu Hosts + hosts: ubuntu + tasks: + - name: Check if host has alloy service + ansible.builtin.set_fact: + has_alloy_service: "{{'alloy' in services}}" + + - name: Skip hosts without alloy service + ansible.builtin.meta: end_host + when: not has_alloy_service + + - name: Add Grafana repository + become: true + ansible.builtin.deb822_repository: + name: grafana + types: [deb] + uris: https://apt.grafana.com + suites: [stable] + components: [main] + signed_by: https://apt.grafana.com/gpg.key + state: present + + - name: Install Alloy + become: true + ansible.builtin.apt: + name: alloy + state: present + update_cache: true + + - name: Check for host-specific Alloy config + ansible.builtin.stat: + path: "{{playbook_dir}}/{{inventory_hostname_short}}/config.alloy.j2" + register: host_specific_config + delegate_to: localhost + connection: local + + - name: Create Alloy configuration (host-specific) + become: true + ansible.builtin.template: + src: "{{ inventory_hostname_short }}/config.alloy.j2" + dest: /etc/alloy/config.alloy + owner: alloy + group: alloy + mode: '644' + when: host_specific_config.stat.exists + notify: restart alloy + + - name: Create Alloy configuration (default) + become: true + ansible.builtin.template: + src: config.alloy.j2 + dest: /etc/alloy/config.alloy + owner: alloy + group: alloy + mode: '644' + when: not host_specific_config.stat.exists + notify: restart alloy + + - name: Check if host has docker service + ansible.builtin.set_fact: + has_docker_service: "{{'docker' in services}}" + + - name: Add alloy user to docker group for cAdvisor + become: true + ansible.builtin.user: + name: alloy + groups: docker + append: true + when: has_docker_service + notify: restart alloy + + - name: Check if host has gitea service + ansible.builtin.set_fact: + has_gitea_service: "{{'gitea' in services}}" + + - name: Add alloy user to gitea group for log collection + become: true + ansible.builtin.user: + name: alloy + groups: git + append: true + when: has_gitea_service + notify: restart alloy + + - name: Enable and start Alloy service + become: true + ansible.builtin.systemd: + name: alloy + enabled: true + state: started + daemon_reload: true + + - name: Flush handlers to ensure Alloy is restarted if needed + ansible.builtin.meta: flush_handlers + + - name: Verify Alloy service is running + become: true + ansible.builtin.systemd: + name: alloy + register: alloy_service_status + + - name: Confirm Alloy service is active + ansible.builtin.assert: + that: + - alloy_service_status.status.ActiveState == "active" + fail_msg: "Alloy service is not running (state: {{ alloy_service_status.status.ActiveState }})" + success_msg: "Alloy service is running" + + handlers: + - name: restart alloy + become: true + ansible.builtin.systemd: + name: alloy + state: restarted + diff --git a/ansible/alloy/miranda/config.alloy.j2 b/ansible/alloy/miranda/config.alloy.j2 new file mode 100644 index 0000000..b87d359 --- /dev/null +++ b/ansible/alloy/miranda/config.alloy.j2 @@ -0,0 +1,131 @@ +logging { + level = "{{alloy_log_level}}" +} + +loki.source.file "system_logs" { + targets = [ + {__path__ = "/var/log/syslog", job = "syslog"}, + {__path__ = "/var/log/auth.log", job = "auth"}, + ] + forward_to = [loki.write.default.receiver] +} + +loki.source.journal "systemd_logs" { + forward_to = [loki.relabel.journal_apps.receiver] + labels = { + job = "systemd", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } +} + +loki.relabel "journal_apps" { + forward_to = [loki.write.default.receiver] + + rule { + source_labels = ["__journal__systemd_unit"] + regex = "mcpo\\.service" + target_label = "job" + replacement = "mcpo" + } + + rule { + source_labels = ["__journal__systemd_unit"] + regex = "mcpo\\.service" + target_label = "app" + replacement = "mcpo" + } +} + +loki.source.syslog "argos_logs" { + listener { + address = "127.0.0.1:{{argos_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "argos", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "grafana_mcp_logs" { + listener { + address = "127.0.0.1:{{grafana_mcp_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "grafana_mcp", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "neo4j_cypher_logs" { + listener { + address = "127.0.0.1:{{neo4j_cypher_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "neo4j-cypher", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "neo4j_memory_logs" { + listener { + address = "127.0.0.1:{{neo4j_memory_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "neo4j-memory", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "gitea_mcp_logs" { + listener { + address = "127.0.0.1:{{gitea_mcp_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "gitea-mcp", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +prometheus.exporter.unix "default" { + include_exporter_metrics = true + disable_collectors = ["mdadm"] +} + +prometheus.scrape "default" { + targets = prometheus.exporter.unix.default.targets + forward_to = [prometheus.remote_write.default.receiver] + job_name = "mcp_docker_host" +} + +prometheus.remote_write "default" { + endpoint { + url = "{{prometheus_remote_write_url}}" + } +} + +loki.write "default" { + endpoint { + url = "{{loki_url}}" + } +} diff --git a/ansible/alloy/oberon/config.alloy.j2 b/ansible/alloy/oberon/config.alloy.j2 new file mode 100644 index 0000000..9d7d557 --- /dev/null +++ b/ansible/alloy/oberon/config.alloy.j2 @@ -0,0 +1,98 @@ +logging { + level = "{{alloy_log_level}}" +} + +loki.source.file "system_logs" { + targets = [ + {__path__ = "/var/log/syslog", job = "syslog"}, + {__path__ = "/var/log/auth.log", job = "auth"}, + ] + forward_to = [loki.write.default.receiver] +} + +loki.source.journal "systemd_logs" { + forward_to = [loki.write.default.receiver] + labels = { + job = "systemd", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } +} + +loki.source.syslog "rabbitmq_logs" { + listener { + address = "127.0.0.1:{{rabbitmq_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "rabbitmq", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "searxng_logs" { + listener { + address = "127.0.0.1:{{searxng_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "searxng", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "smtp4dev_logs" { + listener { + address = "127.0.0.1:{{smtp4dev_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "smtp4dev", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +prometheus.exporter.unix "default" { + include_exporter_metrics = true + disable_collectors = ["mdadm"] +} + +prometheus.scrape "default" { + targets = prometheus.exporter.unix.default.targets + forward_to = [prometheus.remote_write.default.receiver] + job_name = "containers" +} + +prometheus.scrape "hass" { + targets = [{ + __address__ = "127.0.0.1:{{hass_port}}", + job = "hass", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + }] + forward_to = [prometheus.remote_write.default.receiver] + scrape_interval = "60s" + metrics_path = "/api/prometheus" + bearer_token = "{{hass_metrics_token}}" +} + +prometheus.remote_write "default" { + endpoint { + url = "{{prometheus_remote_write_url}}" + } +} + +loki.write "default" { + endpoint { + url = "{{loki_url}}" + } +} diff --git a/ansible/alloy/prospero/config.alloy.j2 b/ansible/alloy/prospero/config.alloy.j2 new file mode 100644 index 0000000..b0c09b4 --- /dev/null +++ b/ansible/alloy/prospero/config.alloy.j2 @@ -0,0 +1,195 @@ +// Prospero Alloy Configuration +// Red Panda Approved 🐼 +// Services: PPLG stack (Grafana, Prometheus, Loki, Alertmanager, PgAdmin, HAProxy, OAuth2-Proxy) + +logging { + level = "{{alloy_log_level}}" +} + +// ============================================================================ +// LOG COLLECTION - Loki Forwarding +// ============================================================================ + +// System log files +loki.source.file "system_logs" { + targets = [ + {__path__ = "/var/log/syslog", job = "syslog"}, + {__path__ = "/var/log/auth.log", job = "auth"}, + ] + forward_to = [loki.write.default.receiver] +} + +// PPLG HAProxy syslog receiver (HAProxy syslog → Alloy → Loki) +loki.source.syslog "pplg_haproxy" { + listener { + address = "127.0.0.1:{{pplg_haproxy_syslog_port}}" + protocol = "tcp" + labels = { + job = "pplg-haproxy", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +// Journal relabeling - assign dedicated job labels per systemd unit +loki.relabel "journal" { + forward_to = [] + + // Expose the systemd unit as a label + rule { + source_labels = ["__journal__systemd_unit"] + target_label = "unit" + } + + // Grafana + rule { + source_labels = ["__journal__systemd_unit"] + regex = "grafana-server\\.service" + target_label = "job" + replacement = "grafana" + } + + // Prometheus + rule { + source_labels = ["__journal__systemd_unit"] + regex = "prometheus\\.service" + target_label = "job" + replacement = "prometheus" + } + + // Loki + rule { + source_labels = ["__journal__systemd_unit"] + regex = "loki\\.service" + target_label = "job" + replacement = "loki" + } + + // Alertmanager + rule { + source_labels = ["__journal__systemd_unit"] + regex = "alertmanager\\.service" + target_label = "job" + replacement = "alertmanager" + } + + // PgAdmin + rule { + source_labels = ["__journal__systemd_unit"] + regex = "pgadmin\\.service" + target_label = "job" + replacement = "pgadmin" + } + + // OAuth2-Proxy (Prometheus UI) + rule { + source_labels = ["__journal__systemd_unit"] + regex = "oauth2-proxy-prometheus\\.service" + target_label = "job" + replacement = "oauth2-proxy-prometheus" + } + + // Alloy + rule { + source_labels = ["__journal__systemd_unit"] + regex = "alloy\\.service" + target_label = "job" + replacement = "alloy" + } + + // Default job for unmatched units + rule { + source_labels = ["__journal__systemd_unit"] + regex = ".+" + target_label = "job" + replacement = "systemd" + } +} + +// Systemd journal logs with per-service job labels +loki.source.journal "systemd_logs" { + forward_to = [loki.write.default.receiver] + relabel_rules = loki.relabel.journal.rules + labels = { + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } +} + +// Loki endpoint +loki.write "default" { + endpoint { + url = "{{loki_url}}" + } +} + +// ============================================================================ +// METRICS COLLECTION - Prometheus Remote Write +// ============================================================================ + +// Unix/Node metrics - Incus-safe collectors only +// Disabled collectors that don't work in containers: hwmon, thermal, mdadm, powersupplyclass, nvme +prometheus.exporter.unix "default" { + include_exporter_metrics = true + disable_collectors = [ + "arp", + "bcache", + "bonding", + "btrfs", + "hwmon", + "infiniband", + "ipvs", + "mdadm", + "nfs", + "nfsd", + "nvme", + "powersupplyclass", + "rapl", + "thermal_zone", + "zfs", + ] +} + +// Process exporter - Track all processes by command name +// Provides: namedprocess_namegroup_* metrics +prometheus.exporter.process "default" { + track_children = true + track_threads = true + gather_smaps = false + recheck_on_scrape = true + + matcher { + name = "{% raw %}{{.Comm}}{% endraw %}" + cmdline = [".+"] + } +} + +// Scrape local exporters +prometheus.scrape "local_exporters" { + targets = concat( + prometheus.exporter.unix.default.targets, + prometheus.exporter.process.default.targets, + ) + forward_to = [prometheus.relabel.add_instance.receiver] + scrape_interval = "15s" + job_name = "prospero" +} + +// Add instance label for Prometheus compatibility +prometheus.relabel "add_instance" { + forward_to = [prometheus.remote_write.default.receiver] + + rule { + target_label = "instance" + replacement = "{{inventory_hostname}}" + } +} + +// Remote write to Prospero Prometheus +prometheus.remote_write "default" { + endpoint { + url = "{{prometheus_remote_write_url}}" + } +} diff --git a/ansible/alloy/puck/config.alloy.j2 b/ansible/alloy/puck/config.alloy.j2 new file mode 100644 index 0000000..2960dcd --- /dev/null +++ b/ansible/alloy/puck/config.alloy.j2 @@ -0,0 +1,196 @@ +// Puck Alloy Configuration +// Red Panda Approved 🐼 +// Services: Log collection, Process metrics, Docker/cAdvisor metrics + +logging { + level = "{{alloy_log_level}}" +} + +// ============================================================================ +// LOG COLLECTION - Loki Forwarding +// ============================================================================ + +loki.source.file "system_logs" { + targets = [ + {__path__ = "/var/log/syslog", job = "syslog"}, + {__path__ = "/var/log/auth.log", job = "auth"}, + ] + forward_to = [loki.write.default.receiver] +} + +loki.source.journal "systemd_logs" { + forward_to = [loki.write.default.receiver] + labels = { + job = "systemd", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } +} + +loki.source.syslog "angelia_logs" { + listener { + address = "127.0.0.1:{{angelia_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "angelia", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "athena_logs" { + listener { + address = "127.0.0.1:{{athena_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "athena", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "kairos_logs" { + listener { + address = "127.0.0.1:{{kairos_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "kairos", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "sagittarius_logs" { + listener { + address = "127.0.0.1:{{sagittarius_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "sagittarius", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "spelunker_logs" { + listener { + address = "127.0.0.1:{{spelunker_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "spelunker", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "jupyterlab_logs" { + listener { + address = "127.0.0.1:{{jupyterlab_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "jupyterlab", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.write "default" { + endpoint { + url = "{{loki_url}}" + } +} + +// ============================================================================ +// METRICS COLLECTION - Prometheus Remote Write +// ============================================================================ + +// Unix/Node metrics - Incus-safe collectors only +// Disabled collectors that don't work in containers: hwmon, thermal, mdadm, powersupplyclass, nvme +prometheus.exporter.unix "default" { + include_exporter_metrics = true + disable_collectors = [ + "arp", + "bcache", + "bonding", + "btrfs", + "hwmon", + "infiniband", + "ipvs", + "mdadm", + "nfs", + "nfsd", + "nvme", + "powersupplyclass", + "rapl", + "thermal_zone", + "zfs", + ] +} + +// Process exporter - Track all processes by command name +// Provides: namedprocess_namegroup_* metrics +prometheus.exporter.process "default" { + track_children = true + track_threads = true + gather_smaps = false + recheck_on_scrape = true + + matcher { + name = "{% raw %}{{.Comm}}{% endraw %}" + cmdline = [".+"] + } +} + +// cAdvisor - Docker container metrics +// Provides: container_* metrics for CPU, memory, network, disk +prometheus.exporter.cadvisor "default" { + docker_host = "unix:///var/run/docker.sock" + storage_duration = "5m" + docker_only = true +} + +// Scrape all local exporters +prometheus.scrape "local_exporters" { + targets = concat( + prometheus.exporter.unix.default.targets, + prometheus.exporter.process.default.targets, + prometheus.exporter.cadvisor.default.targets, + ) + forward_to = [prometheus.relabel.add_instance.receiver] + scrape_interval = "15s" + job_name = "puck" +} + +// Add instance label for Prometheus compatibility +prometheus.relabel "add_instance" { + forward_to = [prometheus.remote_write.default.receiver] + + rule { + target_label = "instance" + replacement = "{{inventory_hostname}}" + } +} + +// Remote write to Prospero Prometheus +prometheus.remote_write "default" { + endpoint { + url = "{{prometheus_remote_write_url}}" + } +} \ No newline at end of file diff --git a/ansible/alloy/rosalind/config.alloy.j2 b/ansible/alloy/rosalind/config.alloy.j2 new file mode 100644 index 0000000..2b0c49b --- /dev/null +++ b/ansible/alloy/rosalind/config.alloy.j2 @@ -0,0 +1,155 @@ +// Rosalind Alloy Configuration +// Services: Gitea, Lobechat, Nextcloud monitoring + +logging { + level = "{{alloy_log_level}}" + format = "logfmt" +} + +// ============================================================================ +// LOG COLLECTION - Loki Forwarding +// ============================================================================ + +// System log files +loki.source.file "system_logs" { + targets = [ + {__path__ = "/var/log/syslog", job = "syslog"}, + {__path__ = "/var/log/auth.log", job = "auth"}, + ] + forward_to = [loki.write.default.receiver] +} + +// Systemd journal logs (includes AnythingLLM server/collector) +loki.source.journal "systemd_logs" { + forward_to = [loki.write.default.receiver] + labels = { + job = "systemd", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } +} + +// Gitea application logs +loki.source.file "gitea_logs" { + targets = [ + {__path__ = "/var/log/gitea/gitea.log", job = "gitea"}, + ] + forward_to = [loki.write.default.receiver] +} + +// Apache access and error logs (Nextcloud) +loki.source.file "apache_logs" { + targets = [ + {__path__ = "/var/log/apache2/access.log", job = "apache_access"}, + {__path__ = "/var/log/apache2/error.log", job = "apache_error"}, + ] + forward_to = [loki.write.default.receiver] +} + +// Lobechat Docker syslog +loki.source.syslog "lobechat_logs" { + listener { + address = "127.0.0.1:{{lobechat_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "lobechat", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +// Loki endpoint +loki.write "default" { + endpoint { + url = "{{loki_url}}" + } +} + +// ============================================================================ +// METRICS COLLECTION - Prometheus Remote Write +// ============================================================================ + +// Unix/Node metrics - Incus-safe collectors only +prometheus.exporter.unix "default" { + include_exporter_metrics = true + disable_collectors = [ + "arp", + "bcache", + "bonding", + "btrfs", + "hwmon", + "infiniband", + "ipvs", + "mdadm", + "nfs", + "nfsd", + "nvme", + "powersupplyclass", + "rapl", + "thermal_zone", + "zfs", + ] +} + +// Process exporter - Track all processes by command name +prometheus.exporter.process "default" { + track_children = true + track_threads = true + gather_smaps = false + recheck_on_scrape = true + + matcher { + name = "{% raw %}{{.Comm}}{% endraw %}" + cmdline = [".+"] + } +} + +// cAdvisor - Docker container metrics (for Lobechat) +prometheus.exporter.cadvisor "default" { + docker_host = "unix:///var/run/docker.sock" + store_container_labels = true + docker_only = true +} + +// Prometheus scrape configurations +prometheus.scrape "unix" { + targets = prometheus.exporter.unix.default.targets + forward_to = [prometheus.remote_write.default.receiver] + scrape_interval = "15s" +} + +prometheus.scrape "process" { + targets = prometheus.exporter.process.default.targets + forward_to = [prometheus.remote_write.default.receiver] + scrape_interval = "15s" +} + +prometheus.scrape "cadvisor" { + targets = prometheus.exporter.cadvisor.default.targets + forward_to = [prometheus.remote_write.default.receiver] + scrape_interval = "15s" +} + +// Gitea application metrics +prometheus.scrape "gitea" { + targets = [{ + __address__ = "127.0.0.1:{{gitea_web_port}}", + job = "gitea", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + }] + forward_to = [prometheus.remote_write.default.receiver] + scrape_interval = "30s" + metrics_path = "/metrics" + bearer_token = "{{gitea_metrics_token}}" +} + +// Prometheus remote write endpoint +prometheus.remote_write "default" { + endpoint { + url = "{{prometheus_remote_write_url}}" + } +} diff --git a/ansible/alloy/titania/config.alloy.j2 b/ansible/alloy/titania/config.alloy.j2 new file mode 100644 index 0000000..9a5eb69 --- /dev/null +++ b/ansible/alloy/titania/config.alloy.j2 @@ -0,0 +1,80 @@ +logging { + level = "{{alloy_log_level}}" +} + +loki.source.file "system_logs" { + targets = [ + {__path__ = "/var/log/syslog", job = "syslog"}, + {__path__ = "/var/log/auth.log", job = "auth"}, + ] + forward_to = [loki.write.default.receiver] +} + +loki.source.journal "systemd_logs" { + forward_to = [loki.write.default.receiver] + labels = { + job = "systemd", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } +} + +loki.source.syslog "haproxy_logs" { + listener { + address = "127.0.0.1:{{haproxy_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "haproxy", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +loki.source.syslog "casdoor_logs" { + listener { + address = "127.0.0.1:{{casdoor_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "casdoor", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + +prometheus.exporter.unix "default" { + include_exporter_metrics = true + disable_collectors = ["mdadm"] +} + +prometheus.scrape "default" { + targets = prometheus.exporter.unix.default.targets + forward_to = [prometheus.remote_write.default.receiver] + job_name = "containers" +} + +prometheus.scrape "haproxy" { + targets = [ + {"__address__" = "localhost:{{haproxy_stats_port}}", "__metrics_path__" = "/metrics"}, + ] + scrape_interval = "15s" + forward_to = [prometheus.remote_write.default.receiver] + job_name = "haproxy" +} + +prometheus.remote_write "default" { + endpoint { + url = "{{prometheus_remote_write_url}}" + } +} + +loki.write "default" { + endpoint { + url = "{{loki_url}}" + } +} diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg new file mode 100644 index 0000000..f7c0a4d --- /dev/null +++ b/ansible/ansible.cfg @@ -0,0 +1,6 @@ +[defaults] +inventory = inventory +stdout_callback = ansible.builtin.default +result_format = yaml +remote_user = robert +vault_password_file = .vault_pass \ No newline at end of file diff --git a/ansible/anythingllm/.env.example b/ansible/anythingllm/.env.example new file mode 100644 index 0000000..b7568df --- /dev/null +++ b/ansible/anythingllm/.env.example @@ -0,0 +1,423 @@ +SERVER_PORT=3001 +STORAGE_DIR="/app/server/storage" +UID='1000' +GID='1000' +# SIG_KEY='passphrase' # Please generate random string at least 32 chars long. +# SIG_SALT='salt' # Please generate random string at least 32 chars long. +# JWT_SECRET="my-random-string-for-seeding" # Only needed if AUTH_TOKEN is set. Please generate random string at least 12 chars long. +# JWT_EXPIRY="30d" # (optional) https://docs.anythingllm.com/configuration#custom-ttl-for-sessions + +########################################### +######## LLM API SElECTION ################ +########################################### +# LLM_PROVIDER='openai' +# OPEN_AI_KEY= +# OPEN_MODEL_PREF='gpt-4o' + +# LLM_PROVIDER='gemini' +# GEMINI_API_KEY= +# GEMINI_LLM_MODEL_PREF='gemini-2.0-flash-lite' + +# LLM_PROVIDER='azure' +# AZURE_OPENAI_ENDPOINT= +# AZURE_OPENAI_KEY= +# OPEN_MODEL_PREF='my-gpt35-deployment' # This is the "deployment" on Azure you want to use. Not the base model. +# EMBEDDING_MODEL_PREF='embedder-model' # This is the "deployment" on Azure you want to use for embeddings. Not the base model. Valid base model is text-embedding-ada-002 + +# LLM_PROVIDER='anthropic' +# ANTHROPIC_API_KEY=sk-ant-xxxx +# ANTHROPIC_MODEL_PREF='claude-2' +# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts. + +# LLM_PROVIDER='lmstudio' +# LMSTUDIO_BASE_PATH='http://your-server:1234/v1' +# LMSTUDIO_MODEL_PREF='Loaded from Chat UI' # this is a bug in LMStudio 0.2.17 +# LMSTUDIO_MODEL_TOKEN_LIMIT=4096 +# LMSTUDIO_AUTH_TOKEN='your-lmstudio-auth-token-here' + +# LLM_PROVIDER='localai' +# LOCAL_AI_BASE_PATH='http://host.docker.internal:8080/v1' +# LOCAL_AI_MODEL_PREF='luna-ai-llama2' +# LOCAL_AI_MODEL_TOKEN_LIMIT=4096 +# LOCAL_AI_API_KEY="sk-123abc" + +# LLM_PROVIDER='ollama' +# OLLAMA_BASE_PATH='http://host.docker.internal:11434' +# OLLAMA_MODEL_PREF='llama2' +# OLLAMA_MODEL_TOKEN_LIMIT=4096 +# OLLAMA_AUTH_TOKEN='your-ollama-auth-token-here (optional, only for ollama running behind auth - Bearer token)' +# OLLAMA_RESPONSE_TIMEOUT=7200000 (optional, max timeout in milliseconds for ollama response to conclude. Default is 5min before aborting) + +# LLM_PROVIDER='togetherai' +# TOGETHER_AI_API_KEY='my-together-ai-key' +# TOGETHER_AI_MODEL_PREF='mistralai/Mixtral-8x7B-Instruct-v0.1' + +# LLM_PROVIDER='mistral' +# MISTRAL_API_KEY='example-mistral-ai-api-key' +# MISTRAL_MODEL_PREF='mistral-tiny' + +# LLM_PROVIDER='perplexity' +# PERPLEXITY_API_KEY='my-perplexity-key' +# PERPLEXITY_MODEL_PREF='codellama-34b-instruct' + +# LLM_PROVIDER='openrouter' +# OPENROUTER_API_KEY='my-openrouter-key' +# OPENROUTER_MODEL_PREF='openrouter/auto' + +# LLM_PROVIDER='huggingface' +# HUGGING_FACE_LLM_ENDPOINT=https://uuid-here.us-east-1.aws.endpoints.huggingface.cloud +# HUGGING_FACE_LLM_API_KEY=hf_xxxxxx +# HUGGING_FACE_LLM_TOKEN_LIMIT=8000 + +# LLM_PROVIDER='groq' +# GROQ_API_KEY=gsk_abcxyz +# GROQ_MODEL_PREF=llama3-8b-8192 + +# LLM_PROVIDER='koboldcpp' +# KOBOLD_CPP_BASE_PATH='http://127.0.0.1:5000/v1' +# KOBOLD_CPP_MODEL_PREF='koboldcpp/codellama-7b-instruct.Q4_K_S' +# KOBOLD_CPP_MODEL_TOKEN_LIMIT=4096 + +# LLM_PROVIDER='textgenwebui' +# TEXT_GEN_WEB_UI_BASE_PATH='http://127.0.0.1:5000/v1' +# TEXT_GEN_WEB_UI_TOKEN_LIMIT=4096 +# TEXT_GEN_WEB_UI_API_KEY='sk-123abc' + +# LLM_PROVIDER='generic-openai' +# GENERIC_OPEN_AI_BASE_PATH='http://proxy.url.openai.com/v1' +# GENERIC_OPEN_AI_MODEL_PREF='gpt-3.5-turbo' +# GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=4096 +# GENERIC_OPEN_AI_API_KEY=sk-123abc +# GENERIC_OPEN_AI_CUSTOM_HEADERS="X-Custom-Auth:my-secret-key,X-Custom-Header:my-value" (useful if using a proxy that requires authentication or other headers) + +# LLM_PROVIDER='litellm' +# LITE_LLM_MODEL_PREF='gpt-3.5-turbo' +# LITE_LLM_MODEL_TOKEN_LIMIT=4096 +# LITE_LLM_BASE_PATH='http://127.0.0.1:4000' +# LITE_LLM_API_KEY='sk-123abc' + +# LLM_PROVIDER='novita' +# NOVITA_LLM_API_KEY='your-novita-api-key-here' check on https://novita.ai/settings/key-management +# NOVITA_LLM_MODEL_PREF='deepseek/deepseek-r1' + +# LLM_PROVIDER='cometapi' +# COMETAPI_LLM_API_KEY='your-cometapi-api-key-here' # Get one at https://api.cometapi.com/console/token +# COMETAPI_LLM_MODEL_PREF='gpt-5-mini' +# COMETAPI_LLM_TIMEOUT_MS=500 # Optional; stream idle timeout in ms (min 500ms) + +# LLM_PROVIDER='cohere' +# COHERE_API_KEY= +# COHERE_MODEL_PREF='command-r' + +# LLM_PROVIDER='bedrock' +# AWS_BEDROCK_LLM_ACCESS_KEY_ID= +# AWS_BEDROCK_LLM_ACCESS_KEY= +# AWS_BEDROCK_LLM_REGION=us-west-2 +# AWS_BEDROCK_LLM_MODEL_PREFERENCE=meta.llama3-1-8b-instruct-v1:0 +# AWS_BEDROCK_LLM_MODEL_TOKEN_LIMIT=8191 +# AWS_BEDROCK_LLM_CONNECTION_METHOD=iam +# AWS_BEDROCK_LLM_MAX_OUTPUT_TOKENS=4096 +# AWS_BEDROCK_LLM_SESSION_TOKEN= # Only required if CONNECTION_METHOD is 'sessionToken' +# or even use Short and Long Term API keys +# AWS_BEDROCK_LLM_CONNECTION_METHOD="apiKey" +# AWS_BEDROCK_LLM_API_KEY= + +# LLM_PROVIDER='fireworksai' +# FIREWORKS_AI_LLM_API_KEY='my-fireworks-ai-key' +# FIREWORKS_AI_LLM_MODEL_PREF='accounts/fireworks/models/llama-v3p1-8b-instruct' + +# LLM_PROVIDER='apipie' +# APIPIE_LLM_API_KEY='sk-123abc' +# APIPIE_LLM_MODEL_PREF='openrouter/llama-3.1-8b-instruct' + +# LLM_PROVIDER='xai' +# XAI_LLM_API_KEY='xai-your-api-key-here' +# XAI_LLM_MODEL_PREF='grok-beta' + +# LLM_PROVIDER='zai' +# ZAI_API_KEY="your-zai-api-key-here" +# ZAI_MODEL_PREF="glm-4.5" + +# LLM_PROVIDER='nvidia-nim' +# NVIDIA_NIM_LLM_BASE_PATH='http://127.0.0.1:8000' +# NVIDIA_NIM_LLM_MODEL_PREF='meta/llama-3.2-3b-instruct' + +# LLM_PROVIDER='deepseek' +# DEEPSEEK_API_KEY='your-deepseek-api-key-here' +# DEEPSEEK_MODEL_PREF='deepseek-chat' + +# LLM_PROVIDER='ppio' +# PPIO_API_KEY='your-ppio-api-key-here' +# PPIO_MODEL_PREF=deepseek/deepseek-v3/community + +# LLM_PROVIDER='moonshotai' +# MOONSHOT_AI_API_KEY='your-moonshot-api-key-here' +# MOONSHOT_AI_MODEL_PREF='moonshot-v1-32k' + +# LLM_PROVIDER='foundry' +# FOUNDRY_BASE_PATH='http://127.0.0.1:55776' +# FOUNDRY_MODEL_PREF='phi-3.5-mini' +# FOUNDRY_MODEL_TOKEN_LIMIT=4096 + +# LLM_PROVIDER='giteeai' +# GITEE_AI_API_KEY= +# GITEE_AI_MODEL_PREF= +# GITEE_AI_MODEL_TOKEN_LIMIT= + +# LLM_PROVIDER='docker-model-runner' +# DOCKER_MODEL_RUNNER_BASE_PATH='http://127.0.0.1:12434' +# DOCKER_MODEL_RUNNER_LLM_MODEL_PREF='phi-3.5-mini' +# DOCKER_MODEL_RUNNER_LLM_MODEL_TOKEN_LIMIT=4096 + +# LLM_PROVIDER='privatemode' +# PRIVATEMODE_LLM_BASE_PATH='http://127.0.0.1:8080' +# PRIVATEMODE_LLM_MODEL_PREF='gemma-3-27b' + +# LLM_PROVIDER='sambanova' +# SAMBANOVA_LLM_API_KEY='xxx-xxx-xxx' +# SAMBANOVA_LLM_MODEL_PREF='gpt-oss-120b' + +########################################### +######## Embedding API SElECTION ########## +########################################### +# This will be the assumed default embedding seleciton and model +# EMBEDDING_ENGINE='native' +# EMBEDDING_MODEL_PREF='Xenova/all-MiniLM-L6-v2' + +# Only used if you are using an LLM that does not natively support embedding (openai or Azure) +# EMBEDDING_ENGINE='openai' +# OPEN_AI_KEY=sk-xxxx +# EMBEDDING_MODEL_PREF='text-embedding-ada-002' + +# EMBEDDING_ENGINE='azure' +# AZURE_OPENAI_ENDPOINT= +# AZURE_OPENAI_KEY= +# EMBEDDING_MODEL_PREF='my-embedder-model' # This is the "deployment" on Azure you want to use for embeddings. Not the base model. Valid base model is text-embedding-ada-002 + +# EMBEDDING_ENGINE='localai' +# EMBEDDING_BASE_PATH='http://localhost:8080/v1' +# EMBEDDING_MODEL_PREF='text-embedding-ada-002' +# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=1000 # The max chunk size in chars a string to embed can be + +# EMBEDDING_ENGINE='ollama' +# EMBEDDING_BASE_PATH='http://host.docker.internal:11434' +# EMBEDDING_MODEL_PREF='nomic-embed-text:latest' +# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=8192 + +# EMBEDDING_ENGINE='lmstudio' +# EMBEDDING_BASE_PATH='https://host.docker.internal:1234/v1' +# EMBEDDING_MODEL_PREF='nomic-ai/nomic-embed-text-v1.5-GGUF/nomic-embed-text-v1.5.Q4_0.gguf' +# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=8192 + +# EMBEDDING_ENGINE='cohere' +# COHERE_API_KEY= +# EMBEDDING_MODEL_PREF='embed-english-v3.0' + +# EMBEDDING_ENGINE='voyageai' +# VOYAGEAI_API_KEY= +# EMBEDDING_MODEL_PREF='voyage-large-2-instruct' + +# EMBEDDING_ENGINE='litellm' +# EMBEDDING_MODEL_PREF='text-embedding-ada-002' +# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=8192 +# LITE_LLM_BASE_PATH='http://127.0.0.1:4000' +# LITE_LLM_API_KEY='sk-123abc' + +# EMBEDDING_ENGINE='generic-openai' +# EMBEDDING_MODEL_PREF='text-embedding-ada-002' +# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=8192 +# EMBEDDING_BASE_PATH='http://127.0.0.1:4000' +# GENERIC_OPEN_AI_EMBEDDING_API_KEY='sk-123abc' +# GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS=500 +# GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS=1000 + +# EMBEDDING_ENGINE='gemini' +# GEMINI_EMBEDDING_API_KEY= +# EMBEDDING_MODEL_PREF='text-embedding-004' + +# EMBEDDING_ENGINE='openrouter' +# EMBEDDING_MODEL_PREF='baai/bge-m3' +# OPENROUTER_API_KEY='' + +########################################### +######## Vector Database Selection ######## +########################################### +# Enable all below if you are using vector database: LanceDB. +# VECTOR_DB="lancedb" + +# Enable all below if you are using vector database: Weaviate. +# VECTOR_DB="pgvector" +# PGVECTOR_CONNECTION_STRING="postgresql://dbuser:dbuserpass@localhost:5432/yourdb" +# PGVECTOR_TABLE_NAME="anythingllm_vectors" # optional, but can be defined + +# Enable all below if you are using vector database: Chroma. +# VECTOR_DB="chroma" +# CHROMA_ENDPOINT='http://host.docker.internal:8000' +# CHROMA_API_HEADER="X-Api-Key" +# CHROMA_API_KEY="sk-123abc" + +# Enable all below if you are using vector database: Chroma Cloud. +# VECTOR_DB="chromacloud" +# CHROMACLOUD_API_KEY="ck-your-api-key" +# CHROMACLOUD_TENANT= +# CHROMACLOUD_DATABASE= + +# Enable all below if you are using vector database: Pinecone. +# VECTOR_DB="pinecone" +# PINECONE_API_KEY= +# PINECONE_INDEX= + +# Enable all below if you are using vector database: Weaviate. +# VECTOR_DB="weaviate" +# WEAVIATE_ENDPOINT="http://localhost:8080" +# WEAVIATE_API_KEY= + +# Enable all below if you are using vector database: Qdrant. +# VECTOR_DB="qdrant" +# QDRANT_ENDPOINT="http://localhost:6333" +# QDRANT_API_KEY= + +# Enable all below if you are using vector database: Milvus. +# VECTOR_DB="milvus" +# MILVUS_ADDRESS="http://localhost:19530" +# MILVUS_USERNAME= +# MILVUS_PASSWORD= + +# Enable all below if you are using vector database: Zilliz Cloud. +# VECTOR_DB="zilliz" +# ZILLIZ_ENDPOINT="https://sample.api.gcp-us-west1.zillizcloud.com" +# ZILLIZ_API_TOKEN=api-token-here + +# Enable all below if you are using vector database: Astra DB. +# VECTOR_DB="astra" +# ASTRA_DB_APPLICATION_TOKEN= +# ASTRA_DB_ENDPOINT= + +########################################### +######## Audio Model Selection ############ +########################################### +# (default) use built-in whisper-small model. +# WHISPER_PROVIDER="local" + +# use openai hosted whisper model. +# WHISPER_PROVIDER="openai" +# OPEN_AI_KEY=sk-xxxxxxxx + +########################################### +######## TTS/STT Model Selection ########## +########################################### +# TTS_PROVIDER="native" + +# TTS_PROVIDER="openai" +# TTS_OPEN_AI_KEY=sk-example +# TTS_OPEN_AI_VOICE_MODEL=nova + +# TTS_PROVIDER="generic-openai" +# TTS_OPEN_AI_COMPATIBLE_KEY=sk-example +# TTS_OPEN_AI_COMPATIBLE_MODEL=tts-1 +# TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL=nova +# TTS_OPEN_AI_COMPATIBLE_ENDPOINT="https://api.openai.com/v1" + +# TTS_PROVIDER="elevenlabs" +# TTS_ELEVEN_LABS_KEY= +# TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel + +# CLOUD DEPLOYMENT VARIRABLES ONLY +# AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. +# DISABLE_TELEMETRY="false" + +########################################### +######## PASSWORD COMPLEXITY ############## +########################################### +# Enforce a password schema for your organization users. +# Documentation on how to use https://github.com/kamronbatman/joi-password-complexity +# Default is only 8 char minimum +# PASSWORDMINCHAR=8 +# PASSWORDMAXCHAR=250 +# PASSWORDLOWERCASE=1 +# PASSWORDUPPERCASE=1 +# PASSWORDNUMERIC=1 +# PASSWORDSYMBOL=1 +# PASSWORDREQUIREMENTS=4 + +########################################### +######## ENABLE HTTPS SERVER ############## +########################################### +# By enabling this and providing the path/filename for the key and cert, +# the server will use HTTPS instead of HTTP. +#ENABLE_HTTPS="true" +#HTTPS_CERT_PATH="sslcert/cert.pem" +#HTTPS_KEY_PATH="sslcert/key.pem" + +########################################### +######## AGENT SERVICE KEYS ############### +########################################### + +#------ SEARCH ENGINES ------- +#============================= +#------ Google Search -------- https://programmablesearchengine.google.com/controlpanel/create +# AGENT_GSE_KEY= +# AGENT_GSE_CTX= + +#------ SearchApi.io ----------- https://www.searchapi.io/ +# AGENT_SEARCHAPI_API_KEY= +# AGENT_SEARCHAPI_ENGINE=google + +#------ SerpApi ----------- https://serpapi.com/ +# AGENT_SERPAPI_API_KEY= +# AGENT_SERPAPI_ENGINE=google + +#------ Serper.dev ----------- https://serper.dev/ +# AGENT_SERPER_DEV_KEY= + +#------ Bing Search ----------- https://portal.azure.com/ +# AGENT_BING_SEARCH_API_KEY= + +#------ Serply.io ----------- https://serply.io/ +# AGENT_SERPLY_API_KEY= + +#------ SearXNG ----------- https://github.com/searxng/searxng +# AGENT_SEARXNG_API_URL= + +#------ Tavily ----------- https://www.tavily.com/ +# AGENT_TAVILY_API_KEY= + +#------ Exa Search ----------- https://www.exa.ai/ +# AGENT_EXA_API_KEY= + +########################################### +######## Other Configurations ############ +########################################### + +# Disable viewing chat history from the UI and frontend APIs. +# See https://docs.anythingllm.com/configuration#disable-view-chat-history for more information. +# DISABLE_VIEW_CHAT_HISTORY=1 + +# Enable simple SSO passthrough to pre-authenticate users from a third party service. +# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information. +# SIMPLE_SSO_ENABLED=1 +# SIMPLE_SSO_NO_LOGIN=1 +# SIMPLE_SSO_NO_LOGIN_REDIRECT=https://your-custom-login-url.com (optional) + +# Allow scraping of any IP address in collector - must be string "true" to be enabled +# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information. +# COLLECTOR_ALLOW_ANY_IP="true" + +# Specify the target languages for when using OCR to parse images and PDFs. +# This is a comma separated list of language codes as a string. Unsupported languages will be ignored. +# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes. +# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol + +# Runtime flags for built-in pupeeteer Chromium instance +# This is only required on Linux machines running AnythingLLM via Docker +# and do not want to use the --cap-add=SYS_ADMIN docker argument +# ANYTHINGLLM_CHROMIUM_ARGS="--no-sandbox,--disable-setuid-sandbox" + +# Disable Swagger API documentation endpoint. +# Set to "true" to disable the /api/docs endpoint (recommended for production deployments). +# DISABLE_SWAGGER_DOCS="true" + +# Disable MCP cooldown timer for agent calls +# this can lead to infinite recursive calls of the same function +# for some model/provider combinations +# MCP_NO_COOLDOWN="true \ No newline at end of file diff --git a/ansible/anythingllm/anythingllm-collector.service.j2 b/ansible/anythingllm/anythingllm-collector.service.j2 new file mode 100644 index 0000000..c25edef --- /dev/null +++ b/ansible/anythingllm/anythingllm-collector.service.j2 @@ -0,0 +1,29 @@ +[Unit] +Description=AnythingLLM Document Collector +Documentation=https://docs.anythingllm.com +After=network.target anythingllm-server.service +BindsTo=anythingllm-server.service + +[Service] +Type=simple +User={{ anythingllm_user }} +Group={{ anythingllm_group }} +WorkingDirectory={{ anythingllm_directory }}/app/collector +EnvironmentFile={{ anythingllm_directory }}/app/server/.env +Environment=NODE_ENV=production +ExecStart=/usr/bin/node index.js +Restart=on-failure +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=anythingllm-collector + +# Security hardening +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths={{ anythingllm_directory }} + +[Install] +WantedBy=multi-user.target diff --git a/ansible/anythingllm/anythingllm-server.service.j2 b/ansible/anythingllm/anythingllm-server.service.j2 new file mode 100644 index 0000000..bb08e69 --- /dev/null +++ b/ansible/anythingllm/anythingllm-server.service.j2 @@ -0,0 +1,29 @@ +[Unit] +Description=AnythingLLM Server +Documentation=https://docs.anythingllm.com +After=network.target postgresql.service +Wants=anythingllm-collector.service + +[Service] +Type=simple +User={{ anythingllm_user }} +Group={{ anythingllm_group }} +WorkingDirectory={{ anythingllm_directory }}/app/server +Environment=NODE_ENV=production +Environment=SERVER_PORT={{ anythingllm_port }} +ExecStart=/usr/bin/node index.js +Restart=on-failure +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=anythingllm-server + +# Security hardening +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths={{ anythingllm_directory }} + +[Install] +WantedBy=multi-user.target diff --git a/ansible/anythingllm/anythingllm_mcp_servers.json.j2 b/ansible/anythingllm/anythingllm_mcp_servers.json.j2 new file mode 100644 index 0000000..7eae782 --- /dev/null +++ b/ansible/anythingllm/anythingllm_mcp_servers.json.j2 @@ -0,0 +1,60 @@ +{ + "mcpServers": { + "upstash-context7": { + "command": "npx", + "args": [ + "-y", + "@upstash/context7-mcp" + ] + }, + "angelia": { + "type": "streamable", + "url": "{{angelia_mcp_url}}", + "headers": { + "Authorization": "Bearer {{angelia_mcp_auth}}" + } + }, + "argos": { + "type": "streamable", + "url": "{{argos_mcp_url}}" + }, + "caliban": { + "type": "streamable", + "url": "{{caliban_mcp_url}}" + }, + "gitea": { + "type": "streamable", + "url": "{{gitea_mcp_url}}" + }, + "github": { + "type": "streamable", + "url": "https://api.githubcopilot.com/mcp/", + "headers": { + "Authorization": "Bearer {{github_personal_access_token}}" + } + }, + "grafana": { + "type": "streamable", + "url": "{{grafana_mcp_url}}" + }, + "huggingface": { + "type": "streamable", + "url": "https://huggingface.co/mcp", + "headers": { + "Authorization": "Bearer {{huggingface_mcp_token}}" + } + }, + "korax": { + "type": "streamable", + "url": "{{korax_mcp_url}}" + }, + "neo4j": { + "type": "streamable", + "url": "{{neo4j_mcp_url}}" + }, + "nike": { + "type": "streamable", + "url": "{{nike_mcp_url}}" + } + } +} \ No newline at end of file diff --git a/ansible/anythingllm/deploy.yml b/ansible/anythingllm/deploy.yml new file mode 100644 index 0000000..c46407c --- /dev/null +++ b/ansible/anythingllm/deploy.yml @@ -0,0 +1,276 @@ +--- +- name: Deploy AnythingLLM (Native Node.js) + hosts: ubuntu + become: true + vars: + nodejs_version: "22" + ansible_common_remote_group: "{{ anythingllm_group }}" + allow_world_readable_tmpfiles: true + tasks: + - name: Check if host has anythingllm service + ansible.builtin.set_fact: + has_anythingllm_service: "{{'anythingllm' in services}}" + + - name: Skip hosts without anythingllm service + ansible.builtin.meta: end_host + when: not has_anythingllm_service + + - name: Install build dependencies + ansible.builtin.apt: + name: + - curl + - tar + - build-essential + - python3 + - libpq-dev + state: present + update_cache: true + + - name: Add NodeSource GPG key + ansible.builtin.apt_key: + url: https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key + state: present + + - name: Add NodeSource repository + ansible.builtin.apt_repository: + repo: "deb https://deb.nodesource.com/node_{{ nodejs_version }}.x nodistro main" + state: present + filename: nodesource + + - name: Install Node.js + ansible.builtin.apt: + name: nodejs + state: present + update_cache: true + + - name: Install Yarn globally + ansible.builtin.npm: + name: yarn + global: true + state: present + + - name: Create anythingllm group + ansible.builtin.group: + name: "{{ anythingllm_group }}" + + - name: Create anythingllm user + ansible.builtin.user: + name: "{{ anythingllm_user }}" + comment: "AnythingLLM service account" + group: "{{ anythingllm_group }}" + home: "{{ anythingllm_directory }}" + system: true + shell: /bin/bash + + - name: Add remote_user to anythingllm group + ansible.builtin.user: + name: "{{ remote_user }}" + groups: "{{ anythingllm_group }}" + append: true + + - name: Create anythingllm directory + ansible.builtin.file: + path: "{{ anythingllm_directory }}" + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + state: directory + mode: '0750' + + - name: Create app directory + ansible.builtin.file: + path: "{{ anythingllm_directory }}/app" + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + state: directory + mode: '0750' + + - name: Transfer and unarchive AnythingLLM release + ansible.builtin.unarchive: + src: "~/rel/anythingllm_{{ anythingllm_rel }}.tar" + dest: "{{ anythingllm_directory }}/app" + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + mode: '0750' + register: app_unarchive + notify: + - Restart AnythingLLM Server + - Restart AnythingLLM Collector + + - name: Run yarn setup + become_user: "{{ anythingllm_user }}" + ansible.builtin.command: + cmd: yarn setup + chdir: "{{ anythingllm_directory }}/app" + when: app_unarchive.changed + register: yarn_setup + + - name: Create storage directory + ansible.builtin.file: + path: "{{ anythingllm_directory }}/storage" + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + state: directory + mode: '0750' + + - name: Create plugins directory + ansible.builtin.file: + path: "{{ anythingllm_directory }}/storage/plugins" + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + state: directory + mode: '0750' + + - name: Template MCP servers configuration + ansible.builtin.template: + src: anythingllm_mcp_servers.json.j2 + dest: "{{ anythingllm_directory }}/storage/plugins/anythingllm_mcp_servers.json" + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + mode: '0600' + notify: + - Restart AnythingLLM Server + - Restart AnythingLLM Collector + + - name: Create hotdir directory + ansible.builtin.file: + path: "{{ anythingllm_directory }}/hotdir" + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + state: directory + mode: '0750' + + - name: Create collector symlink directory + ansible.builtin.file: + path: /srv/collector + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + state: directory + mode: '0755' + + - name: Create hotdir symlink for AnythingLLM path resolution + ansible.builtin.file: + src: "{{ anythingllm_directory }}/hotdir" + dest: /srv/collector/hotdir + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + state: link + + - name: Remove collector's default hotdir directory + ansible.builtin.file: + path: "{{ anythingllm_directory }}/app/collector/hotdir" + state: absent + + - name: Create hotdir symlink for collector + ansible.builtin.file: + src: "{{ anythingllm_directory }}/hotdir" + dest: "{{ anythingllm_directory }}/app/collector/hotdir" + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + state: link + + - name: Template server environment file + ansible.builtin.template: + src: env.j2 + dest: "{{ anythingllm_directory }}/app/server/.env" + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + mode: '0600' + notify: + - Restart AnythingLLM Server + - Restart AnythingLLM Collector + + - name: Configure frontend API base + ansible.builtin.lineinfile: + path: "{{ anythingllm_directory }}/app/frontend/.env" + regexp: "^VITE_API_BASE=" + line: "VITE_API_BASE='/api'" + create: true + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + mode: '0644' + register: frontend_env + + - name: Build frontend + become_user: "{{ anythingllm_user }}" + ansible.builtin.command: + cmd: yarn build + chdir: "{{ anythingllm_directory }}/app/frontend" + when: app_unarchive.changed or frontend_env.changed + register: frontend_build + + - name: Remove old server/public directory + ansible.builtin.file: + path: "{{ anythingllm_directory }}/app/server/public" + state: absent + when: frontend_build.changed + + - name: Copy frontend build to server/public + become_user: "{{ anythingllm_user }}" + ansible.builtin.copy: + src: "{{ anythingllm_directory }}/app/frontend/dist/" + dest: "{{ anythingllm_directory }}/app/server/public/" + remote_src: true + owner: "{{ anythingllm_user }}" + group: "{{ anythingllm_group }}" + when: frontend_build.changed + + - name: Generate Prisma client + become_user: "{{ anythingllm_user }}" + ansible.builtin.command: + cmd: npx prisma generate --schema=./prisma/schema.prisma + chdir: "{{ anythingllm_directory }}/app/server" + when: app_unarchive.changed or yarn_setup.changed + + - name: Run Prisma migrations + become_user: "{{ anythingllm_user }}" + ansible.builtin.command: + cmd: npx prisma migrate deploy --schema=./prisma/schema.prisma + chdir: "{{ anythingllm_directory }}/app/server" + when: app_unarchive.changed or yarn_setup.changed + + - name: Create AnythingLLM server systemd service + ansible.builtin.template: + src: anythingllm-server.service.j2 + dest: /etc/systemd/system/anythingllm-server.service + mode: '0644' + notify: + - Reload systemd + - Restart AnythingLLM Server + + - name: Create AnythingLLM collector systemd service + ansible.builtin.template: + src: anythingllm-collector.service.j2 + dest: /etc/systemd/system/anythingllm-collector.service + mode: '0644' + notify: + - Reload systemd + - Restart AnythingLLM Collector + + - name: Enable and start AnythingLLM server + ansible.builtin.systemd: + name: anythingllm-server + enabled: true + state: started + daemon_reload: true + + - name: Enable and start AnythingLLM collector + ansible.builtin.systemd: + name: anythingllm-collector + enabled: true + state: started + daemon_reload: true + + handlers: + - name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + + - name: Restart AnythingLLM Server + ansible.builtin.systemd: + name: anythingllm-server + state: restarted + + - name: Restart AnythingLLM Collector + ansible.builtin.systemd: + name: anythingllm-collector + state: restarted diff --git a/ansible/anythingllm/docker-compose.yml b/ansible/anythingllm/docker-compose.yml new file mode 100644 index 0000000..2d67404 --- /dev/null +++ b/ansible/anythingllm/docker-compose.yml @@ -0,0 +1,393 @@ +networks: + frontend: + driver: bridge + backend: + driver: bridge + monitoring: + driver: bridge + +volumes: + anythingllm_data: + driver: local + postgres_data: + driver: local + prometheus_data: + driver: local + loki_data: + driver: local + grafana_data: + driver: local + +services: + # ============================================ + # PostgreSQL with pgvector Extension + # ============================================ + postgres: + image: pgvector/pgvector:pg17 + container_name: anythingllm-postgres + restart: unless-stopped + environment: + POSTGRES_DB: ${POSTGRES_DB:-anythingllm} + POSTGRES_USER: ${POSTGRES_USER:-anythingllm} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?POSTGRES_PASSWORD is required} + POSTGRES_INITDB_ARGS: "-E UTF8" + volumes: + - postgres_data:/var/lib/postgresql/data + - ./scripts/init-pgvector.sql:/docker-entrypoint-initdb.d/init-pgvector.sql:ro + networks: + - backend + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-anythingllm}"] + interval: 10s + timeout: 5s + retries: 5 + deploy: + resources: + limits: + memory: 2G + reservations: + memory: 1G + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + labels: "service=postgres" + + # ============================================ + # AnythingLLM Application + # ============================================ + anythingllm: + image: mintplexlabs/anythingllm:latest + container_name: anythingllm + restart: unless-stopped + cap_add: + - SYS_ADMIN + environment: + # Server Configuration + SERVER_PORT: 3001 + JWT_SECRET: ${JWT_SECRET:?JWT_SECRET is required} + SIG_KEY: ${SIG_KEY:?SIG_KEY is required} + SIG_SALT: ${SIG_SALT:?SIG_SALT is required} + STORAGE_DIR: /app/server/storage + + # PostgreSQL Configuration + VECTOR_DB: "pgvector" + PGVECTOR_CONNECTION_STRING: "postgresql://${POSTGRES_USER:-anythingllm}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-anythingllm}" + + # LLM Provider - Generic OpenAI (for llama-cpp) + LLM_PROVIDER: "generic-openai" + GENERIC_OPEN_AI_BASE_PATH: ${LLAMACPP_BASE_URL:?LLAMACPP_BASE_URL is required} + GENERIC_OPEN_AI_MODEL_PREF: ${LLAMACPP_MODEL:-llama-3-8b} + GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT: ${LLAMACPP_TOKEN_LIMIT:-8192} + GENERIC_OPEN_AI_API_KEY: ${LLAMACPP_API_KEY:-not-needed} + + # AWS Bedrock Configuration (optional - uncomment if using) + # LLM_PROVIDER: "bedrock" + # AWS_BEDROCK_LLM_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID} + # AWS_BEDROCK_LLM_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY} + # AWS_BEDROCK_LLM_REGION: ${AWS_REGION:-us-east-1} + # AWS_BEDROCK_LLM_MODEL_PREFERENCE: ${BEDROCK_MODEL:-anthropic.claude-3-sonnet-20240229-v1:0} + # AWS_BEDROCK_LLM_MODEL_TOKEN_LIMIT: 200000 + + # Embedding Configuration + EMBEDDING_ENGINE: ${EMBEDDING_ENGINE} + EMBEDDING_MODEL_PREF: ${EMBEDDING_MODEL_PREF} + EMBEDDING_MODEL_MAX_CHUNK_LENGTH: ${EMBEDDING_MODEL_MAX_CHUNK_LENGTH} + EMBEDDING_BASE_PATH: ${EMBEDDING_BASE_PATH} + GENERIC_OPEN_AI_EMBEDDING_API_KEY: ${GENERIC_OPEN_AI_EMBEDDING_API_KEY} + GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS: ${GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS} + GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS: ${GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS} + + # Whisper Configuration + WHISPER_PROVIDER: "local" + + # TTS Configuration + TTS_PROVIDER: "native" + + # Security + DISABLE_TELEMETRY: "true" + + # Logging (JSON format for Loki) + NODE_ENV: production + + # Optional: Enable HTTP logging + # ENABLE_HTTP_LOGGER: "true" + # ENABLE_HTTP_LOGGER_TIMESTAMPS: "true" + volumes: + - anythingllm_data:/app/server/storage + - anythingllm_data:/app/collector/hotdir + - anythingllm_data:/app/collector/outputs + networks: + - frontend + - backend + depends_on: + postgres: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3001/api/ping"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 2G + logging: + driver: "json-file" + options: + max-size: "50m" + max-file: "5" + labels: "service=anythingllm" + + # ============================================ + # HAProxy - Reverse Proxy & Load Balancer + # ============================================ + haproxy: + image: haproxy:2.9-alpine + container_name: anythingllm-haproxy + restart: unless-stopped + ports: + - "80:80" + - "443:443" + - "8404:8404" # HAProxy stats + volumes: + - ./haproxy/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro + - ./haproxy/certs:/etc/haproxy/certs:ro + - ./haproxy/errors:/etc/haproxy/errors:ro + networks: + - frontend + - monitoring + depends_on: + - anythingllm + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8404/stats"] + interval: 10s + timeout: 5s + retries: 3 + deploy: + resources: + limits: + memory: 512M + reservations: + memory: 256M + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + labels: "service=haproxy" + + # ============================================ + # Prometheus - Metrics Collection + # ============================================ + prometheus: + image: prom/prometheus:latest + container_name: anythingllm-prometheus + restart: unless-stopped + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=30d' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--web.enable-lifecycle' + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro + - prometheus_data:/prometheus + networks: + - monitoring + - backend + ports: + - "9090:9090" + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"] + interval: 30s + timeout: 10s + retries: 3 + deploy: + resources: + limits: + memory: 2G + reservations: + memory: 1G + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + labels: "service=prometheus" + + # ============================================ + # Postgres Exporter - Database Metrics + # ============================================ + postgres-exporter: + image: prometheuscommunity/postgres-exporter:latest + container_name: anythingllm-postgres-exporter + restart: unless-stopped + environment: + DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-anythingllm}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-anythingllm}?sslmode=disable" + networks: + - backend + - monitoring + depends_on: + postgres: + condition: service_healthy + deploy: + resources: + limits: + memory: 256M + reservations: + memory: 128M + logging: + driver: "json-file" + options: + max-size: "5m" + max-file: "2" + labels: "service=postgres-exporter" + + # ============================================ + # cAdvisor - Container Metrics + # ============================================ + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + container_name: anythingllm-cadvisor + restart: unless-stopped + privileged: true + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /dev/disk/:/dev/disk:ro + networks: + - monitoring + ports: + - "8080:8080" + deploy: + resources: + limits: + memory: 512M + reservations: + memory: 256M + logging: + driver: "json-file" + options: + max-size: "5m" + max-file: "2" + labels: "service=cadvisor" + + # ============================================ + # Loki - Log Aggregation + # ============================================ + loki: + image: grafana/loki:latest + container_name: anythingllm-loki + restart: unless-stopped + command: -config.file=/etc/loki/loki-config.yml + volumes: + - ./loki/loki-config.yml:/etc/loki/loki-config.yml:ro + - loki_data:/loki + networks: + - monitoring + ports: + - "3100:3100" + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3100/ready"] + interval: 30s + timeout: 10s + retries: 3 + deploy: + resources: + limits: + memory: 2G + reservations: + memory: 1G + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + labels: "service=loki" + + # ============================================ + # Grafana Alloy - Log Collection + # ============================================ + alloy: + image: grafana/alloy:latest + container_name: anythingllm-alloy + restart: unless-stopped + command: + - run + - /etc/alloy/config.alloy + - --server.http.listen-addr=0.0.0.0:12345 + - --storage.path=/var/lib/alloy/data + volumes: + - ./alloy/config.alloy:/etc/alloy/config.alloy:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + networks: + - monitoring + ports: + - "12345:12345" + depends_on: + - loki + deploy: + resources: + limits: + memory: 512M + reservations: + memory: 256M + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + labels: "service=alloy" + + # ============================================ + # Grafana - Visualization Dashboard + # ============================================ + grafana: + image: grafana/grafana:latest + container_name: anythingllm-grafana + restart: unless-stopped + environment: + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:?GRAFANA_ADMIN_PASSWORD is required} + GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource + GF_SERVER_ROOT_URL: ${GRAFANA_ROOT_URL:-http://localhost:3000} + GF_USERS_ALLOW_SIGN_UP: "false" + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: + - monitoring + - frontend + ports: + - "3000:3000" + depends_on: + - prometheus + - loki + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"] + interval: 30s + timeout: 10s + retries: 3 + deploy: + resources: + limits: + memory: 1G + reservations: + memory: 512M + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + labels: "service=grafana" diff --git a/ansible/anythingllm/env.j2 b/ansible/anythingllm/env.j2 new file mode 100644 index 0000000..cbb954c --- /dev/null +++ b/ansible/anythingllm/env.j2 @@ -0,0 +1,79 @@ +# AnythingLLM Server Environment Configuration +# Managed by Ansible - Red Panda Approved +# Generated for {{ inventory_hostname }} + +# ============================================ +# Server Configuration +# ============================================ +SERVER_PORT={{ anythingllm_port }} +STORAGE_DIR={{ anythingllm_directory }}/storage + +# ============================================ +# Security +# ============================================ +JWT_SECRET={{ anythingllm_jwt_secret }} +SIG_KEY={{ anythingllm_sig_key }} +SIG_SALT={{ anythingllm_sig_salt }} + +# ============================================ +# PostgreSQL + pgvector (Portia) +# ============================================ +VECTOR_DB=pgvector +PGVECTOR_CONNECTION_STRING=postgresql://{{ anythingllm_db_user }}:{{ anythingllm_db_password }}@{{ anythingllm_db_host }}:{{ anythingllm_db_port }}/{{ anythingllm_db_name }} + +# ============================================ +# LLM Provider - AWS Bedrock +# ============================================ +# LLM_PROVIDER='bedrock' +# AWS_BEDROCK_LLM_ACCESS_KEY_ID= +# AWS_BEDROCK_LLM_ACCESS_KEY= +# AWS_BEDROCK_LLM_REGION=us-west-2 +# AWS_BEDROCK_LLM_MODEL_PREFERENCE=meta.llama3-1-8b-instruct-v1:0 +# AWS_BEDROCK_LLM_MODEL_TOKEN_LIMIT=8191 +# AWS_BEDROCK_LLM_CONNECTION_METHOD=iam +# AWS_BEDROCK_LLM_MAX_OUTPUT_TOKENS=4096 +# AWS_BEDROCK_LLM_SESSION_TOKEN= # Only required if CONNECTION_METHOD is 'sessionToken' +# or even use Short and Long Term API keys +# AWS_BEDROCK_LLM_CONNECTION_METHOD="apiKey" +# AWS_BEDROCK_LLM_API_KEY= + +# ============================================ +# LLM Provider - Generic OpenAI (llama-cpp) +# ============================================ +LLM_PROVIDER=generic-openai +GENERIC_OPEN_AI_BASE_PATH={{ anythingllm_llm_base_url }} +GENERIC_OPEN_AI_MODEL_PREF={{ anythingllm_llm_model }} +GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT={{ anythingllm_llm_token_limit }} +GENERIC_OPEN_AI_API_KEY={{ anythingllm_llm_api_key }} + +# ============================================ +# Embedding Configuration +# ============================================ +EMBEDDING_ENGINE={{ anythingllm_embedding_engine }} +EMBEDDING_MODEL_PREF={{ anythingllm_embedding_model }} + +# ============================================ +# TTS Configuration (FastKokoro) +# ============================================ +TTS_PROVIDER={{ anythingllm_tts_provider }} +{% if anythingllm_tts_provider == 'openai' %} +TTS_OPEN_AI_KEY={{ anythingllm_tts_api_key }} +TTS_OPEN_AI_ENDPOINT={{ anythingllm_tts_endpoint }} +TTS_OPEN_AI_MODEL={{ anythingllm_tts_model }} +TTS_OPEN_AI_VOICE={{ anythingllm_tts_voice }} +{% endif %} + +# ============================================ +# Whisper Configuration +# ============================================ +WHISPER_PROVIDER=local + +# use openai hosted whisper model. +# WHISPER_PROVIDER="openai" +# OPEN_AI_KEY=sk-xxxxxxxx + +# ============================================ +# Telemetry & Environment +# ============================================ +DISABLE_TELEMETRY=true +NODE_ENV=production diff --git a/ansible/anythingllm/stage.yml b/ansible/anythingllm/stage.yml new file mode 100644 index 0000000..43b2509 --- /dev/null +++ b/ansible/anythingllm/stage.yml @@ -0,0 +1,29 @@ +--- +- name: Stage AnythingLLM release tarball + hosts: localhost + gather_facts: false + vars: + anythingllm_repo_dir: "{{github_dir}}/anything-llm" + archive_path: "{{rel_dir}}/anythingllm_{{anythingllm_rel}}.tar" + + tasks: + - name: Ensure release directory exists + file: + path: "{{rel_dir}}" + state: directory + mode: '755' + + - name: Fetch all remote branches and tags + ansible.builtin.command: git fetch --all + args: + chdir: "{{anythingllm_repo_dir}}" + + - name: Pull latest changes + ansible.builtin.command: git pull + args: + chdir: "{{anythingllm_repo_dir}}" + + - name: Create AnythingLLM archive for specified release + ansible.builtin.command: git archive -o "{{archive_path}}" "{{anythingllm_rel}}" + args: + chdir: "{{anythingllm_repo_dir}}" diff --git a/ansible/apt_update.yml b/ansible/apt_update.yml new file mode 100644 index 0000000..e3526b9 --- /dev/null +++ b/ansible/apt_update.yml @@ -0,0 +1,11 @@ +--- +- name: Update Ubuntu packages + hosts: ubuntu + become: true + + tasks: + - name: Update all packages to the latest version + ansible.builtin.apt: + name: "*" + state: latest + update_cache: true diff --git a/ansible/argos/deploy.yml b/ansible/argos/deploy.yml new file mode 100644 index 0000000..08ae027 --- /dev/null +++ b/ansible/argos/deploy.yml @@ -0,0 +1,76 @@ +--- +- name: Deploy Argos MCP Server + hosts: ubuntu + + handlers: + - name: restart argos + become: true + community.docker.docker_compose_v2: + project_src: "{{argos_directory}}" + state: restarted + + tasks: + - name: Check if host has argos service + ansible.builtin.set_fact: + has_argos_service: "{{ 'argos' in services | default([]) }}" + + - name: Skip hosts without argos service + ansible.builtin.meta: end_host + when: not has_argos_service + + - name: Create argos group + become: true + ansible.builtin.group: + name: "{{argos_group}}" + state: present + + - name: Create argos user + become: true + ansible.builtin.user: + name: "{{argos_user}}" + group: "{{argos_group}}" + system: true + create_home: false + + - name: Add ansible user to argos group + become: true + ansible.builtin.user: + name: "{{ansible_user}}" + groups: "{{argos_group}}" + append: true + + - name: Create argos directory + become: true + ansible.builtin.file: + path: "{{argos_directory}}" + owner: "{{argos_user}}" + group: "{{argos_group}}" + state: directory + mode: '750' + + - name: Transfer and unarchive git archive + become: true + ansible.builtin.unarchive: + src: "~/rel/argos_{{argos_rel}}.tar" + dest: "{{argos_directory}}" + owner: "{{argos_user}}" + group: "{{argos_group}}" + mode: '550' + + - name: Template docker-compose.yml + become: true + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{argos_directory}}/docker-compose.yml" + owner: "{{argos_user}}" + group: "{{argos_group}}" + mode: '550' + notify: restart argos + + - name: Start argos with docker-compose + become: true + community.docker.docker_compose_v2: + project_src: "{{argos_directory}}" + state: present + pull: always + diff --git a/ansible/argos/docker-compose.yml.j2 b/ansible/argos/docker-compose.yml.j2 new file mode 100644 index 0000000..90a650e --- /dev/null +++ b/ansible/argos/docker-compose.yml.j2 @@ -0,0 +1,44 @@ +services: + argos-searxng: + build: . + depends_on: + - kvdb + environment: + - ARGOS_PORT=8000 + - ARGOS_HOST=0.0.0.0 + - ARGOS_SEARXNG_INSTANCES={{argos_searxng_instances}} + - ARGOS_MEMCACHED_HOST=kvdb + - ARGOS_MEMCACHED_PORT=11211 + - ARGOS_CACHE_TTL={{argos_cache_ttl}} + - ARGOS_MAX_RESULTS_DEFAULT={{argos_max_results}} + - ARGOS_REQUEST_TIMEOUT=30.0 + - ARGOS_HEALTH_CHECK_TIMEOUT=5.0 + - ARGOS_LOG_LEVEL={{argos_log_level}} + - ARGOS_ENABLE_STARTUP_HEALTH_CHECK=true + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/live"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{argos_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "athena-kvdb" + ports: + - {{argos_port}}:8000 + restart: unless-stopped + + kvdb: + image: memcached:1.6-trixie + pull_policy: always + command: memcached -m 128 -I 10m + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{argos_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "argos-kvdb" + restart: unless-stopped diff --git a/ansible/argos/remove.yml b/ansible/argos/remove.yml new file mode 100644 index 0000000..fc44b0a --- /dev/null +++ b/ansible/argos/remove.yml @@ -0,0 +1,32 @@ +--- +- name: Remove Argos from Dev Environment + hosts: ubuntu + become: true + + tasks: + - name: Check if host has argos service + ansible.builtin.set_fact: + has_argos_service: "{{ 'argos' in services | default([]) }}" + + - name: Skip hosts without argos service + ansible.builtin.meta: end_host + when: not has_argos_service + + - name: Stop and remove Docker containers, volumes, and images + community.docker.docker_compose_v2: + project_src: "{{argos_directory}}" + state: absent + remove_images: all + remove_volumes: true + + - name: Prune Docker images + ansible.builtin.docker_prune: + images: true + images_filters: + dangling: false + + - name: Remove Argos directory + become: true + ansible.builtin.file: + path: "{{argos_directory}}" + state: absent \ No newline at end of file diff --git a/ansible/argos/stage.yml b/ansible/argos/stage.yml new file mode 100644 index 0000000..16d06d7 --- /dev/null +++ b/ansible/argos/stage.yml @@ -0,0 +1,34 @@ +--- +- name: Stage Argos release tarball + hosts: localhost + gather_facts: false + vars: + argos_repo_dir: "{{repo_dir}}/argos" + archive_path: "{{rel_dir}}/argos_{{argos_rel}}.tar" + + tasks: + - name: Ensure release directory exists + file: + path: "{{rel_dir}}" + state: directory + mode: '755' + + - name: Fetch all remote branches and tags + ansible.builtin.command: git fetch --all + args: + chdir: "{{argos_repo_dir}}" + + - name: Git pull + ansible.builtin.command: git pull + args: + chdir: "{{argos_repo_dir}}" + + - name: Checkout specified argos release branch or tag + ansible.builtin.command: git checkout "{{argos_rel}}" + args: + chdir: "{{argos_repo_dir}}" + + - name: Create argos archive for specified release + ansible.builtin.command: git archive -o "{{archive_path}}" "{{argos_rel}}" + args: + chdir: "{{argos_repo_dir}}" diff --git a/ansible/arke/.env.example b/ansible/arke/.env.example new file mode 100644 index 0000000..dcf0748 --- /dev/null +++ b/ansible/arke/.env.example @@ -0,0 +1,243 @@ +# Arke Configuration Example +# Copy this file to .env and update with your values + +# ============================================================================ +# Server Configuration +# ============================================================================ +HOST=0.0.0.0 +PORT=8000 +DEBUG=false +LOG_LEVEL=info +RELOAD=false + +# ============================================================================ +# PostgreSQL Database Configuration +# ============================================================================ + +# Database environment variables (same as above) +DB_HOST=localhost +DB_PORT=5432 +DB_NAME=arke +DB_USER=arke +DB_PASSWORD=your_secure_password + +# ============================================================================ +# Memcached Configuration +# ============================================================================ +MEMCACHED_HOST=localhost +MEMCACHED_PORT=11211 + +# ============================================================================ +# Multi-Backend Configuration (Environment Variable Format) +# ============================================================================ +# Ansible-friendly configuration using individual environment variables +# No JSON escaping issues, works perfectly with Ansible Vault + +# --- NTTh Backend (Token Pool) --- +# NTTh is treated specially as it manages a pool of tokens with session limits +NTTH_BACKEND_ENABLED=true +NTTH_SESSION_LIMIT=90 +NTTH_SESSION_TTL=3600 +NTTH_TOKEN_CACHE_TTL=82800 + +# NTTh Tokens (numbered, add as many as needed) +NTTH_TOKEN_1_APP_ID=your_app_id_1 +NTTH_TOKEN_1_APP_SECRET=your_secret_1 +NTTH_TOKEN_1_NAME=production-primary + +NTTH_TOKEN_2_APP_ID=your_app_id_2 +NTTH_TOKEN_2_APP_SECRET=your_secret_2 +NTTH_TOKEN_2_NAME=production-backup + +# Add more tokens as needed: +# NTTH_TOKEN_3_APP_ID=your_app_id_3 +# NTTH_TOKEN_3_APP_SECRET=your_secret_3 +# NTTH_TOKEN_3_NAME=production-tertiary + +# --- Standard Backends (OpenAI-Compatible, etc.) --- +# Backend 1: Nyx (llama-cpp instance) +BACKEND_1_NAME=nyx +BACKEND_1_TYPE=openai-compatible +BACKEND_1_ENABLED=true +BACKEND_1_BASE_URL=http://nyx.helu.ca:8080/v1 +BACKEND_1_API_KEY=not-needed +BACKEND_1_MODEL_PREFIX=nyx +BACKEND_1_TIMEOUT=60 + +# Backend 2: Athena (llama-cpp instance) +BACKEND_2_NAME=athena +BACKEND_2_TYPE=openai-compatible +BACKEND_2_ENABLED=true +BACKEND_2_BASE_URL=http://athena.helu.ca:8080/v1 +BACKEND_2_API_KEY=not-needed +BACKEND_2_MODEL_PREFIX=athena +BACKEND_2_TIMEOUT=60 + +# ============================================================================ +# Future Backend Examples (Reference Only - Not Active) +# ============================================================================ +# These examples show how to configure other backend types when needed + +# --- Anthropic Backend Example --- +# BACKEND_3_NAME=anthropic +# BACKEND_3_TYPE=anthropic +# BACKEND_3_ENABLED=true +# BACKEND_3_BASE_URL=https://api.anthropic.com +# BACKEND_3_API_KEY=sk-ant-api03-xxxxx +# BACKEND_3_MODEL_PREFIX=anthropic +# BACKEND_3_TIMEOUT=60 + +# --- Azure OpenAI Backend Example --- +# BACKEND_4_NAME=azure-openai +# BACKEND_4_TYPE=azure-openai +# BACKEND_4_ENABLED=true +# BACKEND_4_BASE_URL=https://your-resource.openai.azure.com +# BACKEND_4_API_KEY=your-azure-key +# BACKEND_4_MODEL_PREFIX=azure +# BACKEND_4_DEPLOYMENT_NAME=gpt-4 +# BACKEND_4_API_VERSION=2024-02-15-preview +# BACKEND_4_TIMEOUT=60 + +# --- AWS Bedrock Backend Example --- +# BACKEND_5_NAME=bedrock +# BACKEND_5_TYPE=bedrock +# BACKEND_5_ENABLED=true +# BACKEND_5_AWS_REGION=us-east-1 +# BACKEND_5_AWS_ACCESS_KEY_ID=AKIA... +# BACKEND_5_AWS_SECRET_ACCESS_KEY=secret... +# BACKEND_5_MODEL_PREFIX=bedrock +# BACKEND_5_TIMEOUT=60 + +# --- OpenAI Direct Backend Example --- +# BACKEND_6_NAME=openai +# BACKEND_6_TYPE=openai-compatible +# BACKEND_6_ENABLED=true +# BACKEND_6_BASE_URL=https://api.openai.com/v1 +# BACKEND_6_API_KEY=sk-... +# BACKEND_6_MODEL_PREFIX=openai +# BACKEND_6_TIMEOUT=60 + +# ============================================================================ +# Embedding Provider Configuration +# ============================================================================ +# Choose your embedding provider: 'ollama' or 'openai' +EMBEDDING_PROVIDER=ollama + +# --- Ollama Configuration (when EMBEDDING_PROVIDER=ollama) --- +OLLAMA_HOST=nyx.helu.ca +OLLAMA_PORT=11434 +EMBEDDING_MODEL=nomic-embed-text + +# --- OpenAI-Compatible Configuration (when EMBEDDING_PROVIDER=openai) --- +# Works with OpenAI API, llama-cpp, LocalAI, and other compatible services +OPENAI_EMBEDDING_BASE_URL=http://localhost:8080 +OPENAI_EMBEDDING_API_KEY= +OPENAI_EMBEDDING_MODEL=text-embedding-ada-002 + +# --- Common Embedding Configuration --- +EMBEDDING_TIMEOUT=30.0 + +# --- Batch Chunking Configuration (for llama-cpp) --- +# These settings optimize embedding requests for llama-cpp's context limits +EMBEDDING_BATCH_SIZE=512 +EMBEDDING_UBATCH_SIZE=512 +EMBEDDING_MAX_CONTEXT=8192 + +# ============================================================================ +# Memory System Configuration +# ============================================================================ +MEMORY_ENABLED=true +MAX_CONTEXT_TOKENS=8000 +SIMILARITY_THRESHOLD=0.7 +MIN_IMPORTANCE_SCORE=0.7 + +# ============================================================================ +# Message Size Limits +# ============================================================================ +# Maximum tokens allowed for incoming messages (default: 32768) +# This limit prevents excessively large requests that could overwhelm the system +MESSAGE_MAX_TOKENS=32768 + +# ============================================================================ +# Background Task Configuration (Async Embedding Generation) +# ============================================================================ +# Enable background task processing for async operations +BACKGROUND_TASKS_ENABLED=true + +# Number of worker threads for background tasks +BACKGROUND_TASK_WORKERS=5 + +# Maximum retry attempts for failed tasks +BACKGROUND_TASK_MAX_RETRIES=3 + +# Initial retry delay in seconds (uses exponential backoff) +BACKGROUND_TASK_RETRY_DELAY=1.0 + +# Cleanup interval for old completed/failed tasks (hours) +BACKGROUND_TASK_CLEANUP_HOURS=24 + +# --- Async Embedding Configuration --- +# Enable async embedding generation (non-blocking) +ASYNC_EMBEDDINGS_ENABLED=true + +# Number of messages to batch together for embedding generation +ASYNC_EMBEDDING_BATCH_SIZE=50 + +# Priority level for embedding tasks: LOW, NORMAL, HIGH, CRITICAL +ASYNC_EMBEDDING_PRIORITY=NORMAL + +# --- Async Deduplication Configuration --- +# Enable async document enhancement (non-blocking embedding generation for deduplicated documents) +ASYNC_DEDUPLICATION_ENABLED=true + +# Number of documents to batch together for enhancement +DEDUPLICATION_BATCH_SIZE=20 + +# Priority level for document enhancement tasks: LOW, NORMAL, HIGH, CRITICAL +DEDUPLICATION_ENHANCEMENT_PRIORITY=NORMAL + +# Enable HTML content extraction and processing +HTML_CONTENT_EXTRACTION=true + +# Minimum token count for document deduplication +MIN_TOKENS_FOR_DEDUP=500 + +# Semantic similarity threshold for duplicate detection (0.0-1.0) +DEDUPLICATION_THRESHOLD=0.95 + +# Reference expansion strategy: smart, full, summary, minimal +REFERENCE_EXPANSION_STRATEGY=smart + +# ============================================================================ +# Monitoring Configuration +# ============================================================================ +PROMETHEUS_ENABLED=true +METRICS_PORT=9090 + +# ============================================================================ +# Example Configurations for Different Setups +# ============================================================================ + +# Example 1: Using Ollama (default) +# EMBEDDING_PROVIDER=ollama +# OLLAMA_HOST=localhost +# OLLAMA_PORT=11434 +# EMBEDDING_MODEL=nomic-embed-text + +# Example 2: Using llama-cpp with OpenAI-compatible API +# EMBEDDING_PROVIDER=openai +# OPENAI_EMBEDDING_BASE_URL=http://localhost:8080 +# OPENAI_EMBEDDING_MODEL=text-embedding-ada-002 +# OPENAI_EMBEDDING_API_KEY= # Optional, leave empty if not required + +# Example 3: Using actual OpenAI API +# EMBEDDING_PROVIDER=openai +# OPENAI_EMBEDDING_BASE_URL=https://api.openai.com +# OPENAI_EMBEDDING_MODEL=text-embedding-3-small +# OPENAI_EMBEDDING_API_KEY=sk-your-openai-api-key + +# Example 4: Using LocalAI +# EMBEDDING_PROVIDER=openai +# OPENAI_EMBEDDING_BASE_URL=http://localhost:8080 +# OPENAI_EMBEDDING_MODEL=bert-embeddings +# OPENAI_EMBEDDING_API_KEY= # Optional diff --git a/ansible/arke/.env.j2 b/ansible/arke/.env.j2 new file mode 100644 index 0000000..8f861b8 --- /dev/null +++ b/ansible/arke/.env.j2 @@ -0,0 +1,147 @@ +# Arke Environment Configuration +# Edit these values as needed before deployment + +# ============================================================================ +# Server Configuration +# ============================================================================ +HOST=0.0.0.0 +PORT={{ arke_port }} +DEBUG=false +LOG_LEVEL=info +RELOAD={{ arke_reload | default('false') }} + +# ============================================================================ +# PostgreSQL Database Configuration +# ============================================================================ +DB_HOST={{ arke_db_host }} +DB_PORT={{ arke_db_port }} +DB_NAME={{ arke_db_name }} +DB_USER={{ arke_db_user }} +DB_PASSWORD={{ arke_db_password }} + +# ============================================================================ +# Memcached Configuration +# ============================================================================ +MEMCACHED_HOST={{ arke_memcached_host | default('localhost') }} +MEMCACHED_PORT={{ arke_memcached_port | default('11211') }} + +# ============================================================================ +# NTTh API Configuration +# ============================================================================ +# --- NTTh Backend (Token Pool) --- +# NTTh is treated specially as it manages a pool of tokens with session limits +NTTH_BACKEND_ENABLED=true +NTTH_SESSION_LIMIT=90 +NTTH_SESSION_TTL=3600 +NTTH_TOKEN_CACHE_TTL=82800 + +# NTTh Tokens (numbered, add as many as needed) +NTTH_TOKEN_1_NAME={{ntth_token_1_app_name}} +NTTH_TOKEN_1_APP_ID={{ntth_token_1_app_id}} +NTTH_TOKEN_1_APP_SECRET={{ntth_token_1_app_secret}} + +NTTH_TOKEN_2_NAME={{ntth_token_2_app_name}} +NTTH_TOKEN_2_APP_ID={{ntth_token_2_app_id}} +NTTH_TOKEN_2_APP_SECRET={{ntth_token_2_app_secret}} + +NTTH_TOKEN_3_NAME={{ntth_token_3_app_name}} +NTTH_TOKEN_3_APP_ID={{ntth_token_3_app_id}} +NTTH_TOKEN_3_APP_SECRET={{ntth_token_3_app_secret}} + +NTTH_TOKEN_4_NAME={{ntth_token_4_app_name}} +NTTH_TOKEN_4_APP_ID={{ntth_token_4_app_id}} +NTTH_TOKEN_4_APP_SECRET={{ntth_token_4_app_secret}} + +# Session Management +SESSION_LIMIT={{ arke_session_limit | default('90') }} +SESSION_TTL={{ arke_session_ttl | default('3600') }} +TOKEN_CACHE_TTL={{ arke_token_cache_ttl | default('82800') }} + +# ============================================================================ +# Embedding Provider Configuration +# ============================================================================ +# Choose your embedding provider: 'ollama' or 'openai' +EMBEDDING_PROVIDER={{arke_embedding_provider}} + +# --- OpenAI-Compatible Configuration (when EMBEDDING_PROVIDER=openai) --- +# Works with OpenAI API, llama-cpp, LocalAI, and other compatible services +OPENAI_EMBEDDING_BASE_URL={{arke_openai_embedding_base_url}} +OPENAI_EMBEDDING_API_KEY={{arke_openai_embedding_api_key}} +OPENAI_EMBEDDING_MODEL={{arke_openai_embedding_model}} + +# --- Embedding Configuration --- +EMBEDDING_TIMEOUT={{ arke_embedding_timeout | default('30.0') }} +EMBEDDING_BATCH_SIZE={{arke_embedding_batch_size}} +EMBEDDING_UBATCH_SIZE={{arke_embedding_ubatch_size}} +EMBEDDING_MAX_CONTEXT={{arke_embedding_max_context}} + +# ============================================================================ +# Memory System Configuration +# ============================================================================ +MEMORY_ENABLED={{ arke_memory_enabled | default('true') }} +MAX_CONTEXT_TOKENS={{ arke_max_context_tokens | default('8000') }} +SIMILARITY_THRESHOLD={{ arke_similarity_threshold | default('0.7') }} +MIN_IMPORTANCE_SCORE={{ arke_min_importance_score | default('0.7') }} + +# ============================================================================ +# Message Size Limits +# ============================================================================ +# Maximum tokens allowed for incoming messages (default: 32768) +# This limit prevents excessively large requests that could overwhelm the system +MESSAGE_MAX_TOKENS=700000 + +# ============================================================================ +# Background Task Configuration (Async Embedding Generation) +# ============================================================================ +# Enable background task processing for async operations +BACKGROUND_TASKS_ENABLED=true + +# Number of worker threads for background tasks +BACKGROUND_TASK_WORKERS=5 + +# Maximum retry attempts for failed tasks +BACKGROUND_TASK_MAX_RETRIES=3 + +# Initial retry delay in seconds (uses exponential backoff) +BACKGROUND_TASK_RETRY_DELAY=1.0 + +# Cleanup interval for old completed/failed tasks (hours) +BACKGROUND_TASK_CLEANUP_HOURS=24 + +# --- Async Embedding Configuration --- +# Enable async embedding generation (non-blocking) +ASYNC_EMBEDDINGS_ENABLED=true + +# Number of messages to batch together for embedding generation +ASYNC_EMBEDDING_BATCH_SIZE=50 + +# Priority level for embedding tasks: LOW, NORMAL, HIGH, CRITICAL +ASYNC_EMBEDDING_PRIORITY=NORMAL + +# --- Async Deduplication Configuration --- +# Enable async document enhancement (non-blocking embedding generation for deduplicated documents) +ASYNC_DEDUPLICATION_ENABLED=true + +# Number of documents to batch together for enhancement +DEDUPLICATION_BATCH_SIZE=20 + +# Priority level for document enhancement tasks: LOW, NORMAL, HIGH, CRITICAL +DEDUPLICATION_ENHANCEMENT_PRIORITY=NORMAL + +# Enable HTML content extraction and processing +HTML_CONTENT_EXTRACTION=true + +# Minimum token count for document deduplication +MIN_TOKENS_FOR_DEDUP=500 + +# Semantic similarity threshold for duplicate detection (0.0-1.0) +DEDUPLICATION_THRESHOLD=0.95 + +# Reference expansion strategy: smart, full, summary, minimal +REFERENCE_EXPANSION_STRATEGY=smart + +# ============================================================================ +# Monitoring Configuration +# ============================================================================ +PROMETHEUS_ENABLED=true +METRICS_PORT={{arke_metrics_port}} diff --git a/ansible/arke/arke.service.j2 b/ansible/arke/arke.service.j2 new file mode 100644 index 0000000..56374d1 --- /dev/null +++ b/ansible/arke/arke.service.j2 @@ -0,0 +1,24 @@ +[Unit] +Description=Arke MCP Server +After=network.target +Wants=network.target + +[Service] +Type=simple +User={{arke_user}} +Group={{arke_group}} +WorkingDirectory={{arke_directory}} +EnvironmentFile={{arke_directory}}/.env +ExecStart={{arke_directory}}/.venv/bin/python {{arke_directory}}/arke.py + +Restart=always +RestartSec=10 + +# Security +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=true + +[Install] +WantedBy=multi-user.target diff --git a/ansible/arke/deploy.yml b/ansible/arke/deploy.yml new file mode 100644 index 0000000..c1ae81b --- /dev/null +++ b/ansible/arke/deploy.yml @@ -0,0 +1,181 @@ +--- +- name: Deploy Arke Proxy Server + hosts: arke + vars: + ansible_common_remote_group: "{{arke_group}}" + allow_world_readable_tmpfiles: true + tasks: + - name: Create Arke group + become: true + ansible.builtin.group: + name: "{{arke_group}}" + state: present + + - name: Create arke user + become: true + ansible.builtin.user: + name: "{{arke_user}}" + group: "{{arke_group}}" + home: "{{arke_directory}}" + shell: /bin/bash + system: true + create_home: false + + - name: Add remote_user to arke group + become: true + ansible.builtin.user: + name: "{{remote_user}}" + groups: "{{arke_group}}" + append: true + + - name: Create required directories + become: true + ansible.builtin.file: + path: "{{arke_directory}}" + owner: "{{arke_user}}" + group: "{{arke_group}}" + state: directory + mode: '750' + + - name: Ensure tar is installed for unarchive task + become: true + ansible.builtin.apt: + name: + - tar + state: present + update_cache: true + + - name: Ensure Python, Python Dev, Venv module is installed + become: true + ansible.builtin.apt: + name: [python3,python3-venv,python3-dev] + state: present + update_cache: true + + - name: Transfer and unarchive git archive + become: true + ansible.builtin.unarchive: + src: "~/rel/arke_{{arke_rel}}.tar" + dest: "{{arke_directory}}" + owner: "{{arke_user}}" + group: "{{arke_group}}" + mode: '550' + notify: restart arke + + - name: Ensure media directories are writable + become: true + ansible.builtin.file: + path: "{{arke_directory}}/media/generated_images" + owner: "{{arke_user}}" + group: "{{arke_group}}" + state: directory + mode: '750' + + - name: Create virtual environment for Arke + become: true + become_user: "{{arke_user}}" + ansible.builtin.command: + cmd: "python3 -m venv {{arke_directory}}/.venv/" + creates: "{{arke_directory}}/.venv/bin/activate" + + - name: Install wheel in virtual environment + become: true + become_user: "{{arke_user}}" + ansible.builtin.pip: + name: + - wheel + state: latest + virtualenv: "{{arke_directory}}/.venv" + + - name: Install pyproject.toml dependencies in virtualenv + become: true + become_user: "{{arke_user}}" + ansible.builtin.pip: + chdir: "{{arke_directory}}" + name: . + virtualenv: "{{arke_directory}}/.venv" + virtualenv_command: python3 -m venv + + - name: Install Memcached + become: true + ansible.builtin.apt: + name: memcached + state: present + update_cache: true + + - name: Ensure Memcached is running + become: true + ansible.builtin.service: + name: memcached + state: started + enabled: true + + - name: Template Arke .env configuration + become: true + ansible.builtin.template: + src: .env.j2 + dest: "{{arke_directory}}/.env" + owner: "{{arke_user}}" + group: "{{arke_group}}" + mode: '640' + notify: restart arke + + - name: Template systemd service file + become: true + ansible.builtin.template: + src: arke.service.j2 + dest: /etc/systemd/system/arke.service + owner: root + group: root + mode: '644' + notify: restart arke + + - name: Enable and start arke service + become: true + ansible.builtin.systemd: + name: arke + enabled: true + state: started + daemon_reload: true + + - name: Ensure Arke metrics endpoint is open to Prometheus (manual step if not using ufw) + ansible.builtin.debug: + msg: | + Ensure the host's firewall allows inbound TCP on port 8000 from sao.helu.ca for Prometheus scraping. + If using ufw: + sudo ufw allow from to any port 8000 proto tcp + + - name: Reminder - Update Prometheus scrape config on sao.helu.ca + ansible.builtin.debug: + msg: | + Add the following job/target to your Prometheus configuration on sao.helu.ca: + - job_name: 'arke' + static_configs: + - targets: [':{{arke_port}}'] + + - name: Validate Arke health endpoints + ansible.builtin.uri: + url: "http://localhost:{{arke_port}}/health" + status_code: 200 + return_content: true + register: health_check + retries: 5 + delay: 5 + until: health_check.status == 200 + + - name: Validate Arke /metrics endpoint + ansible.builtin.uri: + url: "http://localhost:{{arke_port}}/metrics" + status_code: 200 + return_content: false + register: metrics_check + retries: 5 + delay: 5 + until: metrics_check.status == 200 + + handlers: + - name: restart arke + become: true + ansible.builtin.systemd: + name: arke + state: restarted diff --git a/ansible/arke/remove.yml b/ansible/arke/remove.yml new file mode 100644 index 0000000..356d7e6 --- /dev/null +++ b/ansible/arke/remove.yml @@ -0,0 +1,26 @@ +--- +- name: Remove Arke Proxy Server + hosts: arke + become: true + + tasks: + - name: Stop and disable arke service + ansible.builtin.systemd: + name: arke + state: stopped + enabled: false + ignore_errors: true + + - name: Remove systemd service file + ansible.builtin.file: + path: /etc/systemd/system/arke.service + state: absent + + - name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true + + - name: Remove Arke directory + ansible.builtin.file: + path: "{{arke_directory}}" + state: absent diff --git a/ansible/arke/stage.yml b/ansible/arke/stage.yml new file mode 100644 index 0000000..215fc3c --- /dev/null +++ b/ansible/arke/stage.yml @@ -0,0 +1,29 @@ +--- +- name: Stage Arke release tarball + hosts: localhost + gather_facts: false + vars: + archive_path: "{{rel_dir}}/arke_{{arke_rel}}.tar" + arke_repo_dir: "{{repo_dir}}/arke" + + tasks: + - name: Ensure release directory exists + file: + path: "{{rel_dir}}" + state: directory + mode: '755' + + - name: Fetch all remote branches and tags + ansible.builtin.command: git fetch --all + args: + chdir: "{{arke_repo_dir}}" + + - name: Pull latest changes + ansible.builtin.command: git pull + args: + chdir: "{{arke_repo_dir}}" + + - name: Create Arke archive for specified release + ansible.builtin.command: git archive -o "{{archive_path}}" "{{arke_rel}}" + args: + chdir: "{{arke_repo_dir}}" diff --git a/ansible/auth_keys.yml b/ansible/auth_keys.yml new file mode 100644 index 0000000..cad729e --- /dev/null +++ b/ansible/auth_keys.yml @@ -0,0 +1,52 @@ +--- +# SSH Authorized Keys Management +# Deploys authorized_keys to all ubuntu hosts based on ssh_authorized_users variable +# +# Usage: +# ansible-playbook auth_keys.yml +# +# Override exclusive mode (removes unlisted keys): +# ansible-playbook auth_keys.yml -e "ssh_exclusive_mode=true" +# +# Target specific host: +# ansible-playbook auth_keys.yml --limit ariel.incus +# +# Variables defined in: inventory/group_vars/all/auth_keys.yml + +- name: Manage SSH Authorized Keys + hosts: ubuntu + become: true + + tasks: + - name: Ensure .ssh directory exists for each user + ansible.builtin.file: + path: "/home/{{ item.name }}/.ssh" + state: directory + mode: '0700' + owner: "{{ item.name }}" + group: "{{ item.name }}" + loop: "{{ ssh_authorized_users }}" + loop_control: + label: "{{ item.name }}" + + - name: Deploy authorized keys (additive mode) + ansible.posix.authorized_key: + user: "{{ item.0.name }}" + key: "{{ item.1 }}" + state: present + exclusive: false + loop: "{{ ssh_authorized_users | subelements('keys') }}" + loop_control: + label: "{{ item.0.name }}: {{ item.1 | truncate(50) }}" + when: not ssh_exclusive_mode + + - name: Deploy authorized keys (exclusive mode) + ansible.posix.authorized_key: + user: "{{ item.name }}" + key: "{{ item.keys | join('\n') }}" + state: present + exclusive: true + loop: "{{ ssh_authorized_users }}" + loop_control: + label: "{{ item.name }}" + when: ssh_exclusive_mode diff --git a/ansible/caliban/agent_s_env.j2 b/ansible/caliban/agent_s_env.j2 new file mode 100644 index 0000000..a617365 --- /dev/null +++ b/ansible/caliban/agent_s_env.j2 @@ -0,0 +1,32 @@ +# Agent S Environment Configuration +# Source this file to activate the Agent S environment +# Usage: source ~/.agent_s_env + +# Activate Python virtual environment +if [ -f "{{ agent_s_venv }}/bin/activate" ]; then + source "{{ agent_s_venv }}/bin/activate" + echo "✓ Agent S Python environment activated" +fi + +# Set Agent S paths +export AGENT_S_HOME="{{ agent_s_repo }}" +export PATH="{{ agent_s_venv }}/bin:$PATH" + +# Display setup +export DISPLAY=:10.0 + +# Required API Key Vars: +export HF_TOKEN=0000 +export OPENAI_API_KEY=0000 + +# Helpful aliases +alias agent_s_cd='cd {{ agent_s_repo }}' +alias agent_s_start='cd {{ agent_s_repo }} && source {{ agent_s_venv }}/bin/activate' + +echo "Agent S Environment Ready" +echo " Virtual Env: {{ agent_s_venv }}" +echo " Repository: {{ agent_s_repo }}" +echo "" +echo "Quick commands:" +echo " agent_s_cd - Change to Agent S directory" +echo " agent_s_start - Activate environment and change to repo" diff --git a/ansible/caliban/deploy.yml b/ansible/caliban/deploy.yml new file mode 100644 index 0000000..730b140 --- /dev/null +++ b/ansible/caliban/deploy.yml @@ -0,0 +1,347 @@ +--- +- name: Deploy Agent S Computer Use Agent + hosts: agent_s + become: yes + vars: + system_user: "{{ ansible_user }}" + agent_s_venv: "/home/{{ system_user }}/env/agents" + agent_s_repo: "/home/{{ system_user }}/gh/Agent-S" + chrome_deb_url: "https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb" + + tasks: + # Disable snap - doesn't work in containers with AppArmor disabled + - name: Prevent snapd from being installed + copy: + dest: /etc/apt/preferences.d/nosnap.pref + content: | + Package: snapd + Pin: release a=* + Pin-Priority: -10 + mode: '0644' + + - name: Update apt cache + apt: + update_cache: yes + cache_valid_time: 3600 + + # Firefox Setup, must be in place before desktop install to remove snap dependency + - name: Create APT keyrings directory + file: + path: /etc/apt/keyrings + state: directory + mode: '0755' + + - name: Download Mozilla APT signing key + get_url: + url: https://packages.mozilla.org/apt/repo-signing-key.gpg + dest: /etc/apt/keyrings/packages.mozilla.org.asc + mode: '0644' + + - name: Add Mozilla APT repository + apt_repository: + repo: "deb [signed-by=/etc/apt/keyrings/packages.mozilla.org.asc] https://packages.mozilla.org/apt mozilla main" + filename: mozilla + state: present + + - name: Set Firefox package priority to prefer Mozilla repo + copy: + dest: /etc/apt/preferences.d/mozilla + content: | + Package: * + Pin: origin packages.mozilla.org + Pin-Priority: 1000 + mode: '0644' + + - name: Update apt cache after adding Mozilla repo + apt: + update_cache: yes + + - name: Install Firefox from Mozilla repo + apt: + name: firefox + state: present + + # Desktop Environment - MATE for better AT-SPI accessibility support + - name: Install MATE desktop environment + apt: + name: + - ubuntu-mate-desktop + state: present + + # AT-SPI Accessibility Stack + - name: Install AT-SPI accessibility infrastructure + apt: + name: + - at-spi2-core + - libatk-adaptor + - libatk1.0-0 + - libatk-bridge2.0-0 + state: present + + - name: Configure AT-SPI environment for accessibility + copy: + dest: /etc/profile.d/atspi.sh + content: | + # Enable AT-SPI accessibility bridge + export GTK_MODULES=gail:atk-bridge + export NO_AT_BRIDGE=0 + export ACCESSIBILITY_ENABLED=1 + mode: '0644' + + - name: Configure GPU environment for direct rendering + copy: + dest: /etc/profile.d/gpu.sh + content: | + # Force GPU rendering via AMD render node + export DRI_PRIME=1 + export LIBVA_DRIVER_NAME=radeonsi + export MESA_LOADER_DRIVER_OVERRIDE=radeonsi + # Chrome/Chromium GPU flags + export CHROMIUM_FLAGS="--enable-gpu-rasterization --enable-zero-copy --use-gl=egl" + mode: '0644' + + # Sound Support + - name: Install sound support packages + apt: + name: + - git + - libpulse-dev + - autoconf + - m4 + - intltool + - build-essential + - dpkg-dev + state: present + + # Mouse, Assistive Technology, and Python + - name: Install assistive technology and Python packages + apt: + name: + - python3-tk + - python3-dev + - python3-pyatspi + - python3-gi + - gnome-screenshot + - python3-venv + - python3-pip + state: present + + # OCR + - name: Install OCR support + apt: + name: + - tesseract-ocr + state: present + + # GPU Drivers - AMD Mesa (radeonsi/RADV) + - name: Install AMD GPU drivers and utilities + apt: + name: + - mesa-utils + - mesa-utils-extra + - mesa-vulkan-drivers + - vulkan-tools + - libgl1-mesa-dri + - libglx-mesa0 + - libglu1-mesa + - libdrm2 + - libdrm-amdgpu1 + - libegl1 + - libegl-mesa0 + - libgbm1 + - vainfo + - mesa-va-drivers + state: present + + # VirtualGL for GPU-accelerated remote rendering + - name: Check if VirtualGL is installed + command: dpkg -s virtualgl + register: virtualgl_check + failed_when: false + changed_when: false + + - name: Download VirtualGL + get_url: + url: https://github.com/VirtualGL/virtualgl/releases/download/3.1.2/virtualgl_3.1.2_amd64.deb + dest: /tmp/virtualgl.deb + mode: '0644' + when: virtualgl_check.rc != 0 + + - name: Install VirtualGL + apt: + deb: /tmp/virtualgl.deb + state: present + when: virtualgl_check.rc != 0 + + # GPU Permissions - Add user to video and render groups for DRI access + - name: Add user to video group for GPU access + user: + name: "{{ system_user }}" + groups: video + append: yes + + - name: Add user to render group for GPU render node access + user: + name: "{{ system_user }}" + groups: render + append: yes + + - name: Create udev rules for GPU device permissions + copy: + dest: /etc/udev/rules.d/99-gpu-permissions.rules + content: | + # Allow video group access to DRI devices + SUBSYSTEM=="drm", KERNEL=="card*", MODE="0666" + SUBSYSTEM=="drm", KERNEL=="renderD*", MODE="0666" + mode: '0644' + notify: Reload udev + + # Fix GPU permissions on container start (LXC passthrough doesn't honor udev) + - name: Create systemd service to fix GPU permissions on boot + copy: + dest: /etc/systemd/system/fix-gpu-permissions.service + content: | + [Unit] + Description=Fix GPU device permissions for LXC passthrough + After=local-fs.target + + [Service] + Type=oneshot + ExecStart=/bin/chmod 666 /dev/dri/card2 /dev/dri/renderD129 + RemainAfterExit=yes + + [Install] + WantedBy=multi-user.target + mode: '0644' + notify: Reload systemd + + - name: Enable GPU permissions fix service + systemd: + name: fix-gpu-permissions + enabled: yes + state: started + daemon_reload: yes + + # Create dl directory + - name: Create download directory + become: no + file: + path: "/home/{{ system_user }}/dl" + state: directory + mode: '0755' + + # Chrome Installation + - name: Download Google Chrome + get_url: + url: "{{ chrome_deb_url }}" + dest: /tmp/google-chrome-stable_current_amd64.deb + mode: '0644' + + - name: Install Google Chrome + apt: + deb: /tmp/google-chrome-stable_current_amd64.deb + state: present + + - name: Clean up Chrome installer + file: + path: /tmp/google-chrome-stable_current_amd64.deb + state: absent + + # Chrome GPU Configuration - Use ANGLE+Vulkan to bypass broken GLX in XRDP + - name: Create Chrome policies directory + file: + path: /etc/opt/chrome/policies/managed + state: directory + mode: '0755' + + - name: Configure Chrome GPU policy + copy: + dest: /etc/opt/chrome/policies/managed/gpu-policy.json + content: | + { + "HardwareAccelerationModeEnabled": true + } + mode: '0644' + + - name: Create Chrome Vulkan launcher + copy: + dest: /usr/share/applications/google-chrome-vulkan.desktop + content: | + [Desktop Entry] + Version=1.0 + Name=Google Chrome (Vulkan) + GenericName=Web Browser + Exec=/usr/bin/google-chrome-stable --ignore-gpu-blocklist --use-gl=angle --use-angle=vulkan --enable-features=Vulkan,DefaultANGLEVulkan,VulkanFromANGLE,CanvasOopRasterization --enable-gpu-rasterization --canvas-oop-rasterization %U + Terminal=false + Icon=google-chrome + Type=Application + Categories=Network;WebBrowser; + mode: '0644' + + # Python Virtual Environment Setup + - name: Create virtual environment directory + become: no + file: + path: "/home/{{ system_user }}/env" + state: directory + mode: '0755' + + - name: Create Python virtual environment with system site packages + become: no + command: python3 -m venv --system-site-packages {{ agent_s_venv }} + args: + creates: "{{ agent_s_venv }}/bin/activate" + + - name: Install Python packages in virtual environment + become: no + pip: + name: + - lxml + - pillow + - setuptools + virtualenv: "{{ agent_s_venv }}" + state: present + + # Clone Agent-S Repository + - name: Create gh directory + become: no + file: + path: "/home/{{ system_user }}/gh" + state: directory + mode: '0755' + + - name: Clone Agent-S repository + become: no + git: + repo: https://github.com/simular-ai/Agent-S.git + dest: "{{ agent_s_repo }}" + version: main + update: yes + + - name: Create environment activation script + become: no + template: + src: agent_s_env.j2 + dest: "/home/{{ system_user }}/.agent_s_env" + mode: '0644' + + - name: Create XRDP Xorg config directory + file: + path: /etc/X11/xrdp + state: directory + mode: '0755' + + - name: Deploy XRDP Xorg configuration for 1024x1024 resolution + template: + src: xorg.conf.j2 + dest: /etc/X11/xrdp/xorg.conf + mode: '0644' + + handlers: + - name: Reload systemd + systemd: + daemon_reload: yes + + - name: Reload udev + shell: udevadm control --reload-rules && udevadm trigger + become: yes \ No newline at end of file diff --git a/ansible/caliban/xorg.conf.j2 b/ansible/caliban/xorg.conf.j2 new file mode 100644 index 0000000..e4c3f81 --- /dev/null +++ b/ansible/caliban/xorg.conf.j2 @@ -0,0 +1,72 @@ +# XRDP Xorg configuration - Fixed 1024x768 resolution for Agent-S / UI-TARS compatibility +# Deployed by Ansible to /etc/X11/xrdp/xorg.conf + +Section "ServerLayout" + Identifier "X11 Server" + Screen "Screen (xrdpdev)" + InputDevice "xrdpMouse" "CorePointer" + InputDevice "xrdpKeyboard" "CoreKeyboard" +EndSection + +Section "ServerFlags" + # Prevent other ServerLayout sections from overriding this one + Option "DefaultServerLayout" "X11 Server" + Option "DontVTSwitch" "on" + Option "AutoAddDevices" "off" + Option "AutoAddGPU" "off" +EndSection + +Section "Module" + Load "dbe" + Load "ddc" + Load "extmod" + Load "glx" + Load "int10" + Load "record" + Load "vbe" + Load "xorgxrdp" + Load "fb" +EndSection + +Section "InputDevice" + Identifier "xrdpKeyboard" + Driver "xrdpkeyb" +EndSection + +Section "InputDevice" + Identifier "xrdpMouse" + Driver "xrdpmouse" +EndSection + +Section "Monitor" + Identifier "Monitor" + Option "DPMS" + HorizSync 30-80 + VertRefresh 60-75 + # Fixed resolution for Agent-S / UI-TARS compatibility + Modeline "1024x768" 63.50 1024 1072 1176 1328 768 771 775 798 -hsync +vsync + # Fallback resolutions required by xrdpdev driver + Modeline "800x600" 38.25 800 832 912 1024 600 603 607 624 -hsync +vsync + Modeline "640x480" 23.75 640 664 720 800 480 483 487 500 -hsync +vsync +EndSection + +Section "Device" + Identifier "Video Card (xrdpdev)" + Driver "xrdpdev" + Option "DRMDevice" "/dev/dri/renderD129" + Option "DRI3" "1" + Option "DRMAllowList" "amdgpu" +EndSection + +Section "Screen" + Identifier "Screen (xrdpdev)" + Device "Video Card (xrdpdev)" + Monitor "Monitor" + DefaultDepth 24 + SubSection "Display" + Depth 24 + # Fixed resolution - 1024x768 with fallbacks for xrdpdev driver + Modes "1024x768" "800x600" "640x480" + Virtual 1024 768 + EndSubSection +EndSection diff --git a/ansible/casdoor/app.conf.j2 b/ansible/casdoor/app.conf.j2 new file mode 100644 index 0000000..2c11177 --- /dev/null +++ b/ansible/casdoor/app.conf.j2 @@ -0,0 +1,154 @@ +# ----------------------------------------------------------------------------- +# Casdoor Application Configuration +# ----------------------------------------------------------------------------- +# Generated by Ansible - do not edit manually +# See: https://casdoor.org/docs/basic/server-installation +# ----------------------------------------------------------------------------- + +appname = casdoor +httpport = {{ casdoor_port | default(8000) }} +runmode = {{ casdoor_runmode | default('prod') }} +copyrequestbody = true + +# ----------------------------------------------------------------------------- +# Database Configuration +# ----------------------------------------------------------------------------- +# Connects to native PostgreSQL on localhost (deployed by postgresql_ssl playbook) + +driverName = postgres +dataSourceName = user={{ casdoor_db_user }} password={{ casdoor_db_password }} host=localhost port={{ casdoor_db_port | default(5432) }} sslmode={{ casdoor_db_sslmode | default('disable') }} dbname={{ casdoor_db_name }} +dbName = {{ casdoor_db_name }} +tableNamePrefix = +showSql = {{ casdoor_showsql | default('false') }} + +# ----------------------------------------------------------------------------- +# Cache Configuration +# ----------------------------------------------------------------------------- + +redisEndpoint = {{ casdoor_redis_endpoint | default('') }} + +# ----------------------------------------------------------------------------- +# Storage Configuration +# ----------------------------------------------------------------------------- +# OCI Object Storage via S3-compatible API + +defaultStorageProvider = {{ casdoor_default_storage_provider | default('') }} +{% if casdoor_s3_endpoint is defined and casdoor_s3_endpoint %} +storageProvider = { + "owner": "admin", + "name": "oci-s3", + "createdTime": "", + "displayName": "OCI Object Storage", + "category": "Storage", + "type": "AWS S3", + "subType": "", + "method": "", + "clientId": "{{ casdoor_s3_access_key }}", + "clientSecret": "{{ casdoor_s3_secret_key }}", + "clientId2": "", + "clientSecret2": "", + "cert": "", + "customAuthUrl": "", + "customScope": "", + "customTokenUrl": "", + "customUserInfoUrl": "", + "customLogo": "", + "scopes": "", + "userMapping": null, + "host": "", + "port": 0, + "disableSsl": false, + "title": "", + "content": "", + "receiver": "", + "regionId": "{{ casdoor_s3_region | default('ca-toronto-1') }}", + "signName": "", + "templateCode": "", + "appId": "", + "endpoint": "https://{{ casdoor_s3_endpoint }}", + "intranetEndpoint": "", + "domain": "{{ casdoor_s3_bucket }}", + "bucket": "{{ casdoor_s3_bucket }}", + "pathPrefix": "", + "metadata": "", + "idP": "", + "issuerUrl": "", + "enableSignAuthnRequest": false, + "providerUrl": "" +} +{% endif %} + +# ----------------------------------------------------------------------------- +# Security Configuration +# ----------------------------------------------------------------------------- + +isCloudIntranet = false +authState = {{ casdoor_auth_state | default(casdoor_secret_key) }} +socks5Proxy = +verificationCodeTimeout = 10 +initScore = 0 +logPostOnly = true +isUsernameLowered = false + +# ----------------------------------------------------------------------------- +# Origin Configuration +# ----------------------------------------------------------------------------- +# Must match the external URL used to access Casdoor + +origin = {{ casdoor_origin }} +originFrontend = {{ casdoor_origin_frontend | default(casdoor_origin) }} +staticBaseUrl = "https://cdn.casbin.org" + +# ----------------------------------------------------------------------------- +# Application Settings +# ----------------------------------------------------------------------------- + +isDemoMode = false +batchSize = 100 +enableErrorMask = true +enableGzip = true + +# Session timeout in minutes +inactiveTimeoutMinutes = {{ casdoor_inactive_timeout_minutes | default(60) }} + +# ----------------------------------------------------------------------------- +# Theme Configuration +# ----------------------------------------------------------------------------- + +themeData = {"themeType": "default", "colorPrimary": "#ffa415", "borderRadius": 6, "isCompact": false} + +# ----------------------------------------------------------------------------- +# LDAP Configuration +# ----------------------------------------------------------------------------- + +ldapServerPort = {{ casdoor_ldap_server_port | default(0) }} +ldapsCertId = {{ casdoor_ldaps_cert_id | default('') }} +ldapsServerPort = {{ casdoor_ldaps_server_port | default(0) }} + +# ----------------------------------------------------------------------------- +# RADIUS Configuration +# ----------------------------------------------------------------------------- + +radiusServerPort = {{ casdoor_radius_server_port | default(0) }} +radiusDefaultOrganization = {{ casdoor_radius_default_organization | default('built-in') }} +radiusSecret = {{ casdoor_radius_secret | default('') }} + +# ----------------------------------------------------------------------------- +# Resource Quotas +# ----------------------------------------------------------------------------- + +quota = {"organization": -1, "user": -1, "application": -1, "provider": -1} + +# ----------------------------------------------------------------------------- +# Logging Configuration +# ----------------------------------------------------------------------------- + +logConfig = {"adapter":"console"} + +# ----------------------------------------------------------------------------- +# Initialization +# ----------------------------------------------------------------------------- + +initDataNewOnly = true +initDataFile = "/conf/init_data.json" +frontendBaseDir = "../cc_0" diff --git a/ansible/casdoor/deploy.yml b/ansible/casdoor/deploy.yml new file mode 100644 index 0000000..33a69bf --- /dev/null +++ b/ansible/casdoor/deploy.yml @@ -0,0 +1,155 @@ +--- +# ----------------------------------------------------------------------------- +# Casdoor Deployment Playbook +# ----------------------------------------------------------------------------- +# Deploys Casdoor SSO Docker container +# Host: titania.incus (Incus container) +# Endpoint: id.ouranos.helu.ca via HAProxy on Titania +# +# Prerequisites: +# - postgresql_ssl must be deployed first (provides the database) +# - Docker must be installed +# - Alloy must be configured for syslog +# +# Secrets are fetched from Ansible Vault via group_vars/all/vault.yml +# ----------------------------------------------------------------------------- + +- name: Deploy Casdoor + hosts: ubuntu + tasks: + - name: Check if host has casdoor service + ansible.builtin.set_fact: + has_casdoor_service: "{{ 'casdoor' in services | default([]) }}" + + - name: Skip hosts without casdoor service + ansible.builtin.meta: end_host + when: not has_casdoor_service + + # ------------------------------------------------------------------------- + # Create User and Group (system-assigned UID/GID) + # ------------------------------------------------------------------------- + + - name: Create casdoor group + become: true + ansible.builtin.group: + name: "{{ casdoor_group }}" + system: true + + - name: Create casdoor user + become: true + ansible.builtin.user: + name: "{{ casdoor_user }}" + comment: "Casdoor service account" + group: "{{ casdoor_group }}" + system: true + create_home: false + shell: /usr/sbin/nologin + + - name: Add ansible_user to casdoor group + become: true + ansible.builtin.user: + name: "{{ ansible_user }}" + groups: "{{ casdoor_group }}" + append: true + + # ------------------------------------------------------------------------- + # Query uid/gid for Docker container user + # ------------------------------------------------------------------------- + + - name: Get casdoor user uid + ansible.builtin.shell: | + getent passwd {{ casdoor_user }} | cut -d: -f3 + register: casdoor_uid_result + changed_when: false + + - name: Get casdoor group gid + ansible.builtin.shell: | + getent group {{ casdoor_group }} | cut -d: -f3 + register: casdoor_gid_result + changed_when: false + + - name: Set uid/gid facts + ansible.builtin.set_fact: + casdoor_uid: "{{ casdoor_uid_result.stdout }}" + casdoor_gid: "{{ casdoor_gid_result.stdout }}" + + # ------------------------------------------------------------------------- + # Create Directories + # ------------------------------------------------------------------------- + + - name: Create casdoor base directory + become: true + ansible.builtin.file: + path: "{{ casdoor_directory }}" + owner: "{{ casdoor_user }}" + group: "{{ casdoor_group }}" + state: directory + mode: '0750' + + - name: Create casdoor conf directory + become: true + ansible.builtin.file: + path: "{{ casdoor_directory }}/conf" + owner: "{{ casdoor_user }}" + group: "{{ casdoor_group }}" + state: directory + mode: '0750' + + # ------------------------------------------------------------------------- + # Template Configuration Files + # ------------------------------------------------------------------------- + + - name: Template docker-compose.yml + become: true + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ casdoor_directory }}/docker-compose.yml" + owner: "{{ casdoor_user }}" + group: "{{ casdoor_group }}" + mode: '0640' + notify: restart casdoor + + - name: Template app.conf + become: true + ansible.builtin.template: + src: app.conf.j2 + dest: "{{ casdoor_directory }}/conf/app.conf" + owner: "{{ casdoor_user }}" + group: "{{ casdoor_group }}" + mode: '0640' + notify: restart casdoor + + - name: Template init_data.json + become: true + ansible.builtin.template: + src: init_data.json.j2 + dest: "{{ casdoor_directory }}/conf/init_data.json" + owner: "{{ casdoor_user }}" + group: "{{ casdoor_group }}" + mode: '0640' + notify: restart casdoor + + # ------------------------------------------------------------------------- + # Reset SSH Connection (apply group changes) + # ------------------------------------------------------------------------- + + - name: Reset SSH connection to apply group changes + ansible.builtin.meta: reset_connection + + # ------------------------------------------------------------------------- + # Start Services + # ------------------------------------------------------------------------- + + - name: Start Casdoor service + become: true + community.docker.docker_compose_v2: + project_src: "{{ casdoor_directory }}" + state: present + pull: always + + handlers: + - name: restart casdoor + become: true + community.docker.docker_compose_v2: + project_src: "{{ casdoor_directory }}" + state: restarted \ No newline at end of file diff --git a/ansible/casdoor/docker-compose.yml.j2 b/ansible/casdoor/docker-compose.yml.j2 new file mode 100644 index 0000000..f343b7f --- /dev/null +++ b/ansible/casdoor/docker-compose.yml.j2 @@ -0,0 +1,34 @@ +# ----------------------------------------------------------------------------- +# Casdoor Docker Compose +# ----------------------------------------------------------------------------- +# Casdoor SSO - connects to native PostgreSQL on localhost +# Generated by Ansible - do not edit manually +# ----------------------------------------------------------------------------- + +services: + # --------------------------------------------------------------------------- + # Casdoor - SSO Identity Provider + # --------------------------------------------------------------------------- + casdoor: + image: casbin/casdoor:latest + pull_policy: always + container_name: casdoor + network_mode: host # Access localhost PostgreSQL directly + environment: + RUNNING_IN_DOCKER: "true" + user: "{{ casdoor_uid }}:{{ casdoor_gid }}" + volumes: + - ./conf:/conf:ro + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{ casdoor_syslog_port }}" + syslog-format: "{{ syslog_format | default('rfc3164') }}" + tag: "casdoor" + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:{{ casdoor_port }}/api/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s diff --git a/ansible/casdoor/init_data.json.j2 b/ansible/casdoor/init_data.json.j2 new file mode 100644 index 0000000..e774666 --- /dev/null +++ b/ansible/casdoor/init_data.json.j2 @@ -0,0 +1,350 @@ +{ + "organizations": [ + { + "owner": "admin", + "name": "heluca", + "displayName": "Helu.ca", + "websiteUrl": "https://helu.ca", + "favicon": "https://helu.ca/media/images/favicon.original.png", + "logo": "https://helu.ca/media/images/helu-ca_logo.original.svg", + "passwordType": "bcrypt", + "passwordSalt": "", + "passwordOptions": ["AtLeast6"], + "countryCodes": ["CA", "US"], + "defaultAvatar": "", + "defaultApplication": "angelia", + "tags": [], + "languages": ["en", "fr"], + "masterPassword": "", + "defaultPassword": "", + "initScore": 2000, + "enableSoftDeletion": false, + "isProfilePublic": true, + "useEmailAsUsername": true, + "disableSignin": false, + "accountItems": [ + {"name": "Organization", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "ID", "visible": true, "viewRule": "Public", "modifyRule": "Immutable"}, + {"name": "Name", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "Display name", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Avatar", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "User type", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "Password", "visible": true, "viewRule": "Self", "modifyRule": "Self"}, + {"name": "Email", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Phone", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Country code", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "Country/Region", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Location", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Address", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Affiliation", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Title", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Homepage", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Bio", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Roles", "visible": true, "viewRule": "Public", "modifyRule": "Immutable"}, + {"name": "Permissions", "visible": true, "viewRule": "Public", "modifyRule": "Immutable"}, + {"name": "Groups", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "3rd-party logins", "visible": true, "viewRule": "Self", "modifyRule": "Self"}, + {"name": "Properties", "visible": true, "viewRule": "Admin", "modifyRule": "Admin"}, + {"name": "Is admin", "visible": true, "viewRule": "Admin", "modifyRule": "Admin"}, + {"name": "Is forbidden", "visible": true, "viewRule": "Admin", "modifyRule": "Admin"}, + {"name": "Is deleted", "visible": true, "viewRule": "Admin", "modifyRule": "Admin"}, + {"name": "Multi-factor authentication", "visible": true, "viewRule": "Self", "modifyRule": "Self"}, + {"name": "WebAuthn credentials", "visible": true, "viewRule": "Self", "modifyRule": "Self"}, + {"name": "Managed accounts", "visible": true, "viewRule": "Self", "modifyRule": "Self"} + ] + } + ], + "applications": [ + { + "owner": "admin", + "name": "angelia", + "displayName": "Helu.ca", + "logo": "https://helu.ca/media/images/helu-ca_logo.original.svg", + "homepageUrl": "https://helu.ca", + "organization": "heluca", + "cert": "cert-heluca", + "enablePassword": true, + "enableSignUp": true, + "disableSignin": false, + "clientId": "{{ vault_angelia_oauth_client_id }}", + "clientSecret": "{{ vault_angelia_oauth_client_secret }}", + "providers": [], + "signinMethods": [ + {"name": "Password", "displayName": "Password", "rule": "All"}, + {"name": "Verification code", "displayName": "Verification code", "rule": "All"}, + {"name": "WebAuthn", "displayName": "WebAuthn", "rule": "None"} + ], + "signupItems": [ + {"name": "ID", "visible": false, "required": true, "prompted": false, "rule": "Random"}, + {"name": "Email", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Display name", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Password", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Confirm password", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Agreement", "visible": true, "required": true, "prompted": false, "rule": "None"} + ], + "grantTypes": [ + "authorization_code", + "password", + "client_credentials", + "token", + "id_token", + "refresh_token" + ], + "redirectUris": [ + "https://ouranos.helu.ca/callback" + ], + "tokenFormat": "JWT", + "tokenFields": [], + "expireInHours": 168, + "failedSigninLimit": 5, + "failedSigninFrozenTime": 15, + "formCss": "", + "footerHtml": "
Powered by Helu.ca
" + }, + { + "owner": "admin", + "name": "gitea", + "displayName": "Gitea", + "logo": "https://helu.ca/media/images/helu-ca_logo.original.svg", + "homepageUrl": "https://gitea.ouranos.helu.ca", + "organization": "heluca", + "cert": "cert-heluca", + "enablePassword": true, + "enableSignUp": false, + "clientId": "{{ vault_gitea_oauth_client_id }}", + "clientSecret": "{{ vault_gitea_oauth_client_secret }}", + "providers": [], + "signinMethods": [ + {"name": "Password", "displayName": "Password", "rule": "All"} + ], + "signupItems": [ + {"name": "ID", "visible": false, "required": true, "prompted": false, "rule": "Random"}, + {"name": "Email", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Display name", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Password", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Confirm password", "visible": true, "required": true, "prompted": false, "rule": "None"} + ], + "grantTypes": [ + "authorization_code", + "refresh_token" + ], + "redirectUris": [ + "https://gitea.ouranos.helu.ca/user/oauth2/casdoor/callback" + ], + "tokenFormat": "JWT", + "expireInHours": 168, + "formCss": "", + "footerHtml": "
Powered by Helu.ca
" + }, + { + "owner": "admin", + "name": "jupyterlab", + "displayName": "JupyterLab", + "logo": "https://helu.ca/media/images/helu-ca_logo.original.svg", + "homepageUrl": "https://jupyterlab.ouranos.helu.ca", + "organization": "heluca", + "cert": "cert-heluca", + "enablePassword": true, + "enableSignUp": false, + "clientId": "{{ vault_jupyterlab_oauth_client_id }}", + "clientSecret": "{{ vault_jupyterlab_oauth_client_secret }}", + "providers": [], + "signinMethods": [ + {"name": "Password", "displayName": "Password", "rule": "All"} + ], + "signupItems": [ + {"name": "ID", "visible": false, "required": true, "prompted": false, "rule": "Random"}, + {"name": "Email", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Display name", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Password", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Confirm password", "visible": true, "required": true, "prompted": false, "rule": "None"} + ], + "grantTypes": [ + "authorization_code", + "refresh_token" + ], + "redirectUris": [ + "https://jupyterlab.ouranos.helu.ca/oauth2/callback" + ], + "tokenFormat": "JWT", + "expireInHours": 168, + "formCss": "", + "footerHtml": "
Powered by Helu.ca
" + }, + { + "owner": "admin", + "name": "searxng", + "displayName": "SearXNG", + "logo": "https://helu.ca/media/images/helu-ca_logo.original.svg", + "homepageUrl": "https://searxng.ouranos.helu.ca", + "organization": "heluca", + "cert": "cert-heluca", + "enablePassword": true, + "enableSignUp": false, + "clientId": "{{ vault_searxng_oauth_client_id }}", + "clientSecret": "{{ vault_searxng_oauth_client_secret }}", + "providers": [], + "signinMethods": [ + {"name": "Password", "displayName": "Password", "rule": "All"} + ], + "signupItems": [ + {"name": "ID", "visible": false, "required": true, "prompted": false, "rule": "Random"}, + {"name": "Email", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Display name", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Password", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Confirm password", "visible": true, "required": true, "prompted": false, "rule": "None"} + ], + "grantTypes": [ + "authorization_code", + "refresh_token" + ], + "redirectUris": [ + "https://searxng.ouranos.helu.ca/oauth2/callback" + ], + "tokenFormat": "JWT", + "expireInHours": 168, + "formCss": "", + "footerHtml": "
Powered by Helu.ca
" + }, + { + "owner": "admin", + "name": "openwebui", + "displayName": "Open WebUI", + "logo": "https://helu.ca/media/images/helu-ca_logo.original.svg", + "homepageUrl": "https://openwebui.ouranos.helu.ca", + "organization": "heluca", + "cert": "cert-heluca", + "enablePassword": true, + "enableSignUp": false, + "clientId": "{{ vault_openwebui_oauth_client_id }}", + "clientSecret": "{{ vault_openwebui_oauth_client_secret }}", + "providers": [], + "signinMethods": [ + {"name": "Password", "displayName": "Password", "rule": "All"} + ], + "signupItems": [ + {"name": "ID", "visible": false, "required": true, "prompted": false, "rule": "Random"}, + {"name": "Email", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Display name", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Password", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Confirm password", "visible": true, "required": true, "prompted": false, "rule": "None"} + ], + "grantTypes": [ + "authorization_code", + "refresh_token" + ], + "redirectUris": [ + "https://openwebui.ouranos.helu.ca/oauth/oidc/callback" + ], + "tokenFormat": "JWT", + "expireInHours": 168, + "formCss": "", + "footerHtml": "
Powered by Helu.ca
" + } + ], + "users": [ + { + "owner": "heluca", + "name": "robert@helu.ca", + "type": "normal-user", + "password": "ChangeMe!", + "displayName": "Heluca", + "avatar": "", + "email": "robert@helu.ca", + "phone": "", + "countryCode": "CA", + "address": [], + "affiliation": "Helu.ca", + "tag": "owner", + "title": "Owner", + "score": 2000, + "ranking": 1, + "isAdmin": true, + "isForbidden": false, + "isDeleted": false, + "signupApplication": "angelia", + "createdIp": "", + "groups": [] + }, + { + "owner": "heluca", + "name": "r@helu.ca", + "type": "normal-user", + "password": "ChangeMe!", + "displayName": "Robert", + "avatar": "", + "email": "r@helu.ca", + "phone": "", + "countryCode": "CA", + "address": [], + "affiliation": "Helu.ca", + "tag": "sysadmin", + "title": "Owner", + "bio": "", + "score": 2000, + "ranking": 2, + "isAdmin": false, + "isForbidden": false, + "isDeleted": false, + "signupApplication": "angelia", + "createdIp": "", + "groups": [] + } + ], + "providers": [ + { + "owner": "admin", + "name": "provider-email-smtp4dev", + "displayName": "smtp4dev Email", + "category": "Email", + "type": "SMTP", + "host": "{{ smtp_host }}", + "port": {{ smtp_port }}, + "disableSsl": true, + "fromAddress": "{{ smtp_from }}", + "fromName": "{{ smtp_from_name }}", + "clientSecret": "" + } + ], + "certs": [ + { + "owner": "admin", + "name": "cert-built-in", + "displayName": "Built-in Certificate", + "scope": "JWT", + "type": "x509", + "cryptoAlgorithm": "RS256", + "bitSize": 4096, + "expireInYears": 20, + "certificate": "", + "privateKey": "" + }, + { + "owner": "admin", + "name": "cert-heluca", + "displayName": "Helu.ca JWT Certificate", + "scope": "JWT", + "type": "x509", + "cryptoAlgorithm": "RS256", + "bitSize": 4096, + "expireInYears": 20, + "certificate": "", + "privateKey": "" + } + ], + "ldaps": [], + "models": [], + "permissions": [], + "roles": [], + "groups": [], + "adapters": [], + "enforcers": [], + "plans": [], + "pricings": [], + "payments": [], + "products": [], + "resources": [], + "syncers": [], + "tokens": [], + "webhooks": [] +} \ No newline at end of file diff --git a/ansible/casdoor/init_data.json.template b/ansible/casdoor/init_data.json.template new file mode 100644 index 0000000..653499e --- /dev/null +++ b/ansible/casdoor/init_data.json.template @@ -0,0 +1,524 @@ +{ + "organizations": [ + { + "owner": "", + "name": "", + "displayName": "", + "websiteUrl": "", + "favicon": "", + "passwordType": "bcrypt", + "passwordSalt": "", + "passwordOptions": [ + "AtLeast6" + ], + "countryCodes": [ + "US", + "GB", + "ES", + "FR", + "DE", + "CN", + "JP", + "KR", + "VN", + "ID", + "SG", + "IN", + "IT", + "MY", + "TR", + "DZ", + "IL", + "PH", + "NL", + "PL", + "FI", + "SE", + "UA", + "KZ", + "CZ", + "SK", + "AZ" + ], + "defaultAvatar": "", + "defaultApplication": "", + "tags": [], + "languages": [ + "en", + "es", + "fr", + "de", + "ja", + "zh", + "vi", + "pt", + "tr", + "pl", + "uk" + ], + "masterPassword": "", + "defaultPassword": "", + "initScore": 2000, + "enableSoftDeletion": false, + "isProfilePublic": true, + "disableSignin": false, + "accountItems": [ + {"name": "Organization", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "ID", "visible": true, "viewRule": "Public", "modifyRule": "Immutable"}, + {"name": "Name", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "Display name", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Avatar", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "User type", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "Password", "visible": true, "viewRule": "Self", "modifyRule": "Self"}, + {"name": "Email", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Phone", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Country code", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "Country/Region", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Location", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Address", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Addresses", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Affiliation", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Title", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "ID card type", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "ID card", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Real name", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "ID verification", "visible": true, "viewRule": "Self", "modifyRule": "Self"}, + {"name": "Homepage", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Bio", "visible": true, "viewRule": "Public", "modifyRule": "Self"}, + {"name": "Tag", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "Signup application", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "Register type", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "Register source", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "Roles", "visible": true, "viewRule": "Public", "modifyRule": "Immutable"}, + {"name": "Permissions", "visible": true, "viewRule": "Public", "modifyRule": "Immutable"}, + {"name": "Groups", "visible": true, "viewRule": "Public", "modifyRule": "Admin"}, + {"name": "3rd-party logins", "visible": true, "viewRule": "Self", "modifyRule": "Self"}, + {"name": "Properties", "visible": true, "viewRule": "Admin", "modifyRule": "Admin"}, + {"name": "Is admin", "visible": true, "viewRule": "Admin", "modifyRule": "Admin"}, + {"name": "Is forbidden", "visible": true, "viewRule": "Admin", "modifyRule": "Admin"}, + {"name": "Is deleted", "visible": true, "viewRule": "Admin", "modifyRule": "Admin"}, + {"name": "Multi-factor authentication", "visible": true, "viewRule": "Self", "modifyRule": "Self"}, + {"name": "WebAuthn credentials", "visible": true, "viewRule": "Self", "modifyRule": "Self"}, + {"name": "Managed accounts", "visible": true, "viewRule": "Self", "modifyRule": "Self"}, + {"name": "MFA accounts", "visible": true, "viewRule": "Self", "modifyRule": "Self"} + ] + } + ], + "applications": [ + { + "owner": "", + "name": "", + "displayName": "", + "logo": "", + "homepageUrl": "", + "organization": "", + "cert": "", + "enablePassword": true, + "enableSignUp": true, + "disableSignin": false, + "clientId": "", + "clientSecret": "", + "providers": [ + { + "name": "", + "canSignUp": true, + "canSignIn": true, + "canUnlink": false, + "prompted": false, + "alertType": "None" + } + ], + "signinMethods": [ + { + "name": "Password", + "displayName": "Password", + "rule": "All" + }, + { + "name": "Verification code", + "displayName": "Verification code", + "rule": "All" + }, + { + "name": "WebAuthn", + "displayName": "WebAuthn", + "rule": "None" + }, + { + "name": "Face ID", + "displayName": "Face ID", + "rule": "None" + } + ], + "signupItems": [ + { + "name": "ID", + "visible": false, + "required": true, + "prompted": false, + "rule": "Random" + }, + { + "name": "Username", + "visible": true, + "required": true, + "prompted": false, + "rule": "None" + }, + { + "name": "Display name", + "visible": true, + "required": true, + "prompted": false, + "rule": "None" + }, + { + "name": "Password", + "visible": true, + "required": true, + "prompted": false, + "rule": "None" + }, + { + "name": "Confirm password", + "visible": true, + "required": true, + "prompted": false, + "rule": "None" + }, + { + "name": "Email", + "visible": true, + "required": true, + "prompted": false, + "rule": "None" + }, + { + "name": "Phone", + "visible": true, + "required": true, + "prompted": false, + "rule": "None" + }, + { + "name": "Agreement", + "visible": true, + "required": true, + "prompted": false, + "rule": "None" + } + ], + "grantTypes": [ + "authorization_code", + "password", + "client_credentials", + "token", + "id_token", + "refresh_token" + ], + "redirectUris": [ + "http://localhost:9000/callback" + ], + "tokenFormat": "JWT", + "tokenFields": [], + "expireInHours": 168, + "failedSigninLimit": 5, + "failedSigninFrozenTime": 15 + } + ], + "users": [ + { + "owner": "", + "name": "", + "type": "normal-user", + "password": "", + "displayName": "", + "avatar": "", + "email": "", + "phone": "", + "countryCode": "", + "address": [], + "addresses": [], + "affiliation": "", + "tag": "", + "score": 2000, + "ranking": 1, + "isAdmin": true, + "isForbidden": false, + "isDeleted": false, + "signupApplication": "", + "createdIp": "", + "groups": [] + } + ], + "providers": [ + { + "owner": "", + "name": "", + "displayName": "", + "category": "", + "type": "" + } + ], + "certs": [ + { + "owner": "", + "name": "", + "displayName": "", + "scope": "JWT", + "type": "x509", + "cryptoAlgorithm": "RS256", + "bitSize": 4096, + "expireInYears": 20, + "certificate": "", + "privateKey": "" + } + ], + "ldaps": [ + { + "id": "", + "owner": "", + "serverName": "", + "host": "", + "port": 389, + "username": "", + "password": "", + "baseDn": "", + "autoSync": 0, + "lastSync": "" + } + ], + "models": [ + { + "owner": "", + "name": "", + "modelText": "", + "displayName": "" + } + ], + "permissions": [ + { + "actions": [], + "displayName": "", + "effect": "", + "isEnabled": true, + "model": "", + "name": "", + "owner": "", + "resourceType": "", + "resources": [], + "roles": [], + "users": [] + } + ], + "payments": [ + { + "currency": "", + "detail": "", + "displayName": "", + "invoiceRemark": "", + "invoiceTaxId": "", + "invoiceTitle": "", + "invoiceType": "", + "invoiceUrl": "", + "message": "", + "name": "", + "organization": "", + "owner": "", + "payUrl": "", + "personEmail": "", + "personIdCard": "", + "personName": "", + "personPhone": "", + "price": 0, + "productDisplayName": "", + "productName": "", + "provider": "", + "returnUrl": "", + "state": "", + "tag": "", + "type": "", + "user": "" + } + ], + "products": [ + { + "currency": "", + "detail": "", + "displayName": "", + "image": "", + "name": "", + "owner": "", + "price": 0, + "providers": [], + "quantity": 0, + "returnUrl": "", + "sold": 0, + "state": "", + "tag": "" + } + ], + "resources": [ + { + "owner": "", + "name": "", + "user": "", + "provider": "", + "application": "", + "tag": "", + "parent": "", + "fileName": "", + "fileType": "", + "fileFormat": "", + "url": "", + "description": "" + } + ], + "roles": [ + { + "displayName": "", + "isEnabled": true, + "name": "", + "owner": "", + "roles": [], + "users": [] + } + ], + "syncers": [ + { + "affiliationTable": "", + "avatarBaseUrl": "", + "database": "", + "databaseType": "", + "errorText": "", + "host": "", + "isEnabled": false, + "name": "", + "organization": "", + "owner": "", + "password": "", + "port": 0, + "syncInterval": 0, + "table": "", + "tableColumns": [ + { + "casdoorName": "", + "isHashed": true, + "name": "", + "type": "", + "values": [] + } + ], + "tablePrimaryKey": "", + "type": "", + "user": "" + } + ], + "tokens": [ + { + "accessToken": "", + "application": "", + "code": "", + "codeChallenge": "", + "codeExpireIn": 0, + "codeIsUsed": true, + "createdTime": "", + "expiresIn": 0, + "name": "", + "organization": "", + "owner": "", + "refreshToken": "", + "scope": "", + "tokenType": "", + "user": "" + } + ], + "webhooks": [ + { + "contentType": "", + "events": [], + "headers": [ + { + "name": "", + "value": "" + } + ], + "isEnabled": true, + "isUserExtended": true, + "method": "", + "name": "", + "organization": "", + "owner": "", + "url": "" + } + ], + "groups": [ + { + "owner": "", + "name": "", + "displayName": "", + "manager": "", + "contactEmail": "", + "type": "", + "parent_id": "", + "isTopGroup": true, + "title": "", + "key": "", + "children": [], + "isEnabled": true + } + ], + "adapters": [ + { + "owner": "", + "name": "", + "table": "", + "useSameDb": true, + "type": "", + "databaseType": "", + "database": "", + "host": "", + "port": 0, + "user": "", + "password": "" + } + ], + "enforcers": [ + { + "owner": "", + "name": "", + "displayName": "", + "description": "", + "model": "", + "adapter": "", + "enforcer": "" + } + ], + "plans": [ + { + "owner": "", + "name": "", + "displayName": "", + "description": "", + "price": 0, + "currency": "", + "period": "", + "product": "", + "paymentProviders": [], + "isEnabled": true, + "role": "" + } + ], + "pricings": [ + { + "owner": "", + "name": "", + "displayName": "", + "description": "", + "plans": [], + "isEnabled": true, + "trialDuration": 0, + "application": "" + } + ] +} diff --git a/ansible/casdoor/remove.yml b/ansible/casdoor/remove.yml new file mode 100644 index 0000000..cd22cbb --- /dev/null +++ b/ansible/casdoor/remove.yml @@ -0,0 +1,75 @@ +--- +# ----------------------------------------------------------------------------- +# Casdoor Removal Playbook +# ----------------------------------------------------------------------------- +# Removes Casdoor SSO including: +# - Docker containers and volumes +# - Configuration files +# - PostgreSQL data directory +# - Service user and group +# +# WARNING: This will permanently delete all Casdoor data including the database! +# ----------------------------------------------------------------------------- + +- name: Remove Casdoor + hosts: ubuntu + tasks: + - name: Check if host has casdoor service + ansible.builtin.set_fact: + has_casdoor_service: "{{ 'casdoor' in services | default([]) }}" + + - name: Skip hosts without casdoor service + ansible.builtin.meta: end_host + when: not has_casdoor_service + + # ------------------------------------------------------------------------- + # Stop and Remove Docker Services + # ------------------------------------------------------------------------- + + - name: Check if docker-compose.yml exists + become: true + ansible.builtin.stat: + path: "{{ casdoor_directory }}/docker-compose.yml" + register: compose_file + + - name: Stop and remove Casdoor containers + become: true + community.docker.docker_compose_v2: + project_src: "{{ casdoor_directory }}" + state: absent + remove_volumes: true + when: compose_file.stat.exists + + # ------------------------------------------------------------------------- + # Remove Data Directory + # ------------------------------------------------------------------------- + + - name: Remove casdoor directory and all data + become: true + ansible.builtin.file: + path: "{{ casdoor_directory }}" + state: absent + + # ------------------------------------------------------------------------- + # Remove User and Group + # ------------------------------------------------------------------------- + + - name: Remove ponos from casdoor group + become: true + ansible.builtin.command: + cmd: gpasswd -d ponos {{ casdoor_group }} + register: gpasswd_result + changed_when: gpasswd_result.rc == 0 + failed_when: false + + - name: Remove casdoor user + become: true + ansible.builtin.user: + name: "{{ casdoor_user }}" + state: absent + + - name: Remove casdoor group + become: true + ansible.builtin.group: + name: "{{ casdoor_group }}" + state: absent diff --git a/ansible/certbot/cert-metrics.sh.j2 b/ansible/certbot/cert-metrics.sh.j2 new file mode 100644 index 0000000..8667f7e --- /dev/null +++ b/ansible/certbot/cert-metrics.sh.j2 @@ -0,0 +1,71 @@ +#!/bin/bash +# Certificate metrics for Prometheus node_exporter textfile collector +# Managed by Ansible - DO NOT EDIT MANUALLY +# +# Writes metrics to: {{ prometheus_node_exporter_text_directory }}/ssl_cert.prom +# Metrics: +# ssl_certificate_expiry_timestamp - Unix timestamp when cert expires +# ssl_certificate_expiry_seconds - Seconds until expiry +# ssl_certificate_valid - 1 if valid, 0 if expired or missing + +set -euo pipefail + +METRICS_DIR="{{ prometheus_node_exporter_text_directory }}" +METRICS_FILE="${METRICS_DIR}/ssl_cert.prom" +CERT_FILE="{{ haproxy_cert_path }}" +DOMAIN="{{ haproxy_domain }}" + +# Create temp file for atomic write +TEMP_FILE=$(mktemp "${METRICS_DIR}/.ssl_cert.prom.XXXXXX") + +# Write metric headers +cat > "${TEMP_FILE}" << 'EOF' +# HELP ssl_certificate_expiry_timestamp Unix timestamp when the SSL certificate expires +# TYPE ssl_certificate_expiry_timestamp gauge +# HELP ssl_certificate_expiry_seconds Seconds until the SSL certificate expires +# TYPE ssl_certificate_expiry_seconds gauge +# HELP ssl_certificate_valid Whether the SSL certificate is valid (1) or expired/missing (0) +# TYPE ssl_certificate_valid gauge +EOF + +if [[ -f "${CERT_FILE}" ]]; then + # Extract expiry date from certificate + EXPIRY_DATE=$(openssl x509 -enddate -noout -in "${CERT_FILE}" 2>/dev/null | cut -d= -f2) + + if [[ -n "${EXPIRY_DATE}" ]]; then + # Convert to Unix timestamp + EXPIRY_TIMESTAMP=$(date -d "${EXPIRY_DATE}" +%s 2>/dev/null || echo "0") + CURRENT_TIMESTAMP=$(date +%s) + EXPIRY_SECONDS=$((EXPIRY_TIMESTAMP - CURRENT_TIMESTAMP)) + + # Check if certificate is valid (not expired) + if [[ ${EXPIRY_SECONDS} -gt 0 ]]; then + VALID=1 + else + VALID=0 + fi + + # Extract issuer for label + ISSUER=$(openssl x509 -issuer -noout -in "${CERT_FILE}" 2>/dev/null | sed 's/.*O = \([^,]*\).*/\1/' | tr -d '"' || echo "unknown") + + # Write metrics + echo "ssl_certificate_expiry_timestamp{domain=\"${DOMAIN}\",issuer=\"${ISSUER}\"} ${EXPIRY_TIMESTAMP}" >> "${TEMP_FILE}" + echo "ssl_certificate_expiry_seconds{domain=\"${DOMAIN}\",issuer=\"${ISSUER}\"} ${EXPIRY_SECONDS}" >> "${TEMP_FILE}" + echo "ssl_certificate_valid{domain=\"${DOMAIN}\",issuer=\"${ISSUER}\"} ${VALID}" >> "${TEMP_FILE}" + else + # Could not parse certificate + echo "ssl_certificate_expiry_timestamp{domain=\"${DOMAIN}\",issuer=\"unknown\"} 0" >> "${TEMP_FILE}" + echo "ssl_certificate_expiry_seconds{domain=\"${DOMAIN}\",issuer=\"unknown\"} 0" >> "${TEMP_FILE}" + echo "ssl_certificate_valid{domain=\"${DOMAIN}\",issuer=\"unknown\"} 0" >> "${TEMP_FILE}" + fi +else + # Certificate file does not exist + echo "ssl_certificate_expiry_timestamp{domain=\"${DOMAIN}\",issuer=\"none\"} 0" >> "${TEMP_FILE}" + echo "ssl_certificate_expiry_seconds{domain=\"${DOMAIN}\",issuer=\"none\"} 0" >> "${TEMP_FILE}" + echo "ssl_certificate_valid{domain=\"${DOMAIN}\",issuer=\"none\"} 0" >> "${TEMP_FILE}" +fi + +# Set permissions and atomic move +chmod 644 "${TEMP_FILE}" +chown prometheus:prometheus "${TEMP_FILE}" 2>/dev/null || true +mv "${TEMP_FILE}" "${METRICS_FILE}" \ No newline at end of file diff --git a/ansible/certbot/deploy.yml b/ansible/certbot/deploy.yml new file mode 100644 index 0000000..a83b495 --- /dev/null +++ b/ansible/certbot/deploy.yml @@ -0,0 +1,323 @@ +--- +# ----------------------------------------------------------------------------- +# Certbot Deployment Playbook +# ----------------------------------------------------------------------------- +# Deploys certbot with Namecheap DNS-01 validation for wildcard certificates +# Host: hippocamp.helu.ca (OCI HAProxy instance) +# +# Secrets are fetched automatically from OCI Vault via group_vars/all/secrets.yml +# ----------------------------------------------------------------------------- + +- name: Deploy Certbot with Namecheap DNS-01 Validation + hosts: ubuntu + vars: + ansible_common_remote_group: "{{ certbot_group | default(omit) }}" + allow_world_readable_tmpfiles: true + tags: [certbot, ssl, deploy] + + handlers: + - name: restart certbot-renew timer + become: true + ansible.builtin.systemd: + name: certbot-renew.timer + state: restarted + daemon_reload: true + + tasks: + - name: Check if host has certbot service + ansible.builtin.set_fact: + has_certbot_service: "{{ 'certbot' in services | default([]) }}" + + - name: Skip hosts without certbot service + ansible.builtin.meta: end_host + when: not has_certbot_service + + # ------------------------------------------------------------------------- + # System Setup + # ------------------------------------------------------------------------- + + - name: Create certbot group + become: true + ansible.builtin.group: + name: "{{ certbot_group }}" + system: true + + - name: Create certbot user + become: true + ansible.builtin.user: + name: "{{ certbot_user }}" + comment: "Certbot SSL Certificate Management" + group: "{{ certbot_group }}" + system: true + shell: /usr/sbin/nologin + home: "{{ certbot_directory }}" + create_home: false + + - name: Add ansible user to certbot group + become: true + ansible.builtin.user: + name: "{{ ansible_user }}" + groups: "{{ certbot_group }}" + append: true + + # ------------------------------------------------------------------------- + # Directory Structure + # ------------------------------------------------------------------------- + + - name: Create certbot directories + become: true + ansible.builtin.file: + path: "{{ item }}" + owner: "{{ certbot_user }}" + group: "{{ certbot_group }}" + state: directory + mode: '0750' + loop: + - "{{ certbot_directory }}" + - "{{ certbot_directory }}/config" + - "{{ certbot_directory }}/work" + - "{{ certbot_directory }}/logs" + - "{{ certbot_directory }}/credentials" + - "{{ certbot_directory }}/hooks" + + - name: Create haproxy group for certificate directory + become: true + ansible.builtin.group: + name: "{{ haproxy_group | default('haproxy') }}" + system: true + + - name: Create haproxy user for certificate directory + become: true + ansible.builtin.user: + name: "{{ haproxy_user | default('haproxy') }}" + comment: "HAProxy Load Balancer" + group: "{{ haproxy_group | default('haproxy') }}" + system: true + shell: /usr/sbin/nologin + home: /nonexistent + create_home: false + + - name: Create certificate output directory + become: true + ansible.builtin.file: + path: /etc/haproxy/certs + owner: "{{ certbot_user }}" + group: "{{ haproxy_group | default('haproxy') }}" + state: directory + mode: '0750' + + # ------------------------------------------------------------------------- + # Python Virtual Environment + # ------------------------------------------------------------------------- + + - name: Install Python venv package + become: true + ansible.builtin.apt: + name: + - python3-venv + - python3-pip + state: present + update_cache: true + + - name: Create virtual environment + become: true + become_user: "{{ certbot_user }}" + ansible.builtin.command: python3 -m venv {{ certbot_directory }}/.venv + args: + creates: "{{ certbot_directory }}/.venv/bin/activate" + vars: + ansible_common_remote_group: "{{ certbot_group }}" + allow_world_readable_tmpfiles: true + + - name: Upgrade pip in virtualenv + become: true + become_user: "{{ certbot_user }}" + ansible.builtin.pip: + name: pip + state: latest + virtualenv: "{{ certbot_directory }}/.venv" + vars: + ansible_common_remote_group: "{{ certbot_group }}" + allow_world_readable_tmpfiles: true + + - name: Install certbot and Namecheap DNS plugin + become: true + become_user: "{{ certbot_user }}" + ansible.builtin.pip: + name: + - certbot + - certbot-dns-namecheap + state: present + virtualenv: "{{ certbot_directory }}/.venv" + vars: + ansible_common_remote_group: "{{ certbot_group }}" + allow_world_readable_tmpfiles: true + + # ------------------------------------------------------------------------- + # Namecheap Credentials + # ------------------------------------------------------------------------- + + - name: Get public IP for Namecheap API + ansible.builtin.uri: + url: https://ifconfig.me/ip + return_content: true + register: public_ip_result + delegate_to: localhost + become: false + + - name: Set client IP fact + ansible.builtin.set_fact: + namecheap_client_ip: "{{ public_ip_result.content | trim }}" + + - name: Template Namecheap credentials + become: true + ansible.builtin.template: + src: namecheap.ini.j2 + dest: "{{ certbot_directory }}/credentials/namecheap.ini" + owner: "{{ certbot_user }}" + group: "{{ certbot_group }}" + mode: '0600' + + # ------------------------------------------------------------------------- + # Renewal Hooks + # ------------------------------------------------------------------------- + + - name: Template renewal hook script + become: true + ansible.builtin.template: + src: renewal-hook.sh.j2 + dest: "{{ certbot_directory }}/hooks/renewal-hook.sh" + owner: "{{ certbot_user }}" + group: "{{ certbot_group }}" + mode: '0750' + + - name: Template certificate metrics script + become: true + ansible.builtin.template: + src: cert-metrics.sh.j2 + dest: "{{ certbot_directory }}/hooks/cert-metrics.sh" + owner: "{{ certbot_user }}" + group: "{{ certbot_group }}" + mode: '0750' + + # ------------------------------------------------------------------------- + # Initial Certificate Request + # ------------------------------------------------------------------------- + + - name: Check if certificate already exists + become: true + ansible.builtin.stat: + path: "{{ certbot_directory }}/config/live/{{ certbot_cert_name }}/fullchain.pem" + register: cert_exists + + - name: Build domain arguments for certbot + ansible.builtin.set_fact: + certbot_domain_args: "{{ certbot_domains | map('regex_replace', '^', '-d ') | join(' ') }}" + + - name: Request initial certificate + become: true + become_user: "{{ certbot_user }}" + ansible.builtin.shell: | + source {{ certbot_directory }}/.venv/bin/activate + certbot certonly \ + --non-interactive \ + --agree-tos \ + --email {{ certbot_email }} \ + --authenticator dns-namecheap \ + --dns-namecheap-credentials {{ certbot_directory }}/credentials/namecheap.ini \ + --dns-namecheap-propagation-seconds 120 \ + --config-dir {{ certbot_directory }}/config \ + --work-dir {{ certbot_directory }}/work \ + --logs-dir {{ certbot_directory }}/logs \ + --cert-name {{ certbot_cert_name }} \ + {{ certbot_domain_args }} + args: + executable: /bin/bash + when: not cert_exists.stat.exists + register: certbot_request + + - name: Run renewal hook after initial certificate + become: true + ansible.builtin.command: "{{ certbot_directory }}/hooks/renewal-hook.sh" + when: certbot_request.changed + + # ------------------------------------------------------------------------- + # Systemd Timer for Auto-Renewal + # ------------------------------------------------------------------------- + + - name: Create certbot renewal service + become: true + ansible.builtin.copy: + content: | + [Unit] + Description=Certbot Renewal + After=network-online.target + Wants=network-online.target + + [Service] + Type=oneshot + User={{ certbot_user }} + Group={{ certbot_group }} + ExecStart=/bin/bash -c 'source {{ certbot_directory }}/.venv/bin/activate && certbot renew --config-dir {{ certbot_directory }}/config --work-dir {{ certbot_directory }}/work --logs-dir {{ certbot_directory }}/logs --deploy-hook {{ certbot_directory }}/hooks/renewal-hook.sh' + PrivateTmp=true + dest: /etc/systemd/system/certbot-renew.service + mode: '0644' + notify: restart certbot-renew timer + + - name: Create certbot renewal timer + become: true + ansible.builtin.copy: + content: | + [Unit] + Description=Run certbot renewal twice daily + + [Timer] + OnCalendar=*-*-* 00,12:00:00 + RandomizedDelaySec=3600 + Persistent=true + + [Install] + WantedBy=timers.target + dest: /etc/systemd/system/certbot-renew.timer + mode: '0644' + notify: restart certbot-renew timer + + - name: Enable and start certbot renewal timer + become: true + ansible.builtin.systemd: + name: certbot-renew.timer + enabled: true + state: started + daemon_reload: true + + # ------------------------------------------------------------------------- + # Initial Metrics Update + # ------------------------------------------------------------------------- + + - name: Ensure prometheus textfile directory exists + become: true + ansible.builtin.file: + path: "{{ prometheus_node_exporter_text_directory }}" + state: directory + owner: prometheus + group: prometheus + mode: '0755' + + - name: Run certificate metrics script + become: true + ansible.builtin.command: "{{ certbot_directory }}/hooks/cert-metrics.sh" + changed_when: false + + # ------------------------------------------------------------------------- + # Verification + # ------------------------------------------------------------------------- + + - name: Verify certificate exists + become: true + ansible.builtin.stat: + path: "{{ haproxy_cert_path }}" + register: final_cert + + - name: Certificate deployment status + ansible.builtin.debug: + msg: "Certificate deployed: {{ final_cert.stat.exists }}" \ No newline at end of file diff --git a/ansible/certbot/namecheap.ini.j2 b/ansible/certbot/namecheap.ini.j2 new file mode 100644 index 0000000..6879c20 --- /dev/null +++ b/ansible/certbot/namecheap.ini.j2 @@ -0,0 +1,8 @@ +# Namecheap API credentials for certbot DNS-01 validation +# Managed by Ansible - DO NOT EDIT MANUALLY + +dns_namecheap_username = {{ namecheap_username }} +dns_namecheap_api_key = {{ namecheap_api_key }} +{% if namecheap_client_ip is defined %} +dns_namecheap_client_ip = {{ namecheap_client_ip }} +{% endif %} \ No newline at end of file diff --git a/ansible/certbot/renewal-hook.sh.j2 b/ansible/certbot/renewal-hook.sh.j2 new file mode 100644 index 0000000..641f76c --- /dev/null +++ b/ansible/certbot/renewal-hook.sh.j2 @@ -0,0 +1,52 @@ +#!/bin/bash +# Certbot post-renewal hook for HAProxy +# Managed by Ansible - DO NOT EDIT MANUALLY +# +# This script: +# 1. Combines fullchain.pem + privkey.pem into HAProxy format +# 2. Sets correct permissions +# 3. Reloads HAProxy via Docker +# 4. Updates certificate metrics for Prometheus + +set -euo pipefail + +CERT_NAME="{{ certbot_cert_name }}" +CERT_DIR="{{ certbot_directory }}/config/live/${CERT_NAME}" +HAPROXY_CERT="{{ haproxy_cert_path }}" +HAPROXY_DIR="{{ haproxy_directory }}" + +echo "[$(date '+%Y-%m-%d %H:%M:%S')] Starting renewal hook for ${CERT_NAME}" + +# Check if certificate files exist +if [[ ! -f "${CERT_DIR}/fullchain.pem" ]] || [[ ! -f "${CERT_DIR}/privkey.pem" ]]; then + echo "ERROR: Certificate files not found in ${CERT_DIR}" + exit 1 +fi + +# Combine certificate and private key for HAProxy +# HAProxy requires both in a single PEM file +cat "${CERT_DIR}/fullchain.pem" "${CERT_DIR}/privkey.pem" > "${HAPROXY_CERT}.tmp" + +# Atomic move to avoid HAProxy reading partial file +mv "${HAPROXY_CERT}.tmp" "${HAPROXY_CERT}" + +# Set permissions +chown {{ certbot_user }}:{{ haproxy_group }} "${HAPROXY_CERT}" +chmod 640 "${HAPROXY_CERT}" + +echo "[$(date '+%Y-%m-%d %H:%M:%S')] Certificate combined and written to ${HAPROXY_CERT}" + +# Reload HAProxy if running +if docker ps --format '{{ '{{' }}.Names{{ '}}' }}' | grep -q haproxy; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Reloading HAProxy..." + cd "${HAPROXY_DIR}" + docker compose kill -s HUP haproxy || docker-compose kill -s HUP haproxy + echo "[$(date '+%Y-%m-%d %H:%M:%S')] HAProxy reloaded" +else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] HAProxy not running, skipping reload" +fi + +# Update certificate metrics +{{ certbot_directory }}/hooks/cert-metrics.sh + +echo "[$(date '+%Y-%m-%d %H:%M:%S')] Renewal hook completed successfully" \ No newline at end of file diff --git a/ansible/docker/deploy.yml b/ansible/docker/deploy.yml new file mode 100644 index 0000000..c1ff810 --- /dev/null +++ b/ansible/docker/deploy.yml @@ -0,0 +1,99 @@ +--- +- name: Deploy Docker + hosts: ubuntu + become: true + tasks: + - name: Check if host has docker service + ansible.builtin.set_fact: + has_docker_service: "{{'docker' in services}}" + + - name: Skip hosts without docker service + ansible.builtin.meta: end_host + when: not has_docker_service + + - name: Add Docker repository + ansible.builtin.deb822_repository: + name: docker + types: [deb] + uris: https://download.docker.com/linux/ubuntu + suites: ["{{ ansible_distribution_release }}"] + components: [stable] + signed_by: https://download.docker.com/linux/ubuntu/gpg + state: present + + - name: Update apt and install docker-ce + ansible.builtin.apt: + name: docker-ce + state: latest + update_cache: true + + - name: Enable and start docker service + ansible.builtin.systemd: + name: docker + enabled: true + state: started + + - name: Add ansible_user to docker group + ansible.builtin.user: + name: "{{ansible_user}}" + groups: docker + append: true + + - name: Check if Docker API should be enabled + ansible.builtin.set_fact: + enable_docker_api: "{{ docker_api_enabled | default(false) }}" + + - name: Configure Docker daemon for API exposure + ansible.builtin.copy: + content: | + { + "hosts": ["unix:///var/run/docker.sock", "tcp://{{ docker_api_host }}:{{ docker_api_port }}"], + "log-driver": "json-file", + "log-opts": { + "max-size": "10m", + "max-file": "3" + } + } + dest: /etc/docker/daemon.json + owner: root + group: root + mode: '644' + when: enable_docker_api + notify: restart docker + + - name: Create systemd override directory + ansible.builtin.file: + path: /etc/systemd/system/docker.service.d + state: directory + mode: '755' + + - name: Create AppArmor workaround for Incus nested Docker + ansible.builtin.copy: + content: | + [Service] + Environment=container="setmeandforgetme" + dest: /etc/systemd/system/docker.service.d/apparmor-workaround.conf + owner: root + group: root + mode: '644' + notify: restart docker + + - name: Create systemd override for Docker API + ansible.builtin.copy: + content: | + [Service] + ExecStart= + ExecStart=/usr/bin/dockerd + dest: /etc/systemd/system/docker.service.d/override.conf + owner: root + group: root + mode: '644' + when: enable_docker_api + notify: restart docker + + handlers: + - name: restart docker + ansible.builtin.systemd: + name: docker + state: restarted + daemon_reload: true diff --git a/ansible/fetch_secrets.yml.vault_example b/ansible/fetch_secrets.yml.vault_example new file mode 100644 index 0000000..3892d8c --- /dev/null +++ b/ansible/fetch_secrets.yml.vault_example @@ -0,0 +1,19 @@ +--- +# Example: Ansible Vault Implementation +# Replace fetch_secrets.yml with this file if using Ansible Vault +# +# This implementation does nothing because vault variables are automatically +# loaded from inventory/group_vars/all/vault.yml when using --ask-vault-pass +# +# Usage: +# cp fetch_secrets.yml.vault_example fetch_secrets.yml + +- name: Fetch Secrets (Ansible Vault) + hosts: all + gather_facts: false + tasks: + - name: Verify vault variables are loaded + ansible.builtin.debug: + msg: "Using Ansible Vault - secrets loaded from vault.yml" + run_once: true + when: secret_scope is not defined diff --git a/ansible/gitea/app.ini.j2 b/ansible/gitea/app.ini.j2 new file mode 100644 index 0000000..73e5557 --- /dev/null +++ b/ansible/gitea/app.ini.j2 @@ -0,0 +1,166 @@ +; Gitea Configuration File +; Generated by Ansible + +APP_NAME = Gitea: Git with a cup of tea +RUN_MODE = prod + +[server] +PROTOCOL = http +DOMAIN = {{ gitea_domain }} +ROOT_URL = {{ gitea_root_url }} +HTTP_ADDR = 0.0.0.0 +HTTP_PORT = {{ gitea_web_port }} +DISABLE_SSH = false +SSH_DOMAIN = {{ gitea_domain }} +SSH_PORT = {{ gitea_ssh_port }} +SSH_LISTEN_PORT = {{ gitea_ssh_port }} +START_SSH_SERVER = true +LFS_START_SERVER = {{ gitea_lfs_enabled | lower }} +LFS_HTTP_AUTH_EXPIRY = 20m +OFFLINE_MODE = false + +[database] +DB_TYPE = {{ gitea_db_type }} +HOST = {{ gitea_db_host }}:{{ gitea_db_port }} +NAME = {{ gitea_db_name }} +USER = {{ gitea_db_user }} +PASSWD = {{ gitea_db_password }} +SSL_MODE = {{ gitea_db_ssl_mode }} +LOG_SQL = false +AUTO_MIGRATION = true + +[repository] +ROOT = {{ gitea_repo_root }} +DEFAULT_BRANCH = main +DEFAULT_PRIVATE = public +ENABLE_PUSH_CREATE_USER = true +ENABLE_PUSH_CREATE_ORG = false +DISABLED_REPO_UNITS = +DEFAULT_REPO_UNITS = repo.code,repo.releases,repo.issues,repo.pulls,repo.wiki,repo.projects,repo.packages + +[repository.signing] +SIGNING_KEY = default +INITIAL_COMMIT = always + +[repository.local] +LOCAL_COPY_PATH = {{ gitea_data_dir }}/tmp/local-repo + +[repository.upload] +TEMP_PATH = {{ gitea_data_dir }}/tmp/uploads + +[lfs] +PATH = {{ gitea_lfs_dir }} + +[security] +INSTALL_LOCK = true +SECRET_KEY = {{ gitea_secret_key }} +MIN_PASSWORD_LENGTH = 8 +PASSWORD_COMPLEXITY = lower,upper,digit +PASSWORD_HASH_ALGO = argon2 +REVERSE_PROXY_LIMIT = 1 +REVERSE_PROXY_TRUSTED_PROXIES = 127.0.0.0/8,::1/128,10.0.0.0/8 + +[service] +DISABLE_REGISTRATION = {{ gitea_disable_registration | lower }} +REQUIRE_SIGNIN_VIEW = {{ gitea_require_signin_view | lower }} +REGISTER_EMAIL_CONFIRM = false +ENABLE_NOTIFY_MAIL = false +DEFAULT_KEEP_EMAIL_PRIVATE = true +DEFAULT_ALLOW_CREATE_ORGANIZATION = true +DEFAULT_ENABLE_TIMETRACKING = true +NO_REPLY_ADDRESS = noreply.{{ gitea_domain }} + +[service.explore] +REQUIRE_SIGNIN_VIEW = {{ gitea_require_signin_view | lower }} +DISABLE_USERS_PAGE = false + +[mailer] +ENABLED = true +SMTP_ADDR = {{ smtp_host }} +SMTP_PORT = {{ smtp_port }} +FROM = {{ smtp_from }} + +[session] +PROVIDER = memcache +PROVIDER_CONFIG = 127.0.0.1:11211 +COOKIE_NAME = gitea_session +COOKIE_SECURE = true + +[picture] +AVATAR_UPLOAD_PATH = {{ gitea_data_dir }}/avatars +REPOSITORY_AVATAR_UPLOAD_PATH = {{ gitea_data_dir }}/repo-avatars +DISABLE_GRAVATAR = false + +[attachment] +PATH = {{ gitea_data_dir }}/attachments +MAX_SIZE = 50 +MAX_FILES = 5 + +[log] +MODE = console +LEVEL = Info +ENABLE_SSH_LOG = true +;; Sub-logger modes using new 1.21+ format +logger.router.MODE = console +logger.access.MODE = console + +[log.console] +LEVEL = Info +STDERR = false + +[git] +PATH = /usr/bin/git +DISABLE_DIFF_HIGHLIGHT = false +MAX_GIT_DIFF_LINES = 1000 +MAX_GIT_DIFF_LINE_CHARACTERS = 5000 +MAX_GIT_DIFF_FILES = 100 +GC_ARGS = + +[git.timeout] +DEFAULT = 360 +MIGRATE = 600 +MIRROR = 300 + +[indexer] +ISSUE_INDEXER_TYPE = bleve +ISSUE_INDEXER_PATH = {{ gitea_data_dir }}/indexers/issues.bleve +REPO_INDEXER_ENABLED = true +REPO_INDEXER_TYPE = bleve +REPO_INDEXER_PATH = {{ gitea_data_dir }}/indexers/repos.bleve + +[queue] +TYPE = level +DATADIR = {{ gitea_data_dir }}/queues + +[metrics] +ENABLED = {{ gitea_metrics_enabled | lower }} +ENABLED_ISSUE_BY_LABEL = false +ENABLED_ISSUE_BY_REPOSITORY = false +TOKEN = {{ gitea_metrics_token }} + +[cache] +ADAPTER = memcache +HOST = 127.0.0.1:11211 +ITEM_TTL = 16h + +[webhook] +ALLOWED_HOST_LIST = * + +[oauth2] +ENABLED = true +JWT_SIGNING_ALGORITHM = RS256 +JWT_SECRET = {{ gitea_lfs_jwt_secret }} + +[oauth2_client] +ENABLE_AUTO_REGISTRATION = true +ACCOUNT_LINKING = auto +OPENID_CONNECT_SCOPES = openid profile email +UPDATE_AVATAR = false + +[packages] +ENABLED = true +CHUNKED_UPLOAD_PATH = {{ gitea_data_dir }}/tmp/package-upload + +[actions] +ENABLED = true +DEFAULT_ACTIONS_URL = https://github.com \ No newline at end of file diff --git a/ansible/gitea/deploy.yml b/ansible/gitea/deploy.yml new file mode 100644 index 0000000..54b6630 --- /dev/null +++ b/ansible/gitea/deploy.yml @@ -0,0 +1,229 @@ +--- +- name: Deploy Gitea + hosts: gitea + become: true + tasks: + - name: Check if host has gitea service + ansible.builtin.set_fact: + has_gitea_service: "{{ 'gitea' in services | default([]) }}" + + - name: Skip hosts without gitea service + ansible.builtin.meta: end_host + when: not has_gitea_service + + - name: Install required packages + ansible.builtin.apt: + name: + - git + - git-lfs + - curl + - memcached + state: present + update_cache: true + + - name: Ensure Memcached is running + ansible.builtin.service: + name: memcached + state: started + enabled: true + + - name: Create git system group + ansible.builtin.group: + name: "{{ gitea_group }}" + system: true + state: present + + - name: Create git system user + ansible.builtin.user: + name: "{{ gitea_user }}" + group: "{{ gitea_group }}" + system: true + shell: /bin/bash + home: "{{ gitea_home_dir }}" + create_home: true + comment: "Git Version Control" + + - name: Create Gitea directories + ansible.builtin.file: + path: "{{ item.path }}" + state: directory + owner: "{{ item.owner }}" + group: "{{ item.group }}" + mode: "{{ item.mode }}" + loop: + - { path: "{{ gitea_work_dir }}", owner: "{{ gitea_user }}", group: "{{ gitea_group }}", mode: "0755" } + - { path: "{{ gitea_work_dir }}/custom", owner: "{{ gitea_user }}", group: "{{ gitea_group }}", mode: "0755" } + - { path: "{{ gitea_data_dir }}", owner: "{{ gitea_user }}", group: "{{ gitea_group }}", mode: "0755" } + - { path: "{{ gitea_lfs_dir }}", owner: "{{ gitea_user }}", group: "{{ gitea_group }}", mode: "0755" } + - { path: "{{ gitea_repo_root }}", owner: "{{ gitea_user }}", group: "{{ gitea_group }}", mode: "0755" } + - { path: "/etc/gitea", owner: "root", group: "{{ gitea_group }}", mode: "0770" } + + - name: Get installed Gitea version + ansible.builtin.command: + cmd: /usr/local/bin/gitea --version + register: gitea_installed_version + changed_when: false + failed_when: false + + - name: Parse installed version + ansible.builtin.set_fact: + gitea_current_version: "{{ gitea_installed_version.stdout | regex_search('([0-9]+\\.[0-9]+\\.[0-9]+)') | default('0.0.0') }}" + when: gitea_installed_version.rc == 0 + + - name: Set current version to 0.0.0 if not installed + ansible.builtin.set_fact: + gitea_current_version: "0.0.0" + when: gitea_installed_version.rc != 0 + + - name: Get latest Gitea release version from GitHub + ansible.builtin.uri: + url: https://api.github.com/repos/go-gitea/gitea/releases/latest + return_content: true + register: gitea_latest_release + + - name: Extract latest version number + ansible.builtin.set_fact: + gitea_latest_version: "{{ gitea_latest_release.json.tag_name | regex_replace('^v', '') }}" + + - name: Display version information + ansible.builtin.debug: + msg: "Gitea: installed={{ gitea_current_version }}, latest={{ gitea_latest_version }}" + + - name: Stop Gitea before upgrade + ansible.builtin.systemd: + name: gitea + state: stopped + when: + - gitea_current_version != gitea_latest_version + - gitea_current_version != "0.0.0" + + - name: Download Gitea binary + ansible.builtin.get_url: + url: "https://dl.gitea.com/gitea/{{ gitea_latest_version }}/gitea-{{ gitea_latest_version }}-linux-amd64" + dest: /usr/local/bin/gitea + mode: '0755' + owner: root + group: root + force: true + when: gitea_current_version != gitea_latest_version + notify: restart gitea + + - name: Template Gitea configuration + ansible.builtin.template: + src: app.ini.j2 + dest: "{{ gitea_config_file }}" + owner: "{{ gitea_user }}" + group: "{{ gitea_group }}" + mode: '0640' + notify: restart gitea + + - name: Create Gitea systemd service + ansible.builtin.copy: + dest: /etc/systemd/system/gitea.service + mode: '0644' + owner: root + group: root + content: | + [Unit] + Description=Gitea (Git with a cup of tea) + After=syslog.target + After=network.target + After=postgresql.service + + [Service] + RestartSec=2s + Type=simple + User={{ gitea_user }} + Group={{ gitea_group }} + WorkingDirectory={{ gitea_work_dir }}/ + ExecStart=/usr/local/bin/gitea web --config {{ gitea_config_file }} + Restart=always + Environment=USER={{ gitea_user }} HOME={{ gitea_home_dir }} GITEA_WORK_DIR={{ gitea_work_dir }} + + [Install] + WantedBy=multi-user.target + notify: restart gitea + + - name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true + + - name: Enable and start Gitea service + ansible.builtin.systemd: + name: gitea + enabled: true + state: started + + # OAuth2 Provider Configuration (Casdoor SSO) + - name: Flush handlers to ensure Gitea is restarted before healthcheck + ansible.builtin.meta: flush_handlers + + - name: Wait for Gitea to be ready + ansible.builtin.uri: + url: "http://127.0.0.1:{{ gitea_web_port }}/api/healthz" + method: GET + status_code: 200 + register: gitea_health + until: gitea_health.status == 200 + retries: 30 + delay: 5 + when: gitea_oauth_enabled | default(false) + + - name: Check if Casdoor OAuth source exists + ansible.builtin.command: + cmd: > + /usr/local/bin/gitea admin auth list + --config {{ gitea_config_file }} + become: true + become_user: "{{ gitea_user }}" + register: gitea_auth_list + changed_when: false + when: gitea_oauth_enabled | default(false) + + - name: Add Casdoor OAuth2 authentication source + ansible.builtin.command: + cmd: > + /usr/local/bin/gitea admin auth add-oauth + --config {{ gitea_config_file }} + --name "{{ gitea_oauth_name }}" + --provider openidConnect + --key "{{ gitea_oauth_client_id }}" + --secret "{{ gitea_oauth_client_secret }}" + --auto-discover-url "https://id.ouranos.helu.ca/.well-known/openid-configuration" + --scopes "{{ gitea_oauth_scopes }}" + --skip-local-2fa + --group-claim-name "" + --admin-group "" + become: true + become_user: "{{ gitea_user }}" + when: + - gitea_oauth_enabled | default(false) + - gitea_oauth_name not in gitea_auth_list.stdout + notify: restart gitea + + - name: Update Casdoor OAuth2 authentication source + ansible.builtin.command: + cmd: > + /usr/local/bin/gitea admin auth update-oauth + --config {{ gitea_config_file }} + --id {{ gitea_auth_list.stdout_lines | select('search', gitea_oauth_name) | first | regex_search('^\d+') }} + --name "{{ gitea_oauth_name }}" + --provider openidConnect + --key "{{ gitea_oauth_client_id }}" + --secret "{{ gitea_oauth_client_secret }}" + --auto-discover-url "https://id.ouranos.helu.ca/.well-known/openid-configuration" + --scopes "{{ gitea_oauth_scopes }}" + --skip-local-2fa + become: true + become_user: "{{ gitea_user }}" + when: + - gitea_oauth_enabled | default(false) + - gitea_oauth_name in gitea_auth_list.stdout + notify: restart gitea + + handlers: + - name: restart gitea + ansible.builtin.systemd: + name: gitea + state: restarted + daemon_reload: true diff --git a/ansible/gitea_mcp/deploy.yml b/ansible/gitea_mcp/deploy.yml new file mode 100644 index 0000000..986626a --- /dev/null +++ b/ansible/gitea_mcp/deploy.yml @@ -0,0 +1,56 @@ +--- +- name: Deploy Gitea MCP Server with Docker Compose + hosts: ubuntu + become: true + vars: + required_service: gitea_mcp + tasks: + - name: Check if host has gitea_mcp service + ansible.builtin.set_fact: + has_gitea_mcp_service: "{{ required_service in services | default([]) }}" + + - name: Skip hosts without gitea_mcp service + ansible.builtin.meta: end_host + when: not has_gitea_mcp_service + + - name: Create gitea_mcp group + ansible.builtin.group: + name: "{{gitea_mcp_group}}" + + - name: Create gitea_mcp user + ansible.builtin.user: + name: "{{gitea_mcp_user}}" + comment: "{{gitea_mcp_user}}" + group: "{{gitea_mcp_group}}" + system: true + + - name: Add group gitea_mcp to Ansible remote_user + ansible.builtin.user: + name: "{{remote_user}}" + groups: "{{gitea_mcp_group}}" + append: true + + - name: Create gitea_mcp directory + ansible.builtin.file: + path: "{{gitea_mcp_directory}}" + owner: "{{gitea_mcp_user}}" + group: "{{gitea_mcp_group}}" + state: directory + mode: '750' + + - name: Template docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{gitea_mcp_directory}}/docker-compose.yml" + owner: "{{gitea_mcp_user}}" + group: "{{gitea_mcp_group}}" + mode: '550' + + - name: Reset SSH connection to apply group changes + meta: reset_connection + + - name: Start Gitea MCP service + community.docker.docker_compose_v2: + project_src: "{{gitea_mcp_directory}}" + state: present + pull: always diff --git a/ansible/gitea_mcp/docker-compose.yml.j2 b/ansible/gitea_mcp/docker-compose.yml.j2 new file mode 100644 index 0000000..966f363 --- /dev/null +++ b/ansible/gitea_mcp/docker-compose.yml.j2 @@ -0,0 +1,18 @@ +services: + gitea-mcp: + image: docker.gitea.com/gitea-mcp-server:latest + pull_policy: always + container_name: gitea-mcp + restart: unless-stopped + ports: + - "{{gitea_mcp_port}}:8000" + environment: + - GITEA_HOST={{gitea_mcp_host}} + - GITEA_ACCESS_TOKEN={{gitea_mcp_access_token}} + command: ["/app/gitea-mcp", "-t", "http", "--port", "8000"] + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{gitea_mcp_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "gitea-mcp" diff --git a/ansible/gitea_mcp/remove.yml b/ansible/gitea_mcp/remove.yml new file mode 100644 index 0000000..77ea749 --- /dev/null +++ b/ansible/gitea_mcp/remove.yml @@ -0,0 +1,36 @@ +--- +- name: Remove Gitea MCP Server + hosts: ubuntu + become: true + tasks: + - name: Check if host has gitea_mcp service + ansible.builtin.set_fact: + has_gitea_mcp_service: "{{ 'gitea_mcp' in services | default([]) }}" + + - name: Skip hosts without gitea_mcp service + ansible.builtin.meta: end_host + when: not has_gitea_mcp_service + + - name: Check if docker-compose.yml exists + ansible.builtin.stat: + path: "{{gitea_mcp_directory}}/docker-compose.yml" + register: compose_file + + - name: Stop and remove Docker containers, volumes, and images + community.docker.docker_compose_v2: + project_src: "{{gitea_mcp_directory}}" + state: absent + remove_images: all + remove_volumes: true + when: compose_file.stat.exists + + - name: Prune Docker images + community.docker.docker_prune: + images: true + images_filters: + dangling: false + + - name: Remove Gitea MCP directory + ansible.builtin.file: + path: "{{gitea_mcp_directory}}" + state: absent diff --git a/ansible/gitea_runner/config.yaml b/ansible/gitea_runner/config.yaml new file mode 100644 index 0000000..fbfbdfd --- /dev/null +++ b/ansible/gitea_runner/config.yaml @@ -0,0 +1,110 @@ +# Gitea Act Runner configuration +# Managed by Ansible - edit this file, then re-run the playbook. + +log: + # The level of logging, can be trace, debug, info, warn, error, fatal + level: info + +runner: + # Where to store the registration result. + file: .runner + # Execute how many tasks concurrently at the same time. + capacity: 1 + # Extra environment variables to run jobs. + envs: + A_TEST_ENV_NAME_1: a_test_env_value_1 + A_TEST_ENV_NAME_2: a_test_env_value_2 + # Extra environment variables to run jobs from a file. + # It will be ignored if it's empty or the file doesn't exist. + env_file: .env + # The timeout for a job to be finished. + # Please note that the Gitea instance also has a timeout (3h by default) for the job. + # So the job could be stopped by the Gitea instance if it's timeout is shorter than this. + timeout: 3h + # The timeout for the runner to wait for running jobs to finish when shutting down. + # Any running jobs that haven't finished after this timeout will be cancelled. + shutdown_timeout: 0s + # Whether skip verifying the TLS certificate of the Gitea instance. + insecure: false + # The timeout for fetching the job from the Gitea instance. + fetch_timeout: 5s + # The interval for fetching the job from the Gitea instance. + fetch_interval: 2s + # The github_mirror of a runner is used to specify the mirror address of the github that pulls the action repository. + # It works when something like `uses: actions/checkout@v4` is used and DEFAULT_ACTIONS_URL is set to github, + # and github_mirror is not empty. In this case, + # it replaces https://github.com with the value here, which is useful for some special network environments. + github_mirror: '' + # The labels of a runner are used to determine which jobs the runner can run, and how to run them. + # Like: "macos-arm64:host" or "ubuntu-latest:docker://docker.gitea.com/runner-images:ubuntu-latest" + # Find more images provided by Gitea at https://gitea.com/docker.gitea.com/runner-images . + # If it's empty when registering, it will ask for inputting labels. + # If it's empty when execute `daemon`, will use labels in `.runner` file. + labels: + - "ubuntu-latest:docker://docker.gitea.com/runner-images:ubuntu-latest" + - "ubuntu-24.04:docker://docker.gitea.com/runner-images:ubuntu-24.04" + - "ubuntu-22.04:docker://docker.gitea.com/runner-images:ubuntu-22.04" + - "ubuntu-20.04:docker://docker.gitea.com/runner-images:ubuntu-20.04" + - "node-24:docker://node:24-bookworm" + +cache: + # Enable cache server to use actions/cache. + enabled: true + # The directory to store the cache data. + # If it's empty, the cache data will be stored in $HOME/.cache/actcache. + dir: "" + # The host of the cache server. + # It's not for the address to listen, but the address to connect from job containers. + # So 0.0.0.0 is a bad choice, leave it empty to detect automatically. + host: "" + # The port of the cache server. + # 0 means to use a random available port. + port: 0 + # The external cache server URL. Valid only when enable is true. + # If it's specified, act_runner will use this URL as the ACTIONS_CACHE_URL rather than start a server by itself. + # The URL should generally end with "/". + external_server: "" + +container: + # Specifies the network to which the container will connect. + # Could be host, bridge or the name of a custom network. + # If it's empty, act_runner will create a network automatically. + network: "" + # Whether to use privileged mode or not when launching task containers (privileged mode is required for Docker-in-Docker). + privileged: false + # And other options to be used when the container is started (eg, --add-host=my.gitea.url:host-gateway). + options: + # The parent directory of a job's working directory. + # NOTE: There is no need to add the first '/' of the path as act_runner will add it automatically. + # If the path starts with '/', the '/' will be trimmed. + # For example, if the parent directory is /path/to/my/dir, workdir_parent should be path/to/my/dir + # If it's empty, /workspace will be used. + workdir_parent: + # Volumes (including bind mounts) can be mounted to containers. Glob syntax is supported, see https://github.com/gobwas/glob + # You can specify multiple volumes. If the sequence is empty, no volumes can be mounted. + # For example, if you only allow containers to mount the `data` volume and all the json files in `/src`, you should change the config to: + # valid_volumes: + # - data + # - /src/*.json + # If you want to allow any volume, please use the following configuration: + # valid_volumes: + # - '**' + valid_volumes: [] + # overrides the docker client host with the specified one. + # If it's empty, act_runner will find an available docker host automatically. + # If it's "-", act_runner will find an available docker host automatically, but the docker host won't be mounted to the job containers and service containers. + # If it's not empty or "-", the specified docker host will be used. An error will be returned if it doesn't work. + docker_host: "" + # Pull docker image(s) even if already present + force_pull: true + # Rebuild docker image(s) even if already present + force_rebuild: false + # Always require a reachable docker daemon, even if not required by act_runner + require_docker: false + # Timeout to wait for the docker daemon to be reachable, if docker is required by require_docker or act_runner + docker_timeout: 0s + +host: + # The parent directory of a job's working directory. + # If it's empty, $HOME/.cache/act/ will be used. + workdir_parent: diff --git a/ansible/gitea_runner/deploy.yml b/ansible/gitea_runner/deploy.yml new file mode 100644 index 0000000..2b01354 --- /dev/null +++ b/ansible/gitea_runner/deploy.yml @@ -0,0 +1,157 @@ +--- +- name: Deploy Gitea Runner + hosts: ubuntu + become: true + tasks: + + - name: Check if host has gitea_runner service + ansible.builtin.set_fact: + has_gitea_runner_service: "{{ 'gitea_runner' in services }}" + + - name: Skip hosts without gitea_runner service + ansible.builtin.meta: end_host + when: not has_gitea_runner_service + + # ========================================================================= + # Service Account + # ========================================================================= + + - name: Create gitea-runner system group + ansible.builtin.group: + name: gitea-runner + system: true + state: present + + - name: Create gitea-runner system user + ansible.builtin.user: + name: gitea-runner + group: gitea-runner + groups: docker + append: true + system: true + shell: /bin/bash + home: /srv/gitea-runner + create_home: true + comment: "Gitea Act Runner" + + # ========================================================================= + # Binary Installation + # ========================================================================= + + - name: Ensure /usr/local/bin directory exists + ansible.builtin.file: + path: /usr/local/bin + state: directory + mode: '0755' + owner: root + group: root + + - name: Check current act_runner version + ansible.builtin.command: /usr/local/bin/act_runner --version + register: act_runner_current_version + changed_when: false + failed_when: false + + - name: Download act_runner binary + ansible.builtin.get_url: + url: "https://gitea.com/gitea/act_runner/releases/download/v{{ act_runner_version }}/act_runner-{{ act_runner_version }}-linux-amd64" + dest: /usr/local/bin/act_runner + mode: '0755' + owner: root + group: root + force: true + when: act_runner_current_version.rc != 0 or act_runner_version not in (act_runner_current_version.stdout | default('')) + notify: restart gitea-runner + + # ========================================================================= + # Configuration + # ========================================================================= + + - name: Copy runner config + ansible.builtin.copy: + src: config.yaml + dest: /srv/gitea-runner/config.yaml + owner: gitea-runner + group: gitea-runner + mode: '0644' + notify: restart gitea-runner + + # ========================================================================= + # Systemd Service + # ========================================================================= + + - name: Template gitea-runner systemd service + ansible.builtin.template: + src: gitea-runner.service.j2 + dest: /etc/systemd/system/gitea-runner.service + owner: root + group: root + mode: '0644' + notify: restart gitea-runner + + - name: Check if runner is registered + ansible.builtin.stat: + path: /srv/gitea-runner/.runner + register: runner_registration + + # ========================================================================= + # Registration + # ========================================================================= + + - name: Prompt for registration token + ansible.builtin.pause: + prompt: | + + Gitea runner registration required. + Get token from: {{ gitea_runner_instance_url }}/-/admin/runners + + Enter registration token + register: runner_token + when: + - not runner_registration.stat.exists + - registration_token is not defined + + - name: Set registration token from prompt or variable + ansible.builtin.set_fact: + runner_registration_token: "{{ registration_token | default(runner_token.user_input) }}" + when: not runner_registration.stat.exists + + - name: Register runner with Gitea instance + ansible.builtin.shell: + cmd: > + sudo -u gitea-runner + /usr/local/bin/act_runner register + --instance {{ gitea_runner_instance_url }} + --token {{ runner_registration_token }} + --name {{ gitea_runner_name }} + --no-interactive + args: + creates: /srv/gitea-runner/.runner + chdir: /srv/gitea-runner + when: not runner_registration.stat.exists + + # ========================================================================= + # Service Management + # ========================================================================= + + - name: Enable gitea-runner service + ansible.builtin.systemd: + name: gitea-runner + enabled: true + daemon_reload: true + + - name: Start gitea-runner service + ansible.builtin.systemd: + name: gitea-runner + state: started + + # =========================================================================== + # Handlers + # =========================================================================== + + handlers: + - name: restart gitea-runner + ansible.builtin.systemd: + name: gitea-runner + state: restarted + daemon_reload: true diff --git a/ansible/gitea_runner/gitea-runner.service.j2 b/ansible/gitea_runner/gitea-runner.service.j2 new file mode 100644 index 0000000..23a5340 --- /dev/null +++ b/ansible/gitea_runner/gitea-runner.service.j2 @@ -0,0 +1,17 @@ +[Unit] +Description=Gitea Runner +After=network.target docker.service +Requires=docker.service + +[Service] +Type=simple +User=gitea-runner +Group=gitea-runner +WorkingDirectory=/srv/gitea-runner +ExecStart=/usr/local/bin/act_runner daemon --config /srv/gitea-runner/config.yaml +Restart=on-failure +RestartSec=10 +Environment=HOME=/srv/gitea-runner + +[Install] +WantedBy=multi-user.target diff --git a/ansible/grafana/dashboards/puck_containers.json b/ansible/grafana/dashboards/puck_containers.json new file mode 100644 index 0000000..14c5133 --- /dev/null +++ b/ansible/grafana/dashboards/puck_containers.json @@ -0,0 +1,1003 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Docker container monitoring for puck.incus - Red Panda Approved 🐼", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": true, + "title": "Explore Logs", + "tooltip": "View Docker logs in Loki for puck.incus", + "type": "link", + "url": "/explore?orgId=1&left=%7B%22datasource%22:%22prospero-loki%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bhostname%3D%5C%22puck.incus%5C%22%7D%20%7C%3D%20%5C%22docker%5C%22%20or%20%7C%3D%20%5C%22container%5C%22%22%7D%5D%7D" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Process Monitoring", + "tooltip": "View process-level metrics", + "type": "link", + "url": "/d/puck-process-monitoring" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "Container Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "count(container_last_seen{instance=~\"puck.*\", name!=\"\", name!=\"POD\"})", + "legendFormat": "Running Containers", + "refId": "A" + } + ], + "title": "Running Containers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "count(container_last_seen{instance=~\"puck.*\", name=~\".*_.*\", name!=\"POD\"}) or vector(0)", + "legendFormat": "Auto-Named", + "refId": "A" + } + ], + "title": "Auto-Named (Orphan Candidates)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "count(container_last_seen{instance=~\"puck.*\", image=~\".*mcp-server.*|.*mcp_server.*\"}) or vector(0)", + "legendFormat": "MCP Containers", + "refId": "A" + } + ], + "title": "⚠️ MCP Containers (Should be 0)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "sum(increase(container_oom_events_total{instance=~\"puck.*\"}[24h])) or vector(0)", + "legendFormat": "OOM Events", + "refId": "A" + } + ], + "title": "OOM Events (24h)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 5, + "options": { + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "count by (image) (container_last_seen{instance=~\"puck.*\", name!=\"\", name!=\"POD\"})", + "legendFormat": "{{image}}", + "refId": "A" + } + ], + "title": "Containers by Image", + "type": "piechart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 101, + "panels": [], + "title": "Container Resource Usage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 80 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "sum by (name) (rate(container_cpu_usage_seconds_total{instance=~\"puck.*\", name!=\"\", name!=\"POD\"}[2m])) * 100", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "Container CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 536870912 + }, + { + "color": "red", + "value": 1073741824 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 11, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "container_memory_usage_bytes{instance=~\"puck.*\", name!=\"\", name!=\"POD\"}", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "Container Memory Usage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 102, + "panels": [], + "title": "Container Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU %" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "gauge" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 80 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "gauge" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 536870912 + }, + { + "color": "red", + "value": 1073741824 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Age" + }, + "properties": [ + { + "id": "unit", + "value": "s" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3600 + }, + { + "color": "orange", + "value": 86400 + }, + { + "color": "red", + "value": 604800 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Container" + }, + "properties": [ + { + "id": "custom.width", + "value": 200 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Image" + }, + "properties": [ + { + "id": "custom.width", + "value": 300 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 20, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Age" + } + ] + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "sum by (name, image) (rate(container_cpu_usage_seconds_total{instance=~\"puck.*\", name!=\"\", name!=\"POD\"}[2m])) * 100", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "CPU" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "container_memory_usage_bytes{instance=~\"puck.*\", name!=\"\", name!=\"POD\"}", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "refId": "Memory" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "time() - container_start_time_seconds{instance=~\"puck.*\", name!=\"\", name!=\"POD\"}", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "refId": "Age" + } + ], + "title": "All Containers", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "name" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "id": true, + "image 2": true, + "image 3": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "job 1": true, + "job 2": true, + "job 3": true + }, + "indexByName": {}, + "renameByName": { + "Value #Age": "Age", + "Value #CPU": "CPU %", + "Value #Memory": "Memory", + "image 1": "Image", + "name": "Container" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 103, + "panels": [], + "title": "Orphan Candidates (Auto-Named Containers > 1 hour)", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "description": "Containers with Docker auto-generated names (adjective_scientist pattern) running longer than 1 hour may be orphaned and should be investigated.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "orange", + "value": null + }, + { + "color": "red", + "value": 86400 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Age" + }, + "properties": [ + { + "id": "unit", + "value": "s" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 30, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Age" + } + ] + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "(time() - container_start_time_seconds{instance=~\"puck.*\", name=~\".*_.*\", name!=\"POD\"}) > 3600", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "🔍 Orphan Candidates", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "id": true, + "instance": true, + "job": true + }, + "indexByName": {}, + "renameByName": { + "Value": "Age", + "image": "Image", + "name": "Container" + } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["puck", "docker", "containers", "monitoring", "red-panda-approved"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Puck Docker Containers", + "uid": "puck-docker-containers", + "version": 1, + "weekStart": "" +} diff --git a/ansible/grafana/dashboards/puck_processes.json b/ansible/grafana/dashboards/puck_processes.json new file mode 100644 index 0000000..c2634e0 --- /dev/null +++ b/ansible/grafana/dashboards/puck_processes.json @@ -0,0 +1,1029 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Process-level monitoring for puck.incus - Red Panda Approved 🐼", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": true, + "title": "Explore Logs", + "tooltip": "View logs in Loki for puck.incus", + "type": "link", + "url": "/explore?orgId=1&left=%7B%22datasource%22:%22prospero-loki%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bhostname%3D%5C%22puck.incus%5C%22%7D%22%7D%5D%7D" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "count(namedprocess_namegroup_num_procs{instance=~\"puck.*\"})", + "legendFormat": "Total Processes", + "refId": "A" + } + ], + "title": "Monitored Processes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 4, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "100 - (avg(rate(node_cpu_seconds_total{instance=~\"puck.*\", mode=\"idle\"}[5m])) * 100)", + "legendFormat": "CPU Usage", + "refId": "A" + } + ], + "title": "Overall CPU Usage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 9, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"puck.*\"} / node_memory_MemTotal_bytes{instance=~\"puck.*\"})) * 100", + "legendFormat": "Memory Usage", + "refId": "A" + } + ], + "title": "Overall Memory Usage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 14, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "node_memory_MemTotal_bytes{instance=~\"puck.*\"}", + "legendFormat": "Total Memory", + "refId": "A" + } + ], + "title": "Total Memory", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 19, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "time() - node_boot_time_seconds{instance=~\"puck.*\"}", + "legendFormat": "Uptime", + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 101, + "panels": [], + "title": "Top CPU Consumers", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 80 + }, + { + "color": "red", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "topk(10, sum by (groupname) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~\"puck.*\"}[2m])) * 100)", + "legendFormat": "{{groupname}}", + "refId": "A" + } + ], + "title": "Top 10 CPU Processes (Over Time)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU %" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 11, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "CPU %" + } + ] + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "topk(10, sum by (groupname) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~\"puck.*\"}[2m])) * 100)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "Top 10 CPU Processes (Current)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "instance": true + }, + "indexByName": {}, + "renameByName": { + "Value": "CPU %", + "groupname": "Process" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 102, + "panels": [], + "title": "Top Memory Consumers", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1073741824 + }, + { + "color": "red", + "value": 2147483648 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 20, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "topk(10, namedprocess_namegroup_memory_bytes{instance=~\"puck.*\", memtype=\"resident\"})", + "legendFormat": "{{groupname}}", + "refId": "A" + } + ], + "title": "Top 10 Memory Processes (Over Time)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 536870912 + }, + { + "color": "red", + "value": 1073741824 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Resident Memory" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 21, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Resident Memory" + } + ] + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "topk(10, namedprocess_namegroup_memory_bytes{instance=~\"puck.*\", memtype=\"resident\"})", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "Top 10 Memory Processes (Current)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "instance": true, + "job": true, + "memtype": true + }, + "indexByName": {}, + "renameByName": { + "Value": "Resident Memory", + "groupname": "Process" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 103, + "panels": [], + "title": "Process Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU %" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "gauge" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 80 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Threads" + }, + "properties": [ + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 30, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "CPU %" + } + ] + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "sum by (groupname) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~\"puck.*\"}[2m])) * 100", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "CPU" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "namedprocess_namegroup_memory_bytes{instance=~\"puck.*\", memtype=\"resident\"}", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "refId": "Memory" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prospero-prometheus" + }, + "expr": "namedprocess_namegroup_num_threads{instance=~\"puck.*\"}", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "refId": "Threads" + } + ], + "title": "All Processes", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "groupname" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "job 1": true, + "job 2": true, + "job 3": true, + "memtype": true + }, + "indexByName": {}, + "renameByName": { + "Value #CPU": "CPU %", + "Value #Memory": "Memory", + "Value #Threads": "Threads", + "groupname": "Process" + } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["puck", "processes", "monitoring", "red-panda-approved"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Puck Process Monitoring", + "uid": "puck-process-monitoring", + "version": 1, + "weekStart": "" +} diff --git a/ansible/grafana/datasource.yml.j2 b/ansible/grafana/datasource.yml.j2 new file mode 100644 index 0000000..6e729a8 --- /dev/null +++ b/ansible/grafana/datasource.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: 1 +datasources: + - name: {{prometheus_datasource_name}} + type: prometheus + access: proxy + url: http://{{prometheus_host}}:{{prometheus_port}} + isDefault: true + editable: false + uid: {{prometheus_datasource_uid}} + - name: {{loki_datasource_name}} + type: loki + access: proxy + url: http://{{loki_host}}:{{loki_port}} + editable: false + uid: {{loki_datasource_uid}} \ No newline at end of file diff --git a/ansible/grafana/deploy.yml b/ansible/grafana/deploy.yml new file mode 100644 index 0000000..4e6d3e0 --- /dev/null +++ b/ansible/grafana/deploy.yml @@ -0,0 +1,113 @@ +--- +- name: Deploy Grafana + hosts: ubuntu + become: true + tasks: + - name: Check if host has grafana service + ansible.builtin.set_fact: + has_grafana_service: "{{'grafana' in services}}" + + - name: Skip hosts without grafana service + ansible.builtin.meta: end_host + when: not has_grafana_service + + - name: Add Grafana repository + ansible.builtin.deb822_repository: + name: grafana + types: [deb] + uris: https://apt.grafana.com + suites: [stable] + components: [main] + signed_by: https://apt.grafana.com/gpg.key + state: present + + - name: Install Grafana + become: true + ansible.builtin.apt: + name: grafana + state: present + update_cache: true + + - name: Create provisioning directories + become: true + ansible.builtin.file: + path: "{{item}}" + state: directory + owner: grafana + group: grafana + mode: '750' + loop: + - /etc/grafana/provisioning/dashboards + - /etc/grafana/provisioning/datasources + - /etc/grafana/provisioning/users + + - name: Create dashboards directory + become: true + ansible.builtin.file: + path: /var/lib/grafana/dashboards + state: directory + owner: grafana + group: grafana + mode: '750' + + - name: Template configuration files + become: true + ansible.builtin.template: + src: "{{item.src}}" + dest: "{{item.dest}}" + owner: grafana + group: grafana + mode: '550' + loop: + - src: "datasource.yml.j2" + dest: "/etc/grafana/provisioning/datasources/prometheus.yml" + - src: "users.yml.j2" + dest: "/etc/grafana/provisioning/users/users.yml" + notify: restart grafana + + - name: Template Grafana main configuration + become: true + ansible.builtin.template: + src: "grafana.ini.j2" + dest: "/etc/grafana/grafana.ini" + owner: grafana + group: grafana + mode: '640' + when: grafana_oauth_enabled | default(false) + notify: restart grafana + + - name: Configure dashboard provisioning + become: true + ansible.builtin.copy: + content: | + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + dest: /etc/grafana/provisioning/dashboards/dashboard.yml + owner: grafana + group: grafana + mode: '550' + notify: restart grafana + + - name: Enable and start Grafana service + become: true + ansible.builtin.systemd: + name: grafana-server + enabled: true + state: started + daemon_reload: true + + handlers: + - name: restart grafana + become: true + ansible.builtin.systemd: + name: grafana-server + state: restarted diff --git a/ansible/grafana/grafana.ini.j2 b/ansible/grafana/grafana.ini.j2 new file mode 100644 index 0000000..2eccec3 --- /dev/null +++ b/ansible/grafana/grafana.ini.j2 @@ -0,0 +1,36 @@ +# Grafana Configuration - Managed by Ansible +# Do not edit manually - changes will be overwritten + +[server] +root_url = {{ grafana_root_url }} + +[auth] +# Disable login form for OAuth users (admins can still use local auth) +disable_login_form = false + +[auth.generic_oauth] +enabled = {{ grafana_oauth_enabled | default(false) | lower }} +name = {{ grafana_oauth_name | default('Casdoor') }} +allow_sign_up = {{ grafana_oauth_allow_sign_up | default(true) | lower }} +client_id = {{ grafana_oauth_client_id }} +client_secret = {{ grafana_oauth_client_secret }} +scopes = {{ grafana_oauth_scopes | default('openid profile email') }} +auth_url = {{ grafana_oauth_auth_url }} +token_url = {{ grafana_oauth_token_url }} +api_url = {{ grafana_oauth_api_url }} +# Map Casdoor user attributes to Grafana +email_attribute_path = email +login_attribute_path = preferred_username +name_attribute_path = name +# Default role for new OAuth users +role_attribute_path = contains(groups[*], 'grafana-admin') && 'Admin' || contains(groups[*], 'grafana-editor') && 'Editor' || 'Viewer' +# TLS settings for internal communication +tls_skip_verify_insecure = {{ grafana_oauth_skip_tls_verify | default(true) | lower }} + +[log] +# Console-only logging — systemd journal captures output, Alloy ships to Loki +mode = console +level = {{ grafana_log_level | default('info') }} + +[log.console] +format = text diff --git a/ansible/grafana/users.yml.j2 b/ansible/grafana/users.yml.j2 new file mode 100644 index 0000000..b06c5a5 --- /dev/null +++ b/ansible/grafana/users.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: 1 +users: + - name: {{grafana_admin_name}} + orgId: 1 + login: {{grafana_admin_login}} + password: {{grafana_admin_password}} + isAdmin: true + - name: {{grafana_viewer_name}} + orgId: 1 + login: {{grafana_viewer_login}} + password: {{grafana_viewer_password}} + isAdmin: false + permissions: + - permission: 1 # View permission + role: Viewer \ No newline at end of file diff --git a/ansible/grafana_mcp/deploy.yml b/ansible/grafana_mcp/deploy.yml new file mode 100644 index 0000000..8a9a869 --- /dev/null +++ b/ansible/grafana_mcp/deploy.yml @@ -0,0 +1,92 @@ +--- +# Grafana MCP Server - Docker Compose deployment on Miranda +# +# Grafana itself runs inside the PPLG stack on Prospero (see docs/pplg.md). +# This playbook deploys the Grafana MCP server container on Miranda, which +# connects back to Grafana on Prospero via the internal Incus network. +# +# Prerequisites: +# - PPLG stack deployed on Prospero (ansible-playbook pplg/deploy.yml) +# - Grafana service account token in vault (vault_grafana_service_account_token) +# - Docker installed on the target host (ansible-playbook docker/deploy.yml) +# +# See also: docs/grafana_mcp.md + +- name: Deploy Grafana MCP Server with Docker Compose + hosts: ubuntu + become: true + vars: + required_service: grafana_mcp + tasks: + - name: Check if host has grafana_mcp service + ansible.builtin.set_fact: + has_grafana_mcp_service: "{{ required_service in services | default([]) }}" + + - name: Skip hosts without grafana_mcp service + ansible.builtin.meta: end_host + when: not has_grafana_mcp_service + + - name: Verify Grafana is reachable on PPLG host + ansible.builtin.uri: + url: "http://{{grafana_mcp_grafana_host}}:{{grafana_mcp_grafana_port}}/api/health" + method: GET + status_code: 200 + register: grafana_health + retries: 3 + delay: 5 + + - name: Create grafana_mcp group + ansible.builtin.group: + name: "{{grafana_mcp_group}}" + + - name: Create grafana_mcp user + ansible.builtin.user: + name: "{{grafana_mcp_user}}" + comment: "{{grafana_mcp_user}}" + group: "{{grafana_mcp_group}}" + system: true + + - name: Add group grafana_mcp to Ansible remote_user + ansible.builtin.user: + name: "{{remote_user}}" + groups: "{{grafana_mcp_group}}" + append: true + + - name: Create grafana_mcp directory + ansible.builtin.file: + path: "{{grafana_mcp_directory}}" + owner: "{{grafana_mcp_user}}" + group: "{{grafana_mcp_group}}" + state: directory + mode: '750' + + - name: Template docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{grafana_mcp_directory}}/docker-compose.yml" + owner: "{{grafana_mcp_user}}" + group: "{{grafana_mcp_group}}" + mode: '550' + + - name: Reset SSH connection to apply group changes + meta: reset_connection + + - name: Start Grafana MCP service + community.docker.docker_compose_v2: + project_src: "{{grafana_mcp_directory}}" + state: present + pull: always + + - name: Verify Grafana MCP container is responding + ansible.builtin.uri: + url: "http://localhost:{{grafana_mcp_port}}/mcp" + method: GET + status_code: [200, 405] + register: grafana_mcp_health + retries: 5 + delay: 5 + ignore_errors: true + + - name: Report Grafana MCP health status + ansible.builtin.debug: + msg: "Grafana MCP container is {{ 'responding' if not grafana_mcp_health.failed else 'not responding - check docker logs grafana-mcp' }}" diff --git a/ansible/grafana_mcp/docker-compose.yml.j2 b/ansible/grafana_mcp/docker-compose.yml.j2 new file mode 100644 index 0000000..d7fe51a --- /dev/null +++ b/ansible/grafana_mcp/docker-compose.yml.j2 @@ -0,0 +1,18 @@ +services: + grafana-mcp: + image: mcp/grafana:latest + pull_policy: always + container_name: grafana-mcp + restart: unless-stopped + ports: + - "{{grafana_mcp_port}}:8000" + environment: + - GRAFANA_URL=http://{{grafana_mcp_grafana_host}}:{{grafana_mcp_grafana_port}} + - GRAFANA_SERVICE_ACCOUNT_TOKEN={{grafana_service_account_token}} + command: ["--transport", "streamable-http", "--address", "0.0.0.0:8000", "--tls-skip-verify"] + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{grafana_mcp_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "grafana-mcp" diff --git a/ansible/haproxy/deploy.yml b/ansible/haproxy/deploy.yml new file mode 100644 index 0000000..eb19b08 --- /dev/null +++ b/ansible/haproxy/deploy.yml @@ -0,0 +1,117 @@ +--- +- name: Deploy HAProxy + hosts: ubuntu + tasks: + - name: Check if host has haproxy service + set_fact: + has_haproxy_service: "{{'haproxy' in services}}" + + - name: Skip hosts without haproxy service + meta: end_host + when: not has_haproxy_service + + - name: Create haproxy group + become: true + ansible.builtin.group: + name: "{{haproxy_group}}" + gid: "{{haproxy_gid}}" + system: true + + - name: Create haproxy user + become: true + ansible.builtin.user: + name: "{{haproxy_user}}" + comment: "{{haproxy_user}}" + group: "{{haproxy_group}}" + uid: "{{haproxy_uid}}" + system: true + + - name: Add group haproxy to ansible_user + become: true + ansible.builtin.user: + name: "{{ansible_user}}" + groups: "{{haproxy_group}}" + append: true + + - name: Create required directories + become: true + ansible.builtin.file: + path: "{{haproxy_directory}}" + owner: "{{haproxy_user}}" + group: "{{haproxy_group}}" + state: directory + mode: '750' + + - name: Create /etc/haproxy directory + become: true + ansible.builtin.file: + path: /etc/haproxy + owner: root + group: root + state: directory + mode: '755' + + - name: Create certs directory + become: true + ansible.builtin.file: + path: /etc/haproxy/certs + owner: "{{haproxy_user}}" + group: "{{haproxy_group}}" + state: directory + mode: '750' + + - name: Check if certificate already exists + become: true + stat: + path: "{{ haproxy_cert_path }}" + register: cert_file + + - name: Generate self-signed wildcard certificate + become: true + command: > + openssl req -x509 -nodes -days 365 -newkey rsa:2048 + -keyout {{ haproxy_cert_path }} + -out {{ haproxy_cert_path }} + -subj "/C=US/ST=State/L=City/O=Agathos/CN=*.{{ haproxy_domain }}" + -addext "subjectAltName=DNS:*.{{ haproxy_domain }},DNS:{{ haproxy_domain }}" + when: not cert_file.stat.exists and 'certbot' not in services + + - name: Set certificate permissions + become: true + ansible.builtin.file: + path: "{{ haproxy_cert_path }}" + owner: "{{haproxy_user}}" + group: "{{haproxy_group}}" + mode: '640' + + - name: Install HAProxy + become: true + ansible.builtin.apt: + name: haproxy + state: present + update_cache: true + + - name: Template HAProxy configuration + become: true + ansible.builtin.template: + src: "haproxy.cfg.j2" + dest: /etc/haproxy/haproxy.cfg + owner: "{{haproxy_user}}" + group: "{{haproxy_group}}" + mode: "640" + validate: haproxy -c -f %s + register: haproxy_config + + - name: Enable and start HAProxy service + become: true + ansible.builtin.systemd: + name: haproxy + enabled: true + state: started + + - name: Reload HAProxy if configuration changed + become: true + ansible.builtin.systemd: + name: haproxy + state: reloaded + when: haproxy_config.changed diff --git a/ansible/haproxy/haproxy.cfg.j2 b/ansible/haproxy/haproxy.cfg.j2 new file mode 100644 index 0000000..977f630 --- /dev/null +++ b/ansible/haproxy/haproxy.cfg.j2 @@ -0,0 +1,114 @@ +# HAProxy configuration for Agathos Titania +# Managed by Ansible - Red Panda Approved + +global + log 127.0.0.1:{{ haproxy_syslog_port }} local0 + stats timeout 30s + + # Default SSL material locations + ca-base /etc/ssl/certs + crt-base /etc/ssl/private + + # SSL/TLS configuration + ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384 + ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256 + ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets + +defaults + log global + mode http + option httplog + option dontlognull + # Log format with timing information for latency analysis + log-format "%ci:%cp [%tr] %ft %b/%s %TR/%Tw/%Tc/%Tr/%Ta %ST %B %CC %CS %tsc %ac/%fc/%bc/%sc/%rc %sq/%bq %hr %hs %{+Q}r" + timeout connect 5s + timeout client 50s + timeout server 50s + +# Stats page with Prometheus metrics +listen stats + bind *:{{ haproxy_stats_port }} + mode http + stats enable + stats uri /metrics + stats refresh 15s + stats show-legends + stats show-node + + # Prometheus metrics endpoint + http-request use-service prometheus-exporter if { path /metrics } + +# HTTP frontend - redirect all traffic to HTTPS +frontend http_frontend + bind *:{{ haproxy_http_port }} + mode http + option httplog + http-request redirect scheme https code 301 + +# HTTPS frontend with dynamic routing +frontend https_frontend + bind *:{{ haproxy_https_port }} ssl crt {{ haproxy_cert_path }} + mode http + option httplog + option forwardfor + + # Forward original protocol and host for reverse-proxied services + http-request set-header X-Forwarded-Proto https + http-request set-header X-Forwarded-Port %[dst_port] + + # Security headers + http-response set-header Strict-Transport-Security "max-age=31536000; includeSubDomains" + http-response set-header X-Frame-Options "SAMEORIGIN" + http-response set-header X-Content-Type-Options "nosniff" + http-response set-header X-XSS-Protection "1; mode=block" + +{% for backend in haproxy_backends %} +{% if backend.subdomain %} + # ACL for {{ backend.subdomain }}.{{ haproxy_domain }} (matches with or without port) + acl host_{{ backend.subdomain }} hdr_beg(host) -i {{ backend.subdomain }}.{{ haproxy_domain }} +{% if backend.redirect_root is defined %} + # Redirect root path to {{ backend.redirect_root }} (avoids redirect loop by matching exact path) + http-request redirect location {{ backend.redirect_root }} code 302 if host_{{ backend.subdomain }} { path / } +{% endif %} + use_backend backend_{{ backend.subdomain }} if host_{{ backend.subdomain }} +{% else %} + # Default backend for root domain + default_backend backend_root +{% endif %} +{% endfor %} + +# Backend definitions +{% for backend in haproxy_backends %} +{% if backend.subdomain %} +backend backend_{{ backend.subdomain }} +{% else %} +backend backend_root +{% endif %} + mode http + balance roundrobin +{% if backend.ssl_backend | default(false) %} + option httpchk + http-check send meth GET uri {{ backend.health_path }} hdr Host {{ backend.subdomain }}.{{ haproxy_domain }} +{% else %} + option httpchk GET {{ backend.health_path }} +{% endif %} + http-check expect status 200 +{% if backend.timeout_server is defined %} + timeout server {{ backend.timeout_server }} +{% endif %} + server {{ backend.subdomain or 'root' }}_1 {{ backend.backend_host }}:{{ backend.backend_port }} check{% if backend.ssl_backend | default(false) %} ssl verify none{% endif %} + +{% endfor %} +{% for tcp_backend in haproxy_tcp_backends | default([]) %} +# TCP passthrough: {{ tcp_backend.name }} +frontend {{ tcp_backend.name }}_frontend + bind *:{{ tcp_backend.listen_port }} + mode tcp + option tcplog + default_backend {{ tcp_backend.name }}_backend + +backend {{ tcp_backend.name }}_backend + mode tcp + server {{ tcp_backend.name }}_1 {{ tcp_backend.backend_host }}:{{ tcp_backend.backend_port }} check + +{% endfor %} diff --git a/ansible/hass/configuration.yaml b/ansible/hass/configuration.yaml new file mode 100644 index 0000000..8682ff7 --- /dev/null +++ b/ansible/hass/configuration.yaml @@ -0,0 +1,17 @@ +# Loads default set of integrations. Do not remove. +default_config: + +# ISAL accelerates aiohttp +isal: + +# Load frontend themes from the themes folder +frontend: + themes: !include_dir_merge_named themes + +automation: !include automations.yaml +script: !include scripts.yaml +scene: !include scenes.yaml + +homeassistant: + media_dirs: + media: /mnt/media diff --git a/ansible/hass/configuration.yaml.j2 b/ansible/hass/configuration.yaml.j2 new file mode 100644 index 0000000..65b2e8f --- /dev/null +++ b/ansible/hass/configuration.yaml.j2 @@ -0,0 +1,33 @@ +# Loads default set of integrations. Do not remove. +default_config: + +# ISAL accelerates aiohttp +isal: + +# Load frontend themes from the themes folder +frontend: + themes: !include_dir_merge_named themes + +automation: !include automations.yaml +script: !include scripts.yaml +scene: !include scenes.yaml + +homeassistant: + media_dirs: + media: {{hass_media_directory}} + +# HTTP configuration for reverse proxy (HAProxy on Titania) +http: + server_port: {{hass_port}} + use_x_forwarded_for: true + trusted_proxies: + - 10.0.0.0/8 + +# PostgreSQL recorder (Portia) +recorder: + db_url: "postgresql://{{hass_db_user}}:{{hass_db_password}}@{{hass_db_host}}:{{hass_db_port}}/{{hass_db_name}}" + purge_keep_days: 30 + commit_interval: 1 + +# Prometheus metrics endpoint +prometheus: \ No newline at end of file diff --git a/ansible/hass/deploy.yml b/ansible/hass/deploy.yml new file mode 100644 index 0000000..fbceded --- /dev/null +++ b/ansible/hass/deploy.yml @@ -0,0 +1,139 @@ +--- +- name: Deploy Home Assistant to Dev Environment + hosts: ubuntu + vars: + ansible_common_remote_group: "{{hass_group}}" + allow_world_readable_tmpfiles: true + tasks: + - name: Check if host has hass service + ansible.builtin.set_fact: + has_hass_service: "{{ 'hass' in services | default([]) }}" + + - name: Skip hosts without hass service + ansible.builtin.meta: end_host + when: not has_hass_service + + - name: Create hass user + become: true + ansible.builtin.user: + name: "{{hass_user}}" + comment: "{{hass_user}}" + system: true + create_home: false + + - name: Add group hass to user {{remote_user}} + become: true + ansible.builtin.user: + name: "{{remote_user}}" + groups: "{{hass_group}}" + append: true + + - name: Create required directories + become: true + ansible.builtin.file: + path: "{{item.path}}" + owner: "{{hass_user}}" + group: "{{hass_group}}" + state: directory + mode: '750' + loop: + - path: "{{hass_directory}}" + - path: "{{hass_media_directory}}" + + - name: Add Deadsnakes APT repository + become: true + ansible.builtin.apt_repository: + repo: ppa:deadsnakes/ppa + + - name: Install Python 3.13 and build dependencies + become: true + ansible.builtin.apt: + name: + - python3.13-dev + - python3.13-venv + - build-essential + - libffi-dev + - libssl-dev + state: present + update_cache: true + + - name: Create virtual environment + become: true + become_user: "{{hass_user}}" + ansible.builtin.command: + cmd: python3.13 -m venv {{hass_directory}}/env + args: + creates: "{{hass_directory}}/env/bin/activate" + + - name: Template configuration files + become: true + ansible.builtin.template: + src: "{{item.src}}" + dest: "{{hass_directory}}/{{item.dest}}" + owner: "{{hass_user}}" + group: "{{hass_group}}" + mode: '550' + loop: + - src: "configuration.yaml.j2" + dest: "configuration.yaml" + - src: "requirements.txt.j2" + dest: "requirements.txt" + notify: restart hass + + - name: Create systemd service file + become: true + ansible.builtin.template: + src: hass.service.j2 + dest: /etc/systemd/system/hass.service + mode: '644' + notify: restart hass + + - name: Install Python packages from requirements + become: true + become_user: "{{hass_user}}" + ansible.builtin.pip: + requirements: "{{hass_directory}}/requirements.txt" + virtualenv: "{{hass_directory}}/env" + virtualenv_python: python3.13 + vars: + ansible_common_remote_group: "{{hass_group}}" + allow_world_readable_tmpfiles: true + notify: restart hass + + - name: Reset SSH connection to apply group changes + meta: reset_connection + + - name: Enable and start Home Assistant service + become: true + ansible.builtin.systemd: + name: hass + enabled: true + state: started + daemon_reload: true + + post_tasks: + - name: Wait for Home Assistant to initialize + ansible.builtin.pause: + seconds: 30 + prompt: "Waiting for Home Assistant to initialize..." + + - name: Check if Home Assistant is running + ansible.builtin.uri: + url: http://localhost:{{hass_port}}/ + method: GET + status_code: 200 + timeout: 10 + register: hass_status + ignore_errors: true + + - name: Show Home Assistant status + ansible.builtin.debug: + msg: "Home Assistant is {{ 'running' if hass_status.status == 200 else 'not running properly' }}" + + handlers: + - name: restart hass + become: true + ansible.builtin.systemd: + name: hass + state: restarted + daemon_reload: true diff --git a/ansible/hass/hass.service.j2 b/ansible/hass/hass.service.j2 new file mode 100644 index 0000000..5db7f26 --- /dev/null +++ b/ansible/hass/hass.service.j2 @@ -0,0 +1,18 @@ +[Unit] +Description=Home Assistant +After=network.target + +[Service] +Type=simple +User={{hass_user}} +Group={{hass_group}} +WorkingDirectory={{hass_directory}} +ExecStart=/bin/bash -c 'source {{hass_directory}}/env/bin/activate && hass --config {{hass_directory}}' +Restart=always +RestartSec=3 +SyslogIdentifier=hass +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/ansible/hass/requirements.txt.j2 b/ansible/hass/requirements.txt.j2 new file mode 100644 index 0000000..aadded5 --- /dev/null +++ b/ansible/hass/requirements.txt.j2 @@ -0,0 +1,2 @@ +wheel +homeassistant=={{hass_version}} \ No newline at end of file diff --git a/ansible/inventory/group_vars/all/auth_keys.yml b/ansible/inventory/group_vars/all/auth_keys.yml new file mode 100644 index 0000000..8153d80 --- /dev/null +++ b/ansible/inventory/group_vars/all/auth_keys.yml @@ -0,0 +1,36 @@ +--- +# SSH Authorized Keys Configuration +# Manages authorized_keys files across all ubuntu hosts +# +# Usage: +# ansible-playbook ssh_keys.yml +# +# To override exclusive mode (remove unlisted keys): +# ansible-playbook ssh_keys.yml -e "ssh_exclusive_mode=true" + +# When true, removes any keys not in this list (use with caution!) +ssh_exclusive_mode: false + +# List of users and their authorized SSH public keys +# Each user entry requires: +# - name: username (must exist on target hosts) +# - keys: list of SSH public key strings +# +# Example: +# ssh_authorized_users: +# - name: robert +# keys: +# - "ssh-ed25519 AAAAC3Nza... user@host" +# - "ssh-rsa AAAAB3Nza... another@host" +# - name: deploy +# keys: +# - "ssh-ed25519 AAAAC3Nza... deploy-key" + +ssh_authorized_users: + - name: robert + keys: + - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIH0xFMMSa1SeMPbX84zJOKWHAT3HtMRuWmNA7GGKr1uw robert@Hercules" + - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBTcpW11Vb3w1Bi77WCAM5K9Q2vz9MW5PdBpiAIXhjn3 robert@Norma" + - name: harper + keys: + - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOVvIshMkRx1f9m2TTJ1lMHzsaBnuxZdoMFm6hmuzZzo harper@caliban" diff --git a/ansible/inventory/group_vars/all/vars.yml b/ansible/inventory/group_vars/all/vars.yml new file mode 100644 index 0000000..b9186be --- /dev/null +++ b/ansible/inventory/group_vars/all/vars.yml @@ -0,0 +1,107 @@ +# Red Panda Approved Sandbox Environment Variables +remote_user: robert +remote_group: robert +deployment_environment: "agathos" +ansible_python_interpreter: /usr/bin/python3 + +# Incus configuration (matches terraform.tfvars) +incus_project_name: agathos +incus_storage_pool: default + +# Gitea Runner +act_runner_version: "0.2.13" +gitea_runner_instance_url: "https://gitea.ouranos.helu.ca" + +# Release versions for staging playbooks +anythingllm_rel: master +athena_rel: master +athena_mcp_rel: master +argos_rel: master +arke_rel: master +angelia_rel: master +kairos_rel: master +kairos_mcp_rel: master +spelunker_rel: master +mcp_switchboard_rel: master +kernos_rel: master +# PyPI release version (no 'v' prefix) - https://pypi.org/project/open-webui/ +openwebui_rel: 0.8.3 + +# MCP URLs +argos_mcp_url: http://miranda.incus:25534/mcp +angelia_mcp_url: https://ouranos.helu.ca/mcp/ +angelia_mcp_auth: "{{ vault_angelia_mcp_auth }}" +caliban_mcp_url: http://caliban.incus:22021/mcp +gitea_mcp_url: http://miranda.incus:25535/mcp +gitea_mcp_access_token: "{{ vault_gitea_mcp_access_token }}" +github_personal_access_token: "{{ vault_github_personal_access_token }}" +grafana_mcp_url: http://miranda.incus:25533/mcp +huggingface_mcp_token: "{{ vault_huggingface_mcp_token }}" +neo4j_mcp_url: http://circe.helu.ca:22034/mcp +nike_mcp_url: http://puck.incus:22031/mcp +korax_mcp_url: http://korax.helu.ca:22021/mcp +rommie_mcp_url: http://caliban.incus:22031/mcp + +# Monitoring and Logging (internal endpoints on Prospero) +loki_url: http://prospero.incus:3100/loki/api/v1/push +prometheus_remote_write_url: http://prospero.incus:9090/api/v1/write +syslog_format: "rfc3164" +# Docker configuration +docker_gpg_key_url: https://download.docker.com/linux/debian/gpg +docker_gpg_key_path: /etc/apt/keyrings/docker.asc +docker_gpg_key_checksum: sha256:1500c1f56fa9e26b9b8f42452a553675796ade0807cdce11975eb98170b3a570 + +# RabbitMQ provisioning config +rabbitmq_vhosts: + - name: kairos + - name: spelunker + +rabbitmq_users: + - name: kairos + password: "{{ kairos_rabbitmq_password }}" + tags: [] + - name: spelunker + password: "{{ spelunker_rabbitmq_password }}" + tags: [] + +rabbitmq_permissions: + - vhost: kairos + user: kairos + configure_priv: .* + read_priv: .* + write_priv: .* + - vhost: spelunker + user: spelunker + configure_priv: .* + read_priv: .* + write_priv: .* + +# SMTP (smtp4dev on Oberon) +smtp_host: oberon.incus +smtp_port: 22025 +smtp_from: noreply@ouranos.helu.ca +smtp_from_name: "Agathos" + +# Release directory paths +github_dir: ~/gh +repo_dir: ~/dv +rel_dir: ~/rel + +# Vault Variable Mappings +kairos_rabbitmq_password: "{{ vault_kairos_rabbitmq_password }}" +spelunker_rabbitmq_password: "{{ vault_spelunker_rabbitmq_password }}" +caliban_x11vnc_password: "{{ vault_caliban_x11vnc_password }}" +grafana_service_account_token: "{{ vault_grafana_service_account_token }}" + +# Home Assistant +hass_metrics_token: "{{ vault_hass_metrics_token }}" + +# Namecheap DNS API (for certbot DNS-01 validation) +namecheap_username: "{{ vault_namecheap_username }}" +namecheap_api_key: "{{ vault_namecheap_api_key }}" + +# OAuth2-Proxy Vault Mappings (used for SearXNG auth) +# Note: These must be set in vault.yml after configuring Casdoor application +# vault_oauth2_proxy_client_id: "" +# vault_oauth2_proxy_client_secret: "" +# vault_oauth2_proxy_cookie_secret: "" diff --git a/ansible/inventory/group_vars/all/vault.yml b/ansible/inventory/group_vars/all/vault.yml new file mode 100644 index 0000000..e2362dd --- /dev/null +++ b/ansible/inventory/group_vars/all/vault.yml @@ -0,0 +1,415 @@ +$ANSIBLE_VAULT;1.1;AES256 +63343266373930636632373764653162353131386330313565656139663132373764303333623361 +3866643138386134396330643832303263346633653566330a376434643031326663383165393266 +31306366643937396161633864653962313063316133623966333863663832306437393637656335 +3061333530343639620a623663303836373633623266393932393238393338306534323062656363 +32663032333131663138623533613136376666646163346463613563656365393038363733373663 +63323138323338316534616432396636646262393461653761356664623662633962343866366234 +39313330383565623239353031366630373531623033333836316233663436346535356166623962 +65613333613634626634333064613564616462396136373939636433383162366266636331373365 +61383839666563343365393934353764633635626130363562633432643633373431373563656162 +62373236646138313636623838653065333038666364613531306637623731353565313032623765 +66306634646562643234366365643534366430306333323239656435663732333438343262316166 +65306539663363616638643036656136666432373164386636363038376263663636663662656662 +66353837636162653462626430343835306564336365373731643931353766653165363463316466 +32636431643863633531313464303937313564383636623330663061643466393734646462633236 +66643430333731623564336430363061616638356561346262326236663763316563303939393865 +36366632393034313866386234643832653861613330306337353731396537633162363934653863 +62623030366263343732336634343134383861323130366461343930353335356461353735386161 +63306364363430623136616437643765313835363834313432326439323432353463656661623139 +63313738393832323031373238333065646538303331316132636663616663326139323765333231 +38663362646664663835316164343533353663393666653865326439613431646262376566333063 +64346436363933313639326233383934623539396334363431653439303332316534646464613565 +36383031613162343362643230336634303766613536376666623335663433363035313363633065 +38373530343530336132343038323765656436306537353863326238363263626264636434393564 +35363730626434643538643136653766313966616336663666323034643461373462346466316130 +38343736323730623037393639323065616639373533333265333266366161393962303732353034 +62326534613736643335373461666139346661353335313638333339656238316136323262343330 +64396166336466646635376262323563313431393662663138323335313763623066663561653530 +66333362313833346365383666313461383434623734336336656536343633623163666664373232 +61303635646138346338653730656164303966663533643036323131323862363065323631396364 +35663433366363613962303664383032363065656139656532306162353238653464316331323166 +65373364633834633063626334343365323466383264633763306266333732653935363835623039 +33626437383138343839653539653361373032363536633734666330303131346534323333663131 +33623935663663636261313030306366326631316130363663373133616262633137356132393465 +31353464666365666333313639346439313334313861346461303663366161303038323162366564 +31313032623538353230306339383133363662323761366431366563396464663935316334613730 +65633532306132313032356630656630313135626664306138383264666430633831386661653236 +38376530343635656530326466346337623564303162373536386534626237356639333333656339 +37376630373037643830643334656461323735646438313664353961353464306431353438623631 +33646464383663373734373863383663393561633234656261353139616534646331396465653766 +37666236643363363637666463616137613932306462363035623039653532303262356363626434 +31343530333235373835363732643232373261376464363363313464306537316530306430653536 +65653563363763633737626334393735643563363730623262363265326561666563396438636637 +36363036333331373361306663613562623931303037333538363663666362616636633963386266 +34303837653032383261333037363765633234663061316231313766396637386464306430613439 +33636332343335636532333662633632663664346133353865613062343331356637323366653961 +31393733333139316462336564363761626636616561336165323830363732323035326138343364 +34316231303533353637393962316561666232636339396533666435633631313234336530336235 +33346339346237336236326330343939366163646138616237643038396136323235383737306537 +30366665316661316463633163663835353435656330633966333863356633643163313734303161 +34373231373439663937656363663662376539643739663331623239366237306365353663323937 +38646239303964633030376639363365346461333336313965636364336632626435363162366131 +37323961326330343734633430666636663633363866656466346236383631633939373531323830 +30393133646431316532333061643164313639636138636536393666343035646166363539623034 +39653932313761633664386335393635366631333334663137313662303031343462346337633238 +35363334373738313830333833613134643066356637366538326264333161366564323861343862 +63616462346535323434353363323537653163643839666534353931653262366666303236383365 +34343066373065373338666135386133366365633138346465323565313864376564343830323564 +36633261353335386438393437353238626333323539666337353932613034326534333466336431 +65353065303433613236313435353164313539353535353564653062343037306137616639323062 +34656535313133373264383236646234323366386238616563336330636632313263663861383432 +61306435653130663938663633383530356365313531393561373530396165383034373933303537 +66313732396162393266313637623063633065653463393165383864343965346136383939363531 +64663438643139336230653464313736393439326430333864353231613932393462623333386539 +38306538653633656239646364356433323530306138393863386533623636333832333534616237 +65353164383362366464623737336162326162393965646435373532373639386533386132343765 +31353230316432383038623762346164383130323264383933393236643066333166643665383164 +64663965323235643632333435663065333662376537386130313163633361613733306466333338 +65613537353133643632393661353633366631386564636136643635623534353630616337363633 +66346137326335376665383032663039373462363865356532386530396535303234333261373536 +35396137613063336362653561376235373932383465393462306138656539316336643039303864 +37393434336265376161346664393333666335343764333465313165663438643263353633633065 +36353662653566663536396565616366333631643966656632666164343030643734353230323938 +37303531326531386563623365306161396336386634383264386563323365653731323865383930 +64333738353633646366353666643461653965633037333039623366636233356365313765363031 +66373262373935306632663066656263353934343061323761366262643937396164336435636139 +37663366353165646238353239646335643333383233383237633161363762616339643632346663 +36313433643439303036386639343564643061393833663933343032663830383864306363356632 +37326135343364616264353434663234363861313066306630646366356436323939353661383563 +63363031656539626136336130633432646531653831616232643961613462393061383433653939 +62343735306435666231656563616536346639646139336361613637663931393331323138303939 +66343762363032663764336465333264353765613265373536656538666538663866336237303466 +64656534356431643236353133316435353831633339386134333839386138316661383165633166 +64323262616565303065643636383038363235633036343833303163353530666331323363623961 +35613130326330306539306161633764653138383839336466646262373433653466353236356563 +65616432663066376639663539303863396637373533623232303031336365373861366262656532 +64313163633732623030646234386133613935633134613763323536343831626135343164383734 +63646135393461333463343934333362333365363237356430306162666631333235316363376566 +64383632353736363537653434363037613931313761383866306465326433336465316633303763 +38306364613037363537343839353938326138623063323735653834313639663739323139636437 +38663665333839623736323435386332393738643466316666386631396532633865383665323965 +32373130393438656431323861383035386262353261313534646339626535393538393862366530 +34636136363165363863646538653430376236313733613830353665616262303836353338616232 +66376337633831623531613530356138666330373661646133666666316538386661386536363061 +66336332393439646231303634376364623131653536373464323233333531636238326530333539 +62653937363162646232633134646438643735653237396163396631656439366433323038323438 +36393262303664356637363739633836336631326466363639323765633839373164316139323534 +30313862316135323131633337376566656665613735613934383439306266623938356231626639 +35373934396335333138343263316538613535343162613637313239376235346539393832343939 +64306261323965613066393865663939316566366262626132616664303132616565383838353961 +66303439646565386138366533393564353762323339373366393532383935343665653035346636 +62313661636138393930346362656638333230336537336336616634383561356661366136616631 +63626264666439656439336533633362393930336535326636633436646264613866356562376234 +33616239326633643533323637323638346631613264383931373834666633346437323161303466 +36353466396633656461653432393563366231613565663335666432343838326631623861666136 +34373264383435656665616365666334373135666566363738633962393861303635363935346638 +36323761633535633131356235613462636438616431346465323862373038353530666464323064 +31356233326161633838353334353632616232343164616664396437666563393266653132313939 +65303465303137353132646163376463343563333331666637656361336538333030313736343836 +61646339643833346663396661383735626261316239636265343837393161633333616436373064 +31663635653863613638353236393666323364616535363965633136656262386166656135326363 +35373030336139343062333830363734653839633830356138316431363962356538363837306337 +64313962666261663435626236356666333834373261393165316436353936616437343035326262 +61313535666233303366376533383237316138373430636662323565646564333333333436636339 +38643731666462613533353030383535666561643637306565616232613666653435316639653362 +31333563336362373061636139373034373337343261343336613165653438393037316562643766 +30613034353133653936616562663039363533366438336638306461636533363633633166646163 +39303765393133333536643636326238363534653465313833323461656531386637323730616139 +38663830653363333732643464366235336661613732643163323232393264363637313032336230 +39373636353231386361326137613732623238613233323131613836663630633634346532633639 +32333239316365666436323565656265643661663036363163393861356138326463353862663063 +66633462636632386438613165613766653965656435313231623739393162663562393033333237 +66363162653936663637626564613063323865616163393739623437313235366662656665333063 +63353234366436333739386339636436626532303261616332643834306238633334303436353139 +30626361623637653731316539313966656538653033383362356366646233363664373566383365 +35306331366161336432613962333436666539643536623165636130326230346364323437353730 +63323866316632353261663965356431323633313234613563306135346265333431653033633430 +30303861343161636264383235656638373832626436633035343239313939626534343739303063 +65633537383935306161386262386561333862313332313639653032373965343635353063636262 +61313733323135643831353266363134326534616634616638383138373630343434623865343035 +62306165623335366434393164663631326535393965393064623133396264366138626363343234 +63383833643937356462653331633766383363336539653061636566353732353130643861396633 +64633336343263376132386134326665613762386435313665353537346238346132306232633937 +33373264333865333031353231316266376530383830626163366564343939623930376565646365 +64626564653761356230346537333037323937323066393463626137656539326565363734626231 +32373264663031343963646535653031366666623061393736393164373137613466393935623835 +34663735323439643534366263663432393433346533363333386230656237656130383731366330 +36623538343535643062346166613362333532633263316335333262346161613439353639383564 +34646338633537323035623734353933306534646438386537643166633632333365383634376431 +34336661313430383661623739386436613734373837353765313235616632366464613339353532 +32613938623038303834346337383461663963616466313666323639336130623761383133373031 +33643139373466323662616330656562303061613730646461373033363261666632613836613539 +30633539313331373366353638383661613037393137383162313037666163643566346166653761 +31633139356163663033336362393535336163313037616530633365616234393262666433616239 +38653830333063373736323238653430626530323431653133316533613836333736373966376666 +39343738663532343731316661386537336535363764343537303037653261633432363734333362 +64393239333564633837646666343933323834323336666538656665653637653338383463656661 +66643464306338636330323764363437656236383339636532353162663438646335363534626437 +34386231356161623737636436633636306636646162333537663663303532626436656430343161 +64393435656665333837333266373863376265343935666333353765363437653033323866653838 +64643039343263326166613432666365383264663165663536376433333162306265383566336266 +31626239383432333934363734666535303334616535656630323363393436626436616335396662 +32373432656632376333376630653465366336393264643462386162376134396239646439396466 +63373934336437386663633766666634626665353263343361376130333261666162393334383563 +64626436363765353963373665306131343739356539616464363234633739356233646664376566 +66363833336532633439323563316131303065616633336137336232346238656237616235333764 +39303035663635356531393936303766643834333736666461353132623233373862343264363635 +33653439613761383164346637653636653131373030656131333934396431616365353861636461 +65303432616664653534643539386431656338656663313863656138313261373062636337366637 +65643464626435333634313463623130313535653831303765306531623935313563366238363330 +64383763623131346664643461653764623565616365636662633535376366303566306261386165 +62353532663133326433303638666334616235613937623231656531656361333738323939663238 +62646130623732336332313865376136373937643533666531383332303465353438393733306562 +35326265356361346465346332623262346366306435613531303236653836353466323965316538 +30383439336431346332336332626564333530373461343738346530336562646439306636336433 +66353234663930613835393632336532373531633437666365336231643537643764373431373866 +34306565623530393934363932616164393534396334363766393132306466313338366335643638 +37626562656362393464353061373638393430366331376139643664383836613639383764393230 +38613861323536653864346635343065333734346631386231306630663639343863343033636231 +35363731626533383930313130313438656532323161633736646365353663383166383062616364 +61396631373131356134343563666466633937653766356561353437363566383161386564643333 +36303363616262663066343532336632353262633763393964616438316261343432626264616666 +62643164383234666465313961333966363933323665323730633931336538353537393239386635 +65663263376461636561383032353337346264323662373631616537653930356338656264303766 +38656239363539313961363463396139363133666134303936633061663036336538666163323664 +32613234303935353837616566373163383861336166346466646262386563373661623033623864 +35383534353866303764343661306138646265303439343036393462623163313064643433623965 +37343438636539313862626632383831636334333664636131303234383330393334663837336436 +34333032653630633336383535656666393962383863643333616264353163663939373039303337 +62323965626662613435306636363732376433343132646661396665336432653232646637353230 +65303465373137613266333130623063636566663265613435643464303961633962396334663365 +38356161656563333966623935393633326565336533613666663834363561373334363434643839 +39333765303137656362356233386366303736653031643431663138336566383264373536353234 +39326137653634373235303466363663336662653036363338663363616432393135356231656236 +64653836663033333639626533376237356163343961323539323964666239343738346230323337 +63663163666537343463623565633337393036653037656331383736393930373239333631343930 +65656663656663646235313364333062663938393537303261313032663161383535386365326662 +38643764373134336636306338323634386438396563643662393132303561666363663464616535 +62653865633736386233386630306238623563306139353038613737363031393232613934626533 +65616139656265306337663165323338316665613138336164653637373738656332376563346137 +62386161653836633732376161316562393436363536333132356136396361316534343135656334 +35626436396464663832383336313235346331626464313835646466393966613835353537663962 +61633433633134373765373839386663316266643834353533353936313633613436633530666339 +64313962383735313665393261666564656430366563633835343565316335383738653539376334 +66663334653333616464613531376562393639343765643435663835383439343230393562376532 +32323337333438323463346466356533386234303465643739663261396637646536353233326332 +61366232653232343834393765323163636432356234393766353365623636353930336163663434 +64343535383239343862653661393962643861313764636666376362653532383936626564353539 +37306133313833623361396535333235663034343264663131313061353766396365643639396663 +33396630353234336336353034636630613365613964613866313331356539623538616138623539 +39663466646638393436336438653039306166303066303761393838353861666165393035623065 +37626265646436636362613033363066326138666261353931343063363736333135366638626338 +39656466393964346565313839343036356538353464663234643164323865313764346661393066 +66663139353335393936613366383835613030616465613162653763333530653665633830643038 +35363662316566356637313463643461663833646563396635353036616330643565386239343139 +39643533313664613634326637333136626137323833326161663635623235666530303466373535 +30366234303134373733383138366462323062396362626662306234353863336337633263353637 +64613531653436393562343936666336343231383935353264313536323037356638663733376165 +38613333326263646337303630323761386439616333613566333431376638333165613966373962 +30326434663130653434373130393863386163616537343034316462343865616537313364346138 +34363535663539663630383333343836373065623030393135373531663961646661376332363834 +61363331643464323966653737613130376434666362623765386632653665373834396663613963 +63323262323363303733613731663066383261363938303563363462396238643034623437363464 +61383566623764366132386465666630623461656431326333633066303034663262343439613634 +31646662663837663161623036613631363163656364396531363235616133376633323361393535 +32636631613239353637313337643536393538363531373636336563646333333533663563623131 +62393765323432633561613338336362343665633865326130636635366534313837636362373138 +31343231353837366262393237636464313736343063313536383438366263386331393039383033 +39353536326462376263376263303835393331613562633966623763636562613364376239656635 +39363639303938373237393531373538623739626431343939363063623964343138623763616639 +31666566333966306264346263663333343139333765376135383633386137313035373239663833 +39346137333465633239353761393666653231363264383331353435393864626461333863363966 +34663062376537353133346130303330656631386164613263333933333438346132303362353031 +64393338376631343131366362643766396137346431653439323338353338396235333630313233 +36323131643837366237623333643134373666616362663464656364346436323037663135373462 +37303063303033346230373134393366376636393431666136366636383038333966356561386232 +35653766303235336334656336373339303039353935313239303838633236306433666336666664 +66653735633236343235663766383964386237666437386362626336323136393530343839383865 +39343231356164646530393439613832383364316234353733363865616439646239303231653263 +34356564316236343837386261343430323935323066633938613764613465306137653265656132 +61346633616139343430616630643333663636643731356266356530623030636538303737383462 +33373962393235636266393364336331643566366266636162613334333639626430393965343065 +34393261356164336166613063633039346165633263633336626338653762336338633033313239 +65326334653464613330346430393138356331373861656161323736376434396464376136663434 +32343461333934346534343561613530386661396562343730656630313064643766653030363239 +30313064323234316638613733613939303830323736653931393663346130323361663265396634 +62613831313837646364646363363431633033393137326136353363656637343137656539343730 +66386139356261386137613331336266623239383764306265323635383338646332636337623230 +61346462636134653133333733643231356663633032323332626136663232353964656166613763 +32386565316131316134323933353133343034666135383635396535646435396365626631306665 +61353366633865623765306663383837303037303532666135333461303334636639336665376338 +63653338313463373465323536303732303463356166393365333264623537613362656331356337 +34336564343361663039396264646566383330313739643566663861363661353263383531623632 +31616230643239386561323432356237336361333561643831373132393437323036623962343666 +36376530373938613539613734333465616133663833623635333262306138633639666236613661 +32393236653637376265633131336265633333393836663835313765666130323631343537356332 +38653238646630643535393032653263373131313335663161653264636435643135363063323135 +66636666666437326261396664616164303239333666323463373662313463656361386461636438 +30383238643336613861313265366361636664303565333732326134613537376261656639623739 +66313633633764313632663462323862356265353432306362373138393838313334656137353039 +33363933323036653561303933613832633263626562623836633030326630316363653834656166 +37326139306432636566666534343661326565343330616232393434653634306563383962376633 +62373034646139356662353139326163323439666461363937616565323639663237393939643632 +35643632666365316236633461643965303866653037623564363631383338383830346537353232 +30333262316135633639363764353866353631346430333066626564373133663630383065306233 +63653437613137326165653239343130326565313462363235353035366236346261656532626463 +38383130613861343736323961393838313337643062663939643236346531316461306432393265 +37343035346366306561343632373262393437396262306135613330303938346161663065653639 +62623133393338373966353965313265636335303733343234643466643233303561376365613137 +65633761613633336536333136336233633363376530643832333438383634316533323632393437 +62653264306335663438303963366438306463366565663739653835306638633761326562653537 +39313861663837316566393665663565333736363166613733306432353039376430306639343538 +35336531313637386666373337643265313734373532313132393961333831376637623366306436 +63376336663461393961643038353864363766653564313662343062316637373335336131653830 +38663633313861306636616231343337313064343034393062646461386137373534353638313039 +62353165613138626434636336323866396536626364393763336330343435323263663664373966 +33363865373764376231343162353038396366323136396337343536343630323963346536356535 +34613031316531356433313665653838343339643533643862376139313431643764396432323234 +31643964396234353933313032313438366663643231386233623163666233343961613838613334 +37373532646366323865636564376564386664663834663436333133383566666234303435643231 +31373264646534636335646537333565663161386437326561386530396135313939623462633031 +37366663313564346339396636343139623764653232653432666631353663333161353763646331 +31376263636366313361313138653365373165643637663134323530653030663837333637633034 +38616634393031353132343630373162393638333932376234643038643938656437303234613231 +61653832393438393330346366316163363033386636363835336436396434363663633262376130 +63363033313435643639626138636637656333633232386362353936366166323835616437393939 +36363737613134386665363963663538376137326432386265653436626233376631326236313131 +62643336613563643132363635333930323233666562353035626530316438353136663663373666 +63623030336364666632336330626630623837316535323563393231633665626166613765393938 +35303333333633346130323930643262616234313564663136643237656462653161613261396231 +38383064666635393465353634633936643639336638353163656236346666616566636664383936 +61643630326133323439653261633664363833356437613339646230383235663364323137656464 +64626364336133356562313235613235306138626436643331626662383234313363616563386335 +39393334616365343666363763653232393439656638313562346634626431353162316430343931 +38663364623463353339303064373664633536333037346461363636653162323462366263653232 +37303836343163373030373564343139366465343236306235316336333261653964383436643263 +64376632333931346433376632653733613437363934623338346439623738393064333330633936 +31343263376234386238636131663763346466303762363835303231323939643934633662663832 +35366230663237623237313731633539353661613763386438643537363835646336626461313633 +39313235313937383631646463373937353464356633643031316361356331363063393630646337 +61303036643830663763333735303534643935383731313866633863356437343962353964653163 +39623862333162353936643430643038663732356263643635353361626430353833346165633631 +64646166366231643939313164353261373036623761623433666431373230316662373536646338 +30623433383435386133323062633136663437333166366131346164316666616434396530343965 +61643962336438333936303938633536323365386137646235313230313363386561306339373831 +35333134366536653961643434333865343130636565633866366533353361633439333263616636 +37633063386439653937333861626464306163323265343338303235326234303737313365653537 +35326338653638646465376235646639313736616430353739323162373866636361646664303134 +31643830393836626431643064613733313461656437336463363536383737636230333961303466 +36646433643932613166306333346132343366386438353363386134636237323732353433346635 +34373138616664333266386233376363393239366162666534326566386164646138613638656463 +34656335623238623330616137396337353337663838643432393136316133303263376531336431 +32313335636439646664373338623465396132643965653231306634323337393036386437313366 +66373634653536646664393034316234633465363837666134636537666165343437636636363366 +35356563666539383630303562393131386539346431653031313565323665653937396339346465 +37363731303933343961396430653865656535386263386161613864636662666263323834616539 +33636565383336333437643065626532306232376461313463323530353539323062393664383535 +62303362376237313564333339353933363538363636303961663538313337333464636133626361 +61616466353730656235643139633763313733313738363662393130373330633161376266383563 +39666533336531393662393830396231633536333839646266306464366235626662386634333139 +30343536386135333336313430653136316530393939346636383363666335366266326630313261 +64336535316239646566356633366264303335356637343736373234323138366239623761653032 +31346333623238663539383035646266616635336634373730336263626262346538346137343865 +30626332666565383963356634336532663133626239343234633830366639396365613334613764 +63386364346337383962343731373633376135336531633033346666626631323736366230613036 +38636361643935346563643133386334313730396661323738323637356437356664336333366133 +66373739353533353264636235663034336234303862373732636234623965353061616135663262 +34626538316333613139653632313835313663646535643666323438333965383261663633303730 +37623631623530313330396164376465346531613361633662393338383336663233313934316132 +66616637343933353961326461336466333137633138656239656565346639386565323931316431 +39656132666430326434613032353936653335303163616539376434326365386463373539303235 +62326138623834343437326138386230313634383863613266316638383435656666373266333162 +63333166653862623461633330383131366139646666326266303962623465353238326164633937 +32626431636337343437373834336231323431626665393266353362323164383233633262663432 +66313461306666633038653365326137326563396231323734353733653639396564666137353566 +63633063343232346465373132333032633931396263363932653039633739326433613864346339 +39666332366234326434643265393338316664326532383134373366613964326638346163343838 +36303339323563396134663031386439666439346437316136326662316133366230326431643935 +32663330393564376439303965383633353336613966373566623830386331636463336336333066 +62636339383062343133366137663332343536626464323162656236336634646566356134636237 +37396562643333623530363065373230663130623735633366333437346333383466303061653333 +66376261623437343964616131383133316438393337656135633136346161333831616634633733 +33646636633035373664383930376131363334303637346438316161306132653666346439363165 +65376561303036366630316531346363303639643961376530646433653765373533616138366234 +35303130326131353961623630623538316239383330656536316364323838616461303237623966 +65633763333332646531356638613439663239313566383865396234626631383135303431656332 +35383132326166366236323839363461633766366636643832356562386332313666643636373031 +61303364303835306438653330366163656632376365383663626337613534346233313336663062 +61313764366633613063646461376436326339653465316339663461353835303562613538623239 +32386463383638366432303362636635626536343438303362383133386161393731376538336465 +64623561326163386330393833636264663833653739613962326634326233666630643936663830 +63393765356338343463653662626564613962336538373733366430643236383932633666353061 +36353632363131333364333962653239623266346439643537613031323763623833326636616461 +31383836393563373437666662643064386233643935333165313739393730336132306432326430 +61623464343664653161376332393333663764623232363938636161623539336263353539666464 +37383963643933353034626331623064643232643662613633663631356537646465376264623532 +65386430376230613730353831646661613362623235396639623035653135643333373065373234 +30326664356564656634316462333066343338653339653861383239323764623931633630616232 +64303834663338386266323064343663653534333033376364666532313830626237373033316233 +31633665363635353139366162376130653538353861666662653939613066613965383364393065 +36316633636338373535653662316239633434343833613036393934303465656635333335616133 +32313438393361366437666538326466306462336538346366643366343762653530663633633737 +35656134623237396436333437633933663761393636336135353764353631393332613336633466 +63653334646239386365343036323437616330336265643664356263313062373364666461306130 +31323435393765373131326233363530336161393430613965303366653930383565656262396232 +61353562336235343335666439383635306633313063623638343030653665383033623662336430 +34303934313365613263316533386161303034396262363130623661626235666131363665336432 +39343766333363663238313666353864353232363133653431643764653466663739346537636364 +32306335363332623534346362396362663738636561633937343232616634396237336531336139 +66313663636139323331333966373835396632383435383638373539313230626664386233643930 +32373362623933656131653362643861323733373636356266373464326136633332396337376634 +38326662316537353337636332323935323962363365376463333039316639303666336537356166 +63353837376538323266393433303864353735666432303538323730656339306532316639626233 +62636331363235323838636534666339616135333238646330646537346563393134346366346638 +36656363663563373261323566313130393235646362383463323936306131373865623161343061 +39346330643132353033313836633838303931363365303165356338333665643165366261373835 +65313232363932383438623133653330643463623734313830353334353563316163633966303834 +33393235383763373034653364323062376238623064353632346332373364333861343634313562 +35653465663763396330306534313563646261356363343637316631383732623463643662393163 +35306463663063633665366630396135346138656266376465353138396631653239363730336638 +34626461376638663961666236376136383739636264333637393964633438353665326161306437 +63353132306136303132633963616336653031323233373037636136633561613932653333636563 +33336438353065326263646432386265373363316166343931396464616165386630373530393635 +38346434343366616562376238353963306464323535313965663061386436303139373235633562 +37653966396666326533363338386639663436343637376565303032376333623566386131396230 +34653239646531613065386365626564353532356432336365653965643962333536373164303430 +30343639323136643438306438663531633235323161653237626562356430303230663832303463 +37303562373764323764383762356535633734383731666464303632633637346333646337623535 +33323632623763623836616432393231373364623163333162616365313638316162313036616539 +30336665393034303437646132353336623363663230393335633935656663366565326235363439 +36303230313564393637366434646665346665383931393462663531383131346466613563383031 +61363136303537386666353965653330336236346136356535363437366533306539653636353638 +34313835353038383533323232323730336137666430663865306461643239306362323464323264 +38336230393338363461656639393332353563366431333836363935633565383331656230316131 +63663463343266376330323130613332303534623135386639333834313264623637643634653333 +31666135626664323265663461346135366462316433643161316235363563636432616364653361 +66633661653362393139616163646264346566616337616638613861313937346664323934623435 +35306534353062323234383236343532613533336635303464383533333734353861393330383732 +61663566386333626162396666643737636164323237356533383834303930316631346237343732 +64373832303663333535366566336438636139333434633436396233383238663561396432393135 +30623434336539653732383363633164376634363766353764336431623431363537613833343632 +31646366623439373065626139353939626662333061343038313432616361306533626633653135 +63363865643739656561306331313962376536613832636137613831306431613964363434393538 +62613237356564613739666166336334643639633037623230303134623233343861383934353830 +37363861303963636535623336356132633164316339646231306230313066633536353036363839 +65396434663861636230616530386232633837303462313562353734383134353661653138623537 +31313533353331626235663163663061663631303731363565313262366535303932663239616466 +63313234356366323537653736663630633532666265326665303266623761313939643263653132 +38646634626135653737626563306362383835336361396434313062363563363439323831323566 +63626137616561646663333433363037376332643732663838306361653365383831386230643162 +34303863323638363566643733313036336233303037316430663930396565366163623539656338 +31613862336166376166356134336634636537646532313035633331343862376332333838333231 +62393838623030353338666563336533333265336231393830623264633762386237653364393030 +33336361356139396561336463663963666663616231313432313565383034643230346162653231 +66326535653235643361386135616439336434333638633664393138643765613066363963373636 +32343530613539313434363561616336643236333032643835396262373933623732303335376162 +63613663336531323137633762343832343634653638343263626662356161336163396132383439 +62383364323361373639373137393562363464656238623565343362353265663636376565616164 +33346537343366616663346263316237373666613634333763353838636663656139326636653066 +62333638623432616437306533316337356438376362303461343934623366656131623632333935 +66646130303535626565653138353137633232613131653664356466393932633762366161376430 +31643938643466306436316365613938666635366430376665336166613763386338613235356434 +36306463376233653264356363353134313663666666623039313039613039663862643663343132 +37643032666135633438313635313961333638643862616265643561346661643862353331613839 +3234656634393561653937393036376466656339323862653662 diff --git a/ansible/inventory/group_vars/all/vault.yml.example b/ansible/inventory/group_vars/all/vault.yml.example new file mode 100644 index 0000000..e2edecc --- /dev/null +++ b/ansible/inventory/group_vars/all/vault.yml.example @@ -0,0 +1,93 @@ +# Ansible Vault Secrets File +# Copy to vault.yml and encrypt with: ansible-vault encrypt vault.yml +# +# All secrets should be prefixed with vault_ and encrypted. +# Service variables in vars.yml or host_vars reference these with: +# service_password: "{{ vault_service_password }}" + +# PostgreSQL +vault_postgres_password: changeme + +# Service Database Passwords +vault_arke_db_password: changeme +vault_casdoor_db_password: changeme +vault_mcp_switchboard_db_password: changeme +vault_openwebui_db_password: changeme +vault_spelunker_db_password: changeme + +# Neo4j +vault_neo4j_auth_password: changeme + +# RabbitMQ +vault_rabbitmq_password: changeme +vault_kairos_rabbitmq_password: changeme +vault_spelunker_rabbitmq_password: changeme +vault_mcp_switchboard_rabbitmq_password: changeme + +# Caliban +# Note: VNC passwords are limited to 8 characters maximum +vault_caliban_x11vnc_password: caliban + +# Casdoor +vault_casdoor_auth_state: changeme +vault_casdoor_radius_secret: changeme +vault_casdoor_s3_endpoint: changeme +vault_casdoor_s3_access_key: changeme +vault_casdoor_s3_secret_key: changeme +vault_casdoor_s3_bucket: changeme +vault_casdoor_app_client_secret: changeme +vault_casdoor_admin_password: changeme +vault_casdoor_hostmaster_password: changeme + +# Gitea +vault_gitea_db_password: changeme +vault_gitea_secret_key: changeme +vault_gitea_lfs_jwt_secret: changeme +vault_gitea_metrics_token: changeme +vault_gitea_oauth_client_id: changeme +vault_gitea_oauth_client_secret: changeme + +# OpenWebUI +vault_openwebui_secret_key: changeme +vault_openwebui_openai_api_key: changeme +vault_openwebui_anthropic_api_key: changeme +vault_openwebui_groq_api_key: changeme +vault_openwebui_mistral_api_key: changeme +vault_openwebui_oauth_client_id: changeme +vault_openwebui_oauth_client_secret: changeme + +# MCP Switchboard +vault_mcp_switchboard_secret_key: changeme + +# SearXNG +vault_searxng_secret_key: changeme + +# PgAdmin +vault_pgadmin_email: admin@example.com +vault_pgadmin_password: changeme + +# Grafana +vault_grafana_admin_name: Admin +vault_grafana_admin_login: admin +vault_grafana_admin_password: changeme +vault_grafana_viewer_name: Viewer +vault_grafana_viewer_login: viewer +vault_grafana_viewer_password: changeme + +# Pushover (Alertmanager notifications) +vault_pushover_user_key: changeme +vault_pushover_api_token: changeme + +# GitHub MCP +vault_github_personal_access_token: changeme + +# MCP Authentication Tokens +vault_angelia_mcp_auth: changeme +vault_athena_mcp_auth: changeme +vault_kairos_mcp_auth: changeme + +# Arke NTTh API Tokens +vault_ntth_token_1_app_secret: changeme +vault_ntth_token_2_app_secret: changeme +vault_ntth_token_3_app_secret: changeme +vault_ntth_token_4_app_secret: changeme diff --git a/ansible/inventory/host_vars/ariel.incus.yml b/ansible/inventory/host_vars/ariel.incus.yml new file mode 100644 index 0000000..c58c8f5 --- /dev/null +++ b/ansible/inventory/host_vars/ariel.incus.yml @@ -0,0 +1,24 @@ +--- +# Ariel Configuration - Graph Database Host +# Services: alloy, docker, neo4j + +services: + - alloy + - docker + - neo4j + +# Alloy +alloy_log_level: "warn" +neo4j_syslog_port: 22011 + +# Neo4j +neo4j_rel: master +neo4j_version: "5.26.0" +neo4j_user: neo4j +neo4j_group: neo4j +neo4j_directory: /srv/neo4j +neo4j_auth_user: neo4j +neo4j_auth_password: "{{ vault_neo4j_auth_password }}" +neo4j_http_port: 25554 +neo4j_bolt_port: 7687 +neo4j_apoc_unrestricted: "apoc.*" diff --git a/ansible/inventory/host_vars/caliban.incus.yml b/ansible/inventory/host_vars/caliban.incus.yml new file mode 100644 index 0000000..313e602 --- /dev/null +++ b/ansible/inventory/host_vars/caliban.incus.yml @@ -0,0 +1,23 @@ +--- +# Caliban Configuration - Agent Automation Host +# Services: caliban (Agent S), alloy, docker, kernos + +services: + - alloy + - caliban + - docker + - kernos + +# Alloy +alloy_log_level: "warn" + +# Kernos MCP Shell Server Configuration +kernos_user: harper +kernos_group: harper +kernos_directory: /srv/kernos +kernos_port: 22021 +kernos_host: "0.0.0.0" +kernos_log_level: INFO +kernos_log_format: json +kernos_environment: sandbox +kernos_allow_commands: "apt,awk,base64,bash,cat,chmod,cp,curl,cut,date,dd,df,dig,dmesg,du,echo,env,file,find,free,git,grep,gunzip,gzip,head,host,hostname,id,jq,kill,less,ln,ls,lsblk,lspci,lsusb,make,mkdir,mv,nc,node,nohup,npm,npx,ping,pip,pkill,pnpm,printenv,ps,pwd,python3,rm,rsync,run-captured,scp,sed,sleep,sort,source,ssh,ssh-keygen,ssh-keyscan,stat,sudo,tail,tar,tee,timeout,touch,tr,tree,uname,uniq,unzip,uptime,wc,wget,which,whoami,xargs,xz,zip" \ No newline at end of file diff --git a/ansible/inventory/host_vars/korax.helu.ca.yml b/ansible/inventory/host_vars/korax.helu.ca.yml new file mode 100644 index 0000000..481fbf2 --- /dev/null +++ b/ansible/inventory/host_vars/korax.helu.ca.yml @@ -0,0 +1,20 @@ +--- +# Korax Configuration +# Services: alloy, kernos + +services: + - alloy + - kernos + +# Alloy +alloy_log_level: "warn" +# Kernos MCP Shell Server Configuration +kernos_user: harper +kernos_group: harper +kernos_directory: /srv/kernos +kernos_port: 22021 +kernos_host: "0.0.0.0" +kernos_log_level: INFO +kernos_log_format: json +kernos_environment: sandbox +kernos_allow_commands: "apt,awk,base64,bash,cat,chmod,cp,curl,cut,date,dd,df,dig,dmesg,du,echo,env,file,find,free,git,grep,gunzip,gzip,head,host,hostname,id,jq,kill,less,ln,ls,lsblk,lspci,lsusb,make,mkdir,mv,nc,node,nohup,npm,npx,ping,pip,pkill,pnpm,printenv,ps,pwd,python3,rm,rsync,run-captured,scp,sed,sleep,sort,source,ssh,ssh-keygen,ssh-keyscan,stat,sudo,tail,tar,tee,timeout,touch,tr,tree,uname,uniq,unzip,uptime,wc,wget,which,whoami,xargs,xz,zip" \ No newline at end of file diff --git a/ansible/inventory/host_vars/miranda.incus.yml b/ansible/inventory/host_vars/miranda.incus.yml new file mode 100644 index 0000000..b679802 --- /dev/null +++ b/ansible/inventory/host_vars/miranda.incus.yml @@ -0,0 +1,74 @@ +--- +# Miranda Configuration - MCP Docker Host +# Services: alloy, argos, docker, mcpo, neo4j_mcp + +services: + - alloy + - argos + - docker + - gitea_mcp + - grafana_mcp + - mcpo + - neo4j_mcp + +# Alloy +alloy_log_level: "warn" +argos_syslog_port: 51434 +neo4j_cypher_syslog_port: 51431 +grafana_mcp_syslog_port: 51433 +gitea_mcp_syslog_port: 51435 + +# Argos MCP Configuration +argos_user: argos +argos_group: argos +argos_directory: /srv/argos +argos_port: 25534 +argos_log_level: INFO +argos_searxng_instances: http://oberon.incus:22083/ +argos_cache_ttl: 300 +argos_max_results: 10 +argos_request_timeout: 30.0 +argos_health_check_timeout: 5.0 +argos_kvdb_host: localhost +argos_kvdb_port: 11211 +argos_kvdb_prefix: argos +argos_enable_startup_health_check: true + +# Docker API Configuration +docker_api_enabled: true +docker_api_port: 2375 +docker_api_host: "0.0.0.0" + +# Neo4j MCP Config +neo4j_mcp_user: neo4j_mcp +neo4j_mcp_group: neo4j_mcp +neo4j_mcp_directory: /srv/neo4j_mcp + +# Grafana MCP Config +grafana_mcp_user: grafana_mcp +grafana_mcp_group: grafana_mcp +grafana_mcp_directory: /srv/grafana_mcp +grafana_mcp_port: 25533 +grafana_mcp_grafana_host: prospero.incus +grafana_mcp_grafana_port: 3000 +grafana_service_account_token: "{{ vault_grafana_service_account_token }}" + +# Gitea MCP Config +gitea_mcp_user: gitea_mcp +gitea_mcp_group: gitea_mcp +gitea_mcp_directory: /srv/gitea_mcp +gitea_mcp_port: 25535 +gitea_mcp_host: https://gitea.ouranos.helu.ca +gitea_mcp_access_token: "{{ vault_gitea_mcp_access_token }}" + +# Neo4j Cypher MCP +neo4j_host: ariel.incus +neo4j_bolt_port: 7687 +neo4j_auth_password: "{{ vault_neo4j_auth_password }}" +neo4j_cypher_mcp_port: 25531 + +# MCPO Config +mcpo_user: mcpo +mcpo_group: mcpo +mcpo_directory: /srv/mcpo +mcpo_port: 25530 diff --git a/ansible/inventory/host_vars/oberon.incus.yml b/ansible/inventory/host_vars/oberon.incus.yml new file mode 100644 index 0000000..e40c73e --- /dev/null +++ b/ansible/inventory/host_vars/oberon.incus.yml @@ -0,0 +1,134 @@ +--- +# Oberon Configuration + +services: + - alloy + - docker + - hass + - mcp_switchboard + - openwebui + - rabbitmq + - searxng + - smtp4dev + +# Alloy +alloy_log_level: "warn" +rabbitmq_syslog_port: 51402 +searxng_syslog_port: 51403 + +# MCP Switchboard Configuration +mcp_switchboard_user: mcpsb +mcp_switchboard_group: mcpsb +mcp_switchboard_directory: /srv/mcp_switchboard +mcp_switchboard_port: 22785 +mcp_switchboard_docker_host: "tcp://miranda.incus:2375" +mcp_switchboard_db_host: portia.incus +mcp_switchboard_db_port: 5432 +mcp_switchboard_db_name: mcp_switchboard +mcp_switchboard_db_user: mcpsb +mcp_switchboard_db_password: "{{ vault_mcp_switchboard_db_password }}" +mcp_switchboard_rabbitmq_host: localhost +mcp_switchboard_rabbitmq_port: 5672 +mcp_switchboard_rabbitmq_user: rabbitmq +mcp_switchboard_rabbitmq_password: "{{ vault_mcp_switchboard_rabbitmq_password }}" +mcp_switchboard_secret_key: "{{ vault_mcp_switchboard_secret_key }}" + +# Open WebUI Configuration +openwebui_user: openwebui +openwebui_group: openwebui +openwebui_directory: /srv/openwebui +openwebui_cors_allow_origin: https://openwebui.ouranos.helu.ca +openwebui_port: 22088 +openwebui_host: puck.incus +openwebui_secret_key: "{{ vault_openwebui_secret_key }}" +openwebui_enable_signup: true +openwebui_enable_email_login: false + +# OAuth/OIDC Configuration (Casdoor SSO) +openwebui_oauth_client_id: "{{ vault_openwebui_oauth_client_id }}" +openwebui_oauth_client_secret: "{{ vault_openwebui_oauth_client_secret }}" +openwebui_oauth_provider_name: "Casdoor" +openwebui_oauth_provider_url: "https://id.ouranos.helu.ca/.well-known/openid-configuration" + +# Database Configuration +openwebui_db_host: portia.incus +openwebui_db_port: 5432 +openwebui_db_name: openwebui +openwebui_db_user: openwebui +openwebui_db_password: "{{ vault_openwebui_db_password }}" + +# API Keys +openwebui_openai_api_key: "{{ vault_openwebui_openai_api_key }}" +openwebui_anthropic_api_key: "{{ vault_openwebui_anthropic_api_key }}" +openwebui_groq_api_key: "{{ vault_openwebui_groq_api_key }}" +openwebui_mistral_api_key: "{{ vault_openwebui_mistral_api_key }}" + +# Ollama Configuration +ollama_api_base_url: "" +openwebui_ollama_api_key: "" + +# SSL Configuration +openwebui_enable_https: false +openwebui_ssl_cert_path: "" +openwebui_ssl_key_path: "" + +# Logging +openwebui_log_level: info + +# RabbitMQ Config +rabbitmq_user: rabbitmq +rabbitmq_group: rabbitmq +rabbitmq_directory: /srv/rabbitmq +rabbitmq_amqp_port: 5672 +rabbitmq_management_port: 25582 +rabbitmq_password: "{{ vault_rabbitmq_password }}" + +# Redis password +redis_password: "{{ vault_redis_password }}" + +# SearXNG Configuration +searxng_user: searxng +searxng_group: searxng +searxng_directory: /srv/searxng +searxng_port: 22083 +searxng_base_url: http://oberon.incus:22083/ +searxng_instance_name: "Agathos Search" +searxng_secret_key: "{{ vault_searxng_secret_key }}" + +# SearXNG OAuth2-Proxy Sidecar +# Note: Each host supports at most one OAuth2-Proxy sidecar instance +# (binary shared at /usr/local/bin/oauth2-proxy, unique systemd unit per service) +searxng_oauth2_proxy_dir: /etc/oauth2-proxy-searxng +searxng_oauth2_proxy_version: "7.6.0" +searxng_proxy_port: 22073 +searxng_domain: "ouranos.helu.ca" +searxng_oauth2_oidc_issuer_url: "https://id.ouranos.helu.ca" +searxng_oauth2_redirect_url: "https://searxng.ouranos.helu.ca/oauth2/callback" + +# OAuth2 Credentials (from vault) +searxng_oauth2_client_id: "{{ vault_searxng_oauth2_client_id }}" +searxng_oauth2_client_secret: "{{ vault_searxng_oauth2_client_secret }}" +searxng_oauth2_cookie_secret: "{{ vault_searxng_oauth2_cookie_secret }}" + +# smtp4dev Configuration +smtp4dev_user: smtp4dev +smtp4dev_group: smtp4dev +smtp4dev_directory: /srv/smtp4dev +smtp4dev_port: 22085 +smtp4dev_smtp_port: 22025 +smtp4dev_imap_port: 22045 +smtp4dev_syslog_port: 51405 + +# Home Assistant Configuration +hass_user: hass +hass_group: hass +hass_directory: /srv/hass +hass_media_directory: /srv/hass/media +hass_port: 8123 +hass_version: "2026.2.0" +hass_db_host: portia.incus +hass_db_port: 5432 +hass_db_name: hass +hass_db_user: hass +hass_db_password: "{{ vault_hass_db_password }}" +hass_metrics_token: "{{ vault_hass_metrics_token }}" diff --git a/ansible/inventory/host_vars/portia.incus.yml b/ansible/inventory/host_vars/portia.incus.yml new file mode 100644 index 0000000..08ed22e --- /dev/null +++ b/ansible/inventory/host_vars/portia.incus.yml @@ -0,0 +1,48 @@ +--- +# Portia Configuration - Relational Database Host +# Services: alloy, postgresql +# Note: PgAdmin moved to Prospero (PPLG stack) + +services: + - alloy + - postgresql + +# Alloy +alloy_log_level: "warn" + +# PostgreSQL Config +postgres_user: postgres +postgres_group: postgres +postgresql_port: 5432 +postgresql_data_dir: /var/lib/postgresql +arke_db_name: arke +arke_db_user: arke +arke_db_password: "{{ vault_arke_db_password }}" +anythingllm_db_name: anythingllm +anythingllm_db_user: anythingllm +anythingllm_db_password: "{{ vault_anythingllm_db_password }}" +# Note: Casdoor uses dedicated PostgreSQL on Titania (not Portia) +gitea_db_name: gitea +gitea_db_user: gitea +gitea_db_password: "{{ vault_gitea_db_password }}" +lobechat_db_name: lobechat +lobechat_db_user: lobechat +lobechat_db_password: "{{ vault_lobechat_db_password }}" +nextcloud_db_name: nextcloud +nextcloud_db_user: nextcloud +nextcloud_db_password: "{{ vault_nextcloud_db_password }}" +openwebui_db_name: openwebui +openwebui_db_user: openwebui +openwebui_db_password: "{{ vault_openwebui_db_password }}" +spelunker_db_name: spelunker +spelunker_db_user: spelunker +spelunker_db_password: "{{ vault_spelunker_db_password }}" +hass_db_name: hass +hass_db_user: hass +hass_db_password: "{{ vault_hass_db_password }}" +nike_db_name: nike +nike_db_user: nike +nike_db_password: "{{ vault_nike_db_password }}" + +# PostgreSQL admin password +postgres_password: "{{ vault_postgres_password }}" diff --git a/ansible/inventory/host_vars/prospero.incus.yml b/ansible/inventory/host_vars/prospero.incus.yml new file mode 100644 index 0000000..13ddeb6 --- /dev/null +++ b/ansible/inventory/host_vars/prospero.incus.yml @@ -0,0 +1,141 @@ +--- +# Prospero Configuration - PPLG Observability & Admin Stack +# Services: pplg (PgAdmin, Prometheus, Loki, Grafana + HAProxy + OAuth2-Proxy) + +services: + - alloy + - pplg + +# Alloy +alloy_log_level: "warn" + +# ============================================================================ +# PPLG HAProxy Configuration +# ============================================================================ + +pplg_haproxy_user: haproxy +pplg_haproxy_group: haproxy +pplg_haproxy_uid: 800 +pplg_haproxy_gid: 800 +pplg_haproxy_domain: "ouranos.helu.ca" +pplg_haproxy_cert_path: /etc/haproxy/certs/ouranos.pem +pplg_haproxy_stats_port: 8404 +pplg_haproxy_syslog_port: 51405 + +# ============================================================================ +# Grafana +# ============================================================================ + +# Grafana Datasources +prometheus_datasource_name: Prospero-Prometheus +prometheus_host: prospero.incus +prometheus_port: 9090 +prometheus_datasource_uid: prospero-prometheus +loki_datasource_name: Prospero-Loki +loki_host: prospero.incus +loki_port: 3100 +loki_datasource_uid: prospero-loki + +# Grafana Users +grafana_admin_name: "{{ vault_grafana_admin_name }}" +grafana_admin_login: "{{ vault_grafana_admin_login }}" +grafana_admin_password: "{{ vault_grafana_admin_password }}" +grafana_viewer_name: "{{ vault_grafana_viewer_name }}" +grafana_viewer_login: "{{ vault_grafana_viewer_login }}" +grafana_viewer_password: "{{ vault_grafana_viewer_password }}" + +# Grafana OAuth (Casdoor SSO) +grafana_oauth_enabled: true +grafana_oauth_name: "Casdoor" +grafana_oauth_client_id: "{{ vault_grafana_oauth_client_id }}" +grafana_oauth_client_secret: "{{ vault_grafana_oauth_client_secret }}" +grafana_oauth_auth_url: "https://id.ouranos.helu.ca/login/oauth/authorize" +grafana_oauth_token_url: "https://id.ouranos.helu.ca/api/login/oauth/access_token" +grafana_oauth_api_url: "https://id.ouranos.helu.ca/api/userinfo" +grafana_oauth_scopes: "openid profile email" +grafana_root_url: "https://grafana.ouranos.helu.ca" +grafana_oauth_allow_sign_up: true +grafana_oauth_skip_tls_verify: false + +# ============================================================================ +# Prometheus +# ============================================================================ + +prometheus_user: prometheus +prometheus_group: prometheus +prometheus_scrape_interval: 15s +prometheus_evaluation_interval: 15s +alertmanager_host: prospero.incus +alertmanager_port: 9093 +loki_metrics_port: 3100 +prometheus_targets: + - 'oberon.incus:9100' + - 'portia.incus:9100' + - 'ariel.incus:9100' + - 'puck.incus:9100' + - 'puck.incus:25571' + - 'miranda.incus:9100' + - 'sycorax.incus:9100' + - 'prospero.incus:9100' + - 'rosalind.incus:9100' + +# Prometheus OAuth2-Proxy Sidecar +prometheus_proxy_port: 9091 +prometheus_oauth2_proxy_dir: /etc/oauth2-proxy-prometheus +prometheus_oauth2_proxy_version: "7.6.0" +prometheus_oauth2_oidc_issuer_url: "https://id.ouranos.helu.ca" +prometheus_oauth2_client_id: "{{ vault_prometheus_oauth2_client_id }}" +prometheus_oauth2_client_secret: "{{ vault_prometheus_oauth2_client_secret }}" +prometheus_oauth2_cookie_secret: "{{ vault_prometheus_oauth2_cookie_secret }}" + +# ============================================================================ +# Alertmanager +# ============================================================================ + +alertmanager_user: prometheus +alertmanager_group: prometheus +alertmanager_resolve_timeout: 5m +alertmanager_group_wait: 30s +alertmanager_group_interval: 5m +alertmanager_repeat_interval: 4h +pushover_user_key: "{{ vault_pushover_user_key }}" +pushover_api_token: "{{ vault_pushover_api_token }}" +pushover_priority: 1 +pushover_retry: 30 +pushover_expire: 3600 + +# ============================================================================ +# Loki +# ============================================================================ + +loki_user: loki +loki_group: loki +loki_data_dir: /var/lib/loki +loki_config_dir: /etc/loki +loki_config_file: config.yml +loki_grpc_port: 9096 + +# ============================================================================ +# PgAdmin (Gunicorn - no Apache) +# ============================================================================ + +pgadmin_user: pgadmin +pgadmin_group: pgadmin +pgadmin_port: 5050 +pgadmin_data_dir: /var/lib/pgadmin +pgadmin_log_dir: /var/log/pgadmin +pgadmin_email: "{{ vault_pgadmin_email }}" +pgadmin_password: "{{ vault_pgadmin_password }}" + +# PgAdmin OAuth (Casdoor SSO) +pgadmin_oauth_client_id: "{{ vault_pgadmin_oauth_client_id }}" +pgadmin_oauth_client_secret: "{{ vault_pgadmin_oauth_client_secret }}" + +# ============================================================================ +# Casdoor Metrics (for Prometheus scraping) +# ============================================================================ + +casdoor_metrics_host: "titania.incus" +casdoor_metrics_port: 22081 +casdoor_prometheus_access_key: "{{ vault_casdoor_prometheus_access_key }}" +casdoor_prometheus_access_secret: "{{ vault_casdoor_prometheus_access_secret }}" diff --git a/ansible/inventory/host_vars/puck.incus.yml b/ansible/inventory/host_vars/puck.incus.yml new file mode 100644 index 0000000..bc0ddef --- /dev/null +++ b/ansible/inventory/host_vars/puck.incus.yml @@ -0,0 +1,46 @@ +--- +# Puck Configuration - Application Runtime +# Services: alloy, docker, lxqt, jupyterlab + +services: + - alloy + - docker + - gitea_runner + - jupyterlab + +# Gitea Runner +gitea_runner_name: "puck-runner" + +# Alloy +alloy_log_level: "warn" +angelia_syslog_port: 51421 +sagittarius_syslog_port: 51431 +athena_syslog_port: 51441 +kairos_syslog_port: 51451 +icarlos_syslog_port: 51461 +spelunker_syslog_port: 51481 +jupyterlab_syslog_port: 51491 + +# ============================================================================= +# JupyterLab Configuration +# ============================================================================= +jupyterlab_user: robert +jupyterlab_group: robert +jupyterlab_notebook_dir: /home/robert +jupyterlab_venv_dir: /home/robert/env/jupyter + +# Ports +jupyterlab_port: 22081 # JupyterLab (localhost only) +jupyterlab_proxy_port: 22071 # OAuth2-Proxy (exposed to HAProxy) + +# OAuth2-Proxy Configuration +jupyterlab_oauth2_proxy_dir: /etc/oauth2-proxy-jupyter +jupyterlab_oauth2_proxy_version: "7.6.0" +jupyterlab_domain: "ouranos.helu.ca" +jupyterlab_oauth2_oidc_issuer_url: "https://id.ouranos.helu.ca" +jupyterlab_oauth2_redirect_url: "https://jupyterlab.ouranos.helu.ca/oauth2/callback" + +# OAuth2 Credentials (from vault) +jupyterlab_oauth_client_id: "{{ vault_jupyterlab_oauth_client_id }}" +jupyterlab_oauth_client_secret: "{{ vault_jupyterlab_oauth_client_secret }}" +jupyterlab_oauth2_cookie_secret: "{{ vault_jupyterlab_oauth2_cookie_secret }}" diff --git a/ansible/inventory/host_vars/rosalind.incus.yml b/ansible/inventory/host_vars/rosalind.incus.yml new file mode 100644 index 0000000..4322c7b --- /dev/null +++ b/ansible/inventory/host_vars/rosalind.incus.yml @@ -0,0 +1,155 @@ +--- +# Rosalind Configuration - GO, Node.js, PHP Apps +# Services: alloy, gitea, lobechat, nextcloud + +services: + - alloy + - anythingllm + - docker + - gitea + - lobechat + - memcached + - nextcloud + +# Alloy +alloy_log_level: "warn" +lobechat_syslog_port: 51461 + +# AnythingLLM Configuration +anythingllm_user: anythingllm +anythingllm_group: anythingllm +anythingllm_directory: /srv/anythingllm +anythingllm_port: 22084 + +# AnythingLLM Database (Portia PostgreSQL) +anythingllm_db_host: portia.incus +anythingllm_db_port: 5432 +anythingllm_db_name: anythingllm +anythingllm_db_user: anythingllm +anythingllm_db_password: "{{ vault_anythingllm_db_password }}" + +# AnythingLLM Security +anythingllm_jwt_secret: "{{ vault_anythingllm_jwt_secret }}" +anythingllm_sig_key: "{{ vault_anythingllm_sig_key }}" +anythingllm_sig_salt: "{{ vault_anythingllm_sig_salt }}" + +# AnythingLLM LLM Provider (Generic OpenAI / llama-cpp) +anythingllm_llm_base_url: "http://nyx.helu.ca:25540/v1" +anythingllm_llm_model: "global.anthropic.claude-opus-4-6-v1" +anythingllm_llm_token_limit: 200000 +anythingllm_llm_api_key: "ak_WX_7paeOky041GeX7MtQ51gam4lJsff3ghlClwdcbiI" + +# AnythingLLM Embedding +anythingllm_embedding_engine: "generic-openai" +anythingllm_embedding_model: "Qwen3-Embedding-0.6B-Q8_0" + +# AnythingLLM TTS (FastKokoro) +anythingllm_tts_provider: "openai" +anythingllm_tts_api_key: "not-needed" +anythingllm_tts_endpoint: "http://pan.helu.ca:22070/v1" +anythingllm_tts_model: "kokoro" +anythingllm_tts_voice: "am_echo" + +# Gitea User and Directories +gitea_user: git +gitea_group: git +gitea_home_dir: /srv/git +gitea_work_dir: /var/lib/gitea +gitea_data_dir: /var/lib/gitea/data +gitea_lfs_dir: /var/lib/gitea/data/lfs +gitea_repo_root: /mnt/dv +gitea_config_file: /etc/gitea/app.ini +# Ports +gitea_web_port: 22082 +gitea_ssh_port: 22022 +gitea_metrics_port: 22092 +# Network +gitea_domain: ouranos.helu.ca +gitea_root_url: https://gitea.ouranos.helu.ca/ +# Database Configuration +gitea_db_type: postgres +gitea_db_host: portia.incus +gitea_db_port: 5432 +gitea_db_name: gitea +gitea_db_user: gitea +gitea_db_password: "{{vault_gitea_db_password}}" +gitea_db_ssl_mode: disable +# Features +gitea_lfs_enabled: true +gitea_metrics_enabled: true +# Service Settings +gitea_disable_registration: true # Use Casdoor SSO instead +gitea_require_signin_view: false +# Security (vault secrets) +gitea_secret_key: "{{vault_gitea_secret_key}}" +gitea_lfs_jwt_secret: "{{vault_gitea_lfs_jwt_secret}}" +gitea_metrics_token: "{{vault_gitea_metrics_token}}" +# OAuth2 (Casdoor SSO) +gitea_oauth_enabled: true +gitea_oauth_name: "casdoor" +gitea_oauth_display_name: "Sign in with Casdoor" +gitea_oauth_client_id: "{{vault_gitea_oauth_client_id}}" +gitea_oauth_client_secret: "{{vault_gitea_oauth_client_secret}}" +# Auth URL uses external HAProxy address (user's browser) +gitea_oauth_auth_url: "https://id.ouranos.helu.ca/login/oauth/authorize" +# Token and userinfo URLs use internal Casdoor address (server-to-server) +gitea_oauth_token_url: "https://id.ouranos.helu.ca/api/login/oauth/access_token" +gitea_oauth_userinfo_url: "https://id.ouranos.helu.ca/api/userinfo" +gitea_oauth_scopes: "openid profile email" + +# LobeChat Configuration +lobechat_user: lobechat +lobechat_group: lobechat +lobechat_directory: /srv/lobechat +lobechat_port: 22081 +# Database Configuration +lobechat_db_host: portia.incus +lobechat_db_port: 5432 +lobechat_db_name: lobechat +lobechat_db_user: lobechat +lobechat_db_password: "{{vault_lobechat_db_password}}" +lobechat_key_vaults_secret: "{{vault_lobechat_key_vaults_secret}}" +# Authentication +# NEXTAUTH_URL must be the public URL users access (not internal) +lobechat_nextauth_url: https://lobechat.ouranos.helu.ca +lobechat_next_auth_secret: "{{vault_lobechat_next_auth_secret}}" +lobechat_next_auth_sso_providers: casdoor +# Issuer must match exactly what Casdoor returns in .well-known/openid-configuration +lobechat_auth_casdoor_issuer: http://titania.incus:22081 +lobechat_auth_casdoor_id: "{{vault_lobechat_auth_casdoor_id}}" +lobechat_auth_casdoor_secret: "{{vault_lobechat_auth_casdoor_secret}}" +# S3 Storage +lobechat_s3_endpoint: https://pan.helu.ca:8555 +lobechat_s3_public_domain: https://pan.helu.ca:8555 +lobechat_s3_access_key: "{{vault_lobechat_s3_access_key}}" +lobechat_s3_secret_key: "{{vault_lobechat_s3_secret_key}}" +lobechat_s3_bucket: lobechat +# Search +lobechat_searxng_url: http://oberon.incus:25599 +# AI Models +lobechat_openai_proxy_url: http://sycorax.incus:25540/v1 +lobechat_openai_key: "{{vault_lobechat_openai_api_key}}" +lobechat_ollama_proxy_url: http://perseus.helu.ca:11434 +lobechat_anthropic_api_key: "{{vault_lobechat_anthropic_api_key}}" +lobechat_google_api_key: "{{vault_lobechat_google_api_key}}" +lobechat_app_url: https://lobechat.ouranos.helu.ca/ + +# Nextcloud Configuration +nextcloud_web_port: 22083 +nextcloud_data_dir: /mnt/nextcloud +# Database Configuration +nextcloud_db_type: pgsql +nextcloud_db_host: portia.incus +nextcloud_db_port: 5432 +nextcloud_db_name: nextcloud +nextcloud_db_user: nextcloud +nextcloud_db_password: "{{vault_nextcloud_db_password}}" +# Admin Configuration +nextcloud_admin_user: admin +nextcloud_admin_password: "{{vault_nextcloud_admin_password}}" +# Domain Configuration +nextcloud_domain: nextcloud.ouranos.helu.ca +# Instance secrets (generated during install) +nextcloud_instance_id: "" +nextcloud_password_salt: "" +nextcloud_secret: "" diff --git a/ansible/inventory/host_vars/sycorax.incus.yml b/ansible/inventory/host_vars/sycorax.incus.yml new file mode 100644 index 0000000..db62784 --- /dev/null +++ b/ansible/inventory/host_vars/sycorax.incus.yml @@ -0,0 +1,71 @@ +--- +# Sycorax Configuration - Language Models +# Services: alloy, arke + +services: + - alloy + - arke + +# Alloy +alloy_log_level: "warn" + +# Arke Configuration +arke_user: arke +arke_group: arke +arke_directory: /srv/arke +arke_port: 25540 + +# Server Configuration +arke_reload: false + +# Memcached config +arke_memcached_host: localhost +arke_memcached_port: 11211 + +# Database Configuration +arke_db_host: portia.incus +arke_db_port: 5432 +arke_db_name: arke +arke_db_user: arke +arke_db_password: "{{ vault_arke_db_password }}" + +# NTTh API Configuration +arke_session_limit: 90 +arke_session_ttl: 3600 +arke_token_cache_ttl: 82800 +ntth_token_1_app_name: "{{ vault_ntth_token_1_app_name }}" +ntth_token_1_app_id: "{{ vault_ntth_token_1_app_id }}" +ntth_token_1_app_secret: "{{ vault_ntth_token_1_app_secret }}" +ntth_token_2_app_name: "{{ vault_ntth_token_2_app_name }}" +ntth_token_2_app_id: "{{ vault_ntth_token_2_app_id }}" +ntth_token_2_app_secret: "{{ vault_ntth_token_2_app_secret }}" +ntth_token_3_app_name: "{{ vault_ntth_token_3_app_name }}" +ntth_token_3_app_id: "{{ vault_ntth_token_3_app_id }}" +ntth_token_3_app_secret: "{{ vault_ntth_token_3_app_secret }}" +ntth_token_4_app_name: "{{ vault_ntth_token_4_app_name }}" +ntth_token_4_app_id: "{{ vault_ntth_token_4_app_id }}" +ntth_token_4_app_secret: "{{ vault_ntth_token_4_app_secret }}" + +# Embedding Provider Configuration +arke_embedding_provider: openai + +# OpenAI-Compatible Configuration +arke_openai_embedding_base_url: http://pan.helu.ca:22079/v1 +arke_openai_embedding_api_key: 0000 +arke_openai_embedding_model: Qwen3-Embedding-0.6B-Q8_0 + +# Common Embedding Configuration +arke_embedding_batch_size: 16 +arke_embedding_ubatch_size: 512 +arke_embedding_max_context: 8192 +arke_embedding_timeout: 30.0 + +# Memory System Configuration +arke_memory_enabled: true +arke_max_context_tokens: 8000 +arke_similarity_threshold: 0.7 +arke_min_importance_score: 0.7 + +# Monitoring Configuration +arke_prometheus_enabled: true +arke_metrics_port: 25540 diff --git a/ansible/inventory/host_vars/titania.incus.yml b/ansible/inventory/host_vars/titania.incus.yml new file mode 100644 index 0000000..fa7dc9c --- /dev/null +++ b/ansible/inventory/host_vars/titania.incus.yml @@ -0,0 +1,217 @@ +--- +# Titania Configuration - Proxy & SSO Services +# Services: alloy, certbot, docker, haproxy, postgresql_ssl, casdoor + +services: + - alloy + - certbot + - docker + - haproxy + - postgresql_ssl + - casdoor + +# PostgreSQL SSL Configuration (dedicated database for identity services) +postgresql_ssl_postgres_password: "{{ vault_postgresql_ssl_postgres_password }}" +postgresql_ssl_port: 5432 +postgresql_ssl_cert_path: /etc/postgresql/17/main/ssl/server.crt + +# Alloy +alloy_log_level: "warn" +casdoor_syslog_port: 51401 +haproxy_syslog_port: 51404 + +# Certbot Configuration (Let's Encrypt DNS-01 with Namecheap) +certbot_user: certbot +certbot_group: certbot +certbot_directory: /srv/certbot +certbot_email: webmaster@helu.ca +certbot_cert_name: ouranos.helu.ca +certbot_domains: + - "*.ouranos.helu.ca" + - "ouranos.helu.ca" +prometheus_node_exporter_text_directory: /var/lib/prometheus/node-exporter + +# HAProxy Configuration +haproxy_user: haproxy +haproxy_group: haproxy +haproxy_uid: 800 +haproxy_gid: 800 +haproxy_directory: /srv/haproxy +haproxy_http_port: 8080 +haproxy_https_port: 8443 +haproxy_stats_port: 8404 +haproxy_domain: "ouranos.helu.ca" +haproxy_cert_path: /etc/haproxy/certs/ouranos.pem + +# HAProxy TCP Backend Definitions (mode tcp passthrough) +haproxy_tcp_backends: + - name: gitea_ssh + listen_port: 22022 + backend_host: "rosalind.incus" + backend_port: 22022 + +# HAProxy Backend Definitions +haproxy_backends: + - subdomain: "" # Root domain (ouranos.helu.ca) + backend_host: "puck.incus" + backend_port: 22281 + health_path: "/" + # timeout_server: "50s" # Optional override + + - subdomain: "id" # Casdoor SSO (id.ouranos.helu.ca) + backend_host: "titania.incus" + backend_port: 22081 + health_path: "/api/health" + redirect_root: "/login/heluca" # Redirect root to branded org login page + + - subdomain: "openwebui" + backend_host: "oberon.incus" + backend_port: 22088 + health_path: "/" + + - subdomain: "anythingllm" + backend_host: "rosalind.incus" + backend_port: 22084 + health_path: "/api/ping" + + - subdomain: "arke" + backend_host: "sycorax.incus" + backend_port: 25540 + health_path: "/health" + + # SearXNG - routed through OAuth2-Proxy sidecar on Oberon + - subdomain: "searxng" + backend_host: "oberon.incus" + backend_port: 22073 + health_path: "/ping" + + - subdomain: "pgadmin" + backend_host: "prospero.incus" + backend_port: 443 + health_path: "/misc/ping" + ssl_backend: true + + - subdomain: "grafana" + backend_host: "prospero.incus" + backend_port: 443 + health_path: "/api/health" + ssl_backend: true + + - subdomain: "prometheus" + backend_host: "prospero.incus" + backend_port: 443 + health_path: "/ping" + ssl_backend: true + + - subdomain: "loki" + backend_host: "prospero.incus" + backend_port: 443 + health_path: "/ready" + ssl_backend: true + + - subdomain: "alertmanager" + backend_host: "prospero.incus" + backend_port: 443 + health_path: "/-/healthy" + ssl_backend: true + + - subdomain: "gitea" + backend_host: "rosalind.incus" + backend_port: 22082 + health_path: "/api/healthz" + timeout_server: 120s + + - subdomain: "lobechat" + backend_host: "rosalind.incus" + backend_port: 22081 + health_path: "/chat" + + - subdomain: "nextcloud" + backend_host: "rosalind.incus" + backend_port: 22083 + health_path: "/status.php" + + - subdomain: "angelia" + backend_host: "puck.incus" + backend_port: 22281 + health_path: "/" + + - subdomain: "athena" + backend_host: "puck.incus" + backend_port: 22481 + health_path: "/ready/" + + - subdomain: "kairos" + backend_host: "puck.incus" + backend_port: 22581 + health_path: "/ready/" + + - subdomain: "icarlos" + backend_host: "puck.incus" + backend_port: 22681 + health_path: "/ready/" + + - subdomain: "mcp-switchboard" + backend_host: "puck.incus" + backend_port: 22781 + health_path: "/ready/" + + - subdomain: "spelunker" + backend_host: "puck.incus" + backend_port: 22881 + health_path: "/ready/" + + - subdomain: "peitho" + backend_host: "puck.incus" + backend_port: 22981 + health_path: "/ready/" + + - subdomain: "jupyterlab" + backend_host: "puck.incus" + backend_port: 22071 # OAuth2-Proxy port + health_path: "/ping" + timeout_server: 300s # WebSocket support + + - subdomain: "hass" + backend_host: "oberon.incus" + backend_port: 8123 + health_path: "/api/" + timeout_server: 300s # WebSocket support for HA frontend + + - subdomain: "smtp4dev" + backend_host: "oberon.incus" + backend_port: 22085 + health_path: "/" + +# Casdoor Configuration +casdoor_user: casdoor +casdoor_group: casdoor +casdoor_directory: /srv/casdoor +# Web Configuration +casdoor_port: 22081 +casdoor_runmode: dev +casdoor_copyrequestbody: true +casdoor_drivername: postgres +# Database Configuration +casdoor_db_port: 5432 +casdoor_db_name: casdoor +casdoor_db_user: casdoor +casdoor_db_password: "{{ vault_casdoor_db_password }}" +casdoor_db_sslmode: disable +casdoor_showsql: false +# Redis and Storage +casdoor_redis_endpoint: "" +casdoor_default_storage_provider: "" +# Authentication +casdoor_auth_state: "{{ vault_casdoor_auth_state }}" +# Origin must include port for internal OIDC endpoints to work correctly +casdoor_origin: "https://id.ouranos.helu.ca" +casdoor_origin_frontend: "https://id.ouranos.helu.ca" +# Timeouts and Ports +casdoor_inactive_timeout_minutes: 60 +casdoor_ldap_server_port: 0 +casdoor_ldaps_cert_id: "" +casdoor_ldaps_server_port: 0 +casdoor_radius_server_port: 1812 +casdoor_radius_default_organization: "built-in" +casdoor_radius_secret: "{{ vault_casdoor_radius_secret }}" diff --git a/ansible/inventory/hosts b/ansible/inventory/hosts new file mode 100644 index 0000000..69d990b --- /dev/null +++ b/ansible/inventory/hosts @@ -0,0 +1,50 @@ +--- +# Ansible Inventory - Simplified +# Variables moved to: +# - host_vars/{hostname}.yml (host-specific config) +# - group_vars/all/vars.yml (common variables) + +# Red Panda Approved Uranian Hosts +ubuntu: + hosts: + ariel.incus: + caliban.incus: + miranda.incus: + oberon.incus: + portia.incus: + prospero.incus: + puck.incus: + rosalind.incus: + sycorax.incus: + titania.incus: + korax.helu.ca: + +# Service-specific groups for targeted deployments +agent_s: + hosts: + caliban.incus: + +arke: + hosts: + sycorax.incus: + +casdoor: + hosts: + titania.incus: + +kernos: + hosts: + caliban.incus: + korax.helu.ca: + +searxng: + hosts: + oberon.incus: + +gitea: + hosts: + rosalind.incus: + +mcpo: + hosts: + miranda.incus: diff --git a/ansible/jupyterlab/deploy.yml b/ansible/jupyterlab/deploy.yml new file mode 100644 index 0000000..eb04f7f --- /dev/null +++ b/ansible/jupyterlab/deploy.yml @@ -0,0 +1,221 @@ +--- +# JupyterLab Deployment with OAuth2-Proxy Sidecar +# Deploys JupyterLab as systemd service with Casdoor SSO via oauth2-proxy +# Red Panda Approved + +- name: Deploy JupyterLab + hosts: ubuntu + become: true + tasks: + - name: Check if host has jupyterlab service + ansible.builtin.set_fact: + has_jupyterlab_service: "{{'jupyterlab' in services}}" + + - name: Skip hosts without jupyterlab service + ansible.builtin.meta: end_host + when: not has_jupyterlab_service + + # ========================================================================= + # System Dependencies + # ========================================================================= + - name: Install system dependencies + ansible.builtin.apt: + name: + - python3 + - python3-venv + - python3-dev + - python3-pip + - nodejs + - npm + - graphviz + - git + - curl + state: present + update_cache: true + + # ========================================================================= + # User Setup + # ========================================================================= + - name: Ensure jupyterlab user exists + ansible.builtin.user: + name: "{{ jupyterlab_user }}" + group: "{{ jupyterlab_group }}" + shell: /bin/bash + create_home: true + state: present + + - name: Create Notebooks directory + ansible.builtin.file: + path: "{{ jupyterlab_notebook_dir }}" + owner: "{{ jupyterlab_user }}" + group: "{{ jupyterlab_group }}" + state: directory + mode: '0755' + + - name: Create JupyterLab config directory + ansible.builtin.file: + path: /etc/jupyterlab + owner: root + group: "{{ jupyterlab_group }}" + state: directory + mode: '0755' + + - name: Create JupyterLab log directory + ansible.builtin.file: + path: /var/log/jupyterlab + owner: "{{ jupyterlab_user }}" + group: "{{ jupyterlab_group }}" + state: directory + mode: '0755' + + # ========================================================================= + # Python Virtual Environment + # ========================================================================= + - name: Create virtual environment directory + ansible.builtin.file: + path: "{{ jupyterlab_venv_dir }}" + owner: "{{ jupyterlab_user }}" + group: "{{ jupyterlab_group }}" + state: directory + mode: '0755' + + - name: Create virtual environment for JupyterLab + become_user: "{{ jupyterlab_user }}" + ansible.builtin.command: + cmd: "python3 -m venv {{ jupyterlab_venv_dir }}" + creates: "{{ jupyterlab_venv_dir }}/bin/activate" + + - name: Upgrade pip in virtual environment + become_user: "{{ jupyterlab_user }}" + ansible.builtin.pip: + name: + - pip + - wheel + - setuptools + state: latest + virtualenv: "{{ jupyterlab_venv_dir }}" + + - name: Install JupyterLab and core packages + become_user: "{{ jupyterlab_user }}" + ansible.builtin.pip: + name: + - jupyterlab + - jupyter-ai[all] + - langchain-ollama + - matplotlib + - plotly + - jupyter_contrib_nbextensions + - "jsonschema[format-nongpl]" + - python-mermaid + - ipywidgets + state: present + virtualenv: "{{ jupyterlab_venv_dir }}" + notify: restart jupyterlab + + # ========================================================================= + # Configuration Files + # ========================================================================= + - name: Template JupyterLab configuration + ansible.builtin.template: + src: jupyter_lab_config.py.j2 + dest: /etc/jupyterlab/jupyter_lab_config.py + owner: root + group: "{{ jupyterlab_group }}" + mode: '0644' + notify: restart jupyterlab + + - name: Template JupyterLab systemd service + ansible.builtin.template: + src: jupyterlab.service.j2 + dest: /etc/systemd/system/jupyterlab.service + owner: root + group: root + mode: '0644' + notify: + - reload systemd + - restart jupyterlab + + # ========================================================================= + # OAuth2-Proxy Sidecar + # ========================================================================= + - name: Create oauth2-proxy directory + ansible.builtin.file: + path: "{{ jupyterlab_oauth2_proxy_dir }}" + owner: root + group: root + state: directory + mode: '0755' + + - name: Download oauth2-proxy binary + ansible.builtin.get_url: + url: "https://github.com/oauth2-proxy/oauth2-proxy/releases/download/v{{ jupyterlab_oauth2_proxy_version }}/oauth2-proxy-v{{ jupyterlab_oauth2_proxy_version }}.linux-amd64.tar.gz" + dest: "/tmp/oauth2-proxy-v{{ jupyterlab_oauth2_proxy_version }}.tar.gz" + mode: '0644' + + - name: Extract oauth2-proxy binary + ansible.builtin.unarchive: + src: "/tmp/oauth2-proxy-v{{ jupyterlab_oauth2_proxy_version }}.tar.gz" + dest: /tmp + remote_src: true + creates: "/tmp/oauth2-proxy-v{{ jupyterlab_oauth2_proxy_version }}.linux-amd64/oauth2-proxy" + + - name: Install oauth2-proxy binary + ansible.builtin.copy: + src: "/tmp/oauth2-proxy-v{{ jupyterlab_oauth2_proxy_version }}.linux-amd64/oauth2-proxy" + dest: /usr/local/bin/oauth2-proxy + owner: root + group: root + mode: '0755' + remote_src: true + + - name: Template oauth2-proxy configuration + ansible.builtin.template: + src: oauth2-proxy-jupyter.cfg.j2 + dest: "{{ jupyterlab_oauth2_proxy_dir }}/oauth2-proxy.cfg" + owner: root + group: root + mode: '0600' + notify: restart oauth2-proxy-jupyter + + - name: Template oauth2-proxy systemd service + ansible.builtin.template: + src: oauth2-proxy-jupyter.service.j2 + dest: /etc/systemd/system/oauth2-proxy-jupyter.service + owner: root + group: root + mode: '0644' + notify: + - reload systemd + - restart oauth2-proxy-jupyter + + # ========================================================================= + # Service Management + # ========================================================================= + - name: Enable and start JupyterLab service + ansible.builtin.systemd: + name: jupyterlab + enabled: true + state: started + daemon_reload: true + + - name: Enable and start OAuth2-Proxy service + ansible.builtin.systemd: + name: oauth2-proxy-jupyter + enabled: true + state: started + daemon_reload: true + + handlers: + - name: reload systemd + ansible.builtin.systemd: + daemon_reload: true + + - name: restart jupyterlab + ansible.builtin.systemd: + name: jupyterlab + state: restarted + + - name: restart oauth2-proxy-jupyter + ansible.builtin.systemd: + name: oauth2-proxy-jupyter + state: restarted diff --git a/ansible/jupyterlab/jupyter_lab_config.py.j2 b/ansible/jupyterlab/jupyter_lab_config.py.j2 new file mode 100644 index 0000000..a3b47db --- /dev/null +++ b/ansible/jupyterlab/jupyter_lab_config.py.j2 @@ -0,0 +1,62 @@ +# JupyterLab Configuration +# Deployed via Ansible - Do not edit manually +# Red Panda Approved + +# ============================================================================= +# Server Settings +# ============================================================================= + +# Allow connections from reverse proxy +c.ServerApp.allow_remote_access = True +c.ServerApp.local_hostnames = ['localhost', '127.0.0.1', 'jupyter.{{ jupyterlab_domain }}'] + +# Disable browser launch +c.ServerApp.open_browser = False + +# Disable token authentication (OAuth2-Proxy handles auth) +c.ServerApp.token = '' +c.ServerApp.password = '' + +# Base URL for reverse proxy +c.ServerApp.base_url = '/' + +# Trust X-Forwarded headers from OAuth2-Proxy +c.ServerApp.trust_xheaders = True + +# ============================================================================= +# WebSocket Configuration (for reverse proxy) +# ============================================================================= + +# Allow WebSocket connections from any origin (handled by OAuth2-Proxy) +c.ServerApp.allow_origin = '*' +c.ServerApp.allow_credentials = True + +# Disable XSRF for API (OAuth2-Proxy handles CSRF) +c.ServerApp.disable_check_xsrf = True + +# ============================================================================= +# Notebook Settings +# ============================================================================= + +# Default notebook directory +c.ServerApp.root_dir = '{{ jupyterlab_notebook_dir }}' + +# Allow hidden files +c.ContentsManager.allow_hidden = True + +# ============================================================================= +# Terminal Settings +# ============================================================================= + +# Enable terminal +c.ServerApp.terminals_enabled = True + +# ============================================================================= +# Logging +# ============================================================================= + +# Log level +c.Application.log_level = 'INFO' + +# Log format +c.Application.log_format = '[%(levelname)s %(asctime)s %(name)s] %(message)s' diff --git a/ansible/jupyterlab/jupyterlab.service.j2 b/ansible/jupyterlab/jupyterlab.service.j2 new file mode 100644 index 0000000..25b9ad3 --- /dev/null +++ b/ansible/jupyterlab/jupyterlab.service.j2 @@ -0,0 +1,34 @@ +[Unit] +Description=JupyterLab Server +After=network.target +Wants=oauth2-proxy-jupyter.service + +[Service] +Type=simple +User={{ jupyterlab_user }} +Group={{ jupyterlab_group }} +WorkingDirectory={{ jupyterlab_notebook_dir }} + +ExecStart={{ jupyterlab_venv_dir }}/bin/jupyter-lab \ + --config=/etc/jupyterlab/jupyter_lab_config.py \ + --ip=127.0.0.1 \ + --port={{ jupyterlab_port }} \ + --no-browser \ + --notebook-dir={{ jupyterlab_notebook_dir }} + +Environment="PATH={{ jupyterlab_venv_dir }}/bin:/usr/local/bin:/usr/bin:/bin" + +Restart=on-failure +RestartSec=10 + +# Security hardening +NoNewPrivileges=true +PrivateTmp=true + +# Logging +StandardOutput=journal +StandardError=journal +SyslogIdentifier=jupyterlab + +[Install] +WantedBy=multi-user.target diff --git a/ansible/jupyterlab/oauth2-proxy-jupyter.cfg.j2 b/ansible/jupyterlab/oauth2-proxy-jupyter.cfg.j2 new file mode 100644 index 0000000..c153473 --- /dev/null +++ b/ansible/jupyterlab/oauth2-proxy-jupyter.cfg.j2 @@ -0,0 +1,68 @@ +# OAuth2-Proxy Configuration for JupyterLab +# Authenticates users via Casdoor OIDC before proxying to JupyterLab +# Red Panda Approved + +# Provider Configuration (Casdoor OIDC) +provider = "oidc" +provider_display_name = "Casdoor" +oidc_issuer_url = "{{ jupyterlab_oauth2_oidc_issuer_url }}" +client_id = "{{ jupyterlab_oauth_client_id }}" +client_secret = "{{ jupyterlab_oauth_client_secret }}" + +# Redirect URL after authentication +redirect_url = "{{ jupyterlab_oauth2_redirect_url }}" + +# Upstream service (JupyterLab on localhost) +upstreams = [ + "http://127.0.0.1:{{ jupyterlab_port }}" +] + +# Session/Cookie Configuration +cookie_secret = "{{ jupyterlab_oauth2_cookie_secret }}" +cookie_name = "_oauth2_proxy_jupyter" +cookie_secure = true +cookie_httponly = true +cookie_samesite = "lax" +cookie_domains = [ + ".{{ jupyterlab_domain }}" +] + +# Authentication settings +email_domains = ["*"] +oidc_email_claim = "email" +oidc_groups_claim = "groups" + +# Session settings +session_store_type = "cookie" +cookie_expire = "168h" +cookie_refresh = "1h" + +# Request settings - pass user info to JupyterLab +pass_access_token = false +pass_authorization_header = false +set_authorization_header = false +set_xauthrequest = true + +# Logging +request_logging = true +auth_logging = true +standard_logging = true + +# Network settings +http_address = "0.0.0.0:{{ jupyterlab_proxy_port }}" +reverse_proxy = true +real_client_ip_header = "X-Forwarded-For" + +# Skip authentication for health check endpoints +skip_auth_routes = [ + "^/api/status$", + "^/healthz$" +] + +# OIDC specific settings +skip_provider_button = true +oidc_extra_audiences = [] +insecure_oidc_allow_unverified_email = true + +# SSL verification (internal Casdoor uses valid certs) +ssl_insecure_skip_verify = false diff --git a/ansible/jupyterlab/oauth2-proxy-jupyter.service.j2 b/ansible/jupyterlab/oauth2-proxy-jupyter.service.j2 new file mode 100644 index 0000000..369ad7d --- /dev/null +++ b/ansible/jupyterlab/oauth2-proxy-jupyter.service.j2 @@ -0,0 +1,23 @@ +[Unit] +Description=OAuth2-Proxy for JupyterLab +After=network.target jupyterlab.service +Requires=jupyterlab.service + +[Service] +Type=simple +ExecStart=/usr/local/bin/oauth2-proxy --config={{ jupyterlab_oauth2_proxy_dir }}/oauth2-proxy.cfg + +Restart=on-failure +RestartSec=5 + +# Security hardening +NoNewPrivileges=true +PrivateTmp=true + +# Logging +StandardOutput=journal +StandardError=journal +SyslogIdentifier=oauth2-proxy-jupyter + +[Install] +WantedBy=multi-user.target diff --git a/ansible/kernos/.env.j2 b/ansible/kernos/.env.j2 new file mode 100644 index 0000000..74712e0 --- /dev/null +++ b/ansible/kernos/.env.j2 @@ -0,0 +1,22 @@ +# Kernos Environment Configuration +# HTTP-enabled MCP shell server using FastMCP + +# ============================================================================ +# Server Configuration +# ============================================================================ +HOST={{ kernos_host | default('0.0.0.0') }} +PORT={{ kernos_port }} + +# ============================================================================ +# Logging Configuration +# ============================================================================ +LOG_FORMAT={{ kernos_log_format | default('json') }} +LOG_LEVEL={{ kernos_log_level | default('INFO') }} +ENVIRONMENT={{ kernos_environment | default('production') }} + +# ============================================================================ +# Security Configuration +# ============================================================================ +# Comma-separated whitelist of allowed commands +# Commands after shell operators (;, &&, ||, |) are also validated +ALLOW_COMMANDS={{ kernos_allow_commands }} \ No newline at end of file diff --git a/ansible/kernos/deploy.yml b/ansible/kernos/deploy.yml new file mode 100644 index 0000000..a732aba --- /dev/null +++ b/ansible/kernos/deploy.yml @@ -0,0 +1,180 @@ +--- +- name: Deploy Kernos MCP Shell Server + hosts: kernos + vars: + ansible_common_remote_group: "{{kernos_group}}" + allow_world_readable_tmpfiles: true + tasks: + - name: Create Kernos group + become: true + ansible.builtin.group: + name: "{{kernos_group}}" + state: present + + - name: Create kernos user + become: true + ansible.builtin.user: + name: "{{kernos_user}}" + group: "{{kernos_group}}" + home: "/home/{{kernos_user}}" + shell: /bin/bash + system: false + create_home: true + + - name: Add remote_user to kernos group + become: true + ansible.builtin.user: + name: "{{remote_user}}" + groups: "{{kernos_group}}" + append: true + + - name: Reset connection to pick up new group membership + ansible.builtin.meta: reset_connection + + - name: Create required directories + become: true + ansible.builtin.file: + path: "{{kernos_directory}}" + owner: "{{kernos_user}}" + group: "{{kernos_group}}" + state: directory + mode: '750' + + - name: Ensure tar is installed for unarchive task + become: true + ansible.builtin.apt: + name: + - tar + state: present + update_cache: true + + - name: Ensure Python, Python Dev, Venv module is installed + become: true + ansible.builtin.apt: + name: [python3, python3-venv, python3-dev] + state: present + update_cache: true + + - name: Transfer and unarchive git archive + become: true + ansible.builtin.unarchive: + src: "~/rel/kernos_{{kernos_rel}}.tar" + dest: "{{kernos_directory}}" + owner: "{{kernos_user}}" + group: "{{kernos_group}}" + mode: '550' + notify: restart kernos + + - name: Ensure venv directory ownership is correct + become: true + ansible.builtin.file: + path: "{{kernos_directory}}/.venv" + owner: "{{kernos_user}}" + group: "{{kernos_group}}" + state: directory + recurse: true + when: ansible_facts['file'] is defined or true + + - name: Create virtual environment for Kernos + become: true + become_user: "{{kernos_user}}" + ansible.builtin.command: + cmd: "python3 -m venv {{kernos_directory}}/.venv/" + creates: "{{kernos_directory}}/.venv/bin/activate" + + - name: Install wheel in virtual environment + become: true + become_user: "{{kernos_user}}" + ansible.builtin.pip: + name: + - wheel + state: latest + virtualenv: "{{kernos_directory}}/.venv" + + - name: Install pyproject.toml dependencies in virtualenv + become: true + become_user: "{{kernos_user}}" + ansible.builtin.pip: + chdir: "{{kernos_directory}}" + name: . + virtualenv: "{{kernos_directory}}/.venv" + virtualenv_command: python3 -m venv + notify: restart kernos + + - name: Template Kernos .env configuration + become: true + ansible.builtin.template: + src: .env.j2 + dest: "{{kernos_directory}}/.env" + owner: "{{kernos_user}}" + group: "{{kernos_group}}" + mode: '640' + notify: restart kernos + + - name: Template systemd service file + become: true + ansible.builtin.template: + src: kernos.service.j2 + dest: /etc/systemd/system/kernos.service + owner: root + group: root + mode: '644' + notify: restart kernos + + - name: Enable and start kernos service + become: true + ansible.builtin.systemd: + name: kernos + enabled: true + state: started + daemon_reload: true + + - name: Flush handlers to restart service before validation + ansible.builtin.meta: flush_handlers + + - name: Validate Kernos liveness endpoint + ansible.builtin.uri: + url: "http://localhost:{{kernos_port}}/live" + status_code: 200 + return_content: true + register: live_check + retries: 5 + delay: 5 + until: live_check.status == 200 + + - name: Validate Kernos readiness endpoint + ansible.builtin.uri: + url: "http://localhost:{{kernos_port}}/ready" + status_code: 200 + return_content: true + register: ready_check + retries: 5 + delay: 5 + until: ready_check.status == 200 + + - name: Validate Kernos health endpoint + ansible.builtin.uri: + url: "http://localhost:{{kernos_port}}/health" + status_code: 200 + return_content: true + register: health_check + retries: 5 + delay: 5 + until: health_check.status == 200 + + - name: Validate Kernos /metrics endpoint + ansible.builtin.uri: + url: "http://localhost:{{kernos_port}}/metrics" + status_code: 200 + return_content: false + register: metrics_check + retries: 5 + delay: 5 + until: metrics_check.status == 200 + + handlers: + - name: restart kernos + become: true + ansible.builtin.systemd: + name: kernos + state: restarted diff --git a/ansible/kernos/kernos.service b/ansible/kernos/kernos.service new file mode 100644 index 0000000..3ab83dd --- /dev/null +++ b/ansible/kernos/kernos.service @@ -0,0 +1,23 @@ +[Unit] +Description=Kernos MCP Server +After=network.target + +[Service] +Type=simple +User=nobody +Group=nogroup +WorkingDirectory=/srv/kernos +ExecStart=/srv/kernos/.venv/bin/kernos +EnvironmentFile=/srv/kernos/.env +Restart=on-failure +RestartSec=5 + +# Security hardening +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=read-only +PrivateTmp=false +ReadWritePaths=/ + +[Install] +WantedBy=multi-user.target diff --git a/ansible/kernos/kernos.service.j2 b/ansible/kernos/kernos.service.j2 new file mode 100644 index 0000000..a9577ab --- /dev/null +++ b/ansible/kernos/kernos.service.j2 @@ -0,0 +1,23 @@ +[Unit] +Description=Kernos MCP Server +After=network.target + +[Service] +Type=simple +User={{kernos_user}} +Group={{kernos_group}} +WorkingDirectory={{kernos_directory}} +ExecStart={{kernos_directory}}/.venv/bin/kernos +EnvironmentFile={{kernos_directory}}/.env +Restart=on-failure +RestartSec=5 + +# Security hardening +NoNewPrivileges=false +ProtectSystem=false +ProtectHome=false +PrivateTmp=false +ReadWritePaths=/ + +[Install] +WantedBy=multi-user.target diff --git a/ansible/kernos/stage.yml b/ansible/kernos/stage.yml new file mode 100644 index 0000000..9e31a50 --- /dev/null +++ b/ansible/kernos/stage.yml @@ -0,0 +1,47 @@ +--- +- name: Stage Kernos release tarball + hosts: localhost + gather_facts: false + vars: + archive_path: "{{rel_dir}}/kernos_{{kernos_rel}}.tar" + kernos_repo_url: "ssh://robert@clio.helu.ca:18677/mnt/dev/kernos" + kernos_repo_dir: "{{repo_dir}}/kernos" + + tasks: + - name: Ensure release directory exists + file: + path: "{{rel_dir}}" + state: directory + mode: '755' + + - name: Ensure repo directory exists + file: + path: "{{repo_dir}}" + state: directory + mode: '755' + + - name: Clone Kernos repository if not present + ansible.builtin.git: + repo: "{{kernos_repo_url}}" + dest: "{{kernos_repo_dir}}" + version: "{{kernos_rel}}" + accept_hostkey: true + register: git_clone + ignore_errors: true + + - name: Fetch all remote branches and tags + ansible.builtin.command: git fetch --all + args: + chdir: "{{kernos_repo_dir}}" + when: git_clone is not changed + + - name: Pull latest changes + ansible.builtin.command: git pull + args: + chdir: "{{kernos_repo_dir}}" + when: git_clone is not changed + + - name: Create Kernos archive for specified release + ansible.builtin.command: git archive -o "{{archive_path}}" "{{kernos_rel}}" + args: + chdir: "{{kernos_repo_dir}}" \ No newline at end of file diff --git a/ansible/lobechat/.env.example b/ansible/lobechat/.env.example new file mode 100644 index 0000000..0f22306 --- /dev/null +++ b/ansible/lobechat/.env.example @@ -0,0 +1,241 @@ +# add a access code to lock your lobe-chat application, you can set a long password to avoid leaking. If this value contains a comma, it is a password array. +# ACCESS_CODE=lobe66 + +# Specify your API Key selection method, currently supporting `random` and `turn`. +# API_KEY_SELECT_MODE=random + + +######################################## +########## AI Provider Service ######### +######################################## + +### OpenAI ### + +# you openai api key +OPENAI_API_KEY=sk-xxxxxxxxx + +# use a proxy to connect to the OpenAI API +# OPENAI_PROXY_URL=https://api.openai.com/v1 + +# add your custom model name, multi model separate by comma. for example gpt-3.5-1106,gpt-4-1106 +# OPENAI_MODEL_LIST=gpt-3.5-turbo + + +### Azure OpenAI ### + +# you can learn azure OpenAI Service on https://learn.microsoft.com/en-us/azure/ai-services/openai/overview +# use Azure OpenAI Service by uncomment the following line + +# The API key you applied for on the Azure OpenAI account page, which can be found in the "Keys and Endpoints" section. +# AZURE_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +# The endpoint you applied for on the Azure OpenAI account page, which can be found in the "Keys and Endpoints" section. +# AZURE_ENDPOINT=https://docs-test-001.openai.azure.com + +# Azure's API version, follows the YYYY-MM-DD format +# AZURE_API_VERSION=2024-10-21 + + +### Anthropic Service #### + +# ANTHROPIC_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +# use a proxy to connect to the Anthropic API +# ANTHROPIC_PROXY_URL=https://api.anthropic.com + + +### Google AI #### + +# GOOGLE_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + + +### AWS Bedrock ### + +# AWS_REGION=us-east-1 +# AWS_ACCESS_KEY_ID=xxxxxxxxxxxxxxxxxxx +# AWS_SECRET_ACCESS_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + + +### Ollama AI #### + +# You can use ollama to get and run LLM locally, learn more about it via https://github.com/ollama/ollama + +# The local/remote ollama service url +# OLLAMA_PROXY_URL=http://127.0.0.1:11434 + +# OLLAMA_MODEL_LIST=your_ollama_model_names + + +### OpenRouter Service ### + +# OPENROUTER_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx +# OPENROUTER_MODEL_LIST=model1,model2,model3 + + +### Mistral AI ### + +# MISTRAL_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### Perplexity Service ### + +# PERPLEXITY_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### Groq Service #### + +# GROQ_API_KEY=gsk_xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +#### 01.AI Service #### + +# ZEROONE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### TogetherAI Service ### + +# TOGETHERAI_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### ZhiPu AI ### + +# ZHIPU_API_KEY=xxxxxxxxxxxxxxxxxxx.xxxxxxxxxxxxx + +### Moonshot AI #### + +# MOONSHOT_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### Minimax AI #### + +# MINIMAX_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### DeepSeek AI #### + +# DEEPSEEK_PROXY_URL=https://api.deepseek.com/v1 +# DEEPSEEK_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### Qiniu AI #### + +# QINIU_PROXY_URL=https://api.qnaigc.com/v1 +# QINIU_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### Qwen AI #### + +# QWEN_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### Cloudflare Workers AI #### + +# CLOUDFLARE_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx +# CLOUDFLARE_BASE_URL_OR_ACCOUNT_ID=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### SiliconCloud AI #### + +# SILICONCLOUD_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + + +### TencentCloud AI #### + +# TENCENT_CLOUD_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### PPIO #### + +# PPIO_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +### INFINI-AI ### + +# INFINIAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +######################################## +############ Market Service ############ +######################################## + +# The LobeChat agents market index url +# AGENTS_INDEX_URL=https://chat-agents.lobehub.com + +######################################## +############ Plugin Service ############ +######################################## + +# The LobeChat plugins store index url +# PLUGINS_INDEX_URL=https://chat-plugins.lobehub.com + +# set the plugin settings +# the format is `plugin-identifier:key1=value1;key2=value2`, multiple settings fields are separated by semicolons `;`, multiple plugin settings are separated by commas `,`. +# PLUGIN_SETTINGS=search-engine:SERPAPI_API_KEY=xxxxx + +######################################## +####### Doc / Changelog Service ######## +######################################## + +# Use in Changelog / Document service cdn url prefix +# DOC_S3_PUBLIC_DOMAIN=https://xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +# Use in dev cdn workflow +# DOC_S3_ACCESS_KEY_ID=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx +# DOC_S3_SECRET_ACCESS_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + + +######################################## +##### S3 Object Storage Service ######## +######################################## + +# S3 keys +# S3_ACCESS_KEY_ID=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx +# S3_SECRET_ACCESS_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +# Bucket name +# S3_BUCKET=lobechat + +# Bucket request endpoint +# S3_ENDPOINT=https://xxxxxxxxxxxxxxxxxxxxxxxxxxxxx.r2.cloudflarestorage.com + +# Public access domain for the bucket +# S3_PUBLIC_DOMAIN=https://s3-for-lobechat.your-domain.com + +# Bucket region, such as us-west-1, generally not needed to add +# but some service providers may require configuration +# S3_REGION=us-west-1 + + +######################################## +############ Auth Service ############## +######################################## + + +# Clerk related configurations + +# Clerk public key and secret key +#NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_live_xxxxxxxxxxx +#CLERK_SECRET_KEY=sk_live_xxxxxxxxxxxxxxxxxxxxxx + +# you need to config the clerk webhook secret key if you want to use the clerk with database +#CLERK_WEBHOOK_SECRET=whsec_xxxxxxxxxxxxxxxxxxxxxx + + +# NextAuth related configurations +# NEXT_PUBLIC_ENABLE_NEXT_AUTH=1 +# NEXT_AUTH_SECRET= + +# Auth0 configurations +# AUTH_AUTH0_ID= +# AUTH_AUTH0_SECRET= +# AUTH_AUTH0_ISSUER=https://your-domain.auth0.com + +######################################## +########## Server Database ############# +######################################## + +# Specify the service mode as server if you want to use the server database +# NEXT_PUBLIC_SERVICE_MODE=server + +# Postgres database URL +# DATABASE_URL=postgres://username:password@host:port/database + +# use `openssl rand -base64 32` to generate a key for the encryption of the database +# we use this key to encrypt the user api key and proxy url +#KEY_VAULTS_SECRET=xxxxx/xxxxxxxxxxxxxx= + +# Specify the Embedding model and Reranker model(unImplemented) +# DEFAULT_FILES_CONFIG="embedding_model=openai/embedding-text-3-small,reranker_model=cohere/rerank-english-v3.0,query_mode=full_text" + +######################################## +########## MCP Service Config ########## +######################################## + +# MCP tool call timeout (milliseconds) +# MCP_TOOL_TIMEOUT=60000 diff --git a/ansible/lobechat/deploy.yml b/ansible/lobechat/deploy.yml new file mode 100644 index 0000000..367c3b6 --- /dev/null +++ b/ansible/lobechat/deploy.yml @@ -0,0 +1,82 @@ +--- +- name: Deploy LobeChat to Dev Environment + hosts: ubuntu + tasks: + - name: Check if host has lobechat service + ansible.builtin.set_fact: + has_lobechat_service: "{{ 'lobechat' in services | default([]) }}" + + - name: Skip hosts without lobechat service + ansible.builtin.meta: end_host + when: not has_lobechat_service + + - name: Create lobechat group + become: true + ansible.builtin.group: + name: "{{lobechat_user}}" + + - name: Create lobechat user + become: true + ansible.builtin.user: + name: "{{lobechat_user}}" + comment: "{{lobechat_user}}" + group: "{{lobechat_group}}" + system: true + + - name: Add group lobechat to user ponos + become: true + ansible.builtin.user: + name: ponos + groups: "{{lobechat_group}}" + append: true + + - name: Create lobechat directory + become: true + ansible.builtin.file: + path: "{{lobechat_directory}}" + owner: "{{lobechat_user}}" + group: "{{lobechat_group}}" + state: directory + mode: '750' + + - name: Template docker-compose file + become: true + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{lobechat_directory}}/docker-compose.yml" + owner: "{{lobechat_user}}" + group: "{{lobechat_group}}" + mode: '550' + register: lobechat_compose + + - name: Reset SSH connection to apply group changes + meta: reset_connection + + - name: Start LobeChat service + become: true + community.docker.docker_compose_v2: + project_src: "{{lobechat_directory}}" + state: present + pull: always + + - name: Restart LobeChat if configuration changed + become: true + community.docker.docker_compose_v2: + project_src: "{{lobechat_directory}}" + state: restarted + when: lobechat_compose.changed + + - name: Wait for LobeChat to be healthy + ansible.builtin.uri: + url: "http://localhost:{{lobechat_port}}/chat" + method: GET + status_code: 200 + register: lobechat_health + until: lobechat_health.status == 200 + retries: 30 + delay: 5 + delegate_to: "{{inventory_hostname}}" + + - name: Display LobeChat status + ansible.builtin.debug: + msg: "LobeChat is running at http://{{inventory_hostname}}:{{lobechat_port}}" diff --git a/ansible/lobechat/docker-compose.yml.j2 b/ansible/lobechat/docker-compose.yml.j2 new file mode 100644 index 0000000..965802c --- /dev/null +++ b/ansible/lobechat/docker-compose.yml.j2 @@ -0,0 +1,36 @@ +services: + lobe-chat: + image: lobehub/lobe-chat-database:latest + pull_policy: always + environment: + - DATABASE_URL=postgresql://{{lobechat_db_user}}:{{lobechat_db_password}}@{{lobechat_db_host}}:{{lobechat_db_port}}/{{lobechat_db_name}} + - KEY_VAULTS_SECRET={{lobechat_key_vaults_secret}} + - NEXTAUTH_URL={{lobechat_nextauth_url}} + - NEXT_AUTH_SECRET={{lobechat_next_auth_secret}} + - NEXT_AUTH_SSO_PROVIDERS={{lobechat_next_auth_sso_providers}} + - AUTH_CASDOOR_ISSUER={{lobechat_auth_casdoor_issuer}} + - AUTH_CASDOOR_ID={{lobechat_auth_casdoor_id}} + - AUTH_CASDOOR_SECRET={{lobechat_auth_casdoor_secret}} + # Trust self-signed certificates for internal OIDC communication + - NODE_TLS_REJECT_UNAUTHORIZED=0 + - S3_ENDPOINT={{lobechat_s3_endpoint}} + - S3_PUBLIC_DOMAIN={{lobechat_s3_public_domain}} + - S3_ACCESS_KEY_ID={{lobechat_s3_access_key}} + - S3_SECRET_ACCESS_KEY={{lobechat_s3_secret_key}} + - S3_BUCKET={{lobechat_s3_bucket}} + - SEARXNG_URL={{lobechat_searxng_url}} + - OPENAI_PROXY_URL={{lobechat_openai_proxy_url}} + - OPENAI_API_KEY={{lobechat_openai_key}} + - OLLAMA_PROXY_URL={{lobechat_ollama_proxy_url}} + - ANTHROPIC_API_KEY={{lobechat_anthropic_api_key}} + - GOOGLE_API_KEY={{lobechat_google_api_key}} + - APP_URL={{lobechat_app_url}} + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{lobechat_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "lobechat" + ports: + - "{{lobechat_port}}:3210" + restart: unless-stopped diff --git a/ansible/loki/config.yml.j2 b/ansible/loki/config.yml.j2 new file mode 100644 index 0000000..a65621f --- /dev/null +++ b/ansible/loki/config.yml.j2 @@ -0,0 +1,41 @@ +auth_enabled: false + +server: + http_listen_port: {{ loki_port }} + grpc_listen_port: {{ loki_grpc_port }} + +common: + path_prefix: {{ loki_data_dir }} + storage: + filesystem: + chunks_directory: {{ loki_data_dir }}/chunks + rules_directory: {{ loki_data_dir }}/rules + replication_factor: 1 + ring: + instance_addr: 127.0.0.1 + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2024-04-01 + object_store: filesystem + store: tsdb + schema: v13 + index: + prefix: index_ + period: 24h + +ruler: + alertmanager_url: http://{{ alertmanager_host }}:{{ alertmanager_port }} + +# Red Panda Approved Configuration +analytics: + reporting_enabled: false diff --git a/ansible/loki/deploy.yml b/ansible/loki/deploy.yml new file mode 100644 index 0000000..2f1e870 --- /dev/null +++ b/ansible/loki/deploy.yml @@ -0,0 +1,89 @@ +--- +- name: Deploy Loki to Prospero + hosts: ubuntu + become: true + tasks: + - name: Check if host has loki service + ansible.builtin.set_fact: + has_loki_service: "{{'loki' in services}}" + + - name: Skip hosts without loki service + ansible.builtin.meta: end_host + when: not has_loki_service + + - name: Add Grafana repository + ansible.builtin.deb822_repository: + name: grafana + types: [deb] + uris: https://apt.grafana.com + suites: [stable] + components: [main] + signed_by: https://apt.grafana.com/gpg.key + state: present + + - name: Install Loki + become: true + ansible.builtin.apt: + name: + - loki + state: present + update_cache: true + + - name: Create loki group + become: true + ansible.builtin.group: + name: "{{loki_group}}" + + - name: Create loki user + become: true + ansible.builtin.user: + name: "{{loki_user}}" + comment: "{{loki_user}}" + group: "{{loki_group}}" + system: true + + - name: Add group loki to ansible_user + become: true + ansible.builtin.user: + name: "{{ansible_user}}" + groups: "{{loki_group}}" + append: true + + - name: Create loki directories + become: true + ansible.builtin.file: + path: "{{item}}" + owner: "{{loki_user}}" + group: "{{loki_group}}" + state: directory + mode: '750' + loop: + - "{{loki_data_dir}}" + - "{{loki_config_dir}}" + + - name: Template Loki configuration + become: true + ansible.builtin.template: + src: "{{loki_config_file}}.j2" + dest: "{{loki_config_dir}}/{{loki_config_file}}" + owner: "{{loki_user}}" + group: "{{loki_group}}" + mode: '550' + notify: restart loki + + - name: Reset SSH connection to apply group changes + meta: reset_connection + + - name: Enable and start Loki service + become: true + ansible.builtin.systemd: + name: loki + enabled: true + state: started + + handlers: + - name: restart loki + become: true + ansible.builtin.systemd: + name: loki + state: restarted diff --git a/ansible/mcp_switchboard/.env.j2 b/ansible/mcp_switchboard/.env.j2 new file mode 100644 index 0000000..4de9821 --- /dev/null +++ b/ansible/mcp_switchboard/.env.j2 @@ -0,0 +1,26 @@ +# Django Configuration +SECRET_KEY={{ vault_mcp_switchboard_secret_key }} +DEBUG=True +ALLOWED_HOSTS=localhost,127.0.0.1,oberon.incus,{{ ansible_default_ipv4.address }} + +# Database Configuration +DB_ENGINE=django.db.backends.postgresql +DB_NAME={{ mcp_switchboard_db_name }} +DB_USER={{ mcp_switchboard_db_user }} +DB_PASSWORD={{ mcp_switchboard_db_password }} +DB_HOST={{ mcp_switchboard_db_host }} +DB_PORT={{ mcp_switchboard_db_port }} + +# Docker Configuration +DOCKER_HOST={{ mcp_switchboard_docker_host }} + +# RabbitMQ Configuration +RABBITMQ_HOST={{ mcp_switchboard_rabbitmq_host }} +RABBITMQ_PORT={{ mcp_switchboard_rabbitmq_port }} +RABBITMQ_USER={{ mcp_switchboard_rabbitmq_user }} +RABBITMQ_PASSWORD={{ mcp_switchboard_rabbitmq_password }} +CELERY_BROKER_URL=amqp://{{ mcp_switchboard_rabbitmq_user }}:{{ mcp_switchboard_rabbitmq_password }}@{{ mcp_switchboard_rabbitmq_host }}:{{ mcp_switchboard_rabbitmq_port }}// + +# Application Configuration +BIND_HOST=0.0.0.0 +BIND_PORT={{ mcp_switchboard_port }} diff --git a/ansible/mcp_switchboard/deploy.yml b/ansible/mcp_switchboard/deploy.yml new file mode 100644 index 0000000..c437575 --- /dev/null +++ b/ansible/mcp_switchboard/deploy.yml @@ -0,0 +1,105 @@ +--- +- name: Deploy MCP Switchboard + hosts: ubuntu + tasks: + - name: Check if host has mcp_switchboard service + ansible.builtin.set_fact: + has_mcp_switchboard_service: "{{'mcp_switchboard' in services}}" + + - name: Skip hosts without mcp_switchboard service + ansible.builtin.meta: end_host + when: not has_mcp_switchboard_service + + - name: Create mcp_switchboard user and group + become: true + ansible.builtin.group: + name: "{{mcp_switchboard_group}}" + state: present + + - name: Create mcp_switchboard user + become: true + ansible.builtin.user: + name: "{{mcp_switchboard_user}}" + group: "{{mcp_switchboard_group}}" + home: "{{mcp_switchboard_directory}}" + shell: /bin/bash + system: true + create_home: false + + - name: Add ansible_user to mcp_switchboard group + become: true + ansible.builtin.user: + name: "{{ansible_user}}" + groups: "{{mcp_switchboard_group}}" + append: true + + - name: Add mcp_switchboard user to docker group + become: true + ansible.builtin.user: + name: "{{mcp_switchboard_group}}" + groups: docker + append: true + + - name: Create required directories + become: true + ansible.builtin.file: + path: "{{mcp_switchboard_directory}}" + owner: "{{mcp_switchboard_user}}" + group: "{{mcp_switchboard_group}}" + state: directory + mode: '750' + + - name: Transfer and unarchive git archive + become: true + ansible.builtin.unarchive: + src: "~/rel/mcp_switchboard_{{mcp_switchboard_rel}}.tar" + dest: "{{mcp_switchboard_directory}}" + owner: "{{mcp_switchboard_user}}" + group: "{{mcp_switchboard_group}}" + mode: '550' + + - name: Install mcp_switchboard package in virtual environment + become: true + become_user: "{{mcp_switchboard_user}}" + vars: + ansible_common_remote_group: "{{mcp_switchboard_group}}" + allow_world_readable_tmpfiles: true + ansible.builtin.pip: + name: . + virtualenv: "{{mcp_switchboard_directory}}/.venv" + virtualenv_command: python3 -m venv + chdir: "{{mcp_switchboard_directory}}" + + - name: Template .env file + become: true + ansible.builtin.template: + src: .env.j2 + dest: "{{mcp_switchboard_directory}}/.env" + owner: "{{mcp_switchboard_user}}" + group: "{{mcp_switchboard_group}}" + mode: '550' + + - name: Template systemd service file + become: true + ansible.builtin.template: + src: mcp_switchboard.service.j2 + dest: /etc/systemd/system/mcp_switchboard.service + owner: root + group: root + mode: '644' + notify: restart mcp_switchboard + + - name: Enable and start mcp_switchboard service + become: true + ansible.builtin.systemd: + name: mcp_switchboard + enabled: true + state: started + daemon_reload: true + + handlers: + - name: restart mcp_switchboard + become: true + ansible.builtin.systemd: + name: mcp_switchboard + state: restarted diff --git a/ansible/mcp_switchboard/mcp_switchboard.service.j2 b/ansible/mcp_switchboard/mcp_switchboard.service.j2 new file mode 100644 index 0000000..706b771 --- /dev/null +++ b/ansible/mcp_switchboard/mcp_switchboard.service.j2 @@ -0,0 +1,17 @@ +[Unit] +Description=MCP Switchboard - Django Application +After=network.target postgresql.service rabbitmq-server.service + +[Service] +Type=notify +User={{ mcp_switchboard_user }} +Group={{ mcp_switchboard_group }} +WorkingDirectory={{ mcp_switchboard_directory }} +Environment="PATH={{ mcp_switchboard_directory }}/.venv/bin:/usr/local/bin:/usr/bin:/bin" +EnvironmentFile={{ mcp_switchboard_directory }}/.env +ExecStart={{ mcp_switchboard_directory }}/.venv/bin/gunicorn mcp_switchboard.wsgi:application --bind {{ mcp_switchboard_bind_host | default('0.0.0.0') }}:{{ mcp_switchboard_port }} --workers 4 --timeout 120 +Restart=always +RestartSec=10 + +[Install] +WantedBy=multi-user.target diff --git a/ansible/mcp_switchboard/stage.yml b/ansible/mcp_switchboard/stage.yml new file mode 100644 index 0000000..ed59377 --- /dev/null +++ b/ansible/mcp_switchboard/stage.yml @@ -0,0 +1,29 @@ +--- +- name: Stage MCP Switchboard release tarball + hosts: localhost + gather_facts: false + vars: + mcp_switchboard_repo_dir: "{{repo_dir}}/mcp_switchboard" + archive_path: "{{rel_dir}}/mcp_switchboard_{{mcp_switchboard_rel}}.tar" + + tasks: + - name: Ensure release directory exists + file: + path: "{{rel_dir}}" + state: directory + mode: '755' + + - name: Fetch all remote branches and tags + ansible.builtin.command: git fetch --all + args: + chdir: "{{mcp_switchboard_repo_dir}}" + + - name: Pull latest changes + ansible.builtin.command: git pull + args: + chdir: "{{mcp_switchboard_repo_dir}}" + + - name: Create MCP Switchboard archive for specified release + ansible.builtin.command: git archive -o "{{archive_path}}" "{{mcp_switchboard_rel}}" + args: + chdir: "{{mcp_switchboard_repo_dir}}" diff --git a/ansible/mcpo/config.json.j2 b/ansible/mcpo/config.json.j2 new file mode 100644 index 0000000..4a1af8e --- /dev/null +++ b/ansible/mcpo/config.json.j2 @@ -0,0 +1,55 @@ +{ + "mcpServers": { + "time": { + "command": "/srv/mcpo/.venv/bin/python", + "args": ["/srv/mcpo/.venv/bin/mcp-server-time", "--local-timezone=America/Toronto"] + }, + "upstash-context7": { + "command": "npx", + "args": [ + "-y", + "@upstash/context7-mcp" + ] + }, + "angelia": { + "url": "{{angelia_mcp_url}}", + "headers": { + "Authorization": "Bearer {{angelia_mcp_auth}}" + } + }, + "argos": { + "type": "streamable_http", + "url": "{{argos_mcp_url}}" + }, + "caliban": { + "type": "streamable_http", + "url": "{{caliban_mcp_url}}" + }, + "gitea": { + "type": "streamable_http", + "url": "{{gitea_mcp_url}}" + }, + "github": { + "type": "streamable_http", + "url": "https://api.githubcopilot.com/mcp/", + "headers": { + "Authorization": "Bearer {{github_personal_access_token}}" + } + }, + "huggingface": { + "type": "streamable_http", + "url": "https://huggingface.co/mcp", + "headers": { + "Authorization": "Bearer {{huggingface_mcp_token}}" + } + }, + "korax": { + "type": "streamable_http", + "url": "{{korax_mcp_url}}" + }, + "neo4j-cypher": { + "type": "streamable_http", + "url": "{{neo4j_mcp_url}}" + } + } +} diff --git a/ansible/mcpo/deploy.yml b/ansible/mcpo/deploy.yml new file mode 100644 index 0000000..0053734 --- /dev/null +++ b/ansible/mcpo/deploy.yml @@ -0,0 +1,172 @@ +--- +- name: Deploy MCPO as a system service + hosts: mcpo + vars: + ansible_common_remote_group: ponos + handlers: + - name: restart mcpo + become: true + ansible.builtin.systemd: + name: mcpo + state: restarted + tasks: + - name: Create MCPO group + become: true + ansible.builtin.group: + name: "{{mcpo_group}}" + system: true + + - name: Create MCPO User + become: true + ansible.builtin.user: + name: "{{mcpo_user}}" + group: "{{mcpo_group}}" + comment: "{{mcpo_user}}" + system: true + + - name: Add remote_user to mcpo group + become: true + ansible.builtin.user: + name: "{{remote_user}}" + groups: "{{mcpo_group}}" + append: true + + - name: Create required directories + become: true + ansible.builtin.file: + path: "{{mcpo_directory}}" + owner: "{{mcpo_user}}" + group: "{{mcpo_group}}" + state: directory + mode: '750' + + - name: Check if config.json exists + ansible.builtin.stat: + path: "{{mcpo_directory}}/config.json" + register: config_file + + - name: Backup existing config if present + become: true + ansible.builtin.copy: + src: "{{mcpo_directory}}/config.json" + dest: "{{mcpo_directory}}/config.json.bak" + remote_src: yes + owner: "{{mcpo_user}}" + group: "{{mcpo_group}}" + mode: '0660' + when: config_file.stat.exists + + - name: Deploy config.json from template + become: true + ansible.builtin.template: + src: "config.json.j2" + dest: "{{mcpo_directory}}/config.json" + owner: "{{mcpo_user}}" + group: "{{mcpo_group}}" + mode: '0660' + notify: restart mcpo + + - name: Add NodeSource repository for Node.js 22.x + become: true + ansible.builtin.deb822_repository: + name: nodesource + types: [deb] + uris: https://deb.nodesource.com/node_22.x + suites: [nodistro] + components: [main] + signed_by: https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key + state: present + + - name: Install nodejs and Python 3.12 packages + become: true + apt: + name: [nodejs, python3.12, python3.12-venv, python3.12-dev, python3-pip] + state: latest + update_cache: true + + - name: Create virtual environment + become: true + become_user: "{{mcpo_user}}" + ansible.builtin.command: python3.12 -m venv {{mcpo_directory}}/.venv + args: + creates: "{{mcpo_directory}}/.venv/bin/activate" + vars: + ansible_common_remote_group: "{{mcpo_group}}" + allow_world_readable_tmpfiles: true + + - name: Install pip packages in virtual environment + become: true + become_user: "{{mcpo_user}}" + ansible.builtin.pip: + name: + - wheel + - mcpo + - mcp-server-time + state: latest + virtualenv: "{{mcpo_directory}}/.venv" + vars: + ansible_common_remote_group: "{{mcpo_group}}" + allow_world_readable_tmpfiles: true + + - name: Pre-install Context7 MCP package + become: true + become_user: "{{mcpo_user}}" + ansible.builtin.command: npx -y @upstash/context7-mcp --help + args: + chdir: "{{mcpo_directory}}" + vars: + ansible_common_remote_group: "{{mcpo_group}}" + allow_world_readable_tmpfiles: true + ignore_errors: true + + - name: Create systemd service file + become: true + ansible.builtin.template: + src: mcpo.service.j2 + dest: /etc/systemd/system/mcpo.service + mode: '660' + + - name: Reload systemd and enable mcpo service + become: true + ansible.builtin.systemd: + name: mcpo + daemon_reload: true + enabled: true + state: started + register: mcpo_service_status + + - name: Verify MCPO service is running + ansible.builtin.debug: + msg: "MCPO service is running and enabled" + when: mcpo_service_status.state == "started" + + - name: Check if MCPO service is responding + ansible.builtin.uri: + url: http://localhost:{{mcpo_port}}/docs + method: GET + status_code: 200 + register: health_check + ignore_errors: yes + retries: 5 + delay: 5 + + - name: Report MCPO health status + ansible.builtin.debug: + msg: "MCPO health check {{ 'succeeded' if health_check.status == 200 else 'failed' }}" + when: not health_check.failed + + - name: Verify Grafana MCP backend is reachable + ansible.builtin.uri: + url: "http://localhost:{{grafana_mcp_port}}/mcp" + method: GET + status_code: [200, 405] + register: grafana_mcp_check + ignore_errors: true + retries: 3 + delay: 3 + when: grafana_mcp_port is defined + + - name: Report Grafana MCP backend status + ansible.builtin.debug: + msg: "Grafana MCP backend {{ 'reachable' if not grafana_mcp_check.failed else 'UNREACHABLE - MCPO grafana proxy will not work until grafana_mcp/deploy.yml is run' }}" + when: grafana_mcp_port is defined and grafana_mcp_check is defined diff --git a/ansible/mcpo/mcpo.service.j2 b/ansible/mcpo/mcpo.service.j2 new file mode 100644 index 0000000..0249761 --- /dev/null +++ b/ansible/mcpo/mcpo.service.j2 @@ -0,0 +1,15 @@ +[Unit] +Description=MCPO +After=network.target + +[Service] +Type=simple +User={{mcpo_user}} +Group={{mcpo_group}} +WorkingDirectory={{mcpo_directory}} +ExecStart={{mcpo_directory}}/.venv/bin/mcpo --port {{mcpo_port}} --config {{mcpo_directory}}/config.json +Restart=always +RestartSec=3 + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/ansible/mcpo/requirements.txt b/ansible/mcpo/requirements.txt new file mode 100644 index 0000000..e2673c5 --- /dev/null +++ b/ansible/mcpo/requirements.txt @@ -0,0 +1,3 @@ +wheel +mcpo +mcp-server-time \ No newline at end of file diff --git a/ansible/mcpo/restart.yml b/ansible/mcpo/restart.yml new file mode 100644 index 0000000..8f4b3a9 --- /dev/null +++ b/ansible/mcpo/restart.yml @@ -0,0 +1,32 @@ +--- +- name: Restart MCPO service + hosts: mcpo + tasks: + - name: Restart mcpo service + become: true + ansible.builtin.systemd: + name: mcpo + state: restarted + register: restart_result + + - name: Wait for MCPO to be ready + ansible.builtin.wait_for: + port: "{{mcpo_port}}" + host: localhost + delay: 2 + timeout: 30 + state: started + + - name: Check if MCPO service is responding + ansible.builtin.uri: + url: http://localhost:{{mcpo_port}}/docs + method: GET + status_code: 200 + register: health_check + retries: 5 + delay: 3 + until: health_check.status == 200 + + - name: Report MCPO health status + ansible.builtin.debug: + msg: "✔ MCPO service successfully restarted and is healthy" diff --git a/ansible/neo4j/deploy.yml b/ansible/neo4j/deploy.yml new file mode 100644 index 0000000..bf81b7e --- /dev/null +++ b/ansible/neo4j/deploy.yml @@ -0,0 +1,55 @@ +--- +- name: Deploy Neo4j with Docker Compose + hosts: ubuntu + become: true + vars: + required_service: neo4j + tasks: + - name: Check if host has neo4j service + ansible.builtin.set_fact: + has_neo4j_service: "{{ required_service in services | default([]) }}" + + - name: Skip hosts without neo4j service + ansible.builtin.meta: end_host + when: not has_neo4j_service + + - name: Create neo4j group + ansible.builtin.group: + name: "{{neo4j_group}}" + + - name: Create neo4j user + ansible.builtin.user: + name: "{{neo4j_user}}" + comment: "{{neo4j_user}}" + group: "{{neo4j_group}}" + system: true + + - name: Add group neo4j to user ponos + ansible.builtin.user: + name: ponos + groups: "{{neo4j_group}}" + append: true + + - name: Create neo4j directory + ansible.builtin.file: + path: "{{neo4j_directory}}" + owner: "{{neo4j_user}}" + group: "{{neo4j_group}}" + state: directory + mode: '750' + + - name: Template docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{neo4j_directory}}/docker-compose.yml" + owner: "{{neo4j_user}}" + group: "{{neo4j_group}}" + mode: '550' + + - name: Reset SSH connection to apply group changes + meta: reset_connection + + - name: Start Neo4j service + community.docker.docker_compose_v2: + project_src: "{{neo4j_directory}}" + state: present diff --git a/ansible/neo4j/docker-compose.yml.j2 b/ansible/neo4j/docker-compose.yml.j2 new file mode 100644 index 0000000..a7c7cbd --- /dev/null +++ b/ansible/neo4j/docker-compose.yml.j2 @@ -0,0 +1,31 @@ +services: + neo4j: + image: neo4j:{{neo4j_version}} + container_name: neo4j + restart: unless-stopped + ports: + - "{{neo4j_http_port}}:7474" + - "{{neo4j_bolt_port}}:7687" + volumes: + - neo4j_data:/data + - neo4j_logs:/logs + - neo4j_plugins:/plugins + environment: + NEO4J_AUTH: "{{neo4j_auth_user}}/{{neo4j_auth_password}}" + # APOC Plugin + NEO4J_PLUGINS: '["apoc"]' + NEO4J_apoc_export_file_enabled: "true" + NEO4J_apoc_import_file_enabled: "true" + NEO4J_apoc_import_file_use__neo4j__config: "true" + NEO4J_dbms_security_procedures_unrestricted: "{{neo4j_apoc_unrestricted}}" + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{neo4j_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "neo4j" + +volumes: + neo4j_data: + neo4j_logs: + neo4j_plugins: \ No newline at end of file diff --git a/ansible/neo4j_mcp/deploy.yml b/ansible/neo4j_mcp/deploy.yml new file mode 100644 index 0000000..23172d9 --- /dev/null +++ b/ansible/neo4j_mcp/deploy.yml @@ -0,0 +1,56 @@ +--- +- name: Deploy Neo4j MCP Servers with Docker Compose + hosts: ubuntu + become: true + vars: + required_service: neo4j_mcp + tasks: + - name: Check if host has neo4j_mcp service + ansible.builtin.set_fact: + has_neo4j_mcp_service: "{{ required_service in services | default([]) }}" + + - name: Skip hosts without neo4j_mcp service + ansible.builtin.meta: end_host + when: not has_neo4j_mcp_service + + - name: Create neo4j_mcp group + ansible.builtin.group: + name: "{{neo4j_mcp_group}}" + + - name: Create neo4j_mcp user + ansible.builtin.user: + name: "{{neo4j_mcp_user}}" + comment: "{{neo4j_mcp_user}}" + group: "{{neo4j_mcp_group}}" + system: true + + - name: Add group neo4j_mcp to user ponos + ansible.builtin.user: + name: ponos + groups: "{{neo4j_mcp_group}}" + append: true + + - name: Create neo4j_mcp directory + ansible.builtin.file: + path: "{{neo4j_mcp_directory}}" + owner: "{{neo4j_mcp_user}}" + group: "{{neo4j_mcp_group}}" + state: directory + mode: '750' + + - name: Template docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{neo4j_mcp_directory}}/docker-compose.yml" + owner: "{{neo4j_mcp_user}}" + group: "{{neo4j_mcp_group}}" + mode: '550' + + - name: Reset SSH connection to apply group changes + meta: reset_connection + + - name: Start Neo4j MCP services + community.docker.docker_compose_v2: + project_src: "{{neo4j_mcp_directory}}" + state: present + pull: always diff --git a/ansible/neo4j_mcp/docker-compose.yml.j2 b/ansible/neo4j_mcp/docker-compose.yml.j2 new file mode 100644 index 0000000..a983dee --- /dev/null +++ b/ansible/neo4j_mcp/docker-compose.yml.j2 @@ -0,0 +1,27 @@ +services: + neo4j-cypher: + image: mcp/neo4j-cypher:latest + pull_policy: always + container_name: neo4j-cypher + restart: unless-stopped + ports: + - "{{neo4j_cypher_mcp_port}}:8000" + environment: + - NEO4J_URI=bolt://{{neo4j_host}}:{{neo4j_bolt_port}} + - NEO4J_USERNAME=neo4j + - NEO4J_PASSWORD={{neo4j_auth_password}} + - NEO4J_DATABASE=neo4j + - NEO4J_TRANSPORT=http + - NEO4J_MCP_SERVER_HOST=0.0.0.0 + - NEO4J_MCP_SERVER_PORT=8000 + - NEO4J_MCP_SERVER_PATH=/mcp + - NEO4J_NAMESPACE=local + - NEO4J_MCP_SERVER_ALLOWED_HOSTS=localhost,127.0.0.1,miranda.incus,rosalind.incus,miranda.incus:{{neo4j_cypher_mcp_port}} + - NEO4J_MCP_SERVER_ALLOW_ORIGINS= + - NEO4J_READ_TIMEOUT=30 + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{neo4j_cypher_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "neo4j-cypher" diff --git a/ansible/nextcloud/config.php.j2 b/ansible/nextcloud/config.php.j2 new file mode 100644 index 0000000..8d84b54 --- /dev/null +++ b/ansible/nextcloud/config.php.j2 @@ -0,0 +1,40 @@ + '{{ nextcloud_instance_id | default("") }}', + 'passwordsalt' => '{{ nextcloud_password_salt | default("") }}', + 'secret' => '{{ nextcloud_secret | default("") }}', + 'trusted_domains' => + array ( + 0 => 'rosalind.incus', + 1 => '{{ nextcloud_domain }}', + ), + 'datadirectory' => '{{ nextcloud_data_dir }}', + 'dbtype' => 'pgsql', + 'version' => '', + 'overwrite.cli.url' => 'https://{{ nextcloud_domain }}', + 'dbname' => '{{ nextcloud_db_name }}', + 'dbhost' => '{{ nextcloud_db_host }}:{{ nextcloud_db_port }}', + 'dbport' => '', + 'dbtableprefix' => 'oc_', + 'dbuser' => '{{ nextcloud_db_user }}', + 'dbpassword' => '{{ nextcloud_db_password }}', + 'installed' => true, + 'memcache.local' => '\\OC\\Memcache\\Memcached', + 'memcache.distributed' => '\\OC\\Memcache\\Memcached', + 'memcached_servers' => + array ( + 0 => + array ( + 0 => 'localhost', + 1 => 11211, + ), + ), + 'memcached_options' => + array ( + 'prefix' => 'nc_', + ), + 'maintenance' => false, + 'theme' => '', + 'loglevel' => 2, + 'default_phone_region' => 'US', +); diff --git a/ansible/nextcloud/deploy.yml b/ansible/nextcloud/deploy.yml new file mode 100644 index 0000000..38d449f --- /dev/null +++ b/ansible/nextcloud/deploy.yml @@ -0,0 +1,195 @@ +--- +- name: Deploy Nextcloud + hosts: ubuntu + become: true + tasks: + - name: Check if host has nextcloud service + ansible.builtin.set_fact: + has_nextcloud_service: "{{ 'nextcloud' in services | default([]) }}" + + - name: Skip hosts without nextcloud service + ansible.builtin.meta: end_host + when: not has_nextcloud_service + + - name: Install required packages for Nextcloud + ansible.builtin.apt: + name: + - apache2 + - libapache2-mod-php + - php-gd + - php-pgsql + - php-curl + - php-mbstring + - php-intl + - php-gmp + - php-bcmath + - php-xml + - php-imagick + - php-zip + - php-memcached + - memcached + - libmagickcore-6.q16-6-extra + - unzip + - curl + - bzip2 + - acl + state: present + update_cache: true + + - name: Ensure Memcached is running + ansible.builtin.service: + name: memcached + state: started + enabled: true + + - name: Create Nextcloud data directory + ansible.builtin.file: + path: "{{ nextcloud_data_dir }}" + state: directory + owner: www-data + group: www-data + mode: '0750' + + - name: Check if Nextcloud is already installed + ansible.builtin.stat: + path: /var/www/nextcloud/version.php + register: nextcloud_installed + + - name: Download Nextcloud tarball + ansible.builtin.get_url: + url: https://download.nextcloud.com/server/releases/latest.tar.bz2 + dest: /tmp/nextcloud-latest.tar.bz2 + mode: '0644' + when: not nextcloud_installed.stat.exists + + - name: Extract Nextcloud tarball + ansible.builtin.unarchive: + src: /tmp/nextcloud-latest.tar.bz2 + dest: /tmp/ + remote_src: true + when: not nextcloud_installed.stat.exists + + - name: Copy Nextcloud to web root + ansible.builtin.copy: + src: /tmp/nextcloud/ + dest: /var/www/nextcloud/ + remote_src: true + owner: www-data + group: www-data + mode: preserve + when: not nextcloud_installed.stat.exists + + - name: Set proper ownership for Nextcloud directory + ansible.builtin.file: + path: /var/www/nextcloud + state: directory + owner: www-data + group: www-data + recurse: true + + - name: Template Apache VirtualHost configuration + ansible.builtin.template: + src: nextcloud.conf.j2 + dest: /etc/apache2/sites-available/nextcloud.conf + owner: root + group: root + mode: '0644' + notify: reload apache + + - name: Disable default Apache site + ansible.builtin.command: + cmd: a2dissite 000-default.conf + args: + removes: /etc/apache2/sites-enabled/000-default.conf + notify: reload apache + + - name: Enable Nextcloud Apache site + ansible.builtin.command: + cmd: a2ensite nextcloud.conf + args: + creates: /etc/apache2/sites-enabled/nextcloud.conf + notify: reload apache + + - name: Enable required Apache modules + ansible.builtin.command: + cmd: "a2enmod {{ item }}" + loop: + - rewrite + - headers + - env + - dir + - mime + notify: reload apache + register: apache_mods + changed_when: "'already enabled' not in apache_mods.stdout" + + - name: Check if Nextcloud is already configured + ansible.builtin.stat: + path: /var/www/nextcloud/config/config.php + register: nextcloud_config + + - name: Run Nextcloud installation + become_user: www-data + ansible.builtin.command: + cmd: > + php /var/www/nextcloud/occ maintenance:install + --database "pgsql" + --database-name "{{ nextcloud_db_name }}" + --database-host "{{ nextcloud_db_host }}" + --database-port "{{ nextcloud_db_port }}" + --database-user "{{ nextcloud_db_user }}" + --database-pass "{{ nextcloud_db_password }}" + --admin-user "{{ nextcloud_admin_user }}" + --admin-pass "{{ nextcloud_admin_password }}" + --data-dir "{{ nextcloud_data_dir }}" + when: not nextcloud_config.stat.exists + no_log: true + + - name: Add trusted domain + become_user: www-data + ansible.builtin.command: + cmd: "php /var/www/nextcloud/occ config:system:set trusted_domains 1 --value={{ nextcloud_domain }}" + changed_when: false + + - name: Configure Memcached + become_user: www-data + ansible.builtin.command: + cmd: "php /var/www/nextcloud/occ config:system:set memcache.local --value='\\OC\\Memcache\\Memcached'" + changed_when: false + + - name: Configure Memcached server + become_user: www-data + ansible.builtin.command: + cmd: "php /var/www/nextcloud/occ config:system:set memcache.distributed --value='\\OC\\Memcache\\Memcached'" + changed_when: false + + - name: Configure cron job for Nextcloud + ansible.builtin.cron: + name: "Nextcloud background jobs" + minute: "*/5" + user: www-data + job: "php /var/www/nextcloud/cron.php" + state: present + + - name: Set Nextcloud background job mode to cron + become_user: www-data + ansible.builtin.command: + cmd: "php /var/www/nextcloud/occ background:cron" + changed_when: false + + - name: Ensure Apache is running + ansible.builtin.service: + name: apache2 + state: started + enabled: true + + handlers: + - name: reload apache + ansible.builtin.service: + name: apache2 + state: reloaded + + - name: restart apache + ansible.builtin.service: + name: apache2 + state: restarted diff --git a/ansible/nextcloud/nextcloud.conf.j2 b/ansible/nextcloud/nextcloud.conf.j2 new file mode 100644 index 0000000..5b30a27 --- /dev/null +++ b/ansible/nextcloud/nextcloud.conf.j2 @@ -0,0 +1,22 @@ +Listen {{ nextcloud_web_port }} + + + ServerAdmin webmaster@{{ nextcloud_domain }} + ServerName {{ nextcloud_domain }} + ServerAlias rosalind.incus + + DocumentRoot /var/www/nextcloud + + + Require all granted + AllowOverride All + Options FollowSymLinks MultiViews + + + Dav off + + + + ErrorLog ${APACHE_LOG_DIR}/nextcloud_error.log + CustomLog ${APACHE_LOG_DIR}/nextcloud_access.log combined + diff --git a/ansible/oauth2_proxy/deploy.yml b/ansible/oauth2_proxy/deploy.yml new file mode 100644 index 0000000..baf7eb7 --- /dev/null +++ b/ansible/oauth2_proxy/deploy.yml @@ -0,0 +1,75 @@ +--- +# OAuth2-Proxy Deployment for SearXNG Authentication +# Provides OIDC authentication layer using Casdoor as identity provider +# Red Panda Approved + +- name: Deploy OAuth2-Proxy for SearXNG + hosts: ubuntu + become: true + tasks: + - name: Check if host has oauth2_proxy service + ansible.builtin.set_fact: + has_oauth2_proxy_service: "{{'oauth2_proxy' in services}}" + + - name: Skip hosts without oauth2_proxy service + ansible.builtin.meta: end_host + when: not has_oauth2_proxy_service + + - name: Create oauth2-proxy group + ansible.builtin.group: + name: "{{ oauth2_proxy_group }}" + gid: "{{ oauth2_proxy_gid }}" + system: true + + - name: Create oauth2-proxy user + ansible.builtin.user: + name: "{{ oauth2_proxy_user }}" + uid: "{{ oauth2_proxy_uid }}" + comment: "OAuth2 Proxy Service" + group: "{{ oauth2_proxy_group }}" + system: true + create_home: false + shell: /usr/sbin/nologin + + - name: Add oauth2-proxy group to ansible user + ansible.builtin.user: + name: "{{ ansible_user }}" + groups: "{{ oauth2_proxy_group }}" + append: true + + - name: Create oauth2-proxy directory + ansible.builtin.file: + path: "{{ oauth2_proxy_directory }}" + owner: "{{ oauth2_proxy_user }}" + group: "{{ oauth2_proxy_group }}" + state: directory + mode: '0750' + + - name: Template configuration files + ansible.builtin.template: + src: "{{ item.src }}" + dest: "{{ oauth2_proxy_directory }}/{{ item.dest }}" + owner: "{{ oauth2_proxy_user }}" + group: "{{ oauth2_proxy_group }}" + mode: "{{ item.mode | default('0640') }}" + loop: + - src: "docker-compose.yml.j2" + dest: "docker-compose.yml" + - src: "oauth2-proxy.cfg.j2" + dest: "oauth2-proxy.cfg" + mode: "0600" + notify: Restart oauth2-proxy + + - name: Reset SSH connection to apply group changes + meta: reset_connection + + - name: Start OAuth2-Proxy service + community.docker.docker_compose_v2: + project_src: "{{ oauth2_proxy_directory }}" + state: present + + handlers: + - name: Restart oauth2-proxy + community.docker.docker_compose_v2: + project_src: "{{ oauth2_proxy_directory }}" + state: restarted diff --git a/ansible/oauth2_proxy/docker-compose.yml.j2 b/ansible/oauth2_proxy/docker-compose.yml.j2 new file mode 100644 index 0000000..38eb484 --- /dev/null +++ b/ansible/oauth2_proxy/docker-compose.yml.j2 @@ -0,0 +1,29 @@ +--- +# OAuth2-Proxy Docker Compose Configuration +# Provides OIDC authentication for protected services +# Red Panda Approved + +services: + oauth2-proxy: + image: quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + container_name: oauth2-proxy + user: "{{ oauth2_proxy_uid }}:{{ oauth2_proxy_gid }}" + ports: + - "{{ oauth2_proxy_port }}:4180" + volumes: + - ./oauth2-proxy.cfg:/etc/oauth2-proxy/oauth2-proxy.cfg:ro + command: + - --config=/etc/oauth2-proxy/oauth2-proxy.cfg + restart: unless-stopped + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{ oauth2_proxy_syslog_port }}" + syslog-format: "{{ syslog_format }}" + tag: "oauth2-proxy" + healthcheck: + test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:4180/ping"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s diff --git a/ansible/oauth2_proxy/oauth2-proxy.cfg.j2 b/ansible/oauth2_proxy/oauth2-proxy.cfg.j2 new file mode 100644 index 0000000..20b6ada --- /dev/null +++ b/ansible/oauth2_proxy/oauth2-proxy.cfg.j2 @@ -0,0 +1,67 @@ +# OAuth2-Proxy Configuration +# Authenticates users via Casdoor OIDC before proxying to upstream services +# Red Panda Approved + +# Provider Configuration (Casdoor OIDC) +provider = "oidc" +provider_display_name = "Casdoor" +oidc_issuer_url = "{{ oauth2_proxy_oidc_issuer_url }}" +client_id = "{{ oauth2_proxy_client_id }}" +client_secret = "{{ oauth2_proxy_client_secret }}" + +# Redirect URL after authentication +redirect_url = "{{ oauth2_proxy_redirect_url }}" + +# Upstream service +upstreams = [ + "{{ oauth2_proxy_upstream_url }}" +] + +# Session/Cookie Configuration +cookie_secret = "{{ oauth2_proxy_cookie_secret }}" +cookie_name = "{{ oauth2_proxy_cookie_name | default('_oauth2_proxy') }}" +cookie_secure = true +cookie_httponly = true +cookie_expire = "{{ oauth2_proxy_cookie_expire | default('168h') }}" +cookie_refresh = "{{ oauth2_proxy_cookie_refresh | default('1h') }}" +cookie_domains = ".{{ oauth2_proxy_cookie_domain }}" +session_store_type = "cookie" + +# Authentication settings +email_domains = {{ oauth2_proxy_email_domains | to_json }} +oidc_email_claim = "email" +oidc_groups_claim = "groups" + +# Allow specific groups (if configured in Casdoor) +{% if oauth2_proxy_allowed_groups is defined and oauth2_proxy_allowed_groups | length > 0 %} +allowed_groups = {{ oauth2_proxy_allowed_groups | to_json }} +{% endif %} + +# Request settings +pass_access_token = false +pass_authorization_header = false +set_authorization_header = false +set_xauthrequest = true + +# Logging +request_logging = true +auth_logging = true +standard_logging = true + +# Network settings +http_address = "0.0.0.0:4180" +reverse_proxy = true +real_client_ip_header = "X-Forwarded-For" + +# Skip authentication for health check endpoints +skip_auth_routes = [ + "^/healthz$", + "^/ping$" +] + +# OIDC specific settings +skip_provider_button = true +oidc_extra_audiences = [] + +# SSL verification +ssl_insecure_skip_verify = {{ oauth2_proxy_skip_ssl_verify | default(false) | lower }} diff --git a/ansible/oauth2_proxy/stage.yml b/ansible/oauth2_proxy/stage.yml new file mode 100644 index 0000000..562fdff --- /dev/null +++ b/ansible/oauth2_proxy/stage.yml @@ -0,0 +1,93 @@ +--- +# OAuth2-Proxy Staging Playbook +# Use this to validate configuration before deployment +# Red Panda Approved + +- name: Stage OAuth2-Proxy Configuration (Dry Run) + hosts: ubuntu + become: true + tasks: + - name: Check if host has oauth2_proxy service + ansible.builtin.set_fact: + has_oauth2_proxy_service: "{{'oauth2_proxy' in services}}" + + - name: Skip hosts without oauth2_proxy service + ansible.builtin.meta: end_host + when: not has_oauth2_proxy_service + + - name: Validate required OAuth2-Proxy variables are defined + ansible.builtin.assert: + that: + - oauth2_proxy_client_id is defined + - oauth2_proxy_client_secret is defined + - oauth2_proxy_cookie_secret is defined + fail_msg: | + Missing required OAuth2-Proxy variables. Ensure service-specific vault variables + are mapped to oauth2_proxy_client_id, oauth2_proxy_client_secret, and + oauth2_proxy_cookie_secret in the host_vars file. + + Generate cookie secret with: python3 -c 'import secrets; print(secrets.token_urlsafe(32))' + + - name: Validate OIDC issuer URL is accessible + ansible.builtin.uri: + url: "{{ oauth2_proxy_oidc_issuer_url }}/.well-known/openid-configuration" + method: GET + return_content: true + status_code: 200 + register: oidc_discovery + failed_when: false + + - name: Report OIDC discovery status + ansible.builtin.debug: + msg: "{{ 'OIDC Discovery: OK' if oidc_discovery.status == 200 else 'OIDC Discovery: FAILED - Casdoor may not be running or accessible' }}" + + - name: Validate upstream URL is accessible + ansible.builtin.uri: + url: "{{ oauth2_proxy_upstream_url }}" + method: GET + return_content: false + status_code: [200, 301, 302] + register: upstream_check + failed_when: false + + - name: Report upstream status + ansible.builtin.debug: + msg: "{{ 'Upstream SearXNG: OK' if upstream_check.status in [200, 301, 302] else 'Upstream SearXNG: FAILED - SearXNG may not be running' }}" + + - name: Generate configuration preview + ansible.builtin.template: + src: "oauth2-proxy.cfg.j2" + dest: "/tmp/oauth2-proxy.cfg.preview" + mode: "0600" + + - name: Display configuration preview + ansible.builtin.command: cat /tmp/oauth2-proxy.cfg.preview + register: config_preview + changed_when: false + + - name: Show configuration + ansible.builtin.debug: + msg: "{{ config_preview.stdout_lines }}" + + - name: Clean up preview file + ansible.builtin.file: + path: /tmp/oauth2-proxy.cfg.preview + state: absent + + - name: Configuration Summary + ansible.builtin.debug: + msg: | + OAuth2-Proxy Staging Summary + ============================ + Host: {{ inventory_hostname }} + Port: {{ oauth2_proxy_port }} + OIDC Issuer: {{ oauth2_proxy_oidc_issuer_url }} + Redirect URL: {{ oauth2_proxy_redirect_url }} + Upstream: {{ oauth2_proxy_upstream_url }} + Cookie Domain: {{ oauth2_proxy_cookie_domain }} + Email Domains: {{ oauth2_proxy_email_domains | join(', ') }} + + OIDC Discovery: {{ 'OK' if oidc_discovery.status == 200 else 'FAILED' }} + Upstream Check: {{ 'OK' if upstream_check.status in [200, 301, 302] else 'FAILED' }} + + To deploy: ansible-playbook oauth2_proxy/deploy.yml diff --git a/ansible/openwebui/deploy.yml b/ansible/openwebui/deploy.yml new file mode 100644 index 0000000..bbda481 --- /dev/null +++ b/ansible/openwebui/deploy.yml @@ -0,0 +1,127 @@ +--- +- name: Install OpenWebUI configured with PostgreSQL + hosts: ubuntu + vars: + ansible_common_remote_group: "{{ openwebui_group | default(omit) }}" + allow_world_readable_tmpfiles: true + tasks: + - name: Check if host has openwebui service + ansible.builtin.set_fact: + has_openwebui_service: "{{ 'openwebui' in services | default([]) }}" + + - name: Skip hosts without openwebui service + ansible.builtin.meta: end_host + when: not has_openwebui_service + + - name: Create OpenWebUI User + become: true + ansible.builtin.user: + name: "{{openwebui_user}}" + comment: "{{openwebui_user}}" + system: true + + - name: Add "remote_user" user to OpenWebUI group + become: true + ansible.builtin.user: + name: "{{remote_user}}" + groups: "{{openwebui_group}}" + append: true + + - name: Create OpenWebUI directory + become: true + ansible.builtin.file: + path: "{{openwebui_directory}}" + owner: "{{openwebui_user}}" + group: "{{openwebui_group}}" + state: directory + mode: '0750' + + - name: Install required packages + become: true + ansible.builtin.apt: + name: [postgresql-client, ffmpeg] + state: present + update_cache: true + + - name: Install Python 3.12 and venv + become: true + ansible.builtin.apt: + name: [python3.12, python3.12-venv, python3.12-dev] + state: latest + update_cache: true + + - name: Create virtual environment + become: true + become_user: "{{openwebui_user}}" + ansible.builtin.command: python3.12 -m venv {{openwebui_directory}}/.venv + args: + creates: "{{openwebui_directory}}/.venv/bin/activate" + + - name: Install wheel and openwebui in virtual environment + become: true + become_user: "{{openwebui_user}}" + ansible.builtin.pip: + name: + - wheel + - open-webui[all]=={{openwebui_rel}} + - psycopg2-binary + state: latest + virtualenv: "{{openwebui_directory}}/.venv" + virtualenv_python: python3.12 + vars: + ansible_common_remote_group: "{{openwebui_group}}" + allow_world_readable_tmpfiles: true + notify: Restart OpenWebUI + + - name: Create environment file for OpenWebUI + become: true + ansible.builtin.template: + src: openwebui.env.j2 + dest: "{{openwebui_directory}}/.env" + owner: "{{openwebui_user}}" + group: "{{openwebui_group}}" + mode: '0600' + notify: Restart OpenWebUI + + - name: Create systemd service file + become: true + ansible.builtin.template: + src: openwebui.service.j2 + dest: /etc/systemd/system/openwebui.service + mode: '0644' + notify: Restart OpenWebUI + + - name: Enable openwebui service + become: true + ansible.builtin.systemd: + name: openwebui + daemon_reload: true + enabled: true + + handlers: + - name: Restart OpenWebUI + become: true + ansible.builtin.systemd: + name: openwebui + daemon_reload: true + state: restarted + + post_tasks: + - name: Wait for OpenWebUI to initialize database schema + ansible.builtin.pause: + seconds: 20 + prompt: "Waiting for OpenWebUI to initialize the database schema..." + + - name: Check if OpenWebUI is running + ansible.builtin.uri: + url: http://localhost:{{openwebui_port}}/ + method: GET + status_code: 200 + timeout: 5 + register: openwebui_status + ignore_errors: true + + - name: Show OpenWebUI status + ansible.builtin.debug: + msg: "OpenWebUI is {{ 'running' if openwebui_status.status == 200 else 'not running properly' }}" + diff --git a/ansible/openwebui/openwebui.env.j2 b/ansible/openwebui/openwebui.env.j2 new file mode 100644 index 0000000..92785c3 --- /dev/null +++ b/ansible/openwebui/openwebui.env.j2 @@ -0,0 +1,42 @@ +# OpenWebUI Environment Configuration + +# Server settings +PORT={{openwebui_port}} +HOST={{openwebui_host}} +WEBUI_SECRET_KEY={{openwebui_secret_key}} +CORS_ALLOW_ORIGIN={{openwebui_cors_allow_origin}} + +# Database configuration +DATABASE_URL=postgresql://{{openwebui_db_user}}:{{openwebui_db_password}}@{{openwebui_db_host}}:{{openwebui_db_port}}/{{openwebui_db_name}} +DATABASE_TYPE=postgres +VECTOR_DB=pgvector +PGVECTOR_CREATE_EXTENSION=false + +# Authentication settings +ENABLE_SIGNUP={{openwebui_enable_signup}} +ENABLE_EMAIL_LOGIN={{openwebui_enable_email_login}} + +# OAuth/OIDC Configuration (Casdoor SSO) +ENABLE_OAUTH_SIGNUP=true +OAUTH_CLIENT_ID={{openwebui_oauth_client_id}} +OAUTH_CLIENT_SECRET={{openwebui_oauth_client_secret}} +OAUTH_PROVIDER_NAME={{openwebui_oauth_provider_name}} +OPENID_PROVIDER_URL={{openwebui_oauth_provider_url}} + +# API settings +OPENAI_API_KEY={{openwebui_openai_api_key}} +ANTHROPIC_API_KEY={{openwebui_anthropic_api_key}} +MISTRAL_API_KEY={{openwebui_mistral_api_key}} +GROQ_API_KEY={{openwebui_groq_api_key}} + +# Ollama LLM settings +OLLAMA_API_BASE_URL={{ollama_api_base_url}} +OLLAMA_API_KEY={{openwebui_ollama_api_key}} + +# Security settings +ENABLE_HTTPS={{openwebui_enable_https}} +SSL_CERT_PATH={{openwebui_ssl_cert_path}} +SSL_KEY_PATH={{openwebui_ssl_key_path}} + +# Logging +LOG_LEVEL={{openwebui_log_level}} \ No newline at end of file diff --git a/ansible/openwebui/openwebui.service.j2 b/ansible/openwebui/openwebui.service.j2 new file mode 100644 index 0000000..6a38b07 --- /dev/null +++ b/ansible/openwebui/openwebui.service.j2 @@ -0,0 +1,19 @@ +[Unit] +Description=Open WebUI +After=network.target + +[Service] +Type=simple +User={{openwebui_user}} +Group={{openwebui_group}} +WorkingDirectory={{openwebui_directory}} +EnvironmentFile={{openwebui_directory}}/.env +ExecStart={{openwebui_directory}}/.venv/bin/open-webui serve --port {{openwebui_port}} +Restart=always +RestartSec=3 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=openwebui + +[Install] +WantedBy=multi-user.target diff --git a/ansible/pgadmin/deploy.yml b/ansible/pgadmin/deploy.yml new file mode 100644 index 0000000..bb9b25d --- /dev/null +++ b/ansible/pgadmin/deploy.yml @@ -0,0 +1,61 @@ +--- +- name: Deploy PgAdmin + hosts: ubuntu + tasks: + - name: Check if host has pgadmin service + ansible.builtin.set_fact: + has_pgadmin_service: "{{'pgadmin' in services}}" + + - name: Skip hosts without pgadmin service + ansible.builtin.meta: end_host + when: not has_pgadmin_service + + - name: Add PgAdmin repository + become: true + ansible.builtin.deb822_repository: + name: pgadmin4 + types: [deb] + uris: https://ftp.postgresql.org/pub/pgadmin/pgadmin4/apt/{{ansible_distribution_release}} + suites: [pgadmin4] + components: [main] + signed_by: https://www.pgadmin.org/static/packages_pgadmin_org.pub + state: present + + - name: Install PgAdmin + become: true + ansible.builtin.apt: + name: pgadmin4-web + state: present + update_cache: true + + # ------------------------------------------------------------------------- + # SSL Certificate Distribution for External PostgreSQL Connections + # ------------------------------------------------------------------------- + + - name: Create PGadmin certs directory + become: true + ansible.builtin.file: + path: /var/lib/pgadmin/certs + state: directory + owner: www-data + group: www-data + mode: '0750' + + - name: Fetch Titania PostgreSQL SSL cert + become: true + ansible.builtin.fetch: + src: /etc/postgresql/17/main/ssl/server.crt + dest: /tmp/titania-postgres-ca.crt + flat: yes + delegate_to: titania.incus + when: "'titania.incus' in groups['ubuntu']" + + - name: Copy Titania PostgreSQL SSL cert to PGadmin + become: true + ansible.builtin.copy: + src: /tmp/titania-postgres-ca.crt + dest: /var/lib/pgadmin/certs/titania-postgres-ca.crt + owner: www-data + group: www-data + mode: '0644' + when: "'titania.incus' in groups['ubuntu']" diff --git a/ansible/postgresql/deploy.yml b/ansible/postgresql/deploy.yml new file mode 100644 index 0000000..7b5f47d --- /dev/null +++ b/ansible/postgresql/deploy.yml @@ -0,0 +1,244 @@ +--- +- name: Deploy PostgreSQL + hosts: ubuntu + become: true + tasks: + - name: Check if host has postgresql service + ansible.builtin.set_fact: + has_postgresql_service: "{{ 'postgresql' in services | default([]) }}" + + - name: Skip hosts without postgresql service + ansible.builtin.meta: end_host + when: not has_postgresql_service + + - name: Install build dependencies + become: true + ansible.builtin.apt: + name: [curl, git, build-essential, vim, python3-psycopg2] + state: present + update_cache: true + + - name: Install PostgreSQL Common + become: true + ansible.builtin.apt: + name: postgresql-common + state: present + + - name: Install the public key for the PostgreSQL repository + become: true + ansible.builtin.shell: /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y + register: pg_repo_output + + - name: Install PostgreSQL packages + become: true + ansible.builtin.apt: + name: + - postgresql-client-17 + - postgresql-doc-17 + - postgresql-17 + - libpq-dev + - postgresql-server-dev-17 + state: present + update_cache: true + + - name: Show PostgreSQL repository output + ansible.builtin.debug: + var: pg_repo_output + + - name: Create gh directory + become: true + ansible.builtin.file: + path: /home/{{ remote_user }}/gh + state: directory + owner: "{{ remote_user }}" + group: "{{ remote_user }}" + mode: '755' + + - name: Clone pgvector repository + become: true + become_user: "{{ remote_user }}" + ansible.builtin.git: + repo: https://github.com/pgvector/pgvector.git + dest: /home/{{ remote_user }}/gh/pgvector + version: v0.8.0 + force: true + + - name: Build pgvector + become: true + become_user: "{{ remote_user }}" + ansible.builtin.make: + chdir: /home/{{ remote_user }}/gh/pgvector + + - name: Install pgvector + become: true + ansible.builtin.make: + chdir: /home/{{ remote_user }}/gh/pgvector + target: install + + - name: Ensure PostgreSQL is running + ansible.builtin.systemd: + name: postgresql + state: started + enabled: true + + - name: Restart PostgreSQL after pgvector installation + ansible.builtin.systemd: + name: postgresql + state: restarted + + - name: Check actual PostgreSQL version + ansible.builtin.shell: | + sudo -u postgres psql -c "SELECT version();" + become: true + register: pg_version_check + + - name: Display PostgreSQL version + ansible.builtin.debug: + var: pg_version_check.stdout + + - name: Check PostgreSQL config directory + ansible.builtin.shell: | + sudo -u postgres psql -c "SHOW config_file;" + become: true + register: pg_config_check + + - name: Display PostgreSQL config location + ansible.builtin.debug: + var: pg_config_check.stdout + + - name: Check available pg_config versions + ansible.builtin.shell: | + ls -la /usr/bin/pg_config* + which pg_config + pg_config --version + become: true + register: pg_config_versions + + - name: Display pg_config information + ansible.builtin.debug: + var: pg_config_versions.stdout + + - name: Build pgvector with correct pg_config + become: true + ansible.builtin.shell: | + cd /home/{{ remote_user }}/gh/pgvector + make clean + # Use the specific pg_config for the installed version + PG_CONFIG_PATH=$(ls /usr/bin/pg_config-* | head -1) + if [ -z "$PG_CONFIG_PATH" ]; then + PG_CONFIG_PATH=$(which pg_config) + fi + echo "Using pg_config: $PG_CONFIG_PATH" + $PG_CONFIG_PATH --version + make PG_CONFIG=$PG_CONFIG_PATH + make install PG_CONFIG=$PG_CONFIG_PATH + register: pgvector_build_output + + - name: Display pgvector build output + ansible.builtin.debug: + var: pgvector_build_output.stdout + + - name: Restart PostgreSQL after proper pgvector installation + ansible.builtin.systemd: + name: postgresql + state: restarted + + - name: Verify pgvector extension is available + ansible.builtin.shell: | + sudo -u postgres psql -c "SELECT * FROM pg_available_extensions WHERE name = 'vector';" + become: true + register: pgvector_check + + - name: Display pgvector availability check + ansible.builtin.debug: + var: pgvector_check.stdout + + - name: Set PostgreSQL data directory permissions + ansible.builtin.file: + path: "{{ postgresql_data_dir }}" + owner: "{{ postgres_user }}" + group: "{{ postgres_group }}" + mode: '700' + recurse: true + + - name: Configure PostgreSQL to listen on all addresses + ansible.builtin.lineinfile: + path: /etc/postgresql/17/main/postgresql.conf + regexp: "^#?listen_addresses" + line: "listen_addresses = '*'" + backup: true + notify: restart postgresql + + - name: Configure PostgreSQL client authentication + ansible.builtin.lineinfile: + path: /etc/postgresql/17/main/pg_hba.conf + line: "host all all 0.0.0.0/0 md5" + backup: true + notify: restart postgresql + + - name: Set postgres user password + ansible.builtin.shell: | + sudo -u postgres psql -c "ALTER USER postgres PASSWORD '{{ postgres_password }}'" + become: true + + - name: Create application database users + community.postgresql.postgresql_user: + name: "{{ item.user }}" + password: "{{ item.password }}" + state: present + login_user: postgres + login_password: "{{ postgres_password }}" + login_host: localhost + loop: + - { user: "{{ arke_db_user }}", password: "{{ arke_db_password }}" } + - { user: "{{ openwebui_db_user }}", password: "{{ openwebui_db_password }}" } + - { user: "{{ spelunker_db_user }}", password: "{{ spelunker_db_password }}" } + # Note: Casdoor uses dedicated PostgreSQL on Titania + - { user: "{{ gitea_db_user }}", password: "{{ gitea_db_password }}" } + - { user: "{{ lobechat_db_user }}", password: "{{ lobechat_db_password }}" } + - { user: "{{ nextcloud_db_user }}", password: "{{ nextcloud_db_password }}" } + - { user: "{{ anythingllm_db_user }}", password: "{{ anythingllm_db_password }}" } + - { user: "{{ hass_db_user }}", password: "{{ hass_db_password }}" } + - { user: "{{ nike_db_user }}", password: "{{ nike_db_password }}" } + no_log: true + + - name: Create application databases with owners + community.postgresql.postgresql_db: + name: "{{ item.name }}" + owner: "{{ item.owner }}" + state: present + login_user: postgres + login_password: "{{ postgres_password }}" + login_host: localhost + loop: + - { name: "{{ arke_db_name }}", owner: "{{ arke_db_user }}" } + - { name: "{{ openwebui_db_name }}", owner: "{{ openwebui_db_user }}" } + - { name: "{{ spelunker_db_name }}", owner: "{{ spelunker_db_user }}" } + # Note: Casdoor uses dedicated PostgreSQL on Titania + - { name: "{{ gitea_db_name }}", owner: "{{ gitea_db_user }}" } + - { name: "{{ lobechat_db_name }}", owner: "{{ lobechat_db_user }}" } + - { name: "{{ nextcloud_db_name }}", owner: "{{ nextcloud_db_user }}" } + - { name: "{{ anythingllm_db_name }}", owner: "{{ anythingllm_db_user }}" } + - { name: "{{ hass_db_name }}", owner: "{{ hass_db_user }}" } + - { name: "{{ nike_db_name }}", owner: "{{ nike_db_user }}" } + + - name: Enable pgvector extension in databases + community.postgresql.postgresql_ext: + name: vector + login_db: "{{ item }}" + state: present + login_user: postgres + login_password: "{{ postgres_password }}" + login_host: localhost + loop: + - "{{ arke_db_name }}" + - "{{ lobechat_db_name }}" + - "{{ openwebui_db_name }}" + - "{{ spelunker_db_name }}" + - "{{ anythingllm_db_name }}" + + handlers: + - name: restart postgresql + ansible.builtin.systemd: + name: postgresql + state: restarted diff --git a/ansible/postgresql_ssl/deploy.yml b/ansible/postgresql_ssl/deploy.yml new file mode 100644 index 0000000..2ba9711 --- /dev/null +++ b/ansible/postgresql_ssl/deploy.yml @@ -0,0 +1,218 @@ +--- +# ----------------------------------------------------------------------------- +# PostgreSQL with SSL Deployment Playbook +# ----------------------------------------------------------------------------- +# Deploys PostgreSQL 17 with SSL enabled for secure external connections +# Used for security-critical services like Casdoor identity provider +# +# Features: +# - Native PostgreSQL 17 installation (no Docker) +# - Self-signed SSL certificates for external access +# - Local connections without SSL for same-host services +# - Single-purpose database (not shared with other applications) +# ----------------------------------------------------------------------------- + +- name: Deploy PostgreSQL with SSL + hosts: ubuntu + become: true + tasks: + - name: Check if host has postgresql_ssl service + ansible.builtin.set_fact: + has_postgresql_ssl_service: "{{ 'postgresql_ssl' in services | default([]) }}" + + - name: Skip hosts without postgresql_ssl service + ansible.builtin.meta: end_host + when: not has_postgresql_ssl_service + + # ------------------------------------------------------------------------- + # Install PostgreSQL + # ------------------------------------------------------------------------- + + - name: Install build dependencies + ansible.builtin.apt: + name: + - curl + - python3-psycopg2 + - python3-cryptography + state: present + update_cache: true + + - name: Install PostgreSQL Common + ansible.builtin.apt: + name: postgresql-common + state: present + + - name: Install the public key for the PostgreSQL repository + ansible.builtin.shell: /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y + args: + creates: /etc/apt/sources.list.d/pgdg.sources + register: pg_repo_output + + - name: Install PostgreSQL packages + ansible.builtin.apt: + name: + - postgresql-client-17 + - postgresql-17 + state: present + update_cache: true + + - name: Ensure PostgreSQL is running + ansible.builtin.systemd: + name: postgresql + state: started + enabled: true + + # ------------------------------------------------------------------------- + # Generate SSL Certificates + # ------------------------------------------------------------------------- + + - name: Create PostgreSQL SSL directory + ansible.builtin.file: + path: /etc/postgresql/17/main/ssl + state: directory + owner: postgres + group: postgres + mode: '0700' + + - name: Generate PostgreSQL SSL private key + community.crypto.openssl_privatekey: + path: /etc/postgresql/17/main/ssl/server.key + size: 4096 + owner: postgres + group: postgres + mode: '0600' + + - name: Generate PostgreSQL SSL certificate signing request + community.crypto.openssl_csr: + path: /etc/postgresql/17/main/ssl/server.csr + privatekey_path: /etc/postgresql/17/main/ssl/server.key + common_name: "{{ inventory_hostname }}" + subject_alt_name: + - "DNS:{{ inventory_hostname }}" + - "DNS:localhost" + - "IP:127.0.0.1" + owner: postgres + group: postgres + mode: '0600' + + - name: Generate self-signed PostgreSQL SSL certificate + community.crypto.x509_certificate: + path: /etc/postgresql/17/main/ssl/server.crt + privatekey_path: /etc/postgresql/17/main/ssl/server.key + csr_path: /etc/postgresql/17/main/ssl/server.csr + provider: selfsigned + selfsigned_not_after: "+3650d" # 10 years + owner: postgres + group: postgres + mode: '0644' + + # ------------------------------------------------------------------------- + # Configure PostgreSQL + # ------------------------------------------------------------------------- + + - name: Configure PostgreSQL to listen on all addresses + ansible.builtin.lineinfile: + path: /etc/postgresql/17/main/postgresql.conf + regexp: "^#?listen_addresses" + line: "listen_addresses = '*'" + backup: true + notify: restart postgresql + + - name: Enable SSL in PostgreSQL + ansible.builtin.lineinfile: + path: /etc/postgresql/17/main/postgresql.conf + regexp: "^#?ssl =" + line: "ssl = on" + backup: true + notify: restart postgresql + + - name: Configure SSL certificate file + ansible.builtin.lineinfile: + path: /etc/postgresql/17/main/postgresql.conf + regexp: "^#?ssl_cert_file" + line: "ssl_cert_file = '/etc/postgresql/17/main/ssl/server.crt'" + backup: true + notify: restart postgresql + + - name: Configure SSL key file + ansible.builtin.lineinfile: + path: /etc/postgresql/17/main/postgresql.conf + regexp: "^#?ssl_key_file" + line: "ssl_key_file = '/etc/postgresql/17/main/ssl/server.key'" + backup: true + notify: restart postgresql + + # ------------------------------------------------------------------------- + # Configure Client Authentication (pg_hba.conf) + # ------------------------------------------------------------------------- + + - name: Configure local connections (no SSL, Unix socket) + ansible.builtin.lineinfile: + path: /etc/postgresql/17/main/pg_hba.conf + regexp: "^local\\s+all\\s+all\\s+peer" + line: "local all all peer" + backup: true + notify: restart postgresql + + - name: Configure localhost connections (no SSL required) + ansible.builtin.lineinfile: + path: /etc/postgresql/17/main/pg_hba.conf + line: "host all all 127.0.0.1/32 md5" + insertafter: "^local" + backup: true + notify: restart postgresql + + - name: Configure Incus private network connections (no SSL required) + ansible.builtin.lineinfile: + path: /etc/postgresql/17/main/pg_hba.conf + line: "host all all 10.10.0.0/16 md5" + insertafter: "^host.*127.0.0.1" + backup: true + notify: restart postgresql + + - name: Configure external connections (SSL required) + ansible.builtin.lineinfile: + path: /etc/postgresql/17/main/pg_hba.conf + line: "hostssl all all 0.0.0.0/0 md5" + insertafter: "^host.*10.10.0.0" + backup: true + notify: restart postgresql + + # ------------------------------------------------------------------------- + # Set Admin Password + # ------------------------------------------------------------------------- + + - name: Set postgres user password + ansible.builtin.shell: | + sudo -u postgres psql -c "ALTER USER postgres PASSWORD '{{ postgresql_ssl_postgres_password }}'" + changed_when: false + no_log: true + + # ------------------------------------------------------------------------- + # Create Application Database and User + # ------------------------------------------------------------------------- + + - name: Create Casdoor database user + community.postgresql.postgresql_user: + name: "{{ casdoor_db_user }}" + password: "{{ casdoor_db_password }}" + state: present + login_user: postgres + login_password: "{{ postgresql_ssl_postgres_password }}" + login_host: localhost + no_log: true + + - name: Create Casdoor database + community.postgresql.postgresql_db: + name: "{{ casdoor_db_name }}" + owner: "{{ casdoor_db_user }}" + state: present + login_user: postgres + login_password: "{{ postgresql_ssl_postgres_password }}" + login_host: localhost + + handlers: + - name: restart postgresql + ansible.builtin.systemd: + name: postgresql + state: restarted \ No newline at end of file diff --git a/ansible/pplg/alert_rules.yml.j2 b/ansible/pplg/alert_rules.yml.j2 new file mode 100644 index 0000000..5efc81a --- /dev/null +++ b/ansible/pplg/alert_rules.yml.j2 @@ -0,0 +1,249 @@ +# Prometheus Alert Rules +# Red Panda Approved 🐼 +# Deployed to: /etc/prometheus/alert_rules.yml +{% raw %} +groups: + # ============================================================================ + # Node/Infrastructure Alerts + # ============================================================================ + - name: node_alerts + rules: + - alert: InstanceDown + expr: up == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Instance {{ $labels.instance }} is down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." + + - alert: HighCPUUsage + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage on {{ $labels.instance }}" + description: "CPU usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: CriticalCPUUsage + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical CPU usage on {{ $labels.instance }}" + description: "CPU usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: HighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage on {{ $labels.instance }}" + description: "Memory usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: CriticalMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical memory usage on {{ $labels.instance }}" + description: "Memory usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: DiskSpaceLow + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20 + for: 5m + labels: + severity: warning + annotations: + summary: "Low disk space on {{ $labels.instance }}" + description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 20% free space (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: DiskSpaceCritical + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical disk space on {{ $labels.instance }}" + description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 10% free space (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: HighLoadAverage + expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2 + for: 10m + labels: + severity: warning + annotations: + summary: "High load average on {{ $labels.instance }}" + description: "15-minute load average is {{ $value | printf \"%.2f\" }} times the CPU count on {{ $labels.instance }}" + + # ============================================================================ + # Process-Level Alerts (puck.incus) + # ============================================================================ + - name: puck_process_alerts + rules: + - alert: PuckHighCPUProcess + expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[2m])) * 100 > 80 + for: 2m + labels: + severity: warning + annotations: + summary: "High CPU process on puck: {{ $labels.groupname }}" + description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU for more than 2 minutes" + + - alert: PuckCriticalCPUProcess + expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[1m])) * 100 > 95 + for: 1m + labels: + severity: critical + annotations: + summary: "Critical CPU process on puck: {{ $labels.groupname }}" + description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU - immediate attention required" + + - alert: PuckHighMemoryProcess + expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 1073741824 + for: 2m + labels: + severity: warning + annotations: + summary: "High memory process on puck: {{ $labels.groupname }}" + description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory" + + - alert: PuckCriticalMemoryProcess + expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 2147483648 + for: 1m + labels: + severity: critical + annotations: + summary: "Critical memory process on puck: {{ $labels.groupname }}" + description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory - immediate attention required" + + - alert: PuckProcessCrashLoop + expr: increase(namedprocess_namegroup_num_procs{instance=~"puck.*"}[5m]) < -1 + for: 1m + labels: + severity: warning + annotations: + summary: "Process count dropped on puck: {{ $labels.groupname }}" + description: "Process {{ $labels.groupname }} count has decreased, indicating possible crash or restart" + + # ============================================================================ + # Docker Container Alerts (puck.incus) + # ============================================================================ + - name: puck_container_alerts + rules: + - alert: PuckHighContainerCount + expr: count(container_last_seen{instance=~"puck.*", name!=""}) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "High container count on puck" + description: "puck.incus has {{ $value }} running containers, which exceeds the threshold of 5" + + - alert: PuckDuplicateContainers + expr: count by (image, instance) (container_last_seen{instance=~"puck.*", name!=""}) > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "Duplicate containers on puck: {{ $labels.image }}" + description: "{{ $value }} containers running the same image {{ $labels.image }} on puck" + + - alert: PuckOrphanedContainer + expr: (time() - container_start_time_seconds{instance=~"puck.*", name=~".*_.*"}) > 3600 + for: 10m + labels: + severity: warning + annotations: + summary: "Possible orphaned container on puck: {{ $labels.name }}" + description: "Container {{ $labels.name }} with auto-generated name has been running for {{ $value | humanizeDuration }}" + + - alert: PuckMCPContainerOnPuck + expr: container_last_seen{instance=~"puck.*", image=~".*mcp-server.*|.*mcp_server.*"} + for: 1m + labels: + severity: critical + annotations: + summary: "MCP container detected on puck (WRONG HOST)" + description: "Container {{ $labels.name }} ({{ $labels.image }}) is running on puck but MCP servers should run on miranda.incus" + + - alert: PuckContainerHighCPU + expr: sum by (name, instance) (rate(container_cpu_usage_seconds_total{instance=~"puck.*", name!=""}[2m])) * 100 > 80 + for: 2m + labels: + severity: warning + annotations: + summary: "High CPU container on puck: {{ $labels.name }}" + description: "Container {{ $labels.name }} is using {{ $value | printf \"%.1f\" }}% CPU" + + - alert: PuckContainerHighMemory + expr: container_memory_usage_bytes{instance=~"puck.*", name!=""} > 1073741824 + for: 2m + labels: + severity: warning + annotations: + summary: "High memory container on puck: {{ $labels.name }}" + description: "Container {{ $labels.name }} is using {{ $value | humanize }} memory" + + - alert: PuckContainerOOMKilled + expr: increase(container_oom_events_total{instance=~"puck.*", name!=""}[5m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Container OOM killed on puck: {{ $labels.name }}" + description: "Container {{ $labels.name }} was killed by OOM killer" + + # ============================================================================ + # Service/Application Alerts + # ============================================================================ + - name: service_alerts + rules: + - alert: PrometheusTargetMissing + expr: up == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus target missing: {{ $labels.instance }}" + description: "A Prometheus target has been down for more than 5 minutes." + + - alert: PrometheusJobMissing + expr: absent(up{job="node-exporter"}) + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus job missing" + description: "A Prometheus job has disappeared from target discovery." + + - alert: AlertmanagerDown + expr: absent(up{job="alertmanager"}) + for: 5m + labels: + severity: critical + annotations: + summary: "Alertmanager is down" + description: "Alertmanager is not responding. Alerts may not be delivered." + + # ============================================================================ + # Loki/Logging Alerts + # ============================================================================ + - name: loki_alerts + rules: + - alert: LokiHighLogVolume + expr: sum(rate(loki_distributor_bytes_received_total[5m])) > 10485760 + for: 10m + labels: + severity: warning + annotations: + summary: "High log ingestion rate" + description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging" + +# Red Panda Seal of Approval 🐼 +# "If the metrics aren't red, go back to bed" +{% endraw %} diff --git a/ansible/pplg/alertmanager.yml.j2 b/ansible/pplg/alertmanager.yml.j2 new file mode 100644 index 0000000..0f29df0 --- /dev/null +++ b/ansible/pplg/alertmanager.yml.j2 @@ -0,0 +1,148 @@ +global: + resolve_timeout: 5m + smtp_smarthost: '{{ smtp_host }}:{{ smtp_port }}' + smtp_from: '{{ smtp_from }}' + smtp_require_tls: false + +route: + group_by: ['alertname', 'instance', 'severity'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + receiver: 'email' + routes: + - match: + severity: critical + receiver: 'email-critical' + continue: true + - match: + severity: warning + receiver: 'email-warning' + continue: true + - match: + severity: info + receiver: 'email-info' + repeat_interval: 24h + +inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'instance'] + +receivers: +- name: 'email-critical' + email_configs: + - to: 'hostmaster+critical@ouranos.helu.ca' + send_resolved: true + html: true + headers: + Subject: '🚨 [CRITICAL] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}' + text: |- + {{ "{{" }} range .Alerts {{ "}}" }} + {{ "{{" }} .Annotations.description {{ "}}" }} + + Instance: {{ "{{" }} .Labels.instance {{ "}}" }} + {{ "{{" }} end {{ "}}" }} + +- name: 'email-warning' + email_configs: + - to: 'hostmaster+warning@ouranos.helu.ca' + send_resolved: true + html: true + headers: + Subject: '⚠️ [WARNING] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}' + text: |- + {{ "{{" }} range .Alerts {{ "}}" }} + {{ "{{" }} .Annotations.description {{ "}}" }} + + Instance: {{ "{{" }} .Labels.instance {{ "}}" }} + {{ "{{" }} end {{ "}}" }} + +- name: 'email-info' + email_configs: + - to: 'hostmaster+info@ouranos.helu.ca' + send_resolved: false + html: true + headers: + Subject: '{{ "{{" }} .GroupLabels.alertname {{ "}}" }}' + text: '{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}' + +- name: 'email' + email_configs: + - to: 'hostmaster+alerts@ouranos.helu.ca' + send_resolved: true + html: true + headers: + Subject: '[{{ "{{" }} .GroupLabels.severity | default "ALERT" {{ "}}" }}] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}' + text: |- + {{ "{{" }} range .Alerts {{ "}}" }} + {{ "{{" }} .Annotations.description {{ "}}" }} + + Instance: {{ "{{" }} .Labels.instance {{ "}}" }} + Severity: {{ "{{" }} .Labels.severity {{ "}}" }} + {{ "{{" }} end {{ "}}" }} + +# --- Pushover receivers (disabled for smtp4dev testing) --- +# To re-enable: uncomment these receivers and update the route receiver names +# from email-*/email back to pushover-*/pushover +# +# - name: 'pushover-critical' +# pushover_configs: +# - user_key: '{{ pushover_user_key }}' +# token: '{{ pushover_api_token }}' +# send_resolved: true +# html: true +# priority: '2' +# retry: 30 +# expire: 3600 +# title: '🚨 [CRITICAL] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}' +# message: |- +# {{ "{{" }} range .Alerts {{ "}}" }} +# {{ "{{" }} .Annotations.description {{ "}}" }} +# Instance: {{ "{{" }} .Labels.instance {{ "}}" }} +# {{ "{{" }} end {{ "}}" }} +# +# - name: 'pushover-warning' +# pushover_configs: +# - user_key: '{{ pushover_user_key }}' +# token: '{{ pushover_api_token }}' +# send_resolved: true +# html: true +# priority: '1' +# retry: 30 +# expire: 3600 +# title: '⚠️ [WARNING] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}' +# message: |- +# {{ "{{" }} range .Alerts {{ "}}" }} +# {{ "{{" }} .Annotations.description {{ "}}" }} +# Instance: {{ "{{" }} .Labels.instance {{ "}}" }} +# {{ "{{" }} end {{ "}}" }} +# +# - name: 'pushover-info' +# pushover_configs: +# - user_key: '{{ pushover_user_key }}' +# token: '{{ pushover_api_token }}' +# send_resolved: false +# html: true +# priority: '0' +# title: '{{ "{{" }} .GroupLabels.alertname {{ "}}" }}' +# message: '{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}' +# +# - name: 'pushover' +# pushover_configs: +# - user_key: '{{ pushover_user_key }}' +# token: '{{ pushover_api_token }}' +# send_resolved: true +# html: true +# priority: '1' +# retry: 30 +# expire: 3600 +# title: '[{{ "{{" }} .GroupLabels.severity | default "ALERT" {{ "}}" }}] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}' +# message: |- +# {{ "{{" }} range .Alerts {{ "}}" }} +# {{ "{{" }} .Annotations.description {{ "}}" }} +# Instance: {{ "{{" }} .Labels.instance {{ "}}" }} +# Severity: {{ "{{" }} .Labels.severity {{ "}}" }} +# {{ "{{" }} end {{ "}}" }} \ No newline at end of file diff --git a/ansible/pplg/config.yml.j2 b/ansible/pplg/config.yml.j2 new file mode 100644 index 0000000..a65621f --- /dev/null +++ b/ansible/pplg/config.yml.j2 @@ -0,0 +1,41 @@ +auth_enabled: false + +server: + http_listen_port: {{ loki_port }} + grpc_listen_port: {{ loki_grpc_port }} + +common: + path_prefix: {{ loki_data_dir }} + storage: + filesystem: + chunks_directory: {{ loki_data_dir }}/chunks + rules_directory: {{ loki_data_dir }}/rules + replication_factor: 1 + ring: + instance_addr: 127.0.0.1 + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2024-04-01 + object_store: filesystem + store: tsdb + schema: v13 + index: + prefix: index_ + period: 24h + +ruler: + alertmanager_url: http://{{ alertmanager_host }}:{{ alertmanager_port }} + +# Red Panda Approved Configuration +analytics: + reporting_enabled: false diff --git a/ansible/pplg/config_local.py.j2 b/ansible/pplg/config_local.py.j2 new file mode 100644 index 0000000..4d2cb8f --- /dev/null +++ b/ansible/pplg/config_local.py.j2 @@ -0,0 +1,55 @@ +# PgAdmin4 Local Configuration - Managed by Ansible +# Gunicorn-based deployment (no Apache) with Casdoor OAuth SSO +# Red Panda Approved + +import os + +# Server settings +DEFAULT_SERVER = '0.0.0.0' +DEFAULT_SERVER_PORT = {{pgadmin_port}} + +# Data directory +DATA_DIR = '{{pgadmin_data_dir}}' +SESSION_DB_PATH = os.path.join(DATA_DIR, 'sessions') +STORAGE_DIR = os.path.join(DATA_DIR, 'storage') +SQLITE_PATH = os.path.join(DATA_DIR, 'pgadmin4.db') + +# Log settings +LOG_FILE = '{{pgadmin_log_dir}}/pgadmin4.log' + +# Default admin credentials (for initial setup) +SETUP_EMAIL = '{{pgadmin_email}}' +SETUP_PASSWORD = '{{pgadmin_password}}' + +# Authentication - OAuth2 (Casdoor) + internal fallback +AUTHENTICATION_SOURCES = ['oauth2', 'internal'] + +# Master password disabled (use OAuth) +MASTER_PASSWORD_REQUIRED = False + +# Reverse proxy settings (Titania HAProxy -> Prospero HAProxy -> Gunicorn) +ENHANCED_COOKIE_PROTECTION = False +PROXY_X_FOR_COUNT = 2 +PROXY_X_PROTO_COUNT = 2 +PROXY_X_HOST_COUNT = 2 +X_FRAME_OPTIONS = 'SAMEORIGIN' +SESSION_COOKIE_SECURE = True +SESSION_COOKIE_SAMESITE = 'Lax' +WTF_CSRF_SSL_STRICT = False + +# OAuth2 Configuration (Casdoor OIDC) +OAUTH2_AUTO_CREATE_USER = True +OAUTH2_CONFIG = [{ + 'OAUTH2_NAME': 'Casdoor', + 'OAUTH2_DISPLAY_NAME': 'Casdoor SSO', + 'OAUTH2_CLIENT_ID': '{{pgadmin_oauth_client_id}}', + 'OAUTH2_CLIENT_SECRET': '{{pgadmin_oauth_client_secret}}', + 'OAUTH2_TOKEN_URL': 'https://id.ouranos.helu.ca/api/login/oauth/access_token', + 'OAUTH2_AUTHORIZATION_URL': 'https://id.ouranos.helu.ca/login/oauth/authorize', + 'OAUTH2_API_BASE_URL': 'https://id.ouranos.helu.ca/', + 'OAUTH2_USERINFO_ENDPOINT': 'api/userinfo', + 'OAUTH2_SERVER_METADATA_URL': 'https://id.ouranos.helu.ca/.well-known/openid-configuration', + 'OAUTH2_SCOPE': 'openid profile email', + 'OAUTH2_ICON': 'fa-openid', + 'OAUTH2_BUTTON_COLOR': '#2db7f5', +}] diff --git a/ansible/pplg/datasource.yml.j2 b/ansible/pplg/datasource.yml.j2 new file mode 100644 index 0000000..2a37a86 --- /dev/null +++ b/ansible/pplg/datasource.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: 1 +datasources: + - name: {{prometheus_datasource_name}} + type: prometheus + access: proxy + url: http://{{prometheus_host}}:{{prometheus_port}} + isDefault: true + editable: false + uid: {{prometheus_datasource_uid}} + - name: {{loki_datasource_name}} + type: loki + access: proxy + url: http://{{loki_host}}:{{loki_port}} + editable: false + uid: {{loki_datasource_uid}} diff --git a/ansible/pplg/deploy.yml b/ansible/pplg/deploy.yml new file mode 100644 index 0000000..e531a28 --- /dev/null +++ b/ansible/pplg/deploy.yml @@ -0,0 +1,495 @@ +--- +# PPLG - Consolidated Observability & Admin Stack for Prospero +# PgAdmin, Prometheus, Loki, Grafana + HAProxy (TLS) + OAuth2-Proxy (Prometheus UI) +# Red Panda Approved + +- name: Deploy PPLG Stack + hosts: ubuntu + become: true + tasks: + - name: Check if host has pplg service + ansible.builtin.set_fact: + has_pplg_service: "{{'pplg' in services}}" + + - name: Skip hosts without pplg service + ansible.builtin.meta: end_host + when: not has_pplg_service + + # =========================================================================== + # APT Repositories + # =========================================================================== + + - name: Add Grafana APT repository (Grafana + Loki) + ansible.builtin.deb822_repository: + name: grafana + types: [deb] + uris: https://apt.grafana.com + suites: [stable] + components: [main] + signed_by: https://apt.grafana.com/gpg.key + state: present + + - name: Add PgAdmin APT repository + ansible.builtin.deb822_repository: + name: pgadmin4 + types: [deb] + uris: https://ftp.postgresql.org/pub/pgadmin/pgadmin4/apt/{{ansible_distribution_release}} + suites: [pgadmin4] + components: [main] + signed_by: https://www.pgadmin.org/static/packages_pgadmin_org.pub + state: present + + # =========================================================================== + # Package Installation + # =========================================================================== + + - name: Install PPLG packages + ansible.builtin.apt: + name: + - acl + - haproxy + - prometheus + - loki + - grafana + - pgadmin4-web + state: present + update_cache: true + + - name: Stop and disable Apache (pulled in by pgadmin4-web) + ansible.builtin.systemd: + name: apache2 + state: stopped + enabled: false + + # =========================================================================== + # Prometheus + # =========================================================================== + + - name: Fix Prometheus directory permissions + ansible.builtin.file: + path: /var/lib/prometheus + owner: prometheus + group: prometheus + mode: '750' + recurse: true + + - name: Create textfile collector directory + ansible.builtin.file: + path: /var/lib/prometheus/node-exporter + state: directory + owner: prometheus + group: prometheus + mode: '750' + + - name: Template prometheus.yml + ansible.builtin.template: + src: prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml + owner: prometheus + group: prometheus + mode: '640' + notify: restart prometheus + + - name: Template alert_rules.yml + ansible.builtin.template: + src: alert_rules.yml.j2 + dest: /etc/prometheus/alert_rules.yml + owner: prometheus + group: prometheus + mode: '640' + notify: restart prometheus + + - name: Create Prometheus systemd override directory + ansible.builtin.file: + path: /etc/systemd/system/prometheus.service.d + state: directory + mode: '755' + + - name: Enable remote write receiver + ansible.builtin.copy: + content: | + [Service] + ExecStart= + ExecStart=/usr/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/metrics2/ --web.console.templates=/etc/prometheus/consoles --web.console.libraries=/etc/prometheus/console_libraries --web.listen-address=0.0.0.0:9090 --web.external-url= --web.enable-remote-write-receiver + dest: /etc/systemd/system/prometheus.service.d/override.conf + mode: '644' + notify: restart prometheus + + - name: Start and enable Prometheus service + ansible.builtin.systemd: + name: prometheus + state: started + enabled: true + daemon_reload: true + + # =========================================================================== + # Prometheus Alertmanager + # =========================================================================== + + - name: Install Alertmanager + ansible.builtin.apt: + name: prometheus-alertmanager + state: present + + - name: Create alertmanager configuration directory + ansible.builtin.file: + path: /etc/alertmanager + state: directory + owner: prometheus + group: prometheus + mode: '750' + + - name: Template alertmanager.yml + ansible.builtin.template: + src: alertmanager.yml.j2 + dest: /etc/alertmanager/alertmanager.yml + owner: prometheus + group: prometheus + mode: '640' + notify: restart alertmanager + + - name: Start and enable Alertmanager service + ansible.builtin.systemd: + name: prometheus-alertmanager + state: started + enabled: true + daemon_reload: true + + # =========================================================================== + # Loki + # =========================================================================== + + - name: Create loki group + ansible.builtin.group: + name: "{{loki_group}}" + + - name: Create loki user + ansible.builtin.user: + name: "{{loki_user}}" + comment: "{{loki_user}}" + group: "{{loki_group}}" + system: true + + - name: Create loki directories + ansible.builtin.file: + path: "{{item}}" + owner: "{{loki_user}}" + group: "{{loki_group}}" + state: directory + mode: '750' + loop: + - "{{loki_data_dir}}" + - "{{loki_config_dir}}" + + - name: Template Loki configuration + ansible.builtin.template: + src: "{{loki_config_file}}.j2" + dest: "{{loki_config_dir}}/{{loki_config_file}}" + owner: "{{loki_user}}" + group: "{{loki_group}}" + mode: '550' + notify: restart loki + + - name: Enable and start Loki service + ansible.builtin.systemd: + name: loki + enabled: true + state: started + + # =========================================================================== + # Grafana + # =========================================================================== + + - name: Create dashboards directory + ansible.builtin.file: + path: /var/lib/grafana/dashboards + state: directory + owner: grafana + group: grafana + mode: '750' + + - name: Template Grafana main configuration + ansible.builtin.template: + src: "grafana.ini.j2" + dest: "/etc/grafana/grafana.ini" + owner: grafana + group: grafana + mode: '640' + when: grafana_oauth_enabled | default(false) + notify: restart grafana + + - name: Enable and start Grafana service + ansible.builtin.systemd: + name: grafana-server + enabled: true + state: started + daemon_reload: true + + # =========================================================================== + # PgAdmin (Gunicorn - no Apache) + # =========================================================================== + + - name: Create pgadmin group + ansible.builtin.group: + name: "{{pgadmin_group}}" + system: true + + - name: Create pgadmin user + ansible.builtin.user: + name: "{{pgadmin_user}}" + comment: "PgAdmin Service" + group: "{{pgadmin_group}}" + system: true + create_home: false + shell: /usr/sbin/nologin + + - name: Create PgAdmin directories + ansible.builtin.file: + path: "{{item}}" + state: directory + owner: "{{pgadmin_user}}" + group: "{{pgadmin_group}}" + mode: '750' + loop: + - "{{pgadmin_data_dir}}" + - "{{pgadmin_data_dir}}/sessions" + - "{{pgadmin_data_dir}}/storage" + - "{{pgadmin_data_dir}}/certs" + - "{{pgadmin_log_dir}}" + + - name: Install gunicorn into PgAdmin venv + ansible.builtin.command: + cmd: /usr/pgadmin4/venv/bin/pip install gunicorn + register: pip_gunicorn + changed_when: "'Successfully installed' in pip_gunicorn.stdout" + + - name: Initialize PgAdmin database + ansible.builtin.command: + cmd: /usr/pgadmin4/venv/bin/python3 /usr/pgadmin4/web/setup.py setup-db + creates: "{{pgadmin_data_dir}}/pgadmin4.db" + become_user: "{{pgadmin_user}}" + + - name: Template PgAdmin local config + ansible.builtin.template: + src: config_local.py.j2 + dest: /usr/pgadmin4/web/config_local.py + owner: "{{pgadmin_user}}" + group: "{{pgadmin_group}}" + mode: '640' + notify: restart pgadmin + + - name: Fetch Titania PostgreSQL SSL cert + ansible.builtin.fetch: + src: /etc/postgresql/17/main/ssl/server.crt + dest: /tmp/titania-postgres-ca.crt + flat: yes + delegate_to: titania.incus + when: "'titania.incus' in groups['ubuntu']" + + - name: Copy Titania PostgreSQL SSL cert to PgAdmin + ansible.builtin.copy: + src: /tmp/titania-postgres-ca.crt + dest: "{{pgadmin_data_dir}}/certs/titania-postgres-ca.crt" + owner: "{{pgadmin_user}}" + group: "{{pgadmin_group}}" + mode: '0644' + when: "'titania.incus' in groups['ubuntu']" + + - name: Template PgAdmin systemd service + ansible.builtin.template: + src: pgadmin.service.j2 + dest: /etc/systemd/system/pgadmin.service + owner: root + group: root + mode: '0644' + notify: restart pgadmin + + - name: Enable and start PgAdmin service + ansible.builtin.systemd: + name: pgadmin + enabled: true + state: started + daemon_reload: true + + # =========================================================================== + # OAuth2-Proxy Sidecar (Prometheus UI) + # =========================================================================== + + - name: Create oauth2-proxy config directory + ansible.builtin.file: + path: "{{prometheus_oauth2_proxy_dir}}" + owner: root + group: root + state: directory + mode: '0755' + + - name: Download oauth2-proxy binary + ansible.builtin.get_url: + url: "https://github.com/oauth2-proxy/oauth2-proxy/releases/download/v{{prometheus_oauth2_proxy_version}}/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.linux-amd64.tar.gz" + dest: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.tar.gz" + mode: '0644' + + - name: Extract oauth2-proxy binary + ansible.builtin.unarchive: + src: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.tar.gz" + dest: /tmp + remote_src: true + creates: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.linux-amd64/oauth2-proxy" + + - name: Install oauth2-proxy binary + ansible.builtin.copy: + src: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.linux-amd64/oauth2-proxy" + dest: /usr/local/bin/oauth2-proxy + owner: root + group: root + mode: '0755' + remote_src: true + + - name: Template oauth2-proxy configuration for Prometheus + ansible.builtin.template: + src: oauth2-proxy-prometheus.cfg.j2 + dest: "{{prometheus_oauth2_proxy_dir}}/oauth2-proxy.cfg" + owner: root + group: root + mode: '0600' + notify: restart oauth2-proxy-prometheus + + - name: Template oauth2-proxy systemd service for Prometheus + ansible.builtin.template: + src: oauth2-proxy-prometheus.service.j2 + dest: /etc/systemd/system/oauth2-proxy-prometheus.service + owner: root + group: root + mode: '0644' + notify: + - reload systemd + - restart oauth2-proxy-prometheus + + - name: Enable and start OAuth2-Proxy for Prometheus + ansible.builtin.systemd: + name: oauth2-proxy-prometheus + enabled: true + state: started + daemon_reload: true + + # =========================================================================== + # SSL Certificate Distribution (from Titania) + # =========================================================================== + + - name: Create haproxy group + ansible.builtin.group: + name: "{{pplg_haproxy_group}}" + gid: "{{pplg_haproxy_gid}}" + system: true + + - name: Create haproxy user + ansible.builtin.user: + name: "{{pplg_haproxy_user}}" + comment: "PPLG HAProxy" + group: "{{pplg_haproxy_group}}" + uid: "{{pplg_haproxy_uid}}" + system: true + + - name: Create HAProxy directories + ansible.builtin.file: + path: "{{item}}" + state: directory + owner: "{{pplg_haproxy_user}}" + group: "{{pplg_haproxy_group}}" + mode: '750' + loop: + - /etc/haproxy + - /etc/haproxy/certs + + - name: Fetch wildcard certificate from Titania + ansible.builtin.fetch: + src: /etc/haproxy/certs/ouranos.pem + dest: /tmp/ouranos-haproxy.pem + flat: yes + delegate_to: titania.incus + when: "'titania.incus' in groups['ubuntu']" + + - name: Deploy wildcard certificate + ansible.builtin.copy: + src: /tmp/ouranos-haproxy.pem + dest: "{{pplg_haproxy_cert_path}}" + owner: "{{pplg_haproxy_user}}" + group: "{{pplg_haproxy_group}}" + mode: '0640' + when: "'titania.incus' in groups['ubuntu']" + + - name: Generate self-signed wildcard certificate (fallback) + command: > + openssl req -x509 -nodes -days 365 -newkey rsa:2048 + -keyout {{pplg_haproxy_cert_path}} + -out {{pplg_haproxy_cert_path}} + -subj "/C=US/ST=State/L=City/O=Agathos/CN=*.{{pplg_haproxy_domain}}" + -addext "subjectAltName=DNS:*.{{pplg_haproxy_domain}},DNS:{{pplg_haproxy_domain}}" + when: "'titania.incus' not in groups['ubuntu']" + args: + creates: "{{pplg_haproxy_cert_path}}" + + # =========================================================================== + # HAProxy (TLS Termination) + # =========================================================================== + + - name: Template HAProxy configuration + ansible.builtin.template: + src: pplg-haproxy.cfg.j2 + dest: /etc/haproxy/haproxy.cfg + owner: "{{pplg_haproxy_user}}" + group: "{{pplg_haproxy_group}}" + mode: "640" + validate: haproxy -c -f %s + notify: restart haproxy + + - name: Enable and start HAProxy service + ansible.builtin.systemd: + name: haproxy + enabled: true + state: started + + # =========================================================================== + # Handlers + # =========================================================================== + handlers: + - name: restart prometheus + ansible.builtin.systemd: + name: prometheus + state: restarted + daemon_reload: true + + - name: restart alertmanager + ansible.builtin.systemd: + name: prometheus-alertmanager + state: restarted + + - name: restart loki + ansible.builtin.systemd: + name: loki + state: restarted + + - name: restart grafana + ansible.builtin.systemd: + name: grafana-server + state: restarted + + - name: restart pgadmin + ansible.builtin.systemd: + name: pgadmin + state: restarted + daemon_reload: true + + - name: reload systemd + ansible.builtin.systemd: + daemon_reload: true + + - name: restart haproxy + ansible.builtin.systemd: + name: haproxy + state: reloaded + + - name: restart oauth2-proxy-prometheus + ansible.builtin.systemd: + name: oauth2-proxy-prometheus + state: restarted diff --git a/ansible/pplg/grafana.ini.j2 b/ansible/pplg/grafana.ini.j2 new file mode 100644 index 0000000..da1065b --- /dev/null +++ b/ansible/pplg/grafana.ini.j2 @@ -0,0 +1,36 @@ +# Grafana Configuration - Managed by Ansible +# Do not edit manually - changes will be overwritten + +[server] +root_url = {{ grafana_root_url }} + +[auth] +# Disable login form for OAuth users (admins can still use local auth) +disable_login_form = false + +[auth.generic_oauth] +enabled = {{ grafana_oauth_enabled }} +name = {{ grafana_oauth_name | default('Casdoor') }} +allow_sign_up = {{ grafana_oauth_allow_sign_up | default(true) | lower }} +client_id = {{ grafana_oauth_client_id }} +client_secret = {{ grafana_oauth_client_secret }} +scopes = {{ grafana_oauth_scopes | default('openid profile email') }} +auth_url = {{ grafana_oauth_auth_url }} +token_url = {{ grafana_oauth_token_url }} +api_url = {{ grafana_oauth_api_url }} +# Map Casdoor user attributes to Grafana +email_attribute_path = email +login_attribute_path = preferred_username +name_attribute_path = name +# Default role for new OAuth users +role_attribute_path = contains(groups[*], 'grafana-admin') && 'Admin' || contains(groups[*], 'grafana-editor') && 'Editor' || 'Viewer' +# TLS settings for internal communication +tls_skip_verify_insecure = {{ grafana_oauth_skip_tls_verify | default(true) | lower }} + +[log] +# Console-only logging — systemd journal captures output, Alloy ships to Loki +mode = console +level = {{ grafana_log_level | default('info') }} + +[log.console] +format = text diff --git a/ansible/pplg/oauth2-proxy-prometheus.cfg.j2 b/ansible/pplg/oauth2-proxy-prometheus.cfg.j2 new file mode 100644 index 0000000..caa894b --- /dev/null +++ b/ansible/pplg/oauth2-proxy-prometheus.cfg.j2 @@ -0,0 +1,62 @@ +# OAuth2-Proxy Configuration for Prometheus UI +# Authenticates users via Casdoor OIDC before proxying to Prometheus +# Red Panda Approved + +# Provider Configuration (Casdoor OIDC) +provider = "oidc" +provider_display_name = "Casdoor" +oidc_issuer_url = "{{prometheus_oauth2_oidc_issuer_url}}" +client_id = "{{prometheus_oauth2_client_id}}" +client_secret = "{{prometheus_oauth2_client_secret}}" + +# Redirect URL after authentication +redirect_url = "https://prometheus.{{pplg_haproxy_domain}}/oauth2/callback" + +# Upstream service (Prometheus) +upstreams = [ + "http://127.0.0.1:9090" +] + +# Session/Cookie Configuration +cookie_secret = "{{prometheus_oauth2_cookie_secret}}" +cookie_name = "_oauth2_proxy_prometheus" +cookie_secure = true +cookie_httponly = true +cookie_expire = "168h" +cookie_refresh = "1h" +cookie_domains = ".{{pplg_haproxy_domain}}" +session_store_type = "cookie" + +# Authentication settings +email_domains = ["*"] +oidc_email_claim = "email" +oidc_groups_claim = "groups" +insecure_oidc_allow_unverified_email = true + +# Request settings +pass_access_token = false +pass_authorization_header = false +set_authorization_header = false +set_xauthrequest = true + +# Logging +request_logging = true +auth_logging = true +standard_logging = true + +# Network settings +http_address = "0.0.0.0:{{prometheus_proxy_port}}" +reverse_proxy = true +real_client_ip_header = "X-Forwarded-For" + +# Skip authentication for health check endpoints +skip_auth_routes = [ + "^/ping$" +] + +# OIDC specific settings +skip_provider_button = true +oidc_extra_audiences = [] + +# SSL verification +ssl_insecure_skip_verify = false diff --git a/ansible/pplg/oauth2-proxy-prometheus.service.j2 b/ansible/pplg/oauth2-proxy-prometheus.service.j2 new file mode 100644 index 0000000..33e0e71 --- /dev/null +++ b/ansible/pplg/oauth2-proxy-prometheus.service.j2 @@ -0,0 +1,18 @@ +[Unit] +Description=OAuth2-Proxy for Prometheus UI +After=network.target prometheus.service +Wants=prometheus.service + +[Service] +Type=simple +ExecStart=/usr/local/bin/oauth2-proxy --config={{prometheus_oauth2_proxy_dir}}/oauth2-proxy.cfg +Restart=on-failure +RestartSec=5 +NoNewPrivileges=true +PrivateTmp=true +StandardOutput=journal +StandardError=journal +SyslogIdentifier=oauth2-proxy-prometheus + +[Install] +WantedBy=multi-user.target diff --git a/ansible/pplg/pgadmin.service.j2 b/ansible/pplg/pgadmin.service.j2 new file mode 100644 index 0000000..7d299f5 --- /dev/null +++ b/ansible/pplg/pgadmin.service.j2 @@ -0,0 +1,27 @@ +[Unit] +Description=PgAdmin4 Web Interface (Gunicorn) +After=network.target +Wants=network.target + +[Service] +Type=simple +User={{pgadmin_user}} +Group={{pgadmin_group}} +WorkingDirectory=/usr/pgadmin4/web +ExecStart=/usr/pgadmin4/venv/bin/python3 -m gunicorn pgAdmin4:app \ + --bind 127.0.0.1:{{pgadmin_port}} \ + --workers 1 \ + --threads 4 \ + --timeout 120 \ + --access-logfile - \ + --error-logfile - +Restart=on-failure +RestartSec=5 +NoNewPrivileges=true +PrivateTmp=true +StandardOutput=journal +StandardError=journal +SyslogIdentifier=pgadmin + +[Install] +WantedBy=multi-user.target diff --git a/ansible/pplg/pplg-haproxy.cfg.j2 b/ansible/pplg/pplg-haproxy.cfg.j2 new file mode 100644 index 0000000..4fb4a7f --- /dev/null +++ b/ansible/pplg/pplg-haproxy.cfg.j2 @@ -0,0 +1,127 @@ +# PPLG HAProxy - Internal TLS Termination for Prospero +# Services: Grafana, PgAdmin, Prometheus (via OAuth2-Proxy), Loki, Alertmanager +# Managed by Ansible - Red Panda Approved + +global + log 127.0.0.1:{{pplg_haproxy_syslog_port}} local0 + stats timeout 30s + + # Default SSL material locations + ca-base /etc/ssl/certs + crt-base /etc/ssl/private + + # SSL/TLS configuration + ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384 + ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256 + ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets + +defaults + log global + mode http + option httplog + option dontlognull + log-format "%ci:%cp [%tr] %ft %b/%s %TR/%Tw/%Tc/%Tr/%Ta %ST %B %CC %CS %tsc %ac/%fc/%bc/%sc/%rc %sq/%bq %hr %hs %{+Q}r" + timeout connect 5s + timeout client 50s + timeout server 50s + +# Stats page with Prometheus metrics +listen stats + bind *:{{pplg_haproxy_stats_port}} + mode http + stats enable + stats uri /metrics + stats refresh 15s + stats show-legends + stats show-node + + # Prometheus metrics endpoint + http-request use-service prometheus-exporter if { path /metrics } + +# HTTP frontend - redirect all traffic to HTTPS +frontend http_frontend + bind *:80 + mode http + option httplog + http-request redirect scheme https code 301 + +# HTTPS frontend with subdomain-based routing +frontend https_frontend + bind *:443 ssl crt {{pplg_haproxy_cert_path}} + mode http + option httplog + option forwardfor + + # Forward original protocol and host + http-request set-header X-Forwarded-Proto https + http-request set-header X-Forwarded-Port %[dst_port] + http-request set-header X-Forwarded-Host %[req.hdr(Host)] + + # Security headers + http-response set-header Strict-Transport-Security "max-age=31536000; includeSubDomains" + http-response set-header X-Frame-Options "SAMEORIGIN" + http-response set-header X-Content-Type-Options "nosniff" + http-response set-header X-XSS-Protection "1; mode=block" + + # Subdomain ACLs + acl host_grafana hdr_beg(host) -i grafana.{{pplg_haproxy_domain}} + acl host_pgadmin hdr_beg(host) -i pgadmin.{{pplg_haproxy_domain}} + acl host_prometheus hdr_beg(host) -i prometheus.{{pplg_haproxy_domain}} + acl host_loki hdr_beg(host) -i loki.{{pplg_haproxy_domain}} + acl host_alertmanager hdr_beg(host) -i alertmanager.{{pplg_haproxy_domain}} + + # Prometheus write API - bypass OAuth2-Proxy (machine-to-machine) + acl is_prometheus_write path_beg /api/v1/write + + use_backend backend_grafana if host_grafana + use_backend backend_pgadmin if host_pgadmin + use_backend backend_prometheus_direct if host_prometheus is_prometheus_write + use_backend backend_prometheus if host_prometheus + use_backend backend_loki if host_loki + use_backend backend_alertmanager if host_alertmanager + +# Grafana - Native Casdoor OAuth SSO +backend backend_grafana + mode http + balance roundrobin + option httpchk GET /api/health + http-check expect status 200 + server grafana_1 127.0.0.1:3000 check + +# PgAdmin - Native Casdoor OAuth SSO +backend backend_pgadmin + mode http + balance roundrobin + option httpchk GET /misc/ping + http-check expect status 200 + server pgadmin_1 127.0.0.1:{{pgadmin_port}} check + +# Prometheus UI - via OAuth2-Proxy sidecar +backend backend_prometheus + mode http + balance roundrobin + option httpchk GET /ping + http-check expect status 200 + server prometheus_1 127.0.0.1:{{prometheus_proxy_port}} check + +# Prometheus Write API - direct (no auth, machine-to-machine) +backend backend_prometheus_direct + mode http + balance roundrobin + server prometheus_write_1 127.0.0.1:9090 check + +# Loki - no auth (machine-to-machine log ingestion) +backend backend_loki + mode http + balance roundrobin + option httpchk GET /ready + http-check expect status 200 + server loki_1 127.0.0.1:{{loki_port}} check + +# Alertmanager - internal only +backend backend_alertmanager + mode http + balance roundrobin + option httpchk GET /-/healthy + http-check expect status 200 + server alertmanager_1 127.0.0.1:{{alertmanager_port}} check diff --git a/ansible/pplg/prometheus.yml.j2 b/ansible/pplg/prometheus.yml.j2 new file mode 100644 index 0000000..1a369c8 --- /dev/null +++ b/ansible/pplg/prometheus.yml.j2 @@ -0,0 +1,48 @@ +global: + scrape_interval: {{ prometheus_scrape_interval }} + evaluation_interval: {{ prometheus_evaluation_interval }} + +alerting: + alertmanagers: + - static_configs: + - targets: + - {{ alertmanager_host }}:{{ alertmanager_port }} + +rule_files: + - "alert_rules.yml" + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'node-exporter' + static_configs: + - targets: {{ prometheus_targets | to_json }} + + - job_name: 'alertmanager' + static_configs: + - targets: ['{{ alertmanager_host }}:{{ alertmanager_port }}'] + + - job_name: 'haproxy' + static_configs: + - targets: ['titania.incus:8404'] + metrics_path: '/metrics' + + - job_name: 'gitea' + static_configs: + - targets: ['oberon.incus:22084'] + metrics_path: '/metrics' + authorization: + type: Bearer + credentials: '{{ vault_gitea_metrics_token }}' + + - job_name: 'casdoor' + static_configs: + - targets: ['{{ casdoor_metrics_host }}:{{ casdoor_metrics_port }}'] + metrics_path: '/api/metrics' + params: + accessKey: ['{{ casdoor_prometheus_access_key }}'] + accessSecret: ['{{ casdoor_prometheus_access_secret }}'] + +# Red Panda Approved Prometheus Configuration diff --git a/ansible/prometheus/alert_rules.yml.j2 b/ansible/prometheus/alert_rules.yml.j2 new file mode 100644 index 0000000..5efc81a --- /dev/null +++ b/ansible/prometheus/alert_rules.yml.j2 @@ -0,0 +1,249 @@ +# Prometheus Alert Rules +# Red Panda Approved 🐼 +# Deployed to: /etc/prometheus/alert_rules.yml +{% raw %} +groups: + # ============================================================================ + # Node/Infrastructure Alerts + # ============================================================================ + - name: node_alerts + rules: + - alert: InstanceDown + expr: up == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Instance {{ $labels.instance }} is down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." + + - alert: HighCPUUsage + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage on {{ $labels.instance }}" + description: "CPU usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: CriticalCPUUsage + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical CPU usage on {{ $labels.instance }}" + description: "CPU usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: HighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage on {{ $labels.instance }}" + description: "Memory usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: CriticalMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical memory usage on {{ $labels.instance }}" + description: "Memory usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: DiskSpaceLow + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20 + for: 5m + labels: + severity: warning + annotations: + summary: "Low disk space on {{ $labels.instance }}" + description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 20% free space (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: DiskSpaceCritical + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical disk space on {{ $labels.instance }}" + description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 10% free space (current value: {{ $value | printf \"%.1f\" }}%)" + + - alert: HighLoadAverage + expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2 + for: 10m + labels: + severity: warning + annotations: + summary: "High load average on {{ $labels.instance }}" + description: "15-minute load average is {{ $value | printf \"%.2f\" }} times the CPU count on {{ $labels.instance }}" + + # ============================================================================ + # Process-Level Alerts (puck.incus) + # ============================================================================ + - name: puck_process_alerts + rules: + - alert: PuckHighCPUProcess + expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[2m])) * 100 > 80 + for: 2m + labels: + severity: warning + annotations: + summary: "High CPU process on puck: {{ $labels.groupname }}" + description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU for more than 2 minutes" + + - alert: PuckCriticalCPUProcess + expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[1m])) * 100 > 95 + for: 1m + labels: + severity: critical + annotations: + summary: "Critical CPU process on puck: {{ $labels.groupname }}" + description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU - immediate attention required" + + - alert: PuckHighMemoryProcess + expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 1073741824 + for: 2m + labels: + severity: warning + annotations: + summary: "High memory process on puck: {{ $labels.groupname }}" + description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory" + + - alert: PuckCriticalMemoryProcess + expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 2147483648 + for: 1m + labels: + severity: critical + annotations: + summary: "Critical memory process on puck: {{ $labels.groupname }}" + description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory - immediate attention required" + + - alert: PuckProcessCrashLoop + expr: increase(namedprocess_namegroup_num_procs{instance=~"puck.*"}[5m]) < -1 + for: 1m + labels: + severity: warning + annotations: + summary: "Process count dropped on puck: {{ $labels.groupname }}" + description: "Process {{ $labels.groupname }} count has decreased, indicating possible crash or restart" + + # ============================================================================ + # Docker Container Alerts (puck.incus) + # ============================================================================ + - name: puck_container_alerts + rules: + - alert: PuckHighContainerCount + expr: count(container_last_seen{instance=~"puck.*", name!=""}) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "High container count on puck" + description: "puck.incus has {{ $value }} running containers, which exceeds the threshold of 5" + + - alert: PuckDuplicateContainers + expr: count by (image, instance) (container_last_seen{instance=~"puck.*", name!=""}) > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "Duplicate containers on puck: {{ $labels.image }}" + description: "{{ $value }} containers running the same image {{ $labels.image }} on puck" + + - alert: PuckOrphanedContainer + expr: (time() - container_start_time_seconds{instance=~"puck.*", name=~".*_.*"}) > 3600 + for: 10m + labels: + severity: warning + annotations: + summary: "Possible orphaned container on puck: {{ $labels.name }}" + description: "Container {{ $labels.name }} with auto-generated name has been running for {{ $value | humanizeDuration }}" + + - alert: PuckMCPContainerOnPuck + expr: container_last_seen{instance=~"puck.*", image=~".*mcp-server.*|.*mcp_server.*"} + for: 1m + labels: + severity: critical + annotations: + summary: "MCP container detected on puck (WRONG HOST)" + description: "Container {{ $labels.name }} ({{ $labels.image }}) is running on puck but MCP servers should run on miranda.incus" + + - alert: PuckContainerHighCPU + expr: sum by (name, instance) (rate(container_cpu_usage_seconds_total{instance=~"puck.*", name!=""}[2m])) * 100 > 80 + for: 2m + labels: + severity: warning + annotations: + summary: "High CPU container on puck: {{ $labels.name }}" + description: "Container {{ $labels.name }} is using {{ $value | printf \"%.1f\" }}% CPU" + + - alert: PuckContainerHighMemory + expr: container_memory_usage_bytes{instance=~"puck.*", name!=""} > 1073741824 + for: 2m + labels: + severity: warning + annotations: + summary: "High memory container on puck: {{ $labels.name }}" + description: "Container {{ $labels.name }} is using {{ $value | humanize }} memory" + + - alert: PuckContainerOOMKilled + expr: increase(container_oom_events_total{instance=~"puck.*", name!=""}[5m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Container OOM killed on puck: {{ $labels.name }}" + description: "Container {{ $labels.name }} was killed by OOM killer" + + # ============================================================================ + # Service/Application Alerts + # ============================================================================ + - name: service_alerts + rules: + - alert: PrometheusTargetMissing + expr: up == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus target missing: {{ $labels.instance }}" + description: "A Prometheus target has been down for more than 5 minutes." + + - alert: PrometheusJobMissing + expr: absent(up{job="node-exporter"}) + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus job missing" + description: "A Prometheus job has disappeared from target discovery." + + - alert: AlertmanagerDown + expr: absent(up{job="alertmanager"}) + for: 5m + labels: + severity: critical + annotations: + summary: "Alertmanager is down" + description: "Alertmanager is not responding. Alerts may not be delivered." + + # ============================================================================ + # Loki/Logging Alerts + # ============================================================================ + - name: loki_alerts + rules: + - alert: LokiHighLogVolume + expr: sum(rate(loki_distributor_bytes_received_total[5m])) > 10485760 + for: 10m + labels: + severity: warning + annotations: + summary: "High log ingestion rate" + description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging" + +# Red Panda Seal of Approval 🐼 +# "If the metrics aren't red, go back to bed" +{% endraw %} diff --git a/ansible/prometheus/alertmanager.yml b/ansible/prometheus/alertmanager.yml new file mode 100644 index 0000000..912352e --- /dev/null +++ b/ansible/prometheus/alertmanager.yml @@ -0,0 +1,92 @@ +lobal: + resolve_timeout: 5m + +route: + group_by: ['alertname', 'instance', 'severity'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + receiver: 'pushover' + routes: + - match: + severity: critical + receiver: 'pushover-critical' + continue: true + - match: + severity: warning + receiver: 'pushover-warning' + continue: true + - match: + severity: info + receiver: 'pushover-info' + repeat_interval: 24h + +inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'instance'] + +receivers: +- name: 'pushover-critical' + pushover_configs: + - user_key: '{{ pushover_user_key }}' + token: '{{ pushover_api_token }}' + send_resolved: true + html: true + priority: '2' # Emergency priority + retry: 30 + expire: 3600 + title: '🚨 [CRITICAL] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}' + message: |- + {{ "{{" }} range .Alerts {{ "}}" }} + {{ "{{" }} .Annotations.description {{ "}}" }} + + Instance: {{ "{{" }} .Labels.instance {{ "}}" }} + {{ "{{" }} end {{ "}}" }} + +- name: 'pushover-warning' + pushover_configs: + - user_key: '{{ pushover_user_key }}' + token: '{{ pushover_api_token }}' + send_resolved: true + html: true + priority: '1' # High priority + retry: 30 + expire: 3600 + title: '⚠️ [WARNING] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}' + message: |- + {{ "{{" }} range .Alerts {{ "}}" }} + {{ "{{" }} .Annotations.description {{ "}}" }} + + Instance: {{ "{{" }} .Labels.instance {{ "}}" }} + {{ "{{" }} end {{ "}}" }} + +- name: 'pushover-info' + pushover_configs: + - user_key: '{{ pushover_user_key }}' + token: '{{ pushover_api_token }}' + send_resolved: false + html: true + priority: '0' # Normal priority + title: '{{ "{{" }} .GroupLabels.alertname {{ "}}" }}' + message: '{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}' + +- name: 'pushover' + pushover_configs: + - user_key: '{{ pushover_user_key }}' + token: '{{ pushover_api_token }}' + send_resolved: true + html: true + priority: '1' + retry: 30 + expire: 3600 + title: '[{{ "{{" }} .GroupLabels.severity | default "ALERT" {{ "}}" }}] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}' + message: |- + {{ "{{" }} range .Alerts {{ "}}" }} + {{ "{{" }} .Annotations.description {{ "}}" }} + + Instance: {{ "{{" }} .Labels.instance {{ "}}" }} + Severity: {{ "{{" }} .Labels.severity {{ "}}" }} + {{ "{{" }} end {{ "}}" }} \ No newline at end of file diff --git a/ansible/prometheus/alertmanager_deploy.yml b/ansible/prometheus/alertmanager_deploy.yml new file mode 100644 index 0000000..be99433 --- /dev/null +++ b/ansible/prometheus/alertmanager_deploy.yml @@ -0,0 +1,48 @@ +--- +- name: Deploy Prometheus Alertmanager with Pushover + hosts: ubuntu + become: true + tasks: + - name: Check if host has alertmanager service + ansible.builtin.set_fact: + has_alertmanager_service: "{{'alertmanager' in services}}" + + - name: Skip hosts without alertmanager service + ansible.builtin.meta: end_host + when: not has_alertmanager_service + + - name: Install Alertmanager + ansible.builtin.apt: + name: prometheus-alertmanager + state: present + update_cache: true + + - name: Create alertmanager config directory + ansible.builtin.file: + path: /etc/alertmanager + state: directory + owner: prometheus + group: prometheus + mode: '750' + + - name: Template alertmanager configuration + ansible.builtin.template: + src: prometheus/alertmanager.yml + dest: /etc/alertmanager/alertmanager.yml + owner: prometheus + group: prometheus + mode: '550' + notify: restart alertmanager + + - name: Start and enable Alertmanager service + ansible.builtin.systemd: + name: prometheus-alertmanager + state: started + enabled: true + daemon_reload: true + + handlers: + - name: restart alertmanager + ansible.builtin.systemd: + name: prometheus-alertmanager + state: restarted \ No newline at end of file diff --git a/ansible/prometheus/deploy.yml b/ansible/prometheus/deploy.yml new file mode 100644 index 0000000..36b0281 --- /dev/null +++ b/ansible/prometheus/deploy.yml @@ -0,0 +1,82 @@ +--- +- name: Deploy Prometheus + hosts: ubuntu + become: true + tasks: + - name: Check if host has prometheus service + ansible.builtin.set_fact: + has_prometheus_service: "{{'prometheus' in services}}" + + - name: Skip hosts without prometheus service + ansible.builtin.meta: end_host + when: not has_prometheus_service + + - name: Install Prometheus + ansible.builtin.apt: + name: prometheus + state: present + update_cache: true + + - name: Fix Prometheus directory permissions + ansible.builtin.file: + path: /var/lib/prometheus + owner: prometheus + group: prometheus + mode: '750' + recurse: true + + - name: Create textfile collector directory + ansible.builtin.file: + path: /var/lib/prometheus/node-exporter + state: directory + owner: prometheus + group: prometheus + mode: '750' + + - name: Template prometheus.yml to Prospero + ansible.builtin.template: + src: prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml + owner: prometheus + group: prometheus + mode: '640' + notify: restart prometheus + + - name: Template alert_rules.yml to Prospero + ansible.builtin.template: + src: alert_rules.yml.j2 + dest: /etc/prometheus/alert_rules.yml + owner: prometheus + group: prometheus + mode: '640' + notify: restart prometheus + + - name: Create Prometheus systemd override directory + ansible.builtin.file: + path: /etc/systemd/system/prometheus.service.d + state: directory + mode: '755' + + - name: Enable remote write receiver + ansible.builtin.copy: + content: | + [Service] + ExecStart= + ExecStart=/usr/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/metrics2/ --web.console.templates=/etc/prometheus/consoles --web.console.libraries=/etc/prometheus/console_libraries --web.listen-address=0.0.0.0:9090 --web.external-url= --web.enable-remote-write-receiver + dest: /etc/systemd/system/prometheus.service.d/override.conf + mode: '644' + notify: restart prometheus + + - name: Start and enable Prometheus service + ansible.builtin.systemd: + name: prometheus + state: started + enabled: true + daemon_reload: true + + handlers: + - name: restart prometheus + ansible.builtin.systemd: + name: prometheus + state: restarted + daemon_reload: true \ No newline at end of file diff --git a/ansible/prometheus/node_deploy.yml b/ansible/prometheus/node_deploy.yml new file mode 100644 index 0000000..110e6e6 --- /dev/null +++ b/ansible/prometheus/node_deploy.yml @@ -0,0 +1,14 @@ +--- +- hosts: ubuntu + become: true + tasks: + - name: Aptitude Update + apt: + name: "*" + state: latest + update_cache: true + + - name: Install Prometheus Node Exporter + apt: + name: prometheus-node-exporter + state: present diff --git a/ansible/prometheus/prometheus.yml.j2 b/ansible/prometheus/prometheus.yml.j2 new file mode 100644 index 0000000..1a369c8 --- /dev/null +++ b/ansible/prometheus/prometheus.yml.j2 @@ -0,0 +1,48 @@ +global: + scrape_interval: {{ prometheus_scrape_interval }} + evaluation_interval: {{ prometheus_evaluation_interval }} + +alerting: + alertmanagers: + - static_configs: + - targets: + - {{ alertmanager_host }}:{{ alertmanager_port }} + +rule_files: + - "alert_rules.yml" + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'node-exporter' + static_configs: + - targets: {{ prometheus_targets | to_json }} + + - job_name: 'alertmanager' + static_configs: + - targets: ['{{ alertmanager_host }}:{{ alertmanager_port }}'] + + - job_name: 'haproxy' + static_configs: + - targets: ['titania.incus:8404'] + metrics_path: '/metrics' + + - job_name: 'gitea' + static_configs: + - targets: ['oberon.incus:22084'] + metrics_path: '/metrics' + authorization: + type: Bearer + credentials: '{{ vault_gitea_metrics_token }}' + + - job_name: 'casdoor' + static_configs: + - targets: ['{{ casdoor_metrics_host }}:{{ casdoor_metrics_port }}'] + metrics_path: '/api/metrics' + params: + accessKey: ['{{ casdoor_prometheus_access_key }}'] + accessSecret: ['{{ casdoor_prometheus_access_secret }}'] + +# Red Panda Approved Prometheus Configuration diff --git a/ansible/rabbitmq/deploy.yml b/ansible/rabbitmq/deploy.yml new file mode 100644 index 0000000..b589ab0 --- /dev/null +++ b/ansible/rabbitmq/deploy.yml @@ -0,0 +1,103 @@ +--- +- name: Deploy RabbitMQ with Docker Compose + hosts: ubuntu + become: true + vars: + required_service: rabbitmq + tasks: + - name: Check if host has rabbitmq service + ansible.builtin.set_fact: + has_rabbitmq_service: "{{required_service in services}}" + + - name: Skip hosts without rabbitmq service + ansible.builtin.meta: end_host + when: not has_rabbitmq_service + + - name: Create rabbitmq group + ansible.builtin.group: + name: "{{rabbitmq_group}}" + + - name: Create rabbitmq user + ansible.builtin.user: + name: "{{rabbitmq_user}}" + comment: "{{rabbitmq_user}}" + group: "{{rabbitmq_group}}" + system: true + + - name: Add group rabbitmq to user ponos + ansible.builtin.user: + name: ponos + groups: "{{rabbitmq_group}}" + append: true + + - name: Create rabbitmq directory + ansible.builtin.file: + path: "{{rabbitmq_directory}}" + owner: "{{rabbitmq_user}}" + group: "{{rabbitmq_group}}" + state: directory + mode: '750' + + - name: Template docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{rabbitmq_directory}}/docker-compose.yml" + owner: "{{rabbitmq_user}}" + group: "{{rabbitmq_group}}" + mode: '550' + + - name: Reset SSH connection to apply group changes + meta: reset_connection + + - name: Start RabbitMQ service + community.docker.docker_compose_v2: + project_src: "{{rabbitmq_directory}}" + state: present + pull: always + + - name: Always copy rabbitmqadmin out of RabbitMQ container to host (overwrite if newer) + ansible.builtin.command: + cmd: "docker cp rabbitmq:/usr/local/bin/rabbitmqadmin /usr/local/bin/rabbitmqadmin" + become: true + register: rabbitmqadmin_copy + changed_when: rabbitmqadmin_copy.rc == 0 + failed_when: rabbitmqadmin_copy.rc != 0 + + - name: Ensure rabbitmqadmin is executable + ansible.builtin.file: + path: /usr/local/bin/rabbitmqadmin + mode: '0755' + owner: root + group: root + state: file + + # --- RabbitMQ provisioning tasks (auto from inventory, run inside docker container) --- + + - name: Ensure RabbitMQ vhosts exist + ansible.builtin.command: + cmd: "docker exec rabbitmq rabbitmqctl add_vhost {{ item.name }}" + loop: "{{ rabbitmq_vhosts }}" + register: vhost_result + changed_when: vhost_result.rc == 0 + failed_when: vhost_result.rc != 0 and 'already exists' not in vhost_result.stderr + + - name: Ensure RabbitMQ users exist + ansible.builtin.command: + cmd: "docker exec rabbitmq rabbitmqctl add_user {{ item.name }} {{ item.password }}" + loop: "{{ rabbitmq_users }}" + register: user_result + changed_when: user_result.rc == 0 + failed_when: user_result.rc != 0 and 'already exists' not in user_result.stderr + no_log: true + + - name: Set user tags + ansible.builtin.command: + cmd: "docker exec rabbitmq rabbitmqctl set_user_tags {{ item.name }} {{ item.tags | default([]) | join(' ') }}" + loop: "{{ rabbitmq_users }}" + when: item.tags is defined + no_log: true + + - name: Ensure RabbitMQ user permissions are set + ansible.builtin.command: + cmd: "docker exec rabbitmq rabbitmqctl set_permissions -p {{ item.vhost }} {{ item.user }} '{{ item.configure_priv }}' '{{ item.write_priv }}' '{{ item.read_priv }}'" + loop: "{{ rabbitmq_permissions }}" diff --git a/ansible/rabbitmq/docker-compose.yml.j2 b/ansible/rabbitmq/docker-compose.yml.j2 new file mode 100644 index 0000000..dcf0886 --- /dev/null +++ b/ansible/rabbitmq/docker-compose.yml.j2 @@ -0,0 +1,23 @@ +services: + rabbitmq: + image: rabbitmq:3-management-alpine + pull_policy: always + container_name: rabbitmq + restart: unless-stopped + ports: + - "{{rabbitmq_amqp_port}}:5672" + - "{{rabbitmq_management_port}}:15672" + volumes: + - rabbitmq_data:/var/lib/rabbitmq + environment: + RABBITMQ_DEFAULT_USER: "{{rabbitmq_user}}" + RABBITMQ_DEFAULT_PASS: "{{rabbitmq_password}}" + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{rabbitmq_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "rabbitmq" + +volumes: + rabbitmq_data: diff --git a/ansible/roles/incus_storage_bucket/defaults/main.yml b/ansible/roles/incus_storage_bucket/defaults/main.yml new file mode 100644 index 0000000..abbf0b4 --- /dev/null +++ b/ansible/roles/incus_storage_bucket/defaults/main.yml @@ -0,0 +1,17 @@ +--- +# Default variables for incus_storage_bucket role + +# Incus configuration (should match terraform.tfvars) +storage_pool: default +project_name: agathos +bucket_role: admin + +# Service-specific variables (must be provided) +# bucket_name: casdoor +# service_name: casdoor + +# Path to S3 credentials vault file (separate from main vault) +s3_credentials_file: "{{ playbook_dir }}/inventory/group_vars/all/s3_vault.yml" + +# Task selection +task_action: create # create, regenerate, or remove diff --git a/ansible/roles/incus_storage_bucket/meta/main.yml b/ansible/roles/incus_storage_bucket/meta/main.yml new file mode 100644 index 0000000..57b1647 --- /dev/null +++ b/ansible/roles/incus_storage_bucket/meta/main.yml @@ -0,0 +1,22 @@ +--- +# Role metadata and dependencies + +galaxy_info: + author: Agathos Project + description: Manages Incus S3-compatible storage buckets with Ansible Vault credential storage + license: MIT + min_ansible_version: "2.9" + platforms: + - name: Ubuntu + versions: + - noble + - plucky + - questing + +dependencies: [] + +# Requirements: +# - User running the playbook must be a member of the 'incus' group +# - Incus CLI must be configured and accessible +# - ANSIBLE_VAULT_PASSWORD_FILE environment variable must be set +# - ansible-vault command must be available in PATH diff --git a/ansible/roles/incus_storage_bucket/tasks/main.yml b/ansible/roles/incus_storage_bucket/tasks/main.yml new file mode 100644 index 0000000..52c4f7f --- /dev/null +++ b/ansible/roles/incus_storage_bucket/tasks/main.yml @@ -0,0 +1,92 @@ +--- +# Main task file for incus_storage_bucket role +# Creates Incus S3 buckets and outputs credentials to console + +- name: Validate required variables + ansible.builtin.assert: + that: + - bucket_name is defined + fail_msg: "Required variable not defined: bucket_name" + +- name: Check if bucket already exists + ansible.builtin.command: + cmd: incus storage bucket list {{ storage_pool }} --project={{ project_name }} --format=json + register: bucket_list + changed_when: false + failed_when: false + +- name: Parse bucket list + ansible.builtin.set_fact: + existing_buckets: "{{ bucket_list.stdout | from_json | map(attribute='name') | list }}" + when: bucket_list.rc == 0 + +- name: Create storage bucket + ansible.builtin.command: + cmd: > + incus storage bucket create {{ storage_pool }} {{ bucket_name }} + --project={{ project_name }} + when: bucket_name not in (existing_buckets | default([])) + register: bucket_created + +- name: Set key name + ansible.builtin.set_fact: + key_name: "{{ bucket_name }}-access" + +- name: Check if bucket key already exists + ansible.builtin.command: + cmd: > + incus storage bucket key list {{ storage_pool }} {{ bucket_name }} + --project={{ project_name }} --format=json + register: key_list + changed_when: false + failed_when: false + +- name: Parse key list + ansible.builtin.set_fact: + existing_keys: "{{ key_list.stdout | from_json | map(attribute='name') | list }}" + when: key_list.rc == 0 + +- name: Create bucket access key + ansible.builtin.command: + cmd: > + incus storage bucket key create {{ storage_pool }} {{ bucket_name }} {{ key_name }} + --role={{ bucket_role }} --project={{ project_name }} + register: key_created + when: key_name not in (existing_keys | default([])) + +- name: Show bucket key (for existing key) + ansible.builtin.command: + cmd: > + incus storage bucket key show {{ storage_pool }} {{ bucket_name }} {{ key_name }} + --project={{ project_name }} + register: key_show + changed_when: false + when: key_name in (existing_keys | default([])) + +- name: Parse credentials from YAML output + ansible.builtin.set_fact: + bucket_credentials: "{{ (key_created.stdout | default(key_show.stdout)) | from_yaml }}" + +- name: Get bucket info for endpoint + ansible.builtin.command: + cmd: > + incus storage bucket show {{ storage_pool }} {{ bucket_name }} + --project={{ project_name }} + register: bucket_info + changed_when: false + +- name: Parse bucket info from YAML + ansible.builtin.set_fact: + bucket_data: "{{ bucket_info.stdout | from_yaml }}" + +- name: Display S3 bucket credentials + ansible.builtin.debug: + msg: + - "============================================" + - "S3 BUCKET PROVISIONED: {{ bucket_name }}" + - "============================================" + - "Endpoint: {{ bucket_data.s3_url }}" + - "Bucket: {{ bucket_name }}" + - "Access Key: {{ bucket_credentials['access-key'] }}" + - "Secret Key: {{ bucket_credentials['secret-key'] }}" + - "============================================" diff --git a/ansible/roles/incus_storage_bucket/tasks/regenerate.yml b/ansible/roles/incus_storage_bucket/tasks/regenerate.yml new file mode 100644 index 0000000..3081c36 --- /dev/null +++ b/ansible/roles/incus_storage_bucket/tasks/regenerate.yml @@ -0,0 +1,58 @@ +--- +# Regenerate bucket access key - outputs new credentials to console +# Use with caution - invalidates existing credentials + +- name: Validate required variables + ansible.builtin.assert: + that: + - bucket_name is defined + fail_msg: "Required variable not defined: bucket_name" + +- name: Set key name + ansible.builtin.set_fact: + key_name: "{{ bucket_name }}-access" + +- name: Delete existing bucket key + ansible.builtin.command: + cmd: > + incus storage bucket key delete {{ storage_pool }} {{ bucket_name }} {{ key_name }} + --project={{ project_name }} + register: key_deleted + failed_when: false + +- name: Create new bucket access key + ansible.builtin.command: + cmd: > + incus storage bucket key create {{ storage_pool }} {{ bucket_name }} {{ key_name }} + --role={{ bucket_role }} --project={{ project_name }} + register: key_created + +- name: Parse new credentials from text output + ansible.builtin.set_fact: + bucket_credentials: + access-key: "{{ key_created.stdout | regex_search('Access key: (.+)', '\\1') | first }}" + secret-key: "{{ key_created.stdout | regex_search('Secret key: (.+)', '\\1') | first }}" + +- name: Get bucket info for endpoint + ansible.builtin.command: + cmd: > + incus storage bucket show {{ storage_pool }} {{ bucket_name }} + --project={{ project_name }} + register: bucket_info + changed_when: false + +- name: Parse bucket info from YAML + ansible.builtin.set_fact: + bucket_data: "{{ bucket_info.stdout | from_yaml }}" + +- name: Display new S3 bucket credentials + ansible.builtin.debug: + msg: + - "============================================" + - "S3 BUCKET KEY REGENERATED: {{ bucket_name }}" + - "============================================" + - "Endpoint: {{ bucket_data.s3_url }}" + - "Bucket: {{ bucket_name }}" + - "New Access Key: {{ bucket_credentials['access-key'] }}" + - "New Secret Key: {{ bucket_credentials['secret-key'] }}" + - "============================================" diff --git a/ansible/roles/incus_storage_bucket/tasks/remove.yml b/ansible/roles/incus_storage_bucket/tasks/remove.yml new file mode 100644 index 0000000..5374b77 --- /dev/null +++ b/ansible/roles/incus_storage_bucket/tasks/remove.yml @@ -0,0 +1,48 @@ +--- +# Remove bucket - outputs confirmation to console +# Use with extreme caution - data loss is permanent + +- name: Validate required variables + ansible.builtin.assert: + that: + - bucket_name is defined + fail_msg: "Required variable not defined: bucket_name" + +- name: Set key name + ansible.builtin.set_fact: + key_name: "{{ bucket_name }}-access" + +- name: Confirm deletion + ansible.builtin.pause: + prompt: "WARNING: This will permanently delete bucket '{{ bucket_name }}' and all its data. Type 'yes' to continue" + register: confirm_delete + +- name: Abort if not confirmed + ansible.builtin.fail: + msg: "Deletion aborted by user" + when: confirm_delete.user_input != 'yes' + +- name: Delete bucket key + ansible.builtin.command: + cmd: > + incus storage bucket key delete {{ storage_pool }} {{ bucket_name }} {{ key_name }} + --project={{ project_name }} + register: key_deleted + failed_when: false + +- name: Delete storage bucket + ansible.builtin.command: + cmd: > + incus storage bucket delete {{ storage_pool }} {{ bucket_name }} + --project={{ project_name }} + register: bucket_deleted + +- name: Display removal confirmation + ansible.builtin.debug: + msg: + - "============================================" + - "S3 BUCKET REMOVED: {{ bucket_name }}" + - "============================================" + - "Remember to remove credentials from vault.yml" + - "============================================" + when: bucket_deleted is succeeded diff --git a/ansible/s3_provision.yml b/ansible/s3_provision.yml new file mode 100644 index 0000000..f943e4a --- /dev/null +++ b/ansible/s3_provision.yml @@ -0,0 +1,28 @@ +--- +# Provision S3 Storage Bucket +# Creates Incus S3-compatible storage bucket and outputs credentials to console +# +# Prerequisites: +# - User must be member of 'incus' group +# - Incus must be configured and accessible +# +# Usage: +# ansible-playbook provision_s3.yml -e bucket_name=casdoor + +- name: Provision S3 Storage Bucket + hosts: localhost + connection: local + gather_facts: false + + pre_tasks: + - name: Validate required parameters + ansible.builtin.assert: + that: + - bucket_name is defined + - bucket_name | length > 0 + fail_msg: | + Missing required parameters. Usage: + ansible-playbook provision_s3.yml -e bucket_name= + + roles: + - role: incus_storage_bucket diff --git a/ansible/s3_regenerate_key.yml b/ansible/s3_regenerate_key.yml new file mode 100644 index 0000000..ff41fac --- /dev/null +++ b/ansible/s3_regenerate_key.yml @@ -0,0 +1,34 @@ +--- +# Regenerate S3 Bucket Access Key +# Rotates credentials for an existing Incus S3 bucket +# +# WARNING: This will invalidate existing credentials immediately +# +# Prerequisites: +# - User must be member of 'incus' group +# - Incus must be configured and accessible +# - Bucket must already exist +# +# Usage: +# ansible-playbook regenerate_s3_key.yml -e bucket_name=casdoor + +- name: Regenerate S3 Bucket Access Key + hosts: localhost + connection: local + gather_facts: false + + pre_tasks: + - name: Validate required parameters + ansible.builtin.assert: + that: + - bucket_name is defined + - bucket_name | length > 0 + fail_msg: | + Missing required parameters. Usage: + ansible-playbook regenerate_s3_key.yml -e bucket_name= + + tasks: + - name: Include regenerate tasks + ansible.builtin.include_role: + name: incus_storage_bucket + tasks_from: regenerate diff --git a/ansible/s3_remove.yml b/ansible/s3_remove.yml new file mode 100644 index 0000000..4776186 --- /dev/null +++ b/ansible/s3_remove.yml @@ -0,0 +1,33 @@ +--- +# Remove S3 Storage Bucket +# Permanently deletes Incus S3 bucket +# +# WARNING: This operation cannot be undone. All data in the bucket will be lost. +# +# Prerequisites: +# - User must be member of 'incus' group +# - Incus must be configured and accessible +# +# Usage: +# ansible-playbook remove_s3.yml -e bucket_name=casdoor + +- name: Remove S3 Storage Bucket + hosts: localhost + connection: local + gather_facts: false + + pre_tasks: + - name: Validate required parameters + ansible.builtin.assert: + that: + - bucket_name is defined + - bucket_name | length > 0 + fail_msg: | + Missing required parameters. Usage: + ansible-playbook remove_s3.yml -e bucket_name= + + tasks: + - name: Include remove tasks + ansible.builtin.include_role: + name: incus_storage_bucket + tasks_from: remove diff --git a/ansible/sandbox_down.yml b/ansible/sandbox_down.yml new file mode 100644 index 0000000..4d60290 --- /dev/null +++ b/ansible/sandbox_down.yml @@ -0,0 +1,27 @@ +--- +- name: Stop Agathos Sandbox Uranian Hosts + hosts: localhost + gather_facts: false + vars: + uranian_hosts: + - oberon + - portia + - ariel + - puck + - miranda + - sycorax + - prospero + - rosalind + - titania + tasks: + - name: Stop Uranian host containers + ansible.builtin.command: + cmd: incus stop {{ item }} --project agathos + loop: "{{ uranian_hosts }}" + register: stop_result + failed_when: stop_result.rc != 0 and 'not running' not in stop_result.stderr + changed_when: stop_result.rc == 0 + + - name: Display completion message + ansible.builtin.debug: + msg: "🐾 Uranian hosts stopped gracefully" \ No newline at end of file diff --git a/ansible/sandbox_up.yml b/ansible/sandbox_up.yml new file mode 100644 index 0000000..f7365d5 --- /dev/null +++ b/ansible/sandbox_up.yml @@ -0,0 +1,39 @@ +--- +- name: Start Agathos Sandbox Uranian Hosts + hosts: localhost + gather_facts: false + vars: + uranian_hosts: + - oberon + - portia + - ariel + - puck + - miranda + - sycorax + - prospero + - rosalind + - titania + tasks: + - name: Start Uranian host containers + ansible.builtin.command: + cmd: incus start {{ item }} --project agathos + loop: "{{ uranian_hosts }}" + register: start_result + failed_when: start_result.rc != 0 and 'already running' not in start_result.stderr + changed_when: start_result.rc == 0 + + - name: Wait for containers to be ready + ansible.builtin.pause: + seconds: 4 + + - name: Configure DNS resolution for Incus bridge + ansible.builtin.command: + cmd: "{{ item }}" + loop: + - "resolvectl dns incusbr0 10.10.0.1" + - "resolvectl domain incusbr0 '~incus'" + become: true + + - name: Display Red Panda approval message + ansible.builtin.debug: + msg: "🐾 Uranian hosts started with Red Panda approval!" \ No newline at end of file diff --git a/ansible/searxng/deploy.yml b/ansible/searxng/deploy.yml new file mode 100644 index 0000000..4fa6644 --- /dev/null +++ b/ansible/searxng/deploy.yml @@ -0,0 +1,134 @@ +--- +- name: Deploy SearXNG with Docker Compose + hosts: ubuntu + become: true + tasks: + - name: Check if host has searxng service + ansible.builtin.set_fact: + has_searxng_service: "{{'searxng' in services}}" + + - name: Skip hosts without searxng service + ansible.builtin.meta: end_host + when: not has_searxng_service + + - name: Create searxng group + ansible.builtin.group: + name: "{{searxng_group}}" + + - name: Create searxng user + ansible.builtin.user: + name: "{{searxng_user}}" + comment: "{{searxng_user}}" + group: "{{searxng_group}}" + system: true + + - name: Add group searxng to ansible_user + ansible.builtin.user: + name: "{{ansible_user}}" + groups: "{{searxng_group}}" + append: true + + - name: Create searxng directory + ansible.builtin.file: + path: "{{searxng_directory}}" + owner: "{{searxng_user}}" + group: "{{searxng_group}}" + state: directory + mode: '750' + + - name: Template configuration files + ansible.builtin.template: + src: "{{item.src}}" + dest: "{{searxng_directory}}/{{item.dest}}" + owner: "{{searxng_user}}" + group: "{{searxng_group}}" + mode: '550' + loop: + - src: "docker-compose.yml.j2" + dest: "docker-compose.yml" + - src: "searxng-settings.yml.j2" + dest: "searxng-settings.yml" + + - name: Reset SSH connection to apply group changes + meta: reset_connection + + - name: Start SearXNG service + community.docker.docker_compose_v2: + project_src: "{{searxng_directory}}" + state: present + pull: always + + # =========================================================================== + # OAuth2-Proxy Sidecar + # Note: Each host supports at most one OAuth2-Proxy sidecar instance + # (binary shared at /usr/local/bin/oauth2-proxy, unique systemd unit per service) + # =========================================================================== + - name: Create oauth2-proxy directory + ansible.builtin.file: + path: "{{ searxng_oauth2_proxy_dir }}" + owner: root + group: root + state: directory + mode: '0755' + + - name: Download oauth2-proxy binary + ansible.builtin.get_url: + url: "https://github.com/oauth2-proxy/oauth2-proxy/releases/download/v{{ searxng_oauth2_proxy_version }}/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.linux-amd64.tar.gz" + dest: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.tar.gz" + mode: '0644' + + - name: Extract oauth2-proxy binary + ansible.builtin.unarchive: + src: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.tar.gz" + dest: /tmp + remote_src: true + creates: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.linux-amd64/oauth2-proxy" + + - name: Install oauth2-proxy binary + ansible.builtin.copy: + src: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.linux-amd64/oauth2-proxy" + dest: /usr/local/bin/oauth2-proxy + owner: root + group: root + mode: '0755' + remote_src: true + + - name: Template oauth2-proxy configuration + ansible.builtin.template: + src: oauth2-proxy-searxng.cfg.j2 + dest: "{{ searxng_oauth2_proxy_dir }}/oauth2-proxy.cfg" + owner: root + group: root + mode: '0600' + notify: restart oauth2-proxy-searxng + + - name: Template oauth2-proxy systemd service + ansible.builtin.template: + src: oauth2-proxy-searxng.service.j2 + dest: /etc/systemd/system/oauth2-proxy-searxng.service + owner: root + group: root + mode: '0644' + notify: + - reload systemd + - restart oauth2-proxy-searxng + + # =========================================================================== + # Service Management + # =========================================================================== + - name: Enable and start OAuth2-Proxy service + ansible.builtin.systemd: + name: oauth2-proxy-searxng + enabled: true + state: started + daemon_reload: true + + handlers: + - name: reload systemd + ansible.builtin.systemd: + daemon_reload: true + + - name: restart oauth2-proxy-searxng + ansible.builtin.systemd: + name: oauth2-proxy-searxng + state: restarted diff --git a/ansible/searxng/docker-compose.yml.j2 b/ansible/searxng/docker-compose.yml.j2 new file mode 100644 index 0000000..b9fa302 --- /dev/null +++ b/ansible/searxng/docker-compose.yml.j2 @@ -0,0 +1,18 @@ +services: + searxng: + image: searxng/searxng:latest + pull_policy: always + container_name: searxng + ports: + - "{{ searxng_port }}:8080" + volumes: + - ./searxng-settings.yml:/etc/searxng/settings.yml:ro + restart: unless-stopped + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{searxng_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "searxng" + +# Red Panda Approved SearXNG Configuration \ No newline at end of file diff --git a/ansible/searxng/oauth2-proxy-searxng.cfg.j2 b/ansible/searxng/oauth2-proxy-searxng.cfg.j2 new file mode 100644 index 0000000..e38b501 --- /dev/null +++ b/ansible/searxng/oauth2-proxy-searxng.cfg.j2 @@ -0,0 +1,70 @@ +# OAuth2-Proxy Configuration for SearXNG +# Authenticates users via Casdoor OIDC before proxying to SearXNG +# Red Panda Approved + +# Provider Configuration (Casdoor OIDC) +provider = "oidc" +provider_display_name = "Casdoor" +oidc_issuer_url = "{{ searxng_oauth2_oidc_issuer_url }}" +client_id = "{{ searxng_oauth2_client_id }}" +client_secret = "{{ searxng_oauth2_client_secret }}" + +# Redirect URL after authentication +redirect_url = "{{ searxng_oauth2_redirect_url }}" + +# Upstream service (SearXNG on localhost) +upstreams = [ + "http://127.0.0.1:{{ searxng_port }}" +] + +# Session/Cookie Configuration +cookie_secret = "{{ searxng_oauth2_cookie_secret }}" +cookie_name = "_oauth2_proxy_searxng" +cookie_secure = true +cookie_httponly = true +cookie_samesite = "lax" +cookie_domains = [ + ".{{ searxng_domain }}" +] + +# Authentication settings +email_domains = ["*"] +oidc_email_claim = "email" +oidc_groups_claim = "groups" + +# Session settings +session_store_type = "cookie" +cookie_expire = "168h" +cookie_refresh = "1h" + +# Request settings - pass user info to SearXNG +pass_access_token = false +pass_authorization_header = false +set_authorization_header = false +set_xauthrequest = true + +# Logging +request_logging = true +auth_logging = true +standard_logging = true + +# Network settings +http_address = "0.0.0.0:{{ searxng_proxy_port }}" +reverse_proxy = true +real_client_ip_header = "X-Forwarded-For" + +# Skip authentication for health check endpoints +skip_auth_routes = [ + "^/healthz$", + "^/ping$" +] + +# OIDC specific settings +skip_provider_button = true +oidc_extra_audiences = [] +insecure_oidc_allow_unverified_email = true +cookie_csrf_per_request = true +cookie_csrf_expire = "5m" + +# SSL verification (internal Casdoor uses valid certs) +ssl_insecure_skip_verify = false diff --git a/ansible/searxng/oauth2-proxy-searxng.service.j2 b/ansible/searxng/oauth2-proxy-searxng.service.j2 new file mode 100644 index 0000000..0c01fc9 --- /dev/null +++ b/ansible/searxng/oauth2-proxy-searxng.service.j2 @@ -0,0 +1,23 @@ +[Unit] +Description=OAuth2-Proxy for SearXNG +After=network.target docker.service +Wants=docker.service + +[Service] +Type=simple +ExecStart=/usr/local/bin/oauth2-proxy --config={{ searxng_oauth2_proxy_dir }}/oauth2-proxy.cfg + +Restart=on-failure +RestartSec=5 + +# Security hardening +NoNewPrivileges=true +PrivateTmp=true + +# Logging +StandardOutput=journal +StandardError=journal +SyslogIdentifier=oauth2-proxy-searxng + +[Install] +WantedBy=multi-user.target diff --git a/ansible/searxng/searxng-settings.yml.j2 b/ansible/searxng/searxng-settings.yml.j2 new file mode 100644 index 0000000..986eee0 --- /dev/null +++ b/ansible/searxng/searxng-settings.yml.j2 @@ -0,0 +1,42 @@ +use_default_settings: true + +general: + instance_name: "{{ searxng_instance_name }}" + contact_url: false + enable_metrics: false + +search: + safe_search: 0 + autocomplete: "google" + default_lang: "en" + formats: + - html + - json + +server: + port: 8080 + bind_address: "0.0.0.0" + secret_key: "{{ searxng_secret_key }}" + base_url: "{{ searxng_base_url }}" + limiter: true + public_instance: false + method: "GET" + image_proxy: true + +ui: + static_use_hash: true + default_theme: simple + default_locale: "" + theme_args: + simple_style: auto + +# Red Panda Approved Search Configuration +engines: + - name: google + disabled: false + - name: duckduckgo + disabled: false + - name: bing + disabled: false + - name: startpage + disabled: false \ No newline at end of file diff --git a/ansible/searxng/settings.yml b/ansible/searxng/settings.yml new file mode 100644 index 0000000..be0d825 --- /dev/null +++ b/ansible/searxng/settings.yml @@ -0,0 +1,2778 @@ +general: + # Debug mode, only for development. Is overwritten by ${SEARXNG_DEBUG} + debug: false + # displayed name + instance_name: "SearXNG" + # For example: https://example.com/privacy + privacypolicy_url: false + # use true to use your own donation page written in searx/info/en/donate.md + # use false to disable the donation link + donation_url: false + # mailto:contact@example.com + contact_url: false + # record stats + enable_metrics: true + # expose stats in open metrics format at /metrics + # leave empty to disable (no password set) + # open_metrics: + open_metrics: '' + +brand: + new_issue_url: https://github.com/searxng/searxng/issues/new + docs_url: https://docs.searxng.org/ + public_instances: https://searx.space + wiki_url: https://github.com/searxng/searxng/wiki + issue_url: https://github.com/searxng/searxng/issues + # custom: + # # Custom entries in the footer: [title]: [link] + # links: + # Uptime: https://uptime.searxng.org/history/darmarit-org + # About: "https://searxng.org" + +search: + # Filter results. 0: None, 1: Moderate, 2: Strict + safe_search: 0 + # Existing autocomplete backends: "360search", "baidu", "brave", "dbpedia", "duckduckgo", "google", "yandex", + # "mwmbl", "naver", "seznam", "sogou", "startpage", "stract", "swisscows", "quark", "qwant", "wikipedia" - + # leave blank to turn it off by default. + autocomplete: "" + # minimun characters to type before autocompleter starts + autocomplete_min: 4 + # backend for the favicon near URL in search results. + # Available resolvers: "allesedv", "duckduckgo", "google", "yandex" - leave blank to turn it off by default. + favicon_resolver: "" + # Default search language - leave blank to detect from browser information or + # use codes from 'languages.py' + default_lang: "auto" + # max_page: 0 # if engine supports paging, 0 means unlimited numbers of pages + # Available languages + # languages: + # - all + # - en + # - en-US + # - de + # - it-IT + # - fr + # - fr-BE + # ban time in seconds after engine errors + ban_time_on_fail: 5 + # max ban time in seconds after engine errors + max_ban_time_on_fail: 120 + suspended_times: + # Engine suspension time after error (in seconds; set to 0 to disable) + # For error "Access denied" and "HTTP error [402, 403]" + SearxEngineAccessDenied: 86400 + # For error "CAPTCHA" + SearxEngineCaptcha: 86400 + # For error "Too many request" and "HTTP error 429" + SearxEngineTooManyRequests: 3600 + # Cloudflare CAPTCHA + cf_SearxEngineCaptcha: 1296000 + cf_SearxEngineAccessDenied: 86400 + # ReCAPTCHA + recaptcha_SearxEngineCaptcha: 604800 + + # remove format to deny access, use lower case. + # formats: [html, csv, json, rss] + formats: + - html + +server: + # Is overwritten by ${SEARXNG_PORT} and ${SEARXNG_BIND_ADDRESS} + port: 8888 + bind_address: "127.0.0.1" + # public URL of the instance, to ensure correct inbound links. Is overwritten + # by ${SEARXNG_BASE_URL}. + base_url: false # "http://example.com/location" + # rate limit the number of request on the instance, block some bots. + # Is overwritten by ${SEARXNG_LIMITER} + limiter: false + # enable features designed only for public instances. + # Is overwritten by ${SEARXNG_PUBLIC_INSTANCE} + public_instance: false + + # If your instance owns a /etc/searxng/settings.yml file, then set the following + # values there. + + secret_key: "ultrasecretkey" # Is overwritten by ${SEARXNG_SECRET} + # Proxy image results through SearXNG. Is overwritten by ${SEARXNG_IMAGE_PROXY} + image_proxy: false + # 1.0 and 1.1 are supported + http_protocol_version: "1.0" + # POST queries are "more secure!" but are also the source of hard-to-locate + # annoyances, which is why GET may be better for end users and their browsers. + # see https://github.com/searxng/searxng/pull/3619 + # Is overwritten by ${SEARXNG_METHOD} + method: "POST" + default_http_headers: + X-Content-Type-Options: nosniff + X-Download-Options: noopen + X-Robots-Tag: noindex, nofollow + Referrer-Policy: no-referrer + +valkey: + # URL to connect valkey database. Is overwritten by ${SEARXNG_VALKEY_URL}. + # https://docs.searxng.org/admin/settings/settings_valkey.html#settings-valkey + # url: valkey://localhost:6379/0 + url: false + +ui: + # Custom static path - leave it blank if you didn't change + static_path: "" + # Custom templates path - leave it blank if you didn't change + templates_path: "" + # query_in_title: When true, the result page's titles contains the query + # it decreases the privacy, since the browser can records the page titles. + query_in_title: false + # ui theme + default_theme: simple + # center the results ? + center_alignment: false + # URL prefix of the internet archive, don't forget trailing slash (if needed). + # cache_url: "https://webcache.googleusercontent.com/search?q=cache:" + # Default interface locale - leave blank to detect from browser information or + # use codes from the 'locales' config section + default_locale: "" + # Open result links in a new tab by default + # results_on_new_tab: false + theme_args: + # style of simple theme: auto, light, dark, black + simple_style: auto + # Perform search immediately if a category selected. + # Disable to select multiple categories at once and start the search manually. + search_on_category_select: true + # Hotkeys: default or vim + hotkeys: default + # URL formatting: pretty, full or host + url_formatting: pretty + +# Lock arbitrary settings on the preferences page. +# +# preferences: +# lock: +# - categories +# - language +# - autocomplete +# - favicon +# - safesearch +# - method +# - doi_resolver +# - locale +# - theme +# - results_on_new_tab +# - search_on_category_select +# - method +# - image_proxy +# - query_in_title + +# communication with search engines +# +outgoing: + # default timeout in seconds, can be override by engine + request_timeout: 3.0 + # the maximum timeout in seconds + # max_request_timeout: 10.0 + # suffix of searxng_useragent, could contain information like an email address + # to the administrator + useragent_suffix: "" + # The maximum number of concurrent connections that may be established. + pool_connections: 100 + # Allow the connection pool to maintain keep-alive connections below this + # point. + pool_maxsize: 20 + # See https://www.python-httpx.org/http2/ + enable_http2: true + # uncomment below section if you want to use a custom server certificate + # see https://www.python-httpx.org/advanced/#changing-the-verification-defaults + # and https://www.python-httpx.org/compatibility/#ssl-configuration + # verify: ~/.mitmproxy/mitmproxy-ca-cert.cer + # + # uncomment below section if you want to use a proxyq see: SOCKS proxies + # https://2.python-requests.org/en/latest/user/advanced/#proxies + # are also supported: see + # https://2.python-requests.org/en/latest/user/advanced/#socks + # + # proxies: + # all://: + # - http://proxy1:8080 + # - http://proxy2:8080 + # + # using_tor_proxy: true + # + # Extra seconds to add in order to account for the time taken by the proxy + # + # extra_proxy_timeout: 10 + # + # uncomment below section only if you have more than one network interface + # which can be the source of outgoing search requests + # + # source_ips: + # - 1.1.1.1 + # - 1.1.1.2 + # - fe80::/126 + + +# Plugin configuration, for more details see +# https://docs.searxng.org/admin/settings/settings_plugins.html +# +plugins: + + searx.plugins.calculator.SXNGPlugin: + active: true + + searx.plugins.infinite_scroll.SXNGPlugin: + active: false + + searx.plugins.hash_plugin.SXNGPlugin: + active: true + + searx.plugins.self_info.SXNGPlugin: + active: true + + searx.plugins.unit_converter.SXNGPlugin: + active: true + + searx.plugins.ahmia_filter.SXNGPlugin: + active: true + + searx.plugins.hostnames.SXNGPlugin: + active: true + + searx.plugins.time_zone.SXNGPlugin: + active: true + + searx.plugins.oa_doi_rewrite.SXNGPlugin: + active: false + + searx.plugins.tor_check.SXNGPlugin: + active: false + + searx.plugins.tracker_url_remover.SXNGPlugin: + active: true + + +# Configuration of the "Hostnames plugin": +# +# hostnames: +# replace: +# '(.*\.)?youtube\.com$': 'yt.example.com' +# '(.*\.)?youtu\.be$': 'yt.example.com' +# '(.*\.)?reddit\.com$': 'teddit.example.com' +# '(.*\.)?redd\.it$': 'teddit.example.com' +# '(www\.)?twitter\.com$': 'nitter.example.com' +# remove: +# - '(.*\.)?facebook.com$' +# low_priority: +# - '(.*\.)?google(\..*)?$' +# high_priority: +# - '(.*\.)?wikipedia.org$' +# +# Alternatively you can use external files for configuring the "Hostnames plugin": +# +# hostnames: +# replace: 'rewrite-hosts.yml' +# +# Content of 'rewrite-hosts.yml' (place the file in the same directory as 'settings.yml'): +# '(.*\.)?youtube\.com$': 'yt.example.com' +# '(.*\.)?youtu\.be$': 'yt.example.com' +# + +checker: + # disable checker when in debug mode + off_when_debug: true + + # use "scheduling: {}" to disable scheduling + # scheduling: interval or int + + # to activate the scheduler: + # * uncomment "scheduling" section + # * add "cache2 = name=searxngcache,items=2000,blocks=2000,blocksize=4096,bitmap=1" + # to your uwsgi.ini + + # scheduling: + # start_after: [300, 1800] # delay to start the first run of the checker + # every: [86400, 90000] # how often the checker runs + + # additional tests: only for the YAML anchors (see the engines section) + # + additional_tests: + rosebud: &test_rosebud + matrix: + query: rosebud + lang: en + result_container: + - not_empty + - ['one_title_contains', 'citizen kane'] + test: + - unique_results + + android: &test_android + matrix: + query: ['android'] + lang: ['en', 'de', 'fr', 'zh-CN'] + result_container: + - not_empty + - ['one_title_contains', 'google'] + test: + - unique_results + + # tests: only for the YAML anchors (see the engines section) + tests: + infobox: &tests_infobox + infobox: + matrix: + query: ["linux", "new york", "bbc"] + result_container: + - has_infobox + +categories_as_tabs: + general: + images: + videos: + news: + map: + music: + it: + science: + files: + social media: + +engines: + - name: 360search + engine: 360search + shortcut: 360so + disabled: true + + - name: 360search videos + engine: 360search_videos + shortcut: 360sov + disabled: true + + - name: 9gag + engine: 9gag + shortcut: 9g + disabled: true + + - name: acfun + engine: acfun + shortcut: acf + disabled: true + + - name: adobe stock + engine: adobe_stock + shortcut: asi + categories: ["images"] + # https://docs.searxng.org/dev/engines/online/adobe_stock.html + adobe_order: relevance + adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"] + timeout: 6 + disabled: true + + - name: adobe stock video + engine: adobe_stock + shortcut: asv + network: adobe stock + categories: ["videos"] + adobe_order: relevance + adobe_content_types: ["video"] + timeout: 6 + disabled: true + + - name: adobe stock audio + engine: adobe_stock + shortcut: asa + network: adobe stock + categories: ["music"] + adobe_order: relevance + adobe_content_types: ["audio"] + timeout: 6 + disabled: true + + - name: astrophysics data system + engine: astrophysics_data_system + shortcut: ads + # read https://docs.searxng.org/dev/engines/online/astrophysics_data_system.html + api_key: "" + inactive: true + + - name: alpine linux packages + engine: alpinelinux + disabled: true + shortcut: alp + + - name: annas archive + engine: annas_archive + disabled: true + shortcut: aa + timeout: 5 + + - name: ansa + engine: ansa + shortcut: ans + disabled: true + + # - name: annas articles + # engine: annas_archive + # shortcut: aaa + # # https://docs.searxng.org/dev/engines/online/annas_archive.html + # aa_content: 'magazine' # book_fiction, book_unknown, book_nonfiction, book_comic + # aa_ext: 'pdf' # pdf, epub, .. + # aa_sort: oldest' # newest, oldest, largest, smallest + + - name: apk mirror + engine: apkmirror + timeout: 4.0 + shortcut: apkm + disabled: true + + - name: apple app store + engine: apple_app_store + shortcut: aps + disabled: true + + # Requires Tor + - name: ahmia + engine: ahmia + categories: onions + enable_http: true + shortcut: ah + + - name: anaconda + engine: xpath + paging: true + first_page_num: 0 + search_url: https://anaconda.org/search?q={query}&page={pageno} + results_xpath: //tbody/tr + url_xpath: ./td/h5/a[last()]/@href + title_xpath: ./td/h5 + content_xpath: ./td[h5]/text() + categories: it + timeout: 6.0 + shortcut: conda + disabled: true + + - name: arch linux wiki + engine: archlinux + shortcut: al + + - name: nixos wiki + engine: mediawiki + shortcut: nixw + base_url: https://wiki.nixos.org/ + search_type: text + disabled: true + categories: [it, software wikis] + + - name: artic + engine: artic + shortcut: arc + timeout: 4.0 + + - name: arxiv + engine: arxiv + shortcut: arx + + - name: ask + engine: ask + shortcut: ask + disabled: true + + # - name: azure + # engine: azure + # shortcut: az + # categories: [it, cloud] + # azure_tenant_id: "your_tenant_id" + # azure_client_id: "your_client_id" + # azure_client_secret: "your_client_secret" + # disabled: true + + # tmp suspended: dh key too small + # - name: base + # engine: base + # shortcut: bs + + - name: bandcamp + engine: bandcamp + shortcut: bc + categories: music + + - name: baidu + baidu_category: general + categories: [general] + engine: baidu + shortcut: bd + disabled: true + + - name: baidu images + baidu_category: images + categories: [images] + engine: baidu + shortcut: bdi + disabled: true + + - name: baidu kaifa + baidu_category: it + categories: [it] + engine: baidu + shortcut: bdk + disabled: true + + - name: wikipedia + engine: wikipedia + shortcut: wp + # add "list" to the array to get results in the results list + display_type: ["infobox"] + categories: [general] + + - name: bilibili + engine: bilibili + shortcut: bil + disabled: true + + - name: bing + engine: bing + shortcut: bi + disabled: true + + - name: bing images + engine: bing_images + shortcut: bii + + - name: bing news + engine: bing_news + shortcut: bin + + - name: bing videos + engine: bing_videos + shortcut: biv + + - name: bitchute + engine: bitchute + shortcut: bit + disabled: true + + - name: bitbucket + engine: xpath + paging: true + search_url: https://bitbucket.org/repo/all/{pageno}?name={query} + url_xpath: //article[@class="repo-summary"]//a[@class="repo-link"]/@href + title_xpath: //article[@class="repo-summary"]//a[@class="repo-link"] + content_xpath: //article[@class="repo-summary"]/p + categories: [it, repos] + timeout: 4.0 + disabled: true + shortcut: bb + about: + website: https://bitbucket.org/ + wikidata_id: Q2493781 + official_api_documentation: https://developer.atlassian.com/bitbucket + use_official_api: false + require_api_key: false + results: HTML + + - name: bpb + engine: bpb + shortcut: bpb + disabled: true + + - name: btdigg + engine: btdigg + shortcut: bt + disabled: true + + - name: openverse + engine: openverse + categories: images + shortcut: opv + + - name: media.ccc.de + engine: ccc_media + shortcut: c3tv + # We don't set language: de here because media.ccc.de is not just + # for a German audience. It contains many English videos and many + # German videos have English subtitles. + disabled: true + + - name: chefkoch + engine: chefkoch + shortcut: chef + # to show premium or plus results too: + # skip_premium: false + + # WARNING: links from chinaso.com voilate users privacy + # Before activate these engines its mandatory to read + # - https://github.com/searxng/searxng/issues/4694 + # - https://docs.searxng.org/dev/engines/online/chinaso.html + + - name: chinaso news + engine: chinaso + shortcut: chinaso + categories: [news] + chinaso_category: news + chinaso_news_source: all + disabled: true + inactive: true + + - name: chinaso images + engine: chinaso + network: chinaso news + shortcut: chinasoi + categories: [images] + chinaso_category: images + disabled: true + inactive: true + + - name: chinaso videos + engine: chinaso + network: chinaso news + shortcut: chinasov + categories: [videos] + chinaso_category: videos + disabled: true + inactive: true + + - name: cloudflareai + engine: cloudflareai + shortcut: cfai + # get api token and accont id from https://developers.cloudflare.com/workers-ai/get-started/rest-api/ + cf_account_id: 'your_cf_accout_id' + cf_ai_api: 'your_cf_api' + # create your ai gateway by https://developers.cloudflare.com/ai-gateway/get-started/creating-gateway/ + cf_ai_gateway: 'your_cf_ai_gateway_name' + # find the model name from https://developers.cloudflare.com/workers-ai/models/#text-generation + cf_ai_model: 'ai_model_name' + # custom your preferences + # cf_ai_model_display_name: 'Cloudflare AI' + # cf_ai_model_assistant: 'prompts_for_assistant_role' + # cf_ai_model_system: 'prompts_for_system_role' + timeout: 30 + disabled: true + + - name: core.ac.uk + engine: core + shortcut: cor + # read https://docs.searxng.org/dev/engines/online/core.html + api_key: "" + inactive: true + + - name: crossref + engine: crossref + shortcut: cr + timeout: 30 + disabled: true + + - name: crowdview + engine: json_engine + shortcut: cv + categories: general + paging: false + search_url: https://crowdview-next-js.onrender.com/api/search-v3?query={query} + results_query: results + url_query: link + title_query: title + content_query: snippet + title_html_to_text: true + content_html_to_text: true + disabled: true + about: + website: https://crowdview.ai/ + + - name: yep + engine: yep + shortcut: yep + categories: general + search_type: web + timeout: 5 + disabled: true + + - name: yep images + engine: yep + shortcut: yepi + categories: images + search_type: images + disabled: true + + - name: yep news + engine: yep + shortcut: yepn + categories: news + search_type: news + disabled: true + + - name: currency + engine: currency_convert + shortcut: cc + + - name: deezer + engine: deezer + shortcut: dz + disabled: true + + - name: destatis + engine: destatis + shortcut: destat + disabled: true + + - name: deviantart + engine: deviantart + shortcut: da + timeout: 3.0 + + - name: devicons + engine: devicons + shortcut: di + timeout: 3.0 + + - name: ddg definitions + engine: duckduckgo_definitions + shortcut: ddd + weight: 2 + disabled: true + tests: *tests_infobox + + # cloudflare protected + # - name: digbt + # engine: digbt + # shortcut: dbt + # timeout: 6.0 + # disabled: true + + - name: docker hub + engine: docker_hub + shortcut: dh + categories: [it, packages] + + - name: encyclosearch + engine: json_engine + shortcut: es + categories: general + paging: true + search_url: https://encyclosearch.org/encyclosphere/search?q={query}&page={pageno}&resultsPerPage=15 + results_query: Results + url_query: SourceURL + title_query: Title + content_query: Description + disabled: true + about: + website: https://encyclosearch.org + official_api_documentation: https://encyclosearch.org/docs/#/rest-api + use_official_api: true + require_api_key: false + results: JSON + + - name: erowid + engine: xpath + paging: true + first_page_num: 0 + page_size: 30 + search_url: https://www.erowid.org/search.php?q={query}&s={pageno} + url_xpath: //dl[@class="results-list"]/dt[@class="result-title"]/a/@href + title_xpath: //dl[@class="results-list"]/dt[@class="result-title"]/a/text() + content_xpath: //dl[@class="results-list"]/dd[@class="result-details"] + categories: [] + shortcut: ew + disabled: true + about: + website: https://www.erowid.org/ + wikidata_id: Q1430691 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML + + # - name: elasticsearch + # shortcut: els + # engine: elasticsearch + # base_url: http://localhost:9200 + # username: elastic + # password: changeme + # index: my-index + # enable_http: true + # # available options: match, simple_query_string, term, terms, custom + # query_type: match + # # if query_type is set to custom, provide your query here + # # custom_query_json: {"query":{"match_all": {}}} + # # show_metadata: false + # disabled: true + + - name: wikidata + engine: wikidata + shortcut: wd + timeout: 3.0 + weight: 2 + # add "list" to the array to get results in the results list + display_type: ["infobox"] + tests: *tests_infobox + categories: [general] + + - name: duckduckgo + engine: duckduckgo + shortcut: ddg + + - name: duckduckgo images + engine: duckduckgo_extra + categories: [images, web] + ddg_category: images + shortcut: ddi + disabled: true + + - name: duckduckgo videos + engine: duckduckgo_extra + categories: [videos, web] + ddg_category: videos + shortcut: ddv + disabled: true + + - name: duckduckgo news + engine: duckduckgo_extra + categories: [news, web] + ddg_category: news + shortcut: ddn + disabled: true + + - name: duckduckgo weather + engine: duckduckgo_weather + shortcut: ddw + disabled: true + + - name: apple maps + engine: apple_maps + shortcut: apm + disabled: true + timeout: 5.0 + + - name: emojipedia + engine: emojipedia + timeout: 4.0 + shortcut: em + disabled: true + + - name: tineye + engine: tineye + shortcut: tin + timeout: 9.0 + disabled: true + + - name: etymonline + engine: xpath + paging: true + search_url: https://etymonline.com/search?page={pageno}&q={query} + url_xpath: //a[contains(@class, "word__name--")]/@href + title_xpath: //a[contains(@class, "word__name--")] + content_xpath: //section[contains(@class, "word__defination")] + first_page_num: 1 + shortcut: et + categories: [dictionaries] + about: + website: https://www.etymonline.com/ + wikidata_id: Q1188617 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML + + # - name: ebay + # engine: ebay + # shortcut: eb + # base_url: 'https://www.ebay.com' + # disabled: true + # timeout: 5 + + - name: 1x + engine: www1x + shortcut: 1x + timeout: 3.0 + disabled: true + + - name: fdroid + engine: fdroid + shortcut: fd + disabled: true + + - name: findthatmeme + engine: findthatmeme + shortcut: ftm + disabled: true + + - name: flickr + categories: images + shortcut: fl + # You can use the engine using the official stable API, but you need an API + # key, see: https://www.flickr.com/services/apps/create/ + # engine: flickr + # api_key: 'apikey' # required! + # Or you can use the html non-stable engine, activated by default + engine: flickr_noapi + + - name: free software directory + engine: mediawiki + shortcut: fsd + categories: [it, software wikis] + base_url: https://directory.fsf.org/ + search_type: title + timeout: 5.0 + disabled: true + about: + website: https://directory.fsf.org/ + wikidata_id: Q2470288 + + # - name: freesound + # engine: freesound + # shortcut: fnd + # disabled: true + # timeout: 15.0 + # API key required, see: https://freesound.org/docs/api/overview.html + # api_key: MyAPIkey + + - name: frinkiac + engine: frinkiac + shortcut: frk + disabled: true + + - name: fyyd + engine: fyyd + shortcut: fy + timeout: 8.0 + disabled: true + + - name: geizhals + engine: geizhals + shortcut: geiz + disabled: true + + - name: genius + engine: genius + shortcut: gen + + - name: gentoo + engine: mediawiki + shortcut: ge + categories: ["it", "software wikis"] + base_url: "https://wiki.gentoo.org/" + api_path: "api.php" + search_type: text + timeout: 10 + + - name: gitlab + engine: gitlab + base_url: https://gitlab.com + shortcut: gl + disabled: true + about: + website: https://gitlab.com/ + wikidata_id: Q16639197 + + # - name: gnome + # engine: gitlab + # base_url: https://gitlab.gnome.org + # shortcut: gn + # about: + # website: https://gitlab.gnome.org + # wikidata_id: Q44316 + + - name: github + engine: github + shortcut: gh + + - name: github code + engine: github_code + shortcut: ghc + disabled: true + ghc_auth: + # type is one of: + # * none + # * personal_access_token + # * bearer + # When none is passed, the token is not requried. + type: "none" + token: "token" + # specify whether to highlight the matching lines to the query + ghc_highlight_matching_lines: true + ghc_strip_new_lines: true + ghc_strip_whitespace: false + timeout: 10.0 + + - name: codeberg + # https://docs.searxng.org/dev/engines/online/gitea.html + engine: gitea + base_url: https://codeberg.org + shortcut: cb + disabled: true + + - name: gitea.com + engine: gitea + base_url: https://gitea.com + shortcut: gitea + disabled: true + + - name: goodreads + engine: goodreads + shortcut: good + timeout: 4.0 + disabled: true + + - name: google + engine: google + shortcut: go + # additional_tests: + # android: *test_android + + - name: google images + engine: google_images + shortcut: goi + # additional_tests: + # android: *test_android + # dali: + # matrix: + # query: ['Dali Christ'] + # lang: ['en', 'de', 'fr', 'zh-CN'] + # result_container: + # - ['one_title_contains', 'Salvador'] + + - name: google news + engine: google_news + shortcut: gon + # additional_tests: + # android: *test_android + + - name: google videos + engine: google_videos + shortcut: gov + # additional_tests: + # android: *test_android + + - name: google scholar + engine: google_scholar + shortcut: gos + + - name: google play apps + engine: google_play + categories: [files, apps] + shortcut: gpa + play_categ: apps + disabled: true + + - name: google play movies + engine: google_play + categories: videos + shortcut: gpm + play_categ: movies + disabled: true + + - name: grokipedia + engine: grokipedia + shortcut: gp + disabled: true + inactive: true + + - name: material icons + engine: material_icons + shortcut: mi + disabled: true + + - name: habrahabr + engine: xpath + paging: true + search_url: https://habr.com/en/search/page{pageno}/?q={query} + results_xpath: //article[contains(@class, "tm-articles-list__item")] + url_xpath: .//a[@class="tm-title__link"]/@href + title_xpath: .//a[@class="tm-title__link"] + content_xpath: .//div[contains(@class, "article-formatted-body")] + categories: it + timeout: 4.0 + disabled: true + shortcut: habr + about: + website: https://habr.com/ + wikidata_id: Q4494434 + official_api_documentation: https://habr.com/en/docs/help/api/ + use_official_api: false + require_api_key: false + results: HTML + + - name: hackernews + engine: hackernews + shortcut: hn + disabled: true + + - name: hex + engine: hex + shortcut: hex + disabled: true + # Valid values: name inserted_at updated_at total_downloads recent_downloads + sort_criteria: "recent_downloads" + page_size: 10 + + - name: crates.io + engine: crates + shortcut: crates + disabled: true + timeout: 6.0 + + - name: hoogle + engine: xpath + search_url: https://hoogle.haskell.org/?hoogle={query} + results_xpath: '//div[@class="result"]' + title_xpath: './/div[@class="ans"]//a' + url_xpath: './/div[@class="ans"]//a/@href' + content_xpath: './/div[@class="from"]' + page_size: 20 + categories: [it, packages] + shortcut: ho + about: + website: https://hoogle.haskell.org/ + wikidata_id: Q34010 + official_api_documentation: https://hackage.haskell.org/api + use_official_api: false + require_api_key: false + results: JSON + + - name: il post + engine: il_post + shortcut: pst + disabled: true + + - name: huggingface + engine: huggingface + shortcut: hf + disabled: true + + - name: huggingface datasets + huggingface_endpoint: datasets + engine: huggingface + shortcut: hfd + disabled: true + + - name: huggingface spaces + huggingface_endpoint: spaces + engine: huggingface + shortcut: hfs + disabled: true + + - name: imdb + engine: imdb + shortcut: imdb + timeout: 6.0 + disabled: true + + - name: imgur + engine: imgur + shortcut: img + disabled: true + + - name: ina + engine: ina + shortcut: in + timeout: 6.0 + disabled: true + + # - name: invidious + # engine: invidious + # # if you want to use invidious with SearXNG you should setup one locally + # # https://github.com/searxng/searxng/issues/2722#issuecomment-2884993248 + # base_url: + # - https://invidious.example1.com + # - https://invidious.example2.com + # shortcut: iv + # timeout: 3.0 + + - name: ipernity + engine: ipernity + shortcut: ip + disabled: true + + - name: iqiyi + engine: iqiyi + shortcut: iq + disabled: true + + - name: jisho + engine: jisho + shortcut: js + timeout: 3.0 + disabled: true + + - name: kickass + engine: kickass + base_url: + - https://kickasstorrents.to + - https://kickasstorrents.cr + - https://kickasstorrent.cr + - https://kickass.sx + - https://kat.am + shortcut: kc + timeout: 4.0 + + - name: lemmy communities + engine: lemmy + lemmy_type: Communities + shortcut: leco + + - name: lemmy users + engine: lemmy + network: lemmy communities + lemmy_type: Users + shortcut: leus + + - name: lemmy posts + engine: lemmy + network: lemmy communities + lemmy_type: Posts + shortcut: lepo + + - name: lemmy comments + engine: lemmy + network: lemmy communities + lemmy_type: Comments + shortcut: lecom + + - name: library genesis + engine: xpath + # search_url: https://libgen.is/search.php?req={query} + search_url: https://libgen.rs/search.php?req={query} + url_xpath: //a[contains(@href,"book/index.php?md5")]/@href + title_xpath: //a[contains(@href,"book/")]/text()[1] + content_xpath: //td/a[1][contains(@href,"=author")]/text() + categories: files + timeout: 7.0 + disabled: true + shortcut: lg + about: + website: https://libgen.fun/ + wikidata_id: Q22017206 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML + + - name: z-library + engine: zlibrary + shortcut: zlib + timeout: 7.0 + disabled: true + # https://github.com/searxng/searxng/issues/3610 + inactive: true + + - name: library of congress + engine: loc + shortcut: loc + categories: images + disabled: true + + - name: libretranslate + engine: libretranslate + # https://github.com/LibreTranslate/LibreTranslate?tab=readme-ov-file#mirrors + base_url: + - https://libretranslate.com/translate + # api_key: abc123 + shortcut: lt + disabled: true + + - name: lingva + engine: lingva + shortcut: lv + # set lingva instance in url, by default it will use the official instance + # url: https://lingva.thedaviddelta.com + + - name: lobste.rs + engine: xpath + search_url: https://lobste.rs/search?q={query}&what=stories&order=relevance + results_xpath: //li[contains(@class, "story")] + url_xpath: .//a[@class="u-url"]/@href + title_xpath: .//a[@class="u-url"] + content_xpath: .//a[@class="domain"] + categories: it + shortcut: lo + timeout: 5.0 + disabled: true + about: + website: https://lobste.rs/ + wikidata_id: Q60762874 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML + + - name: lucide + engine: lucide + shortcut: luc + timeout: 3.0 + + - name: marginalia + engine: marginalia + shortcut: mar + # To get an API key, please follow the instructions at + # - https://about.marginalia-search.com/article/api/ + # api_key: ... + disabled: true + inactive: true + + - name: mastodon users + engine: mastodon + mastodon_type: accounts + base_url: https://mastodon.social + shortcut: mau + + - name: mastodon hashtags + engine: mastodon + mastodon_type: hashtags + base_url: https://mastodon.social + shortcut: mah + + # - name: matrixrooms + # engine: mrs + # # https://docs.searxng.org/dev/engines/online/mrs.html + # # base_url: https://mrs-api-host + # shortcut: mtrx + # disabled: true + + - name: mdn + shortcut: mdn + engine: json_engine + categories: [it] + paging: true + search_url: https://developer.mozilla.org/api/v1/search?q={query}&page={pageno} + results_query: documents + url_query: mdn_url + url_prefix: https://developer.mozilla.org + title_query: title + content_query: summary + about: + website: https://developer.mozilla.org + wikidata_id: Q3273508 + official_api_documentation: null + use_official_api: false + require_api_key: false + results: JSON + + - name: metacpan + engine: metacpan + shortcut: cpan + disabled: true + number_of_results: 20 + + # https://docs.searxng.org/dev/engines/offline/search-indexer-engines.html#module-searx.engines.meilisearch + # - name: meilisearch + # engine: meilisearch + # shortcut: mes + # enable_http: true + # base_url: http://localhost:7700 + # index: my-index + # auth_key: Bearer XXXX + + - name: microsoft learn + engine: microsoft_learn + shortcut: msl + disabled: true + + - name: mixcloud + engine: mixcloud + shortcut: mc + + # MongoDB engine + # Required dependency: pymongo + # - name: mymongo + # engine: mongodb + # shortcut: md + # exact_match_only: false + # host: '127.0.0.1' + # port: 27017 + # enable_http: true + # results_per_page: 20 + # database: 'business' + # collection: 'reviews' # name of the db collection + # key: 'name' # key in the collection to search for + + - name: mozhi + engine: mozhi + base_url: + - https://mozhi.aryak.me + - https://translate.bus-hit.me + - https://nyc1.mz.ggtyler.dev + # mozhi_engine: google - see https://mozhi.aryak.me for supported engines + timeout: 4.0 + shortcut: mz + disabled: true + + - name: mwmbl + engine: mwmbl + # api_url: https://api.mwmbl.org + shortcut: mwm + disabled: true + + - name: niconico + engine: niconico + shortcut: nico + disabled: true + + - name: npm + engine: npm + shortcut: npm + timeout: 5.0 + disabled: true + + - name: nyaa + engine: nyaa + shortcut: nt + disabled: true + + - name: mankier + engine: json_engine + search_url: https://www.mankier.com/api/v2/mans/?q={query} + results_query: results + url_query: url + title_query: name + content_query: description + categories: it + shortcut: man + about: + website: https://www.mankier.com/ + official_api_documentation: https://www.mankier.com/api + use_official_api: true + require_api_key: false + results: JSON + + - name: odysee + engine: odysee + shortcut: od + disabled: true + + - name: ollama + engine: ollama + shortcut: ollama + disabled: true + + - name: openairedatasets + engine: json_engine + paging: true + search_url: https://api.openaire.eu/search/datasets?format=json&page={pageno}&size=10&title={query} + results_query: response/results/result + url_query: metadata/oaf:entity/oaf:result/children/instance/webresource/url/$ + title_query: metadata/oaf:entity/oaf:result/title/$ + content_query: metadata/oaf:entity/oaf:result/description/$ + content_html_to_text: true + categories: "science" + shortcut: oad + timeout: 5.0 + about: + website: https://www.openaire.eu/ + wikidata_id: Q25106053 + official_api_documentation: https://api.openaire.eu/ + use_official_api: false + require_api_key: false + results: JSON + + - name: openairepublications + engine: json_engine + paging: true + search_url: https://api.openaire.eu/search/publications?format=json&page={pageno}&size=10&title={query} + results_query: response/results/result + url_query: metadata/oaf:entity/oaf:result/children/instance/webresource/url/$ + title_query: metadata/oaf:entity/oaf:result/title/$ + content_query: metadata/oaf:entity/oaf:result/description/$ + content_html_to_text: true + categories: science + shortcut: oap + timeout: 5.0 + about: + website: https://www.openaire.eu/ + wikidata_id: Q25106053 + official_api_documentation: https://api.openaire.eu/ + use_official_api: false + require_api_key: false + results: JSON + + - name: openalex + engine: openalex + shortcut: oa + # https://docs.searxng.org/dev/engines/online/openalex.html + # Recommended by OpenAlex: join the polite pool with an email address + # mailto: "[email protected]" + timeout: 5.0 + disabled: true + + - name: openclipart + engine: openclipart + shortcut: ocl + inactive: true + disabled: true + timeout: 30 + + - name: openlibrary + engine: openlibrary + shortcut: ol + timeout: 10 + disabled: true + + - name: openmeteo + engine: open_meteo + shortcut: om + disabled: true + + # - name: opensemanticsearch + # engine: opensemantic + # shortcut: oss + # base_url: 'http://localhost:8983/solr/opensemanticsearch/' + + - name: openstreetmap + engine: openstreetmap + shortcut: osm + + - name: openrepos + engine: xpath + paging: true + search_url: https://openrepos.net/search/node/{query}?page={pageno} + url_xpath: //li[@class="search-result"]//h3[@class="title"]/a/@href + title_xpath: //li[@class="search-result"]//h3[@class="title"]/a + content_xpath: //li[@class="search-result"]//div[@class="search-snippet-info"]//p[@class="search-snippet"] + categories: files + timeout: 4.0 + disabled: true + shortcut: or + about: + website: https://openrepos.net/ + wikidata_id: + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML + + - name: packagist + engine: json_engine + paging: true + search_url: https://packagist.org/search.json?q={query}&page={pageno} + results_query: results + url_query: url + title_query: name + content_query: description + categories: [it, packages] + disabled: true + timeout: 5.0 + shortcut: pack + about: + website: https://packagist.org + wikidata_id: Q108311377 + official_api_documentation: https://packagist.org/apidoc + use_official_api: true + require_api_key: false + results: JSON + + - name: pdbe + engine: pdbe + shortcut: pdb + # Hide obsolete PDB entries. Default is not to hide obsolete structures + # hide_obsolete: false + + - name: photon + engine: photon + shortcut: ph + + - name: pinterest + engine: pinterest + shortcut: pin + + - name: piped + engine: piped + shortcut: ppd + categories: videos + piped_filter: videos + timeout: 3.0 + inactive: true + + # URL to use as link and for embeds + frontend_url: https://srv.piped.video + # Instance will be selected randomly, for more see https://piped-instances.kavin.rocks/ + backend_url: + - https://pipedapi.ducks.party + - https://api.piped.private.coffee + + - name: piped.music + engine: piped + network: piped + shortcut: ppdm + categories: music + piped_filter: music_songs + timeout: 3.0 + inactive: true + + - name: piratebay + engine: piratebay + shortcut: tpb + # You may need to change this URL to a proxy if piratebay is blocked in your + # country + url: https://thepiratebay.org/ + timeout: 3.0 + + - name: pixabay images + engine: pixabay + pixabay_type: images + categories: images + shortcut: pixi + disabled: true + + - name: pixabay videos + engine: pixabay + pixabay_type: videos + categories: videos + shortcut: pixv + disabled: true + + - name: pixiv + shortcut: pv + engine: pixiv + disabled: true + inactive: true + pixiv_image_proxies: + - https://pximg.example.org + # A proxy is required to load the images. Hosting an image proxy server + # for Pixiv: + # --> https://pixivfe.pages.dev/hosting-image-proxy-server/ + # Proxies from public instances. Ask the public instances owners if they + # agree to receive traffic from SearXNG! + # --> https://codeberg.org/VnPower/PixivFE#instances + # --> https://github.com/searxng/searxng/pull/3192#issuecomment-1941095047 + # image proxy of https://pixiv.cat + # - https://i.pixiv.cat + # image proxy of https://www.pixiv.pics + # - https://pximg.cocomi.eu.org + # image proxy of https://pixivfe.exozy.me + # - https://pximg.exozy.me + # image proxy of https://pixivfe.ducks.party + # - https://pixiv.ducks.party + # image proxy of https://pixiv.perennialte.ch + # - https://pximg.perennialte.ch + + - name: podcastindex + engine: podcastindex + shortcut: podcast + + # Required dependency: psychopg2 + # - name: postgresql + # engine: postgresql + # database: postgres + # username: postgres + # password: postgres + # limit: 10 + # query_str: 'SELECT * from my_table WHERE my_column = %(query)s' + # shortcut : psql + + - name: presearch + engine: presearch + search_type: search + categories: [general, web] + shortcut: ps + timeout: 4.0 + disabled: true + + - name: presearch images + engine: presearch + network: presearch + search_type: images + categories: [images, web] + timeout: 4.0 + shortcut: psimg + disabled: true + + - name: presearch videos + engine: presearch + network: presearch + search_type: videos + categories: [general, web] + timeout: 4.0 + shortcut: psvid + disabled: true + + - name: presearch news + engine: presearch + network: presearch + search_type: news + categories: [news, web] + timeout: 4.0 + shortcut: psnews + disabled: true + + - name: pub.dev + engine: xpath + shortcut: pd + search_url: https://pub.dev/packages?q={query}&page={pageno} + paging: true + results_xpath: //div[contains(@class,"packages-item")] + url_xpath: ./div/h3/a/@href + title_xpath: ./div/h3/a + content_xpath: ./div/div/div[contains(@class,"packages-description")]/span + categories: [packages, it] + timeout: 3.0 + disabled: true + first_page_num: 1 + about: + website: https://pub.dev/ + official_api_documentation: https://pub.dev/help/api + use_official_api: false + require_api_key: false + results: HTML + + - name: public domain image archive + engine: public_domain_image_archive + shortcut: pdia + disabled: true + + - name: pubmed + engine: pubmed + shortcut: pub + + - name: pypi + shortcut: pypi + engine: pypi + + - name: quark + quark_category: general + categories: [general] + engine: quark + shortcut: qk + disabled: true + + - name: quark images + quark_category: images + categories: [images] + engine: quark + shortcut: qki + disabled: true + + - name: qwant + qwant_categ: web + engine: qwant + shortcut: qw + categories: [general, web] + disabled: true + additional_tests: + rosebud: *test_rosebud + + - name: qwant news + qwant_categ: news + engine: qwant + shortcut: qwn + categories: news + network: qwant + + - name: qwant images + qwant_categ: images + engine: qwant + shortcut: qwi + categories: [images, web] + network: qwant + + - name: qwant videos + qwant_categ: videos + engine: qwant + shortcut: qwv + categories: [videos, web] + network: qwant + + # - name: library + # engine: recoll + # shortcut: lib + # base_url: 'https://recoll.example.org/' + # search_dir: '' + # mount_prefix: /export + # dl_prefix: 'https://download.example.org' + # timeout: 30.0 + # categories: files + # disabled: true + + # - name: recoll library reference + # engine: recoll + # base_url: 'https://recoll.example.org/' + # search_dir: reference + # mount_prefix: /export + # dl_prefix: 'https://download.example.org' + # shortcut: libr + # timeout: 30.0 + # categories: files + # disabled: true + + - name: radio browser + engine: radio_browser + shortcut: rb + + - name: reddit + engine: reddit + shortcut: re + page_size: 25 + disabled: true + + - name: reuters + engine: reuters + shortcut: reu + # https://docs.searxng.org/dev/engines/online/reuters.html + # sort_order = "relevance" + + - name: right dao + engine: xpath + paging: true + page_size: 12 + search_url: https://rightdao.com/search?q={query}&start={pageno} + results_xpath: //div[contains(@class, "description")] + url_xpath: ../div[contains(@class, "title")]/a/@href + title_xpath: ../div[contains(@class, "title")] + content_xpath: . + categories: general + shortcut: rd + disabled: true + about: + website: https://rightdao.com/ + use_official_api: false + require_api_key: false + results: HTML + + - name: rottentomatoes + engine: rottentomatoes + shortcut: rt + disabled: true + + # Required dependency: valkey + # - name: myvalkey + # shortcut : rds + # engine: valkey_server + # exact_match_only: false + # host: '127.0.0.1' + # port: 6379 + # enable_http: true + # password: '' + # db: 0 + + # tmp suspended: bad certificate + # - name: scanr structures + # shortcut: scs + # engine: scanr_structures + # disabled: true + + - name: searchmysite + engine: xpath + shortcut: sms + categories: general + paging: true + search_url: https://searchmysite.net/search/?q={query}&page={pageno} + results_xpath: //div[contains(@class,'search-result')] + url_xpath: .//a[contains(@class,'result-link')]/@href + title_xpath: .//span[contains(@class,'result-title-txt')]/text() + content_xpath: ./p[@id='result-hightlight'] + disabled: true + about: + website: https://searchmysite.net + + - name: selfhst icons + engine: selfhst + shortcut: si + disabled: true + + - name: sepiasearch + engine: sepiasearch + shortcut: sep + + - name: sogou + engine: sogou + shortcut: sogou + disabled: true + + - name: sogou images + engine: sogou_images + shortcut: sogoui + disabled: true + + - name: sogou videos + engine: sogou_videos + shortcut: sogouv + disabled: true + + - name: sogou wechat + engine: sogou_wechat + shortcut: sogouw + disabled: true + + - name: soundcloud + engine: soundcloud + shortcut: sc + + - name: stackoverflow + engine: stackexchange + shortcut: st + api_site: 'stackoverflow' + categories: [it, q&a] + + - name: askubuntu + engine: stackexchange + shortcut: ubuntu + api_site: 'askubuntu' + categories: [it, q&a] + + - name: superuser + engine: stackexchange + shortcut: su + api_site: 'superuser' + categories: [it, q&a] + + - name: discuss.python + engine: discourse + shortcut: dpy + base_url: 'https://discuss.python.org' + categories: [it, q&a] + disabled: true + + - name: caddy.community + engine: discourse + shortcut: caddy + base_url: 'https://caddy.community' + categories: [it, q&a] + disabled: true + + - name: pi-hole.community + engine: discourse + shortcut: pi + categories: [it, q&a] + base_url: 'https://discourse.pi-hole.net' + disabled: true + + - name: searchcode code + engine: searchcode_code + shortcut: scc + disabled: true + inactive: true + + # - name: searx + # engine: searx_engine + # shortcut: se + # instance_urls : + # - http://127.0.0.1:8888/ + # - ... + # disabled: true + + - name: semantic scholar + engine: semantic_scholar + shortcut: se + + # Spotify needs API credentials + # - name: spotify + # engine: spotify + # shortcut: stf + # api_client_id: ******* + # api_client_secret: ******* + + # - name: solr + # engine: solr + # shortcut: slr + # base_url: http://localhost:8983 + # collection: collection_name + # sort: '' # sorting: asc or desc + # field_list: '' # comma separated list of field names to display on the UI + # default_fields: '' # default field to query + # query_fields: '' # query fields + # enable_http: true + + - name: springer nature + engine: springer + shortcut: springer + timeout: 5 + # read https://docs.searxng.org/dev/engines/online/springer.html + api_key: "" + inactive: true + + - name: startpage + engine: startpage + shortcut: sp + startpage_categ: web + categories: [general, web] + additional_tests: + rosebud: *test_rosebud + + - name: startpage news + engine: startpage + startpage_categ: news + categories: [news, web] + shortcut: spn + + - name: startpage images + engine: startpage + startpage_categ: images + categories: [images, web] + shortcut: spi + + - name: steam + engine: steam + shortcut: stm + disabled: true + + - name: tokyotoshokan + engine: tokyotoshokan + shortcut: tt + timeout: 6.0 + disabled: true + + - name: solidtorrents + engine: solidtorrents + shortcut: solid + timeout: 4.0 + base_url: + - https://solidtorrents.to + - https://bitsearch.to + + # For this demo of the sqlite engine download: + # https://liste.mediathekview.de/filmliste-v2.db.bz2 + # and unpack into searx/data/filmliste-v2.db + # Query to test: "!mediathekview concert" + # + # - name: mediathekview + # engine: sqlite + # shortcut: mediathekview + # categories: [general, videos] + # result_type: MainResult + # database: searx/data/filmliste-v2.db + # query_str: >- + # SELECT title || ' (' || time(duration, 'unixepoch') || ')' AS title, + # COALESCE( NULLIF(url_video_hd,''), NULLIF(url_video_sd,''), url_video) AS url, + # description AS content + # FROM film + # WHERE title LIKE :wildcard OR description LIKE :wildcard + # ORDER BY duration DESC + + - name: tagesschau + engine: tagesschau + # when set to false, display URLs from Tagesschau, and not the actual source + # (e.g. NDR, WDR, SWR, HR, ...) + use_source_url: true + shortcut: ts + disabled: true + + - name: tmdb + engine: xpath + paging: true + categories: movies + search_url: https://www.themoviedb.org/search?page={pageno}&query={query} + results_xpath: //div[contains(@class,"movie") or contains(@class,"tv")]//div[contains(@class,"card")] + url_xpath: .//div[contains(@class,"poster")]/a/@href + thumbnail_xpath: .//img/@src + title_xpath: .//div[contains(@class,"title")]//h2 + content_xpath: .//div[contains(@class,"overview")] + shortcut: tm + disabled: true + + # Requires Tor + - name: torch + engine: xpath + paging: true + search_url: + http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and + results_xpath: //table//tr + url_xpath: ./td[2]/a + title_xpath: ./td[2]/b + content_xpath: ./td[2]/small + categories: onions + enable_http: true + shortcut: tch + + # TubeArchivist is a self-hosted Youtube archivist software. + # https://docs.searxng.org/dev/engines/online/tubearchivist.html + # + # - name: tubearchivist + # engine: tubearchivist + # shortcut: tuba + # base_url: + # ta_token: + # ta_link_to_mp4: false + + # torznab engine lets you query any torznab compatible indexer. Using this + # engine in combination with Jackett opens the possibility to query a lot of + # public and private indexers directly from SearXNG. More details at: + # https://docs.searxng.org/dev/engines/online/torznab.html + # + # - name: Torznab EZTV + # engine: torznab + # shortcut: eztv + # base_url: http://localhost:9117/api/v2.0/indexers/eztv/results/torznab + # enable_http: true # if using localhost + # api_key: xxxxxxxxxxxxxxx + # show_magnet_links: true + # show_torrent_files: false + # # https://github.com/Jackett/Jackett/wiki/Jackett-Categories + # torznab_categories: # optional + # - 2000 + # - 5000 + + # tmp suspended - too slow, too many errors + # - name: urbandictionary + # engine : xpath + # search_url : https://www.urbandictionary.com/define.php?term={query} + # url_xpath : //*[@class="word"]/@href + # title_xpath : //*[@class="def-header"] + # content_xpath: //*[@class="meaning"] + # shortcut: ud + + - name: unsplash + engine: unsplash + shortcut: us + + - name: yandex + engine: yandex + categories: general + search_type: web + shortcut: yd + disabled: true + + - name: yandex images + engine: yandex + network: yandex + categories: images + search_type: images + shortcut: ydi + disabled: true + + - name: yandex music + engine: yandex_music + network: yandex + shortcut: ydm + disabled: true + # https://yandex.com/support/music/access.html + + - name: yahoo + engine: yahoo + shortcut: yh + disabled: true + + - name: yahoo news + engine: yahoo_news + shortcut: yhn + + - name: youtube + shortcut: yt + # You can use the engine using the official stable API, but you need an API + # key See: https://console.developers.google.com/project + # + # engine: youtube_api + # api_key: 'apikey' # required! + # + # Or you can use the html non-stable engine, activated by default + engine: youtube_noapi + + - name: dailymotion + engine: dailymotion + shortcut: dm + + - name: vimeo + engine: vimeo + shortcut: vm + + - name: wiby + engine: json_engine + paging: true + search_url: https://wiby.me/json/?q={query}&p={pageno} + url_query: URL + title_query: Title + content_query: Snippet + categories: [general, web] + shortcut: wib + disabled: true + about: + website: https://wiby.me/ + + - name: wikibooks + engine: mediawiki + weight: 0.5 + shortcut: wb + categories: [general, wikimedia] + base_url: "https://{language}.wikibooks.org/" + search_type: text + disabled: true + about: + website: https://www.wikibooks.org/ + wikidata_id: Q367 + + - name: wikinews + engine: mediawiki + shortcut: wn + categories: [news, wikimedia] + base_url: "https://{language}.wikinews.org/" + search_type: text + srsort: create_timestamp_desc + about: + website: https://www.wikinews.org/ + wikidata_id: Q964 + + - name: wikiquote + engine: mediawiki + weight: 0.5 + shortcut: wq + categories: [general, wikimedia] + base_url: "https://{language}.wikiquote.org/" + search_type: text + disabled: true + additional_tests: + rosebud: *test_rosebud + about: + website: https://www.wikiquote.org/ + wikidata_id: Q369 + + - name: wikisource + engine: mediawiki + weight: 0.5 + shortcut: ws + categories: [general, wikimedia] + base_url: "https://{language}.wikisource.org/" + search_type: text + disabled: true + about: + website: https://www.wikisource.org/ + wikidata_id: Q263 + + - name: wikispecies + engine: mediawiki + shortcut: wsp + categories: [general, science, wikimedia] + base_url: "https://species.wikimedia.org/" + search_type: text + disabled: true + about: + website: https://species.wikimedia.org/ + wikidata_id: Q13679 + tests: + wikispecies: + matrix: + query: "Campbell, L.I. et al. 2011: MicroRNAs" + lang: en + result_container: + - not_empty + - ['one_title_contains', 'Tardigrada'] + test: + - unique_results + + - name: wiktionary + engine: mediawiki + shortcut: wt + categories: [dictionaries, wikimedia] + base_url: "https://{language}.wiktionary.org/" + search_type: text + about: + website: https://www.wiktionary.org/ + wikidata_id: Q151 + + - name: wikiversity + engine: mediawiki + weight: 0.5 + shortcut: wv + categories: [general, wikimedia] + base_url: "https://{language}.wikiversity.org/" + search_type: text + disabled: true + about: + website: https://www.wikiversity.org/ + wikidata_id: Q370 + + - name: wikivoyage + engine: mediawiki + weight: 0.5 + shortcut: wy + categories: [general, wikimedia] + base_url: "https://{language}.wikivoyage.org/" + search_type: text + disabled: true + about: + website: https://www.wikivoyage.org/ + wikidata_id: Q373 + + - name: wikicommons.images + engine: wikicommons + shortcut: wci + categories: images + wc_search_type: image + + - name: wikicommons.videos + engine: wikicommons + shortcut: wcv + categories: videos + wc_search_type: video + + - name: wikicommons.audio + engine: wikicommons + shortcut: wca + categories: music + wc_search_type: audio + + - name: wikicommons.files + engine: wikicommons + shortcut: wcf + categories: files + wc_search_type: file + + - name: wolframalpha + shortcut: wa + # You can use the engine using the official stable API, but you need an API + # key. See: https://products.wolframalpha.com/api/ + # + # engine: wolframalpha_api + # api_key: '' + # + # Or you can use the html non-stable engine, activated by default + engine: wolframalpha_noapi + timeout: 6.0 + categories: general + disabled: true + + - name: dictzone + engine: dictzone + shortcut: dc + + - name: mymemory translated + engine: translated + shortcut: tl + timeout: 5.0 + # You can use without an API key, but you are limited to 1000 words/day + # See: https://mymemory.translated.net/doc/usagelimits.php + # api_key: '' + + # Required dependency: mysql-connector-python + # - name: mysql + # engine: mysql_server + # database: mydatabase + # username: user + # password: pass + # limit: 10 + # query_str: 'SELECT * from mytable WHERE fieldname=%(query)s' + # shortcut: mysql + + # Required dependency: mariadb + # - name: mariadb + # engine: mariadb_server + # database: mydatabase + # username: user + # password: pass + # limit: 10 + # query_str: 'SELECT * from mytable WHERE fieldname=%(query)s' + # shortcut: mdb + + - name: 1337x + engine: 1337x + shortcut: 1337x + disabled: true + + - name: duden + engine: duden + shortcut: du + disabled: true + + - name: seznam + shortcut: szn + engine: seznam + disabled: true + + # - name: deepl + # engine: deepl + # shortcut: dpl + # # You can use the engine using the official stable API, but you need an API key + # # See: https://www.deepl.com/pro-api?cta=header-pro-api + # api_key: '' # required! + # timeout: 5.0 + # disabled: true + + - name: mojeek + shortcut: mjk + engine: mojeek + categories: [general, web] + disabled: true + + - name: mojeek images + shortcut: mjkimg + engine: mojeek + categories: [images, web] + search_type: images + paging: false + disabled: true + + - name: mojeek news + shortcut: mjknews + engine: mojeek + categories: [news, web] + search_type: news + paging: false + disabled: true + + - name: moviepilot + engine: moviepilot + shortcut: mp + disabled: true + + - name: naver + categories: [general, web] + engine: naver + shortcut: nvr + disabled: true + + - name: naver images + naver_category: images + categories: [images] + engine: naver + shortcut: nvri + disabled: true + + - name: naver news + naver_category: news + categories: [news] + engine: naver + shortcut: nvrn + disabled: true + + - name: naver videos + naver_category: videos + categories: [videos] + engine: naver + shortcut: nvrv + disabled: true + + - name: rubygems + shortcut: rbg + engine: xpath + paging: true + search_url: https://rubygems.org/search?page={pageno}&query={query} + results_xpath: /html/body/main/div/a[@class="gems__gem"] + url_xpath: ./@href + title_xpath: ./span/h2 + content_xpath: ./span/p + suggestion_xpath: /html/body/main/div/div[@class="search__suggestions"]/p/a + first_page_num: 1 + categories: [it, packages] + disabled: true + about: + website: https://rubygems.org/ + wikidata_id: Q1853420 + official_api_documentation: https://guides.rubygems.org/rubygems-org-api/ + use_official_api: false + require_api_key: false + results: HTML + + - name: peertube + engine: peertube + shortcut: ptb + paging: true + # alternatives see: https://instances.joinpeertube.org/instances + # base_url: https://tube.4aem.com + categories: videos + disabled: true + timeout: 6.0 + + - name: mediathekviewweb + engine: mediathekviewweb + shortcut: mvw + disabled: true + + - name: yacy + # https://docs.searxng.org/dev/engines/online/yacy.html + engine: yacy + categories: general + search_type: text + # see https://github.com/searxng/searxng/pull/3631#issuecomment-2240903027 + base_url: + - https://yacy.searchlab.eu + shortcut: ya + disabled: true + # if you aren't using HTTPS for your local yacy instance disable https + # enable_http: false + search_mode: 'global' + # timeout can be reduced in 'local' search mode + timeout: 5.0 + + - name: yacy images + engine: yacy + network: yacy + categories: images + search_type: image + shortcut: yai + disabled: true + # timeout can be reduced in 'local' search mode + timeout: 5.0 + + - name: rumble + engine: rumble + shortcut: ru + base_url: https://rumble.com/ + paging: true + categories: videos + disabled: true + + - name: repology + engine: repology + shortcut: rep + disabled: true + inactive: true + + - name: livespace + engine: livespace + shortcut: ls + categories: videos + disabled: true + timeout: 5.0 + + - name: wordnik + engine: wordnik + shortcut: wnik + timeout: 5.0 + + - name: woxikon.de synonyme + engine: xpath + shortcut: woxi + categories: [dictionaries] + timeout: 5.0 + disabled: true + search_url: https://synonyme.woxikon.de/synonyme/{query}.php + url_xpath: //div[@class="upper-synonyms"]/a/@href + content_xpath: //div[@class="synonyms-list-group"] + title_xpath: //div[@class="upper-synonyms"]/a + no_result_for_http_status: [404] + about: + website: https://www.woxikon.de/ + wikidata_id: # No Wikidata ID + use_official_api: false + require_api_key: false + results: HTML + language: de + + - name: seekr news + engine: seekr + shortcut: senews + categories: news + seekr_category: news + disabled: true + + - name: seekr images + engine: seekr + network: seekr news + shortcut: seimg + categories: images + seekr_category: images + disabled: true + + - name: seekr videos + engine: seekr + network: seekr news + shortcut: sevid + categories: videos + seekr_category: videos + disabled: true + + - name: stract + engine: stract + shortcut: str + disabled: true + + - name: svgrepo + engine: svgrepo + shortcut: svg + timeout: 10.0 + disabled: true + + - name: tootfinder + engine: tootfinder + shortcut: toot + + - name: uxwing + engine: uxwing + shortcut: ux + disabled: true + + - name: voidlinux + engine: voidlinux + shortcut: void + disabled: true + + - name: wallhaven + engine: wallhaven + # api_key: abcdefghijklmnopqrstuvwxyz + shortcut: wh + disabled: true + + # wikimini: online encyclopedia for children + # The fulltext and title parameter is necessary for Wikimini because + # sometimes it will not show the results and redirect instead + - name: wikimini + engine: xpath + shortcut: wkmn + search_url: https://fr.wikimini.org/w/index.php?search={query}&title=Sp%C3%A9cial%3ASearch&fulltext=Search + url_xpath: //li/div[@class="mw-search-result-heading"]/a/@href + title_xpath: //li//div[@class="mw-search-result-heading"]/a + content_xpath: //li/div[@class="searchresult"] + categories: general + disabled: true + about: + website: https://wikimini.org/ + wikidata_id: Q3568032 + use_official_api: false + require_api_key: false + results: HTML + language: fr + + - name: wttr.in + engine: wttr + shortcut: wttr + timeout: 9.0 + + - name: brave + engine: brave + shortcut: br + time_range_support: true + paging: true + categories: [general, web] + brave_category: search + # brave_spellcheck: true + + - name: brave.images + engine: brave + network: brave + shortcut: brimg + categories: [images, web] + brave_category: images + + - name: brave.videos + engine: brave + network: brave + shortcut: brvid + categories: [videos, web] + brave_category: videos + + - name: brave.news + engine: brave + network: brave + shortcut: brnews + categories: news + brave_category: news + + # - name: brave.goggles + # engine: brave + # network: brave + # shortcut: brgog + # time_range_support: true + # paging: true + # categories: [general, web] + # brave_category: goggles + # Goggles: # required! This should be a URL ending in .goggle + + - name: lib.rs + shortcut: lrs + engine: lib_rs + disabled: true + + - name: sourcehut + shortcut: srht + engine: sourcehut + # https://docs.searxng.org/dev/engines/online/sourcehut.html + # sourcehut_sort_order: longest-active + disabled: true + + - name: bt4g + engine: bt4g + shortcut: bt4g + + - name: pkg.go.dev + engine: pkg_go_dev + shortcut: pgo + disabled: true + + - name: senscritique + engine: senscritique + shortcut: scr + timeout: 4.0 + disabled: true + + - name: minecraft wiki + engine: mediawiki + shortcut: mcw + categories: ["software wikis"] + base_url: https://minecraft.wiki/ + api_path: "api.php" + search_type: text + disabled: true + about: + website: https://minecraft.wiki/ + wikidata_id: Q105533483 + +# Doku engine lets you access to any Doku wiki instance: +# A public one or a privete/corporate one. +# - name: ubuntuwiki +# engine: doku +# shortcut: uw +# base_url: 'https://doc.ubuntu-fr.org' + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: git grep +# engine: command +# command: ['git', 'grep', '{{QUERY}}'] +# shortcut: gg +# tokens: [] +# disabled: true +# delimiter: +# chars: ':' +# keys: ['filepath', 'code'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: locate +# engine: command +# command: ['locate', '{{QUERY}}'] +# shortcut: loc +# tokens: [] +# disabled: true +# delimiter: +# chars: ' ' +# keys: ['line'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: find +# engine: command +# command: ['find', '.', '-name', '{{QUERY}}'] +# query_type: path +# shortcut: fnd +# tokens: [] +# disabled: true +# delimiter: +# chars: ' ' +# keys: ['line'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: pattern search in files +# engine: command +# command: ['fgrep', '{{QUERY}}'] +# shortcut: fgr +# tokens: [] +# disabled: true +# delimiter: +# chars: ' ' +# keys: ['line'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: regex search in files +# engine: command +# command: ['grep', '{{QUERY}}'] +# shortcut: gr +# tokens: [] +# disabled: true +# delimiter: +# chars: ' ' +# keys: ['line'] + +doi_resolvers: + oadoi.org: 'https://oadoi.org/' + doi.org: 'https://doi.org/' + sci-hub.se: 'https://sci-hub.se/' + sci-hub.st: 'https://sci-hub.st/' + sci-hub.ru: 'https://sci-hub.ru/' + +default_doi_resolver: 'oadoi.org' diff --git a/ansible/site.yml b/ansible/site.yml new file mode 100644 index 0000000..de4cb0e --- /dev/null +++ b/ansible/site.yml @@ -0,0 +1,56 @@ +--- +# Optional: Uncomment to pre-fetch all secrets at once +# - name: Fetch secrets +# import_playbook: fetch_secrets.yml + +- name: Update All Hosts + import_playbook: apt_update.yml + +- name: Deploy Alloy + import_playbook: alloy/deploy.yml + +- name: Deploy Prometheus Node Exporter + import_playbook: prometheus/node_deploy.yml + +- name: Deploy Docker + import_playbook: docker/deploy.yml + +- name: Deploy smtp4dev + import_playbook: smtp4dev/deploy.yml + +- name: Deploy PPLG Stack (PgAdmin, Prometheus, Loki, Grafana + HAProxy) + import_playbook: pplg/deploy.yml + +- name: Deploy PostgreSQL + import_playbook: postgresql/deploy.yml + +- name: Deploy PostgreSQL SSL + import_playbook: postgresql_ssl/deploy.yml + +- name: Deploy Neo4j + import_playbook: neo4j/deploy.yml + +- name: Deploy SearXNG + import_playbook: searxng/deploy.yml + +- name: Deploy HAProxy + import_playbook: haproxy/deploy.yml + +- name: Deploy Casdoor + import_playbook: casdoor/deploy.yml + +- name: Deploy MCPO + import_playbook: mcpo/deploy.yml + +- name: Deploy OpenWebUI + import_playbook: openwebui/deploy.yml + +- name: Deploy Home Assistant + import_playbook: hass/deploy.yml + +- name: Deploy Gitea + import_playbook: gitea/deploy.yml + +- name: Deploy Nextcloud + import_playbook: nextcloud/deploy.yml + diff --git a/ansible/smtp4dev/deploy.yml b/ansible/smtp4dev/deploy.yml new file mode 100644 index 0000000..332008d --- /dev/null +++ b/ansible/smtp4dev/deploy.yml @@ -0,0 +1,56 @@ +--- +- name: Deploy smtp4dev with Docker Compose + hosts: ubuntu + become: true + vars: + required_service: smtp4dev + tasks: + - name: Check if host has smtp4dev service + ansible.builtin.set_fact: + has_smtp4dev_service: "{{required_service in services}}" + + - name: Skip hosts without smtp4dev service + ansible.builtin.meta: end_host + when: not has_smtp4dev_service + + - name: Create smtp4dev group + ansible.builtin.group: + name: "{{smtp4dev_group}}" + + - name: Create smtp4dev user + ansible.builtin.user: + name: "{{smtp4dev_user}}" + comment: "smtp4dev" + group: "{{smtp4dev_group}}" + system: true + + - name: Add group smtp4dev to user ponos + ansible.builtin.user: + name: ponos + groups: "{{smtp4dev_group}}" + append: true + + - name: Create smtp4dev directory + ansible.builtin.file: + path: "{{smtp4dev_directory}}" + owner: "{{smtp4dev_user}}" + group: "{{smtp4dev_group}}" + state: directory + mode: '750' + + - name: Template docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{smtp4dev_directory}}/docker-compose.yml" + owner: "{{smtp4dev_user}}" + group: "{{smtp4dev_group}}" + mode: '550' + + - name: Reset SSH connection to apply group changes + meta: reset_connection + + - name: Start smtp4dev service + community.docker.docker_compose_v2: + project_src: "{{smtp4dev_directory}}" + state: present + pull: always diff --git a/ansible/smtp4dev/docker-compose.yml.j2 b/ansible/smtp4dev/docker-compose.yml.j2 new file mode 100644 index 0000000..71c6531 --- /dev/null +++ b/ansible/smtp4dev/docker-compose.yml.j2 @@ -0,0 +1,24 @@ +services: + smtp4dev: + image: rnwood/smtp4dev + pull_policy: always + container_name: smtp4dev + restart: unless-stopped + ports: + - "{{smtp4dev_port}}:80" + - "{{smtp4dev_smtp_port}}:25" + - "{{smtp4dev_imap_port}}:143" + volumes: + - smtp4dev_data:/smtp4dev + environment: + ServerOptions__BasePath: "/" + ServerOptions__Hostname: smtp4dev + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{smtp4dev_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "smtp4dev" + +volumes: + smtp4dev_data: diff --git a/ansible/validate_puck_monitoring.yml b/ansible/validate_puck_monitoring.yml new file mode 100644 index 0000000..ba649e5 --- /dev/null +++ b/ansible/validate_puck_monitoring.yml @@ -0,0 +1,170 @@ +--- +# Red Panda Approved Validation Playbook 🐼 +# Validates process and container monitoring deployment +# +# Usage: ansible-playbook validate_puck_monitoring.yml + +- name: Validate Puck Process & Container Monitoring + hosts: puck.incus + gather_facts: false + tasks: + - name: "🐼 Check Alloy service is running" + become: true + ansible.builtin.systemd: + name: alloy + state: started + check_mode: true + register: alloy_status + + - name: "🐼 Verify Alloy is active" + ansible.builtin.assert: + that: + - alloy_status.status.ActiveState == "active" + fail_msg: "Alloy service is not running on puck" + success_msg: "✅ Alloy service is active" + + - name: "🐼 Check Alloy can access Docker socket" + become: true + ansible.builtin.command: id alloy + register: alloy_groups + changed_when: false + + - name: "🐼 Verify alloy is in docker group" + ansible.builtin.assert: + that: + - "'docker' in alloy_groups.stdout" + fail_msg: "Alloy user is not in docker group - cAdvisor won't work" + success_msg: "✅ Alloy user is in docker group" + + - name: "🐼 Wait for metrics to be available (30s)" + ansible.builtin.pause: + seconds: 30 + when: alloy_status.changed | default(false) + + - name: "🐼 Check Alloy health endpoint" + ansible.builtin.uri: + url: "http://localhost:12345/-/ready" + return_content: true + register: alloy_health + failed_when: false + + - name: "🐼 Report Alloy health" + ansible.builtin.debug: + msg: "{{ 'Alloy health: ' + alloy_health.status | string }}" + +- name: Validate Prometheus on Prospero + hosts: prospero.incus + gather_facts: false + tasks: + - name: "🐼 Check Prometheus service is running" + become: true + ansible.builtin.systemd: + name: prometheus + state: started + check_mode: true + register: prometheus_status + + - name: "🐼 Verify Prometheus is active" + ansible.builtin.assert: + that: + - prometheus_status.status.ActiveState == "active" + fail_msg: "Prometheus service is not running on prospero" + success_msg: "✅ Prometheus service is active" + + - name: "🐼 Check Prometheus can query puck process metrics" + ansible.builtin.uri: + url: "http://localhost:9090/api/v1/query?query=namedprocess_namegroup_num_procs{instance=~\"puck.*\"}" + return_content: true + register: process_metrics + failed_when: false + + - name: "🐼 Verify process metrics are available" + ansible.builtin.assert: + that: + - process_metrics.status == 200 + - process_metrics.json.status == "success" + - process_metrics.json.data.result | length > 0 + fail_msg: "No process metrics found from puck - check Alloy remote_write config" + success_msg: "✅ Process metrics are being received from puck ({{ process_metrics.json.data.result | length }} series)" + when: process_metrics.status == 200 + + - name: "🐼 Check Prometheus can query puck container metrics" + ansible.builtin.uri: + url: "http://localhost:9090/api/v1/query?query=container_last_seen{instance=~\"puck.*\"}" + return_content: true + register: container_metrics + failed_when: false + + - name: "🐼 Verify container metrics are available" + ansible.builtin.assert: + that: + - container_metrics.status == 200 + - container_metrics.json.status == "success" + fail_msg: "No container metrics found from puck - check cAdvisor config" + success_msg: "✅ Container metrics are being received from puck" + when: container_metrics.status == 200 + + - name: "🐼 Check alert rules are loaded" + ansible.builtin.uri: + url: "http://localhost:9090/api/v1/rules" + return_content: true + register: alert_rules + + - name: "🐼 Verify alert rules are present" + ansible.builtin.assert: + that: + - alert_rules.status == 200 + - alert_rules.json.data.groups | length > 0 + fail_msg: "No alert rules loaded in Prometheus" + success_msg: "✅ Alert rules are loaded ({{ alert_rules.json.data.groups | length }} groups)" + + - name: "🐼 Check for puck process alerts" + ansible.builtin.set_fact: + has_puck_alerts: "{{ alert_rules.json.data.groups | selectattr('name', 'equalto', 'puck_process_alerts') | list | length > 0 }}" + + - name: "🐼 Verify puck process alert group exists" + ansible.builtin.assert: + that: + - has_puck_alerts + fail_msg: "puck_process_alerts group not found in Prometheus rules" + success_msg: "✅ puck_process_alerts group is loaded" + + - name: "🐼 Check Alertmanager is reachable" + ansible.builtin.uri: + url: "http://localhost:9093/-/healthy" + return_content: true + register: alertmanager_health + failed_when: false + + - name: "🐼 Verify Alertmanager is healthy" + ansible.builtin.assert: + that: + - alertmanager_health.status == 200 + fail_msg: "Alertmanager is not healthy" + success_msg: "✅ Alertmanager is healthy" + when: alertmanager_health.status is defined + +- name: Summary + hosts: localhost + gather_facts: false + tasks: + - name: "🐼 Validation Complete" + ansible.builtin.debug: + msg: | + + ╔═══════════════════════════════════════════════════════════════╗ + ║ 🐼 RED PANDA MONITORING VALIDATION COMPLETE 🐼 ║ + ╠═══════════════════════════════════════════════════════════════╣ + ║ ║ + ║ Next Steps: ║ + ║ 1. Import dashboards to Grafana: ║ + ║ - ansible/grafana/dashboards/puck_processes.json ║ + ║ - ansible/grafana/dashboards/puck_containers.json ║ + ║ ║ + ║ 2. Verify alerts in Prometheus UI: ║ + ║ http://prospero.incus:9090/alerts ║ + ║ ║ + ║ 3. Test alert routing: ║ + ║ http://prospero.incus:9093/#/alerts ║ + ║ ║ + ╚═══════════════════════════════════════════════════════════════╝ diff --git a/bookmarks.html b/bookmarks.html new file mode 100644 index 0000000..a511ee2 --- /dev/null +++ b/bookmarks.html @@ -0,0 +1,25 @@ + + + +Bookmarks +

Bookmarks

+

+

Bookmarks bar

+

+

pgAdmin 4 +
Kairos +
MCP Switchboard +
Kairos +
Icarlos +
Angelia +
Athena +
Arke +
Prometheus +
Open WebUI +
Grafana +
MCP OpenAPI Proxy - Swagger UI +
Neo4j Browser +

+

diff --git a/docs/Scalable Twelve Factor App.md b/docs/Scalable Twelve Factor App.md new file mode 100644 index 0000000..368755e --- /dev/null +++ b/docs/Scalable Twelve Factor App.md @@ -0,0 +1,38 @@ +# Scalable Twelve-Factor App +https://12factor.net/ + +The twelve-factor app is a methodology for building software-as-a-service apps that: + +Use declarative formats for setup automation, to minimize time and cost for new developers joining the project; +Have a clean contract with the underlying operating system, offering maximum portability between execution environments; +Are suitable for deployment on modern cloud platforms, obviating the need for servers and systems administration; +Minimize divergence between development and production, enabling continuous deployment for maximum agility; +And can scale up without significant changes to tooling, architecture, or development practices. + +I. Codebase +One codebase tracked in revision control, many deploys +II. Dependencies +Explicitly declare and isolate dependencies +III. Config +Store config in the environment +IV. Backing services +Treat backing services as attached resources +V. Build, release, run +Strictly separate build and run stages +VI. Processes +Execute the app as one or more stateless processes +VII. Port binding +Export services via port binding +VIII. Concurrency +Scale out via the process model +IX. Disposability +Maximize robustness with fast startup and graceful shutdown +X. Dev/prod parity +Keep development, staging, and production as similar as possible +XI. Logs +Treat logs as event streams +XII. Admin processes +Run admin/management tasks as one-off processes + +# Django Logging +https://lincolnloop.com/blog/django-logging-right-way/ diff --git a/docs/Semantic Versioning.md b/docs/Semantic Versioning.md new file mode 100644 index 0000000..7e2d496 --- /dev/null +++ b/docs/Semantic Versioning.md @@ -0,0 +1,13 @@ +# Semantic Versioning 2.0.0 +https://semver.org/ + +Given a version number MAJOR.MINOR.PATCH, increment the: + +MAJOR version when you make incompatible API changes +MINOR version when you add functionality in a backwards compatible manner +PATCH version when you make backwards compatible bug fixes +Additional labels for pre-release and build metadata are available as extensions to the MAJOR.MINOR.PATCH format. + + +GitHub Actions: Gitbump +https://betterprogramming.pub/how-to-version-your-code-in-2020-60bdd221278b diff --git a/docs/_template.md b/docs/_template.md new file mode 100644 index 0000000..253a0ed --- /dev/null +++ b/docs/_template.md @@ -0,0 +1,184 @@ +# Service Documentation Template + +This is a template for documenting services deployed in the Agathos sandbox. Copy this file and replace placeholders with service-specific information. + +--- + +# {Service Name} + +## Overview + +Brief description of the service, its purpose, and role in the infrastructure. + +**Host:** {hostname} (e.g., oberon, miranda, prospero) +**Role:** {role from Terraform} (e.g., container_orchestration, observability) +**Port Range:** {exposed ports} (e.g., 25580-25599) + +## Architecture + +``` +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Client │────▶│ Service │────▶│ Database │ +└─────────────┘ └─────────────┘ └─────────────┘ +``` + +Describe the service architecture, data flow, and integration points. + +## Terraform Resources + +### Host Definition + +The service runs on `{hostname}`, defined in `terraform/containers.tf`: + +| Attribute | Value | +|-----------|-------| +| Image | {noble/plucky/questing} | +| Role | {terraform role} | +| Security Nesting | {true/false} | +| Proxy Devices | {port mappings} | + +### Dependencies + +| Resource | Relationship | +|----------|--------------| +| {other host} | {description of dependency} | + +## Ansible Deployment + +### Playbook + +```bash +cd ansible +ansible-playbook {service}/deploy.yml +``` + +### Files + +| File | Purpose | +|------|---------| +| `{service}/deploy.yml` | Main deployment playbook | +| `{service}/*.j2` | Jinja2 templates | + +### Variables + +#### Group Variables (`group_vars/all/main.yml`) + +| Variable | Description | Default | +|----------|-------------|---------| +| `{service}_version` | Version to deploy | `latest` | + +#### Host Variables (`host_vars/{hostname}.yml`) + +| Variable | Description | +|----------|-------------| +| `{service}_port` | Service port | +| `{service}_data_dir` | Data directory | + +#### Vault Variables (`group_vars/all/vault.yml`) + +| Variable | Description | +|----------|-------------| +| `vault_{service}_password` | Service password | +| `vault_{service}_api_key` | API key (if applicable) | + +## Configuration + +### Environment Variables + +| Variable | Description | Source | +|----------|-------------|--------| +| `{VAR_NAME}` | Description | `{{ vault_{service}_var }}` | + +### Configuration Files + +| File | Location | Template | +|------|----------|----------| +| `config.yml` | `/etc/{service}/` | `{service}/config.yml.j2` | + +## Monitoring + +### Prometheus Metrics + +| Metric | Description | +|--------|-------------| +| `{service}_requests_total` | Total requests | +| `{service}_errors_total` | Total errors | + +**Scrape Target:** Configured in `ansible/prometheus/` or via Alloy. + +### Loki Logs + +| Log Source | Labels | +|------------|--------| +| Application log | `{job="{service}", host="{hostname}"}` | +| Access log | `{job="{service}_access", host="{hostname}"}` | + +**Collection:** Alloy agent on host ships logs to Loki on Prospero. + +### Grafana Dashboard + +Dashboard provisioned at: `ansible/grafana/dashboards/{service}.json` + +## Operations + +### Start/Stop + +```bash +# Via systemd (if applicable) +sudo systemctl start {service} +sudo systemctl stop {service} + +# Via Docker (if applicable) +docker compose -f /opt/{service}/docker-compose.yml up -d +docker compose -f /opt/{service}/docker-compose.yml down +``` + +### Health Check + +```bash +curl http://{hostname}.incus:{port}/health +``` + +### Logs + +```bash +# Systemd +journalctl -u {service} -f + +# Docker +docker logs -f {container_name} + +# Loki (via Grafana Explore) +{job="{service}"} +``` + +### Backup + +Describe backup procedures, scripts, and schedules. + +### Restore + +Describe restore procedures and verification steps. + +## Troubleshooting + +### Common Issues + +| Symptom | Cause | Resolution | +|---------|-------|------------| +| Service won't start | Missing config | Check `{config_file}` exists | +| Connection refused | Firewall/proxy | Verify Incus proxy device | + +### Debug Mode + +```bash +# Enable debug logging +{service} --debug +``` + +## References + +- Official Documentation: {url} +- [Terraform Practices](../terraform.md) +- [Ansible Practices](../ansible.md) +- [Sandbox Overview](../sandbox.html) diff --git a/docs/ansible.md b/docs/ansible.md new file mode 100644 index 0000000..710b31b --- /dev/null +++ b/docs/ansible.md @@ -0,0 +1,705 @@ +# Ansible Project Structure - Best Practices + +This document describes the clean, maintainable Ansible structure implemented in the Agathos project. Use this as a reference template for other Ansible projects. + +## Overview + +This structure emphasizes: +- **Simplicity**: Minimal files at root level +- **Organization**: Services contain all related files (playbooks + templates) +- **Separation**: Variables live in dedicated files, not inline in inventory +- **Discoverability**: Clear naming and logical grouping + +## Directory Structure + +``` +ansible/ +├── ansible.cfg # Ansible configuration +├── .vault_pass # Vault password file +│ +├── site.yml # Master orchestration playbook +├── apt_update.yml # Utility: Update all hosts +├── sandbox_up.yml # Utility: Start infrastructure +├── sandbox_down.yml # Utility: Stop infrastructure +│ +├── inventory/ # Inventory organization +│ ├── hosts # Simple host/group membership +│ │ +│ ├── group_vars/ # Variables for groups +│ │ └── all/ +│ │ ├── vars.yml # Common variables +│ │ └── vault.yml # Encrypted secrets +│ │ +│ └── host_vars/ # Variables per host +│ ├── hostname1.yml # All vars for hostname1 +│ ├── hostname2.yml # All vars for hostname2 +│ └── ... +│ +└── service_name/ # Per-service directories + ├── deploy.yml # Main deployment playbook + ├── stage.yml # Staging playbook (if needed) + ├── template1.j2 # Jinja2 templates + ├── template2.j2 + └── files/ # Static files (if needed) +``` + +## Key Components + +### 1. Simplified Inventory (`inventory/hosts`) + +**Purpose**: Define ONLY host/group membership, no variables + +**Example**: +```yaml +--- +# Ansible Inventory - Simplified + +# Main infrastructure group +ubuntu: + hosts: + server1.example.com: + server2.example.com: + server3.example.com: + +# Service-specific groups +web_servers: + hosts: + server1.example.com: + +database_servers: + hosts: + server2.example.com: +``` + +**Before**: 361 lines with variables inline +**After**: 34 lines of pure structure + +### 2. Host Variables (`inventory/host_vars/`) + +**Purpose**: All configuration specific to a single host + +**File naming**: `{hostname}.yml` (matches inventory hostname exactly) + +**Example** (`inventory/host_vars/server1.example.com.yml`): +```yaml +--- +# Server1 Configuration - Web Server +# Services: nginx, php-fpm, redis + +services: + - nginx + - php + - redis + +# Nginx Configuration +nginx_user: www-data +nginx_worker_processes: auto +nginx_port: 80 +nginx_ssl_port: 443 + +# PHP-FPM Configuration +php_version: 8.2 +php_max_children: 50 + +# Redis Configuration +redis_port: 6379 +redis_password: "{{vault_redis_password}}" +``` + +### 3. Group Variables (`inventory/group_vars/`) + +**Purpose**: Variables shared across multiple hosts + +**Structure**: +``` +group_vars/ +├── all/ # Variables for ALL hosts +│ ├── vars.yml # Common non-sensitive config +│ └── vault.yml # Encrypted secrets (ansible-vault) +│ +└── web_servers/ # Variables for web_servers group + └── vars.yml +``` + +**Example** (`inventory/group_vars/all/vars.yml`): +```yaml +--- +# Common Variables for All Hosts + +remote_user: ansible +deployment_environment: production +ansible_python_interpreter: /usr/bin/python3 + +# Release versions +app_release: v1.2.3 +api_release: v2.0.1 + +# Monitoring endpoints +prometheus_url: http://monitoring.example.com:9090 +loki_url: http://monitoring.example.com:3100 +``` + +### 4. Service Directories + +**Purpose**: Group all files related to a service deployment + +**Pattern**: `{service_name}/` + +**Contents**: +- `deploy.yml` - Main deployment playbook +- `stage.yml` - Staging/update playbook (optional) +- `*.j2` - Jinja2 templates +- `files/` - Static files (if needed) +- `tasks/` - Task files (if splitting large playbooks) + +**Example Structure**: +``` +nginx/ +├── deploy.yml # Deployment playbook +├── nginx.conf.j2 # Main config template +├── site.conf.j2 # Virtual host template +├── nginx.service.j2 # Systemd service file +└── files/ + └── ssl_params.conf # Static SSL configuration +``` + +### 5. Master Playbook (`site.yml`) + +**Purpose**: Orchestrate full-stack deployment + +**Pattern**: Import service playbooks in dependency order + +**Example**: +```yaml +--- +- name: Update All Hosts + import_playbook: apt_update.yml + +- name: Deploy Docker + import_playbook: docker/deploy.yml + +- name: Deploy PostgreSQL + import_playbook: postgresql/deploy.yml + +- name: Deploy Application + import_playbook: myapp/deploy.yml + +- name: Deploy Monitoring + import_playbook: prometheus/deploy.yml +``` + +### 6. Service Playbook Pattern + +**Location**: `{service}/deploy.yml` + +**Standard Structure**: +```yaml +--- +- name: Deploy Service Name + hosts: target_group + tasks: + + # Service detection (if using services list) + - name: Check if host has service_name service + ansible.builtin.set_fact: + has_service: "{{ 'service_name' in services | default([]) }}" + + - name: Skip hosts without service + ansible.builtin.meta: end_host + when: not has_service + + # Actual deployment tasks + - name: Create service user + become: true + ansible.builtin.user: + name: "{{service_user}}" + group: "{{service_group}}" + system: true + + - name: Template configuration + become: true + ansible.builtin.template: + src: config.j2 + dest: "{{service_directory}}/config.yml" + notify: restart service + + # Handlers + handlers: + - name: restart service + become: true + ansible.builtin.systemd: + name: service_name + state: restarted + daemon_reload: true +``` + +**IMPORTANT: Template Path Convention** +- When playbooks are inside service directories, template `src:` paths are relative to that directory +- Use `src: config.j2` NOT `src: service_name/config.j2` +- The service directory prefix was correct when playbooks were at the ansible root, but is wrong now + +**Host-Specific Templates** +Some services need different configuration per host. Store these in subdirectories named by hostname: + +``` +service_name/ +├── deploy.yml +├── config.j2 # Default template +├── hostname1/ # Host-specific overrides +│ └── config.j2 +├── hostname2/ +│ └── config.j2 +└── hostname3/ + └── config.j2 +``` + +Use conditional logic to select the correct template: + +```yaml +- name: Check for host-specific configuration + ansible.builtin.stat: + path: "{{playbook_dir}}/{{inventory_hostname_short}}/config.j2" + delegate_to: localhost + register: host_specific_config + become: false + +- name: Template host-specific configuration + become: true + ansible.builtin.template: + src: "{{playbook_dir}}/{{inventory_hostname_short}}/config.j2" + dest: "{{service_directory}}/config" + when: host_specific_config.stat.exists + +- name: Template default configuration + become: true + ansible.builtin.template: + src: config.j2 + dest: "{{service_directory}}/config" + when: not host_specific_config.stat.exists +``` + +**Real Example: Alloy Service** +``` +alloy/ +├── deploy.yml +├── config.alloy.j2 # Default configuration +├── ariel/ # Neo4j monitoring +│ └── config.alloy.j2 +├── miranda/ # Docker monitoring +│ └── config.alloy.j2 +├── oberon/ # Web services monitoring +│ └── config.alloy.j2 +└── puck/ # Application monitoring + └── config.alloy.j2 +``` + +## Service Detection Pattern + +**Purpose**: Allow hosts to selectively run service playbooks + +**How it works**: +1. Each host defines a `services:` list in `host_vars/` +2. Each playbook checks if its service is in the list +3. Playbook skips host if service not needed + +**Example**: + +`inventory/host_vars/server1.yml`: +```yaml +services: + - docker + - nginx + - redis +``` + +`nginx/deploy.yml`: +```yaml +- name: Deploy Nginx + hosts: ubuntu + tasks: + - name: Check if host has nginx service + ansible.builtin.set_fact: + has_nginx: "{{ 'nginx' in services | default([]) }}" + + - name: Skip hosts without nginx + ansible.builtin.meta: end_host + when: not has_nginx + + # Rest of tasks only run if nginx in services list +``` + +## Ansible Vault Integration + +**Setup**: +```bash +# Create vault password file (one-time) +echo "your_vault_password" > .vault_pass +chmod 600 .vault_pass + +# Configure ansible.cfg +echo "vault_password_file = .vault_pass" >> ansible.cfg +``` + +**Usage**: +```bash +# Edit vault file +ansible-vault edit inventory/group_vars/all/vault.yml + +# View vault file +ansible-vault view inventory/group_vars/all/vault.yml + +# Encrypt new file +ansible-vault encrypt secrets.yml +``` + +**Variable naming convention**: +- Prefix vault variables with `vault_` +- Reference in regular vars: `db_password: "{{vault_db_password}}"` + +## Running Playbooks + +**Full deployment**: +```bash +ansible-playbook site.yml +``` + +**Single service**: +```bash +ansible-playbook nginx/deploy.yml +``` + +**Specific hosts**: +```bash +ansible-playbook nginx/deploy.yml --limit server1.example.com +``` + +**Check mode (dry-run)**: +```bash +ansible-playbook site.yml --check +``` + +**With extra verbosity**: +```bash +ansible-playbook nginx/deploy.yml -vv +``` + +## Benefits of This Structure + +### 1. Cleaner Root Directory +- **Before**: 29+ playbook files cluttering root +- **After**: 3-4 utility playbooks + site.yml + +### 2. Simplified Inventory +- **Before**: 361 lines with inline variables +- **After**: 34 lines of pure structure +- Variables organized logically by host/group + +### 3. Service Cohesion +- Everything related to a service in one place +- Easy to find templates when editing playbooks +- Natural grouping for git operations + +### 4. Scalability +- Easy to add new services (create directory, add playbook) +- Easy to add new hosts (create host_vars file) +- No risk of playbook name conflicts + +### 5. Reusability +- Service directories can be copied to other projects +- host_vars pattern works for any inventory size +- Clear separation of concerns + +### 6. Maintainability +- Changes isolated to service directories +- Inventory file rarely needs editing +- Clear audit trail in git (changes per service) + +## Migration Checklist + +Moving an existing Ansible project to this structure: + +- [ ] Create service directories for each playbook +- [ ] Move `{service}_deploy.yml` → `{service}/deploy.yml` +- [ ] Move templates into service directories +- [ ] Extract host variables from inventory to `host_vars/` +- [ ] Extract group variables to `group_vars/all/vars.yml` +- [ ] Move secrets to `group_vars/all/vault.yml` (encrypted) +- [ ] Update `site.yml` import_playbook paths +- [ ] Backup original inventory: `cp hosts hosts.backup` +- [ ] Create simplified inventory with only group/host structure +- [ ] Test with `ansible-playbook site.yml --check` +- [ ] Verify with limited deployment: `--limit test_host` + +## Example: Adding a New Service + +**1. Create service directory**: +```bash +mkdir ansible/myapp +``` + +**2. Create deployment playbook** (`ansible/myapp/deploy.yml`): +```yaml +--- +- name: Deploy MyApp + hosts: ubuntu + tasks: + - name: Check if host has myapp service + ansible.builtin.set_fact: + has_myapp: "{{ 'myapp' in services | default([]) }}" + + - name: Skip hosts without myapp + ansible.builtin.meta: end_host + when: not has_myapp + + - name: Deploy myapp + # ... deployment tasks +``` + +**3. Create template** (`ansible/myapp/config.yml.j2`): +```yaml +app_name: MyApp +port: {{myapp_port}} +database: {{myapp_db_host}} +``` + +**4. Add variables to host** (`inventory/host_vars/server1.yml`): +```yaml +services: + - myapp # Add to services list + +# MyApp configuration +myapp_port: 8080 +myapp_db_host: db.example.com +``` + +**5. Add to site.yml**: +```yaml +- name: Deploy MyApp + import_playbook: myapp/deploy.yml +``` + +**6. Deploy**: +```bash +ansible-playbook myapp/deploy.yml +``` + +## Best Practices + +### Naming Conventions +- Service directories: lowercase, underscores (e.g., `mcp_switchboard/`) +- Playbooks: `deploy.yml`, `stage.yml`, `remove.yml` +- Templates: descriptive name + `.j2` extension +- Variables: service prefix (e.g., `nginx_port`, `redis_password`) +- Vault variables: `vault_` prefix + +### File Organization +- Keep playbooks under 100 lines (split into task files if larger) +- Group related templates in service directory +- Use comments to document non-obvious variables +- Add README.md to complex service directories + +### Variable Organization +- Host-specific: `host_vars/{hostname}.yml` +- Service-specific across hosts: `group_vars/{service_group}/vars.yml` +- Global configuration: `group_vars/all/vars.yml` +- Secrets: `group_vars/all/vault.yml` (encrypted) + +### Idempotency +- Use `creates:` parameter for one-time operations +- Use `state:` explicitly (present/absent/restarted) +- Check conditions before destructive operations +- Test with `--check` mode before applying + +### Documentation +- Comment complex task logic +- Document required variables in playbook header +- Add README.md for service directories with many files +- Keep docs/ separate from ansible/ directory + +## Related Documentation + +- [Ansible Best Practices](https://docs.ansible.com/ansible/latest/tips_tricks/ansible_tips_tricks.html) +- [Ansible Vault Guide](https://docs.ansible.com/ansible/latest/vault_guide/index.html) +- [Inventory Organization](https://docs.ansible.com/ansible/latest/inventory_guide/intro_inventory.html) + +## Secret Management Patterns + +### Ansible Vault (Sandbox Environment) + +**Purpose**: Store sensitive values encrypted at rest in version control + +**File Location**: `inventory/group_vars/all/vault.yml` + +**Variable Naming Convention**: Prefix all vault variables with `vault_` + +**Example vault.yml**: +Note the entire vault file is encrypted +```yaml +--- +# Database passwords +vault_postgres_admin_password: # Avoid special characters & non-ASCII +vault_casdoor_db_password: +# S3 credentials +vault_casdoor_s3_access_key: +vault_casdoor_s3_secret_key: +vault_casdoor_s3_bucket: +``` + +**Host Variables Reference Vault**: +```yaml +# In host_vars/oberon.incus.yml +casdoor_db_password: "{{ vault_casdoor_db_password }}" +casdoor_s3_access_key: "{{ vault_casdoor_s3_access_key }}" +casdoor_s3_secret_key: "{{ vault_casdoor_s3_secret_key }}" +casdoor_s3_bucket: "{{ vault_casdoor_s3_bucket }}" + +# Non-sensitive values stay as plain variables +casdoor_s3_endpoint: "https://ariel.incus:9000" +casdoor_s3_region: "us-east-1" +``` + +**Prerequisites**: +- Set `ANSIBLE_VAULT_PASSWORD_FILE` environment variable +- Create `.vault_pass` file with vault password +- Add `.vault_pass` to `.gitignore` + +**Encrypting New Values**: +```bash +# Encrypt a string and add to vault.yml +echo -n "secret_value" | ansible-vault encrypt_string --stdin-name 'vault_variable_name' + +# Edit vault file directly +ansible-vault edit inventory/group_vars/all/vault.yml +``` + +### OCI Vault (Production Environment) + +**Purpose**: Use Oracle Cloud Infrastructure Vault for centralized secret management + +**Variable Pattern**: Use Ansible lookups to fetch secrets at runtime + +**Example host_vars for OCI**: +```yaml +# In host_vars/production-server.yml + +# Database passwords from OCI Vault +casdoor_db_password: "{{ lookup('community.oci.oci_secret', 'casdoor-db-password', compartment_id=oci_compartment_id, vault_id=oci_services_vault_id) }}" + +# S3 credentials from OCI Vault +casdoor_s3_access_key: "{{ lookup('community.oci.oci_secret', 'casdoor-s3-access-key', compartment_id=oci_compartment_id, vault_id=oci_services_vault_id) }}" +casdoor_s3_secret_key: "{{ lookup('community.oci.oci_secret', 'casdoor-s3-secret-key', compartment_id=oci_compartment_id, vault_id=oci_services_vault_id) }}" +casdoor_s3_bucket: "{{ lookup('community.oci.oci_secret', 'casdoor-s3-bucket', compartment_id=oci_compartment_id, vault_id=oci_services_vault_id) }}" + +# Non-sensitive values remain as plain variables +casdoor_s3_endpoint: "https://objectstorage.us-phoenix-1.oraclecloud.com" +casdoor_s3_region: "us-phoenix-1" +``` + +**OCI Vault Organization**: +``` +OCI Compartment: production +├── Vault: agathos-databases +│ ├── Secret: postgres-admin-password +│ └── Secret: casdoor-db-password +│ +├── Vault: agathos-services +│ ├── Secret: casdoor-s3-access-key +│ ├── Secret: casdoor-s3-secret-key +│ ├── Secret: casdoor-s3-bucket +│ └── Secret: openwebui-db-password +│ +└── Vault: agathos-integrations + ├── Secret: apikey-openai + └── Secret: apikey-anthropic +``` + +**Secret Naming Convention**: +- Ansible Vault: `vault_service_secret` (underscores) +- OCI Vault: `service-secret` (hyphens) + +**Benefits of Two-Tier Pattern**: +1. **Portability**: Service playbooks remain unchanged across environments +2. **Flexibility**: Switch secret backends by changing only host_vars +3. **Clarity**: Variable names clearly indicate their purpose +4. **Security**: Secrets never appear in playbooks or templates + +### S3 Bucket Provisioning with Ansible + +**Purpose**: Provision Incus S3 buckets and manage credentials in Ansible Vault + +**Playbooks**: +- `provision_s3.yml` - Create bucket and store credentials +- `regenerate_s3_key.yml` - Rotate credentials +- `remove_s3.yml` - Delete bucket and clean vault + +**Usage**: +```bash +# Provision new S3 bucket for a service +ansible-playbook provision_s3.yml -e bucket_name=casdoor -e service_name=casdoor + +# Regenerate access credentials (invalidates old keys) +ansible-playbook regenerate_s3_key.yml -e bucket_name=casdoor -e service_name=casdoor + +# Remove bucket and credentials +ansible-playbook remove_s3.yml -e bucket_name=casdoor -e service_name=casdoor +``` + +**Requirements**: +- User must be member of `incus` group +- `ANSIBLE_VAULT_PASSWORD_FILE` must be set +- Incus CLI must be configured and accessible + +**What Gets Created**: +1. Incus storage bucket in project `agathos`, pool `default` +2. Admin access key for the bucket +3. Encrypted vault entries: `vault__s3_access_key`, `vault__s3_secret_key`, `vault__s3_bucket` + +**Behind the Scenes**: +- Role: `incus_storage_bucket` +- Idempotent: Checks if bucket/key exists before creating +- Atomic: Credentials captured and encrypted in single operation +- Variables sourced from: `inventory/group_vars/all/vars.yml` + +## Troubleshooting + +### Template Not Found Errors + +**Symptom**: `Could not find or access 'service_name/template.j2'` + +**Cause**: When playbooks were moved from ansible root into service directories, template paths weren't updated. + +**Solution**: Remove the service directory prefix from template paths: +```yaml +# WRONG (old path from when playbook was at root) +src: service_name/config.j2 + +# CORRECT (playbook is now in service_name/ directory) +src: config.j2 +``` + +### Host-Specific Template Path Issues + +**Symptom**: Playbook fails to find host-specific templates + +**Cause**: Host-specific directories are at the wrong level + +**Expected Structure**: +``` +service_name/ +├── deploy.yml +├── config.j2 # Default +└── hostname/ # Host-specific (inside service dir) + └── config.j2 +``` + +**Use `{{playbook_dir}}` for relative paths**: +```yaml +# This finds templates relative to the playbook location +src: "{{playbook_dir}}/{{inventory_hostname_short}}/config.j2" +``` + +--- + +**Last Updated**: December 2025 +**Project**: Agathos Infrastructure +**Approval**: Red Panda Approved™ diff --git a/docs/anythingllm.md b/docs/anythingllm.md new file mode 100644 index 0000000..36b9beb --- /dev/null +++ b/docs/anythingllm.md @@ -0,0 +1,334 @@ +# AnythingLLM + +## Overview + +AnythingLLM is a full-stack application that provides a unified interface for interacting with Large Language Models (LLMs). It supports multi-provider LLM access, document intelligence (RAG with pgvector), AI agents with tools, and Model Context Protocol (MCP) extensions. + +**Host:** Rosalind +**Role:** go_nodejs_php_apps +**Port:** 22084 (internal), accessible via `anythingllm.ouranos.helu.ca` (HAProxy) + +## Architecture + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Client │────▶│ HAProxy │────▶│ AnythingLLM │ +│ (Browser/API) │ │ (Titania) │ │ (Rosalind) │ +└─────────────────┘ └─────────────────┘ └────────┬────────┘ + │ + ┌────────────────────────────────┼────────────────────────────────┐ + │ │ │ + ▼ ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ + │ PostgreSQL │ │ LLM Backend │ │ TTS Service │ + │ + pgvector │ │ (pan.helu.ca) │ │ (FastKokoro) │ + │ (Portia) │ │ llama-cpp │ │ pan.helu.ca │ + └─────────────────┘ └─────────────────┘ └─────────────────┘ +``` + +### Directory Structure + +AnythingLLM uses a native Node.js deployment with the following directory layout: + +``` +/srv/anythingllm/ +├── app/ # Cloned git repository +│ ├── server/ # Backend API server +│ │ ├── .env # Environment configuration +│ │ └── node_modules/ +│ ├── collector/ # Document processing service +│ │ ├── hotdir -> ../hotdir # SYMLINK (critical!) +│ │ └── node_modules/ +│ └── frontend/ # React frontend (built into server) +├── storage/ # Persistent data +│ ├── documents/ # Processed documents +│ ├── vector-cache/ # Embedding cache +│ └── plugins/ # MCP server configs +└── hotdir/ # Upload staging directory (actual location) + +/srv/collector/ +└── hotdir -> /srv/anythingllm/hotdir # SYMLINK (critical!) +``` + +### Hotdir Path Resolution (Critical) + +The server and collector use **different path resolution** for the upload directory: + +| Component | Code Location | Resolves To | +|-----------|--------------|-------------| +| **Server** (multer) | `STORAGE_DIR/../../collector/hotdir` | `/srv/collector/hotdir` | +| **Collector** | `__dirname/../hotdir` | `/srv/anythingllm/app/collector/hotdir` | + +Both paths must point to the same physical directory. This is achieved with **two symlinks**: + +1. `/srv/collector/hotdir` → `/srv/anythingllm/hotdir` +2. `/srv/anythingllm/app/collector/hotdir` → `/srv/anythingllm/hotdir` + +⚠️ **Important**: The collector ships with an empty `hotdir/` directory. The Ansible deploy must **remove** this directory before creating the symlink, or file uploads will fail with "File does not exist in upload directory." + +### Key Integrations + +| Component | Host | Purpose | +|-----------|------|---------| +| PostgreSQL + pgvector | Portia | Vector database for RAG embeddings | +| LLM Provider | pan.helu.ca:22071 | Generic OpenAI-compatible llama-cpp | +| TTS Service | pan.helu.ca:22070 | FastKokoro text-to-speech | +| HAProxy | Titania | TLS termination and routing | +| Loki | Prospero | Log aggregation | + +## Terraform Resources + +### Host Definition + +AnythingLLM runs on **Rosalind**, which is already defined in `terraform/containers.tf`: + +| Attribute | Value | +|-----------|-------| +| Image | noble | +| Role | go_nodejs_php_apps | +| Security Nesting | true | +| AppArmor | unconfined | +| Port Range | 22080-22099 | + +No Terraform changes required—AnythingLLM uses port 22084 within Rosalind's existing range. + +## Ansible Deployment + +### Playbook + +```bash +cd ansible +source ~/env/agathos/bin/activate + +# Deploy PostgreSQL database first (if not already done) +ansible-playbook postgresql/deploy.yml + +# Deploy AnythingLLM +ansible-playbook anythingllm/deploy.yml + +# Redeploy HAProxy to pick up new backend +ansible-playbook haproxy/deploy.yml + +# Redeploy Alloy to pick up new log source +ansible-playbook alloy/deploy.yml +``` + +### Files + +| File | Purpose | +|------|---------| +| `anythingllm/deploy.yml` | Main deployment playbook | +| `anythingllm/anythingllm-server.service.j2` | Systemd service for server | +| `anythingllm/anythingllm-collector.service.j2` | Systemd service for collector | +| `anythingllm/env.j2` | Environment variables template | + +### Variables + +#### Host Variables (`host_vars/rosalind.incus.yml`) + +| Variable | Description | Default | +|----------|-------------|---------| +| `anythingllm_user` | Service account user | `anythingllm` | +| `anythingllm_group` | Service account group | `anythingllm` | +| `anythingllm_directory` | Installation directory | `/srv/anythingllm` | +| `anythingllm_port` | Service port | `22084` | +| `anythingllm_db_host` | PostgreSQL host | `portia.incus` | +| `anythingllm_db_port` | PostgreSQL port | `5432` | +| `anythingllm_db_name` | Database name | `anythingllm` | +| `anythingllm_db_user` | Database user | `anythingllm` | +| `anythingllm_llm_base_url` | LLM API endpoint | `http://pan.helu.ca:22071/v1` | +| `anythingllm_llm_model` | Default LLM model | `llama-3-8b` | +| `anythingllm_embedding_engine` | Embedding engine | `native` | +| `anythingllm_tts_provider` | TTS provider | `openai` | +| `anythingllm_tts_endpoint` | TTS API endpoint | `http://pan.helu.ca:22070/v1` | + +#### Vault Variables (`group_vars/all/vault.yml`) + +| Variable | Description | +|----------|-------------| +| `vault_anythingllm_db_password` | PostgreSQL password | +| `vault_anythingllm_jwt_secret` | JWT signing secret (32+ chars) | +| `vault_anythingllm_sig_key` | Signature key (32+ chars) | +| `vault_anythingllm_sig_salt` | Signature salt (32+ chars) | + +Generate secrets with: +```bash +openssl rand -hex 32 +``` + +## Configuration + +### Environment Variables + +| Variable | Description | Source | +|----------|-------------|--------| +| `JWT_SECRET` | JWT signing secret | `vault_anythingllm_jwt_secret` | +| `SIG_KEY` | Signature key | `vault_anythingllm_sig_key` | +| `SIG_SALT` | Signature salt | `vault_anythingllm_sig_salt` | +| `VECTOR_DB` | Vector database type | `pgvector` | +| `PGVECTOR_CONNECTION_STRING` | PostgreSQL connection | Composed from host_vars | +| `LLM_PROVIDER` | LLM provider type | `generic-openai` | +| `EMBEDDING_ENGINE` | Embedding engine | `native` | +| `TTS_PROVIDER` | TTS provider | `openai` | + +### External Access + +AnythingLLM is accessible via HAProxy on Titania: + +| URL | Backend | +|-----|---------| +| `https://anythingllm.ouranos.helu.ca` | `rosalind.incus:22084` | + +The HAProxy backend is configured in `host_vars/titania.incus.yml`. + +## Monitoring + +### Loki Logs + +| Log Source | Labels | +|------------|--------| +| Server logs | `{unit="anythingllm-server.service"}` | +| Collector logs | `{unit="anythingllm-collector.service"}` | + +Logs are collected via systemd journal → Alloy on Rosalind → Loki on Prospero. + +**Grafana Query:** +```logql +{unit=~"anythingllm.*"} |= `` +``` + +### Health Check + +```bash +# From any sandbox host +curl http://rosalind.incus:22084/api/ping + +# Via HAProxy (external) +curl -k https://anythingllm.ouranos.helu.ca/api/ping +``` + +## Operations + +### Start/Stop + +```bash +# SSH to Rosalind +ssh rosalind.incus + +# Manage via systemd +sudo systemctl start anythingllm-server # Start server +sudo systemctl start anythingllm-collector # Start collector +sudo systemctl stop anythingllm-server # Stop server +sudo systemctl stop anythingllm-collector # Stop collector +sudo systemctl restart anythingllm-server # Restart server +sudo systemctl restart anythingllm-collector # Restart collector +``` + +### Logs + +```bash +# Real-time server logs +journalctl -u anythingllm-server -f + +# Real-time collector logs +journalctl -u anythingllm-collector -f + +# Grafana (historical) +# Query: {unit=~"anythingllm.*"} +``` + +### Upgrade + +Pull latest code and redeploy: + +```bash +ansible-playbook anythingllm/deploy.yml +``` + +## Vault Setup + +Add the following secrets to `ansible/inventory/group_vars/all/vault.yml`: + +```bash +ansible-vault edit ansible/inventory/group_vars/all/vault.yml +``` + +```yaml +# AnythingLLM Secrets +vault_anythingllm_db_password: "your-secure-password" +vault_anythingllm_jwt_secret: "your-32-char-jwt-secret" +vault_anythingllm_sig_key: "your-32-char-signature-key" +vault_anythingllm_sig_salt: "your-32-char-signature-salt" +``` + +## Follow-On Tasks + +### MCP Server Integration + +AnythingLLM supports Model Context Protocol (MCP) for extending AI agent capabilities. Future integration with existing MCP servers: + +| MCP Server | Host | Tools | +|------------|------|-------| +| MCPO | Miranda | Docker management | +| Neo4j MCP | Miranda | Graph database queries | +| GitHub MCP | (external) | Repository operations | + +Configure MCP connections via AnythingLLM Admin UI after initial deployment. + +### Casdoor SSO + +For single sign-on integration, configure AnythingLLM to authenticate via Casdoor OAuth2. This requires: +1. Creating an application in Casdoor admin +2. Configuring OAuth2 environment variables in AnythingLLM +3. Optionally using OAuth2-Proxy for transparent authentication + +## Troubleshooting + +### File Upload Fails with "File does not exist in upload directory" + +**Symptom:** Uploading files via the UI returns 500 Internal Server Error with message "File does not exist in upload directory." + +**Cause:** The server uploads files to `/srv/collector/hotdir`, but the collector looks for them in `/srv/anythingllm/app/collector/hotdir`. If these aren't the same physical directory, uploads fail. + +**Solution:** Verify symlinks are correctly configured: + +```bash +# Check symlinks +ls -la /srv/collector/hotdir +# Should show: /srv/collector/hotdir -> /srv/anythingllm/hotdir + +ls -la /srv/anythingllm/app/collector/hotdir +# Should show: /srv/anythingllm/app/collector/hotdir -> /srv/anythingllm/hotdir + +# If collector/hotdir is a directory (not symlink), fix it: +sudo rm -rf /srv/anythingllm/app/collector/hotdir +sudo ln -s /srv/anythingllm/hotdir /srv/anythingllm/app/collector/hotdir +sudo chown -h anythingllm:anythingllm /srv/anythingllm/app/collector/hotdir +sudo systemctl restart anythingllm-collector +``` + +### Container Won't Start + +Check Docker logs: +```bash +sudo docker logs anythingllm +``` + +Verify PostgreSQL connectivity: +```bash +psql -h portia.incus -U anythingllm -d anythingllm +``` + +### Database Connection Issues + +Ensure pgvector extension is enabled: +```bash +psql -h portia.incus -U postgres -d anythingllm -c "SELECT * FROM pg_extension WHERE extname = 'vector';" +``` + +### LLM Provider Issues + +Test LLM endpoint directly: +```bash +curl http://pan.helu.ca:22071/v1/models +``` diff --git a/docs/anythingllm_mcp.md b/docs/anythingllm_mcp.md new file mode 100644 index 0000000..a634665 --- /dev/null +++ b/docs/anythingllm_mcp.md @@ -0,0 +1,207 @@ +# AnythingLLM MCP Server Configuration + +## Overview + +AnythingLLM supports [Model Context Protocol (MCP)](https://modelcontextprotocol.io) servers, allowing AI agents to call tools provided by local processes or remote services. MCP servers are managed by the internal `MCPHypervisor` singleton and configured via a single JSON file. + +## Configuration File Location + +| Environment | Path | +|-------------|------| +| Development | `server/storage/plugins/anythingllm_mcp_servers.json` | +| Production / Docker | `$STORAGE_DIR/plugins/anythingllm_mcp_servers.json` | + +The file and its parent directory are created automatically with an empty `{ "mcpServers": {} }` object if they do not already exist. + +## File Format + +```json +{ + "mcpServers": { + "": { ... }, + "": { ... } + } +} +``` + +Each key inside `mcpServers` is the unique name used to identify the server within AnythingLLM. The value is the server definition, whose required fields depend on the transport type (see below). + +--- + +## Transport Types + +### `stdio` — Local Process + +Spawns a local process and communicates over stdin/stdout. The transport type is inferred automatically when a `command` field is present. + +```json +{ + "mcpServers": { + "filesystem": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-filesystem", "/home/user/docs"], + "env": { + "SOME_VAR": "value" + } + } + } +} +``` + +| Field | Required | Description | +|-------|----------|-------------| +| `command` | ✅ | Executable to run (e.g. `npx`, `node`, `python3`) | +| `args` | ❌ | Array of arguments passed to the command | +| `env` | ❌ | Extra environment variables merged into the process environment | + +> **Note:** The process inherits PATH and NODE_PATH from the shell environment that started AnythingLLM. If a command such as `npx` is not found, ensure it is available in that shell's PATH. + +--- + +### `sse` — Server-Sent Events (legacy) + +Connects to a remote MCP server using the legacy SSE transport. The type is inferred automatically when only a `url` field is present (no `command`), or when `"type": "sse"` is set explicitly. + +```json +{ + "mcpServers": { + "my-sse-server": { + "url": "https://example.com/mcp", + "type": "sse", + "headers": { + "Authorization": "Bearer " + } + } + } +} +``` + +--- + +### `streamable` / `http` — Streamable HTTP + +Connects to a remote MCP server using the newer Streamable HTTP transport. + +```json +{ + "mcpServers": { + "my-http-server": { + "url": "https://example.com/mcp", + "type": "streamable", + "headers": { + "Authorization": "Bearer " + } + } + } +} +``` + +Both `"type": "streamable"` and `"type": "http"` select this transport. + +| Field | Required | Description | +|-------|----------|-------------| +| `url` | ✅ | Full URL of the MCP endpoint | +| `type` | ✅ | `"sse"`, `"streamable"`, or `"http"` | +| `headers` | ❌ | HTTP headers sent with every request (useful for auth) | + +--- + +## AnythingLLM-Specific Options + +An optional `anythingllm` block inside any server definition can control AnythingLLM-specific behaviour: + +```json +{ + "mcpServers": { + "my-server": { + "command": "npx", + "args": ["-y", "some-mcp-package"], + "anythingllm": { + "autoStart": false + } + } + } +} +``` + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `autoStart` | boolean | `true` | When `false`, the server is skipped at startup and must be started manually from the Admin UI | + +--- + +## Full Example + +```json +{ + "mcpServers": { + "filesystem": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-filesystem", "/home/user/documents"] + }, + "github": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-github"], + "env": { + "GITHUB_PERSONAL_ACCESS_TOKEN": "ghp_xxxxxxxxxxxx" + } + }, + "remote-tools": { + "url": "https://mcp.example.com/mcp", + "type": "streamable", + "headers": { + "Authorization": "Bearer my-secret-token" + } + }, + "optional-server": { + "command": "node", + "args": ["/opt/mcp/server.js"], + "anythingllm": { + "autoStart": false + } + } + } +} +``` + +--- + +## Managing Servers via the Admin UI + +MCP servers can be managed without editing the JSON file directly: + +1. Log in as an Admin. +2. Go to **Admin → Agents → MCP Servers**. +3. From this page you can: + - View all configured servers and the tools each one exposes. + - Start or stop individual servers. + - Delete a server (removes it from the JSON file). + - Force-reload all servers (stops all, re-reads the file, restarts them). + +Any changes made through the UI are persisted back to `anythingllm_mcp_servers.json`. + +--- + +## How Servers Are Started + +- At startup, `MCPHypervisor` reads the config file and starts all servers whose `anythingllm.autoStart` is not `false`. +- Each server has a **30-second connection timeout**. If a server fails to connect within that window it is marked as failed and its process is cleaned up. +- Servers are exposed to agents via the `@agent` directive using the naming convention `@@mcp_`. + +--- + +## Troubleshooting + +| Symptom | Likely Cause | Fix | +|---------|-------------|-----| +| `ENOENT` / command not found | The executable is not in PATH | Use the full absolute path for `command`, or ensure the binary is accessible in the shell that starts AnythingLLM | +| Connection timeout after 30 s | Server process started but did not respond | Check the server's own logs; verify arguments are correct | +| Tools not visible to agent | Server failed to start | Check the status badge in **Admin → Agents → MCP Servers** for the error message | +| Auth / 401 errors on remote servers | Missing or incorrect credentials | Verify `headers` or `env` values in the config | + +--- + +## Further Reading + +- [AnythingLLM MCP Compatibility Docs](https://docs.anythingllm.com/mcp-compatibility/overview) +- [Model Context Protocol Specification](https://modelcontextprotocol.io) diff --git a/docs/anythingllm_overview.md b/docs/anythingllm_overview.md new file mode 100644 index 0000000..a562068 --- /dev/null +++ b/docs/anythingllm_overview.md @@ -0,0 +1,726 @@ +# AnythingLLM: Your AI-Powered Knowledge Hub + +## 🎯 What is AnythingLLM? + +AnythingLLM is a **full-stack application** that transforms how you interact with Large Language Models (LLMs). Think of it as your personal AI assistant platform that can: + +- 💬 Chat with multiple LLM providers +- 📚 Query your own documents and data (RAG - Retrieval Augmented Generation) +- 🤖 Run autonomous AI agents with tools +- 🔌 Extend capabilities via Model Context Protocol (MCP) +- 👥 Support multiple users and workspaces +- 🎨 Provide a beautiful, intuitive web interface + +**In simple terms:** It's like ChatGPT, but you control everything - the data, the models, the privacy, and the capabilities. + +--- + +## 🌟 Key Capabilities + +### 1. **Multi-Provider LLM Support** + +AnythingLLM isn't locked to a single AI provider. It supports **30+ LLM providers**: + +#### Your Environment: +``` +┌─────────────────────────────────────────┐ +│ Your LLM Infrastructure │ +├─────────────────────────────────────────┤ +│ ✅ Llama CPP Router (pan.helu.ca) │ +│ - Load-balanced inference │ +│ - High availability │ +│ │ +│ ✅ Direct Llama CPP (nyx.helu.ca) │ +│ - Direct connection option │ +│ - Lower latency │ +│ │ +│ ✅ LLM Proxy - Arke (circe.helu.ca) │ +│ - Unified API gateway │ +│ - Request routing │ +│ │ +│ ✅ AWS Bedrock (optional) │ +│ - Claude, Titan models │ +│ - Enterprise-grade │ +└─────────────────────────────────────────┘ +``` + +**What this means:** +- Switch between providers without changing your application +- Use different models for different workspaces +- Fallback to alternative providers if one fails +- Compare model performance side-by-side + +### 2. **Document Intelligence (RAG)** + +AnythingLLM can ingest and understand your documents: + +**Supported Formats:** +- 📄 PDF, DOCX, TXT, MD +- 🌐 Websites (scraping) +- 📊 CSV, JSON +- 🎥 YouTube transcripts +- 🔗 GitHub repositories +- 📝 Confluence, Notion exports + +**How it works:** +``` +Your Document → Text Extraction → Chunking → Embeddings → Vector DB (PostgreSQL) + ↓ +User Question → Embedding → Similarity Search → Relevant Chunks → LLM → Answer +``` + +**Example Use Case:** +``` +You: "What's our refund policy?" +AnythingLLM: [Searches your policy documents] + "According to your Terms of Service (page 12), + refunds are available within 30 days..." +``` + +### 3. **AI Agents with Tools** 🤖 + +This is where AnythingLLM becomes **truly powerful**. Agents can: + +#### Built-in Agent Tools: +- 🌐 **Web Browsing** - Navigate websites, fill forms, take screenshots +- 🔍 **Web Scraping** - Extract data from web pages +- 📊 **SQL Agent** - Query databases (PostgreSQL, MySQL, MSSQL) +- 📈 **Chart Generation** - Create visualizations +- 💾 **File Operations** - Save and manage files +- 📝 **Document Summarization** - Condense long documents +- 🧠 **Memory** - Remember context across conversations + +#### Agent Workflow Example: +``` +User: "Check our database for users who signed up last week + and send them a welcome email" + +Agent: + 1. Uses SQL Agent to query PostgreSQL + 2. Retrieves user list + 3. Generates personalized email content + 4. (With email MCP) Sends emails + 5. Reports back with results +``` + +### 4. **Model Context Protocol (MCP)** 🔌 + +MCP is AnythingLLM's **superpower** - it allows you to extend the AI with custom tools and data sources. + +#### What is MCP? + +MCP is a **standardized protocol** for connecting AI systems to external tools and data. Think of it as "plugins for AI." + +#### Your MCP Possibilities: + +**Example 1: Docker Management** +```javascript +// MCP Server: docker-mcp +Tools Available: + - list_containers() + - start_container(name) + - stop_container(name) + - view_logs(container) + - exec_command(container, command) + +User: "Show me all running containers and restart the one using most memory" +Agent: [Uses docker-mcp tools to check, analyze, and restart] +``` + +**Example 2: GitHub Integration** +```javascript +// MCP Server: github-mcp +Tools Available: + - create_issue(repo, title, body) + - search_code(query) + - create_pr(repo, branch, title) + - list_repos() + +User: "Create a GitHub issue for the bug I just described" +Agent: [Uses github-mcp to create issue with details] +``` + +**Example 3: Custom Business Tools** +```javascript +// Your Custom MCP Server +Tools Available: + - query_crm(customer_id) + - check_inventory(product_sku) + - create_order(customer, items) + - send_notification(user, message) + +User: "Check if we have product XYZ in stock and notify me if it's low" +Agent: [Uses your custom MCP tools] +``` + +#### MCP Architecture in AnythingLLM: + +``` +┌─────────────────────────────────────────────────────────┐ +│ AnythingLLM │ +│ ┌─────────────────────────────────────────────────┐ │ +│ │ Agent System │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ +│ │ │ Built-in │ │ MCP │ │ Custom │ │ │ +│ │ │ Tools │ │ Tools │ │ Flows │ │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ │ │ +│ └─────────────────────────────────────────────────┘ │ +│ ↓ │ +│ ┌─────────────────────────────────────────────────┐ │ +│ │ MCP Hypervisor │ │ +│ │ - Manages MCP server lifecycle │ │ +│ │ - Handles stdio/http/sse transports │ │ +│ │ - Auto-discovers tools │ │ +│ └─────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────┐ +│ MCP Servers (Running Locally or Remote) │ +│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │ Docker │ │ GitHub │ │ Custom │ │ +│ │ MCP │ │ MCP │ │ MCP │ │ +│ └──────────┘ └──────────┘ └──────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + +**Key Features:** +- ✅ **Hot-reload** - Add/remove MCP servers without restarting +- ✅ **Multiple transports** - stdio, HTTP, Server-Sent Events +- ✅ **Auto-discovery** - Tools automatically appear in agent +- ✅ **Process management** - Automatic start/stop/restart +- ✅ **Error handling** - Graceful failures with logging + +### 5. **Agent Flows** 🔄 + +Create **no-code agent workflows** for complex tasks: + +``` +┌─────────────────────────────────────────┐ +│ Example Flow: "Daily Report Generator" │ +├─────────────────────────────────────────┤ +│ 1. Query database for yesterday's data │ +│ 2. Generate summary statistics │ +│ 3. Create visualization charts │ +│ 4. Write report to document │ +│ 5. Send via email (MCP) │ +└─────────────────────────────────────────┘ +``` + +Flows can be: +- Triggered manually +- Scheduled (via external cron) +- Called from other agents +- Shared across workspaces + +--- + +## 🏗️ How AnythingLLM Fits Your Environment + +### Your Complete Stack: + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Internet │ +└────────────────────────────┬────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────────┐ +│ HAProxy (SSL Termination & Load Balancing) │ +│ - HTTPS/WSS support │ +│ - Security headers │ +│ - Health checks │ +└────────────────────────────┬────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────────┐ +│ AnythingLLM Application │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌────────────────┐ │ +│ │ Web UI │ │ API Server │ │ Agent Engine │ │ +│ │ - React │ │ - Express.js │ │ - AIbitat │ │ +│ │ - WebSocket │ │ - REST API │ │ - MCP Support │ │ +│ └─────────────────┘ └─────────────────┘ └────────────────┘ │ +└────────────────────────────┬────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────────┐ +│ Data Layer │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ PostgreSQL 17 + pgvector │ │ +│ │ - User data & workspaces │ │ +│ │ - Chat history │ │ +│ │ - Vector embeddings (for RAG) │ │ +│ │ - Agent invocations │ │ +│ └──────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────────┐ +│ External LLM Services │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Llama Router │ │ Direct Llama │ │ LLM Proxy │ │ +│ │ pan.helu.ca │ │ nyx.helu.ca │ │ circe.helu.ca│ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────────┐ +│ TTS Service │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ FastKokoro (OpenAI-compatible TTS) │ │ +│ │ pan.helu.ca:22070 │ │ +│ │ - Text-to-speech generation │ │ +│ │ - Multiple voices │ │ +│ └──────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Observability Stack: + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Monitoring & Logging │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ Grafana (Unified Dashboard) │ │ +│ │ - Metrics visualization │ │ +│ │ - Log exploration │ │ +│ │ - Alerting │ │ +│ └────────────┬─────────────────────────────┬────────────────┘ │ +│ ↓ ↓ │ +│ ┌────────────────────────┐ ┌────────────────────────┐ │ +│ │ Prometheus │ │ Loki │ │ +│ │ - Metrics storage │ │ - Log aggregation │ │ +│ │ - Alert rules │ │ - 31-day retention │ │ +│ │ - 30-day retention │ │ - Query language │ │ +│ └────────────────────────┘ └────────────────────────┘ │ +│ ↑ ↑ │ +│ ┌────────────┴─────────────────────────────┴────────────────┐ │ +│ │ Data Collection │ │ +│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ +│ │ │ cAdvisor │ │ Postgres │ │ Alloy │ │ │ +│ │ │ (Container) │ │ Exporter │ │ (Logs) │ │ │ +│ │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ +│ └────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 🎨 Real-World Use Cases + +### Use Case 1: **Internal Knowledge Base** + +**Scenario:** Your team needs quick access to company documentation + +**Setup:** +1. Upload all company docs to AnythingLLM workspace +2. Documents are embedded and stored in PostgreSQL +3. Team members ask questions naturally + +**Example:** +``` +Employee: "What's the process for requesting time off?" +AnythingLLM: [Searches HR documents] + "According to the Employee Handbook, you need to: + 1. Submit request via HR portal + 2. Get manager approval + 3. Minimum 2 weeks notice for vacations..." +``` + +**Benefits:** +- ✅ No more searching through SharePoint +- ✅ Instant answers with source citations +- ✅ Always up-to-date (re-sync documents) +- ✅ Multi-user access with permissions + +### Use Case 2: **DevOps Assistant** + +**Scenario:** Manage infrastructure with natural language + +**Setup:** +1. Install Docker MCP server +2. Install GitHub MCP server +3. Connect to your monitoring stack + +**Example Conversation:** +``` +You: "Show me all containers and their resource usage" +Agent: [Uses docker-mcp + Prometheus data] + "Here are your containers: + - anythingllm: 2.1GB RAM, 45% CPU + - postgres: 1.8GB RAM, 12% CPU + - prometheus: 1.2GB RAM, 8% CPU + + anythingllm is using high CPU. Would you like me to investigate?" + +You: "Yes, check the logs for errors" +Agent: [Uses docker-mcp to fetch logs] + "Found 15 errors in the last hour related to LLM timeouts. + Should I create a GitHub issue?" + +You: "Yes, and restart the container" +Agent: [Creates GitHub issue, restarts container] + "Done! Issue #123 created and container restarted. + CPU usage now at 15%." +``` + +### Use Case 3: **Customer Support Automation** + +**Scenario:** AI-powered support that can take action + +**Setup:** +1. Upload product documentation +2. Connect CRM via custom MCP +3. Enable SQL agent for database queries + +**Example:** +``` +Support Agent: "Customer John Doe says his order #12345 hasn't arrived" +AnythingLLM: [Queries database via SQL agent] + "Order #12345 shipped on Jan 5th via FedEx. + Tracking shows it's delayed due to weather. + + Would you like me to: + 1. Send customer an update email + 2. Offer expedited shipping on next order + 3. Issue a partial refund" + +Support Agent: "Send update email" +AnythingLLM: [Uses email MCP] + "Email sent to john@example.com with tracking info + and apology for delay." +``` + +### Use Case 4: **Data Analysis Assistant** + +**Scenario:** Query your database with natural language + +**Setup:** +1. Enable SQL Agent +2. Connect to PostgreSQL +3. Grant read-only access + +**Example:** +``` +You: "Show me user signups by month for the last 6 months" +Agent: [Generates and executes SQL] + SELECT + DATE_TRUNC('month', created_at) as month, + COUNT(*) as signups + FROM users + WHERE created_at >= NOW() - INTERVAL '6 months' + GROUP BY month + ORDER BY month; + + Results: + - July 2025: 145 signups + - August 2025: 203 signups + - September 2025: 187 signups + ... + +You: "Create a chart of this" +Agent: [Uses chart generation tool] + [Displays bar chart visualization] +``` + +--- + +## 🔐 Security & Privacy + +### Why Self-Hosted Matters: + +**Your Data Stays Yours:** +- ✅ Documents never leave your infrastructure +- ✅ Chat history stored in your PostgreSQL +- ✅ No data sent to third parties (except chosen LLM provider) +- ✅ Full audit trail in logs (via Loki) + +**Access Control:** +- ✅ Multi-user authentication +- ✅ Role-based permissions (Admin, User) +- ✅ Workspace-level isolation +- ✅ API key management + +**Network Security:** +- ✅ HAProxy SSL termination +- ✅ Security headers (HSTS, CSP, etc.) +- ✅ Internal network isolation +- ✅ Firewall-friendly (only ports 80/443 exposed) + +**Monitoring:** +- ✅ All access logged to Loki +- ✅ Failed login attempts tracked +- ✅ Resource usage monitored +- ✅ Alerts for suspicious activity + +--- + +## 📊 Monitoring Integration + +Your observability stack provides **complete visibility**: + +### What You Can Monitor: + +**Application Health:** +``` +Grafana Dashboard: "AnythingLLM Overview" +├─ Request Rate: 1,234 req/min +├─ Response Time: 245ms avg +├─ Error Rate: 0.3% +├─ Active Users: 23 +└─ Agent Invocations: 45/hour +``` + +**Resource Usage:** +``` +Container Metrics (via cAdvisor): +├─ CPU: 45% (2 cores) +├─ Memory: 2.1GB / 4GB +├─ Network: 15MB/s in, 8MB/s out +└─ Disk I/O: 120 IOPS +``` + +**Database Performance:** +``` +PostgreSQL Metrics (via postgres-exporter): +├─ Connections: 45 / 100 +├─ Query Time: 12ms avg +├─ Cache Hit Ratio: 98.5% +├─ Database Size: 2.3GB +└─ Vector Index Size: 450MB +``` + +**LLM Provider Performance:** +``` +Custom Metrics (via HAProxy): +├─ Llama Router: 234ms avg latency +├─ Direct Llama: 189ms avg latency +├─ Arke Proxy: 267ms avg latency +└─ Success Rate: 99.2% +``` + +**Log Analysis (Loki):** +```logql +# Find slow LLM responses +{service="anythingllm"} + | json + | duration > 5000 + +# Track agent tool usage +{service="anythingllm"} + |= "agent" + |= "tool_call" + +# Monitor errors by type +{service="anythingllm"} + |= "ERROR" + | json + | count by error_type +``` + +### Alerting Examples: + +**Critical Alerts:** +- 🚨 AnythingLLM container down +- 🚨 PostgreSQL connection failures +- 🚨 Disk space > 95% +- 🚨 Memory usage > 90% + +**Warning Alerts:** +- ⚠️ High LLM response times (> 5s) +- ⚠️ Database connections > 80% +- ⚠️ Error rate > 1% +- ⚠️ Agent failures + +--- + +## 🚀 Getting Started + +### Quick Start: + +```bash +cd deployment + +# 1. Configure environment +cp .env.example .env +nano .env # Set your LLM endpoints, passwords, etc. + +# 2. Setup SSL certificates +# (See README.md for Let's Encrypt instructions) + +# 3. Deploy +docker-compose up -d + +# 4. Access services +# - AnythingLLM: https://your-domain.com +# - Grafana: http://localhost:3000 +# - Prometheus: http://localhost:9090 +``` + +### First Steps in AnythingLLM: + +1. **Create Account** - First user becomes admin +2. **Create Workspace** - Organize by project/team +3. **Upload Documents** - Add your knowledge base +4. **Configure LLM** - Choose your provider (already set via .env) +5. **Enable Agents** - Turn on agent mode for tools +6. **Add MCP Servers** - Extend with custom tools +7. **Start Chatting!** - Ask questions, run agents + +--- + +## 🎯 Why AnythingLLM is Powerful + +### Compared to ChatGPT: + +| Feature | ChatGPT | AnythingLLM | +|---------|---------|-------------| +| **Data Privacy** | ❌ Data sent to OpenAI | ✅ Self-hosted, private | +| **Custom Documents** | ⚠️ Limited (ChatGPT Plus) | ✅ Unlimited RAG | +| **LLM Choice** | ❌ OpenAI only | ✅ 30+ providers | +| **Agents** | ⚠️ Limited tools | ✅ Unlimited via MCP | +| **Multi-User** | ❌ Individual accounts | ✅ Team workspaces | +| **API Access** | ⚠️ Paid tier | ✅ Full REST API | +| **Monitoring** | ❌ No visibility | ✅ Complete observability | +| **Cost** | 💰 $20/user/month | ✅ Self-hosted (compute only) | + +### Compared to LangChain/LlamaIndex: + +| Feature | LangChain | AnythingLLM | +|---------|-----------|-------------| +| **Setup** | 🔧 Code required | ✅ Web UI, no code | +| **User Interface** | ❌ Build your own | ✅ Beautiful UI included | +| **Multi-User** | ❌ Build your own | ✅ Built-in | +| **Agents** | ✅ Powerful | ✅ Equally powerful + UI | +| **MCP Support** | ❌ No | ✅ Native support | +| **Monitoring** | ❌ DIY | ✅ Integrated | +| **Learning Curve** | 📚 Steep | ✅ Gentle | + +--- + +## 🎓 Advanced Capabilities + +### 1. **Workspace Isolation** + +Create separate workspaces for different use cases: + +``` +├─ Engineering Workspace +│ ├─ Documents: Code docs, API specs +│ ├─ LLM: Direct Llama (fast) +│ └─ Agents: GitHub MCP, Docker MCP +│ +├─ Customer Support Workspace +│ ├─ Documents: Product docs, FAQs +│ ├─ LLM: Llama Router (reliable) +│ └─ Agents: CRM MCP, Email MCP +│ +└─ Executive Workspace + ├─ Documents: Reports, analytics + ├─ LLM: AWS Bedrock Claude (best quality) + └─ Agents: SQL Agent, Chart generation +``` + +### 2. **Embedding Strategies** + +AnythingLLM supports multiple embedding models: + +- **Native** (Xenova) - Fast, runs locally +- **OpenAI** - High quality, requires API +- **Azure OpenAI** - Enterprise option +- **Local AI** - Self-hosted alternative + +**Your Setup:** Using native embeddings for privacy and speed + +### 3. **Agent Chaining** + +Agents can call other agents: + +``` +Main Agent + ├─> Research Agent (web scraping) + ├─> Analysis Agent (SQL queries) + └─> Report Agent (document generation) +``` + +### 4. **API Integration** + +Full REST API for programmatic access: + +```bash +# Send chat message +curl -X POST https://your-domain.com/api/v1/workspace/chat \ + -H "Authorization: Bearer YOUR_API_KEY" \ + -d '{"message": "What is our refund policy?"}' + +# Upload document +curl -X POST https://your-domain.com/api/v1/document/upload \ + -H "Authorization: Bearer YOUR_API_KEY" \ + -F "file=@policy.pdf" + +# Invoke agent +curl -X POST https://your-domain.com/api/v1/agent/invoke \ + -H "Authorization: Bearer YOUR_API_KEY" \ + -d '{"prompt": "Check server status"}' +``` + +--- + +## 🔮 Future Possibilities + +With your infrastructure, you could: + +### 1. **Voice Interface** +- Use FastKokoro TTS for responses +- Add speech-to-text (Whisper) +- Create voice-controlled assistant + +### 2. **Slack/Discord Bot** +- Create MCP server for messaging +- Deploy bot that uses AnythingLLM +- Team can chat with AI in Slack + +### 3. **Automated Workflows** +- Scheduled agent runs (cron) +- Webhook triggers +- Event-driven automation + +### 4. **Custom Dashboards** +- Embed AnythingLLM in your apps +- White-label the interface +- Custom branding + +### 5. **Multi-Modal AI** +- Image analysis (with vision models) +- Document OCR +- Video transcription + +--- + +## 📚 Summary + +**AnythingLLM is your AI platform that:** + +✅ **Respects Privacy** - Self-hosted, your data stays yours +✅ **Flexible** - 30+ LLM providers, switch anytime +✅ **Intelligent** - RAG for document understanding +✅ **Powerful** - AI agents with unlimited tools via MCP +✅ **Observable** - Full monitoring with Prometheus/Loki +✅ **Scalable** - PostgreSQL + HAProxy for production +✅ **Extensible** - MCP protocol for custom integrations +✅ **User-Friendly** - Beautiful web UI, no coding required + +**In your environment, it provides:** + +🎯 **Unified AI Interface** - One place for all AI interactions +🔧 **DevOps Automation** - Manage infrastructure with natural language +📊 **Data Intelligence** - Query databases, analyze trends +🤖 **Autonomous Agents** - Tasks that run themselves +📈 **Complete Visibility** - Every metric, every log, every alert +🔒 **Enterprise Security** - SSL, auth, audit trails, monitoring + +**Think of it as:** Your personal AI assistant platform that can see your data, use your tools, and help your team - all while you maintain complete control. + +--- + +## 🆘 Learn More + +- **Deployment Guide**: [README.md](README.md) +- **Monitoring Explained**: [PROMETHEUS_EXPLAINED.md](PROMETHEUS_EXPLAINED.md) +- **Official Docs**: https://docs.anythingllm.com +- **GitHub**: https://github.com/Mintplex-Labs/anything-llm +- **Discord Community**: https://discord.gg/6UyHPeGZAC diff --git a/docs/arke.md b/docs/arke.md new file mode 100644 index 0000000..3990a51 --- /dev/null +++ b/docs/arke.md @@ -0,0 +1,94 @@ +# Arke Vault Variables Documentation + +This document lists the vault variables that need to be added to `ansible/inventory/group_vars/all/vault.yml` for the Arke deployment. + +## Required Vault Variables + +### Existing Variables +These should already be present in your vault: + +```yaml +vault_arke_db_password: "your_secure_password" +vault_arke_ntth_tokens: '[{"app_id":"your_app_id","app_secret":"your_secret","name":"Production"}]' +``` + +### New Variables to Add + +```yaml +# OpenAI-Compatible Embedding API Key (optional - can be empty string if not using OpenAI provider) +vault_arke_openai_embedding_api_key: "" +``` + +## Usage Notes + +### vault_arke_openai_embedding_api_key +- **Required when**: `arke_embedding_provider` is set to `openai` in the inventory +- **Can be empty**: If using llama-cpp, LocalAI, or other services that don't require authentication +- **Must be set**: If using actual OpenAI API or services requiring authentication +- **Default in inventory**: Empty string (`""`) + +### vault_arke_ntth_tokens +- **Format**: JSON array of objects +- **Required fields per object**: + - `app_id`: The application ID + - `app_secret`: The application secret + - `name`: (optional) A descriptive name for the token + +**Example with multiple tokens**: +```yaml +vault_arke_ntth_tokens: '[{"app_id":"id1","app_secret":"secret1","name":"Production-Primary"},{"app_id":"id2","app_secret":"secret2","name":"Production-Backup"}]' +``` + +## Editing the Vault + +To edit the vault file: + +```bash +ansible-vault edit ansible/inventory/group_vars/all/vault.yml +``` + +Make sure you have the vault password available (stored in `ansible/.vault_pass` by default). + +## Configuration Examples + +### Using Ollama (Current Default) +No additional vault variables needed beyond the existing ones. The following inventory settings are used: + +```yaml +arke_embedding_provider: ollama +arke_ollama_host: "pan.helu.ca" +``` + +### Using OpenAI API +Add to vault: +```yaml +vault_arke_openai_embedding_api_key: "sk-your-openai-api-key" +``` + +Update inventory to: +```yaml +arke_embedding_provider: openai +arke_openai_embedding_base_url: "https://api.openai.com" +arke_openai_embedding_model: "text-embedding-3-small" +``` + +### Using llama-cpp or LocalAI (No Auth Required) +Vault variable can remain empty: +```yaml +vault_arke_openai_embedding_api_key: "" +``` + +Update inventory to: +```yaml +arke_embedding_provider: openai +arke_openai_embedding_base_url: "http://your-server:8080" +arke_openai_embedding_model: "text-embedding-ada-002" +``` + +## Security Best Practices + +1. Always use `ansible-vault` to encrypt sensitive data +2. Never commit unencrypted secrets to version control +3. Keep the vault password secure and separate from the repository +4. Rotate API keys and secrets regularly +5. Use unique tokens for different environments (dev/staging/production) diff --git a/docs/auditd.md b/docs/auditd.md new file mode 100644 index 0000000..3ac1974 --- /dev/null +++ b/docs/auditd.md @@ -0,0 +1,204 @@ +## Auditd + Laurel: Host-Based Detection Done Right + +### What They Are + +**Auditd** is the Linux Audit Framework—a kernel-level system that logs security-relevant events: file access, system calls, process execution, user authentication, privilege changes. It's been in the kernel since 2.6 and is rock solid. + +**Laurel** is a plugin that transforms auditd's notoriously awkward multi-line log format into clean, structured JSON—perfect for shipping to Loki. + +### Why This Combination Works + +Auditd alone has two problems: +1. The log format is painful (events split across multiple lines, encoded arguments) +2. High-volume logging can impact performance if not tuned + +Laurel solves the first problem elegantly. Proper rule tuning solves the second. + +### Installation + +```bash +# Auditd (likely already installed) +sudo apt install auditd audispd-plugins + +# Laurel - grab the latest release +wget https://github.com/threathunters-io/laurel/releases/latest/download/laurel-x86_64-musl +sudo mv laurel-x86_64-musl /usr/local/sbin/laurel +sudo chmod 755 /usr/local/sbin/laurel + +# Create laurel user and directories +sudo useradd -r -s /usr/sbin/nologin laurel +sudo mkdir -p /var/log/laurel /etc/laurel +sudo chown laurel:laurel /var/log/laurel +``` + +### Configuration + +**/etc/laurel/config.toml:** +```toml +[auditlog] +# Output JSON logs here - point Promtail/Loki agent at this +file = "/var/log/laurel/audit.json" +size = 100000000 # 100MB rotation +generations = 5 + +[transform] +# Enrich with useful context +execve-argv = "array" +execve-env = "delete" # Don't log environment (secrets risk) + +[filter] +# Drop noisy low-value events +filter-keys = ["exclude-noise"] +``` + +**/etc/audit/plugins.d/laurel.conf:** +```ini +active = yes +direction = out +path = /usr/local/sbin/laurel +type = always +args = --config /etc/laurel/config.toml +format = string +``` + +### High-Value Audit Rules + +Here's a starter set focused on actual intrusion indicators—not compliance checkbox noise: + +**/etc/audit/rules.d/intrusion-detection.rules:** +```bash +# Clear existing rules +-D + +# Buffer size (tune based on your load) +-b 8192 + +# Failed file access (credential hunting) +-a always,exit -F arch=b64 -S open,openat -F exit=-EACCES -F key=access-denied +-a always,exit -F arch=b64 -S open,openat -F exit=-EPERM -F key=access-denied + +# Credential file access +-w /etc/passwd -p wa -k credential-files +-w /etc/shadow -p wa -k credential-files +-w /etc/gshadow -p wa -k credential-files +-w /etc/sudoers -p wa -k credential-files +-w /etc/sudoers.d -p wa -k credential-files + +# SSH key access +-w /root/.ssh -p wa -k ssh-keys +-w /home -p wa -k ssh-keys + +# Privilege escalation +-a always,exit -F arch=b64 -S setuid,setgid,setreuid,setregid -F key=priv-escalation +-w /usr/bin/sudo -p x -k priv-escalation +-w /usr/bin/su -p x -k priv-escalation + +# Process injection / debugging +-a always,exit -F arch=b64 -S ptrace -F key=process-injection + +# Suspicious process execution +-a always,exit -F arch=b64 -S execve -F euid=0 -F key=root-exec +-w /tmp -p x -k exec-from-tmp +-w /var/tmp -p x -k exec-from-tmp +-w /dev/shm -p x -k exec-from-shm + +# Network connections from unexpected processes +-a always,exit -F arch=b64 -S connect -F key=network-connect + +# Kernel module loading +-a always,exit -F arch=b64 -S init_module,finit_module -F key=kernel-modules + +# Audit log tampering (high priority) +-w /var/log/audit -p wa -k audit-tampering +-w /etc/audit -p wa -k audit-tampering + +# Cron/scheduled task modification +-w /etc/crontab -p wa -k persistence +-w /etc/cron.d -p wa -k persistence +-w /var/spool/cron -p wa -k persistence + +# Systemd service creation (persistence mechanism) +-w /etc/systemd/system -p wa -k persistence +-w /usr/lib/systemd/system -p wa -k persistence + +# Make config immutable (remove -e 2 while tuning) +# -e 2 +``` + +Load the rules: +```bash +sudo augenrules --load +sudo systemctl restart auditd +``` + +### Shipping to Loki + +**Promtail config snippet:** +```yaml +scrape_configs: + - job_name: laurel + static_configs: + - targets: + - localhost + labels: + job: auditd + host: your-hostname + __path__: /var/log/laurel/audit.json + pipeline_stages: + - json: + expressions: + event_type: SYSCALL.SYSCALL + key: SYSCALL.key + exe: SYSCALL.exe + uid: SYSCALL.UID + success: SYSCALL.success + - labels: + event_type: + key: +``` + +### Grafana Alerting Examples + +Once in Loki, create alerts for the high-value events: + +```logql +# Credential file tampering +{job="auditd"} |= `credential-files` | json | success = "yes" + +# Execution from /tmp (classic attack pattern) +{job="auditd"} |= `exec-from-tmp` | json + +# Root execution by non-root user (priv esc) +{job="auditd"} |= `priv-escalation` | json + +# Kernel module loading (rootkit indicator) +{job="auditd"} |= `kernel-modules` | json + +# Audit log tampering (covering tracks) +{job="auditd"} |= `audit-tampering` | json +``` + +### Performance Tuning + +If ye see performance impact: +1. **Add exclusions** for known-noisy processes: + ```bash + -a never,exit -F exe=/usr/bin/prometheus -F key=exclude-noise + ``` +2. **Reduce network logging** — the `connect` syscall is high-volume; consider removing or filtering +3. **Increase buffer** if you see `audit: backlog limit exceeded` + +### What You'll Catch + +With this setup, you'll detect: +- Credential harvesting attempts +- Privilege escalation (successful and attempted) +- Persistence mechanisms (cron, systemd services) +- Execution from world-writable directories +- Process injection/debugging +- Rootkit installation attempts +- Evidence tampering + +All with structured JSON flowing into your existing Loki/Grafana stack. No Suricata noise, just host-level events that actually matter. + +Want me to help tune rules for specific services you're running, or set up the Grafana alert rules? \ No newline at end of file diff --git a/docs/casdoor.md b/docs/casdoor.md new file mode 100644 index 0000000..911eea0 --- /dev/null +++ b/docs/casdoor.md @@ -0,0 +1,542 @@ +# Casdoor SSO Identity Provider + +Casdoor provides Single Sign-On (SSO) authentication for Agathos services. This document covers the design decisions, architecture, and deployment procedures. + +## Design Philosophy + +### Security Isolation + +Casdoor handles identity and authentication - the most security-sensitive data in any system. For this reason, Casdoor uses a **dedicated PostgreSQL instance** on Titania rather than sharing the PostgreSQL server on Portia with other applications. + +This isolation provides: +- **Data separation**: Authentication data is physically separated from application data +- **Access control**: The `casdoor` database user only has access to the `casdoor` database +- **Blast radius reduction**: A compromise of the shared database on Portia doesn't expose identity data +- **Production alignment**: Dev/UAT/Prod environments use the same architecture + +### Native PostgreSQL with Docker Casdoor + +The architecture splits cleanly: + +``` +┌──────────────────────────────────────────────────────────────┐ +│ titania.incus │ +│ │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ Native PostgreSQL 17 (systemd) │ │ +│ │ - SSL enabled for external connections │ │ +│ │ - Local connections without SSL │ │ +│ │ - Managed like any standard PostgreSQL install │ │ +│ │ - Port 5432 │ │ +│ └────────────────────────────────────────────────────────┘ │ +│ ▲ │ +│ │ localhost:5432 │ +│ │ sslmode=disable │ +│ │ │ +│ ┌────────┴───────────────────────────────────────────────┐ │ +│ │ Casdoor Docker Container (network_mode: host) │ │ +│ │ - Runs as casdoor:casdoor user │ │ +│ │ - Only has access to its database │ │ +│ │ - Cannot touch PostgreSQL server config │ │ +│ │ - Port 22081 (via HAProxy) │ │ +│ └────────────────────────────────────────────────────────┘ │ +│ │ +└──────────────────────────────────────────────────────────────┘ + │ + │ External: SSL required + │ sslmode=verify-ca + ▼ + ┌─────────────┐ + │ PGadmin │ + │ on Portia │ + └─────────────┘ +``` + +### Why Not Docker for PostgreSQL? + +Docker makes PostgreSQL permission management unnecessarily complex: +- UID/GID mapping between host and container +- Volume permission issues +- SSL certificate ownership problems +- More difficult backups and maintenance + +Native PostgreSQL is: +- Easier to manage (standard Linux administration) +- Better integrated with systemd +- Simpler backup procedures +- Well-documented and understood + +### SSL Strategy + +PostgreSQL connections follow a **split SSL policy**: + +| Connection Source | SSL Requirement | Rationale | +|-------------------|-----------------|-----------| +| Casdoor (localhost) | `sslmode=disable` | Same host, trusted | +| PGadmin (Portia) | `sslmode=verify-ca` | External network, requires encryption | +| Other external | `hostssl` required | Enforced by pg_hba.conf | + +This is controlled by `pg_hba.conf`: +``` +# Local connections (Unix socket) +local all all peer + +# Localhost connections (no SSL required) +host all all 127.0.0.1/32 md5 + +# External connections (SSL required) +hostssl all all 0.0.0.0/0 md5 +``` + +### System User Pattern + +The Casdoor service user is created without hardcoded UID/GID: + +```yaml +- name: Create casdoor user + ansible.builtin.user: + name: "{{ casdoor_user }}" + system: true # System account, UID assigned by OS +``` + +The playbook queries the assigned UID/GID at runtime for Docker container user mapping. + +## Architecture + +### Components + +| Component | Location | Purpose | +|-----------|----------|---------| +| PostgreSQL 17 | Native on Titania | Dedicated identity database | +| Casdoor | Docker on Titania | SSO identity provider | +| HAProxy | Titania | TLS termination, routing | +| Alloy | Titania | Syslog collection | + +### Deployment Order + +``` +1. postgresql_ssl/deploy.yml → Install PostgreSQL, SSL, create casdoor DB +2. casdoor/deploy.yml → Deploy Casdoor container +3. pgadmin/deploy.yml → Distribute SSL cert to PGadmin (optional) +``` + +### Network Ports + +| Port | Service | Access | +|------|---------|--------| +| 22081 | Casdoor HTTP | Via HAProxy (network_mode: host) | +| 5432 | PostgreSQL | SSL for external, plain for localhost | +| 51401 | Syslog | Local only (Alloy) | + +### Data Persistence + +PostgreSQL data (native install): +``` +/var/lib/postgresql/17/main/ # Database files +/etc/postgresql/17/main/ # Configuration +/etc/postgresql/17/main/ssl/ # SSL certificates +``` + +Casdoor configuration: +``` +/srv/casdoor/ +├── conf/ +│ └── app.conf # Casdoor configuration +└── docker-compose.yml # Service definition +``` + +## Prerequisites + +### 1. Terraform (S3 Buckets) + +Casdoor can use S3-compatible storage for avatars and attachments: + +```bash +cd terraform +terraform apply +``` + +### 2. Ansible Vault Secrets + +Add to `ansible/inventory/group_vars/all/vault.yml`: + +```yaml +# PostgreSQL SSL postgres user password (for Titania's dedicated PostgreSQL) +vault_postgresql_ssl_postgres_password: "secure-postgres-password" + +# Casdoor database password +vault_casdoor_db_password: "secure-db-password" + +# Casdoor application secrets +vault_casdoor_auth_state: "random-32-char-string" +vault_casdoor_app_client_secret: "generated-client-secret" + +# Casdoor initial user passwords (changed after first login) +vault_casdoor_admin_password: "initial-admin-password" +vault_casdoor_hostmaster_password: "initial-hostmaster-password" + +# Optional (for RADIUS protocol) +vault_casdoor_radius_secret: "radius-secret" +``` + +Generate secrets: +```bash +# Database password +openssl rand -base64 24 + +# Auth state +openssl rand -hex 16 +``` + +### 3. Alloy Log Collection + +Ensure Alloy is deployed to receive syslog: + +```bash +ansible-playbook alloy/deploy.yml --limit titania.incus +``` + +## Deployment + +### Fresh Installation + +```bash +cd ansible + +# 1. Deploy PostgreSQL with SSL +ansible-playbook postgresql_ssl/deploy.yml + +# 2. Deploy Casdoor +ansible-playbook casdoor/deploy.yml + +# 3. Update PGadmin with SSL certificate (optional) +ansible-playbook pgadmin/deploy.yml +``` + +### Verify Deployment + +```bash +# Check PostgreSQL status +ssh titania.incus "sudo systemctl status postgresql" + +# Check Casdoor container +ssh titania.incus "cd /srv/casdoor && docker compose ps" + +# Check logs +ssh titania.incus "cd /srv/casdoor && docker compose logs --tail=50" + +# Test health endpoint +curl -s http://titania.incus:22081/api/health +``` + +### Redeployment + +To redeploy Casdoor only (database preserved): + +```bash +ansible-playbook casdoor/remove.yml +ansible-playbook casdoor/deploy.yml +``` + +To completely reset (including database): +```bash +ansible-playbook casdoor/remove.yml +ssh titania.incus "sudo -u postgres dropdb casdoor" +ssh titania.incus "sudo -u postgres dropuser casdoor" +ansible-playbook postgresql_ssl/deploy.yml +ansible-playbook casdoor/deploy.yml +``` + +## Configuration Reference + +### Host Variables + +Located in `ansible/inventory/host_vars/titania.incus.yml`: + +```yaml +# PostgreSQL SSL (dedicated identity database) +postgresql_ssl_postgres_password: "{{ vault_postgresql_ssl_postgres_password }}" +postgresql_ssl_port: 5432 +postgresql_ssl_cert_path: /etc/postgresql/17/main/ssl/server.crt + +# Casdoor service account (system-assigned UID/GID) +casdoor_user: casdoor +casdoor_group: casdoor +casdoor_directory: /srv/casdoor + +# Web +casdoor_port: 22081 +casdoor_runmode: dev # or 'prod' + +# Database (connects to localhost PostgreSQL) +casdoor_db_port: 5432 +casdoor_db_name: casdoor +casdoor_db_user: casdoor +casdoor_db_password: "{{ vault_casdoor_db_password }}" +casdoor_db_sslmode: disable # Localhost, no SSL needed + +# Logging +casdoor_syslog_port: 51401 +``` + +### SSL Certificate + +The self-signed certificate is generated automatically with: +- **Common Name**: `titania.incus` +- **Subject Alt Names**: `titania.incus`, `localhost`, `127.0.0.1` +- **Validity**: 10 years (`+3650d`) +- **Key Size**: 4096 bits +- **Location**: `/etc/postgresql/17/main/ssl/` + +To regenerate certificates: +```bash +ssh titania.incus "sudo rm -rf /etc/postgresql/17/main/ssl/*" +ansible-playbook postgresql_ssl/deploy.yml +ansible-playbook pgadmin/deploy.yml # Update cert on Portia +``` + +## PGadmin Connection + +To connect from PGadmin on Portia: + +1. Navigate to https://pgadmin.ouranos.helu.ca +2. Add Server: + - **General tab** + - Name: `Titania PostgreSQL (Casdoor)` + - **Connection tab** + - Host: `titania.incus` + - Port: `5432` + - Database: `casdoor` + - Username: `casdoor` + - Password: *(from vault)* + - **SSL tab** + - SSL Mode: `Verify-CA` + - Root certificate: `/var/lib/pgadmin/certs/titania-postgres-ca.crt` + +The certificate is automatically distributed by `ansible-playbook pgadmin/deploy.yml`. + +## Application Branding & CSS Customization + +Casdoor allows extensive customization of login/signup pages through CSS and HTML fields in the **Application** settings. + +### Available CSS/HTML Fields + +| Field | Purpose | Where Applied | +|-------|---------|---------------| +| `formCss` | Custom CSS for desktop login forms | Login, signup, consent pages | +| `formCssMobile` | Mobile-specific CSS overrides | Mobile views | +| `headerHtml` | Custom HTML in page header | All auth pages (can inject `", + "footerHtml": "

Powered by Helu.ca
", + "headerHtml": "", + "formBackgroundUrl": "https://example.com/bg.jpg" + } + ] +} +``` + +### Example: Custom Theme CSS + +The `formCss` field contains CSS to customize the Ant Design components: + +```css + +``` + +### Example: Custom Footer + +Replace the default "Powered by Casdoor" footer: + +```html + +``` + +### Organization-Level Theme + +Organization settings also affect theming. Configure in the **Organization** settings: + +| Setting | Purpose | +|---------|---------| +| `themeData.colorPrimary` | Primary color (Ant Design) | +| `themeData.borderRadius` | Border radius for components | +| `themeData.isCompact` | Compact mode toggle | +| `logo` | Organization logo | +| `favicon` | Browser favicon | +| `websiteUrl` | Organization website | + +### Updating Existing Applications + +Changes to `init_data.json` only apply during **initial Casdoor setup**. For existing deployments: + +1. **Via Admin UI**: Applications → Edit → Update CSS/HTML fields +2. **Via API**: Use Casdoor's REST API to update application settings +3. **Database reset**: Redeploy with `initDataNewOnly = false` (overwrites existing data) + +### CSS Class Reference + +Common CSS classes for targeting Casdoor UI elements: + +| Class | Element | +|-------|---------| +| `.login-panel` | Main login form container | +| `.login-logo-box` | Logo container | +| `.login-username` | Username input wrapper | +| `.login-password` | Password input wrapper | +| `.login-button-box` | Submit button container | +| `.login-forget-password` | Forgot password link | +| `.login-signup-link` | Signup link | +| `.login-languages` | Language selector | +| `.back-button` | Back button | +| `.provider-img` | OAuth provider icons | +| `.signin-methods` | Sign-in method tabs | +| `.verification-code` | Verification code input | +| `.login-agreement` | Terms agreement checkbox | + +## Initial Setup + +After deployment, access Casdoor at https://id.ouranos.helu.ca: + +1. **Login** with default credentials: `admin` / `123` +2. **Change admin password immediately** +3. **Create organization** for your domain +4. **Create applications** for services that need SSO: + - SearXNG (via OAuth2-Proxy) + - Grafana + - Other internal services + +### OAuth2 Application Setup + +For each service: +1. Applications → Add +2. Configure OAuth2 settings: + - Redirect URI: `https://service.ouranos.helu.ca/oauth2/callback` + - Grant types: Authorization Code +3. Note the Client ID and Client Secret for service configuration + +## Troubleshooting + +### PostgreSQL Issues + +```bash +# Check PostgreSQL status +ssh titania.incus "sudo systemctl status postgresql" + +# View PostgreSQL logs +ssh titania.incus "sudo journalctl -u postgresql -f" + +# Check SSL configuration +ssh titania.incus "sudo -u postgres psql -c 'SHOW ssl;'" +ssh titania.incus "sudo -u postgres psql -c 'SHOW ssl_cert_file;'" + +# Test SSL connection externally +openssl s_client -connect titania.incus:5432 -starttls postgres +``` + +### Casdoor Container Issues + +```bash +# View container status +ssh titania.incus "cd /srv/casdoor && docker compose ps" + +# View logs +ssh titania.incus "cd /srv/casdoor && docker compose logs casdoor" + +# Restart +ssh titania.incus "cd /srv/casdoor && docker compose restart" +``` + +### Database Connection + +```bash +# Connect as postgres admin +ssh titania.incus "sudo -u postgres psql" + +# Connect as casdoor user +ssh titania.incus "sudo -u postgres psql -U casdoor -d casdoor -h localhost" + +# List databases +ssh titania.incus "sudo -u postgres psql -c '\l'" + +# List users +ssh titania.incus "sudo -u postgres psql -c '\du'" +``` + +### Health Check + +```bash +# Casdoor health +curl -s http://titania.incus:22081/api/health | jq + +# PostgreSQL accepting connections +ssh titania.incus "pg_isready -h localhost" +``` + +## Security Considerations + +1. **Change default admin password** immediately after deployment +2. **Rotate database passwords** periodically (update vault, redeploy) +3. **Monitor authentication logs** in Grafana (via Alloy/Loki) +4. **SSL certificates** have 10-year validity, regenerate if compromised +5. **Backup PostgreSQL data** regularly - contains all identity data: + ```bash + ssh titania.incus "sudo -u postgres pg_dump casdoor > casdoor_backup.sql" + ``` + +## Related Documentation + +- [Ansible Practices](ansible.md) - Playbook and variable patterns +- [Terraform Practices](terraform.md) - S3 bucket provisioning +- [OAuth2-Proxy](services/oauth2_proxy.md) - Protecting services with Casdoor SSO \ No newline at end of file diff --git a/docs/cerbot.md b/docs/cerbot.md new file mode 100644 index 0000000..5d360c0 --- /dev/null +++ b/docs/cerbot.md @@ -0,0 +1,191 @@ +# Certbot DNS-01 with Namecheap + +This playbook deploys certbot with the Namecheap DNS plugin for DNS-01 validation, enabling wildcard SSL certificates. + +## Overview + +| Component | Value | +|-----------|-------| +| Installation | Python virtualenv in `/srv/certbot/.venv` | +| DNS Plugin | `certbot-dns-namecheap` | +| Validation | DNS-01 (supports wildcards) | +| Renewal | Systemd timer (twice daily) | +| Certificate Output | `/etc/haproxy/certs/{domain}.pem` | +| Metrics | Prometheus textfile collector | +## Deployments + +### Titania (ouranos.helu.ca) + +Production deployment providing Let's Encrypt certificates for the Agathos sandbox HAProxy reverse proxy. + +| Setting | Value | +|---------|-------| +| **Host** | titania.incus | +| **Domain** | ouranos.helu.ca | +| **Wildcard** | *.ouranos.helu.ca | +| **Email** | webmaster@helu.ca | +| **HAProxy** | Port 443 (HTTPS), Port 80 (HTTP redirect) | +| **Renewal** | Twice daily, automatic HAProxy reload | + +### Other Deployments + +The playbook can be deployed to any host with HAProxy. See the example configuration for hippocamp.helu.ca (d.helu.ca domain) below. +## Prerequisites + +1. **Namecheap API Access** enabled on your account +2. **Namecheap API key** generated +3. **IP whitelisted** in Namecheap API settings +4. **Ansible Vault** configured with Namecheap credentials + +## Setup + +### 1. Add Secrets to Ansible Vault + +Add Namecheap credentials to `ansible/inventory/group_vars/all/vault.yml`: + +```bash +ansible-vault edit inventory/group_vars/all/vault.yml +``` + +Add the following variables: +```yaml +vault_namecheap_username: "your_namecheap_username" +vault_namecheap_api_key: "your_namecheap_api_key" +``` + +Map these in `inventory/group_vars/all/vars.yml`: +```yaml +namecheap_username: "{{ vault_namecheap_username }}" +namecheap_api_key: "{{ vault_namecheap_api_key }}" +``` + +### 2. Configure Host Variables + +For Titania, the configuration is in `inventory/host_vars/titania.incus.yml`: +```yaml +services: + - certbot + - haproxy + # ... + +certbot_email: webmaster@helu.ca +certbot_cert_name: ouranos.helu.ca +certbot_domains: + - "*.ouranos.helu.ca" + - "ouranos.helu.ca" +``` + +### 3. Deploy + +```bash +cd ansible +ansible-playbook certbot/deploy.yml --limit titania.incus +``` + +## Files Created + +| Path | Purpose | +|------|---------| +| `/srv/certbot/.venv/` | Python virtualenv with certbot | +| `/srv/certbot/config/` | Certbot configuration and certificates | +| `/srv/certbot/credentials/namecheap.ini` | Namecheap API credentials (600 perms) | +| `/srv/certbot/hooks/renewal-hook.sh` | Post-renewal script | +| `/srv/certbot/hooks/cert-metrics.sh` | Prometheus metrics script | +| `/etc/haproxy/certs/ouranos.helu.ca.pem` | Combined cert for HAProxy (Titania) | +| `/etc/systemd/system/certbot-renew.service` | Renewal service unit | +| `/etc/systemd/system/certbot-renew.timer` | Twice-daily renewal timer | +| `/etc/systemd/system/certbot-renew.timer` | Twice-daily renewal timer | + +## Renewal Process + +1. Systemd timer triggers at 00:00 and 12:00 (with random delay up to 1 hour) +2. Certbot checks if certificate needs renewal (within 30 days of expiry) +3. If renewal needed: + - Creates DNS TXT record via Namecheap API + - Waits 120 seconds for propagation + - Validates and downloads new certificate + - Runs `renewal-hook.sh` +4. Renewal hook: + - Combines fullchain + privkey into HAProxy format + - Reloads HAProxy via `docker compose kill -s HUP haproxy` + - Updates Prometheus metrics + +## Prometheus Metrics + +Metrics written to `/var/lib/prometheus/node-exporter/ssl_cert.prom`: + +| Metric | Description | +|--------|-------------| +| `ssl_certificate_expiry_timestamp` | Unix timestamp when cert expires | +| `ssl_certificate_expiry_seconds` | Seconds until cert expires | +| `ssl_certificate_valid` | 1 if valid, 0 if expired/missing | + +Example alert rule: +```yaml +- alert: SSLCertificateExpiringSoon + expr: ssl_certificate_expiry_seconds < 604800 # 7 days + for: 1h + labels: + severity: warning + annotations: + summary: "SSL certificate expiring soon" + description: "Certificate for {{ $labels.domain }} expires in {{ $value | humanizeDuration }}" +``` + +## Troubleshooting + +### View Certificate Status + +```bash +# Check certificate expiry (Titania example) +openssl x509 -enddate -noout -in /etc/haproxy/certs/ouranos.helu.ca.pem + +# Check certbot certificates +sudo -u certbot /srv/certbot/.venv/bin/certbot certificates \ + --config-dir /srv/certbot/config +``` + +### Manual Renewal Test + +```bash +# Dry run renewal +sudo -u certbot /srv/certbot/.venv/bin/certbot renew \ + --config-dir /srv/certbot/config \ + --work-dir /srv/certbot/work \ + --logs-dir /srv/certbot/logs \ + --dry-run + +# Force renewal (if needed) +sudo -u certbot /srv/certbot/.venv/bin/certbot renew \ + --config-dir /srv/certbot/config \ + --work-dir /srv/certbot/work \ + --logs-dir /srv/certbot/logs \ + --force-renewal +``` + +### Check Systemd Timer + +```bash +# Timer status +systemctl status certbot-renew.timer + +# Last run +journalctl -u certbot-renew.service --since "1 day ago" + +# List timers +systemctl list-timers certbot-renew.timer +``` + +### DNS Propagation Issues + +If certificate requests fail due to DNS propagation: + +1. Check Namecheap API is accessible +2. Verify IP is whitelisted +3. Increase propagation wait time (default 120s) +4. Check certbot logs: `/srv/certbot/logs/letsencrypt.log` + +## Related Playbooks + +- `haproxy/deploy.yml` - Depends on certificate from certbot +- `prometheus/node_deploy.yml` - Deploys node_exporter for metrics collection \ No newline at end of file diff --git a/docs/django_mcp_standards.html b/docs/django_mcp_standards.html new file mode 100644 index 0000000..3a3ad12 --- /dev/null +++ b/docs/django_mcp_standards.html @@ -0,0 +1,1275 @@ + + + + + + + Django MCP Server Development Standards v1.0 + + + + + + + +
+ + + + +
+
+

Django MCP Server Development Standards v1.0

+

Production-Ready Standards for Building Django-Backed MCP Servers

+
+
+ +
+

📘 Purpose: This document defines standards for building MCP servers that integrate with Django REST APIs, using the Python MCP SDK (FastMCP) for consistent, maintainable, production-ready implementations.

+

Audience: Python developers building MCP servers that connect to Django backends.

+
+ +
+

Overview

+ +
+

Core Technology Stack

+
    +
  • MCP Framework: FastMCP (Python SDK) with Streamable HTTP transport
  • +
  • Backend API: Django REST Framework
  • +
  • Server: Starlette ASGI + Uvicorn
  • +
  • Authentication: JWT with OAuth 2.1 patterns
  • +
  • Container: Docker with non-root user
  • +
+
+ +

Design Principles

+
    +
  1. Security First: JWT authentication, rate limiting, input validation
  2. +
  3. Production Ready: Structured logging, health checks, graceful shutdown
  4. +
  5. Django Integration: Clean API client abstraction with correlation tracking
  6. +
  7. Stateless HTTP: Streamable HTTP transport for broad client compatibility
  8. +
  9. Observable: Prometheus metrics, structured JSON logs, health endpoints
  10. +
+
+ +
+

Architecture Pattern

+ +

Request Flow

+
AI Client (Claude Desktop)
+    ↓ JWT Bearer Token
+CORS Middleware
+    ↓
+Correlation ID Middleware (adds X-Correlation-ID)
+    ↓
+Rate Limit Middleware (per-client IP throttling)
+    ↓
+FastMCP Server (tools/resources routing)
+    ↓ Context object (correlation_id, session, request_id)
+Tool/Resource Handler
+    ↓ API Key + Correlation ID
+Django API Client (async httpx)
+    ↓ Authorization: Api-Key {key}
+Django REST API
+    ↓
+Database
+ +
+

Key Components

+
    +
  • FastMCP: Handles MCP protocol, tool/resource registration
  • +
  • AthenaMCP: Server class managing lifecycle, tools, resources
  • +
  • SecureAPIClient: Async HTTP client for Django API with retry logic
  • +
  • Security Components: RateLimiter, SecureTokenVerifier, Middleware
  • +
  • Configuration: Pydantic models with environment variable loading
  • +
+
+
+ +
+

Python SDK Integration (FastMCP)

+ +

Server Initialization REQUIRED

+ +
With Authentication (Production):
+
from mcp.server.fastmcp import FastMCP
+from mcp.server.auth.settings import AuthSettings
+from pydantic import AnyHttpUrl
+
+from security import SecureTokenVerifier
+
+# Create token verifier
+token_verifier = SecureTokenVerifier(
+    jwt_secret=config.security.jwt_secret,
+    jwt_algorithm=config.security.jwt_algorithm
+)
+
+# Create FastMCP server with authentication
+mcp = FastMCP(
+    name="my-django-mcp-server",
+    token_verifier=token_verifier,
+    auth=AuthSettings(
+        issuer_url=AnyHttpUrl("https://auth.mydomain.com"),
+        resource_server_url=AnyHttpUrl(f"http://{config.server.host}:{config.server.port}"),
+        required_scopes=["read", "write"]
+    ),
+    stateless_http=True,  # Enable Streamable HTTP transport
+    json_response=True    # Return JSON responses (not SSE streams)
+)
+ +
Without Authentication (Development/Testing):
+
# For testing in trusted networks only
+mcp = FastMCP(
+    name="my-django-mcp-server",
+    stateless_http=True,
+    json_response=True
+)
+ +
+ ⚠️ Security: Only disable authentication in development/testing on private networks. Always enable for production deployments. +
+ +

Tool Registration REQUIRED

+ +
Basic Tool Pattern:
+
from mcp.server.fastmcp import Context
+from pydantic import Field
+from typing import Optional, Dict, Any
+
+@mcp.tool()
+async def get_opportunities(
+    ctx: Context,
+    status: Optional[str] = None,
+    client_id: Optional[int] = None,
+    limit: int = Field(default=10, ge=1, le=100)
+) -> Dict[str, Any]:
+    """Retrieve business opportunities with optional filtering.
+    
+    Args:
+        ctx: Context object with correlation_id, session, request_id
+        status: Filter by status (active, won, lost, dropped)
+        client_id: Filter by client ID
+        limit: Maximum number of results (1-100)
+    
+    Returns:
+        Dictionary with count and opportunities list
+    """
+    # Extract correlation ID from context
+    correlation_id = getattr(ctx.request_context, 'correlation_id', None)
+    
+    try:
+        # Build query parameters
+        params = {"limit": limit}
+        if status and status in ["active", "won", "lost", "dropped"]:
+            params["status"] = status
+        if client_id and client_id > 0:
+            params["client"] = client_id
+        
+        # Call Django API with correlation tracking
+        result = await self.api_client.request(
+            "GET",
+            "/api/v1/orbit/opportunities/",
+            params=params,
+            correlation_id=correlation_id
+        )
+        
+        opportunities = result.get("results", [])
+        return {
+            "count": len(opportunities),
+            "opportunities": [
+                {
+                    "id": opp["id"],
+                    "name": opp["name"],
+                    "status": opp["status"],
+                    "client": opp.get("client", {}).get("name"),
+                    "value": opp.get("value"),
+                    "stage": opp.get("stage")
+                }
+                for opp in opportunities
+            ]
+        }
+    except Exception as e:
+        logger.error(
+            f"Error retrieving opportunities: {str(e)}",
+            extra={"correlation_id": correlation_id}
+        )
+        return {
+            "error": "Failed to retrieve opportunities",
+            "count": 0,
+            "opportunities": []
+        }
+ +
+
Context Object - Critical Pattern
+

ALWAYS include ctx: Context as the first parameter in tool functions to access:

+
    +
  • ctx.request_context.correlation_id - Request tracking ID
  • +
  • ctx.request_id - MCP request ID
  • +
  • ctx.session - Session for sending notifications
  • +
  • ctx.request_context - Full request context
  • +
+
+ +

Resource Registration RECOMMENDED

+ +
@mcp.resource("client://{client_id}")
+async def get_client_details(client_id: int) -> str:
+    """Get detailed information about a specific client.
+    
+    Returns formatted text content for display.
+    """
+    try:
+        if client_id <= 0:
+            return "Error: Invalid client ID"
+        
+        result = await self.api_client.request(
+            "GET",
+            f"/api/v1/orbit/clients/{client_id}/"
+        )
+        
+        client = result
+        output = f"Client Details: {client['name']}\n"
+        output += "=" * (len(client['name']) + 16) + "\n\n"
+        output += f"ID: {client['id']}\n"
+        output += f"Legal Name: {client.get('legal_name', 'N/A')}\n"
+        output += f"Type: {client.get('client_type', 'N/A')}\n"
+        output += f"Vertical: {client.get('vertical', 'N/A')}\n"
+        
+        if client.get('overview'):
+            output += f"\nOverview:\n{client['overview']}\n"
+        
+        return output
+    except Exception as e:
+        logger.error(f"Error retrieving client {client_id}: {str(e)}")
+        return "Error: Unable to retrieve client details"
+
+ +
+

Django API Integration

+ +

API Client Pattern REQUIRED

+ +
"""Django API client with retry logic and correlation tracking."""
+import logging
+import time
+from typing import Any, Dict, Optional
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+class SecureAPIClient:
+    """Async HTTP client for Django REST API communication."""
+    
+    def __init__(
+        self,
+        base_url: str,
+        api_key: str,
+        timeout: int = 30,
+        max_retries: int = 3,
+        connection_pool_size: int = 20
+    ):
+        self.base_url = base_url.rstrip('/')
+        self.api_key = api_key
+        self.timeout = timeout
+        self.max_retries = max_retries
+        self.connection_pool_size = connection_pool_size
+        self.client: Optional[httpx.AsyncClient] = None
+        self._health_status = True
+        self._last_health_check = 0.0
+    
+    async def __aenter__(self):
+        """Async context manager entry."""
+        self.client = httpx.AsyncClient(
+            base_url=self.base_url,
+            timeout=httpx.Timeout(self.timeout),
+            limits=httpx.Limits(max_connections=self.connection_pool_size),
+            headers={
+                "Authorization": f"Api-Key {self.api_key}",
+                "User-Agent": "My-MCP-Server/1.0.0",
+                "Accept": "application/json",
+                "Content-Type": "application/json"
+            }
+        )
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        if self.client:
+            await self.client.aclose()
+    
+    async def health_check(self) -> bool:
+        """Check API health using a lightweight endpoint."""
+        try:
+            if not self.client:
+                return False
+            
+            # Use Django's stats or health endpoint
+            response = await self.client.get("/api/v1/core/stats/", timeout=5.0)
+            self._health_status = response.status_code == 200
+            self._last_health_check = time.time()
+            return self._health_status
+        except Exception as e:
+            logger.error(f"Health check failed: {str(e)}")
+            self._health_status = False
+            return False
+    
+    async def request(
+        self,
+        method: str,
+        endpoint: str,
+        params: Optional[Dict] = None,
+        json_data: Optional[Dict] = None,
+        correlation_id: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Make authenticated request to Django API with retry logic."""
+        import asyncio
+        
+        if not self.client:
+            raise RuntimeError("Client not initialized")
+        
+        # Add correlation ID to headers
+        headers = {}
+        if correlation_id:
+            headers["X-Correlation-ID"] = correlation_id
+        
+        # Retry with exponential backoff
+        for attempt in range(self.max_retries + 1):
+            try:
+                if method.upper() == "GET":
+                    response = await self.client.get(
+                        endpoint,
+                        params=params,
+                        headers=headers
+                    )
+                elif method.upper() == "POST":
+                    response = await self.client.post(
+                        endpoint,
+                        json=json_data,
+                        headers=headers
+                    )
+                else:
+                    raise ValueError(f"Unsupported HTTP method: {method}")
+                
+                response.raise_for_status()
+                return response.json()
+                
+            except httpx.HTTPStatusError as e:
+                logger.error(
+                    f"HTTP {e.response.status_code} from {endpoint}",
+                    extra={"correlation_id": correlation_id}
+                )
+                if e.response.status_code < 500:
+                    # Client error - don't retry
+                    raise
+                if attempt == self.max_retries:
+                    raise
+                
+            except Exception as e:
+                logger.error(
+                    f"Request failed (attempt {attempt + 1}/{self.max_retries + 1}): {str(e)}",
+                    extra={"correlation_id": correlation_id}
+                )
+                if attempt == self.max_retries:
+                    raise
+            
+            # Exponential backoff
+            if attempt < self.max_retries:
+                wait_time = 2 ** attempt
+                await asyncio.sleep(wait_time)
+ +
+

Django API Key Authentication

+

Use Django REST Framework's built-in API key authentication:

+
# In Django settings.py
+REST_FRAMEWORK = {
+    'DEFAULT_AUTHENTICATION_CLASSES': [
+        'rest_framework.authentication.TokenAuthentication',
+    ],
+}
+
+# Authorization header format
+Authorization: Api-Key {django_api_key}
+
+ +

Correlation ID Propagation RECOMMENDED

+ +

Propagate correlation IDs from MCP context to Django API for end-to-end tracing:

+ +
from starlette.middleware.base import BaseHTTPMiddleware
+import uuid
+
+class CorrelationMiddleware(BaseHTTPMiddleware):
+    """Add correlation IDs to all requests for tracing."""
+    
+    async def dispatch(self, request, call_next):
+        # Generate or extract correlation ID
+        correlation_id = request.headers.get(
+            "x-correlation-id",
+            str(uuid.uuid4())
+        )
+        
+        # Add to request state for tool handlers
+        request.state.correlation_id = correlation_id
+        
+        response = await call_next(request)
+        
+        # Add to response headers
+        response.headers["X-Correlation-ID"] = correlation_id
+        return response
+
+ +
+

Security Implementation

+ +

JWT Token Verification REQUIRED

+ +
"""JWT token verifier for FastMCP authentication."""
+import jwt
+from typing import Set
+from datetime import datetime, UTC
+
+from mcp.server.auth.provider import AccessToken, TokenVerifier
+
+class SecureTokenVerifier(TokenVerifier):
+    """Secure JWT token verifier implementation."""
+    
+    def __init__(self, jwt_secret: str, jwt_algorithm: str = "HS256"):
+        self.jwt_secret = jwt_secret
+        self.jwt_algorithm = jwt_algorithm
+        self.blocked_tokens: Set[str] = set()
+    
+    async def verify_token(self, token: str) -> AccessToken | None:
+        """Verify JWT token and return access token if valid."""
+        try:
+            # Check if token is blocked
+            if token in self.blocked_tokens:
+                return None
+            
+            # Decode and verify JWT
+            payload = jwt.decode(
+                token,
+                self.jwt_secret,
+                algorithms=[self.jwt_algorithm]
+            )
+            
+            # Validate expiration
+            exp = payload.get("exp")
+            if exp and datetime.fromtimestamp(exp, tz=UTC) < datetime.now(UTC):
+                return None
+            
+            # Return access token
+            return AccessToken(
+                access_token=token,
+                scopes=payload.get("scopes", []),
+                user_id=payload.get("sub")
+            )
+        except jwt.InvalidTokenError:
+            return None
+        except Exception as e:
+            logger.error(f"Token verification error: {str(e)}")
+            return None
+ +

Rate Limiting REQUIRED

+ +
"""Rate limiter with middleware integration."""
+import asyncio
+import time
+from dataclasses import dataclass, field
+from typing import Dict
+
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.responses import JSONResponse
+
+
+@dataclass
+class RateLimitEntry:
+    """Rate limiting entry for tracking requests."""
+    count: int = 0
+    window_start: float = field(default_factory=time.time)
+    
+    def is_expired(self, window_size: int) -> bool:
+        return time.time() - self.window_start > window_size
+    
+    def increment(self) -> None:
+        self.count += 1
+
+
+class RateLimiter:
+    """Thread-safe rate limiter."""
+    
+    def __init__(self, max_requests: int, window_size: int):
+        self.max_requests = max_requests
+        self.window_size = window_size
+        self.clients: Dict[str, RateLimitEntry] = {}
+        self._lock = asyncio.Lock()
+    
+    async def is_allowed(self, client_id: str) -> bool:
+        """Check if client is allowed to make a request."""
+        async with self._lock:
+            now = time.time()
+            
+            # Clean up expired entries
+            expired = [
+                cid for cid, entry in self.clients.items()
+                if entry.is_expired(self.window_size)
+            ]
+            for cid in expired:
+                del self.clients[cid]
+            
+            # Check current client
+            if client_id not in self.clients:
+                self.clients[client_id] = RateLimitEntry()
+            
+            entry = self.clients[client_id]
+            
+            # Reset window if expired
+            if entry.is_expired(self.window_size):
+                entry.count = 0
+                entry.window_start = now
+            
+            # Check limit
+            if entry.count >= self.max_requests:
+                return False
+            
+            entry.increment()
+            return True
+
+
+class RateLimitMiddleware(BaseHTTPMiddleware):
+    """Rate limiting middleware."""
+    
+    def __init__(self, app, rate_limiter: RateLimiter):
+        super().__init__(app)
+        self.rate_limiter = rate_limiter
+    
+    async def dispatch(self, request, call_next):
+        # Get client ID from IP address
+        client_ip = request.client.host if request.client else "unknown"
+        
+        # Skip health check endpoints
+        if request.url.path in ["/health", "/live/", "/ready/", "/metrics"]:
+            return await call_next(request)
+        
+        # Check rate limit
+        if not await self.rate_limiter.is_allowed(client_ip):
+            return JSONResponse(
+                {
+                    "error": "Rate limit exceeded",
+                    "message": "Too many requests. Please try again later."
+                },
+                status_code=429
+            )
+        
+        return await call_next(request)
+
+ +
+

Configuration Management

+ +

Pydantic Configuration Models REQUIRED

+ +
"""Configuration models with validation."""
+from typing import List
+from pydantic import BaseModel, Field, AnyHttpUrl, field_validator
+
+
+class SecurityConfig(BaseModel):
+    """Security configuration settings."""
+    jwt_secret: str = Field(..., min_length=32, description="JWT signing secret")
+    jwt_algorithm: str = Field(default="HS256")
+    rate_limit_requests: int = Field(default=100, ge=1)
+    rate_limit_window: int = Field(default=3600, ge=60)
+    allowed_origins: List[str] = Field(default=["*"])
+
+
+class DatabaseConfig(BaseModel):
+    """Django API configuration."""
+    base_url: AnyHttpUrl = Field(..., description="Django API base URL")
+    api_key: str = Field(..., description="Django API key")
+    timeout: int = Field(default=30, ge=5, le=300)
+    max_retries: int = Field(default=3, ge=0, le=10)
+    connection_pool_size: int = Field(default=20, ge=1, le=100)
+    
+    @field_validator('base_url')
+    @classmethod
+    def validate_base_url(cls, v):
+        return str(v).rstrip('/')
+
+
+class ServerConfig(BaseModel):
+    """Server configuration."""
+    name: str = Field(default="my-mcp-server")
+    version: str = Field(default="1.0.0")
+    host: str = Field(default="0.0.0.0")
+    port: int = Field(default=8080, ge=1024, le=65535)
+    log_level: str = Field(default="INFO")
+    environment: str = Field(default="production")
+
+
+class Config(BaseModel):
+    """Complete application configuration."""
+    security: SecurityConfig
+    database: DatabaseConfig
+    server: ServerConfig = ServerConfig()
+ +

Environment Variable Loading REQUIRED

+ +
"""Load configuration from environment variables."""
+import os
+import secrets
+from pathlib import Path
+
+
+def load_config(config_path: Optional[str] = None) -> Config:
+    """Load configuration from file or environment variables."""
+    # Try loading from JSON file first
+    if config_path and Path(config_path).exists():
+        with open(config_path) as f:
+            config_data = json.load(f)
+        return Config(**config_data)
+    
+    # Load from environment variables
+    return Config(
+        security=SecurityConfig(
+            jwt_secret=os.getenv("JWT_SECRET", secrets.token_hex(32)),
+            rate_limit_requests=int(os.getenv("RATE_LIMIT_REQUESTS", "100")),
+            rate_limit_window=int(os.getenv("RATE_LIMIT_WINDOW", "3600")),
+        ),
+        database=DatabaseConfig(
+            base_url=os.getenv("DJANGO_BASE_URL", "http://localhost:8000"),
+            api_key=os.getenv("DJANGO_API_KEY", ""),
+        ),
+        server=ServerConfig(
+            host=os.getenv("MCP_HOST", "0.0.0.0"),
+            port=int(os.getenv("PORT", "8080")),
+            log_level=os.getenv("LOG_LEVEL", "INFO"),
+            environment=os.getenv("ENVIRONMENT", "production"),
+        )
+    )
+ +

Environment Variables Standard REQUIRED

+ +
# Required Variables
+DJANGO_BASE_URL=https://api.mydomain.com
+DJANGO_API_KEY=your-django-api-key
+JWT_SECRET=minimum-32-character-secret-here
+
+# Standard MCP Docker Spec Variables
+PORT=8080
+LOG_LEVEL=INFO
+LOG_FORMAT=json
+ENVIRONMENT=production
+
+# Optional: Security
+RATE_LIMIT_REQUESTS=100
+RATE_LIMIT_WINDOW=3600
+AUTH_ENABLED=true
+
+# Optional: Testing/Development
+DEBUG=false
+
+ +
+

Health Check Endpoints

+ +

Kubernetes-Style Health Checks REQUIRED

+ +
"""Health check endpoints following Kubernetes standards."""
+from datetime import datetime, UTC
+from starlette.responses import JSONResponse
+from starlette.routing import Route
+
+
+async def live_endpoint(request):
+    """Liveness probe - is the process alive?"""
+    return JSONResponse({
+        "status": "alive",
+        "timestamp": datetime.now(UTC).isoformat(),
+        "version": "1.0.0"
+    })
+
+
+async def ready_endpoint(request):
+    """Readiness probe - is the service ready to serve traffic?"""
+    checks = {}
+    
+    # Check Django API health
+    api_client = request.app.state.api_client
+    api_healthy = await api_client.health_check() if api_client else False
+    checks["api_backend"] = "ok" if api_healthy else "error"
+    
+    # Add more checks as needed (database, cache, etc.)
+    
+    all_ok = all(v == "ok" for v in checks.values())
+    status = "ready" if all_ok else "not_ready"
+    code = 200 if all_ok else 503
+    
+    return JSONResponse({
+        "status": status,
+        "checks": checks,
+        "timestamp": datetime.now(UTC).isoformat()
+    }, status_code=code)
+
+
+async def health_endpoint(request):
+    """Combined health check endpoint."""
+    api_client = request.app.state.api_client
+    api_healthy = await api_client.health_check() if api_client else False
+    
+    return JSONResponse({
+        "status": "healthy" if api_healthy else "degraded",
+        "timestamp": datetime.now(UTC).isoformat(),
+        "version": "1.0.0",
+        "api_status": "healthy" if api_healthy else "unhealthy"
+    }, status_code=200 if api_healthy else 503)
+ +

Prometheus Metrics RECOMMENDED

+ +
"""Prometheus metrics endpoint."""
+from starlette.responses import PlainTextResponse
+
+
+async def metrics_endpoint(request):
+    """Prometheus-format metrics endpoint."""
+    # Accept text/plain for Prometheus scraping
+    want_prom = "text/plain" in request.headers.get("accept", "")
+    
+    timestamp = int(datetime.now(UTC).timestamp())
+    
+    # Collect metrics
+    metrics = {
+        "requests_total": getattr(request.app.state, "request_count", 0),
+        "errors_total": getattr(request.app.state, "error_count", 0),
+        "rate_limit_clients": len(request.app.state.rate_limiter.clients),
+    }
+    
+    if want_prom:
+        lines = [
+            '# HELP mcp_requests_total Total HTTP requests processed',
+            '# TYPE mcp_requests_total counter',
+            f'mcp_requests_total {metrics["requests_total"]}',
+            '# HELP mcp_errors_total Total HTTP errors',
+            '# TYPE mcp_errors_total counter',
+            f'mcp_errors_total {metrics["errors_total"]}',
+            '# HELP mcp_rate_limit_clients Number of rate limited clients',
+            '# TYPE mcp_rate_limit_clients gauge',
+            f'mcp_rate_limit_clients {metrics["rate_limit_clients"]}',
+            '# HELP mcp_metrics_timestamp Metrics timestamp',
+            '# TYPE mcp_metrics_timestamp gauge',
+            f'mcp_metrics_timestamp {timestamp}'
+        ]
+        return PlainTextResponse("\n".join(lines) + "\n", media_type="text/plain")
+    else:
+        return JSONResponse(metrics)
+
+ +
+

Structured Logging

+ +

JSON Formatter REQUIRED

+ +
"""Structured JSON logging for production."""
+import json
+import logging
+from datetime import datetime, UTC
+
+
+class JSONFormatter(logging.Formatter):
+    """JSON formatter for structured logging."""
+    
+    def __init__(self, service_name="mcp-server", version="1.0.0"):
+        super().__init__()
+        self.service_name = service_name
+        self.version = version
+        self.environment = os.getenv("ENVIRONMENT", "unknown")
+    
+    def format(self, record):
+        """Format log record as JSON."""
+        log_data = {
+            "timestamp": datetime.fromtimestamp(record.created, tz=UTC).isoformat(),
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+            "environment": self.environment,
+            "service": self.service_name,
+            "version": self.version,
+        }
+        
+        # Add correlation_id if available
+        if hasattr(record, 'correlation_id'):
+            log_data["correlation_id"] = record.correlation_id
+        
+        # Add exception info if present
+        if record.exc_info:
+            log_data["exception"] = self.formatException(record.exc_info)
+        
+        return json.dumps(log_data)
+
+
+# Configure logging
+log_format = os.getenv("LOG_FORMAT", "json").lower()
+
+if log_format == "json":
+    handler = logging.StreamHandler()
+    handler.setFormatter(JSONFormatter())
+    logging.root.addHandler(handler)
+    logging.root.setLevel(logging.INFO)
+else:
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+ +

Correlation ID Logging RECOMMENDED

+ +
# In tool handlers
+logger.error(
+    f"Error retrieving data: {str(e)}",
+    extra={"correlation_id": correlation_id}
+)
+
+ +
+

Docker Implementation

+ +

Dockerfile REQUIRED

+ +
# Production-ready Dockerfile for Django MCP Server
+FROM python:3.12-slim
+
+WORKDIR /app
+
+# Install runtime dependencies
+RUN apt-get update --yes --quiet && \
+    apt-get install --yes --quiet --no-install-recommends curl && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create non-root user
+RUN useradd --system --create-home --uid 1000 mcpuser && \
+    chown -R mcpuser:mcpuser /app
+
+# Copy and install dependencies
+COPY --chown=mcpuser:mcpuser pyproject.toml .
+RUN pip install --upgrade pip setuptools wheel && \
+    pip install --no-cache-dir .
+
+# Copy application
+COPY --chown=mcpuser:mcpuser . .
+
+# Create /tmp volume for MCP spec compliance
+RUN mkdir -p /tmp/mcp_server && \
+    chown mcpuser:mcpuser /tmp/mcp_server && \
+    chmod 755 /tmp/mcp_server
+
+# Declare volume
+VOLUME ["/tmp/mcp_server"]
+
+USER mcpuser
+
+EXPOSE 8080
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8080/health || exit 1
+
+# Run application
+CMD ["python", "server.py"]
+ +

docker-compose.yml RECOMMENDED

+ +
version: '3.8'
+
+services:
+  mcp-server:
+    build: .
+    image: my-django-mcp-server:latest
+    container_name: mcp-server
+    ports:
+      - "8080:8080"
+    env_file:
+      - .env
+    volumes:
+      - mcp_tmp:/tmp/mcp_server
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 10s
+
+volumes:
+  mcp_tmp:
+ +

/tmp Cleanup Task REQUIRED

+ +
"""Background task for /tmp cleanup."""
+import asyncio
+import shutil
+import time
+from pathlib import Path
+
+
+async def cleanup_tmp_files(
+    tmp_dir: Path,
+    max_age_hours: int = 24,
+    interval_hours: int = 6
+):
+    """Background task to clean up old temporary files."""
+    while True:
+        try:
+            await asyncio.sleep(interval_hours * 3600)
+            
+            cutoff_time = time.time() - (max_age_hours * 3600)
+            cleaned_count = 0
+            cleaned_size = 0
+            
+            for item in tmp_dir.iterdir():
+                try:
+                    stat = item.stat()
+                    if stat.st_mtime < cutoff_time:
+                        size = stat.st_size
+                        if item.is_file():
+                            item.unlink()
+                        elif item.is_dir():
+                            shutil.rmtree(item)
+                        cleaned_count += 1
+                        cleaned_size += size
+                except (PermissionError, OSError):
+                    continue
+            
+            if cleaned_count > 0:
+                logger.info(
+                    f"Cleaned {cleaned_count} items from {tmp_dir} "
+                    f"({cleaned_size / 1024 / 1024:.2f} MB)"
+                )
+        except Exception as e:
+            logger.error(f"Cleanup error: {str(e)}")
+
+ +
+

Testing Strategy

+ +

Test Categories RECOMMENDED

+ +
+

Unit Tests (>80% coverage target)

+
    +
  • Configuration validation
  • +
  • Rate limiter logic
  • +
  • Token verifier
  • +
  • API client retry logic
  • +
+ +

Integration Tests

+
    +
  • Django API communication
  • +
  • Health check endpoints
  • +
  • Tool/resource execution
  • +
  • Authentication flow
  • +
+ +

E2E Tests

+
    +
  • Full MCP protocol flow
  • +
  • Client integration (Claude Desktop)
  • +
  • Error handling scenarios
  • +
+
+
+ +
+

Complete Implementation Example

+ +

Main Server File (server.py)

+ +
#!/usr/bin/env python3
+"""Django MCP Server - Production Implementation."""
+import asyncio
+import logging
+import os
+from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import Optional
+
+import uvicorn
+from pydantic import AnyHttpUrl
+from starlette.applications import Starlette
+from starlette.middleware.cors import CORSMiddleware
+from starlette.routing import Route
+
+from mcp.server.fastmcp import FastMCP, Context
+from mcp.server.auth.settings import AuthSettings
+
+from config import load_config
+from security import (
+    RateLimiter,
+    SecureTokenVerifier,
+    CorrelationMiddleware,
+    RateLimitMiddleware
+)
+from api_client import SecureAPIClient
+from logging_config import setup_logging
+
+
+logger = logging.getLogger(__name__)
+
+
+class DjangoMCPServer:
+    """Django-backed MCP Server implementation."""
+    
+    def __init__(self, config):
+        self.config = config
+        self.api_client: Optional[SecureAPIClient] = None
+        self._cleanup_task: Optional[asyncio.Task] = None
+        
+        # Security components
+        self.rate_limiter = RateLimiter(
+            config.security.rate_limit_requests,
+            config.security.rate_limit_window
+        )
+        self.token_verifier = SecureTokenVerifier(
+            config.security.jwt_secret,
+            config.security.jwt_algorithm
+        )
+        
+        # Create FastMCP server
+        auth_enabled = os.getenv("AUTH_ENABLED", "true").lower() == "true"
+        
+        if auth_enabled:
+            self.mcp = FastMCP(
+                name=config.server.name,
+                token_verifier=self.token_verifier,
+                auth=AuthSettings(
+                    issuer_url=AnyHttpUrl("https://auth.mydomain.com"),
+                    resource_server_url=AnyHttpUrl(
+                        f"http://{config.server.host}:{config.server.port}"
+                    ),
+                    required_scopes=["read", "write"]
+                ),
+                stateless_http=True,
+                json_response=True
+            )
+        else:
+            logger.warning("AUTH DISABLED - Testing mode only!")
+            self.mcp = FastMCP(
+                name=config.server.name,
+                stateless_http=True,
+                json_response=True
+            )
+        
+        self._setup_tools()
+        self._setup_resources()
+    
+    def _setup_tools(self):
+        """Register MCP tools."""
+        # Tools implementation here...
+        pass
+    
+    def _setup_resources(self):
+        """Register MCP resources."""
+        # Resources implementation here...
+        pass
+    
+    @asynccontextmanager
+    async def lifespan(self, app: Starlette):
+        """Manage application lifecycle."""
+        # Startup
+        logger.info(f"Starting {self.config.server.name}...")
+        
+        self.api_client = SecureAPIClient(
+            base_url=self.config.database.base_url,
+            api_key=self.config.database.api_key,
+            timeout=self.config.database.timeout,
+            max_retries=self.config.database.max_retries
+        )
+        await self.api_client.__aenter__()
+        
+        # Health check
+        if not await self.api_client.health_check():
+            logger.warning("Django API health check failed")
+        
+        # Start cleanup task
+        tmp_dir = Path("/tmp/mcp_server")
+        tmp_dir.mkdir(exist_ok=True)
+        self._cleanup_task = asyncio.create_task(
+            cleanup_tmp_files(tmp_dir)
+        )
+        
+        async with self.mcp.session_manager.run():
+            logger.info(f"Server ready on {self.config.server.host}:{self.config.server.port}")
+            try:
+                yield
+            finally:
+                # Shutdown
+                logger.info("Shutting down...")
+                
+                if self._cleanup_task:
+                    self._cleanup_task.cancel()
+                    try:
+                        await self._cleanup_task
+                    except asyncio.CancelledError:
+                        pass
+                
+                if self.api_client:
+                    await self.api_client.__aexit__(None, None, None)
+                
+                logger.info("Shutdown complete")
+    
+    def create_app(self) -> Starlette:
+        """Create Starlette application."""
+        app = self.mcp.streamable_http_app()
+        
+        # Add health endpoints
+        app.routes.insert(0, Route("/live/", self.live_endpoint, methods=["GET"]))
+        app.routes.insert(0, Route("/ready/", self.ready_endpoint, methods=["GET"]))
+        app.routes.insert(0, Route("/health", self.health_endpoint, methods=["GET"]))
+        app.routes.insert(0, Route("/metrics", self.metrics_endpoint, methods=["GET"]))
+        
+        # Replace lifespan
+        app.router.lifespan_context = self.lifespan
+        
+        # Add middleware
+        app.add_middleware(CorrelationMiddleware)
+        app.add_middleware(RateLimitMiddleware, rate_limiter=self.rate_limiter)
+        app.add_middleware(
+            CORSMiddleware,
+            allow_origins=self.config.security.allowed_origins,
+            allow_credentials=True,
+            allow_methods=["GET", "POST", "OPTIONS"],
+            allow_headers=["*"]
+        )
+        
+        # Store references
+        app.state.api_client = self.api_client
+        app.state.rate_limiter = self.rate_limiter
+        
+        return app
+
+
+def main():
+    """Main entry point."""
+    from dotenv import load_dotenv
+    load_dotenv()
+    
+    # Load configuration
+    config = load_config()
+    
+    # Setup logging
+    setup_logging(config)
+    
+    # Validate required variables
+    if not config.database.api_key:
+        logger.error("DJANGO_API_KEY is required")
+        return 1
+    
+    # Create server
+    server = DjangoMCPServer(config)
+    app = server.create_app()
+    
+    # Run server
+    uvicorn.run(
+        app,
+        host=config.server.host,
+        port=config.server.port,
+        log_level=config.server.log_level.lower(),
+        access_log=False
+    )
+
+
+if __name__ == "__main__":
+    exit(main())
+
+
+ +
+

Implementation Checklist

+ +
+

Required Components

+
    +
  • ☑ FastMCP server with Streamable HTTP transport
  • +
  • ☑ JWT authentication with SecureTokenVerifier
  • +
  • ☑ Rate limiting middleware
  • +
  • ☑ Django API client with async/retry logic
  • +
  • ☑ Correlation ID propagation
  • +
  • ☑ Three health endpoints (/live/, /ready/, /health)
  • +
  • ☑ Structured JSON logging
  • +
  • ☑ Pydantic configuration models
  • +
  • ☑ Graceful shutdown handling
  • +
  • ☑ Non-root Docker container
  • +
  • ☑ /tmp volume with cleanup task
  • +
  • ☑ Tool registration with Context parameter
  • +
  • ☑ Environment variable configuration
  • +
+ +

Recommended Components

+
    +
  • ☑ Resource registration
  • +
  • ☑ Prometheus metrics endpoint
  • +
  • ☑ Unit test suite (>80% coverage)
  • +
  • ☑ Integration tests
  • +
  • ☑ Docker Compose configuration
  • +
  • ☑ .env.example file
  • +
  • ☑ README with quick start
  • +
+
+
+ +
+

Django MCP Server Development Standards v1.0

+

Last Updated: November 2025

+
+
+ + + + + + + + + + diff --git a/docs/documentation_style_guide.html b/docs/documentation_style_guide.html new file mode 100644 index 0000000..297ff01 --- /dev/null +++ b/docs/documentation_style_guide.html @@ -0,0 +1,505 @@ + + + + + + Documentation Style Guide + + + + + + + +
+ + + + +
+
+

+ Documentation Style Guide + Complete +

+

This guide explains the approach and principles used to create comprehensive HTML documentation for infrastructure and software projects.

+
+
+ + +
+
Icon Legend
+
+ Critical/Danger + Warning/Important + Success/Complete + Information + Active/Key + Integration +
+
+ +
+

Philosophy

+ +
+
+
+
+

+ Documentation as Architecture +

+

Documentation should mirror and reinforce the software architecture. Each component gets its own focused document that clearly explains its purpose, boundaries, and relationships.

+
+
+
+ +
+
+
+

+ User-Centric Design +

+

Documentation serves multiple audiences:

+
    +
  • Developers need technical details and implementation guidance
  • +
  • Stakeholders need high-level overviews and business context
  • +
  • Red Panda needs approval checkpoints and critical decisions highlighted
  • +
+
+
+
+ +
+
+
+

+ Living Documentation +

+

Documentation evolves with the codebase and captures both current state and architectural decisions.

+
+
+
+
+
+ +
+

Structure Principles

+ +
+

1. Hierarchical Information Architecture

+
Main Documentation (project.html)
+├── Component Docs (component1.html, component2.html, etc.)
+├── Standards References (docs/standards/)
+└── Supporting Materials (README.md, style guides)
+
+ +
+

2. Consistent Navigation

+

Every document includes:

+
    +
  • Navigation bar with key sections
  • +
  • Cross-references to related components
  • +
  • Return links to main documentation
  • +
+
+ +
+

3. Progressive Disclosure

+

Information flows from general to specific:

+

Overview → Architecture → Implementation → Details

+
+
+ +
+

Visual Design Principles

+ +
+

1. Clean Typography

+
    +
  • System fonts for readability
  • +
  • Generous line spacing (1.6)
  • +
  • Clear hierarchy with consistent heading sizes
  • +
+
+ +
+

2. Color-Coded Information Types

+

Bootstrap Alert Classes (Preferred):

+
    +
  • alert alert-danger - Critical decisions requiring immediate attention
  • +
  • alert alert-warning - Important context and warnings
  • +
  • alert alert-success - Completed features and positive outcomes
  • +
  • alert alert-info - Technical architecture information
  • +
  • alert alert-primary - Key workflows and processes
  • +
  • alert alert-secondary - Cross-component integration details
  • +
+

Legacy Custom Classes (Backward Compatible):

+
    +
  • .tech-stack - Technical architecture information
  • +
  • .critical - Important decisions requiring attention
  • +
  • .workflow - Process and workflow information
  • +
  • .integration - Cross-component integration details
  • +
+
+ +
+

3. Responsive Layout

+
    +
  • Bootstrap grid system for all screen sizes
  • +
  • Consistent spacing with utility classes
  • +
  • Card-based information grouping
  • +
+
+
+ +
+

Bootstrap Icons Integration

+ +
+

Setup

+

Add Bootstrap Icons CDN to your HTML documents:

+
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.0/font/bootstrap-icons.css"> +
+

Benefits:

+
    +
  • Minimal overhead (~75KB)
  • +
  • 2000+ icons matching Bootstrap design
  • +
  • CDN caching for fast loading
  • +
+
+ +
+

Common Icon Patterns

+
+
+
Status & Progress
+
    +
  • bi-check-square - Completed
  • +
  • bi-square - Pending
  • +
  • bi-hourglass-split - In Progress
  • +
  • bi-x-circle - Failed/Error
  • +
+
+
+
Navigation
+
    +
  • bi-house-door - Home
  • +
  • bi-arrow-left - Back
  • +
  • bi-box-arrow-up-right - External
  • +
  • bi-link-45deg - Link
  • +
+
+
+
Alerts
+
    +
  • bi-exclamation-triangle-fill - Danger
  • +
  • bi-exclamation-circle-fill - Warning
  • +
  • bi-info-circle-fill - Info
  • +
  • bi-check-circle-fill - Success
  • +
+
+
+
Technical
+
    +
  • bi-code-slash - Code
  • +
  • bi-database - Database
  • +
  • bi-cpu - System
  • +
  • bi-plug - API/Integration
  • +
+
+
+
+ +
+

Usage Examples

+ +
Section Headers with Icons
+
+ <h2><i class="bi bi-book section-icon"></i>Section Title</h2> +
+ +
Alert Boxes with Icons
+
+ <div class="alert alert-info border-start border-4 border-info">
+  <h3><i class="bi bi-info-circle-fill alert-icon"></i>Information</h3>
+</div>
+
+ +
Badges with Icons
+
+ <span class="badge bg-success"><i class="bi bi-check-circle-fill"></i> Complete</span> +
+ +
List Items with Icons
+
+ <li><i class="bi bi-check-circle"></i> Completed task</li>
+<li><i class="bi bi-arrow-right-short"></i> Action item</li>
+
+
+ +
+

Best Practices

+
    +
  • Use semantic icons that match content meaning
  • +
  • Maintain consistent icon usage across documents
  • +
  • Don't overuse icons - they should enhance, not clutter
  • +
  • Ensure icons are visible and meaningful at all screen sizes
  • +
  • Icons should supplement text, not replace it (accessibility)
  • +
+
+
+ +
+

Implementation Guidelines

+ +
+

HTML Document Template

+
<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Document Title</title>
+    <!-- Bootstrap CSS -->
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
+    <!-- Bootstrap Icons -->
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.0/font/bootstrap-icons.css">
+</head>
+<body>
+    <div class="container-fluid">
+        <!-- Navigation -->
+        <nav class="navbar navbar-dark bg-dark rounded mb-4">
+            <a class="navbar-brand" href="main.html">
+                <i class="bi bi-arrow-left"></i> Back
+            </a>
+        </nav>
+        
+        <!-- Breadcrumb -->
+        <nav aria-label="breadcrumb">
+            <ol class="breadcrumb">
+                <li class="breadcrumb-item"><a href="main.html"><i class="bi bi-house-door"></i> Main</a></li>
+                <li class="breadcrumb-item active">Current Page</li>
+            </ol>
+        </nav>
+        
+        <!-- Content -->
+        <h1><i class="bi bi-journal-code"></i> Page Title</h1>
+        
+        <!-- Sections -->
+    </div>
+    
+    <!-- Bootstrap JS -->
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
+    
+    <!-- Dark mode support -->
+    <script>
+        if (window.matchMedia('(prefers-color-scheme: dark)').matches) {
+            document.documentElement.setAttribute('data-bs-theme', 'dark');
+        }
+    </script>
+</body>
+</html>
+
+ +
+

Dark Mode Support

+

Bootstrap 5.3+ includes built-in dark mode support. Add this script to automatically detect system preferences:

+
+ <script>
+  if (window.matchMedia('(prefers-color-scheme: dark)').matches) {
+    document.documentElement.setAttribute('data-bs-theme', 'dark');
+  }
+</script>
+
+
+ +
+

Scroll to Top Button

+

Add a floating button for easy navigation in long documents:

+
+ <button id="scrollTopBtn" class="btn btn-primary">
+  <i class="bi bi-arrow-up-circle"></i>
+</button>

+<script>
+  window.onscroll = function() {
+    if (document.documentElement.scrollTop > 300) {
+      document.getElementById('scrollTopBtn').style.display = 'block';
+    } else {
+      document.getElementById('scrollTopBtn').style.display = 'none';
+    }
+  };
+  document.getElementById('scrollTopBtn').onclick = function() {
+    window.scrollTo({top: 0, behavior: 'smooth'});
+  };
+</script>
+
+
+
+ +
+

Quality Standards

+ +
+
+ Style Guide Implementation: 100% Complete +
+
+ +
+
+
+
+

+ Technical Accuracy +

+
    +
  • All code examples must work
  • +
  • All URLs must be valid
  • +
  • All relationships must be correct
  • +
+
+
+
+ +
+
+
+

+ Clarity and Completeness +

+
    +
  • Each section serves a specific purpose
  • +
  • Information is neither duplicated nor missing
  • +
  • Cross-references are accurate
  • +
+
+
+
+ +
+
+
+

+ Professional Presentation +

+
    +
  • Consistent formatting throughout
  • +
  • Clean visual hierarchy
  • +
  • Responsive design for all devices
  • +
+
+
+
+
+
+ +
+

+ This style guide ensures consistent, professional, and maintainable documentation that serves both technical and business needs while supporting the long-term success of your projects. +

+
+ + + +
+ + + + + + + + diff --git a/docs/gitea.md b/docs/gitea.md new file mode 100644 index 0000000..55012aa --- /dev/null +++ b/docs/gitea.md @@ -0,0 +1,386 @@ +# Gitea - Git with a Cup of Tea + +## Overview +Gitea is a lightweight, self-hosted Git service providing a GitHub-like web interface with repository management, issue tracking, pull requests, and code review capabilities. Deployed on **Rosalind** with PostgreSQL backend on Portia and Memcached caching. + +**Host:** rosalind.incus +**Role:** Collaboration (PHP, Go, Node.js runtimes) +**Container Port:** 22083 (HTTP), 22022 (SSH), 22093 (Metrics) +**External Access:** https://gitea.ouranos.helu.ca/ (via HAProxy on Titania) +**SSH Access:** `ssh -p 22022 git@gitea.ouranos.helu.ca` (TCP passthrough via HAProxy) + +## Architecture + +``` +┌──────────┐ ┌────────────┐ ┌──────────┐ ┌───────────┐ +│ Client │─────▶│ HAProxy │─────▶│ Gitea │─────▶│PostgreSQL │ +│ │ │ (Titania) │ │(Rosalind)│ │ (Portia) │ +└──────────┘ └────────────┘ └──────────┘ └───────────┘ + │ + ▼ + ┌───────────┐ + │ Memcached │ + │ (Local) │ + └───────────┘ +``` + +## Deployment + +### Playbook + +```bash +cd ansible +ansible-playbook gitea/deploy.yml +``` + +### Files + +| File | Purpose | +|------|---------| +| `gitea/deploy.yml` | Main deployment playbook | +| `gitea/app.ini.j2` | Gitea configuration template | + +### Deployment Steps + +1. **Install Dependencies**: git, git-lfs, curl, memcached +2. **Create System User**: `git:git` with home directory +3. **Create Directories**: Work dir, data, LFS storage, repository root, logs +4. **Download Gitea Binary**: Latest release from GitHub (architecture-specific) +5. **Template Configuration**: Apply `app.ini.j2` with variables +6. **Create Systemd Service**: Custom service unit for Gitea +7. **Start Service**: Enable and start gitea.service +8. **Configure OAuth2**: Register Casdoor as OpenID Connect provider + +## Configuration + +### Key Features + +- **Git LFS Support**: Large file storage enabled +- **SSH Server**: Built-in SSH server on port 22022 +- **Prometheus Metrics**: Metrics endpoint on port 22094 +- **Memcached Caching**: Session and cache storage with `gt_` prefix +- **Repository Settings**: Push-to-create, all units enabled +- **Security**: Argon2 password hashing, reverse proxy trusted + +### Storage Locations + +| Path | Purpose | Owner | +|------|---------|-------| +| `/var/lib/gitea` | Working directory | git:git | +| `/var/lib/gitea/data` | Application data | git:git | +| `/var/lib/gitea/data/lfs` | Git LFS objects | git:git | +| `/mnt/dv` | Git repositories | git:git | +| `/var/log/gitea` | Application logs | git:git | +| `/etc/gitea` | Configuration files | root:git | + +### Logging + +- **Console Output**: Info level to systemd journal +- **File Logs**: `/var/log/gitea/gitea.log` +- **Rotation**: Daily rotation, 7-day retention +- **SSH Logs**: Enabled for debugging + +## Access After Deployment + +1. **Web Interface**: https://gitea.ouranos.helu.ca/ +2. **First-Time Setup**: Create admin account on first visit +3. **Git Clone**: + ```bash + git clone https://gitea.ouranos.helu.ca/username/repo.git + ``` +4. **SSH Clone**: + ```bash + git clone git@gitea.ouranos.helu.ca:username/repo.git + ``` + Note: SSH requires port 22022 configured in `~/.ssh/config` + +## Monitoring + +### Alloy Configuration +**File:** `ansible/alloy/rosalind/config.alloy.j2` + +- **Log Collection**: `/var/log/gitea/gitea.log` → Loki +- **Metrics**: Port 22094 → Prometheus (token-protected) +- **System Metrics**: Process exporter tracks Gitea process + +### Metrics Endpoint +- **URL**: `http://rosalind.incus:22083/metrics` +- **Authentication**: Bearer token required (`vault_gitea_metrics_token`) +- **Note**: Metrics are exposed on the main web port, not a separate metrics port + +## Required Vault Secrets + +Add to `ansible/inventory/group_vars/all/vault.yml`: + +### 1. Database Password +```yaml +vault_gitea_db_password: "YourSecurePassword123!" +``` +**Requirements:** +- Minimum 12 characters recommended +- Used by PostgreSQL authentication + +### 2. Secret Key (Session Encryption) +```yaml +vault_gitea_secret_key: "RandomString64CharactersLongForSessionCookieEncryptionSecurity123" +``` +**Requirements:** +- **Length**: Recommended 64+ characters +- **Format**: Base64 or hex string +- **Generation**: + ```bash + openssl rand -base64 48 + ``` + +### 3. LFS JWT Secret +```yaml +vault_gitea_lfs_jwt_secret: "AnotherRandomString64CharsForLFSJWTTokenSigning1234567890ABC" +``` +**Requirements:** +- **Length**: Recommended 64+ characters +- **Purpose**: Signs JWT tokens for Git LFS authentication +- **Generation**: + ```bash + openssl rand -base64 48 + ``` + +### 4. Metrics Token +```yaml +vault_gitea_metrics_token: "RandomTokenForPrometheusMetricsAccess123" +``` +**Requirements:** +- **Length**: 32+ characters recommended +- **Purpose**: Bearer token for Prometheus scraping +- **Generation**: + ```bash + openssl rand -hex 32 + ``` + +### 5. OAuth Client ID +```yaml +vault_gitea_oauth_client_id: "gitea-oauth-client" +``` +**Requirements:** +- **Purpose**: Client ID for Casdoor OAuth2 application +- **Source**: Must match `clientId` in Casdoor application configuration + +### 6. OAuth Client Secret +```yaml +vault_gitea_oauth_client_secret: "YourRandomOAuthSecret123!" +``` +**Requirements:** +- **Length**: 32+ characters recommended +- **Purpose**: Client secret for Casdoor OAuth2 authentication +- **Generation**: + ```bash + openssl rand -base64 32 + ``` +- **Source**: Must match `clientSecret` in Casdoor application configuration + +## Host Variables + +**File:** `ansible/inventory/host_vars/rosalind.incus.yml` + +```yaml +# Gitea User and Directories +gitea_user: git +gitea_group: git +gitea_work_dir: /var/lib/gitea +gitea_data_dir: /var/lib/gitea/data +gitea_lfs_dir: /var/lib/gitea/data/lfs +gitea_repo_root: /mnt/dv +gitea_config_file: /etc/gitea/app.ini + +# Ports +gitea_web_port: 22083 +gitea_ssh_port: 22022 +gitea_metrics_port: 22094 + +# Network +gitea_domain: ouranos.helu.ca +gitea_root_url: https://gitea.ouranos.helu.ca/ + +# Database Configuration +gitea_db_type: postgres +gitea_db_host: portia.incus +gitea_db_port: 5432 +gitea_db_name: gitea +gitea_db_user: gitea +gitea_db_password: "{{vault_gitea_db_password}}" +gitea_db_ssl_mode: disable + +# Features +gitea_lfs_enabled: true +gitea_metrics_enabled: true + +# Service Settings +gitea_disable_registration: true # Use Casdoor SSO +gitea_require_signin_view: false + +# Security (vault secrets) +gitea_secret_key: "{{vault_gitea_secret_key}}" +gitea_lfs_jwt_secret: "{{vault_gitea_lfs_jwt_secret}}" +gitea_metrics_token: "{{vault_gitea_metrics_token}}" + +# OAuth2 (Casdoor SSO) +gitea_oauth_enabled: true +gitea_oauth_name: "casdoor" +gitea_oauth_display_name: "Sign in with Casdoor" +gitea_oauth_client_id: "{{vault_gitea_oauth_client_id}}" +gitea_oauth_client_secret: "{{vault_gitea_oauth_client_secret}}" +gitea_oauth_auth_url: "https://id.ouranos.helu.ca/login/oauth/authorize" +gitea_oauth_token_url: "http://titania.incus:22081/api/login/oauth/access_token" +gitea_oauth_userinfo_url: "http://titania.incus:22081/api/userinfo" +gitea_oauth_scopes: "openid profile email" +``` + +## OAuth2 / Casdoor SSO + +Gitea integrates with Casdoor for Single Sign-On using OpenID Connect. + +### Architecture + +``` +┌──────────┐ ┌────────────┐ ┌──────────┐ ┌──────────┐ +│ Browser │─────▶│ HAProxy │─────▶│ Gitea │─────▶│ Casdoor │ +│ │ │ (Titania) │ │(Rosalind)│ │(Titania) │ +└──────────┘ └────────────┘ └──────────┘ └──────────┘ + │ │ │ + │ 1. Click "Sign in with Casdoor" │ │ + │◀─────────────────────────────────────│ │ + │ 2. Redirect to Casdoor login │ │ + │─────────────────────────────────────────────────────▶│ + │ 3. User authenticates │ │ + │◀─────────────────────────────────────────────────────│ + │ 4. Redirect back with auth code │ │ + │─────────────────────────────────────▶│ │ + │ │ 5. Exchange code for token + │ │────────────────▶│ + │ │◀────────────────│ + │ 6. User logged into Gitea │ │ + │◀─────────────────────────────────────│ │ +``` + +### Casdoor Application Configuration + +A Gitea application is defined in `ansible/casdoor/init_data.json.j2`: + +| Setting | Value | +|---------|-------| +| **Name** | `app-gitea` | +| **Client ID** | `vault_gitea_oauth_client_id` | +| **Redirect URI** | `https://gitea.ouranos.helu.ca/user/oauth2/casdoor/callback` | +| **Grant Types** | `authorization_code`, `refresh_token` | + +### URL Strategy + +| URL Type | Address | Used By | +|----------|---------|---------| +| **Auth URL** | `https://id.ouranos.helu.ca/...` | User's browser (external) | +| **Token URL** | `http://titania.incus:22081/...` | Gitea server (internal) | +| **Userinfo URL** | `http://titania.incus:22081/...` | Gitea server (internal) | +| **Discovery URL** | `http://titania.incus:22081/.well-known/openid-configuration` | Gitea server (internal) | + +The auth URL uses the external HAProxy address because it runs in the user's browser. Token/userinfo URLs use internal addresses for server-to-server communication. + +### User Auto-Registration + +With `ENABLE_AUTO_REGISTRATION = true` in `[oauth2_client]`, users who authenticate via Casdoor are automatically created in Gitea. Account linking uses `auto` mode to match by email address. + +### Deployment Order + +1. **Deploy Casdoor first** (if not already running): + ```bash + ansible-playbook casdoor/deploy.yml + ``` + +2. **Deploy Gitea** (registers OAuth provider): + ```bash + ansible-playbook gitea/deploy.yml + ``` + +### Verify OAuth Configuration + +```bash +# List authentication sources +ssh rosalind.incus "sudo -u git /usr/local/bin/gitea admin auth list --config /etc/gitea/app.ini" + +# Should show: casdoor (OpenID Connect) +``` + +## Database Setup + +Gitea requires a PostgreSQL database on Portia. This is automatically created by the `postgresql/deploy.yml` playbook. + +**Database Details:** +- **Name**: gitea +- **User**: gitea +- **Owner**: gitea +- **Extensions**: None required + +## Integration with Other Services + +### HAProxy Routing +**Backend Configuration** (`titania.incus.yml`): +```yaml +- subdomain: "gitea" + backend_host: "rosalind.incus" + backend_port: 22083 + health_path: "/api/healthz" + timeout_server: 120s +``` + +### Memcached Integration +- **Host**: localhost:11211 +- **Session Prefix**: N/A (Memcache adapter doesn't require prefix) +- **Cache Prefix**: N/A + +### Prometheus Monitoring +- **Scrape Target**: `rosalind.incus:22094` +- **Job Name**: gitea +- **Authentication**: Bearer token + +## Troubleshooting + +### Service Status +```bash +ssh rosalind.incus +sudo systemctl status gitea +``` + +### View Logs +```bash +# Application logs +sudo tail -f /var/log/gitea/gitea.log + +# Systemd journal +sudo journalctl -u gitea -f +``` + +### Test Database Connection +```bash +psql -h portia.incus -U gitea -d gitea +``` + +### Check Memcached +```bash +echo "stats" | nc localhost 11211 +``` + +### Verify Metrics Endpoint +```bash +curl -H "Authorization: Bearer YOUR_TOKEN" http://localhost:22094/metrics +``` + +## Version Information + +- **Installation Method**: Binary download from GitHub releases +- **Version Selection**: Latest stable release (dynamic) +- **Update Process**: Re-run deployment playbook to fetch latest binary +- **Architecture**: linux-amd64 + +## References + +- **Official Documentation**: https://docs.gitea.com/ +- **GitHub Repository**: https://github.com/go-gitea/gitea +- **Configuration Reference**: https://docs.gitea.com/administration/config-cheat-sheet diff --git a/docs/gitea_mcp.md b/docs/gitea_mcp.md new file mode 100644 index 0000000..e6df56e --- /dev/null +++ b/docs/gitea_mcp.md @@ -0,0 +1,759 @@ +# Gitea MCP Server - Red Panda Approved™ + +Model Context Protocol (MCP) server providing programmatic access to Gitea repositories, issues, and pull requests. Deployed as a Docker container on Miranda (MCP Docker Host) in the Agathos sandbox. + +--- + +## Overview + +The Gitea MCP Server exposes Gitea's functionality through the MCP protocol, enabling AI assistants and automation tools to interact with Git repositories, issues, pull requests, and other Gitea features. + +| Property | Value | +|----------|-------| +| **Host** | Miranda (10.10.0.156) | +| **Service Port** | 25535 | +| **Container Port** | 8000 | +| **Transport** | HTTP | +| **Image** | `docker.gitea.com/gitea-mcp-server:latest` | +| **Gitea Instance** | https://gitea.ouranos.helu.ca | +| **Logging** | Syslog to port 51435 → Alloy → Loki | + +### Purpose + +- **Repository Operations**: Clone, read, and analyze repository contents +- **Issue Management**: Create, read, update, and search issues +- **Pull Request Workflow**: Manage PRs, reviews, and merges +- **Code Search**: Search across repositories and file contents +- **User/Organization Info**: Query user profiles and organization details + +### Integration Points + +``` +AI Assistant (Cline/Claude Desktop) + ↓ (MCP Protocol) +MCP Switchboard (Oberon) + ↓ (HTTP) +Gitea MCP Server (Miranda:25535) + ↓ (Gitea API) +Gitea Instance (Rosalind:22083) +``` + +--- + +## Architecture + +### Deployment Model + +**Container-Based**: Single Docker container managed via Docker Compose + +**Directory Structure**: +``` +/srv/gitea_mcp/ +└── docker-compose.yml # Container orchestration +``` + +**System Integration**: +- **User/Group**: `gitea_mcp:gitea_mcp` (system user) +- **Ansible User Access**: Remote user added to gitea_mcp group +- **Permissions**: Directory mode 750, compose file mode 550 + +### Network Configuration + +| Component | Port | Protocol | Purpose | +|-----------|------|----------|---------| +| External Access | 25535 | HTTP | MCP protocol endpoint | +| Container Internal | 8000 | HTTP | Service listening port | +| Syslog | 51435 | TCP | Log forwarding to Alloy | + +### Logging Pipeline + +``` +Gitea MCP Container + ↓ (Docker syslog driver) +Local Syslog (127.0.0.1:51435) + ↓ (Alloy collection) +Loki (Prospero) + ↓ (Grafana queries) +Grafana Dashboards +``` + +**Log Format**: RFC5424 (syslog_format variable) +**Log Tag**: `gitea-mcp` + +--- + +## Prerequisites + +### Infrastructure Requirements + +1. **Miranda Host**: Docker engine installed and running +2. **Gitea Instance**: Accessible Gitea server (gitea.ouranos.helu.ca) +3. **Access Token**: Gitea personal access token with required permissions +4. **Monitoring Stack**: Alloy configured for syslog collection (port 51435) + +### Required Permissions + +**Gitea Access Token Scopes**: +- `repo`: Full repository access (read/write) +- `user`: Read user information +- `org`: Read organization information +- `issue`: Manage issues +- `pull_request`: Manage pull requests + +**Token Creation**: +1. Log into Gitea → User Settings → Applications +2. Generate New Token → Select scopes +3. Copy token (shown only once) +4. Store in Ansible Vault as `vault_gitea_mcp_access_token` + +### Ansible Dependencies + +- `community.docker.docker_compose_v2` collection +- Docker Python SDK on Miranda +- Ansible Vault configured with password file + +--- + +## Configuration + +### Host Variables + +All configuration is defined in `ansible/inventory/host_vars/miranda.incus.yml`: + +```yaml +services: + - gitea_mcp # Enable service on this host + +# Gitea MCP Configuration +gitea_mcp_user: gitea_mcp +gitea_mcp_group: gitea_mcp +gitea_mcp_directory: /srv/gitea_mcp +gitea_mcp_port: 25535 +gitea_mcp_host: https://gitea.ouranos.helu.ca +gitea_mcp_access_token: "{{ vault_gitea_mcp_access_token }}" +gitea_mcp_syslog_port: 51435 +``` + +### Variable Reference + +| Variable | Purpose | Example | +|----------|---------|---------| +| `gitea_mcp_user` | Service system user | `gitea_mcp` | +| `gitea_mcp_group` | Service system group | `gitea_mcp` | +| `gitea_mcp_directory` | Service root directory | `/srv/gitea_mcp` | +| `gitea_mcp_port` | External port binding | `25535` | +| `gitea_mcp_host` | Gitea instance URL | `https://gitea.ouranos.helu.ca` | +| `gitea_mcp_access_token` | Gitea API token (vault) | `{{ vault_gitea_mcp_access_token }}` | +| `gitea_mcp_syslog_port` | Local syslog port | `51435` | + +### Vault Configuration + +Store the Gitea access token securely in `ansible/inventory/group_vars/all/vault.yml`: + +```yaml +--- +# Gitea MCP Server Access Token +vault_gitea_mcp_access_token: "your_gitea_access_token_here" +``` + +**Encrypt vault file**: +```bash +ansible-vault encrypt ansible/inventory/group_vars/all/vault.yml +``` + +**Edit vault file**: +```bash +ansible-vault edit ansible/inventory/group_vars/all/vault.yml +``` + +--- + +## Deployment + +### Initial Deployment + +**Prerequisites Check**: +```bash +# Verify Miranda has Docker +ansible miranda.incus -m command -a "docker --version" + +# Verify Miranda is in inventory +ansible miranda.incus -m ping + +# Check Gitea accessibility +curl -I https://gitea.ouranos.helu.ca +``` + +**Deploy Service**: +```bash +cd ansible/ + +# Deploy only Gitea MCP service +ansible-playbook gitea_mcp/deploy.yml + +# Or deploy as part of full stack +ansible-playbook site.yml +``` + +**Deployment Process**: +1. ✓ Check service is enabled in host's `services` list +2. ✓ Create gitea_mcp system user and group +3. ✓ Add Ansible remote user to gitea_mcp group +4. ✓ Create /srv/gitea_mcp directory (mode 750) +5. ✓ Template docker-compose.yml (mode 550) +6. ✓ Reset SSH connection (apply group changes) +7. ✓ Start Docker container via docker-compose + +### Deployment Output + +**Expected Success**: +``` +PLAY [Deploy Gitea MCP Server with Docker Compose] **************************** + +TASK [Check if host has gitea_mcp service] ************************************ +ok: [miranda.incus] + +TASK [Create gitea_mcp group] ************************************************* +changed: [miranda.incus] + +TASK [Create gitea_mcp user] ************************************************** +changed: [miranda.incus] + +TASK [Add group gitea_mcp to Ansible remote_user] ***************************** +changed: [miranda.incus] + +TASK [Create gitea_mcp directory] ********************************************* +changed: [miranda.incus] + +TASK [Template docker-compose file] ******************************************* +changed: [miranda.incus] + +TASK [Reset SSH connection to apply group changes] **************************** +changed: [miranda.incus] + +TASK [Start Gitea MCP service] ************************************************ +changed: [miranda.incus] + +PLAY RECAP ******************************************************************** +miranda.incus : ok=8 changed=7 unreachable=0 failed=0 +``` + +--- + +## Verification + +### Container Status + +**Check container is running**: +```bash +# Via Ansible +ansible miranda.incus -m command -a "docker ps | grep gitea-mcp" + +# Direct SSH +ssh miranda.incus +docker ps | grep gitea-mcp +``` + +**Expected Output**: +``` +CONTAINER ID IMAGE STATUS PORTS +abc123def456 docker.gitea.com/gitea-mcp-server:latest Up 2 minutes 0.0.0.0:25535->8000/tcp +``` + +### Service Connectivity + +**Test MCP endpoint**: +```bash +# From Miranda +curl -v http://localhost:25535 + +# From other hosts +curl -v http://miranda.incus:25535 +``` + +**Expected Response**: HTTP response indicating MCP server is listening + +### Log Inspection + +**Docker logs**: +```bash +ssh miranda.incus +docker logs gitea-mcp +``` + +**Centralized logs via Loki**: +```bash +# Via logcli (if installed) +logcli query '{job="syslog", container_name="gitea-mcp"}' --limit=50 + +# Via Grafana Explore +# Navigate to: https://grafana.ouranos.helu.ca +# Select Loki datasource +# Query: {job="syslog", container_name="gitea-mcp"} +``` + +### Functional Testing + +**Test Gitea API access**: +```bash +# Enter container +ssh miranda.incus +docker exec -it gitea-mcp sh + +# Test Gitea API connectivity (if curl available in container) +# Note: Container may not have shell utilities +``` + +**MCP Protocol Test** (from client): +```bash +# Using MCP inspector or client tool +mcp connect http://miranda.incus:25535 + +# Or test via MCP Switchboard +curl -X POST http://oberon.incus:22781/mcp/invoke \ + -H "Content-Type: application/json" \ + -d '{"server":"gitea","method":"list_repositories"}' +``` + +--- + +## Management + +### Updating the Service + +**Update container image**: +```bash +cd ansible/ + +# Re-run deployment (pulls latest image) +ansible-playbook gitea_mcp/deploy.yml +``` + +**Docker Compose will**: +1. Pull latest `docker.gitea.com/gitea-mcp-server:latest` image +2. Recreate container if image changed +3. Preserve configuration from docker-compose.yml + +### Restarting the Service + +**Via Docker Compose**: +```bash +ssh miranda.incus +cd /srv/gitea_mcp +docker compose restart +``` + +**Via Docker**: +```bash +ssh miranda.incus +docker restart gitea-mcp +``` + +**Via Ansible** (re-run deployment): +```bash +ansible-playbook gitea_mcp/deploy.yml +``` + +### Removing the Service + +**Complete removal**: +```bash +cd ansible/ +ansible-playbook gitea_mcp/remove.yml +``` + +**Remove playbook actions**: +1. Stop and remove Docker containers +2. Remove Docker volumes +3. Remove Docker images +4. Prune unused Docker images +5. Remove /srv/gitea_mcp directory + +**Manual cleanup** (if needed): +```bash +ssh miranda.incus + +# Stop and remove container +cd /srv/gitea_mcp +docker compose down -v --rmi all + +# Remove directory +sudo rm -rf /srv/gitea_mcp + +# Remove user/group (optional) +sudo userdel gitea_mcp +sudo groupdel gitea_mcp +``` + +### Configuration Changes + +**Update Gitea host or port**: +1. Edit `ansible/inventory/host_vars/miranda.incus.yml` +2. Modify `gitea_mcp_host` or `gitea_mcp_port` +3. Re-run deployment: `ansible-playbook gitea_mcp/deploy.yml` + +**Rotate access token**: +1. Generate new token in Gitea +2. Update vault: `ansible-vault edit ansible/inventory/group_vars/all/vault.yml` +3. Update `vault_gitea_mcp_access_token` value +4. Re-run deployment to update environment variable + +--- + +## Troubleshooting + +### Container Won't Start + +**Symptom**: Container exits immediately or won't start + +**Diagnosis**: +```bash +ssh miranda.incus + +# Check container logs +docker logs gitea-mcp + +# Check container status +docker ps -a | grep gitea-mcp + +# Inspect container +docker inspect gitea-mcp +``` + +**Common Causes**: +- **Invalid Access Token**: Check `GITEA_ACCESS_TOKEN` in docker-compose.yml +- **Gitea Host Unreachable**: Verify `GITEA_HOST` is accessible from Miranda +- **Port Conflict**: Check if port 25535 is already in use +- **Image Pull Failure**: Check Docker registry connectivity + +**Solutions**: +```bash +# Test Gitea connectivity +curl -I https://gitea.ouranos.helu.ca + +# Check port availability +ss -tlnp | grep 25535 + +# Pull image manually +docker pull docker.gitea.com/gitea-mcp-server:latest + +# Re-run deployment with verbose logging +ansible-playbook gitea_mcp/deploy.yml -vv +``` + +### Authentication Errors + +**Symptom**: "401 Unauthorized" or "403 Forbidden" in logs + +**Diagnosis**: +```bash +# Check token is correctly passed +ssh miranda.incus +docker exec gitea-mcp env | grep GITEA_ACCESS_TOKEN + +# Test token manually +TOKEN="your_token_here" +curl -H "Authorization: token $TOKEN" https://gitea.ouranos.helu.ca/api/v1/user +``` + +**Solutions**: +1. Verify token scopes in Gitea (repo, user, org, issue, pull_request) +2. Regenerate token if expired or revoked +3. Update vault with new token +4. Re-run deployment + +### Network Connectivity Issues + +**Symptom**: Cannot connect to Gitea or MCP endpoint unreachable + +**Diagnosis**: +```bash +# Test Gitea from Miranda +ssh miranda.incus +curl -v https://gitea.ouranos.helu.ca + +# Test MCP endpoint from other hosts +curl -v http://miranda.incus:25535 + +# Check Docker network +docker network inspect bridge +``` + +**Solutions**: +- Verify Miranda can resolve and reach `gitea.ouranos.helu.ca` +- Check firewall rules on Miranda +- Verify port 25535 is not blocked +- Check Docker network configuration + +### Logs Not Appearing in Loki + +**Symptom**: No logs in Grafana from gitea-mcp container + +**Diagnosis**: +```bash +# Check Alloy is listening on syslog port +ssh miranda.incus +ss -tlnp | grep 51435 + +# Check Alloy configuration +sudo systemctl status alloy + +# Verify syslog driver is configured +docker inspect gitea-mcp | grep -A 10 LogConfig +``` + +**Solutions**: +1. Verify Alloy is running: `sudo systemctl status alloy` +2. Check Alloy syslog source configuration +3. Verify `gitea_mcp_syslog_port` matches Alloy config +4. Restart Alloy: `sudo systemctl restart alloy` +5. Restart container to reconnect syslog + +### Permission Denied Errors + +**Symptom**: Cannot access /srv/gitea_mcp or docker-compose.yml + +**Diagnosis**: +```bash +ssh miranda.incus + +# Check directory permissions +ls -la /srv/gitea_mcp + +# Check user group membership +groups # Should show gitea_mcp group + +# Check file ownership +ls -la /srv/gitea_mcp/docker-compose.yml +``` + +**Solutions**: +```bash +# Re-run deployment to fix permissions +ansible-playbook gitea_mcp/deploy.yml + +# Manually fix if needed +sudo chown -R gitea_mcp:gitea_mcp /srv/gitea_mcp +sudo chmod 750 /srv/gitea_mcp +sudo chmod 550 /srv/gitea_mcp/docker-compose.yml + +# Re-login to apply group changes +exit +ssh miranda.incus +``` + +### MCP Switchboard Integration Issues + +**Symptom**: Switchboard cannot connect to Gitea MCP server + +**Diagnosis**: +```bash +# Check switchboard configuration +ssh oberon.incus +cat /srv/mcp-switchboard/config.json | jq '.servers.gitea' + +# Test connectivity from Oberon +curl -v http://miranda.incus:25535 +``` + +**Solutions**: +1. Verify Gitea MCP server URL in switchboard config +2. Check network connectivity: Oberon → Miranda +3. Verify port 25535 is accessible +4. Restart MCP Switchboard after config changes + +--- + +## MCP Protocol Integration + +### Server Capabilities + +The Gitea MCP Server exposes these resources and tools via the MCP protocol: + +**Resources**: +- Repository information +- File contents +- Issue details +- Pull request data +- User profiles +- Organization information + +**Tools**: +- `list_repositories`: List accessible repositories +- `get_repository`: Get repository details +- `list_issues`: Search and list issues +- `create_issue`: Create new issue +- `update_issue`: Modify existing issue +- `list_pull_requests`: List PRs in repository +- `create_pull_request`: Open new PR +- `search_code`: Search code across repositories + +### Switchboard Configuration + +**MCP Switchboard** on Oberon routes MCP requests to Gitea MCP Server. + +**Configuration** (`/srv/mcp-switchboard/config.json`): +```json +{ + "servers": { + "gitea": { + "command": null, + "args": [], + "url": "http://miranda.incus:25535", + "transport": "http" + } + } +} +``` + +### Client Usage + +**From AI Assistant** (Claude Desktop, Cline, etc.): + +The assistant can interact with Gitea repositories through natural language: +- "List all repositories in the organization" +- "Show me open issues in the agathos repository" +- "Create an issue about improving documentation" +- "Search for 'ansible' in repository code" + +**Direct MCP Client**: +```json +POST http://oberon.incus:22781/mcp/invoke +Content-Type: application/json + +{ + "server": "gitea", + "method": "list_repositories", + "params": {} +} +``` + +--- + +## Security Considerations + +### Access Token Management + +**Best Practices**: +- Store token in Ansible Vault (never in plain text) +- Use minimum required scopes for token +- Rotate tokens periodically +- Revoke tokens when no longer needed +- Use separate tokens for different services + +**Token Rotation**: +```bash +# 1. Generate new token in Gitea +# 2. Update vault +ansible-vault edit ansible/inventory/group_vars/all/vault.yml + +# 3. Re-deploy to update environment variable +ansible-playbook gitea_mcp/deploy.yml + +# 4. Revoke old token in Gitea +``` + +### Network Security + +**Isolation**: +- Service only accessible within Incus network (10.10.0.0/24) +- No direct external exposure (proxied through Switchboard) +- TLS handled by HAProxy (upstream) for external access + +**Access Control**: +- Gitea enforces user/repository permissions +- MCP protocol authenticated by Switchboard +- Container runs as non-root user + +### Audit and Monitoring + +**Logging**: +- All requests logged to Loki via syslog +- Grafana dashboards for monitoring access patterns +- Alert on authentication failures + +**Monitoring Queries**: +```logql +# All Gitea MCP logs +{job="syslog", container_name="gitea-mcp"} + +# Authentication errors +{job="syslog", container_name="gitea-mcp"} |= "401" or |= "403" + +# Error rate +rate({job="syslog", container_name="gitea-mcp"} |= "error" [5m]) +``` + +--- + +## Performance Considerations + +### Resource Usage + +**Container Resources**: +- **Memory**: ~50-100 MB baseline +- **CPU**: Minimal (< 1% idle, spikes during API calls) +- **Disk**: ~100 MB for image, minimal runtime storage + +**Scaling Considerations**: +- Single container sufficient for development/sandbox +- For production: Consider multiple replicas behind load balancer +- Gitea API rate limits apply to token (typically 5000 requests/hour) + +### Optimization + +**Caching**: +- Gitea MCP Server may cache repository metadata +- Restart container to clear cache if needed + +**Connection Pooling**: +- Server maintains connection pool to Gitea API +- Reuses connections for better performance + +--- + +## Related Documentation + +### Agathos Infrastructure +- [Agathos Overview](agathos.md) - Complete infrastructure documentation +- [Ansible Best Practices](ansible.md) - Deployment patterns and structure +- [Miranda Host](agathos.md#miranda---mcp-docker-host) - MCP Docker host details + +### Related Services +- [Gitea Service](gitea.md) - Gitea server deployment and configuration +- [MCP Switchboard](../ansible/mcp_switchboard/README.md) - MCP request routing +- [Grafana MCP](grafana_mcp.md) - Similar MCP server deployment + +### External References +- [Gitea API Documentation](https://docs.gitea.com/api/1.21/) - Gitea REST API reference +- [Model Context Protocol Specification](https://spec.modelcontextprotocol.io/) - MCP protocol details +- [Gitea MCP Server Repository](https://gitea.com/gitea/mcp-server) - Upstream project +- [Docker Compose Documentation](https://docs.docker.com/compose/) - Container orchestration + +--- + +## Maintenance Schedule + +**Regular Tasks**: +- **Weekly**: Review logs for errors or anomalies +- **Monthly**: Update container image to latest version +- **Quarterly**: Rotate Gitea access token +- **As Needed**: Review and adjust token permissions + +**Update Procedure**: +```bash +# Pull latest image and restart +ansible-playbook gitea_mcp/deploy.yml + +# Verify new version +ssh miranda.incus +docker inspect gitea-mcp | jq '.[0].Config.Image' +``` + +--- + +**Last Updated**: February 2026 +**Project**: Agathos Infrastructure +**Host**: Miranda (MCP Docker Host) +**Status**: Red Panda Approved™ ✓ + diff --git a/docs/gitea_runner.md b/docs/gitea_runner.md new file mode 100644 index 0000000..2128a49 --- /dev/null +++ b/docs/gitea_runner.md @@ -0,0 +1,200 @@ +# Gitea Act Runner + +## Overview + +Gitea Actions is Gitea's built-in CI/CD system, compatible with GitHub Actions workflows. The **Act Runner** is the agent that executes these workflows. It picks up jobs from a Gitea instance, spins up Docker containers for each workflow step, runs the commands, and reports results back. + +The name "act" comes from [nektos/act](https://github.com/nektos/act), an open-source tool originally built to run GitHub Actions locally. Gitea forked and adapted it into their runner, so `act_runner` is a lineage artifact — the binary keeps the upstream name, but everything else in our infrastructure uses `gitea-runner`. + +### How it works + +1. The runner daemon polls the Gitea instance for queued workflow jobs +2. When a job is picked up, the runner pulls the Docker image specified by the workflow label (e.g., `ubuntu-24.04` maps to `docker.gitea.com/runner-images:ubuntu-24.04`) +3. Each workflow step executes inside an ephemeral container +4. Logs and status are streamed back to Gitea in real time +5. The container is destroyed after the job completes + +### Architecture in Agathos + +``` +Gitea (Rosalind) Act Runner (Puck) +┌──────────────┐ poll/report ┌──────────────────┐ +│ gitea.ouranos │◄──────────────────│ act_runner daemon │ +│ .helu.ca │ │ (gitea-runner) │ +└──────────────┘ └────────┬─────────┘ + │ spawns + ┌────────▼─────────┐ + │ Docker containers │ + │ (workflow steps) │ + └──────────────────┘ +``` + +### Naming conventions + +The **binary** is `act_runner` — that's the upstream package name and renaming it would break updates. Everything else uses `gitea-runner`: + +| Component | Name | +|-----------|------| +| Binary | `/usr/local/bin/act_runner` (upstream, don't rename) | +| Service account | `gitea-runner` | +| Home directory | `/srv/gitea-runner/` | +| Config file | `/srv/gitea-runner/config.yaml` | +| Registration state | `/srv/gitea-runner/.runner` (created by registration) | +| Systemd service | `gitea-runner.service` | +| Runner name | `puck-runner` (shown in Gitea UI) | + +--- + +## Ansible Deployment + +The runner is deployed via the `gitea_runner` Ansible service to **Puck** (application runtime host with Docker already available). + +### Prerequisites + +- Docker must be installed on the target host (`docker` in services list) +- Gitea must be running and accessible at `https://gitea.ouranos.helu.ca` + +### Deploy + +```bash +# Deploy to all hosts with gitea_runner in their services list +ansible-playbook gitea_runner/deploy.yml + +# Dry run (skip registration prompt) +ansible-playbook gitea_runner/deploy.yml --check + +# Limit to a specific host +ansible-playbook gitea_runner/deploy.yml --limit puck.incus + +# Non-interactive mode (for CI/CD) +ansible-playbook gitea_runner/deploy.yml -e registration_token=YOUR_TOKEN +``` + +The playbook is also included in the full-stack deployment via `site.yml`, running after the Gitea playbook. + +**Registration Prompt**: On first deployment, the playbook will pause and prompt for a registration token. Get the token from `https://gitea.ouranos.helu.ca/-/admin/runners` before running the playbook. + +### What the playbook does + +1. Filters hosts — only runs on hosts with `gitea_runner` in their `services` list +2. Creates `gitea-runner` system group and user (added to `docker` group) +3. Downloads `act_runner` binary from Gitea releases (version pinned as `act_runner_version` in `group_vars/all/vars.yml`) +4. Skips download if the installed version already matches (idempotent) +5. Copies the managed `config.yaml` from the Ansible controller (edit `ansible/gitea_runner/config.yaml` to change runner settings) +6. Templates `gitea-runner.service` systemd unit +7. **Registers the runner** — prompts for registration token on first deployment +8. Enables and starts the service + +### Systemd unit + +```ini +# /etc/systemd/system/gitea-runner.service +[Unit] +Description=Gitea Runner +After=network.target docker.service +Requires=docker.service + +[Service] +Type=simple +User=gitea-runner +Group=gitea-runner +WorkingDirectory=/srv/gitea-runner +ExecStart=/usr/local/bin/act_runner daemon --config /srv/gitea-runner/config.yaml +Restart=on-failure +RestartSec=10 +Environment=HOME=/srv/gitea-runner + +[Install] +WantedBy=multi-user.target +``` + +### Registration Flow + +On first deployment, the playbook will automatically prompt for a registration token: + +``` +TASK [Prompt for registration token] + +Gitea runner registration required. +Get token from: https://gitea.ouranos.helu.ca/-/admin/runners + +Enter registration token: +[Enter token here] +``` + +**Steps**: +1. Before running the playbook, obtain a registration token: + - Navigate to `https://gitea.ouranos.helu.ca/-/admin/runners` + - Click "Create new Runner" + - Copy the displayed token +2. Run the deployment playbook +3. Paste the token when prompted + +The registration is **idempotent** — if the runner is already registered (`.runner` file exists), the prompt is skipped. + +**Non-interactive mode**: Pass the token as an extra variable: +```bash +ansible-playbook gitea_runner/deploy.yml -e registration_token=YOUR_TOKEN +``` + +**Manual registration** (if needed): The traditional method still works if you prefer manual control. Labels are picked up from `config.yaml` at daemon start, so `--labels` is not needed at registration: +```bash +ssh puck.incus +sudo -iu gitea-runner +act_runner register \ + --instance https://gitea.ouranos.helu.ca \ + --token \ + --name puck-runner \ + --no-interactive +``` + +### Verify + +```bash +# Check service status +sudo systemctl status gitea-runner + +# Check runner version +act_runner --version + +# View runner logs +sudo journalctl -u gitea-runner -f +``` + +`puck-runner` should show as **online** at `https://gitea.ouranos.helu.ca/-/admin/runners`. + +### Runner labels + +Labels map workflow `runs-on` values to Docker images. They are configured in `ansible/gitea_runner/config.yaml` under `runner.labels`: + +| Label | Docker Image | Use case | +|-------|-------------|----------| +| `ubuntu-latest` | `docker.gitea.com/runner-images:ubuntu-latest` | General CI (Gitea official image) | +| `ubuntu-24.04` | `docker.gitea.com/runner-images:ubuntu-24.04` | Ubuntu 24.04 builds | +| `ubuntu-22.04` | `docker.gitea.com/runner-images:ubuntu-22.04` | Ubuntu 22.04 builds | +| `ubuntu-20.04` | `docker.gitea.com/runner-images:ubuntu-20.04` | Ubuntu 20.04 builds | +| `node-24` | `node:24-bookworm` | Node.js CI | + +To add or change labels, edit `ansible/gitea_runner/config.yaml` and re-run the playbook. + +### Configuration reference + +| Variable | Location | Value | +|----------|----------|-------| +| `act_runner_version` | `group_vars/all/vars.yml` | `0.2.13` | +| `gitea_runner_instance_url` | `group_vars/all/vars.yml` | `https://gitea.ouranos.helu.ca` | +| `gitea_runner_name` | `host_vars/puck.incus.yml` | `puck-runner` | +| Runner labels | `ansible/gitea_runner/config.yaml` | See `runner.labels` section | + +### Upgrading + +To upgrade the runner binary, update `act_runner_version` in `group_vars/all/vars.yml` and re-run the playbook: + +```bash +# Edit the version +vim inventory/group_vars/all/vars.yml +# act_runner_version: "0.2.14" + +# Re-deploy — only the binary download and service restart will trigger +ansible-playbook gitea_runner/deploy.yml +``` \ No newline at end of file diff --git a/docs/github_mcp.md b/docs/github_mcp.md new file mode 100644 index 0000000..d46f6c7 --- /dev/null +++ b/docs/github_mcp.md @@ -0,0 +1,344 @@ +# GitHub MCP Server + +## Overview + +The GitHub MCP server provides read-only access to GitHub repositories through the Model Context Protocol (MCP). It enables AI assistants and other MCP clients to explore repository contents, search code, read issues, and analyze pull requests without requiring local clones. + +**Deployment Host:** miranda.incus (10.10.0.156) +**Port:** 25533 (HTTP MCP endpoint) +**MCPO Proxy:** http://miranda.incus:25530/github + +--- + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ MCP CLIENTS │ +│ VS Code/Cline │ OpenWebUI │ Custom Applications │ +└─────────────────────────┬───────────────────────────────────┘ + │ + ┌───────────┴──────────────┐ + │ │ + ▼ ▼ + Direct MCP (port 25533) MCPO Proxy (port 25530) + streamable-http OpenAI-compatible API + │ │ + └──────────┬───────────────┘ + ▼ + ┌──────────────────────┐ + │ GitHub MCP Server │ + │ Docker Container │ + │ miranda.incus │ + └──────────┬───────────┘ + │ + ▼ + ┌──────────────────────┐ + │ GitHub API │ + │ (Read-Only PAT) │ + └──────────────────────┘ +``` + +--- + +## GitHub Personal Access Token + +### Required Scopes + +The GitHub MCP server requires a **read-only Personal Access Token (PAT)** with the following scopes: + +| Scope | Purpose | +|-------|---------| +| `public_repo` | Read access to public repositories | +| `repo` | Read access to private repositories (if needed) | +| `read:org` | Read organization membership and teams | +| `read:user` | Read user profile information | + +### Creating a PAT + +1. Navigate to GitHub Settings → Developer settings → Personal access tokens → Tokens (classic) +2. Click "Generate new token (classic)" +3. Set name: `Agathos GitHub MCP - Read Only` +4. Set expiration: Custom or 90 days (recommended) +5. Select scopes: `public_repo`, `read:org`, `read:user` +6. Click "Generate token" +7. Copy the token immediately (it won't be shown again) +8. Store in Ansible vault: `ansible-vault edit ansible/inventory/group_vars/all/vault.yml` + - Add: `vault_github_personal_access_token: "ghp_xxxxxxxxxxxxx"` + +--- + +## Available Tools + +The GitHub MCP server provides the following tools: + +### Repository Operations +- `get_file_contents` - Read file contents from repository +- `search_repositories` - Search for repositories on GitHub +- `list_commits` - List commits in a repository +- `create_branch` - Create a new branch (requires write access) +- `push_files` - Push files to repository (requires write access) + +### Issue Management +- `create_issue` - Create a new issue (requires write access) +- `list_issues` - List issues in a repository +- `get_issue` - Get details of a specific issue +- `update_issue` - Update an issue (requires write access) + +### Pull Request Management +- `create_pull_request` - Create a new PR (requires write access) +- `list_pull_requests` - List pull requests in a repository +- `get_pull_request` - Get details of a specific PR + +### Search Operations +- `search_code` - Search code across repositories +- `search_users` - Search for GitHub users + +**Note:** With a read-only PAT, write operations (`create_*`, `update_*`, `push_*`) will fail. The primary use case is repository exploration and code reading. + +--- + +## Client Configuration + +### MCP Native Clients (Cline, Claude Desktop) + +Add the following to your MCP settings (e.g., `~/.config/Code/User/globalStorage/saoudrizwan.claude-dev/settings/cline_mcp_settings.json`): + +```json +{ + "mcpServers": { + "github": { + "type": "streamable-http", + "url": "http://miranda.incus:25533/mcp" + } + } +} +``` + +### OpenWebUI Configuration + +1. Navigate to **Settings → Tools → OpenAPI Servers** +2. Click **Add OpenAPI Server** +3. Configure: + - **Name:** GitHub MCP + - **URL:** `http://miranda.incus:25530/github` + - **Authentication:** None (MCPO handles upstream auth) +4. Save and enable desired GitHub tools + +### Custom Applications + +**Direct MCP Connection:** +```python +import mcp + +client = mcp.Client("http://miranda.incus:25533/mcp") +tools = await client.list_tools() +``` + +**Via MCPO (OpenAI-compatible):** +```python +import openai + +client = openai.OpenAI( + base_url="http://miranda.incus:25530/github", + api_key="not-required" # MCPO doesn't require auth for GitHub MCP +) +``` + +--- + +## Deployment + +### Prerequisites + +- Miranda container running with Docker installed +- Ansible vault containing `vault_github_personal_access_token` +- Network connectivity from clients to miranda.incus + +### Deploy GitHub MCP Server + +```bash +cd /home/robert/dv/agathos/ansible +ansible-playbook github_mcp/deploy.yml +``` + +This playbook: +1. Creates `github_mcp` user and group +2. Creates `/srv/github_mcp` directory +3. Templates docker-compose.yml with PAT from vault +4. Starts github-mcp-server container on port 25533 + +### Update MCPO Configuration + +```bash +ansible-playbook mcpo/deploy.yml +``` + +This restarts MCPO with the updated config including GitHub MCP server. + +### Update Alloy Logging + +```bash +ansible-playbook alloy/deploy.yml --limit miranda.incus +``` + +This reconfigures Alloy to collect GitHub MCP server logs. + +--- + +## Verification + +### Test Direct MCP Endpoint + +```bash +# Check container is running +ssh miranda.incus docker ps | grep github-mcp-server + +# Test MCP endpoint responds +curl http://miranda.incus:25533/mcp + +# List available tools (expect JSON response) +curl -X POST http://miranda.incus:25533/mcp \ + -H "Content-Type: application/json" \ + -d '{"jsonrpc": "2.0", "id": 1, "method": "tools/list"}' +``` + +### Test MCPO Proxy + +```bash +# List GitHub tools via MCPO +curl http://miranda.incus:25530/github/tools + +# Test repository file reading +curl -X POST http://miranda.incus:25530/github/tools/get_file_contents \ + -H "Content-Type: application/json" \ + -d '{ + "owner": "github", + "repo": "docs", + "path": "README.md" + }' +``` + +### View Logs + +```bash +# Container logs +ssh miranda.incus docker logs github-mcp-server + +# Loki logs (via Grafana on prospero.incus) +# Navigate to Explore → Loki +# Query: {job="github-mcp-server"} +``` + +--- + +## Troubleshooting + +### Container Won't Start + +**Check Docker Compose:** +```bash +ssh miranda.incus +sudo -u github_mcp docker compose -f /srv/github_mcp/docker-compose.yml logs +``` + +**Common Issues:** +- Missing or invalid GitHub PAT in vault +- Port 25533 already in use +- Docker image pull failure + +### MCP Endpoint Returns Errors + +**Check GitHub PAT validity:** +```bash +curl -H "Authorization: token YOUR_PAT" https://api.github.com/user +``` + +**Verify PAT scopes:** +```bash +curl -i -H "Authorization: token YOUR_PAT" https://api.github.com/user \ + | grep X-OAuth-Scopes +``` + +### MCPO Not Exposing GitHub Tools + +**Verify MCPO config:** +```bash +ssh miranda.incus cat /srv/mcpo/config.json | jq '.mcpServers.github' +``` + +**Restart MCPO:** +```bash +ssh miranda.incus sudo systemctl restart mcpo +ssh miranda.incus sudo systemctl status mcpo +``` + +--- + +## Monitoring + +### Prometheus Metrics + +GitHub MCP server exposes Prometheus metrics (if supported by the container). Add to Prometheus scrape config: + +```yaml +scrape_configs: + - job_name: 'github-mcp' + static_configs: + - targets: ['miranda.incus:25533'] +``` + +### Grafana Dashboard + +Import or create a dashboard on prospero.incus to visualize: +- Request rate and latency +- GitHub API rate limits +- Tool invocation counts +- Error rates + +### Log Queries + +Useful Loki queries in Grafana: + +```logql +# All GitHub MCP logs +{job="github-mcp-server"} + +# Errors only +{job="github-mcp-server"} |= "error" or |= "ERROR" + +# GitHub API rate limit warnings +{job="github-mcp-server"} |= "rate limit" + +# Tool invocations +{job="github-mcp-server"} |= "tool" +``` + +--- + +## Security Considerations + +✔ **Read-Only PAT** - Server uses minimal scopes, cannot modify repositories +✔ **Network Isolation** - Only accessible within Agathos network (miranda.incus) +✔ **Vault Storage** - PAT stored encrypted in Ansible Vault +✔ **No Public Exposure** - MCP endpoint not exposed to internet +⚠️ **PAT Rotation** - Consider rotating PAT every 90 days +⚠️ **Access Control** - MCPO currently doesn't require authentication + +### Recommended Enhancements + +1. Add authentication to MCPO endpoints +2. Implement request rate limiting +3. Monitor GitHub API quota usage +4. Set up PAT expiration alerts +5. Restrict network access to miranda via firewall rules + +--- + +## References + +- [GitHub MCP Server Repository](https://github.com/github/github-mcp-server) +- [Model Context Protocol Specification](https://modelcontextprotocol.io/) +- [MCPO Documentation](https://github.com/open-webui/mcpo) +- [Agathos README](../../README.md) +- [Agathos Sandbox Documentation](../sandbox.html) diff --git a/docs/grafana_mcp.md b/docs/grafana_mcp.md new file mode 100644 index 0000000..080ad71 --- /dev/null +++ b/docs/grafana_mcp.md @@ -0,0 +1,422 @@ +# Grafana MCP Server + +## Overview + +The Grafana MCP server provides AI/LLM access to Grafana dashboards, datasources, and APIs through the Model Context Protocol (MCP). It runs as a Docker container on **Miranda** and connects to the Grafana instance inside the [PPLG stack](pplg.md) on **Prospero** via the internal Incus network. + +**Deployment Host:** miranda.incus +**Port:** 25533 (HTTP MCP endpoint) +**MCPO Proxy:** http://miranda.incus:25530/grafana +**Grafana Backend:** http://prospero.incus:3000 (PPLG stack) + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ MCP CLIENTS │ +│ VS Code/Cline │ OpenWebUI │ LobeChat │ Custom Applications │ +└───────────────────────────┬─────────────────────────────────────────┘ + │ + ┌───────────┴──────────────┐ + │ │ + ▼ ▼ + Direct MCP (port 25533) MCPO Proxy (port 25530) + streamable-http OpenAI-compatible API + │ │ + └──────────┬───────────────┘ + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ Miranda (miranda.incus) │ +│ ┌────────────────────────────────────────────────┐ │ +│ │ Grafana MCP Server (Docker) │ │ +│ │ mcp/grafana:latest │ │ +│ │ Container: grafana-mcp │ │ +│ │ :25533 → :8000 │ │ +│ └─────────────────────┬──────────────────────────┘ │ +│ │ HTTP (internal network) │ +└────────────────────────┼─────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ Prospero (prospero.incus) — PPLG Stack │ +│ ┌────────────────────────────────────────────────┐ │ +│ │ Grafana :3000 │ │ +│ │ Authenticated via Service Account Token │ │ +│ └────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────────┘ +``` + +### Cross-Host Dependency + +The Grafana MCP server on Miranda communicates with Grafana on Prospero over the Incus internal network (`prospero.incus:3000`). This means: + +- **PPLG must be deployed first** — Grafana must be running before deploying the MCP server +- The connection uses Grafana's **internal HTTP port** (3000), not the external HTTPS endpoint +- Authentication is handled by a **Grafana service account token**, not Casdoor OAuth + +## Terraform Resources + +### Host Definition + +Grafana MCP runs on Miranda, defined in `terraform/containers.tf`: + +| Attribute | Value | +|-----------|-------| +| Image | noble | +| Role | mcp_docker_host | +| Security Nesting | true | +| AppArmor | unconfined | +| Proxy: mcp_containers | `0.0.0.0:25530-25539` → `127.0.0.1:25530-25539` | + +### Dependencies + +| Resource | Relationship | +|----------|--------------| +| prospero (PPLG) | Grafana backend — service account token auth on `:3000` | +| miranda (MCPO) | MCPO proxies Grafana MCP at `localhost:25533/mcp` | + +## Ansible Deployment + +### Prerequisites + +1. **PPLG stack**: Grafana must be running on Prospero (`ansible-playbook pplg/deploy.yml`) +2. **Docker**: Docker must be installed on the target host (`ansible-playbook docker/deploy.yml`) +3. **Vault Secret**: `vault_grafana_service_account_token` must be set (see [Required Vault Secrets](#required-vault-secrets)) + +### Playbook + +```bash +cd ansible +ansible-playbook grafana_mcp/deploy.yml +``` + +### Files + +| File | Purpose | +|------|---------| +| `grafana_mcp/deploy.yml` | Main deployment playbook | +| `grafana_mcp/docker-compose.yml.j2` | Docker Compose template for the MCP server | + +### Deployment Steps + +1. **Pre-flight Check**: Verify Grafana is reachable on Prospero (`/api/health`) +2. **Create System User**: `grafana_mcp:grafana_mcp` system account +3. **Create Directory**: `/srv/grafana_mcp` with restricted permissions (750) +4. **Template Docker Compose**: Renders `docker-compose.yml.j2` with Grafana URL and service account token +5. **Start Container**: `docker compose up` via `community.docker.docker_compose_v2` +6. **Health Check**: Verifies the MCP endpoint is responding on `localhost:25533/mcp` + +### Deployment Order + +Grafana MCP must be deployed **after** PPLG and **before** MCPO: + +``` +pplg → docker → grafana_mcp → mcpo +``` + +This ensures Grafana is available before the MCP server starts, and MCPO can proxy to it. + +## Docker Compose Configuration + +The container is defined in `grafana_mcp/docker-compose.yml.j2`: + +```yaml +services: + grafana-mcp: + image: mcp/grafana:latest + container_name: grafana-mcp + restart: unless-stopped + ports: + - "25533:8000" + environment: + - GRAFANA_URL=http://prospero.incus:3000 + - GRAFANA_SERVICE_ACCOUNT_TOKEN= + command: ["--transport", "streamable-http", "--address", "0.0.0.0:8000", "--tls-skip-verify"] + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:51433" + syslog-format: rfc5424 + tag: "grafana-mcp" +``` + +Key configuration: +- **Transport**: `streamable-http` — standard MCP HTTP transport +- **TLS Skip Verify**: Enabled because Grafana is accessed over internal HTTP (not HTTPS) +- **Syslog**: Logs shipped to Alloy on localhost for forwarding to Loki + +## Available Tools + +The Grafana MCP server exposes tools for interacting with Grafana's API: + +### Dashboard Operations +- Search and list dashboards +- Get dashboard details and panels +- Query panel data + +### Datasource Operations +- List configured datasources +- Query datasources directly + +### Alerting +- List alert rules +- Get alert rule details and status + +### General +- Get Grafana health status +- Search across Grafana resources + +> **Note:** The specific tools available depend on the `mcp/grafana` Docker image version. Use the MCPO Swagger docs at `http://miranda.incus:25530/docs` to see the current tool inventory. + +## Client Configuration + +### MCP Native Clients (Cline, Claude Desktop) + +```json +{ + "mcpServers": { + "grafana": { + "type": "streamable-http", + "url": "http://miranda.incus:25533/mcp" + } + } +} +``` + +### Via MCPO (OpenAI-Compatible) + +Grafana MCP is automatically available through MCPO at: + +``` +http://miranda.incus:25530/grafana +``` + +This endpoint is OpenAI-compatible and can be used by OpenWebUI, LobeChat, or any OpenAI SDK client: + +```python +import openai + +client = openai.OpenAI( + base_url="http://miranda.incus:25530/grafana", + api_key="not-required" +) +``` + +### OpenWebUI / LobeChat + +1. Navigate to **Settings → Tools → OpenAPI Servers** +2. Click **Add OpenAPI Server** +3. Configure: + - **Name:** Grafana MCP + - **URL:** `http://miranda.incus:25530/grafana` + - **Authentication:** None (MCPO handles upstream auth) +4. Save and enable the Grafana tools + +## Required Vault Secrets + +Add to `ansible/inventory/group_vars/all/vault.yml`: + +| Variable | Purpose | +|----------|---------| +| `vault_grafana_service_account_token` | Grafana service account token for MCP API access | + +### Creating a Grafana Service Account Token + +1. Log in to Grafana at `https://grafana.ouranos.helu.ca` (Casdoor SSO or local admin) +2. Navigate to **Administration → Service Accounts** +3. Click **Add service account** + - **Name:** `mcp-server` + - **Role:** `Viewer` (or `Editor` if write tools are needed) +4. Click **Add service account token** + - **Name:** `mcp-token` + - **Expiration:** No expiration (or set a rotation schedule) +5. Copy the generated token +6. Store in vault: + +```bash +cd ansible +ansible-vault edit inventory/group_vars/all/vault.yml +``` + +```yaml +vault_grafana_service_account_token: "glsa_xxxxxxxxxxxxxxxxxxxx" +``` + +## Host Variables + +**File:** `ansible/inventory/host_vars/miranda.incus.yml` + +```yaml +# Grafana MCP Config +grafana_mcp_user: grafana_mcp +grafana_mcp_group: grafana_mcp +grafana_mcp_directory: /srv/grafana_mcp +grafana_mcp_port: 25533 +grafana_mcp_grafana_host: prospero.incus +grafana_mcp_grafana_port: 3000 +grafana_service_account_token: "{{ vault_grafana_service_account_token }}" +``` + +Miranda's services list includes `grafana_mcp`: + +```yaml +services: + - alloy + - argos + - docker + - gitea_mcp + - grafana_mcp + - mcpo + - neo4j_mcp +``` + +## Monitoring + +### Syslog to Loki + +The Grafana MCP container ships logs via Docker's syslog driver to Alloy on Miranda: + +| Server | Syslog Port | Loki Tag | +|--------|-------------|----------| +| grafana-mcp | 51433 | `grafana-mcp` | + +### Grafana Log Queries + +Useful Loki queries in Grafana Explore: + +```logql +# All Grafana MCP logs +{hostname="miranda.incus", job="grafana_mcp"} + +# Errors only +{hostname="miranda.incus", job="grafana_mcp"} |= "error" or |= "ERROR" + +# Tool invocations +{hostname="miranda.incus", job="grafana_mcp"} |= "tool" +``` + +### MCPO Aggregation + +Grafana MCP is registered in MCPO's `config.json` as: + +```json +{ + "grafana": { + "type": "streamable-http", + "url": "http://localhost:25533/mcp" + } +} +``` + +MCPO exposes it at `http://miranda.incus:25530/grafana` with OpenAI-compatible API and Swagger documentation. + +## Operations + +### Start / Stop + +```bash +ssh miranda.incus + +# Docker container +sudo -u grafana_mcp docker compose -f /srv/grafana_mcp/docker-compose.yml up -d +sudo -u grafana_mcp docker compose -f /srv/grafana_mcp/docker-compose.yml down + +# Or redeploy via Ansible +cd ansible +ansible-playbook grafana_mcp/deploy.yml +``` + +### Health Check + +```bash +# Container status +ssh miranda.incus docker ps --filter name=grafana-mcp + +# MCP endpoint +curl http://miranda.incus:25533/mcp + +# Via MCPO +curl http://miranda.incus:25530/grafana/tools + +# Grafana backend (from Miranda) +curl http://prospero.incus:3000/api/health +``` + +### Logs + +```bash +# Docker container logs +ssh miranda.incus docker logs -f grafana-mcp + +# Loki logs (via Grafana on Prospero) +# Query: {hostname="miranda.incus", job="grafana_mcp"} +``` + +## Troubleshooting + +### Container Won't Start + +```bash +ssh miranda.incus +sudo -u grafana_mcp docker compose -f /srv/grafana_mcp/docker-compose.yml logs +``` + +**Common causes:** +- Grafana on Prospero not running → check `ssh prospero.incus sudo systemctl status grafana-server` +- Invalid or expired service account token → regenerate in Grafana UI +- Port 25533 already in use → `ss -tlnp | grep 25533` +- Docker image pull failure → check Docker Hub access + +### MCP Endpoint Returns Errors + +**Verify service account token:** +```bash +curl -H "Authorization: Bearer YOUR_TOKEN" http://prospero.incus:3000/api/org +``` + +**Check container environment:** +```bash +ssh miranda.incus docker inspect grafana-mcp | jq '.[0].Config.Env' +``` + +### MCPO Not Exposing Grafana Tools + +**Verify MCPO config:** +```bash +ssh miranda.incus cat /srv/mcpo/config.json | jq '.mcpServers.grafana' +``` + +**Restart MCPO:** +```bash +ssh miranda.incus sudo systemctl restart mcpo +``` + +### Grafana Unreachable from Miranda + +**Test network connectivity:** +```bash +ssh miranda.incus curl -s http://prospero.incus:3000/api/health +``` + +If this fails, check: +- Prospero container is running: `incus list prospero` +- Grafana service is up: `ssh prospero.incus sudo systemctl status grafana-server` +- No firewall rules blocking inter-container traffic + +## Security Considerations + +✔ **Service Account Token** — Scoped to Viewer role, cannot modify Grafana configuration +✔ **Internal Network** — MCP server only accessible within the Incus network +✔ **Vault Storage** — Token stored encrypted in Ansible Vault +✔ **No Public Exposure** — Neither the MCP endpoint nor the MCPO proxy are internet-facing +⚠️ **Token Rotation** — Consider rotating the service account token periodically +⚠️ **Access Control** — MCPO currently doesn't require authentication for tool access + +## References + +- [PPLG Stack Documentation](pplg.md) — Grafana deployment on Prospero +- [MCPO Documentation](mcpo.md) — MCP gateway that proxies Grafana MCP +- [Grafana MCP Server](https://github.com/grafana/mcp-grafana) — Upstream project +- [Model Context Protocol Specification](https://modelcontextprotocol.io/) +- [Ansible Practices](ansible.md) +- [Agathos Overview](agathos.md) diff --git a/docs/hass.md b/docs/hass.md new file mode 100644 index 0000000..2e7b55e --- /dev/null +++ b/docs/hass.md @@ -0,0 +1,222 @@ +# Home Assistant + +## Overview + +[Home Assistant](https://github.com/home-assistant/core) is an open-source home automation platform. In the Agathos sandbox it runs as a native Python application inside a virtual environment, backed by PostgreSQL for state recording and fronted by HAProxy for TLS termination. + +**Host:** Oberon +**Role:** container_orchestration +**Port:** 8123 +**URL:** https://hass.ouranos.helu.ca + +## Architecture + +``` +┌──────────┐ HTTPS ┌──────────────┐ HTTP ┌──────────────┐ +│ Client │────────▶│ HAProxy │────────▶│ Home │ +│ │ │ (Titania) │ │ Assistant │ +└──────────┘ │ :443 TLS │ │ (Oberon) │ + └──────────────┘ │ :8123 │ + └──────┬───────┘ + │ + ┌─────────────────┼─────────────────┐ + │ │ │ + ┌────▼─────┐ ┌──────▼──────┐ ┌─────▼─────┐ + │PostgreSQL│ │ Alloy │ │ Prometheus│ + │(Portia) │ │ (Oberon) │ │(Prospero) │ + │ :5432 │ │ scrape │ │ remote │ + │ recorder │ │ /api/prom │ │ write │ + └──────────┘ └─────────────┘ └───────────┘ +``` + +## Ansible Deployment + +### Playbook + +```bash +cd ansible +ansible-playbook hass/deploy.yml +``` + +### Files + +| File | Purpose | +|------|---------| +| `hass/deploy.yml` | Main deployment playbook | +| `hass/configuration.yaml.j2` | Home Assistant configuration | +| `hass/requirements.txt.j2` | Python package pinning | +| `hass/hass.service.j2` | Systemd service unit | + +### Variables + +#### Host Variables (`host_vars/oberon.incus.yml`) + +| Variable | Description | Value | +|----------|-------------|-------| +| `hass_user` | System user | `hass` | +| `hass_group` | System group | `hass` | +| `hass_directory` | Install directory | `/srv/hass` | +| `hass_media_directory` | Media storage | `/srv/hass/media` | +| `hass_port` | HTTP listen port | `8123` | +| `hass_version` | Pinned HA release | `2026.2.0` | +| `hass_db_host` | PostgreSQL host | `portia.incus` | +| `hass_db_port` | PostgreSQL port | `5432` | +| `hass_db_name` | Database name | `hass` | +| `hass_db_user` | Database user | `hass` | +| `hass_db_password` | Database password | `{{ vault_hass_db_password }}` | +| `hass_metrics_token` | Prometheus bearer token | `{{ vault_hass_metrics_token }}` | + +#### Host Variables (`host_vars/portia.incus.yml`) + +| Variable | Description | +|----------|-------------| +| `hass_db_name` | Database name on Portia | +| `hass_db_user` | Database user on Portia | +| `hass_db_password` | `{{ vault_hass_db_password }}` | + +#### Vault Variables (`group_vars/all/vault.yml`) + +| Variable | Description | +|----------|-------------| +| `vault_hass_db_password` | PostgreSQL password for hass database | +| `vault_hass_metrics_token` | Long-Lived Access Token for Prometheus scraping | + +## Configuration + +### PostgreSQL Recorder + +Home Assistant uses the `recorder` integration to persist entity states and events to PostgreSQL on Portia instead of the default SQLite. Configured in `configuration.yaml.j2`: + +```yaml +recorder: + db_url: "postgresql://hass:@portia.incus:5432/hass" + purge_keep_days: 30 + commit_interval: 1 +``` + +The database and user are provisioned by `postgresql/deploy.yml` alongside other service databases. + +### HTTP / Reverse Proxy + +HAProxy on Titania terminates TLS and forwards to Oberon:8123. The `http` block in `configuration.yaml.j2` configures trusted proxies so HA correctly reads `X-Forwarded-For` headers: + +```yaml +http: + server_port: 8123 + use_x_forwarded_for: true + trusted_proxies: + - 10.0.0.0/8 +``` + +### HAProxy Backend + +Defined in `host_vars/titania.incus.yml` under `haproxy_backends`: + +| Setting | Value | +|---------|-------| +| Subdomain | `hass` | +| Backend | `oberon.incus:8123` | +| Health path | `/api/` | +| Timeout | 300s (WebSocket support) | + +The wildcard TLS certificate (`*.ouranos.helu.ca`) covers `hass.ouranos.helu.ca` automatically — no certificate changes required. + +## Authentication + +Home Assistant uses its **native `homeassistant` auth provider** (built-in username/password). HA does not support OIDC/OAuth2 natively, so Casdoor SSO integration is not available. + +On first deployment, HA will present an onboarding wizard to create the initial admin user. + +## Monitoring + +### Prometheus Metrics + +Home Assistant exposes Prometheus metrics at `/api/prometheus`. The Alloy agent on Oberon scrapes this endpoint with bearer token authentication and remote-writes to Prometheus on Prospero. + +| Setting | Value | +|---------|-------| +| Metrics path | `/api/prometheus` | +| Scrape interval | 60s | +| Auth | Bearer token (Long-Lived Access Token) | + +**⚠️ Two-Phase Metrics Bootstrapping:** + +The `vault_hass_metrics_token` must be a Home Assistant **Long-Lived Access Token**, which can only be generated from the HA web UI after the initial deployment: + +1. Deploy Home Assistant: `ansible-playbook hass/deploy.yml` +2. Complete the onboarding wizard at `https://hass.ouranos.helu.ca` +3. Navigate to **Profile → Security → Long-Lived Access Tokens → Create Token** +4. Store the token in vault: `vault_hass_metrics_token: ""` +5. Redeploy Alloy to pick up the token: `ansible-playbook alloy/deploy.yml` + +Until the token is created, the Alloy hass scrape will fail silently. + +### Loki Logs + +Systemd journal logs are collected by Alloy's `loki.source.journal` and shipped to Loki on Prospero. + +```bash +# Query in Grafana Explore +{job="systemd", hostname="oberon"} |= "hass" +``` + +## Operations + +### Start / Stop + +```bash +sudo systemctl start hass +sudo systemctl stop hass +sudo systemctl restart hass +``` + +### Health Check + +```bash +curl http://localhost:8123/api/ +``` + +### Logs + +```bash +journalctl -u hass -f +``` + +### Version Upgrade + +1. Update `hass_version` in `host_vars/oberon.incus.yml` +2. Run: `ansible-playbook hass/deploy.yml` + +The playbook will reinstall the pinned version via pip and restart the service. + +## Troubleshooting + +### Common Issues + +| Symptom | Cause | Resolution | +|---------|-------|------------| +| Service won't start | Missing Python deps | Check `pip install` output in deploy log | +| Database connection error | Portia unreachable | Verify PostgreSQL is running: `ansible-playbook postgresql/deploy.yml` | +| 502 via HAProxy | HA not listening | Check `systemctl status hass` on Oberon | +| Metrics scrape failing | Missing/invalid token | Generate Long-Lived Access Token from HA UI (see Monitoring section) | + +### Debug Mode + +```bash +# Check service status +sudo systemctl status hass + +# View recent logs +journalctl -u hass --since "5 minutes ago" + +# Test database connectivity from Oberon +psql -h portia.incus -U hass -d hass -c "SELECT 1" +``` + +## References + +- [Home Assistant Documentation](https://www.home-assistant.io/docs/) +- [Home Assistant GitHub](https://github.com/home-assistant/core) +- [Recorder Integration](https://www.home-assistant.io/integrations/recorder/) +- [Prometheus Integration](https://www.home-assistant.io/integrations/prometheus/) +- [HTTP Integration](https://www.home-assistant.io/integrations/http/) diff --git a/docs/jupyterlab.md b/docs/jupyterlab.md new file mode 100644 index 0000000..9bbad1c --- /dev/null +++ b/docs/jupyterlab.md @@ -0,0 +1,342 @@ +# JupyterLab - Interactive Computing Environment + +## Overview +JupyterLab is a web-based interactive development environment for notebooks, code, and data. Deployed on **Puck** as a systemd service running in a Python virtual environment, with OAuth2-Proxy sidecar providing Casdoor SSO authentication. + +**Host:** puck.incus +**Role:** Application Runtime (Python App Host) +**Container Port:** 22181 (JupyterLab), 22182 (OAuth2-Proxy) +**External Access:** https://jupyter.ouranos.helu.ca/ (via HAProxy on Titania) + +## Architecture + +``` +┌──────────┐ ┌────────────┐ ┌─────────────┐ ┌────────────┐ +│ Client │─────▶│ HAProxy │─────▶│ OAuth2-Proxy│─────▶│ JupyterLab │ +│ │ │ (Titania) │ │ (Puck) │ │ (Puck) │ +└──────────┘ └────────────┘ └─────────────┘ └────────────┘ + │ + ▼ + ┌───────────┐ + │ Casdoor │ + │ (Titania) │ + └───────────┘ +``` + +### Authentication Flow + +``` +┌──────────┐ ┌────────────┐ ┌─────────────┐ ┌──────────┐ +│ Browser │─────▶│ HAProxy │─────▶│ OAuth2-Proxy│─────▶│ Casdoor │ +│ │ │ (Titania) │ │ (Puck) │ │(Titania) │ +└──────────┘ └────────────┘ └─────────────┘ └──────────┘ + │ │ │ + │ 1. Access jupyter.ouranos.helu.ca │ │ + │─────────────────────────────────────▶│ │ + │ 2. No session - redirect to Casdoor │ │ + │◀─────────────────────────────────────│ │ + │ 3. User authenticates │ │ + │─────────────────────────────────────────────────────────▶│ + │ 4. Redirect with auth code │ │ + │◀─────────────────────────────────────────────────────────│ + │ 5. Exchange code, set session cookie│ │ + │◀─────────────────────────────────────│ │ + │ 6. Proxy to JupyterLab │ │ + │◀─────────────────────────────────────│ │ +``` + +## Deployment + +### Playbook + +```bash +cd ansible +ansible-playbook jupyterlab/deploy.yml +``` + +### Files + +| File | Purpose | +|------|---------| +| `jupyterlab/deploy.yml` | Main deployment playbook | +| `jupyterlab/jupyterlab.service.j2` | Systemd unit for JupyterLab | +| `jupyterlab/oauth2-proxy-jupyter.service.j2` | Systemd unit for OAuth2-Proxy sidecar | +| `jupyterlab/oauth2-proxy-jupyter.cfg.j2` | OAuth2-Proxy configuration | +| `jupyterlab/jupyter_lab_config.py.j2` | JupyterLab server configuration | + +### Deployment Steps + +1. **Install Dependencies**: python3-venv, nodejs, npm, graphviz +2. **Ensure User Exists**: `robert:robert` with home directory +3. **Create Directories**: Notebooks dir, config dir, log dir +4. **Create Virtual Environment**: `/home/robert/env/jupyter` +5. **Install Python Packages**: jupyterlab, jupyter-ai, langchain-ollama, matplotlib, plotly +6. **Install Jupyter Extensions**: contrib nbextensions +7. **Template Configuration**: Apply JupyterLab config +8. **Download OAuth2-Proxy**: Binary from GitHub releases +9. **Template OAuth2-Proxy Config**: With Casdoor OIDC settings +10. **Start Services**: Enable and start both systemd units + +## Configuration + +### Key Features + +- **Jupyter AI**: AI assistance via jupyter-ai[all] with LangChain Ollama integration +- **Visualization**: matplotlib, plotly for data visualization +- **Diagrams**: Mermaid support via jupyterlab-mermaid +- **Extensions**: Jupyter contrib nbextensions +- **SSO**: Casdoor authentication via OAuth2-Proxy sidecar +- **WebSocket**: Full WebSocket support through reverse proxy + +### Storage Locations + +| Path | Purpose | Owner | +|------|---------|-------| +| `/home/robert/Notebooks` | Notebook files | robert:robert | +| `/home/robert/env/jupyter` | Python virtual environment | robert:robert | +| `/etc/jupyterlab` | Configuration files | root:robert | +| `/var/log/jupyterlab` | Application logs | robert:robert | +| `/etc/oauth2-proxy-jupyter` | OAuth2-Proxy config | root:root | + +### Installed Python Packages + +| Package | Purpose | +|---------|---------| +| `jupyterlab` | Core JupyterLab server | +| `jupyter-ai[all]` | AI assistant integration | +| `langchain-ollama` | Ollama LLM integration | +| `matplotlib` | Data visualization | +| `plotly` | Interactive charts | +| `jupyter_contrib_nbextensions` | Community extensions | +| `jupyterlab-mermaid` | Mermaid diagram support | +| `ipywidgets` | Interactive widgets | + +### Logging + +- **JupyterLab**: systemd journal via `SyslogIdentifier=jupyterlab` +- **OAuth2-Proxy**: systemd journal via `SyslogIdentifier=oauth2-proxy-jupyter` +- **Alloy Forwarding**: Syslog port 51491 → Loki + +## Access After Deployment + +1. **Web Interface**: https://jupyter.ouranos.helu.ca/ +2. **Authentication**: Redirects to Casdoor SSO login +3. **After Login**: Full JupyterLab interface with notebook access + +## Monitoring + +### Alloy Configuration +**File:** `ansible/alloy/puck/config.alloy.j2` + +- **Log Collection**: Syslog port 51491 → Loki +- **Job Label**: `jupyterlab` +- **System Metrics**: Process exporter tracks JupyterLab process + +### Health Check +- **URL**: `http://puck.incus:22182/ping` (OAuth2-Proxy) +- **JupyterLab API**: `http://127.0.0.1:22181/api/status` (localhost only) + +## Required Vault Secrets + +Add to `ansible/inventory/group_vars/all/vault.yml`: + +### 1. OAuth Client ID +```yaml +vault_jupyter_oauth_client_id: "jupyter-oauth-client" +``` +**Requirements:** +- **Purpose**: Client ID for Casdoor OAuth2 application +- **Source**: Must match `clientId` in Casdoor application configuration + +### 2. OAuth Client Secret +```yaml +vault_jupyter_oauth_client_secret: "YourRandomOAuthSecret123!" +``` +**Requirements:** +- **Length**: 32+ characters recommended +- **Purpose**: Client secret for Casdoor OAuth2 authentication +- **Generation**: + ```bash + openssl rand -base64 32 + ``` + +### 3. Cookie Secret +```yaml +vault_jupyter_oauth2_cookie_secret: "32CharacterRandomStringHere1234" +``` +**Requirements:** +- **Length**: Exactly 32 characters (or 16/24 for AES) +- **Purpose**: Encrypts OAuth2-Proxy session cookies +- **Generation**: + ```bash + openssl rand -base64 32 | head -c 32 + ``` + +## Host Variables + +**File:** `ansible/inventory/host_vars/puck.incus.yml` + +```yaml +# JupyterLab Configuration +jupyterlab_user: robert +jupyterlab_group: robert +jupyterlab_notebook_dir: /home/robert/Notebooks +jupyterlab_venv_dir: /home/robert/env/jupyter + +# Ports +jupyterlab_port: 22181 # JupyterLab (localhost only) +jupyterlab_proxy_port: 22182 # OAuth2-Proxy (exposed to HAProxy) + +# OAuth2-Proxy Configuration +jupyterlab_oauth2_proxy_dir: /etc/oauth2-proxy-jupyter +jupyterlab_oauth2_proxy_version: "7.6.0" +jupyterlab_domain: "ouranos.helu.ca" +jupyterlab_oauth2_oidc_issuer_url: "https://id.ouranos.helu.ca" +jupyterlab_oauth2_redirect_url: "https://jupyter.ouranos.helu.ca/oauth2/callback" + +# OAuth2 Credentials (from vault) +jupyterlab_oauth_client_id: "{{ vault_jupyter_oauth_client_id }}" +jupyterlab_oauth_client_secret: "{{ vault_jupyter_oauth_client_secret }}" +jupyterlab_oauth2_cookie_secret: "{{ vault_jupyter_oauth2_cookie_secret }}" + +# Alloy Logging +jupyterlab_syslog_port: 51491 +``` + +## OAuth2 / Casdoor SSO + +JupyterLab uses OAuth2-Proxy as a sidecar to handle Casdoor authentication. This pattern is simpler than native OAuth for single-user setups. + +### Why OAuth2-Proxy Sidecar? + +| Approach | Pros | Cons | +|----------|------|------| +| **OAuth2-Proxy (chosen)** | Simple setup, no JupyterLab modification | Extra service to manage | +| **Native JupyterHub OAuth** | Integrated solution | More complex, overkill for single user | +| **Token-only auth** | Simplest | Less secure, no SSO integration | + +### Casdoor Application Configuration + +A JupyterLab application is defined in `ansible/casdoor/init_data.json.j2`: + +| Setting | Value | +|---------|-------| +| **Name** | `app-jupyter` | +| **Client ID** | `vault_jupyter_oauth_client_id` | +| **Redirect URI** | `https://jupyter.ouranos.helu.ca/oauth2/callback` | +| **Grant Types** | `authorization_code`, `refresh_token` | + +### URL Strategy + +| URL Type | Address | Used By | +|----------|---------|---------| +| **OIDC Issuer** | `https://id.ouranos.helu.ca` | OAuth2-Proxy (external) | +| **Redirect URL** | `https://jupyter.ouranos.helu.ca/oauth2/callback` | Browser callback | +| **Upstream** | `http://127.0.0.1:22181` | OAuth2-Proxy → JupyterLab | + +### Deployment Order + +1. **Deploy Casdoor first** (if not already running): + ```bash + ansible-playbook casdoor/deploy.yml + ``` + +2. **Update HAProxy** (add jupyter backend): + ```bash + ansible-playbook haproxy/deploy.yml + ``` + +3. **Deploy JupyterLab**: + ```bash + ansible-playbook jupyterlab/deploy.yml + ``` + +4. **Update Alloy** (for log forwarding): + ```bash + ansible-playbook alloy/deploy.yml + ``` + +## Integration with Other Services + +### HAProxy Routing +**Backend Configuration** (`titania.incus.yml`): +```yaml +- subdomain: "jupyter" + backend_host: "puck.incus" + backend_port: 22182 # OAuth2-Proxy port + health_path: "/ping" + timeout_server: 300s # WebSocket support +``` + +### Alloy Log Forwarding +**Syslog Configuration** (`puck/config.alloy.j2`): +```hcl +loki.source.syslog "jupyterlab_logs" { + listener { + address = "127.0.0.1:51491" + protocol = "tcp" + labels = { + job = "jupyterlab", + } + } + forward_to = [loki.write.default.receiver] +} +``` + +## Troubleshooting + +### Service Status +```bash +ssh puck.incus +sudo systemctl status jupyterlab +sudo systemctl status oauth2-proxy-jupyter +``` + +### View Logs +```bash +# JupyterLab logs +sudo journalctl -u jupyterlab -f + +# OAuth2-Proxy logs +sudo journalctl -u oauth2-proxy-jupyter -f +``` + +### Test JupyterLab Directly (bypass OAuth) +```bash +# From puck container +curl http://127.0.0.1:22181/api/status +``` + +### Test OAuth2-Proxy Health +```bash +curl http://puck.incus:22182/ping +``` + +### Verify Virtual Environment +```bash +ssh puck.incus +sudo -u robert /home/robert/env/jupyter/bin/jupyter --version +``` + +### Common Issues + +| Issue | Solution | +|-------|----------| +| WebSocket disconnects | Verify `timeout_server: 300s` in HAProxy backend | +| OAuth redirect loop | Check `redirect_url` matches Casdoor app config | +| 502 Bad Gateway | Ensure JupyterLab service is running on port 22181 | +| Cookie errors | Verify `cookie_secret` is exactly 32 characters | + +## Version Information + +- **Installation Method**: Python pip in virtual environment +- **JupyterLab Version**: Latest stable (pip managed) +- **OAuth2-Proxy Version**: 7.6.0 (binary from GitHub) +- **Update Process**: Re-run deployment playbook + +## References + +- **JupyterLab Documentation**: https://jupyterlab.readthedocs.io/ +- **OAuth2-Proxy Documentation**: https://oauth2-proxy.github.io/oauth2-proxy/ +- **Jupyter AI**: https://jupyter-ai.readthedocs.io/ +- **Casdoor OIDC**: https://casdoor.org/docs/integration/oidc diff --git a/docs/kb/Docker Compose doesn't pull newer images for existing tags.md b/docs/kb/Docker Compose doesn't pull newer images for existing tags.md new file mode 100644 index 0000000..17e2e78 --- /dev/null +++ b/docs/kb/Docker Compose doesn't pull newer images for existing tags.md @@ -0,0 +1,127 @@ +Docker Compose doesn't pull newer images for existing tags +----------------------------------------------------------- + +# Issue + +Running `docker compose up` on a service tagged `:latest` does not check the registry for a newer image. The container keeps running the old image even though a newer one has been pushed upstream. + +## Symptoms + +- `docker compose up` starts the container immediately using the locally cached image +- `docker compose pull` or `docker pull :latest` successfully downloads a newer image +- After pulling manually, `docker compose up` recreates the container with the new image +- The `community.docker.docker_compose_v2` Ansible module with `state: present` behaves identically — no pull check + +# Explanation + +Docker's default behaviour is: **if an image with the requested tag exists locally, use it without checking the registry.** The `:latest` tag is not special — it's just a regular mutable tag. Docker does not treat it as "always fetch the newest." It is simply the default tag applied when no tag is specified. + +When you run `docker compose up`: + +1. Docker checks if `image:latest` exists in the local image store +2. If yes → use it, no registry check +3. If no → pull from registry + +This means a stale `:latest` can sit on your host indefinitely while the upstream registry has a completely different image behind the same tag. The only way Docker knows to pull is if: +- The image doesn't exist locally at all +- You explicitly tell it to pull + +The same applies to the Ansible `community.docker.docker_compose_v2` module — `state: present` maps to `docker compose up` behaviour, so no pull check occurs unless you tell it to. + +# Solution + +Two complementary fixes ensure images are always checked against the registry. + +## 1. Docker Compose — `pull_policy: always` + +Add `pull_policy: always` to the service definition in `docker-compose.yml`: + +```yaml +services: + my-service: + image: registry.example.com/my-image:latest + pull_policy: always # Check registry on every `up` + container_name: my-service + ... +``` + +With this set, `docker compose up` will always contact the registry and compare the local image digest with the remote one. If they match, no download occurs — it's a lightweight check. If they differ, the new image layers are pulled. + +Valid values for `pull_policy`: + +| Value | Behaviour | +|-------|-----------| +| `always` | Always check the registry before starting | +| `missing` | Only pull if the image doesn't exist locally (default) | +| `never` | Never pull, fail if image doesn't exist locally | +| `build` | Always build the image (for services with `build:`) | + +## 2. Ansible — `pull: always` on `docker_compose_v2` + +Add `pull: always` to the `community.docker.docker_compose_v2` task: + +```yaml +- name: Start service + community.docker.docker_compose_v2: + project_src: "{{ service_directory }}" + state: present + pull: always # Check registry during deploy +``` + +Valid values for `pull`: + +| Value | Behaviour | +|-------|-----------| +| `always` | Always pull before starting (like `docker compose pull && up`) | +| `missing` | Only pull if image doesn't exist locally | +| `never` | Never pull | +| `policy` | Defer to `pull_policy` defined in the compose file | + +## Why use both? + +- **`pull_policy` in compose file** — Protects against manual `docker compose up` on the host +- **`pull: always` in Ansible** — Ensures automated deployments always get the freshest image + +They are independent mechanisms. The Ansible `pull` parameter runs a pull step before compose up, regardless of what the compose file says. Belt and suspenders. + +# Agathos Fix + +Applied to `ansible/gitea_mcp/` as the first instance. The same pattern should be applied to any service using mutable tags (`:latest`, `:stable`, etc.). + +**docker-compose.yml.j2:** +```yaml +services: + gitea-mcp: + image: docker.gitea.com/gitea-mcp-server:latest + pull_policy: always + ... +``` + +**deploy.yml:** +```yaml +- name: Start Gitea MCP service + community.docker.docker_compose_v2: + project_src: "{{ gitea_mcp_directory }}" + state: present + pull: always +``` + +# When you DON'T need this + +- **Pinned image tags** (e.g., `postgres:16.2`, `grafana/grafana:11.1.0`) — The tag is immutable, so there's nothing newer to pull. Using `pull: always` here just adds a redundant registry check on every deploy. +- **Locally built images** — If the image is built by `docker compose build`, use `pull_policy: build` instead. +- **Air-gapped / offline hosts** — `pull: always` will fail if the registry is unreachable. Use `missing` or `never`. + +# Verification + +```bash +# Check what image a running container is using +docker inspect --format='{{.Image}}' gitea-mcp + +# Compare local digest with remote +docker images --digests docker.gitea.com/gitea-mcp-server + +# Force pull and check if image ID changes +docker compose pull +docker compose up -d +``` diff --git a/docs/kb/Docker won't start inside Incus container.md b/docs/kb/Docker won't start inside Incus container.md new file mode 100644 index 0000000..effb5ea --- /dev/null +++ b/docs/kb/Docker won't start inside Incus container.md @@ -0,0 +1,134 @@ +Docker won't start inside Incus container +------------------------------------------ + +# Issue +Running Docker inside Incus has worked for years, but a recent Ubuntu package update caused it to fail. + +## Symptoms + +Docker containers won't start with the following error: + +``` +docker compose up +Attaching to neo4j +Error response from daemon: failed to create task for container: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: open sysctl net.ipv4.ip_unprivileged_port_start file: reopen fd 8: permission denied +``` + +The issue is AppArmor on Incus containers. The host has AppArmor, and Incus applies an AppArmor profile to containers with `security.nesting=true` that blocks Docker from writing to `/proc/sys/net/ipv4/ip_unprivileged_port_start`. + +# Solution (Automated) + +The fix requires **both** host-side and container-side changes. These are now automated in our infrastructure: + +## 1. Terraform - Host-side fix + +In `terraform/containers.tf`, all containers with `security.nesting=true` now include: + +```terraform +config = { + "security.nesting" = true + "raw.lxc" = "lxc.apparmor.profile=unconfined" +} +``` + +This tells Incus not to load any AppArmor profile for the container. + +## 2. Ansible - Container-side fix + +In `ansible/docker/deploy.yml`, Docker deployment now creates a systemd override: + +```yaml +- name: Create AppArmor workaround for Incus nested Docker + ansible.builtin.copy: + content: | + [Service] + Environment=container="setmeandforgetme" + dest: /etc/systemd/system/docker.service.d/apparmor-workaround.conf +``` + +This tells Docker to skip loading its own AppArmor profile. + +# Manual Workaround + +If you need to fix this manually (e.g., before running Terraform/Ansible): + +## Step 1: Force unconfined mode from the Incus host + +```bash +# On the HOST (pan.helu.ca), not in the container +incus config set raw.lxc "lxc.apparmor.profile=unconfined" --project agathos +incus restart --project agathos +``` + +## Step 2: Disable AppArmor for Docker inside the container + +```bash +# Inside the container +sudo mkdir -p /etc/systemd/system/docker.service.d +sudo tee /etc/systemd/system/docker.service.d/apparmor-workaround.conf < "RETURN 1" +``` + +### Logs + +```bash +# Docker container logs +docker logs -f neo4j + +# Via Loki (Grafana Explore) +{job="neo4j", hostname="ariel.incus"} +``` + +### Cypher Shell Access + +```bash +# SSH to Ariel and exec into container +ssh ariel.incus +docker exec -it neo4j cypher-shell -u neo4j -p +``` + +### Backup + +Neo4j data persists in Docker volumes. Backup procedures: + +```bash +# Stop container for consistent backup +docker compose -f /srv/neo4j/docker-compose.yml stop + +# Backup volumes +docker run --rm -v neo4j_data:/data -v /backup:/backup alpine \ + tar czf /backup/neo4j_data_$(date +%Y%m%d).tar.gz -C /data . + +# Start container +docker compose -f /srv/neo4j/docker-compose.yml up -d +``` + +### Restore + +```bash +# Stop container +docker compose -f /srv/neo4j/docker-compose.yml down + +# Remove existing volume +docker volume rm neo4j_data + +# Create new volume and restore +docker volume create neo4j_data +docker run --rm -v neo4j_data:/data -v /backup:/backup alpine \ + tar xzf /backup/neo4j_data_YYYYMMDD.tar.gz -C /data + +# Start container +docker compose -f /srv/neo4j/docker-compose.yml up -d +``` + +## Troubleshooting + +### Common Issues + +| Symptom | Cause | Resolution | +|---------|-------|------------| +| Container won't start | Auth format issue | Check `NEO4J_AUTH` format is `user/password` | +| APOC procedures fail | Security restrictions | Verify `neo4j_apoc_unrestricted` includes procedure | +| Connection refused | Port not exposed | Check Incus proxy device configuration | +| Bolt connection fails | Wrong port | Use port 7687, not 25554 | + +### Debug Mode + +```bash +# View container startup logs +docker logs neo4j + +# Check Neo4j internal logs +docker exec neo4j cat /logs/debug.log +``` + +### Verify APOC Installation + +```cypher +CALL apoc.help("apoc") +YIELD name, text +RETURN name, text LIMIT 10; +``` + +## Related Services + +### Neo4j MCP Servers (Miranda) + +Two MCP servers run on Miranda to provide AI agent access to Neo4j: + +| Server | Port | Purpose | +|--------|------|---------| +| neo4j-cypher | 25531 | Direct Cypher query execution | +| neo4j-memory | 25532 | Knowledge graph memory operations | + +See [Neo4j MCP documentation](#neo4j-mcp-servers) for deployment details. + +## References + +- [Neo4j Documentation](https://neo4j.com/docs/) +- [APOC Library Documentation](https://neo4j.com/labs/apoc/) +- [Terraform Practices](../terraform.md) +- [Ansible Practices](../ansible.md) +- [Sandbox Overview](../agathos.html) diff --git a/docs/nextcloud.md b/docs/nextcloud.md new file mode 100644 index 0000000..5fd25bc --- /dev/null +++ b/docs/nextcloud.md @@ -0,0 +1,380 @@ +# Nextcloud - Self-Hosted Cloud Collaboration + +## Overview +Nextcloud is a self-hosted cloud collaboration platform providing file storage, sharing, calendar, contacts, and productivity tools. Deployed as a **native LAPP stack** (Linux, Apache, PostgreSQL, PHP) on **Rosalind** with Memcached caching and Incus storage volume for data. + +**Host:** rosalind.incus +**Role:** Collaboration (PHP, Go, Node.js runtimes) +**Container Port:** 22083 +**External Access:** https://nextcloud.ouranos.helu.ca/ (via HAProxy on Titania) +**Installation Method:** Native (tar.bz2 extraction to /var/www/nextcloud) + +## Architecture + +``` +┌──────────┐ ┌────────────┐ ┌───────────┐ ┌───────────┐ +│ Client │─────▶│ HAProxy │─────▶│ Apache2 │─────▶│PostgreSQL │ +│ │ │ (Titania) │ │ Nextcloud │ │ (Portia) │ +└──────────┘ └────────────┘ │(Rosalind) │ └───────────┘ + └───────────┘ + │ + ├─────────▶ Memcached (Local) + │ + └─────────▶ /mnt/nextcloud (Volume) +``` + +## Deployment + +### Playbook + +```bash +cd ansible +ansible-playbook nextcloud/deploy.yml +``` + +### Files + +| File | Purpose | +|------|---------| +| `nextcloud/deploy.yml` | Main deployment playbook | +| `nextcloud/nextcloud.conf.j2` | Apache VirtualHost template | + +2. **Create Data Directory**: `/mnt/nextcloud` on Incus storage volume +3. **Download Nextcloud**: Latest tarball from official site (if not already present) +4. **Extract to Web Root**: `/var/www/nextcloud` (if new installation) +5. **Set Permissions**: `www-data:www-data` ownership +6. **Configure Apache**: Template vhost with port 22083, enable mods, disable default site +7. **Run Installation**: OCC command-line installer (generates config.php with secrets) +8. **Configure via OCC**: Set trusted domains, Memcached, background job mode +9. **Setup Cron**: Background jobs every 5 minutes as www-data + +**⚠️ Important**: The playbook does NOT template over config.php after installation. All configuration changes are made via OCC commands to preserve auto-generated secrets (instanceid, passwordsalt, secret). + +## Configuration + +### Key Features + +- **PostgreSQL Backend**: Database on Portia +- **Memcached Caching**: Local distributed cache with `nc_` prefix +- **Incus Storage Volume**: Dedicated 100GB volume at /mnt/nextcloud +- **Apache Web Server**: mod_php with rewrite/headers modules +- **Cron Background Jobs**: System cron (not Docker/AJAX) +- **Native Installation**: No Docker overhead, matches production pattern + +### Storage Configuration + +| Path | Purpose | Owner | Mount | +|------|---------|-------|-------| +| `/var/www/nextcloud` | Application files | www-data | Local | +| `/mnt/nextcloud` | User data directory | www-data | Incus volume | +| `/var/log/apache2` | Web server logs | root | Local | + +### Apache Modules + +Required modules enabled by playbook: +- `rewrite` - URL rewriting +- `headers` - HTTP header manipulation +- `env` - Environment variable passing +- `dir` - Directory index handling +- `mime` - MIME type configuration + +### PHP Configuration + +Installed PHP extensions: +- `php-gd` - Image manipulation +- `php-pgsql` - PostgreSQL database +- `php-curl` - HTTP client +- `php-mbstring` - Multibyte string handling +- `php-intl` - Internationalization +- `php-gmp` - GNU Multiple Precision +- `php-bcmath` - Binary calculator +- `php-xml` - XML processing +- `php-imagick` - ImageMagick integration +- `php-zip` - ZIP archive handling +- `php-memcached` - Memcached caching + +### Memcached Configuration + +- **Host**: localhost:11211 +- **Prefix**: `nc_` (Nextcloud-specific keys) +- **Local**: `\OC\Memcache\Memcached` +- **Distributed**: `\OC\Memcache\Memcached` + +### Cron Jobs + +Background jobs configured via system cron: +```cron +*/5 * * * * php /var/www/nextcloud/cron.php +``` + +Runs as `www-data` user every 5 minutes. + +## Access After Deployment + +1. **Web Interface**: https://nextcloud.ouranos.helu.ca/ +2. **First Login**: Use admin credentials from vault +3. **Initial Setup**: Configure apps and settings via web UI +4. **Client Apps**: Download desktop/mobile clients from Nextcloud website + +### Desktop/Mobile Sync + +- **Server URL**: https://nextcloud.ouranos.helu.ca +- **Username**: admin (or created user) +- **Password**: From vault +- **Desktop Client**: https://nextcloud.com/install/#install-clients +- **Mobile Apps**: iOS App Store / Google Play Store + +### WebDAV Access + +- **WebDAV URL**: `https://nextcloud.ouranos.helu.ca/remote.php/dav/files/USERNAME/` +- **Use Cases**: File sync, calendar (CalDAV), contacts (CardDAV) + +## Monitoring + +### Alloy Configuration +**File:** `ansible/alloy/rosalind/config.alloy.j2` + +- **Apache Access Logs**: `/var/log/apache2/access.log` → Loki +- **Apache Error Logs**: `/var/log/apache2/error.log` → Loki +- **System Metrics**: Process exporter tracks Apache/PHP processes +- **Labels**: job=apache_access, job=apache_error + +### Health Checks + +**HAProxy Health Endpoint**: `/status.php` + +**Manual Health Check**: +```bash +curl http://rosalind.incus:22082/status.php +``` + +Expected response: JSON with status information + +## Required Vault Secrets + +Add to `ansible/inventory/group_vars/all/vault.yml`: + +### 1. Database Password +```yaml +vault_nextcloud_db_password: "PostgresSecurePassword123!" +``` +**Requirements:** +- Minimum 12 characters +- Used by PostgreSQL authentication + +### 2. Admin Password +```yaml +vault_nextcloud_admin_password: "AdminSecurePassword123!" +``` +**Requirements:** +- Minimum 8 characters (Nextcloud requirement) +- Used for admin user login +- **Important**: Store securely, used for web interface access + +### 3. Instance Secrets (Auto-Generated) +These are automatically generated during installation by the OCC installer and stored in `/var/www/nextcloud/config/config.php`. The host_vars should leave these empty: +```yaml +nextcloud_instance_id: "" # Auto-generated, leave empty +nextcloud_password_salt: "" # Auto-generated, leave empty +nextcloud_secret: "" # Auto-generated, leave empty +``` + +**ℹ️ These secrets persist in config.php and do not need to be stored in vault or host_vars.** They are only referenced in these variables for consistency with the original template design. + +## Host Variables + +**File:** `ansible/inventory/host_vars/rosalind.incus.yml` + +```yaml +# Nextcloud Configuration +nextcloud_web_port: 22083 +nextcloud_data_dir: /mnt/nextcloud + +# Database Configuration +nextcloud_db_type: pgsql +nextcloud_db_host: portia.incus +nextcloud_db_port: 5432 +nextcloud_db_name: nextcloud +nextcloud_db_user: nextcloud +nextcloud_db_password: "{{vault_nextcloud_db_password}}" + +# Admin Configuration +nextcloud_admin_user: admin +nextcloud_admin_password: "{{vault_nextcloud_admin_password}}" + +# Domain Configuration +nextcloud_domain: nextcloud.ouranos.helu.ca + +# Instance secrets (generated during install) +nextcloud_instance_id: "" +nextcloud_password_salt: "" +nextcloud_secret: "" +``` + +## Database Setup + +Nextcloud requires a PostgreSQL database on Portia. This is automatically created by the `postgresql/deploy.yml` playbook. + +**Database Details:** +- **Name**: nextcloud +- **User**: nextcloud +- **Owner**: nextcloud +- **Extensions**: None required + +## Storage Setup + +### Incus Storage Volume +**Terraform Resource:** `terraform/storage.tf` +```hcl +resource "incus_storage_volume" "nextcloud_data" { + name = "nextcloud-data" + pool = "default" + project = "agathos" + config = { size = "100GB" } +} +``` + +Mounted at `/mnt/nextcloud` on Rosalind container. This volume stores all Nextcloud user data, including uploaded files, app data, and user-specific configurations. + +## Integration with Other Services + +### HAProxy Routing +**Backend Configuration** (`titania.incus.yml`): +```yaml +- subdomain: "nextcloud" + backend_host: "rosalind.incus" + backend_port: 22083 + health_path: "/status.php" +``` + +### Memcached Integration +- **Host**: localhost:11211 +- **Prefix**: `nc_` +- **Shared Instance**: Rosalind hosts Memcached for all services + +## Troubleshooting + +### Service Status +```bash +ssh rosalind.incus +sudo systemctl status apache2 +``` + +### View Logs +```bash +# Apache access logs +sudo tail -f /var/log/apache2/access.log + +# Apache error logs +sudo tail -f /var/log/apache2/error.log + +# Nextcloud logs (via web UI) +# Settings → Logging +``` + +### OCC Command-Line Tool +```bash +# As www-data user +sudo -u www-data php /var/www/nextcloud/occ + +# Examples: +sudo -u www-data php /var/www/nextcloud/occ status +sudo -u www-data php /var/www/nextcloud/occ config:list +sudo -u www-data php /var/www/nextcloud/occ maintenance:mode --on +``` + +### Database Connection +```bash +psql -h portia.incus -U nextcloud -d nextcloud +``` + +### Check Memcached +```bash +echo "stats" | nc localhost 11211 +``` + +### Verify Storage Volume +```bash +# Reset ownership +sudo chown -R www-data:www-data /var/www/nextcloud +sudo chown -R www-data:www-data /mnt/nextcloud + +# Reset permissions +sudo chmod -R 0750 /var/www/nextcloud +``` + +### Maintenance Mode +```bash +# Enable maintenance mode +sudo -u www-data php /var/www/nextcloud/occ maintenance:mode --on + +# Disable maintenance mode +sudo -u www-data php /var/www/nextcloud/occ maintenance:mode --off +``` + +## Updates and Maintenance + +### Updating Nextcloud + +**⚠️ Important**: Always backup before updating! + +```bash +# 1. Enable maintenance mode +sudo -u www-data php /var/www/nextcloud/occ maintenance:mode --on + +# 2. Backup config and database +sudo cp -r /var/www/nextcloud/config /backup/nextcloud-config-$(date +%Y%m%d) +pg_dump -h portia.incus -U nextcloud nextcloud > /backup/nextcloud-db-$(date +%Y%m%d).sql + +# 3. Download new version +wget https://download.nextcloud.com/server/releases/latest.tar.bz2 + +# 4. Extract and replace (preserve config/) +tar -xjf latest.tar.bz2 +sudo rsync -av --delete --exclude config/ nextcloud/ /var/www/nextcloud/ + +# 5. Run upgrade +sudo -u www-data php /var/www/nextcloud/occ upgrade + +# 6. Disable maintenance mode +sudo -u www-data php /var/www/nextcloud/occ maintenance:mode --off +``` + +### Database Maintenance +```bash +# Add missing indices +sudo -u www-data php /var/www/nextcloud/occ db:add-missing-indices + +# Convert to bigint +sudo -u www-data php /var/www/nextcloud/occ db:convert-filecache-bigint +``` + +## Version Information + +- **Installation Method**: Tarball extraction (official releases) +- **Current Version**: Check web UI → Settings → Overview +- **Update Channel**: Stable (latest.tar.bz2) +- **PHP Version**: Installed by apt (Ubuntu repository version) + +## Docker vs Native Comparison + +**Why Native Installation?** + +| Aspect | Native (Chosen) | Docker | +|--------|-----------------|--------| +| **Performance** | Better (no container overhead) | Good | +| **Updates** | Manual tarball extraction | Container image pull | +| **Cron Jobs** | System cron (reliable) | Requires sidecar/exec | +| **App Updates** | Direct via web UI | Limited/complex | +| **Customization** | Full PHP/Apache control | Constrained by image | +| **Production Match** | Yes (same pattern) | No | +| **Complexity** | Lower for LAMP stack | Higher for orchestration | + +**Recommendation**: Native installation matches production deployment pattern and avoids Docker-specific limitations with Nextcloud's app ecosystem and cron requirements. + +## References + +- **Official Documentation**: https://docs.nextcloud.com/ +- **Admin Manual**: https://docs.nextcloud.com/server/latest/admin_manual/ +- **Installation Guide**: https://docs.nextcloud.com/server/latest/admin_manual/installation/ +- **OCC Commands**: https://docs.nextcloud.com/server/latest/admin_manual/configuration_server/occ_command.html diff --git a/docs/oauth2_proxy.md b/docs/oauth2_proxy.md new file mode 100644 index 0000000..54e9ee9 --- /dev/null +++ b/docs/oauth2_proxy.md @@ -0,0 +1,314 @@ +# OAuth2-Proxy Authentication Gateway +# Red Panda Approved + +## Overview + +OAuth2-Proxy provides authentication for services that don't natively support SSO/OIDC. +It acts as a reverse proxy that requires users to authenticate via Casdoor before +accessing the upstream service. + +This document describes the generic approach for adding OAuth2-Proxy authentication +to any service in the Agathos infrastructure. + +## Architecture + +``` +┌──────────────┐ ┌───────────────┐ ┌────────────────┐ ┌───────────────┐ +│ Browser │────▶│ HAProxy │────▶│ OAuth2-Proxy │────▶│ Your Service │ +│ │ │ (titania) │ │ (titania) │ │ (any host) │ +└──────────────┘ └───────┬───────┘ └───────┬────────┘ └───────────────┘ + │ │ + │ ┌───────────────▼───────────────┐ + └────▶│ Casdoor │ + │ (OIDC Provider - titania) │ + └───────────────────────────────┘ +``` + +## How It Works + +1. User requests `https://service.ouranos.helu.ca/` +2. HAProxy routes to OAuth2-Proxy (titania:22082) +3. OAuth2-Proxy checks for valid session cookie +4. **No session?** → Redirect to Casdoor login → After login, redirect back with cookie +5. **Valid session?** → Forward request to upstream service + +## File Structure + +``` +ansible/oauth2_proxy/ +├── deploy.yml # Main deployment playbook +├── docker-compose.yml.j2 # Docker Compose template +├── oauth2-proxy.cfg.j2 # OAuth2-Proxy configuration +└── stage.yml # Validation/staging playbook +``` + +Monitoring configuration is integrated into the host-specific Alloy config: +- `ansible/alloy/titania/config.alloy.j2` - Contains OAuth2-Proxy log collection and metrics scraping + +## Variable Architecture + +The OAuth2-Proxy template uses **generic variables** (`oauth2_proxy_*`) that are +mapped from **service-specific variables** in host_vars: + +``` +Vault (service-specific) Host Vars (mapping) Template (generic) +──────────────────────── ─────────────────── ────────────────── +vault__oauth2_* ──► _oauth2_* ──► oauth2_proxy_* +``` + +This allows: +- Multiple services to use the same OAuth2-Proxy template +- Service-specific credentials in vault +- Clear naming conventions + +## Configuration Steps + +### Step 1: Create Casdoor Application + +1. Login to Casdoor at `https://id.ouranos.helu.ca/` (Casdoor SSO) +2. Navigate to **Applications** → **Add** +3. Configure: + - **Name**: `` (e.g., `searxng`, `jupyter`) + - **Organization**: `heluca` (or your organization) + - **Redirect URLs**: `https://.ouranos.helu.ca/oauth2/callback` + - **Grant Types**: `authorization_code`, `refresh_token` +4. Save and note the **Client ID** and **Client Secret** + +### Step 2: Add Vault Secrets + +```bash +ansible-vault edit ansible/inventory/group_vars/all/vault.yml +``` + +Add service-specific credentials: +```yaml +# SearXNG OAuth2 credentials +vault_searxng_oauth2_client_id: "abc123..." +vault_searxng_oauth2_client_secret: "secret..." +vault_searxng_oauth2_cookie_secret: "" +``` + +Generate cookie secret: +```bash +openssl rand -base64 32 +``` + +### Step 3: Configure Host Variables + +Add to the host that will run OAuth2-Proxy (typically `titania.incus.yml`): + +```yaml +# ============================================================================= +# OAuth2 Configuration (Service-Specific) +# ============================================================================= +_oauth2_client_id: "{{ vault__oauth2_client_id }}" +_oauth2_client_secret: "{{ vault__oauth2_client_secret }}" +_oauth2_cookie_secret: "{{ vault__oauth2_cookie_secret }}" + +# ============================================================================= +# OAuth2-Proxy Configuration (Generic Template Variables) +# ============================================================================= +oauth2_proxy_user: oauth2proxy +oauth2_proxy_group: oauth2proxy +oauth2_proxy_uid: 802 +oauth2_proxy_gid: 802 +oauth2_proxy_directory: /srv/oauth2-proxy +oauth2_proxy_port: 22082 + +# OIDC Configuration +oauth2_proxy_oidc_issuer_url: "http://titania.incus:{{ casdoor_port }}" + +# Map service-specific credentials to generic template variables +oauth2_proxy_client_id: "{{ _oauth2_client_id }}" +oauth2_proxy_client_secret: "{{ _oauth2_client_secret }}" +oauth2_proxy_cookie_secret: "{{ _oauth2_cookie_secret }}" + +# Service-specific URLs +oauth2_proxy_redirect_url: "https://.{{ haproxy_domain }}/oauth2/callback" +oauth2_proxy_upstream_url: "http://:" +oauth2_proxy_cookie_domain: "{{ haproxy_domain }}" + +# Access Control +oauth2_proxy_email_domains: + - "*" # Or restrict to specific domains + +# Session Configuration +oauth2_proxy_cookie_expire: "168h" +oauth2_proxy_cookie_refresh: "1h" + +# SSL Verification +oauth2_proxy_skip_ssl_verify: true # Set false for production +``` + +### Step 4: Update HAProxy Backend + +Change the service backend to route through OAuth2-Proxy: + +```yaml +haproxy_backends: + - subdomain: "" + backend_host: "titania.incus" # OAuth2-Proxy host + backend_port: 22082 # OAuth2-Proxy port + health_path: "/ping" # OAuth2-Proxy health endpoint +``` + +### Step 5: Deploy + +```bash +cd ansible + +# Validate configuration +ansible-playbook oauth2_proxy/stage.yml + +# Deploy OAuth2-Proxy +ansible-playbook oauth2_proxy/deploy.yml + +# Update HAProxy routing +ansible-playbook haproxy/deploy.yml +``` + +## Complete Example: SearXNG + +### Vault Variables +```yaml +vault_searxng_oauth2_client_id: "searxng-client-id-from-casdoor" +vault_searxng_oauth2_client_secret: "searxng-client-secret-from-casdoor" +vault_searxng_oauth2_cookie_secret: "ABCdef123..." +``` + +### Host Variables (titania.incus.yml) +```yaml +# SearXNG OAuth2 (service-specific) +searxng_oauth2_client_id: "{{ vault_searxng_oauth2_client_id }}" +searxng_oauth2_client_secret: "{{ vault_searxng_oauth2_client_secret }}" +searxng_oauth2_cookie_secret: "{{ vault_searxng_oauth2_cookie_secret }}" + +# OAuth2-Proxy (generic mapping) +oauth2_proxy_client_id: "{{ searxng_oauth2_client_id }}" +oauth2_proxy_client_secret: "{{ searxng_oauth2_client_secret }}" +oauth2_proxy_cookie_secret: "{{ searxng_oauth2_cookie_secret }}" +oauth2_proxy_redirect_url: "https://searxng.{{ haproxy_domain }}/oauth2/callback" +oauth2_proxy_upstream_url: "http://oberon.incus:25599" +``` + +### HAProxy Backend +```yaml +- subdomain: "searxng" + backend_host: "titania.incus" + backend_port: 22082 + health_path: "/ping" +``` + +## Adding a Second Service (e.g., Jupyter) + +When adding authentication to another service, you would: + +1. Create a new Casdoor application for Jupyter +2. Add vault variables: + ```yaml + vault_jupyter_oauth2_client_id: "..." + vault_jupyter_oauth2_client_secret: "..." + vault_jupyter_oauth2_cookie_secret: "..." + ``` +3. Either: + - **Option A**: Deploy a second OAuth2-Proxy instance on a different port + - **Option B**: Configure the same OAuth2-Proxy with multiple upstreams (more complex) + +For multiple services, **Option A** is recommended for isolation and simplicity. + +## Monitoring + +OAuth2-Proxy monitoring is handled by Grafana Alloy, which runs on each host. + +### Architecture + +``` +OAuth2-Proxy ─────► Grafana Alloy ─────► Prometheus (prospero) +(titania) (local agent) (remote_write) + │ + └─────────────► Loki (prospero) + (log forwarding) +``` + +### Metrics (via Prometheus) + +Alloy scrapes OAuth2-Proxy metrics at `/metrics` and forwards them to Prometheus: +- `oauth2_proxy_requests_total` - Total requests processed +- `oauth2_proxy_errors_total` - Total errors +- `oauth2_proxy_upstream_latency_seconds` - Latency to upstream service + +Configuration in `ansible/alloy/titania/config.alloy.j2`: +```alloy +prometheus.scrape "oauth2_proxy" { + targets = [{"__address__" = "127.0.0.1:{{oauth2_proxy_port}}"}] + scrape_interval = "30s" + forward_to = [prometheus.remote_write.default.receiver] + job_name = "oauth2-proxy" +} +``` + +### Logs (via Loki) + +OAuth2-Proxy logs are collected via syslog and forwarded to Loki: +```alloy +loki.source.syslog "oauth2_proxy_logs" { + listener { + address = "127.0.0.1:{{oauth2_proxy_syslog_port}}" + protocol = "tcp" + labels = { job = "oauth2-proxy", hostname = "{{inventory_hostname}}" } + } + forward_to = [loki.write.default.receiver] +} +``` + +### Deploy Alloy After Changes + +If you update the Alloy configuration: +```bash +ansible-playbook alloy/deploy.yml --limit titania.incus +``` + +## Security Considerations + +1. **Cookie Security**: + - `cookie_secure = true` - HTTPS only + - `cookie_httponly = true` - No JavaScript access + - `cookie_samesite = "lax"` - CSRF protection + +2. **Access Control**: + - Use `oauth2_proxy_email_domains` to restrict by email domain + - Use `oauth2_proxy_allowed_groups` to restrict by Casdoor groups + +3. **SSL Verification**: + - Set `oauth2_proxy_skip_ssl_verify: false` in production + - Ensure Casdoor has valid SSL certificates + +## Troubleshooting + +### Check OAuth2-Proxy Logs +```bash +ssh titania.incus +docker logs oauth2-proxy +``` + +### Test OIDC Discovery +```bash +curl http://titania.incus:22081/.well-known/openid-configuration +``` + +### Verify Cookie Domain +Ensure `oauth2_proxy_cookie_domain` matches your HAProxy domain. + +### Common Issues + +| Issue | Cause | Solution | +|-------|-------|----------| +| Redirect loop | Cookie domain mismatch | Check `oauth2_proxy_cookie_domain` | +| 403 Forbidden | Email domain not allowed | Update `oauth2_proxy_email_domains` | +| OIDC discovery failed | Casdoor not accessible | Check network/firewall | +| Invalid redirect URI | Mismatch in Casdoor app | Verify redirect URL in Casdoor | + +## Related Documentation + +- [SearXNG Authentication](services/searxng-auth.md) - Specific implementation details +- [Casdoor Documentation](casdoor.md) - Identity provider configuration \ No newline at end of file diff --git a/docs/openwebui.md b/docs/openwebui.md new file mode 100644 index 0000000..2c94ad4 --- /dev/null +++ b/docs/openwebui.md @@ -0,0 +1,331 @@ +# Open WebUI + +Open WebUI is an extensible, self-hosted AI interface that provides a web-based chat experience for interacting with LLMs. This document covers deployment, Casdoor SSO integration, and configuration. + +## Architecture + +### Components + +| Component | Location | Purpose | +|-----------|----------|---------| +| Open WebUI | Native on Oberon | AI chat interface | +| PostgreSQL | Portia | Database with pgvector extension | +| Casdoor | Titania | SSO identity provider | +| HAProxy | Ariel | TLS termination, routing | + +### Network Diagram + +``` +┌────────────────────────────────────────────────────────────────────┐ +│ External Access │ +│ https://openwebui.ouranos.helu.ca │ +└───────────────────────────────┬────────────────────────────────────┘ + │ + ▼ +┌────────────────────────────────────────────────────────────────────┐ +│ ariel.incus (HAProxy) │ +│ TLS termination → proxy to oberon.incus:25588 │ +└───────────────────────────────┬────────────────────────────────────┘ + │ + ▼ +┌────────────────────────────────────────────────────────────────────┐ +│ oberon.incus │ +│ │ +│ ┌────────────────────────────────────────────────────────────────┐ │ +│ │ Open WebUI (systemd) │ │ +│ │ - Python 3.12 virtual environment │ │ +│ │ - Port 25588 │ │ +│ │ - OAuth/OIDC via Casdoor │ │ +│ └────────────────────────────────────────────────────────────────┘ │ +│ │ │ │ +│ │ PostgreSQL │ OIDC │ +│ ▼ ▼ │ +│ portia.incus:5432 titania.incus:22081 │ +│ (openwebui database) (Casdoor SSO) │ +└────────────────────────────────────────────────────────────────────┘ +``` + +### Network Ports + +| Port | Service | Access | +|------|---------|--------| +| 25588 | Open WebUI HTTP | Via HAProxy | +| 5432 | PostgreSQL | Internal (Portia) | +| 22081 | Casdoor | Internal (Titania) | + +## Casdoor SSO Integration + +Open WebUI uses native OAuth/OIDC to authenticate against Casdoor. Local signup is disabled—all users must authenticate through Casdoor. + +### How It Works + +1. User visits `https://openwebui.ouranos.helu.ca` +2. Open WebUI redirects to Casdoor login page +3. User authenticates with Casdoor credentials +4. Casdoor redirects back with authorization code +5. Open WebUI exchanges code for tokens and creates/updates user session +6. User email from Casdoor becomes their Open WebUI identity + +### Configuration + +OAuth settings are defined in host variables and rendered into the environment file: + +**Host Variables** (`inventory/host_vars/oberon.incus.yml`): +```yaml +# OAuth/OIDC Configuration (Casdoor SSO) +openwebui_oauth_client_id: "{{ vault_openwebui_oauth_client_id }}" +openwebui_oauth_client_secret: "{{ vault_openwebui_oauth_client_secret }}" +openwebui_oauth_provider_name: "Casdoor" +openwebui_oauth_provider_url: "https://id.ouranos.helu.ca/.well-known/openid-configuration" + +# Disable local authentication +openwebui_enable_signup: false +openwebui_enable_email_login: false +``` + +**Environment Variables** (rendered from `openwebui.env.j2`): +```bash +ENABLE_SIGNUP=false +ENABLE_EMAIL_LOGIN=false +ENABLE_OAUTH_SIGNUP=true +OAUTH_CLIENT_ID= +OAUTH_CLIENT_SECRET= +OAUTH_PROVIDER_NAME=Casdoor +OPENID_PROVIDER_URL=https://id.ouranos.helu.ca/.well-known/openid-configuration +``` + +### Casdoor Application + +The `app-openwebui` application is defined in `ansible/casdoor/init_data.json.j2`: + +| Setting | Value | +|---------|-------| +| Name | `app-openwebui` | +| Display Name | Open WebUI | +| Redirect URI | `https://openwebui.ouranos.helu.ca/oauth/oidc/callback` | +| Grant Types | `authorization_code`, `refresh_token` | +| Token Format | JWT | +| Token Expiry | 168 hours (7 days) | + +## Prerequisites + +### 1. PostgreSQL Database + +The `openwebui` database must exist on Portia with the `pgvector` extension: + +```bash +ansible-playbook postgresql/deploy.yml +``` + +### 2. Casdoor SSO + +Casdoor must be deployed and the `app-openwebui` application configured: + +```bash +ansible-playbook casdoor/deploy.yml +``` + +### 3. Vault Secrets + +Add to `ansible/inventory/group_vars/all/vault.yml`: + +```yaml +# OpenWebUI +vault_openwebui_secret_key: "" +vault_openwebui_db_password: "" +vault_openwebui_oauth_client_id: "" +vault_openwebui_oauth_client_secret: "" + +# API Keys (optional) +vault_openwebui_openai_api_key: "" +vault_openwebui_anthropic_api_key: "" +vault_openwebui_groq_api_key: "" +vault_openwebui_mistral_api_key: "" +``` + +Generate secrets: +```bash +# Secret key +openssl rand -hex 32 + +# Database password +openssl rand -base64 24 +``` + +## Deployment + +### Fresh Installation + +```bash +cd ansible + +# 1. Ensure PostgreSQL is deployed +ansible-playbook postgresql/deploy.yml + +# 2. Deploy Casdoor (if not already deployed) +ansible-playbook casdoor/deploy.yml + +# 3. Get OAuth credentials from Casdoor admin UI +# - Navigate to https://id.ouranos.helu.ca +# - Go to Applications → app-openwebui +# - Copy Client ID and Client Secret +# - Update vault.yml with these values + +# 4. Deploy Open WebUI +ansible-playbook openwebui/deploy.yml +``` + +### Verify Deployment + +```bash +# Check service status +ssh oberon.incus "sudo systemctl status openwebui" + +# View logs +ssh oberon.incus "sudo journalctl -u openwebui -f" + +# Test health endpoint +curl -s http://oberon.incus:25588/health + +# Test via HAProxy +curl -s https://openwebui.ouranos.helu.ca/health +``` + +### Redeployment + +To redeploy Open WebUI (preserves database): + +```bash +ansible-playbook openwebui/deploy.yml +``` + +## Configuration Reference + +### Host Variables + +Located in `ansible/inventory/host_vars/oberon.incus.yml`: + +```yaml +# Service account +openwebui_user: openwebui +openwebui_group: openwebui +openwebui_directory: /srv/openwebui +openwebui_port: 25588 +openwebui_host: puck.incus + +# Database +openwebui_db_host: portia.incus +openwebui_db_port: 5432 +openwebui_db_name: openwebui +openwebui_db_user: openwebui +openwebui_db_password: "{{ vault_openwebui_db_password }}" + +# Authentication (SSO only) +openwebui_enable_signup: false +openwebui_enable_email_login: false + +# OAuth/OIDC (Casdoor) +openwebui_oauth_client_id: "{{ vault_openwebui_oauth_client_id }}" +openwebui_oauth_client_secret: "{{ vault_openwebui_oauth_client_secret }}" +openwebui_oauth_provider_name: "Casdoor" +openwebui_oauth_provider_url: "https://id.ouranos.helu.ca/.well-known/openid-configuration" + +# API Keys +openwebui_openai_api_key: "{{ vault_openwebui_openai_api_key }}" +openwebui_anthropic_api_key: "{{ vault_openwebui_anthropic_api_key }}" +openwebui_groq_api_key: "{{ vault_openwebui_groq_api_key }}" +openwebui_mistral_api_key: "{{ vault_openwebui_mistral_api_key }}" +``` + +### Data Persistence + +Open WebUI data locations: +``` +/srv/openwebui/ +├── .venv/ # Python virtual environment +├── .env # Environment configuration +└── data/ # User uploads, cache +``` + +Database (on Portia): +``` +PostgreSQL: openwebui database with pgvector extension +``` + +## User Management + +### First-Time Setup + +After deployment, the first user to authenticate via Casdoor becomes an admin. Subsequent users get standard user roles. + +### Promoting Users to Admin + +1. Log in as an existing admin +2. Navigate to Admin Panel → Users +3. Select the user and change their role to Admin + +### Existing Users Migration + +If users were created before SSO was enabled: +- Users with matching email addresses will be linked automatically +- Users without matching emails must be recreated through Casdoor + +## Troubleshooting + +### Service Issues + +```bash +# Check service status +ssh oberon.incus "sudo systemctl status openwebui" + +# View logs +ssh oberon.incus "sudo journalctl -u openwebui -n 100" + +# Restart service +ssh oberon.incus "sudo systemctl restart openwebui" +``` + +### OAuth/OIDC Issues + +```bash +# Verify Casdoor is accessible +curl -s https://id.ouranos.helu.ca/.well-known/openid-configuration | jq + +# Check redirect URI matches +# Must be: https://openwebui.ouranos.helu.ca/oauth/oidc/callback + +# Verify client credentials in environment +ssh oberon.incus "sudo grep OAUTH /srv/openwebui/.env" +``` + +### Database Issues + +```bash +# Test database connection +ssh oberon.incus "PGPASSWORD= psql -h portia.incus -U openwebui -d openwebui -c '\dt'" + +# Check pgvector extension +ssh portia.incus "sudo -u postgres psql -d openwebui -c '\dx'" +``` + +### Common Errors + +| Error | Cause | Solution | +|-------|-------|----------| +| "Invalid redirect_uri" | Mismatch between Casdoor config and Open WebUI | Verify redirect URI in Casdoor matches exactly | +| "Invalid client credentials" | Wrong client ID/secret | Update vault with correct values from Casdoor | +| "OIDC discovery failed" | Casdoor unreachable | Check Casdoor is running on Titania | +| "Database connection failed" | PostgreSQL unreachable | Verify PostgreSQL on Portia, check network | + +## Security Considerations + +1. **SSO-only authentication** - Local signup disabled, all users authenticate through Casdoor +2. **API keys in vault** - All API keys stored encrypted in Ansible vault +3. **Database credentials** - Stored in vault, rendered to environment file with restrictive permissions (0600) +4. **Session security** - JWT tokens with 7-day expiry, managed by Casdoor + +## Related Documentation + +- [Casdoor SSO](services/casdoor.md) - Identity provider configuration +- [PostgreSQL](../ansible.md) - Database deployment +- [HAProxy](../terraform.md) - TLS termination and routing \ No newline at end of file diff --git a/docs/ouranos.html b/docs/ouranos.html new file mode 100644 index 0000000..65ad79c --- /dev/null +++ b/docs/ouranos.html @@ -0,0 +1,808 @@ + + + + + + Ouranos Lab - Red Panda Approved Infrastructure + + + + + + + +
+ + + + + +
+
+
+
+

Ouranos Lab

+

Red Panda Approved™ Infrastructure as Code

+

10 Incus containers named after moons of Uranus, provisioned with Terraform and configured with Ansible. Accessible at ouranos.helu.ca

+
+
+
+ Red Panda Approved™ +
+
+
+
+
+ + +
+

Project Overview

+ +
+

Ouranos is a comprehensive infrastructure-as-code project that provisions and manages a complete development sandbox environment. All infrastructure and configuration is tracked in Git for reproducible deployments.

+

DNS Domain: Incus resolves containers via the .incus suffix (e.g., oberon.incus). IPv4 addresses are dynamically assigned — always use DNS names, never hardcode IPs.

+
+ +
+
+
+
+
Terraform
+
+
+

Provisions the Uranian host containers with:

+
    +
  • 10 specialised Incus containers (LXC)
  • +
  • DNS-resolved networking (.incus domain)
  • +
  • Security policies and nested Docker support
  • +
  • Port proxy devices and resource dependencies
  • +
  • Incus S3 buckets for object storage (Casdoor, LobeChat)
  • +
+
+ +
+
+
+
+
+
Ansible
+
+
+

Deploys and configures all services:

+
    +
  • Docker engine on nested-capable hosts
  • +
  • Databases: PostgreSQL (Portia), Neo4j (Ariel)
  • +
  • Observability: Prometheus, Loki, Grafana (Prospero)
  • +
  • Application runtimes and LLM proxies
  • +
  • HAProxy TLS termination and Casdoor SSO (Titania)
  • +
+
+ +
+
+
+
+ + +
+

Uranian Host Architecture

+ +
+
+
Hosts Summary
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameRoleKey ServicesNesting
arielgraph_databaseNeo4j 5.26.0
calibanagent_automationAgent S MCP Server, Kernos, MATE Desktop, GPU
mirandamcp_docker_hostMCPO, Grafana MCP, Gitea MCP, Neo4j MCP, Argos MCP
oberoncontainer_orchestrationMCP Switchboard, RabbitMQ, Open WebUI, SearXNG, Home Assistant, smtp4dev
portiadatabasePostgreSQL 16
prosperoobservabilityPrometheus, Loki, Grafana, PgAdmin, AlertManager
puckapplication_runtimeJupyterLab, Gitea Runner, Django apps (6×)
rosalindcollaborationGitea, LobeChat, Nextcloud, AnythingLLM
sycoraxlanguage_modelsArke LLM Proxy
titaniaproxy_ssoHAProxy, Casdoor SSO, certbot
+
+
+
+ + +
+
+
+
+
oberon — Container Orchestration
+
+
+

King of the Fairies orchestrating containers and managing MCP infrastructure.

+
    +
  • Docker engine
  • +
  • MCP Switchboard (port 22785) — Django app routing MCP tool calls
  • +
  • RabbitMQ message queue
  • +
  • Open WebUI LLM interface (port 22088, PostgreSQL backend on Portia)
  • +
  • SearXNG privacy search (port 22073, behind OAuth2-Proxy)
  • +
  • Home Assistant (port 8123)
  • +
  • smtp4dev SMTP test server (port 22025)
  • +
+
+
+
+ +
+
+
+
portia — Relational Database
+
+
+

Intelligent and resourceful — the reliability of relational databases.

+
    +
  • PostgreSQL 16 (port 5432)
  • +
  • Databases: arke, anythingllm, gitea, hass, lobechat, mcp_switchboard, nextcloud, openwebui, spelunker
  • +
+
+
+
+ +
+
+
+
ariel — Graph Database
+
+
+

Air spirit — ethereal, interconnected nature mirroring graph relationships.

+
    +
  • Neo4j 5.26.0 (Docker)
  • +
  • HTTP API: port 25554
  • +
  • Bolt: port 7687
  • +
+
+
+
+ +
+
+
+
puck — Application Runtime
+
+
+

Shape-shifting trickster embodying Python's versatility.

+
    +
  • Docker engine
  • +
  • JupyterLab (port 22071 via OAuth2-Proxy)
  • +
  • Gitea Runner CI/CD agent
  • +
  • Django apps: Angelia (22281), Athena (22481), Kairos (22581), Icarlos (22681), Spelunker (22881), Peitho (22981)
  • +
+
+
+
+ +
+
+
+
prospero — Observability Stack
+
+
+

Master magician observing all events.

+
    +
  • PPLG stack via Docker Compose: Prometheus, Loki, Grafana, PgAdmin
  • +
  • Internal HAProxy with OAuth2-Proxy for all dashboards
  • +
  • AlertManager with Pushover notifications
  • +
  • Prometheus node-exporter metrics from all hosts
  • +
  • Loki log aggregation via Alloy (all hosts)
  • +
  • Grafana with Casdoor SSO integration
  • +
+
+
+
+ +
+
+
+
miranda — MCP Docker Host
+
+
+

Curious bridge between worlds — hosting MCP server containers.

+
    +
  • Docker engine (API on port 2375 for MCP Switchboard)
  • +
  • MCPO OpenAI-compatible MCP proxy
  • +
  • Grafana MCP Server — Grafana API integration (port 25533)
  • +
  • Gitea MCP Server (port 25535)
  • +
  • Neo4j MCP Server
  • +
  • Argos MCP Server — web search via SearXNG (port 25534)
  • +
+
+
+
+ +
+
+
+
sycorax — Language Models
+
+
+

Original magical power wielding language magic.

+
    +
  • Arke LLM API Proxy (port 25540)
  • +
  • Multi-provider support (OpenAI, Anthropic, etc.)
  • +
  • Session management with Memcached
  • +
  • Database backend on Portia
  • +
+
+
+
+ +
+
+
+
caliban — Agent Automation
+
+
+

Autonomous computer agent learning through environmental interaction.

+
    +
  • Docker engine
  • +
  • Agent S MCP Server (MATE desktop, AT-SPI automation)
  • +
  • Kernos MCP Shell Server (port 22021)
  • +
  • GPU passthrough for vision tasks
  • +
  • RDP access (port 25521)
  • +
+
+
+
+ +
+
+
+
rosalind — Collaboration Services
+
+
+

Witty and resourceful moon for PHP, Go, and Node.js runtimes.

+
    +
  • Gitea self-hosted Git (port 22082, SSH on 22022)
  • +
  • LobeChat AI chat interface (port 22081)
  • +
  • Nextcloud file sharing and collaboration (port 22083)
  • +
  • AnythingLLM document AI workspace (port 22084)
  • +
  • Nextcloud data on dedicated Incus storage volume
  • +
+
+
+
+ +
+
+
+
titania — Proxy & SSO Services
+
+
+

Queen of the Fairies managing access control and authentication.

+
    +
  • HAProxy 3.x with TLS termination (port 443)
  • +
  • Let's Encrypt wildcard certificate via certbot DNS-01 (Namecheap)
  • +
  • HTTP to HTTPS redirect (port 80)
  • +
  • Gitea SSH proxy (port 22022)
  • +
  • Casdoor SSO (port 22081, local PostgreSQL)
  • +
  • Prometheus metrics at :8404/metrics
  • +
+
+
+
+
+
+ + +
+

External Access via HAProxy

+ +
+

Titania provides TLS termination and reverse proxy for all services. Base domain: ouranos.helu.ca — HTTPS port 443, HTTP port 80 (redirects to HTTPS). Certificate: Let's Encrypt wildcard via certbot DNS-01 (Namecheap).

+
+ +
+
+
Route Table
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SubdomainBackendService
ouranos.helu.ca rootpuck.incus:22281Angelia (Django)
alertmanager.ouranos.helu.caprospero.incus:443 SSLAlertManager
angelia.ouranos.helu.capuck.incus:22281Angelia (Django)
anythingllm.ouranos.helu.carosalind.incus:22084AnythingLLM
arke.ouranos.helu.casycorax.incus:25540Arke LLM Proxy
athena.ouranos.helu.capuck.incus:22481Athena (Django)
gitea.ouranos.helu.carosalind.incus:22082Gitea
grafana.ouranos.helu.caprospero.incus:443 SSLGrafana
hass.ouranos.helu.caoberon.incus:8123Home Assistant
id.ouranos.helu.catitania.incus:22081Casdoor SSO
icarlos.ouranos.helu.capuck.incus:22681Icarlos (Django)
jupyterlab.ouranos.helu.capuck.incus:22071JupyterLab OAuth2-Proxy
kairos.ouranos.helu.capuck.incus:22581Kairos (Django)
lobechat.ouranos.helu.carosalind.incus:22081LobeChat
loki.ouranos.helu.caprospero.incus:443 SSLLoki
mcp-switchboard.ouranos.helu.caoberon.incus:22785MCP Switchboard
nextcloud.ouranos.helu.carosalind.incus:22083Nextcloud
openwebui.ouranos.helu.caoberon.incus:22088Open WebUI
peitho.ouranos.helu.capuck.incus:22981Peitho (Django)
pgadmin.ouranos.helu.caprospero.incus:443 SSLPgAdmin 4
prometheus.ouranos.helu.caprospero.incus:443 SSLPrometheus
searxng.ouranos.helu.caoberon.incus:22073SearXNG OAuth2-Proxy
smtp4dev.ouranos.helu.caoberon.incus:22085smtp4dev
spelunker.ouranos.helu.capuck.incus:22881Spelunker (Django)
+
+
+
+
+ + +
+

Infrastructure Management

+ +
+
+
+
+
Quick Start
+
+
+
# Provision containers
+cd terraform
+terraform init
+terraform plan
+terraform apply
+
+# Start all containers
+cd ../ansible
+source ~/env/agathos/bin/activate
+ansible-playbook sandbox_up.yml
+
+# Deploy all services
+ansible-playbook site.yml
+
+# Stop all containers
+ansible-playbook sandbox_down.yml
+
+
+
+
+
+
+
Vault Management
+
+
+
# Edit secrets
+ansible-vault edit \
+  inventory/group_vars/all/vault.yml
+
+# View secrets
+ansible-vault view \
+  inventory/group_vars/all/vault.yml
+
+# Encrypt a new file
+ansible-vault encrypt new_secrets.yml
+
+
+
+
+ +
+
+
+
Terraform Workflow
+
    +
  1. Define — Containers, networks, and resources in *.tf files
  2. +
  3. Plan — Review changes with terraform plan
  4. +
  5. Apply — Provision with terraform apply
  6. +
  7. Verify — Check outputs and container status
  8. +
+
+
+
+
+
Ansible Workflow
+
    +
  1. Bootstrap — Update packages, install essentials (apt_update.yml)
  2. +
  3. Agents — Deploy Alloy and Node Exporter on all hosts
  4. +
  5. Services — Configure databases, Docker, applications, observability
  6. +
  7. Verify — Check service health and connectivity
  8. +
+
+
+
+ +
+
S3 Storage Provisioning
+

Terraform provisions Incus S3 buckets for services requiring object storage:

+
+ + + + + + + + +
ServiceHostPurpose
CasdoorTitaniaUser avatars and SSO resource storage
LobeChatRosalindFile uploads and attachments
+
+

S3 credentials are stored as sensitive Terraform outputs and in Ansible Vault with the vault_*_s3_* prefix.

+
+
+ + +
+

Ansible Automation

+ +
+ + +
+

+ +

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + +
PlaybookHost(s)Purpose
apt_update.ymlAllUpdate packages and install essentials
alloy/deploy.ymlAllGrafana Alloy log/metrics collection
prometheus/node_deploy.ymlAllNode Exporter metrics
docker/deploy.ymlOberon, Ariel, Miranda, Puck, Rosalind, Sycorax, Caliban, TitaniaDocker engine
smtp4dev/deploy.ymlOberonSMTP test server
pplg/deploy.ymlProsperoFull observability stack + internal HAProxy + OAuth2-Proxy
postgresql/deploy.ymlPortiaPostgreSQL with all databases
postgresql_ssl/deploy.ymlTitaniaDedicated PostgreSQL for Casdoor
neo4j/deploy.ymlArielNeo4j graph database
searxng/deploy.ymlOberonSearXNG privacy search
haproxy/deploy.ymlTitaniaHAProxy TLS termination and routing
casdoor/deploy.ymlTitaniaCasdoor SSO
mcpo/deploy.ymlMirandaMCPO MCP proxy
openwebui/deploy.ymlOberonOpen WebUI LLM interface
hass/deploy.ymlOberonHome Assistant
gitea/deploy.ymlRosalindGitea self-hosted Git
nextcloud/deploy.ymlRosalindNextcloud collaboration
+
+
+
+
+ + +
+

+ +

+
+
+
+ + + + + + + + + + + + + + + + + + + +
PlaybookHostService
anythingllm/deploy.ymlRosalindAnythingLLM document AI
arke/deploy.ymlSycoraxArke LLM proxy
argos/deploy.ymlMirandaArgos MCP web search server
caliban/deploy.ymlCalibanAgent S MCP Server
certbot/deploy.ymlTitaniaLet's Encrypt certificate renewal
gitea_mcp/deploy.ymlMirandaGitea MCP Server
gitea_runner/deploy.ymlPuckGitea CI/CD runner
grafana_mcp/deploy.ymlMirandaGrafana MCP Server
jupyterlab/deploy.ymlPuckJupyterLab + OAuth2-Proxy
kernos/deploy.ymlCalibanKernos MCP shell server
lobechat/deploy.ymlRosalindLobeChat AI chat
neo4j_mcp/deploy.ymlMirandaNeo4j MCP Server
rabbitmq/deploy.ymlOberonRabbitMQ message queue
+
+
+
+
+ + +
+

+ +

+
+
+
+
+
+
+ +
sandbox_up.yml
+

Start all Uranian host containers

+
+
+
+
+
+
+ +
site.yml
+

Full deployment orchestration

+
+
+
+
+
+
+ +
apt_update.yml
+

Update packages on all hosts

+
+
+
+
+
+
+ +
sandbox_down.yml
+

Gracefully stop all containers

+
+
+
+
+
+
+
+
+
+ + +
+

Data Flow Architecture

+ +
+
+
Observability Pipeline
+
+
+
+flowchart LR + subgraph hosts["All Hosts"] + alloy["Alloy\n(syslog + journal)"] + node_exp["Node Exporter\n(metrics)"] + end + subgraph prospero["Prospero"] + loki["Loki\n(logs)"] + prom["Prometheus\n(metrics)"] + grafana["Grafana\n(dashboards)"] + alert["AlertManager"] + end + pushover["Pushover\n(notifications)"] + alloy -->|"HTTP push"| loki + node_exp -->|"scrape 15s"| prom + loki --> grafana + prom --> grafana + grafana --> alert + alert -->|"webhook"| pushover +
+
+
+ +
+
+
Service Integration Points
+
+
+
+ + + + + + + + + + + + + + +
ConsumerProviderConnection
All LLM appsArke (Sycorax)http://sycorax.incus:25540
Open WebUI, Arke, Gitea, Nextcloud, LobeChatPostgreSQL (Portia)portia.incus:5432
Neo4j MCPNeo4j (Ariel)ariel.incus:7687 (Bolt)
MCP SwitchboardDocker API (Miranda)tcp://miranda.incus:2375
MCP Switchboard, Kairos, SpelunkerRabbitMQ (Oberon)oberon.incus:5672
All apps (SMTP)smtp4dev (Oberon)oberon.incus:22025
All hosts (logs)Loki (Prospero)http://prospero.incus:3100
All hosts (metrics)Prometheus (Prospero)http://prospero.incus:9090
+
+
+
+
+ + +
+

Important Notes

+ +
+
Alloy Host Variables Required
+

Every host with alloy in its services list must define alloy_log_level in inventory/host_vars/<host>.incus.yml. The playbook will fail with an undefined variable error if this is missing.

+
+ +
+
Alloy Syslog Listeners Required for Docker Services
+

Any Docker Compose service using the syslog logging driver must have a corresponding loki.source.syslog listener in the host's Alloy config template (ansible/alloy/<hostname>/config.alloy.j2). Missing listeners cause Docker containers to fail on start because the syslog driver cannot connect to its configured port.

+
+ +
+
Local Terraform State
+

This project uses local Terraform state (no remote backend). Do not run terraform apply from multiple machines simultaneously.

+
+ +
+
Nested Docker
+

Docker runs inside Incus containers (nested), requiring security.nesting = true and lxc.apparmor.profile=unconfined AppArmor override on all Docker-enabled hosts.

+
+ +
+
Deployment Order
+

Prospero (observability) must be fully deployed before other hosts, as Alloy on every host pushes logs and metrics to prospero.incus. Run pplg/deploy.yml before site.yml on a fresh environment.

+
+
+ + +
+
+

Built with love and approved by red pandas

+ Ouranos Lab — ouranos.helu.ca — Infrastructure as Code for Development Excellence +
+
+ + + + +
+ + + + + + + + + + diff --git a/docs/ouranos.md b/docs/ouranos.md new file mode 100644 index 0000000..66e6583 --- /dev/null +++ b/docs/ouranos.md @@ -0,0 +1,333 @@ +# Ouranos Lab + +Infrastructure-as-Code project managing the **Ouranos Lab** — a development sandbox at [ouranos.helu.ca](https://ouranos.helu.ca). Uses **Terraform** for container provisioning and **Ansible** for configuration management, themed around the moons of Uranus. + +--- + +## Project Overview + +| Component | Purpose | +|-----------|---------| +| **Terraform** | Provisions 10 specialised Incus containers (LXC) with DNS-resolved networking, security policies, and resource dependencies | +| **Ansible** | Deploys Docker, databases (PostgreSQL, Neo4j), observability stack (Prometheus, Grafana, Loki), and application runtimes across all hosts | + +> **DNS Domain**: Incus resolves containers via the `.incus` domain suffix (e.g., `oberon.incus`, `portia.incus`). IPv4 addresses are dynamically assigned — always use DNS names, never hardcode IPs. + +--- + +## Uranian Host Architecture + +All containers are named after moons of Uranus and resolved via the `.incus` DNS suffix. + +| Name | Role | Description | Nesting | +|------|------|-------------|---------| +| **ariel** | graph_database | Neo4j — Ethereal graph connections | ✔ | +| **caliban** | agent_automation | Agent S MCP Server with MATE Desktop | ✔ | +| **miranda** | mcp_docker_host | Dedicated Docker Host for MCP Servers | ✔ | +| **oberon** | container_orchestration | Docker Host — MCP Switchboard, RabbitMQ, Open WebUI | ✔ | +| **portia** | database | PostgreSQL — Relational database host | ❌ | +| **prospero** | observability | PPLG stack — Prometheus, Grafana, Loki, PgAdmin | ❌ | +| **puck** | application_runtime | Python App Host — JupyterLab, Django apps, Gitea Runner | ✔ | +| **rosalind** | collaboration | Gitea, LobeChat, Nextcloud, AnythingLLM | ✔ | +| **sycorax** | language_models | Arke LLM Proxy | ✔ | +| **titania** | proxy_sso | HAProxy TLS termination + Casdoor SSO | ✔ | + +### oberon — Container Orchestration + +King of the Fairies orchestrating containers and managing MCP infrastructure. + +- Docker engine +- MCP Switchboard (port 22785) — Django app routing MCP tool calls +- RabbitMQ message queue +- Open WebUI LLM interface (port 22088, PostgreSQL backend on Portia) +- SearXNG privacy search (port 22083, behind OAuth2-Proxy) +- smtp4dev SMTP test server (port 22025) + +### portia — Relational Database + +Intelligent and resourceful — the reliability of relational databases. + +- PostgreSQL 17 (port 5432) +- Databases: `arke`, `anythingllm`, `gitea`, `hass`, `lobechat`, `mcp_switchboard`, `nextcloud`, `openwebui`, `spelunker` + +### ariel — Graph Database + +Air spirit — ethereal, interconnected nature mirroring graph relationships. + +- Neo4j 5.26.0 (Docker) +- HTTP API: port 25584 +- Bolt: port 25554 + +### puck — Application Runtime + +Shape-shifting trickster embodying Python's versatility. + +- Docker engine +- JupyterLab (port 22071 via OAuth2-Proxy) +- Gitea Runner (CI/CD agent) +- Home Assistant (port 8123) +- Django applications: Angelia (22281), Athena (22481), Kairos (22581), Icarlos (22681), Spelunker (22881), Peitho (22981) + +### prospero — Observability Stack + +Master magician observing all events. + +- PPLG stack via Docker Compose: Prometheus, Loki, Grafana, PgAdmin +- Internal HAProxy with OAuth2-Proxy for all dashboards +- AlertManager with Pushover notifications +- Prometheus metrics collection (`node-exporter`, HAProxy, Loki) +- Loki log aggregation via Alloy (all hosts) +- Grafana dashboard suite with Casdoor SSO integration + +### miranda — MCP Docker Host + +Curious bridge between worlds — hosting MCP server containers. + +- Docker engine (API exposed on port 2375 for MCP Switchboard) +- MCPO OpenAI-compatible MCP proxy +- Grafana MCP Server (port 25533) +- Gitea MCP Server (port 25535) +- Neo4j MCP Server +- Argos MCP Server — web search via SearXNG (port 25534) + +### sycorax — Language Models + +Original magical power wielding language magic. + +- Arke LLM API Proxy (port 25540) +- Multi-provider support (OpenAI, Anthropic, etc.) +- Session management with Memcached +- Database backend on Portia + +### caliban — Agent Automation + +Autonomous computer agent learning through environmental interaction. + +- Docker engine +- Agent S MCP Server (MATE desktop, AT-SPI automation) +- Kernos MCP Shell Server (port 22021) +- GPU passthrough for vision tasks +- RDP access (port 25521) + +### rosalind — Collaboration Services + +Witty and resourceful moon for PHP, Go, and Node.js runtimes. + +- Gitea self-hosted Git (port 22082, SSH on 22022) +- LobeChat AI chat interface (port 22081) +- Nextcloud file sharing and collaboration (port 22083) +- AnythingLLM document AI workspace (port 22084) +- Nextcloud data on dedicated Incus storage volume + +### titania — Proxy & SSO Services + +Queen of the Fairies managing access control and authentication. + +- HAProxy 3.x with TLS termination (port 443) +- Let's Encrypt wildcard certificate via certbot DNS-01 (Namecheap) +- HTTP to HTTPS redirect (port 80) +- Gitea SSH proxy (port 22022) +- Casdoor SSO (port 22081, local PostgreSQL) +- Prometheus metrics at `:8404/metrics` + +--- + +## External Access via HAProxy + +Titania provides TLS termination and reverse proxy for all services. + +- **Base domain**: `ouranos.helu.ca` +- **HTTPS**: port 443 (standard) +- **HTTP**: port 80 (redirects to HTTPS) +- **Certificate**: Let's Encrypt wildcard via certbot DNS-01 + +### Route Table + +| Subdomain | Backend | Service | +|-----------|---------|---------| +| `ouranos.helu.ca` (root) | puck.incus:22281 | Angelia (Django) | +| `alertmanager.ouranos.helu.ca` | prospero.incus:443 (SSL) | AlertManager | +| `angelia.ouranos.helu.ca` | puck.incus:22281 | Angelia (Django) | +| `anythingllm.ouranos.helu.ca` | rosalind.incus:22084 | AnythingLLM | +| `arke.ouranos.helu.ca` | sycorax.incus:25540 | Arke LLM Proxy | +| `athena.ouranos.helu.ca` | puck.incus:22481 | Athena (Django) | +| `gitea.ouranos.helu.ca` | rosalind.incus:22082 | Gitea | +| `grafana.ouranos.helu.ca` | prospero.incus:443 (SSL) | Grafana | +| `hass.ouranos.helu.ca` | oberon.incus:8123 | Home Assistant | +| `id.ouranos.helu.ca` | titania.incus:22081 | Casdoor SSO | +| `icarlos.ouranos.helu.ca` | puck.incus:22681 | Icarlos (Django) | +| `jupyterlab.ouranos.helu.ca` | puck.incus:22071 | JupyterLab (OAuth2-Proxy) | +| `kairos.ouranos.helu.ca` | puck.incus:22581 | Kairos (Django) | +| `lobechat.ouranos.helu.ca` | rosalind.incus:22081 | LobeChat | +| `loki.ouranos.helu.ca` | prospero.incus:443 (SSL) | Loki | +| `mcp-switchboard.ouranos.helu.ca` | oberon.incus:22785 | MCP Switchboard | +| `nextcloud.ouranos.helu.ca` | rosalind.incus:22083 | Nextcloud | +| `openwebui.ouranos.helu.ca` | oberon.incus:22088 | Open WebUI | +| `peitho.ouranos.helu.ca` | puck.incus:22981 | Peitho (Django) | +| `pgadmin.ouranos.helu.ca` | prospero.incus:443 (SSL) | PgAdmin 4 | +| `prometheus.ouranos.helu.ca` | prospero.incus:443 (SSL) | Prometheus | +| `searxng.ouranos.helu.ca` | oberon.incus:22073 | SearXNG (OAuth2-Proxy) | +| `smtp4dev.ouranos.helu.ca` | oberon.incus:22085 | smtp4dev | +| `spelunker.ouranos.helu.ca` | puck.incus:22881 | Spelunker (Django) | + +--- + +## Infrastructure Management + +### Quick Start + +```bash +# Provision containers +cd terraform +terraform init +terraform plan +terraform apply + +# Start all containers +cd ../ansible +source ~/env/agathos/bin/activate +ansible-playbook sandbox_up.yml + +# Deploy all services +ansible-playbook site.yml + +# Stop all containers +ansible-playbook sandbox_down.yml +``` + +### Terraform Workflow + +1. **Define** — Containers, networks, and resources in `*.tf` files +2. **Plan** — Review changes with `terraform plan` +3. **Apply** — Provision with `terraform apply` +4. **Verify** — Check outputs and container status + +### Ansible Workflow + +1. **Bootstrap** — Update packages, install essentials (`apt_update.yml`) +2. **Agents** — Deploy Alloy (log/metrics) and Node Exporter on all hosts +3. **Services** — Configure databases, Docker, applications, observability +4. **Verify** — Check service health and connectivity + +### Vault Management + +```bash +# Edit secrets +ansible-vault edit inventory/group_vars/all/vault.yml + +# View secrets +ansible-vault view inventory/group_vars/all/vault.yml + +# Encrypt a new file +ansible-vault encrypt new_secrets.yml +``` + +--- + +## S3 Storage Provisioning + +Terraform provisions Incus S3 buckets for services requiring object storage: + +| Service | Host | Purpose | +|---------|------|---------| +| **Casdoor** | Titania | User avatars and SSO resource storage | +| **LobeChat** | Rosalind | File uploads and attachments | + +> S3 credentials (access key, secret key, endpoint) are stored as sensitive Terraform outputs and managed in Ansible Vault with the `vault_*_s3_*` prefix. + +--- + +## Ansible Automation + +### Full Deployment (`site.yml`) + +Playbooks run in dependency order: + +| Playbook | Hosts | Purpose | +|----------|-------|---------| +| `apt_update.yml` | All | Update packages and install essentials | +| `alloy/deploy.yml` | All | Grafana Alloy log/metrics collection | +| `prometheus/node_deploy.yml` | All | Node Exporter metrics | +| `docker/deploy.yml` | Oberon, Ariel, Miranda, Puck, Rosalind, Sycorax, Caliban, Titania | Docker engine | +| `smtp4dev/deploy.yml` | Oberon | SMTP test server | +| `pplg/deploy.yml` | Prospero | Full observability stack + HAProxy + OAuth2-Proxy | +| `postgresql/deploy.yml` | Portia | PostgreSQL with all databases | +| `postgresql_ssl/deploy.yml` | Titania | Dedicated PostgreSQL for Casdoor | +| `neo4j/deploy.yml` | Ariel | Neo4j graph database | +| `searxng/deploy.yml` | Oberon | SearXNG privacy search | +| `haproxy/deploy.yml` | Titania | HAProxy TLS termination and routing | +| `casdoor/deploy.yml` | Titania | Casdoor SSO | +| `mcpo/deploy.yml` | Miranda | MCPO MCP proxy | +| `openwebui/deploy.yml` | Oberon | Open WebUI LLM interface | +| `hass/deploy.yml` | Oberon | Home Assistant | +| `gitea/deploy.yml` | Rosalind | Gitea self-hosted Git | +| `nextcloud/deploy.yml` | Rosalind | Nextcloud collaboration | + +### Individual Service Deployments + +Services with standalone deploy playbooks (not in `site.yml`): + +| Playbook | Host | Service | +|----------|------|---------| +| `anythingllm/deploy.yml` | Rosalind | AnythingLLM document AI | +| `arke/deploy.yml` | Sycorax | Arke LLM proxy | +| `argos/deploy.yml` | Miranda | Argos MCP web search server | +| `caliban/deploy.yml` | Caliban | Agent S MCP Server | +| `certbot/deploy.yml` | Titania | Let's Encrypt certificate renewal | +| `gitea_mcp/deploy.yml` | Miranda | Gitea MCP Server | +| `gitea_runner/deploy.yml` | Puck | Gitea CI/CD runner | +| `grafana_mcp/deploy.yml` | Miranda | Grafana MCP Server | +| `jupyterlab/deploy.yml` | Puck | JupyterLab + OAuth2-Proxy | +| `kernos/deploy.yml` | Caliban | Kernos MCP shell server | +| `lobechat/deploy.yml` | Rosalind | LobeChat AI chat | +| `neo4j_mcp/deploy.yml` | Miranda | Neo4j MCP Server | +| `rabbitmq/deploy.yml` | Oberon | RabbitMQ message queue | + +### Lifecycle Playbooks + +| Playbook | Purpose | +|----------|---------| +| `sandbox_up.yml` | Start all Uranian host containers | +| `sandbox_down.yml` | Gracefully stop all containers | +| `apt_update.yml` | Update packages on all hosts | +| `site.yml` | Full deployment orchestration | + +--- + +## Data Flow Architecture + +### Observability Pipeline + +``` +All Hosts Prospero Alerts +Alloy + Node Exporter → Prometheus + Loki + Grafana → AlertManager + Pushover +collect metrics & logs storage & visualisation notifications +``` + +### Integration Points + +| Consumer | Provider | Connection | +|----------|----------|-----------| +| All LLM apps | Arke (Sycorax) | `http://sycorax.incus:25540` | +| Open WebUI, Arke, Gitea, Nextcloud, LobeChat | PostgreSQL (Portia) | `portia.incus:5432` | +| Neo4j MCP | Neo4j (Ariel) | `ariel.incus:7687` (Bolt) | +| MCP Switchboard | Docker API (Miranda) | `tcp://miranda.incus:2375` | +| MCP Switchboard | RabbitMQ (Oberon) | `oberon.incus:5672` | +| Kairos, Spelunker | RabbitMQ (Oberon) | `oberon.incus:5672` | +| SMTP (all apps) | smtp4dev (Oberon) | `oberon.incus:22025` | +| All hosts | Loki (Prospero) | `http://prospero.incus:3100` | +| All hosts | Prometheus (Prospero) | `http://prospero.incus:9090` | + +--- + +## Important Notes + +⚠️ **Alloy Host Variables Required** — Every host with `alloy` in its `services` list must define `alloy_log_level` in `inventory/host_vars/.incus.yml`. The playbook will fail with an undefined variable error if this is missing. + +⚠️ **Alloy Syslog Listeners Required for Docker Services** — Any Docker Compose service using the syslog logging driver must have a corresponding `loki.source.syslog` listener in the host's Alloy config template (`ansible/alloy//config.alloy.j2`). Missing listeners cause Docker containers to fail on start. + +⚠️ **Local Terraform State** — This project uses local Terraform state (no remote backend). Do not run `terraform apply` from multiple machines simultaneously. + +⚠️ **Nested Docker** — Docker runs inside Incus containers (nested), requiring `security.nesting = true` and `lxc.apparmor.profile=unconfined` AppArmor override on all Docker-enabled hosts. + +⚠️ **Deployment Order** — Prospero (observability) must be fully deployed before other hosts, as Alloy on every host pushes logs and metrics to `prospero.incus`. Run `pplg/deploy.yml` before `site.yml` on a fresh environment. diff --git a/docs/pgadmin.md b/docs/pgadmin.md new file mode 100644 index 0000000..4d41e10 --- /dev/null +++ b/docs/pgadmin.md @@ -0,0 +1,192 @@ +# PgAdmin - PostgreSQL Web Administration + +## Overview + +PgAdmin 4 is a web-based administration and management tool for PostgreSQL. It is deployed on **Portia** alongside the shared PostgreSQL instance, providing a graphical interface for database management, query execution, and server monitoring across both PostgreSQL deployments (Portia and Titania). + +**Host:** portia.incus +**Role:** database +**Container Port:** 80 (Apache / pgAdmin4 web app) +**External Access:** https://pgadmin.ouranos.helu.ca/ (via HAProxy on Titania, proxied through host port 25555) + +## Architecture + +``` +┌──────────┐ ┌────────────┐ ┌──────────────────────────────────┐ +│ Client │─────▶│ HAProxy │─────▶│ Portia │ +│ │ │ (Titania) │ │ │ +│ │ │ :443 │ │ :25555 ──▶ :80 (Apache) │ +└──────────┘ └────────────┘ │ │ │ + │ ┌────▼─────┐ │ + │ │ PgAdmin4 │ │ + │ │ (web) │ │ + │ └────┬─────┘ │ + │ │ │ + │ ┌────────▼────────┐ │ + │ │ PostgreSQL 17 │ │ + │ │ (localhost) │ │ + │ └─────────────────┘ │ + └──────────┬─────────────────────┘ + │ SSL + ▼ + ┌─────────────────────┐ + │ PostgreSQL 17 (SSL) │ + │ (Titania) │ + └─────────────────────┘ +``` + +PgAdmin connects to: +- **Portia's PostgreSQL** — locally via `localhost:5432` (no SSL) +- **Titania's PostgreSQL** — over the Incus network via SSL, using the fetched certificate stored at `/var/lib/pgadmin/certs/titania-postgres-ca.crt` + +## Terraform Resources + +### Host Definition + +PgAdmin runs on Portia, defined in `terraform/containers.tf`: + +| Attribute | Value | +|-----------|-------| +| Image | noble | +| Role | database | +| Security Nesting | false | +| Proxy Devices | `25555 → 80` (Apache/PgAdmin web UI) | + +The Incus proxy device maps host port 25555 to Apache on port 80 inside the container, where PgAdmin4 is served as a WSGI application. + +## Ansible Deployment + +### Playbook + +```bash +cd ansible +ansible-playbook pgadmin/deploy.yml +``` + +### Files + +| File | Purpose | +|------|---------| +| `pgadmin/deploy.yml` | PgAdmin installation and SSL cert distribution | + +### Deployment Steps + +1. **Add PgAdmin repository** — Official pgAdmin4 APT repository with GPG key +2. **Install PgAdmin** — `pgadmin4-web` package (includes Apache configuration) +3. **Create certs directory** — `/var/lib/pgadmin/certs/` owned by `www-data` +4. **Fetch Titania SSL certificate** — Retrieves the self-signed PostgreSQL SSL cert from Titania +5. **Distribute certificate** — Copies to `/var/lib/pgadmin/certs/titania-postgres-ca.crt` for SSL connections + +### ⚠️ Manual Post-Deployment Step Required + +After running the playbook, you **must** SSH into Portia and run the PgAdmin web setup script manually: + +```bash +# SSH into Portia +ssh portia.incus + +# Run the setup script +sudo /usr/pgadmin4/bin/setup-web.sh +``` + +This interactive script: +- Prompts for the **admin email address** and **password** (use the values from `pgadmin_email` and `pgadmin_password` vault variables) +- Configures Apache virtual host for PgAdmin4 +- Sets file permissions and ownership +- Restarts Apache to activate the configuration + +This step cannot be automated via Ansible because the script requires interactive input and performs Apache configuration that depends on the local environment. + +### Variables + +#### Host Variables (`host_vars/portia.incus.yml`) + +| Variable | Description | +|----------|-------------| +| `pgadmin_user` | System user (`pgadmin`) | +| `pgadmin_group` | System group (`pgadmin`) | +| `pgadmin_directory` | Data directory (`/srv/pgadmin`) | +| `pgadmin_port` | External port (`25555`) | +| `pgadmin_email` | Admin login email (`{{ vault_pgadmin_email }}`) | +| `pgadmin_password` | Admin login password (`{{ vault_pgadmin_password }}`) | + +#### Vault Variables (`group_vars/all/vault.yml`) + +| Variable | Description | +|----------|-------------| +| `vault_pgadmin_email` | PgAdmin admin email address | +| `vault_pgadmin_password` | PgAdmin admin password | + +## Configuration + +### SSL Certificate for Titania Connection + +The playbook fetches the self-signed PostgreSQL SSL certificate from Titania and places it at `/var/lib/pgadmin/certs/titania-postgres-ca.crt`. When adding Titania's PostgreSQL as a server in PgAdmin: + +1. Navigate to **Servers → Register → Server** +2. On the **Connection** tab: + - Host: `titania.incus` + - Port: `5432` + - Username: `postgres` +3. On the **SSL** tab: + - SSL mode: `verify-ca` or `require` + - Root certificate: `/var/lib/pgadmin/certs/titania-postgres-ca.crt` + +### Registered Servers + +After setup, register both PostgreSQL instances: + +| Server Name | Host | Port | SSL | +|-------------|------|------|-----| +| Portia (local) | `localhost` | `5432` | Off | +| Titania (Casdoor) | `titania.incus` | `5432` | verify-ca | + +## Operations + +### Start/Stop + +```bash +# PgAdmin runs under Apache +sudo systemctl start apache2 +sudo systemctl stop apache2 +sudo systemctl restart apache2 +``` + +### Health Check + +```bash +# Check Apache is serving PgAdmin +curl -s -o /dev/null -w "%{http_code}" http://localhost/pgadmin4/login + +# Check from external host +curl -s -o /dev/null -w "%{http_code}" http://portia.incus/pgadmin4/login +``` + +### Logs + +```bash +# Apache error log +tail -f /var/log/apache2/error.log + +# PgAdmin application log +tail -f /var/log/pgadmin/pgadmin4.log +``` + +## Troubleshooting + +### Common Issues + +| Symptom | Cause | Resolution | +|---------|-------|------------| +| 502/503 on pgadmin.ouranos.helu.ca | Apache not running on Portia | `sudo systemctl restart apache2` on Portia | +| Login page loads but can't authenticate | Setup script not run | SSH to Portia and run `sudo /usr/pgadmin4/bin/setup-web.sh` | +| Can't connect to Titania PostgreSQL | Missing SSL certificate | Re-run `ansible-playbook pgadmin/deploy.yml` to fetch cert | +| SSL certificate error for Titania | Certificate expired or regenerated | Re-fetch cert by re-running the playbook | +| Port 25555 unreachable | Incus proxy device missing | Verify proxy device in `terraform/containers.tf` for Portia | + +## References + +- [PgAdmin 4 Documentation](https://www.pgadmin.org/docs/pgadmin4/latest/) +- [PostgreSQL Deployment](postgresql.md) +- [Terraform Practices](terraform.md) +- [Ansible Practices](ansible.md) diff --git a/docs/postgresql.md b/docs/postgresql.md new file mode 100644 index 0000000..a3850cc --- /dev/null +++ b/docs/postgresql.md @@ -0,0 +1,287 @@ +# PostgreSQL - Dual-Deployment Database Layer + +## Overview + +PostgreSQL 17 serves as the primary relational database engine for the Agathos sandbox. There are **two separate deployment playbooks**, each targeting a different host with a distinct purpose: + +| Playbook | Host | Purpose | +|----------|------|---------| +| `postgresql/deploy.yml` | **Portia** | Shared multi-tenant database with **pgvector** for AI/vector workloads | +| `postgresql_ssl/deploy.yml` | **Titania** | Dedicated SSL-enabled database for the **Casdoor** identity provider | + +**Portia** acts as the central database server for most applications, while **Titania** runs an isolated PostgreSQL instance exclusively for Casdoor, hardened with self-signed SSL certificates for secure external connections. + +## Architecture + +``` + ┌────────────────────────────────────────────────────┐ + │ Portia (postgresql) │ + ┌──────────┐ │ ┌──────────────────────────────────────────────┐ │ + │ Arke │───────────▶│ │ PostgreSQL 17 + pgvector v0.8.0 │ │ + │(Caliban) │ │ │ │ │ + ├──────────┤ │ │ Databases: │ │ + │ Gitea │───────────▶│ │ arke ─── openwebui ─── spelunker │ │ + │(Rosalind)│ │ │ gitea ── lobechat ──── nextcloud │ │ + ├──────────┤ │ │ anythingllm ────────── hass │ │ + │ Open │───────────▶│ │ │ │ + │ WebUI │ │ │ pgvector enabled in: │ │ + ├──────────┤ │ │ arke, lobechat, openwebui, │ │ + │ LobeChat │───────────▶│ │ spelunker, anythingllm │ │ + ├──────────┤ │ └──────────────────────────────────────────────┘ │ + │ HASS │───────────▶│ │ + │ + others │ │ PgAdmin available on :25555 │ + └──────────┘ └────────────────────────────────────────────────────┘ + + ┌────────────────────────────────────────────────────┐ + │ Titania (postgresql_ssl) │ + ┌──────────┐ │ ┌──────────────────────────────────────────────┐ │ + │ Casdoor │──SSL──────▶│ │ PostgreSQL 17 + SSL (self-signed) │ │ + │(Titania) │ (local) │ │ │ │ + └──────────┘ │ │ Database: casdoor (single-purpose) │ │ + │ └──────────────────────────────────────────────┘ │ + └────────────────────────────────────────────────────┘ +``` + +## Terraform Resources + +### Portia – Shared Database Host + +Defined in `terraform/containers.tf`: + +| Attribute | Value | +|-----------|-------| +| Image | noble | +| Role | database | +| Security Nesting | false | +| Proxy Devices | `25555 → 80` (PgAdmin web UI) | + +PostgreSQL port 5432 is **not** exposed externally—applications connect over the private Incus network (`10.10.0.0/16`). + +### Titania – Proxy & SSO Host + +| Attribute | Value | +|-----------|-------| +| Image | noble | +| Role | proxy_sso | +| Security Nesting | true | +| Proxy Devices | `443 → 8443`, `80 → 8080` (HAProxy) | + +Titania runs PostgreSQL alongside Casdoor on the same host. Casdoor connects via localhost, so SSL is not required for the local connection despite being available for external clients. + +## Ansible Deployment + +### Playbook 1: Shared PostgreSQL with pgvector (Portia) + +```bash +cd ansible +ansible-playbook postgresql/deploy.yml +``` + +#### Files + +| File | Purpose | +|------|---------| +| `postgresql/deploy.yml` | Multi-tenant PostgreSQL with pgvector | + +#### Deployment Steps + +1. **Install build dependencies** — `curl`, `git`, `build-essential`, `vim`, `python3-psycopg2` +2. **Add PGDG repository** — Official PostgreSQL APT repository +3. **Install PostgreSQL 17** — Client, server, docs, `libpq-dev`, `server-dev` +4. **Clone & build pgvector v0.8.0** — Compiled from source against the installed PG version +5. **Start PostgreSQL** and restart after pgvector installation +6. **Set data directory permissions** — `700` owned by `postgres:postgres` +7. **Configure networking** — `listen_addresses = '*'` +8. **Configure authentication** — `host all all 0.0.0.0/0 md5` in `pg_hba.conf` +9. **Set admin password** — `postgres` superuser password from vault +10. **Create application users** — 9 database users (see table below) +11. **Create application databases** — 9 databases with matching owners +12. **Enable pgvector** — `CREATE EXTENSION vector` in 5 databases + +### Playbook 2: SSL-Enabled PostgreSQL (Titania) + +```bash +cd ansible +ansible-playbook postgresql_ssl/deploy.yml +``` + +#### Files + +| File | Purpose | +|------|---------| +| `postgresql_ssl/deploy.yml` | Single-purpose SSL PostgreSQL for Casdoor | + +#### Deployment Steps + +1. **Install dependencies** — `curl`, `python3-psycopg2`, `python3-cryptography` +2. **Add PGDG repository** — Official PostgreSQL APT repository +3. **Install PostgreSQL 17** — Client and server only (no dev packages needed) +4. **Generate SSL certificates** — 4096-bit RSA key, self-signed, 10-year validity +5. **Configure networking** — `listen_addresses = '*'` +6. **Enable SSL** — `ssl = on` with cert/key file paths +7. **Configure tiered authentication** in `pg_hba.conf`: + - `local` → `peer` (Unix socket, no password) + - `host 127.0.0.1/32` → `md5` (localhost, no SSL) + - `host 10.10.0.0/16` → `md5` (Incus network, no SSL) + - `hostssl 0.0.0.0/0` → `md5` (external, SSL required) +8. **Set admin password** — `postgres` superuser password from vault +9. **Create Casdoor user and database** — Single-purpose + +## User & Database Creation via Host Variables + +Both playbooks derive all database names, usernames, and passwords from **host variables** defined in the Ansible inventory. No database credentials appear in `group_vars`—everything is scoped to the host that runs PostgreSQL. + +### Portia Host Variables (`inventory/host_vars/portia.incus.yml`) + +The `postgresql/deploy.yml` playbook loops over variable pairs to create users and databases. Each application gets three variables defined in Portia's host_vars: + +| Variable Pattern | Example | Description | +|-----------------|---------|-------------| +| `{app}_db_name` | `arke_db_name: arke` | Database name | +| `{app}_db_user` | `arke_db_user: arke` | Database owner/user | +| `{app}_db_password` | `arke_db_password: "{{ vault_arke_db_password }}"` | Password (from vault) | + +#### Application Database Matrix (Portia) + +| Application | DB Name Variable | DB User Variable | pgvector | +|-------------|-----------------|-----------------|----------| +| Arke | `arke_db_name` | `arke_db_user` | ✔ | +| Open WebUI | `openwebui_db_name` | `openwebui_db_user` | ✔ | +| Spelunker | `spelunker_db_name` | `spelunker_db_user` | ✔ | +| Gitea | `gitea_db_name` | `gitea_db_user` | | +| LobeChat | `lobechat_db_name` | `lobechat_db_user` | ✔ | +| Nextcloud | `nextcloud_db_name` | `nextcloud_db_user` | | +| AnythingLLM | `anythingllm_db_name` | `anythingllm_db_user` | ✔ | +| HASS | `hass_db_name` | `hass_db_user` | | +| Nike | `nike_db_name` | `nike_db_user` | | + +#### Additional Portia Variables + +| Variable | Description | +|----------|-------------| +| `postgres_user` | System user (`postgres`) | +| `postgres_group` | System group (`postgres`) | +| `postgresql_port` | Port (`5432`) | +| `postgresql_data_dir` | Data directory (`/var/lib/postgresql`) | +| `postgres_password` | Admin password (`{{ vault_postgres_password }}`) | + +### Titania Host Variables (`inventory/host_vars/titania.incus.yml`) + +The `postgresql_ssl/deploy.yml` playbook creates a single database for Casdoor: + +| Variable | Value | Description | +|----------|-------|-------------| +| `postgresql_ssl_postgres_password` | `{{ vault_postgresql_ssl_postgres_password }}` | Admin password | +| `postgresql_ssl_port` | `5432` | PostgreSQL port | +| `postgresql_ssl_cert_path` | `/etc/postgresql/17/main/ssl/server.crt` | SSL certificate | +| `casdoor_db_name` | `casdoor` | Database name | +| `casdoor_db_user` | `casdoor` | Database user | +| `casdoor_db_password` | `{{ vault_casdoor_db_password }}` | Password (from vault) | +| `casdoor_db_sslmode` | `disable` | Local connection skips SSL | + +### Adding a New Application Database + +To add a new application database on Portia: + +1. **Add variables** to `inventory/host_vars/portia.incus.yml`: + ```yaml + myapp_db_name: myapp + myapp_db_user: myapp + myapp_db_password: "{{ vault_myapp_db_password }}" + ``` + +2. **Add the vault secret** to `inventory/group_vars/all/vault.yml`: + ```yaml + vault_myapp_db_password: "s3cure-passw0rd" + ``` + +3. **Add the user** to the `Create application database users` loop in `postgresql/deploy.yml`: + ```yaml + - { user: "{{ myapp_db_user }}", password: "{{ myapp_db_password }}" } + ``` + +4. **Add the database** to the `Create application databases with owners` loop: + ```yaml + - { name: "{{ myapp_db_name }}", owner: "{{ myapp_db_user }}" } + ``` + +5. **(Optional)** If the application uses vector embeddings, add the database to the `Enable pgvector extension in databases` loop: + ```yaml + - "{{ myapp_db_name }}" + ``` + +## Operations + +### Start/Stop + +```bash +# On either host +sudo systemctl start postgresql +sudo systemctl stop postgresql +sudo systemctl restart postgresql +``` + +### Health Check + +```bash +# From any Incus host → Portia +psql -h portia.incus -U postgres -c "SELECT 1;" + +# From Titania localhost +sudo -u postgres psql -c "SELECT 1;" + +# Check pgvector availability +sudo -u postgres psql -c "SELECT * FROM pg_available_extensions WHERE name = 'vector';" +``` + +### Logs + +```bash +# Systemd journal +journalctl -u postgresql -f + +# PostgreSQL log files +tail -f /var/log/postgresql/postgresql-17-main.log + +# Loki (via Grafana Explore) +{job="postgresql"} +``` + +### Backup + +```bash +# Dump a single database +sudo -u postgres pg_dump myapp > myapp_backup.sql + +# Dump all databases +sudo -u postgres pg_dumpall > full_backup.sql +``` + +### Restore + +```bash +# Restore a single database +sudo -u postgres psql myapp < myapp_backup.sql + +# Restore all databases +sudo -u postgres psql < full_backup.sql +``` + +## Troubleshooting + +### Common Issues + +| Symptom | Cause | Resolution | +|---------|-------|------------| +| Connection refused from app host | `pg_hba.conf` missing entry | Verify client IP is covered by HBA rules | +| pgvector extension not found | Built against wrong PG version | Re-run the `Build pgvector with correct pg_config` task | +| SSL handshake failure (Titania) | Expired or missing certificate | Check `/etc/postgresql/17/main/ssl/server.crt` validity | +| `FATAL: password authentication failed` | Wrong password in host_vars | Verify vault variable matches and re-run playbook | +| PgAdmin unreachable on :25555 | Incus proxy device missing | Check `terraform/containers.tf` proxy for Portia | + +## References + +- [PostgreSQL 17 Documentation](https://www.postgresql.org/docs/17/) +- [pgvector GitHub](https://github.com/pgvector/pgvector) +- [Terraform Practices](terraform.md) +- [Ansible Practices](ansible.md) diff --git a/docs/pplg.md b/docs/pplg.md new file mode 100644 index 0000000..b6bb5c5 --- /dev/null +++ b/docs/pplg.md @@ -0,0 +1,583 @@ +# PPLG - Consolidated Observability & Admin Stack + +## Overview + +PPLG is the consolidated observability and administration stack running on **Prospero**. It bundles PgAdmin, Prometheus, Loki, and Grafana behind an internal HAProxy for TLS termination, with Casdoor SSO for user-facing services and OAuth2-Proxy as a sidecar for Prometheus UI authentication. + +**Host:** prospero.incus +**Role:** Observability +**Incus Ports:** 25510 → 443 (HTTPS), 25511 → 80 (HTTP redirect) +**External Access:** Via Titania HAProxy → `prospero.incus:443` + +| Subdomain | Service | Auth Method | +|-----------|---------|-------------| +| `grafana.ouranos.helu.ca` | Grafana | Native Casdoor OAuth | +| `pgadmin.ouranos.helu.ca` | PgAdmin | Native Casdoor OAuth | +| `prometheus.ouranos.helu.ca` | Prometheus | OAuth2-Proxy sidecar | +| `loki.ouranos.helu.ca` | Loki | None (machine-to-machine) | +| `alertmanager.ouranos.helu.ca` | Alertmanager | None (internal) | + +## Architecture + +``` +┌──────────┐ ┌────────────┐ ┌─────────────────────────────────────────────────┐ +│ Client │─────▶│ HAProxy │─────▶│ Prospero (PPLG) │ +│ │ │ (Titania) │ │ │ +└──────────┘ │ :443 → :443 │ ┌──────────────────────────────────────────┐ │ + └────────────┘ │ │ HAProxy (systemd, :443/:80) │ │ + │ │ TLS termination + subdomain routing │ │ +┌──────────┐ │ └───┬──────┬──────┬──────┬──────┬──────────┘ │ +│ Alloy │──push──────────────────────────▶│ │ │ │ │ +│ (agents) │ loki.ouranos.helu.ca │ │ │ │ │ │ +│ │ prometheus.ouranos.helu.ca │ │ │ │ │ +└──────────┘ │ ▼ ▼ ▼ ▼ ▼ │ + │ Grafana PgAdmin OAuth2 Loki Alertmanager │ + │ :3000 :5050 Proxy :3100 :9093 │ + │ :9091 │ + │ │ │ + │ ▼ │ + │ Prometheus │ + │ :9090 │ + └─────────────────────────────────────────────────┘ +``` + +### Traffic Flow + +| Source | Destination | Path | Auth | +|--------|-------------|------|------| +| Browser → Grafana | Titania :443 → Prospero :443 → HAProxy → :3000 | Subdomain ACL | Casdoor OAuth | +| Browser → PgAdmin | Titania :443 → Prospero :443 → HAProxy → :5050 | Subdomain ACL | Casdoor OAuth | +| Browser → Prometheus | Titania :443 → Prospero :443 → HAProxy → OAuth2-Proxy :9091 → :9090 | Subdomain ACL | OAuth2-Proxy → Casdoor | +| Alloy → Loki | `https://loki.ouranos.helu.ca` → HAProxy :443 → :3100 | Subdomain ACL | None | +| Alloy → Prometheus | `https://prometheus.ouranos.helu.ca/api/v1/write` → HAProxy :443 → :9090 | `skip_auth_route` | None | + +## Deployment + +### Prerequisites + +1. **Terraform**: Prospero container must have updated port mappings (`terraform apply`) +2. **Certbot**: Wildcard cert must exist on Titania (`ansible-playbook certbot/deploy.yml`) +3. **Vault Secrets**: All vault variables must be set (see [Required Vault Secrets](#required-vault-secrets)) +4. **Casdoor Applications**: Register PgAdmin and Prometheus apps in Casdoor (see [Casdoor SSO](#casdoor-sso)) + +### Playbook + +```bash +cd ansible +ansible-playbook pplg/deploy.yml +``` + +### Files + +| File | Purpose | +|------|---------| +| `pplg/deploy.yml` | Main consolidated deployment playbook | +| `pplg/pplg-haproxy.cfg.j2` | HAProxy TLS termination config (5 backends) | +| `pplg/prometheus.yml.j2` | Prometheus scrape configuration | +| `pplg/alert_rules.yml.j2` | Prometheus alerting rules | +| `pplg/alertmanager.yml.j2` | Alertmanager routing and Pushover notifications | +| `pplg/config.yml.j2` | Loki server configuration | +| `pplg/grafana.ini.j2` | Grafana main config with Casdoor OAuth | +| `pplg/datasource.yml.j2` | Grafana provisioned datasources | +| `pplg/users.yml.j2` | Grafana provisioned users | +| `pplg/config_local.py.j2` | PgAdmin config with Casdoor OAuth | +| `pplg/pgadmin.service.j2` | PgAdmin gunicorn systemd unit | +| `pplg/oauth2-proxy-prometheus.cfg.j2` | OAuth2-Proxy config for Prometheus UI | +| `pplg/oauth2-proxy-prometheus.service.j2` | OAuth2-Proxy systemd unit | + +### Deployment Steps + +1. **APT Repositories**: Add Grafana and PgAdmin repos +2. **Install Packages**: haproxy, prometheus, loki, grafana, pgadmin4-web, gunicorn +3. **Prometheus**: Config, alert rules, systemd override for remote write receiver +4. **Alertmanager**: Install, config with Pushover integration +5. **Loki**: Create user/dirs, template config +6. **Grafana**: Provisioning (datasources, users, dashboards), OAuth config +7. **PgAdmin**: Create user/dirs, gunicorn systemd service, Casdoor OAuth config +8. **OAuth2-Proxy**: Download binary (v7.6.0), config for Prometheus sidecar +9. **SSL Certificate**: Fetch Let's Encrypt wildcard cert from Titania (self-signed fallback) +10. **HAProxy**: Template config, enable and start systemd service + +### Deployment Order + +PPLG must be deployed **before** services that push metrics/logs: + +``` +apt_update → alloy → node_exporter → pplg → postgresql → ... +``` + +This order is enforced in `site.yml`. + +## Required Vault Secrets + +Add to `ansible/inventory/group_vars/all/vault.yml`: + +⚠️ **All vault variables below must be set before running the playbook.** Missing variables will cause template failures like: + +``` +TASK [Template prometheus.yml] **** +[ERROR]: 'vault_casdoor_prometheus_access_key' is undefined +``` + +### Prometheus Scrape Credentials + +These are used in `prometheus.yml.j2` to scrape metrics from Casdoor and Gitea. + +#### 1. Casdoor Prometheus Access Key +```yaml +vault_casdoor_prometheus_access_key: "YourCasdoorAccessKey" +``` + +#### 2. Casdoor Prometheus Access Secret +```yaml +vault_casdoor_prometheus_access_secret: "YourCasdoorAccessSecret" +``` + +**Requirements (both):** +- **Source**: API key pair from the `built-in/admin` Casdoor user +- **Used by**: `prometheus.yml.j2` Casdoor scrape job (`accessKey` / `accessSecret` query params) +- **How to obtain**: Generate via Casdoor API (the "API key" account item is not exposed in the UI by default): + ```bash + # 1. Login to get session cookie + curl -sk -c /tmp/casdoor-cookie.txt -X POST "https://id.ouranos.helu.ca/api/login" \ + -H "Content-Type: application/json" \ + -d '{"application":"app-built-in","organization":"built-in","username":"admin","password":"YOUR_PASSWORD","type":"login"}' + + # 2. Generate API keys for built-in/admin + curl -sk -b /tmp/casdoor-cookie.txt -X POST "https://id.ouranos.helu.ca/api/add-user-keys" \ + -H "Content-Type: application/json" \ + -d '{"owner":"built-in","name":"admin"}' + + # 3. Retrieve the generated keys + curl -sk -b /tmp/casdoor-cookie.txt "https://id.ouranos.helu.ca/api/get-user?id=built-in/admin" | \ + python3 -c "import sys,json; d=json.load(sys.stdin)['data']; print(f'accessKey: {d[\"accessKey\"]}\naccessSecret: {d[\"accessSecret\"]}')" + + # 4. Cleanup + rm /tmp/casdoor-cookie.txt + ``` + +⚠️ The `built-in/admin` user is used (not a `heluca` user) because Casdoor's `/api/metrics` endpoint requires an admin user and serves global platform metrics. + +#### 3. Gitea Metrics Token +```yaml +vault_gitea_metrics_token: "YourGiteaMetricsToken" +``` +**Requirements:** +- **Length**: 32+ characters +- **Source**: Must match the token configured in Gitea's `app.ini` +- **Generation**: `openssl rand -hex 32` +- **Used by**: `prometheus.yml.j2` Gitea scrape job (Bearer token auth) + +### Grafana Credentials + +#### 4. Grafana Admin User +```yaml +vault_grafana_admin_name: "Admin" +vault_grafana_admin_login: "admin" +vault_grafana_admin_password: "YourSecureAdminPassword" +``` + +#### 5. Grafana Viewer User +```yaml +vault_grafana_viewer_name: "Viewer" +vault_grafana_viewer_login: "viewer" +vault_grafana_viewer_password: "YourSecureViewerPassword" +``` + +#### 6. Grafana OAuth (Casdoor SSO) +```yaml +vault_grafana_oauth_client_id: "grafana-oauth-client" +vault_grafana_oauth_client_secret: "YourGrafanaOAuthSecret" +``` +**Requirements:** +- **Source**: Must match the Casdoor application `app-grafana` +- **Redirect URI**: `https://grafana.ouranos.helu.ca/login/generic_oauth` + +### PgAdmin + +#### 7. PgAdmin Setup + +Just do it manually: +cmd: /usr/pgadmin4/venv/bin/python3 /usr/pgadmin4/web/setup.py setup-db + +**Requirements:** +- **Purpose**: Initial local admin account (fallback when OAuth is unavailable) + +#### 8. PgAdmin OAuth (Casdoor SSO) +```yaml +vault_pgadmin_oauth_client_id: "pgadmin-oauth-client" +vault_pgadmin_oauth_client_secret: "YourPgAdminOAuthSecret" +``` +**Requirements:** +- **Source**: Must match the Casdoor application `app-pgadmin` +- **Redirect URI**: `https://pgadmin.ouranos.helu.ca/oauth2/redirect` + +### Prometheus OAuth2-Proxy + +#### 9. Prometheus OAuth2-Proxy (Casdoor SSO) +```yaml +vault_prometheus_oauth2_client_id: "prometheus-oauth-client" +vault_prometheus_oauth2_client_secret: "YourPrometheusOAuthSecret" +vault_prometheus_oauth2_cookie_secret: "GeneratedCookieSecret" +``` +**Requirements:** +- Client ID/Secret must match the Casdoor application `app-prometheus` +- **Redirect URI**: `https://prometheus.ouranos.helu.ca/oauth2/callback` +- **Cookie secret generation**: + ```bash + python3 -c 'import secrets; print(secrets.token_urlsafe(32))' + ``` + +### Alertmanager (Pushover) + +#### 10. Pushover Notification Credentials +```yaml +vault_pushover_user_key: "YourPushoverUserKey" +vault_pushover_api_token: "YourPushoverAPIToken" +``` +**Requirements:** +- **Source**: [pushover.net](https://pushover.net/) account +- **User Key**: Found on Pushover dashboard +- **API Token**: Create an application in Pushover + +### Quick Reference + +| Vault Variable | Used By | Source | +|---------------|---------|--------| +| `vault_casdoor_prometheus_access_key` | prometheus.yml.j2 | Casdoor `built-in/admin` API key | +| `vault_casdoor_prometheus_access_secret` | prometheus.yml.j2 | Casdoor `built-in/admin` API key | +| `vault_gitea_metrics_token` | prometheus.yml.j2 | Gitea app.ini | +| `vault_grafana_admin_name` | users.yml.j2 | Choose any | +| `vault_grafana_admin_login` | users.yml.j2 | Choose any | +| `vault_grafana_admin_password` | users.yml.j2 | Choose any | +| `vault_grafana_viewer_name` | users.yml.j2 | Choose any | +| `vault_grafana_viewer_login` | users.yml.j2 | Choose any | +| `vault_grafana_viewer_password` | users.yml.j2 | Choose any | +| `vault_grafana_oauth_client_id` | grafana.ini.j2 | Casdoor app | +| `vault_grafana_oauth_client_secret` | grafana.ini.j2 | Casdoor app | +| `vault_pgadmin_email` | config_local.py.j2 | Choose any | +| `vault_pgadmin_password` | config_local.py.j2 | Choose any | +| `vault_pgadmin_oauth_client_id` | config_local.py.j2 | Casdoor app | +| `vault_pgadmin_oauth_client_secret` | config_local.py.j2 | Casdoor app | +| `vault_prometheus_oauth2_client_id` | oauth2-proxy-prometheus.cfg.j2 | Casdoor app | +| `vault_prometheus_oauth2_client_secret` | oauth2-proxy-prometheus.cfg.j2 | Casdoor app | +| `vault_prometheus_oauth2_cookie_secret` | oauth2-proxy-prometheus.cfg.j2 | Generate | +| `vault_pushover_user_key` | alertmanager.yml.j2 | Pushover account | +| `vault_pushover_api_token` | alertmanager.yml.j2 | Pushover account | + +## Casdoor SSO + +Three Casdoor applications are required. Grafana's should already exist; PgAdmin and Prometheus need to be created. + +### Applications to Register + +Register in Casdoor Admin UI (`https://id.ouranos.helu.ca`) or add to `ansible/casdoor/init_data.json.j2`: + +| Application | Client ID | Redirect URI | Grant Types | +|-------------|-----------|-------------|-------------| +| `app-grafana` | `vault_grafana_oauth_client_id` | `https://grafana.ouranos.helu.ca/login/generic_oauth` | `authorization_code`, `refresh_token` | +| `app-pgadmin` | `vault_pgadmin_oauth_client_id` | `https://pgadmin.ouranos.helu.ca/oauth2/redirect` | `authorization_code`, `refresh_token` | +| `app-prometheus` | `vault_prometheus_oauth2_client_id` | `https://prometheus.ouranos.helu.ca/oauth2/callback` | `authorization_code`, `refresh_token` | + +### URL Strategy + +| URL Type | Address | Used By | +|----------|---------|---------| +| **Auth URL** | `https://id.ouranos.helu.ca/login/oauth/authorize` | User's browser (external) | +| **Token URL** | `https://id.ouranos.helu.ca/api/login/oauth/access_token` | Server-to-server | +| **Userinfo URL** | `https://id.ouranos.helu.ca/api/userinfo` | Server-to-server | +| **OIDC Discovery** | `https://id.ouranos.helu.ca/.well-known/openid-configuration` | OAuth2-Proxy | + +### Auth Methods per Service + +| Service | Auth Method | Details | +|---------|-------------|---------| +| **Grafana** | Native `[auth.generic_oauth]` | Built-in OAuth support in `grafana.ini` | +| **PgAdmin** | Native `OAUTH2_CONFIG` | Built-in OAuth support in `config_local.py` | +| **Prometheus** | OAuth2-Proxy sidecar | Binary on `:9091` proxying to `:9090` | +| **Loki** | None | Machine-to-machine (Alloy agents push logs) | +| **Alertmanager** | None | Internal only | + +## HAProxy Configuration + +### Backends + +| Backend | Upstream | Health Check | Auth | +|---------|----------|-------------|------| +| `backend_grafana` | `127.0.0.1:3000` | `GET /api/health` | Grafana OAuth | +| `backend_pgadmin` | `127.0.0.1:5050` | `GET /misc/ping` | PgAdmin OAuth | +| `backend_prometheus` | `127.0.0.1:9091` (OAuth2-Proxy) | `GET /ping` | OAuth2-Proxy | +| `backend_prometheus_direct` | `127.0.0.1:9090` | — | None (write API) | +| `backend_loki` | `127.0.0.1:3100` | `GET /ready` | None | +| `backend_alertmanager` | `127.0.0.1:9093` | `GET /-/healthy` | None | + +### skip_auth_route Pattern + +The Prometheus write API (`/api/v1/write`) is accessed by Alloy agents for machine-to-machine metric pushes. HAProxy uses an ACL to bypass OAuth2-Proxy: + +``` +acl is_prometheus_write path_beg /api/v1/write +use_backend backend_prometheus_direct if host_prometheus is_prometheus_write +``` + +This routes `https://prometheus.ouranos.helu.ca/api/v1/write` directly to Prometheus on `:9090`, while all other Prometheus traffic goes through OAuth2-Proxy on `:9091`. + +### SSL Certificate + +- **Primary**: Let's Encrypt wildcard cert (`*.ouranos.helu.ca`) fetched from Titania +- **Fallback**: Self-signed cert generated on Prospero (if Titania unavailable) +- **Path**: `/etc/haproxy/certs/ouranos.pem` + +## Host Variables + +**File:** `ansible/inventory/host_vars/prospero.incus.yml` + +Services list: +```yaml +services: + - alloy + - pplg +``` + +Key variable groups defined in `prospero.incus.yml`: +- PPLG HAProxy (user, group, uid/gid 800, syslog port) +- Grafana (datasources, users, OAuth config) +- Prometheus (scrape targets, OAuth2-Proxy sidecar config) +- Alertmanager (Pushover integration) +- Loki (user, data/config directories) +- PgAdmin (user, data/log directories, OAuth config) +- Casdoor Metrics (access key/secret for Prometheus scraping) + +## Terraform + +### Prospero Port Mapping + +```hcl +devices = [ + { + name = "https_internal" + type = "proxy" + properties = { + listen = "tcp:0.0.0.0:25510" + connect = "tcp:127.0.0.1:443" + } + }, + { + name = "http_redirect" + type = "proxy" + properties = { + listen = "tcp:0.0.0.0:25511" + connect = "tcp:127.0.0.1:80" + } + } +] +``` + +Run `terraform apply` before deploying if port mappings changed. + +### Titania Backend Routing + +Titania's HAProxy routes external subdomains to Prospero's HTTPS port: + +```yaml +# In titania.incus.yml haproxy_backends +- subdomain: "grafana" + backend_host: "prospero.incus" + backend_port: 443 + health_path: "/api/health" + ssl_backend: true + +- subdomain: "pgadmin" + backend_host: "prospero.incus" + backend_port: 443 + health_path: "/misc/ping" + ssl_backend: true + +- subdomain: "prometheus" + backend_host: "prospero.incus" + backend_port: 443 + health_path: "/ping" + ssl_backend: true +``` + +## Monitoring + +### Alloy Configuration + +**File:** `ansible/alloy/prospero/config.alloy.j2` + +- **HAProxy Syslog**: `loki.source.syslog` on `127.0.0.1:51405` (TCP) receives Docker syslog from HAProxy container +- **Journal Labels**: Dedicated job labels for `grafana-server`, `prometheus`, `loki`, `alertmanager`, `pgadmin`, `oauth2-proxy-prometheus` +- **System Logs**: `/var/log/syslog`, `/var/log/auth.log` → Loki +- **Metrics**: Node exporter + process exporter → Prometheus remote write + +### Prometheus Scrape Targets + +| Job | Target | Auth | +|-----|--------|------| +| `prometheus` | `localhost:9090` | None | +| `node-exporter` | All Uranian hosts `:9100` | None | +| `alertmanager` | `prospero.incus:9093` | None | +| `haproxy` | `titania.incus:8404` | None | +| `gitea` | `oberon.incus:22084` | Bearer token | +| `casdoor` | `titania.incus:22081` | Access key/secret params | + +### Alert Rules + +Groups defined in `alert_rules.yml.j2`: + +| Group | Alerts | Scope | +|-------|--------|-------| +| `node_alerts` | InstanceDown, HighCPU, HighMemory, DiskSpace, LoadAverage | All hosts | +| `puck_process_alerts` | HighCPU/Memory per process, CrashLoop | puck.incus | +| `puck_container_alerts` | HighContainerCount, Duplicates, Orphans, OOM | puck.incus | +| `service_alerts` | TargetMissing, JobMissing, AlertmanagerDown | Infrastructure | +| `loki_alerts` | HighLogVolume | Loki | + +### Alertmanager Routing + +Alerts are routed to Pushover with severity-based priority: + +| Severity | Pushover Priority | Emoji | +|----------|-------------------|-------| +| Critical | 2 (Emergency) | 🚨 | +| Warning | 1 (High) | ⚠️ | +| Info | 0 (Normal) | — | + +## Grafana MCP Server + +Grafana has an associated **MCP (Model Context Protocol) server** that provides AI/LLM access to dashboards, datasources, and alerting APIs. The Grafana MCP server runs as a Docker container on **Miranda** and connects back to Grafana on Prospero via the internal network (`prospero.incus:3000`) using a service account token. + +| Property | Value | +|----------|-------| +| MCP Host | miranda.incus | +| MCP Port | 25533 | +| MCPO Proxy | `http://miranda.incus:25530/grafana` | +| Auth | Grafana service account token (`vault_grafana_service_account_token`) | + +The Grafana MCP server is deployed separately from PPLG but depends on Grafana being running first. Deploy order: `pplg → grafana_mcp → mcpo`. + +For full details — deployment, configuration, available tools, troubleshooting — see **[Grafana MCP Server](grafana_mcp.md)**. + +## Access After Deployment + +| Service | URL | Login | +|---------|-----|-------| +| Grafana | https://grafana.ouranos.helu.ca | Casdoor SSO or local admin | +| PgAdmin | https://pgadmin.ouranos.helu.ca | Casdoor SSO or local admin | +| Prometheus | https://prometheus.ouranos.helu.ca | Casdoor SSO | +| Alertmanager | https://alertmanager.ouranos.helu.ca | No auth (internal) | + +## Troubleshooting + +### Service Status + +```bash +ssh prospero.incus +sudo systemctl status prometheus grafana-server loki prometheus-alertmanager pgadmin oauth2-proxy-prometheus +``` + +### HAProxy Service + +```bash +ssh prospero.incus +sudo systemctl status haproxy +sudo journalctl -u haproxy -f +``` + +### View Logs + +```bash +# All PPLG services via journal +sudo journalctl -u prometheus -u grafana-server -u loki -u prometheus-alertmanager -u pgadmin -u oauth2-proxy-prometheus -f + +# HAProxy logs (shipped via syslog to Alloy → Loki) +# Query in Grafana: {job="pplg-haproxy"} +``` + +### Test Endpoints (from Prospero) + +```bash +# Grafana +curl -s http://127.0.0.1:3000/api/health + +# PgAdmin +curl -s http://127.0.0.1:5050/misc/ping + +# Prometheus +curl -s http://127.0.0.1:9090/-/healthy + +# Loki +curl -s http://127.0.0.1:3100/ready + +# Alertmanager +curl -s http://127.0.0.1:9093/-/healthy + +# HAProxy stats +curl -s http://127.0.0.1:8404/metrics | head +``` + +### Test TLS (from any host) + +```bash +# Direct to Prospero container +curl -sk https://prospero.incus/api/health +# Via Titania HAProxy +curl -s https://grafana.ouranos.helu.ca/api/health +``` + +### Common Errors + +#### `vault_casdoor_prometheus_access_key` is undefined + +``` +TASK [Template prometheus.yml] +[ERROR]: 'vault_casdoor_prometheus_access_key' is undefined +``` + +**Cause**: The Casdoor metrics scrape job in `prometheus.yml.j2` requires access credentials. + +**Fix**: Generate API keys for the `built-in/admin` Casdoor user (see [Casdoor Prometheus Access Key](#1-casdoor-prometheus-access-key) for the full procedure), then add to vault: +```bash +cd ansible +ansible-vault edit inventory/group_vars/all/vault.yml +``` +```yaml +vault_casdoor_prometheus_access_key: "your-casdoor-access-key" +vault_casdoor_prometheus_access_secret: "your-casdoor-access-secret" +``` + +#### Certificate fetch fails + +**Cause**: Titania not running or certbot hasn't provisioned the cert yet. + +**Fix**: Ensure Titania is up and certbot has run: +```bash +ansible-playbook sandbox_up.yml +ansible-playbook certbot/deploy.yml +``` + +The playbook falls back to a self-signed certificate if Titania is unavailable. + +#### OAuth2 redirect loops + +**Cause**: Casdoor application redirect URI doesn't match the service URL. + +**Fix**: Verify redirect URIs match exactly: +- Grafana: `https://grafana.ouranos.helu.ca/login/generic_oauth` +- PgAdmin: `https://pgadmin.ouranos.helu.ca/oauth2/redirect` +- Prometheus: `https://prometheus.ouranos.helu.ca/oauth2/callback` + +## Migration Notes + +PPLG replaces the following standalone playbooks (kept as reference): + +| Original Playbook | Replaced By | +|-------------------|-------------| +| `prometheus/deploy.yml` | `pplg/deploy.yml` | +| `prometheus/alertmanager_deploy.yml` | `pplg/deploy.yml` | +| `loki/deploy.yml` | `pplg/deploy.yml` | +| `grafana/deploy.yml` | `pplg/deploy.yml` | +| `pgadmin/deploy.yml` | `pplg/deploy.yml` | + +PgAdmin was previously hosted on **Portia** (port 25555). It now runs on **Prospero** via gunicorn (no Apache). diff --git a/docs/rabbitmq.md b/docs/rabbitmq.md new file mode 100644 index 0000000..fe74b1d --- /dev/null +++ b/docs/rabbitmq.md @@ -0,0 +1,546 @@ +# RabbitMQ - Message Broker Infrastructure + +## Overview + +RabbitMQ 3 (management-alpine) serves as the central message broker for the Agathos sandbox, providing AMQP-compliant message queuing for asynchronous communication between services. The deployment includes the management web interface for monitoring and administration. + +**Host:** Oberon (container_orchestration) +**Role:** Message broker for event-driven architectures +**AMQP Port:** 5672 +**Management Port:** 25582 +**Syslog Port:** 51402 (Alloy) + +## Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ Oberon Host │ +│ │ +│ ┌──────────────────────────────────────────────────┐ │ +│ │ RabbitMQ Container (Docker) │ │ +│ │ │ │ +│ │ ┌──────────────┬──────────────┐ │ │ +│ │ │ VHost │ VHost │ │ │ +│ │ │ "kairos" │ "spelunker" │ │ │ +│ │ │ │ │ │ │ +│ │ │ User: │ User: │ │ │ +│ │ │ kairos │ spelunker │ │ │ +│ │ │ (full perm) │ (full perm) │ │ │ +│ │ └──────────────┴──────────────┘ │ │ +│ │ │ │ +│ │ Default Admin: rabbitmq │ │ +│ │ (all vhosts, admin privileges) │ │ +│ │ │ │ +│ └──────────────────────────────────────────────────┘ │ +│ │ +│ Ports: 5672 (AMQP), 25582 (Management) │ +│ Logs: syslog → Alloy:51402 → Loki │ +└─────────────────────────────────────────────────────────┘ + + ┌──────────────┐ ┌──────────────┐ + │ Kairos │───AMQP────▶│ kairos/ │ + │ (future) │ │ (vhost) │ + └──────────────┘ └──────────────┘ + + ┌──────────────┐ ┌──────────────┐ + │ Spelunker │───AMQP────▶│ spelunker/ │ + │ (future) │ │ (vhost) │ + └──────────────┘ └──────────────┘ +``` + +**Note**: Kairos and Spelunker are future services. The RabbitMQ infrastructure is pre-provisioned with dedicated virtual hosts and users ready for when these services are deployed. + +## Terraform Resources + +### Oberon Host Definition + +RabbitMQ runs on Oberon, defined in `terraform/containers.tf`: + +| Attribute | Value | +|-----------|-------| +| Description | Docker Host + MCP Switchboard - King of Fairies orchestrating containers | +| Image | noble | +| Role | container_orchestration | +| Security Nesting | `true` (required for Docker) | +| AppArmor Profile | unconfined | +| Proxy Devices | `25580-25599 → 25580-25599` (application port range) | + +### Container Dependencies + +| Resource | Relationship | +|----------|--------------| +| Docker | RabbitMQ runs as a Docker container on Oberon | +| Alloy | Collects syslog logs from RabbitMQ on port 51402 | +| Prospero | Receives logs via Loki for observability | + +## Ansible Deployment + +### Playbook + +```bash +cd ansible +ansible-playbook rabbitmq/deploy.yml +``` + +### Files + +| File | Purpose | +|------|---------| +| `rabbitmq/deploy.yml` | Main deployment playbook | +| `rabbitmq/docker-compose.yml.j2` | Docker Compose template | + +### Deployment Steps + +The playbook performs the following operations: + +1. **User and Group Management** + - Creates `rabbitmq` system user and group + - Adds `ponos` user to `rabbitmq` group for operational access + +2. **Directory Setup** + - Creates service directory at `/srv/rabbitmq` + - Sets ownership to `rabbitmq:rabbitmq` + - Configures permissions (mode 750) + +3. **Docker Compose Deployment** + - Templates `docker-compose.yml` from Jinja2 template + - Deploys RabbitMQ container with `docker compose up` + +4. **rabbitmqadmin CLI Setup** + - Extracts `rabbitmqadmin` from container to `/usr/local/bin/` + - Makes it executable for host-level management + +5. **Automatic Provisioning** (idempotent) + - Creates virtual hosts: `kairos`, `spelunker` + - Creates users with passwords from vault + - Sets user tags (currently none, expandable for admin/monitoring roles) + - Configures full permissions for each user on their respective vhost + +### Variables + +#### Host Variables (`host_vars/oberon.incus.yml`) + +| Variable | Description | Default | +|----------|-------------|---------| +| `rabbitmq_user` | Service user | `rabbitmq` | +| `rabbitmq_group` | Service group | `rabbitmq` | +| `rabbitmq_directory` | Installation directory | `/srv/rabbitmq` | +| `rabbitmq_amqp_port` | AMQP protocol port | `5672` | +| `rabbitmq_management_port` | Management web interface | `25582` | +| `rabbitmq_password` | Default admin password | `{{ vault_rabbitmq_password }}` | + +#### Group Variables (`group_vars/all/vars.yml`) + +Defines the provisioning configuration for vhosts, users, and permissions: + +```yaml +rabbitmq_vhosts: + - name: kairos + - name: spelunker + +rabbitmq_users: + - name: kairos + password: "{{ kairos_rabbitmq_password }}" + tags: [] + - name: spelunker + password: "{{ spelunker_rabbitmq_password }}" + tags: [] + +rabbitmq_permissions: + - vhost: kairos + user: kairos + configure_priv: .* + read_priv: .* + write_priv: .* + - vhost: spelunker + user: spelunker + configure_priv: .* + read_priv: .* + write_priv: .* +``` + +**Vault Variable Mappings**: +```yaml +kairos_rabbitmq_password: "{{ vault_kairos_rabbitmq_password }}" +spelunker_rabbitmq_password: "{{ vault_spelunker_rabbitmq_password }}" +``` + +#### Vault Variables (`group_vars/all/vault.yml`) + +All sensitive credentials are encrypted in the vault: + +| Variable | Description | +|----------|-------------| +| `vault_rabbitmq_password` | Default admin account password | +| `vault_kairos_rabbitmq_password` | Kairos service user password | +| `vault_spelunker_rabbitmq_password` | Spelunker service user password | + +## Configuration + +### Docker Compose Template + +The deployment uses a minimal Docker Compose configuration: + +```yaml +services: + rabbitmq: + image: rabbitmq:3-management-alpine + container_name: rabbitmq + restart: unless-stopped + ports: + - "{{rabbitmq_amqp_port}}:5672" # AMQP protocol + - "{{rabbitmq_management_port}}:15672" # Management UI + volumes: + - rabbitmq_data:/var/lib/rabbitmq # Persistent data + environment: + RABBITMQ_DEFAULT_USER: "{{rabbitmq_user}}" + RABBITMQ_DEFAULT_PASS: "{{rabbitmq_password}}" + logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{rabbitmq_syslog_port}}" + syslog-format: "{{syslog_format}}" + tag: "rabbitmq" +``` + +### Data Persistence + +- **Volume**: `rabbitmq_data` (Docker-managed volume) +- **Location**: `/var/lib/rabbitmq` inside container +- **Contents**: + - Message queues and persistent messages + - Virtual host metadata + - User credentials and permissions + - Configuration overrides + +## Virtual Hosts and Users + +### Default Admin Account + +**Username**: `rabbitmq` +**Password**: `{{ vault_rabbitmq_password }}` (from vault) +**Privileges**: Full administrative access to all virtual hosts + +The default admin account is created automatically when the container starts and can access: +- All virtual hosts (including `/`, `kairos`, `spelunker`) +- Management web interface +- All RabbitMQ management commands + +### Kairos Virtual Host + +**VHost**: `kairos` +**User**: `kairos` +**Password**: `{{ vault_kairos_rabbitmq_password }}` +**Permissions**: Full (configure, read, write) on all resources matching `.*` + +Intended for the **Kairos** service (event-driven time-series processing system, planned future deployment). + +### Spelunker Virtual Host + +**VHost**: `spelunker` +**User**: `spelunker` +**Password**: `{{ vault_spelunker_rabbitmq_password }}` +**Permissions**: Full (configure, read, write) on all resources matching `.*` + +Intended for the **Spelunker** service (log exploration and analytics platform, planned future deployment). + +### Permission Model + +Both service users have full access within their respective virtual hosts: + +| Permission | Pattern | Description | +|------------|---------|-------------| +| Configure | `.*` | Create/delete queues, exchanges, bindings | +| Write | `.*` | Publish messages to exchanges | +| Read | `.*` | Consume messages from queues | + +This isolation ensures: +- ✔ Each service operates in its own namespace +- ✔ Messages cannot cross between services +- ✔ Resource limits can be applied per-vhost +- ✔ Service credentials can be rotated independently + +## Access and Administration + +### Management Web Interface + +**URL**: `http://oberon.incus:25582` +**External**: `http://{oberon-ip}:25582` +**Login**: `rabbitmq` / `{{ vault_rabbitmq_password }}` + +Features: +- Queue inspection and message browsing +- Exchange and binding management +- Connection and channel monitoring +- User and permission administration +- Virtual host management +- Performance metrics and charts + +### CLI Administration + +#### On Host Machine (using rabbitmqadmin) + +```bash +# List vhosts +rabbitmqadmin -H oberon.incus -P 25582 -u rabbitmq -p PASSWORD list vhosts + +# List queues in a vhost +rabbitmqadmin -H oberon.incus -P 25582 -u rabbitmq -p PASSWORD -V kairos list queues + +# Publish a test message +rabbitmqadmin -H oberon.incus -P 25582 -u rabbitmq -p PASSWORD -V kairos publish \ + exchange=amq.default routing_key=test payload="test message" +``` + +#### Inside Container + +```bash +# Enter the container +docker exec -it rabbitmq /bin/sh + +# List vhosts +rabbitmqctl list_vhosts + +# List users +rabbitmqctl list_users + +# List permissions for a user +rabbitmqctl list_user_permissions kairos + +# List queues in a vhost +rabbitmqctl list_queues -p kairos + +# Check node status +rabbitmqctl status +``` + +### Connection Strings + +#### AMQP Connection (from other containers on Oberon) + +``` +amqp://kairos:PASSWORD@localhost:5672/kairos +amqp://spelunker:PASSWORD@localhost:5672/spelunker +``` + +#### AMQP Connection (from other hosts) + +``` +amqp://kairos:PASSWORD@oberon.incus:5672/kairos +amqp://spelunker:PASSWORD@oberon.incus:5672/spelunker +``` + +#### Management API + +``` +http://rabbitmq:PASSWORD@oberon.incus:25582/api/ +``` + +## Monitoring and Observability + +### Logging + +- **Driver**: syslog (Docker logging driver) +- **Destination**: `tcp://127.0.0.1:51402` (Alloy on Oberon) +- **Tag**: `rabbitmq` +- **Format**: `{{ syslog_format }}` (from Alloy configuration) + +Logs are collected by Alloy and forwarded to Loki on Prospero for centralized log aggregation. + +### Key Metrics (via Management UI) + +| Metric | Description | +|--------|-------------| +| Connections | Active AMQP client connections | +| Channels | Active channels within connections | +| Queues | Total queues across all vhosts | +| Messages | Ready, unacknowledged, and total message counts | +| Message Rate | Publish/deliver rates (msg/s) | +| Memory Usage | Container memory consumption | +| Disk Usage | Persistent storage utilization | + +### Health Check + +```bash +# Check if RabbitMQ is running +docker ps | grep rabbitmq + +# Check container logs +docker logs rabbitmq + +# Check RabbitMQ node status +docker exec rabbitmq rabbitmqctl status + +# Check cluster health (single-node, should show 1 node) +docker exec rabbitmq rabbitmqctl cluster_status +``` + +## Operational Tasks + +### Restart RabbitMQ + +```bash +# Via Docker Compose +cd /srv/rabbitmq +sudo -u rabbitmq docker compose restart + +# Via Docker directly +docker restart rabbitmq +``` + +### Recreate Container (preserves data) + +```bash +cd /srv/rabbitmq +sudo -u rabbitmq docker compose down +sudo -u rabbitmq docker compose up -d +``` + +### Add New Virtual Host and User + +1. Update `group_vars/all/vars.yml`: + ```yaml + rabbitmq_vhosts: + - name: newservice + + rabbitmq_users: + - name: newservice + password: "{{ newservice_rabbitmq_password }}" + tags: [] + + rabbitmq_permissions: + - vhost: newservice + user: newservice + configure_priv: .* + read_priv: .* + write_priv: .* + + # Add mapping + newservice_rabbitmq_password: "{{ vault_newservice_rabbitmq_password }}" + ``` + +2. Add password to `group_vars/all/vault.yml`: + ```bash + ansible-vault edit inventory/group_vars/all/vault.yml + # Add: vault_newservice_rabbitmq_password: "secure_password" + ``` + +3. Run the playbook: + ```bash + ansible-playbook rabbitmq/deploy.yml + ``` + +The provisioning tasks are idempotent—existing vhosts and users are skipped, only new ones are created. + +### Rotate User Password + +```bash +# Inside container +docker exec rabbitmq rabbitmqctl change_password kairos "new_password" + +# Update vault +ansible-vault edit inventory/group_vars/all/vault.yml +# Update vault_kairos_rabbitmq_password +``` + +### Clear All Messages in a Queue + +```bash +docker exec rabbitmq rabbitmqctl purge_queue queue_name -p kairos +``` + +## Troubleshooting + +### Container Won't Start + +Check Docker logs for errors: +```bash +docker logs rabbitmq +``` + +Common issues: +- Port conflict on 5672 or 25582 +- Permission issues on `/srv/rabbitmq` directory +- Corrupted data volume + +### Cannot Connect to Management UI + +1. Verify port mapping: `docker port rabbitmq` +2. Check firewall rules on Oberon +3. Verify container is running: `docker ps | grep rabbitmq` +4. Check if management plugin is enabled (should be in `-management-alpine` image) + +### User Authentication Failing + +```bash +# List users and verify they exist +docker exec rabbitmq rabbitmqctl list_users + +# Check user permissions +docker exec rabbitmq rabbitmqctl list_user_permissions kairos + +# Verify vhost exists +docker exec rabbitmq rabbitmqctl list_vhosts +``` + +### High Memory Usage + +RabbitMQ may consume significant memory with many messages. Check: +```bash +# Memory usage +docker exec rabbitmq rabbitmqctl status | grep memory + +# Queue depths +docker exec rabbitmq rabbitmqctl list_queues -p kairos messages + +# Consider setting memory limits in docker-compose.yml +``` + +## Security Considerations + +### Network Isolation + +- RabbitMQ AMQP port (5672) is **only** exposed on the Incus network (`10.10.0.0/16`) +- Management UI (25582) is exposed externally for administration +- For production: Place HAProxy in front of management UI with authentication +- Consider enabling SSL/TLS for AMQP connections in production + +### Credential Management + +- ✔ All passwords stored in Ansible Vault +- ✔ Service accounts have isolated virtual hosts +- ✔ Default admin account uses strong password from vault +- ⚠️ Credentials passed as environment variables (visible in `docker inspect`) +- Consider using Docker secrets or Vault integration for enhanced security + +### Virtual Host Isolation + +Each service operates in its own virtual host: +- Messages cannot cross between vhosts +- Resource quotas can be applied per-vhost +- Credentials can be rotated without affecting other services + +## Future Enhancements + +- [ ] **SSL/TLS Support**: Enable encrypted AMQP connections +- [ ] **Cluster Mode**: Add additional RabbitMQ nodes for high availability +- [ ] **Federation**: Connect to external RabbitMQ clusters +- [ ] **Prometheus Exporter**: Add metrics export for Grafana monitoring +- [ ] **Shovel Plugin**: Configure message forwarding between brokers +- [ ] **HAProxy Integration**: Reverse proxy for management UI with authentication +- [ ] **Docker Secrets**: Replace environment variables with Docker secrets + +## References + +- [RabbitMQ Official Documentation](https://www.rabbitmq.com/documentation.html) +- [RabbitMQ Management Plugin](https://www.rabbitmq.com/management.html) +- [AMQP 0-9-1 Protocol Reference](https://www.rabbitmq.com/amqp-0-9-1-reference.html) +- [Virtual Hosts](https://www.rabbitmq.com/vhosts.html) +- [Access Control (Authentication, Authorisation)](https://www.rabbitmq.com/access-control.html) +- [Monitoring RabbitMQ](https://www.rabbitmq.com/monitoring.html) + +--- + +**Last Updated**: February 12, 2026 +**Project**: Agathos Infrastructure +**Approval**: Red Panda Approved™ diff --git a/docs/red_panda_standards.md b/docs/red_panda_standards.md new file mode 100644 index 0000000..e6ede19 --- /dev/null +++ b/docs/red_panda_standards.md @@ -0,0 +1,148 @@ +# Red Panda Approval™ Standards + +Quality and observability standards for the Ouranos Lab. All infrastructure code, application code, and LLM-generated code deployed into this environment must meet these standards. + +--- + +## 🐾 Red Panda Approval™ + +All implementations must meet the 5 Sacred Criteria: + +1. **Fresh Environment Test** — Clean runs on new systems without drift. No leftover state, no manual steps. +2. **Elegant Simplicity** — Modular, reusable, no copy-paste sprawl. One playbook per concern. +3. **Observable & Auditable** — Clear task names, proper logging, check mode compatible. You can see what happened. +4. **Idempotent Patterns** — Run multiple times with consistent results. No side effects on re-runs. +5. **Actually Provisions & Configures** — Resources work, dependencies resolve, services integrate. It does the thing. + +--- + +## Vault Security + +All sensitive information is encrypted using Ansible Vault with AES256 encryption. + +**Encrypted secrets:** +- Database passwords (PostgreSQL, Neo4j) +- API keys (OpenAI, Anthropic, Mistral, Groq) +- Application secrets (Grafana, SearXNG, Arke) +- Monitoring alerts (Pushover integration) + +**Security rules:** +- AES256 encryption with `ansible-vault` +- Password file for automation — never pass `--vault-password-file` inline in scripts +- Vault variables use the `vault_` prefix; map to friendly names in `group_vars/all/vars.yml` +- No secrets in plain text files, ever + +--- + +## Log Level Standards + +All services in the Ouranos Lab MUST follow these log level conventions. These rules apply to application code, infrastructure services, and any LLM-generated code deployed into this environment. Log output flows through Alloy → Loki → Grafana, so disciplined leveling is not cosmetic — it directly determines alert quality, dashboard usefulness, and on-call signal-to-noise ratio. + +### Level Definitions + +| Level | When to Use | What MUST Be Included | Loki / Grafana Role | +|-------|-------------|----------------------|---------------------| +| **ERROR** | Something is broken and requires human intervention. The service cannot fulfil the current request or operation. | Exception class, message, stack trace, and relevant context (request ID, user, resource identifier). Never a bare `"something failed"`. | AlertManager rules fire on `level=~"error\|fatal\|critical"`. These trigger Pushover notifications. | +| **WARNING** | Degraded but self-recovering: retries succeeding, fallback paths taken, thresholds approaching, deprecated features invoked. | What degraded, what recovery action was taken, current metric value vs. threshold. | Grafana dashboard panels. Rate-based alerting (e.g., >N warnings/min). | +| **INFO** | Significant lifecycle and business events: service start/stop, configuration loaded, deployment markers, user authentication, job completion, schema migrations. | The event and its outcome. This level tells the *story* of what the system did. | Default production visibility. The go-to level for post-incident timelines. | +| **DEBUG** | Diagnostic detail for active troubleshooting: request/response payloads, SQL queries, internal state, variable values. | **Actionable context is mandatory.** A DEBUG line with no detail is worse than no line at all. Include variable values, object states, or decision paths. | Never enabled in production by default. Used on-demand via per-service level override. | + +### Anti-Patterns + +These are explicit violations of Ouranos logging standards: + +| ❌ Anti-Pattern | Why It's Wrong | ✅ Correct Approach | +|----------------|---------------|-------------------| +| Health checks logged at INFO (`GET /health → 200 OK`) | Routine HAProxy/Prometheus probes flood syslog with thousands of identical lines per hour, burying real events. | Suppress health endpoints from access logs entirely, or demote to DEBUG. | +| DEBUG with no context (`logger.debug("error occurred")`) | Provides zero diagnostic value. If DEBUG is noisy *and* useless, nobody will ever enable it. | `logger.debug("PaymentService.process failed: order_id=%s, provider=%s, response=%r", oid, provider, resp)` | +| ERROR without exception details (`logger.error("task failed")`) | Cannot be triaged without reproduction steps. Wastes on-call time. | `logger.error("Celery task invoice_gen failed: order_id=%s", oid, exc_info=True)` | +| Logging sensitive data at any level | Passwords, tokens, API keys, and PII in Loki are a security incident. | Mask or redact: `api_key=sk-...a3f2`, `password=*****`. | +| Inconsistent level casing | Breaks LogQL filters and Grafana label selectors. | **Python / Django**: UPPERCASE (`INFO`, `WARNING`, `ERROR`, `DEBUG`). **Go / infrastructure** (HAProxy, Alloy, Gitea): lowercase (`info`, `warn`, `error`, `debug`). | +| Logging expected conditions as ERROR | A user entering a wrong password is not an error — it is normal business logic. | Use WARNING or INFO for expected-but-notable conditions. Reserve ERROR for things that are actually broken. | + +### Health Check Rule + +> All services exposed through HAProxy MUST suppress or demote health check endpoints (`/health`, `/healthz`, `/api/health`, `/metrics`, `/ping`) to DEBUG or below. Health check success is the *absence* of errors, not the presence of 200s. If your syslog shows a successful health probe, your log level is wrong. + +**Implementation guidance:** +- **Django / Gunicorn**: Filter health paths in the access log handler or use middleware that skips logging for probe user-agents. +- **Docker services**: Configure the application's internal logging to exclude health routes — the syslog driver forwards everything it receives. +- **HAProxy**: HAProxy's own health check logs (`option httpchk`) should remain at the HAProxy level for connection debugging, but backend application responses to those probes must not surface at INFO. + +### Background Worker & Queue Monitoring + +> **The most dangerous failure is the one that produces no logs.** + +When a background worker (Celery task consumer, RabbitMQ subscriber, Gitea Runner, cron job) fails to start or crashes on startup, it generates no ongoing log output. Error-rate dashboards stay green because there is no process running to produce errors. Meanwhile, queues grow unbounded and work silently stops being processed. + +**Required practices:** + +1. **Heartbeat logging** — Every long-running background worker MUST emit a periodic INFO-level heartbeat (e.g., `"worker alive, processed N jobs in last 5m, queue depth: M"`). The *absence* of this heartbeat is the alertable condition. + +2. **Startup and shutdown at INFO** — Worker start, ready, graceful shutdown, and crash-exit are significant lifecycle events. These MUST log at INFO. + +3. **Queue depth as a metric** — RabbitMQ queue depths and any application-level task queues MUST be exposed as Prometheus metrics. A growing queue with zero consumer activity is an **ERROR**-level alert, not a warning. + +4. **Grafana "last seen" alerts** — For every background worker, configure a Grafana alert using `absent_over_time()` or equivalent staleness detection: *"Worker X has not logged a heartbeat in >10 minutes"* → ERROR severity → Pushover notification. + +5. **Crash-on-start is ERROR** — If a worker exits within seconds of starting (missing config, failed DB connection, import error), the exit MUST be captured at ERROR level by the service manager (`systemd OnFailure=`, Docker restart policy logs). Do not rely on the crashing application to log its own death — it may never get the chance. + +### Production Defaults + +| Service Category | Default Level | Rationale | +|-----------------|---------------|-----------| +| Django apps (Angelia, Athena, Kairos, Icarlos, Spelunker, Peitho, MCP Switchboard) | `WARNING` | Business logic — only degraded or broken conditions surface. Lifecycle events (start/stop/deploy) still log at INFO via Gunicorn and systemd. | +| Gunicorn access logs | Suppress 2xx/3xx health probes | Routine request logging deferred to HAProxy access logs in Loki. | +| Infrastructure agents (Alloy, Prometheus, Node Exporter) | `warn` | Stable — do not change without cause. | +| HAProxy (Titania) | `warning` | Connection-level logging handled by HAProxy's own log format → Alloy → Loki. | +| Databases (PostgreSQL, Neo4j) | `warning` | Query-level logging only enabled for active troubleshooting. | +| Docker services (Gitea, LobeChat, Nextcloud, AnythingLLM, SearXNG) | `warn` / `warning` | Per-service default. Tune individually if needed. | +| LLM Proxy (Arke) | `info` | Token usage tracking and provider routing decisions justify INFO. Review periodically for noise. | +| Observability stack (Grafana, Loki, AlertManager) | `warn` | Should be quiet unless something is wrong with observability itself. | + +### Loki & Grafana Alignment + +**Label normalization**: Alloy pipelines (syslog listeners and journal relabeling) MUST extract and forward a `level` label on every log line. Without a `level` label, the log entry is invisible to level-based dashboard filters and alert rules. + +**LogQL conventions for dashboards:** +```logql +# Production error monitoring (default dashboard view) +{job="syslog", hostname="puck"} | json | level=~"error|fatal|critical" + +# Warning-and-above for a specific service +{service_name="haproxy"} | logfmt | level=~"warn|error|fatal" + +# Debug-level troubleshooting (temporary, never permanent dashboards) +{container="angelia"} | json | level="debug" +``` + +**Alerting rules** — Grafana alert rules MUST key off the normalized `level` label: +- `level=~"error|fatal|critical"` → Immediate Pushover notification via AlertManager +- `absent_over_time({service_name="celery_worker"}[10m])` → Worker heartbeat staleness → ERROR severity +- Rate-based: `rate({service_name="arke"} | json | level="error" [5m]) > 0.1` → Sustained error rate + +**Retention alignment**: Loki retention policies should preserve ERROR and WARNING logs longer than DEBUG. DEBUG-level logs generated during troubleshooting sessions should have a short TTL or be explicitly cleaned up. + +--- + +## Documentation Standards + +Place documentation in the `/docs/` directory of the repository. + +### HTML Documents + +HTML documents must follow [docs/documentation_style_guide.html](documentation_style_guide.html). + +- Use Bootstrap CDN with Bootswatch theme **Flatly** +- Include a dark mode toggle button in the navbar +- Use Bootstrap Icons for icons +- Use Bootstrap CSS for styles — avoid custom CSS +- Use **Mermaid** for diagrams + +### Markdown Documents + +Only these status symbols are approved: +- ✔ Success/Complete +- ❌ Error/Failed +- ⚠️ Warning/Caution +- ℹ️ Information/Note \ No newline at end of file diff --git a/docs/searxng-auth.md b/docs/searxng-auth.md new file mode 100644 index 0000000..dc92e37 --- /dev/null +++ b/docs/searxng-auth.md @@ -0,0 +1,253 @@ +# SearXNG Authentication Design Document +# Red Panda Approved + +## Overview + +This document describes the design for adding Casdoor-based authentication to SearXNG, +which doesn't natively support SSO/OIDC authentication. + +## Architecture + +``` +┌──────────────┐ ┌───────────────┐ ┌─────────────────────────────────────┐ +│ Browser │────▶│ HAProxy │────▶│ Oberon │ +│ │ │ (titania) │ │ ┌────────────────┐ ┌───────────┐ │ +└──────────────┘ └───────┬───────┘ │ │ OAuth2-Proxy │─▶│ SearXNG │ │ + │ │ │ (port 22073) │ │ (22083) │ │ + │ │ └───────┬────────┘ └───────────┘ │ + │ └──────────┼─────────────────────────┘ + │ │ OIDC + │ ┌──────────────────▼────────────────┐ + └────▶│ Casdoor │ + │ (OIDC Provider - titania) │ + └───────────────────────────────────┘ +``` + +The OAuth2-Proxy runs as a **native binary sidecar** on Oberon alongside SearXNG, +following the same pattern used for JupyterLab on Puck. The upstream connection is +`localhost` — eliminating the cross-host hop from the previous Docker-based deployment +on Titania. + +> ℹ️ Each host supports at most one OAuth2-Proxy sidecar instance. The binary is +> shared at `/usr/local/bin/oauth2-proxy`; each service gets a unique config directory +> and systemd unit name. + +## Components + +### 1. OAuth2-Proxy (Sidecar on Oberon) +- **Purpose**: Acts as authentication gateway for SearXNG +- **Port**: 22073 (exposed to HAProxy) +- **Binary**: Native `oauth2-proxy` v7.6.0 (systemd service `oauth2-proxy-searxng`) +- **Config**: `/etc/oauth2-proxy-searxng/oauth2-proxy.cfg` +- **Upstream**: `http://127.0.0.1:22083` (localhost sidecar to SearXNG) +- **Logging**: systemd journal (`SyslogIdentifier=oauth2-proxy-searxng`) + +### 2. Casdoor (Existing on Titania) +- **Purpose**: OIDC Identity Provider +- **Port**: 22081 +- **URL**: https://id.ouranos.helu.ca/ (via HAProxy) +- **Required Setup**: + - Create Application for SearXNG + - Configure redirect URI + - Generate client credentials + +### 3. HAProxy Updates (Titania) +- Route `searxng.ouranos.helu.ca` to OAuth2-Proxy on Oberon (`oberon.incus:22073`) +- OAuth2-Proxy handles authentication before proxying to SearXNG on localhost + +### 4. SearXNG (Existing on Oberon) +- **No changes required** - remains unaware of authentication +- Receives pre-authenticated requests from OAuth2-Proxy + +## Authentication Flow + +1. User navigates to `https://searxng.ouranos.helu.ca/` +2. HAProxy routes to OAuth2-Proxy on oberon:22073 +3. OAuth2-Proxy checks for valid session cookie (`_oauth2_proxy_searxng`) +4. **If no valid session**: + - Redirect to Casdoor login: `https://id.ouranos.helu.ca/login/oauth/authorize` + - User authenticates with Casdoor (username/password, social login, etc.) + - Casdoor redirects back with authorization code + - OAuth2-Proxy exchanges code for tokens + - OAuth2-Proxy sets session cookie +5. **If valid session**: + - OAuth2-Proxy adds `X-Forwarded-User` header + - Request proxied to SearXNG at `127.0.0.1:22083` (localhost sidecar) + +## Casdoor Configuration + +### Application Setup (Manual via Casdoor UI) + +1. Login to Casdoor at https://id.ouranos.helu.ca/ +2. Navigate to Applications → Add +3. Configure: + - **Name**: `searxng` + - **Display Name**: `SearXNG Search` + - **Organization**: `built-in` (or your organization) + - **Redirect URLs**: + - `https://searxng.ouranos.helu.ca/oauth2/callback` + - **Grant Types**: `authorization_code`, `refresh_token` + - **Response Types**: `code` +4. Save and note the `Client ID` and `Client Secret` + +### Cookie Secret Generation + +Generate a 32-byte random secret for OAuth2-Proxy cookies: + +```bash +openssl rand -base64 32 +``` + +## Environment Variables + +### Development (Sandbox) +```yaml +# In inventory/host_vars/oberon.incus.yml +searxng_oauth2_proxy_dir: /etc/oauth2-proxy-searxng +searxng_oauth2_proxy_version: "7.6.0" +searxng_proxy_port: 22073 +searxng_domain: "ouranos.helu.ca" +searxng_oauth2_oidc_issuer_url: "https://id.ouranos.helu.ca" +searxng_oauth2_redirect_url: "https://searxng.ouranos.helu.ca/oauth2/callback" + +# OAuth2 Credentials (from vault) +searxng_oauth2_client_id: "{{ vault_searxng_oauth2_client_id }}" +searxng_oauth2_client_secret: "{{ vault_searxng_oauth2_client_secret }}" +searxng_oauth2_cookie_secret: "{{ vault_searxng_oauth2_cookie_secret }}" +``` + +> ℹ️ Variables use the `searxng_` prefix, following the same naming pattern as +> `jupyterlab_oauth2_*` variables on Puck. The upstream URL (`http://127.0.0.1:22083`) +> is derived from `searxng_port` in the config template — no cross-host URL needed. + +## Deployment Steps + +### 1. Add Vault Secrets +```bash +ansible-vault edit inventory/group_vars/all/vault.yml +``` + +Add: +```yaml +vault_searxng_oauth2_client_id: "" +vault_searxng_oauth2_client_secret: "" +vault_searxng_oauth2_cookie_secret: "" +``` + +Note: The `searxng_` prefix allows service-specific credentials. The Oberon host_vars +maps these directly to `searxng_oauth2_*` variables used by the sidecar config template. + +### 2. Update Host Variables +OAuth2-Proxy variables are defined in `inventory/host_vars/oberon.incus.yml` alongside +the existing SearXNG configuration. No separate service entry is needed — the OAuth2-Proxy +sidecar is deployed as part of the `searxng` service. + +```yaml +# SearXNG OAuth2-Proxy Sidecar (in oberon.incus.yml) +searxng_oauth2_proxy_dir: /etc/oauth2-proxy-searxng +searxng_oauth2_proxy_version: "7.6.0" +searxng_proxy_port: 22073 +searxng_domain: "ouranos.helu.ca" +searxng_oauth2_oidc_issuer_url: "https://id.ouranos.helu.ca" +searxng_oauth2_redirect_url: "https://searxng.ouranos.helu.ca/oauth2/callback" +``` + +### 3. Update HAProxy Backend +Route SearXNG traffic through OAuth2-Proxy on Oberon: +```yaml +# In inventory/host_vars/titania.incus.yml +haproxy_backends: + - subdomain: "searxng" + backend_host: "oberon.incus" # Same host as SearXNG + backend_port: 22073 # OAuth2-Proxy port + health_path: "/ping" # OAuth2-Proxy health endpoint +``` + +### 4. Deploy +```bash +cd ansible + +# Deploy SearXNG + OAuth2-Proxy sidecar +ansible-playbook searxng/deploy.yml + +# Update HAProxy configuration +ansible-playbook haproxy/deploy.yml +``` + +## Monitoring + +### Logs +OAuth2-Proxy logs to systemd journal on Oberon. Alloy's default `systemd_logs` +source captures these logs automatically, filterable by `SyslogIdentifier=oauth2-proxy-searxng`. + +```bash +# View logs on Oberon +ssh oberon.incus +journalctl -u oauth2-proxy-searxng -f +``` + +### Metrics +OAuth2-Proxy exposes Prometheus metrics at `/metrics` on port 22073: +- `oauth2_proxy_requests_total` - Total requests +- `oauth2_proxy_errors_total` - Error count +- `oauth2_proxy_upstream_latency_seconds` - Upstream latency + +## Security Considerations + +1. **Cookie Security**: + - `cookie_secure = true` enforces HTTPS-only cookies + - `cookie_httponly = true` prevents JavaScript access + - `cookie_samesite = "lax"` provides CSRF protection + +2. **Email Domain Restriction**: + - Configure `oauth2_proxy_email_domains` to limit who can access + - Example: `["yourdomain.com"]` or `["*"]` for any + +3. **Group-Based Access**: + - Optional: Configure `oauth2_proxy_allowed_groups` in Casdoor + - Only users in specified groups can access SearXNG + +## Troubleshooting + +### Check OAuth2-Proxy Status +```bash +ssh oberon.incus +systemctl status oauth2-proxy-searxng +journalctl -u oauth2-proxy-searxng --no-pager -n 50 +``` + +### Test OIDC Discovery +```bash +curl https://id.ouranos.helu.ca/.well-known/openid-configuration +``` + +### Test Health Endpoint +```bash +curl http://oberon.incus:22073/ping +``` + +### Verify Cookie Domain +Ensure the cookie domain (`.ouranos.helu.ca`) matches your HAProxy domain. +Cookies won't work across different domains. + +## Files + +| File | Purpose | +|------|---------| +| `ansible/searxng/deploy.yml` | SearXNG + OAuth2-Proxy sidecar deployment | +| `ansible/searxng/oauth2-proxy-searxng.cfg.j2` | OAuth2-Proxy OIDC configuration | +| `ansible/searxng/oauth2-proxy-searxng.service.j2` | Systemd unit for OAuth2-Proxy | +| `ansible/inventory/host_vars/oberon.incus.yml` | Host variables (`searxng_oauth2_*`) | +| `docs/searxng-auth.md` | This design document | + +### Generic OAuth2-Proxy Module (Retained) + +The standalone `ansible/oauth2_proxy/` directory is retained as a generic, reusable +Docker-based OAuth2-Proxy module for future services: + +| File | Purpose | +|------|---------| +| `ansible/oauth2_proxy/deploy.yml` | Generic Docker Compose deployment | +| `ansible/oauth2_proxy/docker-compose.yml.j2` | Docker Compose template | +| `ansible/oauth2_proxy/oauth2-proxy.cfg.j2` | Generic OIDC configuration template | +| `ansible/oauth2_proxy/stage.yml` | Validation / dry-run playbook | diff --git a/docs/smtp4dev.md b/docs/smtp4dev.md new file mode 100644 index 0000000..191793b --- /dev/null +++ b/docs/smtp4dev.md @@ -0,0 +1,191 @@ +# smtp4dev - Development SMTP Server + +## Overview + +smtp4dev is a fake SMTP server for development and testing. It accepts all incoming email without delivering it, capturing messages for inspection via a web UI and IMAP client. All services in the Agathos sandbox that send email (Casdoor, Gitea, etc.) are wired to smtp4dev so email flows can be tested without a real mail server. + +**Host:** Oberon (container_orchestration) +**Web UI Port:** 22085 → `https://smtp4dev.ouranos.helu.ca` +**SMTP Port:** 22025 (used by all services as `smtp_host:smtp_port`) +**IMAP Port:** 22045 +**Syslog Port:** 51405 (Alloy) + +## Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ Oberon Host │ +│ │ +│ ┌──────────────────────────────────────────────────┐ │ +│ │ smtp4dev Container (Docker) │ │ +│ │ │ │ +│ │ Port 80 → host 22085 (Web UI) │ │ +│ │ Port 25 → host 22025 (SMTP) │ │ +│ │ Port 143 → host 22045 (IMAP) │ │ +│ │ │ │ +│ │ Volume: smtp4dev_data → /smtp4dev │ │ +│ │ Logs: syslog → Alloy:51405 → Loki │ │ +│ └──────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────┘ + ▲ ▲ + │ SMTP :22025 │ SMTP :22025 + ┌──────┴──────┐ ┌──────┴──────┐ + │ Casdoor │ │ Gitea │ + │ (Titania) │ │ (Rosalind) │ + └─────────────┘ └─────────────┘ + +External access: + https://smtp4dev.ouranos.helu.ca → HAProxy (Titania) → oberon.incus:22085 +``` + +## Shared SMTP Variables + +smtp4dev connection details are defined once in `ansible/inventory/group_vars/all/vars.yml` and consumed by all service templates: + +| Variable | Value | Purpose | +|----------|-------|---------| +| `smtp_host` | `oberon.incus` | SMTP server hostname | +| `smtp_port` | `22025` | SMTP server port | +| `smtp_from` | `noreply@ouranos.helu.ca` | Default sender address | +| `smtp_from_name` | `Agathos` | Default sender display name | + +Any service that needs to send email references these shared variables rather than defining its own SMTP config. This means switching to a real SMTP server only requires changing `group_vars/all/vars.yml`. + +## Ansible Deployment + +### Playbook + +```bash +# Deploy smtp4dev on Oberon +ansible-playbook smtp4dev/deploy.yml + +# Redeploy HAProxy to activate the smtp4dev.ouranos.helu.ca backend +ansible-playbook haproxy/deploy.yml +``` + +### Files + +| File | Purpose | +|------|---------| +| `ansible/smtp4dev/deploy.yml` | Main deployment playbook | +| `ansible/smtp4dev/docker-compose.yml.j2` | Docker Compose template | + +### Deployment Steps + +The `deploy.yml` playbook: + +1. Filters hosts — only runs on hosts with `smtp4dev` in their `services` list (Oberon) +2. Creates `smtp4dev` system group and user +3. Adds `ponos` user to the `smtp4dev` group (for `docker compose` access) +4. Creates `/srv/smtp4dev` directory owned by `smtp4dev:smtp4dev` +5. Templates `docker-compose.yml` into `/srv/smtp4dev/` +6. Resets SSH connection to apply group membership +7. Starts the service with `community.docker.docker_compose_v2: state: present` + +### Host Variables + +Defined in `ansible/inventory/host_vars/oberon.incus.yml`: + +```yaml +# smtp4dev Configuration +smtp4dev_user: smtp4dev +smtp4dev_group: smtp4dev +smtp4dev_directory: /srv/smtp4dev +smtp4dev_port: 22085 # Web UI (container port 80) +smtp4dev_smtp_port: 22025 # SMTP (container port 25) +smtp4dev_imap_port: 22045 # IMAP (container port 143) +smtp4dev_syslog_port: 51405 # Alloy syslog collector +``` + +## Service Integrations + +### Casdoor + +The Casdoor email provider is declared in `ansible/casdoor/init_data.json.j2` and seeded automatically on a **fresh** Casdoor deployment: + +```json +{ + "owner": "admin", + "name": "provider-email-smtp4dev", + "displayName": "smtp4dev Email", + "category": "Email", + "type": "SMTP", + "host": "oberon.incus", + "port": 22025, + "disableSsl": true, + "fromAddress": "noreply@ouranos.helu.ca", + "fromName": "Agathos" +} +``` + +> ⚠️ For **existing** Casdoor installs, create the provider manually: +> 1. Log in to `https://id.ouranos.helu.ca` as admin +> 2. Navigate to **Identity → Providers → Add** +> 3. Set **Category**: `Email`, **Type**: `SMTP` +> 4. Fill host `oberon.incus`, port `22025`, disable SSL, from `noreply@ouranos.helu.ca` +> 5. Save and assign the provider to the `heluca` organization under **Organizations → heluca → Edit → Default email provider** + +### Gitea + +Configured directly in `ansible/gitea/app.ini.j2`: + +```ini +[mailer] +ENABLED = true +SMTP_ADDR = {{ smtp_host }} +SMTP_PORT = {{ smtp_port }} +FROM = {{ smtp_from }} +``` + +Redeploy Gitea to apply: + +```bash +ansible-playbook gitea/deploy.yml +``` + +## External Access + +smtp4dev's web UI is exposed via HAProxy on Titania at `https://smtp4dev.ouranos.helu.ca`. + +Backend entry in `ansible/inventory/host_vars/titania.incus.yml`: + +```yaml +- subdomain: "smtp4dev" + backend_host: "oberon.incus" + backend_port: 22085 + health_path: "/" +``` + +## Verification + +```bash +# Check container is running +ssh oberon.incus "cd /srv/smtp4dev && docker compose ps" + +# Check logs +ssh oberon.incus "cd /srv/smtp4dev && docker compose logs --tail=50" + +# Test SMTP delivery (sends a test message) +ssh oberon.incus "echo 'Subject: test' | sendmail -S oberon.incus:22025 test@example.com" + +# Check web UI is reachable internally +curl -s -o /dev/null -w "%{http_code}" http://oberon.incus:22085 + +# Check external HTTPS route +curl -sk -o /dev/null -w "%{http_code}" https://smtp4dev.ouranos.helu.ca +``` + +## site.yml Order + +smtp4dev is deployed after Docker (it requires the Docker engine) and before Casdoor (so the SMTP endpoint exists when Casdoor initialises): + +```yaml +- name: Deploy Docker + import_playbook: docker/deploy.yml + +- name: Deploy smtp4dev + import_playbook: smtp4dev/deploy.yml + +- name: Deploy PPLG Stack # ...continues +``` diff --git a/docs/sunwait.txt b/docs/sunwait.txt new file mode 100644 index 0000000..803be28 --- /dev/null +++ b/docs/sunwait.txt @@ -0,0 +1,70 @@ +Calculate sunrise and sunset times for the current or targetted day. +The times can be adjusted either for twilight or fixed durations. + +The program can either: wait for sunrise or sunset (function: wait), + or return the time (GMT or local) the event occurs (function: list), + or report the day length and twilight timings (function: report), + or simply report if it is DAY or NIGHT (function: poll). + +You should specify the latitude and longitude of your target location. + + +Usage: sunwait [major options] [minor options] [twilight type] [rise|set] [offset] [latitude] [longitude] + +Major options, either: + poll Returns immediately indicating DAY or NIGHT. See 'program exit codes'. Default. + wait Sleep until specified event occurs. Else exit immediate. + list [X] Report twilight times for next 'X' days (inclusive). Default: 1. + report [date] Generate a report about the days sunrise and sunset timings. Default: the current day + +Minor options, any of: + [no]debug Print extra info and returns in one minute. Default: nodebug. + [no]version Print the version number. Default: noversion. + [no]help Print this help. Default: nohelp. + [no]gmt Print times in GMT or local-time. Default: nogmt. + +Twilight types, either: + daylight Top of sun just below the horizon. Default. + civil Civil Twilight. -6 degrees below horizon. + nautical Nautical twilight. -12 degrees below horizon. + astronomical Astronomical twilight. -18 degrees below horizon. + angle [X.XX] User-specified twilight-angle (degrees). Default: 0. + +Sunrise/sunset. Only useful with major-options: 'wait' and 'list'. Any of: (default: both) + rise Wait for the sun to rise past specified twilight & offset. + set Wait for the sun to set past specified twilight & offset. + +Offset: + offset [MM|HH:MM] Time interval (+ve towards noon) to adjust twilight calculation. + +Target date. Only useful with major-options: 'report' or 'list'. Default: today + d [DD] Set the target Day-of-Month to calculate for. 1 to 31. + m [MM] Set the target Month to calculate for. 1 to 12. + y [YYYY] Set the target Year to calculate for. 2000 to 2099. + +latitude/longitude coordinates: floating-point degrees, with [NESW] appended. Default: Bingham, England. + +Exit (return) codes: + 0 OK: exit from 'wait' or 'list' only. + 1 Error. + 2 Exit from 'poll': it is DAY or twilight. + 3 Exit from 'poll': it is NIGHT (after twilight). + +Example 1: sunwait wait rise offset -1:15:10 51.477932N 0.000000E +Wait until 1 hour 15 minutes 10 secs before the sun rises in Greenwich, London. + +Example 2: sunwait list 7 civil 55.752163N 37.617524E +List civil sunrise and sunset times for today and next 6 days. Moscow. + +Example 3: sunwait poll exit angle 10 54.897786N -1.517536E +Indicate by program exit-code if is Day or Night using a custom twilight angle of 10 degrees above horizon. Washington, UK. + +Example 4: sunwait list 7 gmt sunrise angle 3 +List next 7 days sunrise times, custom +3 degree twilight angle, default location. +Uses GMT; as any change in daylight saving over the specified period is not considered. + +Example 5: sunwait report y 20 m 3 d 15 10.49S 105.55E +Produce a report of the different sunrises and sunsets on an arbitrary day (2022/03/15) for an arbitrary location (Christmas Island) + +Note that program uses C library functions to determine time and localtime. +Error for timings are estimated at: +/- 4 minutes. \ No newline at end of file diff --git a/docs/terraform.md b/docs/terraform.md new file mode 100644 index 0000000..f831b2b --- /dev/null +++ b/docs/terraform.md @@ -0,0 +1,296 @@ +# Terraform Practices & Patterns + +This document describes the Terraform design philosophy, patterns, and practices used across our infrastructure. The audience includes LLMs assisting with development, new team members, and existing team members seeking a reference. + +## Design Philosophy + +### Incus-First Infrastructure + +Incus containers form the foundational layer of all environments. Management and monitoring infrastructure (Prospero, Titania) must exist before application hosts. This is a **critical dependency** that must be explicitly codified. + +**Why?** Terraform isn't magic. Implicit ordering can lead to race conditions or failed deployments. Always use explicit `depends_on` for critical infrastructure chains. + +```hcl +# Example: Application host depends on monitoring infrastructure +resource "incus_instance" "app_host" { + # ... + depends_on = [incus_instance.uranian_hosts["prospero"]] +} +``` + +### Explicit Dependencies + +Never rely solely on implicit resource ordering for critical infrastructure. Codify dependencies explicitly to: + +- ✔ Prevent race conditions during parallel applies +- ✔ Document architectural relationships in code +- ✔ Ensure consistent deployment ordering across environments + +## Repository Strategy + +### Agathos (Sandbox) + +Agathos is the **Sandbox repository** — isolated, safe for external demos, and uses local state. + +| Aspect | Decision | +|--------|----------| +| Purpose | Evaluation, demos, pattern experimentation, new software testing | +| State | Local (no remote backend) | +| Secrets | No production credentials or references | +| Security | Safe to use on external infrastructure for demos | + +### Production Repository (Separate) + +A separate repository manages Dev, UAT, and Prod environments: + +``` +terraform/ +├── modules/incus_host/ # Reusable container module +├── environments/ +│ ├── dev/ # Local Incus only +│ └── prod/ # OCI + Incus (parameterized via tfvars) +``` + +| Aspect | Decision | +|--------|----------| +| State | PostgreSQL backend on `eris.helu.ca:6432` with SSL | +| Schemas | Separate per environment: `dev`, `uat`, `prod` | +| UAT/Prod | Parameterized twins via `-var-file` | + +## Module Design + +### When to Extract a Module + +A pattern is a good module candidate when it meets these criteria: + +| Criterion | Description | +|-----------|-------------| +| **Reuse** | Pattern used across multiple environments (Sandbox, Dev, UAT, Prod) | +| **Stable Interface** | Inputs/outputs won't change frequently | +| **Testable** | Can validate module independently before promotion | +| **Encapsulates Complexity** | Hides `dynamic` blocks, `for_each`, cloud-init generation | + +### When NOT to Extract + +- Single-use patterns +- Tightly coupled to specific environment +- Adds indirection without measurable benefit + +### The `incus_host` Module + +The standard container provisioning pattern extracted from Agathos: + +**Inputs:** +- `hosts` — Map of host definitions (name, role, image, devices, config) +- `project` — Incus project name +- `profile` — Incus profile name +- `cloud_init_template` — Cloud-init configuration template +- `ssh_key_path` — Path to SSH authorized keys +- `depends_on_resources` — Explicit dependencies for infrastructure ordering + +**Outputs:** +- `host_details` — Name, IPv4, role, description for each host +- `inventory` — Documentation reference for DHCP/DNS provisioning + +## Environment Strategy + +### Environment Purposes + +| Environment | Purpose | Infrastructure | +|-------------|---------|----------------| +| **Sandbox** | Evaluation, demos, pattern experimentation | Local Incus only | +| **Dev** | Integration testing, container builds, security testing | Local Incus only | +| **UAT** | User acceptance testing, bug resolution | OCI + Incus (hybrid) | +| **Prod** | Production workloads | OCI + Incus (hybrid) | + +### Parameterized Twins (UAT/Prod) + +UAT and Prod are architecturally identical. Use a single environment directory with variable files: + +```bash +# UAT deployment +terraform apply -var-file=uat.tfvars + +# Prod deployment +terraform apply -var-file=prod.tfvars +``` + +Key differences in tfvars: +- Hostnames and DNS domains +- Resource sizing (CPU, memory limits) +- OCI compartment IDs +- Credential references + +## State Management + +### Sandbox (Agathos) + +Local state is acceptable because: +- Environment is ephemeral +- Single-user workflow +- No production secrets to protect +- Safe for external demos + +### Production Environments + +PostgreSQL backend on `eris.helu.ca`: + +```hcl +terraform { + backend "pg" { + conn_str = "postgres://eris.helu.ca:6432/terraform_state?sslmode=verify-full" + schema_name = "dev" # or "uat", "prod" + } +} +``` + +**Connection requirements:** +- Port 6432 (pgBouncer) +- SSL with `sslmode=verify-full` +- Credentials via environment variables (`PGUSER`, `PGPASSWORD`) +- Separate schema per environment for isolation + +## Integration Points + +### Terraform → DHCP/DNS + +The `agathos_inventory` output provides host information for DHCP/DNS provisioning: + +1. Terraform creates containers with cloud-init +2. `agathos_inventory` output includes hostnames and IPs +3. MAC addresses registered in DHCP server +4. DHCP server creates DNS entries (`hostname.incus` domain) +5. Ansible uses DNS names for host connectivity + +### Terraform → Ansible + +Ansible does **not** consume Terraform outputs directly. Instead: + +1. Terraform provisions containers +2. Incus DNS resolution provides `hostname.incus` domain +3. Ansible inventory uses static DNS names +4. `sandbox_up.yml` configures DNS resolution on the hypervisor + +```yaml +# Ansible inventory uses DNS names, not Terraform outputs +ubuntu: + hosts: + oberon.incus: + ariel.incus: + prospero.incus: +``` + +### Terraform → Bash Scripts + +The `ssh_key_update.sh` script demonstrates proper integration: + +```bash +terraform output -json agathos_inventory | jq -r \ + '.uranian_hosts.hosts | to_entries[] | "\(.key) \(.value.ipv4)"' | \ + while read hostname ip; do + ssh-keyscan -H "$ip" >> ~/.ssh/known_hosts + ssh-keyscan -H "$hostname.incus" >> ~/.ssh/known_hosts + done +``` + +## Promotion Workflow + +All infrastructure changes flow through this pipeline: + +``` +Agathos (Sandbox) + ↓ Validate pattern works + ↓ Extract to module if reusable +Dev + ↓ Integration testing + ↓ Container builds + ↓ Security testing +UAT + ↓ User acceptance testing + ↓ Bug fixes return to Dev + ↓ Delete environment, test restore +Prod + ↓ Deploy from tested artifacts +``` + +**Critical:** Nothing starts in Prod. Every change originates in Agathos, is validated through the pipeline, and only then deployed to production. + +### Promotion Includes + +When promoting Terraform changes, always update corresponding: +- Ansible playbooks and templates +- Service documentation in `/docs/services/` +- Host variables if new services added + +## Output Conventions + +### `agathos_inventory` + +The primary output for documentation and DNS integration: + +```hcl +output "agathos_inventory" { + description = "Host inventory for documentation and DHCP/DNS provisioning" + value = { + uranian_hosts = { + hosts = { + for name, instance in incus_instance.uranian_hosts : name => { + name = instance.name + ipv4 = instance.ipv4_address + role = local.uranian_hosts[name].role + description = local.uranian_hosts[name].description + security_nesting = lookup(local.uranian_hosts[name].config, "security.nesting", false) + } + } + } + } +} +``` + +**Purpose:** +- Update [sandbox.html](sandbox.html) documentation +- Reference for DHCP server MAC/IP registration +- DNS entry creation via DHCP + +## Layered Configuration + +### Single Config with Conditional Resources + +Avoid multiple separate Terraform configurations. Use one config with conditional resources: + +``` +environments/prod/ +├── main.tf # Incus project, profile, images (always) +├── incus_hosts.tf # Module call for Incus containers (always) +├── oci_resources.tf # OCI compute (conditional) +├── variables.tf +├── dev.tfvars # Dev: enable_oci = false +├── uat.tfvars # UAT: enable_oci = true +└── prod.tfvars # Prod: enable_oci = true +``` + +```hcl +variable "enable_oci" { + description = "Enable OCI resources (false for Dev, true for UAT/Prod)" + type = bool + default = false +} + +resource "oci_core_instance" "hosts" { + for_each = var.enable_oci ? var.oci_hosts : {} + # ... +} +``` + +## Best Practices Summary + +| Practice | Rationale | +|----------|-----------| +| ✔ Explicit `depends_on` for critical chains | Terraform isn't magic | +| ✔ Local map for host definitions | Single source of truth, easy iteration | +| ✔ `for_each` over `count` | Stable resource addresses | +| ✔ `dynamic` blocks for optional devices | Clean, declarative device configuration | +| ✔ Merge base config with overrides | DRY principle for common settings | +| ✔ Separate tfvars for environment twins | Minimal duplication, clear parameterization | +| ✔ Document module interfaces | Enable promotion across environments | +| ✔ Never start in Prod | Always validate through pipeline | diff --git a/docs/xrdp.md b/docs/xrdp.md new file mode 100644 index 0000000..5220d31 --- /dev/null +++ b/docs/xrdp.md @@ -0,0 +1,38 @@ +Purpose +This script automates the installation and configuration of xRDP (X Remote Desktop Protocol) on Ubuntu-based systems, providing a complete remote desktop solution with enhanced user experience. + +Key Features +Multi-Distribution Support: +Ubuntu 22.04, 24.04, 24.10, 25.04 +Linux Mint, Pop!OS, Zorin OS, Elementary OS +Debian support (best effort) +LMDE (Linux Mint Debian Edition) + +Installation Modes: +Standard installation (from repositories) +Custom installation (compile from source) +Removal/cleanup option + +Advanced Capabilities: +Sound redirection - Compiles audio modules for remote audio playback +H.264 encoding/decoding support (latest version) +Desktop environment detection - Handles GNOME, KDE, Budgie, etc. +Sound server detection - Works with both PulseAudio and PipeWire +Custom login screen - Branded xRDP login with custom colors/backgrounds + +Smart Features: +SSH session detection - Warns when installing over SSH +Version compatibility checks - Prevents incompatible installations +Conflict resolution - Disables conflicting GNOME remote desktop services +Permission fixes - Handles SSL certificates and user groups +Polkit rules - Enables proper shutdown/reboot from remote sessions + +What Makes It Special +Extensive OS/version support with graceful handling of EOL versions +Intelligent detection of desktop environments and sound systems +Post-installation optimization for better remote desktop experience +Comprehensive error handling and user feedback +Modular design with separate functions for different tasks +Active maintenance - regularly updated with new Ubuntu releases + +The script essentially transforms a basic Ubuntu system into a fully-functional remote desktop server with professional-grade features, handling all the complex configuration that would normally require manual intervention. \ No newline at end of file diff --git a/terraform/.terraform.lock.hcl b/terraform/.terraform.lock.hcl new file mode 100644 index 0000000..049c6b9 --- /dev/null +++ b/terraform/.terraform.lock.hcl @@ -0,0 +1,22 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/lxc/incus" { + version = "1.0.2" + hashes = [ + "h1:skSyqJPnvwhbfSrmVVY05I/js7qvX8T8Cd182tnTetc=", + "zh:0f312afd0bc27c111c5b4e41b6274dfe4401c3b5c60e4bd519425c547c5c2316", + "zh:396587c30adce1b57400ecf1a43df8d4fcbdf5172e3e359f58f7147520891546", + "zh:40310405f58493af0e68b1040d62286cd5e6d25b96b5e2d1534d155a98375eba", + "zh:4991adf7f290ffc840a1123b300163b8db25a6c4b096648c7b576a6661980ed5", + "zh:5d71a5c949a5ad01d075f856475e7de95df16b50d52e546a2257e5c56bfa9150", + "zh:60e5fde27aa605abab8487d6ed8a8bb66de88f5e1ba31bb05364b4379fde5f83", + "zh:63f9b65382bcb88efd0d9aa8422987405fcf00d4f5b63fbe1ae030438fb55eb7", + "zh:79acebe8ed9627dffc369058e54bbb933b5568fee02de3cc353274d728c07597", + "zh:97170106b7520d7c025ccfe392a0b7c2d172e63f00f656989b08d0b6ece56573", + "zh:9c8fc5d4b26dc21e6d75d6ac127502a797d7e9253bd10b236914db51fa1fc4d7", + "zh:b2b8cabdfa681efffa3599468257b185f7a7e24ec6e624e57f75920aa1e7c134", + "zh:d32129503b83790752482e0d794ffb9b04f7a893cc113d834654a8ddb028402f", + "zh:ebd2fb8d94d72bc28c5655c29c6e6048cc31ef3650d0e166aaf3d82a31673cd5", + ] +} diff --git a/terraform/containers.tf b/terraform/containers.tf new file mode 100644 index 0000000..8c01599 --- /dev/null +++ b/terraform/containers.tf @@ -0,0 +1,281 @@ +locals { + # Common cloud-init configuration + base_cloud_init = < { + name = instance.name + ipv4 = instance.ipv4_address + description = local.uranian_hosts[name].description + role = local.uranian_hosts[name].role + security_nesting = lookup(local.uranian_hosts[name].config, "security.nesting", false) + } + } +} + +output "project_info" { + description = "Agathos project information" + value = { + name = incus_project.agathos.name + description = incus_project.agathos.description + } +} + +output "agathos_inventory" { + description = "Host inventory for documentation (sandbox.html) and DHCP/DNS provisioning reference" + value = { + uranian_hosts = { + hosts = { + for name, instance in incus_instance.uranian_hosts : name => { + name = instance.name + ipv4 = instance.ipv4_address + role = local.uranian_hosts[name].role + description = local.uranian_hosts[name].description + image = local.uranian_hosts[name].image + security_nesting = lookup(local.uranian_hosts[name].config, "security.nesting", false) + } + } + } + } +} diff --git a/terraform/ssh_key_update.sh b/terraform/ssh_key_update.sh new file mode 100755 index 0000000..e9ed2fd --- /dev/null +++ b/terraform/ssh_key_update.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# Get host info from terraform output and add SSH keys for both IP and hostname.incus +terraform output -json uranian_hosts | jq -r 'to_entries[] | "\(.key) \(.value.ipv4)"' | while read hostname ip; do + # Add key for IP address + ssh-keyscan -H "$ip" >> ~/.ssh/known_hosts + # Add key for hostname.incus + ssh-keyscan -H "$hostname.incus" >> ~/.ssh/known_hosts +done diff --git a/terraform/storage.tf b/terraform/storage.tf new file mode 100644 index 0000000..61a0db4 --- /dev/null +++ b/terraform/storage.tf @@ -0,0 +1,98 @@ +# Storage Resources for Agathos Containers +# Provisions Incus storage volumes and S3 buckets with access keys + +# Storage volume for Nextcloud data +resource "incus_storage_volume" "nextcloud_data" { + name = "nextcloud-data" + pool = var.storage_pool + project = var.project_name + + config = { + size = "100GB" + } +} + +# S3 bucket for Lobechat file storage +resource "incus_storage_bucket" "lobechat" { + name = "lobechat" + pool = var.storage_pool + project = var.project_name + description = "Lobechat file storage bucket" +} + +# Access key for Lobechat S3 bucket +resource "incus_storage_bucket_key" "lobechat_key" { + name = "lobechat-access" + pool = incus_storage_bucket.lobechat.pool + storage_bucket = incus_storage_bucket.lobechat.name + project = var.project_name + role = "admin" +} + +# S3 bucket for Casdoor file storage +resource "incus_storage_bucket" "casdoor" { + name = "casdoor" + pool = var.storage_pool + project = var.project_name + description = "Casdoor file storage bucket" +} + +# Access key for Casdoor S3 bucket +resource "incus_storage_bucket_key" "casdoor_key" { + name = "casdoor-access" + pool = incus_storage_bucket.casdoor.pool + storage_bucket = incus_storage_bucket.casdoor.name + project = var.project_name + role = "admin" +} + +# S3 bucket for Spelunker file storage +resource "incus_storage_bucket" "spelunker" { + name = "spelunker" + pool = var.storage_pool + project = var.project_name + description = "Spelunker file storage bucket" +} + +# Access key for Spelunker S3 bucket +resource "incus_storage_bucket_key" "spelunker_key" { + name = "spelunker-access" + pool = incus_storage_bucket.spelunker.pool + storage_bucket = incus_storage_bucket.spelunker.name + project = var.project_name + role = "admin" +} + +# Outputs for S3 credentials (to be stored in Ansible vault) +output "lobechat_s3_credentials" { + description = "Lobechat S3 bucket credentials - store in vault as vault_lobechat_s3_*" + value = { + bucket = incus_storage_bucket.lobechat.name + access_key = incus_storage_bucket_key.lobechat_key.access_key + secret_key = incus_storage_bucket_key.lobechat_key.secret_key + endpoint = "https://${incus_storage_bucket.lobechat.location}" + } + sensitive = true +} + +output "casdoor_s3_credentials" { + description = "Casdoor S3 bucket credentials - store in vault as vault_casdoor_s3_*" + value = { + bucket = incus_storage_bucket.casdoor.name + access_key = incus_storage_bucket_key.casdoor_key.access_key + secret_key = incus_storage_bucket_key.casdoor_key.secret_key + endpoint = "https://${incus_storage_bucket.casdoor.location}" + } + sensitive = true +} + +output "spelunker_s3_credentials" { + description = "Spelunker S3 bucket credentials - store in vault as vault_spelunker_s3_*" + value = { + bucket = incus_storage_bucket.spelunker.name + access_key = incus_storage_bucket_key.spelunker_key.access_key + secret_key = incus_storage_bucket_key.spelunker_key.secret_key + endpoint = "https://${incus_storage_bucket.spelunker.location}" + } + sensitive = true +} diff --git a/terraform/variables.tf b/terraform/variables.tf new file mode 100644 index 0000000..0a47d67 --- /dev/null +++ b/terraform/variables.tf @@ -0,0 +1,41 @@ +variable "project_name" { + description = "Name of the Incus project for sandbox environment" + type = string + default = "agathos" +} + +variable "profile_name" { + description = "Name of the Incus profile for sandbox hosts" + type = string + default = "sandbox" +} + +variable "network_name" { + description = "Name of the network bridge" + type = string + default = "incusbr0" +} + +variable "storage_pool" { + description = "Name of the storage pool" + type = string + default = "default" +} + +variable "system_user" { + description = "System user name for sandbox hosts" + type = string + default = "robert" +} + +variable "user_uid" { + description = "System user UID" + type = number + default = 1000 +} + +variable "ssh_key_path" { + description = "Path to SSH authorized keys file" + type = string + default = "~/.ssh/authorized_keys" +} diff --git a/terraform/versions.tf b/terraform/versions.tf new file mode 100644 index 0000000..37f72c4 --- /dev/null +++ b/terraform/versions.tf @@ -0,0 +1,13 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + incus = { + source = "lxc/incus" + } + } +} + +provider "incus" { + # Configuration will be read from environment or default socket +} diff --git a/utils/neo4j-personal-schema-init.py b/utils/neo4j-personal-schema-init.py new file mode 100644 index 0000000..9cd2e55 --- /dev/null +++ b/utils/neo4j-personal-schema-init.py @@ -0,0 +1,587 @@ +""" +Neo4j Life Graph Schema Initialization +======================================= +Creates the foundational schema for a personal knowledge graph used by +seven AI assistants: Hypatia, Marcus, Seneca, Nate, Bowie, Bourdain, Cousteau + +Requirements: + pip install neo4j + +Usage: + python neo4j-personal-schema-init.py + python neo4j-personal-schema-init.py --uri bolt://ariel.incus:7687 + python neo4j-personal-schema-init.py --test-only + +Environment Variables (optional): + NEO4J_URI - Bolt URI (default: bolt://localhost:7687) + NEO4J_USER - Username (default: neo4j) + NEO4J_PASSWORD - Password (will prompt if not set) +""" + +import argparse +import getpass +import os +import sys +from neo4j import GraphDatabase +from neo4j.exceptions import AuthError, ServiceUnavailable +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class LifeGraphSchema: + def __init__(self, uri, user, password): + """Initialize connection to Neo4j database""" + self.driver = GraphDatabase.driver(uri, auth=(user, password)) + self.uri = uri + + def close(self): + """Close the database connection""" + self.driver.close() + + def verify_connection(self): + """ + Verify the connection to Neo4j is working. + Returns True if successful, raises exception otherwise. + """ + with self.driver.session() as session: + result = session.run("RETURN 1 AS test") + record = result.single() + if record and record["test"] == 1: + logger.info(f"✓ Connected to Neo4j at {self.uri}") + return True + raise ConnectionError("Failed to verify Neo4j connection") + + def create_constraints(self): + """ + Create uniqueness constraints on key node properties. + This ensures data integrity and creates indexes automatically. + """ + constraints = [ + # Core entities + "CREATE CONSTRAINT person_id IF NOT EXISTS FOR (p:Person) REQUIRE p.id IS UNIQUE", + "CREATE CONSTRAINT location_id IF NOT EXISTS FOR (l:Location) REQUIRE l.id IS UNIQUE", + "CREATE CONSTRAINT event_id IF NOT EXISTS FOR (e:Event) REQUIRE e.id IS UNIQUE", + + # Media types (Bowie, Bourdain, Hypatia domains) + "CREATE CONSTRAINT book_id IF NOT EXISTS FOR (b:Book) REQUIRE b.id IS UNIQUE", + "CREATE CONSTRAINT film_id IF NOT EXISTS FOR (f:Film) REQUIRE f.id IS UNIQUE", + "CREATE CONSTRAINT music_id IF NOT EXISTS FOR (m:Music) REQUIRE m.id IS UNIQUE", + "CREATE CONSTRAINT recipe_id IF NOT EXISTS FOR (r:Recipe) REQUIRE r.id IS UNIQUE", + + # Activity/Practice nodes + "CREATE CONSTRAINT training_id IF NOT EXISTS FOR (t:Training) REQUIRE t.id IS UNIQUE", + "CREATE CONSTRAINT trip_id IF NOT EXISTS FOR (t:Trip) REQUIRE t.id IS UNIQUE", + "CREATE CONSTRAINT reflection_id IF NOT EXISTS FOR (r:Reflection) REQUIRE r.id IS UNIQUE", + + # Knowledge/Learning (Hypatia domain) + "CREATE CONSTRAINT topic_id IF NOT EXISTS FOR (t:Topic) REQUIRE t.id IS UNIQUE", + "CREATE CONSTRAINT concept_id IF NOT EXISTS FOR (c:Concept) REQUIRE c.id IS UNIQUE", + + # Nature (Cousteau domain) + "CREATE CONSTRAINT species_id IF NOT EXISTS FOR (s:Species) REQUIRE s.id IS UNIQUE", + "CREATE CONSTRAINT plant_id IF NOT EXISTS FOR (p:Plant) REQUIRE p.id IS UNIQUE", + ] + + with self.driver.session() as session: + for constraint in constraints: + try: + session.run(constraint) + logger.info(f"Created constraint: {constraint.split('FOR')[1].split('REQUIRE')[0].strip()}") + except Exception as e: + logger.warning(f"Constraint may already exist: {e}") + + def create_indexes(self): + """ + Create indexes for frequently queried properties. + These improve query performance for searches and filters. + """ + indexes = [ + # Text search indexes + "CREATE INDEX person_name IF NOT EXISTS FOR (p:Person) ON (p.name)", + "CREATE INDEX location_name IF NOT EXISTS FOR (l:Location) ON (l.name)", + "CREATE INDEX book_title IF NOT EXISTS FOR (b:Book) ON (b.title)", + "CREATE INDEX film_title IF NOT EXISTS FOR (f:Film) ON (f.title)", + "CREATE INDEX music_title IF NOT EXISTS FOR (m:Music) ON (m.title)", + "CREATE INDEX recipe_name IF NOT EXISTS FOR (r:Recipe) ON (r.name)", + + # Date-based indexes for temporal queries + "CREATE INDEX event_date IF NOT EXISTS FOR (e:Event) ON (e.date)", + "CREATE INDEX training_date IF NOT EXISTS FOR (t:Training) ON (t.date)", + "CREATE INDEX trip_start IF NOT EXISTS FOR (t:Trip) ON (t.start_date)", + "CREATE INDEX reflection_date IF NOT EXISTS FOR (r:Reflection) ON (r.date)", + + # Category/type indexes for filtering + "CREATE INDEX event_type IF NOT EXISTS FOR (e:Event) ON (e.type)", + "CREATE INDEX location_category IF NOT EXISTS FOR (l:Location) ON (l.category)", + "CREATE INDEX music_genre IF NOT EXISTS FOR (m:Music) ON (m.genre)", + ] + + with self.driver.session() as session: + for index in indexes: + try: + session.run(index) + logger.info(f"Created index: {index.split('FOR')[1].split('ON')[0].strip()}") + except Exception as e: + logger.warning(f"Index may already exist: {e}") + + def verify_schema(self): + """ + Verify that constraints and indexes were created successfully. + Returns a dict with counts and status. + """ + results = {"constraints": 0, "indexes": 0, "nodes": 0, "success": True} + + with self.driver.session() as session: + # Count constraints + constraint_result = session.run("SHOW CONSTRAINTS") + constraints = list(constraint_result) + results["constraints"] = len(constraints) + + # Count indexes (excluding constraint-created ones) + index_result = session.run("SHOW INDEXES WHERE type = 'RANGE'") + indexes = list(index_result) + results["indexes"] = len(indexes) + + # Count nodes + node_result = session.run("MATCH (n) RETURN count(n) AS count") + results["nodes"] = node_result.single()["count"] + + return results + + def run_tests(self, include_schema_tests=True): + """ + Run comprehensive tests to verify schema and APOC functionality. + Returns True if all tests pass, False otherwise. + + Args: + include_schema_tests: If True, also verify constraints/indexes exist + """ + tests_passed = 0 + tests_failed = 0 + + test_cases = [ + ("Connection test", "RETURN 1 AS result", lambda r: r.single()["result"] == 1), + ("APOC available", "RETURN apoc.version() AS version", lambda r: r.single()["version"] is not None), + ("Create test node", + "CREATE (t:_Test {id: 'test_' + toString(timestamp())}) RETURN t.id AS id", + lambda r: r.single()["id"] is not None), + ("Query test node", + "MATCH (t:_Test) RETURN count(t) AS count", + lambda r: r.single()["count"] >= 1), + ("APOC collection functions", + "RETURN apoc.coll.sum([1,2,3]) AS total", + lambda r: r.single()["total"] == 6), + ("APOC date functions", + "RETURN apoc.date.format(timestamp(), 'ms', 'yyyy-MM-dd') AS today", + lambda r: len(r.single()["today"]) == 10), + ] + + # Schema-specific tests (only run after schema creation) + schema_tests = [ + ("Constraint exists (Person)", + "SHOW CONSTRAINTS WHERE name = 'person_id'", + lambda r: len(list(r)) == 1), + ("Index exists (person_name)", + "SHOW INDEXES WHERE name = 'person_name'", + lambda r: len(list(r)) == 1), + ] + + if include_schema_tests: + test_cases.extend(schema_tests) + + logger.info("\n" + "=" * 60) + logger.info("RUNNING SCHEMA VERIFICATION TESTS") + logger.info("=" * 60) + + with self.driver.session() as session: + for test_name, query, validator in test_cases: + try: + result = session.run(query) + if validator(result): + logger.info(f" ✓ {test_name}") + tests_passed += 1 + else: + logger.error(f" ✗ {test_name} - Validation failed") + tests_failed += 1 + except Exception as e: + logger.error(f" ✗ {test_name} - {e}") + tests_failed += 1 + + # Cleanup test nodes + try: + session.run("MATCH (t:_Test) DELETE t") + logger.info(" ✓ Cleanup test nodes") + except Exception as e: + logger.warning(f" ⚠ Cleanup failed: {e}") + + logger.info("=" * 60) + logger.info(f"Tests: {tests_passed} passed, {tests_failed} failed") + logger.info("=" * 60 + "\n") + + return tests_failed == 0 + + def create_sample_nodes(self): + """ + Create sample nodes to demonstrate the schema. + Replace this with your actual data import logic. + """ + queries = [ + # Central person node (you) + """ + MERGE (p:Person {id: 'user_main'}) + SET p.name = 'Main User', + p.relationship_type = 'self', + p.created_at = datetime() + """, + + # Sample interest/preference + """ + MERGE (i:Interest {id: 'interest_cooking'}) + SET i.category = 'culinary', + i.name = 'Cooking', + i.intensity = 'high', + i.notes = 'Especially interested in techniques and cultural context' + """, + + # Sample location + """ + MERGE (l:Location {id: 'location_costarica'}) + SET l.name = 'Costa Rica', + l.country = 'Costa Rica', + l.category = 'travel_destination', + l.notes = 'Planning future trip' + """, + ] + + with self.driver.session() as session: + for query in queries: + session.run(query) + logger.info("Created sample nodes") + + def document_schema(self): + """ + Document the schema design for reference. + This prints the node types and their intended use by each assistant. + """ + schema_doc = """ + + ════════════════════════════════════════════════════════════════ + LIFE GRAPH SCHEMA - NODE TYPES AND ASSISTANT RESPONSIBILITIES + ════════════════════════════════════════════════════════════════ + + CORE NODES (Used by all assistants): + ──────────────────────────────────────────────────────────────── + Person - People in your life (family, friends, contacts) + Properties: name, relationship_type, birthday, + contact_info, notes + + Location - Places (home, travel, favorites) + Properties: name, city, country, coordinates, + category, notes + + Event - Life events (vacations, gatherings, milestones) + Properties: name, date, location, description, type + + Interest - Preferences, hobbies, goals + Properties: category, name, intensity, notes + + ════════════════════════════════════════════════════════════════ + HYPATIA (Learning & Knowledge): + ──────────────────────────────────────────────────────────────── + Book - Books read or to-read + Properties: title, author, isbn, status, rating, + date_started, date_finished, notes + + Topic - Subject areas of study + Properties: name, field, depth, resources + + Concept - Ideas and principles learned + Properties: name, definition, examples, connections + + ════════════════════════════════════════════════════════════════ + MARCUS (Fitness & Training): + ──────────────────────────────────────────────────────────────── + Training - Individual workout sessions + Properties: date, type, duration, exercises, + volume, intensity, notes, feeling + + Exercise - Specific movements/activities + Properties: name, category, equipment, + target_muscles, technique_notes + + ════════════════════════════════════════════════════════════════ + SENECA (Reflection & Wellness): + ──────────────────────────────────────────────────────────────── + Reflection - Journal entries and insights + Properties: date, content, mood, themes, + insights, questions + + Goal - Life objectives and aspirations + Properties: name, category, timeline, status, + progress, reflections + + ════════════════════════════════════════════════════════════════ + NATE (Travel & Adventure): + ──────────────────────────────────────────────────────────────── + Trip - Travel plans and experiences + Properties: name, start_date, end_date, + destinations, purpose, budget, highlights + + Activity - Things to do at destinations + Properties: name, type, location, cost, + difficulty, notes + + ════════════════════════════════════════════════════════════════ + BOWIE (Arts, Culture & Style): + ──────────────────────────────────────────────────────────────── + Film - Movies and TV shows + Properties: title, year, director, genre, + status, rating, date_watched, notes + + Music - Songs, albums, artists + Properties: title, artist, album, genre, year, + rating, play_count, notes + + Artwork - Visual art, exhibitions, collections + Properties: title, artist, medium, year, location, + notes + + ════════════════════════════════════════════════════════════════ + BOURDAIN (Food & Drink): + ──────────────────────────────────────────────────────────────── + Recipe - Dishes to cook + Properties: name, cuisine, difficulty, time, + ingredients, instructions, source, notes + + Restaurant - Dining destinations + Properties: name, location, cuisine, price_range, + rating, dishes_tried, notes + + Ingredient - Foods and cooking components + Properties: name, category, season, source, + substitutes, notes + + ════════════════════════════════════════════════════════════════ + COUSTEAU (Nature & Living Things): + ──────────────────────────────────────────────────────────────── + Species - Animals, fish, marine life + Properties: name, scientific_name, category, + habitat, conservation_status, notes + + Plant - Garden plants, houseplants + Properties: name, scientific_name, type, + care_requirements, location, health_status + + Ecosystem - Environments and habitats + Properties: name, type, location, characteristics, + species_present, conservation_notes + + ════════════════════════════════════════════════════════════════ + KEY RELATIONSHIP PATTERNS: + ──────────────────────────────────────────────────────────────── + + Cross-domain connections: + - Training -[PREPARATION_FOR]-> Trip + - Reflection -[ABOUT]-> Event/Training/Trip + - Book -[INSPIRED]-> Trip/Recipe/Concept + - Recipe -[FROM_LOCATION]-> Location + - Music -[PLAYED_AT]-> Event/Location + - Film -[SET_IN]-> Location + - Species -[OBSERVED_AT]-> Location + - Plant -[GROWS_IN]-> Location + + Personal connections: + - Person -[ATTENDED]-> Event + - Person -[TRAVELED_WITH]-> Trip + - Person -[TRAINED_WITH]-> Training + - Person -[SHARED_MEAL]-> Recipe/Restaurant + - Person -[RECOMMENDED]-> Book/Film/Music/Restaurant + + Learning connections: + - Book -[ABOUT]-> Topic + - Topic -[CONTAINS]-> Concept + - Concept -[RELATES_TO]-> Concept + - Training -[TEACHES]-> Concept (movement patterns, discipline) + + ════════════════════════════════════════════════════════════════ + """ + + print(schema_doc) + logger.info("Schema documentation displayed") + + +def get_credentials(args): + """ + Collect Neo4j credentials from environment variables, CLI args, or prompts. + Priority: CLI args > Environment variables > Interactive prompts + """ + # URI + uri = args.uri or os.environ.get("NEO4J_URI") + if not uri: + uri = input("Neo4j URI [bolt://localhost:7687]: ").strip() + if not uri: + uri = "bolt://localhost:7687" + + # Username + user = args.user or os.environ.get("NEO4J_USER") + if not user: + user = input("Neo4j username [neo4j]: ").strip() + if not user: + user = "neo4j" + + # Password (never from CLI for security) + password = os.environ.get("NEO4J_PASSWORD") + if not password: + password = getpass.getpass("Neo4j password: ") + if not password: + logger.error("Password is required") + sys.exit(1) + + return uri, user, password + + +def parse_args(): + """Parse command line arguments""" + parser = argparse.ArgumentParser( + description="Initialize Neo4j Life Graph schema for AI assistants", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s # Interactive prompts + %(prog)s --uri bolt://ariel.incus:7687 # Specify URI, prompt for rest + %(prog)s --test-only # Run tests without creating schema + %(prog)s --skip-samples # Create schema without sample data + +Environment Variables: + NEO4J_URI Bolt connection URI + NEO4J_USER Database username + NEO4J_PASSWORD Database password (recommended for scripts) + """ + ) + + parser.add_argument( + "--uri", "-u", + help="Neo4j Bolt URI (default: bolt://localhost:7687)" + ) + parser.add_argument( + "--user", "-U", + help="Neo4j username (default: neo4j)" + ) + parser.add_argument( + "--test-only", "-t", + action="store_true", + help="Only run verification tests, don't create schema" + ) + parser.add_argument( + "--skip-samples", + action="store_true", + help="Skip creating sample nodes" + ) + parser.add_argument( + "--skip-docs", + action="store_true", + help="Skip displaying schema documentation" + ) + parser.add_argument( + "--quiet", "-q", + action="store_true", + help="Reduce output verbosity" + ) + + return parser.parse_args() + + +def main(): + """ + Main execution function. + Collects credentials via prompts or environment variables. + """ + args = parse_args() + + # Set log level + if args.quiet: + logging.getLogger().setLevel(logging.WARNING) + + # Get credentials + uri, user, password = get_credentials(args) + + logger.info(f"Connecting to Neo4j at {uri}...") + + try: + schema = LifeGraphSchema(uri, user, password) + except Exception as e: + logger.error(f"Failed to create database driver: {e}") + sys.exit(1) + + try: + # Verify connection first + try: + schema.verify_connection() + except AuthError: + logger.error("✗ Authentication failed - check username/password") + sys.exit(1) + except ServiceUnavailable: + logger.error(f"✗ Cannot connect to Neo4j at {uri}") + sys.exit(1) + + if args.test_only: + # Just run basic tests (no schema verification) + success = schema.run_tests(include_schema_tests=False) + sys.exit(0 if success else 1) + + # Display schema documentation + if not args.skip_docs: + schema.document_schema() + + # Create constraints (includes automatic indexes) + logger.info("Creating constraints...") + schema.create_constraints() + + # Create additional indexes + logger.info("Creating indexes...") + schema.create_indexes() + + # Create sample nodes to validate schema + if not args.skip_samples: + logger.info("Creating sample nodes...") + schema.create_sample_nodes() + + # Run verification tests (including schema tests) + logger.info("Verifying schema...") + test_success = schema.run_tests(include_schema_tests=True) + + # Summary + stats = schema.verify_schema() + logger.info("=" * 60) + logger.info("SCHEMA INITIALIZATION COMPLETE") + logger.info("=" * 60) + logger.info(f" Constraints: {stats['constraints']}") + logger.info(f" Indexes: {stats['indexes']}") + logger.info(f" Nodes: {stats['nodes']}") + logger.info("=" * 60) + + if test_success: + logger.info("✓ All tests passed!") + logger.info("\nNext steps:") + logger.info(" 1. Import your Plex library (Film, Music nodes)") + logger.info(" 2. Import your Calibre library (Book nodes)") + logger.info(" 3. Configure your AI assistants to write to this graph") + else: + logger.warning("⚠ Some tests failed - review output above") + sys.exit(1) + + except KeyboardInterrupt: + logger.info("\nOperation cancelled by user") + sys.exit(130) + except Exception as e: + logger.error(f"Error during schema initialization: {e}") + sys.exit(1) + finally: + schema.close() + + +if __name__ == "__main__": + main()