fix(certbot): harden renewal hook and fix permission errors
The renewal deploy-hook ran as the certbot user but lacked permissions to write the combined PEM to /etc/haproxy/certs and to reload HAProxy, causing silent failures that left a stale certificate in production until expiry. - Add certbot user to the haproxy group so it can write the combined PEM - Grant certbot NOPASSWD sudo for `systemctl reload haproxy` only - Make the Prometheus textfile directory group-owned by certbot (0775) so cert-metrics.sh can atomically update ssl_cert.prom - Refactor renewal-hook.sh to always refresh cert metrics on exit via a trap, ensuring expiry alerts fire when the hook itself is broken - Replace `set -e` with explicit error handling and structured logging
This commit is contained in:
@@ -86,6 +86,19 @@
|
||||
groups: "{{ certbot_group }}"
|
||||
append: true
|
||||
|
||||
# The renewal deploy-hook runs as the certbot user and writes the combined
|
||||
# PEM into the group-writable /etc/haproxy/certs (mode 0770, owned by the
|
||||
# haproxy group). certbot must be a member of that group, otherwise the
|
||||
# hook fails with "Permission denied" and HAProxy serves a stale cert until
|
||||
# it expires.
|
||||
- name: Add certbot user to the haproxy group
|
||||
become: true
|
||||
ansible.builtin.user:
|
||||
name: "{{ certbot_user }}"
|
||||
groups: "{{ haproxy_group }}"
|
||||
append: true
|
||||
when: "'haproxy' in services | default([])"
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Directory Structure
|
||||
# -------------------------------------------------------------------------
|
||||
@@ -178,14 +191,32 @@
|
||||
group: "{{ certbot_group }}"
|
||||
mode: '0750'
|
||||
|
||||
# Group-owned by certbot and group-writable so cert-metrics.sh (run as the
|
||||
# certbot user from the renewal hook) can atomically write ssl_cert.prom.
|
||||
# node-exporter only needs to read these files, which 0775 still allows.
|
||||
# The renewal hook reloads HAProxy after installing a new cert, but runs as
|
||||
# the unprivileged certbot user. Grant exactly `systemctl reload haproxy`
|
||||
# via sudo — nothing more. visudo validation prevents a malformed drop-in
|
||||
# from locking out sudo.
|
||||
- name: Allow certbot to reload HAProxy via sudo
|
||||
become: true
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/sudoers.d/certbot-haproxy-reload
|
||||
content: "{{ certbot_user }} ALL=(root) NOPASSWD: /usr/bin/systemctl reload haproxy\n"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0440'
|
||||
validate: visudo -cf %s
|
||||
when: "'haproxy' in services | default([])"
|
||||
|
||||
- name: Create Prometheus textfile directory
|
||||
become: true
|
||||
ansible.builtin.file:
|
||||
path: "{{ prometheus_node_exporter_text_directory }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
group: "{{ certbot_group }}"
|
||||
mode: '0775'
|
||||
|
||||
- name: Template certificate metrics script
|
||||
become: true
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
# 3. Reloads HAProxy via systemd
|
||||
# 4. Updates certificate metrics for Prometheus
|
||||
|
||||
set -euo pipefail
|
||||
set -uo pipefail
|
||||
|
||||
# RENEWED_LINEAGE is set by certbot --deploy-hook or passed explicitly by deploy.yml
|
||||
CERT_DIR="${RENEWED_LINEAGE:?RENEWED_LINEAGE must be set}"
|
||||
@@ -16,37 +16,70 @@ CERT_NAME=$(basename "${CERT_DIR}")
|
||||
HAPROXY_CERT="{{ haproxy_cert_path }}"
|
||||
HAPROXY_DIR="{{ haproxy_directory }}"
|
||||
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Starting renewal hook for ${CERT_NAME}"
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||||
fail() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2; }
|
||||
|
||||
# Always refresh Prometheus cert metrics on exit, even if installation below
|
||||
# fails. The metrics drive the SSLCertificateExpired/ExpiringSoon alerts, so
|
||||
# they must reflect reality precisely when the hook is broken — otherwise a
|
||||
# failed renewal rots silently (which is exactly how the cert expired before).
|
||||
# A non-zero exit is reported by certbot as a WARNING, surfacing the failure.
|
||||
hook_status=0
|
||||
finish() {
|
||||
{{ certbot_directory }}/hooks/cert-metrics.sh || fail "cert-metrics.sh failed"
|
||||
if [[ ${hook_status} -ne 0 ]]; then
|
||||
fail "Renewal hook FAILED for ${CERT_NAME} — HAProxy is serving a STALE certificate"
|
||||
fi
|
||||
exit "${hook_status}"
|
||||
}
|
||||
trap finish EXIT
|
||||
|
||||
log "Starting renewal hook for ${CERT_NAME}"
|
||||
|
||||
# Check if certificate files exist
|
||||
if [[ ! -f "${CERT_DIR}/fullchain.pem" ]] || [[ ! -f "${CERT_DIR}/privkey.pem" ]]; then
|
||||
echo "ERROR: Certificate files not found in ${CERT_DIR}"
|
||||
fail "Certificate files not found in ${CERT_DIR}"
|
||||
hook_status=1
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Combine certificate and private key for HAProxy
|
||||
# HAProxy requires both in a single PEM file
|
||||
cat "${CERT_DIR}/fullchain.pem" "${CERT_DIR}/privkey.pem" > "${HAPROXY_CERT}.tmp"
|
||||
# Combine certificate and private key for HAProxy (single PEM), writing to a
|
||||
# temp file in the same directory and moving atomically so HAProxy never reads
|
||||
# a partial file. A permission failure here is the documented failure mode.
|
||||
if ! cat "${CERT_DIR}/fullchain.pem" "${CERT_DIR}/privkey.pem" > "${HAPROXY_CERT}.tmp"; then
|
||||
fail "Could not write ${HAPROXY_CERT}.tmp — check ownership/permissions of $(dirname "${HAPROXY_CERT}")"
|
||||
rm -f "${HAPROXY_CERT}.tmp"
|
||||
hook_status=1
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Atomic move to avoid HAProxy reading partial file
|
||||
mv "${HAPROXY_CERT}.tmp" "${HAPROXY_CERT}"
|
||||
if ! mv "${HAPROXY_CERT}.tmp" "${HAPROXY_CERT}"; then
|
||||
fail "Could not move combined PEM into place at ${HAPROXY_CERT}"
|
||||
rm -f "${HAPROXY_CERT}.tmp"
|
||||
hook_status=1
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Set permissions
|
||||
chown {{ certbot_user }}:{{ haproxy_group }} "${HAPROXY_CERT}"
|
||||
chmod 640 "${HAPROXY_CERT}"
|
||||
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Certificate combined and written to ${HAPROXY_CERT}"
|
||||
log "Certificate combined and written to ${HAPROXY_CERT}"
|
||||
|
||||
# Reload HAProxy if running
|
||||
# Reload HAProxy if running. The hook runs as the unprivileged certbot user,
|
||||
# so the reload goes through sudo (a scoped sudoers rule grants exactly this
|
||||
# command). sudo -n fails fast rather than blocking on a password prompt.
|
||||
if systemctl is-active --quiet haproxy; then
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Reloading HAProxy..."
|
||||
systemctl reload haproxy
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] HAProxy reloaded"
|
||||
log "Reloading HAProxy..."
|
||||
if sudo -n systemctl reload haproxy; then
|
||||
log "HAProxy reloaded"
|
||||
else
|
||||
fail "HAProxy reload failed"
|
||||
hook_status=1
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] HAProxy not running, skipping reload"
|
||||
log "HAProxy not running, skipping reload"
|
||||
fi
|
||||
|
||||
# Update certificate metrics
|
||||
{{ certbot_directory }}/hooks/cert-metrics.sh
|
||||
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Renewal hook completed successfully"
|
||||
log "Renewal hook completed successfully"
|
||||
|
||||
Reference in New Issue
Block a user