fix(certbot): harden renewal hook and fix permission errors

The renewal deploy-hook ran as the certbot user but lacked permissions to
write the combined PEM to /etc/haproxy/certs and to reload HAProxy,
causing silent failures that left a stale certificate in production until
expiry.

- Add certbot user to the haproxy group so it can write the combined PEM
- Grant certbot NOPASSWD sudo for `systemctl reload haproxy` only
- Make the Prometheus textfile directory group-owned by certbot (0775)
  so cert-metrics.sh can atomically update ssl_cert.prom
- Refactor renewal-hook.sh to always refresh cert metrics on exit via a
  trap, ensuring expiry alerts fire when the hook itself is broken
- Replace `set -e` with explicit error handling and structured logging
This commit is contained in:
2026-06-17 09:58:46 -04:00
parent 2f5a15eef5
commit 343b0e13d6
10 changed files with 665 additions and 46 deletions

View File

@@ -8,7 +8,7 @@
# 3. Reloads HAProxy via systemd
# 4. Updates certificate metrics for Prometheus
set -euo pipefail
set -uo pipefail
# RENEWED_LINEAGE is set by certbot --deploy-hook or passed explicitly by deploy.yml
CERT_DIR="${RENEWED_LINEAGE:?RENEWED_LINEAGE must be set}"
@@ -16,37 +16,70 @@ CERT_NAME=$(basename "${CERT_DIR}")
HAPROXY_CERT="{{ haproxy_cert_path }}"
HAPROXY_DIR="{{ haproxy_directory }}"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Starting renewal hook for ${CERT_NAME}"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
fail() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2; }
# Always refresh Prometheus cert metrics on exit, even if installation below
# fails. The metrics drive the SSLCertificateExpired/ExpiringSoon alerts, so
# they must reflect reality precisely when the hook is broken — otherwise a
# failed renewal rots silently (which is exactly how the cert expired before).
# A non-zero exit is reported by certbot as a WARNING, surfacing the failure.
hook_status=0
finish() {
{{ certbot_directory }}/hooks/cert-metrics.sh || fail "cert-metrics.sh failed"
if [[ ${hook_status} -ne 0 ]]; then
fail "Renewal hook FAILED for ${CERT_NAME} — HAProxy is serving a STALE certificate"
fi
exit "${hook_status}"
}
trap finish EXIT
log "Starting renewal hook for ${CERT_NAME}"
# Check if certificate files exist
if [[ ! -f "${CERT_DIR}/fullchain.pem" ]] || [[ ! -f "${CERT_DIR}/privkey.pem" ]]; then
echo "ERROR: Certificate files not found in ${CERT_DIR}"
fail "Certificate files not found in ${CERT_DIR}"
hook_status=1
exit 1
fi
# Combine certificate and private key for HAProxy
# HAProxy requires both in a single PEM file
cat "${CERT_DIR}/fullchain.pem" "${CERT_DIR}/privkey.pem" > "${HAPROXY_CERT}.tmp"
# Combine certificate and private key for HAProxy (single PEM), writing to a
# temp file in the same directory and moving atomically so HAProxy never reads
# a partial file. A permission failure here is the documented failure mode.
if ! cat "${CERT_DIR}/fullchain.pem" "${CERT_DIR}/privkey.pem" > "${HAPROXY_CERT}.tmp"; then
fail "Could not write ${HAPROXY_CERT}.tmp — check ownership/permissions of $(dirname "${HAPROXY_CERT}")"
rm -f "${HAPROXY_CERT}.tmp"
hook_status=1
exit 1
fi
# Atomic move to avoid HAProxy reading partial file
mv "${HAPROXY_CERT}.tmp" "${HAPROXY_CERT}"
if ! mv "${HAPROXY_CERT}.tmp" "${HAPROXY_CERT}"; then
fail "Could not move combined PEM into place at ${HAPROXY_CERT}"
rm -f "${HAPROXY_CERT}.tmp"
hook_status=1
exit 1
fi
# Set permissions
chown {{ certbot_user }}:{{ haproxy_group }} "${HAPROXY_CERT}"
chmod 640 "${HAPROXY_CERT}"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Certificate combined and written to ${HAPROXY_CERT}"
log "Certificate combined and written to ${HAPROXY_CERT}"
# Reload HAProxy if running
# Reload HAProxy if running. The hook runs as the unprivileged certbot user,
# so the reload goes through sudo (a scoped sudoers rule grants exactly this
# command). sudo -n fails fast rather than blocking on a password prompt.
if systemctl is-active --quiet haproxy; then
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Reloading HAProxy..."
systemctl reload haproxy
echo "[$(date '+%Y-%m-%d %H:%M:%S')] HAProxy reloaded"
log "Reloading HAProxy..."
if sudo -n systemctl reload haproxy; then
log "HAProxy reloaded"
else
fail "HAProxy reload failed"
hook_status=1
exit 1
fi
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] HAProxy not running, skipping reload"
log "HAProxy not running, skipping reload"
fi
# Update certificate metrics
{{ certbot_directory }}/hooks/cert-metrics.sh
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Renewal hook completed successfully"
log "Renewal hook completed successfully"