From d97730174dcfadc45b38822b4ad68d63d9f6a421 Mon Sep 17 00:00:00 2001 From: sstent Date: Mon, 9 Feb 2026 05:52:28 -0800 Subject: [PATCH] feat(entrypoint): Implement TTL heartbeat registration and robust supervision --- entrypoint.sh | 157 +++++++++++++++++++++++++++++++------------------- 1 file changed, 98 insertions(+), 59 deletions(-) diff --git a/entrypoint.sh b/entrypoint.sh index 6de4e89..0868c5d 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -3,31 +3,85 @@ set -e # Configuration from environment SERVICE_NAME="navidrome" +# Use Nomad allocation ID for a unique service ID +SERVICE_ID="${SERVICE_NAME}-${NOMAD_ALLOC_ID:-$(hostname)}" PORT=4533 CONSUL_HTTP_ADDR="${CONSUL_URL:-http://localhost:8500}" NODE_IP="${ADVERTISE_IP}" +DB_LOCK_FILE="/data/.primary" +NAVIDROME_PID=0 # Tags for the Primary service (Traefik enabled) PRIMARY_TAGS='["navidrome","web","traefik.enable=true","urlprefix-/navidrome","tools","traefik.http.routers.navidromelan.rule=Host(`navidrome.service.dc1.consul`)","traefik.http.routers.navidromewan.rule=Host(`m.fbleagh.duckdns.org`)","traefik.http.routers.navidromewan.middlewares=dex@consulcatalog","traefik.http.routers.navidromewan.tls=true"]' -NAVIDROME_PID="" -SERVICE_ID="navidrome-${NODE_IP}-${SERVICE_NAME}" +# --- Helper Functions --- -cleanup() { - echo "Caught signal, shutting down..." - if [ -n "$NAVIDROME_PID" ]; then - echo "Stopping Navidrome (PID: $NAVIDROME_PID)..." - kill -TERM "$NAVIDROME_PID" - wait "$NAVIDROME_PID" || true - fi - echo "Deregistering service ${SERVICE_ID} from Consul..." - curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}" || true - exit 0 +# Register Service with TTL Check +register_service() { + echo "Promoted! Registering service ${SERVICE_ID}..." + # Convert bash list string to JSON array if needed, but PRIMARY_TAGS is already JSON-like + curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/register" -d "{ + \"ID\": \"${SERVICE_ID}\", + \"Name\": \"${SERVICE_NAME}\", + \"Tags\": ${PRIMARY_TAGS}, + \"Address\": \"${NODE_IP}\", + \"Port\": ${PORT}, + \"Check\": { + \"DeregisterCriticalServiceAfter\": \"1m\", + \"TTL\": \"15s\" + } + }" } -trap cleanup SIGTERM SIGINT +# Send Heartbeat to Consul +pass_ttl() { + curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/check/pass/service:${SERVICE_ID}" > /dev/null +} -echo "Starting leadership-aware entrypoint..." +# Deregister Service +deregister_service() { + echo "Demoted/Stopping. Deregistering service ${SERVICE_ID}..." + curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}" +} + +# Start Navidrome in Background +start_app() { + echo "Node is Primary. Starting Navidrome..." + + # Ensure DB path and local data folder are set + export ND_DATABASE_PATH="/data/navidrome.db" + export ND_DATAFOLDER="/local/data" + mkdir -p /local/data + + /app/navidrome & + NAVIDROME_PID=$! + echo "Navidrome started with PID ${NAVIDROME_PID}" +} + +# Stop Navidrome +stop_app() { + if [ "${NAVIDROME_PID}" -gt 0 ]; then + echo "Stopping Navidrome (PID ${NAVIDROME_PID})..." + kill -SIGTERM "${NAVIDROME_PID}" + wait "${NAVIDROME_PID}" 2>/dev/null || true + NAVIDROME_PID=0 + fi +} + +# --- Signal Handling (The Safety Net) --- +# If Nomad stops the container, we stop the app and deregister. +cleanup() { + echo "Caught signal, shutting down..." + stop_app + deregister_service + exit 0 +} + +trap cleanup TERM INT + +# --- Main Loop --- + +echo "Starting Supervisor. Waiting for leadership settle..." echo "Node IP: $NODE_IP" echo "Consul: $CONSUL_HTTP_ADDR" @@ -35,51 +89,36 @@ echo "Consul: $CONSUL_HTTP_ADDR" sleep 5 while true; do - # In LiteFS 0.5, .primary file exists ONLY on replicas. - if [ ! -f /data/.primary ]; then - # PRIMARY STATE - if [ -z "$NAVIDROME_PID" ] || ! kill -0 "$NAVIDROME_PID" 2>/dev/null; then - echo "Node is Primary. Initializing Navidrome..." - - # Register in Consul - echo "Registering as primary in Consul..." - curl -s -X PUT -d "{ - \"ID\": \"${SERVICE_ID}\", - \"Name\": \"${SERVICE_NAME}\", - \"Tags\": ${PRIMARY_TAGS}, - \"Address\": \"${NODE_IP}\", - \"Port\": ${PORT}, - \"Check\": { - \"HTTP\": \"http://${NODE_IP}:${PORT}/app\", - \"Interval\": \"10s\", - \"Timeout\": \"2s\" - } - }" "${CONSUL_HTTP_ADDR}/v1/agent/service/register" - - echo "Starting Navidrome with ND_DATABASE_PATH=/data/navidrome.db" - export ND_DATABASE_PATH="/data/navidrome.db" - export ND_DATAFOLDER="/local/data" + # In LiteFS 0.5, .primary file exists ONLY on replicas. + if [ ! -f "$DB_LOCK_FILE" ]; then + # === WE ARE PRIMARY === + + # 1. If App is not running, start it and register + if [ "${NAVIDROME_PID}" -eq 0 ] || ! kill -0 "${NAVIDROME_PID}" 2>/dev/null; then + if [ "${NAVIDROME_PID}" -gt 0 ]; then + echo "CRITICAL: Navidrome crashed! Restarting..." + fi + start_app + register_service + fi - # Start Navidrome - /app/navidrome & - NAVIDROME_PID=$! - echo "Navidrome started with PID $NAVIDROME_PID" - fi - else - # REPLICA STATE - if [ -n "$NAVIDROME_PID" ] && kill -0 "$NAVIDROME_PID" 2>/dev/null; then - echo "Node transitioned to Replica. Stopping Navidrome..." - kill -TERM "$NAVIDROME_PID" - wait "$NAVIDROME_PID" || true - NAVIDROME_PID="" - - echo "Deregistering primary service from Consul..." - curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}" || true - fi - - # We don't register anything for replicas in this version to keep it simple. - # But we stay alive so LiteFS keeps running. + # 2. Maintain the heartbeat (TTL) + pass_ttl + + else + # === WE ARE REPLICA === + + # If App is running (we were just demoted), stop it + if [ "${NAVIDROME_PID}" -gt 0 ]; then + echo "Lost leadership. Demoting..." + stop_app + deregister_service fi - sleep 5 -done + # No service registration exists for replicas to keep Consul clean. + fi + + # Sleep short enough to update TTL (every 5s is safe for 15s TTL) + sleep 5 & + wait $! # Wait allows the 'trap' to interrupt the sleep instantly +done \ No newline at end of file