feat(entrypoint): Implement TTL heartbeat registration and robust supervision

This commit is contained in:
2026-02-09 05:52:28 -08:00
parent 27b10a39b8
commit d97730174d

View File

@@ -3,31 +3,85 @@ set -e
# Configuration from environment # Configuration from environment
SERVICE_NAME="navidrome" SERVICE_NAME="navidrome"
# Use Nomad allocation ID for a unique service ID
SERVICE_ID="${SERVICE_NAME}-${NOMAD_ALLOC_ID:-$(hostname)}"
PORT=4533 PORT=4533
CONSUL_HTTP_ADDR="${CONSUL_URL:-http://localhost:8500}" CONSUL_HTTP_ADDR="${CONSUL_URL:-http://localhost:8500}"
NODE_IP="${ADVERTISE_IP}" NODE_IP="${ADVERTISE_IP}"
DB_LOCK_FILE="/data/.primary"
NAVIDROME_PID=0
# Tags for the Primary service (Traefik enabled) # Tags for the Primary service (Traefik enabled)
PRIMARY_TAGS='["navidrome","web","traefik.enable=true","urlprefix-/navidrome","tools","traefik.http.routers.navidromelan.rule=Host(`navidrome.service.dc1.consul`)","traefik.http.routers.navidromewan.rule=Host(`m.fbleagh.duckdns.org`)","traefik.http.routers.navidromewan.middlewares=dex@consulcatalog","traefik.http.routers.navidromewan.tls=true"]' PRIMARY_TAGS='["navidrome","web","traefik.enable=true","urlprefix-/navidrome","tools","traefik.http.routers.navidromelan.rule=Host(`navidrome.service.dc1.consul`)","traefik.http.routers.navidromewan.rule=Host(`m.fbleagh.duckdns.org`)","traefik.http.routers.navidromewan.middlewares=dex@consulcatalog","traefik.http.routers.navidromewan.tls=true"]'
NAVIDROME_PID="" # --- Helper Functions ---
SERVICE_ID="navidrome-${NODE_IP}-${SERVICE_NAME}"
cleanup() { # Register Service with TTL Check
echo "Caught signal, shutting down..." register_service() {
if [ -n "$NAVIDROME_PID" ]; then echo "Promoted! Registering service ${SERVICE_ID}..."
echo "Stopping Navidrome (PID: $NAVIDROME_PID)..." # Convert bash list string to JSON array if needed, but PRIMARY_TAGS is already JSON-like
kill -TERM "$NAVIDROME_PID" curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/register" -d "{
wait "$NAVIDROME_PID" || true \"ID\": \"${SERVICE_ID}\",
fi \"Name\": \"${SERVICE_NAME}\",
echo "Deregistering service ${SERVICE_ID} from Consul..." \"Tags\": ${PRIMARY_TAGS},
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}" || true \"Address\": \"${NODE_IP}\",
exit 0 \"Port\": ${PORT},
\"Check\": {
\"DeregisterCriticalServiceAfter\": \"1m\",
\"TTL\": \"15s\"
}
}"
} }
trap cleanup SIGTERM SIGINT # Send Heartbeat to Consul
pass_ttl() {
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/check/pass/service:${SERVICE_ID}" > /dev/null
}
echo "Starting leadership-aware entrypoint..." # Deregister Service
deregister_service() {
echo "Demoted/Stopping. Deregistering service ${SERVICE_ID}..."
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}"
}
# Start Navidrome in Background
start_app() {
echo "Node is Primary. Starting Navidrome..."
# Ensure DB path and local data folder are set
export ND_DATABASE_PATH="/data/navidrome.db"
export ND_DATAFOLDER="/local/data"
mkdir -p /local/data
/app/navidrome &
NAVIDROME_PID=$!
echo "Navidrome started with PID ${NAVIDROME_PID}"
}
# Stop Navidrome
stop_app() {
if [ "${NAVIDROME_PID}" -gt 0 ]; then
echo "Stopping Navidrome (PID ${NAVIDROME_PID})..."
kill -SIGTERM "${NAVIDROME_PID}"
wait "${NAVIDROME_PID}" 2>/dev/null || true
NAVIDROME_PID=0
fi
}
# --- Signal Handling (The Safety Net) ---
# If Nomad stops the container, we stop the app and deregister.
cleanup() {
echo "Caught signal, shutting down..."
stop_app
deregister_service
exit 0
}
trap cleanup TERM INT
# --- Main Loop ---
echo "Starting Supervisor. Waiting for leadership settle..."
echo "Node IP: $NODE_IP" echo "Node IP: $NODE_IP"
echo "Consul: $CONSUL_HTTP_ADDR" echo "Consul: $CONSUL_HTTP_ADDR"
@@ -35,51 +89,36 @@ echo "Consul: $CONSUL_HTTP_ADDR"
sleep 5 sleep 5
while true; do while true; do
# In LiteFS 0.5, .primary file exists ONLY on replicas. # In LiteFS 0.5, .primary file exists ONLY on replicas.
if [ ! -f /data/.primary ]; then if [ ! -f "$DB_LOCK_FILE" ]; then
# PRIMARY STATE # === WE ARE PRIMARY ===
if [ -z "$NAVIDROME_PID" ] || ! kill -0 "$NAVIDROME_PID" 2>/dev/null; then
echo "Node is Primary. Initializing Navidrome..." # 1. If App is not running, start it and register
if [ "${NAVIDROME_PID}" -eq 0 ] || ! kill -0 "${NAVIDROME_PID}" 2>/dev/null; then
# Register in Consul if [ "${NAVIDROME_PID}" -gt 0 ]; then
echo "Registering as primary in Consul..." echo "CRITICAL: Navidrome crashed! Restarting..."
curl -s -X PUT -d "{ fi
\"ID\": \"${SERVICE_ID}\", start_app
\"Name\": \"${SERVICE_NAME}\", register_service
\"Tags\": ${PRIMARY_TAGS}, fi
\"Address\": \"${NODE_IP}\",
\"Port\": ${PORT},
\"Check\": {
\"HTTP\": \"http://${NODE_IP}:${PORT}/app\",
\"Interval\": \"10s\",
\"Timeout\": \"2s\"
}
}" "${CONSUL_HTTP_ADDR}/v1/agent/service/register"
echo "Starting Navidrome with ND_DATABASE_PATH=/data/navidrome.db"
export ND_DATABASE_PATH="/data/navidrome.db"
export ND_DATAFOLDER="/local/data"
# Start Navidrome # 2. Maintain the heartbeat (TTL)
/app/navidrome & pass_ttl
NAVIDROME_PID=$!
echo "Navidrome started with PID $NAVIDROME_PID" else
fi # === WE ARE REPLICA ===
else
# REPLICA STATE # If App is running (we were just demoted), stop it
if [ -n "$NAVIDROME_PID" ] && kill -0 "$NAVIDROME_PID" 2>/dev/null; then if [ "${NAVIDROME_PID}" -gt 0 ]; then
echo "Node transitioned to Replica. Stopping Navidrome..." echo "Lost leadership. Demoting..."
kill -TERM "$NAVIDROME_PID" stop_app
wait "$NAVIDROME_PID" || true deregister_service
NAVIDROME_PID=""
echo "Deregistering primary service from Consul..."
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}" || true
fi
# We don't register anything for replicas in this version to keep it simple.
# But we stay alive so LiteFS keeps running.
fi fi
sleep 5 # No service registration exists for replicas to keep Consul clean.
done fi
# Sleep short enough to update TTL (every 5s is safe for 15s TTL)
sleep 5 &
wait $! # Wait allows the 'trap' to interrupt the sleep instantly
done