#!/bin/bash set -e # Configuration from environment SERVICE_NAME="navidrome" # Use Nomad allocation ID for a unique service ID SERVICE_ID="${SERVICE_NAME}-${NOMAD_ALLOC_ID:-$(hostname)}" PORT=4533 CONSUL_HTTP_ADDR="${CONSUL_URL:-http://localhost:8500}" NODE_IP="${ADVERTISE_IP}" DB_LOCK_FILE="/data/.primary" NAVIDROME_PID=0 # Tags for the Primary service (Traefik enabled) PRIMARY_TAGS='["navidrome","web","traefik.enable=true","urlprefix-/navidrome","tools","traefik.http.routers.navidromelan.rule=Host(`navidrome.service.dc1.consul`)","traefik.http.routers.navidromewan.rule=Host(`m.fbleagh.duckdns.org`)","traefik.http.routers.navidromewan.middlewares=dex@consulcatalog","traefik.http.routers.navidromewan.tls=true"]' # --- Helper Functions --- # Register Service with TTL Check register_service() { echo "Promoted! Registering service ${SERVICE_ID}..." # Convert bash list string to JSON array if needed, but PRIMARY_TAGS is already JSON-like curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/register" -d "{ \"ID\": \"${SERVICE_ID}\", \"Name\": \"${SERVICE_NAME}\", \"Tags\": ${PRIMARY_TAGS}, \"Address\": \"${NODE_IP}\", \"Port\": ${PORT}, \"Check\": { \"DeregisterCriticalServiceAfter\": \"1m\", \"TTL\": \"15s\" } }" } # Send Heartbeat to Consul pass_ttl() { curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/check/pass/service:${SERVICE_ID}" > /dev/null } # Deregister Service deregister_service() { echo "Demoted/Stopping. Deregistering service ${SERVICE_ID}..." curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}" } # Start Navidrome in Background start_app() { echo "Node is Primary. Starting Navidrome..." # Ensure shared directories exist mkdir -p /shared_data/plugins /shared_data/cache /shared_data/backup /app/navidrome & NAVIDROME_PID=$! echo "Navidrome started with PID ${NAVIDROME_PID}" } # Stop Navidrome stop_app() { if [ "${NAVIDROME_PID}" -gt 0 ]; then echo "Stopping Navidrome (PID ${NAVIDROME_PID})..." kill -SIGTERM "${NAVIDROME_PID}" wait "${NAVIDROME_PID}" 2>/dev/null || true NAVIDROME_PID=0 fi } # --- Signal Handling (The Safety Net) --- # If Nomad stops the container, we stop the app and deregister. cleanup() { echo "Caught signal, shutting down..." stop_app deregister_service exit 0 } trap cleanup TERM INT # --- Main Loop --- echo "Starting Supervisor. Waiting for leadership settle..." echo "Node IP: $NODE_IP" echo "Consul: $CONSUL_HTTP_ADDR" # Small sleep to let LiteFS settle and leadership election complete sleep 5 while true; do # In LiteFS 0.5, .primary file exists ONLY on replicas. if [ ! -f "$DB_LOCK_FILE" ]; then # === WE ARE PRIMARY === # 1. If App is not running, start it and register if [ "${NAVIDROME_PID}" -eq 0 ] || ! kill -0 "${NAVIDROME_PID}" 2>/dev/null; then if [ "${NAVIDROME_PID}" -gt 0 ]; then echo "CRITICAL: Navidrome crashed! Restarting..." fi start_app register_service fi # 2. Maintain the heartbeat (TTL) pass_ttl else # === WE ARE REPLICA === # If App is running (we were just demoted), stop it if [ "${NAVIDROME_PID}" -gt 0 ]; then echo "Lost leadership. Demoting..." stop_app deregister_service fi # No service registration exists for replicas to keep Consul clean. fi # Sleep short enough to update TTL (every 5s is safe for 15s TTL) sleep 5 & wait $! # Wait allows the 'trap' to interrupt the sleep instantly done