diff --git a/entrypoint.sh b/entrypoint.sh index f1155e3..7b668dc 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -3,84 +3,19 @@ set -e # Configuration from environment SERVICE_NAME="navidrome" -# Use Nomad allocation ID for a unique service ID SERVICE_ID="${SERVICE_NAME}-${NOMAD_ALLOC_ID:-$(hostname)}" PORT=4533 CONSUL_HTTP_ADDR="${CONSUL_URL:-http://localhost:8500}" NODE_IP="${ADVERTISE_IP}" -DB_LOCK_FILE="/litefs/.primary" -NAVIDROME_PID=0 - -# --- Helper Functions --- - -# Check if this node is the LiteFS Primary -check_primary() { - # Use the local LiteFS API to get the most accurate status - local status=$(curl -s http://localhost:20202/status || echo "{}") - - # Support both LiteFS 0.5 (flat) and potential future/other versions (nested) - # We use jq to find the first truthy isPrimary value - local is_primary=$(echo "$status" | jq -r 'if type == "object" then (.isPrimary // .info.isPrimary // false) else false end' 2>/dev/null || echo "false") - - if [ "$is_primary" = "true" ]; then - return 0 # We are the primary - fi - return 1 # We are a replica -} - -# Wait for LiteFS to be fully initialized and connected -wait_for_litefs() { - echo "Waiting for LiteFS to settle..." - local timeout=60 - local count=0 - while [ $count -lt $timeout ]; do - local status=$(curl -s http://localhost:20202/status || echo "null") - - # Check if we got a valid JSON object with a definitive isPrimary status - local is_primary_val=$(echo "$status" | jq -r 'if type == "object" then (.isPrimary // .info.isPrimary // "null") else "null" end' 2>/dev/null || echo "null") - - if [ "$is_primary_val" != "null" ]; then - local role="replica" - if [ "$is_primary_val" = "true" ]; then role="primary"; fi - echo "LiteFS initialized. Role: $role" - return 0 - fi - sleep 2 - count=$((count + 2)) - echo -n "." - done - echo "ERROR: LiteFS failed to settle after ${timeout}s" - return 1 -} # Tags for the Primary service (Traefik enabled) PRIMARY_TAGS='["navidrome","web","traefik.enable=true","urlprefix-/navidrome","tools","traefik.http.routers.navidromelan.rule=Host(`navidrome.service.dc1.consul`)","traefik.http.routers.navidromewan.rule=Host(`m.fbleagh.duckdns.org`)","traefik.http.routers.navidromewan.middlewares=dex@consulcatalog","traefik.http.routers.navidromewan.tls=true"]' # --- Helper Functions --- -# Backup Database (Only on Primary) -run_backup() { - local backup_dir="/data/backup" - local timestamp=$(date +%Y%m%d_%H%M%S) - local backup_file="${backup_dir}/navidrome.db_${timestamp}.bak" - - echo "Backing up database to ${backup_file}..." - mkdir -p "$backup_dir" - - if litefs export -name navidrome.db "$backup_file"; then - echo "Backup successful." - # Keep only last 7 days - find "$backup_dir" -name "navidrome.db_*.bak" -mtime +7 -delete - echo "Old backups cleaned." - else - echo "ERROR: Backup failed!" - fi -} - # Register Service with TTL Check register_service() { - echo "Promoted! Registering service ${SERVICE_ID}..." - # Convert bash list string to JSON array if needed, but PRIMARY_TAGS is already JSON-like + echo "Registering service ${SERVICE_ID} with Consul..." curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/register" -d "{ \"ID\": \"${SERVICE_ID}\", \"Name\": \"${SERVICE_NAME}\", @@ -101,115 +36,41 @@ pass_ttl() { # Deregister Service deregister_service() { - echo "Demoted/Stopping. Deregistering service ${SERVICE_ID}..." + echo "Deregistering service ${SERVICE_ID} from Consul..." curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}" } -# Start Navidrome in Background -start_app() { - echo "Node is Primary. Starting Navidrome..." - - # Ensure shared directories exist on persistent host volume - mkdir -p /data/plugins /data/cache /data/backup - - # Explicitly wait for the DB file to appear in the LiteFS mount - local db_file="/litefs/navidrome.db" - local timeout=30 - local count=0 - - echo "Waiting for LiteFS database at ${db_file}..." - while [ ! -f "$db_file" ] && [ $count -lt $timeout ]; do - sleep 1 - count=$((count + 1)) - done - - if [ ! -f "$db_file" ]; then - echo "WARNING: Database file ${db_file} not found after ${timeout}s. LiteFS may still be initializing." - # We continue anyway as Navidrome might create it, but logging this is vital for debugging. - fi - - # Tell Navidrome to use the database directly from the LiteFS mount. - # We include standard Navidrome SQLite params for stability. - export ND_DBPATH="${db_file}?cache=shared&_busy_timeout=15000&_journal_mode=WAL&_foreign_keys=on" - - /app/navidrome & - NAVIDROME_PID=$! - echo "Navidrome started with PID ${NAVIDROME_PID} using DB at ${ND_DBPATH}" -} - -# Stop Navidrome -stop_app() { - if [ "${NAVIDROME_PID}" -gt 0 ]; then - echo "Stopping Navidrome (PID ${NAVIDROME_PID})..." - kill -SIGTERM "${NAVIDROME_PID}" - wait "${NAVIDROME_PID}" 2>/dev/null || true - NAVIDROME_PID=0 - fi -} - -# --- Signal Handling (The Safety Net) --- -# If Nomad stops the container, we stop the app and deregister. +# --- Cleanup --- cleanup() { - echo "Caught signal, shutting down..." - stop_app + echo "Shutting down..." deregister_service exit 0 } trap cleanup TERM INT -# --- Main Loop --- +# --- Main Logic --- -echo "Starting Supervisor. Waiting for leadership settle..." -echo "Node IP: $NODE_IP" -echo "Consul: $CONSUL_HTTP_ADDR" +echo "Starting Navidrome Primary Node..." -# Wait for LiteFS to be fully ready before making decisions -wait_for_litefs || exit 1 +# 1. Ensure shared directories exist on persistent host volume +mkdir -p /data/plugins /data/cache /data/backup -LAST_BACKUP_TIME=0 -BACKUP_INTERVAL=86400 # 24 hours +# 2. Tell Navidrome to use the database directly from the LiteFS mount. +export ND_DBPATH="/litefs/navidrome.db?cache=shared&_busy_timeout=15000&_journal_mode=WAL&_foreign_keys=on" -while true; do - # Use explicit LiteFS status check instead of file existence - if check_primary; then - # === WE ARE PRIMARY === - - # 1. If App is not running, start it and register - if [ "${NAVIDROME_PID}" -eq 0 ] || ! kill -0 "${NAVIDROME_PID}" 2>/dev/null; then - if [ "${NAVIDROME_PID}" -gt 0 ]; then - echo "CRITICAL: Navidrome crashed! Restarting..." - fi - start_app - register_service - fi +# 3. Register with Consul +register_service - # 2. Maintain the heartbeat (TTL) - pass_ttl - - # 3. Handle periodic backup - CURRENT_TIME=$(date +%s) - if [ $((CURRENT_TIME - LAST_BACKUP_TIME)) -ge $BACKUP_INTERVAL ]; then - run_backup - LAST_BACKUP_TIME=$CURRENT_TIME - fi - - else - # === WE ARE REPLICA === - - # If App is running (we were just demoted), stop it - if [ "${NAVIDROME_PID}" -gt 0 ]; then - echo "Lost leadership. Demoting..." - stop_app - deregister_service - # Reset backup timer so the next primary can start fresh or we start fresh if promoted again - LAST_BACKUP_TIME=0 - fi - - # No service registration exists for replicas to keep Consul clean. - fi +# 4. Start Navidrome and Maintain TTL +/app/navidrome & +NAVIDROME_PID=$! - # Sleep short enough to update TTL (every 5s is safe for 15s TTL) - sleep 5 & - wait $! # Wait allows the 'trap' to interrupt the sleep instantly +echo "Navidrome running (PID: $NAVIDROME_PID)" + +while kill -0 $NAVIDROME_PID 2>/dev/null; do + pass_ttl + sleep 10 done + +cleanup diff --git a/litefs.yml b/litefs.yml index 0f27a80..fe23657 100644 --- a/litefs.yml +++ b/litefs.yml @@ -8,6 +8,8 @@ data: # Use Consul for leader election lease: type: "consul" + candidate: true + promote: true advertise-url: "http://${ADVERTISE_IP}:20202" consul: url: "${CONSUL_URL}" @@ -34,3 +36,4 @@ proxy: # Commands to run only on the primary node. exec: - cmd: "/usr/local/bin/entrypoint.sh" + if-candidate: true