fix: move to native LiteFS leadership management with if-candidate: true
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 41s
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 41s
This commit is contained in:
181
entrypoint.sh
181
entrypoint.sh
@@ -3,84 +3,19 @@ set -e
|
|||||||
|
|
||||||
# Configuration from environment
|
# Configuration from environment
|
||||||
SERVICE_NAME="navidrome"
|
SERVICE_NAME="navidrome"
|
||||||
# Use Nomad allocation ID for a unique service ID
|
|
||||||
SERVICE_ID="${SERVICE_NAME}-${NOMAD_ALLOC_ID:-$(hostname)}"
|
SERVICE_ID="${SERVICE_NAME}-${NOMAD_ALLOC_ID:-$(hostname)}"
|
||||||
PORT=4533
|
PORT=4533
|
||||||
CONSUL_HTTP_ADDR="${CONSUL_URL:-http://localhost:8500}"
|
CONSUL_HTTP_ADDR="${CONSUL_URL:-http://localhost:8500}"
|
||||||
NODE_IP="${ADVERTISE_IP}"
|
NODE_IP="${ADVERTISE_IP}"
|
||||||
DB_LOCK_FILE="/litefs/.primary"
|
|
||||||
NAVIDROME_PID=0
|
|
||||||
|
|
||||||
# --- Helper Functions ---
|
|
||||||
|
|
||||||
# Check if this node is the LiteFS Primary
|
|
||||||
check_primary() {
|
|
||||||
# Use the local LiteFS API to get the most accurate status
|
|
||||||
local status=$(curl -s http://localhost:20202/status || echo "{}")
|
|
||||||
|
|
||||||
# Support both LiteFS 0.5 (flat) and potential future/other versions (nested)
|
|
||||||
# We use jq to find the first truthy isPrimary value
|
|
||||||
local is_primary=$(echo "$status" | jq -r 'if type == "object" then (.isPrimary // .info.isPrimary // false) else false end' 2>/dev/null || echo "false")
|
|
||||||
|
|
||||||
if [ "$is_primary" = "true" ]; then
|
|
||||||
return 0 # We are the primary
|
|
||||||
fi
|
|
||||||
return 1 # We are a replica
|
|
||||||
}
|
|
||||||
|
|
||||||
# Wait for LiteFS to be fully initialized and connected
|
|
||||||
wait_for_litefs() {
|
|
||||||
echo "Waiting for LiteFS to settle..."
|
|
||||||
local timeout=60
|
|
||||||
local count=0
|
|
||||||
while [ $count -lt $timeout ]; do
|
|
||||||
local status=$(curl -s http://localhost:20202/status || echo "null")
|
|
||||||
|
|
||||||
# Check if we got a valid JSON object with a definitive isPrimary status
|
|
||||||
local is_primary_val=$(echo "$status" | jq -r 'if type == "object" then (.isPrimary // .info.isPrimary // "null") else "null" end' 2>/dev/null || echo "null")
|
|
||||||
|
|
||||||
if [ "$is_primary_val" != "null" ]; then
|
|
||||||
local role="replica"
|
|
||||||
if [ "$is_primary_val" = "true" ]; then role="primary"; fi
|
|
||||||
echo "LiteFS initialized. Role: $role"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
sleep 2
|
|
||||||
count=$((count + 2))
|
|
||||||
echo -n "."
|
|
||||||
done
|
|
||||||
echo "ERROR: LiteFS failed to settle after ${timeout}s"
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
# Tags for the Primary service (Traefik enabled)
|
# Tags for the Primary service (Traefik enabled)
|
||||||
PRIMARY_TAGS='["navidrome","web","traefik.enable=true","urlprefix-/navidrome","tools","traefik.http.routers.navidromelan.rule=Host(`navidrome.service.dc1.consul`)","traefik.http.routers.navidromewan.rule=Host(`m.fbleagh.duckdns.org`)","traefik.http.routers.navidromewan.middlewares=dex@consulcatalog","traefik.http.routers.navidromewan.tls=true"]'
|
PRIMARY_TAGS='["navidrome","web","traefik.enable=true","urlprefix-/navidrome","tools","traefik.http.routers.navidromelan.rule=Host(`navidrome.service.dc1.consul`)","traefik.http.routers.navidromewan.rule=Host(`m.fbleagh.duckdns.org`)","traefik.http.routers.navidromewan.middlewares=dex@consulcatalog","traefik.http.routers.navidromewan.tls=true"]'
|
||||||
|
|
||||||
# --- Helper Functions ---
|
# --- Helper Functions ---
|
||||||
|
|
||||||
# Backup Database (Only on Primary)
|
|
||||||
run_backup() {
|
|
||||||
local backup_dir="/data/backup"
|
|
||||||
local timestamp=$(date +%Y%m%d_%H%M%S)
|
|
||||||
local backup_file="${backup_dir}/navidrome.db_${timestamp}.bak"
|
|
||||||
|
|
||||||
echo "Backing up database to ${backup_file}..."
|
|
||||||
mkdir -p "$backup_dir"
|
|
||||||
|
|
||||||
if litefs export -name navidrome.db "$backup_file"; then
|
|
||||||
echo "Backup successful."
|
|
||||||
# Keep only last 7 days
|
|
||||||
find "$backup_dir" -name "navidrome.db_*.bak" -mtime +7 -delete
|
|
||||||
echo "Old backups cleaned."
|
|
||||||
else
|
|
||||||
echo "ERROR: Backup failed!"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Register Service with TTL Check
|
# Register Service with TTL Check
|
||||||
register_service() {
|
register_service() {
|
||||||
echo "Promoted! Registering service ${SERVICE_ID}..."
|
echo "Registering service ${SERVICE_ID} with Consul..."
|
||||||
# Convert bash list string to JSON array if needed, but PRIMARY_TAGS is already JSON-like
|
|
||||||
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/register" -d "{
|
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/register" -d "{
|
||||||
\"ID\": \"${SERVICE_ID}\",
|
\"ID\": \"${SERVICE_ID}\",
|
||||||
\"Name\": \"${SERVICE_NAME}\",
|
\"Name\": \"${SERVICE_NAME}\",
|
||||||
@@ -101,115 +36,41 @@ pass_ttl() {
|
|||||||
|
|
||||||
# Deregister Service
|
# Deregister Service
|
||||||
deregister_service() {
|
deregister_service() {
|
||||||
echo "Demoted/Stopping. Deregistering service ${SERVICE_ID}..."
|
echo "Deregistering service ${SERVICE_ID} from Consul..."
|
||||||
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}"
|
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Start Navidrome in Background
|
# --- Cleanup ---
|
||||||
start_app() {
|
|
||||||
echo "Node is Primary. Starting Navidrome..."
|
|
||||||
|
|
||||||
# Ensure shared directories exist on persistent host volume
|
|
||||||
mkdir -p /data/plugins /data/cache /data/backup
|
|
||||||
|
|
||||||
# Explicitly wait for the DB file to appear in the LiteFS mount
|
|
||||||
local db_file="/litefs/navidrome.db"
|
|
||||||
local timeout=30
|
|
||||||
local count=0
|
|
||||||
|
|
||||||
echo "Waiting for LiteFS database at ${db_file}..."
|
|
||||||
while [ ! -f "$db_file" ] && [ $count -lt $timeout ]; do
|
|
||||||
sleep 1
|
|
||||||
count=$((count + 1))
|
|
||||||
done
|
|
||||||
|
|
||||||
if [ ! -f "$db_file" ]; then
|
|
||||||
echo "WARNING: Database file ${db_file} not found after ${timeout}s. LiteFS may still be initializing."
|
|
||||||
# We continue anyway as Navidrome might create it, but logging this is vital for debugging.
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Tell Navidrome to use the database directly from the LiteFS mount.
|
|
||||||
# We include standard Navidrome SQLite params for stability.
|
|
||||||
export ND_DBPATH="${db_file}?cache=shared&_busy_timeout=15000&_journal_mode=WAL&_foreign_keys=on"
|
|
||||||
|
|
||||||
/app/navidrome &
|
|
||||||
NAVIDROME_PID=$!
|
|
||||||
echo "Navidrome started with PID ${NAVIDROME_PID} using DB at ${ND_DBPATH}"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Stop Navidrome
|
|
||||||
stop_app() {
|
|
||||||
if [ "${NAVIDROME_PID}" -gt 0 ]; then
|
|
||||||
echo "Stopping Navidrome (PID ${NAVIDROME_PID})..."
|
|
||||||
kill -SIGTERM "${NAVIDROME_PID}"
|
|
||||||
wait "${NAVIDROME_PID}" 2>/dev/null || true
|
|
||||||
NAVIDROME_PID=0
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# --- Signal Handling (The Safety Net) ---
|
|
||||||
# If Nomad stops the container, we stop the app and deregister.
|
|
||||||
cleanup() {
|
cleanup() {
|
||||||
echo "Caught signal, shutting down..."
|
echo "Shutting down..."
|
||||||
stop_app
|
|
||||||
deregister_service
|
deregister_service
|
||||||
exit 0
|
exit 0
|
||||||
}
|
}
|
||||||
|
|
||||||
trap cleanup TERM INT
|
trap cleanup TERM INT
|
||||||
|
|
||||||
# --- Main Loop ---
|
# --- Main Logic ---
|
||||||
|
|
||||||
echo "Starting Supervisor. Waiting for leadership settle..."
|
echo "Starting Navidrome Primary Node..."
|
||||||
echo "Node IP: $NODE_IP"
|
|
||||||
echo "Consul: $CONSUL_HTTP_ADDR"
|
|
||||||
|
|
||||||
# Wait for LiteFS to be fully ready before making decisions
|
# 1. Ensure shared directories exist on persistent host volume
|
||||||
wait_for_litefs || exit 1
|
mkdir -p /data/plugins /data/cache /data/backup
|
||||||
|
|
||||||
LAST_BACKUP_TIME=0
|
# 2. Tell Navidrome to use the database directly from the LiteFS mount.
|
||||||
BACKUP_INTERVAL=86400 # 24 hours
|
export ND_DBPATH="/litefs/navidrome.db?cache=shared&_busy_timeout=15000&_journal_mode=WAL&_foreign_keys=on"
|
||||||
|
|
||||||
while true; do
|
# 3. Register with Consul
|
||||||
# Use explicit LiteFS status check instead of file existence
|
register_service
|
||||||
if check_primary; then
|
|
||||||
# === WE ARE PRIMARY ===
|
|
||||||
|
|
||||||
# 1. If App is not running, start it and register
|
# 4. Start Navidrome and Maintain TTL
|
||||||
if [ "${NAVIDROME_PID}" -eq 0 ] || ! kill -0 "${NAVIDROME_PID}" 2>/dev/null; then
|
/app/navidrome &
|
||||||
if [ "${NAVIDROME_PID}" -gt 0 ]; then
|
NAVIDROME_PID=$!
|
||||||
echo "CRITICAL: Navidrome crashed! Restarting..."
|
|
||||||
fi
|
|
||||||
start_app
|
|
||||||
register_service
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 2. Maintain the heartbeat (TTL)
|
echo "Navidrome running (PID: $NAVIDROME_PID)"
|
||||||
pass_ttl
|
|
||||||
|
|
||||||
# 3. Handle periodic backup
|
while kill -0 $NAVIDROME_PID 2>/dev/null; do
|
||||||
CURRENT_TIME=$(date +%s)
|
pass_ttl
|
||||||
if [ $((CURRENT_TIME - LAST_BACKUP_TIME)) -ge $BACKUP_INTERVAL ]; then
|
sleep 10
|
||||||
run_backup
|
|
||||||
LAST_BACKUP_TIME=$CURRENT_TIME
|
|
||||||
fi
|
|
||||||
|
|
||||||
else
|
|
||||||
# === WE ARE REPLICA ===
|
|
||||||
|
|
||||||
# If App is running (we were just demoted), stop it
|
|
||||||
if [ "${NAVIDROME_PID}" -gt 0 ]; then
|
|
||||||
echo "Lost leadership. Demoting..."
|
|
||||||
stop_app
|
|
||||||
deregister_service
|
|
||||||
# Reset backup timer so the next primary can start fresh or we start fresh if promoted again
|
|
||||||
LAST_BACKUP_TIME=0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# No service registration exists for replicas to keep Consul clean.
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Sleep short enough to update TTL (every 5s is safe for 15s TTL)
|
|
||||||
sleep 5 &
|
|
||||||
wait $! # Wait allows the 'trap' to interrupt the sleep instantly
|
|
||||||
done
|
done
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ data:
|
|||||||
# Use Consul for leader election
|
# Use Consul for leader election
|
||||||
lease:
|
lease:
|
||||||
type: "consul"
|
type: "consul"
|
||||||
|
candidate: true
|
||||||
|
promote: true
|
||||||
advertise-url: "http://${ADVERTISE_IP}:20202"
|
advertise-url: "http://${ADVERTISE_IP}:20202"
|
||||||
consul:
|
consul:
|
||||||
url: "${CONSUL_URL}"
|
url: "${CONSUL_URL}"
|
||||||
@@ -34,3 +36,4 @@ proxy:
|
|||||||
# Commands to run only on the primary node.
|
# Commands to run only on the primary node.
|
||||||
exec:
|
exec:
|
||||||
- cmd: "/usr/local/bin/entrypoint.sh"
|
- cmd: "/usr/local/bin/entrypoint.sh"
|
||||||
|
if-candidate: true
|
||||||
|
|||||||
Reference in New Issue
Block a user