Compare commits
38 Commits
045fc6e82b
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 97733cf7b8 | |||
| 5c1fedd379 | |||
| bb18672bfc | |||
| 48a005cfbc | |||
| 94d8e290bf | |||
| 3232d6568d | |||
| 1117fb178b | |||
| e678120572 | |||
| 92f9209dcd | |||
| 33b84be0a5 | |||
| 45e40bf273 | |||
| 8acb098918 | |||
| dd413d1342 | |||
| 7ea127f9cb | |||
| 9232aeccc5 | |||
| 0200afdc0f | |||
| e0262dc88b | |||
| 107e37cb3e | |||
| 5311f0069a | |||
| af8ce0ef2b | |||
| 5f9e4d23fb | |||
| 6e7c729c5e | |||
| 37f0dcb1e7 | |||
| 402553a674 | |||
| c04c00143e | |||
| 3e6a4d1704 | |||
| 362f838f7c | |||
| a8e02ae063 | |||
| 538ee01b72 | |||
| 25885ea4f0 | |||
| a586d60682 | |||
| 59f406d3b7 | |||
| f08c715d75 | |||
| 8f1565b1af | |||
| 4538ad5909 | |||
| f0b02904a8 | |||
| 7f1f3321e0 | |||
| 23a65be4d8 |
54
.github/workflows/deploy.yml
vendored
Normal file
54
.github/workflows/deploy.yml
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
name: Deploy to Nomad
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["Build and Push Docker Image"]
|
||||
types:
|
||||
- completed
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
container_sha:
|
||||
description: 'Container SHA to deploy (leave empty for latest commit)'
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
nomad:
|
||||
runs-on: ubuntu-latest
|
||||
name: Deploy to Nomad
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
|
||||
|
||||
steps:
|
||||
- name: Checkout Repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Nomad CLI
|
||||
run: |
|
||||
NOMAD_VERSION="1.10.5"
|
||||
curl -sSL https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_linux_amd64.zip -o nomad.zip
|
||||
unzip nomad.zip
|
||||
sudo mv nomad /usr/local/bin/
|
||||
rm nomad.zip
|
||||
nomad version
|
||||
|
||||
- name: Set Container Version
|
||||
id: container_version
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.container_sha }}" ]; then
|
||||
echo "sha=${{ inputs.container_sha }}" >> $GITHUB_OUTPUT
|
||||
elif [ "${{ github.event_name }}" = "workflow_run" ]; then
|
||||
echo "sha=${{ github.event.workflow_run.head_sha }}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "sha=${{ github.sha }}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Deploy Nomad Job
|
||||
id: deploy
|
||||
env:
|
||||
NOMAD_ADDR: http://192.168.4.36:4646
|
||||
NOMAD_TOKEN: ${{ secrets.NOMAD_TOKEN }}
|
||||
run: |
|
||||
echo "Deploying container version: ${{ steps.container_version.outputs.sha }}"
|
||||
nomad job run \
|
||||
-var="container_sha=${{ steps.container_version.outputs.sha }}" \
|
||||
navidrome-litefs.nomad
|
||||
@@ -18,6 +18,9 @@ RUN chmod +x /usr/local/bin/entrypoint.sh
|
||||
# Copy LiteFS configuration
|
||||
COPY litefs.yml /etc/litefs.yml
|
||||
|
||||
# Create mount points and data directories
|
||||
RUN mkdir -p /litefs /data
|
||||
|
||||
# LiteFS becomes the supervisor.
|
||||
|
||||
# It will mount the FUSE fs and then execute the command defined in litefs.yml's exec section.
|
||||
|
||||
5
conductor/archive/fix_navidrome_paths_20260209/index.md
Normal file
5
conductor/archive/fix_navidrome_paths_20260209/index.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# Track fix_navidrome_paths_20260209 Context
|
||||
|
||||
- [Specification](./spec.md)
|
||||
- [Implementation Plan](./plan.md)
|
||||
- [Metadata](./metadata.json)
|
||||
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"track_id": "fix_navidrome_paths_20260209",
|
||||
"type": "bug",
|
||||
"status": "new",
|
||||
"created_at": "2026-02-09T14:30:00Z",
|
||||
"updated_at": "2026-02-09T14:30:00Z",
|
||||
"description": "Fix Navidrome database location to ensure it uses LiteFS mount and resolve process path conflicts."
|
||||
}
|
||||
17
conductor/archive/fix_navidrome_paths_20260209/plan.md
Normal file
17
conductor/archive/fix_navidrome_paths_20260209/plan.md
Normal file
@@ -0,0 +1,17 @@
|
||||
# Plan: Correct Navidrome Database and Plugins Location (`fix_navidrome_paths`)
|
||||
|
||||
## Phase 1: Configuration Updates [x]
|
||||
- [x] Task: Update `navidrome-litefs-v2.nomad` with corrected paths (76398de)
|
||||
- [x] Task: Update `entrypoint.sh` to handle plugins folder and environment cleanup (decb9f5)
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 1: Configuration Updates' (Protocol in workflow.md)
|
||||
|
||||
## Phase 2: Build and Deployment [x]
|
||||
- [x] Task: Commit changes and push to Gitea to trigger build (045fc6e)
|
||||
- [x] Task: Monitor Gitea build completion (Build #26)
|
||||
- [x] Task: Deploy updated Nomad job (Job Version 6)
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 2: Build and Deployment' (Protocol in workflow.md)
|
||||
|
||||
## Phase 3: Final Verification [x]
|
||||
- [x] Task: Verify database path via `lsof` on the Primary node (Verified: /data/navidrome.db)
|
||||
- [x] Task: Verify replication health using `cluster_status` script (Verified: All nodes in sync)
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md)
|
||||
25
conductor/archive/fix_navidrome_paths_20260209/spec.md
Normal file
25
conductor/archive/fix_navidrome_paths_20260209/spec.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Specification: Correct Navidrome Database and Plugins Location (`fix_navidrome_paths`)
|
||||
|
||||
## Overview
|
||||
Force Navidrome to use the `/data` LiteFS mount for its SQLite database by setting the `DATAFOLDER` to `/data`. To avoid the "Operation not permitted" error caused by LiteFS's restriction on directory creation, redirect the Navidrome plugins folder to persistent shared storage.
|
||||
|
||||
## Functional Requirements
|
||||
- **Nomad Job Configuration (`navidrome-litefs-v2.nomad`):**
|
||||
- Set `ND_DATAFOLDER="/data"`. This will force Navidrome to create and use `navidrome.db` on the LiteFS mount.
|
||||
- Set `ND_PLUGINSFOLDER="/shared_data/plugins"`. This prevents Navidrome from attempting to create a `plugins` directory in the read-only/virtual `/data` mount.
|
||||
- Keep `ND_CACHEFOLDER` and `ND_BACKUP_PATH` pointing to `/shared_data` subdirectories.
|
||||
- **Entrypoint Logic (`entrypoint.sh`):**
|
||||
- Ensure it creates `/shared_data/plugins` if it doesn't exist.
|
||||
- Remove the explicit `export ND_DATABASE_PATH` if it conflicts with the new `DATAFOLDER` logic, or keep it as an explicit override.
|
||||
- **Verification:**
|
||||
- Confirm via `lsof` that Navidrome is finally using `/data/navidrome.db`.
|
||||
- Confirm that LiteFS `/debug/vars` now reports the database in its active set.
|
||||
|
||||
## Non-Functional Requirements
|
||||
- **Persistence:** Ensure all non-database files (plugins, cache, backups) are stored on the shared host mount (`/shared_data`) to survive container restarts and migrations.
|
||||
|
||||
## Acceptance Criteria
|
||||
- [ ] Navidrome successfully starts with `/data` as its data folder.
|
||||
- [ ] No "Operation not permitted" errors occur during startup.
|
||||
- [ ] `lsof` confirms `/data/navidrome.db` is open by the Navidrome process.
|
||||
- [ ] LiteFS `txid` increases on the Primary and replicates to Replicas when Navidrome writes to the DB.
|
||||
@@ -8,7 +8,10 @@
|
||||
## Storage & Database
|
||||
- **SQLite:** The primary relational database used by Navidrome for metadata and state.
|
||||
- **LiteFS:** A FUSE-based filesystem that provides synchronous replication of the SQLite database across the cluster.
|
||||
- **Process Management:** LiteFS-supervised with a robust TTL-heartbeat registration script ensuring zero-downtime failover and clean service catalog management.
|
||||
- **Process Management:** LiteFS-supervised with TTL-based self-registration for clean and resilient service catalog management.
|
||||
- **Hybrid Storage Model:**
|
||||
- **Replicated:** SQLite database on LiteFS FUSE mount (`/data`).
|
||||
- **Shared:** Plugins, cache, and backups on persistent network storage (`/shared_data`).
|
||||
|
||||
## Automation & Delivery
|
||||
- **Gitea Actions:** Automates the multi-arch (AMD64/ARM64) building and pushing of the custom supervised container image.
|
||||
|
||||
@@ -2,7 +2,4 @@
|
||||
|
||||
This file tracks all major tracks for the project. Each track has its own detailed plan in its respective folder.
|
||||
---
|
||||
---
|
||||
|
||||
- [x] **Track: Update Monitor Discovery Logic**
|
||||
*Link: [./tracks/update_monitor_discovery_20260208/](./update_monitor_discovery_20260208/)*
|
||||
---
|
||||
@@ -1,30 +0,0 @@
|
||||
# Plan: Cluster Diagnosis and Script Enhancement (`diagnose_and_enhance`)
|
||||
|
||||
## Phase 1: Enhanced Diagnostics (Consul) [x] [checkpoint: a686c5b]
|
||||
- [x] Task: Update `consul_client.py` to fetch detailed health check output
|
||||
- [x] Write tests for fetching `Output` field from Consul checks
|
||||
- [x] Implement logic to extract and store the `Output` (error message)
|
||||
- [x] Task: Update aggregator and formatter to display Consul errors
|
||||
- [x] Update aggregation logic to include `consul_error`
|
||||
- [x] Update table formatter to indicate an error (maybe a flag or color)
|
||||
- [x] Add a "Diagnostics" section to the output to print full error details
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 1: Enhanced Diagnostics (Consul)' (Protocol in workflow.md)
|
||||
|
||||
## Phase 2: Nomad Integration and Logs [x] [checkpoint: 6d77729]
|
||||
- [x] Task: Implement `nomad_client.py` wrapper
|
||||
- [x] Write tests for `get_allocation_logs`, `get_node_status`, and `restart_allocation` (mocking subprocess)
|
||||
- [x] Implement `subprocess.run(["nomad", ...])` logic to fetch logs and restart allocations
|
||||
- [x] Task: Integrate Nomad logs into diagnosis
|
||||
- [x] Update aggregator to call Nomad client for critical nodes
|
||||
- [x] Update "Diagnostics" section to display the last 20 lines of stderr
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 2: Nomad Integration and Logs' (Protocol in workflow.md)
|
||||
|
||||
## Phase 3: Advanced LiteFS Status [ ]
|
||||
- [ ] Task: Implement `litefs_status` via `nomad alloc exec`
|
||||
- [ ] Write tests for executing remote commands via Nomad
|
||||
- [ ] Update `litefs_client.py` to fallback to `nomad alloc exec` if HTTP fails
|
||||
- [ ] Parse `litefs status` output (text/json) to extract uptime and replication lag
|
||||
- [ ] Task: Final Polish and Diagnosis Run
|
||||
- [ ] Ensure all pieces work together
|
||||
- [ ] Run the script to diagnose `odroid8`
|
||||
- [ ] Task: Conductor - User Manual Verification 'Phase 3: Advanced LiteFS Status' (Protocol in workflow.md)
|
||||
@@ -1,22 +0,0 @@
|
||||
# Plan: Fix LiteFS Configuration and Process Management (`fix_litefs_config`)
|
||||
|
||||
## Phase 1: Configuration and Image Structure [ ]
|
||||
- [x] Task: Update `litefs.yml` to include the `exec` block (396dfeb)
|
||||
- [x] Task: Update `Dockerfile` to use LiteFS as the supervisor (`ENTRYPOINT ["litefs", "mount"]`) (ef91b8e)
|
||||
- [x] Task: Update `navidrome-litefs-v2.nomad` with corrected storage paths (`ND_DATAFOLDER`, `ND_CACHEFOLDER`, `ND_BACKUP_PATH`) (5cbb657)
|
||||
- [ ] Task: Conductor - User Manual Verification 'Phase 1: Configuration and Image Structure' (Protocol in workflow.md)
|
||||
|
||||
## Phase 2: Entrypoint and Registration Logic [x] [checkpoint: 9cd5455]
|
||||
- [x] Task: Refactor `entrypoint.sh` to handle leadership-aware process management (9cd5455)
|
||||
- [x] Integrate Consul registration logic (from `register.sh`)
|
||||
- [x] Implement loop to start/stop Navidrome based on `/data/.primary` existence
|
||||
- [x] Ensure proper signal handling for Navidrome shutdown
|
||||
- [x] Task: Clean up redundant scripts (e.g., `register.sh` if fully integrated) (9cd5455)
|
||||
- [ ] Task: Conductor - User Manual Verification 'Phase 2: Entrypoint and Registration Logic' (Protocol in workflow.md)
|
||||
|
||||
## Phase 3: Deployment and Failover Verification [ ]
|
||||
- [ ] Task: Build and push the updated Docker image via Gitea Actions (if possible) or manual trigger
|
||||
- [ ] Task: Deploy the updated Nomad job
|
||||
- [ ] Task: Verify cluster health and process distribution using `cluster_status` script
|
||||
- [ ] Task: Perform a manual failover (stop primary allocation) and verify Navidrome migrates correctly
|
||||
- [ ] Task: Conductor - User Manual Verification 'Phase 3: Deployment and Failover Verification' (Protocol in workflow.md)
|
||||
@@ -1,17 +0,0 @@
|
||||
# Plan: Correct Navidrome Database and Plugins Location (`fix_navidrome_paths`)
|
||||
|
||||
## Phase 1: Configuration Updates [x]
|
||||
- [x] Task: Update `navidrome-litefs-v2.nomad` with corrected paths (76398de)
|
||||
- [x] Task: Update `entrypoint.sh` to handle plugins folder and environment cleanup (decb9f5)
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 1: Configuration Updates' (Protocol in workflow.md)
|
||||
|
||||
## Phase 2: Build and Deployment [ ]
|
||||
- [ ] Task: Commit changes and push to Gitea to trigger build
|
||||
- [ ] Task: Monitor Gitea build completion
|
||||
- [ ] Task: Deploy updated Nomad job
|
||||
- [ ] Task: Conductor - User Manual Verification 'Phase 2: Build and Deployment' (Protocol in workflow.md)
|
||||
|
||||
## Phase 3: Final Verification [ ]
|
||||
- [ ] Task: Verify database path via `lsof` on the Primary node
|
||||
- [ ] Task: Verify replication health using `cluster_status` script
|
||||
- [ ] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md)
|
||||
@@ -1,26 +0,0 @@
|
||||
# Plan: Fix Odroid8 and Script Robustness (`fix_odroid8_and_script`)
|
||||
|
||||
## Phase 1: Script Robustness [x] [checkpoint: 860000b]
|
||||
- [x] Task: Update `nomad_client.py` to handle subprocess errors gracefully
|
||||
- [x] Write tests for handling Nomad CLI absence/failure
|
||||
- [x] Update implementation to return descriptive error objects or `None` without crashing
|
||||
- [x] Task: Update aggregator and formatter to handle Nomad errors
|
||||
- [x] Update `cluster_aggregator.py` to gracefully skip Nomad calls if they fail
|
||||
- [x] Update `output_formatter.py` to display "Nomad Error" in relevant cells
|
||||
- [x] Add a global "Nomad Connectivity Warning" to the summary
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 1: Script Robustness' (Protocol in workflow.md)
|
||||
|
||||
## Phase 2: Odroid8 Recovery [ ]
|
||||
- [x] Task: Identify and verify `odroid8` LiteFS data path
|
||||
- [x] Run `nomad alloc status` to find the volume mount for `odroid8`
|
||||
- [x] Provide the user with the exact host path to the LiteFS data
|
||||
- [x] Task: Guide user through manual cleanup
|
||||
- [x] Provide steps to stop the allocation
|
||||
- [x] Provide the `rm` command to clear the LiteFS metadata
|
||||
- [x] Provide steps to restart and verify the node
|
||||
- [~] Task: Conductor - User Manual Verification 'Phase 2: Odroid8 Recovery' (Protocol in workflow.md)
|
||||
|
||||
## Phase 3: Final Verification [x]
|
||||
- [x] Task: Final verification run of the script
|
||||
- [x] Task: Verify cluster health in Consul and LiteFS API
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md)
|
||||
@@ -1,22 +0,0 @@
|
||||
# Plan: Implement TTL Heartbeat Service Registration (`implement_ttl_heartbeat`)
|
||||
|
||||
## Phase 1: Container Environment Preparation [x] [checkpoint: 51b8fce]
|
||||
- [x] Task: Update `Dockerfile` to install `curl` and `jq` (f7fe258)
|
||||
- [x] Task: Verify `litefs.yml` points to `entrypoint.sh` (should already be correct) (verified)
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 1: Container Environment Preparation' (Protocol in workflow.md)
|
||||
|
||||
## Phase 2: Script Implementation [x] [checkpoint: 139016f]
|
||||
- [x] Task: Refactor `entrypoint.sh` with the TTL Heartbeat logic (d977301)
|
||||
- [x] Implement `register_service` with TTL check definition
|
||||
- [x] Implement `pass_ttl` loop
|
||||
- [x] Implement robust `stop_app` and signal trapping
|
||||
- [x] Ensure correct Primary/Replica detection logic (LiteFS 0.5: Primary = No `.primary` file)
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 2: Script Implementation' (Protocol in workflow.md)
|
||||
|
||||
## Phase 3: Deployment and Verification [ ]
|
||||
- [ ] Task: Commit changes and push to Gitea to trigger build
|
||||
- [ ] Task: Monitor Gitea build completion
|
||||
- [ ] Task: Deploy updated Nomad job (forcing update if necessary)
|
||||
- [ ] Task: Verify "Clean" state in Consul (only one primary registered)
|
||||
- [ ] Task: Verify Failover/Stop behavior (immediate deregistration vs TTL expiry)
|
||||
- [ ] Task: Conductor - User Manual Verification 'Phase 3: Deployment and Verification' (Protocol in workflow.md)
|
||||
@@ -1,23 +0,0 @@
|
||||
# Plan: Update Monitor Discovery Logic (`update_monitor_discovery`)
|
||||
|
||||
## Phase 1: Nomad Discovery Enhancement [x] [checkpoint: 353683e]
|
||||
- [x] Task: Update `nomad_client.py` to fetch job allocations with IPs (353683e)
|
||||
- [x] Write tests for parsing allocation IPs from `nomad job status` or `nomad alloc status`
|
||||
- [x] Implement `get_job_allocations(job_id)` returning a list of dicts (id, node, ip)
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 1: Nomad Discovery Enhancement' (Protocol in workflow.md)
|
||||
|
||||
## Phase 2: Aggregator Refactor [x] [checkpoint: 655a9b2]
|
||||
- [x] Task: Refactor `cluster_aggregator.py` to drive discovery via Nomad (655a9b2)
|
||||
- [x] Update `get_cluster_status` to call `nomad_client.get_job_allocations` first
|
||||
- [x] Update loop to iterate over allocations and supplement with LiteFS and Consul data
|
||||
- [x] Task: Update `consul_client.py` to fetch all services once and allow lookup by IP/ID (655a9b2)
|
||||
- [x] Task: Update tests for the new discovery flow (655a9b2)
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 2: Aggregator Refactor' (Protocol in workflow.md)
|
||||
|
||||
## Phase 3: UI and Health Logic [x] [checkpoint: 21e9c3d]
|
||||
- [x] Task: Update `output_formatter.py` for "Standby" nodes (21e9c3d)
|
||||
- [x] Update table formatting to handle missing Consul status for replicas
|
||||
- [x] Task: Update Cluster Health calculation (21e9c3d)
|
||||
- [x] "Healthy" = 1 Primary (Consul passing) + N Replicas (LiteFS connected)
|
||||
- [x] Task: Final verification run (21e9c3d)
|
||||
- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md)
|
||||
116
entrypoint.sh
116
entrypoint.sh
@@ -1,14 +1,10 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Configuration from environment
|
||||
SERVICE_NAME="navidrome"
|
||||
# Use Nomad allocation ID for a unique service ID
|
||||
SERVICE_ID="${SERVICE_NAME}-${NOMAD_ALLOC_ID:-$(hostname)}"
|
||||
PORT=4533
|
||||
CONSUL_HTTP_ADDR="${CONSUL_URL:-http://localhost:8500}"
|
||||
NODE_IP="${ADVERTISE_IP}"
|
||||
DB_LOCK_FILE="/data/.primary"
|
||||
NAVIDROME_PID=0
|
||||
|
||||
# Tags for the Primary service (Traefik enabled)
|
||||
@@ -16,10 +12,43 @@ PRIMARY_TAGS='["navidrome","web","traefik.enable=true","urlprefix-/navidrome","t
|
||||
|
||||
# --- Helper Functions ---
|
||||
|
||||
# Check if this node is the LiteFS Primary
|
||||
check_primary() {
|
||||
local status=$(curl -s http://localhost:20202/info || echo "{}")
|
||||
local is_primary=$(echo "$status" | jq -r 'if type == "object" then (.isPrimary // false) else false end' 2>/dev/null || echo "false")
|
||||
|
||||
if [ "$is_primary" = "true" ]; then
|
||||
return 0 # We are the primary
|
||||
fi
|
||||
return 1 # We are a replica
|
||||
}
|
||||
|
||||
# Wait for LiteFS to settle and determine its role
|
||||
wait_for_litefs() {
|
||||
echo "Waiting for LiteFS to settle..."
|
||||
local timeout=60
|
||||
local count=0
|
||||
while [ $count -lt $timeout ]; do
|
||||
local status=$(curl -s http://localhost:20202/info || echo "null")
|
||||
local is_primary_val=$(echo "$status" | jq -r 'if type == "object" then (.isPrimary // "null") else "null" end' 2>/dev/null || echo "null")
|
||||
|
||||
if [ "$is_primary_val" != "null" ]; then
|
||||
local role="replica"
|
||||
if [ "$is_primary_val" = "true" ]; then role="primary"; fi
|
||||
echo "LiteFS initialized. Role: $role"
|
||||
return 0
|
||||
fi
|
||||
sleep 2
|
||||
count=$((count + 2))
|
||||
echo -n "."
|
||||
done
|
||||
echo "ERROR: LiteFS failed to settle after ${timeout}s"
|
||||
return 1
|
||||
}
|
||||
|
||||
# Register Service with TTL Check
|
||||
register_service() {
|
||||
echo "Promoted! Registering service ${SERVICE_ID}..."
|
||||
# Convert bash list string to JSON array if needed, but PRIMARY_TAGS is already JSON-like
|
||||
echo "Registering service ${SERVICE_ID} with Consul..."
|
||||
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/register" -d "{
|
||||
\"ID\": \"${SERVICE_ID}\",
|
||||
\"Name\": \"${SERVICE_NAME}\",
|
||||
@@ -40,7 +69,7 @@ pass_ttl() {
|
||||
|
||||
# Deregister Service
|
||||
deregister_service() {
|
||||
echo "Demoted/Stopping. Deregistering service ${SERVICE_ID}..."
|
||||
echo "Deregistering service ${SERVICE_ID} from Consul..."
|
||||
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}"
|
||||
}
|
||||
|
||||
@@ -49,11 +78,48 @@ start_app() {
|
||||
echo "Node is Primary. Starting Navidrome..."
|
||||
|
||||
# Ensure shared directories exist
|
||||
mkdir -p /shared_data/plugins /shared_data/cache /shared_data/backup
|
||||
mkdir -p /shared_data/plugins /shared_data/cache /shared_data/backup /shared_data/artist_images /shared_data/artwork
|
||||
|
||||
# SEEDING LOGIC: If DB doesn't exist in cluster, restore from backup
|
||||
if [ ! -f /data/navidrome.db ]; then
|
||||
echo "Database /data/navidrome.db not found. Looking for backups to seed..."
|
||||
local latest_backup=$(ls -t /shared_data/backup/navidrome.db_*.bak 2>/dev/null | head -n 1)
|
||||
if [ -n "$latest_backup" ]; then
|
||||
echo "Seeding from $latest_backup..."
|
||||
litefs import -name navidrome.db "$latest_backup"
|
||||
else
|
||||
echo "No backups found. Navidrome will start with a fresh database."
|
||||
fi
|
||||
fi
|
||||
|
||||
# Wait for LiteFS to expose the DB file
|
||||
echo "Waiting for /data/navidrome.db to be exposed by LiteFS..."
|
||||
local db_timeout=30
|
||||
local db_count=0
|
||||
while [ ! -f /data/navidrome.db ] && [ $db_count -lt $db_timeout ]; do
|
||||
sleep 1
|
||||
db_count=$((db_count + 1))
|
||||
done
|
||||
|
||||
# Setup local data folder with BIND MOUNT for the DB
|
||||
# This allows SQLite to create -wal/-shm files in the local writable directory
|
||||
# while the main DB file is managed by LiteFS.
|
||||
rm -rf /local/navidrome_data
|
||||
mkdir -p /local/navidrome_data
|
||||
|
||||
touch /local/navidrome_data/navidrome.db
|
||||
mount --bind /data/navidrome.db /local/navidrome_data/navidrome.db
|
||||
|
||||
# Configuration
|
||||
export ND_DATAFOLDER="/local/navidrome_data"
|
||||
export ND_CACHEFOLDER="/shared_data/cache"
|
||||
export ND_BACKUP_PATH="/shared_data/backup"
|
||||
export ND_PLUGINS_FOLDER="/shared_data/plugins"
|
||||
export ND_ARTISTIMAGEFOLDER="artist_images"
|
||||
|
||||
/app/navidrome &
|
||||
NAVIDROME_PID=$!
|
||||
echo "Navidrome started with PID ${NAVIDROME_PID}"
|
||||
echo "Navidrome running (PID: $NAVIDROME_PID) with DataFolder at /local/navidrome_data (DB bind-mounted)"
|
||||
}
|
||||
|
||||
# Stop Navidrome
|
||||
@@ -63,13 +129,13 @@ stop_app() {
|
||||
kill -SIGTERM "${NAVIDROME_PID}"
|
||||
wait "${NAVIDROME_PID}" 2>/dev/null || true
|
||||
NAVIDROME_PID=0
|
||||
umount /local/navidrome_data/navidrome.db 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Signal Handling (The Safety Net) ---
|
||||
# If Nomad stops the container, we stop the app and deregister.
|
||||
# --- Cleanup ---
|
||||
cleanup() {
|
||||
echo "Caught signal, shutting down..."
|
||||
echo "Shutting down..."
|
||||
stop_app
|
||||
deregister_service
|
||||
exit 0
|
||||
@@ -80,43 +146,23 @@ trap cleanup TERM INT
|
||||
# --- Main Loop ---
|
||||
|
||||
echo "Starting Supervisor. Waiting for leadership settle..."
|
||||
echo "Node IP: $NODE_IP"
|
||||
echo "Consul: $CONSUL_HTTP_ADDR"
|
||||
|
||||
# Small sleep to let LiteFS settle and leadership election complete
|
||||
sleep 5
|
||||
wait_for_litefs || exit 1
|
||||
|
||||
while true; do
|
||||
# In LiteFS 0.5, .primary file exists ONLY on replicas.
|
||||
if [ ! -f "$DB_LOCK_FILE" ]; then
|
||||
if check_primary; then
|
||||
# === WE ARE PRIMARY ===
|
||||
|
||||
# 1. If App is not running, start it and register
|
||||
if [ "${NAVIDROME_PID}" -eq 0 ] || ! kill -0 "${NAVIDROME_PID}" 2>/dev/null; then
|
||||
if [ "${NAVIDROME_PID}" -gt 0 ]; then
|
||||
echo "CRITICAL: Navidrome crashed! Restarting..."
|
||||
fi
|
||||
start_app
|
||||
register_service
|
||||
fi
|
||||
|
||||
# 2. Maintain the heartbeat (TTL)
|
||||
pass_ttl
|
||||
|
||||
else
|
||||
# === WE ARE REPLICA ===
|
||||
|
||||
# If App is running (we were just demoted), stop it
|
||||
if [ "${NAVIDROME_PID}" -gt 0 ]; then
|
||||
echo "Lost leadership. Demoting..."
|
||||
stop_app
|
||||
deregister_service
|
||||
fi
|
||||
|
||||
# No service registration exists for replicas to keep Consul clean.
|
||||
fi
|
||||
|
||||
# Sleep short enough to update TTL (every 5s is safe for 15s TTL)
|
||||
sleep 5 &
|
||||
wait $! # Wait allows the 'trap' to interrupt the sleep instantly
|
||||
sleep 10
|
||||
done
|
||||
|
||||
20
litefs.yml
20
litefs.yml
@@ -8,29 +8,19 @@ data:
|
||||
# Use Consul for leader election
|
||||
lease:
|
||||
type: "consul"
|
||||
candidate: true
|
||||
promote: true
|
||||
advertise-url: "http://${ADVERTISE_IP}:20202"
|
||||
consul:
|
||||
url: "${CONSUL_URL}"
|
||||
key: "litefs/navidrome"
|
||||
key: "litefs/navidrome-v8"
|
||||
ttl: "30s"
|
||||
lock-delay: "5s"
|
||||
|
||||
# Internal HTTP API for replication
|
||||
http:
|
||||
addr: "0.0.0.0:20202"
|
||||
|
||||
# The HTTP Proxy routes traffic to handle write-forwarding
|
||||
proxy:
|
||||
addr: ":8080"
|
||||
target: "localhost:4533"
|
||||
db: "navidrome.db"
|
||||
passthrough:
|
||||
- "*.js"
|
||||
- "*.css"
|
||||
- "*.png"
|
||||
- "*.jpg"
|
||||
- "*.jpeg"
|
||||
- "*.gif"
|
||||
- "*.svg"
|
||||
|
||||
# Commands to run only on the primary node.
|
||||
exec:
|
||||
- cmd: "/usr/local/bin/entrypoint.sh"
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
job "navidrome-litefs" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
|
||||
constraint {
|
||||
attribute = "${attr.kernel.name}"
|
||||
value = "linux"
|
||||
}
|
||||
|
||||
group "navidrome" {
|
||||
count = 4
|
||||
|
||||
update {
|
||||
max_parallel = 1
|
||||
min_healthy_time = "30s"
|
||||
healthy_deadline = "5m"
|
||||
auto_revert = false
|
||||
}
|
||||
|
||||
constraint {
|
||||
distinct_hosts = true
|
||||
}
|
||||
|
||||
network {
|
||||
# Request static ports on the host
|
||||
port "http" {
|
||||
static = 4533
|
||||
to = 4533 # Direct to Navidrome
|
||||
}
|
||||
port "litefs" {
|
||||
static = 20202
|
||||
to = 20202 # Maps host 20202 to container 20202 (LiteFS Replication)
|
||||
}
|
||||
}
|
||||
|
||||
task "navidrome" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "gitea.service.dc1.fbleagh.duckdns.org/sstent/navidrome-litefs:e56fb94fdc0ac1f70abdb613b64ce6b4d7a770cf"
|
||||
privileged = true # Still needed for FUSE
|
||||
ports = ["http", "litefs"]
|
||||
force_pull = true
|
||||
|
||||
volumes = [
|
||||
"/mnt/configs/navidrome_litefs:/var/lib/litefs",
|
||||
"/mnt/Public/configs/navidrome:/shared_data",
|
||||
"/mnt/Public/Downloads/Clean_Music:/music/CleanMusic:ro",
|
||||
"/mnt/Public/Downloads/news/slskd/downloads:/music/slskd:ro",
|
||||
"/mnt/Public/Downloads/incoming_music:/music/incomingmusic:ro"
|
||||
]
|
||||
}
|
||||
|
||||
env {
|
||||
# LiteFS Config
|
||||
CONSUL_URL = "http://${attr.unique.network.ip-address}:8500"
|
||||
ADVERTISE_IP = "${attr.unique.network.ip-address}"
|
||||
PORT = "8080" # Internal proxy port (unused but kept)
|
||||
|
||||
# Navidrome Config
|
||||
ND_DATAFOLDER = "/data"
|
||||
ND_PLUGINS_FOLDER = "/shared_data/plugins"
|
||||
ND_CACHEFOLDER = "/shared_data/cache"
|
||||
ND_BACKUP_PATH = "/shared_data/backup"
|
||||
|
||||
ND_LOGLEVEL = "info"
|
||||
ND_REVERSEPROXYWHITELIST = "0.0.0.0/0"
|
||||
ND_REVERSEPROXYUSERHEADER = "X-Forwarded-User"
|
||||
}
|
||||
|
||||
# NO service block here! Managed by register.sh inside the container.
|
||||
|
||||
resources {
|
||||
cpu = 500
|
||||
memory = 512
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,179 +1,84 @@
|
||||
variable "container_sha" {
|
||||
type = string
|
||||
default = "045fc6e82b9ecb6bebc1f095f62498935df70bbf"
|
||||
}
|
||||
|
||||
job "navidrome-litefs" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
|
||||
# We pin to Linux because LiteFS requires FUSE
|
||||
constraint {
|
||||
attribute = "${attr.kernel.name}"
|
||||
value = "linux"
|
||||
}
|
||||
|
||||
group "navidrome" {
|
||||
count = 2
|
||||
count = 4
|
||||
|
||||
update {
|
||||
max_parallel = 1
|
||||
min_healthy_time = "30s"
|
||||
healthy_deadline = "5m"
|
||||
auto_revert = false
|
||||
}
|
||||
|
||||
constraint {
|
||||
distinct_hosts = true
|
||||
}
|
||||
|
||||
network {
|
||||
mode = "host"
|
||||
port "http" {}
|
||||
}
|
||||
|
||||
# --- Setup Task ---
|
||||
task "setup" {
|
||||
driver = "docker"
|
||||
lifecycle {
|
||||
hook = "prestart"
|
||||
sidecar = false
|
||||
# Request static ports on the host
|
||||
port "http" {
|
||||
static = 4533
|
||||
to = 4533 # Direct to Navidrome
|
||||
}
|
||||
config {
|
||||
image = "busybox"
|
||||
command = "mkdir"
|
||||
args = ["-p", "/alloc/sqlite"]
|
||||
network_mode = "host"
|
||||
port "litefs" {
|
||||
static = 20202
|
||||
to = 20202 # Maps host 20202 to container 20202 (LiteFS Replication)
|
||||
}
|
||||
}
|
||||
|
||||
# --- LiteFS Task ---
|
||||
task "litefs" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "flyio/litefs:0.5"
|
||||
privileged = true # Needed for FUSE
|
||||
ports = ["http"]
|
||||
network_mode = "host"
|
||||
|
||||
# 1. Bind mount for LiteFS internal data (chunks/WAL)
|
||||
# 2. Bind mount for the config
|
||||
# 3. Mount the shared alloc dir so we can mount FUSE on it
|
||||
volumes = [
|
||||
"/mnt/configs/navidrome_litefs:/var/lib/litefs",
|
||||
"local/litefs.yml:/etc/litefs.yml"
|
||||
]
|
||||
|
||||
mounts = [
|
||||
{
|
||||
type = "bind"
|
||||
source = "../alloc/sqlite"
|
||||
target = "/mnt/sqlite"
|
||||
bind_options = {
|
||||
propagation = "shared"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Create the config file
|
||||
template {
|
||||
left_delimiter = "[["
|
||||
right_delimiter = "]]"
|
||||
data = <<EOF
|
||||
fuse:
|
||||
# This matches the internal mount point in the container
|
||||
dir: "/mnt/sqlite"
|
||||
|
||||
data:
|
||||
# Internal data storage
|
||||
dir: "/var/lib/litefs"
|
||||
|
||||
# Use Consul for leader election
|
||||
lease:
|
||||
type: "consul"
|
||||
consul:
|
||||
url: "http://[[ env `attr.unique.network.ip-address` ]]:8500"
|
||||
key: "litefs/navidrome"
|
||||
|
||||
# The HTTP Proxy routes traffic
|
||||
proxy:
|
||||
addr: ":[[ env `NOMAD_PORT_http` ]]"
|
||||
target: "127.0.0.1:4533" # Navidrome's internal port
|
||||
db: "navidrome.db" # The DB to track for transaction consistency
|
||||
passthrough: # Paths that don't need write-forwarding (optional optimizations)
|
||||
- "*.js"
|
||||
- "*.css"
|
||||
- "*.png"
|
||||
EOF
|
||||
destination = "local/litefs.yml"
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 200
|
||||
memory = 256
|
||||
}
|
||||
}
|
||||
|
||||
# --- Navidrome Task (The App) ---
|
||||
task "navidrome" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "ghcr.io/navidrome/navidrome:latest"
|
||||
memory_hard_limit = "2048"
|
||||
ports = [] # No ports exposed directly!
|
||||
network_mode = "host"
|
||||
image = "gitea.service.dc1.fbleagh.duckdns.org/sstent/navidrome-litefs:${var.container_sha}"
|
||||
privileged = true # Still needed for FUSE
|
||||
ports = ["http", "litefs"]
|
||||
force_pull = true
|
||||
|
||||
# We mount the sqlite dir from the allocation directory
|
||||
mounts = [
|
||||
{
|
||||
type = "bind"
|
||||
source = "../alloc/sqlite"
|
||||
target = "/data"
|
||||
bind_options = {
|
||||
propagation = "shared"
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
# Shared Music and Configs
|
||||
volumes = [
|
||||
"/mnt/configs/navidrome_litefs:/var/lib/litefs",
|
||||
"/mnt/Public/configs/navidrome:/shared_data",
|
||||
"/mnt/Public/Downloads/Clean_Music:/music/CleanMusic:ro",
|
||||
"/mnt/Public/Downloads/news/slskd/downloads:/music/slskd:ro",
|
||||
"/mnt/Public/Downloads/incoming_music:/music/incomingmusic:ro",
|
||||
"/mnt/Public/configs/navidrome:/shared_data"
|
||||
"/mnt/Public/Downloads/incoming_music:/music/incomingmusic:ro"
|
||||
]
|
||||
}
|
||||
|
||||
env {
|
||||
ND_DATAFOLDER = "/local/data"
|
||||
ND_CACHEFOLDER = "/shared_data/cache"
|
||||
ND_CONFIGFILE= "/local/data/navidrome.toml"
|
||||
# LiteFS Config
|
||||
CONSUL_URL = "http://${attr.unique.network.ip-address}:8500"
|
||||
ADVERTISE_IP = "${attr.unique.network.ip-address}"
|
||||
PORT = "8080" # Internal proxy port (unused but kept)
|
||||
|
||||
# Navidrome Config
|
||||
ND_DATAFOLDER = "/shared_data"
|
||||
ND_PLUGINS_FOLDER = "/shared_data/plugins"
|
||||
ND_CACHEFOLDER = "/shared_data/cache"
|
||||
ND_BACKUP_PATH = "/shared_data/backup"
|
||||
ND_ARTISTIMAGEFOLDER = "artist_images"
|
||||
ND_BACKUPSCHEDULE = ""
|
||||
|
||||
# Important: LiteFS handles locking, but we still want WAL mode.
|
||||
ND_DBPATH = "/data/navidrome.db?_busy_timeout=30000&_journal_mode=WAL&_foreign_keys=on&synchronous=NORMAL"
|
||||
|
||||
# Disable internal scheduling to prevent redundant scans on secondary nodes.
|
||||
ND_SCANSCHEDULE = "0"
|
||||
ND_SCANNER_FSWATCHER_ENABLED = "false"
|
||||
|
||||
ND_LOGLEVEL = "info"
|
||||
ND_REVERSEPROXYWHITELIST = "0.0.0.0/0"
|
||||
ND_REVERSEPROXYUSERHEADER = "X-Forwarded-User"
|
||||
ND_FORCE_REDEPLOY = "5"
|
||||
ND_LOGLEVEL = "info"
|
||||
ND_REVERSEPROXYWHITELIST = "0.0.0.0/0"
|
||||
ND_REVERSEPROXYUSERHEADER = "X-Forwarded-User"
|
||||
}
|
||||
|
||||
service {
|
||||
name = "navidrome"
|
||||
tags = [
|
||||
"navidrome",
|
||||
"web",
|
||||
"traefik.enable=true",
|
||||
"urlprefix-/navidrome",
|
||||
"tools",
|
||||
"traefik.http.routers.navidromelan.rule=Host(`navidrome.service.dc1.consul`)",
|
||||
"traefik.http.routers.navidromewan.rule=Host(`m.fbleagh.duckdns.org`)",
|
||||
"traefik.http.routers.navidromewan.middlewares=dex@consulcatalog",
|
||||
"traefik.http.routers.navidromewan.tls=true",
|
||||
]
|
||||
port = "http" # This maps to the LiteFS proxy port defined in network block
|
||||
|
||||
check {
|
||||
type = "http"
|
||||
path = "/app" # LiteFS proxy passes this through
|
||||
interval = "10s"
|
||||
timeout = "2s"
|
||||
}
|
||||
}
|
||||
# NO service block here! Managed by register.sh inside the container.
|
||||
|
||||
resources {
|
||||
cpu = 500
|
||||
@@ -181,4 +86,4 @@ EOF
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user