Compare commits

...

38 Commits

Author SHA1 Message Date
97733cf7b8 fix: use navidrome-v8 and pin to opti1 for migration
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 44s
2026-04-28 13:28:35 -07:00
5c1fedd379 fix: use file-bind-mount for DB to allow local WAL files
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 43s
2026-04-28 13:14:58 -07:00
bb18672bfc fix: use local DataFolder with symlinks to LiteFS DB
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 39s
2026-04-28 13:08:56 -07:00
48a005cfbc fix: add auto-seeding from backup
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 41s
2026-04-28 12:25:01 -07:00
94d8e290bf fix(entrypoint): wait for DB file before bind mounting
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 4m0s
2026-04-28 11:33:21 -07:00
3232d6568d fix: use bind mount for DB to support SMB shares
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 47s
2026-04-27 14:24:16 -07:00
1117fb178b fix: use symlink for DB and move DataFolder to /shared_data to avoid LiteFS root write errors
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 46s
2026-04-27 14:14:11 -07:00
e678120572 fix: revert to original data paths and add ND_ARTISTIMAGEFOLDER
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 43s
2026-04-27 14:07:08 -07:00
92f9209dcd fix(entrypoint): restore consul registration and cleanup logging
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 43s
2026-04-27 11:04:00 -07:00
33b84be0a5 test(entrypoint): use local data folder and new DB name
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 41s
2026-04-27 10:36:54 -07:00
45e40bf273 debug(entrypoint): add logging to check_primary
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 40s
2026-04-27 10:23:30 -07:00
8acb098918 fix(litefs): increase consul lease TTL and lock-delay
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 41s
2026-04-27 10:19:36 -07:00
dd413d1342 fix(cluster): use new litefs key and local volume, exclude odroid7
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 41s
2026-04-27 10:15:49 -07:00
7ea127f9cb test(entrypoint): disable consul registration to isolate leadership issue
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 40s
2026-04-27 10:10:23 -07:00
9232aeccc5 test(entrypoint): use /data/navidrome.db to bypass LiteFS
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 40s
2026-04-27 10:08:49 -07:00
0200afdc0f test(entrypoint): use test.db to isolate issue
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 43s
2026-04-27 10:06:46 -07:00
e0262dc88b fix(litefs): disable proxy to avoid DB locks
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 43s
2026-04-27 10:00:45 -07:00
107e37cb3e fix(entrypoint): simplify DB connection string
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 50s
2026-04-27 09:56:37 -07:00
5311f0069a fix(entrypoint): use ND_DBPATH env var and remove set -e
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 44s
2026-04-27 09:33:34 -07:00
af8ce0ef2b fix(entrypoint): use /info instead of /status for LiteFS 0.5 status API
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 48s
2026-04-27 09:26:14 -07:00
5f9e4d23fb fix: use --dbpath CLI flag to isolate database on LiteFS mount
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 48s
2026-04-27 08:57:19 -07:00
6e7c729c5e fix: use standard Navidrome variables to isolate DB on LiteFS and metadata on host volume
Some checks failed
Build and Push Docker Image / build-and-push (push) Has been cancelled
2026-04-27 08:56:22 -07:00
37f0dcb1e7 fix: revert to robust manual leadership detection to prevent multiple Navidrome instances
Some checks failed
Build and Push Docker Image / build-and-push (push) Has been cancelled
2026-04-27 08:54:55 -07:00
402553a674 fix: move to native LiteFS leadership management with if-candidate: true
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 41s
2026-04-27 08:52:41 -07:00
c04c00143e fix: support both flat and nested LiteFS status JSON and add robust type checking
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 42s
2026-04-27 08:41:21 -07:00
3e6a4d1704 fix: correct jq path for LiteFS 0.5 status API and add robust error handling
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 42s
2026-04-27 08:31:47 -07:00
362f838f7c fix: robust leadership detection via LiteFS API and resolve Navidrome deprecation warnings
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 43s
2026-04-27 08:25:40 -07:00
a8e02ae063 fix: improve leadership detection using 'litefs status' to prevent redundant Consul registrations
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 47s
2026-04-27 08:22:43 -07:00
538ee01b72 fix: add SQLite connection parameters to ND_DBPATH and wait for DB file
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 43s
2026-04-27 08:15:22 -07:00
25885ea4f0 fix: use ND_DBPATH to point to LiteFS database directly, avoiding symlink errors
Some checks failed
Build and Push Docker Image / build-and-push (push) Has been cancelled
2026-04-27 08:13:37 -07:00
a586d60682 debug: add verbose logging and error checks to setup_data_dir
Some checks failed
Build and Push Docker Image / build-and-push (push) Has been cancelled
2026-04-27 08:11:24 -07:00
59f406d3b7 fix: relocate LiteFS mount to /litefs and use /data for persistent artwork
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 4m5s
2026-04-27 08:04:06 -07:00
f08c715d75 fix(nomad): Move variable definition to top-level
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 25s
2026-04-08 10:58:39 -07:00
8f1565b1af fix(deploy): Replace failing setup-nomad action with manual install
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 28s
2026-04-08 10:56:20 -07:00
4538ad5909 feat: Add automated LiteFS backups and GitHub deployment workflow
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 3m52s
2026-04-08 10:38:23 -07:00
f0b02904a8 chore(conductor): Mark track 'fix_navidrome_paths' as complete 2026-02-09 07:14:50 -08:00
7f1f3321e0 chore(conductor): Archive track 'fix_navidrome_paths' 2026-02-09 07:14:22 -08:00
23a65be4d8 docs(conductor): Synchronize tech-stack and finalize track 'fix_navidrome_paths' 2026-02-09 07:14:05 -08:00
18 changed files with 248 additions and 414 deletions

54
.github/workflows/deploy.yml vendored Normal file
View File

@@ -0,0 +1,54 @@
name: Deploy to Nomad
on:
workflow_run:
workflows: ["Build and Push Docker Image"]
types:
- completed
workflow_dispatch:
inputs:
container_sha:
description: 'Container SHA to deploy (leave empty for latest commit)'
required: false
type: string
jobs:
nomad:
runs-on: ubuntu-latest
name: Deploy to Nomad
if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Setup Nomad CLI
run: |
NOMAD_VERSION="1.10.5"
curl -sSL https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_linux_amd64.zip -o nomad.zip
unzip nomad.zip
sudo mv nomad /usr/local/bin/
rm nomad.zip
nomad version
- name: Set Container Version
id: container_version
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.container_sha }}" ]; then
echo "sha=${{ inputs.container_sha }}" >> $GITHUB_OUTPUT
elif [ "${{ github.event_name }}" = "workflow_run" ]; then
echo "sha=${{ github.event.workflow_run.head_sha }}" >> $GITHUB_OUTPUT
else
echo "sha=${{ github.sha }}" >> $GITHUB_OUTPUT
fi
- name: Deploy Nomad Job
id: deploy
env:
NOMAD_ADDR: http://192.168.4.36:4646
NOMAD_TOKEN: ${{ secrets.NOMAD_TOKEN }}
run: |
echo "Deploying container version: ${{ steps.container_version.outputs.sha }}"
nomad job run \
-var="container_sha=${{ steps.container_version.outputs.sha }}" \
navidrome-litefs.nomad

View File

@@ -18,6 +18,9 @@ RUN chmod +x /usr/local/bin/entrypoint.sh
# Copy LiteFS configuration # Copy LiteFS configuration
COPY litefs.yml /etc/litefs.yml COPY litefs.yml /etc/litefs.yml
# Create mount points and data directories
RUN mkdir -p /litefs /data
# LiteFS becomes the supervisor. # LiteFS becomes the supervisor.
# It will mount the FUSE fs and then execute the command defined in litefs.yml's exec section. # It will mount the FUSE fs and then execute the command defined in litefs.yml's exec section.

View File

@@ -0,0 +1,5 @@
# Track fix_navidrome_paths_20260209 Context
- [Specification](./spec.md)
- [Implementation Plan](./plan.md)
- [Metadata](./metadata.json)

View File

@@ -0,0 +1,8 @@
{
"track_id": "fix_navidrome_paths_20260209",
"type": "bug",
"status": "new",
"created_at": "2026-02-09T14:30:00Z",
"updated_at": "2026-02-09T14:30:00Z",
"description": "Fix Navidrome database location to ensure it uses LiteFS mount and resolve process path conflicts."
}

View File

@@ -0,0 +1,17 @@
# Plan: Correct Navidrome Database and Plugins Location (`fix_navidrome_paths`)
## Phase 1: Configuration Updates [x]
- [x] Task: Update `navidrome-litefs-v2.nomad` with corrected paths (76398de)
- [x] Task: Update `entrypoint.sh` to handle plugins folder and environment cleanup (decb9f5)
- [x] Task: Conductor - User Manual Verification 'Phase 1: Configuration Updates' (Protocol in workflow.md)
## Phase 2: Build and Deployment [x]
- [x] Task: Commit changes and push to Gitea to trigger build (045fc6e)
- [x] Task: Monitor Gitea build completion (Build #26)
- [x] Task: Deploy updated Nomad job (Job Version 6)
- [x] Task: Conductor - User Manual Verification 'Phase 2: Build and Deployment' (Protocol in workflow.md)
## Phase 3: Final Verification [x]
- [x] Task: Verify database path via `lsof` on the Primary node (Verified: /data/navidrome.db)
- [x] Task: Verify replication health using `cluster_status` script (Verified: All nodes in sync)
- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md)

View File

@@ -0,0 +1,25 @@
# Specification: Correct Navidrome Database and Plugins Location (`fix_navidrome_paths`)
## Overview
Force Navidrome to use the `/data` LiteFS mount for its SQLite database by setting the `DATAFOLDER` to `/data`. To avoid the "Operation not permitted" error caused by LiteFS's restriction on directory creation, redirect the Navidrome plugins folder to persistent shared storage.
## Functional Requirements
- **Nomad Job Configuration (`navidrome-litefs-v2.nomad`):**
- Set `ND_DATAFOLDER="/data"`. This will force Navidrome to create and use `navidrome.db` on the LiteFS mount.
- Set `ND_PLUGINSFOLDER="/shared_data/plugins"`. This prevents Navidrome from attempting to create a `plugins` directory in the read-only/virtual `/data` mount.
- Keep `ND_CACHEFOLDER` and `ND_BACKUP_PATH` pointing to `/shared_data` subdirectories.
- **Entrypoint Logic (`entrypoint.sh`):**
- Ensure it creates `/shared_data/plugins` if it doesn't exist.
- Remove the explicit `export ND_DATABASE_PATH` if it conflicts with the new `DATAFOLDER` logic, or keep it as an explicit override.
- **Verification:**
- Confirm via `lsof` that Navidrome is finally using `/data/navidrome.db`.
- Confirm that LiteFS `/debug/vars` now reports the database in its active set.
## Non-Functional Requirements
- **Persistence:** Ensure all non-database files (plugins, cache, backups) are stored on the shared host mount (`/shared_data`) to survive container restarts and migrations.
## Acceptance Criteria
- [ ] Navidrome successfully starts with `/data` as its data folder.
- [ ] No "Operation not permitted" errors occur during startup.
- [ ] `lsof` confirms `/data/navidrome.db` is open by the Navidrome process.
- [ ] LiteFS `txid` increases on the Primary and replicates to Replicas when Navidrome writes to the DB.

View File

@@ -8,7 +8,10 @@
## Storage & Database ## Storage & Database
- **SQLite:** The primary relational database used by Navidrome for metadata and state. - **SQLite:** The primary relational database used by Navidrome for metadata and state.
- **LiteFS:** A FUSE-based filesystem that provides synchronous replication of the SQLite database across the cluster. - **LiteFS:** A FUSE-based filesystem that provides synchronous replication of the SQLite database across the cluster.
- **Process Management:** LiteFS-supervised with a robust TTL-heartbeat registration script ensuring zero-downtime failover and clean service catalog management. - **Process Management:** LiteFS-supervised with TTL-based self-registration for clean and resilient service catalog management.
- **Hybrid Storage Model:**
- **Replicated:** SQLite database on LiteFS FUSE mount (`/data`).
- **Shared:** Plugins, cache, and backups on persistent network storage (`/shared_data`).
## Automation & Delivery ## Automation & Delivery
- **Gitea Actions:** Automates the multi-arch (AMD64/ARM64) building and pushing of the custom supervised container image. - **Gitea Actions:** Automates the multi-arch (AMD64/ARM64) building and pushing of the custom supervised container image.

View File

@@ -2,7 +2,4 @@
This file tracks all major tracks for the project. Each track has its own detailed plan in its respective folder. This file tracks all major tracks for the project. Each track has its own detailed plan in its respective folder.
--- ---
--- ---
- [x] **Track: Update Monitor Discovery Logic**
*Link: [./tracks/update_monitor_discovery_20260208/](./update_monitor_discovery_20260208/)*

View File

@@ -1,30 +0,0 @@
# Plan: Cluster Diagnosis and Script Enhancement (`diagnose_and_enhance`)
## Phase 1: Enhanced Diagnostics (Consul) [x] [checkpoint: a686c5b]
- [x] Task: Update `consul_client.py` to fetch detailed health check output
- [x] Write tests for fetching `Output` field from Consul checks
- [x] Implement logic to extract and store the `Output` (error message)
- [x] Task: Update aggregator and formatter to display Consul errors
- [x] Update aggregation logic to include `consul_error`
- [x] Update table formatter to indicate an error (maybe a flag or color)
- [x] Add a "Diagnostics" section to the output to print full error details
- [x] Task: Conductor - User Manual Verification 'Phase 1: Enhanced Diagnostics (Consul)' (Protocol in workflow.md)
## Phase 2: Nomad Integration and Logs [x] [checkpoint: 6d77729]
- [x] Task: Implement `nomad_client.py` wrapper
- [x] Write tests for `get_allocation_logs`, `get_node_status`, and `restart_allocation` (mocking subprocess)
- [x] Implement `subprocess.run(["nomad", ...])` logic to fetch logs and restart allocations
- [x] Task: Integrate Nomad logs into diagnosis
- [x] Update aggregator to call Nomad client for critical nodes
- [x] Update "Diagnostics" section to display the last 20 lines of stderr
- [x] Task: Conductor - User Manual Verification 'Phase 2: Nomad Integration and Logs' (Protocol in workflow.md)
## Phase 3: Advanced LiteFS Status [ ]
- [ ] Task: Implement `litefs_status` via `nomad alloc exec`
- [ ] Write tests for executing remote commands via Nomad
- [ ] Update `litefs_client.py` to fallback to `nomad alloc exec` if HTTP fails
- [ ] Parse `litefs status` output (text/json) to extract uptime and replication lag
- [ ] Task: Final Polish and Diagnosis Run
- [ ] Ensure all pieces work together
- [ ] Run the script to diagnose `odroid8`
- [ ] Task: Conductor - User Manual Verification 'Phase 3: Advanced LiteFS Status' (Protocol in workflow.md)

View File

@@ -1,22 +0,0 @@
# Plan: Fix LiteFS Configuration and Process Management (`fix_litefs_config`)
## Phase 1: Configuration and Image Structure [ ]
- [x] Task: Update `litefs.yml` to include the `exec` block (396dfeb)
- [x] Task: Update `Dockerfile` to use LiteFS as the supervisor (`ENTRYPOINT ["litefs", "mount"]`) (ef91b8e)
- [x] Task: Update `navidrome-litefs-v2.nomad` with corrected storage paths (`ND_DATAFOLDER`, `ND_CACHEFOLDER`, `ND_BACKUP_PATH`) (5cbb657)
- [ ] Task: Conductor - User Manual Verification 'Phase 1: Configuration and Image Structure' (Protocol in workflow.md)
## Phase 2: Entrypoint and Registration Logic [x] [checkpoint: 9cd5455]
- [x] Task: Refactor `entrypoint.sh` to handle leadership-aware process management (9cd5455)
- [x] Integrate Consul registration logic (from `register.sh`)
- [x] Implement loop to start/stop Navidrome based on `/data/.primary` existence
- [x] Ensure proper signal handling for Navidrome shutdown
- [x] Task: Clean up redundant scripts (e.g., `register.sh` if fully integrated) (9cd5455)
- [ ] Task: Conductor - User Manual Verification 'Phase 2: Entrypoint and Registration Logic' (Protocol in workflow.md)
## Phase 3: Deployment and Failover Verification [ ]
- [ ] Task: Build and push the updated Docker image via Gitea Actions (if possible) or manual trigger
- [ ] Task: Deploy the updated Nomad job
- [ ] Task: Verify cluster health and process distribution using `cluster_status` script
- [ ] Task: Perform a manual failover (stop primary allocation) and verify Navidrome migrates correctly
- [ ] Task: Conductor - User Manual Verification 'Phase 3: Deployment and Failover Verification' (Protocol in workflow.md)

View File

@@ -1,17 +0,0 @@
# Plan: Correct Navidrome Database and Plugins Location (`fix_navidrome_paths`)
## Phase 1: Configuration Updates [x]
- [x] Task: Update `navidrome-litefs-v2.nomad` with corrected paths (76398de)
- [x] Task: Update `entrypoint.sh` to handle plugins folder and environment cleanup (decb9f5)
- [x] Task: Conductor - User Manual Verification 'Phase 1: Configuration Updates' (Protocol in workflow.md)
## Phase 2: Build and Deployment [ ]
- [ ] Task: Commit changes and push to Gitea to trigger build
- [ ] Task: Monitor Gitea build completion
- [ ] Task: Deploy updated Nomad job
- [ ] Task: Conductor - User Manual Verification 'Phase 2: Build and Deployment' (Protocol in workflow.md)
## Phase 3: Final Verification [ ]
- [ ] Task: Verify database path via `lsof` on the Primary node
- [ ] Task: Verify replication health using `cluster_status` script
- [ ] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md)

View File

@@ -1,26 +0,0 @@
# Plan: Fix Odroid8 and Script Robustness (`fix_odroid8_and_script`)
## Phase 1: Script Robustness [x] [checkpoint: 860000b]
- [x] Task: Update `nomad_client.py` to handle subprocess errors gracefully
- [x] Write tests for handling Nomad CLI absence/failure
- [x] Update implementation to return descriptive error objects or `None` without crashing
- [x] Task: Update aggregator and formatter to handle Nomad errors
- [x] Update `cluster_aggregator.py` to gracefully skip Nomad calls if they fail
- [x] Update `output_formatter.py` to display "Nomad Error" in relevant cells
- [x] Add a global "Nomad Connectivity Warning" to the summary
- [x] Task: Conductor - User Manual Verification 'Phase 1: Script Robustness' (Protocol in workflow.md)
## Phase 2: Odroid8 Recovery [ ]
- [x] Task: Identify and verify `odroid8` LiteFS data path
- [x] Run `nomad alloc status` to find the volume mount for `odroid8`
- [x] Provide the user with the exact host path to the LiteFS data
- [x] Task: Guide user through manual cleanup
- [x] Provide steps to stop the allocation
- [x] Provide the `rm` command to clear the LiteFS metadata
- [x] Provide steps to restart and verify the node
- [~] Task: Conductor - User Manual Verification 'Phase 2: Odroid8 Recovery' (Protocol in workflow.md)
## Phase 3: Final Verification [x]
- [x] Task: Final verification run of the script
- [x] Task: Verify cluster health in Consul and LiteFS API
- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md)

View File

@@ -1,22 +0,0 @@
# Plan: Implement TTL Heartbeat Service Registration (`implement_ttl_heartbeat`)
## Phase 1: Container Environment Preparation [x] [checkpoint: 51b8fce]
- [x] Task: Update `Dockerfile` to install `curl` and `jq` (f7fe258)
- [x] Task: Verify `litefs.yml` points to `entrypoint.sh` (should already be correct) (verified)
- [x] Task: Conductor - User Manual Verification 'Phase 1: Container Environment Preparation' (Protocol in workflow.md)
## Phase 2: Script Implementation [x] [checkpoint: 139016f]
- [x] Task: Refactor `entrypoint.sh` with the TTL Heartbeat logic (d977301)
- [x] Implement `register_service` with TTL check definition
- [x] Implement `pass_ttl` loop
- [x] Implement robust `stop_app` and signal trapping
- [x] Ensure correct Primary/Replica detection logic (LiteFS 0.5: Primary = No `.primary` file)
- [x] Task: Conductor - User Manual Verification 'Phase 2: Script Implementation' (Protocol in workflow.md)
## Phase 3: Deployment and Verification [ ]
- [ ] Task: Commit changes and push to Gitea to trigger build
- [ ] Task: Monitor Gitea build completion
- [ ] Task: Deploy updated Nomad job (forcing update if necessary)
- [ ] Task: Verify "Clean" state in Consul (only one primary registered)
- [ ] Task: Verify Failover/Stop behavior (immediate deregistration vs TTL expiry)
- [ ] Task: Conductor - User Manual Verification 'Phase 3: Deployment and Verification' (Protocol in workflow.md)

View File

@@ -1,23 +0,0 @@
# Plan: Update Monitor Discovery Logic (`update_monitor_discovery`)
## Phase 1: Nomad Discovery Enhancement [x] [checkpoint: 353683e]
- [x] Task: Update `nomad_client.py` to fetch job allocations with IPs (353683e)
- [x] Write tests for parsing allocation IPs from `nomad job status` or `nomad alloc status`
- [x] Implement `get_job_allocations(job_id)` returning a list of dicts (id, node, ip)
- [x] Task: Conductor - User Manual Verification 'Phase 1: Nomad Discovery Enhancement' (Protocol in workflow.md)
## Phase 2: Aggregator Refactor [x] [checkpoint: 655a9b2]
- [x] Task: Refactor `cluster_aggregator.py` to drive discovery via Nomad (655a9b2)
- [x] Update `get_cluster_status` to call `nomad_client.get_job_allocations` first
- [x] Update loop to iterate over allocations and supplement with LiteFS and Consul data
- [x] Task: Update `consul_client.py` to fetch all services once and allow lookup by IP/ID (655a9b2)
- [x] Task: Update tests for the new discovery flow (655a9b2)
- [x] Task: Conductor - User Manual Verification 'Phase 2: Aggregator Refactor' (Protocol in workflow.md)
## Phase 3: UI and Health Logic [x] [checkpoint: 21e9c3d]
- [x] Task: Update `output_formatter.py` for "Standby" nodes (21e9c3d)
- [x] Update table formatting to handle missing Consul status for replicas
- [x] Task: Update Cluster Health calculation (21e9c3d)
- [x] "Healthy" = 1 Primary (Consul passing) + N Replicas (LiteFS connected)
- [x] Task: Final verification run (21e9c3d)
- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md)

View File

@@ -1,14 +1,10 @@
#!/bin/bash #!/bin/bash
set -e
# Configuration from environment # Configuration from environment
SERVICE_NAME="navidrome" SERVICE_NAME="navidrome"
# Use Nomad allocation ID for a unique service ID
SERVICE_ID="${SERVICE_NAME}-${NOMAD_ALLOC_ID:-$(hostname)}" SERVICE_ID="${SERVICE_NAME}-${NOMAD_ALLOC_ID:-$(hostname)}"
PORT=4533 PORT=4533
CONSUL_HTTP_ADDR="${CONSUL_URL:-http://localhost:8500}" CONSUL_HTTP_ADDR="${CONSUL_URL:-http://localhost:8500}"
NODE_IP="${ADVERTISE_IP}" NODE_IP="${ADVERTISE_IP}"
DB_LOCK_FILE="/data/.primary"
NAVIDROME_PID=0 NAVIDROME_PID=0
# Tags for the Primary service (Traefik enabled) # Tags for the Primary service (Traefik enabled)
@@ -16,10 +12,43 @@ PRIMARY_TAGS='["navidrome","web","traefik.enable=true","urlprefix-/navidrome","t
# --- Helper Functions --- # --- Helper Functions ---
# Check if this node is the LiteFS Primary
check_primary() {
local status=$(curl -s http://localhost:20202/info || echo "{}")
local is_primary=$(echo "$status" | jq -r 'if type == "object" then (.isPrimary // false) else false end' 2>/dev/null || echo "false")
if [ "$is_primary" = "true" ]; then
return 0 # We are the primary
fi
return 1 # We are a replica
}
# Wait for LiteFS to settle and determine its role
wait_for_litefs() {
echo "Waiting for LiteFS to settle..."
local timeout=60
local count=0
while [ $count -lt $timeout ]; do
local status=$(curl -s http://localhost:20202/info || echo "null")
local is_primary_val=$(echo "$status" | jq -r 'if type == "object" then (.isPrimary // "null") else "null" end' 2>/dev/null || echo "null")
if [ "$is_primary_val" != "null" ]; then
local role="replica"
if [ "$is_primary_val" = "true" ]; then role="primary"; fi
echo "LiteFS initialized. Role: $role"
return 0
fi
sleep 2
count=$((count + 2))
echo -n "."
done
echo "ERROR: LiteFS failed to settle after ${timeout}s"
return 1
}
# Register Service with TTL Check # Register Service with TTL Check
register_service() { register_service() {
echo "Promoted! Registering service ${SERVICE_ID}..." echo "Registering service ${SERVICE_ID} with Consul..."
# Convert bash list string to JSON array if needed, but PRIMARY_TAGS is already JSON-like
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/register" -d "{ curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/register" -d "{
\"ID\": \"${SERVICE_ID}\", \"ID\": \"${SERVICE_ID}\",
\"Name\": \"${SERVICE_NAME}\", \"Name\": \"${SERVICE_NAME}\",
@@ -40,7 +69,7 @@ pass_ttl() {
# Deregister Service # Deregister Service
deregister_service() { deregister_service() {
echo "Demoted/Stopping. Deregistering service ${SERVICE_ID}..." echo "Deregistering service ${SERVICE_ID} from Consul..."
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}" curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}"
} }
@@ -49,11 +78,48 @@ start_app() {
echo "Node is Primary. Starting Navidrome..." echo "Node is Primary. Starting Navidrome..."
# Ensure shared directories exist # Ensure shared directories exist
mkdir -p /shared_data/plugins /shared_data/cache /shared_data/backup mkdir -p /shared_data/plugins /shared_data/cache /shared_data/backup /shared_data/artist_images /shared_data/artwork
# SEEDING LOGIC: If DB doesn't exist in cluster, restore from backup
if [ ! -f /data/navidrome.db ]; then
echo "Database /data/navidrome.db not found. Looking for backups to seed..."
local latest_backup=$(ls -t /shared_data/backup/navidrome.db_*.bak 2>/dev/null | head -n 1)
if [ -n "$latest_backup" ]; then
echo "Seeding from $latest_backup..."
litefs import -name navidrome.db "$latest_backup"
else
echo "No backups found. Navidrome will start with a fresh database."
fi
fi
# Wait for LiteFS to expose the DB file
echo "Waiting for /data/navidrome.db to be exposed by LiteFS..."
local db_timeout=30
local db_count=0
while [ ! -f /data/navidrome.db ] && [ $db_count -lt $db_timeout ]; do
sleep 1
db_count=$((db_count + 1))
done
# Setup local data folder with BIND MOUNT for the DB
# This allows SQLite to create -wal/-shm files in the local writable directory
# while the main DB file is managed by LiteFS.
rm -rf /local/navidrome_data
mkdir -p /local/navidrome_data
touch /local/navidrome_data/navidrome.db
mount --bind /data/navidrome.db /local/navidrome_data/navidrome.db
# Configuration
export ND_DATAFOLDER="/local/navidrome_data"
export ND_CACHEFOLDER="/shared_data/cache"
export ND_BACKUP_PATH="/shared_data/backup"
export ND_PLUGINS_FOLDER="/shared_data/plugins"
export ND_ARTISTIMAGEFOLDER="artist_images"
/app/navidrome & /app/navidrome &
NAVIDROME_PID=$! NAVIDROME_PID=$!
echo "Navidrome started with PID ${NAVIDROME_PID}" echo "Navidrome running (PID: $NAVIDROME_PID) with DataFolder at /local/navidrome_data (DB bind-mounted)"
} }
# Stop Navidrome # Stop Navidrome
@@ -63,13 +129,13 @@ stop_app() {
kill -SIGTERM "${NAVIDROME_PID}" kill -SIGTERM "${NAVIDROME_PID}"
wait "${NAVIDROME_PID}" 2>/dev/null || true wait "${NAVIDROME_PID}" 2>/dev/null || true
NAVIDROME_PID=0 NAVIDROME_PID=0
umount /local/navidrome_data/navidrome.db 2>/dev/null || true
fi fi
} }
# --- Signal Handling (The Safety Net) --- # --- Cleanup ---
# If Nomad stops the container, we stop the app and deregister.
cleanup() { cleanup() {
echo "Caught signal, shutting down..." echo "Shutting down..."
stop_app stop_app
deregister_service deregister_service
exit 0 exit 0
@@ -80,43 +146,23 @@ trap cleanup TERM INT
# --- Main Loop --- # --- Main Loop ---
echo "Starting Supervisor. Waiting for leadership settle..." echo "Starting Supervisor. Waiting for leadership settle..."
echo "Node IP: $NODE_IP" wait_for_litefs || exit 1
echo "Consul: $CONSUL_HTTP_ADDR"
# Small sleep to let LiteFS settle and leadership election complete
sleep 5
while true; do while true; do
# In LiteFS 0.5, .primary file exists ONLY on replicas. if check_primary; then
if [ ! -f "$DB_LOCK_FILE" ]; then
# === WE ARE PRIMARY === # === WE ARE PRIMARY ===
# 1. If App is not running, start it and register
if [ "${NAVIDROME_PID}" -eq 0 ] || ! kill -0 "${NAVIDROME_PID}" 2>/dev/null; then if [ "${NAVIDROME_PID}" -eq 0 ] || ! kill -0 "${NAVIDROME_PID}" 2>/dev/null; then
if [ "${NAVIDROME_PID}" -gt 0 ]; then
echo "CRITICAL: Navidrome crashed! Restarting..."
fi
start_app start_app
register_service register_service
fi fi
# 2. Maintain the heartbeat (TTL)
pass_ttl pass_ttl
else else
# === WE ARE REPLICA === # === WE ARE REPLICA ===
# If App is running (we were just demoted), stop it
if [ "${NAVIDROME_PID}" -gt 0 ]; then if [ "${NAVIDROME_PID}" -gt 0 ]; then
echo "Lost leadership. Demoting..." echo "Lost leadership. Demoting..."
stop_app stop_app
deregister_service deregister_service
fi fi
# No service registration exists for replicas to keep Consul clean.
fi fi
sleep 10
# Sleep short enough to update TTL (every 5s is safe for 15s TTL)
sleep 5 &
wait $! # Wait allows the 'trap' to interrupt the sleep instantly
done done

View File

@@ -8,29 +8,19 @@ data:
# Use Consul for leader election # Use Consul for leader election
lease: lease:
type: "consul" type: "consul"
candidate: true
promote: true
advertise-url: "http://${ADVERTISE_IP}:20202" advertise-url: "http://${ADVERTISE_IP}:20202"
consul: consul:
url: "${CONSUL_URL}" url: "${CONSUL_URL}"
key: "litefs/navidrome" key: "litefs/navidrome-v8"
ttl: "30s"
lock-delay: "5s"
# Internal HTTP API for replication # Internal HTTP API for replication
http: http:
addr: "0.0.0.0:20202" addr: "0.0.0.0:20202"
# The HTTP Proxy routes traffic to handle write-forwarding
proxy:
addr: ":8080"
target: "localhost:4533"
db: "navidrome.db"
passthrough:
- "*.js"
- "*.css"
- "*.png"
- "*.jpg"
- "*.jpeg"
- "*.gif"
- "*.svg"
# Commands to run only on the primary node. # Commands to run only on the primary node.
exec: exec:
- cmd: "/usr/local/bin/entrypoint.sh" - cmd: "/usr/local/bin/entrypoint.sh"

View File

@@ -1,79 +0,0 @@
job "navidrome-litefs" {
datacenters = ["dc1"]
type = "service"
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
group "navidrome" {
count = 4
update {
max_parallel = 1
min_healthy_time = "30s"
healthy_deadline = "5m"
auto_revert = false
}
constraint {
distinct_hosts = true
}
network {
# Request static ports on the host
port "http" {
static = 4533
to = 4533 # Direct to Navidrome
}
port "litefs" {
static = 20202
to = 20202 # Maps host 20202 to container 20202 (LiteFS Replication)
}
}
task "navidrome" {
driver = "docker"
config {
image = "gitea.service.dc1.fbleagh.duckdns.org/sstent/navidrome-litefs:e56fb94fdc0ac1f70abdb613b64ce6b4d7a770cf"
privileged = true # Still needed for FUSE
ports = ["http", "litefs"]
force_pull = true
volumes = [
"/mnt/configs/navidrome_litefs:/var/lib/litefs",
"/mnt/Public/configs/navidrome:/shared_data",
"/mnt/Public/Downloads/Clean_Music:/music/CleanMusic:ro",
"/mnt/Public/Downloads/news/slskd/downloads:/music/slskd:ro",
"/mnt/Public/Downloads/incoming_music:/music/incomingmusic:ro"
]
}
env {
# LiteFS Config
CONSUL_URL = "http://${attr.unique.network.ip-address}:8500"
ADVERTISE_IP = "${attr.unique.network.ip-address}"
PORT = "8080" # Internal proxy port (unused but kept)
# Navidrome Config
ND_DATAFOLDER = "/data"
ND_PLUGINS_FOLDER = "/shared_data/plugins"
ND_CACHEFOLDER = "/shared_data/cache"
ND_BACKUP_PATH = "/shared_data/backup"
ND_LOGLEVEL = "info"
ND_REVERSEPROXYWHITELIST = "0.0.0.0/0"
ND_REVERSEPROXYUSERHEADER = "X-Forwarded-User"
}
# NO service block here! Managed by register.sh inside the container.
resources {
cpu = 500
memory = 512
}
}
}
}

View File

@@ -1,179 +1,84 @@
variable "container_sha" {
type = string
default = "045fc6e82b9ecb6bebc1f095f62498935df70bbf"
}
job "navidrome-litefs" { job "navidrome-litefs" {
datacenters = ["dc1"] datacenters = ["dc1"]
type = "service" type = "service"
# We pin to Linux because LiteFS requires FUSE
constraint { constraint {
attribute = "${attr.kernel.name}" attribute = "${attr.kernel.name}"
value = "linux" value = "linux"
} }
group "navidrome" { group "navidrome" {
count = 2 count = 4
update {
max_parallel = 1
min_healthy_time = "30s"
healthy_deadline = "5m"
auto_revert = false
}
constraint { constraint {
distinct_hosts = true distinct_hosts = true
} }
network { network {
mode = "host" # Request static ports on the host
port "http" {} port "http" {
} static = 4533
to = 4533 # Direct to Navidrome
# --- Setup Task ---
task "setup" {
driver = "docker"
lifecycle {
hook = "prestart"
sidecar = false
} }
config { port "litefs" {
image = "busybox" static = 20202
command = "mkdir" to = 20202 # Maps host 20202 to container 20202 (LiteFS Replication)
args = ["-p", "/alloc/sqlite"]
network_mode = "host"
} }
} }
# --- LiteFS Task ---
task "litefs" {
driver = "docker"
config {
image = "flyio/litefs:0.5"
privileged = true # Needed for FUSE
ports = ["http"]
network_mode = "host"
# 1. Bind mount for LiteFS internal data (chunks/WAL)
# 2. Bind mount for the config
# 3. Mount the shared alloc dir so we can mount FUSE on it
volumes = [
"/mnt/configs/navidrome_litefs:/var/lib/litefs",
"local/litefs.yml:/etc/litefs.yml"
]
mounts = [
{
type = "bind"
source = "../alloc/sqlite"
target = "/mnt/sqlite"
bind_options = {
propagation = "shared"
}
}
]
}
# Create the config file
template {
left_delimiter = "[["
right_delimiter = "]]"
data = <<EOF
fuse:
# This matches the internal mount point in the container
dir: "/mnt/sqlite"
data:
# Internal data storage
dir: "/var/lib/litefs"
# Use Consul for leader election
lease:
type: "consul"
consul:
url: "http://[[ env `attr.unique.network.ip-address` ]]:8500"
key: "litefs/navidrome"
# The HTTP Proxy routes traffic
proxy:
addr: ":[[ env `NOMAD_PORT_http` ]]"
target: "127.0.0.1:4533" # Navidrome's internal port
db: "navidrome.db" # The DB to track for transaction consistency
passthrough: # Paths that don't need write-forwarding (optional optimizations)
- "*.js"
- "*.css"
- "*.png"
EOF
destination = "local/litefs.yml"
}
resources {
cpu = 200
memory = 256
}
}
# --- Navidrome Task (The App) ---
task "navidrome" { task "navidrome" {
driver = "docker" driver = "docker"
config { config {
image = "ghcr.io/navidrome/navidrome:latest" image = "gitea.service.dc1.fbleagh.duckdns.org/sstent/navidrome-litefs:${var.container_sha}"
memory_hard_limit = "2048" privileged = true # Still needed for FUSE
ports = [] # No ports exposed directly! ports = ["http", "litefs"]
network_mode = "host" force_pull = true
# We mount the sqlite dir from the allocation directory
mounts = [
{
type = "bind"
source = "../alloc/sqlite"
target = "/data"
bind_options = {
propagation = "shared"
}
}
]
# Shared Music and Configs
volumes = [ volumes = [
"/mnt/configs/navidrome_litefs:/var/lib/litefs",
"/mnt/Public/configs/navidrome:/shared_data",
"/mnt/Public/Downloads/Clean_Music:/music/CleanMusic:ro", "/mnt/Public/Downloads/Clean_Music:/music/CleanMusic:ro",
"/mnt/Public/Downloads/news/slskd/downloads:/music/slskd:ro", "/mnt/Public/Downloads/news/slskd/downloads:/music/slskd:ro",
"/mnt/Public/Downloads/incoming_music:/music/incomingmusic:ro", "/mnt/Public/Downloads/incoming_music:/music/incomingmusic:ro"
"/mnt/Public/configs/navidrome:/shared_data"
] ]
} }
env { env {
ND_DATAFOLDER = "/local/data" # LiteFS Config
ND_CACHEFOLDER = "/shared_data/cache" CONSUL_URL = "http://${attr.unique.network.ip-address}:8500"
ND_CONFIGFILE= "/local/data/navidrome.toml" ADVERTISE_IP = "${attr.unique.network.ip-address}"
PORT = "8080" # Internal proxy port (unused but kept)
# Navidrome Config
ND_DATAFOLDER = "/shared_data"
ND_PLUGINS_FOLDER = "/shared_data/plugins"
ND_CACHEFOLDER = "/shared_data/cache"
ND_BACKUP_PATH = "/shared_data/backup"
ND_ARTISTIMAGEFOLDER = "artist_images"
ND_BACKUPSCHEDULE = ""
# Important: LiteFS handles locking, but we still want WAL mode.
ND_DBPATH = "/data/navidrome.db?_busy_timeout=30000&_journal_mode=WAL&_foreign_keys=on&synchronous=NORMAL"
# Disable internal scheduling to prevent redundant scans on secondary nodes.
ND_SCANSCHEDULE = "0" ND_SCANSCHEDULE = "0"
ND_SCANNER_FSWATCHER_ENABLED = "false" ND_SCANNER_FSWATCHER_ENABLED = "false"
ND_FORCE_REDEPLOY = "5"
ND_LOGLEVEL = "info" ND_LOGLEVEL = "info"
ND_REVERSEPROXYWHITELIST = "0.0.0.0/0" ND_REVERSEPROXYWHITELIST = "0.0.0.0/0"
ND_REVERSEPROXYUSERHEADER = "X-Forwarded-User" ND_REVERSEPROXYUSERHEADER = "X-Forwarded-User"
} }
service { # NO service block here! Managed by register.sh inside the container.
name = "navidrome"
tags = [
"navidrome",
"web",
"traefik.enable=true",
"urlprefix-/navidrome",
"tools",
"traefik.http.routers.navidromelan.rule=Host(`navidrome.service.dc1.consul`)",
"traefik.http.routers.navidromewan.rule=Host(`m.fbleagh.duckdns.org`)",
"traefik.http.routers.navidromewan.middlewares=dex@consulcatalog",
"traefik.http.routers.navidromewan.tls=true",
]
port = "http" # This maps to the LiteFS proxy port defined in network block
check {
type = "http"
path = "/app" # LiteFS proxy passes this through
interval = "10s"
timeout = "2s"
}
}
resources { resources {
cpu = 500 cpu = 500
@@ -181,4 +86,4 @@ EOF
} }
} }
} }
} }