diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..67eec1c --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,50 @@ +name: Deploy to Nomad + +on: + workflow_run: + workflows: ["Build and Push Docker Image"] + types: + - completed + workflow_dispatch: + inputs: + container_sha: + description: 'Container SHA to deploy (leave empty for latest commit)' + required: false + type: string + +jobs: + nomad: + runs-on: ubuntu-latest + name: Deploy to Nomad + if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }} + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Setup Nomad CLI + uses: hashicorp/setup-nomad@v2 + with: + version: '1.10.5' + + - name: Set Container Version + id: container_version + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.container_sha }}" ]; then + echo "sha=${{ inputs.container_sha }}" >> $GITHUB_OUTPUT + elif [ "${{ github.event_name }}" = "workflow_run" ]; then + echo "sha=${{ github.event.workflow_run.head_sha }}" >> $GITHUB_OUTPUT + else + echo "sha=${{ github.sha }}" >> $GITHUB_OUTPUT + fi + + - name: Deploy Nomad Job + id: deploy + env: + NOMAD_ADDR: http://192.168.4.36:4646 + NOMAD_TOKEN: ${{ secrets.NOMAD_TOKEN }} + run: | + echo "Deploying container version: ${{ steps.container_version.outputs.sha }}" + nomad job run \ + -var="container_sha=${{ steps.container_version.outputs.sha }}" \ + navidrome-litefs.nomad diff --git a/conductor/tracks/diagnose_and_enhance_20260208/plan.md b/conductor/tracks/diagnose_and_enhance_20260208/plan.md deleted file mode 100644 index 909b58b..0000000 --- a/conductor/tracks/diagnose_and_enhance_20260208/plan.md +++ /dev/null @@ -1,30 +0,0 @@ -# Plan: Cluster Diagnosis and Script Enhancement (`diagnose_and_enhance`) - -## Phase 1: Enhanced Diagnostics (Consul) [x] [checkpoint: a686c5b] -- [x] Task: Update `consul_client.py` to fetch detailed health check output - - [x] Write tests for fetching `Output` field from Consul checks - - [x] Implement logic to extract and store the `Output` (error message) -- [x] Task: Update aggregator and formatter to display Consul errors - - [x] Update aggregation logic to include `consul_error` - - [x] Update table formatter to indicate an error (maybe a flag or color) - - [x] Add a "Diagnostics" section to the output to print full error details -- [x] Task: Conductor - User Manual Verification 'Phase 1: Enhanced Diagnostics (Consul)' (Protocol in workflow.md) - -## Phase 2: Nomad Integration and Logs [x] [checkpoint: 6d77729] -- [x] Task: Implement `nomad_client.py` wrapper - - [x] Write tests for `get_allocation_logs`, `get_node_status`, and `restart_allocation` (mocking subprocess) - - [x] Implement `subprocess.run(["nomad", ...])` logic to fetch logs and restart allocations -- [x] Task: Integrate Nomad logs into diagnosis - - [x] Update aggregator to call Nomad client for critical nodes - - [x] Update "Diagnostics" section to display the last 20 lines of stderr -- [x] Task: Conductor - User Manual Verification 'Phase 2: Nomad Integration and Logs' (Protocol in workflow.md) - -## Phase 3: Advanced LiteFS Status [ ] -- [ ] Task: Implement `litefs_status` via `nomad alloc exec` - - [ ] Write tests for executing remote commands via Nomad - - [ ] Update `litefs_client.py` to fallback to `nomad alloc exec` if HTTP fails - - [ ] Parse `litefs status` output (text/json) to extract uptime and replication lag -- [ ] Task: Final Polish and Diagnosis Run - - [ ] Ensure all pieces work together - - [ ] Run the script to diagnose `odroid8` -- [ ] Task: Conductor - User Manual Verification 'Phase 3: Advanced LiteFS Status' (Protocol in workflow.md) diff --git a/conductor/tracks/fix_litefs_config_20260208/plan.md b/conductor/tracks/fix_litefs_config_20260208/plan.md deleted file mode 100644 index 840c4b9..0000000 --- a/conductor/tracks/fix_litefs_config_20260208/plan.md +++ /dev/null @@ -1,22 +0,0 @@ -# Plan: Fix LiteFS Configuration and Process Management (`fix_litefs_config`) - -## Phase 1: Configuration and Image Structure [ ] -- [x] Task: Update `litefs.yml` to include the `exec` block (396dfeb) -- [x] Task: Update `Dockerfile` to use LiteFS as the supervisor (`ENTRYPOINT ["litefs", "mount"]`) (ef91b8e) -- [x] Task: Update `navidrome-litefs-v2.nomad` with corrected storage paths (`ND_DATAFOLDER`, `ND_CACHEFOLDER`, `ND_BACKUP_PATH`) (5cbb657) -- [ ] Task: Conductor - User Manual Verification 'Phase 1: Configuration and Image Structure' (Protocol in workflow.md) - -## Phase 2: Entrypoint and Registration Logic [x] [checkpoint: 9cd5455] -- [x] Task: Refactor `entrypoint.sh` to handle leadership-aware process management (9cd5455) - - [x] Integrate Consul registration logic (from `register.sh`) - - [x] Implement loop to start/stop Navidrome based on `/data/.primary` existence - - [x] Ensure proper signal handling for Navidrome shutdown -- [x] Task: Clean up redundant scripts (e.g., `register.sh` if fully integrated) (9cd5455) -- [ ] Task: Conductor - User Manual Verification 'Phase 2: Entrypoint and Registration Logic' (Protocol in workflow.md) - -## Phase 3: Deployment and Failover Verification [ ] -- [ ] Task: Build and push the updated Docker image via Gitea Actions (if possible) or manual trigger -- [ ] Task: Deploy the updated Nomad job -- [ ] Task: Verify cluster health and process distribution using `cluster_status` script -- [ ] Task: Perform a manual failover (stop primary allocation) and verify Navidrome migrates correctly -- [ ] Task: Conductor - User Manual Verification 'Phase 3: Deployment and Failover Verification' (Protocol in workflow.md) diff --git a/conductor/tracks/fix_navidrome_paths_20260209/index.md b/conductor/tracks/fix_navidrome_paths_20260209/index.md deleted file mode 100644 index 6e18d5b..0000000 --- a/conductor/tracks/fix_navidrome_paths_20260209/index.md +++ /dev/null @@ -1,5 +0,0 @@ -# Track fix_navidrome_paths_20260209 Context - -- [Specification](./spec.md) -- [Implementation Plan](./plan.md) -- [Metadata](./metadata.json) diff --git a/conductor/tracks/fix_navidrome_paths_20260209/metadata.json b/conductor/tracks/fix_navidrome_paths_20260209/metadata.json deleted file mode 100644 index e065794..0000000 --- a/conductor/tracks/fix_navidrome_paths_20260209/metadata.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "track_id": "fix_navidrome_paths_20260209", - "type": "bug", - "status": "new", - "created_at": "2026-02-09T14:30:00Z", - "updated_at": "2026-02-09T14:30:00Z", - "description": "Fix Navidrome database location to ensure it uses LiteFS mount and resolve process path conflicts." -} diff --git a/conductor/tracks/fix_navidrome_paths_20260209/plan.md b/conductor/tracks/fix_navidrome_paths_20260209/plan.md deleted file mode 100644 index d3fafa5..0000000 --- a/conductor/tracks/fix_navidrome_paths_20260209/plan.md +++ /dev/null @@ -1,17 +0,0 @@ -# Plan: Correct Navidrome Database and Plugins Location (`fix_navidrome_paths`) - -## Phase 1: Configuration Updates [x] -- [x] Task: Update `navidrome-litefs-v2.nomad` with corrected paths (76398de) -- [x] Task: Update `entrypoint.sh` to handle plugins folder and environment cleanup (decb9f5) -- [x] Task: Conductor - User Manual Verification 'Phase 1: Configuration Updates' (Protocol in workflow.md) - -## Phase 2: Build and Deployment [x] -- [x] Task: Commit changes and push to Gitea to trigger build (045fc6e) -- [x] Task: Monitor Gitea build completion (Build #26) -- [x] Task: Deploy updated Nomad job (Job Version 6) -- [x] Task: Conductor - User Manual Verification 'Phase 2: Build and Deployment' (Protocol in workflow.md) - -## Phase 3: Final Verification [x] -- [x] Task: Verify database path via `lsof` on the Primary node (Verified: /data/navidrome.db) -- [x] Task: Verify replication health using `cluster_status` script (Verified: All nodes in sync) -- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md) \ No newline at end of file diff --git a/conductor/tracks/fix_navidrome_paths_20260209/spec.md b/conductor/tracks/fix_navidrome_paths_20260209/spec.md deleted file mode 100644 index 4a97bb8..0000000 --- a/conductor/tracks/fix_navidrome_paths_20260209/spec.md +++ /dev/null @@ -1,25 +0,0 @@ -# Specification: Correct Navidrome Database and Plugins Location (`fix_navidrome_paths`) - -## Overview -Force Navidrome to use the `/data` LiteFS mount for its SQLite database by setting the `DATAFOLDER` to `/data`. To avoid the "Operation not permitted" error caused by LiteFS's restriction on directory creation, redirect the Navidrome plugins folder to persistent shared storage. - -## Functional Requirements -- **Nomad Job Configuration (`navidrome-litefs-v2.nomad`):** - - Set `ND_DATAFOLDER="/data"`. This will force Navidrome to create and use `navidrome.db` on the LiteFS mount. - - Set `ND_PLUGINSFOLDER="/shared_data/plugins"`. This prevents Navidrome from attempting to create a `plugins` directory in the read-only/virtual `/data` mount. - - Keep `ND_CACHEFOLDER` and `ND_BACKUP_PATH` pointing to `/shared_data` subdirectories. -- **Entrypoint Logic (`entrypoint.sh`):** - - Ensure it creates `/shared_data/plugins` if it doesn't exist. - - Remove the explicit `export ND_DATABASE_PATH` if it conflicts with the new `DATAFOLDER` logic, or keep it as an explicit override. -- **Verification:** - - Confirm via `lsof` that Navidrome is finally using `/data/navidrome.db`. - - Confirm that LiteFS `/debug/vars` now reports the database in its active set. - -## Non-Functional Requirements -- **Persistence:** Ensure all non-database files (plugins, cache, backups) are stored on the shared host mount (`/shared_data`) to survive container restarts and migrations. - -## Acceptance Criteria -- [ ] Navidrome successfully starts with `/data` as its data folder. -- [ ] No "Operation not permitted" errors occur during startup. -- [ ] `lsof` confirms `/data/navidrome.db` is open by the Navidrome process. -- [ ] LiteFS `txid` increases on the Primary and replicates to Replicas when Navidrome writes to the DB. diff --git a/conductor/tracks/fix_odroid8_and_script_20260208/plan.md b/conductor/tracks/fix_odroid8_and_script_20260208/plan.md deleted file mode 100644 index 96565c0..0000000 --- a/conductor/tracks/fix_odroid8_and_script_20260208/plan.md +++ /dev/null @@ -1,26 +0,0 @@ -# Plan: Fix Odroid8 and Script Robustness (`fix_odroid8_and_script`) - -## Phase 1: Script Robustness [x] [checkpoint: 860000b] -- [x] Task: Update `nomad_client.py` to handle subprocess errors gracefully - - [x] Write tests for handling Nomad CLI absence/failure - - [x] Update implementation to return descriptive error objects or `None` without crashing -- [x] Task: Update aggregator and formatter to handle Nomad errors - - [x] Update `cluster_aggregator.py` to gracefully skip Nomad calls if they fail - - [x] Update `output_formatter.py` to display "Nomad Error" in relevant cells - - [x] Add a global "Nomad Connectivity Warning" to the summary -- [x] Task: Conductor - User Manual Verification 'Phase 1: Script Robustness' (Protocol in workflow.md) - -## Phase 2: Odroid8 Recovery [ ] -- [x] Task: Identify and verify `odroid8` LiteFS data path - - [x] Run `nomad alloc status` to find the volume mount for `odroid8` - - [x] Provide the user with the exact host path to the LiteFS data -- [x] Task: Guide user through manual cleanup - - [x] Provide steps to stop the allocation - - [x] Provide the `rm` command to clear the LiteFS metadata - - [x] Provide steps to restart and verify the node -- [~] Task: Conductor - User Manual Verification 'Phase 2: Odroid8 Recovery' (Protocol in workflow.md) - -## Phase 3: Final Verification [x] -- [x] Task: Final verification run of the script -- [x] Task: Verify cluster health in Consul and LiteFS API -- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md) diff --git a/conductor/tracks/implement_ttl_heartbeat_20260208/plan.md b/conductor/tracks/implement_ttl_heartbeat_20260208/plan.md deleted file mode 100644 index 20c2300..0000000 --- a/conductor/tracks/implement_ttl_heartbeat_20260208/plan.md +++ /dev/null @@ -1,22 +0,0 @@ -# Plan: Implement TTL Heartbeat Service Registration (`implement_ttl_heartbeat`) - -## Phase 1: Container Environment Preparation [x] [checkpoint: 51b8fce] -- [x] Task: Update `Dockerfile` to install `curl` and `jq` (f7fe258) -- [x] Task: Verify `litefs.yml` points to `entrypoint.sh` (should already be correct) (verified) -- [x] Task: Conductor - User Manual Verification 'Phase 1: Container Environment Preparation' (Protocol in workflow.md) - -## Phase 2: Script Implementation [x] [checkpoint: 139016f] -- [x] Task: Refactor `entrypoint.sh` with the TTL Heartbeat logic (d977301) - - [x] Implement `register_service` with TTL check definition - - [x] Implement `pass_ttl` loop - - [x] Implement robust `stop_app` and signal trapping - - [x] Ensure correct Primary/Replica detection logic (LiteFS 0.5: Primary = No `.primary` file) -- [x] Task: Conductor - User Manual Verification 'Phase 2: Script Implementation' (Protocol in workflow.md) - -## Phase 3: Deployment and Verification [ ] -- [ ] Task: Commit changes and push to Gitea to trigger build -- [ ] Task: Monitor Gitea build completion -- [ ] Task: Deploy updated Nomad job (forcing update if necessary) -- [ ] Task: Verify "Clean" state in Consul (only one primary registered) -- [ ] Task: Verify Failover/Stop behavior (immediate deregistration vs TTL expiry) -- [ ] Task: Conductor - User Manual Verification 'Phase 3: Deployment and Verification' (Protocol in workflow.md) diff --git a/conductor/tracks/update_monitor_discovery_20260208/plan.md b/conductor/tracks/update_monitor_discovery_20260208/plan.md deleted file mode 100644 index 3043be8..0000000 --- a/conductor/tracks/update_monitor_discovery_20260208/plan.md +++ /dev/null @@ -1,23 +0,0 @@ -# Plan: Update Monitor Discovery Logic (`update_monitor_discovery`) - -## Phase 1: Nomad Discovery Enhancement [x] [checkpoint: 353683e] -- [x] Task: Update `nomad_client.py` to fetch job allocations with IPs (353683e) - - [x] Write tests for parsing allocation IPs from `nomad job status` or `nomad alloc status` - - [x] Implement `get_job_allocations(job_id)` returning a list of dicts (id, node, ip) -- [x] Task: Conductor - User Manual Verification 'Phase 1: Nomad Discovery Enhancement' (Protocol in workflow.md) - -## Phase 2: Aggregator Refactor [x] [checkpoint: 655a9b2] -- [x] Task: Refactor `cluster_aggregator.py` to drive discovery via Nomad (655a9b2) - - [x] Update `get_cluster_status` to call `nomad_client.get_job_allocations` first - - [x] Update loop to iterate over allocations and supplement with LiteFS and Consul data -- [x] Task: Update `consul_client.py` to fetch all services once and allow lookup by IP/ID (655a9b2) -- [x] Task: Update tests for the new discovery flow (655a9b2) -- [x] Task: Conductor - User Manual Verification 'Phase 2: Aggregator Refactor' (Protocol in workflow.md) - -## Phase 3: UI and Health Logic [x] [checkpoint: 21e9c3d] -- [x] Task: Update `output_formatter.py` for "Standby" nodes (21e9c3d) - - [x] Update table formatting to handle missing Consul status for replicas -- [x] Task: Update Cluster Health calculation (21e9c3d) - - [x] "Healthy" = 1 Primary (Consul passing) + N Replicas (LiteFS connected) -- [x] Task: Final verification run (21e9c3d) -- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md) diff --git a/entrypoint.sh b/entrypoint.sh index 73173f9..c2c491d 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -16,6 +16,25 @@ PRIMARY_TAGS='["navidrome","web","traefik.enable=true","urlprefix-/navidrome","t # --- Helper Functions --- +# Backup Database (Only on Primary) +run_backup() { + local backup_dir="/shared_data/backup" + local timestamp=$(date +%Y%m%d_%H%M%S) + local backup_file="${backup_dir}/navidrome.db_${timestamp}.bak" + + echo "Backing up database to ${backup_file}..." + mkdir -p "$backup_dir" + + if litefs export -name navidrome.db "$backup_file"; then + echo "Backup successful." + # Keep only last 7 days + find "$backup_dir" -name "navidrome.db_*.bak" -mtime +7 -delete + echo "Old backups cleaned." + else + echo "ERROR: Backup failed!" + fi +} + # Register Service with TTL Check register_service() { echo "Promoted! Registering service ${SERVICE_ID}..." @@ -86,6 +105,9 @@ echo "Consul: $CONSUL_HTTP_ADDR" # Small sleep to let LiteFS settle and leadership election complete sleep 5 +LAST_BACKUP_TIME=0 +BACKUP_INTERVAL=86400 # 24 hours + while true; do # In LiteFS 0.5, .primary file exists ONLY on replicas. if [ ! -f "$DB_LOCK_FILE" ]; then @@ -103,6 +125,13 @@ while true; do # 2. Maintain the heartbeat (TTL) pass_ttl + # 3. Handle periodic backup + CURRENT_TIME=$(date +%s) + if [ $((CURRENT_TIME - LAST_BACKUP_TIME)) -ge $BACKUP_INTERVAL ]; then + run_backup + LAST_BACKUP_TIME=$CURRENT_TIME + fi + else # === WE ARE REPLICA === @@ -111,6 +140,8 @@ while true; do echo "Lost leadership. Demoting..." stop_app deregister_service + # Reset backup timer so the next primary can start fresh or we start fresh if promoted again + LAST_BACKUP_TIME=0 fi # No service registration exists for replicas to keep Consul clean. diff --git a/navidrome-litefs-v2.nomad b/navidrome-litefs-v2.nomad deleted file mode 100644 index 5635586..0000000 --- a/navidrome-litefs-v2.nomad +++ /dev/null @@ -1,82 +0,0 @@ -job "navidrome-litefs" { - datacenters = ["dc1"] - type = "service" - - constraint { - attribute = "${attr.kernel.name}" - value = "linux" - } - - group "navidrome" { - count = 4 - - update { - max_parallel = 1 - min_healthy_time = "30s" - healthy_deadline = "5m" - auto_revert = false - } - - constraint { - distinct_hosts = true - } - - network { - # Request static ports on the host - port "http" { - static = 4533 - to = 4533 # Direct to Navidrome - } - port "litefs" { - static = 20202 - to = 20202 # Maps host 20202 to container 20202 (LiteFS Replication) - } - } - - task "navidrome" { - driver = "docker" - - config { - image = "gitea.service.dc1.fbleagh.duckdns.org/sstent/navidrome-litefs:045fc6e82b9ecb6bebc1f095f62498935df70bbf" - privileged = true # Still needed for FUSE - ports = ["http", "litefs"] - force_pull = true - - volumes = [ - "/mnt/configs/navidrome_litefs:/var/lib/litefs", - "/mnt/Public/configs/navidrome:/shared_data", - "/mnt/Public/Downloads/Clean_Music:/music/CleanMusic:ro", - "/mnt/Public/Downloads/news/slskd/downloads:/music/slskd:ro", - "/mnt/Public/Downloads/incoming_music:/music/incomingmusic:ro" - ] - } - - env { - # LiteFS Config - CONSUL_URL = "http://${attr.unique.network.ip-address}:8500" - ADVERTISE_IP = "${attr.unique.network.ip-address}" - PORT = "8080" # Internal proxy port (unused but kept) - - # Navidrome Config - ND_DATAFOLDER = "/data" - ND_PLUGINS_FOLDER = "/shared_data/plugins" - ND_CACHEFOLDER = "/shared_data/cache" - ND_BACKUP_PATH = "/shared_data/backup" - - ND_SCANSCHEDULE = "0" - ND_SCANNER_FSWATCHER_ENABLED = "false" - ND_FORCE_REDEPLOY = "5" - ND_LOGLEVEL = "info" - ND_REVERSEPROXYWHITELIST = "0.0.0.0/0" - ND_REVERSEPROXYUSERHEADER = "X-Forwarded-User" - } - - # NO service block here! Managed by register.sh inside the container. - - resources { - cpu = 500 - memory = 512 - } - } - } -} diff --git a/navidrome-litefs.nomad b/navidrome-litefs.nomad index 0ccbed0..7cf2481 100644 --- a/navidrome-litefs.nomad +++ b/navidrome-litefs.nomad @@ -2,178 +2,82 @@ job "navidrome-litefs" { datacenters = ["dc1"] type = "service" - # We pin to Linux because LiteFS requires FUSE + variable "container_sha" { + type = string + default = "045fc6e82b9ecb6bebc1f095f62498935df70bbf" + } + constraint { attribute = "${attr.kernel.name}" value = "linux" } group "navidrome" { - count = 2 + count = 4 + + update { + max_parallel = 1 + min_healthy_time = "30s" + healthy_deadline = "5m" + auto_revert = false + } constraint { distinct_hosts = true } network { - mode = "host" - port "http" {} - } - - # --- Setup Task --- - task "setup" { - driver = "docker" - lifecycle { - hook = "prestart" - sidecar = false + # Request static ports on the host + port "http" { + static = 4533 + to = 4533 # Direct to Navidrome } - config { - image = "busybox" - command = "mkdir" - args = ["-p", "/alloc/sqlite"] - network_mode = "host" + port "litefs" { + static = 20202 + to = 20202 # Maps host 20202 to container 20202 (LiteFS Replication) } } - # --- LiteFS Task --- - task "litefs" { - driver = "docker" - - config { - image = "flyio/litefs:0.5" - privileged = true # Needed for FUSE - ports = ["http"] - network_mode = "host" - - # 1. Bind mount for LiteFS internal data (chunks/WAL) - # 2. Bind mount for the config - # 3. Mount the shared alloc dir so we can mount FUSE on it - volumes = [ - "/mnt/configs/navidrome_litefs:/var/lib/litefs", - "local/litefs.yml:/etc/litefs.yml" - ] - - mounts = [ - { - type = "bind" - source = "../alloc/sqlite" - target = "/mnt/sqlite" - bind_options = { - propagation = "shared" - } - } - ] - } - - # Create the config file - template { - left_delimiter = "[[" - right_delimiter = "]]" - data = <