Compare commits

..

29 Commits

Author SHA1 Message Date
f08c715d75 fix(nomad): Move variable definition to top-level
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 25s
2026-04-08 10:58:39 -07:00
8f1565b1af fix(deploy): Replace failing setup-nomad action with manual install
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 28s
2026-04-08 10:56:20 -07:00
4538ad5909 feat: Add automated LiteFS backups and GitHub deployment workflow
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 3m52s
2026-04-08 10:38:23 -07:00
f0b02904a8 chore(conductor): Mark track 'fix_navidrome_paths' as complete 2026-02-09 07:14:50 -08:00
7f1f3321e0 chore(conductor): Archive track 'fix_navidrome_paths' 2026-02-09 07:14:22 -08:00
23a65be4d8 docs(conductor): Synchronize tech-stack and finalize track 'fix_navidrome_paths' 2026-02-09 07:14:05 -08:00
045fc6e82b fix(nomad): Correct env var name to ND_PLUGINS_FOLDER
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 53s
2026-02-09 07:04:12 -08:00
e56fb94fdc conductor(plan): Mark Phase 1 as complete
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 39s
2026-02-09 06:54:24 -08:00
decb9f5860 feat(entrypoint): Ensure shared directories exist and clean up start_app 2026-02-09 06:53:31 -08:00
76398dec99 feat(nomad): Correct storage paths and remove conflicting /data bind mount 2026-02-09 06:52:10 -08:00
2746b4a550 chore(conductor): Archive track 'update_monitor_discovery' 2026-02-09 06:43:23 -08:00
8c58004d1c chore(conductor): Mark track 'update_monitor_discovery' as complete 2026-02-09 06:43:23 -08:00
ad49e12368 docs(conductor): Synchronize tech-stack and commit monitor script updates 2026-02-09 06:42:38 -08:00
1c693aade4 conductor(plan): Mark phase 'Phase 3: UI and Health Logic' as complete 2026-02-09 06:15:15 -08:00
21e9c3d72d conductor(checkpoint): Checkpoint end of Phase 3 - UI and Health Logic 2026-02-09 06:14:47 -08:00
c5a3cbfeb8 conductor(plan): Mark phase 'Phase 2: Aggregator Refactor' as complete 2026-02-09 06:13:35 -08:00
655a9b2571 conductor(checkpoint): Checkpoint end of Phase 2 - Aggregator Refactor 2026-02-09 06:13:09 -08:00
079498caba conductor(plan): Mark phase 'Phase 1: Nomad Discovery Enhancement' as complete 2026-02-09 06:11:07 -08:00
353683e2bf conductor(checkpoint): Checkpoint end of Phase 1 - Nomad Discovery Enhancement 2026-02-09 06:10:41 -08:00
9b6159a40c chore(conductor): Archive track 'implement_ttl_heartbeat' 2026-02-09 06:06:48 -08:00
ed8e7608f1 docs(conductor): Synchronize docs for track 'implement_ttl_heartbeat' 2026-02-09 06:05:58 -08:00
3c5968690c conductor(plan): Mark phase 'Phase 2: Script Implementation' as complete
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 48s
2026-02-09 05:54:39 -08:00
139016f121 conductor(checkpoint): Checkpoint end of Phase 2 2026-02-09 05:54:22 -08:00
d97730174d feat(entrypoint): Implement TTL heartbeat registration and robust supervision 2026-02-09 05:52:28 -08:00
27b10a39b8 conductor(plan): Mark phase 'Phase 1: Container Environment Preparation' as complete 2026-02-09 05:51:50 -08:00
51b8fce10b conductor(checkpoint): Checkpoint end of Phase 1 2026-02-09 05:51:28 -08:00
f7fe258480 feat(image): Add jq to Dockerfile for service registration logic 2026-02-09 05:48:43 -08:00
5a557145ac chore(conductor): Archive track 'fix_litefs_config' 2026-02-08 17:50:13 -08:00
01fa06e7dc docs(conductor): Synchronize docs for track 'fix_litefs_config' 2026-02-08 16:44:30 -08:00
35 changed files with 801 additions and 528 deletions

54
.github/workflows/deploy.yml vendored Normal file
View File

@@ -0,0 +1,54 @@
name: Deploy to Nomad
on:
workflow_run:
workflows: ["Build and Push Docker Image"]
types:
- completed
workflow_dispatch:
inputs:
container_sha:
description: 'Container SHA to deploy (leave empty for latest commit)'
required: false
type: string
jobs:
nomad:
runs-on: ubuntu-latest
name: Deploy to Nomad
if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Setup Nomad CLI
run: |
NOMAD_VERSION="1.10.5"
curl -sSL https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_linux_amd64.zip -o nomad.zip
unzip nomad.zip
sudo mv nomad /usr/local/bin/
rm nomad.zip
nomad version
- name: Set Container Version
id: container_version
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.container_sha }}" ]; then
echo "sha=${{ inputs.container_sha }}" >> $GITHUB_OUTPUT
elif [ "${{ github.event_name }}" = "workflow_run" ]; then
echo "sha=${{ github.event.workflow_run.head_sha }}" >> $GITHUB_OUTPUT
else
echo "sha=${{ github.sha }}" >> $GITHUB_OUTPUT
fi
- name: Deploy Nomad Job
id: deploy
env:
NOMAD_ADDR: http://192.168.4.36:4646
NOMAD_TOKEN: ${{ secrets.NOMAD_TOKEN }}
run: |
echo "Deploying container version: ${{ steps.container_version.outputs.sha }}"
nomad job run \
-var="container_sha=${{ steps.container_version.outputs.sha }}" \
navidrome-litefs.nomad

View File

@@ -6,7 +6,7 @@ FROM ghcr.io/navidrome/navidrome:latest
# Install dependencies
USER root
RUN apk add --no-cache fuse3 ca-certificates bash curl
RUN apk add --no-cache fuse3 ca-certificates bash curl jq
# Copy LiteFS binary
COPY --from=litefs /usr/local/bin/litefs /usr/local/bin/litefs

View File

@@ -0,0 +1,5 @@
# Track fix_litefs_config_20260208 Context
- [Specification](./spec.md)
- [Implementation Plan](./plan.md)
- [Metadata](./metadata.json)

View File

@@ -0,0 +1,8 @@
{
"track_id": "fix_litefs_config_20260208",
"type": "feature",
"status": "new",
"created_at": "2026-02-08T18:00:00Z",
"updated_at": "2026-02-08T18:00:00Z",
"description": "Fix LiteFS configuration to use 'exec' for Navidrome and ensure it only runs on the Primary node. Also fix DB path configuration."
}

View File

@@ -15,8 +15,8 @@
- [ ] Task: Conductor - User Manual Verification 'Phase 2: Entrypoint and Registration Logic' (Protocol in workflow.md)
## Phase 3: Deployment and Failover Verification [ ]
- [ ] Task: Build and push the updated Docker image via Gitea Actions (if possible) or manual trigger
- [ ] Task: Deploy the updated Nomad job
- [~] Task: Build and push the updated Docker image via Gitea Actions (if possible) or manual trigger
- [~] Task: Deploy the updated Nomad job
- [ ] Task: Verify cluster health and process distribution using `cluster_status` script
- [ ] Task: Perform a manual failover (stop primary allocation) and verify Navidrome migrates correctly
- [ ] Task: Conductor - User Manual Verification 'Phase 3: Deployment and Failover Verification' (Protocol in workflow.md)

View File

@@ -0,0 +1,38 @@
# Specification: Fix LiteFS Configuration and Process Management (`fix_litefs_config`)
## Overview
Reconfigure the Navidrome/LiteFS process management to ensure Navidrome and its Consul service registration only occur on the Primary node. This will be achieved by leveraging the LiteFS `exec` block and updating the `entrypoint.sh` logic. Additionally, correct the Navidrome database and storage paths to properly utilize the LiteFS replicated mount.
## Functional Requirements
- **LiteFS Configuration (`litefs.yml`):**
- Enable the `exec` block to trigger `/usr/local/bin/entrypoint.sh`.
- This allows LiteFS to manage the lifecycle of the application.
- **Entrypoint Logic (`entrypoint.sh`):**
- Implement a supervision loop that monitors leadership via the `/data/.primary` file.
- **On Primary:**
- Register the node as the `navidrome` (primary) service in Consul.
- Start the Navidrome process.
- **On Replica:**
- Ensure Navidrome is NOT running.
- Deregister the `navidrome` primary service if previously registered.
- (Optional) Register as a replica service or simply wait.
- **On Transition:** Handle graceful shutdown of Navidrome if the node loses leadership.
- **Storage and Path Configuration (`navidrome-litefs-v2.nomad`):**
- Set `ND_DATAFOLDER` to `/data` (the LiteFS FUSE mount).
- Set `ND_CACHEFOLDER` to `/shared_data/cache` (shared persistent storage).
- Set `ND_BACKUP_PATH` to `/shared_data/backup` (shared persistent storage).
- **Dockerfile Updates:**
- Update `ENTRYPOINT` to `["litefs", "mount"]` to allow LiteFS to act as the supervisor.
## Non-Functional Requirements
- **Robustness:** Use a simple bash loop for process management to avoid extra dependencies.
- **Signal Handling:** Ensure signals (SIGTERM) are correctly forwarded to Navidrome for graceful shutdown.
## Acceptance Criteria
- [ ] Navidrome process runs ONLY on the Primary node.
- [ ] Consul service `navidrome` correctly points to the current Primary.
- [ ] Navidrome database (`navidrome.db`) is confirmed to be on the `/data` mount.
- [ ] Cluster failover correctly stops Navidrome on the old primary and starts it on the new one.
## Out of Scope
- Implementation of complex init systems like `tini` (bash loop selected by user).

View File

@@ -0,0 +1,5 @@
# Track fix_navidrome_paths_20260209 Context
- [Specification](./spec.md)
- [Implementation Plan](./plan.md)
- [Metadata](./metadata.json)

View File

@@ -0,0 +1,8 @@
{
"track_id": "fix_navidrome_paths_20260209",
"type": "bug",
"status": "new",
"created_at": "2026-02-09T14:30:00Z",
"updated_at": "2026-02-09T14:30:00Z",
"description": "Fix Navidrome database location to ensure it uses LiteFS mount and resolve process path conflicts."
}

View File

@@ -0,0 +1,17 @@
# Plan: Correct Navidrome Database and Plugins Location (`fix_navidrome_paths`)
## Phase 1: Configuration Updates [x]
- [x] Task: Update `navidrome-litefs-v2.nomad` with corrected paths (76398de)
- [x] Task: Update `entrypoint.sh` to handle plugins folder and environment cleanup (decb9f5)
- [x] Task: Conductor - User Manual Verification 'Phase 1: Configuration Updates' (Protocol in workflow.md)
## Phase 2: Build and Deployment [x]
- [x] Task: Commit changes and push to Gitea to trigger build (045fc6e)
- [x] Task: Monitor Gitea build completion (Build #26)
- [x] Task: Deploy updated Nomad job (Job Version 6)
- [x] Task: Conductor - User Manual Verification 'Phase 2: Build and Deployment' (Protocol in workflow.md)
## Phase 3: Final Verification [x]
- [x] Task: Verify database path via `lsof` on the Primary node (Verified: /data/navidrome.db)
- [x] Task: Verify replication health using `cluster_status` script (Verified: All nodes in sync)
- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md)

View File

@@ -0,0 +1,25 @@
# Specification: Correct Navidrome Database and Plugins Location (`fix_navidrome_paths`)
## Overview
Force Navidrome to use the `/data` LiteFS mount for its SQLite database by setting the `DATAFOLDER` to `/data`. To avoid the "Operation not permitted" error caused by LiteFS's restriction on directory creation, redirect the Navidrome plugins folder to persistent shared storage.
## Functional Requirements
- **Nomad Job Configuration (`navidrome-litefs-v2.nomad`):**
- Set `ND_DATAFOLDER="/data"`. This will force Navidrome to create and use `navidrome.db` on the LiteFS mount.
- Set `ND_PLUGINSFOLDER="/shared_data/plugins"`. This prevents Navidrome from attempting to create a `plugins` directory in the read-only/virtual `/data` mount.
- Keep `ND_CACHEFOLDER` and `ND_BACKUP_PATH` pointing to `/shared_data` subdirectories.
- **Entrypoint Logic (`entrypoint.sh`):**
- Ensure it creates `/shared_data/plugins` if it doesn't exist.
- Remove the explicit `export ND_DATABASE_PATH` if it conflicts with the new `DATAFOLDER` logic, or keep it as an explicit override.
- **Verification:**
- Confirm via `lsof` that Navidrome is finally using `/data/navidrome.db`.
- Confirm that LiteFS `/debug/vars` now reports the database in its active set.
## Non-Functional Requirements
- **Persistence:** Ensure all non-database files (plugins, cache, backups) are stored on the shared host mount (`/shared_data`) to survive container restarts and migrations.
## Acceptance Criteria
- [ ] Navidrome successfully starts with `/data` as its data folder.
- [ ] No "Operation not permitted" errors occur during startup.
- [ ] `lsof` confirms `/data/navidrome.db` is open by the Navidrome process.
- [ ] LiteFS `txid` increases on the Primary and replicates to Replicas when Navidrome writes to the DB.

View File

@@ -0,0 +1,5 @@
# Track implement_ttl_heartbeat_20260208 Context
- [Specification](./spec.md)
- [Implementation Plan](./plan.md)
- [Metadata](./metadata.json)

View File

@@ -0,0 +1,8 @@
{
"track_id": "implement_ttl_heartbeat_20260208",
"type": "enhancement",
"status": "new",
"created_at": "2026-02-08T19:00:00Z",
"updated_at": "2026-02-08T19:00:00Z",
"description": "Implement TTL Heartbeat architecture for robust Consul service registration and cleaner failure handling."
}

View File

@@ -0,0 +1,22 @@
# Plan: Implement TTL Heartbeat Service Registration (`implement_ttl_heartbeat`)
## Phase 1: Container Environment Preparation [x] [checkpoint: 51b8fce]
- [x] Task: Update `Dockerfile` to install `curl` and `jq` (f7fe258)
- [x] Task: Verify `litefs.yml` points to `entrypoint.sh` (should already be correct) (verified)
- [x] Task: Conductor - User Manual Verification 'Phase 1: Container Environment Preparation' (Protocol in workflow.md)
## Phase 2: Script Implementation [x] [checkpoint: 139016f]
- [x] Task: Refactor `entrypoint.sh` with the TTL Heartbeat logic (d977301)
- [x] Implement `register_service` with TTL check definition
- [x] Implement `pass_ttl` loop
- [x] Implement robust `stop_app` and signal trapping
- [x] Ensure correct Primary/Replica detection logic (LiteFS 0.5: Primary = No `.primary` file)
- [x] Task: Conductor - User Manual Verification 'Phase 2: Script Implementation' (Protocol in workflow.md)
## Phase 3: Deployment and Verification [ ]
- [~] Task: Commit changes and push to Gitea to trigger build
- [ ] Task: Monitor Gitea build completion
- [ ] Task: Deploy updated Nomad job (forcing update if necessary)
- [ ] Task: Verify "Clean" state in Consul (only one primary registered)
- [ ] Task: Verify Failover/Stop behavior (immediate deregistration vs TTL expiry)
- [ ] Task: Conductor - User Manual Verification 'Phase 3: Deployment and Verification' (Protocol in workflow.md)

View File

@@ -0,0 +1,32 @@
# Specification: Implement TTL Heartbeat Service Registration (`implement_ttl_heartbeat`)
## Overview
Replace the current "register and forget" Consul registration logic with a robust "TTL Heartbeat" pattern. This ensures that only the active Primary node is registered in Consul, and service entries are automatically removed (deregistered) if the node crashes, failover occurs, or Nomad stops the allocation.
## Functional Requirements
- **Supervisor Script (`entrypoint.sh`):**
- Refactor to implement the "Self-Registration" pattern with TTL checks.
- **Leadership Detection:** Monitor `/data/.primary` (LiteFS 0.5).
- **Primary:** Absence of file. Start Navidrome, register service with TTL.
- **Replica:** Presence of file. Stop Navidrome, deregister service.
- **Heartbeat:** Periodically (e.g., every 5-10s) PUT to Consul to pass the TTL check while Primary.
- **Signal Handling:** Trap `SIGTERM`/`SIGINT` to gracefully stop Navidrome and deregister immediately.
- **Docker Image:**
- Ensure `curl` and `jq` are installed (prerequisites for the script).
- **Nomad Configuration:**
- Ensure `NOMAD_IP_http` and `NOMAD_PORT_http` are accessible to the task (standard, but verifying).
## Non-Functional Requirements
- **Resilience:** The script must handle Consul unavailability gracefully (retries) without crashing the application loop.
- **Cleanliness:** No "ghost" services. Replicas must not appear in the service catalog.
## Acceptance Criteria
- [ ] Navidrome runs ONLY on the Primary.
- [ ] Only ONE `navidrome` service is registered in Consul (pointing to the Primary).
- [ ] Stopping the Primary allocation results in immediate deregistration (via trap).
- [ ] Hard killing the Primary allocation results in deregistration after TTL expires (approx 15s).
- [ ] Replicas do not register any service.
## Implementation Details
- **Script Name:** We will stick with `entrypoint.sh` for consistency with `litefs.yml` configuration, refactoring its content.
- **Service ID:** Use `navidrome-${NOMAD_ALLOC_ID}` to ensure uniqueness and traceability.

View File

@@ -0,0 +1,5 @@
# Track update_monitor_discovery_20260208 Context
- [Specification](./spec.md)
- [Implementation Plan](./plan.md)
- [Metadata](./metadata.json)

View File

@@ -0,0 +1,8 @@
{
"track_id": "update_monitor_discovery_20260208",
"type": "enhancement",
"status": "new",
"created_at": "2026-02-08T20:00:00Z",
"updated_at": "2026-02-08T20:00:00Z",
"description": "Update cluster monitoring script to discover nodes via Nomad instead of Consul, ensuring all replicas are visible."
}

View File

@@ -0,0 +1,25 @@
# Plan: Update Monitor Discovery Logic (`update_monitor_discovery`)
## Phase 1: Nomad Discovery Enhancement [x] [checkpoint: 353683e]
- [x] Task: Update `nomad_client.py` to fetch job allocations with IPs (353683e)
- [x] Write tests for parsing allocation IPs from `nomad job status` or `nomad alloc status`
- [x] Implement `get_job_allocations(job_id)` returning a list of dicts (id, node, ip)
- [x] Task: Conductor - User Manual Verification 'Phase 1: Nomad Discovery Enhancement' (Protocol in workflow.md)
## Phase 2: Aggregator Refactor [x] [checkpoint: 655a9b2]
- [x] Task: Refactor `cluster_aggregator.py` to drive discovery via Nomad (655a9b2)
- [x] Update `get_cluster_status` to call `nomad_client.get_job_allocations` first
- [x] Update loop to iterate over allocations and supplement with LiteFS and Consul data
- [x] Task: Update `consul_client.py` to fetch all services once and allow lookup by IP/ID (655a9b2)
- [x] Task: Update tests for the new discovery flow (655a9b2)
- [x] Task: Conductor - User Manual Verification 'Phase 2: Aggregator Refactor' (Protocol in workflow.md)
## Phase 3: UI and Health Logic [x] [checkpoint: 21e9c3d]
- [x] Task: Update `output_formatter.py` for "Standby" nodes (21e9c3d)
- [x] Update table formatting to handle missing Consul status for replicas
- [x] Task: Update Cluster Health calculation (21e9c3d)
- [x] "Healthy" = 1 Primary (Consul passing) + N Replicas (LiteFS connected)
- [~] Task: Extract Uptime from Nomad and internal LiteFS states (txid, checksum)
- [~] Task: Update aggregator and formatter to display detailed database info
- [x] Task: Final verification run (21e9c3d)
- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md)

View File

@@ -0,0 +1,30 @@
# Specification: Update Monitor Discovery Logic (`update_monitor_discovery`)
## Overview
Refactor the cluster monitoring script (`scripts/cluster_status`) to use Nomad as the primary source of discovery. Currently, the script queries Consul for services, which only shows the Primary node in the new architecture. By querying Nomad allocations first, we can identify all running LiteFS nodes (Primary and Replicas) and then inspect their individual health and replication status.
## Functional Requirements
- **Nomad Client (`nomad_client.py`):**
- Add a function to list all active allocations for a job and extract their host IPs and node names.
- **Consul Client (`consul_client.py`):**
- Modify to allow checking the registration status of a *specific* node/allocation ID rather than just listing all services.
- **Aggregator (`cluster_aggregator.py`):**
- **New Discovery Flow:**
1. Query Nomad for all allocations of `navidrome-litefs`.
2. For each allocation:
- Get the Node Name and IP.
- Query the LiteFS API (`:20202`) on that IP for role/DB info.
- Query Consul to see if a matching service registration exists (and its health).
- **Formatter (`output_formatter.py`):**
- Handle nodes that are "Standby" (running in Nomad and LiteFS, but not registered in Consul).
- Ensure the table correctly displays all 4 nodes.
## Non-Functional Requirements
- **Efficiency:** Minimize CLI calls by batching Nomad/Consul queries where possible.
- **Robustness:** Gracefully handle cases where an allocation has no IP yet (starting state).
## Acceptance Criteria
- [ ] Script output shows all 4 Nomad allocations.
- [ ] Primary node is clearly identified with its Consul health status.
- [ ] Replica nodes are shown with their LiteFS role and DB status, even if not in Consul.
- [ ] Overall cluster health is calculated based on the existence of exactly one Primary and healthy replication on all nodes.

View File

@@ -14,3 +14,4 @@ A highly available and durable personal music streaming service built on Navidro
- **High-Quality Streaming:** Support for advanced audio formats and on-the-fly transcoding (Opus/FLAC) to ensure the best possible listening experience.
- **Universal Compatibility:** Full support for the Subsonic API to allow connection from a wide variety of mobile and desktop music clients.
- **Automated Infrastructure:** Managed by Nomad and Consul for seamless cluster operations and service discovery.
- **Robust High Availability:** Automatic failover with TTL-based self-registration for clean and resilient service catalog management.

View File

@@ -8,6 +8,10 @@
## Storage & Database
- **SQLite:** The primary relational database used by Navidrome for metadata and state.
- **LiteFS:** A FUSE-based filesystem that provides synchronous replication of the SQLite database across the cluster.
- **Process Management:** LiteFS-supervised with TTL-based self-registration for clean and resilient service catalog management.
- **Hybrid Storage Model:**
- **Replicated:** SQLite database on LiteFS FUSE mount (`/data`).
- **Shared:** Plugins, cache, and backups on persistent network storage (`/shared_data`).
## Automation & Delivery
- **Gitea Actions:** Automates the multi-arch (AMD64/ARM64) building and pushing of the custom supervised container image.
@@ -18,4 +22,4 @@
- **LiteFS Proxy:** Handles transparent write-forwarding to the cluster leader.
## Monitoring & Tooling
- **Python (Cluster Status Script):** A local CLI tool for monitoring Consul service registration, LiteFS replication status, and diagnosing Nomad allocation logs.
- **Python (Cluster Status Script):** A local CLI tool for hybrid monitoring using Nomad (discovery & uptime), Consul (service health), and LiteFS HTTP API (internal replication state).

View File

@@ -2,3 +2,4 @@
This file tracks all major tracks for the project. Each track has its own detailed plan in its respective folder.
---
---

View File

@@ -1,30 +0,0 @@
# Plan: Cluster Diagnosis and Script Enhancement (`diagnose_and_enhance`)
## Phase 1: Enhanced Diagnostics (Consul) [x] [checkpoint: a686c5b]
- [x] Task: Update `consul_client.py` to fetch detailed health check output
- [x] Write tests for fetching `Output` field from Consul checks
- [x] Implement logic to extract and store the `Output` (error message)
- [x] Task: Update aggregator and formatter to display Consul errors
- [x] Update aggregation logic to include `consul_error`
- [x] Update table formatter to indicate an error (maybe a flag or color)
- [x] Add a "Diagnostics" section to the output to print full error details
- [x] Task: Conductor - User Manual Verification 'Phase 1: Enhanced Diagnostics (Consul)' (Protocol in workflow.md)
## Phase 2: Nomad Integration and Logs [x] [checkpoint: 6d77729]
- [x] Task: Implement `nomad_client.py` wrapper
- [x] Write tests for `get_allocation_logs`, `get_node_status`, and `restart_allocation` (mocking subprocess)
- [x] Implement `subprocess.run(["nomad", ...])` logic to fetch logs and restart allocations
- [x] Task: Integrate Nomad logs into diagnosis
- [x] Update aggregator to call Nomad client for critical nodes
- [x] Update "Diagnostics" section to display the last 20 lines of stderr
- [x] Task: Conductor - User Manual Verification 'Phase 2: Nomad Integration and Logs' (Protocol in workflow.md)
## Phase 3: Advanced LiteFS Status [ ]
- [ ] Task: Implement `litefs_status` via `nomad alloc exec`
- [ ] Write tests for executing remote commands via Nomad
- [ ] Update `litefs_client.py` to fallback to `nomad alloc exec` if HTTP fails
- [ ] Parse `litefs status` output (text/json) to extract uptime and replication lag
- [ ] Task: Final Polish and Diagnosis Run
- [ ] Ensure all pieces work together
- [ ] Run the script to diagnose `odroid8`
- [ ] Task: Conductor - User Manual Verification 'Phase 3: Advanced LiteFS Status' (Protocol in workflow.md)

View File

@@ -1,26 +0,0 @@
# Plan: Fix Odroid8 and Script Robustness (`fix_odroid8_and_script`)
## Phase 1: Script Robustness [x] [checkpoint: 860000b]
- [x] Task: Update `nomad_client.py` to handle subprocess errors gracefully
- [x] Write tests for handling Nomad CLI absence/failure
- [x] Update implementation to return descriptive error objects or `None` without crashing
- [x] Task: Update aggregator and formatter to handle Nomad errors
- [x] Update `cluster_aggregator.py` to gracefully skip Nomad calls if they fail
- [x] Update `output_formatter.py` to display "Nomad Error" in relevant cells
- [x] Add a global "Nomad Connectivity Warning" to the summary
- [x] Task: Conductor - User Manual Verification 'Phase 1: Script Robustness' (Protocol in workflow.md)
## Phase 2: Odroid8 Recovery [ ]
- [x] Task: Identify and verify `odroid8` LiteFS data path
- [x] Run `nomad alloc status` to find the volume mount for `odroid8`
- [x] Provide the user with the exact host path to the LiteFS data
- [x] Task: Guide user through manual cleanup
- [x] Provide steps to stop the allocation
- [x] Provide the `rm` command to clear the LiteFS metadata
- [x] Provide steps to restart and verify the node
- [~] Task: Conductor - User Manual Verification 'Phase 2: Odroid8 Recovery' (Protocol in workflow.md)
## Phase 3: Final Verification [x]
- [x] Task: Final verification run of the script
- [x] Task: Verify cluster health in Consul and LiteFS API
- [x] Task: Conductor - User Manual Verification 'Phase 3: Final Verification' (Protocol in workflow.md)

View File

@@ -3,83 +3,151 @@ set -e
# Configuration from environment
SERVICE_NAME="navidrome"
# Use Nomad allocation ID for a unique service ID
SERVICE_ID="${SERVICE_NAME}-${NOMAD_ALLOC_ID:-$(hostname)}"
PORT=4533
CONSUL_HTTP_ADDR="${CONSUL_URL:-http://localhost:8500}"
NODE_IP="${ADVERTISE_IP}"
DB_LOCK_FILE="/data/.primary"
NAVIDROME_PID=0
# Tags for the Primary service (Traefik enabled)
PRIMARY_TAGS='["navidrome","web","traefik.enable=true","urlprefix-/navidrome","tools","traefik.http.routers.navidromelan.rule=Host(`navidrome.service.dc1.consul`)","traefik.http.routers.navidromewan.rule=Host(`m.fbleagh.duckdns.org`)","traefik.http.routers.navidromewan.middlewares=dex@consulcatalog","traefik.http.routers.navidromewan.tls=true"]'
NAVIDROME_PID=""
SERVICE_ID="navidrome-${NODE_IP}-${SERVICE_NAME}"
# --- Helper Functions ---
cleanup() {
echo "Caught signal, shutting down..."
if [ -n "$NAVIDROME_PID" ]; then
echo "Stopping Navidrome (PID: $NAVIDROME_PID)..."
kill -TERM "$NAVIDROME_PID"
wait "$NAVIDROME_PID" || true
fi
echo "Deregistering service ${SERVICE_ID} from Consul..."
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}" || true
exit 0
# Backup Database (Only on Primary)
run_backup() {
local backup_dir="/shared_data/backup"
local timestamp=$(date +%Y%m%d_%H%M%S)
local backup_file="${backup_dir}/navidrome.db_${timestamp}.bak"
echo "Backing up database to ${backup_file}..."
mkdir -p "$backup_dir"
if litefs export -name navidrome.db "$backup_file"; then
echo "Backup successful."
# Keep only last 7 days
find "$backup_dir" -name "navidrome.db_*.bak" -mtime +7 -delete
echo "Old backups cleaned."
else
echo "ERROR: Backup failed!"
fi
}
trap cleanup SIGTERM SIGINT
# Register Service with TTL Check
register_service() {
echo "Promoted! Registering service ${SERVICE_ID}..."
# Convert bash list string to JSON array if needed, but PRIMARY_TAGS is already JSON-like
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/register" -d "{
\"ID\": \"${SERVICE_ID}\",
\"Name\": \"${SERVICE_NAME}\",
\"Tags\": ${PRIMARY_TAGS},
\"Address\": \"${NODE_IP}\",
\"Port\": ${PORT},
\"Check\": {
\"DeregisterCriticalServiceAfter\": \"1m\",
\"TTL\": \"15s\"
}
}"
}
echo "Starting leadership-aware entrypoint..."
# Send Heartbeat to Consul
pass_ttl() {
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/check/pass/service:${SERVICE_ID}" > /dev/null
}
# Deregister Service
deregister_service() {
echo "Demoted/Stopping. Deregistering service ${SERVICE_ID}..."
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}"
}
# Start Navidrome in Background
start_app() {
echo "Node is Primary. Starting Navidrome..."
# Ensure shared directories exist
mkdir -p /shared_data/plugins /shared_data/cache /shared_data/backup
/app/navidrome &
NAVIDROME_PID=$!
echo "Navidrome started with PID ${NAVIDROME_PID}"
}
# Stop Navidrome
stop_app() {
if [ "${NAVIDROME_PID}" -gt 0 ]; then
echo "Stopping Navidrome (PID ${NAVIDROME_PID})..."
kill -SIGTERM "${NAVIDROME_PID}"
wait "${NAVIDROME_PID}" 2>/dev/null || true
NAVIDROME_PID=0
fi
}
# --- Signal Handling (The Safety Net) ---
# If Nomad stops the container, we stop the app and deregister.
cleanup() {
echo "Caught signal, shutting down..."
stop_app
deregister_service
exit 0
}
trap cleanup TERM INT
# --- Main Loop ---
echo "Starting Supervisor. Waiting for leadership settle..."
echo "Node IP: $NODE_IP"
echo "Consul: $CONSUL_HTTP_ADDR"
# Small sleep to let LiteFS settle and leadership election complete
sleep 5
LAST_BACKUP_TIME=0
BACKUP_INTERVAL=86400 # 24 hours
while true; do
# In LiteFS 0.5, .primary file exists ONLY on replicas.
if [ ! -f /data/.primary ]; then
# PRIMARY STATE
if [ -z "$NAVIDROME_PID" ] || ! kill -0 "$NAVIDROME_PID" 2>/dev/null; then
echo "Node is Primary. Initializing Navidrome..."
# In LiteFS 0.5, .primary file exists ONLY on replicas.
if [ ! -f "$DB_LOCK_FILE" ]; then
# === WE ARE PRIMARY ===
# Register in Consul
echo "Registering as primary in Consul..."
curl -s -X PUT -d "{
\"ID\": \"${SERVICE_ID}\",
\"Name\": \"${SERVICE_NAME}\",
\"Tags\": ${PRIMARY_TAGS},
\"Address\": \"${NODE_IP}\",
\"Port\": ${PORT},
\"Check\": {
\"HTTP\": \"http://${NODE_IP}:${PORT}/app\",
\"Interval\": \"10s\",
\"Timeout\": \"2s\"
}
}" "${CONSUL_HTTP_ADDR}/v1/agent/service/register"
echo "Starting Navidrome with ND_DATABASE_PATH=/data/navidrome.db"
export ND_DATABASE_PATH="/data/navidrome.db"
export ND_DATAFOLDER="/local/data"
# Start Navidrome
/app/navidrome &
NAVIDROME_PID=$!
echo "Navidrome started with PID $NAVIDROME_PID"
fi
else
# REPLICA STATE
if [ -n "$NAVIDROME_PID" ] && kill -0 "$NAVIDROME_PID" 2>/dev/null; then
echo "Node transitioned to Replica. Stopping Navidrome..."
kill -TERM "$NAVIDROME_PID"
wait "$NAVIDROME_PID" || true
NAVIDROME_PID=""
echo "Deregistering primary service from Consul..."
curl -s -X PUT "${CONSUL_HTTP_ADDR}/v1/agent/service/deregister/${SERVICE_ID}" || true
fi
# We don't register anything for replicas in this version to keep it simple.
# But we stay alive so LiteFS keeps running.
# 1. If App is not running, start it and register
if [ "${NAVIDROME_PID}" -eq 0 ] || ! kill -0 "${NAVIDROME_PID}" 2>/dev/null; then
if [ "${NAVIDROME_PID}" -gt 0 ]; then
echo "CRITICAL: Navidrome crashed! Restarting..."
fi
start_app
register_service
fi
sleep 5
# 2. Maintain the heartbeat (TTL)
pass_ttl
# 3. Handle periodic backup
CURRENT_TIME=$(date +%s)
if [ $((CURRENT_TIME - LAST_BACKUP_TIME)) -ge $BACKUP_INTERVAL ]; then
run_backup
LAST_BACKUP_TIME=$CURRENT_TIME
fi
else
# === WE ARE REPLICA ===
# If App is running (we were just demoted), stop it
if [ "${NAVIDROME_PID}" -gt 0 ]; then
echo "Lost leadership. Demoting..."
stop_app
deregister_service
# Reset backup timer so the next primary can start fresh or we start fresh if promoted again
LAST_BACKUP_TIME=0
fi
# No service registration exists for replicas to keep Consul clean.
fi
# Sleep short enough to update TTL (every 5s is safe for 15s TTL)
sleep 5 &
wait $! # Wait allows the 'trap' to interrupt the sleep instantly
done

View File

@@ -1,85 +0,0 @@
job "navidrome-litefs" {
datacenters = ["dc1"]
type = "service"
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
group "navidrome" {
count = 4
update {
max_parallel = 1
min_healthy_time = "30s"
healthy_deadline = "5m"
auto_revert = false
}
constraint {
distinct_hosts = true
}
network {
# Request static ports on the host
port "http" {
static = 4533
to = 4533 # Direct to Navidrome
}
port "litefs" {
static = 20202
to = 20202 # Maps host 20202 to container 20202 (LiteFS Replication)
}
}
task "navidrome" {
driver = "docker"
config {
image = "gitea.service.dc1.fbleagh.duckdns.org/sstent/navidrome-litefs:latest"
privileged = true # Still needed for FUSE
ports = ["http", "litefs"]
force_pull = true
volumes = [
"/mnt/configs/navidrome_litefs:/var/lib/litefs",
"/mnt/Public/configs/navidrome:/shared_data",
"/mnt/Public/Downloads/Clean_Music:/music/CleanMusic:ro",
"/mnt/Public/Downloads/news/slskd/downloads:/music/slskd:ro",
"/mnt/Public/Downloads/incoming_music:/music/incomingmusic:ro"
]
}
env {
# LiteFS Config
CONSUL_URL = "http://${attr.unique.network.ip-address}:8500"
ADVERTISE_IP = "${attr.unique.network.ip-address}"
PORT = "8080" # Internal proxy port (unused but kept)
# Navidrome Config
ND_DATAFOLDER = "/local/data"
ND_CACHEFOLDER = "/shared_data/cache"
ND_BACKUP_PATH = "/shared_data/backup"
ND_CONFIGFILE = "/local/data/navidrome.toml"
# Database is on the LiteFS FUSE mount. Forced v35.
ND_DATABASE_PATH = "/data/navidrome.db"
ND_SCANSCHEDULE = "0"
ND_SCANNER_FSWATCHER_ENABLED = "false"
ND_FORCE_REDEPLOY = "2"
ND_LOGLEVEL = "info"
ND_REVERSEPROXYWHITELIST = "0.0.0.0/0"
ND_REVERSEPROXYUSERHEADER = "X-Forwarded-User"
}
# NO service block here! Managed by register.sh inside the container.
resources {
cpu = 500
memory = 512
}
}
}
}

View File

@@ -1,179 +1,83 @@
variable "container_sha" {
type = string
default = "045fc6e82b9ecb6bebc1f095f62498935df70bbf"
}
job "navidrome-litefs" {
datacenters = ["dc1"]
type = "service"
# We pin to Linux because LiteFS requires FUSE
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
group "navidrome" {
count = 2
count = 4
update {
max_parallel = 1
min_healthy_time = "30s"
healthy_deadline = "5m"
auto_revert = false
}
constraint {
distinct_hosts = true
}
network {
mode = "host"
port "http" {}
}
# --- Setup Task ---
task "setup" {
driver = "docker"
lifecycle {
hook = "prestart"
sidecar = false
# Request static ports on the host
port "http" {
static = 4533
to = 4533 # Direct to Navidrome
}
config {
image = "busybox"
command = "mkdir"
args = ["-p", "/alloc/sqlite"]
network_mode = "host"
port "litefs" {
static = 20202
to = 20202 # Maps host 20202 to container 20202 (LiteFS Replication)
}
}
# --- LiteFS Task ---
task "litefs" {
driver = "docker"
config {
image = "flyio/litefs:0.5"
privileged = true # Needed for FUSE
ports = ["http"]
network_mode = "host"
# 1. Bind mount for LiteFS internal data (chunks/WAL)
# 2. Bind mount for the config
# 3. Mount the shared alloc dir so we can mount FUSE on it
volumes = [
"/mnt/configs/navidrome_litefs:/var/lib/litefs",
"local/litefs.yml:/etc/litefs.yml"
]
mounts = [
{
type = "bind"
source = "../alloc/sqlite"
target = "/mnt/sqlite"
bind_options = {
propagation = "shared"
}
}
]
}
# Create the config file
template {
left_delimiter = "[["
right_delimiter = "]]"
data = <<EOF
fuse:
# This matches the internal mount point in the container
dir: "/mnt/sqlite"
data:
# Internal data storage
dir: "/var/lib/litefs"
# Use Consul for leader election
lease:
type: "consul"
consul:
url: "http://[[ env `attr.unique.network.ip-address` ]]:8500"
key: "litefs/navidrome"
# The HTTP Proxy routes traffic
proxy:
addr: ":[[ env `NOMAD_PORT_http` ]]"
target: "127.0.0.1:4533" # Navidrome's internal port
db: "navidrome.db" # The DB to track for transaction consistency
passthrough: # Paths that don't need write-forwarding (optional optimizations)
- "*.js"
- "*.css"
- "*.png"
EOF
destination = "local/litefs.yml"
}
resources {
cpu = 200
memory = 256
}
}
# --- Navidrome Task (The App) ---
task "navidrome" {
driver = "docker"
config {
image = "ghcr.io/navidrome/navidrome:latest"
memory_hard_limit = "2048"
ports = [] # No ports exposed directly!
network_mode = "host"
image = "gitea.service.dc1.fbleagh.duckdns.org/sstent/navidrome-litefs:${var.container_sha}"
privileged = true # Still needed for FUSE
ports = ["http", "litefs"]
force_pull = true
# We mount the sqlite dir from the allocation directory
mounts = [
{
type = "bind"
source = "../alloc/sqlite"
target = "/data"
bind_options = {
propagation = "shared"
}
}
]
# Shared Music and Configs
volumes = [
"/mnt/configs/navidrome_litefs:/var/lib/litefs",
"/mnt/Public/configs/navidrome:/shared_data",
"/mnt/Public/Downloads/Clean_Music:/music/CleanMusic:ro",
"/mnt/Public/Downloads/news/slskd/downloads:/music/slskd:ro",
"/mnt/Public/Downloads/incoming_music:/music/incomingmusic:ro",
"/mnt/Public/configs/navidrome:/shared_data"
"/mnt/Public/Downloads/incoming_music:/music/incomingmusic:ro"
]
}
env {
ND_DATAFOLDER = "/local/data"
ND_CACHEFOLDER = "/shared_data/cache"
ND_CONFIGFILE= "/local/data/navidrome.toml"
# LiteFS Config
CONSUL_URL = "http://${attr.unique.network.ip-address}:8500"
ADVERTISE_IP = "${attr.unique.network.ip-address}"
PORT = "8080" # Internal proxy port (unused but kept)
# Important: LiteFS handles locking, but we still want WAL mode.
ND_DBPATH = "/data/navidrome.db?_busy_timeout=30000&_journal_mode=WAL&_foreign_keys=on&synchronous=NORMAL"
# Navidrome Config
ND_DATAFOLDER = "/data"
ND_PLUGINS_FOLDER = "/shared_data/plugins"
ND_CACHEFOLDER = "/shared_data/cache"
ND_BACKUP_PATH = "/shared_data/backup"
ND_BACKUPSCHEDULE = ""
# Disable internal scheduling to prevent redundant scans on secondary nodes.
ND_SCANSCHEDULE = "0"
ND_SCANNER_FSWATCHER_ENABLED = "false"
ND_LOGLEVEL = "info"
ND_REVERSEPROXYWHITELIST = "0.0.0.0/0"
ND_REVERSEPROXYUSERHEADER = "X-Forwarded-User"
ND_FORCE_REDEPLOY = "5"
ND_LOGLEVEL = "info"
ND_REVERSEPROXYWHITELIST = "0.0.0.0/0"
ND_REVERSEPROXYUSERHEADER = "X-Forwarded-User"
}
service {
name = "navidrome"
tags = [
"navidrome",
"web",
"traefik.enable=true",
"urlprefix-/navidrome",
"tools",
"traefik.http.routers.navidromelan.rule=Host(`navidrome.service.dc1.consul`)",
"traefik.http.routers.navidromewan.rule=Host(`m.fbleagh.duckdns.org`)",
"traefik.http.routers.navidromewan.middlewares=dex@consulcatalog",
"traefik.http.routers.navidromewan.tls=true",
]
port = "http" # This maps to the LiteFS proxy port defined in network block
check {
type = "http"
path = "/app" # LiteFS proxy passes this through
interval = "10s"
timeout = "2s"
}
}
# NO service block here! Managed by register.sh inside the container.
resources {
cpu = 500

View File

@@ -4,51 +4,72 @@ import nomad_client
def get_cluster_status(consul_url, job_id="navidrome-litefs"):
"""
Aggregates cluster data from Consul, LiteFS, and Nomad.
Aggregates cluster data from Nomad (Discovery), LiteFS (Role), and Consul (Routing Health).
"""
consul_nodes = consul_client.get_cluster_services(consul_url)
aggregated_nodes = []
# 1. Discover all nodes via Nomad Allocations
allocations = nomad_client.get_job_allocations(job_id)
nomad_available = bool(nomad_client.get_node_map())
# 2. Get all Consul registrations for 'navidrome'
consul_services = consul_client.get_cluster_services(consul_url)
# Create a map for easy lookup by IP
consul_map = {s["address"]: s for s in consul_services}
aggregated_nodes = []
is_healthy = True
primary_count = 0
# Check Nomad connectivity
node_map = nomad_client.get_node_map()
nomad_available = bool(node_map)
for alloc in allocations:
node_name = alloc["node"]
address = alloc["ip"]
alloc_id = alloc["id"]
for node in consul_nodes:
# Fetch allocation ID first to enable nomad exec fallback
alloc_id = nomad_client.get_allocation_id(node["node"], job_id)
# 3. Get LiteFS Status
litefs_status = litefs_client.get_node_status(address, alloc_id=alloc_id)
litefs_status = litefs_client.get_node_status(node["address"], alloc_id=alloc_id)
# 4. Match with Consul info
consul_info = consul_map.get(address)
# Merge data
node_data = {
**node,
"node": node_name,
"address": address,
"alloc_id": alloc_id,
"litefs_primary": litefs_status.get("is_primary", False),
"uptime": litefs_status.get("uptime", "N/A"),
"advertise_url": litefs_status.get("advertise_url", ""),
"candidate": litefs_status.get("candidate", False),
"uptime": alloc.get("uptime", "N/A"),
"replication_lag": litefs_status.get("replication_lag", "N/A"),
"litefs_error": litefs_status.get("error", None),
"nomad_logs": None,
"alloc_id": alloc_id
"dbs": litefs_status.get("dbs", {}),
"litefs_error": litefs_status.get("error"),
"nomad_logs": None
}
if node["status"] != "passing":
is_healthy = False
# Fetch Nomad logs for critical nodes
if alloc_id:
node_data["nomad_logs"] = nomad_client.get_allocation_logs(alloc_id)
# Legacy compat for formatter
node_data["active_dbs"] = list(node_data["dbs"].keys())
if node_data["litefs_primary"]:
primary_count += 1
# Check for active databases
node_dbs = litefs_status.get("dbs", {})
if node_dbs:
node_data["active_dbs"] = list(node_dbs.keys())
node_data["role"] = "primary"
else:
node_data["active_dbs"] = []
node_data["role"] = "replica"
# 5. Determine Consul status
if consul_info:
node_data["status"] = consul_info["status"]
node_data["check_output"] = consul_info["check_output"]
if node_data["status"] != "passing":
is_healthy = False
node_data["nomad_logs"] = nomad_client.get_allocation_logs(alloc_id)
else:
# Not in Consul
if node_data["litefs_primary"]:
# If it's primary in LiteFS but not in Consul, that's an error (unless just started)
node_data["status"] = "unregistered"
is_healthy = False
node_data["nomad_logs"] = nomad_client.get_allocation_logs(alloc_id)
else:
# Replicas are expected to be unregistered in the new model
node_data["status"] = "standby"
node_data["check_output"] = "Clean catalog (expected for replica)"
aggregated_nodes.append(node_data)
@@ -56,7 +77,8 @@ def get_cluster_status(consul_url, job_id="navidrome-litefs"):
health = "Healthy"
if not is_healthy:
health = "Unhealthy"
elif primary_count == 0:
if primary_count == 0:
health = "No Primary Detected"
elif primary_count > 1:
health = "Split Brain Detected (Multiple Primaries)"

View File

@@ -2,55 +2,45 @@ import requests
def get_cluster_services(consul_url):
"""
Queries Consul health API for navidrome and replica-navidrome services.
Queries Consul health API for all 'navidrome' services.
Returns a list of dictionaries with node info.
"""
services = []
# Define roles to fetch
role_map = {
"navidrome": "primary",
"replica-navidrome": "replica"
}
url = f"{consul_url}/v1/health/service/navidrome"
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
data = response.json()
for service_name, role in role_map.items():
url = f"{consul_url}/v1/health/service/{service_name}"
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
data = response.json()
for item in data:
node_name = item["Node"]["Node"]
address = item["Node"]["Address"]
port = item["Service"]["Port"]
for item in data:
node_name = item["Node"]["Node"]
address = item["Node"]["Address"]
port = item["Service"]["Port"]
# Determine overall status from checks and extract output
checks = item.get("Checks", [])
status = "passing"
check_output = ""
for check in checks:
if check["Status"] != "passing":
status = check["Status"]
# Determine overall status from checks and extract output
checks = item.get("Checks", [])
status = "passing"
check_output = ""
for check in checks:
if check["Status"] != "passing":
status = check["Status"]
check_output = check.get("Output", "")
break
else:
if not check_output:
check_output = check.get("Output", "")
break
else:
# Even if passing, store the output of the first check if it's the only one
if not check_output:
check_output = check.get("Output", "")
services.append({
"node": node_name,
"address": address,
"port": port,
"role": role,
"status": status,
"service_id": item["Service"]["ID"],
"check_output": check_output
})
except Exception as e:
# For now, we just don't add the service if it fails to fetch
# In a real script we might want to report the error
print(f"Error fetching {service_name}: {e}")
services.append({
"node": node_name,
"address": address,
"port": port,
"role": "primary", # If it's in Consul as 'navidrome', it's intended to be primary
"status": status,
"service_id": item["Service"]["ID"],
"check_output": check_output
})
except Exception as e:
print(f"Error fetching navidrome services from Consul: {e}")
return services

View File

@@ -59,7 +59,8 @@ def get_node_status(node_address, port=20202, alloc_id=None):
store = data.get("store", {})
status = {
"is_primary": store.get("isPrimary", False),
"uptime": "N/A",
"candidate": store.get("candidate", False),
"uptime": "N/A", # Will be filled by Nomad uptime
"advertise_url": f"http://{node_address}:{port}",
"dbs": store.get("dbs", {})
}

View File

@@ -1,6 +1,7 @@
import subprocess
import re
import sys
from datetime import datetime, timezone
def get_node_map():
"""
@@ -29,6 +30,93 @@ def get_node_map():
print(f"Error getting node map: {e}", file=sys.stderr)
return {}
def get_job_allocations(job_id):
"""
Returns a list of all active allocations for a job with their IPs and uptimes.
"""
try:
# 1. Get list of allocations
result = subprocess.run(
["nomad", "job", "status", job_id],
capture_output=True, text=True, check=True
)
alloc_ids = []
lines = result.stdout.splitlines()
start_parsing = False
for line in lines:
if "Allocations" in line:
start_parsing = True
continue
if start_parsing and line.strip() and not line.startswith("ID") and not line.startswith("=="):
parts = re.split(r"\s+", line.strip())
if len(parts) >= 5:
alloc_id = parts[0]
# Status is usually the 6th or 8th column depending on verbose
# We'll look for 'running' in any part from 3 onwards
if any(p == "running" for p in parts[3:]):
alloc_ids.append(alloc_id)
# 2. For each allocation, get its IP and Uptime
allocations = []
now = datetime.now(timezone.utc)
for alloc_id in alloc_ids:
res_alloc = subprocess.run(
["nomad", "alloc", "status", alloc_id],
capture_output=True, text=True, check=True
)
node_name = ""
ip = ""
full_id = alloc_id
uptime = "N/A"
for l in res_alloc.stdout.splitlines():
if l.startswith("ID") and "=" in l:
full_id = l.split("=")[1].strip()
if l.startswith("Node Name") and "=" in l:
node_name = l.split("=")[1].strip()
# Extract IP from Allocation Addresses
if "*litefs" in l:
# e.g. "*litefs yes 1.1.1.1:20202 -> 20202"
m = re.search(r"(\d+\.\d+\.\d+\.\d+):", l)
if m:
ip = m.group(1)
# Extract Uptime from Started At
if "Started At" in l and "=" in l:
# e.g. "Started At = 2026-02-09T14:04:28Z"
ts_str = l.split("=")[1].strip()
if ts_str and ts_str != "N/A":
try:
# Parse ISO timestamp
started_at = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
duration = now - started_at
# Format duration
secs = int(duration.total_seconds())
if secs < 60:
uptime = f"{secs}s"
elif secs < 3600:
uptime = f"{secs//60}m{secs%60}s"
else:
uptime = f"{secs//3600}h{(secs%3600)//60}m"
except Exception:
uptime = ts_str
allocations.append({
"id": full_id,
"node": node_name,
"ip": ip,
"uptime": uptime
})
return allocations
except Exception as e:
print(f"Error getting job allocations: {e}", file=sys.stderr)
return []
def get_allocation_id(node_name, job_id):
"""
Finds the FULL allocation ID for a specific node and job.

View File

@@ -38,13 +38,21 @@ def format_node_table(nodes, use_color=True):
"""
Formats the node list as a table.
"""
headers = ["Node", "Role", "Consul Status", "LiteFS Role", "Uptime", "Lag", "DBs", "LiteFS Info"]
headers = ["Node", "Role", "Consul Status", "LiteFS Role", "Cand", "Uptime", "Lag", "DBs", "LiteFS Info"]
table_data = []
for node in nodes:
# Consul status color
status = node["status"]
status_color = GREEN if status == "passing" else RED
if status == "passing":
status_color = GREEN
elif status == "standby":
status_color = CYAN
elif status == "unregistered":
status_color = YELLOW
else:
status_color = RED
colored_status = colorize(status, status_color, use_color)
# Role color
@@ -56,6 +64,11 @@ def format_node_table(nodes, use_color=True):
litefs_primary = node["litefs_primary"]
litefs_role = "primary" if litefs_primary else "replica"
# Candidate status
candidate = "" if node.get("candidate") else ""
candidate_color = GREEN if node.get("candidate") else RESET
colored_candidate = colorize(candidate, candidate_color, use_color)
# Highlight discrepancy if consul and litefs disagree
litefs_role_color = RESET
if (role == "primary" and not litefs_primary) or (role == "replica" and litefs_primary):
@@ -66,21 +79,33 @@ def format_node_table(nodes, use_color=True):
colored_litefs_role = colorize(litefs_role, litefs_role_color, use_color)
# Database details (name, txid, checksum)
db_infos = []
dbs = node.get("dbs", {})
for db_name, db_data in dbs.items():
txid = db_data.get("txid", "0")
chk = db_data.get("checksum", "0")
# Only show first 8 chars of checksum for space
db_infos.append(f"{db_name} (tx:{int(txid, 16)}, chk:{chk[:8]})")
db_str = "\n".join(db_infos) if db_infos else "None"
# Error info
info = ""
if node.get("litefs_error"):
info = colorize("LiteFS API Error", RED, use_color)
else:
info = node.get("advertise_url", "")
info = node.get("address", "")
table_data.append([
colorize(node["node"], BOLD, use_color),
colored_role,
colored_status,
colored_litefs_role,
colored_candidate,
node.get("uptime", "N/A"),
node.get("replication_lag", "N/A"),
", ".join(node.get("active_dbs", [])),
db_str,
info
])
@@ -90,7 +115,9 @@ def format_diagnostics(nodes, use_color=True):
"""
Formats detailed diagnostic information for nodes with errors.
"""
error_nodes = [n for n in nodes if n["status"] != "passing" or n.get("litefs_error")]
# Only show diagnostics if status is critical/unregistered OR if there is a LiteFS error
# Ignore 'standby' since it is expected for replicas
error_nodes = [n for n in nodes if (n["status"] not in ["passing", "standby"]) or n.get("litefs_error")]
if not error_nodes:
return ""

View File

@@ -1,29 +1,32 @@
import pytest
from unittest.mock import patch
from unittest.mock import patch, MagicMock
import cluster_aggregator
@patch("consul_client.get_cluster_services")
@patch("litefs_client.get_node_status")
@patch("nomad_client.get_allocation_id")
@patch("nomad_client.get_allocation_logs")
@patch("nomad_client.get_job_allocations")
@patch("nomad_client.get_node_map")
def test_aggregate_cluster_status(mock_node_map, mock_nomad_logs, mock_nomad_id, mock_litefs, mock_consul):
"""Test aggregating Consul and LiteFS data."""
def test_aggregate_cluster_status(mock_node_map, mock_nomad_allocs, mock_litefs, mock_consul):
"""Test aggregating Nomad, Consul and LiteFS data."""
mock_node_map.return_value = {"id": "name"}
# Mock Consul data
# Mock Nomad allocations
mock_nomad_allocs.return_value = [
{"id": "alloc1", "node": "node1", "ip": "1.1.1.1"},
{"id": "alloc2", "node": "node2", "ip": "1.1.1.2"}
]
# Mock Consul data (only node1 is registered as primary)
mock_consul.return_value = [
{"node": "node1", "address": "1.1.1.1", "role": "primary", "status": "passing"},
{"node": "node2", "address": "1.1.1.2", "role": "replica", "status": "passing"}
{"node": "node1", "address": "1.1.1.1", "role": "primary", "status": "passing", "check_output": "OK"}
]
# Mock LiteFS data
def litefs_side_effect(addr, **kwargs):
if addr == "1.1.1.1":
return {"is_primary": True, "uptime": 100, "advertise_url": "url1", "dbs": {"db1": {}}}
return {"is_primary": False, "uptime": 50, "advertise_url": "url2", "replication_lag": 10, "dbs": {"db1": {}}}
return {"is_primary": True, "candidate": True, "uptime": 100, "dbs": {"db1": {"txid": "0000000000000001", "checksum": "abc"}}}
return {"is_primary": False, "candidate": True, "uptime": 50, "dbs": {"db1": {"txid": "0000000000000001", "checksum": "abc"}}}
mock_litefs.side_effect = litefs_side_effect
mock_nomad_id.return_value = None
cluster_data = cluster_aggregator.get_cluster_status("http://consul:8500")
@@ -32,27 +35,27 @@ def test_aggregate_cluster_status(mock_node_map, mock_nomad_logs, mock_nomad_id,
node1 = next(n for n in cluster_data["nodes"] if n["node"] == "node1")
assert node1["litefs_primary"] is True
assert node1["role"] == "primary"
node2 = next(n for n in cluster_data["nodes"] if n["node"] == "node2")
assert node2["litefs_primary"] is False
assert node2["replication_lag"] == 10
assert node1["candidate"] is True
assert "db1" in node1["dbs"]
@patch("consul_client.get_cluster_services")
@patch("litefs_client.get_node_status")
@patch("nomad_client.get_allocation_id")
@patch("nomad_client.get_job_allocations")
@patch("nomad_client.get_allocation_logs")
@patch("nomad_client.get_node_map")
def test_aggregate_cluster_status_unhealthy(mock_node_map, mock_nomad_logs, mock_nomad_id, mock_litefs, mock_consul):
"""Test health calculation when nodes are critical."""
mock_node_map.return_value = {}
mock_consul.return_value = [
{"node": "node1", "address": "1.1.1.1", "role": "primary", "status": "critical"}
def test_aggregate_cluster_status_unhealthy(mock_node_map, mock_nomad_logs, mock_nomad_allocs, mock_litefs, mock_consul):
"""Test health calculation when primary is unregistered or failing."""
mock_node_map.return_value = {"id": "name"}
mock_nomad_allocs.return_value = [
{"id": "alloc1", "node": "node1", "ip": "1.1.1.1"}
]
mock_litefs.return_value = {"is_primary": True, "uptime": 100, "dbs": {"db1": {}}}
mock_nomad_id.return_value = "alloc1"
# Primary in LiteFS but missing in Consul
mock_litefs.return_value = {"is_primary": True, "candidate": True, "uptime": 100, "dbs": {"db1": {"txid": "1", "checksum": "abc"}}}
mock_consul.return_value = []
mock_nomad_logs.return_code = 0
mock_nomad_logs.return_value = "error logs"
cluster_data = cluster_aggregator.get_cluster_status("http://consul:8500")
assert cluster_data["health"] == "Unhealthy"
assert cluster_data["nodes"][0]["status"] == "unregistered"
assert cluster_data["nodes"][0]["nomad_logs"] == "error logs"

View File

@@ -1,11 +1,12 @@
import pytest
from unittest.mock import patch, MagicMock
import consul_client
import requests
@patch("requests.get")
def test_get_cluster_services(mock_get):
"""Test fetching healthy services from Consul."""
# Mock responses for navidrome and replica-navidrome
# Mock responses for navidrome
mock_navidrome = [
{
"Node": {"Node": "node1", "Address": "192.168.1.101"},
@@ -13,55 +14,19 @@ def test_get_cluster_services(mock_get):
"Checks": [{"Status": "passing"}]
}
]
mock_replicas = [
{
"Node": {"Node": "node2", "Address": "192.168.1.102"},
"Service": {"Service": "replica-navidrome", "Port": 4533, "ID": "replica-1"},
"Checks": [{"Status": "passing"}]
},
{
"Node": {"Node": "node3", "Address": "192.168.1.103"},
"Service": {"Service": "replica-navidrome", "Port": 4533, "ID": "replica-2"},
"Checks": [{"Status": "critical"}] # One failing check
}
]
def side_effect(url, params=None, timeout=None):
if "health/service/navidrome" in url:
m = MagicMock()
m.json.return_value = mock_navidrome
m.raise_for_status.return_value = None
return m
elif "health/service/replica-navidrome" in url:
m = MagicMock()
m.json.return_value = mock_replicas
m.raise_for_status.return_value = None
return m
return MagicMock()
mock_get.side_effect = side_effect
m = MagicMock()
m.json.return_value = mock_navidrome
m.raise_for_status.return_value = None
mock_get.return_value = m
consul_url = "http://consul:8500"
services = consul_client.get_cluster_services(consul_url)
# Should find 3 nodes total (node1 primary, node2 healthy replica, node3 critical replica)
assert len(services) == 3
# Check node1 (primary)
node1 = next(s for s in services if s["node"] == "node1")
assert node1["role"] == "primary"
assert node1["status"] == "passing"
assert node1["address"] == "192.168.1.101"
# Check node2 (healthy replica)
node2 = next(s for s in services if s["node"] == "node2")
assert node2["role"] == "replica"
assert node2["status"] == "passing"
# Check node3 (critical replica)
node3 = next(s for s in services if s["node"] == "node3")
assert node3["role"] == "replica"
assert node3["status"] == "critical"
# Should find 1 node (primary)
assert len(services) == 1
assert services[0]["node"] == "node1"
assert services[0]["status"] == "passing"
@patch("requests.get")
def test_get_cluster_services_with_errors(mock_get):
@@ -71,38 +36,18 @@ def test_get_cluster_services_with_errors(mock_get):
"Node": {"Node": "node1", "Address": "192.168.1.101"},
"Service": {"Service": "navidrome", "Port": 4533, "ID": "navidrome-1"},
"Checks": [
{"Status": "passing", "Output": "HTTP GET http://192.168.1.101:4533/app: 200 OK"}
]
}
]
mock_replicas = [
{
"Node": {"Node": "node3", "Address": "192.168.1.103"},
"Service": {"Service": "replica-navidrome", "Port": 4533, "ID": "replica-2"},
"Checks": [
{"Status": "critical", "Output": "HTTP GET http://192.168.1.103:4533/app: 500 Internal Server Error"}
{"Status": "critical", "Output": "HTTP GET http://192.168.1.101:4533/app: 500 Internal Server Error"}
]
}
]
def side_effect(url, params=None, timeout=None):
if "health/service/navidrome" in url:
m = MagicMock()
m.json.return_value = mock_navidrome
m.raise_for_status.return_value = None
return m
elif "health/service/replica-navidrome" in url:
m = MagicMock()
m.json.return_value = mock_replicas
m.raise_for_status.return_value = None
return m
return MagicMock()
mock_get.side_effect = side_effect
m = MagicMock()
m.json.return_value = mock_navidrome
m.raise_for_status.return_value = None
mock_get.return_value = m
services = consul_client.get_cluster_services("http://consul:8500")
node3 = next(s for s in services if s["node"] == "node3")
assert node3["status"] == "critical"
assert "500 Internal Server Error" in node3["check_output"]
node1 = next(s for s in services if s["node"] == "node1")
assert node1["status"] == "critical"
assert "500 Internal Server Error" in node1["check_output"]

View File

@@ -21,15 +21,19 @@ def test_format_node_table():
"node": "node1",
"role": "primary",
"status": "passing",
"uptime": 100,
"candidate": True,
"uptime": "1h",
"replication_lag": "N/A",
"litefs_primary": True
"litefs_primary": True,
"dbs": {"db1": {"txid": "1", "checksum": "abc"}}
}
]
table = output_formatter.format_node_table(nodes, use_color=False)
assert "node1" in table
assert "primary" in table
assert "passing" in table
assert "db1" in table
assert "Cand" in table
def test_format_diagnostics():
"""Test the diagnostics section generation."""
@@ -54,8 +58,25 @@ def test_format_diagnostics_empty():
"node": "node1",
"status": "passing",
"litefs_error": None
},
{
"node": "node2",
"status": "standby", # Should also be empty
"litefs_error": None
}
]
diagnostics = output_formatter.format_diagnostics(nodes, use_color=False)
assert diagnostics == ""
def test_format_node_table_status_colors():
"""Test that different statuses are handled."""
nodes = [
{"node": "n1", "role": "primary", "status": "passing", "litefs_primary": True},
{"node": "n2", "role": "replica", "status": "standby", "litefs_primary": False},
{"node": "n3", "role": "primary", "status": "unregistered", "litefs_primary": True},
]
table = output_formatter.format_node_table(nodes, use_color=False)
assert "passing" in table
assert "standby" in table
assert "unregistered" in table

View File

@@ -89,3 +89,47 @@ def test_get_node_map_failure(mock_run):
# It should not raise
node_map = nomad_client.get_node_map()
assert node_map == {}
@patch("subprocess.run")
def test_get_job_allocations(mock_run):
"""Test getting all allocations for a job with their IPs."""
# Mock 'nomad job status navidrome-litefs'
mock_job_status = MagicMock()
mock_job_status.stdout = """
Allocations
ID Node ID Node Name Status Created
abc12345 node1 host1 running 1h ago
def67890 node2 host2 running 1h ago
"""
# Mock 'nomad alloc status' for each alloc
mock_alloc1 = MagicMock()
mock_alloc1.stdout = """
ID = abc12345
Node Name = host1
Allocation Addresses:
Label Dynamic Address
*http yes 1.1.1.1:4533 -> 4533
*litefs yes 1.1.1.1:20202 -> 20202
Task Events:
Started At = 2026-02-09T14:00:00Z
"""
mock_alloc2 = MagicMock()
mock_alloc2.stdout = """
ID = def67890
Node Name = host2
Allocation Addresses:
Label Dynamic Address
*http yes 2.2.2.2:4533 -> 4533
*litefs yes 2.2.2.2:20202 -> 20202
Task Events:
Started At = 2026-02-09T14:00:00Z
"""
mock_run.side_effect = [mock_job_status, mock_alloc1, mock_alloc2]
allocs = nomad_client.get_job_allocations("navidrome-litefs")
assert len(allocs) == 2
assert allocs[0]["ip"] == "1.1.1.1"
assert "uptime" in allocs[0]
assert allocs[0]["uptime"] != "N/A"