conductor(checkpoint): Checkpoint end of Phase 2

2026-02-08 07:54:49 -08:00
parent a686c5b225
commit 6d77729a4a
7 changed files with 196 additions and 6 deletions
--- a/scripts/cluster_status/cli.py
+++ b/scripts/cluster_status/cli.py
@@ -4,11 +4,13 @@ import sys
 import config
 import cluster_aggregator
 import output_formatter
+import nomad_client

 def parse_args():
    parser = argparse.ArgumentParser(description="Monitor Navidrome LiteFS/Consul cluster status.")
    parser.add_argument("--consul-url", help="Override Consul API URL (default from env or hardcoded)")
    parser.add_argument("--no-color", action="store_true", help="Disable colorized output")
+    parser.add_argument("--restart", help="Restart the allocation on the specified node")
    return parser.parse_args()

 def main():
@@ -17,6 +19,19 @@ def main():
    # Resolve Consul URL
    consul_url = config.get_consul_url(args.consul_url)
    
+    # Handle restart if requested
+    if args.restart:
+        print(f"Attempting to restart allocation on node: {args.restart}...")
+        alloc_id = nomad_client.get_allocation_id(args.restart, "navidrome-litefs")
+        if alloc_id:
+            if nomad_client.restart_allocation(alloc_id):
+                print(f"Successfully sent restart signal to allocation {alloc_id}")
+            else:
+                print(f"Failed to restart allocation {alloc_id}")
+        else:
+            print(f"Could not find allocation for node {args.restart}")
+        print("-" * 30)
+
    try:
        # Fetch and aggregate data
        cluster_data = cluster_aggregator.get_cluster_status(consul_url)
--- a/scripts/cluster_status/cluster_aggregator.py
+++ b/scripts/cluster_status/cluster_aggregator.py
@@ -1,9 +1,10 @@
 import consul_client
 import litefs_client
+import nomad_client

-def get_cluster_status(consul_url):
+def get_cluster_status(consul_url, job_id="navidrome-litefs"):
    """
-    Aggregates cluster data from Consul and LiteFS.
+    Aggregates cluster data from Consul, LiteFS, and Nomad.
    """
    consul_nodes = consul_client.get_cluster_services(consul_url)
    aggregated_nodes = []
@@ -21,11 +22,17 @@ def get_cluster_status(consul_url):
            "uptime": litefs_status.get("uptime", "N/A"),
            "advertise_url": litefs_status.get("advertise_url", ""),
            "replication_lag": litefs_status.get("replication_lag", "N/A"),
-            "litefs_error": litefs_status.get("error", None)
+            "litefs_error": litefs_status.get("error", None),
+            "nomad_logs": None
        }
        
        if node["status"] != "passing":
            is_healthy = False
+            # Fetch Nomad logs for critical nodes
+            alloc_id = nomad_client.get_allocation_id(node["node"], job_id)
+            if alloc_id:
+                node_data["alloc_id"] = alloc_id
+                node_data["nomad_logs"] = nomad_client.get_allocation_logs(alloc_id)
            
        if node_data["litefs_primary"]:
            primary_count += 1
--- a/scripts/cluster_status/nomad_client.py
+++ b/scripts/cluster_status/nomad_client.py
@@ -0,0 +1,98 @@
+import subprocess
+import re
+
+def get_node_map():
+    """
+    Returns a mapping of Node ID to Node Name.
+    """
+    try:
+        result = subprocess.run(
+            ["nomad", "node", "status"],
+            capture_output=True, text=True, check=True
+        )
+        lines = result.stdout.splitlines()
+        node_map = {}
+        for line in lines:
+            if line.strip() and not line.startswith("ID") and not line.startswith("=="):
+                parts = re.split(r"\s+", line.strip())
+                if len(parts) >= 4:
+                    node_map[parts[0]] = parts[3]
+        return node_map
+    except Exception as e:
+        print(f"Error getting node map: {e}")
+        return {}
+
+def get_allocation_id(node_name, job_id):
+    """
+    Finds the FULL allocation ID for a specific node and job.
+    """
+    node_map = get_node_map()
+    try:
+        result = subprocess.run(
+            ["nomad", "job", "status", job_id],
+            capture_output=True, text=True, check=True
+        )
+        
+        lines = result.stdout.splitlines()
+        start_parsing = False
+        for line in lines:
+            if "Allocations" in line:
+                start_parsing = True
+                continue
+            if start_parsing and line.strip() and not line.startswith("ID") and not line.startswith("=="):
+                parts = re.split(r"\s+", line.strip())
+                if len(parts) >= 2:
+                    alloc_id = parts[0]
+                    node_id = parts[1]
+                    
+                    resolved_name = node_map.get(node_id, "")
+                    if node_id == node_name or resolved_name == node_name:
+                        # Now get the FULL ID using nomad alloc status
+                        res_alloc = subprocess.run(
+                            ["nomad", "alloc", "status", alloc_id],
+                            capture_output=True, text=True, check=True
+                        )
+                        for l in res_alloc.stdout.splitlines():
+                            if l.startswith("ID"):
+                                return l.split("=")[1].strip()
+                        return alloc_id
+                        
+    except Exception as e:
+        print(f"Error getting allocation ID: {e}")
+        
+    return None
+
+def get_allocation_logs(alloc_id, tail=20):
+    """
+    Fetches the last N lines of stderr for an allocation.
+    """
+    try:
+        # Try with task name first, then without
+        try:
+            result = subprocess.run(
+                ["nomad", "alloc", "logs", "-stderr", "-task", "navidrome", "-n", str(tail), alloc_id],
+                capture_output=True, text=True, check=True
+            )
+            return result.stdout
+        except subprocess.CalledProcessError:
+            result = subprocess.run(
+                ["nomad", "alloc", "logs", "-stderr", "-n", str(tail), alloc_id],
+                capture_output=True, text=True, check=True
+            )
+            return result.stdout
+    except Exception as e:
+        return f"Error fetching logs: {e}"
+
+def restart_allocation(alloc_id):
+    """
+    Restarts a specific allocation.
+    """
+    try:
+        subprocess.run(
+            ["nomad", "alloc", "restart", alloc_id],
+            capture_output=True, text=True, check=True
+        )
+        return True
+    except Exception as e:
+        print(f"Error restarting allocation: {e}")
+        return False
--- a/scripts/cluster_status/output_formatter.py
+++ b/scripts/cluster_status/output_formatter.py
@@ -100,6 +100,9 @@ def format_diagnostics(nodes, use_color=True):
            if node.get("check_output"):
                output.append(f"  {BOLD}Consul Check Output:{RESET}\n    {node['check_output'].strip()}")
                
+        if node.get("nomad_logs"):
+            output.append(f"  {BOLD}Nomad Stderr Logs (last 20 lines):{RESET}\n{node['nomad_logs']}")
+            
        if node.get("litefs_error"):
            output.append(f"  {BOLD}LiteFS API Error:{RESET} {colorize(node['litefs_error'], RED, use_color)}")
            
--- a/scripts/cluster_status/tests/test_aggregator.py
+++ b/scripts/cluster_status/tests/test_aggregator.py
@@ -4,7 +4,9 @@ import cluster_aggregator

@patch("consul_client.get_cluster_services")
@patch("litefs_client.get_node_status")
-def test_aggregate_cluster_status(mock_litefs, mock_consul):
+@patch("nomad_client.get_allocation_id")
+@patch("nomad_client.get_allocation_logs")
+def test_aggregate_cluster_status(mock_nomad_logs, mock_nomad_id, mock_litefs, mock_consul):
    """Test aggregating Consul and LiteFS data."""
    # Mock Consul data
    mock_consul.return_value = [
@@ -19,6 +21,7 @@ def test_aggregate_cluster_status(mock_litefs, mock_consul):
        return {"is_primary": False, "uptime": 50, "advertise_url": "url2", "replication_lag": 10}
        
    mock_litefs.side_effect = litefs_side_effect
+    mock_nomad_id.return_value = None
    
    cluster_data = cluster_aggregator.get_cluster_status("http://consul:8500")
    
@@ -35,12 +38,17 @@ def test_aggregate_cluster_status(mock_litefs, mock_consul):

@patch("consul_client.get_cluster_services")
@patch("litefs_client.get_node_status")
-def test_aggregate_cluster_status_unhealthy(mock_litefs, mock_consul):
+@patch("nomad_client.get_allocation_id")
+@patch("nomad_client.get_allocation_logs")
+def test_aggregate_cluster_status_unhealthy(mock_nomad_logs, mock_nomad_id, mock_litefs, mock_consul):
    """Test health calculation when nodes are critical."""
    mock_consul.return_value = [
        {"node": "node1", "address": "1.1.1.1", "role": "primary", "status": "critical"}
    ]
    mock_litefs.return_value = {"is_primary": True, "uptime": 100}
+    mock_nomad_id.return_value = "alloc1"
+    mock_nomad_logs.return_value = "error logs"
    
    cluster_data = cluster_aggregator.get_cluster_status("http://consul:8500")
    assert cluster_data["health"] == "Unhealthy"
+    assert cluster_data["nodes"][0]["nomad_logs"] == "error logs"
--- a/scripts/cluster_status/tests/test_main.py
+++ b/scripts/cluster_status/tests/test_main.py
@@ -12,7 +12,8 @@ def test_arg_parsing_default():

 def test_arg_parsing_custom():
    """Test that custom arguments are parsed correctly."""
-    with patch.object(sys, 'argv', ['cli.py', '--consul-url', 'http://custom:8500', '--no-color']):
+    with patch.object(sys, 'argv', ['cli.py', '--consul-url', 'http://custom:8500', '--no-color', '--restart', 'node1']):
        args = cli.parse_args()
        assert args.consul_url == 'http://custom:8500'
        assert args.no_color is True
+        assert args.restart == 'node1'
--- a/scripts/cluster_status/tests/test_nomad_client.py
+++ b/scripts/cluster_status/tests/test_nomad_client.py
@@ -0,0 +1,58 @@
+import pytest
+from unittest.mock import patch, MagicMock
+import nomad_client
+import subprocess
+
+@patch("subprocess.run")
+@patch("nomad_client.get_node_map")
+def test_get_allocation_id(mock_node_map, mock_run):
+    """Test getting allocation ID for a node."""
+    mock_node_map.return_value = {"node_id1": "node1"}
+    
+    # Mock 'nomad job status navidrome-litefs' output
+    mock_job_status = MagicMock()
+    mock_job_status.stdout = """
+Allocations
+ID        Node ID   Task Group  Version  Desired  Status   Created    Modified
+abc12345  node_id1  navidrome   1        run      running  1h ago     1h ago
+"""
+    
+    # Mock 'nomad alloc status abc12345' output
+    mock_alloc_status = MagicMock()
+    mock_alloc_status.stdout = "ID = abc12345-full-id"
+    
+    mock_run.side_effect = [mock_job_status, mock_alloc_status]
+    
+    alloc_id = nomad_client.get_allocation_id("node1", "navidrome-litefs")
+    assert alloc_id == "abc12345-full-id"
+
+@patch("subprocess.run")
+def test_get_logs(mock_run):
+    """Test fetching logs for an allocation."""
+    mock_stderr = "Error: database is locked\nSome other error"
+    m = MagicMock()
+    m.stdout = mock_stderr
+    m.return_code = 0
+    mock_run.return_value = m
+    
+    logs = nomad_client.get_allocation_logs("abc12345", tail=20)
+    assert "database is locked" in logs
+    # It should have tried with -task navidrome first
+    mock_run.assert_any_call(
+        ["nomad", "alloc", "logs", "-stderr", "-task", "navidrome", "-n", "20", "abc12345"],
+        capture_output=True, text=True, check=True
+    )
+
+@patch("subprocess.run")
+def test_restart_allocation(mock_run):
+    """Test restarting an allocation."""
+    m = MagicMock()
+    m.return_code = 0
+    mock_run.return_value = m
+    
+    success = nomad_client.restart_allocation("abc12345")
+    assert success is True
+    mock_run.assert_called_with(
+        ["nomad", "alloc", "restart", "abc12345"],
+        capture_output=True, text=True, check=True
+    )