conductor(checkpoint): Checkpoint end of Phase 2

2026-02-08 07:54:49 -08:00
parent a686c5b225
commit 6d77729a4a
7 changed files with 196 additions and 6 deletions
--- a/scripts/cluster_status/nomad_client.py
+++ b/scripts/cluster_status/nomad_client.py
@@ -0,0 +1,98 @@
+import subprocess
+import re
+
+def get_node_map():
+    """
+    Returns a mapping of Node ID to Node Name.
+    """
+    try:
+        result = subprocess.run(
+            ["nomad", "node", "status"],
+            capture_output=True, text=True, check=True
+        )
+        lines = result.stdout.splitlines()
+        node_map = {}
+        for line in lines:
+            if line.strip() and not line.startswith("ID") and not line.startswith("=="):
+                parts = re.split(r"\s+", line.strip())
+                if len(parts) >= 4:
+                    node_map[parts[0]] = parts[3]
+        return node_map
+    except Exception as e:
+        print(f"Error getting node map: {e}")
+        return {}
+
+def get_allocation_id(node_name, job_id):
+    """
+    Finds the FULL allocation ID for a specific node and job.
+    """
+    node_map = get_node_map()
+    try:
+        result = subprocess.run(
+            ["nomad", "job", "status", job_id],
+            capture_output=True, text=True, check=True
+        )
+        
+        lines = result.stdout.splitlines()
+        start_parsing = False
+        for line in lines:
+            if "Allocations" in line:
+                start_parsing = True
+                continue
+            if start_parsing and line.strip() and not line.startswith("ID") and not line.startswith("=="):
+                parts = re.split(r"\s+", line.strip())
+                if len(parts) >= 2:
+                    alloc_id = parts[0]
+                    node_id = parts[1]
+                    
+                    resolved_name = node_map.get(node_id, "")
+                    if node_id == node_name or resolved_name == node_name:
+                        # Now get the FULL ID using nomad alloc status
+                        res_alloc = subprocess.run(
+                            ["nomad", "alloc", "status", alloc_id],
+                            capture_output=True, text=True, check=True
+                        )
+                        for l in res_alloc.stdout.splitlines():
+                            if l.startswith("ID"):
+                                return l.split("=")[1].strip()
+                        return alloc_id
+                        
+    except Exception as e:
+        print(f"Error getting allocation ID: {e}")
+        
+    return None
+
+def get_allocation_logs(alloc_id, tail=20):
+    """
+    Fetches the last N lines of stderr for an allocation.
+    """
+    try:
+        # Try with task name first, then without
+        try:
+            result = subprocess.run(
+                ["nomad", "alloc", "logs", "-stderr", "-task", "navidrome", "-n", str(tail), alloc_id],
+                capture_output=True, text=True, check=True
+            )
+            return result.stdout
+        except subprocess.CalledProcessError:
+            result = subprocess.run(
+                ["nomad", "alloc", "logs", "-stderr", "-n", str(tail), alloc_id],
+                capture_output=True, text=True, check=True
+            )
+            return result.stdout
+    except Exception as e:
+        return f"Error fetching logs: {e}"
+
+def restart_allocation(alloc_id):
+    """
+    Restarts a specific allocation.
+    """
+    try:
+        subprocess.run(
+            ["nomad", "alloc", "restart", alloc_id],
+            capture_output=True, text=True, check=True
+        )
+        return True
+    except Exception as e:
+        print(f"Error restarting allocation: {e}")
+        return False