import subprocess import re import sys from datetime import datetime, timezone def get_node_map(): """ Returns a mapping of Node ID to Node Name. """ try: result = subprocess.run( ["nomad", "node", "status"], capture_output=True, text=True, check=True ) lines = result.stdout.splitlines() node_map = {} for line in lines: if line.strip() and not line.startswith("ID") and not line.startswith("=="): parts = re.split(r"\s+", line.strip()) if len(parts) >= 4: node_map[parts[0]] = parts[3] return node_map except FileNotFoundError: print("Warning: 'nomad' binary not found in PATH.", file=sys.stderr) return {} except subprocess.CalledProcessError as e: print(f"Warning: Failed to query Nomad nodes: {e}", file=sys.stderr) return {} except Exception as e: print(f"Error getting node map: {e}", file=sys.stderr) return {} def get_job_allocations(job_id): """ Returns a list of all active allocations for a job with their IPs and uptimes. """ try: # 1. Get list of allocations result = subprocess.run( ["nomad", "job", "status", job_id], capture_output=True, text=True, check=True ) alloc_ids = [] lines = result.stdout.splitlines() start_parsing = False for line in lines: if "Allocations" in line: start_parsing = True continue if start_parsing and line.strip() and not line.startswith("ID") and not line.startswith("=="): parts = re.split(r"\s+", line.strip()) if len(parts) >= 5: alloc_id = parts[0] # Status is usually the 6th or 8th column depending on verbose # We'll look for 'running' in any part from 3 onwards if any(p == "running" for p in parts[3:]): alloc_ids.append(alloc_id) # 2. For each allocation, get its IP and Uptime allocations = [] now = datetime.now(timezone.utc) for alloc_id in alloc_ids: res_alloc = subprocess.run( ["nomad", "alloc", "status", alloc_id], capture_output=True, text=True, check=True ) node_name = "" ip = "" full_id = alloc_id uptime = "N/A" for l in res_alloc.stdout.splitlines(): if l.startswith("ID") and "=" in l: full_id = l.split("=")[1].strip() if l.startswith("Node Name") and "=" in l: node_name = l.split("=")[1].strip() # Extract IP from Allocation Addresses if "*litefs" in l: # e.g. "*litefs yes 1.1.1.1:20202 -> 20202" m = re.search(r"(\d+\.\d+\.\d+\.\d+):", l) if m: ip = m.group(1) # Extract Uptime from Started At if "Started At" in l and "=" in l: # e.g. "Started At = 2026-02-09T14:04:28Z" ts_str = l.split("=")[1].strip() if ts_str and ts_str != "N/A": try: # Parse ISO timestamp started_at = datetime.fromisoformat(ts_str.replace("Z", "+00:00")) duration = now - started_at # Format duration secs = int(duration.total_seconds()) if secs < 60: uptime = f"{secs}s" elif secs < 3600: uptime = f"{secs//60}m{secs%60}s" else: uptime = f"{secs//3600}h{(secs%3600)//60}m" except Exception: uptime = ts_str allocations.append({ "id": full_id, "node": node_name, "ip": ip, "uptime": uptime }) return allocations except Exception as e: print(f"Error getting job allocations: {e}", file=sys.stderr) return [] def get_allocation_id(node_name, job_id): """ Finds the FULL allocation ID for a specific node and job. """ node_map = get_node_map() try: result = subprocess.run( ["nomad", "job", "status", job_id], capture_output=True, text=True, check=True ) lines = result.stdout.splitlines() start_parsing = False for line in lines: if "Allocations" in line: start_parsing = True continue if start_parsing and line.strip() and not line.startswith("ID") and not line.startswith("=="): parts = re.split(r"\s+", line.strip()) if len(parts) >= 2: alloc_id = parts[0] node_id = parts[1] resolved_name = node_map.get(node_id, "") if node_id == node_name or resolved_name == node_name: # Now get the FULL ID using nomad alloc status res_alloc = subprocess.run( ["nomad", "alloc", "status", alloc_id], capture_output=True, text=True, check=True ) for l in res_alloc.stdout.splitlines(): if l.startswith("ID"): return l.split("=")[1].strip() return alloc_id except FileNotFoundError: return None # Warning already printed by get_node_map likely except Exception as e: print(f"Error getting allocation ID: {e}", file=sys.stderr) return None def get_allocation_logs(alloc_id, tail=20): """ Fetches the last N lines of stderr for an allocation. """ try: # Try with task name first, then without try: result = subprocess.run( ["nomad", "alloc", "logs", "-stderr", "-task", "navidrome", "-n", str(tail), alloc_id], capture_output=True, text=True, check=True ) return result.stdout except subprocess.CalledProcessError: result = subprocess.run( ["nomad", "alloc", "logs", "-stderr", "-n", str(tail), alloc_id], capture_output=True, text=True, check=True ) return result.stdout except Exception as e: # Don't print stack trace, just the error return f"Nomad Error: {str(e)}" def exec_command(alloc_id, command, task="navidrome"): """ Executes a command inside a specific allocation and task. """ try: args = ["nomad", "alloc", "exec", "-task", task, alloc_id] + command result = subprocess.run( args, capture_output=True, text=True, check=True ) return result.stdout except Exception as e: # Don't print stack trace, just return error string return f"Nomad Error: {str(e)}" def restart_allocation(alloc_id): """ Restarts a specific allocation. """ try: subprocess.run( ["nomad", "alloc", "restart", alloc_id], capture_output=True, text=True, check=True ) return True except Exception as e: print(f"Error restarting allocation: {e}", file=sys.stderr) return False