Files

211 lines
7.7 KiB
Python

import subprocess
import re
import sys
from datetime import datetime, timezone
def get_node_map():
"""
Returns a mapping of Node ID to Node Name.
"""
try:
result = subprocess.run(
["nomad", "node", "status"],
capture_output=True, text=True, check=True
)
lines = result.stdout.splitlines()
node_map = {}
for line in lines:
if line.strip() and not line.startswith("ID") and not line.startswith("=="):
parts = re.split(r"\s+", line.strip())
if len(parts) >= 4:
node_map[parts[0]] = parts[3]
return node_map
except FileNotFoundError:
print("Warning: 'nomad' binary not found in PATH.", file=sys.stderr)
return {}
except subprocess.CalledProcessError as e:
print(f"Warning: Failed to query Nomad nodes: {e}", file=sys.stderr)
return {}
except Exception as e:
print(f"Error getting node map: {e}", file=sys.stderr)
return {}
def get_job_allocations(job_id):
"""
Returns a list of all active allocations for a job with their IPs and uptimes.
"""
try:
# 1. Get list of allocations
result = subprocess.run(
["nomad", "job", "status", job_id],
capture_output=True, text=True, check=True
)
alloc_ids = []
lines = result.stdout.splitlines()
start_parsing = False
for line in lines:
if "Allocations" in line:
start_parsing = True
continue
if start_parsing and line.strip() and not line.startswith("ID") and not line.startswith("=="):
parts = re.split(r"\s+", line.strip())
if len(parts) >= 5:
alloc_id = parts[0]
# Status is usually the 6th or 8th column depending on verbose
# We'll look for 'running' in any part from 3 onwards
if any(p == "running" for p in parts[3:]):
alloc_ids.append(alloc_id)
# 2. For each allocation, get its IP and Uptime
allocations = []
now = datetime.now(timezone.utc)
for alloc_id in alloc_ids:
res_alloc = subprocess.run(
["nomad", "alloc", "status", alloc_id],
capture_output=True, text=True, check=True
)
node_name = ""
ip = ""
full_id = alloc_id
uptime = "N/A"
for l in res_alloc.stdout.splitlines():
if l.startswith("ID") and "=" in l:
full_id = l.split("=")[1].strip()
if l.startswith("Node Name") and "=" in l:
node_name = l.split("=")[1].strip()
# Extract IP from Allocation Addresses
if "*litefs" in l:
# e.g. "*litefs yes 1.1.1.1:20202 -> 20202"
m = re.search(r"(\d+\.\d+\.\d+\.\d+):", l)
if m:
ip = m.group(1)
# Extract Uptime from Started At
if "Started At" in l and "=" in l:
# e.g. "Started At = 2026-02-09T14:04:28Z"
ts_str = l.split("=")[1].strip()
if ts_str and ts_str != "N/A":
try:
# Parse ISO timestamp
started_at = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
duration = now - started_at
# Format duration
secs = int(duration.total_seconds())
if secs < 60:
uptime = f"{secs}s"
elif secs < 3600:
uptime = f"{secs//60}m{secs%60}s"
else:
uptime = f"{secs//3600}h{(secs%3600)//60}m"
except Exception:
uptime = ts_str
allocations.append({
"id": full_id,
"node": node_name,
"ip": ip,
"uptime": uptime
})
return allocations
except Exception as e:
print(f"Error getting job allocations: {e}", file=sys.stderr)
return []
def get_allocation_id(node_name, job_id):
"""
Finds the FULL allocation ID for a specific node and job.
"""
node_map = get_node_map()
try:
result = subprocess.run(
["nomad", "job", "status", job_id],
capture_output=True, text=True, check=True
)
lines = result.stdout.splitlines()
start_parsing = False
for line in lines:
if "Allocations" in line:
start_parsing = True
continue
if start_parsing and line.strip() and not line.startswith("ID") and not line.startswith("=="):
parts = re.split(r"\s+", line.strip())
if len(parts) >= 2:
alloc_id = parts[0]
node_id = parts[1]
resolved_name = node_map.get(node_id, "")
if node_id == node_name or resolved_name == node_name:
# Now get the FULL ID using nomad alloc status
res_alloc = subprocess.run(
["nomad", "alloc", "status", alloc_id],
capture_output=True, text=True, check=True
)
for l in res_alloc.stdout.splitlines():
if l.startswith("ID"):
return l.split("=")[1].strip()
return alloc_id
except FileNotFoundError:
return None # Warning already printed by get_node_map likely
except Exception as e:
print(f"Error getting allocation ID: {e}", file=sys.stderr)
return None
def get_allocation_logs(alloc_id, tail=20):
"""
Fetches the last N lines of stderr for an allocation.
"""
try:
# Try with task name first, then without
try:
result = subprocess.run(
["nomad", "alloc", "logs", "-stderr", "-task", "navidrome", "-n", str(tail), alloc_id],
capture_output=True, text=True, check=True
)
return result.stdout
except subprocess.CalledProcessError:
result = subprocess.run(
["nomad", "alloc", "logs", "-stderr", "-n", str(tail), alloc_id],
capture_output=True, text=True, check=True
)
return result.stdout
except Exception as e:
# Don't print stack trace, just the error
return f"Nomad Error: {str(e)}"
def exec_command(alloc_id, command, task="navidrome"):
"""
Executes a command inside a specific allocation and task.
"""
try:
args = ["nomad", "alloc", "exec", "-task", task, alloc_id] + command
result = subprocess.run(
args,
capture_output=True, text=True, check=True
)
return result.stdout
except Exception as e:
# Don't print stack trace, just return error string
return f"Nomad Error: {str(e)}"
def restart_allocation(alloc_id):
"""
Restarts a specific allocation.
"""
try:
subprocess.run(
["nomad", "alloc", "restart", alloc_id],
capture_output=True, text=True, check=True
)
return True
except Exception as e:
print(f"Error restarting allocation: {e}", file=sys.stderr)
return False