conductor(checkpoint): Checkpoint end of Phase 2
This commit is contained in:
98
scripts/cluster_status/nomad_client.py
Normal file
98
scripts/cluster_status/nomad_client.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
def get_node_map():
|
||||
"""
|
||||
Returns a mapping of Node ID to Node Name.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nomad", "node", "status"],
|
||||
capture_output=True, text=True, check=True
|
||||
)
|
||||
lines = result.stdout.splitlines()
|
||||
node_map = {}
|
||||
for line in lines:
|
||||
if line.strip() and not line.startswith("ID") and not line.startswith("=="):
|
||||
parts = re.split(r"\s+", line.strip())
|
||||
if len(parts) >= 4:
|
||||
node_map[parts[0]] = parts[3]
|
||||
return node_map
|
||||
except Exception as e:
|
||||
print(f"Error getting node map: {e}")
|
||||
return {}
|
||||
|
||||
def get_allocation_id(node_name, job_id):
|
||||
"""
|
||||
Finds the FULL allocation ID for a specific node and job.
|
||||
"""
|
||||
node_map = get_node_map()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nomad", "job", "status", job_id],
|
||||
capture_output=True, text=True, check=True
|
||||
)
|
||||
|
||||
lines = result.stdout.splitlines()
|
||||
start_parsing = False
|
||||
for line in lines:
|
||||
if "Allocations" in line:
|
||||
start_parsing = True
|
||||
continue
|
||||
if start_parsing and line.strip() and not line.startswith("ID") and not line.startswith("=="):
|
||||
parts = re.split(r"\s+", line.strip())
|
||||
if len(parts) >= 2:
|
||||
alloc_id = parts[0]
|
||||
node_id = parts[1]
|
||||
|
||||
resolved_name = node_map.get(node_id, "")
|
||||
if node_id == node_name or resolved_name == node_name:
|
||||
# Now get the FULL ID using nomad alloc status
|
||||
res_alloc = subprocess.run(
|
||||
["nomad", "alloc", "status", alloc_id],
|
||||
capture_output=True, text=True, check=True
|
||||
)
|
||||
for l in res_alloc.stdout.splitlines():
|
||||
if l.startswith("ID"):
|
||||
return l.split("=")[1].strip()
|
||||
return alloc_id
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting allocation ID: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def get_allocation_logs(alloc_id, tail=20):
|
||||
"""
|
||||
Fetches the last N lines of stderr for an allocation.
|
||||
"""
|
||||
try:
|
||||
# Try with task name first, then without
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nomad", "alloc", "logs", "-stderr", "-task", "navidrome", "-n", str(tail), alloc_id],
|
||||
capture_output=True, text=True, check=True
|
||||
)
|
||||
return result.stdout
|
||||
except subprocess.CalledProcessError:
|
||||
result = subprocess.run(
|
||||
["nomad", "alloc", "logs", "-stderr", "-n", str(tail), alloc_id],
|
||||
capture_output=True, text=True, check=True
|
||||
)
|
||||
return result.stdout
|
||||
except Exception as e:
|
||||
return f"Error fetching logs: {e}"
|
||||
|
||||
def restart_allocation(alloc_id):
|
||||
"""
|
||||
Restarts a specific allocation.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(
|
||||
["nomad", "alloc", "restart", alloc_id],
|
||||
capture_output=True, text=True, check=True
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error restarting allocation: {e}")
|
||||
return False
|
||||
Reference in New Issue
Block a user