conductor(checkpoint): Checkpoint end of Phase 2

This commit is contained in:
2026-02-08 07:54:49 -08:00
parent a686c5b225
commit 6d77729a4a
7 changed files with 196 additions and 6 deletions

View File

@@ -0,0 +1,98 @@
import subprocess
import re
def get_node_map():
"""
Returns a mapping of Node ID to Node Name.
"""
try:
result = subprocess.run(
["nomad", "node", "status"],
capture_output=True, text=True, check=True
)
lines = result.stdout.splitlines()
node_map = {}
for line in lines:
if line.strip() and not line.startswith("ID") and not line.startswith("=="):
parts = re.split(r"\s+", line.strip())
if len(parts) >= 4:
node_map[parts[0]] = parts[3]
return node_map
except Exception as e:
print(f"Error getting node map: {e}")
return {}
def get_allocation_id(node_name, job_id):
"""
Finds the FULL allocation ID for a specific node and job.
"""
node_map = get_node_map()
try:
result = subprocess.run(
["nomad", "job", "status", job_id],
capture_output=True, text=True, check=True
)
lines = result.stdout.splitlines()
start_parsing = False
for line in lines:
if "Allocations" in line:
start_parsing = True
continue
if start_parsing and line.strip() and not line.startswith("ID") and not line.startswith("=="):
parts = re.split(r"\s+", line.strip())
if len(parts) >= 2:
alloc_id = parts[0]
node_id = parts[1]
resolved_name = node_map.get(node_id, "")
if node_id == node_name or resolved_name == node_name:
# Now get the FULL ID using nomad alloc status
res_alloc = subprocess.run(
["nomad", "alloc", "status", alloc_id],
capture_output=True, text=True, check=True
)
for l in res_alloc.stdout.splitlines():
if l.startswith("ID"):
return l.split("=")[1].strip()
return alloc_id
except Exception as e:
print(f"Error getting allocation ID: {e}")
return None
def get_allocation_logs(alloc_id, tail=20):
"""
Fetches the last N lines of stderr for an allocation.
"""
try:
# Try with task name first, then without
try:
result = subprocess.run(
["nomad", "alloc", "logs", "-stderr", "-task", "navidrome", "-n", str(tail), alloc_id],
capture_output=True, text=True, check=True
)
return result.stdout
except subprocess.CalledProcessError:
result = subprocess.run(
["nomad", "alloc", "logs", "-stderr", "-n", str(tail), alloc_id],
capture_output=True, text=True, check=True
)
return result.stdout
except Exception as e:
return f"Error fetching logs: {e}"
def restart_allocation(alloc_id):
"""
Restarts a specific allocation.
"""
try:
subprocess.run(
["nomad", "alloc", "restart", alloc_id],
capture_output=True, text=True, check=True
)
return True
except Exception as e:
print(f"Error restarting allocation: {e}")
return False