navidrome-litefs/scripts/cluster_status/nomad_client.py

import subprocess
import re
import sys

def get_node_map():
    """
    Returns a mapping of Node ID to Node Name.
    """
    try:
        result = subprocess.run(
            ["nomad", "node", "status"],
            capture_output=True, text=True, check=True
        )
        lines = result.stdout.splitlines()
        node_map = {}
        for line in lines:
            if line.strip() and not line.startswith("ID") and not line.startswith("=="):
                parts = re.split(r"\s+", line.strip())
                if len(parts) >= 4:
                    node_map[parts[0]] = parts[3]
        return node_map
    except FileNotFoundError:
        print("Warning: 'nomad' binary not found in PATH.", file=sys.stderr)
        return {}
    except subprocess.CalledProcessError as e:
        print(f"Warning: Failed to query Nomad nodes: {e}", file=sys.stderr)
        return {}
    except Exception as e:
        print(f"Error getting node map: {e}", file=sys.stderr)
        return {}

def get_allocation_id(node_name, job_id):
    """
    Finds the FULL allocation ID for a specific node and job.
    """
    node_map = get_node_map()
    try:
        result = subprocess.run(
            ["nomad", "job", "status", job_id],
            capture_output=True, text=True, check=True
        )

        lines = result.stdout.splitlines()
        start_parsing = False
        for line in lines:
            if "Allocations" in line:
                start_parsing = True
                continue
            if start_parsing and line.strip() and not line.startswith("ID") and not line.startswith("=="):
                parts = re.split(r"\s+", line.strip())
                if len(parts) >= 2:
                    alloc_id = parts[0]
                    node_id = parts[1]

                    resolved_name = node_map.get(node_id, "")
                    if node_id == node_name or resolved_name == node_name:
                        # Now get the FULL ID using nomad alloc status
                        res_alloc = subprocess.run(
                            ["nomad", "alloc", "status", alloc_id],
                            capture_output=True, text=True, check=True
                        )
                        for l in res_alloc.stdout.splitlines():
                            if l.startswith("ID"):
                                return l.split("=")[1].strip()
                        return alloc_id

    except FileNotFoundError:
        return None # Warning already printed by get_node_map likely
    except Exception as e:
        print(f"Error getting allocation ID: {e}", file=sys.stderr)

    return None

def get_allocation_logs(alloc_id, tail=20):
    """
    Fetches the last N lines of stderr for an allocation.
    """
    try:
        # Try with task name first, then without
        try:
            result = subprocess.run(
                ["nomad", "alloc", "logs", "-stderr", "-task", "navidrome", "-n", str(tail), alloc_id],
                capture_output=True, text=True, check=True
            )
            return result.stdout
        except subprocess.CalledProcessError:
            result = subprocess.run(
                ["nomad", "alloc", "logs", "-stderr", "-n", str(tail), alloc_id],
                capture_output=True, text=True, check=True
            )
            return result.stdout
    except Exception as e:
        # Don't print stack trace, just the error
        return f"Nomad Error: {str(e)}"

def exec_command(alloc_id, command, task="navidrome"):
    """
    Executes a command inside a specific allocation and task.
    """
    try:
        args = ["nomad", "alloc", "exec", "-task", task, alloc_id] + command
        result = subprocess.run(
            args,
            capture_output=True, text=True, check=True
        )
        return result.stdout
    except Exception as e:
        # Don't print stack trace, just return error string
        return f"Nomad Error: {str(e)}"

def restart_allocation(alloc_id):
    """
    Restarts a specific allocation.
    """
    try:
        subprocess.run(
            ["nomad", "alloc", "restart", alloc_id],
            capture_output=True, text=True, check=True
        )
        return True
    except Exception as e:
        print(f"Error restarting allocation: {e}", file=sys.stderr)
        return False