import requests import time import logging from typing import Dict, Any, Optional class NomadClient: """Client for interacting with Nomad API""" def __init__(self, base_url: str = 'http://127.0.0.1:4646', logger: logging.Logger = None): """ Initialize Nomad client Args: base_url: Base URL for Nomad API logger: Optional logger instance (uses module logger if not provided) """ self.base_url = base_url.rstrip('/') self.logger = logger or logging.getLogger(__name__) def restart_task_via_allocation(self, job_id: str, task_name: str, namespace: str = "default", wait_time: int = 60, token: Optional[str] = None) -> bool: """ Restart a specific task in a Nomad job by restarting just that task. Args: job_id: The ID of the job containing the task task_name: The name of the task to restart namespace: The namespace of the job (default: 'default') wait_time: Seconds to wait after restart (default: 60) token: Optional Nomad ACL token Returns: bool: True if restart succeeded, False otherwise """ headers = {} if token: headers['X-Nomad-Token'] = token try: # Get allocations for the job allocs_url = f"{self.base_url}/v1/job/{job_id}/allocations" params = {'namespace': namespace} self.logger.info(f"Fetching allocations for job '{job_id}'...") response = requests.get(allocs_url, headers=headers, params=params, timeout=10) response.raise_for_status() allocations = response.json() # Find allocation containing the task target_alloc = None for alloc in allocations: if alloc['ClientStatus'] == 'running': task_states = alloc.get('TaskStates', {}) if task_name in task_states: target_alloc = alloc break if not target_alloc: self.logger.error(f"No running allocation found for task '{task_name}' in job '{job_id}'") return False # Restart just the specific task alloc_id = target_alloc['ID'] restart_url = f"{self.base_url}/v1/client/allocation/{alloc_id}/restart" payload = {"TaskName": task_name} self.logger.info(f"Restarting task '{task_name}' in job '{job_id}'...") response = requests.post(restart_url, headers=headers, params=params, json=payload, timeout=10) if response.status_code in [200, 204]: self.logger.info(f"Successfully restarted task '{task_name}' in job '{job_id}'") time.sleep(wait_time) return True else: self.logger.error(f"Failed: {response.status_code} - {response.text}") return False except Exception as e: self.logger.error(f"Request failed: {e}") return False def check_job_status(self, job_id: str, namespace: str = "default", token: Optional[str] = None) -> Optional[Dict[str, Any]]: """ Check the status of a Nomad job Args: job_id: The ID of the job to check namespace: The namespace of the job token: Optional Nomad ACL token Returns: Job status information or None if failed """ headers = {} if token: headers['X-Nomad-Token'] = token try: job_url = f"{self.base_url}/v1/job/{job_id}" params = {'namespace': namespace} response = requests.get(job_url, headers=headers, params=params, timeout=10) response.raise_for_status() return response.json() except Exception as e: self.logger.error(f"Failed to get job status: {e}") return None def get_allocations(self, job_id: str, namespace: str = "default", token: Optional[str] = None) -> Optional[list]: """ Get allocations for a specific job Args: job_id: The ID of the job namespace: The namespace of the job token: Optional Nomad ACL token Returns: List of allocations or None if failed """ headers = {} if token: headers['X-Nomad-Token'] = token try: allocs_url = f"{self.base_url}/v1/job/{job_id}/allocations" params = {'namespace': namespace} response = requests.get(allocs_url, headers=headers, params=params, timeout=10) response.raise_for_status() return response.json() except Exception as e: self.logger.error(f"Failed to get allocations: {e}") return None def get_container_info(self, job_id: str, task_names: list, namespace: str = "default", token: Optional[str] = None) -> Dict[str, Any]: """ Get container information including uptime and restart details for specified tasks Args: job_id: The ID of the job containing the tasks task_names: List of task names to get information for namespace: The namespace of the job token: Optional Nomad ACL token Returns: Dictionary with container information for each task """ import time from datetime import datetime headers = {} if token: headers['X-Nomad-Token'] = token container_info = {} try: # Get allocations for the job allocations = self.get_allocations(job_id, namespace, token) if not allocations: self.logger.warning(f"No allocations found for job '{job_id}'") return container_info current_time = time.time() for task_name in task_names: task_info = { 'uptime': 'Unknown', 'last_restart_time': 'Never', 'last_restart_reason': 'None', 'restart_count': 0 } # Find allocation with the task for alloc in allocations: if alloc.get('ClientStatus') == 'running': task_states = alloc.get('TaskStates', {}) if task_name in task_states: task_state = task_states[task_name] # Calculate uptime from StartedAt started_at = task_state.get('StartedAt') if started_at and started_at != '0001-01-01T00:00:00Z': try: start_dt = datetime.fromisoformat(started_at.replace('Z', '+00:00')) uptime_seconds = current_time - start_dt.timestamp() from utils.time_utils import format_human_readable_time task_info['uptime'] = format_human_readable_time(uptime_seconds) except (ValueError, TypeError): task_info['uptime'] = 'Invalid timestamp' # Get last restart information last_restart = task_state.get('LastRestart') if last_restart and last_restart != '0001-01-01T00:00:00Z': try: restart_dt = datetime.fromisoformat(last_restart.replace('Z', '+00:00')) restart_seconds_ago = current_time - restart_dt.timestamp() from utils.time_utils import format_human_readable_time task_info['last_restart_time'] = f"{format_human_readable_time(restart_seconds_ago)} ago" except (ValueError, TypeError): task_info['last_restart_time'] = 'Invalid timestamp' # Get restart count task_info['restart_count'] = task_state.get('Restarts', 0) # Find restart reasons from events - analyze multiple event types events = task_state.get('Events', []) restart_reasons = [] # Look for the most recent restart-related events in priority order for event in reversed(events): # Check most recent events first event_type = event.get('Type') # Highest priority: Termination events with exit codes (actual failure) if event_type == 'Terminated': exit_code = event.get('ExitCode') exit_message = event.get('ExitMessage', '') if exit_code is not None: if exit_code != 0: # Only consider non-zero exit codes as failures reason = f"Exit Code: {exit_code}" if exit_message: reason += f", {exit_message}" restart_reasons.append(reason) break # Second priority: Killed events (external termination) elif event_type == 'Killed': reason = event.get('KillReason') or 'Task killed' restart_reasons.append(reason) break # Third priority: Explicit restart events elif event_type == 'Restart': reason = event.get('RestartReason') or 'Restart initiated' restart_reasons.append(reason) break # Fourth priority: Restart signaling (user or system initiated) elif event_type == 'Restart Signaled': reason = event.get('DriverMessage') or 'Restart signaled' restart_reasons.append(reason) break # If no specific restart events found, look for any abnormal events if not restart_reasons: for event in reversed(events): event_type = event.get('Type') if event_type in ['Terminated', 'Killed']: exit_code = event.get('ExitCode') if exit_code and exit_code != 0: # Non-zero exit code exit_message = event.get('ExitMessage', '') reason = f"Abnormal termination (Exit: {exit_code})" if exit_message: reason += f": {exit_message}" restart_reasons.append(reason) break task_info['last_restart_reason'] = restart_reasons[0] if restart_reasons else 'Normal operation' break # Found the task, move to next task container_info[task_name] = task_info return container_info except Exception as e: self.logger.error(f"Failed to get container info: {e}") return container_info