276 lines
13 KiB
Python
276 lines
13 KiB
Python
import requests
|
|
import time
|
|
import logging
|
|
from typing import Dict, Any, Optional
|
|
|
|
class NomadClient:
|
|
"""Client for interacting with Nomad API"""
|
|
|
|
def __init__(self, base_url: str = 'http://127.0.0.1:4646',
|
|
logger: logging.Logger = None):
|
|
"""
|
|
Initialize Nomad client
|
|
|
|
Args:
|
|
base_url: Base URL for Nomad API
|
|
logger: Optional logger instance (uses module logger if not provided)
|
|
"""
|
|
self.base_url = base_url.rstrip('/')
|
|
self.logger = logger or logging.getLogger(__name__)
|
|
|
|
def restart_task_via_allocation(self, job_id: str, task_name: str,
|
|
namespace: str = "default", wait_time: int = 60,
|
|
token: Optional[str] = None) -> bool:
|
|
"""
|
|
Restart a specific task in a Nomad job by restarting just that task.
|
|
|
|
Args:
|
|
job_id: The ID of the job containing the task
|
|
task_name: The name of the task to restart
|
|
namespace: The namespace of the job (default: 'default')
|
|
wait_time: Seconds to wait after restart (default: 60)
|
|
token: Optional Nomad ACL token
|
|
|
|
Returns:
|
|
bool: True if restart succeeded, False otherwise
|
|
"""
|
|
headers = {}
|
|
if token:
|
|
headers['X-Nomad-Token'] = token
|
|
|
|
try:
|
|
# Get allocations for the job
|
|
allocs_url = f"{self.base_url}/v1/job/{job_id}/allocations"
|
|
params = {'namespace': namespace}
|
|
|
|
self.logger.info(f"Fetching allocations for job '{job_id}'...")
|
|
response = requests.get(allocs_url, headers=headers, params=params, timeout=10)
|
|
response.raise_for_status()
|
|
allocations = response.json()
|
|
|
|
# Find allocation containing the task
|
|
target_alloc = None
|
|
for alloc in allocations:
|
|
if alloc['ClientStatus'] == 'running':
|
|
task_states = alloc.get('TaskStates', {})
|
|
if task_name in task_states:
|
|
target_alloc = alloc
|
|
break
|
|
|
|
if not target_alloc:
|
|
self.logger.error(f"No running allocation found for task '{task_name}' in job '{job_id}'")
|
|
return False
|
|
|
|
# Restart just the specific task
|
|
alloc_id = target_alloc['ID']
|
|
restart_url = f"{self.base_url}/v1/client/allocation/{alloc_id}/restart"
|
|
payload = {"TaskName": task_name}
|
|
|
|
self.logger.info(f"Restarting task '{task_name}' in job '{job_id}'...")
|
|
response = requests.post(restart_url, headers=headers, params=params, json=payload, timeout=10)
|
|
|
|
if response.status_code in [200, 204]:
|
|
self.logger.info(f"Successfully restarted task '{task_name}' in job '{job_id}'")
|
|
time.sleep(wait_time)
|
|
return True
|
|
else:
|
|
self.logger.error(f"Failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Request failed: {e}")
|
|
return False
|
|
|
|
def check_job_status(self, job_id: str, namespace: str = "default",
|
|
token: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Check the status of a Nomad job
|
|
|
|
Args:
|
|
job_id: The ID of the job to check
|
|
namespace: The namespace of the job
|
|
token: Optional Nomad ACL token
|
|
|
|
Returns:
|
|
Job status information or None if failed
|
|
"""
|
|
headers = {}
|
|
if token:
|
|
headers['X-Nomad-Token'] = token
|
|
|
|
try:
|
|
job_url = f"{self.base_url}/v1/job/{job_id}"
|
|
params = {'namespace': namespace}
|
|
|
|
response = requests.get(job_url, headers=headers, params=params, timeout=10)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to get job status: {e}")
|
|
return None
|
|
|
|
def get_allocations(self, job_id: str, namespace: str = "default",
|
|
token: Optional[str] = None) -> Optional[list]:
|
|
"""
|
|
Get allocations for a specific job
|
|
|
|
Args:
|
|
job_id: The ID of the job
|
|
namespace: The namespace of the job
|
|
token: Optional Nomad ACL token
|
|
|
|
Returns:
|
|
List of allocations or None if failed
|
|
"""
|
|
headers = {}
|
|
if token:
|
|
headers['X-Nomad-Token'] = token
|
|
|
|
try:
|
|
allocs_url = f"{self.base_url}/v1/job/{job_id}/allocations"
|
|
params = {'namespace': namespace}
|
|
|
|
response = requests.get(allocs_url, headers=headers, params=params, timeout=10)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to get allocations: {e}")
|
|
return None
|
|
|
|
def get_container_info(self, job_id: str, task_names: list, namespace: str = "default",
|
|
token: Optional[str] = None) -> Dict[str, Any]:
|
|
"""
|
|
Get container information including uptime and restart details for specified tasks
|
|
|
|
Args:
|
|
job_id: The ID of the job containing the tasks
|
|
task_names: List of task names to get information for
|
|
namespace: The namespace of the job
|
|
token: Optional Nomad ACL token
|
|
|
|
Returns:
|
|
Dictionary with container information for each task
|
|
"""
|
|
import time
|
|
from datetime import datetime
|
|
|
|
headers = {}
|
|
if token:
|
|
headers['X-Nomad-Token'] = token
|
|
|
|
container_info = {}
|
|
|
|
try:
|
|
# Get allocations for the job
|
|
allocations = self.get_allocations(job_id, namespace, token)
|
|
if not allocations:
|
|
self.logger.warning(f"No allocations found for job '{job_id}'")
|
|
return container_info
|
|
|
|
current_time = time.time()
|
|
|
|
for task_name in task_names:
|
|
task_info = {
|
|
'uptime': 'Unknown',
|
|
'last_restart_time': 'Never',
|
|
'last_restart_reason': 'None',
|
|
'restart_count': 0
|
|
}
|
|
|
|
# Find allocation with the task
|
|
for alloc in allocations:
|
|
if alloc.get('ClientStatus') == 'running':
|
|
task_states = alloc.get('TaskStates', {})
|
|
if task_name in task_states:
|
|
task_state = task_states[task_name]
|
|
|
|
# Calculate uptime from StartedAt
|
|
started_at = task_state.get('StartedAt')
|
|
if started_at and started_at != '0001-01-01T00:00:00Z':
|
|
try:
|
|
start_dt = datetime.fromisoformat(started_at.replace('Z', '+00:00'))
|
|
uptime_seconds = current_time - start_dt.timestamp()
|
|
from utils.time_utils import format_human_readable_time
|
|
task_info['uptime'] = format_human_readable_time(uptime_seconds)
|
|
except (ValueError, TypeError):
|
|
task_info['uptime'] = 'Invalid timestamp'
|
|
|
|
# Get last restart information
|
|
last_restart = task_state.get('LastRestart')
|
|
if last_restart and last_restart != '0001-01-01T00:00:00Z':
|
|
try:
|
|
restart_dt = datetime.fromisoformat(last_restart.replace('Z', '+00:00'))
|
|
restart_seconds_ago = current_time - restart_dt.timestamp()
|
|
from utils.time_utils import format_human_readable_time
|
|
task_info['last_restart_time'] = f"{format_human_readable_time(restart_seconds_ago)} ago"
|
|
except (ValueError, TypeError):
|
|
task_info['last_restart_time'] = 'Invalid timestamp'
|
|
|
|
# Get restart count
|
|
task_info['restart_count'] = task_state.get('Restarts', 0)
|
|
|
|
# Find restart reasons from events - analyze multiple event types
|
|
events = task_state.get('Events', [])
|
|
restart_reasons = []
|
|
|
|
# Look for the most recent restart-related events in priority order
|
|
for event in reversed(events): # Check most recent events first
|
|
event_type = event.get('Type')
|
|
|
|
# Highest priority: Termination events with exit codes (actual failure)
|
|
if event_type == 'Terminated':
|
|
exit_code = event.get('ExitCode')
|
|
exit_message = event.get('ExitMessage', '')
|
|
if exit_code is not None:
|
|
if exit_code != 0: # Only consider non-zero exit codes as failures
|
|
reason = f"Exit Code: {exit_code}"
|
|
if exit_message:
|
|
reason += f", {exit_message}"
|
|
restart_reasons.append(reason)
|
|
break
|
|
|
|
# Second priority: Killed events (external termination)
|
|
elif event_type == 'Killed':
|
|
reason = event.get('KillReason') or 'Task killed'
|
|
restart_reasons.append(reason)
|
|
break
|
|
|
|
# Third priority: Explicit restart events
|
|
elif event_type == 'Restart':
|
|
reason = event.get('RestartReason') or 'Restart initiated'
|
|
restart_reasons.append(reason)
|
|
break
|
|
|
|
# Fourth priority: Restart signaling (user or system initiated)
|
|
elif event_type == 'Restart Signaled':
|
|
reason = event.get('DriverMessage') or 'Restart signaled'
|
|
restart_reasons.append(reason)
|
|
break
|
|
|
|
# If no specific restart events found, look for any abnormal events
|
|
if not restart_reasons:
|
|
for event in reversed(events):
|
|
event_type = event.get('Type')
|
|
if event_type in ['Terminated', 'Killed']:
|
|
exit_code = event.get('ExitCode')
|
|
if exit_code and exit_code != 0: # Non-zero exit code
|
|
exit_message = event.get('ExitMessage', '')
|
|
reason = f"Abnormal termination (Exit: {exit_code})"
|
|
if exit_message:
|
|
reason += f": {exit_message}"
|
|
restart_reasons.append(reason)
|
|
break
|
|
|
|
task_info['last_restart_reason'] = restart_reasons[0] if restart_reasons else 'Normal operation'
|
|
|
|
break # Found the task, move to next task
|
|
|
|
container_info[task_name] = task_info
|
|
|
|
return container_info
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to get container info: {e}")
|
|
return container_info |