Files
qbitcheck/api/nomad_client.py
2025-11-21 13:38:02 -08:00

276 lines
13 KiB
Python

import requests
import time
import logging
from typing import Dict, Any, Optional
class NomadClient:
"""Client for interacting with Nomad API"""
def __init__(self, base_url: str = 'http://127.0.0.1:4646',
logger: logging.Logger = None):
"""
Initialize Nomad client
Args:
base_url: Base URL for Nomad API
logger: Optional logger instance (uses module logger if not provided)
"""
self.base_url = base_url.rstrip('/')
self.logger = logger or logging.getLogger(__name__)
def restart_task_via_allocation(self, job_id: str, task_name: str,
namespace: str = "default", wait_time: int = 60,
token: Optional[str] = None) -> bool:
"""
Restart a specific task in a Nomad job by restarting just that task.
Args:
job_id: The ID of the job containing the task
task_name: The name of the task to restart
namespace: The namespace of the job (default: 'default')
wait_time: Seconds to wait after restart (default: 60)
token: Optional Nomad ACL token
Returns:
bool: True if restart succeeded, False otherwise
"""
headers = {}
if token:
headers['X-Nomad-Token'] = token
try:
# Get allocations for the job
allocs_url = f"{self.base_url}/v1/job/{job_id}/allocations"
params = {'namespace': namespace}
self.logger.info(f"Fetching allocations for job '{job_id}'...")
response = requests.get(allocs_url, headers=headers, params=params, timeout=10)
response.raise_for_status()
allocations = response.json()
# Find allocation containing the task
target_alloc = None
for alloc in allocations:
if alloc['ClientStatus'] == 'running':
task_states = alloc.get('TaskStates', {})
if task_name in task_states:
target_alloc = alloc
break
if not target_alloc:
self.logger.error(f"No running allocation found for task '{task_name}' in job '{job_id}'")
return False
# Restart just the specific task
alloc_id = target_alloc['ID']
restart_url = f"{self.base_url}/v1/client/allocation/{alloc_id}/restart"
payload = {"TaskName": task_name}
self.logger.info(f"Restarting task '{task_name}' in job '{job_id}'...")
response = requests.post(restart_url, headers=headers, params=params, json=payload, timeout=10)
if response.status_code in [200, 204]:
self.logger.info(f"Successfully restarted task '{task_name}' in job '{job_id}'")
time.sleep(wait_time)
return True
else:
self.logger.error(f"Failed: {response.status_code} - {response.text}")
return False
except Exception as e:
self.logger.error(f"Request failed: {e}")
return False
def check_job_status(self, job_id: str, namespace: str = "default",
token: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""
Check the status of a Nomad job
Args:
job_id: The ID of the job to check
namespace: The namespace of the job
token: Optional Nomad ACL token
Returns:
Job status information or None if failed
"""
headers = {}
if token:
headers['X-Nomad-Token'] = token
try:
job_url = f"{self.base_url}/v1/job/{job_id}"
params = {'namespace': namespace}
response = requests.get(job_url, headers=headers, params=params, timeout=10)
response.raise_for_status()
return response.json()
except Exception as e:
self.logger.error(f"Failed to get job status: {e}")
return None
def get_allocations(self, job_id: str, namespace: str = "default",
token: Optional[str] = None) -> Optional[list]:
"""
Get allocations for a specific job
Args:
job_id: The ID of the job
namespace: The namespace of the job
token: Optional Nomad ACL token
Returns:
List of allocations or None if failed
"""
headers = {}
if token:
headers['X-Nomad-Token'] = token
try:
allocs_url = f"{self.base_url}/v1/job/{job_id}/allocations"
params = {'namespace': namespace}
response = requests.get(allocs_url, headers=headers, params=params, timeout=10)
response.raise_for_status()
return response.json()
except Exception as e:
self.logger.error(f"Failed to get allocations: {e}")
return None
def get_container_info(self, job_id: str, task_names: list, namespace: str = "default",
token: Optional[str] = None) -> Dict[str, Any]:
"""
Get container information including uptime and restart details for specified tasks
Args:
job_id: The ID of the job containing the tasks
task_names: List of task names to get information for
namespace: The namespace of the job
token: Optional Nomad ACL token
Returns:
Dictionary with container information for each task
"""
import time
from datetime import datetime
headers = {}
if token:
headers['X-Nomad-Token'] = token
container_info = {}
try:
# Get allocations for the job
allocations = self.get_allocations(job_id, namespace, token)
if not allocations:
self.logger.warning(f"No allocations found for job '{job_id}'")
return container_info
current_time = time.time()
for task_name in task_names:
task_info = {
'uptime': 'Unknown',
'last_restart_time': 'Never',
'last_restart_reason': 'None',
'restart_count': 0
}
# Find allocation with the task
for alloc in allocations:
if alloc.get('ClientStatus') == 'running':
task_states = alloc.get('TaskStates', {})
if task_name in task_states:
task_state = task_states[task_name]
# Calculate uptime from StartedAt
started_at = task_state.get('StartedAt')
if started_at and started_at != '0001-01-01T00:00:00Z':
try:
start_dt = datetime.fromisoformat(started_at.replace('Z', '+00:00'))
uptime_seconds = current_time - start_dt.timestamp()
from utils.time_utils import format_human_readable_time
task_info['uptime'] = format_human_readable_time(uptime_seconds)
except (ValueError, TypeError):
task_info['uptime'] = 'Invalid timestamp'
# Get last restart information
last_restart = task_state.get('LastRestart')
if last_restart and last_restart != '0001-01-01T00:00:00Z':
try:
restart_dt = datetime.fromisoformat(last_restart.replace('Z', '+00:00'))
restart_seconds_ago = current_time - restart_dt.timestamp()
from utils.time_utils import format_human_readable_time
task_info['last_restart_time'] = f"{format_human_readable_time(restart_seconds_ago)} ago"
except (ValueError, TypeError):
task_info['last_restart_time'] = 'Invalid timestamp'
# Get restart count
task_info['restart_count'] = task_state.get('Restarts', 0)
# Find restart reasons from events - analyze multiple event types
events = task_state.get('Events', [])
restart_reasons = []
# Look for the most recent restart-related events in priority order
for event in reversed(events): # Check most recent events first
event_type = event.get('Type')
# Highest priority: Termination events with exit codes (actual failure)
if event_type == 'Terminated':
exit_code = event.get('ExitCode')
exit_message = event.get('ExitMessage', '')
if exit_code is not None:
if exit_code != 0: # Only consider non-zero exit codes as failures
reason = f"Exit Code: {exit_code}"
if exit_message:
reason += f", {exit_message}"
restart_reasons.append(reason)
break
# Second priority: Killed events (external termination)
elif event_type == 'Killed':
reason = event.get('KillReason') or 'Task killed'
restart_reasons.append(reason)
break
# Third priority: Explicit restart events
elif event_type == 'Restart':
reason = event.get('RestartReason') or 'Restart initiated'
restart_reasons.append(reason)
break
# Fourth priority: Restart signaling (user or system initiated)
elif event_type == 'Restart Signaled':
reason = event.get('DriverMessage') or 'Restart signaled'
restart_reasons.append(reason)
break
# If no specific restart events found, look for any abnormal events
if not restart_reasons:
for event in reversed(events):
event_type = event.get('Type')
if event_type in ['Terminated', 'Killed']:
exit_code = event.get('ExitCode')
if exit_code and exit_code != 0: # Non-zero exit code
exit_message = event.get('ExitMessage', '')
reason = f"Abnormal termination (Exit: {exit_code})"
if exit_message:
reason += f": {exit_message}"
restart_reasons.append(reason)
break
task_info['last_restart_reason'] = restart_reasons[0] if restart_reasons else 'Normal operation'
break # Found the task, move to next task
container_info[task_name] = task_info
return container_info
except Exception as e:
self.logger.error(f"Failed to get container info: {e}")
return container_info