first commit
This commit is contained in:
189
monitoring/remediation_manager.py
Normal file
189
monitoring/remediation_manager.py
Normal file
@@ -0,0 +1,189 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
from requests import Session
|
||||
|
||||
from api.qbittorrent_client import QBittorrentClient
|
||||
from api.nomad_client import NomadClient
|
||||
from persistence.state_manager import StateManager
|
||||
from utils.time_utils import format_human_readable_time
|
||||
|
||||
class RemediationManager:
|
||||
"""Manages the remediation state machine for connection issues"""
|
||||
|
||||
def __init__(self,
|
||||
qbittorrent_client: QBittorrentClient,
|
||||
nomad_client: NomadClient,
|
||||
state_manager: StateManager,
|
||||
tracker_name: str,
|
||||
max_consecutive_failures: int = 20,
|
||||
stability_wait_time: int = 1800,
|
||||
stability_duration_required: int = 3600,
|
||||
logger: logging.Logger = None):
|
||||
"""
|
||||
Initialize remediation manager
|
||||
|
||||
Args:
|
||||
qbittorrent_client: QBittorrent client instance
|
||||
nomad_client: Nomad client instance
|
||||
state_manager: State manager instance
|
||||
tracker_name: Tracker name for torrent operations
|
||||
max_consecutive_failures: Maximum failures before remediation
|
||||
stability_wait_time: Time to wait for stability
|
||||
stability_duration_required: Required stability duration
|
||||
logger: Optional logger instance (uses module logger if not provided)
|
||||
"""
|
||||
self.qbittorrent_client = qbittorrent_client
|
||||
self.nomad_client = nomad_client
|
||||
self.state_manager = state_manager
|
||||
self.tracker_name = tracker_name
|
||||
self.max_consecutive_failures = max_consecutive_failures
|
||||
self.stability_wait_time = stability_wait_time
|
||||
self.stability_duration_required = stability_duration_required
|
||||
self.logger = logger or logging.getLogger(__name__)
|
||||
|
||||
# Remediation session
|
||||
self.remediation_session: Optional[Session] = None
|
||||
|
||||
def start_remediation(self) -> bool:
|
||||
"""
|
||||
Start the remediation process (non-blocking)
|
||||
|
||||
Returns:
|
||||
True if remediation started successfully
|
||||
"""
|
||||
self.logger.warning("Connection instability detected. Starting remediation...")
|
||||
|
||||
# Login to qBittorrent
|
||||
self.remediation_session = self.qbittorrent_client.login()
|
||||
if not self.remediation_session:
|
||||
self.logger.error("Could not log in to qBittorrent. Aborting remediation.")
|
||||
return False
|
||||
|
||||
# Update state
|
||||
self.state_manager.start_remediation()
|
||||
self.logger.info(f"Remediation started. State: {self.state_manager.remediation_state}")
|
||||
return True
|
||||
|
||||
def process_remediation(self) -> bool:
|
||||
"""
|
||||
Process the current remediation state (non-blocking)
|
||||
|
||||
Returns:
|
||||
True if remediation completed successfully, False otherwise
|
||||
"""
|
||||
if self.state_manager.remediation_state is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Log detailed remediation state information
|
||||
remediation_duration = time.time() - self.state_manager.remediation_start_time
|
||||
self.logger.debug(f"Processing remediation state: {self.state_manager.remediation_state} "
|
||||
f"(duration: {format_human_readable_time(remediation_duration)})")
|
||||
|
||||
if self.state_manager.remediation_state == 'stopping_torrents':
|
||||
return self._process_stopping_torrents()
|
||||
|
||||
elif self.state_manager.remediation_state == 'restarting_nomad':
|
||||
return self._process_restarting_nomad()
|
||||
|
||||
elif self.state_manager.remediation_state == 'waiting_for_stability':
|
||||
return self._process_waiting_for_stability()
|
||||
|
||||
elif self.state_manager.remediation_state == 'restarting_torrents':
|
||||
return self._process_restarting_torrents()
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Unexpected error during remediation: {e}")
|
||||
self.logger.error("Resetting remediation state due to unexpected error")
|
||||
self.state_manager.reset_remediation_state()
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
def _process_stopping_torrents(self) -> bool:
|
||||
"""Process stopping torrents state"""
|
||||
if self.qbittorrent_client.stop_tracker_torrents(self.remediation_session, self.tracker_name):
|
||||
self.logger.info("Torrents stopped successfully, proceeding to restart Nomad task")
|
||||
self.state_manager.update_remediation_state('restarting_nomad')
|
||||
return False
|
||||
else:
|
||||
self.logger.error("Failed to stop torrents - retrying in next cycle")
|
||||
return False
|
||||
|
||||
def _process_restarting_nomad(self) -> bool:
|
||||
"""Process restarting nomad state"""
|
||||
if self.nomad_client.restart_task_via_allocation(
|
||||
job_id="qbittorrent",
|
||||
task_name="qbittorrent"
|
||||
):
|
||||
self.logger.info("Nomad task restarted successfully, waiting for stable connectivity")
|
||||
self.state_manager.update_remediation_state('waiting_for_stability')
|
||||
return False
|
||||
else:
|
||||
self.logger.error("Nomad task restart failed - retrying in next cycle")
|
||||
return False
|
||||
|
||||
def _process_waiting_for_stability(self) -> bool:
|
||||
"""Process waiting for stability state"""
|
||||
# Check if we've exceeded the stabilization timeout
|
||||
elapsed_time = time.time() - self.state_manager.remediation_start_time
|
||||
if elapsed_time > self.stability_wait_time:
|
||||
self.logger.error(f"Stabilization timeout reached after {format_human_readable_time(elapsed_time)}")
|
||||
self.state_manager.reset_remediation_state()
|
||||
return False
|
||||
|
||||
# This state just waits - stability checking is handled in main monitor
|
||||
return False
|
||||
|
||||
def _process_restarting_torrents(self) -> bool:
|
||||
"""Process restarting torrents state"""
|
||||
try:
|
||||
if self.qbittorrent_client.restart_tracker_torrents(self.remediation_session, self.tracker_name):
|
||||
self.logger.info("Remediation completed successfully")
|
||||
self.state_manager.reset_remediation_state()
|
||||
return True
|
||||
else:
|
||||
self.logger.error("Failed to restart torrents")
|
||||
self.state_manager.reset_remediation_state()
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to restart torrents: {e}")
|
||||
self.state_manager.reset_remediation_state()
|
||||
return False
|
||||
|
||||
def should_start_remediation(self) -> bool:
|
||||
"""
|
||||
Check if remediation should be started
|
||||
|
||||
Returns:
|
||||
True if remediation should be started
|
||||
"""
|
||||
return (self.state_manager.consecutive_failures >= self.max_consecutive_failures and
|
||||
self.state_manager.remediation_state is None)
|
||||
|
||||
def check_stability_requirement_met(self) -> bool:
|
||||
"""
|
||||
Check if stability requirement has been met
|
||||
|
||||
Returns:
|
||||
True if stability requirement met
|
||||
"""
|
||||
if (self.state_manager.remediation_state == 'waiting_for_stability' and
|
||||
self.state_manager.stability_start_time is not None):
|
||||
|
||||
elapsed_stable_time = time.time() - self.state_manager.stability_start_time
|
||||
if elapsed_stable_time >= self.stability_duration_required:
|
||||
self.logger.info("1 hour of stable connectivity achieved, proceeding to restart torrents")
|
||||
self.state_manager.update_remediation_state('restarting_torrents')
|
||||
self.state_manager.reset_stability_timer()
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def reset_stability_on_connection_loss(self):
|
||||
"""Reset stability timer when connection is lost"""
|
||||
if self.state_manager.stability_start_time is not None:
|
||||
self.logger.warning("Connection lost during stabilization, resetting 1-hour timer")
|
||||
self.state_manager.reset_stability_timer()
|
||||
Reference in New Issue
Block a user