Files
qbitcheck/monitoring/remediation_manager.py
2025-11-21 13:38:02 -08:00

189 lines
8.1 KiB
Python

import logging
import time
from typing import Optional
from requests import Session
from api.qbittorrent_client import QBittorrentClient
from api.nomad_client import NomadClient
from persistence.state_manager import StateManager
from utils.time_utils import format_human_readable_time
class RemediationManager:
"""Manages the remediation state machine for connection issues"""
def __init__(self,
qbittorrent_client: QBittorrentClient,
nomad_client: NomadClient,
state_manager: StateManager,
tracker_name: str,
max_consecutive_failures: int = 20,
stability_wait_time: int = 1800,
stability_duration_required: int = 3600,
logger: logging.Logger = None):
"""
Initialize remediation manager
Args:
qbittorrent_client: QBittorrent client instance
nomad_client: Nomad client instance
state_manager: State manager instance
tracker_name: Tracker name for torrent operations
max_consecutive_failures: Maximum failures before remediation
stability_wait_time: Time to wait for stability
stability_duration_required: Required stability duration
logger: Optional logger instance (uses module logger if not provided)
"""
self.qbittorrent_client = qbittorrent_client
self.nomad_client = nomad_client
self.state_manager = state_manager
self.tracker_name = tracker_name
self.max_consecutive_failures = max_consecutive_failures
self.stability_wait_time = stability_wait_time
self.stability_duration_required = stability_duration_required
self.logger = logger or logging.getLogger(__name__)
# Remediation session
self.remediation_session: Optional[Session] = None
def start_remediation(self) -> bool:
"""
Start the remediation process (non-blocking)
Returns:
True if remediation started successfully
"""
self.logger.warning("Connection instability detected. Starting remediation...")
# Login to qBittorrent
self.remediation_session = self.qbittorrent_client.login()
if not self.remediation_session:
self.logger.error("Could not log in to qBittorrent. Aborting remediation.")
return False
# Update state
self.state_manager.start_remediation()
self.logger.info(f"Remediation started. State: {self.state_manager.remediation_state}")
return True
def process_remediation(self) -> bool:
"""
Process the current remediation state (non-blocking)
Returns:
True if remediation completed successfully, False otherwise
"""
if self.state_manager.remediation_state is None:
return False
try:
# Log detailed remediation state information
remediation_duration = time.time() - self.state_manager.remediation_start_time
self.logger.debug(f"Processing remediation state: {self.state_manager.remediation_state} "
f"(duration: {format_human_readable_time(remediation_duration)})")
if self.state_manager.remediation_state == 'stopping_torrents':
return self._process_stopping_torrents()
elif self.state_manager.remediation_state == 'restarting_nomad':
return self._process_restarting_nomad()
elif self.state_manager.remediation_state == 'waiting_for_stability':
return self._process_waiting_for_stability()
elif self.state_manager.remediation_state == 'restarting_torrents':
return self._process_restarting_torrents()
except Exception as e:
self.logger.error(f"Unexpected error during remediation: {e}")
self.logger.error("Resetting remediation state due to unexpected error")
self.state_manager.reset_remediation_state()
return False
return False
def _process_stopping_torrents(self) -> bool:
"""Process stopping torrents state"""
if self.qbittorrent_client.stop_tracker_torrents(self.remediation_session, self.tracker_name):
self.logger.info("Torrents stopped successfully, proceeding to restart Nomad task")
self.state_manager.update_remediation_state('restarting_nomad')
return False
else:
self.logger.error("Failed to stop torrents - retrying in next cycle")
return False
def _process_restarting_nomad(self) -> bool:
"""Process restarting nomad state"""
if self.nomad_client.restart_task_via_allocation(
job_id="qbittorrent",
task_name="qbittorrent"
):
self.logger.info("Nomad task restarted successfully, waiting for stable connectivity")
self.state_manager.update_remediation_state('waiting_for_stability')
return False
else:
self.logger.error("Nomad task restart failed - retrying in next cycle")
return False
def _process_waiting_for_stability(self) -> bool:
"""Process waiting for stability state"""
# Check if we've exceeded the stabilization timeout
elapsed_time = time.time() - self.state_manager.remediation_start_time
if elapsed_time > self.stability_wait_time:
self.logger.error(f"Stabilization timeout reached after {format_human_readable_time(elapsed_time)}")
self.state_manager.reset_remediation_state()
return False
# This state just waits - stability checking is handled in main monitor
return False
def _process_restarting_torrents(self) -> bool:
"""Process restarting torrents state"""
try:
if self.qbittorrent_client.restart_tracker_torrents(self.remediation_session, self.tracker_name):
self.logger.info("Remediation completed successfully")
self.state_manager.reset_remediation_state()
return True
else:
self.logger.error("Failed to restart torrents")
self.state_manager.reset_remediation_state()
return False
except Exception as e:
self.logger.error(f"Failed to restart torrents: {e}")
self.state_manager.reset_remediation_state()
return False
def should_start_remediation(self) -> bool:
"""
Check if remediation should be started
Returns:
True if remediation should be started
"""
return (self.state_manager.consecutive_failures >= self.max_consecutive_failures and
self.state_manager.remediation_state is None)
def check_stability_requirement_met(self) -> bool:
"""
Check if stability requirement has been met
Returns:
True if stability requirement met
"""
if (self.state_manager.remediation_state == 'waiting_for_stability' and
self.state_manager.stability_start_time is not None):
elapsed_stable_time = time.time() - self.state_manager.stability_start_time
if elapsed_stable_time >= self.stability_duration_required:
self.logger.info("1 hour of stable connectivity achieved, proceeding to restart torrents")
self.state_manager.update_remediation_state('restarting_torrents')
self.state_manager.reset_stability_timer()
return True
return False
def reset_stability_on_connection_loss(self):
"""Reset stability timer when connection is lost"""
if self.state_manager.stability_start_time is not None:
self.logger.warning("Connection lost during stabilization, resetting 1-hour timer")
self.state_manager.reset_stability_timer()