first commit
This commit is contained in:
8
monitoring/__init__.py
Normal file
8
monitoring/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
Monitoring components for qBittorrent connection monitoring
|
||||
"""
|
||||
|
||||
from .connection_monitor import ConnectionMonitor
|
||||
from .remediation_manager import RemediationManager
|
||||
|
||||
__all__ = ['ConnectionMonitor', 'RemediationManager']
|
||||
BIN
monitoring/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
monitoring/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
monitoring/__pycache__/connection_monitor.cpython-313.pyc
Normal file
BIN
monitoring/__pycache__/connection_monitor.cpython-313.pyc
Normal file
Binary file not shown.
BIN
monitoring/__pycache__/remediation_manager.cpython-313.pyc
Normal file
BIN
monitoring/__pycache__/remediation_manager.cpython-313.pyc
Normal file
Binary file not shown.
428
monitoring/connection_monitor.py
Normal file
428
monitoring/connection_monitor.py
Normal file
@@ -0,0 +1,428 @@
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Any
|
||||
|
||||
from api.qbittorrent_client import QBittorrentClient
|
||||
from api.nomad_client import NomadClient
|
||||
from api.vpn_client import VPNClient
|
||||
from persistence.state_manager import StateManager
|
||||
from monitoring.remediation_manager import RemediationManager
|
||||
from utils.time_utils import format_human_readable_time
|
||||
from utils.formatters import format_connection_status
|
||||
|
||||
class ConnectionMonitor:
|
||||
"""Main connection monitoring class"""
|
||||
|
||||
def __init__(self,
|
||||
qbittorrent_url: str = 'http://127.0.0.1:8080',
|
||||
nomad_url: str = 'http://127.0.0.1:4646',
|
||||
tracker_name: str = 'myanon',
|
||||
consul_url: str = 'http://consul.service.dc1.consul:8500',
|
||||
check_interval: int = 30,
|
||||
max_consecutive_failures: int = 20,
|
||||
stability_wait_time: int = 3720,
|
||||
stability_duration_required: int = 3600,
|
||||
logger: logging.Logger = None):
|
||||
"""
|
||||
Initialize connection monitoring with configurable parameters
|
||||
|
||||
Args:
|
||||
qbittorrent_url: qBittorrent API URL
|
||||
nomad_url: Nomad API URL
|
||||
tracker_name: Tracker name for torrent operations
|
||||
consul_url: Consul server URL for persistence
|
||||
check_interval: Check interval in seconds
|
||||
max_consecutive_failures: Maximum failures before remediation
|
||||
stability_wait_time: Time to wait for stability during remediation
|
||||
stability_duration_required: Required stability duration after remediation
|
||||
logger: Optional logger instance (uses module logger if not provided)
|
||||
"""
|
||||
self.logger = logger or logging.getLogger(__name__)
|
||||
|
||||
# Initialize clients
|
||||
self.qbittorrent_client = QBittorrentClient(
|
||||
base_url=qbittorrent_url,
|
||||
username='admin',
|
||||
password='adminpass',
|
||||
logger=self.logger # Pass the logger
|
||||
)
|
||||
self.nomad_client = NomadClient(base_url=nomad_url, logger=self.logger)
|
||||
|
||||
# Initialize VPN client (using same host as qBittorrent but port 8000)
|
||||
vpn_base_url = qbittorrent_url.replace(':8080', ':8000')
|
||||
self.vpn_client = VPNClient(base_url=vpn_base_url, logger=self.logger)
|
||||
|
||||
# Initialize state management
|
||||
self.state_manager = StateManager(consul_url=consul_url)
|
||||
|
||||
# Initialize remediation manager
|
||||
self.remediation_manager = RemediationManager(
|
||||
qbittorrent_client=self.qbittorrent_client,
|
||||
nomad_client=self.nomad_client,
|
||||
state_manager=self.state_manager,
|
||||
tracker_name=tracker_name,
|
||||
max_consecutive_failures=max_consecutive_failures,
|
||||
stability_wait_time=stability_wait_time,
|
||||
stability_duration_required=stability_duration_required,
|
||||
logger=self.logger # Pass the logger
|
||||
)
|
||||
|
||||
# Configuration
|
||||
self.check_interval = check_interval
|
||||
self.tracker_name = tracker_name
|
||||
|
||||
def _determine_connection_state(self, status: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Determine if connection is stable or unstable based on status
|
||||
|
||||
Args:
|
||||
status: Connection status dictionary
|
||||
|
||||
Returns:
|
||||
'stable' or 'unstable'
|
||||
"""
|
||||
connection_status = status.get('connection_status', 'unknown')
|
||||
dht_nodes = status.get('dht_nodes', 0)
|
||||
|
||||
is_connected = (
|
||||
connection_status == 'connected' and
|
||||
dht_nodes > 0
|
||||
)
|
||||
|
||||
if is_connected:
|
||||
self.state_manager.consecutive_stable_checks += 1
|
||||
# Always return 'stable' when connection is good, regardless of remediation state
|
||||
# The 1-hour stability requirement is handled in the stability tracking logic, not here
|
||||
return 'stable'
|
||||
else:
|
||||
self.state_manager.consecutive_stable_checks = 0
|
||||
# Log why the connection is considered unstable
|
||||
if connection_status == 'error':
|
||||
self.logger.warning(f"Connection unstable due to API error: {status.get('error_type', 'Unknown')} - {status.get('error_details', 'No details')}")
|
||||
elif connection_status != 'connected':
|
||||
self.logger.warning(f"Connection unstable: Status is '{connection_status}' (expected 'connected')")
|
||||
elif dht_nodes <= 0:
|
||||
self.logger.warning(f"Connection unstable: DHT nodes is {dht_nodes} (expected > 0)")
|
||||
return 'unstable'
|
||||
|
||||
def _handle_state_transition(self, current_state: str):
|
||||
"""
|
||||
Handle state transitions and call appropriate handlers
|
||||
|
||||
Args:
|
||||
current_state: Current connection state
|
||||
"""
|
||||
if current_state != self.state_manager.connection_state:
|
||||
if current_state == 'unstable':
|
||||
self._on_stable_to_unstable()
|
||||
else:
|
||||
self._on_unstable_to_stable()
|
||||
|
||||
self.state_manager.connection_state = current_state
|
||||
self.state_manager.last_state_change_time = time.time()
|
||||
self.state_manager.save_state()
|
||||
|
||||
def _on_stable_to_unstable(self):
|
||||
"""Called when connection transitions from stable to unstable state"""
|
||||
self.logger.warning("STATE TRANSITION: STABLE → UNSTABLE")
|
||||
# Log detailed transition information
|
||||
current_time = time.time()
|
||||
stable_duration = current_time - self.state_manager.last_state_change_time
|
||||
self.logger.debug(f"Stable duration: {format_human_readable_time(stable_duration)}, "
|
||||
f"Failures: {self.state_manager.consecutive_failures}/{self.remediation_manager.max_consecutive_failures}")
|
||||
|
||||
# Get container information for qbittorrent job tasks
|
||||
try:
|
||||
container_info = self.nomad_client.get_container_info(
|
||||
job_id="qbittorrent",
|
||||
task_names=["dante", "qbittorrent", "qbittorrent-vpn"],
|
||||
namespace="default"
|
||||
)
|
||||
|
||||
if container_info:
|
||||
self.logger.info("Container Status:")
|
||||
for task_name, info in container_info.items():
|
||||
self.logger.info(
|
||||
f"- {task_name}: Uptime: {info['uptime']}, "
|
||||
f"Last restart: {info['last_restart_time']} "
|
||||
f"(reason: {info['last_restart_reason']}), "
|
||||
f"Restarts: {info['restart_count']}"
|
||||
)
|
||||
else:
|
||||
self.logger.warning("Could not retrieve container information")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get container information: {e}")
|
||||
|
||||
self.state_manager.save_state()
|
||||
|
||||
def _on_unstable_to_stable(self):
|
||||
"""Called when connection transitions from unstable to stable state"""
|
||||
self.logger.info("STATE TRANSITION: UNSTABLE → STABLE")
|
||||
unstable_duration = time.time() - self.state_manager.last_state_change_time
|
||||
self.logger.debug(f"Unstable duration: {format_human_readable_time(unstable_duration)}, "
|
||||
f"Total failures: {self.state_manager.consecutive_failures}")
|
||||
|
||||
# Only reset failures if not in remediation (remediation handles its own failure tracking)
|
||||
if self.state_manager.remediation_state is None:
|
||||
self.state_manager.consecutive_failures = 0
|
||||
self.state_manager.last_failure_time = None
|
||||
self.state_manager.save_state()
|
||||
|
||||
def _log_vpn_status(self, vpn_status: Dict[str, Any], public_ip_info: Dict[str, Any]):
|
||||
"""
|
||||
Log VPN status and public IP information with change detection
|
||||
|
||||
Args:
|
||||
vpn_status: VPN status dictionary
|
||||
public_ip_info: Public IP information dictionary
|
||||
"""
|
||||
# Log VPN status and update state
|
||||
vpn_state = vpn_status.get('status', 'unknown')
|
||||
if vpn_state == 'error':
|
||||
self.logger.error(f"VPN Status Error: {vpn_status.get('error_type', 'Unknown')} - {vpn_status.get('error_details', 'No details')}")
|
||||
self.state_manager.update_vpn_status('error')
|
||||
else:
|
||||
self.logger.info(f"VPN Status: {vpn_state}")
|
||||
self.state_manager.update_vpn_status(vpn_state)
|
||||
|
||||
# Log public IP information with change detection
|
||||
current_ip = public_ip_info.get('public_ip', 'unknown')
|
||||
if current_ip == 'error':
|
||||
self.logger.error(f"Public IP Error: {public_ip_info.get('error_type', 'Unknown')} - {public_ip_info.get('error_details', 'No details')}")
|
||||
else:
|
||||
# Update state with public IP information
|
||||
self.state_manager.update_public_ip(current_ip, public_ip_info)
|
||||
|
||||
# Check if IP has changed for logging purposes
|
||||
previous_ip = getattr(self, '_last_public_ip', None)
|
||||
if previous_ip is None:
|
||||
# First time seeing IP, just log it
|
||||
self._last_public_ip = current_ip
|
||||
self._log_public_ip_details(public_ip_info, "Initial")
|
||||
elif current_ip != previous_ip:
|
||||
# IP has changed
|
||||
self.logger.warning(f"PUBLIC IP CHANGE DETECTED: {previous_ip} → {current_ip}")
|
||||
self._last_public_ip = current_ip
|
||||
self._log_public_ip_details(public_ip_info, "Changed")
|
||||
else:
|
||||
# IP unchanged, log details at debug level
|
||||
self.logger.debug(f"Public IP unchanged: {current_ip}")
|
||||
|
||||
def _log_public_ip_details(self, ip_info: Dict[str, Any], context: str):
|
||||
"""
|
||||
Log detailed public IP information
|
||||
|
||||
Args:
|
||||
ip_info: Public IP information dictionary
|
||||
context: Context for logging (e.g., "Initial", "Changed")
|
||||
"""
|
||||
public_ip = ip_info.get('public_ip', 'unknown')
|
||||
region = ip_info.get('region', 'unknown')
|
||||
country = ip_info.get('country', 'unknown')
|
||||
city = ip_info.get('city', 'unknown')
|
||||
organization = ip_info.get('organization', 'unknown')
|
||||
|
||||
self.logger.info(f"{context} Public IP Details: {public_ip} ({organization}) - {city}, {region}, {country}")
|
||||
|
||||
def _log_check_summary(self, status: Dict[str, Any], vpn_status: Dict[str, Any],
|
||||
public_ip_info: Dict[str, Any], current_state: str):
|
||||
"""
|
||||
Log a comprehensive check summary in hierarchical format
|
||||
|
||||
Args:
|
||||
status: Connection status dictionary
|
||||
vpn_status: VPN status dictionary
|
||||
public_ip_info: Public IP information dictionary
|
||||
current_state: Current connection state
|
||||
"""
|
||||
# Build summary components
|
||||
summary_lines = []
|
||||
|
||||
# Connection status
|
||||
connection_status = status.get('connection_status', 'unknown')
|
||||
dht_nodes = status.get('dht_nodes', 0)
|
||||
summary_lines.append(f"Status: {connection_status} (DHT: {dht_nodes} nodes)")
|
||||
|
||||
# VPN status
|
||||
vpn_state = vpn_status.get('status', 'unknown')
|
||||
if vpn_state == 'error':
|
||||
error_type = vpn_status.get('error_type', 'Unknown')
|
||||
error_details = vpn_status.get('error_details', 'No details')
|
||||
summary_lines.append(f"VPN: {vpn_state} - {error_type}: {error_details}")
|
||||
else:
|
||||
summary_lines.append(f"VPN: {vpn_state}")
|
||||
|
||||
# Public IP
|
||||
current_ip = public_ip_info.get('public_ip', 'unknown')
|
||||
if current_ip == 'error':
|
||||
error_type = public_ip_info.get('error_type', 'Unknown')
|
||||
error_details = public_ip_info.get('error_details', 'No details')
|
||||
summary_lines.append(f"IP: {current_ip} - {error_type}: {error_details}")
|
||||
else:
|
||||
ip_change = "unchanged" if current_ip == getattr(self, '_last_public_ip', None) else "changed"
|
||||
summary_lines.append(f"IP: {current_ip} ({ip_change})")
|
||||
|
||||
# Get system state metrics
|
||||
debug_metrics = self.state_manager.get_debug_metrics()
|
||||
|
||||
# Log as structured hierarchical message
|
||||
self.logger.debug(f"Connection Check Summary:\n " + "\n ".join(summary_lines) +
|
||||
f"\n\n System State:{debug_metrics['multiline']}")
|
||||
|
||||
def _update_vpn_and_ip_state(self, vpn_status: Dict[str, Any], public_ip_info: Dict[str, Any]):
|
||||
"""
|
||||
Update VPN and public IP state without logging (for use with consolidated logging)
|
||||
|
||||
Args:
|
||||
vpn_status: VPN status dictionary
|
||||
public_ip_info: Public IP information dictionary
|
||||
"""
|
||||
# Update VPN status without logging
|
||||
vpn_state = vpn_status.get('status', 'unknown')
|
||||
if vpn_state == 'error':
|
||||
self.state_manager.update_vpn_status('error')
|
||||
else:
|
||||
self.state_manager.update_vpn_status(vpn_state)
|
||||
|
||||
# Update public IP information without logging
|
||||
current_ip = public_ip_info.get('public_ip', 'unknown')
|
||||
if current_ip == 'error':
|
||||
# Just update state, don't log error here
|
||||
pass
|
||||
else:
|
||||
# Update state with public IP information
|
||||
self.state_manager.update_public_ip(current_ip, public_ip_info)
|
||||
|
||||
# Track IP changes for logging purposes in summary
|
||||
previous_ip = getattr(self, '_last_public_ip', None)
|
||||
if previous_ip is None:
|
||||
self._last_public_ip = current_ip
|
||||
elif current_ip != previous_ip:
|
||||
self._last_public_ip = current_ip
|
||||
|
||||
def monitor_connection(self):
|
||||
"""
|
||||
Main connection monitoring loop
|
||||
"""
|
||||
self.logger.info("Starting connection monitoring...")
|
||||
self.logger.info(f"Monitoring parameters:")
|
||||
self.logger.info(f"- API URL: {self.qbittorrent_client.api_url}")
|
||||
self.logger.info(f"- Tracker: {self.tracker_name}")
|
||||
self.logger.info(f"- Check Interval: {self.check_interval} seconds")
|
||||
self.logger.info(f"- Max Consecutive Failures: {self.remediation_manager.max_consecutive_failures}")
|
||||
|
||||
# Log initial container status
|
||||
try:
|
||||
container_info = self.nomad_client.get_container_info(
|
||||
job_id="qbittorrent",
|
||||
task_names=["dante", "qbittorrent", "qbittorrent-vpn"],
|
||||
namespace="default"
|
||||
)
|
||||
|
||||
if container_info:
|
||||
self.logger.info("Initial Container Status:")
|
||||
for task_name, info in container_info.items():
|
||||
self.logger.info(
|
||||
f"- {task_name}: Uptime: {info['uptime']}, "
|
||||
f"Last restart: {info['last_restart_time']} "
|
||||
f"(reason: {info['last_restart_reason']}), "
|
||||
f"Restarts: {info['restart_count']}"
|
||||
)
|
||||
else:
|
||||
self.logger.warning("Could not retrieve initial container information")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get initial container information: {e}")
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Get current connection status (suppress individual debug logs)
|
||||
status = self.qbittorrent_client.get_connection_status(verbose_debug=False)
|
||||
|
||||
# Get VPN status and public IP information (suppress individual debug logs)
|
||||
vpn_status = self.vpn_client.get_vpn_status(verbose_debug=False)
|
||||
public_ip_info = self.vpn_client.get_public_ip_info(verbose_debug=False)
|
||||
|
||||
# Update VPN and public IP state (without individual logging)
|
||||
self._update_vpn_and_ip_state(vpn_status, public_ip_info)
|
||||
|
||||
# Determine current connection state
|
||||
current_state = self._determine_connection_state(status)
|
||||
|
||||
# Handle state transitions
|
||||
self._handle_state_transition(current_state)
|
||||
|
||||
# Log comprehensive check summary in hierarchical format (only this debug log)
|
||||
self._log_check_summary(status, vpn_status, public_ip_info, current_state)
|
||||
|
||||
# Handle connection status and failure tracking
|
||||
if current_state == 'stable':
|
||||
|
||||
# Handle stability tracking for remediation
|
||||
if self.state_manager.remediation_state == 'waiting_for_stability':
|
||||
# Start tracking stability time if not already started
|
||||
if self.state_manager.stability_start_time is None:
|
||||
self.state_manager.start_stability_timer()
|
||||
self.logger.info("Stable connectivity detected, starting 1-hour timer")
|
||||
|
||||
# Calculate elapsed stable time
|
||||
elapsed_stable_time = time.time() - self.state_manager.stability_start_time
|
||||
remaining_time = max(0, self.remediation_manager.stability_duration_required - elapsed_stable_time)
|
||||
self.logger.info(f"Stable for {format_human_readable_time(elapsed_stable_time)}/"
|
||||
f"{format_human_readable_time(self.remediation_manager.stability_duration_required)}, "
|
||||
f"Remaining: {format_human_readable_time(remaining_time)}")
|
||||
|
||||
# Check if we've reached 1 hour of stability
|
||||
if self.remediation_manager.check_stability_requirement_met():
|
||||
continue
|
||||
else:
|
||||
# Reset stability timer if not in waiting_for_stability state
|
||||
if self.state_manager.stability_start_time is not None:
|
||||
self.logger.debug("Resetting stability timer (not in waiting_for_stability state)")
|
||||
self.state_manager.reset_stability_timer()
|
||||
|
||||
# Reset failure count when stable outside of remediation
|
||||
if (self.state_manager.consecutive_failures > 0 and
|
||||
self.state_manager.remediation_state is None):
|
||||
self.logger.info(f"Connection restored, resetting failure count from {self.state_manager.consecutive_failures} to 0")
|
||||
self.state_manager.consecutive_failures = 0
|
||||
self.state_manager.last_failure_time = None
|
||||
else:
|
||||
# Increment failure counter for unstable state
|
||||
self.state_manager.consecutive_failures += 1
|
||||
self.state_manager.last_failure_time = time.time()
|
||||
self.logger.warning(f"Connection unstable. Failures: {self.state_manager.consecutive_failures}/{self.remediation_manager.max_consecutive_failures}")
|
||||
|
||||
# Reset stability timer if connection is lost
|
||||
self.remediation_manager.reset_stability_on_connection_loss()
|
||||
|
||||
# Check if remediation is needed (only if not already in progress)
|
||||
if self.remediation_manager.should_start_remediation():
|
||||
self.logger.error(f"Persistent connection failure ({self.state_manager.consecutive_failures}/{self.remediation_manager.max_consecutive_failures}). Initiating remediation.")
|
||||
if self.remediation_manager.start_remediation():
|
||||
self.logger.info("Remediation started successfully")
|
||||
else:
|
||||
self.logger.error("Failed to start remediation")
|
||||
self.state_manager.consecutive_failures = self.remediation_manager.max_consecutive_failures
|
||||
|
||||
# Process remediation if active (even if connection is down)
|
||||
if self.state_manager.remediation_state is not None:
|
||||
remediation_result = self.remediation_manager.process_remediation()
|
||||
if remediation_result:
|
||||
self.logger.info("Remediation completed successfully")
|
||||
self.state_manager.consecutive_failures = 0
|
||||
elif self.state_manager.remediation_state is None:
|
||||
self.logger.warning("Remediation failed or was cancelled")
|
||||
self.state_manager.consecutive_failures = self.remediation_manager.max_consecutive_failures
|
||||
|
||||
# Wait before next check
|
||||
time.sleep(self.check_interval)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Unexpected error in monitoring loop: {e}")
|
||||
time.sleep(self.check_interval)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
self.logger.info("Connection monitoring stopped by user.")
|
||||
break
|
||||
189
monitoring/remediation_manager.py
Normal file
189
monitoring/remediation_manager.py
Normal file
@@ -0,0 +1,189 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
from requests import Session
|
||||
|
||||
from api.qbittorrent_client import QBittorrentClient
|
||||
from api.nomad_client import NomadClient
|
||||
from persistence.state_manager import StateManager
|
||||
from utils.time_utils import format_human_readable_time
|
||||
|
||||
class RemediationManager:
|
||||
"""Manages the remediation state machine for connection issues"""
|
||||
|
||||
def __init__(self,
|
||||
qbittorrent_client: QBittorrentClient,
|
||||
nomad_client: NomadClient,
|
||||
state_manager: StateManager,
|
||||
tracker_name: str,
|
||||
max_consecutive_failures: int = 20,
|
||||
stability_wait_time: int = 1800,
|
||||
stability_duration_required: int = 3600,
|
||||
logger: logging.Logger = None):
|
||||
"""
|
||||
Initialize remediation manager
|
||||
|
||||
Args:
|
||||
qbittorrent_client: QBittorrent client instance
|
||||
nomad_client: Nomad client instance
|
||||
state_manager: State manager instance
|
||||
tracker_name: Tracker name for torrent operations
|
||||
max_consecutive_failures: Maximum failures before remediation
|
||||
stability_wait_time: Time to wait for stability
|
||||
stability_duration_required: Required stability duration
|
||||
logger: Optional logger instance (uses module logger if not provided)
|
||||
"""
|
||||
self.qbittorrent_client = qbittorrent_client
|
||||
self.nomad_client = nomad_client
|
||||
self.state_manager = state_manager
|
||||
self.tracker_name = tracker_name
|
||||
self.max_consecutive_failures = max_consecutive_failures
|
||||
self.stability_wait_time = stability_wait_time
|
||||
self.stability_duration_required = stability_duration_required
|
||||
self.logger = logger or logging.getLogger(__name__)
|
||||
|
||||
# Remediation session
|
||||
self.remediation_session: Optional[Session] = None
|
||||
|
||||
def start_remediation(self) -> bool:
|
||||
"""
|
||||
Start the remediation process (non-blocking)
|
||||
|
||||
Returns:
|
||||
True if remediation started successfully
|
||||
"""
|
||||
self.logger.warning("Connection instability detected. Starting remediation...")
|
||||
|
||||
# Login to qBittorrent
|
||||
self.remediation_session = self.qbittorrent_client.login()
|
||||
if not self.remediation_session:
|
||||
self.logger.error("Could not log in to qBittorrent. Aborting remediation.")
|
||||
return False
|
||||
|
||||
# Update state
|
||||
self.state_manager.start_remediation()
|
||||
self.logger.info(f"Remediation started. State: {self.state_manager.remediation_state}")
|
||||
return True
|
||||
|
||||
def process_remediation(self) -> bool:
|
||||
"""
|
||||
Process the current remediation state (non-blocking)
|
||||
|
||||
Returns:
|
||||
True if remediation completed successfully, False otherwise
|
||||
"""
|
||||
if self.state_manager.remediation_state is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Log detailed remediation state information
|
||||
remediation_duration = time.time() - self.state_manager.remediation_start_time
|
||||
self.logger.debug(f"Processing remediation state: {self.state_manager.remediation_state} "
|
||||
f"(duration: {format_human_readable_time(remediation_duration)})")
|
||||
|
||||
if self.state_manager.remediation_state == 'stopping_torrents':
|
||||
return self._process_stopping_torrents()
|
||||
|
||||
elif self.state_manager.remediation_state == 'restarting_nomad':
|
||||
return self._process_restarting_nomad()
|
||||
|
||||
elif self.state_manager.remediation_state == 'waiting_for_stability':
|
||||
return self._process_waiting_for_stability()
|
||||
|
||||
elif self.state_manager.remediation_state == 'restarting_torrents':
|
||||
return self._process_restarting_torrents()
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Unexpected error during remediation: {e}")
|
||||
self.logger.error("Resetting remediation state due to unexpected error")
|
||||
self.state_manager.reset_remediation_state()
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
def _process_stopping_torrents(self) -> bool:
|
||||
"""Process stopping torrents state"""
|
||||
if self.qbittorrent_client.stop_tracker_torrents(self.remediation_session, self.tracker_name):
|
||||
self.logger.info("Torrents stopped successfully, proceeding to restart Nomad task")
|
||||
self.state_manager.update_remediation_state('restarting_nomad')
|
||||
return False
|
||||
else:
|
||||
self.logger.error("Failed to stop torrents - retrying in next cycle")
|
||||
return False
|
||||
|
||||
def _process_restarting_nomad(self) -> bool:
|
||||
"""Process restarting nomad state"""
|
||||
if self.nomad_client.restart_task_via_allocation(
|
||||
job_id="qbittorrent",
|
||||
task_name="qbittorrent"
|
||||
):
|
||||
self.logger.info("Nomad task restarted successfully, waiting for stable connectivity")
|
||||
self.state_manager.update_remediation_state('waiting_for_stability')
|
||||
return False
|
||||
else:
|
||||
self.logger.error("Nomad task restart failed - retrying in next cycle")
|
||||
return False
|
||||
|
||||
def _process_waiting_for_stability(self) -> bool:
|
||||
"""Process waiting for stability state"""
|
||||
# Check if we've exceeded the stabilization timeout
|
||||
elapsed_time = time.time() - self.state_manager.remediation_start_time
|
||||
if elapsed_time > self.stability_wait_time:
|
||||
self.logger.error(f"Stabilization timeout reached after {format_human_readable_time(elapsed_time)}")
|
||||
self.state_manager.reset_remediation_state()
|
||||
return False
|
||||
|
||||
# This state just waits - stability checking is handled in main monitor
|
||||
return False
|
||||
|
||||
def _process_restarting_torrents(self) -> bool:
|
||||
"""Process restarting torrents state"""
|
||||
try:
|
||||
if self.qbittorrent_client.restart_tracker_torrents(self.remediation_session, self.tracker_name):
|
||||
self.logger.info("Remediation completed successfully")
|
||||
self.state_manager.reset_remediation_state()
|
||||
return True
|
||||
else:
|
||||
self.logger.error("Failed to restart torrents")
|
||||
self.state_manager.reset_remediation_state()
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to restart torrents: {e}")
|
||||
self.state_manager.reset_remediation_state()
|
||||
return False
|
||||
|
||||
def should_start_remediation(self) -> bool:
|
||||
"""
|
||||
Check if remediation should be started
|
||||
|
||||
Returns:
|
||||
True if remediation should be started
|
||||
"""
|
||||
return (self.state_manager.consecutive_failures >= self.max_consecutive_failures and
|
||||
self.state_manager.remediation_state is None)
|
||||
|
||||
def check_stability_requirement_met(self) -> bool:
|
||||
"""
|
||||
Check if stability requirement has been met
|
||||
|
||||
Returns:
|
||||
True if stability requirement met
|
||||
"""
|
||||
if (self.state_manager.remediation_state == 'waiting_for_stability' and
|
||||
self.state_manager.stability_start_time is not None):
|
||||
|
||||
elapsed_stable_time = time.time() - self.state_manager.stability_start_time
|
||||
if elapsed_stable_time >= self.stability_duration_required:
|
||||
self.logger.info("1 hour of stable connectivity achieved, proceeding to restart torrents")
|
||||
self.state_manager.update_remediation_state('restarting_torrents')
|
||||
self.state_manager.reset_stability_timer()
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def reset_stability_on_connection_loss(self):
|
||||
"""Reset stability timer when connection is lost"""
|
||||
if self.state_manager.stability_start_time is not None:
|
||||
self.logger.warning("Connection lost during stabilization, resetting 1-hour timer")
|
||||
self.state_manager.reset_stability_timer()
|
||||
Reference in New Issue
Block a user