first commit

This commit is contained in:
2025-11-21 13:38:02 -08:00
parent 1142b96b52
commit 04da8cd466
35 changed files with 73503 additions and 0 deletions

80
REMEDIATION_BUG_FIX.md Normal file
View File

@@ -0,0 +1,80 @@
# Remediation System Bug Fix - Connection Detection Logic
## Problem Summary
The remediation system had a critical logic inversion bug that prevented it from detecting stable connections during the `waiting_for_stability` state, causing unnecessary timeouts.
### Symptoms
- Connection showed as "connected (DHT: 357 nodes)" in logs but system treated it as unstable
- Remediation timed out after 1+ hours despite connection being stable
- System accumulated failures even when connection was working
### Root Cause
**File**: [`monitoring/connection_monitor.py`](monitoring/connection_monitor.py)
**Lines**: 96-100 (before fix)
The bug was in the `_determine_connection_state` method:
```python
# BUGGY CODE (before fix):
if self.state_manager.remediation_state != 'waiting_for_stability':
return 'stable'
else:
# In remediation, maintain current state until 1-hour requirement is met
return self.state_manager.connection_state # ❌ WRONG
```
This logic returned the **previous connection state** instead of the actual current state when in remediation, creating a feedback loop where the system could never detect stability.
## The Fix
**Changed to**:
```python
# FIXED CODE:
if is_connected:
self.state_manager.consecutive_stable_checks += 1
# Always return 'stable' when connection is good, regardless of remediation state
# The 1-hour stability requirement is handled in the stability tracking logic, not here
return 'stable'
```
## Impact
### Before Fix
- System could not detect stable connections during remediation
- Remediation always timed out after 62 minutes (3720 seconds)
- Connection quality metrics were inaccurate
### After Fix
- System correctly detects stable connections immediately
- Remediation can complete successfully when connection stabilizes
- Stability timer starts properly when connection becomes stable
## Testing
A verification test was created ([`test_fix_verification.py`](test_fix_verification.py)) that simulates the exact problematic scenario:
```bash
python test_fix_verification.py
```
The test confirms that:
1. Good connections return 'stable' during remediation
2. Bad connections return 'unstable'
3. Error conditions are handled properly
4. The specific log scenario now works correctly
## Files Modified
1. **`monitoring/connection_monitor.py`** - Fixed logic inversion bug in `_determine_connection_state`
2. **`test_fix_verification.py`** - Added verification test for the fix
## Future Enhancements
While this fixes the critical bug, additional improvements could be made:
1. **Sliding window detection** - Track connection quality over multiple checks
2. **Graceful transitions** - Require multiple consecutive state changes
3. **Enhanced logging** - Better connection quality metrics
These can be implemented as separate enhancements now that the core detection logic is fixed.

Binary file not shown.

8
api/__init__.py Normal file
View File

@@ -0,0 +1,8 @@
"""
API clients for external services
"""
from .qbittorrent_client import QBittorrentClient
from .nomad_client import NomadClient
__all__ = ['QBittorrentClient', 'NomadClient']

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

276
api/nomad_client.py Normal file
View File

@@ -0,0 +1,276 @@
import requests
import time
import logging
from typing import Dict, Any, Optional
class NomadClient:
"""Client for interacting with Nomad API"""
def __init__(self, base_url: str = 'http://127.0.0.1:4646',
logger: logging.Logger = None):
"""
Initialize Nomad client
Args:
base_url: Base URL for Nomad API
logger: Optional logger instance (uses module logger if not provided)
"""
self.base_url = base_url.rstrip('/')
self.logger = logger or logging.getLogger(__name__)
def restart_task_via_allocation(self, job_id: str, task_name: str,
namespace: str = "default", wait_time: int = 60,
token: Optional[str] = None) -> bool:
"""
Restart a specific task in a Nomad job by restarting just that task.
Args:
job_id: The ID of the job containing the task
task_name: The name of the task to restart
namespace: The namespace of the job (default: 'default')
wait_time: Seconds to wait after restart (default: 60)
token: Optional Nomad ACL token
Returns:
bool: True if restart succeeded, False otherwise
"""
headers = {}
if token:
headers['X-Nomad-Token'] = token
try:
# Get allocations for the job
allocs_url = f"{self.base_url}/v1/job/{job_id}/allocations"
params = {'namespace': namespace}
self.logger.info(f"Fetching allocations for job '{job_id}'...")
response = requests.get(allocs_url, headers=headers, params=params, timeout=10)
response.raise_for_status()
allocations = response.json()
# Find allocation containing the task
target_alloc = None
for alloc in allocations:
if alloc['ClientStatus'] == 'running':
task_states = alloc.get('TaskStates', {})
if task_name in task_states:
target_alloc = alloc
break
if not target_alloc:
self.logger.error(f"No running allocation found for task '{task_name}' in job '{job_id}'")
return False
# Restart just the specific task
alloc_id = target_alloc['ID']
restart_url = f"{self.base_url}/v1/client/allocation/{alloc_id}/restart"
payload = {"TaskName": task_name}
self.logger.info(f"Restarting task '{task_name}' in job '{job_id}'...")
response = requests.post(restart_url, headers=headers, params=params, json=payload, timeout=10)
if response.status_code in [200, 204]:
self.logger.info(f"Successfully restarted task '{task_name}' in job '{job_id}'")
time.sleep(wait_time)
return True
else:
self.logger.error(f"Failed: {response.status_code} - {response.text}")
return False
except Exception as e:
self.logger.error(f"Request failed: {e}")
return False
def check_job_status(self, job_id: str, namespace: str = "default",
token: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""
Check the status of a Nomad job
Args:
job_id: The ID of the job to check
namespace: The namespace of the job
token: Optional Nomad ACL token
Returns:
Job status information or None if failed
"""
headers = {}
if token:
headers['X-Nomad-Token'] = token
try:
job_url = f"{self.base_url}/v1/job/{job_id}"
params = {'namespace': namespace}
response = requests.get(job_url, headers=headers, params=params, timeout=10)
response.raise_for_status()
return response.json()
except Exception as e:
self.logger.error(f"Failed to get job status: {e}")
return None
def get_allocations(self, job_id: str, namespace: str = "default",
token: Optional[str] = None) -> Optional[list]:
"""
Get allocations for a specific job
Args:
job_id: The ID of the job
namespace: The namespace of the job
token: Optional Nomad ACL token
Returns:
List of allocations or None if failed
"""
headers = {}
if token:
headers['X-Nomad-Token'] = token
try:
allocs_url = f"{self.base_url}/v1/job/{job_id}/allocations"
params = {'namespace': namespace}
response = requests.get(allocs_url, headers=headers, params=params, timeout=10)
response.raise_for_status()
return response.json()
except Exception as e:
self.logger.error(f"Failed to get allocations: {e}")
return None
def get_container_info(self, job_id: str, task_names: list, namespace: str = "default",
token: Optional[str] = None) -> Dict[str, Any]:
"""
Get container information including uptime and restart details for specified tasks
Args:
job_id: The ID of the job containing the tasks
task_names: List of task names to get information for
namespace: The namespace of the job
token: Optional Nomad ACL token
Returns:
Dictionary with container information for each task
"""
import time
from datetime import datetime
headers = {}
if token:
headers['X-Nomad-Token'] = token
container_info = {}
try:
# Get allocations for the job
allocations = self.get_allocations(job_id, namespace, token)
if not allocations:
self.logger.warning(f"No allocations found for job '{job_id}'")
return container_info
current_time = time.time()
for task_name in task_names:
task_info = {
'uptime': 'Unknown',
'last_restart_time': 'Never',
'last_restart_reason': 'None',
'restart_count': 0
}
# Find allocation with the task
for alloc in allocations:
if alloc.get('ClientStatus') == 'running':
task_states = alloc.get('TaskStates', {})
if task_name in task_states:
task_state = task_states[task_name]
# Calculate uptime from StartedAt
started_at = task_state.get('StartedAt')
if started_at and started_at != '0001-01-01T00:00:00Z':
try:
start_dt = datetime.fromisoformat(started_at.replace('Z', '+00:00'))
uptime_seconds = current_time - start_dt.timestamp()
from utils.time_utils import format_human_readable_time
task_info['uptime'] = format_human_readable_time(uptime_seconds)
except (ValueError, TypeError):
task_info['uptime'] = 'Invalid timestamp'
# Get last restart information
last_restart = task_state.get('LastRestart')
if last_restart and last_restart != '0001-01-01T00:00:00Z':
try:
restart_dt = datetime.fromisoformat(last_restart.replace('Z', '+00:00'))
restart_seconds_ago = current_time - restart_dt.timestamp()
from utils.time_utils import format_human_readable_time
task_info['last_restart_time'] = f"{format_human_readable_time(restart_seconds_ago)} ago"
except (ValueError, TypeError):
task_info['last_restart_time'] = 'Invalid timestamp'
# Get restart count
task_info['restart_count'] = task_state.get('Restarts', 0)
# Find restart reasons from events - analyze multiple event types
events = task_state.get('Events', [])
restart_reasons = []
# Look for the most recent restart-related events in priority order
for event in reversed(events): # Check most recent events first
event_type = event.get('Type')
# Highest priority: Termination events with exit codes (actual failure)
if event_type == 'Terminated':
exit_code = event.get('ExitCode')
exit_message = event.get('ExitMessage', '')
if exit_code is not None:
if exit_code != 0: # Only consider non-zero exit codes as failures
reason = f"Exit Code: {exit_code}"
if exit_message:
reason += f", {exit_message}"
restart_reasons.append(reason)
break
# Second priority: Killed events (external termination)
elif event_type == 'Killed':
reason = event.get('KillReason') or 'Task killed'
restart_reasons.append(reason)
break
# Third priority: Explicit restart events
elif event_type == 'Restart':
reason = event.get('RestartReason') or 'Restart initiated'
restart_reasons.append(reason)
break
# Fourth priority: Restart signaling (user or system initiated)
elif event_type == 'Restart Signaled':
reason = event.get('DriverMessage') or 'Restart signaled'
restart_reasons.append(reason)
break
# If no specific restart events found, look for any abnormal events
if not restart_reasons:
for event in reversed(events):
event_type = event.get('Type')
if event_type in ['Terminated', 'Killed']:
exit_code = event.get('ExitCode')
if exit_code and exit_code != 0: # Non-zero exit code
exit_message = event.get('ExitMessage', '')
reason = f"Abnormal termination (Exit: {exit_code})"
if exit_message:
reason += f": {exit_message}"
restart_reasons.append(reason)
break
task_info['last_restart_reason'] = restart_reasons[0] if restart_reasons else 'Normal operation'
break # Found the task, move to next task
container_info[task_name] = task_info
return container_info
except Exception as e:
self.logger.error(f"Failed to get container info: {e}")
return container_info

196
api/qbittorrent_client.py Normal file
View File

@@ -0,0 +1,196 @@
import requests
import logging
from typing import Dict, Any, Optional, List
from requests import Session
class QBittorrentClient:
"""Client for interacting with qBittorrent API"""
def __init__(self, base_url: str = 'http://127.0.0.1:8080',
username: str = 'admin',
password: str = 'adminpass',
logger: logging.Logger = None):
"""
Initialize qBittorrent client
Args:
base_url: Base URL for qBittorrent API
username: qBittorrent username
password: qBittorrent password
logger: Optional logger instance (uses module logger if not provided)
"""
self.base_url = base_url.rstrip('/')
self.username = username
self.password = password
self.logger = logger or logging.getLogger(__name__)
# API endpoints
self.api_url = f'{self.base_url}/api/v2/transfer/info'
self.login_url = f'{self.base_url}/api/v2/auth/login'
self.torrents_url = f'{self.base_url}/api/v2/torrents/info'
self.stop_url = f'{self.base_url}/api/v2/torrents/stop'
self.start_url = f'{self.base_url}/api/v2/torrents/start'
# API request retry configuration
self.api_retry_attempts = 3
self.api_retry_delay = 2
self.api_retry_backoff = 2
def login(self) -> Optional[Session]:
"""
Authenticate with qBittorrent
Returns:
Authenticated session or None if login failed
"""
try:
session = requests.Session()
login_data = {
'username': self.username,
'password': self.password
}
response = session.post(self.login_url, data=login_data)
response.raise_for_status()
self.logger.info("Successfully logged into qBittorrent")
return session
except requests.RequestException as e:
self.logger.error(f"qBittorrent login failed: {e}")
return None
def get_connection_status(self, verbose_debug: bool = True) -> Dict[str, Any]:
"""
Retrieve connection status from qBittorrent API with retry logic
Args:
verbose_debug: Whether to log detailed debug information
Returns:
Connection status dictionary
"""
last_exception = None
for attempt in range(self.api_retry_attempts):
try:
response = requests.get(self.api_url, timeout=10)
response.raise_for_status()
status_data = response.json()
# Log the actual status values for debugging (optional)
if verbose_debug:
connection_status = status_data.get('connection_status', 'unknown')
dht_nodes = status_data.get('dht_nodes', 0)
self.logger.debug(f"API response - Status: {connection_status}, DHT Nodes: {dht_nodes}")
return status_data
except Exception as e:
last_exception = e
if attempt < self.api_retry_attempts - 1: # Not the last attempt
delay = self.api_retry_delay * (self.api_retry_backoff ** attempt)
self.logger.warning(f"API request attempt {attempt + 1}/{self.api_retry_attempts} failed: {type(e).__name__}: {e}. Retrying in {delay} seconds...")
import time
time.sleep(delay)
else:
# More detailed error logging for final failure
error_type = type(last_exception).__name__
error_details = str(last_exception)
self.logger.error(f"API request failed after {self.api_retry_attempts} attempts: {error_type}: {error_details}")
# Return error details for better debugging
return {
'connection_status': 'error',
'dht_nodes': 0,
'error_type': error_type,
'error_details': error_details,
'api_url': self.api_url
}
# Fallback return if all attempts fail (shouldn't normally reach here)
return {
'connection_status': 'error',
'dht_nodes': 0,
'error_type': 'Unknown',
'error_details': 'All retry attempts exhausted',
'api_url': self.api_url
}
def stop_tracker_torrents(self, session: Session, tracker_name: str) -> bool:
"""
Stop torrents matching specific tracker
Args:
session: Authenticated session
tracker_name: Tracker name to match
Returns:
True if successful, False otherwise
"""
try:
# Get list of torrents
torrents = session.get(self.torrents_url).json()
# Find and stop torrents with matching tracker
tracker_torrents = [
torrent['hash'] for torrent in torrents
if tracker_name.lower() in str(torrent).lower()
]
if tracker_torrents:
hashes_str = '|'.join(tracker_torrents)
self.logger.debug(f"Stopping torrents: {hashes_str}")
response = session.post(self.stop_url, data={'hashes': hashes_str})
response.raise_for_status()
self.logger.info(f"Stopped {len(tracker_torrents)} torrents for tracker {tracker_name}")
return True
else:
self.logger.info(f"No torrents found for tracker {tracker_name}")
return True
except requests.RequestException as e:
self.logger.error(f"Failed to stop torrents: {e}")
return False
except Exception as e:
self.logger.error(f"Unexpected error stopping torrents: {e}")
return False
def restart_tracker_torrents(self, session: Session, tracker_name: str) -> bool:
"""
Restart torrents for specific tracker
Args:
session: Authenticated session
tracker_name: Tracker name to match
Returns:
True if successful, False otherwise
"""
try:
# Get list of torrents
torrents = session.get(self.torrents_url).json()
# Find and resume torrents with matching tracker
tracker_torrents = [
torrent['hash'] for torrent in torrents
if (tracker_name.lower() in str(torrent).lower()
and torrent.get('state') == 'paused')
]
if tracker_torrents:
hashes_str = '|'.join(tracker_torrents)
self.logger.debug(f"Restarting torrents: {hashes_str}")
# Note: Start endpoint commented out in original code
# response = session.post(self.start_url, data={'hashes': hashes_str})
# response.raise_for_status()
self.logger.info(f"Restarted {len(tracker_torrents)} torrents for tracker {tracker_name}")
return True
else:
self.logger.info(f"No paused torrents found for tracker {tracker_name}")
return True
except requests.RequestException as e:
self.logger.error(f"Failed to restart torrents: {e}")
return False
except Exception as e:
self.logger.error(f"Unexpected error restarting torrents: {e}")
return False

132
api/vpn_client.py Normal file
View File

@@ -0,0 +1,132 @@
import requests
import logging
import time
from typing import Dict, Any, Optional
class VPNClient:
"""Client for monitoring VPN server status and public IP information"""
def __init__(self, base_url: str = 'http://192.168.4.36:8000',
logger: logging.Logger = None):
"""
Initialize VPN client
Args:
base_url: Base URL for VPN monitoring API
logger: Optional logger instance (uses module logger if not provided)
"""
self.base_url = base_url.rstrip('/')
self.logger = logger or logging.getLogger(__name__)
# API endpoints
self.vpn_status_url = f'{self.base_url}/v1/vpn/status'
self.public_ip_url = f'{self.base_url}/v1/publicip/ip'
# API request retry configuration
self.api_retry_attempts = 3
self.api_retry_delay = 2
self.api_retry_backoff = 2
def get_vpn_status(self, verbose_debug: bool = True) -> Dict[str, Any]:
"""
Retrieve VPN status from VPN monitoring API with retry logic
Args:
verbose_debug: Whether to log detailed debug information
Returns:
VPN status dictionary with 'status' field
"""
last_exception = None
for attempt in range(self.api_retry_attempts):
try:
response = requests.get(self.vpn_status_url, timeout=10)
response.raise_for_status()
status_data = response.json()
# Log the VPN status for debugging (optional)
if verbose_debug:
vpn_status = status_data.get('status', 'unknown')
self.logger.debug(f"VPN status: {vpn_status}")
return status_data
except Exception as e:
last_exception = e
if attempt < self.api_retry_attempts - 1: # Not the last attempt
delay = self.api_retry_delay * (self.api_retry_backoff ** attempt)
self.logger.warning(f"VPN status request attempt {attempt + 1}/{self.api_retry_attempts} failed: {type(e).__name__}: {e}. Retrying in {delay} seconds...")
time.sleep(delay)
else:
# More detailed error logging for final failure
error_type = type(last_exception).__name__
error_details = str(last_exception)
self.logger.error(f"VPN status request failed after {self.api_retry_attempts} attempts: {error_type}: {error_details}")
# Return error details for better debugging
return {
'status': 'error',
'error_type': error_type,
'error_details': error_details,
'api_url': self.vpn_status_url
}
# Fallback return if all attempts fail (shouldn't normally reach here)
return {
'status': 'error',
'error_type': 'Unknown',
'error_details': 'All retry attempts exhausted',
'api_url': self.vpn_status_url
}
def get_public_ip_info(self, verbose_debug: bool = True) -> Dict[str, Any]:
"""
Retrieve public IP information from VPN monitoring API with retry logic
Args:
verbose_debug: Whether to log detailed debug information
Returns:
Public IP information dictionary with 'public_ip' and geographic details
"""
last_exception = None
for attempt in range(self.api_retry_attempts):
try:
response = requests.get(self.public_ip_url, timeout=10)
response.raise_for_status()
ip_data = response.json()
# Log the public IP for debugging (optional)
if verbose_debug:
public_ip = ip_data.get('public_ip', 'unknown')
self.logger.debug(f"Public IP: {public_ip}")
return ip_data
except Exception as e:
last_exception = e
if attempt < self.api_retry_attempts - 1: # Not the last attempt
delay = self.api_retry_delay * (self.api_retry_backoff ** attempt)
self.logger.warning(f"Public IP request attempt {attempt + 1}/{self.api_retry_attempts} failed: {type(e).__name__}: {e}. Retrying in {delay} seconds...")
time.sleep(delay)
else:
# More detailed error logging for final failure
error_type = type(last_exception).__name__
error_details = str(last_exception)
self.logger.error(f"Public IP request failed after {self.api_retry_attempts} attempts: {error_type}: {error_details}")
# Return error details for better debugging
return {
'public_ip': 'error',
'error_type': error_type,
'error_details': error_details,
'api_url': self.public_ip_url
}
# Fallback return if all attempts fail (shouldn't normally reach here)
return {
'public_ip': 'error',
'error_type': 'Unknown',
'error_details': 'All retry attempts exhausted',
'api_url': self.public_ip_url
}

819
checker_backup_original.py Normal file
View File

@@ -0,0 +1,819 @@
import requests
import time
import logging
import sys
import json
from typing import Dict, Any, Optional
from pprint import pprint
from time import sleep
try:
import consul
CONSUL_AVAILABLE = True
except ImportError:
consul = None
CONSUL_AVAILABLE = False
logger = logging.getLogger(__name__)
logger.warning("python-consul package not available. State persistence disabled.")
# Configure logging with colors
class ColoredFormatter(logging.Formatter):
"""Custom formatter with colors for different log levels"""
# ANSI color codes
GREY = '\033[90m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD_RED = '\033[1;91m'
RESET = '\033[0m'
def format(self, record):
# Add color based on log level
if record.levelno == logging.DEBUG:
color = self.GREY
elif record.levelno == logging.INFO:
color = self.GREEN
elif record.levelno == logging.WARNING:
color = self.YELLOW
elif record.levelno in (logging.ERROR, logging.CRITICAL):
color = self.RED
else:
color = self.RESET
# Format the message with color
message = super().format(record)
return f"{color}{message}{self.RESET}"
# Configure logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# Remove any existing handlers
logger.handlers = []
# Console handler with colors
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(ColoredFormatter(
'%(asctime)s - %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
))
# File handler (no colors)
file_handler = logging.FileHandler('connection_monitor.log')
file_handler.setFormatter(logging.Formatter(
'%(asctime)s - %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
))
# Add both handlers
logger.addHandler(console_handler)
logger.addHandler(file_handler)
def format_human_readable_time(seconds: float) -> str:
"""
Convert seconds to human-readable time format using datetime.timedelta
"""
from datetime import timedelta
td = timedelta(seconds=seconds)
# Extract components
days = td.days
hours, remainder = divmod(td.seconds, 3600)
minutes, seconds_remaining = divmod(remainder, 60)
# Build the human-readable string
parts = []
if days > 0:
parts.append(f"{days} day" if days == 1 else f"{days} days")
if hours > 0:
parts.append(f"{hours} hour" if hours == 1 else f"{hours} hours")
if minutes > 0:
parts.append(f"{minutes} minute" if minutes == 1 else f"{minutes} minutes")
if seconds_remaining > 0 or not parts: # Include seconds if no other parts or if seconds exist
parts.append(f"{seconds_remaining} second" if seconds_remaining == 1 else f"{seconds_remaining} seconds")
return " ".join(parts)
class ConnectionMonitor:
def __init__(self,
qbittorrent_url: str = 'http://127.0.0.1:8080',
nomad_url: str = 'http://127.0.0.1:4646',
tracker_name: str = 'myanon',
consul_url: str = 'http://consul.service.dc1.consul:8500'):
"""
Initialize connection monitoring with configurable parameters
"""
self.api_url = f'{qbittorrent_url}/api/v2/transfer/info'
self.qbittorrent_base_url = qbittorrent_url
self.nomad_url = nomad_url
self.tracker_name = tracker_name
self.consul_url = consul_url
# Initialize Consul client if available
self.consul_client = None
if CONSUL_AVAILABLE:
try:
self.consul_client = consul.Consul(host=consul_url.split('://')[1].split(':')[0],
port=int(consul_url.split(':')[2]))
logger.info(f"Consul client initialized for {consul_url}")
except Exception as e:
logger.error(f"Failed to initialize Consul client: {e}")
self.consul_client = None
# Tracking variables
self.consecutive_failures = 0
self.max_consecutive_failures = 20 # 10 minutes (30s * 20)
self.stability_wait_time = 1800 # 30 minutes
self.check_interval = 30 # seconds
# API request retry configuration
self.api_retry_attempts = 3 # Number of retry attempts
self.api_retry_delay = 2 # Initial delay in seconds
self.api_retry_backoff = 2 # Exponential backoff multiplier
# Connection state tracking
self.connection_state = 'stable' # 'stable' or 'unstable'
self.last_state_change_time = time.time()
self.consecutive_stable_checks = 0
self.last_failure_time = None # Track when last failure occurred
# Remediation state tracking
self.remediation_state = None # None, 'stopping_torrents', 'restarting_nomad', 'waiting_for_stability', 'restarting_torrents'
self.remediation_start_time = None
self.stabilization_checks = 0
self.remediation_session = None
# Stability tracking for 1-hour requirement
self.stability_start_time = None # When stable connectivity begins
self.stability_duration_required = 3600 # 1 hour in seconds
# Authentication (update with your credentials)
self.qbittorrent_username = 'admin'
self.qbittorrent_password = 'adminpass'
# Load state from Consul if available
self._load_state_from_consul()
def _save_state_to_consul(self):
"""
Save current state to Consul KV store
"""
if not self.consul_client:
return False
try:
base_key = "qbitcheck/connection_monitor/"
# Connection state
state_data = {
'connection_state': self.connection_state,
'last_state_change_time': self.last_state_change_time,
'consecutive_failures': self.consecutive_failures,
'consecutive_stable_checks': self.consecutive_stable_checks,
'last_failure_time': self.last_failure_time
}
# Remediation state
remediation_data = {
'state': self.remediation_state,
'start_time': self.remediation_start_time,
'stabilization_checks': self.stabilization_checks
}
# Stability tracking
stability_data = {
'start_time': self.stability_start_time
}
# Save each section to Consul
self.consul_client.kv.put(f"{base_key}state", json.dumps(state_data))
self.consul_client.kv.put(f"{base_key}remediation", json.dumps(remediation_data))
self.consul_client.kv.put(f"{base_key}stability", json.dumps(stability_data))
logger.debug("State successfully saved to Consul")
return True
except Exception as e:
logger.error(f"Failed to save state to Consul: {e}")
return False
def _load_state_from_consul(self):
"""
Load state from Consul KV store
"""
if not self.consul_client:
return False
try:
base_key = "qbitcheck/connection_monitor/"
# Load connection state
_, state_data = self.consul_client.kv.get(f"{base_key}state")
if state_data:
state = json.loads(state_data['Value'].decode('utf-8'))
self.connection_state = state.get('connection_state', 'stable')
self.last_state_change_time = state.get('last_state_change_time', time.time())
self.consecutive_failures = state.get('consecutive_failures', 0)
self.consecutive_stable_checks = state.get('consecutive_stable_checks', 0)
self.last_failure_time = state.get('last_failure_time')
# Load remediation state
_, remediation_data = self.consul_client.kv.get(f"{base_key}remediation")
if remediation_data:
remediation = json.loads(remediation_data['Value'].decode('utf-8'))
self.remediation_state = remediation.get('state')
self.remediation_start_time = remediation.get('start_time')
self.stabilization_checks = remediation.get('stabilization_checks', 0)
# Load stability tracking
_, stability_data = self.consul_client.kv.get(f"{base_key}stability")
if stability_data:
stability = json.loads(stability_data['Value'].decode('utf-8'))
self.stability_start_time = stability.get('start_time')
logger.info("State successfully loaded from Consul")
logger.debug(f"Loaded state: connection={self.connection_state}, "
f"remediation={self.remediation_state}, "
f"failures={self.consecutive_failures}")
return True
except Exception as e:
logger.error(f"Failed to load state from Consul: {e}")
return False
def qbittorrent_login(self) -> requests.Session:
"""
Authenticate with qBittorrent
"""
try:
session = requests.Session()
login_url = f'{self.qbittorrent_base_url}/api/v2/auth/login'
login_data = {
'username': self.qbittorrent_username,
'password': self.qbittorrent_password
}
response = session.post(login_url, data=login_data)
response.raise_for_status()
logger.info("Successfully logged into qBittorrent")
return session
except requests.RequestException as e:
logger.error(f"qBittorrent login failed: {e}")
return None
def get_connection_status(self) -> Dict[str, Any]:
"""
Retrieve connection status from qBittorrent API with retry logic
"""
last_exception = None
for attempt in range(self.api_retry_attempts):
try:
response = requests.get(self.api_url, timeout=10)
response.raise_for_status()
status_data = response.json()
# Log the actual status values for debugging
connection_status = status_data.get('connection_status', 'unknown')
dht_nodes = status_data.get('dht_nodes', 0)
logger.debug(f"API response - Status: {connection_status}, DHT Nodes: {dht_nodes}")
return status_data
except Exception as e:
last_exception = e
if attempt < self.api_retry_attempts - 1: # Not the last attempt
delay = self.api_retry_delay * (self.api_retry_backoff ** attempt)
logger.warning(f"API request attempt {attempt + 1}/{self.api_retry_attempts} failed: {type(e).__name__}: {e}. Retrying in {delay} seconds...")
time.sleep(delay)
else:
# More detailed error logging for final failure
error_type = type(last_exception).__name__
error_details = str(last_exception)
logger.error(f"API request failed after {self.api_retry_attempts} attempts: {error_type}: {error_details}")
# Return error details for better debugging
return {
'connection_status': 'error',
'dht_nodes': 0,
'error_type': error_type,
'error_details': error_details,
'api_url': self.api_url
}
# Fallback return if all attempts fail (shouldn't normally reach here)
return {
'connection_status': 'error',
'dht_nodes': 0,
'error_type': 'Unknown',
'error_details': 'All retry attempts exhausted',
'api_url': self.api_url
}
def stop_tracker_torrents(self, session: requests.Session):
"""
Stop torrents matching specific tracker
"""
try:
# Get list of torrents
torrents_url = f'{self.qbittorrent_base_url}/api/v2/torrents/info'
torrents = session.get(torrents_url).json()
# Find and stop torrents with matching tracker
tracker_torrents = [
torrent['hash'] for torrent in torrents
if self.tracker_name.lower() in str(torrent).lower()
]
if tracker_torrents:
hashes_str = '|'.join(tracker_torrents)
logger.debug(f"Stopping torrents: {hashes_str}")
stop_url = f'{self.qbittorrent_base_url}/api/v2/torrents/stop'
response = session.post(stop_url, data={'hashes': hashes_str})
response.raise_for_status()
logger.info(f"Stopped {len(tracker_torrents)} torrents for tracker {self.tracker_name}")
return True
else:
logger.info(f"No torrents found for tracker {self.tracker_name}")
return True
except requests.RequestException as e:
logger.error(f"Failed to stop torrents: {e}")
return False
except Exception as e:
logger.error(f"Unexpected error stopping torrents: {e}")
return False
def restart_nomad_task_via_allocation(self, job_id: str, task_name: str, namespace: str = "default", wait_time: int = 60) -> bool:
"""
Restart a specific task in a Nomad job by restarting just that task.
Args:
job_id: The ID of the job containing the task
task_name: The name of the task to restart
namespace: The namespace of the job (default: 'default')
wait_time: Seconds to wait after restart (default: 60)
Returns:
bool: True if restart succeeded, False otherwise
"""
headers = {}
if hasattr(self, 'token') and self.token:
headers['X-Nomad-Token'] = self.token
try:
# Get allocations for the job
allocs_url = f"{self.nomad_url}/v1/job/{job_id}/allocations"
params = {'namespace': namespace}
logger.info(f"Fetching allocations for job '{job_id}'...")
response = requests.get(allocs_url, headers=headers, params=params, timeout=10)
response.raise_for_status()
allocations = response.json()
# Find allocation containing the task
target_alloc = None
for alloc in allocations:
if alloc['ClientStatus'] == 'running':
task_states = alloc.get('TaskStates', {})
if task_name in task_states:
target_alloc = alloc
break
if not target_alloc:
logger.error(f"No running allocation found for task '{task_name}' in job '{job_id}'")
return False
# Restart just the specific task
alloc_id = target_alloc['ID']
restart_url = f"{self.nomad_url}/v1/client/allocation/{alloc_id}/restart"
payload = {"TaskName": task_name}
logger.info(f"Restarting task '{task_name}' in job '{job_id}'...")
response = requests.post(restart_url, headers=headers, params=params, json=payload, timeout=10)
if response.status_code in [200, 204]:
logger.info(f"Successfully restarted task '{task_name}' in job '{job_id}'")
time.sleep(wait_time)
return True
else:
logger.error(f"Failed: {response.status_code} - {response.text}")
return False
except Exception as e:
logger.error(f"Request failed: {e}")
return False
def restart_tracker_torrents(self, session: requests.Session):
"""
Restart torrents for specific tracker after stability period
"""
try:
# Get list of previously stopped torrents
torrents_url = f'{self.qbittorrent_base_url}/api/v2/torrents/info'
torrents = session.get(torrents_url).json()
# Find and resume torrents with matching tracker
tracker_torrents = [
torrent['hash'] for torrent in torrents
if (self.tracker_name.lower() in str(torrent).lower()
and torrent.get('state') == 'paused')
]
if tracker_torrents:
hashes_str = '|'.join(tracker_torrents)
logger.debug(f"Restarting torrents: {hashes_str}")
#start_url = f'{self.qbittorrent_base_url}/api/v2/torrents/start'
#response = session.post(start_url, data={'hashes': hashes_str})
#response.raise_for_status()
logger.info(f"Restarted {len(tracker_torrents)} torrents for tracker {self.tracker_name}")
else:
logger.info(f"No paused torrents found for tracker {self.tracker_name}")
except requests.RequestException as e:
logger.error(f"Failed to restart torrents: {e}")
def start_remediation(self):
"""
Start the remediation process (non-blocking)
"""
logger.warning("Connection instability detected. Starting remediation...")
self.remediation_state = 'stopping_torrents'
self.remediation_start_time = time.time()
self.stabilization_checks = 0
self.remediation_session = self.qbittorrent_login()
if not self.remediation_session:
logger.error("Could not log in to qBittorrent. Aborting remediation.")
self.remediation_state = None
return False
logger.info(f"Remediation started. State: {self.remediation_state}")
# Save state to Consul
self._save_state_to_consul()
return True
def process_remediation(self):
"""
Process the current remediation state (non-blocking)
"""
if self.remediation_state is None:
return False
try:
# Log detailed remediation state information
remediation_duration = time.time() - self.remediation_start_time
logger.debug(f"Processing remediation state: {self.remediation_state} (duration: {format_human_readable_time(remediation_duration)})")
if self.remediation_state == 'stopping_torrents':
if self.stop_tracker_torrents(self.remediation_session):
logger.info("Torrents stopped successfully, proceeding to restart Nomad task")
self.remediation_state = 'restarting_nomad'
logger.info(f"Remediation state updated: {self.remediation_state}")
# Log state transition with debug metrics
self._log_debug_metrics()
# Save state to Consul
self._save_state_to_consul()
else:
logger.error("Failed to stop torrents - retrying in next cycle")
# Don't reset state, allow retry in next cycle
elif self.remediation_state == 'restarting_nomad':
if self.restart_nomad_task_via_allocation(
job_id="qbittorrent",
task_name="qbittorrent"
):
logger.info("Nomad task restarted successfully, waiting for 30 minutes of stable connectivity")
self.remediation_state = 'waiting_for_stability'
logger.info(f"Remediation state updated: {self.remediation_state}")
# Log state transition with debug metrics
self._log_debug_metrics()
# Save state to Consul
self._save_state_to_consul()
else:
logger.error("Nomad task restart failed - retrying in next cycle")
# Don't reset state, allow retry in next cycle
elif self.remediation_state == 'waiting_for_stability':
# This state just waits - stability checking is handled in main loop
# Check if we've exceeded the stabilization timeout
elapsed_time = time.time() - self.remediation_start_time
if elapsed_time > self.stability_wait_time:
logger.error(f"Stabilization timeout reached after {format_human_readable_time(elapsed_time)}")
self.remediation_state = None
self.stability_start_time = None
# Log timeout with debug metrics
self._log_debug_metrics()
# Save state to Consul (remediation timeout)
self._save_state_to_consul()
return False
elif self.remediation_state == 'restarting_torrents':
try:
self.restart_tracker_torrents(self.remediation_session)
logger.info("Remediation completed successfully")
self.remediation_state = None
# Log successful completion with debug metrics
self._log_debug_metrics()
# Save state to Consul (remediation completed)
self._save_state_to_consul()
return True
except Exception as e:
logger.error(f"Failed to restart torrents: {e}")
self.remediation_state = None
# Log failure with debug metrics
self._log_debug_metrics()
# Save state to Consul (remediation failed)
self._save_state_to_consul()
return False
except Exception as e:
logger.error(f"Unexpected error during remediation: {e}")
logger.error("Resetting remediation state due to unexpected error")
self.remediation_state = None
# Log error with debug metrics
self._log_debug_metrics()
# Save state to Consul (error condition)
self._save_state_to_consul()
return False
return False
def on_stable_to_unstable(self):
"""
Called when connection transitions from stable to unstable state
"""
logger.warning("STATE TRANSITION: STABLE → UNSTABLE")
# Log detailed transition information
current_time = time.time()
stable_duration = current_time - self.last_state_change_time
logger.debug(f"Stable duration: {format_human_readable_time(stable_duration)}, Failures: {self.consecutive_failures}/{self.max_consecutive_failures}")
# Custom logic: logging, notifications, metrics, etc.
# Save state to Consul
self._save_state_to_consul()
def _log_debug_metrics(self):
"""
Log comprehensive debug metrics including remediation state, stability timer,
consecutive failures, state transitions, and time remaining
"""
current_time = time.time()
# Format remediation state information
remediation_info = f"Remediation: {self.remediation_state or 'None'}"
if self.remediation_state:
remediation_duration = current_time - self.remediation_start_time
remediation_info += f" (duration: {format_human_readable_time(remediation_duration)})"
# Format stability timer information
stability_info = "Stability timer: Not active"
if self.stability_start_time:
elapsed_stable = current_time - self.stability_start_time
remaining_stable = max(0, self.stability_duration_required - elapsed_stable)
stability_info = f"Stability: {format_human_readable_time(elapsed_stable)}/{format_human_readable_time(self.stability_duration_required)}, Remaining: {format_human_readable_time(remaining_stable)}"
# Format failure information
failure_info = f"Failures: {self.consecutive_failures}/{self.max_consecutive_failures}"
# Format state transition information
state_duration = current_time - self.last_state_change_time
transition_info = f"State: {self.connection_state} (duration: {format_human_readable_time(state_duration)})"
# Format time since last failure information
last_failure_info = "Last failure: Never"
if self.last_failure_time:
time_since_last_failure = current_time - self.last_failure_time
last_failure_info = f"Last failure: {format_human_readable_time(time_since_last_failure)} ago"
# Log comprehensive debug information
logger.debug(f"SYSTEM STATUS - {transition_info}, {failure_info}, {last_failure_info}, {remediation_info}, {stability_info}")
# Save state to Consul
self._save_state_to_consul()
def on_unstable_to_stable(self):
"""
Called when connection transitions from unstable to stable state
"""
logger.info("STATE TRANSITION: UNSTABLE → STABLE")
unstable_duration = time.time() - self.last_state_change_time
logger.debug(f"Unstable duration: {format_human_readable_time(unstable_duration)}, Total failures: {self.consecutive_failures}")
# Only reset failures if not in remediation (remediation handles its own failure tracking)
if self.remediation_state is None:
self.consecutive_failures = 0
self.last_failure_time = None
# Custom logic: logging, notifications, metrics, etc.
# Save state to Consul
self._save_state_to_consul()
def _determine_connection_state(self, status) -> str:
"""
Determine if connection is stable or unstable based on status
"""
connection_status = status.get('connection_status', 'unknown')
dht_nodes = status.get('dht_nodes', 0)
is_connected = (
connection_status == 'connected' and
dht_nodes > 0
)
if is_connected:
self.consecutive_stable_checks += 1
# For normal operation, consider stable immediately when connected
# The 1-hour requirement is only for remediation stability checking
if self.remediation_state != 'waiting_for_stability':
return 'stable'
else:
# In remediation, maintain current state until 1-hour requirement is met
return self.connection_state
else:
self.consecutive_stable_checks = 0
# Log why the connection is considered unstable
if connection_status == 'error':
logger.warning(f"Connection unstable due to API error: {status.get('error_type', 'Unknown')} - {status.get('error_details', 'No details')}")
elif connection_status != 'connected':
logger.warning(f"Connection unstable: Status is '{connection_status}' (expected 'connected')")
elif dht_nodes <= 0:
logger.warning(f"Connection unstable: DHT nodes is {dht_nodes} (expected > 0)")
return 'unstable'
def monitor_connection(self):
"""
Main connection monitoring loop
"""
logger.info("Starting connection monitoring...")
logger.info(f"Monitoring parameters:")
logger.info(f"- API URL: {self.api_url}")
logger.info(f"- Tracker: {self.tracker_name}")
logger.info(f"- Check Interval: {self.check_interval} seconds")
logger.info(f"- Max Consecutive Failures: {self.max_consecutive_failures}")
while True:
try:
# Get current connection status
status = self.get_connection_status()
# Determine current connection state
current_state = self._determine_connection_state(status)
# Handle state transitions
if current_state != self.connection_state:
if current_state == 'unstable':
self.on_stable_to_unstable()
else:
self.on_unstable_to_stable()
self.connection_state = current_state
self.last_state_change_time = time.time()
# Log state transition with debug metrics
self._log_debug_metrics()
# Log comprehensive debug metrics on each iteration
self._log_debug_metrics()
# Handle connection status logging and failure tracking
if current_state == 'stable':
logger.debug(f"Connection Stable. Status: {status.get('connection_status', 'unknown')}, DHT Nodes: {status.get('dht_nodes', 0)}")
# Handle stability tracking for remediation
if self.remediation_state == 'waiting_for_stability':
# Start tracking stability time if not already started
if self.stability_start_time is None:
self.stability_start_time = time.time()
logger.info("Stable connectivity detected, starting 1-hour timer")
# Log detailed stability info
self._log_debug_metrics()
# Calculate elapsed stable time
elapsed_stable_time = time.time() - self.stability_start_time
remaining_time = max(0, self.stability_duration_required - elapsed_stable_time)
logger.info(f"Stable for {format_human_readable_time(elapsed_stable_time)}/{format_human_readable_time(self.stability_duration_required)}, Remaining: {format_human_readable_time(remaining_time)}")
# Check if we've reached 1 hour of stability
if elapsed_stable_time >= self.stability_duration_required:
logger.info("1 hour of stable connectivity achieved, proceeding to restart torrents")
self.remediation_state = 'restarting_torrents'
self.stability_start_time = None
# Save state to Consul
self._save_state_to_consul()
else:
# Reset stability timer if not in waiting_for_stability state
if self.stability_start_time is not None:
logger.debug("Resetting stability timer (not in waiting_for_stability state)")
self.stability_start_time = None
# Reset failure count when stable outside of remediation
if self.consecutive_failures > 0 and self.remediation_state is None:
logger.info(f"Connection restored, resetting failure count from {self.consecutive_failures} to 0")
self.consecutive_failures = 0
self.last_failure_time = None
else:
# Increment failure counter for unstable state
self.consecutive_failures += 1
self.last_failure_time = time.time() # Record when failure occurred
logger.warning(f"Connection unstable. Failures: {self.consecutive_failures}/{self.max_consecutive_failures}")
# Reset stability timer if connection is lost
if self.stability_start_time is not None:
logger.warning("Connection lost during stabilization, resetting 1-hour timer")
self.stability_start_time = None
# Log the reset with debug metrics
self._log_debug_metrics()
# Check if remediation is needed (only if not already in progress)
if (self.consecutive_failures >= self.max_consecutive_failures and
self.remediation_state is None):
logger.error(f"Persistent connection failure ({self.consecutive_failures}/{self.max_consecutive_failures}). Initiating remediation.")
if self.start_remediation():
logger.info("Remediation started successfully")
# Log remediation start with debug metrics
self._log_debug_metrics()
else:
logger.error("Failed to start remediation")
self.consecutive_failures = self.max_consecutive_failures
# Process remediation if active (even if connection is down)
if self.remediation_state is not None:
remediation_result = self.process_remediation()
if remediation_result:
logger.info("Remediation completed successfully")
self.consecutive_failures = 0
# Log successful remediation with debug metrics
self._log_debug_metrics()
elif self.remediation_state is None:
logger.warning("Remediation failed or was cancelled")
self.consecutive_failures = self.max_consecutive_failures
# Log remediation failure with debug metrics
self._log_debug_metrics()
# Wait before next check
time.sleep(self.check_interval)
except Exception as e:
logger.error(f"Unexpected error in monitoring loop: {e}")
time.sleep(self.check_interval)
except KeyboardInterrupt:
logger.info("Connection monitoring stopped by user.")
break
def main():
"""
Main entry point for the connection monitoring script
"""
# Create monitor instance with optional custom parameters
monitor = ConnectionMonitor(
qbittorrent_url='http://sp.service.dc1.consul:8080', # Customize as needed
nomad_url='http://192.168.4.36:4646', # Customize as needed
tracker_name='https://t.myanonamouse.net/tracker.php/VPRYYAL-WpTwnr9G9aIN6044YVZ7x8Ao/announce', # Customize tracker name
consul_url='http://consul.service.dc1.consul:8500' # Consul server URL
)
try:
# Start connection monitoring
monitor.monitor_connection()
except Exception as e:
logger.critical(f"Critical error in main execution: {e}")
def debug_system_info():
"""
Log system and environment information for troubleshooting
"""
import platform
import socket
logger.info("System Diagnostic Information:")
logger.info(f"Python Version: {platform.python_version()}")
logger.info(f"Operating System: {platform.platform()}")
logger.info(f"Hostname: {socket.gethostname()}")
# Network connectivity check
try:
import requests
response = requests.get('https://www.google.com', timeout=5)
logger.info(f"Internet Connectivity: OK (Status Code: {response.status_code})")
except Exception as e:
logger.warning(f"Internet Connectivity Check Failed: {e}")
if __name__ == '__main__':
# Run system diagnostics before main script
debug_system_info()
# Configure additional logging
try:
# Attempt to set up more detailed logging
import logging
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('requests').setLevel(logging.WARNING)
except Exception as e:
logger.error(f"Failed to configure additional logging: {e}")
# Run the main monitoring script
main()

7
config/__init__.py Normal file
View File

@@ -0,0 +1,7 @@
"""
Configuration module for qBittorrent connection monitoring
"""
from .logging_config import setup_logging, configure_third_party_logging, ColoredFormatter
__all__ = ['setup_logging', 'configure_third_party_logging', 'ColoredFormatter']

Binary file not shown.

Binary file not shown.

83
config/logging_config.py Normal file
View File

@@ -0,0 +1,83 @@
import logging
import sys
from typing import Optional
class ColoredFormatter(logging.Formatter):
"""Custom formatter with colors for different log levels"""
# ANSI color codes
GREY = '\033[90m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD_RED = '\033[1;91m'
RESET = '\033[0m'
def format(self, record):
# Add color based on log level
if record.levelno == logging.DEBUG:
color = self.GREY
elif record.levelno == logging.INFO:
color = self.GREEN
elif record.levelno == logging.WARNING:
color = self.YELLOW
elif record.levelno in (logging.ERROR, logging.CRITICAL):
color = self.RED
else:
color = self.RESET
# Format the message with color
message = super().format(record)
return f"{color}{message}{self.RESET}"
def setup_logging(logger_name: str = __name__,
log_file: str = 'connection_monitor.log',
console_level: int = logging.DEBUG,
file_level: int = logging.DEBUG) -> logging.Logger:
"""
Configure logging with console and file handlers
Args:
logger_name: Name of the logger
log_file: Path to log file
console_level: Log level for console output
file_level: Log level for file output
Returns:
Configured logger instance
"""
logger = logging.getLogger(logger_name)
logger.setLevel(logging.DEBUG)
# Remove any existing handlers
logger.handlers = []
# Console handler with colors
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(console_level)
console_handler.setFormatter(ColoredFormatter(
'%(asctime)s - %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
))
# File handler (no colors)
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(file_level)
file_handler.setFormatter(logging.Formatter(
'%(asctime)s - %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
))
# Add both handlers
logger.addHandler(console_handler)
logger.addHandler(file_handler)
return logger
def configure_third_party_logging():
"""Configure logging for third-party libraries"""
try:
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('requests').setLevel(logging.WARNING)
except Exception as e:
logging.getLogger(__name__).error(f"Failed to configure additional logging: {e}")

File diff suppressed because it is too large Load Diff

64
main.py Normal file
View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python3
"""
Main entry point for qBittorrent connection monitoring
"""
import platform
import socket
import logging
import requests
from config.logging_config import setup_logging, configure_third_party_logging
from monitoring.connection_monitor import ConnectionMonitor
def debug_system_info(logger):
"""
Log system and environment information for troubleshooting
Args:
logger: Configured logger instance
"""
logger.info("System Diagnostic Information:")
logger.info(f"Python Version: {platform.python_version()}")
logger.info(f"Operating System: {platform.platform()}")
logger.info(f"Hostname: {socket.gethostname()}")
# Network connectivity check
try:
response = requests.get('https://www.google.com', timeout=5)
logger.info(f"Internet Connectivity: OK (Status Code: {response.status_code})")
except Exception as e:
logger.warning(f"Internet Connectivity Check Failed: {e}")
def main():
"""
Main entry point for the connection monitoring script
"""
# Configure logging
logger = setup_logging(__name__)
# Run system diagnostics
debug_system_info(logger)
# Configure third-party logging
configure_third_party_logging()
# Create monitor instance with custom parameters
monitor = ConnectionMonitor(
qbittorrent_url='http://sp.service.dc1.consul:8080', # Customize as needed
nomad_url='http://192.168.4.36:4646', # Customize as needed
tracker_name='https://t.myanonamouse.net/tracker.php/VPRYYAL-WpTwnr9G9aIN6044YVZ7x8Ao/announce', # Customize tracker name
consul_url='http://consul.service.dc1.consul:8500', # Consul server URL
logger=logger # Pass the configured logger
)
try:
# Start connection monitoring
monitor.monitor_connection()
except Exception as e:
logger.critical(f"Critical error in main execution: {e}")
raise
if __name__ == '__main__':
main()

8
monitoring/__init__.py Normal file
View File

@@ -0,0 +1,8 @@
"""
Monitoring components for qBittorrent connection monitoring
"""
from .connection_monitor import ConnectionMonitor
from .remediation_manager import RemediationManager
__all__ = ['ConnectionMonitor', 'RemediationManager']

Binary file not shown.

View File

@@ -0,0 +1,428 @@
import time
import logging
from typing import Dict, Any
from api.qbittorrent_client import QBittorrentClient
from api.nomad_client import NomadClient
from api.vpn_client import VPNClient
from persistence.state_manager import StateManager
from monitoring.remediation_manager import RemediationManager
from utils.time_utils import format_human_readable_time
from utils.formatters import format_connection_status
class ConnectionMonitor:
"""Main connection monitoring class"""
def __init__(self,
qbittorrent_url: str = 'http://127.0.0.1:8080',
nomad_url: str = 'http://127.0.0.1:4646',
tracker_name: str = 'myanon',
consul_url: str = 'http://consul.service.dc1.consul:8500',
check_interval: int = 30,
max_consecutive_failures: int = 20,
stability_wait_time: int = 3720,
stability_duration_required: int = 3600,
logger: logging.Logger = None):
"""
Initialize connection monitoring with configurable parameters
Args:
qbittorrent_url: qBittorrent API URL
nomad_url: Nomad API URL
tracker_name: Tracker name for torrent operations
consul_url: Consul server URL for persistence
check_interval: Check interval in seconds
max_consecutive_failures: Maximum failures before remediation
stability_wait_time: Time to wait for stability during remediation
stability_duration_required: Required stability duration after remediation
logger: Optional logger instance (uses module logger if not provided)
"""
self.logger = logger or logging.getLogger(__name__)
# Initialize clients
self.qbittorrent_client = QBittorrentClient(
base_url=qbittorrent_url,
username='admin',
password='adminpass',
logger=self.logger # Pass the logger
)
self.nomad_client = NomadClient(base_url=nomad_url, logger=self.logger)
# Initialize VPN client (using same host as qBittorrent but port 8000)
vpn_base_url = qbittorrent_url.replace(':8080', ':8000')
self.vpn_client = VPNClient(base_url=vpn_base_url, logger=self.logger)
# Initialize state management
self.state_manager = StateManager(consul_url=consul_url)
# Initialize remediation manager
self.remediation_manager = RemediationManager(
qbittorrent_client=self.qbittorrent_client,
nomad_client=self.nomad_client,
state_manager=self.state_manager,
tracker_name=tracker_name,
max_consecutive_failures=max_consecutive_failures,
stability_wait_time=stability_wait_time,
stability_duration_required=stability_duration_required,
logger=self.logger # Pass the logger
)
# Configuration
self.check_interval = check_interval
self.tracker_name = tracker_name
def _determine_connection_state(self, status: Dict[str, Any]) -> str:
"""
Determine if connection is stable or unstable based on status
Args:
status: Connection status dictionary
Returns:
'stable' or 'unstable'
"""
connection_status = status.get('connection_status', 'unknown')
dht_nodes = status.get('dht_nodes', 0)
is_connected = (
connection_status == 'connected' and
dht_nodes > 0
)
if is_connected:
self.state_manager.consecutive_stable_checks += 1
# Always return 'stable' when connection is good, regardless of remediation state
# The 1-hour stability requirement is handled in the stability tracking logic, not here
return 'stable'
else:
self.state_manager.consecutive_stable_checks = 0
# Log why the connection is considered unstable
if connection_status == 'error':
self.logger.warning(f"Connection unstable due to API error: {status.get('error_type', 'Unknown')} - {status.get('error_details', 'No details')}")
elif connection_status != 'connected':
self.logger.warning(f"Connection unstable: Status is '{connection_status}' (expected 'connected')")
elif dht_nodes <= 0:
self.logger.warning(f"Connection unstable: DHT nodes is {dht_nodes} (expected > 0)")
return 'unstable'
def _handle_state_transition(self, current_state: str):
"""
Handle state transitions and call appropriate handlers
Args:
current_state: Current connection state
"""
if current_state != self.state_manager.connection_state:
if current_state == 'unstable':
self._on_stable_to_unstable()
else:
self._on_unstable_to_stable()
self.state_manager.connection_state = current_state
self.state_manager.last_state_change_time = time.time()
self.state_manager.save_state()
def _on_stable_to_unstable(self):
"""Called when connection transitions from stable to unstable state"""
self.logger.warning("STATE TRANSITION: STABLE → UNSTABLE")
# Log detailed transition information
current_time = time.time()
stable_duration = current_time - self.state_manager.last_state_change_time
self.logger.debug(f"Stable duration: {format_human_readable_time(stable_duration)}, "
f"Failures: {self.state_manager.consecutive_failures}/{self.remediation_manager.max_consecutive_failures}")
# Get container information for qbittorrent job tasks
try:
container_info = self.nomad_client.get_container_info(
job_id="qbittorrent",
task_names=["dante", "qbittorrent", "qbittorrent-vpn"],
namespace="default"
)
if container_info:
self.logger.info("Container Status:")
for task_name, info in container_info.items():
self.logger.info(
f"- {task_name}: Uptime: {info['uptime']}, "
f"Last restart: {info['last_restart_time']} "
f"(reason: {info['last_restart_reason']}), "
f"Restarts: {info['restart_count']}"
)
else:
self.logger.warning("Could not retrieve container information")
except Exception as e:
self.logger.error(f"Failed to get container information: {e}")
self.state_manager.save_state()
def _on_unstable_to_stable(self):
"""Called when connection transitions from unstable to stable state"""
self.logger.info("STATE TRANSITION: UNSTABLE → STABLE")
unstable_duration = time.time() - self.state_manager.last_state_change_time
self.logger.debug(f"Unstable duration: {format_human_readable_time(unstable_duration)}, "
f"Total failures: {self.state_manager.consecutive_failures}")
# Only reset failures if not in remediation (remediation handles its own failure tracking)
if self.state_manager.remediation_state is None:
self.state_manager.consecutive_failures = 0
self.state_manager.last_failure_time = None
self.state_manager.save_state()
def _log_vpn_status(self, vpn_status: Dict[str, Any], public_ip_info: Dict[str, Any]):
"""
Log VPN status and public IP information with change detection
Args:
vpn_status: VPN status dictionary
public_ip_info: Public IP information dictionary
"""
# Log VPN status and update state
vpn_state = vpn_status.get('status', 'unknown')
if vpn_state == 'error':
self.logger.error(f"VPN Status Error: {vpn_status.get('error_type', 'Unknown')} - {vpn_status.get('error_details', 'No details')}")
self.state_manager.update_vpn_status('error')
else:
self.logger.info(f"VPN Status: {vpn_state}")
self.state_manager.update_vpn_status(vpn_state)
# Log public IP information with change detection
current_ip = public_ip_info.get('public_ip', 'unknown')
if current_ip == 'error':
self.logger.error(f"Public IP Error: {public_ip_info.get('error_type', 'Unknown')} - {public_ip_info.get('error_details', 'No details')}")
else:
# Update state with public IP information
self.state_manager.update_public_ip(current_ip, public_ip_info)
# Check if IP has changed for logging purposes
previous_ip = getattr(self, '_last_public_ip', None)
if previous_ip is None:
# First time seeing IP, just log it
self._last_public_ip = current_ip
self._log_public_ip_details(public_ip_info, "Initial")
elif current_ip != previous_ip:
# IP has changed
self.logger.warning(f"PUBLIC IP CHANGE DETECTED: {previous_ip}{current_ip}")
self._last_public_ip = current_ip
self._log_public_ip_details(public_ip_info, "Changed")
else:
# IP unchanged, log details at debug level
self.logger.debug(f"Public IP unchanged: {current_ip}")
def _log_public_ip_details(self, ip_info: Dict[str, Any], context: str):
"""
Log detailed public IP information
Args:
ip_info: Public IP information dictionary
context: Context for logging (e.g., "Initial", "Changed")
"""
public_ip = ip_info.get('public_ip', 'unknown')
region = ip_info.get('region', 'unknown')
country = ip_info.get('country', 'unknown')
city = ip_info.get('city', 'unknown')
organization = ip_info.get('organization', 'unknown')
self.logger.info(f"{context} Public IP Details: {public_ip} ({organization}) - {city}, {region}, {country}")
def _log_check_summary(self, status: Dict[str, Any], vpn_status: Dict[str, Any],
public_ip_info: Dict[str, Any], current_state: str):
"""
Log a comprehensive check summary in hierarchical format
Args:
status: Connection status dictionary
vpn_status: VPN status dictionary
public_ip_info: Public IP information dictionary
current_state: Current connection state
"""
# Build summary components
summary_lines = []
# Connection status
connection_status = status.get('connection_status', 'unknown')
dht_nodes = status.get('dht_nodes', 0)
summary_lines.append(f"Status: {connection_status} (DHT: {dht_nodes} nodes)")
# VPN status
vpn_state = vpn_status.get('status', 'unknown')
if vpn_state == 'error':
error_type = vpn_status.get('error_type', 'Unknown')
error_details = vpn_status.get('error_details', 'No details')
summary_lines.append(f"VPN: {vpn_state} - {error_type}: {error_details}")
else:
summary_lines.append(f"VPN: {vpn_state}")
# Public IP
current_ip = public_ip_info.get('public_ip', 'unknown')
if current_ip == 'error':
error_type = public_ip_info.get('error_type', 'Unknown')
error_details = public_ip_info.get('error_details', 'No details')
summary_lines.append(f"IP: {current_ip} - {error_type}: {error_details}")
else:
ip_change = "unchanged" if current_ip == getattr(self, '_last_public_ip', None) else "changed"
summary_lines.append(f"IP: {current_ip} ({ip_change})")
# Get system state metrics
debug_metrics = self.state_manager.get_debug_metrics()
# Log as structured hierarchical message
self.logger.debug(f"Connection Check Summary:\n " + "\n ".join(summary_lines) +
f"\n\n System State:{debug_metrics['multiline']}")
def _update_vpn_and_ip_state(self, vpn_status: Dict[str, Any], public_ip_info: Dict[str, Any]):
"""
Update VPN and public IP state without logging (for use with consolidated logging)
Args:
vpn_status: VPN status dictionary
public_ip_info: Public IP information dictionary
"""
# Update VPN status without logging
vpn_state = vpn_status.get('status', 'unknown')
if vpn_state == 'error':
self.state_manager.update_vpn_status('error')
else:
self.state_manager.update_vpn_status(vpn_state)
# Update public IP information without logging
current_ip = public_ip_info.get('public_ip', 'unknown')
if current_ip == 'error':
# Just update state, don't log error here
pass
else:
# Update state with public IP information
self.state_manager.update_public_ip(current_ip, public_ip_info)
# Track IP changes for logging purposes in summary
previous_ip = getattr(self, '_last_public_ip', None)
if previous_ip is None:
self._last_public_ip = current_ip
elif current_ip != previous_ip:
self._last_public_ip = current_ip
def monitor_connection(self):
"""
Main connection monitoring loop
"""
self.logger.info("Starting connection monitoring...")
self.logger.info(f"Monitoring parameters:")
self.logger.info(f"- API URL: {self.qbittorrent_client.api_url}")
self.logger.info(f"- Tracker: {self.tracker_name}")
self.logger.info(f"- Check Interval: {self.check_interval} seconds")
self.logger.info(f"- Max Consecutive Failures: {self.remediation_manager.max_consecutive_failures}")
# Log initial container status
try:
container_info = self.nomad_client.get_container_info(
job_id="qbittorrent",
task_names=["dante", "qbittorrent", "qbittorrent-vpn"],
namespace="default"
)
if container_info:
self.logger.info("Initial Container Status:")
for task_name, info in container_info.items():
self.logger.info(
f"- {task_name}: Uptime: {info['uptime']}, "
f"Last restart: {info['last_restart_time']} "
f"(reason: {info['last_restart_reason']}), "
f"Restarts: {info['restart_count']}"
)
else:
self.logger.warning("Could not retrieve initial container information")
except Exception as e:
self.logger.error(f"Failed to get initial container information: {e}")
while True:
try:
# Get current connection status (suppress individual debug logs)
status = self.qbittorrent_client.get_connection_status(verbose_debug=False)
# Get VPN status and public IP information (suppress individual debug logs)
vpn_status = self.vpn_client.get_vpn_status(verbose_debug=False)
public_ip_info = self.vpn_client.get_public_ip_info(verbose_debug=False)
# Update VPN and public IP state (without individual logging)
self._update_vpn_and_ip_state(vpn_status, public_ip_info)
# Determine current connection state
current_state = self._determine_connection_state(status)
# Handle state transitions
self._handle_state_transition(current_state)
# Log comprehensive check summary in hierarchical format (only this debug log)
self._log_check_summary(status, vpn_status, public_ip_info, current_state)
# Handle connection status and failure tracking
if current_state == 'stable':
# Handle stability tracking for remediation
if self.state_manager.remediation_state == 'waiting_for_stability':
# Start tracking stability time if not already started
if self.state_manager.stability_start_time is None:
self.state_manager.start_stability_timer()
self.logger.info("Stable connectivity detected, starting 1-hour timer")
# Calculate elapsed stable time
elapsed_stable_time = time.time() - self.state_manager.stability_start_time
remaining_time = max(0, self.remediation_manager.stability_duration_required - elapsed_stable_time)
self.logger.info(f"Stable for {format_human_readable_time(elapsed_stable_time)}/"
f"{format_human_readable_time(self.remediation_manager.stability_duration_required)}, "
f"Remaining: {format_human_readable_time(remaining_time)}")
# Check if we've reached 1 hour of stability
if self.remediation_manager.check_stability_requirement_met():
continue
else:
# Reset stability timer if not in waiting_for_stability state
if self.state_manager.stability_start_time is not None:
self.logger.debug("Resetting stability timer (not in waiting_for_stability state)")
self.state_manager.reset_stability_timer()
# Reset failure count when stable outside of remediation
if (self.state_manager.consecutive_failures > 0 and
self.state_manager.remediation_state is None):
self.logger.info(f"Connection restored, resetting failure count from {self.state_manager.consecutive_failures} to 0")
self.state_manager.consecutive_failures = 0
self.state_manager.last_failure_time = None
else:
# Increment failure counter for unstable state
self.state_manager.consecutive_failures += 1
self.state_manager.last_failure_time = time.time()
self.logger.warning(f"Connection unstable. Failures: {self.state_manager.consecutive_failures}/{self.remediation_manager.max_consecutive_failures}")
# Reset stability timer if connection is lost
self.remediation_manager.reset_stability_on_connection_loss()
# Check if remediation is needed (only if not already in progress)
if self.remediation_manager.should_start_remediation():
self.logger.error(f"Persistent connection failure ({self.state_manager.consecutive_failures}/{self.remediation_manager.max_consecutive_failures}). Initiating remediation.")
if self.remediation_manager.start_remediation():
self.logger.info("Remediation started successfully")
else:
self.logger.error("Failed to start remediation")
self.state_manager.consecutive_failures = self.remediation_manager.max_consecutive_failures
# Process remediation if active (even if connection is down)
if self.state_manager.remediation_state is not None:
remediation_result = self.remediation_manager.process_remediation()
if remediation_result:
self.logger.info("Remediation completed successfully")
self.state_manager.consecutive_failures = 0
elif self.state_manager.remediation_state is None:
self.logger.warning("Remediation failed or was cancelled")
self.state_manager.consecutive_failures = self.remediation_manager.max_consecutive_failures
# Wait before next check
time.sleep(self.check_interval)
except Exception as e:
self.logger.error(f"Unexpected error in monitoring loop: {e}")
time.sleep(self.check_interval)
except KeyboardInterrupt:
self.logger.info("Connection monitoring stopped by user.")
break

View File

@@ -0,0 +1,189 @@
import logging
import time
from typing import Optional
from requests import Session
from api.qbittorrent_client import QBittorrentClient
from api.nomad_client import NomadClient
from persistence.state_manager import StateManager
from utils.time_utils import format_human_readable_time
class RemediationManager:
"""Manages the remediation state machine for connection issues"""
def __init__(self,
qbittorrent_client: QBittorrentClient,
nomad_client: NomadClient,
state_manager: StateManager,
tracker_name: str,
max_consecutive_failures: int = 20,
stability_wait_time: int = 1800,
stability_duration_required: int = 3600,
logger: logging.Logger = None):
"""
Initialize remediation manager
Args:
qbittorrent_client: QBittorrent client instance
nomad_client: Nomad client instance
state_manager: State manager instance
tracker_name: Tracker name for torrent operations
max_consecutive_failures: Maximum failures before remediation
stability_wait_time: Time to wait for stability
stability_duration_required: Required stability duration
logger: Optional logger instance (uses module logger if not provided)
"""
self.qbittorrent_client = qbittorrent_client
self.nomad_client = nomad_client
self.state_manager = state_manager
self.tracker_name = tracker_name
self.max_consecutive_failures = max_consecutive_failures
self.stability_wait_time = stability_wait_time
self.stability_duration_required = stability_duration_required
self.logger = logger or logging.getLogger(__name__)
# Remediation session
self.remediation_session: Optional[Session] = None
def start_remediation(self) -> bool:
"""
Start the remediation process (non-blocking)
Returns:
True if remediation started successfully
"""
self.logger.warning("Connection instability detected. Starting remediation...")
# Login to qBittorrent
self.remediation_session = self.qbittorrent_client.login()
if not self.remediation_session:
self.logger.error("Could not log in to qBittorrent. Aborting remediation.")
return False
# Update state
self.state_manager.start_remediation()
self.logger.info(f"Remediation started. State: {self.state_manager.remediation_state}")
return True
def process_remediation(self) -> bool:
"""
Process the current remediation state (non-blocking)
Returns:
True if remediation completed successfully, False otherwise
"""
if self.state_manager.remediation_state is None:
return False
try:
# Log detailed remediation state information
remediation_duration = time.time() - self.state_manager.remediation_start_time
self.logger.debug(f"Processing remediation state: {self.state_manager.remediation_state} "
f"(duration: {format_human_readable_time(remediation_duration)})")
if self.state_manager.remediation_state == 'stopping_torrents':
return self._process_stopping_torrents()
elif self.state_manager.remediation_state == 'restarting_nomad':
return self._process_restarting_nomad()
elif self.state_manager.remediation_state == 'waiting_for_stability':
return self._process_waiting_for_stability()
elif self.state_manager.remediation_state == 'restarting_torrents':
return self._process_restarting_torrents()
except Exception as e:
self.logger.error(f"Unexpected error during remediation: {e}")
self.logger.error("Resetting remediation state due to unexpected error")
self.state_manager.reset_remediation_state()
return False
return False
def _process_stopping_torrents(self) -> bool:
"""Process stopping torrents state"""
if self.qbittorrent_client.stop_tracker_torrents(self.remediation_session, self.tracker_name):
self.logger.info("Torrents stopped successfully, proceeding to restart Nomad task")
self.state_manager.update_remediation_state('restarting_nomad')
return False
else:
self.logger.error("Failed to stop torrents - retrying in next cycle")
return False
def _process_restarting_nomad(self) -> bool:
"""Process restarting nomad state"""
if self.nomad_client.restart_task_via_allocation(
job_id="qbittorrent",
task_name="qbittorrent"
):
self.logger.info("Nomad task restarted successfully, waiting for stable connectivity")
self.state_manager.update_remediation_state('waiting_for_stability')
return False
else:
self.logger.error("Nomad task restart failed - retrying in next cycle")
return False
def _process_waiting_for_stability(self) -> bool:
"""Process waiting for stability state"""
# Check if we've exceeded the stabilization timeout
elapsed_time = time.time() - self.state_manager.remediation_start_time
if elapsed_time > self.stability_wait_time:
self.logger.error(f"Stabilization timeout reached after {format_human_readable_time(elapsed_time)}")
self.state_manager.reset_remediation_state()
return False
# This state just waits - stability checking is handled in main monitor
return False
def _process_restarting_torrents(self) -> bool:
"""Process restarting torrents state"""
try:
if self.qbittorrent_client.restart_tracker_torrents(self.remediation_session, self.tracker_name):
self.logger.info("Remediation completed successfully")
self.state_manager.reset_remediation_state()
return True
else:
self.logger.error("Failed to restart torrents")
self.state_manager.reset_remediation_state()
return False
except Exception as e:
self.logger.error(f"Failed to restart torrents: {e}")
self.state_manager.reset_remediation_state()
return False
def should_start_remediation(self) -> bool:
"""
Check if remediation should be started
Returns:
True if remediation should be started
"""
return (self.state_manager.consecutive_failures >= self.max_consecutive_failures and
self.state_manager.remediation_state is None)
def check_stability_requirement_met(self) -> bool:
"""
Check if stability requirement has been met
Returns:
True if stability requirement met
"""
if (self.state_manager.remediation_state == 'waiting_for_stability' and
self.state_manager.stability_start_time is not None):
elapsed_stable_time = time.time() - self.state_manager.stability_start_time
if elapsed_stable_time >= self.stability_duration_required:
self.logger.info("1 hour of stable connectivity achieved, proceeding to restart torrents")
self.state_manager.update_remediation_state('restarting_torrents')
self.state_manager.reset_stability_timer()
return True
return False
def reset_stability_on_connection_loss(self):
"""Reset stability timer when connection is lost"""
if self.state_manager.stability_start_time is not None:
self.logger.warning("Connection lost during stabilization, resetting 1-hour timer")
self.state_manager.reset_stability_timer()

8
persistence/__init__.py Normal file
View File

@@ -0,0 +1,8 @@
"""
Persistence layer for state management
"""
from .consul_persistence import ConsulPersistence
from .state_manager import StateManager
__all__ = ['ConsulPersistence', 'StateManager']

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,144 @@
import json
import logging
from typing import Dict, Any, Optional, Tuple
try:
import consul
CONSUL_AVAILABLE = True
except ImportError:
consul = None
CONSUL_AVAILABLE = False
class ConsulPersistence:
"""Handles Consul-based state persistence for connection monitoring"""
def __init__(self, consul_url: str = 'http://consul.service.dc1.consul:8500'):
"""
Initialize Consul persistence
Args:
consul_url: Consul server URL
"""
self.consul_url = consul_url
self.logger = logging.getLogger(__name__)
self.consul_client = None
self.base_key = "qbitcheck/connection_monitor/"
if CONSUL_AVAILABLE:
self._initialize_consul_client()
else:
self.logger.warning("python-consul package not available. State persistence disabled.")
def _initialize_consul_client(self) -> bool:
"""Initialize Consul client if available"""
if not CONSUL_AVAILABLE:
return False
try:
# Parse URL to extract host and port
url_parts = self.consul_url.split('://')
if len(url_parts) < 2:
raise ValueError(f"Invalid Consul URL format: {self.consul_url}")
host_port = url_parts[1].split(':')
host = host_port[0]
port = int(host_port[1]) if len(host_port) > 1 else 8500
self.consul_client = consul.Consul(host=host, port=port)
self.logger.info(f"Consul client initialized for {self.consul_url}")
return True
except Exception as e:
self.logger.error(f"Failed to initialize Consul client: {e}")
self.consul_client = None
return False
def save_state(self, state_data: Dict[str, Any],
remediation_data: Dict[str, Any],
stability_data: Dict[str, Any],
vpn_data: Dict[str, Any]) -> bool:
"""
Save state to Consul KV store
Args:
state_data: Connection state data
remediation_data: Remediation state data
stability_data: Stability tracking data
vpn_data: VPN status and IP data
Returns:
True if successful, False otherwise
"""
if not self.consul_client:
return False
try:
# Save each section to Consul
self.consul_client.kv.put(f"{self.base_key}state", json.dumps(state_data))
self.consul_client.kv.put(f"{self.base_key}remediation", json.dumps(remediation_data))
self.consul_client.kv.put(f"{self.base_key}stability", json.dumps(stability_data))
self.consul_client.kv.put(f"{self.base_key}vpn", json.dumps(vpn_data))
self.logger.debug("State successfully saved to Consul")
return True
except Exception as e:
self.logger.error(f"Failed to save state to Consul: {e}")
return False
def load_state(self) -> Tuple[Optional[Dict[str, Any]],
Optional[Dict[str, Any]],
Optional[Dict[str, Any]],
Optional[Dict[str, Any]]]:
"""
Load state from Consul KV store
Returns:
Tuple of (state_data, remediation_data, stability_data, vpn_data)
"""
if not self.consul_client:
return None, None, None, None
try:
state_data = None
remediation_data = None
stability_data = None
vpn_data = None
# Load connection state
_, state_kv = self.consul_client.kv.get(f"{self.base_key}state")
if state_kv:
state_data = json.loads(state_kv['Value'].decode('utf-8'))
# Load remediation state
_, remediation_kv = self.consul_client.kv.get(f"{self.base_key}remediation")
if remediation_kv:
remediation_data = json.loads(remediation_kv['Value'].decode('utf-8'))
# Load stability tracking
_, stability_kv = self.consul_client.kv.get(f"{self.base_key}stability")
if stability_kv:
stability_data = json.loads(stability_kv['Value'].decode('utf-8'))
# Load VPN state
_, vpn_kv = self.consul_client.kv.get(f"{self.base_key}vpn")
if vpn_kv:
vpn_data = json.loads(vpn_kv['Value'].decode('utf-8'))
self.logger.info("State successfully loaded from Consul")
return state_data, remediation_data, stability_data, vpn_data
except Exception as e:
self.logger.error(f"Failed to load state from Consul: {e}")
return None, None, None, None
def is_available(self) -> bool:
"""Check if Consul persistence is available"""
return self.consul_client is not None and CONSUL_AVAILABLE
def get_consul_status(self) -> Dict[str, Any]:
"""Get Consul connection status"""
return {
'available': self.is_available(),
'url': self.consul_url,
'client_initialized': self.consul_client is not None
}

View File

@@ -0,0 +1,263 @@
import time
import logging
from typing import Dict, Any, Optional, Tuple
from .consul_persistence import ConsulPersistence
class StateManager:
"""Manages connection monitoring state with optional Consul persistence"""
def __init__(self, consul_url: Optional[str] = None):
"""
Initialize state manager
Args:
consul_url: Optional Consul server URL for persistence
"""
self.logger = logging.getLogger(__name__)
# Initialize Consul persistence if URL provided
self.consul_persistence = ConsulPersistence(consul_url) if consul_url else None
# Default state values
self.connection_state = 'stable'
self.last_state_change_time = time.time()
self.consecutive_failures = 0
self.consecutive_stable_checks = 0
self.last_failure_time = None
# Remediation state
self.remediation_state = None
self.remediation_start_time = None
self.stabilization_checks = 0
# Stability tracking
self.stability_start_time = None
# VPN status tracking
self.vpn_status = 'unknown'
self.last_vpn_status_change = time.time()
self.public_ip = None
self.last_public_ip_change = None
self.public_ip_details = {}
# Load state from Consul if available
self._load_state()
def _load_state(self):
"""Load state from persistence if available"""
if self.consul_persistence and self.consul_persistence.is_available():
state_data, remediation_data, stability_data, vpn_data = self.consul_persistence.load_state()
if state_data:
self.connection_state = state_data.get('connection_state', 'stable')
self.last_state_change_time = state_data.get('last_state_change_time', time.time())
self.consecutive_failures = state_data.get('consecutive_failures', 0)
self.consecutive_stable_checks = state_data.get('consecutive_stable_checks', 0)
self.last_failure_time = state_data.get('last_failure_time')
if remediation_data:
self.remediation_state = remediation_data.get('state')
self.remediation_start_time = remediation_data.get('start_time')
self.stabilization_checks = remediation_data.get('stabilization_checks', 0)
if stability_data:
self.stability_start_time = stability_data.get('start_time')
if vpn_data:
self.vpn_status = vpn_data.get('vpn_status', 'unknown')
self.last_vpn_status_change = vpn_data.get('last_vpn_status_change', time.time())
self.public_ip = vpn_data.get('public_ip')
self.last_public_ip_change = vpn_data.get('last_public_ip_change')
self.public_ip_details = vpn_data.get('public_ip_details', {})
self.logger.debug(f"Loaded state: connection={self.connection_state}, "
f"remediation={self.remediation_state}, "
f"failures={self.consecutive_failures}, "
f"vpn={self.vpn_status}")
def save_state(self):
"""Save current state to persistence if available"""
if self.consul_persistence and self.consul_persistence.is_available():
state_data = {
'connection_state': self.connection_state,
'last_state_change_time': self.last_state_change_time,
'consecutive_failures': self.consecutive_failures,
'consecutive_stable_checks': self.consecutive_stable_checks,
'last_failure_time': self.last_failure_time
}
remediation_data = {
'state': self.remediation_state,
'start_time': self.remediation_start_time,
'stabilization_checks': self.stabilization_checks
}
stability_data = {
'start_time': self.stability_start_time
}
# VPN state data
vpn_data = {
'vpn_status': self.vpn_status,
'last_vpn_status_change': self.last_vpn_status_change,
'public_ip': self.public_ip,
'last_public_ip_change': self.last_public_ip_change,
'public_ip_details': self.public_ip_details
}
return self.consul_persistence.save_state(state_data, remediation_data, stability_data, vpn_data)
return False
def reset_remediation_state(self):
"""Reset remediation state to initial values"""
self.remediation_state = None
self.remediation_start_time = None
self.stabilization_checks = 0
self.stability_start_time = None
self.save_state()
def start_remediation(self):
"""Start remediation process"""
self.remediation_state = 'stopping_torrents'
self.remediation_start_time = time.time()
self.stabilization_checks = 0
self.save_state()
def update_remediation_state(self, new_state: str):
"""Update remediation state"""
self.remediation_state = new_state
self.save_state()
def increment_stabilization_checks(self):
"""Increment stabilization check counter"""
self.stabilization_checks += 1
self.save_state()
def start_stability_timer(self):
"""Start stability timer"""
self.stability_start_time = time.time()
self.save_state()
def reset_stability_timer(self):
"""Reset stability timer"""
self.stability_start_time = None
self.save_state()
def update_vpn_status(self, vpn_status: str):
"""Update VPN status and track changes"""
if vpn_status != self.vpn_status:
self.vpn_status = vpn_status
self.last_vpn_status_change = time.time()
self.save_state()
def update_public_ip(self, public_ip: str, ip_details: Dict[str, Any]):
"""Update public IP and track changes"""
if public_ip != self.public_ip:
self.public_ip = public_ip
self.public_ip_details = ip_details
self.last_public_ip_change = time.time()
self.save_state()
def get_state_summary(self) -> Dict[str, Any]:
"""Get summary of current state"""
return {
'connection_state': self.connection_state,
'consecutive_failures': self.consecutive_failures,
'remediation_state': self.remediation_state,
'stability_start_time': self.stability_start_time,
'consul_available': self.consul_persistence.is_available() if self.consul_persistence else False
}
def get_debug_metrics(self) -> Dict[str, Any]:
"""Get debug metrics for logging in improved format with color support"""
current_time = time.time()
# ANSI color codes
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BLUE = '\033[94m'
MAGENTA = '\033[95m'
CYAN = '\033[96m'
RESET = '\033[0m'
BOLD = '\033[1m'
# Helper function for compact duration formatting
def format_compact_duration(seconds: float) -> str:
if seconds == 0:
return "0s"
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
if hours > 0:
return f"{hours}h{minutes:02d}m{secs:02d}s"
elif minutes > 0:
return f"{minutes}m{secs:02d}s"
else:
return f"{secs}s"
# Build formatted metrics with colors
metrics = []
colored_metrics = []
# Connection state
state_duration = current_time - self.last_state_change_time
state_color = GREEN if self.connection_state == 'stable' else RED
metrics.append(f"Connection: {self.connection_state} ({format_compact_duration(state_duration)})")
colored_metrics.append(f"{BOLD}Connection:{RESET} {state_color}{self.connection_state} ({format_compact_duration(state_duration)}){RESET}")
# Failures
failure_color = RED if self.consecutive_failures > 0 else GREEN
failure_info = f"Failures: {self.consecutive_failures}"
last_failure_info = f" (Last: {GREEN}Never{RESET})"
if self.last_failure_time:
time_since_failure = current_time - self.last_failure_time
failure_recency_color = GREEN if time_since_failure > 300 else YELLOW # Green after 5min
last_failure_info = f" (Last: {failure_recency_color}{format_compact_duration(time_since_failure)} ago{RESET})"
metrics.append(f"{failure_info}{last_failure_info.replace(RESET, '').replace(GREEN, '').replace(YELLOW, '')}")
colored_metrics.append(f"{BOLD}Failures:{RESET} {failure_color}{failure_info}{RESET}{last_failure_info}")
# Remediation
remediation_color = YELLOW if self.remediation_state else GREEN
remediation_info = f"Remediation: {self.remediation_state or 'None'}"
if self.remediation_state:
remediation_duration = current_time - self.remediation_start_time
remediation_info += f" ({format_compact_duration(remediation_duration)})"
metrics.append(remediation_info)
colored_metrics.append(f"{BOLD}Remediation:{RESET} {remediation_color}{remediation_info}{RESET}")
# Stability timer
stability_info = "Stability: Not active"
if self.stability_start_time:
elapsed_stable = current_time - self.stability_start_time
stability_color = GREEN if elapsed_stable >= 1800 else YELLOW # Green after 30min
stability_info = f"Stability: {stability_color}{format_compact_duration(elapsed_stable)}{RESET}"
else:
stability_info = f"{CYAN}Stability: Not active{RESET}"
metrics.append(stability_info.replace(RESET, '').replace(GREEN, '').replace(YELLOW, '').replace(CYAN, ''))
colored_metrics.append(f"{BOLD}Stability:{RESET} {stability_info}")
# VPN status
vpn_color = GREEN if self.vpn_status == 'running' else RED
vpn_info = f"VPN Uptime: {format_compact_duration(current_time - self.last_vpn_status_change)}"
metrics.append(vpn_info)
colored_metrics.append(f"{BOLD}VPN:{RESET} {vpn_color}{vpn_info}{RESET}")
# Public IP
ip_color = GREEN if self.public_ip and self.public_ip != 'unknown' else YELLOW
ip_info = f"IP: {self.public_ip or 'unknown'}"
if self.public_ip and self.last_public_ip_change:
ip_duration = current_time - self.last_public_ip_change
ip_info += f" ({format_compact_duration(ip_duration)})"
metrics.append(ip_info)
colored_metrics.append(f"{BOLD}IP:{RESET} {ip_color}{ip_info}{RESET}")
return {
'multiline': "\n ".join([""] + colored_metrics), # For hierarchical format with colors
'compact': " | ".join(metrics) # For single-line format without colors
}
def _format_duration(self, seconds: float) -> str:
"""Format duration in human-readable format"""
from utils.time_utils import format_human_readable_time
return format_human_readable_time(seconds)

14
utils/__init__.py Normal file
View File

@@ -0,0 +1,14 @@
"""
Utility functions for qBittorrent connection monitoring
"""
from .time_utils import format_human_readable_time, calculate_time_remaining
from .formatters import format_connection_status, format_debug_metrics, format_torrent_list
__all__ = [
'format_human_readable_time',
'calculate_time_remaining',
'format_connection_status',
'format_debug_metrics',
'format_torrent_list'
]

Binary file not shown.

Binary file not shown.

Binary file not shown.

56
utils/formatters.py Normal file
View File

@@ -0,0 +1,56 @@
from typing import Any, Dict
def format_connection_status(status: Dict[str, Any]) -> str:
"""
Format connection status for logging
Args:
status: Connection status dictionary
Returns:
Formatted status string
"""
connection_status = status.get('connection_status', 'unknown')
dht_nodes = status.get('dht_nodes', 0)
return f"Status: {connection_status}, DHT Nodes: {dht_nodes}"
def format_debug_metrics(metrics: Dict[str, Any]) -> str:
"""
Format debug metrics for logging
Args:
metrics: Dictionary of debug metrics
Returns:
Formatted debug metrics string
"""
parts = []
if 'transition_info' in metrics:
parts.append(metrics['transition_info'])
if 'failure_info' in metrics:
parts.append(metrics['failure_info'])
if 'last_failure_info' in metrics:
parts.append(metrics['last_failure_info'])
if 'remediation_info' in metrics:
parts.append(metrics['remediation_info'])
if 'stability_info' in metrics:
parts.append(metrics['stability_info'])
return " - ".join(parts)
def format_torrent_list(torrents: list) -> str:
"""
Format torrent list for logging
Args:
torrents: List of torrent dictionaries
Returns:
Formatted torrent list string
"""
if not torrents:
return "No torrents"
hashes = [torrent.get('hash', 'unknown') for torrent in torrents]
return f"{len(hashes)} torrents: {'|'.join(hashes[:3])}{'...' if len(hashes) > 3 else ''}"

49
utils/time_utils.py Normal file
View File

@@ -0,0 +1,49 @@
from datetime import timedelta
from typing import Optional
def format_human_readable_time(seconds: float) -> str:
"""
Convert seconds to human-readable time format using datetime.timedelta
Args:
seconds: Number of seconds to format
Returns:
Human-readable time string (e.g., "1 day 2 hours 3 minutes 4 seconds")
"""
td = timedelta(seconds=seconds)
# Extract components
days = td.days
hours, remainder = divmod(td.seconds, 3600)
minutes, seconds_remaining = divmod(remainder, 60)
# Build the human-readable string
parts = []
if days > 0:
parts.append(f"{days} day" if days == 1 else f"{days} days")
if hours > 0:
parts.append(f"{hours} hour" if hours == 1 else f"{hours} hours")
if minutes > 0:
parts.append(f"{minutes} minute" if minutes == 1 else f"{minutes} minutes")
if seconds_remaining > 0 or not parts: # Include seconds if no other parts or if seconds exist
parts.append(f"{seconds_remaining} second" if seconds_remaining == 1 else f"{seconds_remaining} seconds")
return " ".join(parts)
def calculate_time_remaining(start_time: float, duration_required: float, current_time: Optional[float] = None) -> float:
"""
Calculate remaining time for a duration requirement
Args:
start_time: When the timer started
duration_required: Total duration required
current_time: Current time (defaults to time.time())
Returns:
Remaining time in seconds (0 if elapsed time exceeds requirement)
"""
import time
current = current_time or time.time()
elapsed = current - start_time
return max(0, duration_required - elapsed)