Files
AICyclingCoach/backend/app/services/health_monitor.py
2025-09-12 09:08:10 -07:00

94 lines
3.6 KiB
Python

import psutil
from datetime import datetime
import logging
from typing import Dict, Any
from sqlalchemy import text
from backend.app.database import get_db
from backend.app.models.garmin_sync_log import GarminSyncLog, SyncStatus
import requests
from backend.app.config import settings
logger = logging.getLogger(__name__)
class HealthMonitor:
def __init__(self):
self.warning_thresholds = {
'cpu_percent': 80,
'memory_percent': 75,
'disk_percent': 85
}
def check_system_health(self) -> Dict[str, Any]:
"""Check vital system metrics and log warnings"""
metrics = {
'timestamp': datetime.utcnow().isoformat(),
'cpu': psutil.cpu_percent(),
'memory': psutil.virtual_memory().percent,
'disk': psutil.disk_usage('/').percent,
'services': self._check_service_health()
}
self._log_anomalies(metrics)
return metrics
def _check_service_health(self) -> Dict[str, str]:
"""Check critical application services"""
return {
'database': self._check_database(),
'garmin_sync': self._check_garmin_sync(),
'ai_service': self._check_ai_service(),
'sync_queue_size': self._get_sync_queue_size(),
'pending_analyses': self._count_pending_analyses()
}
def _get_sync_queue_size(self) -> int:
"""Get number of pending sync operations"""
from backend.app.models.garmin_sync_log import GarminSyncLog, SyncStatus
return GarminSyncLog.query.filter_by(status=SyncStatus.PENDING).count()
def _count_pending_analyses(self) -> int:
"""Count workouts needing analysis"""
from backend.app.models.workout import Workout
return Workout.query.filter_by(analysis_status='pending').count()
def _check_database(self) -> str:
try:
with get_db() as db:
db.execute(text("SELECT 1"))
return "ok"
except Exception as e:
logger.error("Database check failed", extra={"component": "database", "error": str(e)})
return "down"
def _check_garmin_sync(self) -> str:
try:
last_sync = GarminSyncLog.get_latest()
if last_sync and last_sync.status == SyncStatus.FAILED:
logger.warning("Garmin sync has failed status", extra={"component": "garmin_sync", "status": last_sync.status.value})
return "warning"
return "ok"
except Exception as e:
logger.error("Garmin sync check failed", extra={"component": "garmin_sync", "error": str(e)})
return "down"
def _check_ai_service(self) -> str:
try:
response = requests.get(
f"{settings.AI_SERVICE_URL}/ping",
timeout=5,
headers={"Authorization": f"Bearer {settings.OPENROUTER_API_KEY}"}
)
return "ok" if response.ok else "down"
except Exception as e:
logger.error("AI service check failed", extra={"component": "ai_service", "error": str(e)})
return "down"
def _log_anomalies(self, metrics: Dict[str, Any]):
alerts = []
for metric, value in metrics.items():
if metric in self.warning_thresholds and value > self.warning_thresholds[metric]:
alerts.append(f"{metric} {value}%")
logger.warning("System threshold exceeded", extra={"metric": metric, "value": value, "threshold": self.warning_thresholds[metric]})
if alerts:
logger.warning("System thresholds exceeded", extra={"alerts": alerts})