From 20bda0cee0b2f4e5e33d23ff31b382d4226ee4a7 Mon Sep 17 00:00:00 2001 From: sstent Date: Sat, 9 Aug 2025 19:27:29 -0700 Subject: [PATCH] first post --- consul-monitor/Dockerfile | 27 ++ consul-monitor/app.py | 126 ++++++++ consul-monitor/consul_client.py | 82 +++++ consul-monitor/database.py | 121 +++++++ consul-monitor/requirements.txt | 2 + consul-monitor/static/css/style.css | 113 +++++++ consul-monitor/static/js/app.js | 74 +++++ consul-monitor/templates/index.html | 63 ++++ design.md | 318 +++++++++++++++++++ plan_phase1.md | 477 ++++++++++++++++++++++++++++ 10 files changed, 1403 insertions(+) create mode 100644 consul-monitor/Dockerfile create mode 100644 consul-monitor/app.py create mode 100644 consul-monitor/consul_client.py create mode 100644 consul-monitor/database.py create mode 100644 consul-monitor/requirements.txt create mode 100644 consul-monitor/static/css/style.css create mode 100644 consul-monitor/static/js/app.js create mode 100644 consul-monitor/templates/index.html create mode 100644 design.md create mode 100644 plan_phase1.md diff --git a/consul-monitor/Dockerfile b/consul-monitor/Dockerfile new file mode 100644 index 0000000..0c5adcb --- /dev/null +++ b/consul-monitor/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +# Create non-root user +RUN useradd -m appuser && chown -R appuser:appuser /app +USER appuser + +# Expose port +EXPOSE 5000 + +# Environment variables +ENV FLASK_APP=app.py +ENV FLASK_ENV=production + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD python -c "import requests; requests.get('http://localhost:5000/health', timeout=5)" || exit 1 + +CMD ["python", "-m", "flask", "run", "--host=0.0.0.0"] diff --git a/consul-monitor/app.py b/consul-monitor/app.py new file mode 100644 index 0000000..4f32bf0 --- /dev/null +++ b/consul-monitor/app.py @@ -0,0 +1,126 @@ +from flask import Flask, render_template, jsonify, g +import sqlite3 +import json +from datetime import datetime +import database +import consul_client + +app = Flask(__name__) + +def get_db(): + """Get a thread-local database connection""" + if 'db_conn' not in g: + g.db_conn = database.init_database() + database.create_tables(g.db_conn) + return g.db_conn + +@app.teardown_appcontext +def close_db(e=None): + """Close database connection at end of request""" + db_conn = g.pop('db_conn', None) + if db_conn is not None: + db_conn.close() + +@app.route('/') +def index(): + """Render the main dashboard""" + # Get thread-local database connection + db_conn = get_db() + + # Get initial service data + services = database.get_all_services_with_health(db_conn) + consul_available = consul_client.is_consul_available() + + # Generate URLs for services + for service in services: + if service['port']: + service['url'] = f"http://{service['name']}.service.dc1.consul:{service['port']}" + else: + service['url'] = None + + return render_template('index.html', services=services, consul_available=consul_available) + +@app.route('/api/services') +def get_services(): + """API endpoint to get service data""" + # Get thread-local database connection + db_conn = get_db() + + try: + # Try to get fresh data from Consul + if consul_client.is_consul_available(): + service_data = consul_client.fetch_all_service_data() + + # Update database with fresh data + for service_id, data in service_data.items(): + # Upsert service + database.upsert_service(db_conn, { + 'id': service_id, + 'name': data['name'], + 'address': data['address'], + 'port': data['port'], + 'tags': data['tags'], + 'meta': data['meta'] + }) + + # Insert health checks + for check in data['health_checks']: + database.insert_health_check(db_conn, service_id, + check['check_name'], check['status']) + + # Retrieve services from DB with updated data + services = database.get_all_services_with_health(db_conn) + consul_available = True + else: + raise Exception("Consul unavailable") + + except Exception as e: + # Fallback to cached data + services = database.get_all_services_with_health(db_conn) + consul_available = False + error_message = str(e) + + # Generate URLs for services + for service in services: + if service['port']: + service['url'] = f"http://{service['name']}.service.dc1.consul:{service['port']}" + else: + service['url'] = None + + # Prepare response + if consul_available: + response = { + 'status': 'success', + 'consul_available': True, + 'services': services + } + else: + response = { + 'status': 'error', + 'consul_available': False, + 'services': services, + 'error': error_message + } + + return jsonify(response) + +@app.route('/health') +def health_check(): + """Health check endpoint""" + # Get thread-local database connection + db_conn = get_db() + + db_available = database.is_database_available(db_conn) + consul_available = consul_client.is_consul_available() + + status = 'healthy' if db_available and consul_available else 'unhealthy' + + return jsonify({ + 'status': status, + 'consul': 'connected' if consul_available else 'disconnected', + 'database': 'available' if db_available else 'unavailable', + 'timestamp': datetime.utcnow().isoformat() + }) + +if __name__ == '__main__': + app.run(host='0.0.0.0', port=5000) diff --git a/consul-monitor/consul_client.py b/consul-monitor/consul_client.py new file mode 100644 index 0000000..fddc096 --- /dev/null +++ b/consul-monitor/consul_client.py @@ -0,0 +1,82 @@ +import requests +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# Consul configuration +CONSUL_HOST = "consul.service.dc1.consul" +CONSUL_PORT = 8500 +CONSUL_BASE_URL = f"http://{CONSUL_HOST}:{CONSUL_PORT}" + +def get_consul_services(): + """Fetch all registered services from Consul""" + url = f"{CONSUL_BASE_URL}/v1/agent/services" + try: + response = requests.get(url, timeout=5) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + logger.error(f"Failed to fetch Consul services: {e}") + raise + +def get_service_health(service_name): + """Fetch health checks for a specific service""" + url = f"{CONSUL_BASE_URL}/v1/health/service/{service_name}" + try: + response = requests.get(url, timeout=5) + response.raise_for_status() + data = response.json() + + # Process health checks + health_checks = [] + for entry in data: + for check in entry.get('Checks', []): + health_checks.append({ + 'check_name': check.get('Name', ''), + 'status': check.get('Status', '') + }) + return health_checks + except requests.exceptions.RequestException as e: + logger.error(f"Failed to fetch health for service {service_name}: {e}") + raise + +def is_consul_available(): + """Check if Consul is reachable""" + try: + response = requests.get(f"{CONSUL_BASE_URL}/v1/agent/self", timeout=2) + return response.status_code == 200 + except requests.exceptions.RequestException: + return False + +def fetch_all_service_data(): + """Fetch service data and health status for all services""" + try: + services = get_consul_services() + service_data = {} + + for service_id, service_info in services.items(): + service_name = service_info.get('Service', '') + health_checks = [] + + try: + health_checks = get_service_health(service_name) + except requests.exceptions.RequestException: + # Log but continue with other services + logger.warning(f"Skipping health checks for service {service_name}") + + service_data[service_id] = { + 'id': service_id, + 'name': service_info.get('Service', ''), + 'address': service_info.get('Address', ''), + 'port': service_info.get('Port', None), + 'tags': service_info.get('Tags', []), + 'meta': service_info.get('Meta', {}), + 'health_checks': health_checks + } + + return service_data + except requests.exceptions.RequestException: + logger.error("Failed to fetch service data from Consul") + return {} diff --git a/consul-monitor/database.py b/consul-monitor/database.py new file mode 100644 index 0000000..2eb6e86 --- /dev/null +++ b/consul-monitor/database.py @@ -0,0 +1,121 @@ +import sqlite3 +import json +from datetime import datetime + +def create_tables(conn): + cursor = conn.cursor() + # Create services table + cursor.execute(''' + CREATE TABLE IF NOT EXISTS services ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + address TEXT, + port INTEGER, + tags TEXT, + meta TEXT, + first_seen DATETIME DEFAULT CURRENT_TIMESTAMP, + last_seen DATETIME DEFAULT CURRENT_TIMESTAMP + ) + ''') + + # Create health checks table + cursor.execute(''' + CREATE TABLE IF NOT EXISTS health_checks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + service_id TEXT NOT NULL, + check_name TEXT, + status TEXT NOT NULL, + timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (service_id) REFERENCES services (id) + ) + ''') + + # Create indexes + cursor.execute(''' + CREATE INDEX IF NOT EXISTS idx_health_checks_service_timestamp + ON health_checks (service_id, timestamp) + ''') + cursor.execute(''' + CREATE INDEX IF NOT EXISTS idx_health_checks_timestamp + ON health_checks (timestamp) + ''') + + conn.commit() + conn.commit() + +def init_database(): + """Initialize database and return connection""" + return sqlite3.connect('file:consul-monitor.db?mode=memory&cache=shared', uri=True) + +def upsert_service(conn, service_data): + cursor = conn.cursor() + cursor.execute(''' + INSERT INTO services (id, name, address, port, tags, meta) + VALUES (?, ?, ?, ?, ?, ?) + ON CONFLICT(id) DO UPDATE SET + name = excluded.name, + address = excluded.address, + port = excluded.port, + tags = excluded.tags, + meta = excluded.meta, + last_seen = CURRENT_TIMESTAMP + ''', ( + service_data['id'], + service_data['name'], + service_data.get('address'), + service_data.get('port'), + json.dumps(service_data.get('tags', [])), + json.dumps(service_data.get('meta', {})) + )) + conn.commit() + +def insert_health_check(conn, service_id, check_name, status): + cursor = conn.cursor() + cursor.execute(''' + INSERT INTO health_checks (service_id, check_name, status) + VALUES (?, ?, ?) + ''', (service_id, check_name, status)) + conn.commit() + +def get_all_services_with_health(conn): + cursor = conn.cursor() + cursor.execute(''' + SELECT s.id, s.name, s.address, s.port, s.tags, s.meta, + h.status, MAX(h.timestamp) AS last_check + FROM services s + LEFT JOIN health_checks h ON s.id = h.service_id + GROUP BY s.id + ''') + + services = [] + for row in cursor.fetchall(): + service = { + 'id': row[0], + 'name': row[1], + 'address': row[2], + 'port': row[3], + 'tags': json.loads(row[4]) if row[4] else [], + 'meta': json.loads(row[5]) if row[5] else {}, + 'current_status': row[6] or 'unknown', + 'last_check': row[7] + } + services.append(service) + return services + +def get_service_history(conn, service_id, hours=24): + cursor = conn.cursor() + cursor.execute(''' + SELECT status, timestamp + FROM health_checks + WHERE service_id = ? + AND timestamp >= datetime('now', ?) + ORDER BY timestamp + ''', (service_id, f'-{hours} hours')) + return cursor.fetchall() + +def is_database_available(conn): + try: + conn.execute('SELECT 1') + return True + except sqlite3.Error: + return False diff --git a/consul-monitor/requirements.txt b/consul-monitor/requirements.txt new file mode 100644 index 0000000..b509766 --- /dev/null +++ b/consul-monitor/requirements.txt @@ -0,0 +1,2 @@ +Flask==2.3.3 +requests==2.31.0 diff --git a/consul-monitor/static/css/style.css b/consul-monitor/static/css/style.css new file mode 100644 index 0000000..6955231 --- /dev/null +++ b/consul-monitor/static/css/style.css @@ -0,0 +1,113 @@ +/* Basic reset and layout */ +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} +body { + font-family: Arial, sans-serif; + background: #f5f5f5; + padding: 0; + margin: 0; +} + +/* Header */ +.header { + background: white; + padding: 1rem 2rem; + box-shadow: 0 2px 4px rgba(0,0,0,0.1); + display: flex; + justify-content: space-between; + align-items: center; + position: sticky; + top: 0; + z-index: 100; +} + +/* Alert banners */ +.error-banner, .warning-banner { + padding: 0.75rem 2rem; + margin: 0; + font-weight: bold; +} +.error-banner { + background: #fee; + color: #c33; +} +.warning-banner { + background: #fff3cd; + color: #856404; +} + +/* Services table */ +.services-container { + padding: 2rem; +} +.services-table { + width: 100%; + background: white; + border-radius: 8px; + box-shadow: 0 2px 4px rgba(0,0,0,0.1); + border-collapse: collapse; + overflow: hidden; +} +.services-table th, .services-table td { + padding: 1rem; + text-align: left; + border-bottom: 1px solid #eee; +} +.services-table th { + background: #f8f9fa; + font-weight: bold; +} + +/* Status indicators */ +.status-icon { + font-size: 1.2rem; + display: inline-block; + width: 24px; + text-align: center; +} +.status-passing { color: #28a745; } +.status-warning { color: #ffc107; } +.status-critical { color: #dc3545; } +.status-unknown { color: #6c757d; } + +/* Tags */ +.tag { + display: inline-block; + background: #e9ecef; + padding: 0.25rem 0.5rem; + border-radius: 4px; + font-size: 0.875rem; + margin-right: 0.5rem; + margin-bottom: 0.25rem; +} + +/* Buttons */ +button { + background: #007bff; + color: white; + border: none; + padding: 0.5rem 1rem; + border-radius: 4px; + cursor: pointer; + font-size: 1rem; +} +button:hover { background: #0056b3; } +button:disabled { + background: #6c757d; + cursor: not-allowed; + opacity: 0.7; +} + +/* No services message */ +.no-services { + padding: 2rem; + text-align: center; + background: white; + border-radius: 8px; + margin-top: 1rem; + font-style: italic; + color: #6c757d; +} diff --git a/consul-monitor/static/js/app.js b/consul-monitor/static/js/app.js new file mode 100644 index 0000000..0f72616 --- /dev/null +++ b/consul-monitor/static/js/app.js @@ -0,0 +1,74 @@ +console.log('app.js loading...'); + +// Define the serviceMonitor component +function serviceMonitor() { + console.log('serviceMonitor function called'); + return { + services: [], + loading: false, + error: null, + consulAvailable: true, + + init() { + console.log('Initializing serviceMonitor component'); + this.refreshServices(); + }, + + async refreshServices() { + console.log('Refreshing services'); + this.loading = true; + this.error = null; + + try { + const response = await fetch('/api/services'); + const data = await response.json(); + + if (data.status === 'success') { + this.services = data.services; + this.consulAvailable = data.consul_available; + } else { + this.error = data.error || 'Failed to fetch services'; + this.services = data.services || []; + this.consulAvailable = data.consul_available; + } + } catch (err) { + this.error = 'Network error: ' + err.message; + this.services = []; + this.consulAvailable = false; + } finally { + this.loading = false; + } + }, + + getStatusClass(status) { + return { + 'status-passing': status === 'passing', + 'status-warning': status === 'warning', + 'status-critical': status === 'critical', + 'status-unknown': !status || status === 'unknown' + }; + }, + + getStatusEmoji(status) { + switch(status) { + case 'passing': return '🟢'; + case 'warning': return '🟡'; + case 'critical': return '🔴'; + default: return '⚪'; + } + } + }; +} + +// Try to register with Alpine.js with fallback to window +try { + console.log('Registering with Alpine.js'); + Alpine.data('serviceMonitor', serviceMonitor); + console.log('Alpine registration successful'); +} catch (error) { + console.error('Alpine registration failed:', error); + window.serviceMonitor = serviceMonitor; + console.log('Fallback to window registration'); +} + +console.log('app.js loaded'); diff --git a/consul-monitor/templates/index.html b/consul-monitor/templates/index.html new file mode 100644 index 0000000..9231583 --- /dev/null +++ b/consul-monitor/templates/index.html @@ -0,0 +1,63 @@ + + + + Consul Service Monitor + + + + + +
+

Consul Service Monitor

+
+ +
+
+ +
+
+ ⚠️ Consul connection failed - showing cached data +
+ +
+ + + + + + + + + + + + +
Service NameStatusURLTags
+ +
+ No services found +
+
+ + diff --git a/design.md b/design.md new file mode 100644 index 0000000..40902d5 --- /dev/null +++ b/design.md @@ -0,0 +1,318 @@ +# Consul Service Monitor - Design Document + +## Overview + +A web-based dashboard application that monitors and visualizes the health status of services registered in HashiCorp Consul. The application provides real-time monitoring with historical health tracking capabilities. + +## Architecture + +### High-Level Components + +1. **Web Frontend** - Interactive dashboard displaying service status +2. **Backend API** - REST API for data retrieval and configuration +3. **Data Collection Service** - Background service polling Consul for health data +4. **SQLite Database** - Historical health check data storage +5. **Consul Integration** - Service discovery and health check monitoring + +### Technology Stack + +- **Frontend**: HTML5, CSS3, JavaScript (with Chart.js for visualizations) +- **Backend**: Python 3.9+ with Flask +- **Database**: SQLite (ephemeral storage) +- **Service Discovery**: HashiCorp Consul (consul.service.dc1.consul) +- **Updates**: Periodic polling (no WebSockets needed) + +## Functional Requirements + +### Core Features + +#### 1. Service List Display +- Display all services registered in Consul +- Show service name, ID, and tags +- Provide clickable links to service URLs +- Support sorting and filtering + +#### 2. Health Status Visualization +- **Current Status Indicator** + - Green icon: All health checks passing + - Red icon: One or more health checks failing + - Yellow icon: Warning state (if supported) +- **Historical Status Chart** + - Mini bar chart showing 24-hour health history + - Time-based visualization (hourly aggregation) + - Color-coded status representation + +#### 3. Auto-refresh Functionality +- Toggle switch to enable/disable auto-refresh +- Configurable refresh interval (30s, 1m, 2m, 5m, 10m) +- Visual indicator when auto-refresh is active +- Manual refresh button + +#### 4. Configuration Management +- Session-based storage of user preferences (no persistence needed) +- Configurable history granularity (5m, 15m, 30m, 1h) - default: 15 minutes + +## Database Schema + +### Tables + +```sql +-- Services table +CREATE TABLE services ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + address TEXT, + port INTEGER, + tags TEXT, -- JSON array + meta TEXT, -- JSON object + first_seen DATETIME DEFAULT CURRENT_TIMESTAMP, + last_seen DATETIME DEFAULT CURRENT_TIMESTAMP +); + +-- Health checks table +CREATE TABLE health_checks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + service_id TEXT NOT NULL, + check_id TEXT NOT NULL, + check_name TEXT, + status TEXT NOT NULL, -- 'passing', 'warning', 'critical' + output TEXT, + timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (service_id) REFERENCES services (id) +); + +-- Configuration table (session-based, optional for defaults) +CREATE TABLE config ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +-- Service URLs are generated using pattern: http://{service_name}.service.dc1.consul:{port} + +-- Indexes for performance +CREATE INDEX idx_health_checks_service_timestamp ON health_checks (service_id, timestamp); +CREATE INDEX idx_health_checks_timestamp ON health_checks (timestamp); +``` + +## API Design + +### REST Endpoints + +```python +# Flask routes +GET / +- Serves main dashboard HTML page + +GET /api/services +- Returns list of all services with current health status +- Generated URLs: http://{service_name}.service.dc1.consul:{port} +- Response: Array of service objects with health summary + +GET /api/services//history +- Returns historical health data for charts +- Query params: ?granularity=15 (minutes: 5,15,30,60) +- Response: Time-series data for Chart.js + +POST /api/config +- Updates session configuration +- Body: { "autoRefresh": true, "refreshInterval": 60, "historyGranularity": 15 } + +GET /api/config +- Returns current session configuration +``` + +## Data Collection Service + +### Polling Strategy + +```yaml +Consul Polling: + - Interval: 60 seconds + - Consul Address: consul.service.dc1.consul:8500 + - Endpoints: + - /v1/agent/services (service discovery) + - /v1/health/service/{service} (health checks) + - No authentication required + - Error handling: Log errors, continue polling + - Expected services: 30-40 services + +Data Retention: + - Keep detailed data for 24 hours only (ephemeral storage) + - No long-term aggregation needed + - Database recreated on container restart +``` + +### Health Check Processing + +1. **Data Collection** + - Poll Consul API for service list + - For each service, fetch health check status + - Store raw health check data with timestamps + +2. **Status Aggregation** + - Service-level status: Worst status among all checks + - Historical aggregation: Count of passing/warning/critical per time window + +3. **Change Detection** + - Compare current status with previous poll + - Trigger notifications/updates on status changes + - Maintain service registration/deregistration events + +## Frontend Design + +### Main Dashboard Layout + +``` +┌─────────────────────────────────────────────────┐ +│ Consul Service Monitor [⚙️] [🔄] │ +├─────────────────────────────────────────────────┤ +│ Auto-refresh: [ON/OFF] Interval: [1m ▼] │ +│ History granularity: [15m ▼] │ +├─────────────────────────────────────────────────┤ +│ Service Name │ Status │ URL │ History │ +├─────────────────┼────────┼──────────┼───────────┤ +│ web-api │ 🟢 │ [link] │ ▆▆█▆█▆▆ │ +│ database │ 🔴 │ [link] │ █▆▆▄▂▂▄ │ +│ cache-service │ 🟢 │ [link] │ ████████ │ +└─────────────────────────────────────────────────┘ +``` + +### Interactive Elements + +- **Status Icons**: Visual indicators only (no detailed popup needed) +- **History Charts**: Chart.js mini bar charts with 24-hour data +- **Service Links**: URLs generated as http://{service_name}.service.dc1.consul:{port} +- **Desktop-optimized**: No mobile responsive design required + +### Updates + +- Periodic AJAX polling for updates +- Configurable refresh intervals (30s, 1m, 2m, 5m, 10m) +- Visual loading indicators during refresh + +## Configuration Management + +### User Settings (Session-based) + +```json +{ + "autoRefresh": { + "enabled": false, + "interval": 60, + "options": [30, 60, 120, 300, 600] + }, + "display": { + "historyGranularity": 15, + "granularityOptions": [5, 15, 30, 60] + } +} +``` + +### System Configuration + +```python +# Flask configuration +CONSUL_HOST = "consul.service.dc1.consul" +CONSUL_PORT = 8500 +DATABASE_PATH = ":memory:" # Ephemeral SQLite +POLL_INTERVAL = 60 # seconds +MAX_SERVICES = 50 # Safety limit +``` + +## Deployment Considerations + +### Docker Deployment + +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +# Expose port +EXPOSE 5000 + +# Set environment variables +ENV FLASK_APP=app.py +ENV FLASK_ENV=production +ENV CONSUL_HOST=consul.service.dc1.consul +ENV CONSUL_PORT=8500 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:5000/health || exit 1 + +CMD ["python", "-m", "flask", "run", "--host=0.0.0.0"] +``` + +### Python Dependencies (requirements.txt) + +``` +Flask==2.3.3 +requests==2.31.0 +sqlite3 # Built-in +APScheduler==3.10.4 # For background polling +``` + +### Environment Variables + +- `CONSUL_HOST`: Consul server hostname (default: consul.service.dc1.consul) +- `CONSUL_PORT`: Consul server port (default: 8500) +- `FLASK_PORT`: Web server port (default: 5000) +- `POLL_INTERVAL`: Health check polling interval in seconds (default: 60) + +### Health Checks + +The application should expose its own health endpoint: +- `GET /health`: Returns application health status +- `GET /metrics`: Prometheus-style metrics (optional) + +## Security Considerations + +1. **Consul Access**: No authentication required for your setup +2. **Database**: Ephemeral SQLite in container memory +3. **Web Interface**: Open dashboard, no authentication needed +4. **Input Validation**: Sanitize service names and configuration inputs +5. **Container Security**: Run as non-root user in container + +## Future Enhancements + +- **Alerting**: Email/Slack notifications on service failures (mentioned as future feature) +- **Service Filtering**: Search and filter capabilities for larger service lists +- **Service Details**: Detailed health check information popup/modal +- **Themes**: Dark/light mode toggle +- **Export**: Export health data as CSV/JSON +- **Custom Time Ranges**: Configurable history periods beyond 24 hours + +## Development Phases + +### Phase 1: Core Functionality +- Basic Consul integration +- SQLite database setup +- Simple web interface +- Manual refresh capability + +### Phase 2: Real-time Features +- Auto-refresh functionality +- WebSocket integration +- Historical data visualization +- Configuration persistence + +### Phase 3: Enhanced UX +- Responsive design +- Advanced filtering +- Performance optimizations +- Error handling improvements + +### Phase 4: Production Ready +- Docker deployment +- Security hardening +- Monitoring and logging +- Documentation and testing \ No newline at end of file diff --git a/plan_phase1.md b/plan_phase1.md new file mode 100644 index 0000000..0077597 --- /dev/null +++ b/plan_phase1.md @@ -0,0 +1,477 @@ +# Phase 1 Implementation Plan - Consul Service Monitor + +## Overview +Implement the core functionality for a Flask-based Consul service monitoring dashboard. This phase focuses on basic Consul integration, SQLite database setup, and a simple web interface with manual refresh capability. + +## Project Structure +Create the following directory structure: +``` +consul-monitor/ +├── app.py # Main Flask application +├── consul_client.py # Consul API integration +├── database.py # SQLite database operations +├── requirements.txt # Python dependencies +├── templates/ +│ └── index.html # Main dashboard template +├── static/ +│ ├── css/ +│ │ └── style.css # Dashboard styles +│ └── js/ +│ └── app.js # Frontend JavaScript with Alpine.js +└── Dockerfile # Container configuration +``` + +## Dependencies (requirements.txt) +``` +Flask==2.3.3 +requests==2.31.0 +``` + +## Database Implementation (database.py) + +### Database Schema +Implement exactly these SQLite tables: + +```sql +-- Services table +CREATE TABLE IF NOT EXISTS services ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + address TEXT, + port INTEGER, + tags TEXT, -- Store as JSON string + meta TEXT, -- Store as JSON string + first_seen DATETIME DEFAULT CURRENT_TIMESTAMP, + last_seen DATETIME DEFAULT CURRENT_TIMESTAMP +); + +-- Health checks table +CREATE TABLE IF NOT EXISTS health_checks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + service_id TEXT NOT NULL, + check_name TEXT, + status TEXT NOT NULL, -- 'passing', 'warning', 'critical' + timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (service_id) REFERENCES services (id) +); + +-- Indexes for performance +CREATE INDEX IF NOT EXISTS idx_health_checks_service_timestamp +ON health_checks (service_id, timestamp); +``` + +### Database Functions +Create these specific functions in database.py: + +1. **`init_database()`**: Initialize SQLite database with the above schema +2. **`upsert_service(service_data)`**: Insert or update service record + - Parameters: dictionary with id, name, address, port, tags (as JSON string), meta (as JSON string) + - Update last_seen timestamp on existing records +3. **`insert_health_check(service_id, check_name, status)`**: Insert health check record +4. **`get_all_services_with_health()`**: Return all services with their latest health status + - Join services table with latest health_checks record per service + - Return list of dictionaries with service details + current health status +5. **`get_service_history(service_id, hours=24)`**: Get health history for specific service +6. **`is_database_available()`**: Test database connectivity + +## Consul Client Implementation (consul_client.py) + +### Configuration +Set these constants: +```python +CONSUL_HOST = "consul.service.dc1.consul" +CONSUL_PORT = 8500 +CONSUL_BASE_URL = f"http://{CONSUL_HOST}:{CONSUL_PORT}" +``` + +### Consul Functions +Implement these specific functions: + +1. **`get_consul_services()`**: + - Call `/v1/agent/services` endpoint + - Return dictionary of services or raise exception on failure + - Handle HTTP errors and connection timeouts + +2. **`get_service_health(service_name)`**: + - Call `/v1/health/service/{service_name}` endpoint + - Parse health check results + - Return list of health checks with check_name and status + - Handle cases where service has no health checks + +3. **`is_consul_available()`**: + - Test connection to Consul + - Return True/False boolean + +4. **`fetch_all_service_data()`**: + - Orchestrate calls to get_consul_services() and get_service_health() + - Return combined service and health data + - Handle partial failures gracefully + +## Flask Application (app.py) + +### Application Configuration +```python +from flask import Flask, render_template, jsonify +import sqlite3 +import json +from datetime import datetime +``` + +### Flask Routes +Implement exactly these routes: + +1. **`GET /`**: + - Render main dashboard using index.html template + - Pass initial service data to template + - Handle database/consul errors gracefully + +2. **`GET /api/services`**: + - Return JSON array of all services with current health status + - Include generated URLs using pattern: `http://{service_name}.service.dc1.consul:{port}` + - Response format: + ```json + { + "status": "success|error", + "consul_available": true|false, + "services": [ + { + "id": "service-id", + "name": "service-name", + "address": "10.0.0.1", + "port": 8080, + "url": "http://service-name.service.dc1.consul:8080", + "tags": ["tag1", "tag2"], + "current_status": "passing|warning|critical|unknown", + "last_check": "2024-01-01T12:00:00" + } + ], + "error": "error message if any" + } + ``` + +3. **`GET /health`**: + - Return application health status + - Test both database and Consul connectivity + - Response format: + ```json + { + "status": "healthy|unhealthy", + "consul": "connected|disconnected", + "database": "available|unavailable", + "timestamp": "2024-01-01T12:00:00" + } + ``` + +### Data Flow Logic +Implement this exact flow in the `/api/services` endpoint: + +1. Try to fetch fresh data from Consul using `fetch_all_service_data()` +2. If successful: + - Update database with new service and health data + - Return fresh data with `consul_available: true` +3. If Consul fails: + - Retrieve cached data from database using `get_all_services_with_health()` + - Return cached data with `consul_available: false` and error message +4. If both fail: + - Return error response with empty services array + +## Frontend Implementation + +### HTML Template (templates/index.html) +Create dashboard with this structure: +```html + + + + Consul Service Monitor + + + + + +
+

Consul Service Monitor

+
+ +
+
+ +
+
+ ⚠️ Consul connection failed - showing cached data +
+ +
+ + + + + + + + + + + + +
Service NameStatusURLTags
+ +
+ No services found +
+
+ + +``` + +### Alpine.js JavaScript (static/js/app.js) +```javascript +function serviceMonitor() { + return { + services: [], + loading: false, + error: null, + consulAvailable: true, + + init() { + this.refreshServices(); + }, + + async refreshServices() { + this.loading = true; + this.error = null; + + try { + const response = await fetch('/api/services'); + const data = await response.json(); + + if (data.status === 'success') { + this.services = data.services; + this.consulAvailable = data.consul_available; + } else { + this.error = data.error || 'Failed to fetch services'; + this.services = data.services || []; + this.consulAvailable = data.consul_available; + } + } catch (err) { + this.error = 'Network error: ' + err.message; + this.services = []; + this.consulAvailable = false; + } finally { + this.loading = false; + } + }, + + getStatusClass(status) { + return { + 'status-passing': status === 'passing', + 'status-warning': status === 'warning', + 'status-critical': status === 'critical', + 'status-unknown': !status || status === 'unknown' + }; + }, + + getStatusEmoji(status) { + switch(status) { + case 'passing': return '🟢'; + case 'warning': return '🟡'; + case 'critical': return '🔴'; + default: return '⚪'; + } + } + } +} +``` + +### CSS Styling (static/css/style.css) +Implement these specific styles: +```css +/* Basic reset and layout */ +* { margin: 0; padding: 0; box-sizing: border-box; } +body { font-family: Arial, sans-serif; background: #f5f5f5; } + +/* Header */ +.header { + background: white; + padding: 1rem 2rem; + box-shadow: 0 2px 4px rgba(0,0,0,0.1); + display: flex; + justify-content: space-between; + align-items: center; +} + +/* Alert banners */ +.error-banner, .warning-banner { + padding: 0.75rem 2rem; + margin: 0; + font-weight: bold; +} +.error-banner { background: #fee; color: #c33; } +.warning-banner { background: #fff3cd; color: #856404; } + +/* Services table */ +.services-container { padding: 2rem; } +.services-table { + width: 100%; + background: white; + border-radius: 8px; + box-shadow: 0 2px 4px rgba(0,0,0,0.1); + border-collapse: collapse; +} +.services-table th, .services-table td { + padding: 1rem; + text-align: left; + border-bottom: 1px solid #eee; +} +.services-table th { background: #f8f9fa; font-weight: bold; } + +/* Status indicators */ +.status-icon { font-size: 1.2rem; } +.status-passing { color: #28a745; } +.status-warning { color: #ffc107; } +.status-critical { color: #dc3545; } +.status-unknown { color: #6c757d; } + +/* Tags */ +.tag { + display: inline-block; + background: #e9ecef; + padding: 0.25rem 0.5rem; + border-radius: 4px; + font-size: 0.875rem; + margin-right: 0.5rem; +} + +/* Buttons */ +button { + background: #007bff; + color: white; + border: none; + padding: 0.5rem 1rem; + border-radius: 4px; + cursor: pointer; +} +button:hover { background: #0056b3; } +button:disabled { background: #6c757d; cursor: not-allowed; } +``` + +## Error Handling Requirements + +### Consul Connection Errors +- Catch `requests.exceptions.ConnectionError` and `requests.exceptions.Timeout` +- Log errors but continue serving cached data +- Display connection status in UI + +### Database Errors +- Handle SQLite database lock errors +- Graceful degradation when database is unavailable +- Return appropriate HTTP status codes + +### Data Validation +- Validate service data structure from Consul API +- Handle missing or malformed service records +- Default to 'unknown' status for services without health checks + +## Testing Checklist +Before considering Phase 1 complete, verify: + +1. **Database Operations**: + - [ ] Database tables created correctly + - [ ] Services can be inserted/updated + - [ ] Health checks are stored with timestamps + - [ ] Queries return expected data structure + +2. **Consul Integration**: + - [ ] Can fetch service list from Consul + - [ ] Can fetch health status for each service + - [ ] Handles Consul connection failures gracefully + - [ ] Service URLs generated correctly + +3. **Web Interface**: + - [ ] Dashboard loads without errors + - [ ] Services displayed in table format + - [ ] Status icons show correct colors + - [ ] Refresh button updates data via AJAX + - [ ] Error messages display when appropriate + +4. **Error Scenarios**: + - [ ] App starts when Consul is unavailable + - [ ] Shows cached data when Consul fails + - [ ] Displays appropriate error messages + - [ ] Recovers when Consul comes back online + +## Docker Configuration (Dockerfile) +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +# Create non-root user +RUN useradd -m appuser && chown -R appuser:appuser /app +USER appuser + +# Expose port +EXPOSE 5000 + +# Environment variables +ENV FLASK_APP=app.py +ENV FLASK_ENV=production + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD python -c "import requests; requests.get('http://localhost:5000/health', timeout=5)" || exit 1 + +CMD ["python", "-m", "flask", "run", "--host=0.0.0.0"] +``` + +## Implementation Order +Follow this exact sequence: + +1. Create project structure and requirements.txt +2. Implement database.py with all functions and test database operations +3. Implement consul_client.py and test Consul connectivity +4. Create basic Flask app.py with health endpoint +5. Add /api/services endpoint with full error handling +6. Create HTML template with Alpine.js integration +7. Add CSS styling for professional appearance +8. Test complete workflow: Consul → Database → API → Frontend +9. Create Dockerfile and test containerized deployment +10. Verify all error scenarios work as expected + +## Success Criteria +Phase 1 is complete when: +- Application starts successfully in Docker container +- Dashboard displays list of services from Consul +- Manual refresh button updates service data +- Application gracefully handles Consul outages +- All services show correct health status with colored indicators +- Generated service URLs follow the specified pattern +- Error messages display appropriately in the UI \ No newline at end of file