This commit is contained in:
2025-08-11 19:46:25 -07:00
parent f7adbcc0de
commit 00409ff368
5 changed files with 445 additions and 106 deletions

View File

@@ -50,6 +50,19 @@ def index():
# Generate URLs for each instance in each service
for service in services:
# Create a set of unique ports for this service
unique_ports = set()
for instance in service['instances']:
if instance['port']:
unique_ports.add(instance['port'])
# Create port-based URLs
service['port_urls'] = [
f"http://{service['name']}.service.dc1.consul:{port}"
for port in unique_ports
]
# Keep instance URLs for other display purposes
for instance in service['instances']:
if instance['port']:
instance['url'] = f"http://{service['name']}.service.dc1.consul:{instance['port']}"
@@ -73,13 +86,23 @@ def get_services():
consul_available = consul_client.is_consul_available()
# Generate URLs for each instance in each service
# Generate URLs for each service and its instances
for service in services:
# Create a set of unique ports for port-based URLs
unique_ports = set()
for instance in service['instances']:
if instance['port']:
unique_ports.add(instance['port'])
instance['url'] = f"http://{service['name']}.service.dc1.consul:{instance['port']}"
else:
instance['url'] = None
# Add port-based URLs to service object
service['port_urls'] = [
f"http://{service['name']}.service.dc1.consul:{port}"
for port in unique_ports
]
response = {
'status': 'success',
'consul_available': consul_available,
@@ -276,5 +299,12 @@ def health_check():
'timestamp': datetime.utcnow().isoformat()
})
# Log 404 errors
@app.after_request
def log_404(response):
if response.status_code == 404:
app.logger.warning(f"404 for {request.path} from {request.remote_addr}")
return response
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)

View File

@@ -1,5 +1,6 @@
import requests
import logging
from collections import defaultdict
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -10,16 +11,29 @@ CONSUL_HOST = "consul.service.dc1.consul"
CONSUL_PORT = 8500
CONSUL_BASE_URL = f"http://{CONSUL_HOST}:{CONSUL_PORT}"
def get_consul_services():
"""Fetch all registered services from Consul"""
url = f"{CONSUL_BASE_URL}/v1/agent/services"
def get_all_service_names():
"""Fetch all service names from Consul catalog"""
url = f"{CONSUL_BASE_URL}/v1/catalog/services"
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
services = response.json()
# Filter out consul service and return service names
return [name for name in services.keys() if name != 'consul']
except requests.exceptions.RequestException as e:
logger.error(f"Failed to fetch Consul services: {e}")
return []
def get_service_instances(service_name):
"""Fetch instances of a service from Consul catalog"""
url = f"{CONSUL_BASE_URL}/v1/catalog/service/{service_name}"
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
logger.error(f"Failed to fetch Consul services: {e}")
raise
logger.error(f"Failed to fetch instances for service {service_name}: {e}")
return []
def get_service_health(service_name):
"""Fetch health checks for a specific service"""
@@ -27,20 +41,10 @@ def get_service_health(service_name):
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
data = response.json()
# Process health checks
health_checks = []
for entry in data:
for check in entry.get('Checks', []):
health_checks.append({
'check_name': check.get('Name', ''),
'status': check.get('Status', '')
})
return health_checks
return response.json()
except requests.exceptions.RequestException as e:
logger.error(f"Failed to fetch health for service {service_name}: {e}")
raise
return []
def is_consul_available():
"""Check if Consul is reachable"""
@@ -50,67 +54,90 @@ def is_consul_available():
except requests.exceptions.RequestException:
return False
def calculate_composite_health(services):
"""Calculate overall health status for a group of services"""
status_priority = {'critical': 3, 'warning': 2, 'passing': 1}
worst_status = 'passing'
for service in services:
for check in service['health_checks']:
if status_priority[check['status']] > status_priority[worst_status]:
worst_status = check['status']
return worst_status
def group_services_by_instance(services):
"""Group services by their instance address"""
instances = {}
for service in services.values():
address = service['address']
if address not in instances:
instances[address] = {
'address': address,
'services': [],
'health_status': 'passing'
}
instances[address]['services'].append(service)
# Calculate composite health for each instance
for instance in instances.values():
instance['health_status'] = calculate_composite_health(instance['services'])
return instances
def fetch_all_service_data():
"""Fetch service data and health status for all services, grouped by instance"""
try:
services = get_consul_services()
# Get all service names
service_names = get_all_service_names()
if not service_names:
logger.warning("No services found in Consul catalog")
return {}
logger.info(f"Received {len(service_names)} services from Consul")
# Initialize data structures
service_data = {}
instances = defaultdict(lambda: {
'address': '',
'health_status': 'passing',
'services': []
})
for service_id, service_info in services.items():
service_name = service_info.get('Service', '')
health_checks = []
# Process each service
for service_name in service_names:
# Get service instances from catalog
catalog_instances = get_service_instances(service_name)
if not catalog_instances:
continue
try:
health_checks = get_service_health(service_name)
except requests.exceptions.RequestException:
# Log but continue with other services
logger.warning(f"Skipping health checks for service {service_name}")
# Get health information
health_data = get_service_health(service_name)
service_data[service_id] = {
# Create a mapping of Node+ServiceID to health checks
health_map = {}
for entry in health_data:
node = entry['Node']['Node']
service_id = entry['Service']['ID']
health_map[(node, service_id)] = entry['Checks']
# Process each instance
for instance in catalog_instances:
node = instance['Node']
service_id = instance['ServiceID']
address = instance['ServiceAddress'] or instance['Address']
port = instance['ServicePort']
# Get health checks for this instance
checks = health_map.get((node, service_id), [])
health_checks = [
{'check_name': c.get('Name', ''), 'status': c.get('Status', '')}
for c in checks
]
# Create service object
service_obj = {
'id': service_id,
'name': service_info.get('Service', ''),
'address': service_info.get('Address', ''),
'port': service_info.get('Port', None),
'tags': service_info.get('Tags', []),
'meta': service_info.get('Meta', {}),
'name': service_name,
'address': address,
'port': port,
'tags': instance.get('ServiceTags', []),
'meta': instance.get('ServiceMeta', {}),
'health_checks': health_checks
}
# Return both individual services and grouped instances
# Add to service data
service_data[service_id] = service_obj
# Add to instance grouping
if address not in instances:
instances[address]['address'] = address
instances[address]['services'].append(service_obj)
# Calculate composite health for each instance
for instance in instances.values():
status_priority = {'critical': 3, 'warning': 2, 'passing': 1}
worst_status = 'passing'
for service in instance['services']:
for check in service['health_checks']:
if status_priority.get(check['status'], 0) > status_priority.get(worst_status, 0):
worst_status = check['status']
instance['health_status'] = worst_status
return {
'services': service_data,
'instances': group_services_by_instance(service_data)
'instances': dict(instances)
}
except requests.exceptions.RequestException:
logger.error("Failed to fetch service data from Consul")
except Exception as e:
logger.error(f"Error fetching service data: {e}")
return {}

View File

@@ -176,8 +176,12 @@ def get_all_services_grouped(conn):
services.append(service)
return services
def get_service_history(conn, service_name, instance_address, hours=24):
def get_service_history(conn, service_name, instance_address='', hours=24):
"""Get service history by service name with optional instance filtering"""
cursor = conn.cursor()
if instance_address:
# Get history for specific service instance
cursor.execute('''
SELECT hc.status, hc.timestamp
FROM health_checks hc
@@ -187,6 +191,17 @@ def get_service_history(conn, service_name, instance_address, hours=24):
AND hc.timestamp >= datetime('now', ?)
ORDER BY hc.timestamp ASC
''', (service_name, instance_address, f'-{hours} hours'))
else:
# Get history for all instances of the service
cursor.execute('''
SELECT hc.status, hc.timestamp
FROM health_checks hc
JOIN services s ON hc.service_id = s.id
WHERE s.name = ?
AND hc.timestamp >= datetime('now', ?)
ORDER BY hc.timestamp ASC
''', (service_name, f'-{hours} hours'))
return cursor.fetchall()
def get_service_history_detailed(conn, service_id, hours=24):

View File

@@ -11,14 +11,14 @@
<div class="controls">
<!-- Auto-refresh controls -->
<div class="control-group">
<label class="toggle">
<label class="toggle" title="Enable/disable automatic background refreshing">
<input type="checkbox" x-model="config.autoRefresh.enabled"
@change="updateConfig()">
<span class="toggle-slider"></span>
Auto-refresh
</label>
<select x-model="config.autoRefresh.interval" @change="updateConfig()"
:disabled="!config.autoRefresh.enabled">
:disabled="!config.autoRefresh.enabled" title="Set refresh frequency (30-600 seconds)">
<template x-for="option in config.autoRefresh.options" :key="option">
<option :value="option" x-text="formatInterval(option)"></option>
</template>
@@ -28,7 +28,8 @@
<!-- History granularity -->
<div class="control-group">
<label>History:</label>
<select x-model="config.display.historyGranularity" @change="updateConfig()">
<select x-model="config.display.historyGranularity" @change="updateConfig()"
title="Set time resolution for historical health data">
<template x-for="option in config.display.granularityOptions" :key="option">
<option :value="option" x-text="option + 'm'"></option>
</template>
@@ -36,7 +37,7 @@
</div>
<!-- Manual refresh -->
<button @click="refreshServices" :disabled="loading">
<button @click="refreshServices" :disabled="loading" title="Manually refresh service data now">
<span x-show="!loading">🔄 Refresh</span>
<span x-show="loading">Loading...</span>
</button>
@@ -76,22 +77,16 @@
</div>
</td>
<td>
<div class="instance-details">
<template x-for="(instance, index) in service.instances"
:key="`${service.name}-${instance.address}-${index}`">
<div class="instance">
<span class="status-icon"
:class="getStatusClass(instance.current_status)"
x-text="getStatusEmoji(instance.current_status) + ' '">
</span>
<span x-text="instance.address || 'Unknown address'"></span>
<template x-if="instance.port">
<span>:<span x-text="instance.port"></span></span>
</template>
<a :href="instance.url" target="_blank" x-show="instance.url">🔗</a>
<!-- Port-based links -->
<div class="port-links">
<template x-for="url in service.port_urls" :key="url">
<div>
<a :href="url" target="_blank" title="Open service endpoint">
<span x-text="url"></span>
</a>
</div>
</template>
</div>
</div>
</td>
</tr>
</template>
@@ -236,6 +231,12 @@ if (data.status === 'success') {
const historyData = await response.json();
// Handle error response from API
if (historyData.error) {
console.warn(`No history data for ${serviceName}: ${historyData.error}`);
return;
}
// Process data for Chart.js
const timestamps = historyData.data.map(item => item.timestamp);
const values = historyData.data.map(item => {
@@ -318,8 +319,21 @@ getStatusEmoji(status) {
gap: 8px;
}
.history-chart-container {
height: 60px;
width: 200px;
height: 120px; /* 100% larger */
width: 400px; /* 100% larger */
}
.port-links {
display: flex;
flex-direction: column;
gap: 4px;
}
.port-links a {
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
max-width: 100%;
}
</style>
</body>

253
historyfix.md Normal file
View File

@@ -0,0 +1,253 @@
# Consul Monitor - History Endpoint Fix
## Problem Description
The application is experiencing 404 errors when requesting service history data:
```
GET /api/service_history/nomad-client HTTP/1.1" 404 -
GET /api/service_history/traefik-ui HTTP/1.1" 404 -
```
## Root Cause Analysis
1. **Frontend-Backend Mismatch**: The frontend JavaScript is calling history endpoints that don't match the backend route definitions
2. **Service Name Encoding**: Service names with special characters (hyphens, etc.) aren't properly URL-encoded
3. **Database Query Logic**: The history query function doesn't properly handle the service grouping structure
4. **Error Handling**: Missing graceful handling of services without history data
## Solution Overview
Fix the endpoint routing, database queries, and frontend calls to properly handle service history requests by service name.
## Files to Modify
### 1. app.py - Backend History Endpoint
**Location**: Line ~95 (after the config routes)
**Add/Replace the history endpoint:**
```python
@app.route('/api/services/<service_name>/history')
def get_service_history(service_name):
"""Get historical health data for charts"""
# Get thread-local database connection
db_conn = get_db()
# Get granularity from query params or session
granularity = int(request.args.get('granularity',
session.get('history_granularity', 15)))
# Get instance address from query params (optional - for specific instance)
instance_address = request.args.get('instance', '')
try:
# Get raw history data (24 hours)
history = database.get_service_history(db_conn, service_name, instance_address, 24)
# Aggregate data by granularity for Chart.js
chart_data = aggregate_health_data(history, granularity)
return jsonify({
'service_name': service_name,
'instance_address': instance_address,
'granularity': granularity,
'data': chart_data
})
except Exception as e:
return jsonify({
'error': str(e),
'service_name': service_name,
'instance_address': instance_address,
'data': []
}), 500
```
### 2. database.py - Fix History Query Function
**Location**: Replace the existing `get_service_history` function
**Updated function:**
```python
def get_service_history(conn, service_name, instance_address='', hours=24):
"""Get service history by service name with optional instance filtering"""
cursor = conn.cursor()
if instance_address:
# Get history for specific service instance
cursor.execute('''
SELECT hc.status, hc.timestamp
FROM health_checks hc
JOIN services s ON hc.service_id = s.id
WHERE s.name = ?
AND s.address = ?
AND hc.timestamp >= datetime('now', ?)
ORDER BY hc.timestamp ASC
''', (service_name, instance_address, f'-{hours} hours'))
else:
# Get history for all instances of the service
cursor.execute('''
SELECT hc.status, hc.timestamp
FROM health_checks hc
JOIN services s ON hc.service_id = s.id
WHERE s.name = ?
AND hc.timestamp >= datetime('now', ?)
ORDER BY hc.timestamp ASC
''', (service_name, f'-{hours} hours'))
return cursor.fetchall()
```
### 3. templates/index.html - Fix Frontend History Calls
**Location**: In the JavaScript `loadHistoryChart` function
**Replace the fetch call:**
```javascript
async loadHistoryChart(serviceName) {
// Destroy existing chart if present
if (this.charts[serviceName]) {
this.charts[serviceName].destroy();
delete this.charts[serviceName];
}
try {
// Get granularity from config
const granularity = this.config.display.historyGranularity;
// FIXED: Use correct endpoint with proper URL encoding
const response = await fetch(`/api/services/${encodeURIComponent(serviceName)}/history?granularity=${granularity}`);
// Check for HTTP errors
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const historyData = await response.json();
// Handle error response from API
if (historyData.error) {
console.warn(`No history data for ${serviceName}: ${historyData.error}`);
return;
}
// Process data for Chart.js
const timestamps = historyData.data.map(item => item.timestamp);
const values = historyData.data.map(item => {
// Calculate composite health score:
// passing=1.0, warning=0.5, critical=0.0
return (item.passing + item.warning * 0.5) / 100;
});
const ctx = document.getElementById(`chart-${serviceName}`);
if (!ctx) return;
// Create and store new chart
this.charts[serviceName] = new Chart(ctx, {
type: 'line',
data: {
labels: timestamps,
datasets: [{
label: 'Health Score',
data: values,
borderColor: 'rgb(75, 192, 192)',
tension: 0.1,
fill: false
}]
},
options: {
responsive: true,
maintainAspectRatio: false,
plugins: {
legend: { display: false }
},
scales: {
y: {
min: 0,
max: 1
}
}
}
});
} catch (error) {
console.error(`Error loading history for ${serviceName}:`, error);
}
},
```
## Implementation Steps
1. **Update app.py**:
- Ensure the history endpoint matches the pattern `/api/services/<service_name>/history`
- Add proper error handling and response structure
- Verify the `aggregate_health_data` function exists
2. **Update database.py**:
- Replace the `get_service_history` function with the fixed version
- Ensure it queries by service name, not service ID
- Add support for optional instance filtering
3. **Update templates/index.html**:
- Fix the fetch URL to use proper encoding with `encodeURIComponent`
- Add error handling for missing history data
- Ensure chart creation handles empty data gracefully
4. **Test the fix**:
- Restart the application
- Check browser console for JavaScript errors
- Verify history endpoints return 200 responses
- Confirm charts display when history data is available
## Verification Commands
**Test the endpoint directly:**
```bash
curl http://localhost:5000/api/services/nomad-client/history
curl http://localhost:5000/api/services/traefik-ui/history
```
**Check Flask logs:**
```bash
# Should see 200 responses instead of 404
docker logs consul-monitor
```
**Browser console:**
```javascript
// Should not see 404 errors for history endpoints
// Charts should appear after background poller collects data
```
## Expected Behavior After Fix
1. **History endpoints respond with 200**: `/api/services/<service_name>/history` returns JSON data
2. **Charts display**: Mini line charts appear in the History column after data collection
3. **No 404 errors**: Browser console and Flask logs show no missing endpoint errors
4. **Graceful handling**: Services without history data don't break the interface
## Notes
- History data will only be available after the background poller has run for some time
- Charts may be empty initially until health check data accumulates
- Service names with special characters are now properly URL-encoded
- The fix maintains backward compatibility with existing functionality
## Troubleshooting
**If endpoints still return 404:**
- Verify the route decorator exactly matches: `@app.route('/api/services/<service_name>/history')`
- Check that Flask is importing the updated app.py
- Restart the application completely
**If charts don't appear:**
- Check that Chart.js is loaded before the Alpine.js script
- Verify canvas elements have unique IDs
- Check browser console for JavaScript errors
**If database queries fail:**
- Ensure services table has a 'name' column
- Verify the health_checks table is populated by the background poller
- Check that services are grouped correctly in the main API response