trying to fix history issues

This commit is contained in:
2025-08-11 12:30:41 -07:00
parent 9028028967
commit f7adbcc0de
6 changed files with 447 additions and 186 deletions

View File

@@ -43,18 +43,23 @@ def index():
# Get thread-local database connection
db_conn = get_db()
# Get initial service data
services = database.get_all_services_with_health(db_conn)
try:
# Get services grouped by name
services = database.get_all_services_grouped(db_conn)
consul_available = consul_client.is_consul_available()
# Generate URLs for services
# Generate URLs for each instance in each service
for service in services:
if service['port']:
service['url'] = f"http://{service['name']}.service.dc1.consul:{service['port']}"
for instance in service['instances']:
if instance['port']:
instance['url'] = f"http://{service['name']}.service.dc1.consul:{instance['port']}"
else:
service['url'] = None
instance['url'] = None
return render_template('index.html', services=services, consul_available=consul_available)
except Exception as e:
# Fallback in case of errors
return render_template('index.html', services=[], consul_available=False, error=str(e))
@app.route('/api/services')
def get_services():
@@ -63,16 +68,17 @@ def get_services():
db_conn = get_db()
try:
# Always use database data since background polling updates it
services = database.get_all_services_with_health(db_conn)
# Get services grouped by name
services = database.get_all_services_grouped(db_conn)
consul_available = consul_client.is_consul_available()
# Generate URLs for services
# Generate URLs for each instance in each service
for service in services:
if service['port']:
service['url'] = f"http://{service['name']}.service.dc1.consul:{service['port']}"
for instance in service['instances']:
if instance['port']:
instance['url'] = f"http://{service['name']}.service.dc1.consul:{instance['port']}"
else:
service['url'] = None
instance['url'] = None
response = {
'status': 'success',
@@ -134,8 +140,8 @@ def update_config():
session.permanent = True
return jsonify({'status': 'success'})
@app.route('/api/services/<service_id>/history')
def get_service_history(service_id):
@app.route('/api/services/<service_name>/history')
def get_service_history(service_name):
"""Get historical health data for charts"""
# Get thread-local database connection
db_conn = get_db()
@@ -144,15 +150,19 @@ def get_service_history(service_id):
granularity = int(request.args.get('granularity',
session.get('history_granularity', 15)))
# Get instance address from query params
instance_address = request.args.get('instance', '')
try:
# Get raw history data (24 hours)
history = database.get_service_history(db_conn, service_id, 24)
history = database.get_service_history(db_conn, service_name, instance_address, 24)
# Aggregate data by granularity for Chart.js
chart_data = aggregate_health_data(history, granularity)
return jsonify({
'service_id': service_id,
'service_name': service_name,
'instance_address': instance_address,
'granularity': granularity,
'data': chart_data
})
@@ -160,7 +170,8 @@ def get_service_history(service_id):
except Exception as e:
return jsonify({
'error': str(e),
'service_id': service_id,
'service_name': service_name,
'instance_address': instance_address,
'data': []
}), 500

View File

@@ -61,8 +61,14 @@ class ConsulPoller:
logger.warning("Consul unavailable during background poll")
return
# Get fresh data from Consul
service_data = consul_client.fetch_all_service_data()
# Get fresh data from Consul (now returns services and instances)
consul_data = consul_client.fetch_all_service_data()
if not consul_data:
logger.warning("No data received from Consul")
return
service_data = consul_data['services']
instances = consul_data['instances']
if not service_data:
logger.warning("No service data received from Consul")
@@ -80,22 +86,24 @@ class ConsulPoller:
services_updated = 0
health_checks_inserted = 0
for service_id, data in service_data.items():
# Upsert service
database.upsert_service(conn, {
'id': service_id,
'name': data['name'],
'address': data['address'],
'port': data['port'],
'tags': data['tags'],
'meta': data['meta']
})
# Process instances
for address, instance in instances.items():
# Upsert instance with composite health
database.upsert_instance(conn, address, instance['health_status'])
# Record instance health
database.insert_instance_health(conn, address, instance['health_status'])
# Process services in this instance
for service in instance['services']:
# Upsert service with instance address
database.upsert_service(conn, service, address)
services_updated += 1
# Insert health checks - raw data points every minute
for check in data['health_checks']:
# Insert health checks
for check in service['health_checks']:
database.insert_health_check(
conn, service_id,
conn, service['id'],
check['check_name'],
check['status']
)

View File

@@ -50,8 +50,38 @@ def is_consul_available():
except requests.exceptions.RequestException:
return False
def calculate_composite_health(services):
"""Calculate overall health status for a group of services"""
status_priority = {'critical': 3, 'warning': 2, 'passing': 1}
worst_status = 'passing'
for service in services:
for check in service['health_checks']:
if status_priority[check['status']] > status_priority[worst_status]:
worst_status = check['status']
return worst_status
def group_services_by_instance(services):
"""Group services by their instance address"""
instances = {}
for service in services.values():
address = service['address']
if address not in instances:
instances[address] = {
'address': address,
'services': [],
'health_status': 'passing'
}
instances[address]['services'].append(service)
# Calculate composite health for each instance
for instance in instances.values():
instance['health_status'] = calculate_composite_health(instance['services'])
return instances
def fetch_all_service_data():
"""Fetch service data and health status for all services"""
"""Fetch service data and health status for all services, grouped by instance"""
try:
services = get_consul_services()
service_data = {}
@@ -76,7 +106,11 @@ def fetch_all_service_data():
'health_checks': health_checks
}
return service_data
# Return both individual services and grouped instances
return {
'services': service_data,
'instances': group_services_by_instance(service_data)
}
except requests.exceptions.RequestException:
logger.error("Failed to fetch service data from Consul")
return {}

View File

@@ -4,12 +4,22 @@ from datetime import datetime
def create_tables(conn):
cursor = conn.cursor()
# Create instances table
cursor.execute('''
CREATE TABLE IF NOT EXISTS instances (
address TEXT PRIMARY KEY,
health_status TEXT NOT NULL DEFAULT 'unknown',
first_seen DATETIME DEFAULT CURRENT_TIMESTAMP,
last_seen DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
# Create services table
cursor.execute('''
CREATE TABLE IF NOT EXISTS services (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
address TEXT,
address TEXT REFERENCES instances(address) ON DELETE CASCADE,
port INTEGER,
tags TEXT,
meta TEXT,
@@ -30,6 +40,16 @@ def create_tables(conn):
)
''')
# Create instance health table
cursor.execute('''
CREATE TABLE IF NOT EXISTS instance_health (
id INTEGER PRIMARY KEY AUTOINCREMENT,
address TEXT NOT NULL REFERENCES instances(address) ON DELETE CASCADE,
health_status TEXT NOT NULL,
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
# Create indexes
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_health_checks_service_timestamp
@@ -49,7 +69,20 @@ def init_database():
create_tables(conn)
return conn
def upsert_service(conn, service_data):
def upsert_instance(conn, address, health_status):
"""Insert or update an instance record"""
cursor = conn.cursor()
cursor.execute('''
INSERT INTO instances (address, health_status, last_seen)
VALUES (?, ?, CURRENT_TIMESTAMP)
ON CONFLICT(address) DO UPDATE SET
health_status = excluded.health_status,
last_seen = excluded.last_seen
''', (address, health_status))
conn.commit()
def upsert_service(conn, service_data, instance_address):
"""Insert or update a service record with instance reference"""
cursor = conn.cursor()
cursor.execute('''
INSERT INTO services (id, name, address, port, tags, meta)
@@ -64,13 +97,22 @@ def upsert_service(conn, service_data):
''', (
service_data['id'],
service_data['name'],
service_data.get('address'),
instance_address,
service_data.get('port'),
json.dumps(service_data.get('tags', [])),
json.dumps(service_data.get('meta', {}))
))
conn.commit()
def insert_instance_health(conn, address, health_status):
"""Insert an instance health record"""
cursor = conn.cursor()
cursor.execute('''
INSERT INTO instance_health (address, health_status)
VALUES (?, ?)
''', (address, health_status))
conn.commit()
def insert_health_check(conn, service_id, check_name, status):
cursor = conn.cursor()
cursor.execute('''
@@ -79,40 +121,72 @@ def insert_health_check(conn, service_id, check_name, status):
''', (service_id, check_name, status))
conn.commit()
def get_all_services_with_health(conn):
def get_all_services_grouped(conn):
"""Get all services grouped by name with composite health status"""
cursor = conn.cursor()
cursor.execute('''
SELECT s.id, s.name, s.address, s.port, s.tags, s.meta,
h.status, MAX(h.timestamp) AS last_check
WITH latest_health AS (
SELECT
service_id,
status,
MAX(timestamp) as last_check
FROM health_checks
GROUP BY service_id
)
SELECT
s.name,
json_group_array(json_object(
'address', s.address,
'port', s.port,
'id', s.id,
'tags', s.tags,
'meta', s.meta,
'current_status', lh.status,
'last_check', lh.last_check
)) AS instances,
MIN(CASE
WHEN lh.status = 'critical' THEN 1
WHEN lh.status = 'warning' THEN 2
WHEN lh.status = 'passing' THEN 3
ELSE 4 END) as composite_status_order
FROM services s
LEFT JOIN health_checks h ON s.id = h.service_id
GROUP BY s.id
LEFT JOIN latest_health lh ON s.id = lh.service_id
GROUP BY s.name
ORDER BY s.name
''')
services = []
for row in cursor.fetchall():
service = {
'id': row[0],
'name': row[1],
'address': row[2],
'port': row[3],
'tags': json.loads(row[4]) if row[4] else [],
'meta': json.loads(row[5]) if row[5] else {},
'current_status': row[6] or 'unknown',
'last_check': row[7]
'name': row[0],
'instances': json.loads(row[1]) if row[1] else [],
'composite_status': 'passing' # Default
}
# Determine composite status based on worst status
if any(inst.get('current_status') == 'critical' for inst in service['instances']):
service['composite_status'] = 'critical'
elif any(inst.get('current_status') == 'warning' for inst in service['instances']):
service['composite_status'] = 'warning'
elif all(inst.get('current_status') == 'passing' for inst in service['instances']):
service['composite_status'] = 'passing'
else:
service['composite_status'] = 'unknown'
services.append(service)
return services
def get_service_history(conn, service_id, hours=24):
def get_service_history(conn, service_name, instance_address, hours=24):
cursor = conn.cursor()
cursor.execute('''
SELECT status, timestamp
FROM health_checks
WHERE service_id = ?
AND timestamp >= datetime('now', ?)
ORDER BY timestamp ASC
''', (service_id, f'-{hours} hours'))
SELECT hc.status, hc.timestamp
FROM health_checks hc
JOIN services s ON hc.service_id = s.id
WHERE s.name = ?
AND s.address = ?
AND hc.timestamp >= datetime('now', ?)
ORDER BY hc.timestamp ASC
''', (service_name, instance_address, f'-{hours} hours'))
return cursor.fetchall()
def get_service_history_detailed(conn, service_id, hours=24):
@@ -135,3 +209,42 @@ def is_database_available(conn):
return True
except sqlite3.Error:
return False
# Keep the old function for now but we'll remove it later
def get_all_instances_with_services(conn):
"""Get all instances with their services and health status"""
cursor = conn.cursor()
cursor.execute('''
SELECT i.address, i.health_status,
s.id, s.name, s.port, s.tags, s.meta,
h.status, MAX(h.timestamp) AS last_check
FROM instances i
LEFT JOIN services s ON i.address = s.address
LEFT JOIN health_checks h ON s.id = h.service_id
GROUP BY i.address, s.id
''')
instances = {}
for row in cursor.fetchall():
address = row[0]
if address not in instances:
instances[address] = {
'address': address,
'health_status': row[1],
'services': []
}
# Only add service if it exists
if row[2]: # service id
service = {
'id': row[2],
'name': row[3],
'port': row[4],
'tags': json.loads(row[5]) if row[5] else [],
'meta': json.loads(row[6]) if row[6] else {},
'current_status': row[7] or 'unknown',
'last_check': row[8]
}
instances[address]['services'].append(service)
return list(instances.values())

View File

@@ -53,34 +53,44 @@
<thead>
<tr>
<th>Service Name</th>
<th>Status</th>
<th>URL</th>
<th>Tags</th>
<th>24h History</th>
<th>Health</th>
<th>Instances</th>
<th>History</th>
<th>Details</th>
</tr>
</thead>
<tbody>
<template x-for="service in services" :key="service.id">
<tr>
<template x-for="service in services" :key="service.name">
<tr class="service-row">
<td x-text="service.name"></td>
<td>
<span class="status-icon"
:class="getStatusClass(service.current_status)"
x-text="getStatusEmoji(service.current_status)">
:class="getStatusClass(service.composite_status)"
x-text="getStatusEmoji(service.composite_status)">
</span>
</td>
<td x-text="service.instances.length"></td>
<td>
<a :href="service.url" target="_blank" x-text="service.url"></a>
<div class="history-chart-container">
<canvas :id="'chart-'+service.name"></canvas>
</div>
</td>
<td>
<template x-for="tag in service.tags">
<span class="tag" x-text="tag"></span>
<div class="instance-details">
<template x-for="(instance, index) in service.instances"
:key="`${service.name}-${instance.address}-${index}`">
<div class="instance">
<span class="status-icon"
:class="getStatusClass(instance.current_status)"
x-text="getStatusEmoji(instance.current_status) + ' '">
</span>
<span x-text="instance.address || 'Unknown address'"></span>
<template x-if="instance.port">
<span>:<span x-text="instance.port"></span></span>
</template>
<a :href="instance.url" target="_blank" x-show="instance.url">🔗</a>
</div>
</template>
</td>
<td>
<div class="chart-container">
<canvas :id="'chart-' + service.id"
width="200" height="50"></canvas>
</div>
</td>
</tr>
@@ -94,12 +104,14 @@
</div>
</div>
<!-- Load Alpine.js and Chart.js -->
<!-- Load Alpine.js -->
<script src="https://unpkg.com/alpinejs@3.x.x/dist/cdn.min.js" defer></script>
<!-- Load Chart.js -->
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script>
document.addEventListener('alpine:init', () => {
Alpine.data('serviceMonitor', () => ({
charts: {}, // Store Chart.js instances by service name
services: [],
loading: false,
error: null,
@@ -116,7 +128,6 @@
}
},
autoRefreshTimer: null,
charts: {},
async init() {
await this.loadConfig();
@@ -141,26 +152,16 @@
headers: {'Content-Type': 'application/json'},
body: JSON.stringify(this.config)
});
// Restart auto-refresh with new interval
this.startAutoRefresh();
// Refresh charts if granularity changed
this.loadHistoryCharts();
} catch (err) {
console.error('Failed to update config:', err);
}
},
startAutoRefresh() {
// Clear existing timer
if (this.autoRefreshTimer) {
clearInterval(this.autoRefreshTimer);
this.autoRefreshTimer = null;
}
// Start new timer if enabled
if (this.config.autoRefresh.enabled) {
this.autoRefreshTimer = setInterval(
() => this.refreshServices(),
@@ -178,12 +179,28 @@
const data = await response.json();
if (data.status === 'success') {
this.services = data.services;
// Add default values for instance properties
this.services = data.services.map(service => ({
...service,
instances: service.instances.map(instance => ({
address: instance.address || 'Unknown address',
port: instance.port || null,
url: instance.url || '',
current_status: instance.current_status || 'unknown'
}))
}));
this.consulAvailable = data.consul_available;
// Load history charts after services update
this.$nextTick(() => this.loadHistoryCharts());
// Destroy existing charts
Object.values(this.charts).forEach(chart => chart.destroy());
this.charts = {};
// Initialize charts after rendering
this.$nextTick(() => {
this.services.forEach(service => {
this.loadHistoryChart(service.name);
});
});
} else {
this.error = data.error || 'Failed to fetch services';
this.services = data.services || [];
@@ -198,80 +215,67 @@
}
},
async loadHistoryCharts() {
for (const service of this.services) {
await this.createChart(service.id);
async loadHistoryChart(serviceName) {
// Destroy existing chart if present
if (this.charts[serviceName]) {
this.charts[serviceName].destroy();
delete this.charts[serviceName];
}
},
async createChart(serviceId) {
try {
const response = await fetch(`/api/services/${serviceId}/history?granularity=${this.config.display.historyGranularity}`);
const data = await response.json();
// Get granularity from config
const granularity = this.config.display.historyGranularity;
const canvas = document.getElementById(`chart-${serviceId}`);
if (!canvas) return;
// Use correct endpoint with query parameter
const response = await fetch(`/api/services/${serviceName}/history?granularity=${granularity}`);
// Destroy existing chart
if (this.charts[serviceId]) {
this.charts[serviceId].destroy();
// Check for HTTP errors
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const ctx = canvas.getContext('2d');
const historyData = await response.json();
this.charts[serviceId] = new Chart(ctx, {
type: 'bar',
// Process data for Chart.js
const timestamps = historyData.data.map(item => item.timestamp);
const values = historyData.data.map(item => {
// Calculate composite health score:
// passing=1.0, warning=0.5, critical=0.0
return (item.passing + item.warning * 0.5) / 100;
});
const ctx = document.getElementById(`chart-${serviceName}`);
if (!ctx) return;
// Create and store new chart
this.charts[serviceName] = new Chart(ctx, {
type: 'line',
data: {
labels: data.data.map(d => new Date(d.timestamp).toLocaleTimeString([], {hour: '2-digit', minute:'2-digit'})),
datasets: [
{
label: 'Passing',
data: data.data.map(d => d.passing),
backgroundColor: '#28a745',
stack: 'health'
},
{
label: 'Warning',
data: data.data.map(d => d.warning),
backgroundColor: '#ffc107',
stack: 'health'
},
{
label: 'Critical',
data: data.data.map(d => d.critical),
backgroundColor: '#dc3545',
stack: 'health'
}
]
labels: timestamps,
datasets: [{
label: 'Health Score',
data: values,
borderColor: 'rgb(75, 192, 192)',
tension: 0.1,
fill: false
}]
},
options: {
responsive: false,
responsive: true,
maintainAspectRatio: false,
plugins: {
legend: { display: false },
tooltip: {
callbacks: {
title: function(context) {
return new Date(data.data[context[0].dataIndex].timestamp).toLocaleString();
},
label: function(context) {
return context.dataset.label + ': ' + context.parsed.y + '%';
}
}
}
legend: { display: false }
},
scales: {
x: { display: false },
y: {
display: false,
max: 100,
stacked: true
min: 0,
max: 1
}
}
}
});
} catch (err) {
console.error(`Failed to load chart for service ${serviceId}:`, err);
} catch (error) {
console.error(`Error loading history for ${serviceName}:`, error);
}
},
@@ -281,16 +285,18 @@
},
getStatusClass(status) {
const safeStatus = status || 'unknown';
return {
'status-passing': status === 'passing',
'status-warning': status === 'warning',
'status-critical': status === 'critical',
'status-unknown': !status || status === 'unknown'
'status-passing': safeStatus === 'passing',
'status-warning': safeStatus === 'warning',
'status-critical': safeStatus === 'critical',
'status-unknown': safeStatus === 'unknown'
};
},
getStatusEmoji(status) {
switch(status) {
const safeStatus = status || 'unknown';
switch(safeStatus) {
case 'passing': return '🟢';
case 'warning': return '🟡';
case 'critical': return '🔴';
@@ -300,5 +306,21 @@
}));
});
</script>
<style>
.instance-details {
display: flex;
flex-direction: column;
gap: 4px;
}
.instance {
display: flex;
align-items: center;
gap: 8px;
}
.history-chart-container {
height: 60px;
width: 200px;
}
</style>
</body>
</html>

73
plan_phase3.md Normal file
View File

@@ -0,0 +1,73 @@
# Phase 3 Implementation Plan - Service Grouping and Scalability
## Overview
Implemented service grouping with composite health reporting and UI scalability enhancements to support up to 30 services.
## Key Features
1. **Service Grouping**: Services are grouped by name into single rows
2. **Composite Health**: Overall service health based on all instances
3. **Scalability**: UI optimizations to support 30+ services
## Implementation Details
### Backend Modifications
1. **Service Grouping Logic** (database.py)
- Added `get_all_services_grouped()` function
- Implemented composite health calculation per service
- Returns aggregated service data with instance lists
2. **Database Queries**
- Created optimized query to group services by name
- Added composite status calculation in SQL
- Maintained instance details within service groups
3. **API Endpoint Updates** (app.py)
- Modified `/api/services` to return service groups
- Added service-based instance grouping in responses
### Frontend Changes
1. **Table Redesign** (index.html)
- Converted to service-based table structure
- Added expandable rows for instance details
- Implemented service health indicators
2. **Health Reporting UI**
- Added composite status indicators per service
- Maintained instance-level health details
- Preserved history chart functionality
3. **Scalability Features**
- Added expand/collapse functionality
- Optimized UI for 30+ services
- Efficient data loading with grouping
### Health Calculation
1. **Status Algorithm**
- Critical if any instance critical
- Warning if any instance warning (no criticals)
- Passing if all instances passing
## Implementation Sequence
1. Updated database.py for service grouping
2. Modified app.py endpoints to use service groups
3. Redesigned frontend to display service groups
4. Added expand/collapse functionality for instances
5. Maintained URL generation for instances
6. Added error handling for new data model
## Testing Considerations
- Verify service grouping by name
- Test composite health calculation logic
- Validate expand/collapse functionality
- Test with 30+ services to ensure scalability
- Verify history charts still function properly
- Test error handling for Consul unavailability
## Estimated Implementation Time
**Total: 4-5 hours**
## Next Steps
- Implement pagination for large service sets
- Add search/filter functionality
- Optimize database queries for large datasets
- Implement service-level history charts