conductor(checkpoint): Checkpoint end of Phase 2 - Aggregator Refactor

This commit is contained in:
2026-02-09 06:13:09 -08:00
parent 079498caba
commit 655a9b2571
4 changed files with 128 additions and 169 deletions

View File

@@ -4,59 +4,77 @@ import nomad_client
def get_cluster_status(consul_url, job_id="navidrome-litefs"):
"""
Aggregates cluster data from Consul, LiteFS, and Nomad.
Aggregates cluster data from Nomad (Discovery), LiteFS (Role), and Consul (Routing Health).
"""
consul_nodes = consul_client.get_cluster_services(consul_url)
aggregated_nodes = []
# 1. Discover all nodes via Nomad Allocations
allocations = nomad_client.get_job_allocations(job_id)
nomad_available = bool(nomad_client.get_node_map())
# 2. Get all Consul registrations for 'navidrome'
consul_services = consul_client.get_cluster_services(consul_url)
# Create a map for easy lookup by IP
consul_map = {s["address"]: s for s in consul_services}
aggregated_nodes = []
is_healthy = True
primary_count = 0
# Check Nomad connectivity
node_map = nomad_client.get_node_map()
nomad_available = bool(node_map)
for node in consul_nodes:
# Fetch allocation ID first to enable nomad exec fallback
alloc_id = nomad_client.get_allocation_id(node["node"], job_id)
for alloc in allocations:
node_name = alloc["node"]
address = alloc["ip"]
alloc_id = alloc["id"]
litefs_status = litefs_client.get_node_status(node["address"], alloc_id=alloc_id)
# 3. Get LiteFS Status
litefs_status = litefs_client.get_node_status(address, alloc_id=alloc_id)
# 4. Match with Consul info
consul_info = consul_map.get(address)
# Merge data
node_data = {
**node,
"node": node_name,
"address": address,
"alloc_id": alloc_id,
"litefs_primary": litefs_status.get("is_primary", False),
"uptime": litefs_status.get("uptime", "N/A"),
"advertise_url": litefs_status.get("advertise_url", ""),
"replication_lag": litefs_status.get("replication_lag", "N/A"),
"litefs_error": litefs_status.get("error", None),
"nomad_logs": None,
"alloc_id": alloc_id
"active_dbs": list(litefs_status.get("dbs", {}).keys()),
"litefs_error": litefs_status.get("error"),
"nomad_logs": None
}
if node["status"] != "passing":
is_healthy = False
# Fetch Nomad logs for critical nodes
if alloc_id:
node_data["nomad_logs"] = nomad_client.get_allocation_logs(alloc_id)
if node_data["litefs_primary"]:
primary_count += 1
# Check for active databases
node_dbs = litefs_status.get("dbs", {})
if node_dbs:
node_data["active_dbs"] = list(node_dbs.keys())
node_data["role"] = "primary"
else:
node_data["active_dbs"] = []
node_data["role"] = "replica"
# 5. Determine Consul status
if consul_info:
node_data["status"] = consul_info["status"]
node_data["check_output"] = consul_info["check_output"]
if node_data["status"] != "passing":
is_healthy = False
node_data["nomad_logs"] = nomad_client.get_allocation_logs(alloc_id)
else:
# Not in Consul
if node_data["litefs_primary"]:
# If it's primary in LiteFS but not in Consul, that's an error (unless just started)
node_data["status"] = "unregistered"
is_healthy = False
node_data["nomad_logs"] = nomad_client.get_allocation_logs(alloc_id)
else:
# Replicas are expected to be unregistered in the new model
node_data["status"] = "standby"
node_data["check_output"] = "Clean catalog (expected for replica)"
aggregated_nodes.append(node_data)
# Final health check
health = "Healthy"
if not is_healthy:
health = "Unhealthy"
elif primary_count == 0:
if primary_count == 0:
health = "No Primary Detected"
elif primary_count > 1:
health = "Split Brain Detected (Multiple Primaries)"