conductor(checkpoint): Checkpoint end of Phase 2 - Aggregator Refactor
This commit is contained in:
@@ -4,59 +4,77 @@ import nomad_client
|
||||
|
||||
def get_cluster_status(consul_url, job_id="navidrome-litefs"):
|
||||
"""
|
||||
Aggregates cluster data from Consul, LiteFS, and Nomad.
|
||||
Aggregates cluster data from Nomad (Discovery), LiteFS (Role), and Consul (Routing Health).
|
||||
"""
|
||||
consul_nodes = consul_client.get_cluster_services(consul_url)
|
||||
aggregated_nodes = []
|
||||
# 1. Discover all nodes via Nomad Allocations
|
||||
allocations = nomad_client.get_job_allocations(job_id)
|
||||
nomad_available = bool(nomad_client.get_node_map())
|
||||
|
||||
# 2. Get all Consul registrations for 'navidrome'
|
||||
consul_services = consul_client.get_cluster_services(consul_url)
|
||||
# Create a map for easy lookup by IP
|
||||
consul_map = {s["address"]: s for s in consul_services}
|
||||
|
||||
aggregated_nodes = []
|
||||
is_healthy = True
|
||||
primary_count = 0
|
||||
|
||||
# Check Nomad connectivity
|
||||
node_map = nomad_client.get_node_map()
|
||||
nomad_available = bool(node_map)
|
||||
|
||||
for node in consul_nodes:
|
||||
# Fetch allocation ID first to enable nomad exec fallback
|
||||
alloc_id = nomad_client.get_allocation_id(node["node"], job_id)
|
||||
for alloc in allocations:
|
||||
node_name = alloc["node"]
|
||||
address = alloc["ip"]
|
||||
alloc_id = alloc["id"]
|
||||
|
||||
litefs_status = litefs_client.get_node_status(node["address"], alloc_id=alloc_id)
|
||||
# 3. Get LiteFS Status
|
||||
litefs_status = litefs_client.get_node_status(address, alloc_id=alloc_id)
|
||||
|
||||
# 4. Match with Consul info
|
||||
consul_info = consul_map.get(address)
|
||||
|
||||
# Merge data
|
||||
node_data = {
|
||||
**node,
|
||||
"node": node_name,
|
||||
"address": address,
|
||||
"alloc_id": alloc_id,
|
||||
"litefs_primary": litefs_status.get("is_primary", False),
|
||||
"uptime": litefs_status.get("uptime", "N/A"),
|
||||
"advertise_url": litefs_status.get("advertise_url", ""),
|
||||
"replication_lag": litefs_status.get("replication_lag", "N/A"),
|
||||
"litefs_error": litefs_status.get("error", None),
|
||||
"nomad_logs": None,
|
||||
"alloc_id": alloc_id
|
||||
"active_dbs": list(litefs_status.get("dbs", {}).keys()),
|
||||
"litefs_error": litefs_status.get("error"),
|
||||
"nomad_logs": None
|
||||
}
|
||||
|
||||
if node["status"] != "passing":
|
||||
is_healthy = False
|
||||
# Fetch Nomad logs for critical nodes
|
||||
if alloc_id:
|
||||
node_data["nomad_logs"] = nomad_client.get_allocation_logs(alloc_id)
|
||||
|
||||
if node_data["litefs_primary"]:
|
||||
primary_count += 1
|
||||
|
||||
# Check for active databases
|
||||
node_dbs = litefs_status.get("dbs", {})
|
||||
if node_dbs:
|
||||
node_data["active_dbs"] = list(node_dbs.keys())
|
||||
node_data["role"] = "primary"
|
||||
else:
|
||||
node_data["active_dbs"] = []
|
||||
node_data["role"] = "replica"
|
||||
|
||||
# 5. Determine Consul status
|
||||
if consul_info:
|
||||
node_data["status"] = consul_info["status"]
|
||||
node_data["check_output"] = consul_info["check_output"]
|
||||
if node_data["status"] != "passing":
|
||||
is_healthy = False
|
||||
node_data["nomad_logs"] = nomad_client.get_allocation_logs(alloc_id)
|
||||
else:
|
||||
# Not in Consul
|
||||
if node_data["litefs_primary"]:
|
||||
# If it's primary in LiteFS but not in Consul, that's an error (unless just started)
|
||||
node_data["status"] = "unregistered"
|
||||
is_healthy = False
|
||||
node_data["nomad_logs"] = nomad_client.get_allocation_logs(alloc_id)
|
||||
else:
|
||||
# Replicas are expected to be unregistered in the new model
|
||||
node_data["status"] = "standby"
|
||||
node_data["check_output"] = "Clean catalog (expected for replica)"
|
||||
|
||||
aggregated_nodes.append(node_data)
|
||||
|
||||
# Final health check
|
||||
health = "Healthy"
|
||||
if not is_healthy:
|
||||
health = "Unhealthy"
|
||||
elif primary_count == 0:
|
||||
|
||||
if primary_count == 0:
|
||||
health = "No Primary Detected"
|
||||
elif primary_count > 1:
|
||||
health = "Split Brain Detected (Multiple Primaries)"
|
||||
|
||||
Reference in New Issue
Block a user