conductor(checkpoint): Checkpoint end of Phase 1
This commit is contained in:
@@ -1,11 +1,36 @@
|
||||
import requests
|
||||
import nomad_client
|
||||
import re
|
||||
|
||||
def get_node_status(node_address, port=20202):
|
||||
def parse_litefs_status(output):
|
||||
"""
|
||||
Parses the output of 'litefs status'.
|
||||
"""
|
||||
status = {}
|
||||
|
||||
# Extract Primary
|
||||
primary_match = re.search(r"Primary:\s+(true|false)", output, re.IGNORECASE)
|
||||
if primary_match:
|
||||
status["is_primary"] = primary_match.group(1).lower() == "true"
|
||||
|
||||
# Extract Uptime
|
||||
uptime_match = re.search(r"Uptime:\s+(\S+)", output)
|
||||
if uptime_match:
|
||||
status["uptime"] = uptime_match.group(1)
|
||||
|
||||
# Extract Replication Lag
|
||||
lag_match = re.search(r"Replication Lag:\s+(\S+)", output)
|
||||
if lag_match:
|
||||
status["replication_lag"] = lag_match.group(1)
|
||||
|
||||
return status
|
||||
|
||||
def get_node_status(node_address, port=20202, alloc_id=None):
|
||||
"""
|
||||
Queries the LiteFS HTTP API on a specific node for its status.
|
||||
Tries /status first, then falls back to /debug/vars.
|
||||
Tries /status first, then /debug/vars, then falls back to nomad alloc exec.
|
||||
"""
|
||||
# Try /status first
|
||||
# 1. Try /status
|
||||
url = f"http://{node_address}:{port}/status"
|
||||
try:
|
||||
response = requests.get(url, timeout=3)
|
||||
@@ -14,7 +39,8 @@ def get_node_status(node_address, port=20202):
|
||||
status = {
|
||||
"is_primary": data.get("primary", False),
|
||||
"uptime": data.get("uptime", 0),
|
||||
"advertise_url": data.get("advertiseURL", "")
|
||||
"advertise_url": data.get("advertiseURL", ""),
|
||||
"dbs": data.get("dbs", {})
|
||||
}
|
||||
if "replicationLag" in data:
|
||||
status["replication_lag"] = data["replicationLag"]
|
||||
@@ -24,28 +50,43 @@ def get_node_status(node_address, port=20202):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to /debug/vars
|
||||
# 2. Try /debug/vars
|
||||
url = f"http://{node_address}:{port}/debug/vars"
|
||||
try:
|
||||
response = requests.get(url, timeout=3)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
store = data.get("store", {})
|
||||
|
||||
status = {
|
||||
"is_primary": store.get("isPrimary", False),
|
||||
"uptime": "N/A", # Not available in /debug/vars
|
||||
"advertise_url": f"http://{node_address}:{port}" # Best guess
|
||||
}
|
||||
|
||||
# Look for lag in dbs or store if it exists in other versions
|
||||
if "replicationLag" in store:
|
||||
status["replication_lag"] = store["replicationLag"]
|
||||
|
||||
return status
|
||||
except Exception as e:
|
||||
return {
|
||||
"error": str(e),
|
||||
"is_primary": False,
|
||||
"uptime": "N/A"
|
||||
}
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
store = data.get("store", {})
|
||||
status = {
|
||||
"is_primary": store.get("isPrimary", False),
|
||||
"uptime": "N/A",
|
||||
"advertise_url": f"http://{node_address}:{port}",
|
||||
"dbs": store.get("dbs", {})
|
||||
}
|
||||
if "replicationLag" in store:
|
||||
status["replication_lag"] = store["replicationLag"]
|
||||
return status
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 3. Fallback to nomad alloc exec
|
||||
if alloc_id:
|
||||
try:
|
||||
output = nomad_client.exec_command(alloc_id, ["litefs", "status"])
|
||||
if output and "Error" not in output:
|
||||
parsed_status = parse_litefs_status(output)
|
||||
if parsed_status:
|
||||
if "is_primary" not in parsed_status:
|
||||
parsed_status["is_primary"] = False
|
||||
if "uptime" not in parsed_status:
|
||||
parsed_status["uptime"] = "N/A"
|
||||
parsed_status["advertise_url"] = f"nomad://{alloc_id}"
|
||||
return parsed_status
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"error": "All status retrieval methods failed",
|
||||
"is_primary": False,
|
||||
"uptime": "N/A"
|
||||
}
|
||||
Reference in New Issue
Block a user