mirror of
https://github.com/sstent/nixos-cluster.git
synced 2026-01-25 06:32:08 +00:00
sync
This commit is contained in:
@@ -4,115 +4,274 @@
|
|||||||
config,
|
config,
|
||||||
...
|
...
|
||||||
}: let
|
}: let
|
||||||
# 1. Merged Hosts Template
|
# Script to generate DNS records from Consul services with Traefik tags
|
||||||
consulAllHostsTemplate = pkgs.writeText "consul-all-hosts.ctmpl" ''
|
consulDnsSync = pkgs.writeShellScript "consul-dns-sync" ''
|
||||||
# --- Static Hosts from Consul KV ---
|
#!/usr/bin/env bash
|
||||||
{{ printf "\n" }}
|
set -euo pipefail
|
||||||
{{- range ls "dns/hosts" -}}
|
|
||||||
{{ .Value }} {{ .Key }}
|
|
||||||
{{ printf "\n" }}
|
|
||||||
{{- end -}}
|
|
||||||
|
|
||||||
# --- Dynamic Hosts from Consul Services (Traefik Tags) ---
|
LOCK_FILE="/var/run/consul-dns-sync.lock"
|
||||||
{{ printf "\n" }}
|
HOSTS_FILE="/var/lib/coredns/consul-hosts"
|
||||||
{{- range services -}}
|
TEMP_FILE="/tmp/consul-hosts.tmp"
|
||||||
{{- range service .Name -}}
|
LAST_UPDATE_FILE="/var/run/consul-dns-sync.last"
|
||||||
{{- /* Determine IP: Use Service Address, fall back to Node Address */ -}}
|
MIN_UPDATE_INTERVAL=5 # Minimum seconds between updates
|
||||||
{{- $ip := .Address -}}
|
|
||||||
{{- if eq $ip "" -}}
|
|
||||||
{{- $ip = .NodeAddress -}}
|
|
||||||
{{- end -}}
|
|
||||||
|
|
||||||
{{- /* Scan Tags */ -}}
|
# Simple file-based locking to prevent concurrent runs
|
||||||
{{- range .Tags -}}
|
if [ -f "$LOCK_FILE" ]; then
|
||||||
{{- if . | regexMatch "traefik.http.routers.*.rule=Host" -}}
|
LOCK_AGE=$(($(date +%s) - $(stat -c %Y "$LOCK_FILE" 2>/dev/null || echo 0)))
|
||||||
|
if [ $LOCK_AGE -lt 60 ]; then
|
||||||
|
echo "[$(date)] Sync already running, skipping" >&2
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
echo "[$(date)] Stale lock detected, removing" >&2
|
||||||
|
rm -f "$LOCK_FILE"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
{{- /* 1. Extract content inside Host(...) */ -}}
|
# Check if we updated too recently (debouncing)
|
||||||
{{- $content := . | regexReplaceAll ".*Host\\(([^)]+)\\).*" "$1" -}}
|
if [ -f "$LAST_UPDATE_FILE" ]; then
|
||||||
|
LAST_UPDATE=$(cat "$LAST_UPDATE_FILE")
|
||||||
|
CURRENT_TIME=$(date +%s)
|
||||||
|
TIME_SINCE_LAST=$((CURRENT_TIME - LAST_UPDATE))
|
||||||
|
|
||||||
{{- /* 2. Clean up quotes and spaces */ -}}
|
if [ $TIME_SINCE_LAST -lt $MIN_UPDATE_INTERVAL ]; then
|
||||||
{{- $clean := $content | regexReplaceAll "[`'\"\\s]" "" -}}
|
echo "[$(date)] Updated $TIME_SINCE_LAST seconds ago, debouncing (min: $MIN_UPDATE_INTERVAL)" >&2
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
{{- /* 3. Split by comma and print */ -}}
|
touch "$LOCK_FILE"
|
||||||
{{- range split "," $clean -}}
|
trap "rm -f $LOCK_FILE" EXIT
|
||||||
{{- if ne . "" -}}
|
|
||||||
{{ $ip }} {{ . }}
|
echo "[$(date)] Starting DNS sync from Consul" >&2
|
||||||
{{ printf "\n" }}
|
|
||||||
{{- end -}}
|
# Query Consul API directly (more reliable than stdin during flapping)
|
||||||
{{- end -}}
|
SERVICES=$(${pkgs.curl}/bin/curl -sf http://localhost:8500/v1/health/state/any 2>/dev/null || echo "[]")
|
||||||
{{- end -}}
|
|
||||||
{{- end -}}
|
if [ "$SERVICES" = "[]" ] || [ -z "$SERVICES" ]; then
|
||||||
{{- end -}}
|
echo "[$(date)] Failed to fetch services from Consul, keeping existing hosts" >&2
|
||||||
{{- end -}}
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Generate hosts file from services with traefik tags
|
||||||
|
echo "# Auto-generated from Consul services - $(date)" > "$TEMP_FILE"
|
||||||
|
|
||||||
|
# Parse the Consul services data
|
||||||
|
echo "$SERVICES" | ${pkgs.jq}/bin/jq -r '
|
||||||
|
.[] |
|
||||||
|
select(.Service.Tags != null) |
|
||||||
|
{
|
||||||
|
tags: .Service.Tags,
|
||||||
|
address: (.Service.Address // .Node.Address),
|
||||||
|
port: .Service.Port,
|
||||||
|
status: .Status
|
||||||
|
} |
|
||||||
|
select(.status == "passing" or .status == "warning") |
|
||||||
|
.tags[] |
|
||||||
|
select(startswith("traefik.http.routers.") and contains(".rule=Host")) |
|
||||||
|
. as $tag |
|
||||||
|
($tag | capture("Host\\((?<hosts>[^)]+)\\)") | .hosts | gsub("[`\"\\s]"; "") | split(",")[]) as $host |
|
||||||
|
{
|
||||||
|
host: $host,
|
||||||
|
address: input.address
|
||||||
|
}
|
||||||
|
' 2>/dev/null | ${pkgs.jq}/bin/jq -s 'unique_by(.host) | .[]' 2>/dev/null | while read -r line; do
|
||||||
|
HOST=$(echo "$line" | ${pkgs.jq}/bin/jq -r '.host // empty' 2>/dev/null)
|
||||||
|
ADDRESS=$(echo "$line" | ${pkgs.jq}/bin/jq -r '.address // empty' 2>/dev/null)
|
||||||
|
|
||||||
|
if [ ! -z "$HOST" ] && [ ! -z "$ADDRESS" ] && [ "$ADDRESS" != "null" ]; then
|
||||||
|
echo "$ADDRESS $HOST" >> "$TEMP_FILE"
|
||||||
|
echo "[$(date)] Added: $ADDRESS -> $HOST" >&2
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Add static entries for critical services (always accessible even during flapping)
|
||||||
|
# These ensure you can always reach Nomad/Consul UIs
|
||||||
|
cat >> "$TEMP_FILE" << 'EOF'
|
||||||
|
# Static critical services - always available
|
||||||
|
192.168.4.250 consul.fbleagh.duckdns.org
|
||||||
|
192.168.4.250 nomad.fbleagh.duckdns.org
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Only update if there were actual changes
|
||||||
|
if ! cmp -s "$TEMP_FILE" "$HOSTS_FILE" 2>/dev/null; then
|
||||||
|
cp "$TEMP_FILE" "$HOSTS_FILE"
|
||||||
|
date +%s > "$LAST_UPDATE_FILE"
|
||||||
|
echo "[$(date)] DNS hosts file updated, reloading CoreDNS" >&2
|
||||||
|
${pkgs.systemd}/bin/systemctl reload coredns.service 2>/dev/null || true
|
||||||
|
else
|
||||||
|
echo "[$(date)] No changes detected, skipping reload" >&2
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f "$TEMP_FILE"
|
||||||
'';
|
'';
|
||||||
|
|
||||||
# 2. Wrapper script to ensure clean execution and environment setup
|
|
||||||
consulTemplateWrapper = pkgs.writeShellScript "consul-template-wrapper" ''
|
|
||||||
# Only render the single merged template file
|
|
||||||
${pkgs.consul-template}/bin/consul-template \
|
|
||||||
-template "${consulAllHostsTemplate}:/etc/coredns/consul-all-hosts:${pkgs.systemd}/bin/systemctl reload coredns" \
|
|
||||||
-log-level info
|
|
||||||
'';
|
|
||||||
|
|
||||||
in {
|
in {
|
||||||
# --- CoreDNS Configuration ---
|
# Create CoreDNS configuration file with increased cache and stability
|
||||||
environment.etc."coredns/Corefile".text = ''
|
environment.etc."coredns/Corefile".text = ''
|
||||||
# Forward Consul DNS queries to the local Consul Agent
|
# Handle .consul domain - forward ALL to Consul
|
||||||
consul:53 {
|
consul:53 {
|
||||||
forward . 127.0.0.1:8600
|
forward . 127.0.0.1:8600
|
||||||
cache 30
|
cache 30
|
||||||
errors
|
errors
|
||||||
log
|
log {
|
||||||
|
class error
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Handle custom domain
|
# Handle fbleagh.duckdns.org domain
|
||||||
fbleagh.duckdns.org:53 {
|
fbleagh.duckdns.org:53 {
|
||||||
# CRITICAL FIX: Use only one hosts file/plugin definition
|
# Load dynamic hosts from Consul (now in writable location)
|
||||||
hosts /etc/coredns/consul-all-hosts {
|
hosts /var/lib/coredns/consul-hosts {
|
||||||
|
ttl 60
|
||||||
|
reload 5s
|
||||||
fallthrough
|
fallthrough
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Forward service.* queries to Consul with retries
|
||||||
|
forward service.dc1.fbleagh.duckdns.org 127.0.0.1:8600 {
|
||||||
|
max_fails 3
|
||||||
|
expire 10s
|
||||||
|
health_check 5s
|
||||||
|
}
|
||||||
|
|
||||||
|
# Cache aggressively to handle flapping
|
||||||
|
cache 300 {
|
||||||
|
success 4096
|
||||||
|
denial 1024
|
||||||
|
prefetch 10
|
||||||
|
}
|
||||||
|
|
||||||
# Fallback to upstream DNS
|
# Fallback to upstream DNS
|
||||||
forward . 192.168.4.1 8.8.8.8
|
forward . 192.168.4.1 8.8.8.8 {
|
||||||
cache 30
|
max_fails 3
|
||||||
|
expire 10s
|
||||||
|
health_check 5s
|
||||||
|
}
|
||||||
|
|
||||||
errors
|
errors
|
||||||
log
|
log {
|
||||||
|
class error
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Handle all other DNS queries
|
# Handle all other DNS queries
|
||||||
.:53 {
|
.:53 {
|
||||||
forward . 192.168.4.1 8.8.8.8
|
forward . 192.168.4.1 8.8.8.8 {
|
||||||
cache 30
|
max_fails 3
|
||||||
|
expire 10s
|
||||||
|
health_check 5s
|
||||||
|
}
|
||||||
|
|
||||||
|
cache 300 {
|
||||||
|
success 4096
|
||||||
|
denial 1024
|
||||||
|
}
|
||||||
|
|
||||||
errors
|
errors
|
||||||
log
|
log {
|
||||||
|
class error
|
||||||
|
}
|
||||||
}
|
}
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
# Create initial hosts file with critical services
|
||||||
|
environment.etc."coredns/consul-hosts".text = ''
|
||||||
|
# Placeholder - will be populated by consul-watch
|
||||||
|
# Static critical services - always available
|
||||||
|
192.168.4.250 consul.fbleagh.duckdns.org
|
||||||
|
192.168.4.250 nomad.fbleagh.duckdns.org
|
||||||
|
'';
|
||||||
|
|
||||||
|
# Create writable directory for dynamic hosts file
|
||||||
systemd.tmpfiles.rules = [
|
systemd.tmpfiles.rules = [
|
||||||
"f /etc/coredns/consul-all-hosts 0644 root root - #"
|
"d /var/lib/coredns 0755 root root -"
|
||||||
|
"f /var/lib/coredns/consul-hosts 0644 root root - # Placeholder\n192.168.4.250 consul.fbleagh.duckdns.org\n192.168.4.250 nomad.fbleagh.duckdns.org"
|
||||||
];
|
];
|
||||||
|
|
||||||
# --- Consul Template Service ---
|
# Systemd service for Consul watch with rate limiting
|
||||||
systemd.services.consul-template = {
|
systemd.services.consul-watch = {
|
||||||
description = "Consul Template for CoreDNS Hosts";
|
description = "Consul watch for DNS updates";
|
||||||
|
after = ["consul.service" "coredns.service"];
|
||||||
|
requires = ["consul.service"];
|
||||||
wantedBy = ["multi-user.target"];
|
wantedBy = ["multi-user.target"];
|
||||||
after = ["consul.service" "coredns.service" "network.target"];
|
|
||||||
requires = ["coredns.service"];
|
|
||||||
|
|
||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
# Use the robust wrapper script
|
Type = "simple";
|
||||||
ExecStart = "${consulTemplateWrapper}";
|
# Use exec mode to avoid shell overhead during flapping
|
||||||
|
ExecStart = "${pkgs.consul}/bin/consul watch -type=service -service=.* -passingonly=false ${consulDnsSync}";
|
||||||
Restart = "always";
|
Restart = "on-failure";
|
||||||
RestartSec = "10s";
|
RestartSec = "10s";
|
||||||
|
User = "root";
|
||||||
|
|
||||||
|
# Rate limiting: max 10 starts in 30 seconds
|
||||||
|
StartLimitIntervalSec = 30;
|
||||||
|
StartLimitBurst = 10;
|
||||||
|
|
||||||
|
# Logging - only errors to reduce noise
|
||||||
|
StandardOutput = "journal";
|
||||||
|
StandardError = "journal";
|
||||||
|
|
||||||
|
# Resource limits to prevent runaway during flapping
|
||||||
|
CPUQuota = "25%";
|
||||||
|
MemoryMax = "128M";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
# --- CoreDNS Service ---
|
# Initial sync on boot
|
||||||
|
systemd.services.consul-dns-initial-sync = {
|
||||||
|
description = "Initial DNS sync from Consul";
|
||||||
|
after = ["consul.service" "coredns.service"];
|
||||||
|
requires = ["consul.service" "coredns.service"];
|
||||||
|
wantedBy = ["multi-user.target"];
|
||||||
|
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
ExecStart = pkgs.writeShellScript "initial-sync" ''
|
||||||
|
# Wait for Consul to be ready
|
||||||
|
timeout=60
|
||||||
|
elapsed=0
|
||||||
|
until ${pkgs.curl}/bin/curl -sf http://localhost:8500/v1/status/leader > /dev/null 2>&1; do
|
||||||
|
if [ $elapsed -ge $timeout ]; then
|
||||||
|
echo "Timeout waiting for Consul"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Waiting for Consul..."
|
||||||
|
sleep 2
|
||||||
|
elapsed=$((elapsed + 2))
|
||||||
|
done
|
||||||
|
|
||||||
|
# Run initial sync
|
||||||
|
${consulDnsSync}
|
||||||
|
'';
|
||||||
|
User = "root";
|
||||||
|
TimeoutStartSec = "90s";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Backup timer-based sync as fallback (every 5 minutes)
|
||||||
|
systemd.services.consul-dns-timer-sync = {
|
||||||
|
description = "Periodic DNS sync from Consul (fallback)";
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
ExecStart = "${consulDnsSync}";
|
||||||
|
User = "root";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
systemd.timers.consul-dns-timer-sync = {
|
||||||
|
description = "Periodic DNS sync timer";
|
||||||
|
wantedBy = ["timers.target"];
|
||||||
|
timerConfig = {
|
||||||
|
OnBootSec = "5min";
|
||||||
|
OnUnitActiveSec = "5min";
|
||||||
|
RandomizedDelaySec = "30s";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Create systemd service for CoreDNS
|
||||||
systemd.services.coredns = {
|
systemd.services.coredns = {
|
||||||
description = "CoreDNS DNS server";
|
description = "CoreDNS DNS server";
|
||||||
|
after = ["network.target"];
|
||||||
wantedBy = ["multi-user.target"];
|
wantedBy = ["multi-user.target"];
|
||||||
requires = ["consul.service"];
|
|
||||||
after = ["network.target" "consul.service"];
|
|
||||||
|
|
||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
Type = "simple";
|
Type = "simple";
|
||||||
@@ -120,6 +279,11 @@ in {
|
|||||||
ExecReload = "${pkgs.coreutils}/bin/kill -SIGUSR1 $MAINPID";
|
ExecReload = "${pkgs.coreutils}/bin/kill -SIGUSR1 $MAINPID";
|
||||||
Restart = "on-failure";
|
Restart = "on-failure";
|
||||||
RestartSec = "5s";
|
RestartSec = "5s";
|
||||||
|
|
||||||
|
# Rate limiting for reloads
|
||||||
|
ReloadPropagatedFrom = [];
|
||||||
|
|
||||||
|
# Security hardening
|
||||||
DynamicUser = true;
|
DynamicUser = true;
|
||||||
AmbientCapabilities = "CAP_NET_BIND_SERVICE";
|
AmbientCapabilities = "CAP_NET_BIND_SERVICE";
|
||||||
CapabilityBoundingSet = "CAP_NET_BIND_SERVICE";
|
CapabilityBoundingSet = "CAP_NET_BIND_SERVICE";
|
||||||
@@ -127,21 +291,11 @@ in {
|
|||||||
ProtectSystem = "strict";
|
ProtectSystem = "strict";
|
||||||
ProtectHome = true;
|
ProtectHome = true;
|
||||||
PrivateTmp = true;
|
PrivateTmp = true;
|
||||||
ReadWritePaths = "/etc/coredns";
|
ReadWritePaths = "/var/lib/coredns";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
# --- Helper Scripts and Firewall ---
|
# Open firewall for CoreDNS
|
||||||
environment.systemPackages = [
|
|
||||||
pkgs.consul-template
|
|
||||||
(pkgs.writeShellScriptBin "debug-consul-template" ''
|
|
||||||
echo "Rendering template to stdout..."
|
|
||||||
${pkgs.consul-template}/bin/consul-template \
|
|
||||||
-template "${consulAllHostsTemplate}:-" \
|
|
||||||
-dry | grep -v "^$"
|
|
||||||
'') # <--- This is the crucial closing of the multi-line string
|
|
||||||
];
|
|
||||||
|
|
||||||
networking.firewall = {
|
networking.firewall = {
|
||||||
allowedTCPPorts = [53];
|
allowedTCPPorts = [53];
|
||||||
allowedUDPPorts = [53];
|
allowedUDPPorts = [53];
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user