This commit is contained in:
2025-11-24 23:57:38 +00:00
parent 2288bd52a4
commit 7030361f2e
2 changed files with 254 additions and 101 deletions

View File

@@ -4,115 +4,274 @@
config, config,
... ...
}: let }: let
# 1. Merged Hosts Template # Script to generate DNS records from Consul services with Traefik tags
consulAllHostsTemplate = pkgs.writeText "consul-all-hosts.ctmpl" '' consulDnsSync = pkgs.writeShellScript "consul-dns-sync" ''
# --- Static Hosts from Consul KV --- #!/usr/bin/env bash
{{ printf "\n" }} set -euo pipefail
{{- range ls "dns/hosts" -}}
{{ .Value }} {{ .Key }}
{{ printf "\n" }}
{{- end -}}
# --- Dynamic Hosts from Consul Services (Traefik Tags) --- LOCK_FILE="/var/run/consul-dns-sync.lock"
{{ printf "\n" }} HOSTS_FILE="/var/lib/coredns/consul-hosts"
{{- range services -}} TEMP_FILE="/tmp/consul-hosts.tmp"
{{- range service .Name -}} LAST_UPDATE_FILE="/var/run/consul-dns-sync.last"
{{- /* Determine IP: Use Service Address, fall back to Node Address */ -}} MIN_UPDATE_INTERVAL=5 # Minimum seconds between updates
{{- $ip := .Address -}}
{{- if eq $ip "" -}}
{{- $ip = .NodeAddress -}}
{{- end -}}
{{- /* Scan Tags */ -}} # Simple file-based locking to prevent concurrent runs
{{- range .Tags -}} if [ -f "$LOCK_FILE" ]; then
{{- if . | regexMatch "traefik.http.routers.*.rule=Host" -}} LOCK_AGE=$(($(date +%s) - $(stat -c %Y "$LOCK_FILE" 2>/dev/null || echo 0)))
if [ $LOCK_AGE -lt 60 ]; then
echo "[$(date)] Sync already running, skipping" >&2
exit 0
else
echo "[$(date)] Stale lock detected, removing" >&2
rm -f "$LOCK_FILE"
fi
fi
{{- /* 1. Extract content inside Host(...) */ -}} # Check if we updated too recently (debouncing)
{{- $content := . | regexReplaceAll ".*Host\\(([^)]+)\\).*" "$1" -}} if [ -f "$LAST_UPDATE_FILE" ]; then
LAST_UPDATE=$(cat "$LAST_UPDATE_FILE")
CURRENT_TIME=$(date +%s)
TIME_SINCE_LAST=$((CURRENT_TIME - LAST_UPDATE))
{{- /* 2. Clean up quotes and spaces */ -}} if [ $TIME_SINCE_LAST -lt $MIN_UPDATE_INTERVAL ]; then
{{- $clean := $content | regexReplaceAll "[`'\"\\s]" "" -}} echo "[$(date)] Updated $TIME_SINCE_LAST seconds ago, debouncing (min: $MIN_UPDATE_INTERVAL)" >&2
exit 0
fi
fi
{{- /* 3. Split by comma and print */ -}} touch "$LOCK_FILE"
{{- range split "," $clean -}} trap "rm -f $LOCK_FILE" EXIT
{{- if ne . "" -}}
{{ $ip }} {{ . }} echo "[$(date)] Starting DNS sync from Consul" >&2
{{ printf "\n" }}
{{- end -}} # Query Consul API directly (more reliable than stdin during flapping)
{{- end -}} SERVICES=$(${pkgs.curl}/bin/curl -sf http://localhost:8500/v1/health/state/any 2>/dev/null || echo "[]")
{{- end -}}
{{- end -}} if [ "$SERVICES" = "[]" ] || [ -z "$SERVICES" ]; then
{{- end -}} echo "[$(date)] Failed to fetch services from Consul, keeping existing hosts" >&2
{{- end -}} exit 1
fi
# Generate hosts file from services with traefik tags
echo "# Auto-generated from Consul services - $(date)" > "$TEMP_FILE"
# Parse the Consul services data
echo "$SERVICES" | ${pkgs.jq}/bin/jq -r '
.[] |
select(.Service.Tags != null) |
{
tags: .Service.Tags,
address: (.Service.Address // .Node.Address),
port: .Service.Port,
status: .Status
} |
select(.status == "passing" or .status == "warning") |
.tags[] |
select(startswith("traefik.http.routers.") and contains(".rule=Host")) |
. as $tag |
($tag | capture("Host\\((?<hosts>[^)]+)\\)") | .hosts | gsub("[`\"\\s]"; "") | split(",")[]) as $host |
{
host: $host,
address: input.address
}
' 2>/dev/null | ${pkgs.jq}/bin/jq -s 'unique_by(.host) | .[]' 2>/dev/null | while read -r line; do
HOST=$(echo "$line" | ${pkgs.jq}/bin/jq -r '.host // empty' 2>/dev/null)
ADDRESS=$(echo "$line" | ${pkgs.jq}/bin/jq -r '.address // empty' 2>/dev/null)
if [ ! -z "$HOST" ] && [ ! -z "$ADDRESS" ] && [ "$ADDRESS" != "null" ]; then
echo "$ADDRESS $HOST" >> "$TEMP_FILE"
echo "[$(date)] Added: $ADDRESS -> $HOST" >&2
fi
done
# Add static entries for critical services (always accessible even during flapping)
# These ensure you can always reach Nomad/Consul UIs
cat >> "$TEMP_FILE" << 'EOF'
# Static critical services - always available
192.168.4.250 consul.fbleagh.duckdns.org
192.168.4.250 nomad.fbleagh.duckdns.org
EOF
# Only update if there were actual changes
if ! cmp -s "$TEMP_FILE" "$HOSTS_FILE" 2>/dev/null; then
cp "$TEMP_FILE" "$HOSTS_FILE"
date +%s > "$LAST_UPDATE_FILE"
echo "[$(date)] DNS hosts file updated, reloading CoreDNS" >&2
${pkgs.systemd}/bin/systemctl reload coredns.service 2>/dev/null || true
else
echo "[$(date)] No changes detected, skipping reload" >&2
fi
rm -f "$TEMP_FILE"
''; '';
# 2. Wrapper script to ensure clean execution and environment setup
consulTemplateWrapper = pkgs.writeShellScript "consul-template-wrapper" ''
# Only render the single merged template file
${pkgs.consul-template}/bin/consul-template \
-template "${consulAllHostsTemplate}:/etc/coredns/consul-all-hosts:${pkgs.systemd}/bin/systemctl reload coredns" \
-log-level info
'';
in { in {
# --- CoreDNS Configuration --- # Create CoreDNS configuration file with increased cache and stability
environment.etc."coredns/Corefile".text = '' environment.etc."coredns/Corefile".text = ''
# Forward Consul DNS queries to the local Consul Agent # Handle .consul domain - forward ALL to Consul
consul:53 { consul:53 {
forward . 127.0.0.1:8600 forward . 127.0.0.1:8600
cache 30 cache 30
errors errors
log log {
class error
}
} }
# Handle custom domain # Handle fbleagh.duckdns.org domain
fbleagh.duckdns.org:53 { fbleagh.duckdns.org:53 {
# CRITICAL FIX: Use only one hosts file/plugin definition # Load dynamic hosts from Consul (now in writable location)
hosts /etc/coredns/consul-all-hosts { hosts /var/lib/coredns/consul-hosts {
ttl 60
reload 5s
fallthrough fallthrough
} }
# Forward service.* queries to Consul with retries
forward service.dc1.fbleagh.duckdns.org 127.0.0.1:8600 {
max_fails 3
expire 10s
health_check 5s
}
# Cache aggressively to handle flapping
cache 300 {
success 4096
denial 1024
prefetch 10
}
# Fallback to upstream DNS # Fallback to upstream DNS
forward . 192.168.4.1 8.8.8.8 forward . 192.168.4.1 8.8.8.8 {
cache 30 max_fails 3
expire 10s
health_check 5s
}
errors errors
log log {
class error
}
} }
# Handle all other DNS queries # Handle all other DNS queries
.:53 { .:53 {
forward . 192.168.4.1 8.8.8.8 forward . 192.168.4.1 8.8.8.8 {
cache 30 max_fails 3
expire 10s
health_check 5s
}
cache 300 {
success 4096
denial 1024
}
errors errors
log log {
class error
}
} }
''; '';
# Create initial hosts file with critical services
environment.etc."coredns/consul-hosts".text = ''
# Placeholder - will be populated by consul-watch
# Static critical services - always available
192.168.4.250 consul.fbleagh.duckdns.org
192.168.4.250 nomad.fbleagh.duckdns.org
'';
# Create writable directory for dynamic hosts file
systemd.tmpfiles.rules = [ systemd.tmpfiles.rules = [
"f /etc/coredns/consul-all-hosts 0644 root root - #" "d /var/lib/coredns 0755 root root -"
"f /var/lib/coredns/consul-hosts 0644 root root - # Placeholder\n192.168.4.250 consul.fbleagh.duckdns.org\n192.168.4.250 nomad.fbleagh.duckdns.org"
]; ];
# --- Consul Template Service --- # Systemd service for Consul watch with rate limiting
systemd.services.consul-template = { systemd.services.consul-watch = {
description = "Consul Template for CoreDNS Hosts"; description = "Consul watch for DNS updates";
after = ["consul.service" "coredns.service"];
requires = ["consul.service"];
wantedBy = ["multi-user.target"]; wantedBy = ["multi-user.target"];
after = ["consul.service" "coredns.service" "network.target"];
requires = ["coredns.service"];
serviceConfig = { serviceConfig = {
# Use the robust wrapper script Type = "simple";
ExecStart = "${consulTemplateWrapper}"; # Use exec mode to avoid shell overhead during flapping
ExecStart = "${pkgs.consul}/bin/consul watch -type=service -service=.* -passingonly=false ${consulDnsSync}";
Restart = "always"; Restart = "on-failure";
RestartSec = "10s"; RestartSec = "10s";
User = "root";
# Rate limiting: max 10 starts in 30 seconds
StartLimitIntervalSec = 30;
StartLimitBurst = 10;
# Logging - only errors to reduce noise
StandardOutput = "journal";
StandardError = "journal";
# Resource limits to prevent runaway during flapping
CPUQuota = "25%";
MemoryMax = "128M";
}; };
}; };
# --- CoreDNS Service --- # Initial sync on boot
systemd.services.consul-dns-initial-sync = {
description = "Initial DNS sync from Consul";
after = ["consul.service" "coredns.service"];
requires = ["consul.service" "coredns.service"];
wantedBy = ["multi-user.target"];
serviceConfig = {
Type = "oneshot";
ExecStart = pkgs.writeShellScript "initial-sync" ''
# Wait for Consul to be ready
timeout=60
elapsed=0
until ${pkgs.curl}/bin/curl -sf http://localhost:8500/v1/status/leader > /dev/null 2>&1; do
if [ $elapsed -ge $timeout ]; then
echo "Timeout waiting for Consul"
exit 1
fi
echo "Waiting for Consul..."
sleep 2
elapsed=$((elapsed + 2))
done
# Run initial sync
${consulDnsSync}
'';
User = "root";
TimeoutStartSec = "90s";
};
};
# Backup timer-based sync as fallback (every 5 minutes)
systemd.services.consul-dns-timer-sync = {
description = "Periodic DNS sync from Consul (fallback)";
serviceConfig = {
Type = "oneshot";
ExecStart = "${consulDnsSync}";
User = "root";
};
};
systemd.timers.consul-dns-timer-sync = {
description = "Periodic DNS sync timer";
wantedBy = ["timers.target"];
timerConfig = {
OnBootSec = "5min";
OnUnitActiveSec = "5min";
RandomizedDelaySec = "30s";
};
};
# Create systemd service for CoreDNS
systemd.services.coredns = { systemd.services.coredns = {
description = "CoreDNS DNS server"; description = "CoreDNS DNS server";
after = ["network.target"];
wantedBy = ["multi-user.target"]; wantedBy = ["multi-user.target"];
requires = ["consul.service"];
after = ["network.target" "consul.service"];
serviceConfig = { serviceConfig = {
Type = "simple"; Type = "simple";
@@ -120,6 +279,11 @@ in {
ExecReload = "${pkgs.coreutils}/bin/kill -SIGUSR1 $MAINPID"; ExecReload = "${pkgs.coreutils}/bin/kill -SIGUSR1 $MAINPID";
Restart = "on-failure"; Restart = "on-failure";
RestartSec = "5s"; RestartSec = "5s";
# Rate limiting for reloads
ReloadPropagatedFrom = [];
# Security hardening
DynamicUser = true; DynamicUser = true;
AmbientCapabilities = "CAP_NET_BIND_SERVICE"; AmbientCapabilities = "CAP_NET_BIND_SERVICE";
CapabilityBoundingSet = "CAP_NET_BIND_SERVICE"; CapabilityBoundingSet = "CAP_NET_BIND_SERVICE";
@@ -127,21 +291,11 @@ in {
ProtectSystem = "strict"; ProtectSystem = "strict";
ProtectHome = true; ProtectHome = true;
PrivateTmp = true; PrivateTmp = true;
ReadWritePaths = "/etc/coredns"; ReadWritePaths = "/var/lib/coredns";
}; };
}; };
# --- Helper Scripts and Firewall --- # Open firewall for CoreDNS
environment.systemPackages = [
pkgs.consul-template
(pkgs.writeShellScriptBin "debug-consul-template" ''
echo "Rendering template to stdout..."
${pkgs.consul-template}/bin/consul-template \
-template "${consulAllHostsTemplate}:-" \
-dry | grep -v "^$"
'') # <--- This is the crucial closing of the multi-line string
];
networking.firewall = { networking.firewall = {
allowedTCPPorts = [53]; allowedTCPPorts = [53];
allowedUDPPorts = [53]; allowedUDPPorts = [53];

File diff suppressed because one or more lines are too long