From 3d34cc9b749869734da4674a2ce361db28bded0a Mon Sep 17 00:00:00 2001 From: Nate Brown Date: Mon, 20 Apr 2026 16:38:14 -0500 Subject: [PATCH] Try to make smoke less flakey (#1663) --- .github/workflows/smoke/build-relay.sh | 8 ++-- .github/workflows/smoke/build.sh | 18 +++++--- .github/workflows/smoke/smoke-relay.sh | 57 ++++++++++++++++++++---- .github/workflows/smoke/smoke-vagrant.sh | 47 +++++++++++++++++-- .github/workflows/smoke/smoke.sh | 56 +++++++++++++++++++---- 5 files changed, 156 insertions(+), 30 deletions(-) diff --git a/.github/workflows/smoke/build-relay.sh b/.github/workflows/smoke/build-relay.sh index 70b07f4e..249e6c84 100755 --- a/.github/workflows/smoke/build-relay.sh +++ b/.github/workflows/smoke/build-relay.sh @@ -16,8 +16,10 @@ relay: am_relay: true EOF - export LIGHTHOUSES="192.168.100.1 172.17.0.2:4242" - export REMOTE_ALLOW_LIST='{"172.17.0.4/32": false, "172.17.0.5/32": false}' + # TEST-NET-3 placeholder IPs; smoke-relay.sh seds them to real container IPs. + # Mapping: .2 lighthouse1, .3 host2, .4 host3, .5 host4. + export LIGHTHOUSES="192.168.100.1 203.0.113.2:4242" + export REMOTE_ALLOW_LIST='{"203.0.113.4/32": false, "203.0.113.5/32": false}' HOST="host2" ../genconfig.sh >host2.yml <host3.yml diff --git a/.github/workflows/smoke/build.sh b/.github/workflows/smoke/build.sh index dcd132b0..b23516ee 100755 --- a/.github/workflows/smoke/build.sh +++ b/.github/workflows/smoke/build.sh @@ -5,9 +5,15 @@ set -e -x rm -rf ./build mkdir ./build -# TODO: Assumes your docker bridge network is a /24, and the first container that launches will be .1 -# - We could make this better by launching the lighthouse first and then fetching what IP it is. -NET="$(docker network inspect bridge -f '{{ range .IPAM.Config }}{{ .Subnet }}{{ end }}' | cut -d. -f1-3)" +# Smoke containers run on a dedicated docker network whose subnet is allocated +# at smoke time, not known at build time. Configs are written with TEST-NET-3 +# placeholder IPs (RFC 5737) and smoke.sh / smoke-vagrant.sh / smoke-relay.sh +# sed the real container IPs in before starting nebula. +# +# Placeholder mapping (last octet == fixed container slot): +# 203.0.113.2 -> lighthouse1, 203.0.113.3 -> host2, +# 203.0.113.4 -> host3, 203.0.113.5 -> host4. +LIGHTHOUSE_IP="203.0.113.2" ( cd build @@ -25,16 +31,16 @@ NET="$(docker network inspect bridge -f '{{ range .IPAM.Config }}{{ .Subnet }}{{ ../genconfig.sh >lighthouse1.yml HOST="host2" \ - LIGHTHOUSES="192.168.100.1 $NET.2:4242" \ + LIGHTHOUSES="192.168.100.1 $LIGHTHOUSE_IP:4242" \ ../genconfig.sh >host2.yml HOST="host3" \ - LIGHTHOUSES="192.168.100.1 $NET.2:4242" \ + LIGHTHOUSES="192.168.100.1 $LIGHTHOUSE_IP:4242" \ INBOUND='[{"port": "any", "proto": "icmp", "group": "lighthouse"}]' \ ../genconfig.sh >host3.yml HOST="host4" \ - LIGHTHOUSES="192.168.100.1 $NET.2:4242" \ + LIGHTHOUSES="192.168.100.1 $LIGHTHOUSE_IP:4242" \ OUTBOUND='[{"port": "any", "proto": "icmp", "group": "lighthouse"}]' \ ../genconfig.sh >host4.yml diff --git a/.github/workflows/smoke/smoke-relay.sh b/.github/workflows/smoke/smoke-relay.sh index 9c113e18..aa1cd915 100755 --- a/.github/workflows/smoke/smoke-relay.sh +++ b/.github/workflows/smoke/smoke-relay.sh @@ -6,6 +6,8 @@ set -o pipefail mkdir -p logs +NETWORK="nebula-smoke-relay" + cleanup() { echo echo " *** cleanup" @@ -16,22 +18,53 @@ cleanup() { then docker kill lighthouse1 host2 host3 host4 fi + docker network rm "$NETWORK" >/dev/null 2>&1 } trap cleanup EXIT -docker run --name lighthouse1 --rm nebula:smoke-relay -config lighthouse1.yml -test -docker run --name host2 --rm nebula:smoke-relay -config host2.yml -test -docker run --name host3 --rm nebula:smoke-relay -config host3.yml -test -docker run --name host4 --rm nebula:smoke-relay -config host4.yml -test +# Create a dedicated smoke network with an explicit subnet (required for --ip +# below). Probe a short list of candidates so a locally-used range doesn't +# fail the whole test — we only need one to be free. +docker network rm "$NETWORK" >/dev/null 2>&1 || true +for candidate in 172.30.0.0/24 172.31.0.0/24 10.98.0.0/24 10.99.0.0/24 192.168.230.0/24; do + if docker network create --subnet "$candidate" "$NETWORK" >/dev/null 2>&1; then + break + fi +done +if ! docker network inspect "$NETWORK" >/dev/null 2>&1; then + echo "failed to create $NETWORK: every candidate subnet is in use" >&2 + exit 1 +fi -docker run --name lighthouse1 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' & +# Derive container IPs from the network's assigned subnet. Slots: .2 lighthouse1, +# .3 host2, .4 host3, .5 host4 — matches the placeholders in build-relay.sh. +SUBNET="$(docker network inspect -f '{{(index .IPAM.Config 0).Subnet}}' "$NETWORK")" +PREFIX="${SUBNET%/*}" +PREFIX="${PREFIX%.*}" +LIGHTHOUSE_IP="$PREFIX.2" +HOST2_IP="$PREFIX.3" +HOST3_IP="$PREFIX.4" +HOST4_IP="$PREFIX.5" + +# Sed the placeholder TEST-NET-3 IPs in the host configs to the real ones. +for f in build/host2.yml build/host3.yml build/host4.yml; do + sed "s|203\.0\.113\.|$PREFIX.|g" "$f" >"$f.tmp" + mv "$f.tmp" "$f" +done + +docker run --name lighthouse1 --rm nebula:smoke-relay -config lighthouse1.yml -test +docker run --name host2 --rm -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" nebula:smoke-relay -config host2.yml -test +docker run --name host3 --rm -v "$PWD/build/host3.yml:/nebula/host3.yml:ro" nebula:smoke-relay -config host3.yml -test +docker run --name host4 --rm -v "$PWD/build/host4.yml:/nebula/host4.yml:ro" nebula:smoke-relay -config host4.yml -test + +docker run --name lighthouse1 --network "$NETWORK" --ip "$LIGHTHOUSE_IP" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' & sleep 1 -docker run --name host2 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' & +docker run --name host2 --network "$NETWORK" --ip "$HOST2_IP" -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' & sleep 1 -docker run --name host3 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host3.yml 2>&1 | tee logs/host3 | sed -u 's/^/ [host3] /' & +docker run --name host3 --network "$NETWORK" --ip "$HOST3_IP" -v "$PWD/build/host3.yml:/nebula/host3.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host3.yml 2>&1 | tee logs/host3 | sed -u 's/^/ [host3] /' & sleep 1 -docker run --name host4 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host4.yml 2>&1 | tee logs/host4 | sed -u 's/^/ [host4] /' & +docker run --name host4 --network "$NETWORK" --ip "$HOST4_IP" -v "$PWD/build/host4.yml:/nebula/host4.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host4.yml 2>&1 | tee logs/host4 | sed -u 's/^/ [host4] /' & sleep 1 set +x @@ -76,7 +109,13 @@ docker exec host4 sh -c 'kill 1' docker exec host3 sh -c 'kill 1' docker exec host2 sh -c 'kill 1' docker exec lighthouse1 sh -c 'kill 1' -sleep 5 + +# Wait up to 30s for all backgrounded jobs to exit rather than relying on a +# fixed sleep. +for _ in $(seq 1 30); do + [ -z "$(jobs -r)" ] && break + sleep 1 +done if [ "$(jobs -r)" ] then diff --git a/.github/workflows/smoke/smoke-vagrant.sh b/.github/workflows/smoke/smoke-vagrant.sh index 1c1e3c50..e3863cb5 100755 --- a/.github/workflows/smoke/smoke-vagrant.sh +++ b/.github/workflows/smoke/smoke-vagrant.sh @@ -8,6 +8,8 @@ export VAGRANT_CWD="$PWD/vagrant-$1" mkdir -p logs +NETWORK="nebula-smoke" + cleanup() { echo echo " *** cleanup" @@ -19,21 +21,51 @@ cleanup() { docker kill lighthouse1 host2 fi vagrant destroy -f + docker network rm "$NETWORK" >/dev/null 2>&1 } trap cleanup EXIT +# Create a dedicated smoke network with an explicit subnet (required for --ip +# below). Probe a short list of candidates so a locally-used range doesn't +# fail the whole test — we only need one to be free. +docker network rm "$NETWORK" >/dev/null 2>&1 || true +for candidate in 172.30.0.0/24 172.31.0.0/24 10.98.0.0/24 10.99.0.0/24 192.168.230.0/24; do + if docker network create --subnet "$candidate" "$NETWORK" >/dev/null 2>&1; then + break + fi +done +if ! docker network inspect "$NETWORK" >/dev/null 2>&1; then + echo "failed to create $NETWORK: every candidate subnet is in use" >&2 + exit 1 +fi + +# Derive container IPs from the network's assigned subnet. Slots: .2 lighthouse1, +# .3 host2 — matches the placeholders in build.sh. +SUBNET="$(docker network inspect -f '{{(index .IPAM.Config 0).Subnet}}' "$NETWORK")" +PREFIX="${SUBNET%/*}" +PREFIX="${PREFIX%.*}" +LIGHTHOUSE_IP="$PREFIX.2" +HOST2_IP="$PREFIX.3" + +# Sed the placeholder TEST-NET-3 IPs in the host configs to the real ones. +# This must happen before `vagrant up` rsyncs build/ into the VM for host3. +for f in build/host2.yml build/host3.yml; do + sed "s|203\.0\.113\.|$PREFIX.|g" "$f" >"$f.tmp" + mv "$f.tmp" "$f" +done + CONTAINER="nebula:${NAME:-smoke}" docker run --name lighthouse1 --rm "$CONTAINER" -config lighthouse1.yml -test -docker run --name host2 --rm "$CONTAINER" -config host2.yml -test +docker run --name host2 --rm -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" "$CONTAINER" -config host2.yml -test vagrant up vagrant ssh -c "cd /nebula && /nebula/$1-nebula -config host3.yml -test" -- -T -docker run --name lighthouse1 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' & +docker run --name lighthouse1 --network "$NETWORK" --ip "$LIGHTHOUSE_IP" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' & sleep 1 -docker run --name host2 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' & +docker run --name host2 --network "$NETWORK" --ip "$HOST2_IP" -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' & sleep 1 vagrant ssh -c "cd /nebula && sudo sh -c 'echo \$\$ >/nebula/pid && exec /nebula/$1-nebula -config host3.yml'" 2>&1 -- -T | tee logs/host3 | sed -u 's/^/ [host3] /' & sleep 15 @@ -96,7 +128,14 @@ vagrant ssh -c "ping -c1 192.168.100.2" -- -T vagrant ssh -c "sudo xargs kill /dev/null 2>&1 } trap cleanup EXIT +# Create a dedicated smoke network with an explicit subnet (required for --ip +# below). Probe a short list of candidates so a locally-used range doesn't +# fail the whole test — we only need one to be free. +docker network rm "$NETWORK" >/dev/null 2>&1 || true +for candidate in 172.30.0.0/24 172.31.0.0/24 10.98.0.0/24 10.99.0.0/24 192.168.230.0/24; do + if docker network create --subnet "$candidate" "$NETWORK" >/dev/null 2>&1; then + break + fi +done +if ! docker network inspect "$NETWORK" >/dev/null 2>&1; then + echo "failed to create $NETWORK: every candidate subnet is in use" >&2 + exit 1 +fi + +# Derive container IPs from the network's assigned subnet. Slots: .2 lighthouse1, +# .3 host2, .4 host3, .5 host4 — matches the placeholders in build.sh. +SUBNET="$(docker network inspect -f '{{(index .IPAM.Config 0).Subnet}}' "$NETWORK")" +PREFIX="${SUBNET%/*}" +PREFIX="${PREFIX%.*}" +LIGHTHOUSE_IP="$PREFIX.2" +HOST2_IP="$PREFIX.3" +HOST3_IP="$PREFIX.4" +HOST4_IP="$PREFIX.5" + +# Sed the placeholder TEST-NET-3 IPs in the host configs to the real ones. +# build/lighthouse1.yml has no IPs to rewrite so it's skipped. +for f in build/host2.yml build/host3.yml build/host4.yml; do + sed "s|203\.0\.113\.|$PREFIX.|g" "$f" >"$f.tmp" + mv "$f.tmp" "$f" +done + CONTAINER="nebula:${NAME:-smoke}" docker run --name lighthouse1 --rm "$CONTAINER" -config lighthouse1.yml -test -docker run --name host2 --rm "$CONTAINER" -config host2.yml -test -docker run --name host3 --rm "$CONTAINER" -config host3.yml -test -docker run --name host4 --rm "$CONTAINER" -config host4.yml -test +docker run --name host2 --rm -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" "$CONTAINER" -config host2.yml -test +docker run --name host3 --rm -v "$PWD/build/host3.yml:/nebula/host3.yml:ro" "$CONTAINER" -config host3.yml -test +docker run --name host4 --rm -v "$PWD/build/host4.yml:/nebula/host4.yml:ro" "$CONTAINER" -config host4.yml -test -docker run --name lighthouse1 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' & +docker run --name lighthouse1 --network "$NETWORK" --ip "$LIGHTHOUSE_IP" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' & sleep 1 -docker run --name host2 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' & +docker run --name host2 --network "$NETWORK" --ip "$HOST2_IP" -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' & sleep 1 -docker run --name host3 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host3.yml 2>&1 | tee logs/host3 | sed -u 's/^/ [host3] /' & +docker run --name host3 --network "$NETWORK" --ip "$HOST3_IP" -v "$PWD/build/host3.yml:/nebula/host3.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host3.yml 2>&1 | tee logs/host3 | sed -u 's/^/ [host3] /' & sleep 1 -docker run --name host4 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host4.yml 2>&1 | tee logs/host4 | sed -u 's/^/ [host4] /' & +docker run --name host4 --network "$NETWORK" --ip "$HOST4_IP" -v "$PWD/build/host4.yml:/nebula/host4.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host4.yml 2>&1 | tee logs/host4 | sed -u 's/^/ [host4] /' & sleep 1 # grab tcpdump pcaps for debugging @@ -131,7 +165,13 @@ docker exec host4 sh -c 'kill 1' docker exec host3 sh -c 'kill 1' docker exec host2 sh -c 'kill 1' docker exec lighthouse1 sh -c 'kill 1' -sleep 5 + +# Wait up to 30s for all backgrounded jobs to exit rather than relying on a +# fixed sleep. +for _ in $(seq 1 30); do + [ -z "$(jobs -r)" ] && break + sleep 1 +done if [ "$(jobs -r)" ] then