Try to make smoke less flakey (#1663)
Some checks failed
gofmt / Run gofmt (push) Failing after 3s
smoke-extra / Run extra smoke tests (push) Failing after 2s
smoke / Run multi node smoke test (push) Failing after 3s
Build and test / Build all and test on ubuntu-linux (push) Failing after 3s
Build and test / Build and test on linux with boringcrypto (push) Failing after 2s
Build and test / Build and test on linux with pkcs11 (push) Failing after 3s
Build and test / Build and test on macos-latest (push) Has been cancelled
Build and test / Build and test on windows-latest (push) Has been cancelled

This commit is contained in:
Nate Brown
2026-04-20 16:38:14 -05:00
committed by GitHub
parent e80b9830a3
commit 3d34cc9b74
5 changed files with 156 additions and 30 deletions

View File

@@ -16,8 +16,10 @@ relay:
am_relay: true
EOF
export LIGHTHOUSES="192.168.100.1 172.17.0.2:4242"
export REMOTE_ALLOW_LIST='{"172.17.0.4/32": false, "172.17.0.5/32": false}'
# TEST-NET-3 placeholder IPs; smoke-relay.sh seds them to real container IPs.
# Mapping: .2 lighthouse1, .3 host2, .4 host3, .5 host4.
export LIGHTHOUSES="192.168.100.1 203.0.113.2:4242"
export REMOTE_ALLOW_LIST='{"203.0.113.4/32": false, "203.0.113.5/32": false}'
HOST="host2" ../genconfig.sh >host2.yml <<EOF
relay:
@@ -25,7 +27,7 @@ relay:
- 192.168.100.1
EOF
export REMOTE_ALLOW_LIST='{"172.17.0.3/32": false}'
export REMOTE_ALLOW_LIST='{"203.0.113.3/32": false}'
HOST="host3" ../genconfig.sh >host3.yml

View File

@@ -5,9 +5,15 @@ set -e -x
rm -rf ./build
mkdir ./build
# TODO: Assumes your docker bridge network is a /24, and the first container that launches will be .1
# - We could make this better by launching the lighthouse first and then fetching what IP it is.
NET="$(docker network inspect bridge -f '{{ range .IPAM.Config }}{{ .Subnet }}{{ end }}' | cut -d. -f1-3)"
# Smoke containers run on a dedicated docker network whose subnet is allocated
# at smoke time, not known at build time. Configs are written with TEST-NET-3
# placeholder IPs (RFC 5737) and smoke.sh / smoke-vagrant.sh / smoke-relay.sh
# sed the real container IPs in before starting nebula.
#
# Placeholder mapping (last octet == fixed container slot):
# 203.0.113.2 -> lighthouse1, 203.0.113.3 -> host2,
# 203.0.113.4 -> host3, 203.0.113.5 -> host4.
LIGHTHOUSE_IP="203.0.113.2"
(
cd build
@@ -25,16 +31,16 @@ NET="$(docker network inspect bridge -f '{{ range .IPAM.Config }}{{ .Subnet }}{{
../genconfig.sh >lighthouse1.yml
HOST="host2" \
LIGHTHOUSES="192.168.100.1 $NET.2:4242" \
LIGHTHOUSES="192.168.100.1 $LIGHTHOUSE_IP:4242" \
../genconfig.sh >host2.yml
HOST="host3" \
LIGHTHOUSES="192.168.100.1 $NET.2:4242" \
LIGHTHOUSES="192.168.100.1 $LIGHTHOUSE_IP:4242" \
INBOUND='[{"port": "any", "proto": "icmp", "group": "lighthouse"}]' \
../genconfig.sh >host3.yml
HOST="host4" \
LIGHTHOUSES="192.168.100.1 $NET.2:4242" \
LIGHTHOUSES="192.168.100.1 $LIGHTHOUSE_IP:4242" \
OUTBOUND='[{"port": "any", "proto": "icmp", "group": "lighthouse"}]' \
../genconfig.sh >host4.yml

View File

@@ -6,6 +6,8 @@ set -o pipefail
mkdir -p logs
NETWORK="nebula-smoke-relay"
cleanup() {
echo
echo " *** cleanup"
@@ -16,22 +18,53 @@ cleanup() {
then
docker kill lighthouse1 host2 host3 host4
fi
docker network rm "$NETWORK" >/dev/null 2>&1
}
trap cleanup EXIT
docker run --name lighthouse1 --rm nebula:smoke-relay -config lighthouse1.yml -test
docker run --name host2 --rm nebula:smoke-relay -config host2.yml -test
docker run --name host3 --rm nebula:smoke-relay -config host3.yml -test
docker run --name host4 --rm nebula:smoke-relay -config host4.yml -test
# Create a dedicated smoke network with an explicit subnet (required for --ip
# below). Probe a short list of candidates so a locally-used range doesn't
# fail the whole test — we only need one to be free.
docker network rm "$NETWORK" >/dev/null 2>&1 || true
for candidate in 172.30.0.0/24 172.31.0.0/24 10.98.0.0/24 10.99.0.0/24 192.168.230.0/24; do
if docker network create --subnet "$candidate" "$NETWORK" >/dev/null 2>&1; then
break
fi
done
if ! docker network inspect "$NETWORK" >/dev/null 2>&1; then
echo "failed to create $NETWORK: every candidate subnet is in use" >&2
exit 1
fi
docker run --name lighthouse1 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' &
# Derive container IPs from the network's assigned subnet. Slots: .2 lighthouse1,
# .3 host2, .4 host3, .5 host4 — matches the placeholders in build-relay.sh.
SUBNET="$(docker network inspect -f '{{(index .IPAM.Config 0).Subnet}}' "$NETWORK")"
PREFIX="${SUBNET%/*}"
PREFIX="${PREFIX%.*}"
LIGHTHOUSE_IP="$PREFIX.2"
HOST2_IP="$PREFIX.3"
HOST3_IP="$PREFIX.4"
HOST4_IP="$PREFIX.5"
# Sed the placeholder TEST-NET-3 IPs in the host configs to the real ones.
for f in build/host2.yml build/host3.yml build/host4.yml; do
sed "s|203\.0\.113\.|$PREFIX.|g" "$f" >"$f.tmp"
mv "$f.tmp" "$f"
done
docker run --name lighthouse1 --rm nebula:smoke-relay -config lighthouse1.yml -test
docker run --name host2 --rm -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" nebula:smoke-relay -config host2.yml -test
docker run --name host3 --rm -v "$PWD/build/host3.yml:/nebula/host3.yml:ro" nebula:smoke-relay -config host3.yml -test
docker run --name host4 --rm -v "$PWD/build/host4.yml:/nebula/host4.yml:ro" nebula:smoke-relay -config host4.yml -test
docker run --name lighthouse1 --network "$NETWORK" --ip "$LIGHTHOUSE_IP" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' &
sleep 1
docker run --name host2 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' &
docker run --name host2 --network "$NETWORK" --ip "$HOST2_IP" -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' &
sleep 1
docker run --name host3 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host3.yml 2>&1 | tee logs/host3 | sed -u 's/^/ [host3] /' &
docker run --name host3 --network "$NETWORK" --ip "$HOST3_IP" -v "$PWD/build/host3.yml:/nebula/host3.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host3.yml 2>&1 | tee logs/host3 | sed -u 's/^/ [host3] /' &
sleep 1
docker run --name host4 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host4.yml 2>&1 | tee logs/host4 | sed -u 's/^/ [host4] /' &
docker run --name host4 --network "$NETWORK" --ip "$HOST4_IP" -v "$PWD/build/host4.yml:/nebula/host4.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm nebula:smoke-relay -config host4.yml 2>&1 | tee logs/host4 | sed -u 's/^/ [host4] /' &
sleep 1
set +x
@@ -76,7 +109,13 @@ docker exec host4 sh -c 'kill 1'
docker exec host3 sh -c 'kill 1'
docker exec host2 sh -c 'kill 1'
docker exec lighthouse1 sh -c 'kill 1'
sleep 5
# Wait up to 30s for all backgrounded jobs to exit rather than relying on a
# fixed sleep.
for _ in $(seq 1 30); do
[ -z "$(jobs -r)" ] && break
sleep 1
done
if [ "$(jobs -r)" ]
then

View File

@@ -8,6 +8,8 @@ export VAGRANT_CWD="$PWD/vagrant-$1"
mkdir -p logs
NETWORK="nebula-smoke"
cleanup() {
echo
echo " *** cleanup"
@@ -19,21 +21,51 @@ cleanup() {
docker kill lighthouse1 host2
fi
vagrant destroy -f
docker network rm "$NETWORK" >/dev/null 2>&1
}
trap cleanup EXIT
# Create a dedicated smoke network with an explicit subnet (required for --ip
# below). Probe a short list of candidates so a locally-used range doesn't
# fail the whole test — we only need one to be free.
docker network rm "$NETWORK" >/dev/null 2>&1 || true
for candidate in 172.30.0.0/24 172.31.0.0/24 10.98.0.0/24 10.99.0.0/24 192.168.230.0/24; do
if docker network create --subnet "$candidate" "$NETWORK" >/dev/null 2>&1; then
break
fi
done
if ! docker network inspect "$NETWORK" >/dev/null 2>&1; then
echo "failed to create $NETWORK: every candidate subnet is in use" >&2
exit 1
fi
# Derive container IPs from the network's assigned subnet. Slots: .2 lighthouse1,
# .3 host2 — matches the placeholders in build.sh.
SUBNET="$(docker network inspect -f '{{(index .IPAM.Config 0).Subnet}}' "$NETWORK")"
PREFIX="${SUBNET%/*}"
PREFIX="${PREFIX%.*}"
LIGHTHOUSE_IP="$PREFIX.2"
HOST2_IP="$PREFIX.3"
# Sed the placeholder TEST-NET-3 IPs in the host configs to the real ones.
# This must happen before `vagrant up` rsyncs build/ into the VM for host3.
for f in build/host2.yml build/host3.yml; do
sed "s|203\.0\.113\.|$PREFIX.|g" "$f" >"$f.tmp"
mv "$f.tmp" "$f"
done
CONTAINER="nebula:${NAME:-smoke}"
docker run --name lighthouse1 --rm "$CONTAINER" -config lighthouse1.yml -test
docker run --name host2 --rm "$CONTAINER" -config host2.yml -test
docker run --name host2 --rm -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" "$CONTAINER" -config host2.yml -test
vagrant up
vagrant ssh -c "cd /nebula && /nebula/$1-nebula -config host3.yml -test" -- -T
docker run --name lighthouse1 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' &
docker run --name lighthouse1 --network "$NETWORK" --ip "$LIGHTHOUSE_IP" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' &
sleep 1
docker run --name host2 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' &
docker run --name host2 --network "$NETWORK" --ip "$HOST2_IP" -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' &
sleep 1
vagrant ssh -c "cd /nebula && sudo sh -c 'echo \$\$ >/nebula/pid && exec /nebula/$1-nebula -config host3.yml'" 2>&1 -- -T | tee logs/host3 | sed -u 's/^/ [host3] /' &
sleep 15
@@ -96,7 +128,14 @@ vagrant ssh -c "ping -c1 192.168.100.2" -- -T
vagrant ssh -c "sudo xargs kill </nebula/pid" -- -T
docker exec host2 sh -c 'kill 1'
docker exec lighthouse1 sh -c 'kill 1'
sleep 1
# Wait up to 30s for all backgrounded jobs to exit. vagrant ssh in particular
# takes a beat to tear down after nebula exits on the VM, so a fixed sleep is
# racy.
for _ in $(seq 1 30); do
[ -z "$(jobs -r)" ] && break
sleep 1
done
if [ "$(jobs -r)" ]
then

View File

@@ -6,6 +6,8 @@ set -o pipefail
mkdir -p logs
NETWORK="nebula-smoke"
cleanup() {
echo
echo " *** cleanup"
@@ -16,24 +18,56 @@ cleanup() {
then
docker kill lighthouse1 host2 host3 host4
fi
docker network rm "$NETWORK" >/dev/null 2>&1
}
trap cleanup EXIT
# Create a dedicated smoke network with an explicit subnet (required for --ip
# below). Probe a short list of candidates so a locally-used range doesn't
# fail the whole test — we only need one to be free.
docker network rm "$NETWORK" >/dev/null 2>&1 || true
for candidate in 172.30.0.0/24 172.31.0.0/24 10.98.0.0/24 10.99.0.0/24 192.168.230.0/24; do
if docker network create --subnet "$candidate" "$NETWORK" >/dev/null 2>&1; then
break
fi
done
if ! docker network inspect "$NETWORK" >/dev/null 2>&1; then
echo "failed to create $NETWORK: every candidate subnet is in use" >&2
exit 1
fi
# Derive container IPs from the network's assigned subnet. Slots: .2 lighthouse1,
# .3 host2, .4 host3, .5 host4 — matches the placeholders in build.sh.
SUBNET="$(docker network inspect -f '{{(index .IPAM.Config 0).Subnet}}' "$NETWORK")"
PREFIX="${SUBNET%/*}"
PREFIX="${PREFIX%.*}"
LIGHTHOUSE_IP="$PREFIX.2"
HOST2_IP="$PREFIX.3"
HOST3_IP="$PREFIX.4"
HOST4_IP="$PREFIX.5"
# Sed the placeholder TEST-NET-3 IPs in the host configs to the real ones.
# build/lighthouse1.yml has no IPs to rewrite so it's skipped.
for f in build/host2.yml build/host3.yml build/host4.yml; do
sed "s|203\.0\.113\.|$PREFIX.|g" "$f" >"$f.tmp"
mv "$f.tmp" "$f"
done
CONTAINER="nebula:${NAME:-smoke}"
docker run --name lighthouse1 --rm "$CONTAINER" -config lighthouse1.yml -test
docker run --name host2 --rm "$CONTAINER" -config host2.yml -test
docker run --name host3 --rm "$CONTAINER" -config host3.yml -test
docker run --name host4 --rm "$CONTAINER" -config host4.yml -test
docker run --name host2 --rm -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" "$CONTAINER" -config host2.yml -test
docker run --name host3 --rm -v "$PWD/build/host3.yml:/nebula/host3.yml:ro" "$CONTAINER" -config host3.yml -test
docker run --name host4 --rm -v "$PWD/build/host4.yml:/nebula/host4.yml:ro" "$CONTAINER" -config host4.yml -test
docker run --name lighthouse1 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' &
docker run --name lighthouse1 --network "$NETWORK" --ip "$LIGHTHOUSE_IP" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config lighthouse1.yml 2>&1 | tee logs/lighthouse1 | sed -u 's/^/ [lighthouse1] /' &
sleep 1
docker run --name host2 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' &
docker run --name host2 --network "$NETWORK" --ip "$HOST2_IP" -v "$PWD/build/host2.yml:/nebula/host2.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host2.yml 2>&1 | tee logs/host2 | sed -u 's/^/ [host2] /' &
sleep 1
docker run --name host3 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host3.yml 2>&1 | tee logs/host3 | sed -u 's/^/ [host3] /' &
docker run --name host3 --network "$NETWORK" --ip "$HOST3_IP" -v "$PWD/build/host3.yml:/nebula/host3.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host3.yml 2>&1 | tee logs/host3 | sed -u 's/^/ [host3] /' &
sleep 1
docker run --name host4 --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host4.yml 2>&1 | tee logs/host4 | sed -u 's/^/ [host4] /' &
docker run --name host4 --network "$NETWORK" --ip "$HOST4_IP" -v "$PWD/build/host4.yml:/nebula/host4.yml:ro" --device /dev/net/tun:/dev/net/tun --cap-add NET_ADMIN --rm "$CONTAINER" -config host4.yml 2>&1 | tee logs/host4 | sed -u 's/^/ [host4] /' &
sleep 1
# grab tcpdump pcaps for debugging
@@ -131,7 +165,13 @@ docker exec host4 sh -c 'kill 1'
docker exec host3 sh -c 'kill 1'
docker exec host2 sh -c 'kill 1'
docker exec lighthouse1 sh -c 'kill 1'
sleep 5
# Wait up to 30s for all backgrounded jobs to exit rather than relying on a
# fixed sleep.
for _ in $(seq 1 30); do
[ -z "$(jobs -r)" ] && break
sleep 1
done
if [ "$(jobs -r)" ]
then