Reduce relay log spam (#1733)

This commit is contained in:
Nate Brown
2026-05-22 10:19:06 -05:00
committed by GitHub
parent 72bad1603a
commit 873f94f465
3 changed files with 136 additions and 19 deletions
+2 -1
View File
@@ -83,6 +83,7 @@ type HandshakeHostInfo struct {
initiatingVersionOverride cert.Version // Should we use a non-default cert version for this handshake?
counter int64 // How many attempts have we made so far
lastRemotes []netip.AddrPort // Remotes that we sent to during the previous attempt
lastRelays []netip.Addr // Relays we attempted to use during the previous attempt
packetStore []*cachedPacket // A set of packets to be transmitted once the handshake completes
hostinfo *HostInfo
@@ -323,7 +324,7 @@ func (hm *HandshakeManager) handleOutbound(vpnIp netip.Addr, lighthouseTriggered
)
}
hm.f.relayManager.StartRelays(hm.f, vpnIp, hostinfo, stage0)
hm.f.relayManager.StartRelays(hm.f, vpnIp, hh, stage0)
// If a lighthouse triggered this attempt then we are still in the timer wheel and do not need to re-add
if !lighthouseTriggered {
+37 -18
View File
@@ -7,6 +7,7 @@ import (
"fmt"
"log/slog"
"net/netip"
"slices"
"sync/atomic"
"github.com/slackhq/nebula/cert"
@@ -57,14 +58,25 @@ func (rm *relayManager) GetUseRelays() bool {
// For each candidate relay it either kicks off a handshake to the relay, sends a CreateRelayRequest, retransmits
// one that may have been lost, or, once the relay is Established, forwards the in-progress
// stage 0 handshake packet for vpnIp through it.
func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *HostInfo, stage0 []byte) {
func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hh *HandshakeHostInfo, stage0 []byte) {
hostinfo := hh.hostinfo
if !rm.GetUseRelays() || len(hostinfo.remotes.relays) == 0 {
hh.lastRelays = nil
return
}
hostinfo.logger(rm.l).Info("Attempt to relay through hosts", "relays", hostinfo.remotes.relays)
relays := hostinfo.remotes.relays
listLevel := slog.LevelDebug
prior := hh.lastRelays
if !slices.Equal(relays, prior) {
listLevel = slog.LevelInfo
hh.lastRelays = slices.Clone(relays)
}
hl := hostinfo.logger(rm.l)
hl.Log(context.Background(), listLevel, "Attempt to relay through hosts", "relays", relays)
// Send a RelayRequest to all known Relay IP's
for _, relay := range hostinfo.remotes.relays {
for _, relay := range relays {
// Don't relay through the host I'm trying to connect to
if relay == vpnIp {
continue
@@ -75,12 +87,19 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho
continue
}
// Each relay's per-attempt log fires at Info on the first time we hit it and Debug after that.
level := slog.LevelInfo
if slices.Contains(prior, relay) {
level = slog.LevelDebug
}
relayHostInfo := rm.hostmap.QueryVpnAddr(relay)
if relayHostInfo == nil || !relayHostInfo.remote.IsValid() {
hostinfo.logger(rm.l).Info("Establish tunnel to relay target", "relay", relay.String())
hl.Log(context.Background(), level, "Establish tunnel to relay target", "relay", relay.String())
f.Handshake(relay)
continue
}
// Check the relay HostInfo to see if we already established a relay through
existingRelay, ok := relayHostInfo.relayState.QueryRelayForByIp(vpnIp)
if !ok {
@@ -88,7 +107,7 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho
if relayHostInfo.remote.IsValid() {
idx, err := AddRelay(rm.l, relayHostInfo, rm.hostmap, vpnIp, nil, TerminalType, Requested)
if err != nil {
hostinfo.logger(rm.l).Info("Failed to add relay to hostmap", "relay", relay.String(), "error", err)
hl.Info("Failed to add relay to hostmap", "relay", relay.String(), "error", err)
}
m := NebulaControl{
@@ -99,12 +118,12 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho
switch relayHostInfo.GetCert().Certificate.Version() {
case cert.Version1:
if !f.myVpnAddrs[0].Is4() {
hostinfo.logger(rm.l).Error("can not establish v1 relay with a v6 network because the relay is not running a current nebula version")
hl.Error("can not establish v1 relay with a v6 network because the relay is not running a current nebula version")
continue
}
if !vpnIp.Is4() {
hostinfo.logger(rm.l).Error("can not establish v1 relay with a v6 remote network because the relay is not running a current nebula version")
hl.Error("can not establish v1 relay with a v6 remote network because the relay is not running a current nebula version")
continue
}
@@ -116,16 +135,16 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho
m.RelayFromAddr = netAddrToProtoAddr(f.myVpnAddrs[0])
m.RelayToAddr = netAddrToProtoAddr(vpnIp)
default:
hostinfo.logger(rm.l).Error("Unknown certificate version found while creating relay")
hl.Error("Unknown certificate version found while creating relay")
continue
}
msg, err := m.Marshal()
if err != nil {
hostinfo.logger(rm.l).Error("Failed to marshal Control message to create relay", "error", err)
hl.Error("Failed to marshal Control message to create relay", "error", err)
} else {
f.SendMessageToHostInfo(header.Control, 0, relayHostInfo, msg, make([]byte, 12), make([]byte, mtu))
rm.l.Info("send CreateRelayRequest",
rm.l.Log(context.Background(), level, "send CreateRelayRequest",
"relayFrom", f.myVpnAddrs[0],
"relayTo", vpnIp,
"initiatorRelayIndex", idx,
@@ -138,14 +157,14 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho
switch existingRelay.State {
case Established:
hostinfo.logger(rm.l).Info("Send handshake via relay", "relay", relay.String())
hl.Log(context.Background(), level, "Send handshake via relay", "relay", relay.String())
f.SendVia(relayHostInfo, existingRelay, stage0, make([]byte, 12), make([]byte, mtu), false)
case Disestablished:
// Mark this relay as 'requested'
relayHostInfo.relayState.UpdateRelayForByIpState(vpnIp, Requested)
fallthrough
case Requested:
hostinfo.logger(rm.l).Info("Re-send CreateRelay request", "relay", relay.String())
hl.Log(context.Background(), level, "Re-send CreateRelay request", "relay", relay.String())
// Re-send the CreateRelay request, in case the previous one was lost.
m := NebulaControl{
Type: NebulaControl_CreateRelayRequest,
@@ -155,12 +174,12 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho
switch relayHostInfo.GetCert().Certificate.Version() {
case cert.Version1:
if !f.myVpnAddrs[0].Is4() {
hostinfo.logger(rm.l).Error("can not establish v1 relay with a v6 network because the relay is not running a current nebula version")
hl.Error("can not establish v1 relay with a v6 network because the relay is not running a current nebula version")
continue
}
if !vpnIp.Is4() {
hostinfo.logger(rm.l).Error("can not establish v1 relay with a v6 remote network because the relay is not running a current nebula version")
hl.Error("can not establish v1 relay with a v6 remote network because the relay is not running a current nebula version")
continue
}
@@ -172,16 +191,16 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho
m.RelayFromAddr = netAddrToProtoAddr(f.myVpnAddrs[0])
m.RelayToAddr = netAddrToProtoAddr(vpnIp)
default:
hostinfo.logger(rm.l).Error("Unknown certificate version found while creating relay")
hl.Error("Unknown certificate version found while creating relay")
continue
}
msg, err := m.Marshal()
if err != nil {
hostinfo.logger(rm.l).Error("Failed to marshal Control message to create relay", "error", err)
hl.Error("Failed to marshal Control message to create relay", "error", err)
} else {
// This must send over the hostinfo, not over hm.Hosts[ip]
f.SendMessageToHostInfo(header.Control, 0, relayHostInfo, msg, make([]byte, 12), make([]byte, mtu))
rm.l.Info("send CreateRelayRequest",
rm.l.Log(context.Background(), level, "send CreateRelayRequest",
"relayFrom", f.myVpnAddrs[0],
"relayTo", vpnIp,
"initiatorRelayIndex", existingRelay.LocalIndex,
@@ -192,7 +211,7 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho
// PeerRequested only occurs in Forwarding relays, not Terminal relays, and this is a Terminal relay case.
fallthrough
default:
hostinfo.logger(rm.l).Error("Relay unexpected state",
hl.Error("Relay unexpected state",
"vpnIp", vpnIp,
"state", existingRelay.State,
"relay", relay,
+97
View File
@@ -0,0 +1,97 @@
package nebula
import (
"bytes"
"log/slog"
"net/netip"
"testing"
"github.com/gaissmai/bart"
"github.com/slackhq/nebula/test"
"github.com/stretchr/testify/assert"
)
// TestStartRelaysLogDedupe verifies that repeated attempts with the same relay set drop the log
// chatter to Debug, mirroring how the normal handshake retry loop quiets down once it's already
// announced its targets.
func TestStartRelaysLogDedupe(t *testing.T) {
vpnIp := netip.MustParseAddr("100.64.99.4")
otherRelay := netip.MustParseAddr("100.64.99.5")
newHH := func() *HandshakeHostInfo {
// Use the target's own vpnIp as the "relay" so the loop body skips it without
// touching any sender-side state. That isolates the test to the level-selection
// behavior of the top-level "Attempt to relay through hosts" log.
hostinfo := &HostInfo{
vpnAddrs: []netip.Addr{vpnIp},
localIndexId: 1,
remotes: NewRemoteList([]netip.Addr{vpnIp}, nil),
}
hostinfo.remotes.relays = []netip.Addr{vpnIp}
return &HandshakeHostInfo{hostinfo: hostinfo}
}
// Park any extra relay addresses we'll introduce mid-test in myVpnAddrsTable so the loop
// body always skips before touching f.Handshake (which would need a real handshakeManager).
addrTable := new(bart.Lite)
addrTable.Insert(netip.PrefixFrom(otherRelay, otherRelay.BitLen()))
f := &Interface{myVpnAddrsTable: addrTable}
newRM := func(buf *bytes.Buffer) *relayManager {
l := test.NewLoggerWithOutputAndLevel(buf, slog.LevelDebug)
rm := &relayManager{l: l, hostmap: newHostMap(l)}
rm.useRelays.Store(true)
return rm
}
const msg = `msg="Attempt to relay through hosts"`
t.Run("first attempt logs at Info", func(t *testing.T) {
var buf bytes.Buffer
rm := newRM(&buf)
hh := newHH()
rm.StartRelays(f, vpnIp, hh, nil)
assert.Equal(t, []netip.Addr{vpnIp}, hh.lastRelays, "lastRelays should record the relay set we just attempted")
assert.Contains(t, buf.String(), "level=INFO "+msg, "expected Info level on first attempt")
})
t.Run("repeat attempt with same relays drops to Debug", func(t *testing.T) {
var buf bytes.Buffer
rm := newRM(&buf)
hh := newHH()
rm.StartRelays(f, vpnIp, hh, nil)
first := append([]netip.Addr(nil), hh.lastRelays...)
buf.Reset()
rm.StartRelays(f, vpnIp, hh, nil)
assert.Equal(t, first, hh.lastRelays)
assert.Contains(t, buf.String(), "level=DEBUG "+msg, "expected Debug level on identical retry")
assert.NotContains(t, buf.String(), "level=INFO "+msg, "Info should not fire on identical retry")
})
t.Run("changed relay list bumps back to Info", func(t *testing.T) {
var buf bytes.Buffer
rm := newRM(&buf)
hh := newHH()
rm.StartRelays(f, vpnIp, hh, nil)
buf.Reset()
// The lighthouse handed us a new set this round.
hh.hostinfo.remotes.relays = []netip.Addr{vpnIp, otherRelay}
rm.StartRelays(f, vpnIp, hh, nil)
assert.Equal(t, []netip.Addr{vpnIp, otherRelay}, hh.lastRelays)
assert.Contains(t, buf.String(), "level=INFO "+msg, "expected Info when the relay list changes")
})
t.Run("disabled relays clears lastRelays and emits no Attempt log", func(t *testing.T) {
var buf bytes.Buffer
rm := newRM(&buf)
rm.useRelays.Store(false)
hh := newHH()
hh.lastRelays = []netip.Addr{vpnIp}
rm.StartRelays(f, vpnIp, hh, nil)
assert.Nil(t, hh.lastRelays, "with relays disabled lastRelays should be cleared")
assert.NotContains(t, buf.String(), msg, "should not log when we shortcut out")
})
}