diff --git a/handshake_manager.go b/handshake_manager.go index 87257028..d03814da 100644 --- a/handshake_manager.go +++ b/handshake_manager.go @@ -83,6 +83,7 @@ type HandshakeHostInfo struct { initiatingVersionOverride cert.Version // Should we use a non-default cert version for this handshake? counter int64 // How many attempts have we made so far lastRemotes []netip.AddrPort // Remotes that we sent to during the previous attempt + lastRelays []netip.Addr // Relays we attempted to use during the previous attempt packetStore []*cachedPacket // A set of packets to be transmitted once the handshake completes hostinfo *HostInfo @@ -323,7 +324,7 @@ func (hm *HandshakeManager) handleOutbound(vpnIp netip.Addr, lighthouseTriggered ) } - hm.f.relayManager.StartRelays(hm.f, vpnIp, hostinfo, stage0) + hm.f.relayManager.StartRelays(hm.f, vpnIp, hh, stage0) // If a lighthouse triggered this attempt then we are still in the timer wheel and do not need to re-add if !lighthouseTriggered { diff --git a/relay_manager.go b/relay_manager.go index 25e65871..1fd98963 100644 --- a/relay_manager.go +++ b/relay_manager.go @@ -7,6 +7,7 @@ import ( "fmt" "log/slog" "net/netip" + "slices" "sync/atomic" "github.com/slackhq/nebula/cert" @@ -57,14 +58,25 @@ func (rm *relayManager) GetUseRelays() bool { // For each candidate relay it either kicks off a handshake to the relay, sends a CreateRelayRequest, retransmits // one that may have been lost, or, once the relay is Established, forwards the in-progress // stage 0 handshake packet for vpnIp through it. -func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *HostInfo, stage0 []byte) { +func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hh *HandshakeHostInfo, stage0 []byte) { + hostinfo := hh.hostinfo if !rm.GetUseRelays() || len(hostinfo.remotes.relays) == 0 { + hh.lastRelays = nil return } - hostinfo.logger(rm.l).Info("Attempt to relay through hosts", "relays", hostinfo.remotes.relays) + relays := hostinfo.remotes.relays + listLevel := slog.LevelDebug + prior := hh.lastRelays + if !slices.Equal(relays, prior) { + listLevel = slog.LevelInfo + hh.lastRelays = slices.Clone(relays) + } + hl := hostinfo.logger(rm.l) + hl.Log(context.Background(), listLevel, "Attempt to relay through hosts", "relays", relays) + // Send a RelayRequest to all known Relay IP's - for _, relay := range hostinfo.remotes.relays { + for _, relay := range relays { // Don't relay through the host I'm trying to connect to if relay == vpnIp { continue @@ -75,12 +87,19 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho continue } + // Each relay's per-attempt log fires at Info on the first time we hit it and Debug after that. + level := slog.LevelInfo + if slices.Contains(prior, relay) { + level = slog.LevelDebug + } + relayHostInfo := rm.hostmap.QueryVpnAddr(relay) if relayHostInfo == nil || !relayHostInfo.remote.IsValid() { - hostinfo.logger(rm.l).Info("Establish tunnel to relay target", "relay", relay.String()) + hl.Log(context.Background(), level, "Establish tunnel to relay target", "relay", relay.String()) f.Handshake(relay) continue } + // Check the relay HostInfo to see if we already established a relay through existingRelay, ok := relayHostInfo.relayState.QueryRelayForByIp(vpnIp) if !ok { @@ -88,7 +107,7 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho if relayHostInfo.remote.IsValid() { idx, err := AddRelay(rm.l, relayHostInfo, rm.hostmap, vpnIp, nil, TerminalType, Requested) if err != nil { - hostinfo.logger(rm.l).Info("Failed to add relay to hostmap", "relay", relay.String(), "error", err) + hl.Info("Failed to add relay to hostmap", "relay", relay.String(), "error", err) } m := NebulaControl{ @@ -99,12 +118,12 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho switch relayHostInfo.GetCert().Certificate.Version() { case cert.Version1: if !f.myVpnAddrs[0].Is4() { - hostinfo.logger(rm.l).Error("can not establish v1 relay with a v6 network because the relay is not running a current nebula version") + hl.Error("can not establish v1 relay with a v6 network because the relay is not running a current nebula version") continue } if !vpnIp.Is4() { - hostinfo.logger(rm.l).Error("can not establish v1 relay with a v6 remote network because the relay is not running a current nebula version") + hl.Error("can not establish v1 relay with a v6 remote network because the relay is not running a current nebula version") continue } @@ -116,16 +135,16 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho m.RelayFromAddr = netAddrToProtoAddr(f.myVpnAddrs[0]) m.RelayToAddr = netAddrToProtoAddr(vpnIp) default: - hostinfo.logger(rm.l).Error("Unknown certificate version found while creating relay") + hl.Error("Unknown certificate version found while creating relay") continue } msg, err := m.Marshal() if err != nil { - hostinfo.logger(rm.l).Error("Failed to marshal Control message to create relay", "error", err) + hl.Error("Failed to marshal Control message to create relay", "error", err) } else { f.SendMessageToHostInfo(header.Control, 0, relayHostInfo, msg, make([]byte, 12), make([]byte, mtu)) - rm.l.Info("send CreateRelayRequest", + rm.l.Log(context.Background(), level, "send CreateRelayRequest", "relayFrom", f.myVpnAddrs[0], "relayTo", vpnIp, "initiatorRelayIndex", idx, @@ -138,14 +157,14 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho switch existingRelay.State { case Established: - hostinfo.logger(rm.l).Info("Send handshake via relay", "relay", relay.String()) + hl.Log(context.Background(), level, "Send handshake via relay", "relay", relay.String()) f.SendVia(relayHostInfo, existingRelay, stage0, make([]byte, 12), make([]byte, mtu), false) case Disestablished: // Mark this relay as 'requested' relayHostInfo.relayState.UpdateRelayForByIpState(vpnIp, Requested) fallthrough case Requested: - hostinfo.logger(rm.l).Info("Re-send CreateRelay request", "relay", relay.String()) + hl.Log(context.Background(), level, "Re-send CreateRelay request", "relay", relay.String()) // Re-send the CreateRelay request, in case the previous one was lost. m := NebulaControl{ Type: NebulaControl_CreateRelayRequest, @@ -155,12 +174,12 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho switch relayHostInfo.GetCert().Certificate.Version() { case cert.Version1: if !f.myVpnAddrs[0].Is4() { - hostinfo.logger(rm.l).Error("can not establish v1 relay with a v6 network because the relay is not running a current nebula version") + hl.Error("can not establish v1 relay with a v6 network because the relay is not running a current nebula version") continue } if !vpnIp.Is4() { - hostinfo.logger(rm.l).Error("can not establish v1 relay with a v6 remote network because the relay is not running a current nebula version") + hl.Error("can not establish v1 relay with a v6 remote network because the relay is not running a current nebula version") continue } @@ -172,16 +191,16 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho m.RelayFromAddr = netAddrToProtoAddr(f.myVpnAddrs[0]) m.RelayToAddr = netAddrToProtoAddr(vpnIp) default: - hostinfo.logger(rm.l).Error("Unknown certificate version found while creating relay") + hl.Error("Unknown certificate version found while creating relay") continue } msg, err := m.Marshal() if err != nil { - hostinfo.logger(rm.l).Error("Failed to marshal Control message to create relay", "error", err) + hl.Error("Failed to marshal Control message to create relay", "error", err) } else { // This must send over the hostinfo, not over hm.Hosts[ip] f.SendMessageToHostInfo(header.Control, 0, relayHostInfo, msg, make([]byte, 12), make([]byte, mtu)) - rm.l.Info("send CreateRelayRequest", + rm.l.Log(context.Background(), level, "send CreateRelayRequest", "relayFrom", f.myVpnAddrs[0], "relayTo", vpnIp, "initiatorRelayIndex", existingRelay.LocalIndex, @@ -192,7 +211,7 @@ func (rm *relayManager) StartRelays(f *Interface, vpnIp netip.Addr, hostinfo *Ho // PeerRequested only occurs in Forwarding relays, not Terminal relays, and this is a Terminal relay case. fallthrough default: - hostinfo.logger(rm.l).Error("Relay unexpected state", + hl.Error("Relay unexpected state", "vpnIp", vpnIp, "state", existingRelay.State, "relay", relay, diff --git a/relay_manager_test.go b/relay_manager_test.go new file mode 100644 index 00000000..8da38940 --- /dev/null +++ b/relay_manager_test.go @@ -0,0 +1,97 @@ +package nebula + +import ( + "bytes" + "log/slog" + "net/netip" + "testing" + + "github.com/gaissmai/bart" + "github.com/slackhq/nebula/test" + "github.com/stretchr/testify/assert" +) + +// TestStartRelaysLogDedupe verifies that repeated attempts with the same relay set drop the log +// chatter to Debug, mirroring how the normal handshake retry loop quiets down once it's already +// announced its targets. +func TestStartRelaysLogDedupe(t *testing.T) { + vpnIp := netip.MustParseAddr("100.64.99.4") + otherRelay := netip.MustParseAddr("100.64.99.5") + + newHH := func() *HandshakeHostInfo { + // Use the target's own vpnIp as the "relay" so the loop body skips it without + // touching any sender-side state. That isolates the test to the level-selection + // behavior of the top-level "Attempt to relay through hosts" log. + hostinfo := &HostInfo{ + vpnAddrs: []netip.Addr{vpnIp}, + localIndexId: 1, + remotes: NewRemoteList([]netip.Addr{vpnIp}, nil), + } + hostinfo.remotes.relays = []netip.Addr{vpnIp} + return &HandshakeHostInfo{hostinfo: hostinfo} + } + + // Park any extra relay addresses we'll introduce mid-test in myVpnAddrsTable so the loop + // body always skips before touching f.Handshake (which would need a real handshakeManager). + addrTable := new(bart.Lite) + addrTable.Insert(netip.PrefixFrom(otherRelay, otherRelay.BitLen())) + f := &Interface{myVpnAddrsTable: addrTable} + + newRM := func(buf *bytes.Buffer) *relayManager { + l := test.NewLoggerWithOutputAndLevel(buf, slog.LevelDebug) + rm := &relayManager{l: l, hostmap: newHostMap(l)} + rm.useRelays.Store(true) + return rm + } + + const msg = `msg="Attempt to relay through hosts"` + + t.Run("first attempt logs at Info", func(t *testing.T) { + var buf bytes.Buffer + rm := newRM(&buf) + hh := newHH() + rm.StartRelays(f, vpnIp, hh, nil) + assert.Equal(t, []netip.Addr{vpnIp}, hh.lastRelays, "lastRelays should record the relay set we just attempted") + assert.Contains(t, buf.String(), "level=INFO "+msg, "expected Info level on first attempt") + }) + + t.Run("repeat attempt with same relays drops to Debug", func(t *testing.T) { + var buf bytes.Buffer + rm := newRM(&buf) + hh := newHH() + rm.StartRelays(f, vpnIp, hh, nil) + first := append([]netip.Addr(nil), hh.lastRelays...) + buf.Reset() + rm.StartRelays(f, vpnIp, hh, nil) + assert.Equal(t, first, hh.lastRelays) + assert.Contains(t, buf.String(), "level=DEBUG "+msg, "expected Debug level on identical retry") + assert.NotContains(t, buf.String(), "level=INFO "+msg, "Info should not fire on identical retry") + }) + + t.Run("changed relay list bumps back to Info", func(t *testing.T) { + var buf bytes.Buffer + rm := newRM(&buf) + hh := newHH() + rm.StartRelays(f, vpnIp, hh, nil) + buf.Reset() + + // The lighthouse handed us a new set this round. + hh.hostinfo.remotes.relays = []netip.Addr{vpnIp, otherRelay} + + rm.StartRelays(f, vpnIp, hh, nil) + assert.Equal(t, []netip.Addr{vpnIp, otherRelay}, hh.lastRelays) + assert.Contains(t, buf.String(), "level=INFO "+msg, "expected Info when the relay list changes") + }) + + t.Run("disabled relays clears lastRelays and emits no Attempt log", func(t *testing.T) { + var buf bytes.Buffer + rm := newRM(&buf) + rm.useRelays.Store(false) + hh := newHH() + hh.lastRelays = []netip.Addr{vpnIp} + + rm.StartRelays(f, vpnIp, hh, nil) + assert.Nil(t, hh.lastRelays, "with relays disabled lastRelays should be cleared") + assert.NotContains(t, buf.String(), msg, "should not log when we shortcut out") + }) +}