From 16a836a73f4cd0dbd04119d4486dd89f0dd9ba24 Mon Sep 17 00:00:00 2001 From: Nate Brown Date: Tue, 5 May 2026 17:05:50 -0500 Subject: [PATCH] PMTUD exploration, start small then grow --- connection_manager.go | 11 +- connection_manager_test.go | 6 + control.go | 4 + header/header.go | 12 +- interface.go | 4 + main.go | 3 + outside.go | 13 +- overlay/device.go | 9 + overlay/overlaytest/noop.go | 8 + overlay/tun_android.go | 8 + overlay/tun_darwin.go | 8 + overlay/tun_disabled.go | 8 + overlay/tun_freebsd.go | 8 + overlay/tun_ios.go | 8 + overlay/tun_linux.go | 68 +++- overlay/tun_netbsd.go | 8 + overlay/tun_openbsd.go | 8 + overlay/tun_tester.go | 8 + overlay/tun_windows.go | 8 + overlay/user.go | 8 + pmtud_manager.go | 623 ++++++++++++++++++++++++++++++++++++ udp/conn.go | 9 + udp/udp_android.go | 14 + udp/udp_bsd.go | 5 + udp/udp_darwin.go | 11 + udp/udp_freebsd.go | 25 ++ udp/udp_generic.go | 38 +++ udp/udp_linux.go | 30 ++ udp/udp_netbsd.go | 15 + udp/udp_openbsd.go | 23 ++ udp/udp_rio_windows.go | 6 + udp/udp_tester.go | 4 + udp/udp_windows.go | 26 ++ 33 files changed, 1036 insertions(+), 11 deletions(-) create mode 100644 pmtud_manager.go create mode 100644 udp/udp_freebsd.go create mode 100644 udp/udp_openbsd.go diff --git a/connection_manager.go b/connection_manager.go index e7fc04cd..85206e8b 100644 --- a/connection_manager.go +++ b/connection_manager.go @@ -145,6 +145,7 @@ func (cm *connectionManager) getAndResetTrafficCheck(h *HostInfo, now time.Time) func (cm *connectionManager) AddTrafficWatch(h *HostInfo) { if h.out.Swap(true) == false { cm.trafficTimer.Add(h.localIndexId, cm.checkInterval) + cm.intf.pmtudManager.OnTunnelUp(h) } } @@ -180,6 +181,7 @@ func (cm *connectionManager) doTrafficCheck(localIndex uint32, p, nb, out []byte switch decision { case deleteTunnel: + cm.intf.pmtudManager.OnTunnelDown(hostinfo) if cm.hostMap.DeleteHostInfo(hostinfo) { // Only clearing the lighthouse cache if this is the last hostinfo for this vpn ip in the hostmap cm.intf.lightHouse.DeleteVpnAddrs(hostinfo.vpnAddrs) @@ -199,7 +201,14 @@ func (cm *connectionManager) doTrafficCheck(localIndex uint32, p, nb, out []byte cm.tryRehandshake(hostinfo) case sendTestPacket: - cm.intf.SendMessageToHostInfo(header.Test, header.TestRequest, hostinfo, p, nb, out) + // Defer to pmtud if it has a confirmed PMTU > floor for this peer: + // the probe at the confirmed size verifies both liveness AND that + // the discovered PMTU still fits, so we don't burn a separate test + // packet on top of it. If pmtud declines (disabled, peer unsupported, + // or no confirmed size yet) we fall back to the regular test. + if !cm.intf.pmtudManager.MaybeProbeAsTest(hostinfo) { + cm.intf.SendMessageToHostInfo(header.Test, header.TestRequest, hostinfo, p, nb, out) + } } cm.resetRelayTrafficCheck(hostinfo) diff --git a/connection_manager_test.go b/connection_manager_test.go index 7dc08a45..aa41c4ec 100644 --- a/connection_manager_test.go +++ b/connection_manager_test.go @@ -67,6 +67,8 @@ func Test_NewConnectionManagerTest(t *testing.T) { punchy := NewPunchyFromConfig(test.NewLogger(), conf) nc := newConnectionManagerFromConfig(test.NewLogger(), conf, hostMap, punchy) nc.intf = ifce + ifce.pmtudManager = newPMTUDManagerFromConfig(test.NewLogger(), conf, ifce.inside) + ifce.pmtudManager.intf = ifce p := []byte("") nb := make([]byte, 12, 12) out := make([]byte, mtu) @@ -149,6 +151,8 @@ func Test_NewConnectionManagerTest2(t *testing.T) { punchy := NewPunchyFromConfig(test.NewLogger(), conf) nc := newConnectionManagerFromConfig(test.NewLogger(), conf, hostMap, punchy) nc.intf = ifce + ifce.pmtudManager = newPMTUDManagerFromConfig(test.NewLogger(), conf, ifce.inside) + ifce.pmtudManager.intf = ifce p := []byte("") nb := make([]byte, 12, 12) out := make([]byte, mtu) @@ -361,6 +365,8 @@ func Test_NewConnectionManagerTest_DisconnectInvalid(t *testing.T) { punchy := NewPunchyFromConfig(test.NewLogger(), conf) nc := newConnectionManagerFromConfig(test.NewLogger(), conf, hostMap, punchy) nc.intf = ifce + ifce.pmtudManager = newPMTUDManagerFromConfig(test.NewLogger(), conf, ifce.inside) + ifce.pmtudManager.intf = ifce ifce.connectionManager = nc hostinfo := &HostInfo{ diff --git a/control.go b/control.go index ef58988b..6f43c935 100644 --- a/control.go +++ b/control.go @@ -54,6 +54,7 @@ type Control struct { dnsStart func() lighthouseStart func() connectionManagerStart func(context.Context) + pmtudManagerStart func(context.Context) } type ControlHostInfo struct { @@ -107,6 +108,9 @@ func (c *Control) Start() (func() error, error) { if c.connectionManagerStart != nil { go c.connectionManagerStart(c.ctx) } + if c.pmtudManagerStart != nil { + go c.pmtudManagerStart(c.ctx) + } if c.lighthouseStart != nil { c.lighthouseStart() } diff --git a/header/header.go b/header/header.go index f22509b8..3556e1c0 100644 --- a/header/header.go +++ b/header/header.go @@ -55,8 +55,10 @@ const ( ) const ( - TestRequest MessageSubType = 0 - TestReply MessageSubType = 1 + TestRequest MessageSubType = 0 + TestReply MessageSubType = 1 + MTUDProbeRequest MessageSubType = 2 + MTUDProbeReply MessageSubType = 3 ) const ( @@ -67,8 +69,10 @@ const ( var ErrHeaderTooShort = errors.New("header is too short") var subTypeTestMap = map[MessageSubType]string{ - TestRequest: "testRequest", - TestReply: "testReply", + TestRequest: "testRequest", + TestReply: "testReply", + MTUDProbeRequest: "mtudProbeRequest", + MTUDProbeReply: "mtudProbeReply", } var subTypeNoneMap = map[MessageSubType]string{0: "none"} diff --git a/interface.go b/interface.go index 5fedcdd3..512a91b8 100644 --- a/interface.go +++ b/interface.go @@ -34,6 +34,7 @@ type InterfaceConfig struct { HandshakeManager *HandshakeManager lightHouse *LightHouse connectionManager *connectionManager + pmtudManager *pmtudManager DropLocalBroadcast bool DropMulticast bool routines int @@ -57,6 +58,7 @@ type Interface struct { pki *PKI firewall *Firewall connectionManager *connectionManager + pmtudManager *pmtudManager handshakeManager *HandshakeManager dnsServer *dnsServer createTime time.Time @@ -195,6 +197,7 @@ func NewInterface(ctx context.Context, c *InterfaceConfig) (*Interface, error) { myBroadcastAddrsTable: cs.myVpnBroadcastAddrsTable, relayManager: c.relayManager, connectionManager: c.connectionManager, + pmtudManager: c.pmtudManager, conntrackCacheTimeout: c.ConntrackCacheTimeout, metricHandshakes: metrics.GetOrRegisterHistogram("handshakes", nil, metrics.NewExpDecaySample(1028, 0.015)), @@ -212,6 +215,7 @@ func NewInterface(ctx context.Context, c *InterfaceConfig) (*Interface, error) { ifce.reQueryWait.Store(int64(c.reQueryWait)) ifce.connectionManager.intf = ifce + ifce.pmtudManager.intf = ifce return ifce, nil } diff --git a/main.go b/main.go index d5e5dcc8..d58c2674 100644 --- a/main.go +++ b/main.go @@ -172,6 +172,7 @@ func Main(c *config.C, configTest bool, buildVersion string, l *slog.Logger, dev hostMap := NewHostMapFromConfig(l, c) punchy := NewPunchyFromConfig(l, c) connManager := newConnectionManagerFromConfig(l, c, hostMap, punchy) + pmtudMgr := newPMTUDManagerFromConfig(l, c, tun) lightHouse, err := NewLightHouseFromConfig(ctx, l, c, pki.getCertState(), udpConns[0], punchy) if err != nil { return nil, util.ContextualizeIfNeeded("Failed to initialize lighthouse handler", err) @@ -208,6 +209,7 @@ func Main(c *config.C, configTest bool, buildVersion string, l *slog.Logger, dev DnsServer: ds, HandshakeManager: handshakeManager, connectionManager: connManager, + pmtudManager: pmtudMgr, lightHouse: lightHouse, tryPromoteEvery: c.GetUint32("counters.try_promote", defaultPromoteEvery), reQueryEvery: c.GetUint32("counters.requery_every_packets", defaultReQueryEvery), @@ -266,6 +268,7 @@ func Main(c *config.C, configTest bool, buildVersion string, l *slog.Logger, dev dnsStart: ds.Start, lighthouseStart: lightHouse.StartUpdateWorker, connectionManagerStart: connManager.Start, + pmtudManagerStart: pmtudMgr.Start, }, nil } diff --git a/outside.go b/outside.go index 1e00a0a9..3c7ec2f4 100644 --- a/outside.go +++ b/outside.go @@ -183,11 +183,20 @@ func (f *Interface) readOutsidePackets(via ViaSender, out []byte, packet []byte, return } - if h.Subtype == header.TestRequest { + switch h.Subtype { + case header.TestRequest: // This testRequest might be from TryPromoteBest, so we should roam // to the new IP address before responding f.handleHostRoaming(hostinfo, via) f.send(header.Test, header.TestReply, ci, hostinfo, d, nb, out) + case header.MTUDProbeRequest: + // Reply with just the 8-byte ack header so the reverse path doesn't have to + // carry the full probe size; we only verify the forward direction. + if len(d) >= 8 { + f.send(header.Test, header.MTUDProbeReply, ci, hostinfo, d[:8], nb, out) + } + case header.MTUDProbeReply: + f.pmtudManager.HandleReply(hostinfo.localIndexId, d) } // Fallthrough to the bottom to record incoming traffic @@ -257,6 +266,7 @@ func (f *Interface) readOutsidePackets(via ViaSender, out []byte, packet []byte, // closeTunnel closes a tunnel locally, it does not send a closeTunnel packet to the remote func (f *Interface) closeTunnel(hostInfo *HostInfo) { + f.pmtudManager.OnTunnelDown(hostInfo) final := f.hostMap.DeleteHostInfo(hostInfo) if final { // We no longer have any tunnels with this vpn addr, clear learned lighthouse state to lower memory usage @@ -296,6 +306,7 @@ func (f *Interface) handleHostRoaming(hostinfo *HostInfo, via ViaSender) { hostinfo.lastRoam = time.Now() hostinfo.lastRoamRemote = hostinfo.remote hostinfo.SetRemote(via.UdpAddr) + f.pmtudManager.OnRoam(hostinfo) } } diff --git a/overlay/device.go b/overlay/device.go index b6077aba..26060bc0 100644 --- a/overlay/device.go +++ b/overlay/device.go @@ -15,4 +15,13 @@ type Device interface { RoutesFor(netip.Addr) routing.Gateways SupportsMultiqueue() bool NewMultiQueueReader() (io.ReadWriteCloser, error) + // SupportsPerPeerMTU reports whether SetPeerMTU is implemented for real on + // this platform. PMTUD requires this; the manager will refuse to enable when + // false even if the operator set tun.max_mtu, because a discovered MTU we + // can't actually install does the operator no good. + SupportsPerPeerMTU() bool + // SetPeerMTU installs a per-peer MTU on the routing table so the kernel will + // surface PTB / EMSGSIZE for inside packets to that peer that would exceed mtu. + // Pass mtu=0 to remove the override and let the device default apply. + SetPeerMTU(addr netip.Addr, mtu int) error } diff --git a/overlay/overlaytest/noop.go b/overlay/overlaytest/noop.go index 956da7dd..03969654 100644 --- a/overlay/overlaytest/noop.go +++ b/overlay/overlaytest/noop.go @@ -39,6 +39,14 @@ func (NoopTun) Write([]byte) (int, error) { return 0, nil } +func (NoopTun) SupportsPerPeerMTU() bool { + return false +} + +func (NoopTun) SetPeerMTU(addr netip.Addr, mtu int) error { + return nil +} + func (NoopTun) SupportsMultiqueue() bool { return false } diff --git a/overlay/tun_android.go b/overlay/tun_android.go index 9cbb64be..5b78c4ce 100644 --- a/overlay/tun_android.go +++ b/overlay/tun_android.go @@ -95,6 +95,14 @@ func (t *tun) Name() string { return "android" } +func (t *tun) SupportsPerPeerMTU() bool { + return false +} + +func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error { + return nil +} + func (t *tun) SupportsMultiqueue() bool { return false } diff --git a/overlay/tun_darwin.go b/overlay/tun_darwin.go index 524ef0cd..9b8f438b 100644 --- a/overlay/tun_darwin.go +++ b/overlay/tun_darwin.go @@ -548,6 +548,14 @@ func (t *tun) Name() string { return t.Device } +func (t *tun) SupportsPerPeerMTU() bool { + return false +} + +func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error { + return nil +} + func (t *tun) SupportsMultiqueue() bool { return false } diff --git a/overlay/tun_disabled.go b/overlay/tun_disabled.go index f47880dd..8ed30e26 100644 --- a/overlay/tun_disabled.go +++ b/overlay/tun_disabled.go @@ -106,6 +106,14 @@ func (t *disabledTun) Write(b []byte) (int, error) { return len(b), nil } +func (t *disabledTun) SupportsPerPeerMTU() bool { + return false +} + +func (t *disabledTun) SetPeerMTU(addr netip.Addr, mtu int) error { + return nil +} + func (t *disabledTun) SupportsMultiqueue() bool { return true } diff --git a/overlay/tun_freebsd.go b/overlay/tun_freebsd.go index 3d995553..c88adb3d 100644 --- a/overlay/tun_freebsd.go +++ b/overlay/tun_freebsd.go @@ -561,6 +561,14 @@ func (t *tun) Name() string { return t.Device } +func (t *tun) SupportsPerPeerMTU() bool { + return false +} + +func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error { + return nil +} + func (t *tun) SupportsMultiqueue() bool { return false } diff --git a/overlay/tun_ios.go b/overlay/tun_ios.go index 6bfcbdfb..ecf6e9d2 100644 --- a/overlay/tun_ios.go +++ b/overlay/tun_ios.go @@ -151,6 +151,14 @@ func (t *tun) Name() string { return "iOS" } +func (t *tun) SupportsPerPeerMTU() bool { + return false +} + +func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error { + return nil +} + func (t *tun) SupportsMultiqueue() bool { return false } diff --git a/overlay/tun_linux.go b/overlay/tun_linux.go index c6cfb686..1c444bb5 100644 --- a/overlay/tun_linux.go +++ b/overlay/tun_linux.go @@ -368,6 +368,13 @@ func (t *tun) reload(c *config.C, initial bool) error { } } + // tun.max_mtu raises the device MTU above tun.mtu so PMTUD has headroom to + // install per-peer routes between tun.mtu (floor) and tun.max_mtu (ceiling). + // When unset (default 0) the device MTU is unchanged from existing behavior. + if pmtudCeiling := c.GetInt("tun.max_mtu", 0); pmtudCeiling > newMaxMTU { + newMaxMTU = pmtudCeiling + } + t.MaxMTU = newMaxMTU t.DefaultMTU = newDefaultMTU @@ -596,7 +603,7 @@ func (t *tun) setDefaultRoute(cidr netip.Prefix) error { LinkIndex: t.deviceIndex, Dst: dr, MTU: t.DefaultMTU, - AdvMSS: t.advMSS(Route{}), + AdvMSS: t.advMSS(Route{Cidr: cidr}), Scope: unix.RT_SCOPE_LINK, Src: net.IP(cidr.Addr().AsSlice()), Protocol: unix.RTPROT_KERNEL, @@ -705,17 +712,68 @@ func (t *tun) Name() string { return t.Device } +func (t *tun) SupportsPerPeerMTU() bool { + return true +} + +// SetPeerMTU installs a host route (/32 for an IPv4 vpn address, /128 for an IPv6 +// vpn address) to addr through this tun device with the given MTU. This causes +// the kernel to reject (or surface PTB to apps for) inside packets to addr that +// would exceed mtu. Pass mtu=0 to remove the override and let the per-vpn-network +// route apply again. PoC: assumes addr is reachable directly via this device. +func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error { + bits := addr.BitLen() + prefix := netip.PrefixFrom(addr, bits) + + dr := &net.IPNet{ + IP: addr.AsSlice(), + Mask: net.CIDRMask(bits, bits), + } + + if mtu == 0 { + nr := netlink.Route{ + LinkIndex: t.deviceIndex, + Dst: dr, + Scope: unix.RT_SCOPE_LINK, + } + if err := netlink.RouteDel(&nr); err != nil { + return fmt.Errorf("failed to remove per-peer mtu route %v: %w", prefix, err) + } + return nil + } + + nr := netlink.Route{ + LinkIndex: t.deviceIndex, + Dst: dr, + MTU: mtu, + AdvMSS: t.advMSS(Route{Cidr: prefix, MTU: mtu}), + Scope: unix.RT_SCOPE_LINK, + } + if err := netlink.RouteReplace(&nr); err != nil { + return fmt.Errorf("failed to set per-peer mtu route %v mtu=%d: %w", prefix, mtu, err) + } + return nil +} + func (t *tun) advMSS(r Route) int { mtu := r.MTU if r.MTU == 0 { mtu = t.DefaultMTU } - // We only need to set advmss if the route MTU does not match the device MTU - if mtu != t.MaxMTU { - return mtu - 40 + // We only need to set advmss if the route MTU does not match the device MTU. + if mtu == t.MaxMTU { + return 0 } - return 0 + + // MSS = MTU - (IP header + TCP header). TCP is always 20 bytes; IP is 20 for + // v4 and 40 for v6. r.Cidr is the route destination so it tells us which + // family this route is in. If Cidr is unset (empty Route) we default to v4. + addr := r.Cidr.Addr() + if addr.Is6() && !addr.Is4In6() { + return mtu - 60 + } + return mtu - 40 } func (t *tun) watchRoutes() { diff --git a/overlay/tun_netbsd.go b/overlay/tun_netbsd.go index c971bb6e..f839ac7c 100644 --- a/overlay/tun_netbsd.go +++ b/overlay/tun_netbsd.go @@ -390,6 +390,14 @@ func (t *tun) Name() string { return t.Device } +func (t *tun) SupportsPerPeerMTU() bool { + return false +} + +func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error { + return nil +} + func (t *tun) SupportsMultiqueue() bool { return false } diff --git a/overlay/tun_openbsd.go b/overlay/tun_openbsd.go index 81362184..763b1613 100644 --- a/overlay/tun_openbsd.go +++ b/overlay/tun_openbsd.go @@ -310,6 +310,14 @@ func (t *tun) Name() string { return t.Device } +func (t *tun) SupportsPerPeerMTU() bool { + return false +} + +func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error { + return nil +} + func (t *tun) SupportsMultiqueue() bool { return false } diff --git a/overlay/tun_tester.go b/overlay/tun_tester.go index b2c2a0ea..6d745d1c 100644 --- a/overlay/tun_tester.go +++ b/overlay/tun_tester.go @@ -105,6 +105,14 @@ func (t *TestTun) Name() string { return t.Device } +func (t *TestTun) SupportsPerPeerMTU() bool { + return false +} + +func (t *TestTun) SetPeerMTU(addr netip.Addr, mtu int) error { + return nil +} + func (t *TestTun) Write(b []byte) (n int, err error) { if t.closed.Load() { return 0, io.ErrClosedPipe diff --git a/overlay/tun_windows.go b/overlay/tun_windows.go index 680dddb3..306aee50 100644 --- a/overlay/tun_windows.go +++ b/overlay/tun_windows.go @@ -229,6 +229,14 @@ func (t *winTun) Name() string { return t.Device } +func (t *winTun) SupportsPerPeerMTU() bool { + return false +} + +func (t *winTun) SetPeerMTU(addr netip.Addr, mtu int) error { + return nil +} + func (t *winTun) Read(b []byte) (int, error) { return t.tun.Read(b, 0) } diff --git a/overlay/user.go b/overlay/user.go index e5f27f37..faec1598 100644 --- a/overlay/user.go +++ b/overlay/user.go @@ -46,6 +46,14 @@ func (d *UserDevice) RoutesFor(ip netip.Addr) routing.Gateways { return routing.Gateways{routing.NewGateway(ip, 1)} } +func (d *UserDevice) SupportsPerPeerMTU() bool { + return false +} + +func (d *UserDevice) SetPeerMTU(addr netip.Addr, mtu int) error { + return nil +} + func (d *UserDevice) SupportsMultiqueue() bool { return true } diff --git a/pmtud_manager.go b/pmtud_manager.go new file mode 100644 index 00000000..222038bb --- /dev/null +++ b/pmtud_manager.go @@ -0,0 +1,623 @@ +package nebula + +import ( + "context" + "encoding/binary" + "log/slog" + "math/rand/v2" + "net/netip" + "sync" + "sync/atomic" + "time" + + "github.com/slackhq/nebula/config" + "github.com/slackhq/nebula/header" + "github.com/slackhq/nebula/overlay" +) + +// PMTUD PoC: discover the path MTU per-tunnel via authenticated probes that ride +// the existing crypto session. We follow RFC 8899 PLPMTUD: a binary search +// between a known-good floor and a configured ceiling, with N consecutive probe +// losses at a size treated as "doesn't fit." Confirmed PMTU is pushed to the +// overlay device, which on Linux installs a per-host route with the discovered +// MTU. The kernel then surfaces EMSGSIZE / PTB to apps writing to the tun. +// +// Probe payload format (request): +// +// [magic uint32 BE][probeID uint32 BE][padding 0x00...] +// +// Reply is a small ack with the same magic and probeID and no padding. We do not +// verify the reverse-path MTU; only the forward direction matters for the +// receiver's MTU on the inside. + +const ( + pmtudMagic uint32 = 0x504D5544 // 'P' 'M' 'U' 'D' + pmtudFloor = 1280 // IPv6 minimum payload, also a safe internet MTU floor + + // pmtudConverged is the bytes-tolerance for stopping the search. + pmtudConverged = 8 + + // pmtudMaxLoss matches RFC 8899 MAX_PROBES (default 3). + pmtudMaxLoss = 3 + + // pmtudProbeInterval is the time between probe ticks during the search phase. + // Once a peer converges the wheel stops ticking it; re-validation is driven + // by connection_manager via MaybeProbeAsTest at its natural test cadence. + pmtudProbeInterval = 500 * time.Millisecond + + // pmtudWheelMax is the wheel's maximum supported scheduling duration. We + // only ever schedule at pmtudProbeInterval today, but the wheel needs a + // max greater than its tick to allocate its slot ring sensibly. + pmtudWheelMax = 5 * time.Second + + // pmtudOverheadPessimistic assumes IPv6 underlay + relay framing: + // IPv6(40) + UDP(8) + outer nebula(16) + outer AEAD tag(16) + // + inner nebula(16) + inner AEAD tag(16) = 112 bytes. + // TODO: track underlay address family and per-peer relay state on the HostInfo + // so the manager can use the actual overhead for that tunnel and recover the + // 32 bytes we pessimistically give up on direct IPv6 paths and the 52 bytes on + // direct IPv4 paths. + pmtudOverheadPessimistic = 112 + + // pmtudUnsupportedAfter is the number of consecutive lost probes (across any + // sizes) without ever receiving a reply that we treat as evidence the peer + // does not understand the MTUDProbeRequest subtype (i.e. it's running an + // older nebula). After this many failures with everReplied=false we mark the + // peer pmtud-unsupported and stop scheduling probes. K is small enough that + // it fires before the binary search would naturally converge to floor (which + // would otherwise be ~30 wasted probes), but large enough to absorb a few + // transient probe losses on a path that's just starting to settle. + pmtudUnsupportedAfter = 5 +) + +// pmtudPeer tracks the binary-search state for one tunnel. +type pmtudPeer struct { + mu sync.Mutex + addr netip.Addr + localIdx uint32 + + // low is the largest outer IP packet size we have a confirmed ack for. + // high is the smallest size we believe fails (the search ceiling to start). + low, high int + + // inFlightSize is the outer IP packet size of the probe currently awaiting + // an ack. 0 means no probe in flight. + inFlightSize int + // inFlightID matches the probeID echoed in the reply. + inFlightID uint32 + // losses counts consecutive failures at inFlightSize. + losses int + + // firstProbe is true until we have sent the first probe of a search. The + // first probe targets the ceiling directly (RFC 8899 permits this Search + // Algorithm choice); operators who set tun.max_mtu typically have a path + // that supports it, so we converge in one probe in the common case. + firstProbe bool + // everReplied is true once we have ever received any MTUDProbeReply from + // this peer. Combined with consecutiveFailures, this lets us detect peers + // that don't understand the new subtype and stop probing them. + everReplied bool + // consecutiveFailures counts probes lost without an intervening reply. + // Resets to 0 on any successful reply. + consecutiveFailures int + // unsupported is set true once we conclude the peer doesn't speak PMTUD. + // The manager skips probes for unsupported peers. + unsupported bool + + // converged means we have a confirmed PMTU and are in the slow re-validation phase. + converged bool + // applied is the inner MTU we last pushed to the overlay device (0 if never). + applied int +} + +func (p *pmtudPeer) overhead() int { + // TODO: branch on actual underlay family + relay state for this peer. + return pmtudOverheadPessimistic +} + +func (p *pmtudPeer) midpoint() int { + return (p.low + p.high) / 2 +} + +type pmtudManager struct { + intf *Interface + device overlay.Device + + // peers is keyed by HostInfo.localIndexId. + peers sync.Map // map[uint32]*pmtudPeer + + wheel *LockingTimerWheel[uint32] + + // floor is the always-safe inner MTU (= tun.mtu). Per-peer routes start here + // on tunnel-up so unprobed traffic is always small enough to fit. Stored as + // atomic int64 so reload can update it without coordinating with the readers + // in tick/HandleReply/OnTunnelUp. + floor atomic.Int64 + // ceiling is the search ceiling expressed as an outer IP packet size, derived + // from tun.max_mtu (which is the kernel's device MTU on the tun) plus our + // pessimistic overhead. PMTUD will not probe larger than this. + ceiling atomic.Int64 + + enabled atomic.Bool + + l *slog.Logger +} + +func newPMTUDManagerFromConfig(l *slog.Logger, c *config.C, device overlay.Device) *pmtudManager { + m := &pmtudManager{ + device: device, + wheel: NewLockingTimerWheel[uint32](pmtudProbeInterval, pmtudWheelMax), + l: l, + } + c.RegisterReloadCallback(func(c *config.C) { m.reload(c, false) }) + m.reload(c, true) + return m +} + +// reload applies tun.mtu / tun.max_mtu changes to the manager. On the initial +// call (during construction) it just snapshots state; on a live reload it also +// transitions in-flight peers to match the new bounds: clearing per-peer routes +// when newly disabled, seeding peers from the hostmap and flipping DF on +// outside sockets when newly enabled, and rebounding existing searches in +// place when only the ceiling moved. +func (m *pmtudManager) reload(c *config.C, initial bool) { + if !initial && !c.HasChanged("tun.mtu") && !c.HasChanged("tun.max_mtu") { + return + } + + floor := c.GetInt("tun.mtu", overlay.DefaultMTU) + maxMTU := c.GetInt("tun.max_mtu", 0) + + enable := maxMTU > floor && m.device.SupportsPerPeerMTU() + var ceiling int + if enable { + ceiling = maxMTU + pmtudOverheadPessimistic + } + + if initial { + m.floor.Store(int64(floor)) + m.ceiling.Store(int64(ceiling)) + m.enabled.Store(enable) + switch { + case enable: + m.l.Info("pmtud enabled", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU) + case maxMTU > floor: + m.l.Warn("pmtud disabled: this platform does not yet support per-peer MTU routes", + "tun.max_mtu", maxMTU) + } + return + } + + wasEnabled := m.enabled.Load() + m.floor.Store(int64(floor)) + m.ceiling.Store(int64(ceiling)) + m.enabled.Store(enable) + + switch { + case wasEnabled && !enable: + m.disableLive(floor, maxMTU) + case !wasEnabled && enable: + m.enableLive(floor, ceiling, maxMTU) + case wasEnabled && enable: + m.reboundLive(floor, ceiling, maxMTU) + } +} + +// disableLive clears per-peer routes and drops all peer state. We do not +// disable DF on the outside sockets; once on, it stays on for the life of the +// process. Operators flipping pmtud off live get correct routing behavior; if +// they want the historical no-DF behavior back they need to restart. +func (m *pmtudManager) disableLive(floor, maxMTU int) { + m.peers.Range(func(k, v any) bool { + p := v.(*pmtudPeer) + p.mu.Lock() + applied := p.applied + addr := p.addr + p.applied = 0 + p.mu.Unlock() + if applied != 0 { + if err := m.device.SetPeerMTU(addr, 0); err != nil { + m.l.Warn("pmtud: failed to clear per-peer mtu on disable", "addr", addr, "error", err) + } + } + m.peers.Delete(k) + return true + }) + m.l.Info("pmtud disabled (tun.max_mtu <= tun.mtu)", "tun.mtu", floor, "tun.max_mtu", maxMTU) +} + +// enableLive flips DF on every outside socket. We don't pre-seed existing +// tunnels here; connection_manager's normal test cadence will eventually call +// MaybeProbeAsTest for each peer, which seeds on miss and lets the wheel pick +// up the search from there. New tunnels established after this point still +// take the OnTunnelUp fast path. +func (m *pmtudManager) enableLive(floor, ceiling, maxMTU int) { + m.enableDF() + m.l.Info("pmtud enabled", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU) +} + +// reboundLive resets each peer's search state to the new bounds. Peers whose +// confirmed PMTU still fits under the new ceiling keep their applied route in +// place during the new search; peers whose confirmed PMTU exceeds the new +// ceiling get cleared back to floor and re-search from scratch. The unsupported +// flag is preserved because peer software version doesn't change on reload. +func (m *pmtudManager) reboundLive(floor, ceiling, maxMTU int) { + overhead := pmtudOverheadPessimistic + m.peers.Range(func(k, v any) bool { + p := v.(*pmtudPeer) + p.mu.Lock() + if p.applied > 0 && p.applied+overhead > ceiling { + if err := m.device.SetPeerMTU(p.addr, 0); err != nil { + m.l.Warn("pmtud: failed to clear per-peer mtu on rebound", "addr", p.addr, "error", err) + } else { + p.applied = 0 + } + } + p.low = floor + overhead + p.high = ceiling + p.inFlightSize = 0 + p.inFlightID = 0 + p.losses = 0 + p.firstProbe = !p.unsupported + p.converged = false + idx := p.localIdx + unsupported := p.unsupported + p.mu.Unlock() + if !unsupported { + m.wheel.Add(idx, pmtudProbeInterval) + } + return true + }) + m.l.Info("pmtud reloaded", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU) +} + +// enableDF asks every outside socket to set the don't-fragment bit on outbound +// packets. Idempotent: safe to call from both Start (initial enable) and from a +// live reload that flips pmtud on. +func (m *pmtudManager) enableDF() { + for i, w := range m.intf.writers { + if err := w.EnablePathMTUDiscovery(); err != nil { + m.l.Warn("pmtud: failed to enable path mtu discovery on outside socket; pmtud will not work correctly", + "writer", i, "error", err) + } + } +} + +// Start runs the probe scheduler until ctx is done. The loop runs even when PMTUD +// is disabled at startup so a hot reload can turn it on without restarting nebula. +// +// When PMTUD is enabled at startup we ask each outside socket to enable +// path-MTU discovery (DF on every send). This is intentionally gated on the +// feature being on so that operators who haven't opted in keep the historical +// behavior where the kernel may fragment outbound nebula UDP packets. A live +// reload from disabled to enabled will also flip DF on via enableLive; the +// reverse direction does not turn DF off, so flipping pmtud back off live +// keeps DF on until restart. +func (m *pmtudManager) Start(ctx context.Context) { + if m.enabled.Load() { + m.enableDF() + } + + ticker := time.NewTicker(m.wheel.t.tickDuration) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case now := <-ticker.C: + m.wheel.Advance(now) + for { + idx, has := m.wheel.Purge() + if !has { + break + } + m.tick(idx) + } + } + } +} + +// OnTunnelUp is called when a HostInfo becomes traffic-watched. The kernel +// already routes packets to this peer through the per-vpn-network route (mtu = +// tun.mtu), so the floor is in effect implicitly. We just kick off the search +// here; HandleReply will install a per-host /32 (or /128) route once a larger +// size is confirmed. +func (m *pmtudManager) OnTunnelUp(hi *HostInfo) { + if !m.enabled.Load() { + return + } + m.seedPeer(hi) +} + +// seedPeer is the shared body of OnTunnelUp and the live-reload enable path. +// LoadOrStore protects against double-seeding the same localIndexId from a +// race between OnTunnelUp and a reload-driven hostmap walk. +func (m *pmtudManager) seedPeer(hi *HostInfo) { + if hi == nil || len(hi.vpnAddrs) == 0 { + return + } + floor := int(m.floor.Load()) + ceiling := int(m.ceiling.Load()) + p := &pmtudPeer{ + addr: hi.vpnAddrs[0], + localIdx: hi.localIndexId, + low: floor + pmtudOverheadPessimistic, + high: ceiling, + firstProbe: true, + } + if _, loaded := m.peers.LoadOrStore(hi.localIndexId, p); loaded { + return + } + m.wheel.Add(hi.localIndexId, pmtudProbeInterval) +} + +// OnTunnelDown is called when a HostInfo is being torn down. Removes any per-host +// MTU override so the device default applies again. +func (m *pmtudManager) OnTunnelDown(hi *HostInfo) { + if hi == nil { + return + } + v, ok := m.peers.LoadAndDelete(hi.localIndexId) + if !ok { + return + } + p := v.(*pmtudPeer) + p.mu.Lock() + applied := p.applied + addr := p.addr + p.applied = 0 + p.mu.Unlock() + if applied != 0 { + if err := m.device.SetPeerMTU(addr, 0); err != nil { + m.l.Warn("pmtud: failed to clear per-peer mtu", "addr", addr, "error", err) + } + } +} + +// OnRoam is called when a HostInfo's remote underlay address changes. The path +// MTU may now be different; drop the per-host route so the kernel falls back to +// the per-vpn-network route (mtu = tun.mtu floor), then restart the search. +// We do not reset the unsupported flag: peer software version doesn't change on +// roam, so once we've decided a peer doesn't speak PMTUD we stay decided. +func (m *pmtudManager) OnRoam(hi *HostInfo) { + if !m.enabled.Load() || hi == nil { + return + } + v, ok := m.peers.Load(hi.localIndexId) + if !ok { + return + } + p := v.(*pmtudPeer) + p.mu.Lock() + if p.unsupported { + p.mu.Unlock() + return + } + p.low = int(m.floor.Load()) + pmtudOverheadPessimistic + p.high = int(m.ceiling.Load()) + p.inFlightSize = 0 + p.inFlightID = 0 + p.losses = 0 + p.consecutiveFailures = 0 + p.firstProbe = true + p.converged = false + if p.applied != 0 { + if err := m.device.SetPeerMTU(p.addr, 0); err != nil { + m.l.Warn("pmtud: failed to clear per-peer mtu on roam", "addr", p.addr, "error", err) + } else { + p.applied = 0 + } + } + p.mu.Unlock() + m.wheel.Add(hi.localIndexId, pmtudProbeInterval) +} + +// MaybeProbeAsTest is called by connection_manager when it would otherwise send +// a TestRequest because a tunnel has gone silent. If we have a confirmed PMTU +// for this peer that's larger than the floor, we send a probe at that size +// instead. The reply confirms both liveness (consumed by connection_manager via +// the existing inbound traffic accounting fallthrough in outside.go) and that +// the confirmed PMTU still fits (consumed by HandleReply here). One synthetic +// packet does the work of two. +// +// Returns true if a probe was sent. False means the caller should send a +// regular TestRequest at the floor. +// +// On probe failure, connection_manager's existing pendingDeletion timeout will +// tear the tunnel down. Heavy hammer, but correct: a re-handshake re-runs PMTUD +// discovery against the now-shrunken path. A future EMSGSIZE-capture followup +// can replace this with a soft-drop-and-research flow. +func (m *pmtudManager) MaybeProbeAsTest(hi *HostInfo) bool { + if !m.enabled.Load() || hi == nil { + return false + } + v, ok := m.peers.Load(hi.localIndexId) + if !ok { + // Tunnel pre-dates the manager being aware of it (e.g. pmtud was just + // enabled live, or AddTrafficWatch fired before this call). Seed the + // peer so the wheel picks up the search; let connection_manager send + // its regular TestRequest this cycle. + m.seedPeer(hi) + return false + } + p := v.(*pmtudPeer) + p.mu.Lock() + if p.unsupported || p.applied == 0 { + p.mu.Unlock() + return false + } + overhead := p.overhead() + size := p.applied + overhead + id := rand.Uint32() + p.inFlightSize = size + p.inFlightID = id + p.mu.Unlock() + + m.sendProbe(hi, size, id, overhead) + return true +} + +// HandleReply consumes an MTUDProbeReply payload from the receive path. +func (m *pmtudManager) HandleReply(localIdx uint32, payload []byte) { + if !m.enabled.Load() { + return + } + if len(payload) < 8 { + return + } + if binary.BigEndian.Uint32(payload[0:4]) != pmtudMagic { + return + } + id := binary.BigEndian.Uint32(payload[4:8]) + + v, ok := m.peers.Load(localIdx) + if !ok { + return + } + p := v.(*pmtudPeer) + p.mu.Lock() + defer p.mu.Unlock() + + if p.inFlightSize == 0 || p.inFlightID != id { + return + } + + confirmed := p.inFlightSize + p.low = confirmed + p.inFlightSize = 0 + p.losses = 0 + p.everReplied = true + p.consecutiveFailures = 0 + + innerMTU := confirmed - p.overhead() + // Only install a /32 override when it would actually raise the MTU above the + // per-vpn-network floor route. If the discovered MTU is <= floor, the /24 + // already covers it; installing a /32 at floor would just create roam churn. + if innerMTU > int(m.floor.Load()) && p.applied != innerMTU { + if err := m.device.SetPeerMTU(p.addr, innerMTU); err != nil { + m.l.Warn("pmtud: failed to apply per-peer mtu", "addr", p.addr, "innerMTU", innerMTU, "error", err) + } else { + m.l.Info("pmtud probe confirmed", + "addr", p.addr, + "outerMTU", confirmed, + "innerMTU", innerMTU, + "low", p.low, + "high", p.high, + ) + p.applied = innerMTU + } + } + + if p.high-p.low <= pmtudConverged { + p.converged = true + } else { + p.converged = false + } +} + +// tick handles one wheel firing for a single peer. +func (m *pmtudManager) tick(localIdx uint32) { + v, ok := m.peers.Load(localIdx) + if !ok { + return + } + p := v.(*pmtudPeer) + p.mu.Lock() + + if p.unsupported { + p.mu.Unlock() + return + } + + // If a probe was outstanding, this tick is the loss timeout. + if p.inFlightSize != 0 { + p.losses++ + p.consecutiveFailures++ + if p.losses >= pmtudMaxLoss { + p.high = p.inFlightSize + p.inFlightSize = 0 + p.losses = 0 + if p.high-p.low <= pmtudConverged { + p.converged = true + } + } + } + + // If we've never gotten a reply from this peer and we've burned through our + // failure budget, conclude the peer doesn't understand the MTUDProbeRequest + // subtype and stop scheduling probes for it. + if !p.everReplied && p.consecutiveFailures >= pmtudUnsupportedAfter { + p.unsupported = true + addr := p.addr + p.mu.Unlock() + m.l.Info("pmtud: peer not responding to probes, marking unsupported", + "addr", addr, "failures", pmtudUnsupportedAfter) + return + } + + hi := m.intf.hostMap.QueryIndex(localIdx) + if hi == nil { + p.mu.Unlock() + m.peers.Delete(localIdx) + return + } + + // Once a peer converges, the wheel stops scheduling for it. Re-validation + // (and the resulting black hole detection) is driven by connection_manager + // via MaybeProbeAsTest at its natural test cadence, so a converged peer + // has nothing for the wheel to do until OnRoam or a tunnel down/up cycle + // triggers a fresh search. + if p.converged { + p.mu.Unlock() + return + } + + ceiling := int(m.ceiling.Load()) + var size int + switch { + case p.firstProbe: + // Probe the ceiling directly. If the path supports it (the common case + // when an operator has explicitly configured tun.max_mtu), we converge + // in one round trip. If it fails, the standard binary search resumes + // on the next tick from the (low, ceiling) bounds. + size = ceiling + p.firstProbe = false + case p.losses > 0 && p.inFlightSize != 0: + size = p.inFlightSize + default: + size = p.midpoint() + } + if size < pmtudFloor { + size = pmtudFloor + } + if size > ceiling { + size = ceiling + } + + id := rand.Uint32() + p.inFlightSize = size + p.inFlightID = id + overhead := p.overhead() + p.mu.Unlock() + + m.sendProbe(hi, size, id, overhead) + m.wheel.Add(localIdx, pmtudProbeInterval) +} + +// sendProbe builds an MTUDProbeRequest payload that will produce an outer IP +// packet of approximately `outerSize` bytes, then sends it. +func (m *pmtudManager) sendProbe(hi *HostInfo, outerSize int, id uint32, overhead int) { + payloadLen := outerSize - overhead + if payloadLen < 8 { + payloadLen = 8 + } + p := make([]byte, payloadLen) + binary.BigEndian.PutUint32(p[0:4], pmtudMagic) + binary.BigEndian.PutUint32(p[4:8], id) + // remaining bytes are zero-padding + + nb := make([]byte, 12) + out := make([]byte, outerSize+128) // headroom for header/tag/relay framing + m.intf.SendMessageToHostInfo(header.Test, header.MTUDProbeRequest, hi, p, nb, out) +} diff --git a/udp/conn.go b/udp/conn.go index 30d89dec..01936099 100644 --- a/udp/conn.go +++ b/udp/conn.go @@ -20,6 +20,12 @@ type Conn interface { WriteTo(b []byte, addr netip.AddrPort) error ReloadConfig(c *config.C) SupportsMultipleReaders() bool + // EnablePathMTUDiscovery sets the don't-fragment bit on outgoing packets for + // this socket. Called by the pmtud manager when PMTUD is enabled. A no-op on + // platforms that don't support it; nebula's default behavior (no DF, kernel + // fragmentation allowed) is preserved on those platforms and on this one when + // PMTUD is disabled. + EnablePathMTUDiscovery() error Close() error } @@ -43,6 +49,9 @@ func (NoopConn) WriteTo(_ []byte, _ netip.AddrPort) error { func (NoopConn) ReloadConfig(_ *config.C) { return } +func (NoopConn) EnablePathMTUDiscovery() error { + return nil +} func (NoopConn) Close() error { return nil } diff --git a/udp/udp_android.go b/udp/udp_android.go index 3fc68003..7b3b8811 100644 --- a/udp/udp_android.go +++ b/udp/udp_android.go @@ -44,3 +44,17 @@ func NewListenConfig(multi bool) net.ListenConfig { func (u *GenericConn) Rebind() error { return nil } + +// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets. +// Android is Linux underneath, so we use IP_PMTUDISC_PROBE (kernel sets DF but +// does not consume incoming ICMP frag-needed for its PMTU cache; the manager +// drives discovery via authenticated probes). +func (u *GenericConn) EnablePathMTUDiscovery() error { + v4 := u.isV4Socket() + return u.controlFD(func(fd uintptr) error { + if v4 { + return unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_MTU_DISCOVER, unix.IP_PMTUDISC_PROBE) + } + return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_MTU_DISCOVER, unix.IPV6_PMTUDISC_PROBE) + }) +} diff --git a/udp/udp_bsd.go b/udp/udp_bsd.go index c42a3c18..e94e86b9 100644 --- a/udp/udp_bsd.go +++ b/udp/udp_bsd.go @@ -47,3 +47,8 @@ func NewListenConfig(multi bool) net.ListenConfig { func (u *GenericConn) Rebind() error { return nil } + +// EnablePathMTUDiscovery is split into per-OS files: udp_freebsd.go handles +// FreeBSD (which has both IP_DONTFRAG and IPV6_DONTFRAG in the unix package); +// udp_openbsd.go handles OpenBSD (v6 only; the kernel doesn't expose a v4 DF +// sockopt). diff --git a/udp/udp_darwin.go b/udp/udp_darwin.go index 8a4f5b18..48f3221d 100644 --- a/udp/udp_darwin.go +++ b/udp/udp_darwin.go @@ -187,6 +187,17 @@ func (u *StdConn) SupportsMultipleReaders() bool { return false } +// EnablePathMTUDiscovery sets the don't-fragment bit on every outbound packet. +// On darwin we use IP_DONTFRAG (v4) / IPV6_DONTFRAG (v6). The kernel will return +// EMSGSIZE for sends that exceed the local interface MTU; ICMP-driven PMTU +// updates from upstream routers are processed by the kernel as usual. +func (u *StdConn) EnablePathMTUDiscovery() error { + if u.isV4 { + return syscall.SetsockoptInt(int(u.sysFd), syscall.IPPROTO_IP, unix.IP_DONTFRAG, 1) + } + return syscall.SetsockoptInt(int(u.sysFd), syscall.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1) +} + func (u *StdConn) Rebind() error { var err error if u.isV4 { diff --git a/udp/udp_freebsd.go b/udp/udp_freebsd.go new file mode 100644 index 00000000..25054b92 --- /dev/null +++ b/udp/udp_freebsd.go @@ -0,0 +1,25 @@ +//go:build freebsd && !e2e_testing +// +build freebsd,!e2e_testing + +package udp + +import ( + "golang.org/x/sys/unix" +) + +// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets. +// FreeBSD exposes IP_DONTFRAG (v4) and IPV6_DONTFRAG (v6) in golang.org/x/sys/unix. +// Unlike Linux, BSDs don't have an explicit "don't consume incoming ICMP +// frag-needed" knob for unconnected UDP sockets; the kernel's PMTU cache will +// be updated from ICMP, which is benign for our usage (the cache only affects +// what EMSGSIZE gets surfaced for; the manager drives its own discovery via +// authenticated probes). +func (u *GenericConn) EnablePathMTUDiscovery() error { + v4 := u.isV4Socket() + return u.controlFD(func(fd uintptr) error { + if v4 { + return unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_DONTFRAG, 1) + } + return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1) + }) +} diff --git a/udp/udp_generic.go b/udp/udp_generic.go index 131eb73b..58b867fc 100644 --- a/udp/udp_generic.go +++ b/udp/udp_generic.go @@ -100,3 +100,41 @@ func (u *GenericConn) ListenOut(r EncReader) error { func (u *GenericConn) SupportsMultipleReaders() bool { return false } + +// EnablePathMTUDiscovery is implemented per-platform alongside Rebind, in +// udp_android.go / udp_bsd.go / udp_netbsd.go / udp_windows.go. + +// controlFD invokes f with the underlying UDP socket file descriptor (or +// handle, on Windows). Used by platform files for setsockopt calls that the +// stdlib net.UDPConn does not expose directly. +func (u *GenericConn) controlFD(f func(fd uintptr) error) error { + rc, err := u.UDPConn.SyscallConn() + if err != nil { + return err + } + var sockErr error + err = rc.Control(func(fd uintptr) { + sockErr = f(fd) + }) + if err != nil { + return err + } + return sockErr +} + +// isV4Socket reports whether the local bind address looks like an IPv4 socket. +// Used by EnablePathMTUDiscovery to pick IPPROTO_IP vs IPPROTO_IPV6 socket +// options. Assumes pure-v4 or pure-v6 sockets; a dual-stack v6 socket bound to +// :: will be treated as v6 (correct: setting IPV6_DONTFRAG covers v4-mapped +// traffic too on most stacks). +func (u *GenericConn) isV4Socket() bool { + la := u.UDPConn.LocalAddr() + if la == nil { + return false + } + ua, ok := la.(*net.UDPAddr) + if !ok { + return false + } + return ua.IP.To4() != nil +} diff --git a/udp/udp_linux.go b/udp/udp_linux.go index 3e2d726a..d8743502 100644 --- a/udp/udp_linux.go +++ b/udp/udp_linux.go @@ -73,6 +73,21 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int) return out, nil } +// EnablePathMTUDiscovery sets IP_MTU_DISCOVER=IP_PMTUDISC_PROBE (IPV6 equivalent +// for v6 sockets). This sets the don't-fragment bit on every outbound packet but +// tells the kernel not to consume incoming ICMP frag-needed for its own PMTU +// cache; we drive PMTU discovery from the application via authenticated probes +// (RFC 8899). Called by the pmtud manager when PMTUD is enabled. Without this +// call the socket retains nebula's historical behavior (no DF, kernel may +// fragment), preserving compatibility with deployments that depend on UDP +// fragmentation. +func (u *StdConn) EnablePathMTUDiscovery() error { + if u.isV4 { + return u.setSockOptIPInt(unix.IPPROTO_IP, unix.IP_MTU_DISCOVER, unix.IP_PMTUDISC_PROBE) + } + return u.setSockOptIPInt(unix.IPPROTO_IPV6, unix.IPV6_MTU_DISCOVER, unix.IPV6_PMTUDISC_PROBE) +} + func (u *StdConn) SupportsMultipleReaders() bool { return true } @@ -110,6 +125,21 @@ func (u *StdConn) setSockOptInt(opt int, n int) error { return opErr } +// setSockOptIPInt sets a socket option at a non-SOL_SOCKET level (e.g. IPPROTO_IP). +func (u *StdConn) setSockOptIPInt(level, opt, n int) error { + if u.rawConn == nil { + return fmt.Errorf("no UDP connection") + } + var opErr error + err := u.rawConn.Control(func(fd uintptr) { + opErr = unix.SetsockoptInt(int(fd), level, opt, n) + }) + if err != nil { + return err + } + return opErr +} + func (u *StdConn) SetRecvBuffer(n int) error { return u.setSockOptInt(unix.SO_RCVBUFFORCE, n) } diff --git a/udp/udp_netbsd.go b/udp/udp_netbsd.go index 4b2de75a..d7562707 100644 --- a/udp/udp_netbsd.go +++ b/udp/udp_netbsd.go @@ -46,3 +46,18 @@ func NewListenConfig(multi bool) net.ListenConfig { func (u *GenericConn) Rebind() error { return nil } + +// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets. +// NetBSD exposes IPV6_DONTFRAG via golang.org/x/sys/unix but the kernel does +// not provide a socket-level knob for setting DF on v4 UDP. The only IP-layer +// constant exposed is IP_DF, which is the wire header flag, not a sockopt. +// quic-go skips NetBSD for the same reason. So v4 sockets stay at nebula's +// historical behavior (kernel may fragment); v6 gets DF. +func (u *GenericConn) EnablePathMTUDiscovery() error { + if u.isV4Socket() { + return nil + } + return u.controlFD(func(fd uintptr) error { + return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1) + }) +} diff --git a/udp/udp_openbsd.go b/udp/udp_openbsd.go new file mode 100644 index 00000000..6e539d94 --- /dev/null +++ b/udp/udp_openbsd.go @@ -0,0 +1,23 @@ +//go:build openbsd && !e2e_testing +// +build openbsd,!e2e_testing + +package udp + +import ( + "golang.org/x/sys/unix" +) + +// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets. +// OpenBSD exposes IPV6_DONTFRAG via golang.org/x/sys/unix but the kernel does +// not provide a socket-level knob for setting DF on v4 UDP. The only IP-layer +// constant exposed is IP_DF, which is the wire header flag, not a sockopt. +// quic-go skips OpenBSD for the same reason. So v4 sockets stay at nebula's +// historical behavior (kernel may fragment); v6 gets DF. +func (u *GenericConn) EnablePathMTUDiscovery() error { + if u.isV4Socket() { + return nil + } + return u.controlFD(func(fd uintptr) error { + return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1) + }) +} diff --git a/udp/udp_rio_windows.go b/udp/udp_rio_windows.go index d110af19..b9308c54 100644 --- a/udp/udp_rio_windows.go +++ b/udp/udp_rio_windows.go @@ -335,6 +335,12 @@ func (u *RIOConn) Rebind() error { return nil } +// EnablePathMTUDiscovery is a no-op on Windows for now. PMTUD is Linux-only in +// the initial PoC; Windows support would set IP_DONTFRAGMENT here. +func (u *RIOConn) EnablePathMTUDiscovery() error { + return nil +} + func (u *RIOConn) ReloadConfig(*config.C) {} func (u *RIOConn) Close() error { diff --git a/udp/udp_tester.go b/udp/udp_tester.go index fcd0967c..64e8dda7 100644 --- a/udp/udp_tester.go +++ b/udp/udp_tester.go @@ -152,6 +152,10 @@ func (u *TesterConn) Rebind() error { return nil } +func (u *TesterConn) EnablePathMTUDiscovery() error { + return nil +} + func (u *TesterConn) Close() error { u.closeOnce.Do(func() { close(u.done) diff --git a/udp/udp_windows.go b/udp/udp_windows.go index 7969f7e8..d5e3b6fa 100644 --- a/udp/udp_windows.go +++ b/udp/udp_windows.go @@ -9,6 +9,8 @@ import ( "net" "net/netip" "syscall" + + "golang.org/x/sys/windows" ) func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int) (Conn, error) { @@ -44,3 +46,27 @@ func NewListenConfig(multi bool) net.ListenConfig { func (u *GenericConn) Rebind() error { return nil } + +// Windows IP_DONTFRAGMENT and IPV6_DONTFRAG are not exposed in the +// golang.org/x/sys/windows package. Defined locally per the values in +// ws2ipdef.h / ws2tcpip.h. These are stable Win32 constants that have not +// changed since at least Windows Vista. +const ( + winIPDontFragment = 14 + winIPv6DontFrag = 14 +) + +// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets. +// Windows uses IP_DONTFRAGMENT (v4) and IPV6_DONTFRAG (v6) at IPPROTO_IP / +// IPPROTO_IPV6 respectively. Note: this only enables DF on the GenericConn +// fallback path. The RIO path (RIOConn) has its own EnablePathMTUDiscovery +// in udp_rio_windows.go and is currently a no-op pending RIO-specific work. +func (u *GenericConn) EnablePathMTUDiscovery() error { + v4 := u.isV4Socket() + return u.controlFD(func(fd uintptr) error { + if v4 { + return windows.SetsockoptInt(windows.Handle(fd), windows.IPPROTO_IP, winIPDontFragment, 1) + } + return windows.SetsockoptInt(windows.Handle(fd), windows.IPPROTO_IPV6, winIPv6DontFrag, 1) + }) +}