PMTUD exploration, start small then grow

This commit is contained in:
Nate Brown
2026-05-05 17:05:50 -05:00
parent 33c2d7277c
commit 16a836a73f
33 changed files with 1036 additions and 11 deletions

View File

@@ -20,6 +20,12 @@ type Conn interface {
WriteTo(b []byte, addr netip.AddrPort) error
ReloadConfig(c *config.C)
SupportsMultipleReaders() bool
// EnablePathMTUDiscovery sets the don't-fragment bit on outgoing packets for
// this socket. Called by the pmtud manager when PMTUD is enabled. A no-op on
// platforms that don't support it; nebula's default behavior (no DF, kernel
// fragmentation allowed) is preserved on those platforms and on this one when
// PMTUD is disabled.
EnablePathMTUDiscovery() error
Close() error
}
@@ -43,6 +49,9 @@ func (NoopConn) WriteTo(_ []byte, _ netip.AddrPort) error {
func (NoopConn) ReloadConfig(_ *config.C) {
return
}
func (NoopConn) EnablePathMTUDiscovery() error {
return nil
}
func (NoopConn) Close() error {
return nil
}

View File

@@ -44,3 +44,17 @@ func NewListenConfig(multi bool) net.ListenConfig {
func (u *GenericConn) Rebind() error {
return nil
}
// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets.
// Android is Linux underneath, so we use IP_PMTUDISC_PROBE (kernel sets DF but
// does not consume incoming ICMP frag-needed for its PMTU cache; the manager
// drives discovery via authenticated probes).
func (u *GenericConn) EnablePathMTUDiscovery() error {
v4 := u.isV4Socket()
return u.controlFD(func(fd uintptr) error {
if v4 {
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_MTU_DISCOVER, unix.IP_PMTUDISC_PROBE)
}
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_MTU_DISCOVER, unix.IPV6_PMTUDISC_PROBE)
})
}

View File

@@ -47,3 +47,8 @@ func NewListenConfig(multi bool) net.ListenConfig {
func (u *GenericConn) Rebind() error {
return nil
}
// EnablePathMTUDiscovery is split into per-OS files: udp_freebsd.go handles
// FreeBSD (which has both IP_DONTFRAG and IPV6_DONTFRAG in the unix package);
// udp_openbsd.go handles OpenBSD (v6 only; the kernel doesn't expose a v4 DF
// sockopt).

View File

@@ -187,6 +187,17 @@ func (u *StdConn) SupportsMultipleReaders() bool {
return false
}
// EnablePathMTUDiscovery sets the don't-fragment bit on every outbound packet.
// On darwin we use IP_DONTFRAG (v4) / IPV6_DONTFRAG (v6). The kernel will return
// EMSGSIZE for sends that exceed the local interface MTU; ICMP-driven PMTU
// updates from upstream routers are processed by the kernel as usual.
func (u *StdConn) EnablePathMTUDiscovery() error {
if u.isV4 {
return syscall.SetsockoptInt(int(u.sysFd), syscall.IPPROTO_IP, unix.IP_DONTFRAG, 1)
}
return syscall.SetsockoptInt(int(u.sysFd), syscall.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1)
}
func (u *StdConn) Rebind() error {
var err error
if u.isV4 {

25
udp/udp_freebsd.go Normal file
View File

@@ -0,0 +1,25 @@
//go:build freebsd && !e2e_testing
// +build freebsd,!e2e_testing
package udp
import (
"golang.org/x/sys/unix"
)
// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets.
// FreeBSD exposes IP_DONTFRAG (v4) and IPV6_DONTFRAG (v6) in golang.org/x/sys/unix.
// Unlike Linux, BSDs don't have an explicit "don't consume incoming ICMP
// frag-needed" knob for unconnected UDP sockets; the kernel's PMTU cache will
// be updated from ICMP, which is benign for our usage (the cache only affects
// what EMSGSIZE gets surfaced for; the manager drives its own discovery via
// authenticated probes).
func (u *GenericConn) EnablePathMTUDiscovery() error {
v4 := u.isV4Socket()
return u.controlFD(func(fd uintptr) error {
if v4 {
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_DONTFRAG, 1)
}
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1)
})
}

View File

@@ -100,3 +100,41 @@ func (u *GenericConn) ListenOut(r EncReader) error {
func (u *GenericConn) SupportsMultipleReaders() bool {
return false
}
// EnablePathMTUDiscovery is implemented per-platform alongside Rebind, in
// udp_android.go / udp_bsd.go / udp_netbsd.go / udp_windows.go.
// controlFD invokes f with the underlying UDP socket file descriptor (or
// handle, on Windows). Used by platform files for setsockopt calls that the
// stdlib net.UDPConn does not expose directly.
func (u *GenericConn) controlFD(f func(fd uintptr) error) error {
rc, err := u.UDPConn.SyscallConn()
if err != nil {
return err
}
var sockErr error
err = rc.Control(func(fd uintptr) {
sockErr = f(fd)
})
if err != nil {
return err
}
return sockErr
}
// isV4Socket reports whether the local bind address looks like an IPv4 socket.
// Used by EnablePathMTUDiscovery to pick IPPROTO_IP vs IPPROTO_IPV6 socket
// options. Assumes pure-v4 or pure-v6 sockets; a dual-stack v6 socket bound to
// :: will be treated as v6 (correct: setting IPV6_DONTFRAG covers v4-mapped
// traffic too on most stacks).
func (u *GenericConn) isV4Socket() bool {
la := u.UDPConn.LocalAddr()
if la == nil {
return false
}
ua, ok := la.(*net.UDPAddr)
if !ok {
return false
}
return ua.IP.To4() != nil
}

View File

@@ -73,6 +73,21 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int)
return out, nil
}
// EnablePathMTUDiscovery sets IP_MTU_DISCOVER=IP_PMTUDISC_PROBE (IPV6 equivalent
// for v6 sockets). This sets the don't-fragment bit on every outbound packet but
// tells the kernel not to consume incoming ICMP frag-needed for its own PMTU
// cache; we drive PMTU discovery from the application via authenticated probes
// (RFC 8899). Called by the pmtud manager when PMTUD is enabled. Without this
// call the socket retains nebula's historical behavior (no DF, kernel may
// fragment), preserving compatibility with deployments that depend on UDP
// fragmentation.
func (u *StdConn) EnablePathMTUDiscovery() error {
if u.isV4 {
return u.setSockOptIPInt(unix.IPPROTO_IP, unix.IP_MTU_DISCOVER, unix.IP_PMTUDISC_PROBE)
}
return u.setSockOptIPInt(unix.IPPROTO_IPV6, unix.IPV6_MTU_DISCOVER, unix.IPV6_PMTUDISC_PROBE)
}
func (u *StdConn) SupportsMultipleReaders() bool {
return true
}
@@ -110,6 +125,21 @@ func (u *StdConn) setSockOptInt(opt int, n int) error {
return opErr
}
// setSockOptIPInt sets a socket option at a non-SOL_SOCKET level (e.g. IPPROTO_IP).
func (u *StdConn) setSockOptIPInt(level, opt, n int) error {
if u.rawConn == nil {
return fmt.Errorf("no UDP connection")
}
var opErr error
err := u.rawConn.Control(func(fd uintptr) {
opErr = unix.SetsockoptInt(int(fd), level, opt, n)
})
if err != nil {
return err
}
return opErr
}
func (u *StdConn) SetRecvBuffer(n int) error {
return u.setSockOptInt(unix.SO_RCVBUFFORCE, n)
}

View File

@@ -46,3 +46,18 @@ func NewListenConfig(multi bool) net.ListenConfig {
func (u *GenericConn) Rebind() error {
return nil
}
// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets.
// NetBSD exposes IPV6_DONTFRAG via golang.org/x/sys/unix but the kernel does
// not provide a socket-level knob for setting DF on v4 UDP. The only IP-layer
// constant exposed is IP_DF, which is the wire header flag, not a sockopt.
// quic-go skips NetBSD for the same reason. So v4 sockets stay at nebula's
// historical behavior (kernel may fragment); v6 gets DF.
func (u *GenericConn) EnablePathMTUDiscovery() error {
if u.isV4Socket() {
return nil
}
return u.controlFD(func(fd uintptr) error {
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1)
})
}

23
udp/udp_openbsd.go Normal file
View File

@@ -0,0 +1,23 @@
//go:build openbsd && !e2e_testing
// +build openbsd,!e2e_testing
package udp
import (
"golang.org/x/sys/unix"
)
// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets.
// OpenBSD exposes IPV6_DONTFRAG via golang.org/x/sys/unix but the kernel does
// not provide a socket-level knob for setting DF on v4 UDP. The only IP-layer
// constant exposed is IP_DF, which is the wire header flag, not a sockopt.
// quic-go skips OpenBSD for the same reason. So v4 sockets stay at nebula's
// historical behavior (kernel may fragment); v6 gets DF.
func (u *GenericConn) EnablePathMTUDiscovery() error {
if u.isV4Socket() {
return nil
}
return u.controlFD(func(fd uintptr) error {
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1)
})
}

View File

@@ -335,6 +335,12 @@ func (u *RIOConn) Rebind() error {
return nil
}
// EnablePathMTUDiscovery is a no-op on Windows for now. PMTUD is Linux-only in
// the initial PoC; Windows support would set IP_DONTFRAGMENT here.
func (u *RIOConn) EnablePathMTUDiscovery() error {
return nil
}
func (u *RIOConn) ReloadConfig(*config.C) {}
func (u *RIOConn) Close() error {

View File

@@ -152,6 +152,10 @@ func (u *TesterConn) Rebind() error {
return nil
}
func (u *TesterConn) EnablePathMTUDiscovery() error {
return nil
}
func (u *TesterConn) Close() error {
u.closeOnce.Do(func() {
close(u.done)

View File

@@ -9,6 +9,8 @@ import (
"net"
"net/netip"
"syscall"
"golang.org/x/sys/windows"
)
func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int) (Conn, error) {
@@ -44,3 +46,27 @@ func NewListenConfig(multi bool) net.ListenConfig {
func (u *GenericConn) Rebind() error {
return nil
}
// Windows IP_DONTFRAGMENT and IPV6_DONTFRAG are not exposed in the
// golang.org/x/sys/windows package. Defined locally per the values in
// ws2ipdef.h / ws2tcpip.h. These are stable Win32 constants that have not
// changed since at least Windows Vista.
const (
winIPDontFragment = 14
winIPv6DontFrag = 14
)
// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets.
// Windows uses IP_DONTFRAGMENT (v4) and IPV6_DONTFRAG (v6) at IPPROTO_IP /
// IPPROTO_IPV6 respectively. Note: this only enables DF on the GenericConn
// fallback path. The RIO path (RIOConn) has its own EnablePathMTUDiscovery
// in udp_rio_windows.go and is currently a no-op pending RIO-specific work.
func (u *GenericConn) EnablePathMTUDiscovery() error {
v4 := u.isV4Socket()
return u.controlFD(func(fd uintptr) error {
if v4 {
return windows.SetsockoptInt(windows.Handle(fd), windows.IPPROTO_IP, winIPDontFragment, 1)
}
return windows.SetsockoptInt(windows.Handle(fd), windows.IPPROTO_IPV6, winIPv6DontFrag, 1)
})
}