PMTUD exploration, start small then grow

2026-05-16 04:47:38 +02:00 · 2026-05-05 17:05:50 -05:00
parent 33c2d7277c
commit 16a836a73f
33 changed files with 1036 additions and 11 deletions
--- a/pmtud_manager.go
+++ b/pmtud_manager.go
@@ -0,0 +1,623 @@
+package nebula
+
+import (
+	"context"
+	"encoding/binary"
+	"log/slog"
+	"math/rand/v2"
+	"net/netip"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/slackhq/nebula/config"
+	"github.com/slackhq/nebula/header"
+	"github.com/slackhq/nebula/overlay"
+)
+
+// PMTUD PoC: discover the path MTU per-tunnel via authenticated probes that ride
+// the existing crypto session. We follow RFC 8899 PLPMTUD: a binary search
+// between a known-good floor and a configured ceiling, with N consecutive probe
+// losses at a size treated as "doesn't fit." Confirmed PMTU is pushed to the
+// overlay device, which on Linux installs a per-host route with the discovered
+// MTU. The kernel then surfaces EMSGSIZE / PTB to apps writing to the tun.
+//
+// Probe payload format (request):
+//
+//	[magic uint32 BE][probeID uint32 BE][padding 0x00...]
+//
+// Reply is a small ack with the same magic and probeID and no padding. We do not
+// verify the reverse-path MTU; only the forward direction matters for the
+// receiver's MTU on the inside.
+
+const (
+	pmtudMagic uint32 = 0x504D5544 // 'P' 'M' 'U' 'D'
+	pmtudFloor        = 1280       // IPv6 minimum payload, also a safe internet MTU floor
+
+	// pmtudConverged is the bytes-tolerance for stopping the search.
+	pmtudConverged = 8
+
+	// pmtudMaxLoss matches RFC 8899 MAX_PROBES (default 3).
+	pmtudMaxLoss = 3
+
+	// pmtudProbeInterval is the time between probe ticks during the search phase.
+	// Once a peer converges the wheel stops ticking it; re-validation is driven
+	// by connection_manager via MaybeProbeAsTest at its natural test cadence.
+	pmtudProbeInterval = 500 * time.Millisecond
+
+	// pmtudWheelMax is the wheel's maximum supported scheduling duration. We
+	// only ever schedule at pmtudProbeInterval today, but the wheel needs a
+	// max greater than its tick to allocate its slot ring sensibly.
+	pmtudWheelMax = 5 * time.Second
+
+	// pmtudOverheadPessimistic assumes IPv6 underlay + relay framing:
+	//   IPv6(40) + UDP(8) + outer nebula(16) + outer AEAD tag(16)
+	// + inner nebula(16) + inner AEAD tag(16) = 112 bytes.
+	// TODO: track underlay address family and per-peer relay state on the HostInfo
+	// so the manager can use the actual overhead for that tunnel and recover the
+	// 32 bytes we pessimistically give up on direct IPv6 paths and the 52 bytes on
+	// direct IPv4 paths.
+	pmtudOverheadPessimistic = 112
+
+	// pmtudUnsupportedAfter is the number of consecutive lost probes (across any
+	// sizes) without ever receiving a reply that we treat as evidence the peer
+	// does not understand the MTUDProbeRequest subtype (i.e. it's running an
+	// older nebula). After this many failures with everReplied=false we mark the
+	// peer pmtud-unsupported and stop scheduling probes. K is small enough that
+	// it fires before the binary search would naturally converge to floor (which
+	// would otherwise be ~30 wasted probes), but large enough to absorb a few
+	// transient probe losses on a path that's just starting to settle.
+	pmtudUnsupportedAfter = 5
+)
+
+// pmtudPeer tracks the binary-search state for one tunnel.
+type pmtudPeer struct {
+	mu       sync.Mutex
+	addr     netip.Addr
+	localIdx uint32
+
+	// low is the largest outer IP packet size we have a confirmed ack for.
+	// high is the smallest size we believe fails (the search ceiling to start).
+	low, high int
+
+	// inFlightSize is the outer IP packet size of the probe currently awaiting
+	// an ack. 0 means no probe in flight.
+	inFlightSize int
+	// inFlightID matches the probeID echoed in the reply.
+	inFlightID uint32
+	// losses counts consecutive failures at inFlightSize.
+	losses int
+
+	// firstProbe is true until we have sent the first probe of a search. The
+	// first probe targets the ceiling directly (RFC 8899 permits this Search
+	// Algorithm choice); operators who set tun.max_mtu typically have a path
+	// that supports it, so we converge in one probe in the common case.
+	firstProbe bool
+	// everReplied is true once we have ever received any MTUDProbeReply from
+	// this peer. Combined with consecutiveFailures, this lets us detect peers
+	// that don't understand the new subtype and stop probing them.
+	everReplied bool
+	// consecutiveFailures counts probes lost without an intervening reply.
+	// Resets to 0 on any successful reply.
+	consecutiveFailures int
+	// unsupported is set true once we conclude the peer doesn't speak PMTUD.
+	// The manager skips probes for unsupported peers.
+	unsupported bool
+
+	// converged means we have a confirmed PMTU and are in the slow re-validation phase.
+	converged bool
+	// applied is the inner MTU we last pushed to the overlay device (0 if never).
+	applied int
+}
+
+func (p *pmtudPeer) overhead() int {
+	// TODO: branch on actual underlay family + relay state for this peer.
+	return pmtudOverheadPessimistic
+}
+
+func (p *pmtudPeer) midpoint() int {
+	return (p.low + p.high) / 2
+}
+
+type pmtudManager struct {
+	intf   *Interface
+	device overlay.Device
+
+	// peers is keyed by HostInfo.localIndexId.
+	peers sync.Map // map[uint32]*pmtudPeer
+
+	wheel *LockingTimerWheel[uint32]
+
+	// floor is the always-safe inner MTU (= tun.mtu). Per-peer routes start here
+	// on tunnel-up so unprobed traffic is always small enough to fit. Stored as
+	// atomic int64 so reload can update it without coordinating with the readers
+	// in tick/HandleReply/OnTunnelUp.
+	floor atomic.Int64
+	// ceiling is the search ceiling expressed as an outer IP packet size, derived
+	// from tun.max_mtu (which is the kernel's device MTU on the tun) plus our
+	// pessimistic overhead. PMTUD will not probe larger than this.
+	ceiling atomic.Int64
+
+	enabled atomic.Bool
+
+	l *slog.Logger
+}
+
+func newPMTUDManagerFromConfig(l *slog.Logger, c *config.C, device overlay.Device) *pmtudManager {
+	m := &pmtudManager{
+		device: device,
+		wheel:  NewLockingTimerWheel[uint32](pmtudProbeInterval, pmtudWheelMax),
+		l:      l,
+	}
+	c.RegisterReloadCallback(func(c *config.C) { m.reload(c, false) })
+	m.reload(c, true)
+	return m
+}
+
+// reload applies tun.mtu / tun.max_mtu changes to the manager. On the initial
+// call (during construction) it just snapshots state; on a live reload it also
+// transitions in-flight peers to match the new bounds: clearing per-peer routes
+// when newly disabled, seeding peers from the hostmap and flipping DF on
+// outside sockets when newly enabled, and rebounding existing searches in
+// place when only the ceiling moved.
+func (m *pmtudManager) reload(c *config.C, initial bool) {
+	if !initial && !c.HasChanged("tun.mtu") && !c.HasChanged("tun.max_mtu") {
+		return
+	}
+
+	floor := c.GetInt("tun.mtu", overlay.DefaultMTU)
+	maxMTU := c.GetInt("tun.max_mtu", 0)
+
+	enable := maxMTU > floor && m.device.SupportsPerPeerMTU()
+	var ceiling int
+	if enable {
+		ceiling = maxMTU + pmtudOverheadPessimistic
+	}
+
+	if initial {
+		m.floor.Store(int64(floor))
+		m.ceiling.Store(int64(ceiling))
+		m.enabled.Store(enable)
+		switch {
+		case enable:
+			m.l.Info("pmtud enabled", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU)
+		case maxMTU > floor:
+			m.l.Warn("pmtud disabled: this platform does not yet support per-peer MTU routes",
+				"tun.max_mtu", maxMTU)
+		}
+		return
+	}
+
+	wasEnabled := m.enabled.Load()
+	m.floor.Store(int64(floor))
+	m.ceiling.Store(int64(ceiling))
+	m.enabled.Store(enable)
+
+	switch {
+	case wasEnabled && !enable:
+		m.disableLive(floor, maxMTU)
+	case !wasEnabled && enable:
+		m.enableLive(floor, ceiling, maxMTU)
+	case wasEnabled && enable:
+		m.reboundLive(floor, ceiling, maxMTU)
+	}
+}
+
+// disableLive clears per-peer routes and drops all peer state. We do not
+// disable DF on the outside sockets; once on, it stays on for the life of the
+// process. Operators flipping pmtud off live get correct routing behavior; if
+// they want the historical no-DF behavior back they need to restart.
+func (m *pmtudManager) disableLive(floor, maxMTU int) {
+	m.peers.Range(func(k, v any) bool {
+		p := v.(*pmtudPeer)
+		p.mu.Lock()
+		applied := p.applied
+		addr := p.addr
+		p.applied = 0
+		p.mu.Unlock()
+		if applied != 0 {
+			if err := m.device.SetPeerMTU(addr, 0); err != nil {
+				m.l.Warn("pmtud: failed to clear per-peer mtu on disable", "addr", addr, "error", err)
+			}
+		}
+		m.peers.Delete(k)
+		return true
+	})
+	m.l.Info("pmtud disabled (tun.max_mtu <= tun.mtu)", "tun.mtu", floor, "tun.max_mtu", maxMTU)
+}
+
+// enableLive flips DF on every outside socket. We don't pre-seed existing
+// tunnels here; connection_manager's normal test cadence will eventually call
+// MaybeProbeAsTest for each peer, which seeds on miss and lets the wheel pick
+// up the search from there. New tunnels established after this point still
+// take the OnTunnelUp fast path.
+func (m *pmtudManager) enableLive(floor, ceiling, maxMTU int) {
+	m.enableDF()
+	m.l.Info("pmtud enabled", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU)
+}
+
+// reboundLive resets each peer's search state to the new bounds. Peers whose
+// confirmed PMTU still fits under the new ceiling keep their applied route in
+// place during the new search; peers whose confirmed PMTU exceeds the new
+// ceiling get cleared back to floor and re-search from scratch. The unsupported
+// flag is preserved because peer software version doesn't change on reload.
+func (m *pmtudManager) reboundLive(floor, ceiling, maxMTU int) {
+	overhead := pmtudOverheadPessimistic
+	m.peers.Range(func(k, v any) bool {
+		p := v.(*pmtudPeer)
+		p.mu.Lock()
+		if p.applied > 0 && p.applied+overhead > ceiling {
+			if err := m.device.SetPeerMTU(p.addr, 0); err != nil {
+				m.l.Warn("pmtud: failed to clear per-peer mtu on rebound", "addr", p.addr, "error", err)
+			} else {
+				p.applied = 0
+			}
+		}
+		p.low = floor + overhead
+		p.high = ceiling
+		p.inFlightSize = 0
+		p.inFlightID = 0
+		p.losses = 0
+		p.firstProbe = !p.unsupported
+		p.converged = false
+		idx := p.localIdx
+		unsupported := p.unsupported
+		p.mu.Unlock()
+		if !unsupported {
+			m.wheel.Add(idx, pmtudProbeInterval)
+		}
+		return true
+	})
+	m.l.Info("pmtud reloaded", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU)
+}
+
+// enableDF asks every outside socket to set the don't-fragment bit on outbound
+// packets. Idempotent: safe to call from both Start (initial enable) and from a
+// live reload that flips pmtud on.
+func (m *pmtudManager) enableDF() {
+	for i, w := range m.intf.writers {
+		if err := w.EnablePathMTUDiscovery(); err != nil {
+			m.l.Warn("pmtud: failed to enable path mtu discovery on outside socket; pmtud will not work correctly",
+				"writer", i, "error", err)
+		}
+	}
+}
+
+// Start runs the probe scheduler until ctx is done. The loop runs even when PMTUD
+// is disabled at startup so a hot reload can turn it on without restarting nebula.
+//
+// When PMTUD is enabled at startup we ask each outside socket to enable
+// path-MTU discovery (DF on every send). This is intentionally gated on the
+// feature being on so that operators who haven't opted in keep the historical
+// behavior where the kernel may fragment outbound nebula UDP packets. A live
+// reload from disabled to enabled will also flip DF on via enableLive; the
+// reverse direction does not turn DF off, so flipping pmtud back off live
+// keeps DF on until restart.
+func (m *pmtudManager) Start(ctx context.Context) {
+	if m.enabled.Load() {
+		m.enableDF()
+	}
+
+	ticker := time.NewTicker(m.wheel.t.tickDuration)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case now := <-ticker.C:
+			m.wheel.Advance(now)
+			for {
+				idx, has := m.wheel.Purge()
+				if !has {
+					break
+				}
+				m.tick(idx)
+			}
+		}
+	}
+}
+
+// OnTunnelUp is called when a HostInfo becomes traffic-watched. The kernel
+// already routes packets to this peer through the per-vpn-network route (mtu =
+// tun.mtu), so the floor is in effect implicitly. We just kick off the search
+// here; HandleReply will install a per-host /32 (or /128) route once a larger
+// size is confirmed.
+func (m *pmtudManager) OnTunnelUp(hi *HostInfo) {
+	if !m.enabled.Load() {
+		return
+	}
+	m.seedPeer(hi)
+}
+
+// seedPeer is the shared body of OnTunnelUp and the live-reload enable path.
+// LoadOrStore protects against double-seeding the same localIndexId from a
+// race between OnTunnelUp and a reload-driven hostmap walk.
+func (m *pmtudManager) seedPeer(hi *HostInfo) {
+	if hi == nil || len(hi.vpnAddrs) == 0 {
+		return
+	}
+	floor := int(m.floor.Load())
+	ceiling := int(m.ceiling.Load())
+	p := &pmtudPeer{
+		addr:       hi.vpnAddrs[0],
+		localIdx:   hi.localIndexId,
+		low:        floor + pmtudOverheadPessimistic,
+		high:       ceiling,
+		firstProbe: true,
+	}
+	if _, loaded := m.peers.LoadOrStore(hi.localIndexId, p); loaded {
+		return
+	}
+	m.wheel.Add(hi.localIndexId, pmtudProbeInterval)
+}
+
+// OnTunnelDown is called when a HostInfo is being torn down. Removes any per-host
+// MTU override so the device default applies again.
+func (m *pmtudManager) OnTunnelDown(hi *HostInfo) {
+	if hi == nil {
+		return
+	}
+	v, ok := m.peers.LoadAndDelete(hi.localIndexId)
+	if !ok {
+		return
+	}
+	p := v.(*pmtudPeer)
+	p.mu.Lock()
+	applied := p.applied
+	addr := p.addr
+	p.applied = 0
+	p.mu.Unlock()
+	if applied != 0 {
+		if err := m.device.SetPeerMTU(addr, 0); err != nil {
+			m.l.Warn("pmtud: failed to clear per-peer mtu", "addr", addr, "error", err)
+		}
+	}
+}
+
+// OnRoam is called when a HostInfo's remote underlay address changes. The path
+// MTU may now be different; drop the per-host route so the kernel falls back to
+// the per-vpn-network route (mtu = tun.mtu floor), then restart the search.
+// We do not reset the unsupported flag: peer software version doesn't change on
+// roam, so once we've decided a peer doesn't speak PMTUD we stay decided.
+func (m *pmtudManager) OnRoam(hi *HostInfo) {
+	if !m.enabled.Load() || hi == nil {
+		return
+	}
+	v, ok := m.peers.Load(hi.localIndexId)
+	if !ok {
+		return
+	}
+	p := v.(*pmtudPeer)
+	p.mu.Lock()
+	if p.unsupported {
+		p.mu.Unlock()
+		return
+	}
+	p.low = int(m.floor.Load()) + pmtudOverheadPessimistic
+	p.high = int(m.ceiling.Load())
+	p.inFlightSize = 0
+	p.inFlightID = 0
+	p.losses = 0
+	p.consecutiveFailures = 0
+	p.firstProbe = true
+	p.converged = false
+	if p.applied != 0 {
+		if err := m.device.SetPeerMTU(p.addr, 0); err != nil {
+			m.l.Warn("pmtud: failed to clear per-peer mtu on roam", "addr", p.addr, "error", err)
+		} else {
+			p.applied = 0
+		}
+	}
+	p.mu.Unlock()
+	m.wheel.Add(hi.localIndexId, pmtudProbeInterval)
+}
+
+// MaybeProbeAsTest is called by connection_manager when it would otherwise send
+// a TestRequest because a tunnel has gone silent. If we have a confirmed PMTU
+// for this peer that's larger than the floor, we send a probe at that size
+// instead. The reply confirms both liveness (consumed by connection_manager via
+// the existing inbound traffic accounting fallthrough in outside.go) and that
+// the confirmed PMTU still fits (consumed by HandleReply here). One synthetic
+// packet does the work of two.
+//
+// Returns true if a probe was sent. False means the caller should send a
+// regular TestRequest at the floor.
+//
+// On probe failure, connection_manager's existing pendingDeletion timeout will
+// tear the tunnel down. Heavy hammer, but correct: a re-handshake re-runs PMTUD
+// discovery against the now-shrunken path. A future EMSGSIZE-capture followup
+// can replace this with a soft-drop-and-research flow.
+func (m *pmtudManager) MaybeProbeAsTest(hi *HostInfo) bool {
+	if !m.enabled.Load() || hi == nil {
+		return false
+	}
+	v, ok := m.peers.Load(hi.localIndexId)
+	if !ok {
+		// Tunnel pre-dates the manager being aware of it (e.g. pmtud was just
+		// enabled live, or AddTrafficWatch fired before this call). Seed the
+		// peer so the wheel picks up the search; let connection_manager send
+		// its regular TestRequest this cycle.
+		m.seedPeer(hi)
+		return false
+	}
+	p := v.(*pmtudPeer)
+	p.mu.Lock()
+	if p.unsupported || p.applied == 0 {
+		p.mu.Unlock()
+		return false
+	}
+	overhead := p.overhead()
+	size := p.applied + overhead
+	id := rand.Uint32()
+	p.inFlightSize = size
+	p.inFlightID = id
+	p.mu.Unlock()
+
+	m.sendProbe(hi, size, id, overhead)
+	return true
+}
+
+// HandleReply consumes an MTUDProbeReply payload from the receive path.
+func (m *pmtudManager) HandleReply(localIdx uint32, payload []byte) {
+	if !m.enabled.Load() {
+		return
+	}
+	if len(payload) < 8 {
+		return
+	}
+	if binary.BigEndian.Uint32(payload[0:4]) != pmtudMagic {
+		return
+	}
+	id := binary.BigEndian.Uint32(payload[4:8])
+
+	v, ok := m.peers.Load(localIdx)
+	if !ok {
+		return
+	}
+	p := v.(*pmtudPeer)
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	if p.inFlightSize == 0 || p.inFlightID != id {
+		return
+	}
+
+	confirmed := p.inFlightSize
+	p.low = confirmed
+	p.inFlightSize = 0
+	p.losses = 0
+	p.everReplied = true
+	p.consecutiveFailures = 0
+
+	innerMTU := confirmed - p.overhead()
+	// Only install a /32 override when it would actually raise the MTU above the
+	// per-vpn-network floor route. If the discovered MTU is <= floor, the /24
+	// already covers it; installing a /32 at floor would just create roam churn.
+	if innerMTU > int(m.floor.Load()) && p.applied != innerMTU {
+		if err := m.device.SetPeerMTU(p.addr, innerMTU); err != nil {
+			m.l.Warn("pmtud: failed to apply per-peer mtu", "addr", p.addr, "innerMTU", innerMTU, "error", err)
+		} else {
+			m.l.Info("pmtud probe confirmed",
+				"addr", p.addr,
+				"outerMTU", confirmed,
+				"innerMTU", innerMTU,
+				"low", p.low,
+				"high", p.high,
+			)
+			p.applied = innerMTU
+		}
+	}
+
+	if p.high-p.low <= pmtudConverged {
+		p.converged = true
+	} else {
+		p.converged = false
+	}
+}
+
+// tick handles one wheel firing for a single peer.
+func (m *pmtudManager) tick(localIdx uint32) {
+	v, ok := m.peers.Load(localIdx)
+	if !ok {
+		return
+	}
+	p := v.(*pmtudPeer)
+	p.mu.Lock()
+
+	if p.unsupported {
+		p.mu.Unlock()
+		return
+	}
+
+	// If a probe was outstanding, this tick is the loss timeout.
+	if p.inFlightSize != 0 {
+		p.losses++
+		p.consecutiveFailures++
+		if p.losses >= pmtudMaxLoss {
+			p.high = p.inFlightSize
+			p.inFlightSize = 0
+			p.losses = 0
+			if p.high-p.low <= pmtudConverged {
+				p.converged = true
+			}
+		}
+	}
+
+	// If we've never gotten a reply from this peer and we've burned through our
+	// failure budget, conclude the peer doesn't understand the MTUDProbeRequest
+	// subtype and stop scheduling probes for it.
+	if !p.everReplied && p.consecutiveFailures >= pmtudUnsupportedAfter {
+		p.unsupported = true
+		addr := p.addr
+		p.mu.Unlock()
+		m.l.Info("pmtud: peer not responding to probes, marking unsupported",
+			"addr", addr, "failures", pmtudUnsupportedAfter)
+		return
+	}
+
+	hi := m.intf.hostMap.QueryIndex(localIdx)
+	if hi == nil {
+		p.mu.Unlock()
+		m.peers.Delete(localIdx)
+		return
+	}
+
+	// Once a peer converges, the wheel stops scheduling for it. Re-validation
+	// (and the resulting black hole detection) is driven by connection_manager
+	// via MaybeProbeAsTest at its natural test cadence, so a converged peer
+	// has nothing for the wheel to do until OnRoam or a tunnel down/up cycle
+	// triggers a fresh search.
+	if p.converged {
+		p.mu.Unlock()
+		return
+	}
+
+	ceiling := int(m.ceiling.Load())
+	var size int
+	switch {
+	case p.firstProbe:
+		// Probe the ceiling directly. If the path supports it (the common case
+		// when an operator has explicitly configured tun.max_mtu), we converge
+		// in one round trip. If it fails, the standard binary search resumes
+		// on the next tick from the (low, ceiling) bounds.
+		size = ceiling
+		p.firstProbe = false
+	case p.losses > 0 && p.inFlightSize != 0:
+		size = p.inFlightSize
+	default:
+		size = p.midpoint()
+	}
+	if size < pmtudFloor {
+		size = pmtudFloor
+	}
+	if size > ceiling {
+		size = ceiling
+	}
+
+	id := rand.Uint32()
+	p.inFlightSize = size
+	p.inFlightID = id
+	overhead := p.overhead()
+	p.mu.Unlock()
+
+	m.sendProbe(hi, size, id, overhead)
+	m.wheel.Add(localIdx, pmtudProbeInterval)
+}
+
+// sendProbe builds an MTUDProbeRequest payload that will produce an outer IP
+// packet of approximately `outerSize` bytes, then sends it.
+func (m *pmtudManager) sendProbe(hi *HostInfo, outerSize int, id uint32, overhead int) {
+	payloadLen := outerSize - overhead
+	if payloadLen < 8 {
+		payloadLen = 8
+	}
+	p := make([]byte, payloadLen)
+	binary.BigEndian.PutUint32(p[0:4], pmtudMagic)
+	binary.BigEndian.PutUint32(p[4:8], id)
+	// remaining bytes are zero-padding
+
+	nb := make([]byte, 12)
+	out := make([]byte, outerSize+128) // headroom for header/tag/relay framing
+	m.intf.SendMessageToHostInfo(header.Test, header.MTUDProbeRequest, hi, p, nb, out)
+}