package nebula import ( "context" "encoding/binary" "log/slog" "math/rand/v2" "net/netip" "sync" "sync/atomic" "time" "github.com/slackhq/nebula/config" "github.com/slackhq/nebula/header" "github.com/slackhq/nebula/overlay" ) // PMTUD PoC: discover the path MTU per-tunnel via authenticated probes that ride // the existing crypto session. We follow RFC 8899 PLPMTUD: a binary search // between a known-good floor and a configured ceiling, with N consecutive probe // losses at a size treated as "doesn't fit." Confirmed PMTU is pushed to the // overlay device, which on Linux installs a per-host route with the discovered // MTU. The kernel then surfaces EMSGSIZE / PTB to apps writing to the tun. // // Probe payload format (request): // // [magic uint32 BE][probeID uint32 BE][padding 0x00...] // // Reply is a small ack with the same magic and probeID and no padding. We do not // verify the reverse-path MTU; only the forward direction matters for the // receiver's MTU on the inside. const ( pmtudMagic uint32 = 0x504D5544 // 'P' 'M' 'U' 'D' pmtudFloor = 1280 // IPv6 minimum payload, also a safe internet MTU floor // pmtudConverged is the bytes-tolerance for stopping the search. pmtudConverged = 8 // pmtudMaxLoss matches RFC 8899 MAX_PROBES (default 3). pmtudMaxLoss = 3 // pmtudProbeInterval is the time between probe ticks during the search phase. // Once a peer converges the wheel stops ticking it; re-validation is driven // by connection_manager via MaybeProbeAsTest at its natural test cadence. pmtudProbeInterval = 500 * time.Millisecond // pmtudWheelMax is the wheel's maximum supported scheduling duration. We // only ever schedule at pmtudProbeInterval today, but the wheel needs a // max greater than its tick to allocate its slot ring sensibly. pmtudWheelMax = 5 * time.Second // pmtudOverheadPessimistic assumes IPv6 underlay + relay framing: // IPv6(40) + UDP(8) + outer nebula(16) + outer AEAD tag(16) // + inner nebula(16) + inner AEAD tag(16) = 112 bytes. // TODO: track underlay address family and per-peer relay state on the HostInfo // so the manager can use the actual overhead for that tunnel and recover the // 32 bytes we pessimistically give up on direct IPv6 paths and the 52 bytes on // direct IPv4 paths. pmtudOverheadPessimistic = 112 // pmtudUnsupportedAfter is the number of consecutive lost probes (across any // sizes) without ever receiving a reply that we treat as evidence the peer // does not understand the MTUDProbeRequest subtype (i.e. it's running an // older nebula). After this many failures with everReplied=false we mark the // peer pmtud-unsupported and stop scheduling probes. K is small enough that // it fires before the binary search would naturally converge to floor (which // would otherwise be ~30 wasted probes), but large enough to absorb a few // transient probe losses on a path that's just starting to settle. pmtudUnsupportedAfter = 5 ) // pmtudPeer tracks the binary-search state for one tunnel. type pmtudPeer struct { mu sync.Mutex addr netip.Addr localIdx uint32 // low is the largest outer IP packet size we have a confirmed ack for. // high is the smallest size we believe fails (the search ceiling to start). low, high int // inFlightSize is the outer IP packet size of the probe currently awaiting // an ack. 0 means no probe in flight. inFlightSize int // inFlightID matches the probeID echoed in the reply. inFlightID uint32 // losses counts consecutive failures at inFlightSize. losses int // firstProbe is true until we have sent the first probe of a search. The // first probe targets the ceiling directly (RFC 8899 permits this Search // Algorithm choice); operators who set tun.max_mtu typically have a path // that supports it, so we converge in one probe in the common case. firstProbe bool // everReplied is true once we have ever received any MTUDProbeReply from // this peer. Combined with consecutiveFailures, this lets us detect peers // that don't understand the new subtype and stop probing them. everReplied bool // consecutiveFailures counts probes lost without an intervening reply. // Resets to 0 on any successful reply. consecutiveFailures int // unsupported is set true once we conclude the peer doesn't speak PMTUD. // The manager skips probes for unsupported peers. unsupported bool // converged means we have a confirmed PMTU and are in the slow re-validation phase. converged bool // applied is the inner MTU we last pushed to the overlay device (0 if never). applied int } func (p *pmtudPeer) overhead() int { // TODO: branch on actual underlay family + relay state for this peer. return pmtudOverheadPessimistic } func (p *pmtudPeer) midpoint() int { return (p.low + p.high) / 2 } type pmtudManager struct { intf *Interface device overlay.Device // peers is keyed by HostInfo.localIndexId. peers sync.Map // map[uint32]*pmtudPeer wheel *LockingTimerWheel[uint32] // floor is the always-safe inner MTU (= tun.mtu). Per-peer routes start here // on tunnel-up so unprobed traffic is always small enough to fit. Stored as // atomic int64 so reload can update it without coordinating with the readers // in tick/HandleReply/OnTunnelUp. floor atomic.Int64 // ceiling is the search ceiling expressed as an outer IP packet size, derived // from tun.max_mtu (which is the kernel's device MTU on the tun) plus our // pessimistic overhead. PMTUD will not probe larger than this. ceiling atomic.Int64 enabled atomic.Bool l *slog.Logger } func newPMTUDManagerFromConfig(l *slog.Logger, c *config.C, device overlay.Device) *pmtudManager { m := &pmtudManager{ device: device, wheel: NewLockingTimerWheel[uint32](pmtudProbeInterval, pmtudWheelMax), l: l, } c.RegisterReloadCallback(func(c *config.C) { m.reload(c, false) }) m.reload(c, true) return m } // reload applies tun.mtu / tun.max_mtu changes to the manager. On the initial // call (during construction) it just snapshots state; on a live reload it also // transitions in-flight peers to match the new bounds: clearing per-peer routes // when newly disabled, seeding peers from the hostmap and flipping DF on // outside sockets when newly enabled, and rebounding existing searches in // place when only the ceiling moved. func (m *pmtudManager) reload(c *config.C, initial bool) { if !initial && !c.HasChanged("tun.mtu") && !c.HasChanged("tun.max_mtu") { return } floor := c.GetInt("tun.mtu", overlay.DefaultMTU) maxMTU := c.GetInt("tun.max_mtu", 0) enable := maxMTU > floor && m.device.SupportsPerPeerMTU() var ceiling int if enable { ceiling = maxMTU + pmtudOverheadPessimistic } if initial { m.floor.Store(int64(floor)) m.ceiling.Store(int64(ceiling)) m.enabled.Store(enable) switch { case enable: m.l.Info("pmtud enabled", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU) case maxMTU > floor: m.l.Warn("pmtud disabled: this platform does not yet support per-peer MTU routes", "tun.max_mtu", maxMTU) } return } wasEnabled := m.enabled.Load() m.floor.Store(int64(floor)) m.ceiling.Store(int64(ceiling)) m.enabled.Store(enable) switch { case wasEnabled && !enable: m.disableLive(floor, maxMTU) case !wasEnabled && enable: m.enableLive(floor, ceiling, maxMTU) case wasEnabled && enable: m.reboundLive(floor, ceiling, maxMTU) } } // disableLive clears per-peer routes and drops all peer state. We do not // disable DF on the outside sockets; once on, it stays on for the life of the // process. Operators flipping pmtud off live get correct routing behavior; if // they want the historical no-DF behavior back they need to restart. func (m *pmtudManager) disableLive(floor, maxMTU int) { m.peers.Range(func(k, v any) bool { p := v.(*pmtudPeer) p.mu.Lock() applied := p.applied addr := p.addr p.applied = 0 p.mu.Unlock() if applied != 0 { if err := m.device.SetPeerMTU(addr, 0); err != nil { m.l.Warn("pmtud: failed to clear per-peer mtu on disable", "addr", addr, "error", err) } } m.peers.Delete(k) return true }) m.l.Info("pmtud disabled (tun.max_mtu <= tun.mtu)", "tun.mtu", floor, "tun.max_mtu", maxMTU) } // enableLive flips DF on every outside socket. We don't pre-seed existing // tunnels here; connection_manager's normal test cadence will eventually call // MaybeProbeAsTest for each peer, which seeds on miss and lets the wheel pick // up the search from there. New tunnels established after this point still // take the OnTunnelUp fast path. func (m *pmtudManager) enableLive(floor, ceiling, maxMTU int) { m.enableDF() m.l.Info("pmtud enabled", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU) } // reboundLive resets each peer's search state to the new bounds. Peers whose // confirmed PMTU still fits under the new ceiling keep their applied route in // place during the new search; peers whose confirmed PMTU exceeds the new // ceiling get cleared back to floor and re-search from scratch. The unsupported // flag is preserved because peer software version doesn't change on reload. func (m *pmtudManager) reboundLive(floor, ceiling, maxMTU int) { overhead := pmtudOverheadPessimistic m.peers.Range(func(k, v any) bool { p := v.(*pmtudPeer) p.mu.Lock() if p.applied > 0 && p.applied+overhead > ceiling { if err := m.device.SetPeerMTU(p.addr, 0); err != nil { m.l.Warn("pmtud: failed to clear per-peer mtu on rebound", "addr", p.addr, "error", err) } else { p.applied = 0 } } p.low = floor + overhead p.high = ceiling p.inFlightSize = 0 p.inFlightID = 0 p.losses = 0 p.firstProbe = !p.unsupported p.converged = false idx := p.localIdx unsupported := p.unsupported p.mu.Unlock() if !unsupported { m.wheel.Add(idx, pmtudProbeInterval) } return true }) m.l.Info("pmtud reloaded", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU) } // enableDF asks every outside socket to set the don't-fragment bit on outbound // packets. Idempotent: safe to call from both Start (initial enable) and from a // live reload that flips pmtud on. func (m *pmtudManager) enableDF() { for i, w := range m.intf.writers { if err := w.EnablePathMTUDiscovery(); err != nil { m.l.Warn("pmtud: failed to enable path mtu discovery on outside socket; pmtud will not work correctly", "writer", i, "error", err) } } } // Start runs the probe scheduler until ctx is done. The loop runs even when PMTUD // is disabled at startup so a hot reload can turn it on without restarting nebula. // // When PMTUD is enabled at startup we ask each outside socket to enable // path-MTU discovery (DF on every send). This is intentionally gated on the // feature being on so that operators who haven't opted in keep the historical // behavior where the kernel may fragment outbound nebula UDP packets. A live // reload from disabled to enabled will also flip DF on via enableLive; the // reverse direction does not turn DF off, so flipping pmtud back off live // keeps DF on until restart. func (m *pmtudManager) Start(ctx context.Context) { if m.enabled.Load() { m.enableDF() } ticker := time.NewTicker(m.wheel.t.tickDuration) defer ticker.Stop() for { select { case <-ctx.Done(): return case now := <-ticker.C: m.wheel.Advance(now) for { idx, has := m.wheel.Purge() if !has { break } m.tick(idx) } } } } // OnTunnelUp is called when a HostInfo becomes traffic-watched. The kernel // already routes packets to this peer through the per-vpn-network route (mtu = // tun.mtu), so the floor is in effect implicitly. We just kick off the search // here; HandleReply will install a per-host /32 (or /128) route once a larger // size is confirmed. func (m *pmtudManager) OnTunnelUp(hi *HostInfo) { if !m.enabled.Load() { return } m.seedPeer(hi) } // seedPeer is the shared body of OnTunnelUp and the live-reload enable path. // LoadOrStore protects against double-seeding the same localIndexId from a // race between OnTunnelUp and a reload-driven hostmap walk. func (m *pmtudManager) seedPeer(hi *HostInfo) { if hi == nil || len(hi.vpnAddrs) == 0 { return } floor := int(m.floor.Load()) ceiling := int(m.ceiling.Load()) p := &pmtudPeer{ addr: hi.vpnAddrs[0], localIdx: hi.localIndexId, low: floor + pmtudOverheadPessimistic, high: ceiling, firstProbe: true, } if _, loaded := m.peers.LoadOrStore(hi.localIndexId, p); loaded { return } m.wheel.Add(hi.localIndexId, pmtudProbeInterval) } // OnTunnelDown is called when a HostInfo is being torn down. Removes any per-host // MTU override so the device default applies again. func (m *pmtudManager) OnTunnelDown(hi *HostInfo) { if hi == nil { return } v, ok := m.peers.LoadAndDelete(hi.localIndexId) if !ok { return } p := v.(*pmtudPeer) p.mu.Lock() applied := p.applied addr := p.addr p.applied = 0 p.mu.Unlock() if applied != 0 { if err := m.device.SetPeerMTU(addr, 0); err != nil { m.l.Warn("pmtud: failed to clear per-peer mtu", "addr", addr, "error", err) } } } // OnRoam is called when a HostInfo's remote underlay address changes. The path // MTU may now be different; drop the per-host route so the kernel falls back to // the per-vpn-network route (mtu = tun.mtu floor), then restart the search. // We do not reset the unsupported flag: peer software version doesn't change on // roam, so once we've decided a peer doesn't speak PMTUD we stay decided. func (m *pmtudManager) OnRoam(hi *HostInfo) { if !m.enabled.Load() || hi == nil { return } v, ok := m.peers.Load(hi.localIndexId) if !ok { return } p := v.(*pmtudPeer) p.mu.Lock() if p.unsupported { p.mu.Unlock() return } p.low = int(m.floor.Load()) + pmtudOverheadPessimistic p.high = int(m.ceiling.Load()) p.inFlightSize = 0 p.inFlightID = 0 p.losses = 0 p.consecutiveFailures = 0 p.firstProbe = true p.converged = false if p.applied != 0 { if err := m.device.SetPeerMTU(p.addr, 0); err != nil { m.l.Warn("pmtud: failed to clear per-peer mtu on roam", "addr", p.addr, "error", err) } else { p.applied = 0 } } p.mu.Unlock() m.wheel.Add(hi.localIndexId, pmtudProbeInterval) } // MaybeProbeAsTest is called by connection_manager when it would otherwise send // a TestRequest because a tunnel has gone silent. If we have a confirmed PMTU // for this peer that's larger than the floor, we send a probe at that size // instead. The reply confirms both liveness (consumed by connection_manager via // the existing inbound traffic accounting fallthrough in outside.go) and that // the confirmed PMTU still fits (consumed by HandleReply here). One synthetic // packet does the work of two. // // Returns true if a probe was sent. False means the caller should send a // regular TestRequest at the floor. // // On probe failure, connection_manager's existing pendingDeletion timeout will // tear the tunnel down. Heavy hammer, but correct: a re-handshake re-runs PMTUD // discovery against the now-shrunken path. A future EMSGSIZE-capture followup // can replace this with a soft-drop-and-research flow. func (m *pmtudManager) MaybeProbeAsTest(hi *HostInfo) bool { if !m.enabled.Load() || hi == nil { return false } v, ok := m.peers.Load(hi.localIndexId) if !ok { // Tunnel pre-dates the manager being aware of it (e.g. pmtud was just // enabled live, or AddTrafficWatch fired before this call). Seed the // peer so the wheel picks up the search; let connection_manager send // its regular TestRequest this cycle. m.seedPeer(hi) return false } p := v.(*pmtudPeer) p.mu.Lock() if p.unsupported || p.applied == 0 { p.mu.Unlock() return false } overhead := p.overhead() size := p.applied + overhead id := rand.Uint32() p.inFlightSize = size p.inFlightID = id p.mu.Unlock() m.sendProbe(hi, size, id, overhead) return true } // HandleReply consumes an MTUDProbeReply payload from the receive path. func (m *pmtudManager) HandleReply(localIdx uint32, payload []byte) { if !m.enabled.Load() { return } if len(payload) < 8 { return } if binary.BigEndian.Uint32(payload[0:4]) != pmtudMagic { return } id := binary.BigEndian.Uint32(payload[4:8]) v, ok := m.peers.Load(localIdx) if !ok { return } p := v.(*pmtudPeer) p.mu.Lock() defer p.mu.Unlock() if p.inFlightSize == 0 || p.inFlightID != id { return } confirmed := p.inFlightSize p.low = confirmed p.inFlightSize = 0 p.losses = 0 p.everReplied = true p.consecutiveFailures = 0 innerMTU := confirmed - p.overhead() // Only install a /32 override when it would actually raise the MTU above the // per-vpn-network floor route. If the discovered MTU is <= floor, the /24 // already covers it; installing a /32 at floor would just create roam churn. if innerMTU > int(m.floor.Load()) && p.applied != innerMTU { if err := m.device.SetPeerMTU(p.addr, innerMTU); err != nil { m.l.Warn("pmtud: failed to apply per-peer mtu", "addr", p.addr, "innerMTU", innerMTU, "error", err) } else { m.l.Info("pmtud probe confirmed", "addr", p.addr, "outerMTU", confirmed, "innerMTU", innerMTU, "low", p.low, "high", p.high, ) p.applied = innerMTU } } if p.high-p.low <= pmtudConverged { p.converged = true } else { p.converged = false } } // tick handles one wheel firing for a single peer. func (m *pmtudManager) tick(localIdx uint32) { v, ok := m.peers.Load(localIdx) if !ok { return } p := v.(*pmtudPeer) p.mu.Lock() if p.unsupported { p.mu.Unlock() return } // If a probe was outstanding, this tick is the loss timeout. if p.inFlightSize != 0 { p.losses++ p.consecutiveFailures++ if p.losses >= pmtudMaxLoss { p.high = p.inFlightSize p.inFlightSize = 0 p.losses = 0 if p.high-p.low <= pmtudConverged { p.converged = true } } } // If we've never gotten a reply from this peer and we've burned through our // failure budget, conclude the peer doesn't understand the MTUDProbeRequest // subtype and stop scheduling probes for it. if !p.everReplied && p.consecutiveFailures >= pmtudUnsupportedAfter { p.unsupported = true addr := p.addr p.mu.Unlock() m.l.Info("pmtud: peer not responding to probes, marking unsupported", "addr", addr, "failures", pmtudUnsupportedAfter) return } hi := m.intf.hostMap.QueryIndex(localIdx) if hi == nil { p.mu.Unlock() m.peers.Delete(localIdx) return } // Once a peer converges, the wheel stops scheduling for it. Re-validation // (and the resulting black hole detection) is driven by connection_manager // via MaybeProbeAsTest at its natural test cadence, so a converged peer // has nothing for the wheel to do until OnRoam or a tunnel down/up cycle // triggers a fresh search. if p.converged { p.mu.Unlock() return } ceiling := int(m.ceiling.Load()) var size int switch { case p.firstProbe: // Probe the ceiling directly. If the path supports it (the common case // when an operator has explicitly configured tun.max_mtu), we converge // in one round trip. If it fails, the standard binary search resumes // on the next tick from the (low, ceiling) bounds. size = ceiling p.firstProbe = false case p.losses > 0 && p.inFlightSize != 0: size = p.inFlightSize default: size = p.midpoint() } if size < pmtudFloor { size = pmtudFloor } if size > ceiling { size = ceiling } id := rand.Uint32() p.inFlightSize = size p.inFlightID = id overhead := p.overhead() p.mu.Unlock() m.sendProbe(hi, size, id, overhead) m.wheel.Add(localIdx, pmtudProbeInterval) } // sendProbe builds an MTUDProbeRequest payload that will produce an outer IP // packet of approximately `outerSize` bytes, then sends it. func (m *pmtudManager) sendProbe(hi *HostInfo, outerSize int, id uint32, overhead int) { payloadLen := outerSize - overhead if payloadLen < 8 { payloadLen = 8 } p := make([]byte, payloadLen) binary.BigEndian.PutUint32(p[0:4], pmtudMagic) binary.BigEndian.PutUint32(p[4:8], id) // remaining bytes are zero-padding nb := make([]byte, 12) out := make([]byte, outerSize+128) // headroom for header/tag/relay framing m.intf.SendMessageToHostInfo(header.Test, header.MTUDProbeRequest, hi, p, nb, out) }