mirror of
https://github.com/slackhq/nebula.git
synced 2026-05-15 20:37:36 +02:00
PMTUD exploration, start small then grow
This commit is contained in:
@@ -145,6 +145,7 @@ func (cm *connectionManager) getAndResetTrafficCheck(h *HostInfo, now time.Time)
|
|||||||
func (cm *connectionManager) AddTrafficWatch(h *HostInfo) {
|
func (cm *connectionManager) AddTrafficWatch(h *HostInfo) {
|
||||||
if h.out.Swap(true) == false {
|
if h.out.Swap(true) == false {
|
||||||
cm.trafficTimer.Add(h.localIndexId, cm.checkInterval)
|
cm.trafficTimer.Add(h.localIndexId, cm.checkInterval)
|
||||||
|
cm.intf.pmtudManager.OnTunnelUp(h)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -180,6 +181,7 @@ func (cm *connectionManager) doTrafficCheck(localIndex uint32, p, nb, out []byte
|
|||||||
|
|
||||||
switch decision {
|
switch decision {
|
||||||
case deleteTunnel:
|
case deleteTunnel:
|
||||||
|
cm.intf.pmtudManager.OnTunnelDown(hostinfo)
|
||||||
if cm.hostMap.DeleteHostInfo(hostinfo) {
|
if cm.hostMap.DeleteHostInfo(hostinfo) {
|
||||||
// Only clearing the lighthouse cache if this is the last hostinfo for this vpn ip in the hostmap
|
// Only clearing the lighthouse cache if this is the last hostinfo for this vpn ip in the hostmap
|
||||||
cm.intf.lightHouse.DeleteVpnAddrs(hostinfo.vpnAddrs)
|
cm.intf.lightHouse.DeleteVpnAddrs(hostinfo.vpnAddrs)
|
||||||
@@ -199,7 +201,14 @@ func (cm *connectionManager) doTrafficCheck(localIndex uint32, p, nb, out []byte
|
|||||||
cm.tryRehandshake(hostinfo)
|
cm.tryRehandshake(hostinfo)
|
||||||
|
|
||||||
case sendTestPacket:
|
case sendTestPacket:
|
||||||
cm.intf.SendMessageToHostInfo(header.Test, header.TestRequest, hostinfo, p, nb, out)
|
// Defer to pmtud if it has a confirmed PMTU > floor for this peer:
|
||||||
|
// the probe at the confirmed size verifies both liveness AND that
|
||||||
|
// the discovered PMTU still fits, so we don't burn a separate test
|
||||||
|
// packet on top of it. If pmtud declines (disabled, peer unsupported,
|
||||||
|
// or no confirmed size yet) we fall back to the regular test.
|
||||||
|
if !cm.intf.pmtudManager.MaybeProbeAsTest(hostinfo) {
|
||||||
|
cm.intf.SendMessageToHostInfo(header.Test, header.TestRequest, hostinfo, p, nb, out)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cm.resetRelayTrafficCheck(hostinfo)
|
cm.resetRelayTrafficCheck(hostinfo)
|
||||||
|
|||||||
@@ -67,6 +67,8 @@ func Test_NewConnectionManagerTest(t *testing.T) {
|
|||||||
punchy := NewPunchyFromConfig(test.NewLogger(), conf)
|
punchy := NewPunchyFromConfig(test.NewLogger(), conf)
|
||||||
nc := newConnectionManagerFromConfig(test.NewLogger(), conf, hostMap, punchy)
|
nc := newConnectionManagerFromConfig(test.NewLogger(), conf, hostMap, punchy)
|
||||||
nc.intf = ifce
|
nc.intf = ifce
|
||||||
|
ifce.pmtudManager = newPMTUDManagerFromConfig(test.NewLogger(), conf, ifce.inside)
|
||||||
|
ifce.pmtudManager.intf = ifce
|
||||||
p := []byte("")
|
p := []byte("")
|
||||||
nb := make([]byte, 12, 12)
|
nb := make([]byte, 12, 12)
|
||||||
out := make([]byte, mtu)
|
out := make([]byte, mtu)
|
||||||
@@ -149,6 +151,8 @@ func Test_NewConnectionManagerTest2(t *testing.T) {
|
|||||||
punchy := NewPunchyFromConfig(test.NewLogger(), conf)
|
punchy := NewPunchyFromConfig(test.NewLogger(), conf)
|
||||||
nc := newConnectionManagerFromConfig(test.NewLogger(), conf, hostMap, punchy)
|
nc := newConnectionManagerFromConfig(test.NewLogger(), conf, hostMap, punchy)
|
||||||
nc.intf = ifce
|
nc.intf = ifce
|
||||||
|
ifce.pmtudManager = newPMTUDManagerFromConfig(test.NewLogger(), conf, ifce.inside)
|
||||||
|
ifce.pmtudManager.intf = ifce
|
||||||
p := []byte("")
|
p := []byte("")
|
||||||
nb := make([]byte, 12, 12)
|
nb := make([]byte, 12, 12)
|
||||||
out := make([]byte, mtu)
|
out := make([]byte, mtu)
|
||||||
@@ -361,6 +365,8 @@ func Test_NewConnectionManagerTest_DisconnectInvalid(t *testing.T) {
|
|||||||
punchy := NewPunchyFromConfig(test.NewLogger(), conf)
|
punchy := NewPunchyFromConfig(test.NewLogger(), conf)
|
||||||
nc := newConnectionManagerFromConfig(test.NewLogger(), conf, hostMap, punchy)
|
nc := newConnectionManagerFromConfig(test.NewLogger(), conf, hostMap, punchy)
|
||||||
nc.intf = ifce
|
nc.intf = ifce
|
||||||
|
ifce.pmtudManager = newPMTUDManagerFromConfig(test.NewLogger(), conf, ifce.inside)
|
||||||
|
ifce.pmtudManager.intf = ifce
|
||||||
ifce.connectionManager = nc
|
ifce.connectionManager = nc
|
||||||
|
|
||||||
hostinfo := &HostInfo{
|
hostinfo := &HostInfo{
|
||||||
|
|||||||
@@ -54,6 +54,7 @@ type Control struct {
|
|||||||
dnsStart func()
|
dnsStart func()
|
||||||
lighthouseStart func()
|
lighthouseStart func()
|
||||||
connectionManagerStart func(context.Context)
|
connectionManagerStart func(context.Context)
|
||||||
|
pmtudManagerStart func(context.Context)
|
||||||
}
|
}
|
||||||
|
|
||||||
type ControlHostInfo struct {
|
type ControlHostInfo struct {
|
||||||
@@ -107,6 +108,9 @@ func (c *Control) Start() (func() error, error) {
|
|||||||
if c.connectionManagerStart != nil {
|
if c.connectionManagerStart != nil {
|
||||||
go c.connectionManagerStart(c.ctx)
|
go c.connectionManagerStart(c.ctx)
|
||||||
}
|
}
|
||||||
|
if c.pmtudManagerStart != nil {
|
||||||
|
go c.pmtudManagerStart(c.ctx)
|
||||||
|
}
|
||||||
if c.lighthouseStart != nil {
|
if c.lighthouseStart != nil {
|
||||||
c.lighthouseStart()
|
c.lighthouseStart()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -55,8 +55,10 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
TestRequest MessageSubType = 0
|
TestRequest MessageSubType = 0
|
||||||
TestReply MessageSubType = 1
|
TestReply MessageSubType = 1
|
||||||
|
MTUDProbeRequest MessageSubType = 2
|
||||||
|
MTUDProbeReply MessageSubType = 3
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -67,8 +69,10 @@ const (
|
|||||||
var ErrHeaderTooShort = errors.New("header is too short")
|
var ErrHeaderTooShort = errors.New("header is too short")
|
||||||
|
|
||||||
var subTypeTestMap = map[MessageSubType]string{
|
var subTypeTestMap = map[MessageSubType]string{
|
||||||
TestRequest: "testRequest",
|
TestRequest: "testRequest",
|
||||||
TestReply: "testReply",
|
TestReply: "testReply",
|
||||||
|
MTUDProbeRequest: "mtudProbeRequest",
|
||||||
|
MTUDProbeReply: "mtudProbeReply",
|
||||||
}
|
}
|
||||||
|
|
||||||
var subTypeNoneMap = map[MessageSubType]string{0: "none"}
|
var subTypeNoneMap = map[MessageSubType]string{0: "none"}
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ type InterfaceConfig struct {
|
|||||||
HandshakeManager *HandshakeManager
|
HandshakeManager *HandshakeManager
|
||||||
lightHouse *LightHouse
|
lightHouse *LightHouse
|
||||||
connectionManager *connectionManager
|
connectionManager *connectionManager
|
||||||
|
pmtudManager *pmtudManager
|
||||||
DropLocalBroadcast bool
|
DropLocalBroadcast bool
|
||||||
DropMulticast bool
|
DropMulticast bool
|
||||||
routines int
|
routines int
|
||||||
@@ -57,6 +58,7 @@ type Interface struct {
|
|||||||
pki *PKI
|
pki *PKI
|
||||||
firewall *Firewall
|
firewall *Firewall
|
||||||
connectionManager *connectionManager
|
connectionManager *connectionManager
|
||||||
|
pmtudManager *pmtudManager
|
||||||
handshakeManager *HandshakeManager
|
handshakeManager *HandshakeManager
|
||||||
dnsServer *dnsServer
|
dnsServer *dnsServer
|
||||||
createTime time.Time
|
createTime time.Time
|
||||||
@@ -195,6 +197,7 @@ func NewInterface(ctx context.Context, c *InterfaceConfig) (*Interface, error) {
|
|||||||
myBroadcastAddrsTable: cs.myVpnBroadcastAddrsTable,
|
myBroadcastAddrsTable: cs.myVpnBroadcastAddrsTable,
|
||||||
relayManager: c.relayManager,
|
relayManager: c.relayManager,
|
||||||
connectionManager: c.connectionManager,
|
connectionManager: c.connectionManager,
|
||||||
|
pmtudManager: c.pmtudManager,
|
||||||
conntrackCacheTimeout: c.ConntrackCacheTimeout,
|
conntrackCacheTimeout: c.ConntrackCacheTimeout,
|
||||||
|
|
||||||
metricHandshakes: metrics.GetOrRegisterHistogram("handshakes", nil, metrics.NewExpDecaySample(1028, 0.015)),
|
metricHandshakes: metrics.GetOrRegisterHistogram("handshakes", nil, metrics.NewExpDecaySample(1028, 0.015)),
|
||||||
@@ -212,6 +215,7 @@ func NewInterface(ctx context.Context, c *InterfaceConfig) (*Interface, error) {
|
|||||||
ifce.reQueryWait.Store(int64(c.reQueryWait))
|
ifce.reQueryWait.Store(int64(c.reQueryWait))
|
||||||
|
|
||||||
ifce.connectionManager.intf = ifce
|
ifce.connectionManager.intf = ifce
|
||||||
|
ifce.pmtudManager.intf = ifce
|
||||||
|
|
||||||
return ifce, nil
|
return ifce, nil
|
||||||
}
|
}
|
||||||
|
|||||||
3
main.go
3
main.go
@@ -172,6 +172,7 @@ func Main(c *config.C, configTest bool, buildVersion string, l *slog.Logger, dev
|
|||||||
hostMap := NewHostMapFromConfig(l, c)
|
hostMap := NewHostMapFromConfig(l, c)
|
||||||
punchy := NewPunchyFromConfig(l, c)
|
punchy := NewPunchyFromConfig(l, c)
|
||||||
connManager := newConnectionManagerFromConfig(l, c, hostMap, punchy)
|
connManager := newConnectionManagerFromConfig(l, c, hostMap, punchy)
|
||||||
|
pmtudMgr := newPMTUDManagerFromConfig(l, c, tun)
|
||||||
lightHouse, err := NewLightHouseFromConfig(ctx, l, c, pki.getCertState(), udpConns[0], punchy)
|
lightHouse, err := NewLightHouseFromConfig(ctx, l, c, pki.getCertState(), udpConns[0], punchy)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, util.ContextualizeIfNeeded("Failed to initialize lighthouse handler", err)
|
return nil, util.ContextualizeIfNeeded("Failed to initialize lighthouse handler", err)
|
||||||
@@ -208,6 +209,7 @@ func Main(c *config.C, configTest bool, buildVersion string, l *slog.Logger, dev
|
|||||||
DnsServer: ds,
|
DnsServer: ds,
|
||||||
HandshakeManager: handshakeManager,
|
HandshakeManager: handshakeManager,
|
||||||
connectionManager: connManager,
|
connectionManager: connManager,
|
||||||
|
pmtudManager: pmtudMgr,
|
||||||
lightHouse: lightHouse,
|
lightHouse: lightHouse,
|
||||||
tryPromoteEvery: c.GetUint32("counters.try_promote", defaultPromoteEvery),
|
tryPromoteEvery: c.GetUint32("counters.try_promote", defaultPromoteEvery),
|
||||||
reQueryEvery: c.GetUint32("counters.requery_every_packets", defaultReQueryEvery),
|
reQueryEvery: c.GetUint32("counters.requery_every_packets", defaultReQueryEvery),
|
||||||
@@ -266,6 +268,7 @@ func Main(c *config.C, configTest bool, buildVersion string, l *slog.Logger, dev
|
|||||||
dnsStart: ds.Start,
|
dnsStart: ds.Start,
|
||||||
lighthouseStart: lightHouse.StartUpdateWorker,
|
lighthouseStart: lightHouse.StartUpdateWorker,
|
||||||
connectionManagerStart: connManager.Start,
|
connectionManagerStart: connManager.Start,
|
||||||
|
pmtudManagerStart: pmtudMgr.Start,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
13
outside.go
13
outside.go
@@ -183,11 +183,20 @@ func (f *Interface) readOutsidePackets(via ViaSender, out []byte, packet []byte,
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if h.Subtype == header.TestRequest {
|
switch h.Subtype {
|
||||||
|
case header.TestRequest:
|
||||||
// This testRequest might be from TryPromoteBest, so we should roam
|
// This testRequest might be from TryPromoteBest, so we should roam
|
||||||
// to the new IP address before responding
|
// to the new IP address before responding
|
||||||
f.handleHostRoaming(hostinfo, via)
|
f.handleHostRoaming(hostinfo, via)
|
||||||
f.send(header.Test, header.TestReply, ci, hostinfo, d, nb, out)
|
f.send(header.Test, header.TestReply, ci, hostinfo, d, nb, out)
|
||||||
|
case header.MTUDProbeRequest:
|
||||||
|
// Reply with just the 8-byte ack header so the reverse path doesn't have to
|
||||||
|
// carry the full probe size; we only verify the forward direction.
|
||||||
|
if len(d) >= 8 {
|
||||||
|
f.send(header.Test, header.MTUDProbeReply, ci, hostinfo, d[:8], nb, out)
|
||||||
|
}
|
||||||
|
case header.MTUDProbeReply:
|
||||||
|
f.pmtudManager.HandleReply(hostinfo.localIndexId, d)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallthrough to the bottom to record incoming traffic
|
// Fallthrough to the bottom to record incoming traffic
|
||||||
@@ -257,6 +266,7 @@ func (f *Interface) readOutsidePackets(via ViaSender, out []byte, packet []byte,
|
|||||||
|
|
||||||
// closeTunnel closes a tunnel locally, it does not send a closeTunnel packet to the remote
|
// closeTunnel closes a tunnel locally, it does not send a closeTunnel packet to the remote
|
||||||
func (f *Interface) closeTunnel(hostInfo *HostInfo) {
|
func (f *Interface) closeTunnel(hostInfo *HostInfo) {
|
||||||
|
f.pmtudManager.OnTunnelDown(hostInfo)
|
||||||
final := f.hostMap.DeleteHostInfo(hostInfo)
|
final := f.hostMap.DeleteHostInfo(hostInfo)
|
||||||
if final {
|
if final {
|
||||||
// We no longer have any tunnels with this vpn addr, clear learned lighthouse state to lower memory usage
|
// We no longer have any tunnels with this vpn addr, clear learned lighthouse state to lower memory usage
|
||||||
@@ -296,6 +306,7 @@ func (f *Interface) handleHostRoaming(hostinfo *HostInfo, via ViaSender) {
|
|||||||
hostinfo.lastRoam = time.Now()
|
hostinfo.lastRoam = time.Now()
|
||||||
hostinfo.lastRoamRemote = hostinfo.remote
|
hostinfo.lastRoamRemote = hostinfo.remote
|
||||||
hostinfo.SetRemote(via.UdpAddr)
|
hostinfo.SetRemote(via.UdpAddr)
|
||||||
|
f.pmtudManager.OnRoam(hostinfo)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,4 +15,13 @@ type Device interface {
|
|||||||
RoutesFor(netip.Addr) routing.Gateways
|
RoutesFor(netip.Addr) routing.Gateways
|
||||||
SupportsMultiqueue() bool
|
SupportsMultiqueue() bool
|
||||||
NewMultiQueueReader() (io.ReadWriteCloser, error)
|
NewMultiQueueReader() (io.ReadWriteCloser, error)
|
||||||
|
// SupportsPerPeerMTU reports whether SetPeerMTU is implemented for real on
|
||||||
|
// this platform. PMTUD requires this; the manager will refuse to enable when
|
||||||
|
// false even if the operator set tun.max_mtu, because a discovered MTU we
|
||||||
|
// can't actually install does the operator no good.
|
||||||
|
SupportsPerPeerMTU() bool
|
||||||
|
// SetPeerMTU installs a per-peer MTU on the routing table so the kernel will
|
||||||
|
// surface PTB / EMSGSIZE for inside packets to that peer that would exceed mtu.
|
||||||
|
// Pass mtu=0 to remove the override and let the device default apply.
|
||||||
|
SetPeerMTU(addr netip.Addr, mtu int) error
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,6 +39,14 @@ func (NoopTun) Write([]byte) (int, error) {
|
|||||||
return 0, nil
|
return 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (NoopTun) SupportsPerPeerMTU() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (NoopTun) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (NoopTun) SupportsMultiqueue() bool {
|
func (NoopTun) SupportsMultiqueue() bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -95,6 +95,14 @@ func (t *tun) Name() string {
|
|||||||
return "android"
|
return "android"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *tun) SupportsPerPeerMTU() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (t *tun) SupportsMultiqueue() bool {
|
func (t *tun) SupportsMultiqueue() bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -548,6 +548,14 @@ func (t *tun) Name() string {
|
|||||||
return t.Device
|
return t.Device
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *tun) SupportsPerPeerMTU() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (t *tun) SupportsMultiqueue() bool {
|
func (t *tun) SupportsMultiqueue() bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -106,6 +106,14 @@ func (t *disabledTun) Write(b []byte) (int, error) {
|
|||||||
return len(b), nil
|
return len(b), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *disabledTun) SupportsPerPeerMTU() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *disabledTun) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (t *disabledTun) SupportsMultiqueue() bool {
|
func (t *disabledTun) SupportsMultiqueue() bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -561,6 +561,14 @@ func (t *tun) Name() string {
|
|||||||
return t.Device
|
return t.Device
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *tun) SupportsPerPeerMTU() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (t *tun) SupportsMultiqueue() bool {
|
func (t *tun) SupportsMultiqueue() bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -151,6 +151,14 @@ func (t *tun) Name() string {
|
|||||||
return "iOS"
|
return "iOS"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *tun) SupportsPerPeerMTU() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (t *tun) SupportsMultiqueue() bool {
|
func (t *tun) SupportsMultiqueue() bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -368,6 +368,13 @@ func (t *tun) reload(c *config.C, initial bool) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// tun.max_mtu raises the device MTU above tun.mtu so PMTUD has headroom to
|
||||||
|
// install per-peer routes between tun.mtu (floor) and tun.max_mtu (ceiling).
|
||||||
|
// When unset (default 0) the device MTU is unchanged from existing behavior.
|
||||||
|
if pmtudCeiling := c.GetInt("tun.max_mtu", 0); pmtudCeiling > newMaxMTU {
|
||||||
|
newMaxMTU = pmtudCeiling
|
||||||
|
}
|
||||||
|
|
||||||
t.MaxMTU = newMaxMTU
|
t.MaxMTU = newMaxMTU
|
||||||
t.DefaultMTU = newDefaultMTU
|
t.DefaultMTU = newDefaultMTU
|
||||||
|
|
||||||
@@ -596,7 +603,7 @@ func (t *tun) setDefaultRoute(cidr netip.Prefix) error {
|
|||||||
LinkIndex: t.deviceIndex,
|
LinkIndex: t.deviceIndex,
|
||||||
Dst: dr,
|
Dst: dr,
|
||||||
MTU: t.DefaultMTU,
|
MTU: t.DefaultMTU,
|
||||||
AdvMSS: t.advMSS(Route{}),
|
AdvMSS: t.advMSS(Route{Cidr: cidr}),
|
||||||
Scope: unix.RT_SCOPE_LINK,
|
Scope: unix.RT_SCOPE_LINK,
|
||||||
Src: net.IP(cidr.Addr().AsSlice()),
|
Src: net.IP(cidr.Addr().AsSlice()),
|
||||||
Protocol: unix.RTPROT_KERNEL,
|
Protocol: unix.RTPROT_KERNEL,
|
||||||
@@ -705,17 +712,68 @@ func (t *tun) Name() string {
|
|||||||
return t.Device
|
return t.Device
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *tun) SupportsPerPeerMTU() bool {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetPeerMTU installs a host route (/32 for an IPv4 vpn address, /128 for an IPv6
|
||||||
|
// vpn address) to addr through this tun device with the given MTU. This causes
|
||||||
|
// the kernel to reject (or surface PTB to apps for) inside packets to addr that
|
||||||
|
// would exceed mtu. Pass mtu=0 to remove the override and let the per-vpn-network
|
||||||
|
// route apply again. PoC: assumes addr is reachable directly via this device.
|
||||||
|
func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
bits := addr.BitLen()
|
||||||
|
prefix := netip.PrefixFrom(addr, bits)
|
||||||
|
|
||||||
|
dr := &net.IPNet{
|
||||||
|
IP: addr.AsSlice(),
|
||||||
|
Mask: net.CIDRMask(bits, bits),
|
||||||
|
}
|
||||||
|
|
||||||
|
if mtu == 0 {
|
||||||
|
nr := netlink.Route{
|
||||||
|
LinkIndex: t.deviceIndex,
|
||||||
|
Dst: dr,
|
||||||
|
Scope: unix.RT_SCOPE_LINK,
|
||||||
|
}
|
||||||
|
if err := netlink.RouteDel(&nr); err != nil {
|
||||||
|
return fmt.Errorf("failed to remove per-peer mtu route %v: %w", prefix, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
nr := netlink.Route{
|
||||||
|
LinkIndex: t.deviceIndex,
|
||||||
|
Dst: dr,
|
||||||
|
MTU: mtu,
|
||||||
|
AdvMSS: t.advMSS(Route{Cidr: prefix, MTU: mtu}),
|
||||||
|
Scope: unix.RT_SCOPE_LINK,
|
||||||
|
}
|
||||||
|
if err := netlink.RouteReplace(&nr); err != nil {
|
||||||
|
return fmt.Errorf("failed to set per-peer mtu route %v mtu=%d: %w", prefix, mtu, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (t *tun) advMSS(r Route) int {
|
func (t *tun) advMSS(r Route) int {
|
||||||
mtu := r.MTU
|
mtu := r.MTU
|
||||||
if r.MTU == 0 {
|
if r.MTU == 0 {
|
||||||
mtu = t.DefaultMTU
|
mtu = t.DefaultMTU
|
||||||
}
|
}
|
||||||
|
|
||||||
// We only need to set advmss if the route MTU does not match the device MTU
|
// We only need to set advmss if the route MTU does not match the device MTU.
|
||||||
if mtu != t.MaxMTU {
|
if mtu == t.MaxMTU {
|
||||||
return mtu - 40
|
return 0
|
||||||
}
|
}
|
||||||
return 0
|
|
||||||
|
// MSS = MTU - (IP header + TCP header). TCP is always 20 bytes; IP is 20 for
|
||||||
|
// v4 and 40 for v6. r.Cidr is the route destination so it tells us which
|
||||||
|
// family this route is in. If Cidr is unset (empty Route) we default to v4.
|
||||||
|
addr := r.Cidr.Addr()
|
||||||
|
if addr.Is6() && !addr.Is4In6() {
|
||||||
|
return mtu - 60
|
||||||
|
}
|
||||||
|
return mtu - 40
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *tun) watchRoutes() {
|
func (t *tun) watchRoutes() {
|
||||||
|
|||||||
@@ -390,6 +390,14 @@ func (t *tun) Name() string {
|
|||||||
return t.Device
|
return t.Device
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *tun) SupportsPerPeerMTU() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (t *tun) SupportsMultiqueue() bool {
|
func (t *tun) SupportsMultiqueue() bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -310,6 +310,14 @@ func (t *tun) Name() string {
|
|||||||
return t.Device
|
return t.Device
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *tun) SupportsPerPeerMTU() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *tun) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (t *tun) SupportsMultiqueue() bool {
|
func (t *tun) SupportsMultiqueue() bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -105,6 +105,14 @@ func (t *TestTun) Name() string {
|
|||||||
return t.Device
|
return t.Device
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *TestTun) SupportsPerPeerMTU() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *TestTun) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (t *TestTun) Write(b []byte) (n int, err error) {
|
func (t *TestTun) Write(b []byte) (n int, err error) {
|
||||||
if t.closed.Load() {
|
if t.closed.Load() {
|
||||||
return 0, io.ErrClosedPipe
|
return 0, io.ErrClosedPipe
|
||||||
|
|||||||
@@ -229,6 +229,14 @@ func (t *winTun) Name() string {
|
|||||||
return t.Device
|
return t.Device
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *winTun) SupportsPerPeerMTU() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *winTun) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (t *winTun) Read(b []byte) (int, error) {
|
func (t *winTun) Read(b []byte) (int, error) {
|
||||||
return t.tun.Read(b, 0)
|
return t.tun.Read(b, 0)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,6 +46,14 @@ func (d *UserDevice) RoutesFor(ip netip.Addr) routing.Gateways {
|
|||||||
return routing.Gateways{routing.NewGateway(ip, 1)}
|
return routing.Gateways{routing.NewGateway(ip, 1)}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (d *UserDevice) SupportsPerPeerMTU() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *UserDevice) SetPeerMTU(addr netip.Addr, mtu int) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (d *UserDevice) SupportsMultiqueue() bool {
|
func (d *UserDevice) SupportsMultiqueue() bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|||||||
623
pmtud_manager.go
Normal file
623
pmtud_manager.go
Normal file
@@ -0,0 +1,623 @@
|
|||||||
|
package nebula
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/binary"
|
||||||
|
"log/slog"
|
||||||
|
"math/rand/v2"
|
||||||
|
"net/netip"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/slackhq/nebula/config"
|
||||||
|
"github.com/slackhq/nebula/header"
|
||||||
|
"github.com/slackhq/nebula/overlay"
|
||||||
|
)
|
||||||
|
|
||||||
|
// PMTUD PoC: discover the path MTU per-tunnel via authenticated probes that ride
|
||||||
|
// the existing crypto session. We follow RFC 8899 PLPMTUD: a binary search
|
||||||
|
// between a known-good floor and a configured ceiling, with N consecutive probe
|
||||||
|
// losses at a size treated as "doesn't fit." Confirmed PMTU is pushed to the
|
||||||
|
// overlay device, which on Linux installs a per-host route with the discovered
|
||||||
|
// MTU. The kernel then surfaces EMSGSIZE / PTB to apps writing to the tun.
|
||||||
|
//
|
||||||
|
// Probe payload format (request):
|
||||||
|
//
|
||||||
|
// [magic uint32 BE][probeID uint32 BE][padding 0x00...]
|
||||||
|
//
|
||||||
|
// Reply is a small ack with the same magic and probeID and no padding. We do not
|
||||||
|
// verify the reverse-path MTU; only the forward direction matters for the
|
||||||
|
// receiver's MTU on the inside.
|
||||||
|
|
||||||
|
const (
|
||||||
|
pmtudMagic uint32 = 0x504D5544 // 'P' 'M' 'U' 'D'
|
||||||
|
pmtudFloor = 1280 // IPv6 minimum payload, also a safe internet MTU floor
|
||||||
|
|
||||||
|
// pmtudConverged is the bytes-tolerance for stopping the search.
|
||||||
|
pmtudConverged = 8
|
||||||
|
|
||||||
|
// pmtudMaxLoss matches RFC 8899 MAX_PROBES (default 3).
|
||||||
|
pmtudMaxLoss = 3
|
||||||
|
|
||||||
|
// pmtudProbeInterval is the time between probe ticks during the search phase.
|
||||||
|
// Once a peer converges the wheel stops ticking it; re-validation is driven
|
||||||
|
// by connection_manager via MaybeProbeAsTest at its natural test cadence.
|
||||||
|
pmtudProbeInterval = 500 * time.Millisecond
|
||||||
|
|
||||||
|
// pmtudWheelMax is the wheel's maximum supported scheduling duration. We
|
||||||
|
// only ever schedule at pmtudProbeInterval today, but the wheel needs a
|
||||||
|
// max greater than its tick to allocate its slot ring sensibly.
|
||||||
|
pmtudWheelMax = 5 * time.Second
|
||||||
|
|
||||||
|
// pmtudOverheadPessimistic assumes IPv6 underlay + relay framing:
|
||||||
|
// IPv6(40) + UDP(8) + outer nebula(16) + outer AEAD tag(16)
|
||||||
|
// + inner nebula(16) + inner AEAD tag(16) = 112 bytes.
|
||||||
|
// TODO: track underlay address family and per-peer relay state on the HostInfo
|
||||||
|
// so the manager can use the actual overhead for that tunnel and recover the
|
||||||
|
// 32 bytes we pessimistically give up on direct IPv6 paths and the 52 bytes on
|
||||||
|
// direct IPv4 paths.
|
||||||
|
pmtudOverheadPessimistic = 112
|
||||||
|
|
||||||
|
// pmtudUnsupportedAfter is the number of consecutive lost probes (across any
|
||||||
|
// sizes) without ever receiving a reply that we treat as evidence the peer
|
||||||
|
// does not understand the MTUDProbeRequest subtype (i.e. it's running an
|
||||||
|
// older nebula). After this many failures with everReplied=false we mark the
|
||||||
|
// peer pmtud-unsupported and stop scheduling probes. K is small enough that
|
||||||
|
// it fires before the binary search would naturally converge to floor (which
|
||||||
|
// would otherwise be ~30 wasted probes), but large enough to absorb a few
|
||||||
|
// transient probe losses on a path that's just starting to settle.
|
||||||
|
pmtudUnsupportedAfter = 5
|
||||||
|
)
|
||||||
|
|
||||||
|
// pmtudPeer tracks the binary-search state for one tunnel.
|
||||||
|
type pmtudPeer struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
addr netip.Addr
|
||||||
|
localIdx uint32
|
||||||
|
|
||||||
|
// low is the largest outer IP packet size we have a confirmed ack for.
|
||||||
|
// high is the smallest size we believe fails (the search ceiling to start).
|
||||||
|
low, high int
|
||||||
|
|
||||||
|
// inFlightSize is the outer IP packet size of the probe currently awaiting
|
||||||
|
// an ack. 0 means no probe in flight.
|
||||||
|
inFlightSize int
|
||||||
|
// inFlightID matches the probeID echoed in the reply.
|
||||||
|
inFlightID uint32
|
||||||
|
// losses counts consecutive failures at inFlightSize.
|
||||||
|
losses int
|
||||||
|
|
||||||
|
// firstProbe is true until we have sent the first probe of a search. The
|
||||||
|
// first probe targets the ceiling directly (RFC 8899 permits this Search
|
||||||
|
// Algorithm choice); operators who set tun.max_mtu typically have a path
|
||||||
|
// that supports it, so we converge in one probe in the common case.
|
||||||
|
firstProbe bool
|
||||||
|
// everReplied is true once we have ever received any MTUDProbeReply from
|
||||||
|
// this peer. Combined with consecutiveFailures, this lets us detect peers
|
||||||
|
// that don't understand the new subtype and stop probing them.
|
||||||
|
everReplied bool
|
||||||
|
// consecutiveFailures counts probes lost without an intervening reply.
|
||||||
|
// Resets to 0 on any successful reply.
|
||||||
|
consecutiveFailures int
|
||||||
|
// unsupported is set true once we conclude the peer doesn't speak PMTUD.
|
||||||
|
// The manager skips probes for unsupported peers.
|
||||||
|
unsupported bool
|
||||||
|
|
||||||
|
// converged means we have a confirmed PMTU and are in the slow re-validation phase.
|
||||||
|
converged bool
|
||||||
|
// applied is the inner MTU we last pushed to the overlay device (0 if never).
|
||||||
|
applied int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *pmtudPeer) overhead() int {
|
||||||
|
// TODO: branch on actual underlay family + relay state for this peer.
|
||||||
|
return pmtudOverheadPessimistic
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *pmtudPeer) midpoint() int {
|
||||||
|
return (p.low + p.high) / 2
|
||||||
|
}
|
||||||
|
|
||||||
|
type pmtudManager struct {
|
||||||
|
intf *Interface
|
||||||
|
device overlay.Device
|
||||||
|
|
||||||
|
// peers is keyed by HostInfo.localIndexId.
|
||||||
|
peers sync.Map // map[uint32]*pmtudPeer
|
||||||
|
|
||||||
|
wheel *LockingTimerWheel[uint32]
|
||||||
|
|
||||||
|
// floor is the always-safe inner MTU (= tun.mtu). Per-peer routes start here
|
||||||
|
// on tunnel-up so unprobed traffic is always small enough to fit. Stored as
|
||||||
|
// atomic int64 so reload can update it without coordinating with the readers
|
||||||
|
// in tick/HandleReply/OnTunnelUp.
|
||||||
|
floor atomic.Int64
|
||||||
|
// ceiling is the search ceiling expressed as an outer IP packet size, derived
|
||||||
|
// from tun.max_mtu (which is the kernel's device MTU on the tun) plus our
|
||||||
|
// pessimistic overhead. PMTUD will not probe larger than this.
|
||||||
|
ceiling atomic.Int64
|
||||||
|
|
||||||
|
enabled atomic.Bool
|
||||||
|
|
||||||
|
l *slog.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
func newPMTUDManagerFromConfig(l *slog.Logger, c *config.C, device overlay.Device) *pmtudManager {
|
||||||
|
m := &pmtudManager{
|
||||||
|
device: device,
|
||||||
|
wheel: NewLockingTimerWheel[uint32](pmtudProbeInterval, pmtudWheelMax),
|
||||||
|
l: l,
|
||||||
|
}
|
||||||
|
c.RegisterReloadCallback(func(c *config.C) { m.reload(c, false) })
|
||||||
|
m.reload(c, true)
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
// reload applies tun.mtu / tun.max_mtu changes to the manager. On the initial
|
||||||
|
// call (during construction) it just snapshots state; on a live reload it also
|
||||||
|
// transitions in-flight peers to match the new bounds: clearing per-peer routes
|
||||||
|
// when newly disabled, seeding peers from the hostmap and flipping DF on
|
||||||
|
// outside sockets when newly enabled, and rebounding existing searches in
|
||||||
|
// place when only the ceiling moved.
|
||||||
|
func (m *pmtudManager) reload(c *config.C, initial bool) {
|
||||||
|
if !initial && !c.HasChanged("tun.mtu") && !c.HasChanged("tun.max_mtu") {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
floor := c.GetInt("tun.mtu", overlay.DefaultMTU)
|
||||||
|
maxMTU := c.GetInt("tun.max_mtu", 0)
|
||||||
|
|
||||||
|
enable := maxMTU > floor && m.device.SupportsPerPeerMTU()
|
||||||
|
var ceiling int
|
||||||
|
if enable {
|
||||||
|
ceiling = maxMTU + pmtudOverheadPessimistic
|
||||||
|
}
|
||||||
|
|
||||||
|
if initial {
|
||||||
|
m.floor.Store(int64(floor))
|
||||||
|
m.ceiling.Store(int64(ceiling))
|
||||||
|
m.enabled.Store(enable)
|
||||||
|
switch {
|
||||||
|
case enable:
|
||||||
|
m.l.Info("pmtud enabled", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU)
|
||||||
|
case maxMTU > floor:
|
||||||
|
m.l.Warn("pmtud disabled: this platform does not yet support per-peer MTU routes",
|
||||||
|
"tun.max_mtu", maxMTU)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
wasEnabled := m.enabled.Load()
|
||||||
|
m.floor.Store(int64(floor))
|
||||||
|
m.ceiling.Store(int64(ceiling))
|
||||||
|
m.enabled.Store(enable)
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case wasEnabled && !enable:
|
||||||
|
m.disableLive(floor, maxMTU)
|
||||||
|
case !wasEnabled && enable:
|
||||||
|
m.enableLive(floor, ceiling, maxMTU)
|
||||||
|
case wasEnabled && enable:
|
||||||
|
m.reboundLive(floor, ceiling, maxMTU)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// disableLive clears per-peer routes and drops all peer state. We do not
|
||||||
|
// disable DF on the outside sockets; once on, it stays on for the life of the
|
||||||
|
// process. Operators flipping pmtud off live get correct routing behavior; if
|
||||||
|
// they want the historical no-DF behavior back they need to restart.
|
||||||
|
func (m *pmtudManager) disableLive(floor, maxMTU int) {
|
||||||
|
m.peers.Range(func(k, v any) bool {
|
||||||
|
p := v.(*pmtudPeer)
|
||||||
|
p.mu.Lock()
|
||||||
|
applied := p.applied
|
||||||
|
addr := p.addr
|
||||||
|
p.applied = 0
|
||||||
|
p.mu.Unlock()
|
||||||
|
if applied != 0 {
|
||||||
|
if err := m.device.SetPeerMTU(addr, 0); err != nil {
|
||||||
|
m.l.Warn("pmtud: failed to clear per-peer mtu on disable", "addr", addr, "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m.peers.Delete(k)
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
m.l.Info("pmtud disabled (tun.max_mtu <= tun.mtu)", "tun.mtu", floor, "tun.max_mtu", maxMTU)
|
||||||
|
}
|
||||||
|
|
||||||
|
// enableLive flips DF on every outside socket. We don't pre-seed existing
|
||||||
|
// tunnels here; connection_manager's normal test cadence will eventually call
|
||||||
|
// MaybeProbeAsTest for each peer, which seeds on miss and lets the wheel pick
|
||||||
|
// up the search from there. New tunnels established after this point still
|
||||||
|
// take the OnTunnelUp fast path.
|
||||||
|
func (m *pmtudManager) enableLive(floor, ceiling, maxMTU int) {
|
||||||
|
m.enableDF()
|
||||||
|
m.l.Info("pmtud enabled", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU)
|
||||||
|
}
|
||||||
|
|
||||||
|
// reboundLive resets each peer's search state to the new bounds. Peers whose
|
||||||
|
// confirmed PMTU still fits under the new ceiling keep their applied route in
|
||||||
|
// place during the new search; peers whose confirmed PMTU exceeds the new
|
||||||
|
// ceiling get cleared back to floor and re-search from scratch. The unsupported
|
||||||
|
// flag is preserved because peer software version doesn't change on reload.
|
||||||
|
func (m *pmtudManager) reboundLive(floor, ceiling, maxMTU int) {
|
||||||
|
overhead := pmtudOverheadPessimistic
|
||||||
|
m.peers.Range(func(k, v any) bool {
|
||||||
|
p := v.(*pmtudPeer)
|
||||||
|
p.mu.Lock()
|
||||||
|
if p.applied > 0 && p.applied+overhead > ceiling {
|
||||||
|
if err := m.device.SetPeerMTU(p.addr, 0); err != nil {
|
||||||
|
m.l.Warn("pmtud: failed to clear per-peer mtu on rebound", "addr", p.addr, "error", err)
|
||||||
|
} else {
|
||||||
|
p.applied = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p.low = floor + overhead
|
||||||
|
p.high = ceiling
|
||||||
|
p.inFlightSize = 0
|
||||||
|
p.inFlightID = 0
|
||||||
|
p.losses = 0
|
||||||
|
p.firstProbe = !p.unsupported
|
||||||
|
p.converged = false
|
||||||
|
idx := p.localIdx
|
||||||
|
unsupported := p.unsupported
|
||||||
|
p.mu.Unlock()
|
||||||
|
if !unsupported {
|
||||||
|
m.wheel.Add(idx, pmtudProbeInterval)
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
m.l.Info("pmtud reloaded", "floor", floor, "ceiling", ceiling, "tun.max_mtu", maxMTU)
|
||||||
|
}
|
||||||
|
|
||||||
|
// enableDF asks every outside socket to set the don't-fragment bit on outbound
|
||||||
|
// packets. Idempotent: safe to call from both Start (initial enable) and from a
|
||||||
|
// live reload that flips pmtud on.
|
||||||
|
func (m *pmtudManager) enableDF() {
|
||||||
|
for i, w := range m.intf.writers {
|
||||||
|
if err := w.EnablePathMTUDiscovery(); err != nil {
|
||||||
|
m.l.Warn("pmtud: failed to enable path mtu discovery on outside socket; pmtud will not work correctly",
|
||||||
|
"writer", i, "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start runs the probe scheduler until ctx is done. The loop runs even when PMTUD
|
||||||
|
// is disabled at startup so a hot reload can turn it on without restarting nebula.
|
||||||
|
//
|
||||||
|
// When PMTUD is enabled at startup we ask each outside socket to enable
|
||||||
|
// path-MTU discovery (DF on every send). This is intentionally gated on the
|
||||||
|
// feature being on so that operators who haven't opted in keep the historical
|
||||||
|
// behavior where the kernel may fragment outbound nebula UDP packets. A live
|
||||||
|
// reload from disabled to enabled will also flip DF on via enableLive; the
|
||||||
|
// reverse direction does not turn DF off, so flipping pmtud back off live
|
||||||
|
// keeps DF on until restart.
|
||||||
|
func (m *pmtudManager) Start(ctx context.Context) {
|
||||||
|
if m.enabled.Load() {
|
||||||
|
m.enableDF()
|
||||||
|
}
|
||||||
|
|
||||||
|
ticker := time.NewTicker(m.wheel.t.tickDuration)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case now := <-ticker.C:
|
||||||
|
m.wheel.Advance(now)
|
||||||
|
for {
|
||||||
|
idx, has := m.wheel.Purge()
|
||||||
|
if !has {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
m.tick(idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// OnTunnelUp is called when a HostInfo becomes traffic-watched. The kernel
|
||||||
|
// already routes packets to this peer through the per-vpn-network route (mtu =
|
||||||
|
// tun.mtu), so the floor is in effect implicitly. We just kick off the search
|
||||||
|
// here; HandleReply will install a per-host /32 (or /128) route once a larger
|
||||||
|
// size is confirmed.
|
||||||
|
func (m *pmtudManager) OnTunnelUp(hi *HostInfo) {
|
||||||
|
if !m.enabled.Load() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
m.seedPeer(hi)
|
||||||
|
}
|
||||||
|
|
||||||
|
// seedPeer is the shared body of OnTunnelUp and the live-reload enable path.
|
||||||
|
// LoadOrStore protects against double-seeding the same localIndexId from a
|
||||||
|
// race between OnTunnelUp and a reload-driven hostmap walk.
|
||||||
|
func (m *pmtudManager) seedPeer(hi *HostInfo) {
|
||||||
|
if hi == nil || len(hi.vpnAddrs) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
floor := int(m.floor.Load())
|
||||||
|
ceiling := int(m.ceiling.Load())
|
||||||
|
p := &pmtudPeer{
|
||||||
|
addr: hi.vpnAddrs[0],
|
||||||
|
localIdx: hi.localIndexId,
|
||||||
|
low: floor + pmtudOverheadPessimistic,
|
||||||
|
high: ceiling,
|
||||||
|
firstProbe: true,
|
||||||
|
}
|
||||||
|
if _, loaded := m.peers.LoadOrStore(hi.localIndexId, p); loaded {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
m.wheel.Add(hi.localIndexId, pmtudProbeInterval)
|
||||||
|
}
|
||||||
|
|
||||||
|
// OnTunnelDown is called when a HostInfo is being torn down. Removes any per-host
|
||||||
|
// MTU override so the device default applies again.
|
||||||
|
func (m *pmtudManager) OnTunnelDown(hi *HostInfo) {
|
||||||
|
if hi == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
v, ok := m.peers.LoadAndDelete(hi.localIndexId)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
p := v.(*pmtudPeer)
|
||||||
|
p.mu.Lock()
|
||||||
|
applied := p.applied
|
||||||
|
addr := p.addr
|
||||||
|
p.applied = 0
|
||||||
|
p.mu.Unlock()
|
||||||
|
if applied != 0 {
|
||||||
|
if err := m.device.SetPeerMTU(addr, 0); err != nil {
|
||||||
|
m.l.Warn("pmtud: failed to clear per-peer mtu", "addr", addr, "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// OnRoam is called when a HostInfo's remote underlay address changes. The path
|
||||||
|
// MTU may now be different; drop the per-host route so the kernel falls back to
|
||||||
|
// the per-vpn-network route (mtu = tun.mtu floor), then restart the search.
|
||||||
|
// We do not reset the unsupported flag: peer software version doesn't change on
|
||||||
|
// roam, so once we've decided a peer doesn't speak PMTUD we stay decided.
|
||||||
|
func (m *pmtudManager) OnRoam(hi *HostInfo) {
|
||||||
|
if !m.enabled.Load() || hi == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
v, ok := m.peers.Load(hi.localIndexId)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
p := v.(*pmtudPeer)
|
||||||
|
p.mu.Lock()
|
||||||
|
if p.unsupported {
|
||||||
|
p.mu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
p.low = int(m.floor.Load()) + pmtudOverheadPessimistic
|
||||||
|
p.high = int(m.ceiling.Load())
|
||||||
|
p.inFlightSize = 0
|
||||||
|
p.inFlightID = 0
|
||||||
|
p.losses = 0
|
||||||
|
p.consecutiveFailures = 0
|
||||||
|
p.firstProbe = true
|
||||||
|
p.converged = false
|
||||||
|
if p.applied != 0 {
|
||||||
|
if err := m.device.SetPeerMTU(p.addr, 0); err != nil {
|
||||||
|
m.l.Warn("pmtud: failed to clear per-peer mtu on roam", "addr", p.addr, "error", err)
|
||||||
|
} else {
|
||||||
|
p.applied = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p.mu.Unlock()
|
||||||
|
m.wheel.Add(hi.localIndexId, pmtudProbeInterval)
|
||||||
|
}
|
||||||
|
|
||||||
|
// MaybeProbeAsTest is called by connection_manager when it would otherwise send
|
||||||
|
// a TestRequest because a tunnel has gone silent. If we have a confirmed PMTU
|
||||||
|
// for this peer that's larger than the floor, we send a probe at that size
|
||||||
|
// instead. The reply confirms both liveness (consumed by connection_manager via
|
||||||
|
// the existing inbound traffic accounting fallthrough in outside.go) and that
|
||||||
|
// the confirmed PMTU still fits (consumed by HandleReply here). One synthetic
|
||||||
|
// packet does the work of two.
|
||||||
|
//
|
||||||
|
// Returns true if a probe was sent. False means the caller should send a
|
||||||
|
// regular TestRequest at the floor.
|
||||||
|
//
|
||||||
|
// On probe failure, connection_manager's existing pendingDeletion timeout will
|
||||||
|
// tear the tunnel down. Heavy hammer, but correct: a re-handshake re-runs PMTUD
|
||||||
|
// discovery against the now-shrunken path. A future EMSGSIZE-capture followup
|
||||||
|
// can replace this with a soft-drop-and-research flow.
|
||||||
|
func (m *pmtudManager) MaybeProbeAsTest(hi *HostInfo) bool {
|
||||||
|
if !m.enabled.Load() || hi == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
v, ok := m.peers.Load(hi.localIndexId)
|
||||||
|
if !ok {
|
||||||
|
// Tunnel pre-dates the manager being aware of it (e.g. pmtud was just
|
||||||
|
// enabled live, or AddTrafficWatch fired before this call). Seed the
|
||||||
|
// peer so the wheel picks up the search; let connection_manager send
|
||||||
|
// its regular TestRequest this cycle.
|
||||||
|
m.seedPeer(hi)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
p := v.(*pmtudPeer)
|
||||||
|
p.mu.Lock()
|
||||||
|
if p.unsupported || p.applied == 0 {
|
||||||
|
p.mu.Unlock()
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
overhead := p.overhead()
|
||||||
|
size := p.applied + overhead
|
||||||
|
id := rand.Uint32()
|
||||||
|
p.inFlightSize = size
|
||||||
|
p.inFlightID = id
|
||||||
|
p.mu.Unlock()
|
||||||
|
|
||||||
|
m.sendProbe(hi, size, id, overhead)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// HandleReply consumes an MTUDProbeReply payload from the receive path.
|
||||||
|
func (m *pmtudManager) HandleReply(localIdx uint32, payload []byte) {
|
||||||
|
if !m.enabled.Load() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(payload) < 8 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if binary.BigEndian.Uint32(payload[0:4]) != pmtudMagic {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
id := binary.BigEndian.Uint32(payload[4:8])
|
||||||
|
|
||||||
|
v, ok := m.peers.Load(localIdx)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
p := v.(*pmtudPeer)
|
||||||
|
p.mu.Lock()
|
||||||
|
defer p.mu.Unlock()
|
||||||
|
|
||||||
|
if p.inFlightSize == 0 || p.inFlightID != id {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
confirmed := p.inFlightSize
|
||||||
|
p.low = confirmed
|
||||||
|
p.inFlightSize = 0
|
||||||
|
p.losses = 0
|
||||||
|
p.everReplied = true
|
||||||
|
p.consecutiveFailures = 0
|
||||||
|
|
||||||
|
innerMTU := confirmed - p.overhead()
|
||||||
|
// Only install a /32 override when it would actually raise the MTU above the
|
||||||
|
// per-vpn-network floor route. If the discovered MTU is <= floor, the /24
|
||||||
|
// already covers it; installing a /32 at floor would just create roam churn.
|
||||||
|
if innerMTU > int(m.floor.Load()) && p.applied != innerMTU {
|
||||||
|
if err := m.device.SetPeerMTU(p.addr, innerMTU); err != nil {
|
||||||
|
m.l.Warn("pmtud: failed to apply per-peer mtu", "addr", p.addr, "innerMTU", innerMTU, "error", err)
|
||||||
|
} else {
|
||||||
|
m.l.Info("pmtud probe confirmed",
|
||||||
|
"addr", p.addr,
|
||||||
|
"outerMTU", confirmed,
|
||||||
|
"innerMTU", innerMTU,
|
||||||
|
"low", p.low,
|
||||||
|
"high", p.high,
|
||||||
|
)
|
||||||
|
p.applied = innerMTU
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.high-p.low <= pmtudConverged {
|
||||||
|
p.converged = true
|
||||||
|
} else {
|
||||||
|
p.converged = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// tick handles one wheel firing for a single peer.
|
||||||
|
func (m *pmtudManager) tick(localIdx uint32) {
|
||||||
|
v, ok := m.peers.Load(localIdx)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
p := v.(*pmtudPeer)
|
||||||
|
p.mu.Lock()
|
||||||
|
|
||||||
|
if p.unsupported {
|
||||||
|
p.mu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// If a probe was outstanding, this tick is the loss timeout.
|
||||||
|
if p.inFlightSize != 0 {
|
||||||
|
p.losses++
|
||||||
|
p.consecutiveFailures++
|
||||||
|
if p.losses >= pmtudMaxLoss {
|
||||||
|
p.high = p.inFlightSize
|
||||||
|
p.inFlightSize = 0
|
||||||
|
p.losses = 0
|
||||||
|
if p.high-p.low <= pmtudConverged {
|
||||||
|
p.converged = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we've never gotten a reply from this peer and we've burned through our
|
||||||
|
// failure budget, conclude the peer doesn't understand the MTUDProbeRequest
|
||||||
|
// subtype and stop scheduling probes for it.
|
||||||
|
if !p.everReplied && p.consecutiveFailures >= pmtudUnsupportedAfter {
|
||||||
|
p.unsupported = true
|
||||||
|
addr := p.addr
|
||||||
|
p.mu.Unlock()
|
||||||
|
m.l.Info("pmtud: peer not responding to probes, marking unsupported",
|
||||||
|
"addr", addr, "failures", pmtudUnsupportedAfter)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
hi := m.intf.hostMap.QueryIndex(localIdx)
|
||||||
|
if hi == nil {
|
||||||
|
p.mu.Unlock()
|
||||||
|
m.peers.Delete(localIdx)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Once a peer converges, the wheel stops scheduling for it. Re-validation
|
||||||
|
// (and the resulting black hole detection) is driven by connection_manager
|
||||||
|
// via MaybeProbeAsTest at its natural test cadence, so a converged peer
|
||||||
|
// has nothing for the wheel to do until OnRoam or a tunnel down/up cycle
|
||||||
|
// triggers a fresh search.
|
||||||
|
if p.converged {
|
||||||
|
p.mu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ceiling := int(m.ceiling.Load())
|
||||||
|
var size int
|
||||||
|
switch {
|
||||||
|
case p.firstProbe:
|
||||||
|
// Probe the ceiling directly. If the path supports it (the common case
|
||||||
|
// when an operator has explicitly configured tun.max_mtu), we converge
|
||||||
|
// in one round trip. If it fails, the standard binary search resumes
|
||||||
|
// on the next tick from the (low, ceiling) bounds.
|
||||||
|
size = ceiling
|
||||||
|
p.firstProbe = false
|
||||||
|
case p.losses > 0 && p.inFlightSize != 0:
|
||||||
|
size = p.inFlightSize
|
||||||
|
default:
|
||||||
|
size = p.midpoint()
|
||||||
|
}
|
||||||
|
if size < pmtudFloor {
|
||||||
|
size = pmtudFloor
|
||||||
|
}
|
||||||
|
if size > ceiling {
|
||||||
|
size = ceiling
|
||||||
|
}
|
||||||
|
|
||||||
|
id := rand.Uint32()
|
||||||
|
p.inFlightSize = size
|
||||||
|
p.inFlightID = id
|
||||||
|
overhead := p.overhead()
|
||||||
|
p.mu.Unlock()
|
||||||
|
|
||||||
|
m.sendProbe(hi, size, id, overhead)
|
||||||
|
m.wheel.Add(localIdx, pmtudProbeInterval)
|
||||||
|
}
|
||||||
|
|
||||||
|
// sendProbe builds an MTUDProbeRequest payload that will produce an outer IP
|
||||||
|
// packet of approximately `outerSize` bytes, then sends it.
|
||||||
|
func (m *pmtudManager) sendProbe(hi *HostInfo, outerSize int, id uint32, overhead int) {
|
||||||
|
payloadLen := outerSize - overhead
|
||||||
|
if payloadLen < 8 {
|
||||||
|
payloadLen = 8
|
||||||
|
}
|
||||||
|
p := make([]byte, payloadLen)
|
||||||
|
binary.BigEndian.PutUint32(p[0:4], pmtudMagic)
|
||||||
|
binary.BigEndian.PutUint32(p[4:8], id)
|
||||||
|
// remaining bytes are zero-padding
|
||||||
|
|
||||||
|
nb := make([]byte, 12)
|
||||||
|
out := make([]byte, outerSize+128) // headroom for header/tag/relay framing
|
||||||
|
m.intf.SendMessageToHostInfo(header.Test, header.MTUDProbeRequest, hi, p, nb, out)
|
||||||
|
}
|
||||||
@@ -20,6 +20,12 @@ type Conn interface {
|
|||||||
WriteTo(b []byte, addr netip.AddrPort) error
|
WriteTo(b []byte, addr netip.AddrPort) error
|
||||||
ReloadConfig(c *config.C)
|
ReloadConfig(c *config.C)
|
||||||
SupportsMultipleReaders() bool
|
SupportsMultipleReaders() bool
|
||||||
|
// EnablePathMTUDiscovery sets the don't-fragment bit on outgoing packets for
|
||||||
|
// this socket. Called by the pmtud manager when PMTUD is enabled. A no-op on
|
||||||
|
// platforms that don't support it; nebula's default behavior (no DF, kernel
|
||||||
|
// fragmentation allowed) is preserved on those platforms and on this one when
|
||||||
|
// PMTUD is disabled.
|
||||||
|
EnablePathMTUDiscovery() error
|
||||||
Close() error
|
Close() error
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -43,6 +49,9 @@ func (NoopConn) WriteTo(_ []byte, _ netip.AddrPort) error {
|
|||||||
func (NoopConn) ReloadConfig(_ *config.C) {
|
func (NoopConn) ReloadConfig(_ *config.C) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
func (NoopConn) EnablePathMTUDiscovery() error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
func (NoopConn) Close() error {
|
func (NoopConn) Close() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -44,3 +44,17 @@ func NewListenConfig(multi bool) net.ListenConfig {
|
|||||||
func (u *GenericConn) Rebind() error {
|
func (u *GenericConn) Rebind() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets.
|
||||||
|
// Android is Linux underneath, so we use IP_PMTUDISC_PROBE (kernel sets DF but
|
||||||
|
// does not consume incoming ICMP frag-needed for its PMTU cache; the manager
|
||||||
|
// drives discovery via authenticated probes).
|
||||||
|
func (u *GenericConn) EnablePathMTUDiscovery() error {
|
||||||
|
v4 := u.isV4Socket()
|
||||||
|
return u.controlFD(func(fd uintptr) error {
|
||||||
|
if v4 {
|
||||||
|
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_MTU_DISCOVER, unix.IP_PMTUDISC_PROBE)
|
||||||
|
}
|
||||||
|
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_MTU_DISCOVER, unix.IPV6_PMTUDISC_PROBE)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|||||||
@@ -47,3 +47,8 @@ func NewListenConfig(multi bool) net.ListenConfig {
|
|||||||
func (u *GenericConn) Rebind() error {
|
func (u *GenericConn) Rebind() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// EnablePathMTUDiscovery is split into per-OS files: udp_freebsd.go handles
|
||||||
|
// FreeBSD (which has both IP_DONTFRAG and IPV6_DONTFRAG in the unix package);
|
||||||
|
// udp_openbsd.go handles OpenBSD (v6 only; the kernel doesn't expose a v4 DF
|
||||||
|
// sockopt).
|
||||||
|
|||||||
@@ -187,6 +187,17 @@ func (u *StdConn) SupportsMultipleReaders() bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// EnablePathMTUDiscovery sets the don't-fragment bit on every outbound packet.
|
||||||
|
// On darwin we use IP_DONTFRAG (v4) / IPV6_DONTFRAG (v6). The kernel will return
|
||||||
|
// EMSGSIZE for sends that exceed the local interface MTU; ICMP-driven PMTU
|
||||||
|
// updates from upstream routers are processed by the kernel as usual.
|
||||||
|
func (u *StdConn) EnablePathMTUDiscovery() error {
|
||||||
|
if u.isV4 {
|
||||||
|
return syscall.SetsockoptInt(int(u.sysFd), syscall.IPPROTO_IP, unix.IP_DONTFRAG, 1)
|
||||||
|
}
|
||||||
|
return syscall.SetsockoptInt(int(u.sysFd), syscall.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1)
|
||||||
|
}
|
||||||
|
|
||||||
func (u *StdConn) Rebind() error {
|
func (u *StdConn) Rebind() error {
|
||||||
var err error
|
var err error
|
||||||
if u.isV4 {
|
if u.isV4 {
|
||||||
|
|||||||
25
udp/udp_freebsd.go
Normal file
25
udp/udp_freebsd.go
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
//go:build freebsd && !e2e_testing
|
||||||
|
// +build freebsd,!e2e_testing
|
||||||
|
|
||||||
|
package udp
|
||||||
|
|
||||||
|
import (
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
)
|
||||||
|
|
||||||
|
// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets.
|
||||||
|
// FreeBSD exposes IP_DONTFRAG (v4) and IPV6_DONTFRAG (v6) in golang.org/x/sys/unix.
|
||||||
|
// Unlike Linux, BSDs don't have an explicit "don't consume incoming ICMP
|
||||||
|
// frag-needed" knob for unconnected UDP sockets; the kernel's PMTU cache will
|
||||||
|
// be updated from ICMP, which is benign for our usage (the cache only affects
|
||||||
|
// what EMSGSIZE gets surfaced for; the manager drives its own discovery via
|
||||||
|
// authenticated probes).
|
||||||
|
func (u *GenericConn) EnablePathMTUDiscovery() error {
|
||||||
|
v4 := u.isV4Socket()
|
||||||
|
return u.controlFD(func(fd uintptr) error {
|
||||||
|
if v4 {
|
||||||
|
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_DONTFRAG, 1)
|
||||||
|
}
|
||||||
|
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1)
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -100,3 +100,41 @@ func (u *GenericConn) ListenOut(r EncReader) error {
|
|||||||
func (u *GenericConn) SupportsMultipleReaders() bool {
|
func (u *GenericConn) SupportsMultipleReaders() bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// EnablePathMTUDiscovery is implemented per-platform alongside Rebind, in
|
||||||
|
// udp_android.go / udp_bsd.go / udp_netbsd.go / udp_windows.go.
|
||||||
|
|
||||||
|
// controlFD invokes f with the underlying UDP socket file descriptor (or
|
||||||
|
// handle, on Windows). Used by platform files for setsockopt calls that the
|
||||||
|
// stdlib net.UDPConn does not expose directly.
|
||||||
|
func (u *GenericConn) controlFD(f func(fd uintptr) error) error {
|
||||||
|
rc, err := u.UDPConn.SyscallConn()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
var sockErr error
|
||||||
|
err = rc.Control(func(fd uintptr) {
|
||||||
|
sockErr = f(fd)
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return sockErr
|
||||||
|
}
|
||||||
|
|
||||||
|
// isV4Socket reports whether the local bind address looks like an IPv4 socket.
|
||||||
|
// Used by EnablePathMTUDiscovery to pick IPPROTO_IP vs IPPROTO_IPV6 socket
|
||||||
|
// options. Assumes pure-v4 or pure-v6 sockets; a dual-stack v6 socket bound to
|
||||||
|
// :: will be treated as v6 (correct: setting IPV6_DONTFRAG covers v4-mapped
|
||||||
|
// traffic too on most stacks).
|
||||||
|
func (u *GenericConn) isV4Socket() bool {
|
||||||
|
la := u.UDPConn.LocalAddr()
|
||||||
|
if la == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
ua, ok := la.(*net.UDPAddr)
|
||||||
|
if !ok {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return ua.IP.To4() != nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -73,6 +73,21 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int)
|
|||||||
return out, nil
|
return out, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// EnablePathMTUDiscovery sets IP_MTU_DISCOVER=IP_PMTUDISC_PROBE (IPV6 equivalent
|
||||||
|
// for v6 sockets). This sets the don't-fragment bit on every outbound packet but
|
||||||
|
// tells the kernel not to consume incoming ICMP frag-needed for its own PMTU
|
||||||
|
// cache; we drive PMTU discovery from the application via authenticated probes
|
||||||
|
// (RFC 8899). Called by the pmtud manager when PMTUD is enabled. Without this
|
||||||
|
// call the socket retains nebula's historical behavior (no DF, kernel may
|
||||||
|
// fragment), preserving compatibility with deployments that depend on UDP
|
||||||
|
// fragmentation.
|
||||||
|
func (u *StdConn) EnablePathMTUDiscovery() error {
|
||||||
|
if u.isV4 {
|
||||||
|
return u.setSockOptIPInt(unix.IPPROTO_IP, unix.IP_MTU_DISCOVER, unix.IP_PMTUDISC_PROBE)
|
||||||
|
}
|
||||||
|
return u.setSockOptIPInt(unix.IPPROTO_IPV6, unix.IPV6_MTU_DISCOVER, unix.IPV6_PMTUDISC_PROBE)
|
||||||
|
}
|
||||||
|
|
||||||
func (u *StdConn) SupportsMultipleReaders() bool {
|
func (u *StdConn) SupportsMultipleReaders() bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
@@ -110,6 +125,21 @@ func (u *StdConn) setSockOptInt(opt int, n int) error {
|
|||||||
return opErr
|
return opErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// setSockOptIPInt sets a socket option at a non-SOL_SOCKET level (e.g. IPPROTO_IP).
|
||||||
|
func (u *StdConn) setSockOptIPInt(level, opt, n int) error {
|
||||||
|
if u.rawConn == nil {
|
||||||
|
return fmt.Errorf("no UDP connection")
|
||||||
|
}
|
||||||
|
var opErr error
|
||||||
|
err := u.rawConn.Control(func(fd uintptr) {
|
||||||
|
opErr = unix.SetsockoptInt(int(fd), level, opt, n)
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return opErr
|
||||||
|
}
|
||||||
|
|
||||||
func (u *StdConn) SetRecvBuffer(n int) error {
|
func (u *StdConn) SetRecvBuffer(n int) error {
|
||||||
return u.setSockOptInt(unix.SO_RCVBUFFORCE, n)
|
return u.setSockOptInt(unix.SO_RCVBUFFORCE, n)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,3 +46,18 @@ func NewListenConfig(multi bool) net.ListenConfig {
|
|||||||
func (u *GenericConn) Rebind() error {
|
func (u *GenericConn) Rebind() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets.
|
||||||
|
// NetBSD exposes IPV6_DONTFRAG via golang.org/x/sys/unix but the kernel does
|
||||||
|
// not provide a socket-level knob for setting DF on v4 UDP. The only IP-layer
|
||||||
|
// constant exposed is IP_DF, which is the wire header flag, not a sockopt.
|
||||||
|
// quic-go skips NetBSD for the same reason. So v4 sockets stay at nebula's
|
||||||
|
// historical behavior (kernel may fragment); v6 gets DF.
|
||||||
|
func (u *GenericConn) EnablePathMTUDiscovery() error {
|
||||||
|
if u.isV4Socket() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return u.controlFD(func(fd uintptr) error {
|
||||||
|
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|||||||
23
udp/udp_openbsd.go
Normal file
23
udp/udp_openbsd.go
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
//go:build openbsd && !e2e_testing
|
||||||
|
// +build openbsd,!e2e_testing
|
||||||
|
|
||||||
|
package udp
|
||||||
|
|
||||||
|
import (
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
)
|
||||||
|
|
||||||
|
// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets.
|
||||||
|
// OpenBSD exposes IPV6_DONTFRAG via golang.org/x/sys/unix but the kernel does
|
||||||
|
// not provide a socket-level knob for setting DF on v4 UDP. The only IP-layer
|
||||||
|
// constant exposed is IP_DF, which is the wire header flag, not a sockopt.
|
||||||
|
// quic-go skips OpenBSD for the same reason. So v4 sockets stay at nebula's
|
||||||
|
// historical behavior (kernel may fragment); v6 gets DF.
|
||||||
|
func (u *GenericConn) EnablePathMTUDiscovery() error {
|
||||||
|
if u.isV4Socket() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return u.controlFD(func(fd uintptr) error {
|
||||||
|
return unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_DONTFRAG, 1)
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -335,6 +335,12 @@ func (u *RIOConn) Rebind() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// EnablePathMTUDiscovery is a no-op on Windows for now. PMTUD is Linux-only in
|
||||||
|
// the initial PoC; Windows support would set IP_DONTFRAGMENT here.
|
||||||
|
func (u *RIOConn) EnablePathMTUDiscovery() error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (u *RIOConn) ReloadConfig(*config.C) {}
|
func (u *RIOConn) ReloadConfig(*config.C) {}
|
||||||
|
|
||||||
func (u *RIOConn) Close() error {
|
func (u *RIOConn) Close() error {
|
||||||
|
|||||||
@@ -152,6 +152,10 @@ func (u *TesterConn) Rebind() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (u *TesterConn) EnablePathMTUDiscovery() error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (u *TesterConn) Close() error {
|
func (u *TesterConn) Close() error {
|
||||||
u.closeOnce.Do(func() {
|
u.closeOnce.Do(func() {
|
||||||
close(u.done)
|
close(u.done)
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ import (
|
|||||||
"net"
|
"net"
|
||||||
"net/netip"
|
"net/netip"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
|
"golang.org/x/sys/windows"
|
||||||
)
|
)
|
||||||
|
|
||||||
func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int) (Conn, error) {
|
func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int) (Conn, error) {
|
||||||
@@ -44,3 +46,27 @@ func NewListenConfig(multi bool) net.ListenConfig {
|
|||||||
func (u *GenericConn) Rebind() error {
|
func (u *GenericConn) Rebind() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Windows IP_DONTFRAGMENT and IPV6_DONTFRAG are not exposed in the
|
||||||
|
// golang.org/x/sys/windows package. Defined locally per the values in
|
||||||
|
// ws2ipdef.h / ws2tcpip.h. These are stable Win32 constants that have not
|
||||||
|
// changed since at least Windows Vista.
|
||||||
|
const (
|
||||||
|
winIPDontFragment = 14
|
||||||
|
winIPv6DontFrag = 14
|
||||||
|
)
|
||||||
|
|
||||||
|
// EnablePathMTUDiscovery sets the don't-fragment bit on outbound packets.
|
||||||
|
// Windows uses IP_DONTFRAGMENT (v4) and IPV6_DONTFRAG (v6) at IPPROTO_IP /
|
||||||
|
// IPPROTO_IPV6 respectively. Note: this only enables DF on the GenericConn
|
||||||
|
// fallback path. The RIO path (RIOConn) has its own EnablePathMTUDiscovery
|
||||||
|
// in udp_rio_windows.go and is currently a no-op pending RIO-specific work.
|
||||||
|
func (u *GenericConn) EnablePathMTUDiscovery() error {
|
||||||
|
v4 := u.isV4Socket()
|
||||||
|
return u.controlFD(func(fd uintptr) error {
|
||||||
|
if v4 {
|
||||||
|
return windows.SetsockoptInt(windows.Handle(fd), windows.IPPROTO_IP, winIPDontFragment, 1)
|
||||||
|
}
|
||||||
|
return windows.SetsockoptInt(windows.Handle(fd), windows.IPPROTO_IPV6, winIPv6DontFrag, 1)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user