GSO/GRO offloads, with TCP+ECN and UDP support

2026-05-16 04:47:38 +02:00 · 2026-04-17 10:25:05 -05:00
parent f95857b4c3
commit 5d35351437
60 changed files with 6915 additions and 283 deletions
--- a/interface.go
+++ b/interface.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"log/slog"
 	"net/netip"
+	"runtime"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -48,7 +49,14 @@ type InterfaceConfig struct {
 	reQueryWait     time.Duration

 	ConntrackCacheTimeout time.Duration
-	l                     *slog.Logger
+
+	// CpuAffinity, when non-empty, names the CPUs each TUN reader goroutine
+	// should pin to. Queue i pins to CpuAffinity[i % len(CpuAffinity)] —
+	// shorter lists than `routines` cycle. Empty list keeps the default
+	// pin-to-(i % NumCPU) behavior.
+	CpuAffinity []int
+
+	l *slog.Logger
 }

 type Interface struct {
@@ -72,7 +80,16 @@ type Interface struct {
 	routines              int
 	disconnectInvalid     atomic.Bool
 	closed                atomic.Bool
-	relayManager          *relayManager
+	// cpuAffinity, when non-empty, names the CPUs each TUN reader goroutine
+	// should pin to. Queue i pins to cpuAffinity[i % len(cpuAffinity)].
+	// Empty falls back to the default pin-to-(i % NumCPU) behavior.
+	cpuAffinity []int
+	// ecnEnabled gates RFC 6040 underlay ECN propagation. When true,
+	// inside.go copies the inner ECN onto the outer carrier on encap and
+	// decryptToTun folds outer CE into the inner header on decap. Toggle
+	// via tunnels.ecn (default true).
+	ecnEnabled   atomic.Bool
+	relayManager *relayManager

 	tryPromoteEvery atomic.Uint32
 	reQueryEvery    atomic.Uint32
@@ -202,6 +219,7 @@ func NewInterface(ctx context.Context, c *InterfaceConfig) (*Interface, error) {
 		relayManager:          c.relayManager,
 		connectionManager:     c.connectionManager,
 		conntrackCacheTimeout: c.ConntrackCacheTimeout,
+		cpuAffinity:           c.CpuAffinity,

 		metricHandshakes: metrics.GetOrRegisterHistogram("handshakes", nil, metrics.NewExpDecaySample(1028, 0.015)),
 		messageMetrics:   c.MessageMetrics,
@@ -260,7 +278,16 @@ func (f *Interface) activate() error {
 	}
 	f.readers = f.inside.Readers()
 	for i := range f.readers {
-		f.batchers[i] = batch.NewPassthrough(f.readers[i])
+		caps := tio.QueueCapabilities(f.readers[i])
+		if caps.TSO || caps.USO {
+			// Multi-lane: TCP gets coalesced when TSO is on, UDP when USO
+			// is on, everything else (and either lane disabled) falls
+			// through to passthrough so non-IP / non-TCP-UDP traffic still
+			// reaches the TUN.
+			f.batchers[i] = batch.NewMultiCoalescer(f.readers[i], caps.TSO, caps.USO)
+		} else {
+			f.batchers[i] = batch.NewPassthrough(f.readers[i])
+		}
 	}

 	f.wg.Add(1) // for us to wait on Close() to return
@@ -322,15 +349,13 @@ func (f *Interface) listenOut(i int) {
 	fwPacket := &firewall.Packet{}
 	nb := make([]byte, 12, 12)

-	coalescer := f.batchers[i]
-
-	listener := func(fromUdpAddr netip.AddrPort, payload []byte) {
+	listener := func(fromUdpAddr netip.AddrPort, payload []byte, meta udp.RxMeta) {
 		plaintext := f.batchers[i].Reserve(len(payload))
-		f.readOutsidePackets(ViaSender{UdpAddr: fromUdpAddr}, plaintext[:0], payload, h, fwPacket, lhh, nb, i, ctCache.Get())
+		f.readOutsidePackets(ViaSender{UdpAddr: fromUdpAddr}, plaintext[:0], payload, h, fwPacket, lhh, nb, i, ctCache.Get(), meta)
 	}

 	flusher := func() {
-		if err := coalescer.Flush(); err != nil {
+		if err := f.batchers[i].Flush(); err != nil {
 			f.l.Error("Failed to flush tun coalescer", "error", err)
 		}
 	}
@@ -346,8 +371,27 @@ func (f *Interface) listenOut(i int) {
 }

 func (f *Interface) listenIn(reader tio.Queue, i int) {
+	// Pin this goroutine to one CPU. LockOSThread alone keeps the goroutine
+	// on a single OS thread but the kernel can still migrate that thread
+	// across CPUs — XPS reads smp_processor_id() at sendmmsg time and picks
+	// the TX ring from the current CPU's xps_cpus map, so an unpinned
+	// thread bouncing between CPUs spreads one nebula flow's packets across
+	// multiple TX rings, which the rings then drain at independent rates
+	// and the wire delivers reordered.
+	//
+	// Pinning keeps every sendmmsg from this goroutine going through the
+	// same TX ring, so the wire sees per-flow order. Cost: less scheduler
+	// flexibility — if i % NumCPU collides between two TUN reader
+	// goroutines they share a CPU.
+	cpu := i % runtime.NumCPU()
+	if n := len(f.cpuAffinity); n > 0 {
+		cpu = f.cpuAffinity[i%n]
+	}
+	if err := pinThreadToCPU(cpu); err != nil {
+		f.l.Warn("failed to pin tun reader to CPU", "queue", i, "cpu", cpu, "err", err)
+	}
 	rejectBuf := make([]byte, mtu)
-	sb := batch.NewSendBatch(batch.SendBatchCap, udp.MTU+32)
+	sb := batch.NewSendBatch(f.writers[i], batch.SendBatchCap, udp.MTU+32)
 	fwPacket := &firewall.Packet{}
 	nb := make([]byte, 12, 12)

@@ -363,35 +407,24 @@ func (f *Interface) listenIn(reader tio.Queue, i int) {
 			break
 		}

-		sb.Reset()
 		for _, pkt := range pkts {
-			if sb.Len() >= sb.Cap() {
-				f.flushBatch(sb, i)
-				sb.Reset()
-			}
 			f.consumeInsidePacket(pkt, fwPacket, nb, sb, rejectBuf, i, conntrackCache.Get())
 		}
-		if sb.Len() > 0 {
-			f.flushBatch(sb, i)
+		if err := sb.Flush(); err != nil {
+			f.l.Error("Failed to write outgoing batch", "error", err, "writer", i)
 		}
 	}

 	f.l.Debug("overlay reader is done", "reader", i)
 }

-func (f *Interface) flushBatch(sb batch.TxBatcher, q int) {
-	bufs, dsts := sb.Get()
-	if err := f.writers[q].WriteBatch(bufs, dsts); err != nil {
-		f.l.Error("Failed to write outgoing batch", "error", err, "writer", q)
-	}
-}
-
 func (f *Interface) RegisterConfigChangeCallbacks(c *config.C) {
 	c.RegisterReloadCallback(f.reloadFirewall)
 	c.RegisterReloadCallback(f.reloadSendRecvError)
 	c.RegisterReloadCallback(f.reloadAcceptRecvError)
 	c.RegisterReloadCallback(f.reloadDisconnectInvalid)
 	c.RegisterReloadCallback(f.reloadMisc)
+	c.RegisterReloadCallback(f.reloadEcn)

 	for _, udpConn := range f.writers {
 		c.RegisterReloadCallback(udpConn.ReloadConfig)
@@ -515,6 +548,20 @@ func (f *Interface) reloadMisc(c *config.C) {
 	}
 }

+// reloadEcn syncs Interface.ecnEnabled with the tunnels.ecn config knob.
+// Default is enabled (RFC 6040 normal mode); set false on the rare path
+// where an underlay middlebox rewrites or drops ECN bits unpredictably.
+func (f *Interface) reloadEcn(c *config.C) {
+	initial := c.InitialLoad()
+	if initial || c.HasChanged("tunnels.ecn") {
+		v := c.GetBool("tunnels.ecn", true)
+		f.ecnEnabled.Store(v)
+		if !initial {
+			f.l.Info("tunnels.ecn changed", "enabled", v)
+		}
+	}
+}
+
 func (f *Interface) emitStats(ctx context.Context, i time.Duration) {
 	ticker := time.NewTicker(i)
 	defer ticker.Stop()