broken checkpt

2026-05-16 04:47:38 +02:00 · 2026-05-14 15:56:34 -05:00
parent d429dab5dd
commit d50c3028a2
17 changed files with 706 additions and 258 deletions
--- a/interface.go
+++ b/interface.go
@@ -283,7 +283,7 @@ func (f *Interface) activate() error {
 	}
 	f.readers = f.inside.Readers()
 	for i := range f.readers {
-		caps := tio.QueueCapabilities(f.readers[i])
+		caps := f.readers[i].Capabilities()
 		if caps.TSO || caps.USO {
 			// Multi-lane: TCP gets coalesced when TSO is on, UDP when USO
 			// is on, everything else (and either lane disabled) falls
@@ -387,7 +387,19 @@ func (f *Interface) listenIn(reader tio.Queue, q int) {
 		f.l.Warn("failed to pin tun reader to CPU", "queue", q, "cpu", cpu, "err", err)
 	}

+	const bonusInfo = 16
+	bufferScale := udp.MTU + bonusInfo
+	numTunPackets := 1
+	caps := reader.Capabilities()
+	if caps.TSO || caps.USO {
+		bufferScale = 65535 + bonusInfo
+		numTunPackets = f.batchSize
+	}
+
 	rejectBuf := make([]byte, mtu)
+	tunPackets := make([]wire.TunPacket, numTunPackets)
+	packetMem := make([]byte, bufferScale*numTunPackets)
+
 	arenaSize := batch.SendBatchCap * (udp.MTU + 32)
 	sb := batch.NewSendBatch(f.writers[q], batch.SendBatchCap, util.NewArena(arenaSize))
 	fwPacket := &firewall.Packet{}
@@ -396,7 +408,7 @@ func (f *Interface) listenIn(reader tio.Queue, q int) {
 	conntrackCache := firewall.NewConntrackCacheTicker(f.ctx, f.l, f.conntrackCacheTimeout)

 	for {
-		n, err := reader.Read(packets, packetMem)
+		n, err := reader.Read(tunPackets, packetMem)
 		if err != nil {
 			if !f.closed.Load() {
 				f.l.Error("Error while reading outbound packet, closing", "error", err, "reader", q)
@@ -407,7 +419,7 @@ func (f *Interface) listenIn(reader tio.Queue, q int) {

 		ctCache := conntrackCache.Get()
 		for i := range n {
-			f.consumeInsidePacket(packets[i], fwPacket, nb, sb, rejectBuf, q, ctCache)
+			f.consumeInsidePacket(tunPackets[i], fwPacket, nb, sb, rejectBuf, q, ctCache)
 		}
 		if err := sb.Flush(); err != nil {
 			f.l.Error("Failed to write outgoing batch", "error", err, "writer", q)
--- a/overlay/batch/coalesce_core.go
+++ b/overlay/batch/coalesce_core.go
@@ -147,44 +147,3 @@ func mergeECNIntoSeed(seedHdr, pktHdr []byte, isV6 bool) {
 		seedHdr[1] |= pktHdr[1] & 0x03
 	}
 }
-
-// Arena is an injectable byte-slab that hands out non-overlapping borrowed
-// slices via Reserve and releases them in bulk via Reset. Coalescers take
-// an *Arena at construction so the caller controls the slab lifetime and
-// can share one slab across multiple coalescers (MultiCoalescer hands the
-// same *Arena to every lane so the lanes don't carry their own backings).
-//
-// Reserve borrows; the slice is valid until the next Reset. The slab grows
-// (by allocating a fresh, larger backing array) if a Reserve doesn't fit;
-// pre-size the arena via NewArena to avoid that path on the hot path.
-type Arena struct {
-	buf []byte
-}
-
-// NewArena returns an Arena with a pre-allocated backing of the given
-// capacity. Pass 0 if you don't intend to call Reserve (e.g. a test that
-// only feeds the coalescer pre-made []byte packets via Commit).
-func NewArena(capacity int) *Arena {
-	return &Arena{buf: make([]byte, 0, capacity)}
-}
-
-// Reserve hands out a non-overlapping sz-byte slice from the arena. If the
-// request doesn't fit the current backing, a fresh, larger backing is
-// allocated; already-borrowed slices reference the old backing and remain
-// valid until Reset.
-func (a *Arena) Reserve(sz int) []byte {
-	if len(a.buf)+sz > cap(a.buf) {
-		newCap := max(cap(a.buf)*2, sz)
-		a.buf = make([]byte, 0, newCap)
-	}
-	start := len(a.buf)
-	a.buf = a.buf[:start+sz]
-	return a.buf[start : start+sz : start+sz]
-}
-
-// Reset releases every slice handed out since the last Reset. Callers must
-// not use any previously-borrowed slice after this returns. The underlying
-// backing array is retained so subsequent Reserves don't re-allocate.
-func (a *Arena) Reset() {
-	a.buf = a.buf[:0]
-}
--- a/overlay/batch/multi_coalesce.go
+++ b/overlay/batch/multi_coalesce.go
@@ -2,8 +2,10 @@ package batch

 import (
 	"errors"
-	"io"
 	"log/slog"
+
+	"github.com/slackhq/nebula/overlay/tio"
+	"github.com/slackhq/nebula/util"
 )

 // MultiCoalescer fans plaintext packets out to lane-specific batchers based
@@ -35,7 +37,7 @@ type MultiCoalescer struct {
 	// sequentially and never Reserves in between, so a later lane's
 	// slots stay readable across an earlier lane's Reset (the underlying
 	// bytes are still alive — Reset only re-slices len to 0).
-	arena *Arena
+	arena *util.Arena
 }

 // DefaultMultiArenaCap is the recommended arena capacity for a Multi-lane
@@ -49,9 +51,9 @@ const DefaultMultiArenaCap = initialSlots * 65535
 // Either lane disabled redirects its traffic into the passthrough lane.
 // arena is the single backing slab shared across every lane; the caller
 // pre-sizes it via NewArena so the hot path never allocates.
-func NewMultiCoalescer(w io.Writer, l *slog.Logger, arena *Arena, tcpEnabled, udpEnabled bool) *MultiCoalescer {
+func NewMultiCoalescer(w tio.Queue, l *slog.Logger, arena *util.Arena, tcpEnabled, udpEnabled bool) *MultiCoalescer {
 	m := &MultiCoalescer{
-		pt:    NewPassthrough(w, arena),
+		pt:    NewPassthrough(w, initialSlots, arena),
 		arena: arena,
 	}
 	if tcpEnabled {
--- a/overlay/batch/passthrough.go
+++ b/overlay/batch/passthrough.go
@@ -4,6 +4,7 @@ import (
 	"io"

 	"github.com/slackhq/nebula/udp"
+	"github.com/slackhq/nebula/util"
 )

 // Passthrough is a RxBatcher that doesn't batch anything, it just accumulates and then sends packets.
@@ -11,7 +12,7 @@ type Passthrough struct {
 	out   io.Writer
 	slots [][]byte
 	// arena is injected; see TCPCoalescer.arena for the contract.
-	arena  *Arena
+	arena  *util.Arena
 	cursor int
 }

@@ -21,7 +22,7 @@ const passthroughBaseNumSlots = 128
 // standalone Passthrough batcher: 128 slots × udp.MTU ≈ 1.1 MiB.
 const DefaultPassthroughArenaCap = passthroughBaseNumSlots * udp.MTU

-func NewPassthrough(w io.Writer, slots int, arena *Arena) *Passthrough {
+func NewPassthrough(w io.Writer, slots int, arena *util.Arena) *Passthrough {
 	return &Passthrough{
 		out:   w,
 		slots: make([][]byte, 0, slots),
--- a/overlay/batch/tcp_coalesce.go
+++ b/overlay/batch/tcp_coalesce.go
@@ -10,6 +10,8 @@ import (
 	"slices"

 	"github.com/slackhq/nebula/overlay/tio"
+	"github.com/slackhq/nebula/util"
+	"github.com/slackhq/nebula/wire"
 )

 // ipProtoTCP is the IANA protocol number for TCP. Hardcoded instead of
@@ -88,11 +90,11 @@ type TCPCoalescer struct {
 	// and tells it to release them via Reset on Flush. When wrapped in
 	// MultiCoalescer the same *Arena is shared with the other lanes so
 	// there's exactly one backing slab per Multi instance.
-	arena *Arena
+	arena *util.Arena
 	l     *slog.Logger
 }

-func NewTCPCoalescer(w io.Writer, l *slog.Logger, arena *Arena) *TCPCoalescer {
+func NewTCPCoalescer(w tio.Queue, l *slog.Logger, arena *util.Arena) *TCPCoalescer {
 	c := &TCPCoalescer{
 		plainW:    w,
 		slots:     make([]*coalesceSlot, 0, initialSlots),
@@ -101,7 +103,7 @@ func NewTCPCoalescer(w io.Writer, l *slog.Logger, arena *Arena) *TCPCoalescer {
 		arena:     arena,
 		l:         l,
 	}
-	if gw, ok := tio.SupportsGSO(w, tio.GSOProtoTCP); ok {
+	if gw, ok := tio.SupportsGSO(w, wire.GSOProtoTCP); ok {
 		c.gsoW = gw
 	}
 	return c
@@ -419,7 +421,7 @@ func (c *TCPCoalescer) flushSlot(s *coalesceSlot) error {
 	tcsum := s.ipHdrLen + 16
 	binary.BigEndian.PutUint16(hdr[tcsum:tcsum+2], foldOnceNoInvert(psum))

-	return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, tio.GSOProtoTCP)
+	return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, wire.GSOProtoTCP)
 }

 // headersMatch compares two IP+TCP header prefixes for byte-for-byte
--- a/overlay/batch/udp_coalesce.go
+++ b/overlay/batch/udp_coalesce.go
@@ -5,6 +5,8 @@ import (
 	"io"

 	"github.com/slackhq/nebula/overlay/tio"
+	"github.com/slackhq/nebula/util"
+	"github.com/slackhq/nebula/wire"
 )

 // ipProtoUDP is the IANA protocol number for UDP.
@@ -67,7 +69,7 @@ type UDPCoalescer struct {
 	pool      []*udpSlot

 	// arena is injected; see TCPCoalescer.arena for the contract.
-	arena *Arena
+	arena *util.Arena
 }

 // NewUDPCoalescer wraps w. The caller is responsible for only constructing
@@ -75,7 +77,7 @@ type UDPCoalescer struct {
 // the kernel may reject GSO_UDP_L4 writes. If w does not implement
 // tio.GSOWriter at all (single-packet Queue), the coalescer degrades to
 // plain Writes — same defensive shape as the TCP coalescer.
-func NewUDPCoalescer(w io.Writer, arena *Arena) *UDPCoalescer {
+func NewUDPCoalescer(w tio.Queue, arena *util.Arena) *UDPCoalescer {
 	c := &UDPCoalescer{
 		plainW:    w,
 		slots:     make([]*udpSlot, 0, initialSlots),
@@ -83,7 +85,7 @@ func NewUDPCoalescer(w io.Writer, arena *Arena) *UDPCoalescer {
 		pool:      make([]*udpSlot, 0, initialSlots),
 		arena:     arena,
 	}
-	if gw, ok := tio.SupportsGSO(w, tio.GSOProtoUDP); ok {
+	if gw, ok := tio.SupportsGSO(w, wire.GSOProtoUDP); ok {
 		c.gsoW = gw
 	}
 	return c
@@ -313,7 +315,7 @@ func (c *UDPCoalescer) flushSlot(s *udpSlot) error {
 	udpCsumOff := s.ipHdrLen + 6
 	binary.BigEndian.PutUint16(hdr[udpCsumOff:udpCsumOff+2], foldOnceNoInvert(psum))

-	return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, tio.GSOProtoUDP)
+	return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, wire.GSOProtoUDP)
 }

 // udpHeadersMatch compares two IP+UDP header prefixes for byte-equality on
--- a/overlay/tio/tio.go
+++ b/overlay/tio/tio.go
@@ -46,42 +46,6 @@ type Queue interface {
 	Capabilities() Capabilities
 }

-
-// GSOInfo describes a kernel-supplied superpacket sitting in Packet.Bytes.
-// The zero value means "not a superpacket" — Bytes is one regular IP
-// datagram and no segmentation is required.
-type GSOInfo struct {
-	// Size is the GSO segment size: max payload bytes per segment
-	// (== TCP MSS for TSO, == UDP payload chunk for USO). Zero means
-	// not a superpacket.
-	Size uint16
-	// HdrLen is the total L3+L4 header length within Bytes (already
-	// corrected via correctHdrLen, so safe to slice on).
-	HdrLen uint16
-	// CsumStart is the L4 header offset inside Bytes (== L3 header
-	// length).
-	CsumStart uint16
-	// Proto picks the L4 protocol (TCP or UDP) so the segmenter knows
-	// which checksum/header layout to apply.
-	Proto GSOProto
-}
-
-// IsSuperpacket reports whether g describes a multi-segment GSO/USO
-// superpacket that needs segmentation before its bytes can be encrypted
-// and sent on the wire.
-func (g GSOInfo) IsSuperpacket() bool { return g.Size > 0 }
-
-// GSOProto selects the L4 protocol for a GSO superpacket. Determines which
-// VIRTIO_NET_HDR_GSO_* type the writer stamps and which checksum offset
-// inside the transport header virtio NEEDS_CSUM expects.
-type GSOProto uint8
-
-const (
-	GSOProtoNone GSOProto = iota
-	GSOProtoTCP
-	GSOProtoUDP
-)
-
 // GSOWriter is implemented by Queues that can emit a TCP or UDP superpacket
 // assembled from a header prefix plus one or more borrowed payload
 // fragments, in a single vectored write (writev with a leading
@@ -104,24 +68,25 @@ const (
 // implementation of GSOWriter is necessary but not sufficient since USO
 // may not have been negotiated even when TSO was.
 type GSOWriter interface {
-	WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto GSOProto) error
+	WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto wire.GSOProto) error
 }

 // SupportsGSO reports whether w implements GSOWriter and the underlying
 // queue advertises the negotiated capability for `want`. A writer that
 // implements GSOWriter but not CapsProvider is treated as permissive
 // (used by tests and fakes that don't negotiate).
-func SupportsGSO(w Queue, want GSOProto) (GSOWriter, bool) {
+func SupportsGSO(w Queue, want wire.GSOProto) (GSOWriter, bool) {
 	gw, ok := w.(GSOWriter)
 	if !ok {
 		return nil, false
 	}
 	caps := w.Capabilities()
 	switch want {
-	case GSOProtoTCP:
+	case wire.GSOProtoTCP:
 		return gw, caps.TSO
-	case GSOProtoUDP:
+	case wire.GSOProtoUDP:
 		return gw, caps.USO
+	default:
+		return gw, false
 	}
-	return gw, false
 }
--- a/overlay/tio/tio_gso_linux.go
+++ b/overlay/tio/tio_gso_linux.go
@@ -10,6 +10,7 @@ import (
 	"syscall"
 	"unsafe"

+	"github.com/slackhq/nebula/wire"
 	"golang.org/x/sys/unix"

 	"github.com/slackhq/nebula/overlay/tio/virtio"
@@ -67,9 +68,6 @@ type Offload struct {
 	// events.
 	writeLock sync.Mutex
 	closed    atomic.Bool
-	rxBuf     []byte   // backing store for kernel-handed packets read this drain
-	rxOff     int      // cursor into rxBuf for the current Read drain
-	pending   []Packet // packets returned from the most recent Read

 	// readVnetScratch holds the 10-byte virtio_net_hdr split off the front of
 	// every TUN read via readv(2). Decoupling the header from the packet body
@@ -115,9 +113,7 @@ func newOffload(fd int, shutdownFd int, usoEnabled bool) (*Offload, error) {
 			{Fd: int32(shutdownFd), Events: unix.POLLIN},
 		},
 		writeLock: sync.Mutex{},
-
-		rxBuf:   make([]byte, tunRxBufCap),
-		gsoIovs: make([]unix.Iovec, 2, gsoMaxIovs),
+		gsoIovs:   make([]unix.Iovec, 2, gsoMaxIovs),
 	}

 	out.gsoIovs[0].Base = &out.gsoHdrBuf[0]
@@ -197,9 +193,9 @@ func (r *Offload) blockOnWrite() error {
 // hold one worst-case kernel-supplied packet body. Without that gate the
 // body iovec could be smaller than the next inbound packet and the
 // kernel would truncate.
-func (r *Offload) readPacket(block bool) (int, error) {
+func (r *Offload) readPacket(mem []byte, block bool) (int, error) {
 	for {
-		r.readIovs[1].Base = &r.rxBuf[r.rxOff]
+		r.readIovs[1].Base = &mem[0]
 		r.readIovs[1].SetLen(tunReadBufSize)
 		n, _, errno := syscall.Syscall(unix.SYS_READV, uintptr(r.fd), uintptr(unsafe.Pointer(&r.readIovs[0])), uintptr(len(r.readIovs)))
 		if errno == 0 {
@@ -237,29 +233,33 @@ func (r *Offload) readPacket(block bool) (int, error) {
 // bursts of small packets (e.g. TCP ACKs). Packet.Bytes slices point
 // into the Offload's internal buffer and are only valid until the next
 // Read or Close on this Queue.
-func (r *Offload) Read() ([]Packet, error) {
-	r.pending = r.pending[:0]
-	r.rxOff = 0
+func (r *Offload) Read(p []wire.TunPacket, mem []byte) (int, error) {
+	maxP := len(p)
+	maxM := len(mem)
+	p = p[:0]
+	rxOff := 0

 	// Initial (blocking) read. Retry on decode errors so a single bad
 	// packet does not stall the reader.
 	for {
-		n, err := r.readPacket(true)
+		n, err := r.readPacket(mem, true)
 		if err != nil {
-			return nil, err
+			return 0, err
 		}
-		if err := r.decodeRead(n); err != nil {
+		if p, err = r.decodeRead(p, mem, n); err != nil {
 			// Drop and read again — a bad packet should not kill the reader.
 			continue
 		}
+
+		rxOff += n
 		break
 	}

 	// Drain: non-blocking reads until the kernel queue is empty, the drain
 	// cap is reached, or rxBuf no longer has room for another worst-case
 	// kernel-supplied packet (tunRxBufSize).
-	for len(r.pending) < tunDrainCap && tunRxBufCap-r.rxOff >= tunRxBufSize {
-		n, err := r.readPacket(false)
+	for len(p) < maxP && maxM-rxOff >= tunRxBufSize {
+		n, err := r.readPacket(mem[rxOff:], false)
 		if err != nil {
 			// EAGAIN / EINTR / anything else: stop draining. We already
 			// have a valid batch from the first read.
@@ -268,14 +268,15 @@ func (r *Offload) Read() ([]Packet, error) {
 		if n <= 0 {
 			break
 		}
-		if err := r.decodeRead(n); err != nil {
+		if p, err = r.decodeRead(p, mem, n); err != nil {
 			// Drop this packet and stop the drain; we'd rather hand off
 			// what we have than keep spinning here.
 			break
 		}
+		rxOff += n
 	}

-	return r.pending, nil
+	return len(p), nil
 }

 // decodeRead processes the packet sitting in rxBuf at rxOff (length
@@ -285,24 +286,23 @@ func (r *Offload) Read() ([]Packet, error) {
 // caller can segment lazily at encrypt time. rxOff advances past the
 // kernel-supplied body and nothing else, since segmentation no longer
 // writes back into rxBuf.
-func (r *Offload) decodeRead(pktLen int) error {
+func (r *Offload) decodeRead(p []wire.TunPacket, mem []byte, pktLen int) ([]wire.TunPacket, error) {
 	if pktLen <= 0 {
-		return fmt.Errorf("short tun read: %d", pktLen)
+		return p, fmt.Errorf("short tun read: %d", pktLen)
 	}
 	var hdr virtio.Hdr
 	hdr.Decode(r.readVnetScratch[:])

-	body := r.rxBuf[r.rxOff : r.rxOff+pktLen]
+	body := mem[:pktLen]

 	if hdr.GSOType == unix.VIRTIO_NET_HDR_GSO_NONE {
 		if hdr.Flags&unix.VIRTIO_NET_HDR_F_NEEDS_CSUM != 0 {
 			if err := virtio.FinishChecksum(body, hdr); err != nil {
-				return err
+				return p, err
 			}
 		}
-		r.pending = append(r.pending, Packet{Bytes: body})
-		r.rxOff += pktLen
-		return nil
+		p = append(p, wire.TunPacket{Bytes: body})
+		return p, nil
 	}

 	// GSO superpacket: validate, fix the kernel-supplied HdrLen on the
@@ -310,26 +310,25 @@ func (r *Offload) decodeRead(pktLen int) error {
 	// the metadata. The bytes stay in rxBuf untouched, segmentation
 	// happens in SegmentSuperpacket at encrypt time.
 	if err := virtio.CheckValid(body, hdr); err != nil {
-		return err
+		return p, err
 	}
 	if err := virtio.CorrectHdrLen(body, &hdr); err != nil {
-		return err
+		return p, err
 	}
 	proto, err := protoFromGSOType(hdr.GSOType)
 	if err != nil {
-		return err
+		return p, err
 	}
-	r.pending = append(r.pending, Packet{
+	p = append(p, wire.TunPacket{
 		Bytes: body,
-		GSO: GSOInfo{
+		Meta: wire.GSOInfo{
 			Size:      hdr.GSOSize,
 			HdrLen:    hdr.HdrLen,
 			CsumStart: hdr.CsumStart,
 			Proto:     proto,
 		},
 	})
-	r.rxOff += pktLen
-	return nil
+	return p, nil
 }

 func (r *Offload) Write(buf []byte) (int, error) {
@@ -384,7 +383,7 @@ func (r *Offload) Capabilities() Capabilities {
 	return Capabilities{TSO: true, USO: r.usoEnabled}
 }

-func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto GSOProto) error {
+func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto wire.GSOProto) error {
 	if len(hdr) == 0 || len(pays) == 0 || len(transportHdr) == 0 {
 		return nil
 	}
@@ -392,7 +391,7 @@ func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto
 	// seq/ack/dataoff/flags/window), UDP=6 (after sport/dport/length).
 	var csumOff uint16
 	switch proto {
-	case GSOProtoUDP:
+	case wire.GSOProtoUDP:
 		csumOff = 6
 	default:
 		csumOff = 16
@@ -407,7 +406,7 @@ func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto
 	if len(pays) > 1 {
 		ipVer := hdr[0] >> 4
 		switch {
-		case proto == GSOProtoUDP && (ipVer == 4 || ipVer == 6):
+		case proto == wire.GSOProtoUDP && (ipVer == 4 || ipVer == 6):
 			vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_UDP_L4
 		case ipVer == 6:
 			vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV6
--- a/overlay/tio/tio_poll_linux.go
+++ b/overlay/tio/tio_poll_linux.go
@@ -5,6 +5,7 @@ import (
 	"os"
 	"sync/atomic"

+	"github.com/slackhq/nebula/wire"
 	"golang.org/x/sys/unix"
 )

@@ -19,9 +20,6 @@ type Poll struct {
 	readPoll  [2]unix.PollFd
 	writePoll [2]unix.PollFd
 	closed    atomic.Bool
-
-	readBuf  []byte
-	batchRet [1]Packet
 }

 func newPoll(fd int, shutdownFd int) (*Poll, error) {
@@ -31,8 +29,7 @@ func newPoll(fd int, shutdownFd int) (*Poll, error) {
 	}

 	out := &Poll{
-		fd:      fd,
-		readBuf: make([]byte, tunReadBufSize),
+		fd: fd,
 		readPoll: [2]unix.PollFd{
 			{Fd: int32(fd), Events: unix.POLLIN},
 			{Fd: int32(shutdownFd), Events: unix.POLLIN},
@@ -97,13 +94,17 @@ func (t *Poll) blockOnWrite() error {
 	return nil
 }

-func (t *Poll) Read() ([]Packet, error) {
-	n, err := t.readOne(t.readBuf)
-	if err != nil {
-		return nil, err
+func (t *Poll) Read(p []wire.TunPacket, mem []byte) (int, error) {
+	if len(p) == 0 || len(mem) == 0 {
+		return 0, nil //todo should this be an err?
 	}
-	t.batchRet[0] = Packet{Bytes: t.readBuf[:n]}
-	return t.batchRet[:], nil
+	p[0].Meta = wire.GSOInfo{}
+	n, err := t.readOne(mem)
+	if err != nil {
+		return 0, err
+	}
+	p[0].Bytes = mem[:n]
+	return 1, nil
 }

 func (t *Poll) readOne(to []byte) (int, error) {
@@ -162,3 +163,7 @@ func (t *Poll) Close() error {

 	return err
 }
+
+func (t *Poll) Capabilities() Capabilities {
+	return Capabilities{}
+}
--- a/overlay/tio/tun_linux_offload.go
+++ b/overlay/tio/tun_linux_offload.go
@@ -6,46 +6,20 @@ package tio
 import (
 	"fmt"

+	"github.com/slackhq/nebula/wire"
 	"golang.org/x/sys/unix"
-
-	"github.com/slackhq/nebula/overlay/tio/virtio"
 )

 // protoFromGSOType maps a virtio_net_hdr GSOType to the GSOProto value the
 // segment-time helpers use. Returns an error for GSO_NONE or any unknown
 // value — the caller should only invoke this on a confirmed superpacket.
-func protoFromGSOType(t uint8) (GSOProto, error) {
+func protoFromGSOType(t uint8) (wire.GSOProto, error) {
 	switch t {
 	case unix.VIRTIO_NET_HDR_GSO_TCPV4, unix.VIRTIO_NET_HDR_GSO_TCPV6:
-		return GSOProtoTCP, nil
+		return wire.GSOProtoTCP, nil
 	case unix.VIRTIO_NET_HDR_GSO_UDP_L4:
-		return GSOProtoUDP, nil
+		return wire.GSOProtoUDP, nil
 	default:
 		return 0, fmt.Errorf("unsupported virtio gso type: %d", t)
 	}
 }
-
-// SegmentSuperpacket invokes fn once per segment of pkt. For non-GSO pkts
-// fn is called once with pkt.Bytes (no segmentation, no copy). For GSO/USO
-// superpackets fn is called once per segment with a slice of pkt.Bytes
-// holding that segment's plaintext (a freshly-patched L3+L4 header sliced
-// in front of the original payload chunk). The slide is destructive: pkt is
-// consumed by this call and its bytes are in an undefined state when
-// SegmentSuperpacket returns. Callers must not retain pkt or any earlier
-// seg slice past fn's return for that segment. The scratch parameter is
-// unused on the destructive path and kept only for cross-platform
-// signature compatibility. Aborts and returns the first error from fn or
-// from per-segment construction.
-func SegmentSuperpacket(pkt Packet, fn func(seg []byte) error) error {
-	if !pkt.GSO.IsSuperpacket() {
-		return fn(pkt.Bytes)
-	}
-	switch pkt.GSO.Proto {
-	case GSOProtoTCP:
-		return virtio.SegmentTCP(pkt.Bytes, pkt.GSO.HdrLen, pkt.GSO.CsumStart, pkt.GSO.Size, fn)
-	case GSOProtoUDP:
-		return virtio.SegmentUDP(pkt.Bytes, pkt.GSO.HdrLen, pkt.GSO.CsumStart, pkt.GSO.Size, fn)
-	default:
-		return fmt.Errorf("unsupported gso proto: %d", pkt.GSO.Proto)
-	}
-}
--- a/overlay/tun_disabled.go
+++ b/overlay/tun_disabled.go
@@ -83,7 +83,7 @@ func (t *disabledTun) Read(p []wire.TunPacket, mem []byte) (int, error) {
 	if len(p) == 0 || len(mem) == 0 {
 		return 0, nil //todo should this be an err?
 	}
-	p[0].Meta = struct{}{}
+	p[0].Meta = wire.GSOInfo{}
 	n, err := t.readOne(mem)
 	if err != nil {
 		return 0, err
--- a/overlay/tun_linux.go
+++ b/overlay/tun_linux.go
@@ -146,6 +146,11 @@ func newTun(c *config.C, l *slog.Logger, vpnNetworks []netip.Prefix, multiqueue
 	}
 	nameStr := c.GetString("tun.dev", "")

+	// First try to enable IFF_VNET_HDR via TUNSETIFF and negotiate TUN_F_*
+	// offloads via TUNSETOFFLOAD so we can receive TSO/USO superpackets.
+	// We try TSO+USO first, fall back to TSO-only on kernels without USO
+	// (Linux < 6.2), and finally give up on virtio headers entirely and
+	// reopen as a plain TUN if neither offload mask is accepted.
 	fd, err := openTunDev()
 	if err != nil {
 		return nil, err
--- a/overlay/user.go
+++ b/overlay/user.go
@@ -48,7 +48,7 @@ func (d *UserDevice) Read(p []wire.TunPacket, mem []byte) (int, error) {
 	if len(p) == 0 || len(mem) == 0 {
 		return 0, nil //todo should this be an err?
 	}
-	p[0].Meta = struct{}{}
+	p[0].Meta = wire.GSOInfo{}
 	n, err := d.outboundReader.Read(mem)
 	if err != nil {
 		return 0, err
--- a/udp/udp_linux.go
+++ b/udp/udp_linux.go
@@ -6,10 +6,13 @@ package udp
 import (
 	"context"
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"log/slog"
 	"net"
 	"net/netip"
+	"strconv"
+	"strings"
 	"syscall"
 	"unsafe"

@@ -32,14 +35,44 @@ type StdConn struct {
 	writeIovs  []iovec
 	writeNames [][]byte

-	// sendmmsg(2) callback state. sendmmsgCB is bound once in NewListener
-	// to the sendmmsgRun method value so passing it to rawConn.Write does
-	// not allocate a fresh closure per send; sendmmsgN/Sent/Errno carry
-	// the inputs and outputs across the call without escaping locals.
-	sendmmsgCB    func(fd uintptr) bool
-	sendmmsgN     int
-	sendmmsgSent  int
-	sendmmsgErrno syscall.Errno
+	// Per-entry cmsg scratch. writeCmsg is one contiguous slab of
+	// MaxWriteBatch * writeCmsgSpace bytes; each entry holds two cmsg
+	// headers (UDP_SEGMENT then IP_TOS / IPV6_TCLASS) pre-filled once in
+	// prepareWriteMessages. WriteBatch only rewrites the per-call data
+	// payloads and toggles Hdr.Control / Hdr.Controllen to point at
+	// whichever subset of the two cmsgs applies.
+	writeCmsg         []byte
+	writeCmsgSpace    int
+	writeCmsgSegSpace int
+	writeCmsgEcnSpace int
+
+	// writeEntryEnd[e] is the bufs index *after* the last packet packed
+	// into mmsghdr entry e. Used to rewind `i` on partial sendmmsg success.
+	writeEntryEnd []int
+
+	// rawSend wraps the sendmmsg(2) callback in a closure-free helper so
+	// the hot path doesn't heap-allocate a fresh closure per call.
+	rawSend rawSendmmsg
+
+	// UDP GSO (sendmsg with UDP_SEGMENT cmsg) support. gsoSupported is
+	// probed once at socket creation. When true, WriteBatch packs same-
+	// destination consecutive packets into a single sendmmsg entry with a
+	// UDP_SEGMENT cmsg; otherwise each packet is its own entry.
+	gsoSupported   bool
+	maxGSOSegments int
+
+	// UDP GRO (recvmsg with UDP_GRO cmsg) support. groSupported is probed
+	// once at socket creation. When true, listenOutBatch allocates larger
+	// RX buffers and a per-entry cmsg slot so the kernel can coalesce
+	// consecutive same-flow datagrams into a single recvmmsg entry; the
+	// delivered cmsg carries the gso_size used to split them back apart.
+	groSupported bool
+
+	// ecnRecvSupported is true when IP_RECVTOS / IPV6_RECVTCLASS was
+	// successfully enabled — the kernel will deliver the outer IP-ECN of
+	// each arriving datagram as a per-slot cmsg, and listenOutBatch passes
+	// the parsed value to the EncReader callback for RFC 6040 combine.
+	ecnRecvSupported bool
 }

 func setReusePort(network, address string, c syscall.RawConn) error {
@@ -73,10 +106,11 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int)
 	}
 	//gotta find out if we got an AF_INET6 socket or not:
 	out := &StdConn{
-		udpConn: udpConn,
-		rawConn: rawConn,
-		l:       l,
-		batch:   batch,
+		udpConn:        udpConn,
+		rawConn:        rawConn,
+		l:              l,
+		batch:          batch,
+		maxGSOSegments: 1,
 	}

 	af, err := out.getSockOptInt(unix.SO_DOMAIN)
@@ -87,15 +121,71 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int)
 	out.isV4 = af == unix.AF_INET

 	out.prepareWriteMessages(MaxWriteBatch)
-	out.sendmmsgCB = out.sendmmsgRun
+	out.rawSend.msgs = out.writeMsgs
+	out.rawSend.bind()
+
+	out.prepareGSO()
+	// GRO delivers coalesced superpackets that need a cmsg to split back
+	// into segments. The single-packet RX path uses ReadFromUDPAddrPort
+	// and cannot see that cmsg, so only enable GRO for the batch path.
+	if batch > 1 {
+		out.prepareGRO()
+	}
+	// Best-effort: ask the kernel to deliver outer IP-ECN as ancillary data
+	// on every recvmmsg slot so the decap side can apply RFC 6040 combine.
+	// On older kernels these may not exist; failing here just means we get
+	// 0 (Not-ECT) on every slot, which is the same as ecn_mode=disable.
+	out.prepareECNRecv()

 	return out, nil
 }

+// prepareWriteMessages allocates one mmsghdr/iovec/sockaddr/cmsg scratch
+// slot per sendmmsg entry. The iovec slab is sized to n so all entries'
+// iovecs share one allocation; per-entry fan-out is further capped at
+// maxGSOSegments. Hdr.Iov / Hdr.Iovlen / Hdr.Control / Hdr.Controllen are
+// wired per call since each entry can span a variable number of iovecs
+// and may or may not carry a cmsg.
+//
+// Per-mmsghdr cmsg layout. Each entry's slot of length writeCmsgSpace holds
+// up to two cmsg headers placed at fixed offsets:
+//
+//	[0 .. writeCmsgSegSpace)              UDP_SEGMENT (gso_size, uint16)
+//	[writeCmsgSegSpace .. writeCmsgSpace) IP_TOS or IPV6_TCLASS (int32)
+//
+// Both headers are pre-filled once here; per-call we only rewrite the data
+// payload and toggle Hdr.Control / Hdr.Controllen to point at whichever
+// subset applies (none / segment-only / ecn-only / both).
 func (u *StdConn) prepareWriteMessages(n int) {
 	u.writeMsgs = make([]rawMessage, n)
 	u.writeIovs = make([]iovec, n)
 	u.writeNames = make([][]byte, n)
+	u.writeEntryEnd = make([]int, n)
+
+	u.writeCmsgSegSpace = unix.CmsgSpace(2)
+	u.writeCmsgEcnSpace = unix.CmsgSpace(4)
+	u.writeCmsgSpace = u.writeCmsgSegSpace + u.writeCmsgEcnSpace
+	u.writeCmsg = make([]byte, n*u.writeCmsgSpace)
+
+	ecnLevel := int32(unix.IPPROTO_IP)
+	ecnType := int32(unix.IP_TOS)
+	if !u.isV4 {
+		ecnLevel = unix.IPPROTO_IPV6
+		ecnType = unix.IPV6_TCLASS
+	}
+
+	for k := 0; k < n; k++ {
+		base := k * u.writeCmsgSpace
+		seg := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base]))
+		seg.Level = unix.SOL_UDP
+		seg.Type = unix.UDP_SEGMENT
+		setCmsgLen(seg, unix.CmsgLen(2))
+
+		ecn := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base+u.writeCmsgSegSpace]))
+		ecn.Level = ecnLevel
+		ecn.Type = ecnType
+		setCmsgLen(ecn, unix.CmsgLen(4))
+	}

 	for i := range u.writeMsgs {
 		u.writeNames[i] = make([]byte, unix.SizeofSockaddrInet6)
@@ -103,6 +193,139 @@ func (u *StdConn) prepareWriteMessages(n int) {
 	}
 }

+// maxGSOBytes bounds the total payload per sendmsg() when UDP_SEGMENT is
+// set. The kernel stitches all iovecs into a single skb whose length the
+// UDP length field can represent, and also enforces sk_gso_max_size (which
+// on most devices is 65536). We use 65000 to leave headroom under the
+// 65535 UDP-length cap, avoiding EMSGSIZE on large TSO superpackets.
+const maxGSOBytes = 65000
+
+// prepareGSO probes UDP_SEGMENT support and sets u.gsoSupported on success.
+// Best-effort; failure leaves it false.
+func (u *StdConn) prepareGSO() {
+	u.maxGSOSegments = 63 //gotta be one less than the max so we can still attach a header
+
+	var probeErr error
+	if err := u.rawConn.Control(func(fd uintptr) {
+		probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_SEGMENT, 0)
+	}); err != nil {
+		u.l.Info("udp: GSO disabled", "reason", "rawconn control failed", "error", err)
+		recordCapability("udp.gso.enabled", false)
+		return
+	}
+	if probeErr != nil {
+		u.l.Info("udp: GSO disabled", "reason", "kernel rejected probe", "error", probeErr)
+		recordCapability("udp.gso.enabled", false)
+		return
+	}
+
+	var un unix.Utsname
+	if err := unix.Uname(&un); err != nil {
+		u.l.Info("udp: GSO disabled", "reason", "kernel uname probe failed", "error", err)
+		recordCapability("udp.gso.enabled", false)
+		return
+	}
+	major, minor := parseRelease(string(un.Release[:]))
+	if major > 5 || (major == 5 && minor >= 5) {
+		u.maxGSOSegments = 127
+	}
+
+	u.gsoSupported = true
+	u.l.Info("udp: GSO enabled", "maxGSOSegments", u.maxGSOSegments)
+	recordCapability("udp.gso.enabled", true)
+}
+
+// udpGROBufferSize sizes the per-entry recvmmsg buffer when UDP_GRO is on.
+// The kernel stitches a run of same-flow datagrams into a single skb whose
+// length is bounded by sk_gso_max_size (typically 65535); anything larger
+// would be MSG_TRUNCed. We use the maximum representable UDP length so a
+// full superpacket always lands intact.
+const udpGROBufferSize = 65535
+
+// udpGROCmsgPayload is the size of the UDP_GRO cmsg data delivered by the
+// kernel: a single int (gso_size in bytes). See udp_cmsg_recv() in
+// net/ipv4/udp.c.
+const udpGROCmsgPayload = 4
+
+// prepareGRO turns on UDP_GRO so the kernel coalesces consecutive same-flow
+// datagrams into one recvmmsg entry, with a cmsg carrying the gso_size used
+// to split them back apart on the application side.
+func (u *StdConn) prepareGRO() {
+	var probeErr error
+	if err := u.rawConn.Control(func(fd uintptr) {
+		probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_GRO, 1)
+	}); err != nil {
+		u.l.Info("udp: GRO disabled", "reason", "rawconn control failed", "error", err)
+		recordCapability("udp.gro.enabled", false)
+		return
+	}
+	if probeErr != nil {
+		u.l.Info("udp: GRO disabled", "reason", "kernel rejected probe", "error", probeErr)
+		recordCapability("udp.gro.enabled", false)
+		return
+	}
+	u.groSupported = true
+	u.l.Info("udp: GRO enabled")
+	recordCapability("udp.gro.enabled", true)
+}
+
+// prepareECNRecv turns on IP_RECVTOS / IPV6_RECVTCLASS so the outer IP-ECN
+// field of each arriving datagram is delivered as ancillary data alongside
+// the payload. listenOutBatch reads it via parseRecvCmsg and passes the
+// codepoint through the EncReader for RFC 6040 combine on the decap side.
+// Best-effort: we keep going on failure.
+func (u *StdConn) prepareECNRecv() {
+	var v4err, v6err error
+	if err := u.rawConn.Control(func(fd uintptr) {
+		v4err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_RECVTOS, 1)
+		if !u.isV4 {
+			v6err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_RECVTCLASS, 1)
+		}
+	}); err != nil {
+		u.l.Info("udp: outer-ECN RX disabled", "reason", "rawconn control failed", "error", err)
+		recordCapability("udp.ecn_rx.enabled", false)
+		return
+	}
+	if u.isV4 { //only check the V4 attempt
+		if v4err != nil {
+			u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", v4err)
+			recordCapability("udp.ecn_rx.enabled", false)
+		} else {
+			u.ecnRecvSupported = true
+			u.l.Info("udp: outer-ECN RX enabled")
+			recordCapability("udp.ecn_rx.enabled", true)
+		}
+		return
+	} else {
+		if v6err != nil { //no V6 ECN? disable it.
+			u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", errors.Join(v4err, v6err))
+			recordCapability("udp.ecn_rx.enabled", false)
+			return
+		} else if v4err != nil { //no V4, but yes V6? Low level warning. Could be a V6-specific bind.
+			u.l.Debug("udp: outer-ECN RX degraded", "reason", "kernel rejected probe on IPv4", "error", v4err)
+		}
+		// all good
+		u.ecnRecvSupported = true
+		u.l.Info("udp: outer-ECN RX enabled")
+		recordCapability("udp.ecn_rx.enabled", true)
+		return
+	}
+}
+
+// recordCapability registers (or updates) a boolean gauge for one of the
+// kernel-feature probes. Gauges go to 1 when the feature is enabled, 0 when
+// it is not — dashboards can show degraded state on partially-supported
+// kernels at a glance. Calling repeatedly with the same name updates the
+// existing gauge rather than registering a duplicate.
+func recordCapability(name string, enabled bool) {
+	g := metrics.GetOrRegisterGauge(name, nil)
+	if enabled {
+		g.Update(1)
+	} else {
+		g.Update(0)
+	}
+}
+
 func (u *StdConn) SupportsMultipleReaders() bool {
 	return true
 }
@@ -221,16 +444,15 @@ func (u *StdConn) listenOutSingle(r EncReader, flush func()) error {
 	}
 }

-// readSockaddr decodes the source address out of a recvmmsg name buffer
-func (u *StdConn) readSockaddr(name []byte) netip.AddrPort {
+func getFrom(names [][]byte, i int, isV4 bool) netip.AddrPort {
 	var ip netip.Addr
-	// It's ok to skip the ok check here, the slicing is the only error that can occur and it will panic
-	if u.isV4 {
-		ip, _ = netip.AddrFromSlice(name[4:8])
+	// Its ok to skip the ok check here, the slicing is the only error that can occur and it will panic
+	if isV4 {
+		ip, _ = netip.AddrFromSlice(names[i][4:8])
 	} else {
-		ip, _ = netip.AddrFromSlice(name[8:24])
+		ip, _ = netip.AddrFromSlice(names[i][8:24])
 	}
-	return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(name[2:4]))
+	return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(names[i][2:4]))
 }

 func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
@@ -239,6 +461,16 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {

 	bufSize := MTU
 	cmsgSpace := 0
+	if u.groSupported {
+		bufSize = udpGROBufferSize
+		cmsgSpace = unix.CmsgSpace(udpGROCmsgPayload)
+	}
+	if u.ecnRecvSupported {
+		// IP_TOS arrives as 1 byte; IPV6_TCLASS arrives as a 4-byte int.
+		// Reserve enough for the wider of the two so the same buffer fits
+		// either family alongside any UDP_GRO cmsg.
+		cmsgSpace += unix.CmsgSpace(4)
+	}
 	msgs, buffers, names, _ := u.PrepareRawMessages(u.batch, bufSize, cmsgSpace)

 	//reader needs to capture variables from this function, since it's used as a lambda with rawConn.Read
@@ -249,6 +481,11 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
 	}

 	for {
+		if cmsgSpace > 0 {
+			for i := range msgs {
+				setMsgControllen(&msgs[i].Hdr, cmsgSpace)
+			}
+		}
 		err := u.rawConn.Read(reader)
 		if err != nil {
 			return err
@@ -258,13 +495,75 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
 		}

 		for i := 0; i < n; i++ {
-			r(u.readSockaddr(names[i]), buffers[i][:msgs[i].Len], RxMeta{})
+			from := getFrom(names, i, u.isV4)
+			payload := buffers[i][:msgs[i].Len]
+
+			segSize := 0
+			outerECN := byte(0)
+			if cmsgSpace > 0 {
+				segSize, outerECN = parseRecvCmsg(&msgs[i].Hdr, u.groSupported, u.ecnRecvSupported, u.isV4)
+			}
+
+			if segSize <= 0 || segSize >= len(payload) {
+				r(from, payload, RxMeta{OuterECN: outerECN})
+			} else {
+				for off := 0; off < len(payload); off += segSize {
+					end := off + segSize
+					if end > len(payload) {
+						end = len(payload)
+					}
+					seg := payload[off:end]
+					r(from, seg, RxMeta{OuterECN: outerECN})
+				}
+			}
 		}

 		flush()
 	}
 }

+// parseRecvCmsg walks the per-slot ancillary buffer once and extracts up to
+// two values of interest in a single pass: the UDP_GRO gso_size (when
+// wantGRO is true) and the outer IP-level ECN codepoint stamped on the
+// carrier (when wantECN is true). Returns zeros for whichever field is not
+// requested or not present. isV4 selects between IP_TOS (1-byte) and
+// IPV6_TCLASS (4-byte int) cmsg payloads.
+func parseRecvCmsg(hdr *msghdr, wantGRO, wantECN bool, isV4 bool) (gso int, ecn byte) {
+	controllen := int(hdr.Controllen)
+	if controllen < unix.SizeofCmsghdr || hdr.Control == nil {
+		return 0, 0
+	}
+	ctrl := unsafe.Slice(hdr.Control, controllen)
+	off := 0
+	for off+unix.SizeofCmsghdr <= len(ctrl) {
+		ch := (*unix.Cmsghdr)(unsafe.Pointer(&ctrl[off]))
+		clen := int(ch.Len)
+		if clen < unix.SizeofCmsghdr || off+clen > len(ctrl) {
+			return gso, ecn
+		}
+		dataOff := off + unix.CmsgLen(0)
+		switch {
+		case wantGRO && ch.Level == unix.SOL_UDP && ch.Type == unix.UDP_GRO:
+			if dataOff+udpGROCmsgPayload <= len(ctrl) {
+				gso = int(int32(binary.NativeEndian.Uint32(ctrl[dataOff : dataOff+udpGROCmsgPayload])))
+			}
+		case wantECN && isV4 && ch.Level == unix.IPPROTO_IP && ch.Type == unix.IP_TOS:
+			// IP_TOS arrives as a single byte; only the low 2 bits are ECN.
+			if dataOff+1 <= len(ctrl) {
+				ecn = ctrl[dataOff] & 0x03
+			}
+		case wantECN && !isV4 && ch.Level == unix.IPPROTO_IPV6 && ch.Type == unix.IPV6_TCLASS:
+			// IPV6_TCLASS arrives as a 4-byte int; ECN is the low 2 bits.
+			if dataOff+4 <= len(ctrl) {
+				ecn = byte(binary.NativeEndian.Uint32(ctrl[dataOff:dataOff+4])) & 0x03
+			}
+		}
+		// Advance by the aligned cmsg space.
+		off += unix.CmsgSpace(clen - unix.CmsgLen(0))
+	}
+	return gso, ecn
+}
+
 func (u *StdConn) ListenOut(r EncReader, flush func()) error {
 	if u.batch == 1 {
 		return u.listenOutSingle(r, flush)
@@ -279,89 +578,222 @@ func (u *StdConn) WriteTo(b []byte, ip netip.AddrPort) error {
 }

 // WriteBatch sends bufs via sendmmsg(2) using the preallocated scratch on
-// StdConn. If supported, consecutive packets to the same destination with
-// matching segment sizes (all but possibly the last) are coalesced into a
-// single mmsghdr entry
+// StdConn. Consecutive packets to the same destination with matching segment
+// sizes (all but possibly the last) are coalesced into a single mmsghdr entry
+// carrying a UDP_SEGMENT cmsg, so one syscall can mix runs of GSO superpackets
+// with plain one-off datagrams. Without GSO support every packet is its own
+// entry, matching the prior behaviour.
 //
-// If sendmmsg returns an error and zero entries went out, we fall back to
+// Chunks larger than the scratch are processed across multiple syscalls. If
+// sendmmsg returns an error AND zero entries went out we fall back to
 // per-packet WriteTo for that chunk so the caller still gets best-effort
-// delivery. On a partial send we resume at the first un-acked entry on
-// the next iteration.
-func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, _ []byte) error {
-	for i := 0; i < len(bufs); {
-		chunk := min(len(bufs)-i, len(u.writeMsgs))
+// delivery; on a partial-success error we just replay the remainder.
+func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, ecns []byte) error {
+	if len(bufs) != len(addrs) {
+		return fmt.Errorf("WriteBatch: len(bufs)=%d != len(addrs)=%d", len(bufs), len(addrs))
+	}
+	if ecns != nil && len(ecns) != len(bufs) {
+		return fmt.Errorf("WriteBatch: len(ecns)=%d != len(bufs)=%d", len(ecns), len(bufs))
+	}

-		for k := 0; k < chunk; k++ {
-			u.writeIovs[k].Base = &bufs[i+k][0]
-			setIovLen(&u.writeIovs[k], len(bufs[i+k]))
+	// Callers deliver same-destination packets contiguously and in counter
+	// order, so we run the GSO planner directly without a pre-sort. A
+	// sorting pass measurably hurt throughput in microbenchmarks while
+	// providing no observed reordering benefit.

-			nlen, err := writeSockaddr(u.writeNames[k], addrs[i+k], u.isV4)
+	i := 0
+	for i < len(bufs) {
+		baseI := i
+		entry := 0
+		iovIdx := 0
+		for entry < len(u.writeMsgs) && i < len(bufs) {
+			iovBudget := len(u.writeIovs) - iovIdx
+			if iovBudget < 1 {
+				break
+			}
+			runLen, segSize := u.planRun(bufs, addrs, ecns, i, iovBudget)
+			if runLen == 0 {
+				break
+			}
+
+			for k := 0; k < runLen; k++ {
+				b := bufs[i+k]
+				if len(b) == 0 {
+					u.writeIovs[iovIdx+k].Base = nil
+					setIovLen(&u.writeIovs[iovIdx+k], 0)
+				} else {
+					u.writeIovs[iovIdx+k].Base = &b[0]
+					setIovLen(&u.writeIovs[iovIdx+k], len(b))
+				}
+			}
+
+			nlen, err := writeSockaddr(u.writeNames[entry], addrs[i], u.isV4)
 			if err != nil {
 				return err
 			}

-			hdr := &u.writeMsgs[k].Hdr
-			hdr.Iov = &u.writeIovs[k]
-			setMsgIovlen(hdr, 1)
+			hdr := &u.writeMsgs[entry].Hdr
+			hdr.Iov = &u.writeIovs[iovIdx]
+			setMsgIovlen(hdr, runLen)
 			hdr.Namelen = uint32(nlen)
+
+			var ecn byte
+			if ecns != nil {
+				ecn = ecns[i]
+			}
+			u.writeEntryCmsg(entry, runLen, segSize, ecn)
+
+			i += runLen
+			iovIdx += runLen
+			u.writeEntryEnd[entry] = i
+			entry++
 		}

-		sent, serr := u.sendmmsg(chunk)
+		if entry == 0 {
+			return fmt.Errorf("sendmmsg: no progress")
+		}
+
+		sent, serr := u.sendmmsg(entry)
 		if serr != nil && sent <= 0 {
-			// sendmmsg returns -1 / sent=0 when entry 0 itself failed; log
-			// that entry's destination and fall back to per-packet WriteTo
-			// for the whole chunk so the caller still gets best-effort
-			// delivery without duplicating packets the kernel accepted.
-			u.l.Warn("sendmmsg failed, falling back to per-packet WriteTo",
-				"err", serr,
-				"entries", chunk,
-				"entry0_dst", addrs[i],
+			// Nothing went out for this chunk; fall back to WriteTo for each
+			// packet that was queued this iteration. We only enter this path
+			// when sendmmsg returned an error AND zero entries succeeded —
+			// otherwise the partial-success advance below replays only the
+			// remainder, avoiding duplicates of already-sent packets.
+			//
+			// sent=-1 from sendmmsg means message 0 itself failed (partial
+			// success returns the count instead), so log entry 0's parameters
+			// — that's the entry the kernel rejected.
+			hdr0 := &u.writeMsgs[0].Hdr
+			runLen0 := u.writeEntryEnd[0] - baseI
+			seg0 := len(bufs[baseI])
+			ecn0 := byte(0)
+			if ecns != nil {
+				ecn0 = ecns[baseI]
+			}
+			u.l.Warn("sendmmsg had problem",
+				"sent", sent, "err", serr,
+				"entries", entry,
+				"entry0_runLen", runLen0,
+				"entry0_segSize", seg0,
+				"entry0_iovlen", hdr0.Iovlen,
+				"entry0_controllen", hdr0.Controllen,
+				"entry0_namelen", hdr0.Namelen,
+				"entry0_ecn", ecn0,
+				"entry0_dst", addrs[baseI],
 				"isV4", u.isV4,
+				"gso", u.gsoSupported,
+				"gro", u.groSupported,
 			)
-			for k := 0; k < chunk; k++ {
-				if werr := u.WriteTo(bufs[i+k], addrs[i+k]); werr != nil {
+			for k := baseI; k < i; k++ {
+				if werr := u.WriteTo(bufs[k], addrs[k]); werr != nil {
 					return werr
 				}
 			}
-			i += chunk
 			continue
 		}
-		i += sent
+		if sent == 0 {
+			return fmt.Errorf("sendmmsg made no progress")
+		}
+		// Rewind i to the end of the last successfully sent entry. For a
+		// full-success send this leaves i unchanged; for a partial send it
+		// replays the remainder on the next outer-loop iteration.
+		i = u.writeEntryEnd[sent-1]
 	}
 	return nil
 }

-// sendmmsg issues sendmmsg(2) against the first n entries of u.writeMsgs.
-// The bound u.sendmmsgCB is passed to rawConn.Write so no closure is
-// allocated per call; inputs and outputs ride on the StdConn fields.
-func (u *StdConn) sendmmsg(n int) (int, error) {
-	u.sendmmsgN = n
-	u.sendmmsgSent = 0
-	u.sendmmsgErrno = 0
-	if err := u.rawConn.Write(u.sendmmsgCB); err != nil {
-		return u.sendmmsgSent, err
+// planRun groups consecutive packets starting at `start` that can be sent as
+// a single UDP GSO superpacket (one sendmmsg entry with UDP_SEGMENT cmsg).
+// A run of length 1 means the entry carries no UDP_SEGMENT cmsg and the
+// kernel treats it as a plain datagram. Returns the run length and the
+// per-segment size (which equals len(bufs[start])). Without GSO support
+// every call returns runLen=1. Outer ECN (when ecns != nil) is also a run
+// boundary — the kernel stamps one outer codepoint per sendmsg entry, so
+// mixing values inside a run would lose information.
+func (u *StdConn) planRun(bufs [][]byte, addrs []netip.AddrPort, ecns []byte, start, iovBudget int) (int, int) {
+	if start >= len(bufs) || iovBudget < 1 {
+		return 0, 0
 	}
-	if u.sendmmsgErrno != 0 {
-		return u.sendmmsgSent, &net.OpError{Op: "sendmmsg", Err: u.sendmmsgErrno}
+	segSize := len(bufs[start])
+	if !u.gsoSupported || segSize == 0 || segSize > maxGSOBytes {
+		return 1, segSize
 	}
-	return u.sendmmsgSent, nil
+	dst := addrs[start]
+	var ecn byte
+	if ecns != nil {
+		ecn = ecns[start]
+	}
+	maxLen := u.maxGSOSegments
+	if iovBudget < maxLen {
+		maxLen = iovBudget
+	}
+	runLen := 1
+	total := segSize
+	for runLen < maxLen && start+runLen < len(bufs) {
+		nextLen := len(bufs[start+runLen])
+		if nextLen == 0 || nextLen > segSize {
+			break
+		}
+		if addrs[start+runLen] != dst {
+			break
+		}
+		if ecns != nil && ecns[start+runLen] != ecn {
+			break
+		}
+		if total+nextLen > maxGSOBytes {
+			break
+		}
+		total += nextLen
+		runLen++
+		if nextLen < segSize {
+			// A short packet must be the last in the run.
+			break
+		}
+	}
+	return runLen, segSize
 }

-// sendmmsgRun is the rawConn.Write callback. It is bound once into
-// u.sendmmsgCB at construction so it stays alloc-free in the hot path;
-// inputs (sendmmsgN) and outputs (sendmmsgSent, sendmmsgErrno) ride on
-// the receiver rather than escaping locals.
-func (u *StdConn) sendmmsgRun(fd uintptr) bool {
-	r1, _, errno := unix.Syscall6(unix.SYS_SENDMMSG, fd,
-		uintptr(unsafe.Pointer(&u.writeMsgs[0])), uintptr(u.sendmmsgN),
-		0, 0, 0,
-	)
-	if errno == syscall.EAGAIN || errno == syscall.EWOULDBLOCK {
-		return false
+// writeEntryCmsg sets up the per-mmsghdr Hdr.Control / Hdr.Controllen for one
+// entry. It writes the UDP_SEGMENT payload when runLen >= 2 and the
+// IP_TOS/IPV6_TCLASS payload when ecn != 0, then points hdr.Control at the
+// smallest contiguous span that covers whichever cmsg(s) actually apply.
+func (u *StdConn) writeEntryCmsg(entry, runLen, segSize int, ecn byte) {
+	hdr := &u.writeMsgs[entry].Hdr
+	useSeg := runLen >= 2
+	useEcn := ecn != 0
+	base := entry * u.writeCmsgSpace
+
+	if useSeg {
+		dataOff := base + unix.CmsgLen(0)
+		binary.NativeEndian.PutUint16(u.writeCmsg[dataOff:dataOff+2], uint16(segSize))
 	}
-	u.sendmmsgSent = int(r1)
-	u.sendmmsgErrno = errno
-	return true
+	if useEcn {
+		dataOff := base + u.writeCmsgSegSpace + unix.CmsgLen(0)
+		binary.NativeEndian.PutUint32(u.writeCmsg[dataOff:dataOff+4], uint32(ecn))
+	}
+
+	switch {
+	case useSeg && useEcn:
+		hdr.Control = &u.writeCmsg[base]
+		setMsgControllen(hdr, u.writeCmsgSpace)
+	case useSeg:
+		hdr.Control = &u.writeCmsg[base]
+		setMsgControllen(hdr, u.writeCmsgSegSpace)
+	case useEcn:
+		hdr.Control = &u.writeCmsg[base+u.writeCmsgSegSpace]
+		setMsgControllen(hdr, u.writeCmsgEcnSpace)
+	default:
+		hdr.Control = nil
+		setMsgControllen(hdr, 0)
+	}
+}
+
+// sendmmsg issues sendmmsg(2) over u.rawConn against the first n entries
+// of u.writeMsgs. Routes through u.rawSend so the per-call kernel callback
+// stays alloc-free.
+func (u *StdConn) sendmmsg(n int) (int, error) {
+	return u.rawSend.send(u.rawConn, n)
 }

 // writeSockaddr encodes addr into buf (which must be at least
@@ -497,3 +929,22 @@ func NewUDPStatsEmitter(udpConns []Conn) func() {
 		}
 	}
 }
+
+func parseRelease(r string) (major, minor int) {
+	// strip anything after the second dot or any non-digit
+	parts := strings.SplitN(r, ".", 3)
+	if len(parts) < 2 {
+		return 0, 0
+	}
+	major, _ = strconv.Atoi(parts[0])
+	// minor may have trailing junk like "15-generic"
+	mp := parts[1]
+	for i, c := range mp {
+		if c < '0' || c > '9' {
+			mp = mp[:i]
+			break
+		}
+	}
+	minor, _ = strconv.Atoi(mp)
+	return
+}
--- a/wire/wire.go
+++ b/wire/wire.go
@@ -7,11 +7,40 @@ type TunPacket struct {
 	Bytes []byte
 	// Meta contains other information to help process the packet correctly, such as offsets for segmentation offloads
 	// Fields in Meta should be as portable/platform-agnostic as possible.
-	Meta struct{}
+	Meta GSOInfo
 }

-// PerSegment invokes fn once per segment of pkt.
-// This is a stub implementation that does not actually support segmentation
-func (t *TunPacket) PerSegment(fn func(seg []byte) error) error {
-	return fn(t.Bytes)
+// GSOInfo describes a kernel-supplied superpacket sitting in Packet.Bytes.
+// The zero value means "not a superpacket" — Bytes is one regular IP
+// datagram and no segmentation is required.
+type GSOInfo struct {
+	// Size is the GSO segment size: max payload bytes per segment
+	// (== TCP MSS for TSO, == UDP payload chunk for USO). Zero means
+	// not a superpacket.
+	Size uint16
+	// HdrLen is the total L3+L4 header length within Bytes (already
+	// corrected via correctHdrLen, so safe to slice on).
+	HdrLen uint16
+	// CsumStart is the L4 header offset inside Bytes (== L3 header
+	// length).
+	CsumStart uint16
+	// Proto picks the L4 protocol (TCP or UDP) so the segmenter knows
+	// which checksum/header layout to apply.
+	Proto GSOProto
 }
+
+// GSOProto selects the L4 protocol for a GSO superpacket. Determines which
+// VIRTIO_NET_HDR_GSO_* type the writer stamps and which checksum offset
+// inside the transport header virtio NEEDS_CSUM expects.
+type GSOProto uint8
+
+const (
+	GSOProtoNone GSOProto = iota
+	GSOProtoTCP
+	GSOProtoUDP
+)
+
+// IsSuperpacket reports whether g describes a multi-segment GSO/USO
+// superpacket that needs segmentation before its bytes can be encrypted
+// and sent on the wire.
+func (g GSOInfo) IsSuperpacket() bool { return g.Size > 0 }
--- a/wire/wire_generic.go
+++ b/wire/wire_generic.go
@@ -0,0 +1,10 @@
+//go:build !linux
+// +build !linux
+
+package wire
+
+// PerSegment invokes fn once per segment of pkt.
+// This is a stub implementation that does not actually support segmentation
+func (t *TunPacket) PerSegment(fn func(seg []byte) error) error {
+	return fn(t.Bytes)
+}
--- a/wire/wire_linux.go
+++ b/wire/wire_linux.go
@@ -0,0 +1,32 @@
+package wire
+
+import (
+	"fmt"
+
+	"github.com/slackhq/nebula/overlay/tio/virtio"
+)
+
+// PerSegment invokes fn once per segment of pkt. For non-GSO pkts
+// fn is called once with pkt.Bytes (no segmentation, no copy). For GSO/USO
+// superpackets fn is called once per segment with a slice of pkt.Bytes
+// holding that segment's plaintext (a freshly-patched L3+L4 header sliced
+// in front of the original payload chunk). The slide is destructive: pkt is
+// consumed by this call and its bytes are in an undefined state when
+// PerSegment returns. Callers must not retain pkt or any earlier
+// seg slice past fn's return for that segment. The scratch parameter is
+// unused on the destructive path and kept only for cross-platform
+// signature compatibility. Aborts and returns the first error from fn or
+// from per-segment construction.
+func (t *TunPacket) PerSegment(fn func(seg []byte) error) error {
+	if !t.Meta.IsSuperpacket() {
+		return fn(t.Bytes)
+	}
+	switch t.Meta.Proto {
+	case GSOProtoTCP:
+		return virtio.SegmentTCP(t.Bytes, t.Meta.HdrLen, t.Meta.CsumStart, t.Meta.Size, fn)
+	case GSOProtoUDP:
+		return virtio.SegmentUDP(t.Bytes, t.Meta.HdrLen, t.Meta.CsumStart, t.Meta.Size, fn)
+	default:
+		return fmt.Errorf("unsupported gso proto: %d", t.Meta.Proto)
+	}
+}