GSO/GRO offloads, with TCP+ECN and UDP support

2026-05-16 04:47:38 +02:00 · 2026-04-17 10:25:05 -05:00
parent f95857b4c3
commit 5d35351437
60 changed files with 6915 additions and 283 deletions
--- a/udp/conn.go
+++ b/udp/conn.go
@@ -11,12 +11,25 @@ const MTU = 9001
 // MaxWriteBatch is the largest batch any Conn.WriteBatch implementation is
 // required to accept. Callers SHOULD NOT pass more than this per call; Linux
 // backends preallocate sendmmsg scratch sized to this value, so exceeding it
-// only costs a chunked retry.
+// only costs additional sendmmsg chunks within a single WriteBatch call.
 const MaxWriteBatch = 128

+// RxMeta carries per-packet metadata extracted from the RX path (ancillary
+// data, kernel offload state, etc.) and passed to EncReader callbacks.
+// Backends that do not produce a particular signal leave its zero value.
+//
+// OuterECN is the 2-bit IP-level ECN codepoint stamped on the carrier
+// datagram (extracted from IP_TOS / IPV6_TCLASS cmsg on Linux). Zero
+// means Not-ECT, which is also the value backends without ECN RX support
+// supply on every packet.
+type RxMeta struct {
+	OuterECN byte
+}
+
 type EncReader func(
 	addr netip.AddrPort,
 	payload []byte,
+	meta RxMeta,
 )

 type Conn interface {
@@ -30,11 +43,14 @@ type Conn interface {
 	ListenOut(r EncReader, flush func()) error
 	WriteTo(b []byte, addr netip.AddrPort) error
 	// WriteBatch sends a contiguous batch of packets, each with its own
-	// destination. bufs and addrs must have the same length. Linux uses
-	// sendmmsg(2) for a single syscall; other backends fall back to a
-	// WriteTo loop. Returns on the first error; callers may observe a
-	// partial send if some packets went out before the error.
-	WriteBatch(bufs [][]byte, addrs []netip.AddrPort) error
+	// destination. bufs and addrs must have the same length. outerECNs may
+	// be nil (treated as all-zero / Not-ECT); when non-nil it must have the
+	// same length as bufs, and outerECNs[i] is the 2-bit IP-level ECN
+	// codepoint to set on packet i's outer header. Linux uses sendmmsg(2)
+	// for a single syscall and attaches the value as IP_TOS / IPV6_TCLASS
+	// cmsg; other backends ignore it. Returns on the first error; callers
+	// may observe a partial send if some packets went out before the error.
+	WriteBatch(bufs [][]byte, addrs []netip.AddrPort, outerECNs []byte) error
 	ReloadConfig(c *config.C)
 	SupportsMultipleReaders() bool
 	Close() error
@@ -57,7 +73,7 @@ func (NoopConn) SupportsMultipleReaders() bool {
 func (NoopConn) WriteTo(_ []byte, _ netip.AddrPort) error {
 	return nil
 }
-func (NoopConn) WriteBatch(_ [][]byte, _ []netip.AddrPort) error {
+func (NoopConn) WriteBatch(_ [][]byte, _ []netip.AddrPort, _ []byte) error {
 	return nil
 }
 func (NoopConn) ReloadConfig(_ *config.C) {
--- a/udp/raw_sendmmsg_linux.go
+++ b/udp/raw_sendmmsg_linux.go
@@ -0,0 +1,62 @@
+//go:build !android && !e2e_testing
+// +build !android,!e2e_testing
+
+package udp
+
+import (
+	"net"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+// rawSendmmsg performs sendmmsg(2) over a syscall.RawConn without
+// allocating a closure per call. The struct holds preallocated in/out
+// scratch (chunk/sent/errno) and a method-value bound at construction so
+// rawConn.Write receives a stable function pointer instead of a fresh
+// closure on every send.
+type rawSendmmsg struct {
+	msgs     []rawMessage
+	chunk    int
+	sent     int
+	errno    syscall.Errno
+	callback func(fd uintptr) bool
+}
+
+// bind wires r.callback to r.run. Must be called once after r.msgs is set;
+// subsequent send calls invoke r.callback without rebinding.
+func (r *rawSendmmsg) bind() { r.callback = r.run }
+
+// run is the preallocated callback rawConn.Write invokes. It reads its
+// input (r.chunk) and writes its outputs (r.sent, r.errno) through the
+// rawSendmmsg fields so the method value does not capture per-call locals
+// and therefore does not heap-allocate.
+func (r *rawSendmmsg) run(fd uintptr) bool {
+	r1, _, errno := unix.Syscall6(unix.SYS_SENDMMSG, fd,
+		uintptr(unsafe.Pointer(&r.msgs[0])), uintptr(r.chunk),
+		0, 0, 0,
+	)
+	if errno == syscall.EAGAIN || errno == syscall.EWOULDBLOCK {
+		return false
+	}
+	r.sent = int(r1)
+	r.errno = errno
+	return true
+}
+
+// send issues sendmmsg over rc against the first n entries of r.msgs.
+// Returns the number of entries the kernel processed and any error;
+// matches the original sendmmsg helper's contract.
+func (r *rawSendmmsg) send(rc syscall.RawConn, n int) (int, error) {
+	r.chunk = n
+	r.sent = 0
+	r.errno = 0
+	if err := rc.Write(r.callback); err != nil {
+		return r.sent, err
+	}
+	if r.errno != 0 {
+		return r.sent, &net.OpError{Op: "sendmmsg", Err: r.errno}
+	}
+	return r.sent, nil
+}
--- a/udp/rx_reorder_linux.go
+++ b/udp/rx_reorder_linux.go
@@ -0,0 +1,86 @@
+//go:build !android && !e2e_testing
+// +build !android,!e2e_testing
+
+package udp
+
+import (
+	"cmp"
+	"net/netip"
+	"slices"
+)
+
+// rxSegment is one nebula packet pulled out of a recvmmsg entry — either a
+// lone datagram or one segment of a GRO superpacket. cnt is the big-endian
+// uint64 message counter at bytes [8:16] of the nebula header; 0 if the
+// segment is too short to contain a header. ecn is the 2-bit IP-level ECN
+// codepoint stamped on the carrier (one value per slot, since GRO requires
+// equal ECN across coalesced datagrams).
+type rxSegment struct {
+	src netip.AddrPort
+	cnt uint64
+	buf []byte
+	ecn byte
+}
+
+// rxReorderBuffer accumulates one recvmmsg batch worth of segments,
+// splits any GRO superpackets at gso_size boundaries, stable-sorts by
+// (src, port, counter), then delivers in order. The reorder distance is
+// bounded by len(buf), which the caller sizes to stay well within the
+// receiver's ReplayWindow so older arrivals are not rejected as replays.
+type rxReorderBuffer struct {
+	buf []rxSegment
+}
+
+func newRxReorderBuffer(initialCap int) *rxReorderBuffer {
+	return &rxReorderBuffer{buf: make([]rxSegment, 0, initialCap)}
+}
+
+// reset prepares the buffer for the next recvmmsg batch.
+func (r *rxReorderBuffer) reset() { r.buf = r.buf[:0] }
+
+// addEntry expands one recvmmsg slot into rxSegments. When segSize <= 0 or
+// segSize >= len(payload) the payload is appended as a single segment;
+// otherwise the kernel-coalesced GRO superpacket is split at segSize
+// boundaries (the kernel guarantees every segment is exactly segSize bytes
+// except for the final one, which may be short). ecn applies uniformly to
+// every produced segment because GRO requires equal ECN across coalesced
+// datagrams.
+func (r *rxReorderBuffer) addEntry(from netip.AddrPort, payload []byte, segSize int, ecn byte) {
+	if segSize <= 0 || segSize >= len(payload) {
+		r.buf = append(r.buf, rxSegment{from, headerCounter(payload), payload, ecn})
+		return
+	}
+	for off := 0; off < len(payload); off += segSize {
+		end := off + segSize
+		if end > len(payload) {
+			end = len(payload)
+		}
+		seg := payload[off:end]
+		r.buf = append(r.buf, rxSegment{from, headerCounter(seg), seg, ecn})
+	}
+}
+
+// sortStable orders the accumulated segments by (src addr, src port,
+// counter). Same-source segments are reordered into counter order;
+// cross-source relative order is determined by a stable address compare so
+// the sort is total and predictable.
+func (r *rxReorderBuffer) sortStable() {
+	slices.SortStableFunc(r.buf, func(a, b rxSegment) int {
+		if c := a.src.Addr().Compare(b.src.Addr()); c != 0 {
+			return c
+		}
+		if c := cmp.Compare(a.src.Port(), b.src.Port()); c != 0 {
+			return c
+		}
+		return cmp.Compare(a.cnt, b.cnt)
+	})
+}
+
+// deliver invokes fn once per segment in sorted order, then nils the
+// per-entry buf reference so the next batch's append doesn't alias it.
+func (r *rxReorderBuffer) deliver(fn EncReader) {
+	for k := range r.buf {
+		fn(r.buf[k].src, r.buf[k].buf, RxMeta{OuterECN: r.buf[k].ecn})
+		r.buf[k].buf = nil
+	}
+}
--- a/udp/rx_reorder_linux_test.go
+++ b/udp/rx_reorder_linux_test.go
@@ -0,0 +1,203 @@
+//go:build !android && !e2e_testing
+// +build !android,!e2e_testing
+
+package udp
+
+import (
+	"encoding/binary"
+	"net/netip"
+	"testing"
+)
+
+// makeNebulaPkt returns a buffer whose [8:16] bytes encode the given
+// counter big-endian, the rest left zero. Anything shorter than 16 bytes
+// would yield counter 0; tests use this to simulate well-formed nebula
+// headers (the rxReorderBuffer doesn't care about anything else).
+func makeNebulaPkt(cnt uint64, payLen int) []byte {
+	if payLen < 16 {
+		payLen = 16
+	}
+	b := make([]byte, payLen)
+	binary.BigEndian.PutUint64(b[8:16], cnt)
+	return b
+}
+
+func srcOf(addr string, port uint16) netip.AddrPort {
+	return netip.AddrPortFrom(netip.MustParseAddr(addr), port)
+}
+
+func TestRxReorderBuffer_LonePassesThrough(t *testing.T) {
+	r := newRxReorderBuffer(8)
+	pkt := makeNebulaPkt(42, 100)
+	r.addEntry(srcOf("1.1.1.1", 4242), pkt, 0, 0x02)
+
+	if got := len(r.buf); got != 1 {
+		t.Fatalf("want 1 entry, got %d", got)
+	}
+	if r.buf[0].cnt != 42 {
+		t.Errorf("counter=%d want 42", r.buf[0].cnt)
+	}
+	if r.buf[0].ecn != 0x02 {
+		t.Errorf("ecn=%#x want 0x02", r.buf[0].ecn)
+	}
+	if len(r.buf[0].buf) != 100 {
+		t.Errorf("buf len=%d want 100", len(r.buf[0].buf))
+	}
+}
+
+func TestRxReorderBuffer_SegSizeGEPayloadIsLone(t *testing.T) {
+	// segSize >= len(payload) means the kernel did not coalesce this slot.
+	r := newRxReorderBuffer(8)
+	pkt := makeNebulaPkt(7, 50)
+	r.addEntry(srcOf("1.1.1.1", 1), pkt, 50, 0)
+	if got := len(r.buf); got != 1 {
+		t.Fatalf("segSize==len: want 1 entry, got %d", got)
+	}
+	r.reset()
+	r.addEntry(srcOf("1.1.1.1", 1), pkt, 60, 0)
+	if got := len(r.buf); got != 1 {
+		t.Fatalf("segSize>len: want 1 entry, got %d", got)
+	}
+}
+
+func TestRxReorderBuffer_GROSplitExactMultiple(t *testing.T) {
+	// 3 segments of 80 bytes each, packed into one 240-byte GRO superpacket.
+	const segSize = 80
+	const numSeg = 3
+	pkt := make([]byte, segSize*numSeg)
+	for i := range numSeg {
+		off := i * segSize
+		binary.BigEndian.PutUint64(pkt[off+8:off+16], uint64(100+i))
+	}
+
+	r := newRxReorderBuffer(8)
+	r.addEntry(srcOf("2.2.2.2", 5555), pkt, segSize, 0x03)
+	if got := len(r.buf); got != numSeg {
+		t.Fatalf("want %d segments, got %d", numSeg, got)
+	}
+	for i, seg := range r.buf {
+		if seg.cnt != uint64(100+i) {
+			t.Errorf("seg %d: cnt=%d want %d", i, seg.cnt, 100+i)
+		}
+		if len(seg.buf) != segSize {
+			t.Errorf("seg %d: buf len=%d want %d", i, len(seg.buf), segSize)
+		}
+		if seg.ecn != 0x03 {
+			t.Errorf("seg %d: ecn=%#x want 0x03 (uniform across GRO)", i, seg.ecn)
+		}
+	}
+}
+
+func TestRxReorderBuffer_GROSplitShortFinal(t *testing.T) {
+	// 200-byte payload, segSize=80 → segments of 80, 80, 40.
+	const segSize = 80
+	pkt := make([]byte, 200)
+	binary.BigEndian.PutUint64(pkt[8:16], 1)
+	binary.BigEndian.PutUint64(pkt[80+8:80+16], 2)
+	binary.BigEndian.PutUint64(pkt[160+8:160+16], 3)
+
+	r := newRxReorderBuffer(8)
+	r.addEntry(srcOf("3.3.3.3", 1), pkt, segSize, 0)
+	if got := len(r.buf); got != 3 {
+		t.Fatalf("want 3 segments, got %d", got)
+	}
+	wantLens := []int{80, 80, 40}
+	for i, seg := range r.buf {
+		if len(seg.buf) != wantLens[i] {
+			t.Errorf("seg %d: len=%d want %d", i, len(seg.buf), wantLens[i])
+		}
+	}
+}
+
+func TestRxReorderBuffer_SortGroupsBySrcThenCounter(t *testing.T) {
+	r := newRxReorderBuffer(8)
+	a := srcOf("1.1.1.1", 1)
+	b := srcOf("2.2.2.2", 1)
+	// Insert deliberately scrambled.
+	r.addEntry(a, makeNebulaPkt(3, 16), 0, 0)
+	r.addEntry(b, makeNebulaPkt(1, 16), 0, 0)
+	r.addEntry(a, makeNebulaPkt(1, 16), 0, 0)
+	r.addEntry(b, makeNebulaPkt(2, 16), 0, 0)
+	r.addEntry(a, makeNebulaPkt(2, 16), 0, 0)
+
+	r.sortStable()
+
+	want := []struct {
+		src netip.AddrPort
+		cnt uint64
+	}{
+		{a, 1}, {a, 2}, {a, 3}, {b, 1}, {b, 2},
+	}
+	if got := len(r.buf); got != len(want) {
+		t.Fatalf("len=%d want %d", got, len(want))
+	}
+	for i, w := range want {
+		if r.buf[i].src != w.src || r.buf[i].cnt != w.cnt {
+			t.Errorf("idx %d: got %v/%d want %v/%d",
+				i, r.buf[i].src, r.buf[i].cnt, w.src, w.cnt)
+		}
+	}
+}
+
+func TestRxReorderBuffer_SortStableAcrossPorts(t *testing.T) {
+	// Same source addr but different ports — must group by port.
+	r := newRxReorderBuffer(8)
+	addr := netip.MustParseAddr("4.4.4.4")
+	p1 := netip.AddrPortFrom(addr, 1)
+	p2 := netip.AddrPortFrom(addr, 2)
+	r.addEntry(p2, makeNebulaPkt(10, 16), 0, 0)
+	r.addEntry(p1, makeNebulaPkt(20, 16), 0, 0)
+	r.addEntry(p2, makeNebulaPkt(5, 16), 0, 0)
+
+	r.sortStable()
+
+	// Expect: p1/20 then p2/5 then p2/10.
+	if r.buf[0].src.Port() != 1 || r.buf[1].src.Port() != 2 || r.buf[2].src.Port() != 2 {
+		t.Fatalf("port order broken: %v %v %v",
+			r.buf[0].src.Port(), r.buf[1].src.Port(), r.buf[2].src.Port())
+	}
+	if r.buf[1].cnt != 5 || r.buf[2].cnt != 10 {
+		t.Errorf("counter order in p2: %d %d (want 5 10)", r.buf[1].cnt, r.buf[2].cnt)
+	}
+}
+
+func TestRxReorderBuffer_DeliverInOrderAndNilsRefs(t *testing.T) {
+	r := newRxReorderBuffer(4)
+	a := srcOf("5.5.5.5", 1)
+	r.addEntry(a, makeNebulaPkt(2, 32), 0, 0x01)
+	r.addEntry(a, makeNebulaPkt(1, 32), 0, 0x01)
+	r.sortStable()
+
+	var seenCnts []uint64
+	var seenECN []byte
+	r.deliver(func(src netip.AddrPort, buf []byte, meta RxMeta) {
+		seenCnts = append(seenCnts, binary.BigEndian.Uint64(buf[8:16]))
+		seenECN = append(seenECN, meta.OuterECN)
+	})
+
+	if len(seenCnts) != 2 || seenCnts[0] != 1 || seenCnts[1] != 2 {
+		t.Errorf("delivery order broken: %v", seenCnts)
+	}
+	if seenECN[0] != 0x01 || seenECN[1] != 0x01 {
+		t.Errorf("ecn passed wrong: %v", seenECN)
+	}
+	for i := range r.buf {
+		if r.buf[i].buf != nil {
+			t.Errorf("buf[%d].buf not nil after deliver", i)
+		}
+	}
+}
+
+func TestRxReorderBuffer_ResetIsReusable(t *testing.T) {
+	r := newRxReorderBuffer(2)
+	r.addEntry(srcOf("6.6.6.6", 1), makeNebulaPkt(1, 16), 0, 0)
+	r.addEntry(srcOf("6.6.6.6", 1), makeNebulaPkt(2, 16), 0, 0)
+	r.reset()
+	if got := len(r.buf); got != 0 {
+		t.Fatalf("after reset len=%d want 0", got)
+	}
+	r.addEntry(srcOf("6.6.6.6", 1), makeNebulaPkt(7, 16), 0, 0)
+	if r.buf[0].cnt != 7 {
+		t.Errorf("after reset+add: cnt=%d want 7", r.buf[0].cnt)
+	}
+}
--- a/udp/udp_darwin.go
+++ b/udp/udp_darwin.go
@@ -140,7 +140,7 @@ func (u *StdConn) WriteTo(b []byte, ap netip.AddrPort) error {
 	}
 }

-func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort) error {
+func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, _ []byte) error {
 	for i, b := range bufs {
 		if err := u.WriteTo(b, addrs[i]); err != nil {
 			return err
@@ -188,7 +188,7 @@ func (u *StdConn) ListenOut(r EncReader, flush func()) error {
 			u.l.Error("unexpected udp socket receive error", "error", err)
 		}

-		r(netip.AddrPortFrom(rua.Addr().Unmap(), rua.Port()), buffer[:n])
+		r(netip.AddrPortFrom(rua.Addr().Unmap(), rua.Port()), buffer[:n], RxMeta{})
 		flush()
 	}
 }
--- a/udp/udp_ecn_outer_linux_test.go
+++ b/udp/udp_ecn_outer_linux_test.go
@@ -0,0 +1,61 @@
+//go:build linux && !android && !e2e_testing
+
+package udp
+
+import (
+	"net/netip"
+	"testing"
+)
+
+// TestPlanRunBreaksOnECNChange confirms that two same-destination, same-size
+// packets with different outer ECN end up in separate sendmmsg entries (the
+// kernel stamps one outer codepoint per entry, so a run that straddled the
+// boundary would silently lose information).
+func TestPlanRunBreaksOnECNChange(t *testing.T) {
+	u := &StdConn{gsoSupported: true}
+	dst := netip.MustParseAddrPort("10.0.0.1:4242")
+
+	bufs := [][]byte{
+		make([]byte, 1200),
+		make([]byte, 1200),
+		make([]byte, 1200),
+	}
+	addrs := []netip.AddrPort{dst, dst, dst}
+
+	t.Run("uniform_ecn_runs_together", func(t *testing.T) {
+		ecns := []byte{0x02, 0x02, 0x02}
+		runLen, segSize := u.planRun(bufs, addrs, ecns, 0, 64)
+		if runLen != 3 {
+			t.Errorf("runLen=%d want 3 (uniform ECT(0))", runLen)
+		}
+		if segSize != 1200 {
+			t.Errorf("segSize=%d want 1200", segSize)
+		}
+	})
+
+	t.Run("ecn_change_truncates_run", func(t *testing.T) {
+		// 0,0,3: first two run together, CE seeds a fresh entry.
+		ecns := []byte{0x00, 0x00, 0x03}
+		runLen, _ := u.planRun(bufs, addrs, ecns, 0, 64)
+		if runLen != 2 {
+			t.Errorf("runLen=%d want 2 (ECN changes at index 2)", runLen)
+		}
+	})
+
+	t.Run("nil_ecns_runs_full", func(t *testing.T) {
+		runLen, _ := u.planRun(bufs, addrs, nil, 0, 64)
+		if runLen != 3 {
+			t.Errorf("runLen=%d want 3 (nil ecns means no break)", runLen)
+		}
+	})
+
+	t.Run("first_ecn_is_singleton", func(t *testing.T) {
+		// Second packet has different ECN from the first → run halts at 1
+		// (the first packet alone forms the run).
+		ecns := []byte{0x00, 0x03, 0x03}
+		runLen, _ := u.planRun(bufs, addrs, ecns, 0, 64)
+		if runLen != 1 {
+			t.Errorf("runLen=%d want 1 (different ECN immediately)", runLen)
+		}
+	})
+}
--- a/udp/udp_generic.go
+++ b/udp/udp_generic.go
@@ -44,7 +44,7 @@ func (u *GenericConn) WriteTo(b []byte, addr netip.AddrPort) error {
 	return err
 }

-func (u *GenericConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort) error {
+func (u *GenericConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, _ []byte) error {
 	for i, b := range bufs {
 		if _, err := u.UDPConn.WriteToUDPAddrPort(b, addrs[i]); err != nil {
 			return err
@@ -102,7 +102,7 @@ func (u *GenericConn) ListenOut(r EncReader, flush func()) error {
 			continue
 		}

-		r(netip.AddrPortFrom(rua.Addr().Unmap(), rua.Port()), buffer[:n])
+		r(netip.AddrPortFrom(rua.Addr().Unmap(), rua.Port()), buffer[:n], RxMeta{})
 		flush()
 	}
 }
--- a/udp/udp_linux.go
+++ b/udp/udp_linux.go
@@ -24,6 +24,58 @@ type StdConn struct {
 	isV4    bool
 	l       *slog.Logger
 	batch   int
+
+	// sendmmsg scratch. Each queue has its own StdConn, so no locking is
+	// needed. Sized to MaxWriteBatch at construction; WriteBatch chunks
+	// larger inputs.
+	writeMsgs  []rawMessage
+	writeIovs  []iovec
+	writeNames [][]byte
+
+	// Per-entry cmsg scratch. writeCmsg is one contiguous slab of
+	// MaxWriteBatch * writeCmsgSpace bytes; each entry holds two cmsg
+	// headers (UDP_SEGMENT then IP_TOS / IPV6_TCLASS) pre-filled once in
+	// prepareWriteMessages. WriteBatch only rewrites the per-call data
+	// payloads and toggles Hdr.Control / Hdr.Controllen to point at
+	// whichever subset of the two cmsgs applies.
+	writeCmsg         []byte
+	writeCmsgSpace    int
+	writeCmsgSegSpace int
+	writeCmsgEcnSpace int
+
+	// writeEntryEnd[e] is the bufs index *after* the last packet packed
+	// into mmsghdr entry e. Used to rewind `i` on partial sendmmsg success.
+	writeEntryEnd []int
+
+	// rawSend wraps the sendmmsg(2) callback in a closure-free helper so
+	// the hot path doesn't heap-allocate a fresh closure per call.
+	rawSend rawSendmmsg
+
+	// UDP GSO (sendmsg with UDP_SEGMENT cmsg) support. gsoSupported is
+	// probed once at socket creation. When true, WriteBatch packs same-
+	// destination consecutive packets into a single sendmmsg entry with a
+	// UDP_SEGMENT cmsg; otherwise each packet is its own entry.
+	gsoSupported bool
+
+	// UDP GRO (recvmsg with UDP_GRO cmsg) support. groSupported is probed
+	// once at socket creation. When true, listenOutBatch allocates larger
+	// RX buffers and a per-entry cmsg slot so the kernel can coalesce
+	// consecutive same-flow datagrams into a single recvmmsg entry; the
+	// delivered cmsg carries the gso_size used to split them back apart.
+	groSupported bool
+
+	// ecnRecvSupported is true when IP_RECVTOS / IPV6_RECVTCLASS was
+	// successfully enabled — the kernel will deliver the outer IP-ECN of
+	// each arriving datagram as a per-slot cmsg, and listenOutBatch passes
+	// the parsed value to the EncReader callback for RFC 6040 combine.
+	ecnRecvSupported bool
+
+	// rxOrder is the per-batch scratch listenOutBatch uses to gather every
+	// segment in a recvmmsg call (after splitting GRO superpackets) and
+	// stable-sort by (source, message-counter) before delivery. Reordering
+	// fits within the receiver's replay window so briefly out-of-order
+	// arrivals do not get rejected as replays.
+	rxOrder *rxReorderBuffer
 }

 func setReusePort(network, address string, c syscall.RawConn) error {
@@ -70,9 +122,196 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int)
 	}
 	out.isV4 = af == unix.AF_INET

+	out.prepareWriteMessages(MaxWriteBatch)
+	out.rawSend.msgs = out.writeMsgs
+	out.rawSend.bind()
+
+	out.prepareGSO()
+	// GRO delivers coalesced superpackets that need a cmsg to split back
+	// into segments. The single-packet RX path uses ReadFromUDPAddrPort
+	// and cannot see that cmsg, so only enable GRO for the batch path.
+	if batch > 1 {
+		out.prepareGRO()
+	}
+	// Best-effort: ask the kernel to deliver outer IP-ECN as ancillary data
+	// on every recvmmsg slot so the decap side can apply RFC 6040 combine.
+	// On older kernels these may not exist; failing here just means we get
+	// 0 (Not-ECT) on every slot, which is the same as ecn_mode=disable.
+	out.prepareECNRecv()
+
 	return out, nil
 }

+// prepareWriteMessages allocates one mmsghdr/iovec/sockaddr/cmsg scratch
+// slot per sendmmsg entry. The iovec slab is sized to n so all entries'
+// iovecs share one allocation; per-entry fan-out is further capped at
+// maxGSOSegments. Hdr.Iov / Hdr.Iovlen / Hdr.Control / Hdr.Controllen are
+// wired per call since each entry can span a variable number of iovecs
+// and may or may not carry a cmsg.
+//
+// Per-mmsghdr cmsg layout. Each entry's slot of length writeCmsgSpace holds
+// up to two cmsg headers placed at fixed offsets:
+//
+//	[0 .. writeCmsgSegSpace)              UDP_SEGMENT (gso_size, uint16)
+//	[writeCmsgSegSpace .. writeCmsgSpace) IP_TOS or IPV6_TCLASS (int32)
+//
+// Both headers are pre-filled once here; per-call we only rewrite the data
+// payload and toggle Hdr.Control / Hdr.Controllen to point at whichever
+// subset applies (none / segment-only / ecn-only / both).
+func (u *StdConn) prepareWriteMessages(n int) {
+	u.writeMsgs = make([]rawMessage, n)
+	u.writeIovs = make([]iovec, n)
+	u.writeNames = make([][]byte, n)
+	u.writeEntryEnd = make([]int, n)
+
+	u.writeCmsgSegSpace = unix.CmsgSpace(2)
+	u.writeCmsgEcnSpace = unix.CmsgSpace(4)
+	u.writeCmsgSpace = u.writeCmsgSegSpace + u.writeCmsgEcnSpace
+	u.writeCmsg = make([]byte, n*u.writeCmsgSpace)
+
+	ecnLevel := int32(unix.IPPROTO_IP)
+	ecnType := int32(unix.IP_TOS)
+	if !u.isV4 {
+		ecnLevel = unix.IPPROTO_IPV6
+		ecnType = unix.IPV6_TCLASS
+	}
+
+	for k := 0; k < n; k++ {
+		base := k * u.writeCmsgSpace
+		seg := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base]))
+		seg.Level = unix.SOL_UDP
+		seg.Type = unix.UDP_SEGMENT
+		setCmsgLen(seg, unix.CmsgLen(2))
+
+		ecn := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base+u.writeCmsgSegSpace]))
+		ecn.Level = ecnLevel
+		ecn.Type = ecnType
+		setCmsgLen(ecn, unix.CmsgLen(4))
+	}
+
+	for i := range u.writeMsgs {
+		u.writeNames[i] = make([]byte, unix.SizeofSockaddrInet6)
+		u.writeMsgs[i].Hdr.Name = &u.writeNames[i][0]
+	}
+}
+
+// maxGSOSegments caps the per-sendmsg GSO fan-out. Linux kernels have
+// historically capped UDP_MAX_SEGMENTS at 64; newer kernels raise it to 128.
+// We stay one below 64 because the kernel's check is
+//
+//	if (cork->length > cork->gso_size * UDP_MAX_SEGMENTS) return -EINVAL;
+//
+// and cork->length includes the 8-byte UDP header (udp_sendmsg passes
+// ulen = len + sizeof(udphdr) to ip_append_data). Packing exactly 64
+// same-size segments puts cork->length at gso_size*64 + 8, which is one
+// UDP-header over the bound and the kernel rejects the whole sendmmsg
+// with EINVAL. 63 leaves room for the header for any segSize >= 8.
+const maxGSOSegments = 63
+
+// maxGSOBytes bounds the total payload per sendmsg() when UDP_SEGMENT is
+// set. The kernel stitches all iovecs into a single skb whose length the
+// UDP length field can represent, and also enforces sk_gso_max_size (which
+// on most devices is 65536). We use 65000 to leave headroom under the
+// 65535 UDP-length cap, avoiding EMSGSIZE on large TSO superpackets.
+const maxGSOBytes = 65000
+
+// prepareGSO probes UDP_SEGMENT support and sets u.gsoSupported on success.
+// Best-effort; failure leaves it false.
+func (u *StdConn) prepareGSO() {
+	var probeErr error
+	if err := u.rawConn.Control(func(fd uintptr) {
+		probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_SEGMENT, 0)
+	}); err != nil {
+		u.l.Info("udp: GSO disabled", "reason", "rawconn control failed", "error", err)
+		recordCapability("udp.gso.enabled", false)
+		return
+	}
+	if probeErr != nil {
+		u.l.Info("udp: GSO disabled", "reason", "kernel rejected probe", "error", probeErr)
+		recordCapability("udp.gso.enabled", false)
+		return
+	}
+	u.gsoSupported = true
+	u.l.Info("udp: GSO enabled")
+	recordCapability("udp.gso.enabled", true)
+}
+
+// udpGROBufferSize sizes the per-entry recvmmsg buffer when UDP_GRO is on.
+// The kernel stitches a run of same-flow datagrams into a single skb whose
+// length is bounded by sk_gso_max_size (typically 65535); anything larger
+// would be MSG_TRUNCed. We use the maximum representable UDP length so a
+// full superpacket always lands intact.
+const udpGROBufferSize = 65535
+
+// udpGROCmsgPayload is the size of the UDP_GRO cmsg data delivered by the
+// kernel: a single int (gso_size in bytes). See udp_cmsg_recv() in
+// net/ipv4/udp.c.
+const udpGROCmsgPayload = 4
+
+// prepareGRO turns on UDP_GRO so the kernel coalesces consecutive same-flow
+// datagrams into one recvmmsg entry, with a cmsg carrying the gso_size used
+// to split them back apart on the application side.
+func (u *StdConn) prepareGRO() {
+	var probeErr error
+	if err := u.rawConn.Control(func(fd uintptr) {
+		probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_GRO, 1)
+	}); err != nil {
+		u.l.Info("udp: GRO disabled", "reason", "rawconn control failed", "error", err)
+		recordCapability("udp.gro.enabled", false)
+		return
+	}
+	if probeErr != nil {
+		u.l.Info("udp: GRO disabled", "reason", "kernel rejected probe", "error", probeErr)
+		recordCapability("udp.gro.enabled", false)
+		return
+	}
+	u.groSupported = true
+	u.l.Info("udp: GRO enabled")
+	recordCapability("udp.gro.enabled", true)
+}
+
+// prepareECNRecv turns on IP_RECVTOS / IPV6_RECVTCLASS so the outer IP-ECN
+// field of each arriving datagram is delivered as ancillary data alongside
+// the payload. listenOutBatch reads it via parseRecvCmsg and passes the
+// codepoint through the EncReader for RFC 6040 combine on the decap side.
+// Best-effort: we keep going on failure.
+func (u *StdConn) prepareECNRecv() {
+	var probeErr error
+	if err := u.rawConn.Control(func(fd uintptr) {
+		if u.isV4 {
+			probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_RECVTOS, 1)
+		} else {
+			probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_RECVTCLASS, 1)
+		}
+	}); err != nil {
+		u.l.Info("udp: outer-ECN RX disabled", "reason", "rawconn control failed", "error", err)
+		recordCapability("udp.ecn_rx.enabled", false)
+		return
+	}
+	if probeErr != nil {
+		u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", probeErr)
+		recordCapability("udp.ecn_rx.enabled", false)
+		return
+	}
+	u.ecnRecvSupported = true
+	u.l.Info("udp: outer-ECN RX enabled")
+	recordCapability("udp.ecn_rx.enabled", true)
+}
+
+// recordCapability registers (or updates) a boolean gauge for one of the
+// kernel-feature probes. Gauges go to 1 when the feature is enabled, 0 when
+// it is not — dashboards can show degraded state on partially-supported
+// kernels at a glance. Calling repeatedly with the same name updates the
+// existing gauge rather than registering a duplicate.
+func recordCapability(name string, enabled bool) {
+	g := metrics.GetOrRegisterGauge(name, nil)
+	if enabled {
+		g.Update(1)
+	} else {
+		g.Update(0)
+	}
+}
+
 func (u *StdConn) SupportsMultipleReaders() bool {
 	return true
 }
@@ -183,7 +422,10 @@ func (u *StdConn) listenOutSingle(r EncReader, flush func()) error {
 			return err
 		}
 		from = netip.AddrPortFrom(from.Addr().Unmap(), from.Port())
-		r(from, buffer[:n])
+		// listenOutSingle uses ReadFromUDPAddrPort which discards cmsgs,
+		// so the outer ECN field is not visible on this path. Zero RxMeta
+		// (Not-ECT) means RFC 6040 combine is a no-op.
+		r(from, buffer[:n], RxMeta{})
 		flush()
 	}
 }
@@ -194,7 +436,22 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
 	var operr error

 	bufSize := MTU
-	msgs, buffers, names := u.PrepareRawMessages(u.batch, bufSize)
+	cmsgSpace := 0
+	if u.groSupported {
+		bufSize = udpGROBufferSize
+		cmsgSpace = unix.CmsgSpace(udpGROCmsgPayload)
+	}
+	if u.ecnRecvSupported {
+		// IP_TOS arrives as 1 byte; IPV6_TCLASS arrives as a 4-byte int.
+		// Reserve enough for the wider of the two so the same buffer fits
+		// either family alongside any UDP_GRO cmsg.
+		cmsgSpace += unix.CmsgSpace(4)
+	}
+	msgs, buffers, names, _ := u.PrepareRawMessages(u.batch, bufSize, cmsgSpace)
+
+	if u.rxOrder == nil {
+		u.rxOrder = newRxReorderBuffer(u.batch * 64)
+	}

 	//reader needs to capture variables from this function, since it's used as a lambda with rawConn.Read
 	//defining it outside the loop so it gets re-used
@@ -204,6 +461,11 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
 	}

 	for {
+		if cmsgSpace > 0 {
+			for i := range msgs {
+				setMsgControllen(&msgs[i].Hdr, cmsgSpace)
+			}
+		}
 		err := u.rawConn.Read(reader)
 		if err != nil {
 			return err
@@ -212,6 +474,9 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
 			return operr
 		}

+		// Phase 1: gather every segment from this recvmmsg into rxOrder,
+		// splitting GRO superpackets into their constituent segments.
+		u.rxOrder.reset()
 		for i := 0; i < n; i++ {
 			// Its ok to skip the ok check here, the slicing is the only error that can occur and it will panic
 			if u.isV4 {
@@ -222,14 +487,77 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
 			from := netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(names[i][2:4]))
 			payload := buffers[i][:msgs[i].Len]

-			r(from, payload)
+			segSize := 0
+			outerECN := byte(0)
+			if cmsgSpace > 0 {
+				segSize, outerECN = parseRecvCmsg(&msgs[i].Hdr, u.groSupported, u.ecnRecvSupported, u.isV4)
+			}
+			u.rxOrder.addEntry(from, payload, segSize, outerECN)
 		}
+
+		// Phase 2 + 3: stable-sort by (src, port, counter), then deliver in
+		// order. Reorder distance is bounded by len(u.rxOrder.buf), which
+		// stays well within the receiver's ReplayWindow (currently 8192) so
+		// older arrivals are not rejected as replays.
+		u.rxOrder.sortStable()
+		u.rxOrder.deliver(r)
 		// End-of-batch: let callers (e.g. TUN write coalescer) flush any
 		// state they accumulated across this batch.
 		flush()
 	}
 }

+// headerCounter returns the big-endian uint64 message counter at bytes
+// [8:16] of a nebula packet, or 0 if the buffer is too short.
+func headerCounter(buf []byte) uint64 {
+	if len(buf) < 16 {
+		return 0
+	}
+	return binary.BigEndian.Uint64(buf[8:16])
+}
+
+// parseRecvCmsg walks the per-slot ancillary buffer once and extracts up to
+// two values of interest in a single pass: the UDP_GRO gso_size (when
+// wantGRO is true) and the outer IP-level ECN codepoint stamped on the
+// carrier (when wantECN is true). Returns zeros for whichever field is not
+// requested or not present. isV4 selects between IP_TOS (1-byte) and
+// IPV6_TCLASS (4-byte int) cmsg payloads.
+func parseRecvCmsg(hdr *msghdr, wantGRO, wantECN bool, isV4 bool) (gso int, ecn byte) {
+	controllen := int(hdr.Controllen)
+	if controllen < unix.SizeofCmsghdr || hdr.Control == nil {
+		return 0, 0
+	}
+	ctrl := unsafe.Slice(hdr.Control, controllen)
+	off := 0
+	for off+unix.SizeofCmsghdr <= len(ctrl) {
+		ch := (*unix.Cmsghdr)(unsafe.Pointer(&ctrl[off]))
+		clen := int(ch.Len)
+		if clen < unix.SizeofCmsghdr || off+clen > len(ctrl) {
+			return gso, ecn
+		}
+		dataOff := off + unix.CmsgLen(0)
+		switch {
+		case wantGRO && ch.Level == unix.SOL_UDP && ch.Type == unix.UDP_GRO:
+			if dataOff+udpGROCmsgPayload <= len(ctrl) {
+				gso = int(int32(binary.NativeEndian.Uint32(ctrl[dataOff : dataOff+udpGROCmsgPayload])))
+			}
+		case wantECN && isV4 && ch.Level == unix.IPPROTO_IP && ch.Type == unix.IP_TOS:
+			// IP_TOS arrives as a single byte; only the low 2 bits are ECN.
+			if dataOff+1 <= len(ctrl) {
+				ecn = ctrl[dataOff] & 0x03
+			}
+		case wantECN && !isV4 && ch.Level == unix.IPPROTO_IPV6 && ch.Type == unix.IPV6_TCLASS:
+			// IPV6_TCLASS arrives as a 4-byte int; ECN is the low 2 bits.
+			if dataOff+4 <= len(ctrl) {
+				ecn = byte(binary.NativeEndian.Uint32(ctrl[dataOff:dataOff+4])) & 0x03
+			}
+		}
+		// Advance by the aligned cmsg space.
+		off += unix.CmsgSpace(clen - unix.CmsgLen(0))
+	}
+	return gso, ecn
+}
+
 func (u *StdConn) ListenOut(r EncReader, flush func()) error {
 	if u.batch == 1 {
 		return u.listenOutSingle(r, flush)
@@ -243,19 +571,255 @@ func (u *StdConn) WriteTo(b []byte, ip netip.AddrPort) error {
 	return err
 }

-func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort) error {
+// WriteBatch sends bufs via sendmmsg(2) using the preallocated scratch on
+// StdConn. Consecutive packets to the same destination with matching segment
+// sizes (all but possibly the last) are coalesced into a single mmsghdr entry
+// carrying a UDP_SEGMENT cmsg, so one syscall can mix runs of GSO superpackets
+// with plain one-off datagrams. Without GSO support every packet is its own
+// entry, matching the prior behaviour.
+//
+// Chunks larger than the scratch are processed across multiple syscalls. If
+// sendmmsg returns an error AND zero entries went out we fall back to
+// per-packet WriteTo for that chunk so the caller still gets best-effort
+// delivery; on a partial-success error we just replay the remainder.
+func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, ecns []byte) error {
 	if len(bufs) != len(addrs) {
 		return fmt.Errorf("WriteBatch: len(bufs)=%d != len(addrs)=%d", len(bufs), len(addrs))
 	}
-	//todo use sendmmsg
-	for i := 0; i < len(bufs); i++ {
-		if _, err := u.udpConn.WriteToUDPAddrPort(bufs[i], addrs[i]); err != nil {
-			return err
+	if ecns != nil && len(ecns) != len(bufs) {
+		return fmt.Errorf("WriteBatch: len(ecns)=%d != len(bufs)=%d", len(ecns), len(bufs))
+	}
+
+	// Callers deliver same-destination packets contiguously and in counter
+	// order, so we run the GSO planner directly without a pre-sort. A
+	// sorting pass measurably hurt throughput in microbenchmarks while
+	// providing no observed reordering benefit.
+
+	i := 0
+	for i < len(bufs) {
+		baseI := i
+		entry := 0
+		iovIdx := 0
+		for entry < len(u.writeMsgs) && i < len(bufs) {
+			iovBudget := len(u.writeIovs) - iovIdx
+			if iovBudget < 1 {
+				break
+			}
+			runLen, segSize := u.planRun(bufs, addrs, ecns, i, iovBudget)
+			if runLen == 0 {
+				break
+			}
+
+			for k := 0; k < runLen; k++ {
+				b := bufs[i+k]
+				if len(b) == 0 {
+					u.writeIovs[iovIdx+k].Base = nil
+					setIovLen(&u.writeIovs[iovIdx+k], 0)
+				} else {
+					u.writeIovs[iovIdx+k].Base = &b[0]
+					setIovLen(&u.writeIovs[iovIdx+k], len(b))
+				}
+			}
+
+			nlen, err := writeSockaddr(u.writeNames[entry], addrs[i], u.isV4)
+			if err != nil {
+				return err
+			}
+
+			hdr := &u.writeMsgs[entry].Hdr
+			hdr.Iov = &u.writeIovs[iovIdx]
+			setMsgIovlen(hdr, runLen)
+			hdr.Namelen = uint32(nlen)
+
+			var ecn byte
+			if ecns != nil {
+				ecn = ecns[i]
+			}
+			u.writeEntryCmsg(entry, runLen, segSize, ecn)
+
+			i += runLen
+			iovIdx += runLen
+			u.writeEntryEnd[entry] = i
+			entry++
 		}
+
+		if entry == 0 {
+			return fmt.Errorf("sendmmsg: no progress")
+		}
+
+		sent, serr := u.sendmmsg(entry)
+		if serr != nil && sent <= 0 {
+			// Nothing went out for this chunk; fall back to WriteTo for each
+			// packet that was queued this iteration. We only enter this path
+			// when sendmmsg returned an error AND zero entries succeeded —
+			// otherwise the partial-success advance below replays only the
+			// remainder, avoiding duplicates of already-sent packets.
+			//
+			// sent=-1 from sendmmsg means message 0 itself failed (partial
+			// success returns the count instead), so log entry 0's parameters
+			// — that's the entry the kernel rejected.
+			hdr0 := &u.writeMsgs[0].Hdr
+			runLen0 := u.writeEntryEnd[0] - baseI
+			seg0 := len(bufs[baseI])
+			ecn0 := byte(0)
+			if ecns != nil {
+				ecn0 = ecns[baseI]
+			}
+			u.l.Warn("sendmmsg had problem",
+				"sent", sent, "err", serr,
+				"entries", entry,
+				"entry0_runLen", runLen0,
+				"entry0_segSize", seg0,
+				"entry0_iovlen", hdr0.Iovlen,
+				"entry0_controllen", hdr0.Controllen,
+				"entry0_namelen", hdr0.Namelen,
+				"entry0_ecn", ecn0,
+				"entry0_dst", addrs[baseI],
+				"isV4", u.isV4,
+				"gso", u.gsoSupported,
+				"gro", u.groSupported,
+			)
+			for k := baseI; k < i; k++ {
+				if werr := u.WriteTo(bufs[k], addrs[k]); werr != nil {
+					return werr
+				}
+			}
+			continue
+		}
+		if sent == 0 {
+			return fmt.Errorf("sendmmsg made no progress")
+		}
+		// Rewind i to the end of the last successfully sent entry. For a
+		// full-success send this leaves i unchanged; for a partial send it
+		// replays the remainder on the next outer-loop iteration.
+		i = u.writeEntryEnd[sent-1]
 	}
 	return nil
 }

+// planRun groups consecutive packets starting at `start` that can be sent as
+// a single UDP GSO superpacket (one sendmmsg entry with UDP_SEGMENT cmsg).
+// A run of length 1 means the entry carries no UDP_SEGMENT cmsg and the
+// kernel treats it as a plain datagram. Returns the run length and the
+// per-segment size (which equals len(bufs[start])). Without GSO support
+// every call returns runLen=1. Outer ECN (when ecns != nil) is also a run
+// boundary — the kernel stamps one outer codepoint per sendmsg entry, so
+// mixing values inside a run would lose information.
+func (u *StdConn) planRun(bufs [][]byte, addrs []netip.AddrPort, ecns []byte, start, iovBudget int) (int, int) {
+	if start >= len(bufs) || iovBudget < 1 {
+		return 0, 0
+	}
+	segSize := len(bufs[start])
+	if !u.gsoSupported || segSize == 0 || segSize > maxGSOBytes {
+		return 1, segSize
+	}
+	dst := addrs[start]
+	var ecn byte
+	if ecns != nil {
+		ecn = ecns[start]
+	}
+	maxLen := maxGSOSegments
+	if iovBudget < maxLen {
+		maxLen = iovBudget
+	}
+	runLen := 1
+	total := segSize
+	for runLen < maxLen && start+runLen < len(bufs) {
+		nextLen := len(bufs[start+runLen])
+		if nextLen == 0 || nextLen > segSize {
+			break
+		}
+		if addrs[start+runLen] != dst {
+			break
+		}
+		if ecns != nil && ecns[start+runLen] != ecn {
+			break
+		}
+		if total+nextLen > maxGSOBytes {
+			break
+		}
+		total += nextLen
+		runLen++
+		if nextLen < segSize {
+			// A short packet must be the last in the run.
+			break
+		}
+	}
+	return runLen, segSize
+}
+
+// writeEntryCmsg sets up the per-mmsghdr Hdr.Control / Hdr.Controllen for one
+// entry. It writes the UDP_SEGMENT payload when runLen >= 2 and the
+// IP_TOS/IPV6_TCLASS payload when ecn != 0, then points hdr.Control at the
+// smallest contiguous span that covers whichever cmsg(s) actually apply.
+func (u *StdConn) writeEntryCmsg(entry, runLen, segSize int, ecn byte) {
+	hdr := &u.writeMsgs[entry].Hdr
+	useSeg := runLen >= 2
+	useEcn := ecn != 0
+	base := entry * u.writeCmsgSpace
+
+	if useSeg {
+		dataOff := base + unix.CmsgLen(0)
+		binary.NativeEndian.PutUint16(u.writeCmsg[dataOff:dataOff+2], uint16(segSize))
+	}
+	if useEcn {
+		dataOff := base + u.writeCmsgSegSpace + unix.CmsgLen(0)
+		binary.NativeEndian.PutUint32(u.writeCmsg[dataOff:dataOff+4], uint32(ecn))
+	}
+
+	switch {
+	case useSeg && useEcn:
+		hdr.Control = &u.writeCmsg[base]
+		setMsgControllen(hdr, u.writeCmsgSpace)
+	case useSeg:
+		hdr.Control = &u.writeCmsg[base]
+		setMsgControllen(hdr, u.writeCmsgSegSpace)
+	case useEcn:
+		hdr.Control = &u.writeCmsg[base+u.writeCmsgSegSpace]
+		setMsgControllen(hdr, u.writeCmsgEcnSpace)
+	default:
+		hdr.Control = nil
+		setMsgControllen(hdr, 0)
+	}
+}
+
+// sendmmsg issues sendmmsg(2) over u.rawConn against the first n entries
+// of u.writeMsgs. Routes through u.rawSend so the per-call kernel callback
+// stays alloc-free.
+func (u *StdConn) sendmmsg(n int) (int, error) {
+	return u.rawSend.send(u.rawConn, n)
+}
+
+// writeSockaddr encodes addr into buf (which must be at least
+// SizeofSockaddrInet6 bytes). Returns the number of bytes used. If isV4 is
+// true and addr is not a v4 (or v4-in-v6) address, returns an error.
+func writeSockaddr(buf []byte, addr netip.AddrPort, isV4 bool) (int, error) {
+	ap := addr.Addr().Unmap()
+	if isV4 {
+		if !ap.Is4() {
+			return 0, ErrInvalidIPv6RemoteForSocket
+		}
+		// struct sockaddr_in: { sa_family_t(2), in_port_t(2, BE), in_addr(4), zero(8) }
+		// sa_family is host endian.
+		binary.NativeEndian.PutUint16(buf[0:2], unix.AF_INET)
+		binary.BigEndian.PutUint16(buf[2:4], addr.Port())
+		ip4 := ap.As4()
+		copy(buf[4:8], ip4[:])
+		for j := 8; j < 16; j++ {
+			buf[j] = 0
+		}
+		return unix.SizeofSockaddrInet4, nil
+	}
+	// struct sockaddr_in6: { sa_family_t(2), in_port_t(2, BE), flowinfo(4), in6_addr(16), scope_id(4) }
+	binary.NativeEndian.PutUint16(buf[0:2], unix.AF_INET6)
+	binary.BigEndian.PutUint16(buf[2:4], addr.Port())
+	binary.NativeEndian.PutUint32(buf[4:8], 0)
+	ip6 := addr.Addr().As16()
+	copy(buf[8:24], ip6[:])
+	binary.NativeEndian.PutUint32(buf[24:28], 0)
+	return unix.SizeofSockaddrInet6, nil
+}
+
 func (u *StdConn) ReloadConfig(c *config.C) {
 	b := c.GetInt("listen.read_buffer", 0)
 	if b > 0 {
--- a/udp/udp_linux_32.go
+++ b/udp/udp_linux_32.go
@@ -30,11 +30,16 @@ type rawMessage struct {
 	Len uint32
 }

-func (u *StdConn) PrepareRawMessages(n, bufSize int) ([]rawMessage, [][]byte, [][]byte) {
+func (u *StdConn) PrepareRawMessages(n, bufSize, cmsgSpace int) ([]rawMessage, [][]byte, [][]byte, []byte) {
 	msgs := make([]rawMessage, n)
 	buffers := make([][]byte, n)
 	names := make([][]byte, n)

+	var cmsgs []byte
+	if cmsgSpace > 0 {
+		cmsgs = make([]byte, n*cmsgSpace)
+	}
+
 	for i := range msgs {
 		buffers[i] = make([]byte, bufSize)
 		names[i] = make([]byte, unix.SizeofSockaddrInet6)
@@ -48,9 +53,14 @@ func (u *StdConn) PrepareRawMessages(n, bufSize int) ([]rawMessage, [][]byte, []

 		msgs[i].Hdr.Name = &names[i][0]
 		msgs[i].Hdr.Namelen = uint32(len(names[i]))
+
+		if cmsgSpace > 0 {
+			msgs[i].Hdr.Control = &cmsgs[i*cmsgSpace]
+			msgs[i].Hdr.Controllen = uint32(cmsgSpace)
+		}
 	}

-	return msgs, buffers, names
+	return msgs, buffers, names, cmsgs
 }

 func setIovLen(v *iovec, n int) {
--- a/udp/udp_linux_64.go
+++ b/udp/udp_linux_64.go
@@ -33,11 +33,16 @@ type rawMessage struct {
 	Pad0 [4]byte
 }

-func (u *StdConn) PrepareRawMessages(n, bufSize int) ([]rawMessage, [][]byte, [][]byte) {
+func (u *StdConn) PrepareRawMessages(n, bufSize, cmsgSpace int) ([]rawMessage, [][]byte, [][]byte, []byte) {
 	msgs := make([]rawMessage, n)
 	buffers := make([][]byte, n)
 	names := make([][]byte, n)

+	var cmsgs []byte
+	if cmsgSpace > 0 {
+		cmsgs = make([]byte, n*cmsgSpace)
+	}
+
 	for i := range msgs {
 		buffers[i] = make([]byte, bufSize)
 		names[i] = make([]byte, unix.SizeofSockaddrInet6)
@@ -51,9 +56,14 @@ func (u *StdConn) PrepareRawMessages(n, bufSize int) ([]rawMessage, [][]byte, []

 		msgs[i].Hdr.Name = &names[i][0]
 		msgs[i].Hdr.Namelen = uint32(len(names[i]))
+
+		if cmsgSpace > 0 {
+			msgs[i].Hdr.Control = &cmsgs[i*cmsgSpace]
+			msgs[i].Hdr.Controllen = uint64(cmsgSpace)
+		}
 	}

-	return msgs, buffers, names
+	return msgs, buffers, names, cmsgs
 }

 func setIovLen(v *iovec, n int) {
--- a/udp/udp_rio_windows.go
+++ b/udp/udp_rio_windows.go
@@ -161,7 +161,7 @@ func (u *RIOConn) ListenOut(r EncReader, flush func()) error {
 			continue
 		}

-		r(netip.AddrPortFrom(netip.AddrFrom16(rua.Addr).Unmap(), (rua.Port>>8)|((rua.Port&0xff)<<8)), buffer[:n])
+		r(netip.AddrPortFrom(netip.AddrFrom16(rua.Addr).Unmap(), (rua.Port>>8)|((rua.Port&0xff)<<8)), buffer[:n], RxMeta{})
 		flush()
 	}
 }
@@ -317,7 +317,7 @@ func (u *RIOConn) WriteTo(buf []byte, ip netip.AddrPort) error {
 	return winrio.SendEx(u.rq, dataBuffer, 1, nil, addressBuffer, nil, nil, 0, 0)
 }

-func (u *RIOConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort) error {
+func (u *RIOConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, _ []byte) error {
 	for i, b := range bufs {
 		if err := u.WriteTo(b, addrs[i]); err != nil {
 			return err
--- a/udp/udp_tester.go
+++ b/udp/udp_tester.go
@@ -157,7 +157,7 @@ func (u *TesterConn) WriteTo(b []byte, addr netip.AddrPort) error {
 		return nil
 	}
 }
-func (u *TesterConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort) error {
+func (u *TesterConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, _ []byte) error {
 	for i, b := range bufs {
 		if err := u.WriteTo(b, addrs[i]); err != nil {
 			return err
@@ -172,7 +172,7 @@ func (u *TesterConn) ListenOut(r EncReader, flush func()) error {
 		case <-u.done:
 			return os.ErrClosed
 		case p := <-u.RxPackets:
-			r(p.From, p.Data)
+			r(p.From, p.Data, RxMeta{})
 			p.Release()
 			flush()
 		}