diff --git a/inside.go b/inside.go
index 0fa841da..223a1c02 100644
--- a/inside.go
+++ b/inside.go
@@ -15,14 +15,7 @@ import (
 	"github.com/slackhq/nebula/routing"
 )
 
-func (f *Interface) consumeInsidePacket(pkt tio.Packet, fwPacket *firewall.Packet, nb []byte, sendBatch batch.TxBatcher, rejectBuf []byte, q int, localCache firewall.ConntrackCache) {
-	// borrowed: pkt.Bytes is owned by the originating tio.Queue and is
-	// only valid until the next Read on that queue. Every consumer below
-	// (parse, self-forward, handshake cache, sendInsideMessage) reads it
-	// synchronously; do not retain pkt outside this call. If a future
-	// caller needs to keep the packet, use pkt.Clone() to detach it from
-	// the borrow.
-	//
+func (f *Interface) consumeInsidePacket(pkt wire.Packet, fwPacket *firewall.Packet, nb []byte, sendBatch batch.TxBatcher, rejectBuf []byte, q int, localCache firewall.ConntrackCache) {
 	// pkt.Bytes is either one IP datagram (GSO zero) or a TSO/USO
 	// superpacket. In both cases the L3+L4 headers at the start describe
 	// the same 5-tuple every segment will share, so a single newPacket /
@@ -52,10 +45,6 @@ func (f *Interface) consumeInsidePacket(pkt tio.Packet, fwPacket *firewall.Packe
 		// routes packets from the Nebula addr to the Nebula addr through the Nebula
 		// TUN device.
 		if immediatelyForwardToSelf {
-			// Write copies into the kernel queue synchronously, so seg's lifetime ends at return.
-			// A self-forwarded superpacket would be re-handed to the
-			// kernel as one giant blob; segment first so the loopback
-			// path sees one IP datagram per Write.
 			err := tio.SegmentSuperpacket(pkt, func(seg []byte) error {
 				_, werr := f.readers[q].Write(seg)
 				return werr
@@ -107,7 +96,7 @@ func (f *Interface) consumeInsidePacket(pkt tio.Packet, fwPacket *firewall.Packe
 
 	dropReason := f.firewall.Drop(*fwPacket, false, hostinfo, f.pki.GetCAPool(), localCache)
 	if dropReason == nil {
-		f.sendInsideMessage(hostinfo, pkt, nb, sendBatch, rejectBuf, q)
+		f.sendInsideMessage(hostinfo, pkt, nb, sendBatch)
 	} else {
 		f.rejectInside(packet, rejectBuf, q)
 		if f.l.Enabled(context.Background(), slog.LevelDebug) {
@@ -521,6 +510,10 @@ func (f *Interface) SendVia(via *HostInfo,
 	nocopy bool,
 ) {
 	toSend, err := f.prepareSendVia(via, relay, ad, nb, out, nocopy)
+	if err != nil {
+		via.logger(f.l).Info("Failed to prepareSendVia", "error", err)
+		return
+	}
 	err = f.writers[0].WriteTo(toSend, via.remote)
 	if err != nil {
 		via.logger(f.l).Info("Failed to WriteTo in sendVia", "error", err)
diff --git a/interface.go b/interface.go
index 303771d6..0402e68b 100644
--- a/interface.go
+++ b/interface.go
@@ -407,7 +407,7 @@ func (f *Interface) listenIn(reader tio.Queue, q int) {
 			f.consumeInsidePacket(packets[i], fwPacket, nb, sb, rejectBuf, q, ctCache)
 		}
 		if err := sb.Flush(); err != nil {
-			f.l.Error("Failed to write outgoing batch", "error", err, "writer", i)
+			f.l.Error("Failed to write outgoing batch", "error", err, "writer", q)
 		}
 	}
 
diff --git a/overlay/batch/tx_batch.go b/overlay/batch/tx_batch.go
index 38f86b25..599bc306 100644
--- a/overlay/batch/tx_batch.go
+++ b/overlay/batch/tx_batch.go
@@ -1,9 +1,18 @@
 package batch
 
-import "net/netip"
+import (
+	"net/netip"
+
+	"github.com/slackhq/nebula/udp"
+)
 
 const SendBatchCap = 128
 
+// DefaultSendBatchArenaCap is the recommended arena capacity for a
+// standalone SendBatch: 128 slots × (udp.MTU + 32) ≈ 1.1 MiB. The +32 covers
+// the nebula header + AEAD tag tacked onto each plaintext segment.
+const DefaultSendBatchArenaCap = SendBatchCap * (udp.MTU + 32)
+
 // batchWriter is the minimal subset of udp.Conn needed by SendBatch to flush.
 type batchWriter interface {
 	WriteBatch(bufs [][]byte, addrs []netip.AddrPort, outerECNs []byte) error
@@ -11,38 +20,29 @@ type batchWriter interface {
 
 // SendBatch accumulates encrypted UDP packets and flushes them via WriteBatch.
 // One SendBatch is owned by each listenIn goroutine; no locking is needed.
-// The backing arena grows on demand: when there isn't room for the next slot
-// we allocate a fresh backing array. Already-committed slices keep referencing
-// the old array and remain valid until Flush drops them.
+// Slot bytes are borrowed from the injected Arena and remain valid until
+// Flush, which Resets the arena.
 type SendBatch struct {
-	out     batchWriter
-	bufs    [][]byte
-	dsts    []netip.AddrPort
-	ecns    []byte
-	backing []byte
+	out   batchWriter
+	bufs  [][]byte
+	dsts  []netip.AddrPort
+	ecns  []byte
+	arena *Arena
 }
 
-// NewSendBatch makes a SendBatch with batchCap slots and an arenaSize byte buffer for slices to back those slots
-func NewSendBatch(out batchWriter, batchCap, arenaSize int) *SendBatch {
+// NewSendBatch makes a SendBatch with batchCap slots backed by arena.
+func NewSendBatch(out batchWriter, batchCap int, arena *Arena) *SendBatch {
 	return &SendBatch{
-		out:     out,
-		bufs:    make([][]byte, 0, batchCap),
-		dsts:    make([]netip.AddrPort, 0, batchCap),
-		ecns:    make([]byte, 0, batchCap),
-		backing: make([]byte, 0, arenaSize),
+		out:   out,
+		bufs:  make([][]byte, 0, batchCap),
+		dsts:  make([]netip.AddrPort, 0, batchCap),
+		ecns:  make([]byte, 0, batchCap),
+		arena: arena,
 	}
 }
 
 func (b *SendBatch) Reserve(sz int) []byte {
-	if len(b.backing)+sz > cap(b.backing) {
-		// Grow: allocate a fresh backing. Already-committed slices still
-		// reference the old array and remain valid until Flush drops them.
-		newCap := max(cap(b.backing)*2, sz)
-		b.backing = make([]byte, 0, newCap)
-	}
-	start := len(b.backing)
-	b.backing = b.backing[:start+sz]
-	return b.backing[start : start+sz : start+sz]
+	return b.arena.Reserve(sz)
 }
 
 func (b *SendBatch) Commit(pkt []byte, dst netip.AddrPort, outerECN byte) {
@@ -60,6 +60,6 @@ func (b *SendBatch) Flush() error {
 	b.bufs = b.bufs[:0]
 	b.dsts = b.dsts[:0]
 	b.ecns = b.ecns[:0]
-	b.backing = b.backing[:0]
+	b.arena.Reset()
 	return err
 }
diff --git a/overlay/batch/tx_batch_test.go b/overlay/batch/tx_batch_test.go
index 454011dc..59b06e58 100644
--- a/overlay/batch/tx_batch_test.go
+++ b/overlay/batch/tx_batch_test.go
@@ -27,7 +27,7 @@ func (w *fakeBatchWriter) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, ecns
 
 func TestSendBatchReserveCommitFlush(t *testing.T) {
 	fw := &fakeBatchWriter{}
-	b := NewSendBatch(fw, 4, 32)
+	b := NewSendBatch(fw, 4, NewArena(32))
 
 	ap := netip.MustParseAddrPort("10.0.0.1:4242")
 	for i := 0; i < 4; i++ {
@@ -71,7 +71,7 @@ func TestSendBatchReserveCommitFlush(t *testing.T) {
 
 func TestSendBatchSlotsDoNotOverlap(t *testing.T) {
 	fw := &fakeBatchWriter{}
-	b := NewSendBatch(fw, 3, 8)
+	b := NewSendBatch(fw, 3, NewArena(8))
 	ap := netip.MustParseAddrPort("10.0.0.1:80")
 
 	for i := 0; i < 3; i++ {
@@ -93,7 +93,7 @@ func TestSendBatchSlotsDoNotOverlap(t *testing.T) {
 func TestSendBatchGrowPreservesCommitted(t *testing.T) {
 	fw := &fakeBatchWriter{}
 	// Tiny initial backing forces a grow on the second Reserve.
-	b := NewSendBatch(fw, 1, 4)
+	b := NewSendBatch(fw, 1, NewArena(4))
 	ap := netip.MustParseAddrPort("10.0.0.1:80")
 
 	s1 := b.Reserve(4)
diff --git a/overlay/device.go b/overlay/device.go
index cff3ac7d..8044ee75 100644
--- a/overlay/device.go
+++ b/overlay/device.go
@@ -8,6 +8,10 @@ import (
 	"github.com/slackhq/nebula/routing"
 )
 
+// defaultBatchBufSize is the per-Queue scratch size for Read on backends
+// that don't do TSO segmentation. 65535 covers any single IP packet.
+const defaultBatchBufSize = 65535
+
 type Device interface {
 	io.Closer
 	Activate() error
diff --git a/overlay/tio/segment.go b/overlay/tio/segment.go
new file mode 100644
index 00000000..67648ad2
--- /dev/null
+++ b/overlay/tio/segment.go
@@ -0,0 +1,12 @@
+package tio
+
+import "fmt"
+
+// SegmentSuperpacket invokes fn once per segment of pkt.
+// This is a stub implementation that does not actually support segmentation
+func SegmentSuperpacket(pkt Packet, fn func(seg []byte) error) error {
+	if pkt.GSO.IsSuperpacket() {
+		return fmt.Errorf("tio: GSO superpacket on platform without segmentation support")
+	}
+	return fn(pkt.Bytes)
+}
diff --git a/udp/udp_linux.go b/udp/udp_linux.go
index a1b43c5b..6465be32 100644
--- a/udp/udp_linux.go
+++ b/udp/udp_linux.go
@@ -6,13 +6,10 @@ package udp
 import (
 	"context"
 	"encoding/binary"
-	"errors"
 	"fmt"
 	"log/slog"
 	"net"
 	"net/netip"
-	"strconv"
-	"strings"
 	"syscall"
 	"unsafe"
 
@@ -35,44 +32,14 @@ type StdConn struct {
 	writeIovs  []iovec
 	writeNames [][]byte
 
-	// Per-entry cmsg scratch. writeCmsg is one contiguous slab of
-	// MaxWriteBatch * writeCmsgSpace bytes; each entry holds two cmsg
-	// headers (UDP_SEGMENT then IP_TOS / IPV6_TCLASS) pre-filled once in
-	// prepareWriteMessages. WriteBatch only rewrites the per-call data
-	// payloads and toggles Hdr.Control / Hdr.Controllen to point at
-	// whichever subset of the two cmsgs applies.
-	writeCmsg         []byte
-	writeCmsgSpace    int
-	writeCmsgSegSpace int
-	writeCmsgEcnSpace int
-
-	// writeEntryEnd[e] is the bufs index *after* the last packet packed
-	// into mmsghdr entry e. Used to rewind `i` on partial sendmmsg success.
-	writeEntryEnd []int
-
-	// rawSend wraps the sendmmsg(2) callback in a closure-free helper so
-	// the hot path doesn't heap-allocate a fresh closure per call.
-	rawSend rawSendmmsg
-
-	// UDP GSO (sendmsg with UDP_SEGMENT cmsg) support. gsoSupported is
-	// probed once at socket creation. When true, WriteBatch packs same-
-	// destination consecutive packets into a single sendmmsg entry with a
-	// UDP_SEGMENT cmsg; otherwise each packet is its own entry.
-	gsoSupported   bool
-	maxGSOSegments int
-
-	// UDP GRO (recvmsg with UDP_GRO cmsg) support. groSupported is probed
-	// once at socket creation. When true, listenOutBatch allocates larger
-	// RX buffers and a per-entry cmsg slot so the kernel can coalesce
-	// consecutive same-flow datagrams into a single recvmmsg entry; the
-	// delivered cmsg carries the gso_size used to split them back apart.
-	groSupported bool
-
-	// ecnRecvSupported is true when IP_RECVTOS / IPV6_RECVTCLASS was
-	// successfully enabled — the kernel will deliver the outer IP-ECN of
-	// each arriving datagram as a per-slot cmsg, and listenOutBatch passes
-	// the parsed value to the EncReader callback for RFC 6040 combine.
-	ecnRecvSupported bool
+	// sendmmsg(2) callback state. sendmmsgCB is bound once in NewListener
+	// to the sendmmsgRun method value so passing it to rawConn.Write does
+	// not allocate a fresh closure per send; sendmmsgN/Sent/Errno carry
+	// the inputs and outputs across the call without escaping locals.
+	sendmmsgCB    func(fd uintptr) bool
+	sendmmsgN     int
+	sendmmsgSent  int
+	sendmmsgErrno syscall.Errno
 }
 
 func setReusePort(network, address string, c syscall.RawConn) error {
@@ -106,11 +73,10 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int)
 	}
 	//gotta find out if we got an AF_INET6 socket or not:
 	out := &StdConn{
-		udpConn:        udpConn,
-		rawConn:        rawConn,
-		l:              l,
-		batch:          batch,
-		maxGSOSegments: 1,
+		udpConn: udpConn,
+		rawConn: rawConn,
+		l:       l,
+		batch:   batch,
 	}
 
 	af, err := out.getSockOptInt(unix.SO_DOMAIN)
@@ -121,71 +87,15 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int)
 	out.isV4 = af == unix.AF_INET
 
 	out.prepareWriteMessages(MaxWriteBatch)
-	out.rawSend.msgs = out.writeMsgs
-	out.rawSend.bind()
-
-	out.prepareGSO()
-	// GRO delivers coalesced superpackets that need a cmsg to split back
-	// into segments. The single-packet RX path uses ReadFromUDPAddrPort
-	// and cannot see that cmsg, so only enable GRO for the batch path.
-	if batch > 1 {
-		out.prepareGRO()
-	}
-	// Best-effort: ask the kernel to deliver outer IP-ECN as ancillary data
-	// on every recvmmsg slot so the decap side can apply RFC 6040 combine.
-	// On older kernels these may not exist; failing here just means we get
-	// 0 (Not-ECT) on every slot, which is the same as ecn_mode=disable.
-	out.prepareECNRecv()
+	out.sendmmsgCB = out.sendmmsgRun
 
 	return out, nil
 }
 
-// prepareWriteMessages allocates one mmsghdr/iovec/sockaddr/cmsg scratch
-// slot per sendmmsg entry. The iovec slab is sized to n so all entries'
-// iovecs share one allocation; per-entry fan-out is further capped at
-// maxGSOSegments. Hdr.Iov / Hdr.Iovlen / Hdr.Control / Hdr.Controllen are
-// wired per call since each entry can span a variable number of iovecs
-// and may or may not carry a cmsg.
-//
-// Per-mmsghdr cmsg layout. Each entry's slot of length writeCmsgSpace holds
-// up to two cmsg headers placed at fixed offsets:
-//
-//	[0 .. writeCmsgSegSpace)              UDP_SEGMENT (gso_size, uint16)
-//	[writeCmsgSegSpace .. writeCmsgSpace) IP_TOS or IPV6_TCLASS (int32)
-//
-// Both headers are pre-filled once here; per-call we only rewrite the data
-// payload and toggle Hdr.Control / Hdr.Controllen to point at whichever
-// subset applies (none / segment-only / ecn-only / both).
 func (u *StdConn) prepareWriteMessages(n int) {
 	u.writeMsgs = make([]rawMessage, n)
 	u.writeIovs = make([]iovec, n)
 	u.writeNames = make([][]byte, n)
-	u.writeEntryEnd = make([]int, n)
-
-	u.writeCmsgSegSpace = unix.CmsgSpace(2)
-	u.writeCmsgEcnSpace = unix.CmsgSpace(4)
-	u.writeCmsgSpace = u.writeCmsgSegSpace + u.writeCmsgEcnSpace
-	u.writeCmsg = make([]byte, n*u.writeCmsgSpace)
-
-	ecnLevel := int32(unix.IPPROTO_IP)
-	ecnType := int32(unix.IP_TOS)
-	if !u.isV4 {
-		ecnLevel = unix.IPPROTO_IPV6
-		ecnType = unix.IPV6_TCLASS
-	}
-
-	for k := 0; k < n; k++ {
-		base := k * u.writeCmsgSpace
-		seg := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base]))
-		seg.Level = unix.SOL_UDP
-		seg.Type = unix.UDP_SEGMENT
-		setCmsgLen(seg, unix.CmsgLen(2))
-
-		ecn := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base+u.writeCmsgSegSpace]))
-		ecn.Level = ecnLevel
-		ecn.Type = ecnType
-		setCmsgLen(ecn, unix.CmsgLen(4))
-	}
 
 	for i := range u.writeMsgs {
 		u.writeNames[i] = make([]byte, unix.SizeofSockaddrInet6)
@@ -193,139 +103,6 @@ func (u *StdConn) prepareWriteMessages(n int) {
 	}
 }
 
-// maxGSOBytes bounds the total payload per sendmsg() when UDP_SEGMENT is
-// set. The kernel stitches all iovecs into a single skb whose length the
-// UDP length field can represent, and also enforces sk_gso_max_size (which
-// on most devices is 65536). We use 65000 to leave headroom under the
-// 65535 UDP-length cap, avoiding EMSGSIZE on large TSO superpackets.
-const maxGSOBytes = 65000
-
-// prepareGSO probes UDP_SEGMENT support and sets u.gsoSupported on success.
-// Best-effort; failure leaves it false.
-func (u *StdConn) prepareGSO() {
-	u.maxGSOSegments = 63 //gotta be one less than the max so we can still attach a header
-
-	var probeErr error
-	if err := u.rawConn.Control(func(fd uintptr) {
-		probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_SEGMENT, 0)
-	}); err != nil {
-		u.l.Info("udp: GSO disabled", "reason", "rawconn control failed", "error", err)
-		recordCapability("udp.gso.enabled", false)
-		return
-	}
-	if probeErr != nil {
-		u.l.Info("udp: GSO disabled", "reason", "kernel rejected probe", "error", probeErr)
-		recordCapability("udp.gso.enabled", false)
-		return
-	}
-
-	var un unix.Utsname
-	if err := unix.Uname(&un); err != nil {
-		u.l.Info("udp: GSO disabled", "reason", "kernel uname probe failed", "error", err)
-		recordCapability("udp.gso.enabled", false)
-		return
-	}
-	major, minor := parseRelease(string(un.Release[:]))
-	if major > 5 || (major == 5 && minor >= 5) {
-		u.maxGSOSegments = 127
-	}
-
-	u.gsoSupported = true
-	u.l.Info("udp: GSO enabled", "maxGSOSegments", u.maxGSOSegments)
-	recordCapability("udp.gso.enabled", true)
-}
-
-// udpGROBufferSize sizes the per-entry recvmmsg buffer when UDP_GRO is on.
-// The kernel stitches a run of same-flow datagrams into a single skb whose
-// length is bounded by sk_gso_max_size (typically 65535); anything larger
-// would be MSG_TRUNCed. We use the maximum representable UDP length so a
-// full superpacket always lands intact.
-const udpGROBufferSize = 65535
-
-// udpGROCmsgPayload is the size of the UDP_GRO cmsg data delivered by the
-// kernel: a single int (gso_size in bytes). See udp_cmsg_recv() in
-// net/ipv4/udp.c.
-const udpGROCmsgPayload = 4
-
-// prepareGRO turns on UDP_GRO so the kernel coalesces consecutive same-flow
-// datagrams into one recvmmsg entry, with a cmsg carrying the gso_size used
-// to split them back apart on the application side.
-func (u *StdConn) prepareGRO() {
-	var probeErr error
-	if err := u.rawConn.Control(func(fd uintptr) {
-		probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_GRO, 1)
-	}); err != nil {
-		u.l.Info("udp: GRO disabled", "reason", "rawconn control failed", "error", err)
-		recordCapability("udp.gro.enabled", false)
-		return
-	}
-	if probeErr != nil {
-		u.l.Info("udp: GRO disabled", "reason", "kernel rejected probe", "error", probeErr)
-		recordCapability("udp.gro.enabled", false)
-		return
-	}
-	u.groSupported = true
-	u.l.Info("udp: GRO enabled")
-	recordCapability("udp.gro.enabled", true)
-}
-
-// prepareECNRecv turns on IP_RECVTOS / IPV6_RECVTCLASS so the outer IP-ECN
-// field of each arriving datagram is delivered as ancillary data alongside
-// the payload. listenOutBatch reads it via parseRecvCmsg and passes the
-// codepoint through the EncReader for RFC 6040 combine on the decap side.
-// Best-effort: we keep going on failure.
-func (u *StdConn) prepareECNRecv() {
-	var v4err, v6err error
-	if err := u.rawConn.Control(func(fd uintptr) {
-		v4err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_RECVTOS, 1)
-		if !u.isV4 {
-			v6err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_RECVTCLASS, 1)
-		}
-	}); err != nil {
-		u.l.Info("udp: outer-ECN RX disabled", "reason", "rawconn control failed", "error", err)
-		recordCapability("udp.ecn_rx.enabled", false)
-		return
-	}
-	if u.isV4 { //only check the V4 attempt
-		if v4err != nil {
-			u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", v4err)
-			recordCapability("udp.ecn_rx.enabled", false)
-		} else {
-			u.ecnRecvSupported = true
-			u.l.Info("udp: outer-ECN RX enabled")
-			recordCapability("udp.ecn_rx.enabled", true)
-		}
-		return
-	} else {
-		if v6err != nil { //no V6 ECN? disable it.
-			u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", errors.Join(v4err, v6err))
-			recordCapability("udp.ecn_rx.enabled", false)
-			return
-		} else if v4err != nil { //no V4, but yes V6? Low level warning. Could be a V6-specific bind.
-			u.l.Debug("udp: outer-ECN RX degraded", "reason", "kernel rejected probe on IPv4", "error", v4err)
-		}
-		// all good
-		u.ecnRecvSupported = true
-		u.l.Info("udp: outer-ECN RX enabled")
-		recordCapability("udp.ecn_rx.enabled", true)
-		return
-	}
-}
-
-// recordCapability registers (or updates) a boolean gauge for one of the
-// kernel-feature probes. Gauges go to 1 when the feature is enabled, 0 when
-// it is not — dashboards can show degraded state on partially-supported
-// kernels at a glance. Calling repeatedly with the same name updates the
-// existing gauge rather than registering a duplicate.
-func recordCapability(name string, enabled bool) {
-	g := metrics.GetOrRegisterGauge(name, nil)
-	if enabled {
-		g.Update(1)
-	} else {
-		g.Update(0)
-	}
-}
-
 func (u *StdConn) SupportsMultipleReaders() bool {
 	return true
 }
@@ -444,15 +221,16 @@ func (u *StdConn) listenOutSingle(r EncReader, flush func()) error {
 	}
 }
 
-func getFrom(names [][]byte, i int, isV4 bool) netip.AddrPort {
+// readSockaddr decodes the source address out of a recvmmsg name buffer
+func (u *StdConn) readSockaddr(name []byte) netip.AddrPort {
 	var ip netip.Addr
-	// Its ok to skip the ok check here, the slicing is the only error that can occur and it will panic
-	if isV4 {
-		ip, _ = netip.AddrFromSlice(names[i][4:8])
+	// It's ok to skip the ok check here, the slicing is the only error that can occur and it will panic
+	if u.isV4 {
+		ip, _ = netip.AddrFromSlice(name[4:8])
 	} else {
-		ip, _ = netip.AddrFromSlice(names[i][8:24])
+		ip, _ = netip.AddrFromSlice(name[8:24])
 	}
-	return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(names[i][2:4]))
+	return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(name[2:4]))
 }
 
 func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
@@ -461,16 +239,6 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
 
 	bufSize := MTU
 	cmsgSpace := 0
-	if u.groSupported {
-		bufSize = udpGROBufferSize
-		cmsgSpace = unix.CmsgSpace(udpGROCmsgPayload)
-	}
-	if u.ecnRecvSupported {
-		// IP_TOS arrives as 1 byte; IPV6_TCLASS arrives as a 4-byte int.
-		// Reserve enough for the wider of the two so the same buffer fits
-		// either family alongside any UDP_GRO cmsg.
-		cmsgSpace += unix.CmsgSpace(4)
-	}
 	msgs, buffers, names, _ := u.PrepareRawMessages(u.batch, bufSize, cmsgSpace)
 
 	//reader needs to capture variables from this function, since it's used as a lambda with rawConn.Read
@@ -481,11 +249,6 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
 	}
 
 	for {
-		if cmsgSpace > 0 {
-			for i := range msgs {
-				setMsgControllen(&msgs[i].Hdr, cmsgSpace)
-			}
-		}
 		err := u.rawConn.Read(reader)
 		if err != nil {
 			return err
@@ -495,84 +258,13 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
 		}
 
 		for i := 0; i < n; i++ {
-			from := getFrom(names, i, u.isV4)
-			payload := buffers[i][:msgs[i].Len]
-
-			segSize := 0
-			outerECN := byte(0)
-			if cmsgSpace > 0 {
-				segSize, outerECN = parseRecvCmsg(&msgs[i].Hdr, u.groSupported, u.ecnRecvSupported, u.isV4)
-			}
-
-			if segSize <= 0 || segSize >= len(payload) {
-				r(from, payload, RxMeta{OuterECN: outerECN})
-			} else {
-				for off := 0; off < len(payload); off += segSize {
-					end := off + segSize
-					if end > len(payload) {
-						end = len(payload)
-					}
-					seg := payload[off:end]
-					r(from, seg, RxMeta{OuterECN: outerECN})
-				}
-			}
+			r(u.readSockaddr(names[i]), buffers[i][:msgs[i].Len], RxMeta{})
 		}
 
 		flush()
 	}
 }
 
-// headerCounter returns the big-endian uint64 message counter at bytes
-// [8:16] of a nebula packet, or 0 if the buffer is too short.
-func headerCounter(buf []byte) uint64 {
-	if len(buf) < 16 {
-		return 0
-	}
-	return binary.BigEndian.Uint64(buf[8:16])
-}
-
-// parseRecvCmsg walks the per-slot ancillary buffer once and extracts up to
-// two values of interest in a single pass: the UDP_GRO gso_size (when
-// wantGRO is true) and the outer IP-level ECN codepoint stamped on the
-// carrier (when wantECN is true). Returns zeros for whichever field is not
-// requested or not present. isV4 selects between IP_TOS (1-byte) and
-// IPV6_TCLASS (4-byte int) cmsg payloads.
-func parseRecvCmsg(hdr *msghdr, wantGRO, wantECN bool, isV4 bool) (gso int, ecn byte) {
-	controllen := int(hdr.Controllen)
-	if controllen < unix.SizeofCmsghdr || hdr.Control == nil {
-		return 0, 0
-	}
-	ctrl := unsafe.Slice(hdr.Control, controllen)
-	off := 0
-	for off+unix.SizeofCmsghdr <= len(ctrl) {
-		ch := (*unix.Cmsghdr)(unsafe.Pointer(&ctrl[off]))
-		clen := int(ch.Len)
-		if clen < unix.SizeofCmsghdr || off+clen > len(ctrl) {
-			return gso, ecn
-		}
-		dataOff := off + unix.CmsgLen(0)
-		switch {
-		case wantGRO && ch.Level == unix.SOL_UDP && ch.Type == unix.UDP_GRO:
-			if dataOff+udpGROCmsgPayload <= len(ctrl) {
-				gso = int(int32(binary.NativeEndian.Uint32(ctrl[dataOff : dataOff+udpGROCmsgPayload])))
-			}
-		case wantECN && isV4 && ch.Level == unix.IPPROTO_IP && ch.Type == unix.IP_TOS:
-			// IP_TOS arrives as a single byte; only the low 2 bits are ECN.
-			if dataOff+1 <= len(ctrl) {
-				ecn = ctrl[dataOff] & 0x03
-			}
-		case wantECN && !isV4 && ch.Level == unix.IPPROTO_IPV6 && ch.Type == unix.IPV6_TCLASS:
-			// IPV6_TCLASS arrives as a 4-byte int; ECN is the low 2 bits.
-			if dataOff+4 <= len(ctrl) {
-				ecn = byte(binary.NativeEndian.Uint32(ctrl[dataOff:dataOff+4])) & 0x03
-			}
-		}
-		// Advance by the aligned cmsg space.
-		off += unix.CmsgSpace(clen - unix.CmsgLen(0))
-	}
-	return gso, ecn
-}
-
 func (u *StdConn) ListenOut(r EncReader, flush func()) error {
 	if u.batch == 1 {
 		return u.listenOutSingle(r, flush)
@@ -587,222 +279,89 @@ func (u *StdConn) WriteTo(b []byte, ip netip.AddrPort) error {
 }
 
 // WriteBatch sends bufs via sendmmsg(2) using the preallocated scratch on
-// StdConn. Consecutive packets to the same destination with matching segment
-// sizes (all but possibly the last) are coalesced into a single mmsghdr entry
-// carrying a UDP_SEGMENT cmsg, so one syscall can mix runs of GSO superpackets
-// with plain one-off datagrams. Without GSO support every packet is its own
-// entry, matching the prior behaviour.
+// StdConn. If supported, consecutive packets to the same destination with
+// matching segment sizes (all but possibly the last) are coalesced into a
+// single mmsghdr entry
 //
-// Chunks larger than the scratch are processed across multiple syscalls. If
-// sendmmsg returns an error AND zero entries went out we fall back to
+// If sendmmsg returns an error and zero entries went out, we fall back to
 // per-packet WriteTo for that chunk so the caller still gets best-effort
-// delivery; on a partial-success error we just replay the remainder.
-func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, ecns []byte) error {
-	if len(bufs) != len(addrs) {
-		return fmt.Errorf("WriteBatch: len(bufs)=%d != len(addrs)=%d", len(bufs), len(addrs))
-	}
-	if ecns != nil && len(ecns) != len(bufs) {
-		return fmt.Errorf("WriteBatch: len(ecns)=%d != len(bufs)=%d", len(ecns), len(bufs))
-	}
+// delivery. On a partial send we resume at the first un-acked entry on
+// the next iteration.
+func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, _ []byte) error {
+	for i := 0; i < len(bufs); {
+		chunk := min(len(bufs)-i, len(u.writeMsgs))
 
-	// Callers deliver same-destination packets contiguously and in counter
-	// order, so we run the GSO planner directly without a pre-sort. A
-	// sorting pass measurably hurt throughput in microbenchmarks while
-	// providing no observed reordering benefit.
+		for k := 0; k < chunk; k++ {
+			u.writeIovs[k].Base = &bufs[i+k][0]
+			setIovLen(&u.writeIovs[k], len(bufs[i+k]))
 
-	i := 0
-	for i < len(bufs) {
-		baseI := i
-		entry := 0
-		iovIdx := 0
-		for entry < len(u.writeMsgs) && i < len(bufs) {
-			iovBudget := len(u.writeIovs) - iovIdx
-			if iovBudget < 1 {
-				break
-			}
-			runLen, segSize := u.planRun(bufs, addrs, ecns, i, iovBudget)
-			if runLen == 0 {
-				break
-			}
-
-			for k := 0; k < runLen; k++ {
-				b := bufs[i+k]
-				if len(b) == 0 {
-					u.writeIovs[iovIdx+k].Base = nil
-					setIovLen(&u.writeIovs[iovIdx+k], 0)
-				} else {
-					u.writeIovs[iovIdx+k].Base = &b[0]
-					setIovLen(&u.writeIovs[iovIdx+k], len(b))
-				}
-			}
-
-			nlen, err := writeSockaddr(u.writeNames[entry], addrs[i], u.isV4)
+			nlen, err := writeSockaddr(u.writeNames[k], addrs[i+k], u.isV4)
 			if err != nil {
 				return err
 			}
 
-			hdr := &u.writeMsgs[entry].Hdr
-			hdr.Iov = &u.writeIovs[iovIdx]
-			setMsgIovlen(hdr, runLen)
+			hdr := &u.writeMsgs[k].Hdr
+			hdr.Iov = &u.writeIovs[k]
+			setMsgIovlen(hdr, 1)
 			hdr.Namelen = uint32(nlen)
-
-			var ecn byte
-			if ecns != nil {
-				ecn = ecns[i]
-			}
-			u.writeEntryCmsg(entry, runLen, segSize, ecn)
-
-			i += runLen
-			iovIdx += runLen
-			u.writeEntryEnd[entry] = i
-			entry++
 		}
 
-		if entry == 0 {
-			return fmt.Errorf("sendmmsg: no progress")
-		}
-
-		sent, serr := u.sendmmsg(entry)
+		sent, serr := u.sendmmsg(chunk)
 		if serr != nil && sent <= 0 {
-			// Nothing went out for this chunk; fall back to WriteTo for each
-			// packet that was queued this iteration. We only enter this path
-			// when sendmmsg returned an error AND zero entries succeeded —
-			// otherwise the partial-success advance below replays only the
-			// remainder, avoiding duplicates of already-sent packets.
-			//
-			// sent=-1 from sendmmsg means message 0 itself failed (partial
-			// success returns the count instead), so log entry 0's parameters
-			// — that's the entry the kernel rejected.
-			hdr0 := &u.writeMsgs[0].Hdr
-			runLen0 := u.writeEntryEnd[0] - baseI
-			seg0 := len(bufs[baseI])
-			ecn0 := byte(0)
-			if ecns != nil {
-				ecn0 = ecns[baseI]
-			}
-			u.l.Warn("sendmmsg had problem",
-				"sent", sent, "err", serr,
-				"entries", entry,
-				"entry0_runLen", runLen0,
-				"entry0_segSize", seg0,
-				"entry0_iovlen", hdr0.Iovlen,
-				"entry0_controllen", hdr0.Controllen,
-				"entry0_namelen", hdr0.Namelen,
-				"entry0_ecn", ecn0,
-				"entry0_dst", addrs[baseI],
+			// sendmmsg returns -1 / sent=0 when entry 0 itself failed; log
+			// that entry's destination and fall back to per-packet WriteTo
+			// for the whole chunk so the caller still gets best-effort
+			// delivery without duplicating packets the kernel accepted.
+			u.l.Warn("sendmmsg failed, falling back to per-packet WriteTo",
+				"err", serr,
+				"entries", chunk,
+				"entry0_dst", addrs[i],
 				"isV4", u.isV4,
-				"gso", u.gsoSupported,
-				"gro", u.groSupported,
 			)
-			for k := baseI; k < i; k++ {
-				if werr := u.WriteTo(bufs[k], addrs[k]); werr != nil {
+			for k := 0; k < chunk; k++ {
+				if werr := u.WriteTo(bufs[i+k], addrs[i+k]); werr != nil {
 					return werr
 				}
 			}
+			i += chunk
 			continue
 		}
-		if sent == 0 {
-			return fmt.Errorf("sendmmsg made no progress")
-		}
-		// Rewind i to the end of the last successfully sent entry. For a
-		// full-success send this leaves i unchanged; for a partial send it
-		// replays the remainder on the next outer-loop iteration.
-		i = u.writeEntryEnd[sent-1]
+		i += sent
 	}
 	return nil
 }
 
-// planRun groups consecutive packets starting at `start` that can be sent as
-// a single UDP GSO superpacket (one sendmmsg entry with UDP_SEGMENT cmsg).
-// A run of length 1 means the entry carries no UDP_SEGMENT cmsg and the
-// kernel treats it as a plain datagram. Returns the run length and the
-// per-segment size (which equals len(bufs[start])). Without GSO support
-// every call returns runLen=1. Outer ECN (when ecns != nil) is also a run
-// boundary — the kernel stamps one outer codepoint per sendmsg entry, so
-// mixing values inside a run would lose information.
-func (u *StdConn) planRun(bufs [][]byte, addrs []netip.AddrPort, ecns []byte, start, iovBudget int) (int, int) {
-	if start >= len(bufs) || iovBudget < 1 {
-		return 0, 0
-	}
-	segSize := len(bufs[start])
-	if !u.gsoSupported || segSize == 0 || segSize > maxGSOBytes {
-		return 1, segSize
-	}
-	dst := addrs[start]
-	var ecn byte
-	if ecns != nil {
-		ecn = ecns[start]
-	}
-	maxLen := u.maxGSOSegments
-	if iovBudget < maxLen {
-		maxLen = iovBudget
-	}
-	runLen := 1
-	total := segSize
-	for runLen < maxLen && start+runLen < len(bufs) {
-		nextLen := len(bufs[start+runLen])
-		if nextLen == 0 || nextLen > segSize {
-			break
-		}
-		if addrs[start+runLen] != dst {
-			break
-		}
-		if ecns != nil && ecns[start+runLen] != ecn {
-			break
-		}
-		if total+nextLen > maxGSOBytes {
-			break
-		}
-		total += nextLen
-		runLen++
-		if nextLen < segSize {
-			// A short packet must be the last in the run.
-			break
-		}
-	}
-	return runLen, segSize
-}
-
-// writeEntryCmsg sets up the per-mmsghdr Hdr.Control / Hdr.Controllen for one
-// entry. It writes the UDP_SEGMENT payload when runLen >= 2 and the
-// IP_TOS/IPV6_TCLASS payload when ecn != 0, then points hdr.Control at the
-// smallest contiguous span that covers whichever cmsg(s) actually apply.
-func (u *StdConn) writeEntryCmsg(entry, runLen, segSize int, ecn byte) {
-	hdr := &u.writeMsgs[entry].Hdr
-	useSeg := runLen >= 2
-	useEcn := ecn != 0
-	base := entry * u.writeCmsgSpace
-
-	if useSeg {
-		dataOff := base + unix.CmsgLen(0)
-		binary.NativeEndian.PutUint16(u.writeCmsg[dataOff:dataOff+2], uint16(segSize))
-	}
-	if useEcn {
-		dataOff := base + u.writeCmsgSegSpace + unix.CmsgLen(0)
-		binary.NativeEndian.PutUint32(u.writeCmsg[dataOff:dataOff+4], uint32(ecn))
-	}
-
-	switch {
-	case useSeg && useEcn:
-		hdr.Control = &u.writeCmsg[base]
-		setMsgControllen(hdr, u.writeCmsgSpace)
-	case useSeg:
-		hdr.Control = &u.writeCmsg[base]
-		setMsgControllen(hdr, u.writeCmsgSegSpace)
-	case useEcn:
-		hdr.Control = &u.writeCmsg[base+u.writeCmsgSegSpace]
-		setMsgControllen(hdr, u.writeCmsgEcnSpace)
-	default:
-		hdr.Control = nil
-		setMsgControllen(hdr, 0)
-	}
-}
-
-// sendmmsg issues sendmmsg(2) over u.rawConn against the first n entries
-// of u.writeMsgs. Routes through u.rawSend so the per-call kernel callback
-// stays alloc-free.
+// sendmmsg issues sendmmsg(2) against the first n entries of u.writeMsgs.
+// The bound u.sendmmsgCB is passed to rawConn.Write so no closure is
+// allocated per call; inputs and outputs ride on the StdConn fields.
 func (u *StdConn) sendmmsg(n int) (int, error) {
-	return u.rawSend.send(u.rawConn, n)
+	u.sendmmsgN = n
+	u.sendmmsgSent = 0
+	u.sendmmsgErrno = 0
+	if err := u.rawConn.Write(u.sendmmsgCB); err != nil {
+		return u.sendmmsgSent, err
+	}
+	if u.sendmmsgErrno != 0 {
+		return u.sendmmsgSent, &net.OpError{Op: "sendmmsg", Err: u.sendmmsgErrno}
+	}
+	return u.sendmmsgSent, nil
+}
+
+// sendmmsgRun is the rawConn.Write callback. It is bound once into
+// u.sendmmsgCB at construction so it stays alloc-free in the hot path;
+// inputs (sendmmsgN) and outputs (sendmmsgSent, sendmmsgErrno) ride on
+// the receiver rather than escaping locals.
+func (u *StdConn) sendmmsgRun(fd uintptr) bool {
+	r1, _, errno := unix.Syscall6(unix.SYS_SENDMMSG, fd,
+		uintptr(unsafe.Pointer(&u.writeMsgs[0])), uintptr(u.sendmmsgN),
+		0, 0, 0,
+	)
+	if errno == syscall.EAGAIN || errno == syscall.EWOULDBLOCK {
+		return false
+	}
+	u.sendmmsgSent = int(r1)
+	u.sendmmsgErrno = errno
+	return true
 }
 
 // writeSockaddr encodes addr into buf (which must be at least
@@ -820,9 +379,7 @@ func writeSockaddr(buf []byte, addr netip.AddrPort, isV4 bool) (int, error) {
 		binary.BigEndian.PutUint16(buf[2:4], addr.Port())
 		ip4 := ap.As4()
 		copy(buf[4:8], ip4[:])
-		for j := 8; j < 16; j++ {
-			buf[j] = 0
-		}
+		clear(buf[8:16])
 		return unix.SizeofSockaddrInet4, nil
 	}
 	// struct sockaddr_in6: { sa_family_t(2), in_port_t(2, BE), flowinfo(4), in6_addr(16), scope_id(4) }
@@ -940,22 +497,3 @@ func NewUDPStatsEmitter(udpConns []Conn) func() {
 		}
 	}
 }
-
-func parseRelease(r string) (major, minor int) {
-	// strip anything after the second dot or any non-digit
-	parts := strings.SplitN(r, ".", 3)
-	if len(parts) < 2 {
-		return 0, 0
-	}
-	major, _ = strconv.Atoi(parts[0])
-	// minor may have trailing junk like "15-generic"
-	mp := parts[1]
-	for i, c := range mp {
-		if c < '0' || c > '9' {
-			mp = mp[:i]
-			break
-		}
-	}
-	minor, _ = strconv.Atoi(mp)
-	return
-}