batched tun interface

2026-05-16 04:47:38 +02:00 · 2026-04-17 10:25:05 -05:00
parent 9a30c5b6a1
commit afcdf2163b
20 changed files with 939 additions and 68 deletions
--- a/overlay/batch/batch.go
+++ b/overlay/batch/batch.go
@@ -0,0 +1,28 @@
+package batch
+
+import "net/netip"
+
+type RxBatcher interface {
+	// Reserve creates a pkt to borrow
+	Reserve(sz int) []byte
+	// Commit borrows pkt. The caller must keep pkt valid until the next Flush
+	Commit(pkt []byte) error
+	// Flush emits every queued packet in arrival order. Returns the
+	// first error observed; keeps draining so one bad packet doesn't hold up
+	// the rest. After Flush returns, borrowed payload slices may be recycled.
+	Flush() error
+}
+
+type TxBatcher interface {
+	// Reserve creates a pkt to borrow
+	Reserve(sz int) []byte
+	// Commit borrows pkt and records its destination plus the 2-bit
+	// IP-level ECN codepoint to set on the outer (carrier) header. The
+	// caller must keep pkt valid until the next Flush. Pass 0 (Not-ECT)
+	// to leave the outer ECN field unset.
+	Commit(pkt []byte, dst netip.AddrPort, outerECN byte)
+	// Flush emits every queued packet via the underlying batch writer in
+	// arrival order. Returns an errors.Join of one or more errors. After Flush returns,
+	// borrowed payload slices may be recycled.
+	Flush() error
+}
--- a/overlay/batch/coalesce_core.go
+++ b/overlay/batch/coalesce_core.go
@@ -0,0 +1,42 @@
+package batch
+
+// Arena is an injectable byte-slab that hands out non-overlapping borrowed
+// slices via Reserve and releases them in bulk via Reset. Coalescers take
+// an *Arena at construction so the caller controls the slab lifetime and
+// can share one slab across multiple coalescers (MultiCoalescer hands the
+// same *Arena to every lane so the lanes don't carry their own backings).
+//
+// Reserve borrows; the slice is valid until the next Reset. The slab grows
+// (by allocating a fresh, larger backing array) if a Reserve doesn't fit;
+// pre-size the arena via NewArena to avoid that path on the hot path.
+type Arena struct {
+	buf []byte
+}
+
+// NewArena returns an Arena with a pre-allocated backing of the given
+// capacity. Pass 0 if you don't intend to call Reserve (e.g. a test that
+// only feeds the coalescer pre-made []byte packets via Commit).
+func NewArena(capacity int) *Arena {
+	return &Arena{buf: make([]byte, 0, capacity)}
+}
+
+// Reserve hands out a non-overlapping sz-byte slice from the arena. If the
+// request doesn't fit the current backing, a fresh, larger backing is
+// allocated; already-borrowed slices reference the old backing and remain
+// valid until Reset.
+func (a *Arena) Reserve(sz int) []byte {
+	if len(a.buf)+sz > cap(a.buf) {
+		newCap := max(cap(a.buf)*2, sz)
+		a.buf = make([]byte, 0, newCap)
+	}
+	start := len(a.buf)
+	a.buf = a.buf[:start+sz]
+	return a.buf[start : start+sz : start+sz]
+}
+
+// Reset releases every slice handed out since the last Reset. Callers must
+// not use any previously-borrowed slice after this returns. The underlying
+// backing array is retained so subsequent Reserves don't re-allocate.
+func (a *Arena) Reset() {
+	a.buf = a.buf[:0]
+}
--- a/overlay/batch/passthrough.go
+++ b/overlay/batch/passthrough.go
@@ -0,0 +1,52 @@
+package batch
+
+import (
+	"io"
+
+	"github.com/slackhq/nebula/udp"
+)
+
+// Passthrough is a RxBatcher that doesn't batch anything, it just accumulates and then sends packets.
+type Passthrough struct {
+	out    io.Writer
+	slots  [][]byte
+	arena  *Arena
+	cursor int
+}
+
+const passthroughBaseNumSlots = 128
+
+// DefaultPassthroughArenaCap is the recommended arena capacity for a
+// standalone Passthrough batcher: 128 slots × udp.MTU ≈ 1.1 MiB.
+const DefaultPassthroughArenaCap = passthroughBaseNumSlots * udp.MTU
+
+func NewPassthrough(w io.Writer, arena *Arena) *Passthrough {
+	return &Passthrough{
+		out:   w,
+		slots: make([][]byte, 0, passthroughBaseNumSlots),
+		arena: arena,
+	}
+}
+
+func (p *Passthrough) Reserve(sz int) []byte {
+	return p.arena.Reserve(sz)
+}
+
+func (p *Passthrough) Commit(pkt []byte) error {
+	p.slots = append(p.slots, pkt)
+	return nil
+}
+
+func (p *Passthrough) Flush() error {
+	var firstErr error
+	for _, s := range p.slots {
+		_, err := p.out.Write(s)
+		if err != nil && firstErr == nil {
+			firstErr = err
+		}
+	}
+	clear(p.slots)
+	p.slots = p.slots[:0]
+	p.arena.Reset()
+	return firstErr
+}
--- a/overlay/batch/tx_batch.go
+++ b/overlay/batch/tx_batch.go
@@ -0,0 +1,65 @@
+package batch
+
+import (
+	"net/netip"
+
+	"github.com/slackhq/nebula/udp"
+)
+
+const SendBatchCap = 128
+
+// DefaultSendBatchArenaCap is the recommended arena capacity for a
+// standalone SendBatch: 128 slots × (udp.MTU + 32) ≈ 1.1 MiB. The +32 covers
+// the nebula header + AEAD tag tacked onto each plaintext segment.
+const DefaultSendBatchArenaCap = SendBatchCap * (udp.MTU + 32)
+
+// batchWriter is the minimal subset of udp.Conn needed by SendBatch to flush.
+type batchWriter interface {
+	WriteBatch(bufs [][]byte, addrs []netip.AddrPort, outerECNs []byte) error
+}
+
+// SendBatch accumulates encrypted UDP packets and flushes them via WriteBatch.
+// One SendBatch is owned by each listenIn goroutine; no locking is needed.
+// Slot bytes are borrowed from the injected Arena and remain valid until
+// Flush, which Resets the arena.
+type SendBatch struct {
+	out   batchWriter
+	bufs  [][]byte
+	dsts  []netip.AddrPort
+	ecns  []byte
+	arena *Arena
+}
+
+// NewSendBatch makes a SendBatch with batchCap slots backed by arena.
+func NewSendBatch(out batchWriter, batchCap int, arena *Arena) *SendBatch {
+	return &SendBatch{
+		out:   out,
+		bufs:  make([][]byte, 0, batchCap),
+		dsts:  make([]netip.AddrPort, 0, batchCap),
+		ecns:  make([]byte, 0, batchCap),
+		arena: arena,
+	}
+}
+
+func (b *SendBatch) Reserve(sz int) []byte {
+	return b.arena.Reserve(sz)
+}
+
+func (b *SendBatch) Commit(pkt []byte, dst netip.AddrPort, outerECN byte) {
+	b.bufs = append(b.bufs, pkt)
+	b.dsts = append(b.dsts, dst)
+	b.ecns = append(b.ecns, outerECN)
+}
+
+func (b *SendBatch) Flush() error {
+	var err error
+	if len(b.bufs) > 0 {
+		err = b.out.WriteBatch(b.bufs, b.dsts, b.ecns)
+	}
+	clear(b.bufs)
+	b.bufs = b.bufs[:0]
+	b.dsts = b.dsts[:0]
+	b.ecns = b.ecns[:0]
+	b.arena.Reset()
+	return err
+}
--- a/overlay/batch/tx_batch_test.go
+++ b/overlay/batch/tx_batch_test.go
@@ -0,0 +1,124 @@
+package batch
+
+import (
+	"net/netip"
+	"testing"
+)
+
+type fakeBatchWriter struct {
+	bufs  [][]byte
+	addrs []netip.AddrPort
+	ecns  []byte
+}
+
+func (w *fakeBatchWriter) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, ecns []byte) error {
+	// Snapshot — SendBatch.Flush nils its slot pointers right after WriteBatch
+	// returns, so tests must capture data before that happens.
+	w.bufs = make([][]byte, len(bufs))
+	for i, b := range bufs {
+		cp := make([]byte, len(b))
+		copy(cp, b)
+		w.bufs[i] = cp
+	}
+	w.addrs = append(w.addrs[:0], addrs...)
+	w.ecns = append(w.ecns[:0], ecns...)
+	return nil
+}
+
+func TestSendBatchReserveCommitFlush(t *testing.T) {
+	fw := &fakeBatchWriter{}
+	b := NewSendBatch(fw, 4, NewArena(32))
+
+	ap := netip.MustParseAddrPort("10.0.0.1:4242")
+	for i := 0; i < 4; i++ {
+		slot := b.Reserve(32)
+		if cap(slot) != 32 {
+			t.Fatalf("slot %d: cap=%d want 32", i, cap(slot))
+		}
+		pkt := append(slot[:0], byte(i), byte(i+1), byte(i+2))
+		b.Commit(pkt, ap, 0)
+	}
+	if err := b.Flush(); err != nil {
+		t.Fatalf("Flush: %v", err)
+	}
+	if len(fw.bufs) != 4 {
+		t.Fatalf("WriteBatch got %d bufs want 4", len(fw.bufs))
+	}
+	for i, buf := range fw.bufs {
+		if len(buf) != 3 || buf[0] != byte(i) {
+			t.Errorf("buf %d: %x", i, buf)
+		}
+		if fw.addrs[i] != ap {
+			t.Errorf("addr %d: got %v want %v", i, fw.addrs[i], ap)
+		}
+	}
+
+	// Flush again with nothing committed — should be a no-op.
+	fw.bufs = nil
+	if err := b.Flush(); err != nil {
+		t.Fatalf("empty Flush: %v", err)
+	}
+	if fw.bufs != nil {
+		t.Fatalf("empty Flush triggered WriteBatch")
+	}
+
+	// Reuse after Flush.
+	slot := b.Reserve(32)
+	if cap(slot) != 32 {
+		t.Fatalf("after Flush Reserve wrong cap: %d", cap(slot))
+	}
+}
+
+func TestSendBatchSlotsDoNotOverlap(t *testing.T) {
+	fw := &fakeBatchWriter{}
+	b := NewSendBatch(fw, 3, NewArena(8))
+	ap := netip.MustParseAddrPort("10.0.0.1:80")
+
+	for i := 0; i < 3; i++ {
+		s := b.Reserve(8)
+		pkt := append(s[:0], byte(0xA0+i), byte(0xB0+i))
+		b.Commit(pkt, ap, 0)
+	}
+	if err := b.Flush(); err != nil {
+		t.Fatalf("Flush: %v", err)
+	}
+
+	for i, buf := range fw.bufs {
+		if buf[0] != byte(0xA0+i) || buf[1] != byte(0xB0+i) {
+			t.Errorf("slot %d corrupted: %x", i, buf)
+		}
+	}
+}
+
+func TestSendBatchGrowPreservesCommitted(t *testing.T) {
+	fw := &fakeBatchWriter{}
+	// Tiny initial backing forces a grow on the second Reserve.
+	b := NewSendBatch(fw, 1, NewArena(4))
+	ap := netip.MustParseAddrPort("10.0.0.1:80")
+
+	s1 := b.Reserve(4)
+	pkt1 := append(s1[:0], 0x11, 0x22, 0x33, 0x44)
+	b.Commit(pkt1, ap, 0)
+
+	s2 := b.Reserve(8) // exceeds remaining cap, triggers grow
+	pkt2 := append(s2[:0], 0xA, 0xB, 0xC, 0xD, 0xE)
+	b.Commit(pkt2, ap, 0)
+
+	// pkt1 must still be intact even though backing reallocated.
+	if pkt1[0] != 0x11 || pkt1[3] != 0x44 {
+		t.Fatalf("first packet corrupted by grow: %x", pkt1)
+	}
+
+	if err := b.Flush(); err != nil {
+		t.Fatalf("Flush: %v", err)
+	}
+	if len(fw.bufs) != 2 {
+		t.Fatalf("got %d bufs want 2", len(fw.bufs))
+	}
+	if fw.bufs[0][0] != 0x11 || fw.bufs[0][3] != 0x44 {
+		t.Errorf("first packet on the wire: %x", fw.bufs[0])
+	}
+	if fw.bufs[1][0] != 0xA || fw.bufs[1][4] != 0xE {
+		t.Errorf("second packet on the wire: %x", fw.bufs[1])
+	}
+}
--- a/overlay/device.go
+++ b/overlay/device.go
@@ -8,6 +8,10 @@ import (
 	"github.com/slackhq/nebula/routing"
 )

+// defaultBatchBufSize is the per-Queue scratch size for Read on backends
+// that don't do TSO segmentation. 65535 covers any single IP packet.
+const defaultBatchBufSize = 65535
+
 type Device interface {
 	io.Closer
 	Activate() error
--- a/overlay/tio/segment.go
+++ b/overlay/tio/segment.go
@@ -0,0 +1,12 @@
+package tio
+
+import "fmt"
+
+// SegmentSuperpacket invokes fn once per segment of pkt.
+// This is a stub implementation that does not actually support segmentation
+func SegmentSuperpacket(pkt Packet, fn func(seg []byte) error) error {
+	if pkt.GSO.IsSuperpacket() {
+		return fmt.Errorf("tio: GSO superpacket on platform without segmentation support")
+	}
+	return fn(pkt.Bytes)
+}
--- a/overlay/tio/tio.go
+++ b/overlay/tio/tio.go
@@ -18,7 +18,12 @@ type QueueSet interface {
 // Capabilities advertises which kernel offload features a Queue successfully negotiated.
 // Callers consult this to decide which coalescers to wire onto the write path.
 type Capabilities struct {
-	//none yet!
+	// TSO means the FD was opened with IFF_VNET_HDR and the kernel agreed
+	// to TUN_F_TSO4|TSO6 — i.e. WriteGSO with GSOProtoTCP is safe.
+	TSO bool
+	// USO means the kernel additionally agreed to TUN_F_USO4|USO6, so
+	// WriteGSO with GSOProtoUDP is safe. Linux ≥ 6.2.
+	USO bool
 }

 // Queue is a readable/writable Poll queue. One Queue is driven by a single
@@ -40,3 +45,78 @@ type Queue interface {
 	// or the zero value when q does not advertise any.
 	Capabilities() Capabilities
 }
+
+// GSOInfo describes a kernel-supplied superpacket sitting in Packet.Bytes.
+// The zero value means "not a superpacket" — Bytes is one regular IP
+// datagram and no segmentation is required.
+type GSOInfo struct {
+	// Size is the GSO segment size: max payload bytes per segment
+	// (== TCP MSS for TSO, == UDP payload chunk for USO). Zero means
+	// not a superpacket.
+	Size uint16
+	// HdrLen is the total L3+L4 header length within Bytes (already
+	// corrected via correctHdrLen, so safe to slice on).
+	HdrLen uint16
+	// CsumStart is the L4 header offset inside Bytes (== L3 header
+	// length).
+	CsumStart uint16
+	// Proto picks the L4 protocol (TCP or UDP) so the segmenter knows
+	// which checksum/header layout to apply.
+	Proto GSOProto
+}
+
+// GSOProto selects the L4 protocol for a GSO superpacket. Determines which
+// VIRTIO_NET_HDR_GSO_* type the writer stamps and which checksum offset
+// inside the transport header virtio NEEDS_CSUM expects.
+type GSOProto uint8
+
+const (
+	GSOProtoNone GSOProto = iota
+	GSOProtoTCP
+	GSOProtoUDP
+)
+
+// GSOWriter is implemented by Queues that can emit a TCP or UDP superpacket
+// assembled from a header prefix plus one or more borrowed payload
+// fragments, in a single vectored write (writev with a leading
+// virtio_net_hdr). This lets the coalescer avoid copying payload bytes
+// between the caller's decrypt buffer and the TUN. Backends without GSO
+// support do not implement this interface and coalescing is skipped.
+//
+// hdr contains the IPv4/IPv6 header prefix (mutable - callers will have
+// filled in total length and IP csum). transportHdr is the TCP or UDP
+// header (mutable - the L4 checksum field must hold the pseudo-header
+// partial, single-fold not inverted, per virtio NEEDS_CSUM semantics).
+// pays are non-overlapping payload fragments whose concatenation is the
+// full superpacket payload; they are read-only from the writer's
+// perspective and must remain valid until the call returns. Every segment
+// in pays except possibly the last is exactly the same size. proto picks
+// the L4 protocol so the writer knows which GSOType / CsumOffset to set.
+//
+// Callers should also consult CapsProvider (via SupportsGSO or
+// QueueCapabilities) for the per-protocol negotiated capability; an
+// implementation of GSOWriter is necessary but not sufficient since USO
+// may not have been negotiated even when TSO was.
+type GSOWriter interface {
+	WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto GSOProto) error
+}
+
+// SupportsGSO reports whether w implements GSOWriter and the underlying
+// queue advertises the negotiated capability for `want`. A writer that
+// implements GSOWriter but not CapsProvider is treated as permissive
+// (used by tests and fakes that don't negotiate).
+func SupportsGSO(w Queue, want GSOProto) (GSOWriter, bool) {
+	gw, ok := w.(GSOWriter)
+	if !ok {
+		return nil, false
+	}
+	caps := w.Capabilities()
+	switch want {
+	case GSOProtoTCP:
+		return gw, caps.TSO
+	case GSOProtoUDP:
+		return gw, caps.USO
+	default:
+		return gw, false
+	}
+}