This commit is contained in:
JackDoan
2026-05-22 08:52:37 -05:00
parent aff46ce762
commit c610b712af
15 changed files with 202 additions and 381 deletions
+11
View File
@@ -62,6 +62,10 @@ func (c *offloadQueueSet) wakeForShutdown() error {
}
func (c *offloadQueueSet) Close() error {
if c.shutdownFd < 0 {
return nil
}
errs := []error{}
// Signal all readers blocked in poll to wake up and exit
@@ -75,5 +79,12 @@ func (c *offloadQueueSet) Close() error {
}
}
// All Offloads reference shutdownFd in their pollfd arrays, so close it
// only after every Offload.Close has returned.
if err := unix.Close(c.shutdownFd); err != nil {
errs = append(errs, err)
}
c.shutdownFd = -1
return errors.Join(errs...)
}
+15 -11
View File
@@ -2,7 +2,11 @@
package tio
import "testing"
import (
"testing"
"github.com/slackhq/nebula/wire"
)
// fakeBatch stands in for batch.TxBatcher inside the bench — same shape
// of pointer-capturing closure that sendInsideMessage builds.
@@ -21,27 +25,27 @@ type fakeIface struct {
}
// BenchmarkSegmentSuperpacketAllocsTSO measures allocation per
// SegmentSuperpacket call when a closure captures pointer-bearing
// receivers — the realistic shape of sendInsideMessage's closure.
// PerSegment call when a closure captures pointer-bearing receivers — the
// realistic shape of sendInsideMessage's closure.
func BenchmarkSegmentSuperpacketAllocsTSO(b *testing.B) {
const mss = 1400
const numSeg = 32
pkt := buildTSOv6(mss*numSeg, mss)
gso := GSOInfo{
gso := wire.GSOInfo{
Size: mss,
HdrLen: 60, // 40 (IPv6) + 20 (TCP)
CsumStart: 40,
Proto: GSOProtoTCP,
Proto: wire.GSOProtoTCP,
}
p := Packet{Bytes: pkt, GSO: gso}
p := wire.TunPacket{Bytes: pkt, Meta: gso}
hi := &fakeHostInfo{remoteIndexId: 0xdeadbeef}
f := &fakeIface{rebindCount: 7, hi: hi}
fb := &fakeBatch{}
// SegmentSuperpacket consumes pkt destructively; refresh from a master
// copy each iter (matches the production pattern where every TUN read
// hands the segmenter a fresh kernel-supplied buffer).
// PerSegment consumes pkt destructively; refresh from a master copy
// each iter (matches the production pattern where every TUN read hands
// the segmenter a fresh kernel-supplied buffer).
master := append([]byte(nil), pkt...)
work := make([]byte, len(pkt))
p.Bytes = work
@@ -50,7 +54,7 @@ func BenchmarkSegmentSuperpacketAllocsTSO(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
copy(work, master)
err := SegmentSuperpacket(p, func(seg []byte) error {
err := p.PerSegment(func(seg []byte) error {
out := fb.Reserve(16 + len(seg) + 16)
out[0] = byte(f.rebindCount)
out[1] = byte(hi.counter)
@@ -59,7 +63,7 @@ func BenchmarkSegmentSuperpacketAllocsTSO(b *testing.B) {
return nil
})
if err != nil {
b.Fatalf("SegmentSuperpacket: %v", err)
b.Fatalf("PerSegment: %v", err)
}
}
}
-22
View File
@@ -1,22 +0,0 @@
//go:build !linux || android || e2e_testing
package tio
import "fmt"
func protoFromGSOType(_ uint8) (GSOProto, error) {
return 0, fmt.Errorf("GSO unsupported")
}
// SegmentSuperpacket invokes fn once per segment of pkt. On non-Linux
// builds (and Android/e2e_testing) this package does not provide a Queue
// implementation, so any caller that does construct a Packet here can only
// be operating on non-superpacket bytes and the stub forwards them
// directly. A non-zero GSO field is a programming error from the caller
// and returns an explicit error rather than silently misbehaving.
func SegmentSuperpacket(pkt Packet, fn func(seg []byte) error) error {
if pkt.GSO.IsSuperpacket() {
return fmt.Errorf("tio: GSO superpacket on platform without segmentation support")
}
return fn(pkt.Bytes)
}
+5 -7
View File
@@ -63,18 +63,16 @@ type Queue interface {
// in pays except possibly the last is exactly the same size. proto picks
// the L4 protocol so the writer knows which GSOType / CsumOffset to set.
//
// Callers should also consult CapsProvider (via SupportsGSO or
// QueueCapabilities) for the per-protocol negotiated capability; an
// implementation of GSOWriter is necessary but not sufficient since USO
// may not have been negotiated even when TSO was.
// Callers should also consult Queue.Capabilities (via SupportsGSO) for
// the per-protocol negotiated capability; an implementation of GSOWriter
// is necessary but not sufficient since USO may not have been negotiated
// even when TSO was.
type GSOWriter interface {
WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto wire.GSOProto) error
}
// SupportsGSO reports whether w implements GSOWriter and the underlying
// queue advertises the negotiated capability for `want`. A writer that
// implements GSOWriter but not CapsProvider is treated as permissive
// (used by tests and fakes that don't negotiate).
// queue advertises the negotiated capability for `want` via Capabilities.
func SupportsGSO(w Queue, want wire.GSOProto) (GSOWriter, bool) {
gw, ok := w.(GSOWriter)
if !ok {
+42 -55
View File
@@ -16,26 +16,14 @@ import (
"github.com/slackhq/nebula/overlay/tio/virtio"
)
// tunRxBufSize is the per-Read worst-case footprint inside rxBuf: one
// kernel-supplied packet body, which is at most ~64 KiB (tunReadBufSize).
// Segmentation happens at encrypt time on a per-routine MTU-sized scratch
// (see SegmentSuperpacket), so rxBuf only holds raw kernel-supplied bytes.
// We round up to give comfortable margin for the drain headroom check
// below.
// tunRxBufSize is the per-Read worst-case footprint for one kernel-supplied
// packet body, which is at most ~64 KiB (tunReadBufSize). Segmentation
// happens at encrypt time via wire.TunPacket.PerSegment on a per-routine
// MTU-sized scratch, so the caller-supplied read buffer only holds raw
// kernel-supplied bytes. Used by Read's drain loop to gate further reads
// on whether the remaining buffer can still hold one worst-case packet.
const tunRxBufSize = 64 * 1024
// tunRxBufCap is the total size we allocate for the per-reader rx
// buffer. With reads landing directly in rxBuf, each drain iteration
// consumes up to tunRxBufSize of headroom for the kernel-supplied bytes.
// Sized to two such iterations so the initial blocking read plus one
// drain read both fit without partial-drop.
const tunRxBufCap = tunRxBufSize * 2
// tunDrainCap caps how many packets a single Read will accumulate via
// the post-wake drain loop. Sized to soak up a burst of small ACKs while
// bounding how much work a single caller holds before handing off.
const tunDrainCap = 64
// gsoMaxIovs caps the iovec budget WriteGSO assembles per call: 3 fixed
// entries (virtio_net_hdr, IP hdr, transport hdr) plus up to gsoMaxIovs-3
// payload fragments. Sized comfortably above the typical kernel GSO
@@ -50,7 +38,7 @@ const gsoMaxIovs = 256
// CHECKSUM_UNNECESSARY so the receiving network stack skips L4 checksum
// verification. All packets that reach the plain Write paths already carry
// a valid L4 checksum (either supplied by a remote peer whose ciphertext we
// AEAD-authenticated, produced by segmentTCPYield/segmentUDPYield during
// AEAD-authenticated, produced by virtio.SegmentTCP/SegmentUDP during
// superpacket segmentation, or built locally by CreateRejectPacket), so
// trusting them is safe.
var validVnetHdr = [virtio.Size]byte{unix.VIRTIO_NET_HDR_F_DATA_VALID}
@@ -71,12 +59,12 @@ type Offload struct {
// readVnetScratch holds the 10-byte virtio_net_hdr split off the front of
// every TUN read via readv(2). Decoupling the header from the packet body
// lets us read the body directly into rxBuf at the current rxOff with
// no userspace copy on the GSO_NONE fast path.
// lets us read the body directly into the caller-supplied mem at the
// current rxOff with no userspace copy on the GSO_NONE fast path.
readVnetScratch [virtio.Size]byte
// readIovs is the readv(2) iovec scratch wired once at construction —
// iovec[0] points at readVnetScratch; iovec[1].Base/Len is updated per
// read to address the current rxBuf slot.
// read to address the caller-supplied mem slot.
readIovs [2]unix.Iovec
// usoEnabled records whether the kernel agreed to TUN_F_USO* on this FD,
@@ -120,7 +108,8 @@ func newOffload(fd int, shutdownFd int, usoEnabled bool) (*Offload, error) {
out.gsoIovs[0].SetLen(virtio.Size)
// readIovs[0] is wired once to the virtio_net_hdr scratch; per-read we
// only repoint readIovs[1] at the next rxBuf slot (see readPacket).
// only repoint readIovs[1] at the next caller-supplied mem slot
// (see readPacket).
out.readIovs[0].Base = &out.readVnetScratch[0]
out.readIovs[0].SetLen(virtio.Size)
@@ -182,17 +171,16 @@ func (r *Offload) blockOnWrite() error {
}
// readPacket issues a single readv(2) splitting the virtio_net_hdr off
// into readVnetScratch and reading the packet body directly into rxBuf at
// the current rxOff. Returns the body length (zero virtio header bytes,
// just the IP packet/superpacket). block controls whether EAGAIN is
// retried via poll: the initial read of a drain blocks; subsequent drain
// reads do not.
// into readVnetScratch and reading the packet body directly into mem.
// Returns the body length (zero virtio header bytes, just the IP
// packet/superpacket). block controls whether EAGAIN is retried via poll:
// the initial read of a drain blocks; subsequent drain reads do not.
//
// The body iovec capacity is always tunReadBufSize; callers (the Read
// drain loop) gate entry on tunRxBufCap-rxOff >= tunRxBufSize, sized to
// hold one worst-case kernel-supplied packet body. Without that gate the
// body iovec could be smaller than the next inbound packet and the
// kernel would truncate.
// The body iovec capacity is always tunReadBufSize; the Read drain loop
// gates entry on len(mem)-rxOff >= tunRxBufSize, sized to hold one
// worst-case kernel-supplied packet body. Without that gate the body
// iovec could be smaller than the next inbound packet and the kernel
// would truncate.
func (r *Offload) readPacket(mem []byte, block bool) (int, error) {
for {
r.readIovs[1].Base = &mem[0]
@@ -223,16 +211,16 @@ func (r *Offload) readPacket(mem []byte, block bool) (int, error) {
}
}
// Read returns one or more packets from the tun. Each Packet either
// carries a single ready-to-use IP datagram (GSO zero) or a TSO/USO
// superpacket plus the GSOInfo a caller needs to segment it (see
// SegmentSuperpacket). The first read blocks via poll; once the fd is
// known readable we drain additional packets non-blocking until the
// kernel queue is empty (EAGAIN), we've collected tunDrainCap packets,
// or we're out of rxBuf headroom. This amortizes the poll wake over
// bursts of small packets (e.g. TCP ACKs). Packet.Bytes slices point
// into the Offload's internal buffer and are only valid until the next
// Read or Close on this Queue.
// Read returns one or more packets from the tun. Each wire.TunPacket
// either carries a single ready-to-use IP datagram (GSO zero) or a TSO/USO
// superpacket plus the wire.GSOInfo a caller needs to segment it (see
// wire.TunPacket.PerSegment). The first read blocks via poll; once the fd
// is known readable we drain additional packets non-blocking until the
// kernel queue is empty (EAGAIN), p is full, or mem no longer has room
// for another worst-case packet (tunRxBufSize). This amortizes the poll
// wake over bursts of small packets (e.g. TCP ACKs). The Bytes slices on
// returned packets point into the caller-supplied mem and are only valid
// until the next Read or Close on this Queue.
func (r *Offload) Read(p []wire.TunPacket, mem []byte) (int, error) {
maxP := len(p)
maxM := len(mem)
@@ -255,9 +243,9 @@ func (r *Offload) Read(p []wire.TunPacket, mem []byte) (int, error) {
break
}
// Drain: non-blocking reads until the kernel queue is empty, the drain
// cap is reached, or rxBuf no longer has room for another worst-case
// kernel-supplied packet (tunRxBufSize).
// Drain: non-blocking reads until the kernel queue is empty, p is full,
// or mem no longer has room for another worst-case kernel-supplied
// packet (tunRxBufSize).
for len(p) < maxP && maxM-rxOff >= tunRxBufSize {
n, err := r.readPacket(mem[rxOff:], false)
if err != nil {
@@ -279,13 +267,12 @@ func (r *Offload) Read(p []wire.TunPacket, mem []byte) (int, error) {
return len(p), nil
}
// decodeRead processes the packet sitting in rxBuf at rxOff (length
// pktLen). The bytes stay in rxBuf — for GSO_NONE we slice them as a
// regular IP datagram (running finishChecksum if NEEDS_CSUM is set);
// for TSO/USO superpackets we attach the corrected GSO metadata so the
// caller can segment lazily at encrypt time. rxOff advances past the
// kernel-supplied body and nothing else, since segmentation no longer
// writes back into rxBuf.
// decodeRead processes the packet sitting at mem[:pktLen]. The bytes stay
// in mem — for GSO_NONE we slice them as a regular IP datagram (running
// finishChecksum if NEEDS_CSUM is set); for TSO/USO superpackets we attach
// the corrected GSO metadata so the caller can segment lazily at encrypt
// time. The caller advances its own rxOff past the kernel-supplied body
// and nothing else, since segmentation no longer writes back into mem.
func (r *Offload) decodeRead(p []wire.TunPacket, mem []byte, pktLen int) ([]wire.TunPacket, error) {
if pktLen <= 0 {
return p, fmt.Errorf("short tun read: %d", pktLen)
@@ -307,8 +294,8 @@ func (r *Offload) decodeRead(p []wire.TunPacket, mem []byte, pktLen int) ([]wire
// GSO superpacket: validate, fix the kernel-supplied HdrLen on the
// FORWARD path (CorrectHdrLen), pick the L4 protocol, and attach
// the metadata. The bytes stay in rxBuf untouched, segmentation
// happens in SegmentSuperpacket at encrypt time.
// the metadata. The bytes stay in mem untouched; segmentation
// happens in wire.TunPacket.PerSegment at encrypt time.
if err := virtio.CheckValid(body, hdr); err != nil {
return p, err
}
+49 -61
View File
@@ -12,6 +12,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/checksum"
"github.com/slackhq/nebula/overlay/tio/virtio"
"github.com/slackhq/nebula/wire"
)
// testSegScratchSize is a generous segmentation scratch sized to fit any
@@ -26,12 +27,11 @@ func verifyChecksum(b []byte, pseudo uint16) bool {
}
// segmentForTest is the test-only counterpart to the production
// SegmentSuperpacket path. It handles GSO_NONE (with optional
// wire.TunPacket.PerSegment path. It handles GSO_NONE (with optional
// finishChecksum) inline and dispatches GSO superpackets through
// SegmentSuperpacket, draining each yielded segment into a
// freshly-copied [][]byte slot so callers can iterate after the call
// returns. Tests pre-set hdr.HdrLen correctly, so correctHdrLen is not
// invoked here.
// PerSegment, draining each yielded segment into a freshly-copied [][]byte
// slot so callers can iterate after the call returns. Tests pre-set
// hdr.HdrLen correctly, so correctHdrLen is not invoked here.
func segmentForTest(pkt []byte, hdr virtio.Hdr, out *[][]byte, scratch []byte) error {
if hdr.GSOType == unix.VIRTIO_NET_HDR_GSO_NONE {
cp := append([]byte(nil), pkt...)
@@ -47,13 +47,16 @@ func segmentForTest(pkt []byte, hdr virtio.Hdr, out *[][]byte, scratch []byte) e
if err != nil {
return err
}
gso := GSOInfo{
Size: hdr.GSOSize,
HdrLen: hdr.HdrLen,
CsumStart: hdr.CsumStart,
Proto: proto,
p := wire.TunPacket{
Bytes: pkt,
Meta: wire.GSOInfo{
Size: hdr.GSOSize,
HdrLen: hdr.HdrLen,
CsumStart: hdr.CsumStart,
Proto: proto,
},
}
return SegmentSuperpacket(Packet{Bytes: pkt, GSO: gso}, func(seg []byte) error {
return p.PerSegment(func(seg []byte) error {
*out = append(*out, append([]byte(nil), seg...))
return nil
})
@@ -592,8 +595,8 @@ func BenchmarkSegmentTCPv4(b *testing.B) {
scratch := make([]byte, testSegScratchSize)
out := make([][]byte, 0, 64)
// SegmentSuperpacket consumes its input destructively; restore
// pkt from a master copy each iteration. The restore mirrors the
// PerSegment consumes its input destructively; restore pkt from
// a master copy each iteration. The restore mirrors the
// kernel→userspace copy that hands a fresh GSO blob to the
// segmenter in production, so it's representative cost rather
// than bench overhead.
@@ -673,24 +676,21 @@ func buildTSOv6(payLen, gso int) []byte {
return pkt
}
// TestDecodeReadFitsMaxTSOAtDrainThreshold proves the rxBuf sizing is
// correct: when rxOff is at the maximum value the drain headroom check
// allows, decodeRead must still be able to absorb a worst-case 64KiB
// TSO superpacket without dropping the burst. With segmentation deferred
// to encrypt time, decodeRead writes only the kernel-supplied bytes into
// rxBuf, so the size requirement is just "fit one worst-case input."
// TestDecodeReadFitsMaxTSO proves decodeRead can absorb a worst-case
// 64KiB TSO superpacket without dropping it. With segmentation deferred to
// encrypt time, decodeRead writes nothing — it just slices the
// caller-supplied mem and attaches GSO metadata — so the size requirement
// is just "fit one worst-case input."
//
// Regression history: in a prior layout the rx buffer doubled as the
// segmentation output, a near-threshold drain read returned "scratch too
// small", the whole 45-segment TSO burst was dropped, and the remote's TCP
// fast-retransmit collapsed cwnd. Keeping this test in the new layout
// guards against re-introducing a drain headroom shortfall.
func TestDecodeReadFitsMaxTSOAtDrainThreshold(t *testing.T) {
// fast-retransmit collapsed cwnd. Keeping this test guards against
// re-introducing per-call sizing assumptions inside decodeRead.
func TestDecodeReadFitsMaxTSO(t *testing.T) {
const ipv6HdrLen = 40
const tcpHdrLen = 20
const headerLen = ipv6HdrLen + tcpHdrLen
// Maximum TUN read body. The tunReadBufSize cap on readv's body iovec
// is what bounds the kernel's superpacket length.
pktLen := tunReadBufSize
payLen := pktLen - headerLen
const targetSegs = 64
@@ -701,16 +701,12 @@ func TestDecodeReadFitsMaxTSOAtDrainThreshold(t *testing.T) {
t.Fatalf("buildTSOv6 produced %d bytes, want %d", len(pkt), pktLen)
}
o := &Offload{
rxBuf: make([]byte, tunRxBufCap),
}
// rxOff at the maximum value the drain headroom check permits before
// it would refuse another read. Any drain-time read up to this
// threshold MUST still process correctly.
o.rxOff = tunRxBufCap - tunRxBufSize
// Stage the body in rxBuf as if readv(2) just placed it there.
copy(o.rxBuf[o.rxOff:], pkt)
o := &Offload{}
// mem is sized exactly to one worst-case packet — the caller-side
// invariant the drain loop in Read enforces. decodeRead must process
// the burst within that window.
mem := make([]byte, pktLen)
copy(mem, pkt)
// Encode the matching virtio_net_hdr.
hdr := virtio.Hdr{
@@ -723,50 +719,42 @@ func TestDecodeReadFitsMaxTSOAtDrainThreshold(t *testing.T) {
}
hdr.Encode(o.readVnetScratch[:])
startRxOff := o.rxOff
if err := o.decodeRead(pktLen); err != nil {
t.Fatalf("decodeRead at drain threshold returned %v — rxBuf sizing regression: "+
var pkts []wire.TunPacket
pkts, err := o.decodeRead(pkts, mem, pktLen)
if err != nil {
t.Fatalf("decodeRead returned %v — sizing regression: "+
"tunRxBufSize=%d must hold one worst-case input (%d)",
err, tunRxBufSize, pktLen)
}
if len(o.pending) != 1 {
t.Fatalf("got %d packets, want 1 superpacket entry", len(o.pending))
if len(pkts) != 1 {
t.Fatalf("got %d packets, want 1 superpacket entry", len(pkts))
}
got := o.pending[0]
if !got.GSO.IsSuperpacket() {
t.Fatalf("expected superpacket GSO metadata, got %+v", got.GSO)
got := pkts[0]
if !got.Meta.IsSuperpacket() {
t.Fatalf("expected superpacket GSO metadata, got %+v", got.Meta)
}
if got.GSO.Proto != GSOProtoTCP {
t.Errorf("GSO.Proto=%d want TCP", got.GSO.Proto)
if got.Meta.Proto != wire.GSOProtoTCP {
t.Errorf("Meta.Proto=%d want TCP", got.Meta.Proto)
}
if got.GSO.Size != uint16(gsoSize) {
t.Errorf("GSO.Size=%d want %d", got.GSO.Size, gsoSize)
if got.Meta.Size != uint16(gsoSize) {
t.Errorf("Meta.Size=%d want %d", got.Meta.Size, gsoSize)
}
if got.GSO.HdrLen != uint16(headerLen) {
t.Errorf("GSO.HdrLen=%d want %d", got.GSO.HdrLen, headerLen)
if got.Meta.HdrLen != uint16(headerLen) {
t.Errorf("Meta.HdrLen=%d want %d", got.Meta.HdrLen, headerLen)
}
if got.GSO.CsumStart != uint16(ipv6HdrLen) {
t.Errorf("GSO.CsumStart=%d want %d", got.GSO.CsumStart, ipv6HdrLen)
if got.Meta.CsumStart != uint16(ipv6HdrLen) {
t.Errorf("Meta.CsumStart=%d want %d", got.Meta.CsumStart, ipv6HdrLen)
}
if len(got.Bytes) != pktLen {
t.Errorf("len(Bytes)=%d want %d", len(got.Bytes), pktLen)
}
// rxOff advances exactly by the kernel-supplied body length — no
// segmentation output to account for any more.
if o.rxOff != startRxOff+pktLen {
t.Errorf("rxOff=%d want %d", o.rxOff, startRxOff+pktLen)
}
if o.rxOff > tunRxBufCap {
t.Fatalf("rxOff=%d overran rxBuf (cap=%d)", o.rxOff, tunRxBufCap)
}
// Validate that segmenting the returned superpacket reproduces the
// expected per-segment IPv6 payload length and TCP checksum.
wantSegs := (payLen + gsoSize - 1) / gsoSize
gotSegs := 0
if err := SegmentSuperpacket(got, func(seg []byte) error {
if err := got.PerSegment(func(seg []byte) error {
defer func() { gotSegs++ }()
if len(seg) < headerLen+1 {
t.Errorf("seg %d too short: %d", gotSegs, len(seg))
@@ -786,7 +774,7 @@ func TestDecodeReadFitsMaxTSOAtDrainThreshold(t *testing.T) {
}
return nil
}); err != nil {
t.Fatalf("SegmentSuperpacket: %v", err)
t.Fatalf("PerSegment: %v", err)
}
if gotSegs != wantSegs {
t.Fatalf("got %d segments, want %d", gotSegs, wantSegs)