holy crap 2x

2026-05-16 04:47:38 +02:00 · 2026-04-17 15:33:46 -05:00
parent 1fd24a19c7
commit 60e556866a
5 changed files with 643 additions and 307 deletions
--- a/tcp_coalesce.go
+++ b/tcp_coalesce.go
@@ -7,72 +7,114 @@ import (
 	"github.com/slackhq/nebula/overlay"
 )

-// IPPROTO_TCP is the IANA protocol number for TCP. Hardcoded instead of
-// reaching for ipProtoTCP because golang.org/x/sys/unix doesn't
-// define that constant on Windows, which would break cross-compiles even
-// though this file runs unchanged on every platform.
+// ipProtoTCP is the IANA protocol number for TCP. Hardcoded instead of
+// reaching for golang.org/x/sys/unix — that package doesn't define the
+// constant on Windows, which would break cross-compiles even though this
+// file runs unchanged on every platform.
 const ipProtoTCP = 6

-// tcpCoalesceBufSize bounds the largest coalesced superpacket we will buffer.
-// Linux caps sk_gso_max_size around 64KiB; 65535 bytes covers IP hdr + TCP
-// hdr + up to ~65KB of payload, which is the most the kernel's TSO can
-// segment in one shot.
+// tcpCoalesceBufSize caps total bytes per superpacket. Mirrors the kernel's
+// sk_gso_max_size of ~64KiB; anything beyond this would be rejected anyway.
 const tcpCoalesceBufSize = 65535

-// tcpCoalesceMaxSegs caps how many segments we are willing to coalesce into
-// a single superpacket regardless of byte budget. Kernel allows up to 64
-// for UDP GSO and 128 for many TSO engines; stop well before either limit
-// to keep latency bounded.
+// tcpCoalesceMaxSegs caps how many segments we'll coalesce into a single
+// superpacket. Keeping this well below the kernel's TSO ceiling bounds
+// latency.
 const tcpCoalesceMaxSegs = 64

-// tcpCoalescer accumulates adjacent in-flow TCP data segments into a single
-// TSO superpacket and emits them via overlay.GSOWriter in one writev. When
-// a packet fails admission or fails to extend the pending flow, the
-// pending superpacket is flushed and the non-matching packet is written
-// through as-is. Owns no locks — one coalescer per TUN write queue.
+// tcpCoalesceHdrCap is the scratch space we copy a seed's IP+TCP header
+// into. IPv6 (40) + TCP with full options (60) = 100 bytes.
+const tcpCoalesceHdrCap = 100
+
+// initialSlots is the starting capacity of the slot pool. One flow per
+// packet is the worst case so this matches a typical UDP recvmmsg batch.
+const initialSlots = 64
+
+// flowKey identifies a TCP flow by {src, dst, sport, dport, family}.
+// Comparable, so linear scans over the slot list stay tight.
+type flowKey struct {
+	src, dst     [16]byte
+	sport, dport uint16
+	isV6         bool
+}
+
+// coalesceSlot is one entry in the coalescer's ordered event queue. When
+// passthrough is true the slot holds a single borrowed packet that must be
+// emitted verbatim (non-TCP, non-admissible TCP, or oversize seed). When
+// passthrough is false the slot is an in-progress coalesced superpacket:
+// hdrBuf is a mutable copy of the seed's IP+TCP header (we patch total
+// length and pseudo-header partial at flush), and payIovs are *borrowed*
+// slices from the caller's plaintext buffers — no payload is ever copied.
+// The caller (listenOut) must keep those buffers alive until Flush.
+type coalesceSlot struct {
+	passthrough bool
+	rawPkt      []byte // borrowed when passthrough
+
+	fk       flowKey
+	hdrBuf   [tcpCoalesceHdrCap]byte
+	hdrLen   int
+	ipHdrLen int
+	isV6     bool
+	gsoSize  int
+	numSeg   int
+	totalPay int
+	nextSeq  uint32
+	// psh closes the chain: set when the last-accepted segment had PSH or
+	// was sub-gsoSize. No further appends after that.
+	psh     bool
+	payIovs [][]byte
+}
+
+// tcpCoalescer accumulates adjacent in-flow TCP data segments across
+// multiple concurrent flows and emits each flow's run as a single TSO
+// superpacket via overlay.GSOWriter. All output — coalesced or not — is
+// deferred until Flush so arrival order is preserved on the wire. Owns
+// no locks; one coalescer per TUN write queue.
 type tcpCoalescer struct {
 	plainW io.Writer
 	gsoW   overlay.GSOWriter // nil when the queue doesn't support TSO

-	buf      []byte
-	bufLen   int  // valid bytes in buf — hdrLen plus accumulated payload
-	active   bool // a seed packet is present
-	numSeg   int
-	gsoSize  int // payload length of each segment (= MSS of the seed)
-	isV6     bool
-	ipHdrLen int
-	hdrLen   int    // ipHdrLen + tcpHdrLen, the offset where payload starts
-	nextSeq  uint32 // expected TCP seq of the next packet to coalesce
-	// psh indicates the last-accepted segment had PSH set. We accept a PSH
-	// packet as the final segment but reject any further Adds after that.
-	psh bool
+	// slots is the ordered event queue. Flush walks it once and emits each
+	// entry as either a WriteGSO (coalesced) or a plainW.Write (passthrough).
+	slots []*coalesceSlot
+	// openSlots maps a flow key to its most recent non-sealed slot, so new
+	// segments can extend an in-progress superpacket in O(1). Slots are
+	// removed from this map when they close (PSH or short-last-segment),
+	// when a non-admissible packet for that flow arrives, or in Flush.
+	openSlots map[flowKey]*coalesceSlot
+	pool      []*coalesceSlot // free list for reuse
 }

 func newTCPCoalescer(w io.Writer) *tcpCoalescer {
-	c := &tcpCoalescer{plainW: w, buf: make([]byte, tcpCoalesceBufSize)}
+	c := &tcpCoalescer{
+		plainW:    w,
+		slots:     make([]*coalesceSlot, 0, initialSlots),
+		openSlots: make(map[flowKey]*coalesceSlot, initialSlots),
+		pool:      make([]*coalesceSlot, 0, initialSlots),
+	}
 	if gw, ok := w.(overlay.GSOWriter); ok && gw.GSOSupported() {
 		c.gsoW = gw
 	}
 	return c
 }

-// parsedTCP holds the byte offsets / values we extract from one admission
-// check so Add and canAppend don't re-parse the same header twice.
+// parsedTCP holds the fields extracted from a single parse so later steps
+// (admission, slot lookup, canAppend) don't re-walk the header.
 type parsedTCP struct {
-	isV6      bool
+	fk        flowKey
 	ipHdrLen  int
 	tcpHdrLen int
-	hdrLen    int // ipHdrLen + tcpHdrLen
+	hdrLen    int
 	payLen    int
 	seq       uint32
 	flags     byte
 }

-// parseCoalesceable decides whether pkt is eligible for TCP coalescing. It
-// accepts IPv4 (no options, DF set, no fragmentation) and IPv6 (no
-// extension headers) carrying a TCP segment with flags in {ACK, ACK|PSH}
-// and a non-empty payload. On success it returns the parsed offsets.
-func parseCoalesceable(pkt []byte) (parsedTCP, bool) {
+// parseTCPBase extracts the flow key and IP/TCP offsets for any TCP packet,
+// regardless of whether it's admissible for coalescing. Returns ok=false
+// for non-TCP or malformed input. Accepts IPv4 (no options, no fragmentation)
+// and IPv6 (no extension headers).
+func parseTCPBase(pkt []byte) (parsedTCP, bool) {
 	var p parsedTCP
 	if len(pkt) < 20 {
 		return p, false
@@ -80,42 +122,41 @@ func parseCoalesceable(pkt []byte) (parsedTCP, bool) {
 	v := pkt[0] >> 4
 	switch v {
 	case 4:
-		if len(pkt) < 20 {
-			return p, false
-		}
 		ihl := int(pkt[0]&0x0f) * 4
 		if ihl != 20 {
-			return p, false // reject IP options
+			return p, false
 		}
 		if pkt[9] != ipProtoTCP {
 			return p, false
 		}
-		// Fragment check: MF=0 and frag offset=0. Accept DF=1 or DF=0 —
-		// just reject any actual fragmentation.
-		fragField := binary.BigEndian.Uint16(pkt[6:8])
-		if fragField&0x3fff != 0 {
+		// Reject actual fragmentation (MF or non-zero frag offset).
+		if binary.BigEndian.Uint16(pkt[6:8])&0x3fff != 0 {
 			return p, false
 		}
 		totalLen := int(binary.BigEndian.Uint16(pkt[2:4]))
 		if totalLen > len(pkt) || totalLen < ihl {
 			return p, false
 		}
-		p.isV6 = false
-		p.ipHdrLen = ihl
+		p.ipHdrLen = 20
+		p.fk.isV6 = false
+		copy(p.fk.src[:4], pkt[12:16])
+		copy(p.fk.dst[:4], pkt[16:20])
 		pkt = pkt[:totalLen]
 	case 6:
 		if len(pkt) < 40 {
 			return p, false
 		}
 		if pkt[6] != ipProtoTCP {
-			return p, false // reject ext headers
+			return p, false
 		}
 		payloadLen := int(binary.BigEndian.Uint16(pkt[4:6]))
 		if 40+payloadLen > len(pkt) {
 			return p, false
 		}
-		p.isV6 = true
 		p.ipHdrLen = 40
+		p.fk.isV6 = true
+		copy(p.fk.src[:], pkt[8:24])
+		copy(p.fk.dst[:], pkt[24:40])
 		pkt = pkt[:40+payloadLen]
 	default:
 		return p, false
@@ -131,146 +172,216 @@ func parseCoalesceable(pkt []byte) (parsedTCP, bool) {
 	if len(pkt) < p.ipHdrLen+tcpOff {
 		return p, false
 	}
-	flags := pkt[p.ipHdrLen+13]
-	// Allow only ACK and ACK|PSH. In particular: no SYN/FIN/RST/URG/CWR/ECE.
-	const ack = 0x10
-	const psh = 0x08
-	if flags&^(ack|psh) != 0 || flags&ack == 0 {
-		return p, false
-	}
 	p.tcpHdrLen = tcpOff
 	p.hdrLen = p.ipHdrLen + tcpOff
 	p.payLen = len(pkt) - p.hdrLen
-	if p.payLen <= 0 {
-		return p, false
-	}
 	p.seq = binary.BigEndian.Uint32(pkt[p.ipHdrLen+4 : p.ipHdrLen+8])
-	p.flags = flags
+	p.flags = pkt[p.ipHdrLen+13]
+	p.fk.sport = binary.BigEndian.Uint16(pkt[p.ipHdrLen : p.ipHdrLen+2])
+	p.fk.dport = binary.BigEndian.Uint16(pkt[p.ipHdrLen+2 : p.ipHdrLen+4])
 	return p, true
 }

-// Add takes a plaintext inbound packet destined for the tun. If GSO is
-// unavailable or the packet isn't coalesceable, Add falls through to a
-// plain Write on the underlying queue (flushing any pending superpacket
-// first).
+// coalesceable reports whether a parsed TCP segment is eligible for
+// coalescing. Accepts only ACK or ACK|PSH with a non-empty payload.
+func (p parsedTCP) coalesceable() bool {
+	const ack = 0x10
+	const psh = 0x08
+	if p.flags&^(ack|psh) != 0 || p.flags&ack == 0 {
+		return false
+	}
+	return p.payLen > 0
+}
+
+// Add borrows pkt. The caller must keep pkt valid until the next Flush,
+// whether or not the packet was coalesced — passthrough (non-admissible)
+// packets are queued and written at Flush time, not synchronously.
 func (c *tcpCoalescer) Add(pkt []byte) error {
 	if c.gsoW == nil {
-		_, err := c.plainW.Write(pkt)
-		return err
+		c.addPassthrough(pkt)
+		return nil
 	}

-	info, ok := parseCoalesceable(pkt)
+	info, ok := parseTCPBase(pkt)
 	if !ok {
-		if c.active {
-			if err := c.flushLocked(); err != nil {
-				return err
-			}
-		}
-		_, err := c.plainW.Write(pkt)
-		return err
+		// Non-TCP or malformed — can't possibly collide with an open flow.
+		c.addPassthrough(pkt)
+		return nil
+	}
+	if !info.coalesceable() {
+		// TCP but not admissible (SYN/FIN/RST/URG/CWR/ECE or zero-payload).
+		// Seal this flow's open slot so later in-flow packets don't extend
+		// it and accidentally reorder past this passthrough.
+		delete(c.openSlots, info.fk)
+		c.addPassthrough(pkt)
+		return nil
 	}

-	if c.active {
-		if c.canAppend(pkt, info) {
-			c.appendPayload(pkt, info)
-			if info.flags&0x08 != 0 {
-				c.psh = true
+	if open := c.openSlots[info.fk]; open != nil {
+		if c.canAppend(open, pkt, info) {
+			c.appendPayload(open, pkt, info)
+			if open.psh {
+				delete(c.openSlots, info.fk)
 			}
 			return nil
 		}
-		if err := c.flushLocked(); err != nil {
-			return err
-		}
+		// Can't extend — seal it and fall through to seed a fresh slot.
+		delete(c.openSlots, info.fk)
 	}
-	return c.seed(pkt, info)
-}
-
-// Flush emits any pending superpacket. Called by the UDP read loop at
-// recvmmsg batch boundaries — "no more packets coming right now".
-func (c *tcpCoalescer) Flush() error {
-	if !c.active {
-		return nil
-	}
-	return c.flushLocked()
-}
-
-func (c *tcpCoalescer) reset() {
-	c.active = false
-	c.bufLen = 0
-	c.numSeg = 0
-	c.gsoSize = 0
-	c.hdrLen = 0
-	c.ipHdrLen = 0
-	c.nextSeq = 0
-	c.psh = false
-}
-
-func (c *tcpCoalescer) seed(pkt []byte, info parsedTCP) error {
-	if info.hdrLen+info.payLen > len(c.buf) {
-		// Oversize single packet — flush (already done above) and passthrough.
-		_, err := c.plainW.Write(pkt)
-		return err
-	}
-	copy(c.buf, pkt[:info.hdrLen+info.payLen])
-	c.active = true
-	c.bufLen = info.hdrLen + info.payLen
-	c.numSeg = 1
-	c.gsoSize = info.payLen
-	c.isV6 = info.isV6
-	c.ipHdrLen = info.ipHdrLen
-	c.hdrLen = info.hdrLen
-	c.nextSeq = info.seq + uint32(info.payLen)
-	c.psh = info.flags&0x08 != 0
+	c.seed(pkt, info)
 	return nil
 }

-// canAppend reports whether info's packet extends the current seed: same
-// flow, adjacent seq, payload size rule, and no-PSH-mid-chain.
-func (c *tcpCoalescer) canAppend(pkt []byte, info parsedTCP) bool {
-	if c.psh {
-		return false // we already accepted a PSH — chain is closed
+// Flush emits every queued event in arrival order. Coalesced slots go out
+// via WriteGSO; passthrough slots go out via plainW.Write. Returns the
+// first error observed; keeps draining so one bad packet doesn't hold up
+// the rest. After Flush returns, borrowed payload slices may be recycled.
+func (c *tcpCoalescer) Flush() error {
+	var first error
+	for _, s := range c.slots {
+		var err error
+		if s.passthrough {
+			_, err = c.plainW.Write(s.rawPkt)
+		} else {
+			err = c.flushSlot(s)
+		}
+		if err != nil && first == nil {
+			first = err
+		}
+		c.release(s)
 	}
-	if info.isV6 != c.isV6 {
+	for i := range c.slots {
+		c.slots[i] = nil
+	}
+	c.slots = c.slots[:0]
+	for k := range c.openSlots {
+		delete(c.openSlots, k)
+	}
+	return first
+}
+
+func (c *tcpCoalescer) addPassthrough(pkt []byte) {
+	s := c.take()
+	s.passthrough = true
+	s.rawPkt = pkt
+	c.slots = append(c.slots, s)
+}
+
+func (c *tcpCoalescer) seed(pkt []byte, info parsedTCP) {
+	if info.hdrLen > tcpCoalesceHdrCap || info.hdrLen+info.payLen > tcpCoalesceBufSize {
+		// Pathological shape — can't fit our scratch, emit as-is.
+		c.addPassthrough(pkt)
+		return
+	}
+	s := c.take()
+	s.passthrough = false
+	s.rawPkt = nil
+	copy(s.hdrBuf[:], pkt[:info.hdrLen])
+	s.hdrLen = info.hdrLen
+	s.ipHdrLen = info.ipHdrLen
+	s.isV6 = info.fk.isV6
+	s.fk = info.fk
+	s.gsoSize = info.payLen
+	s.numSeg = 1
+	s.totalPay = info.payLen
+	s.nextSeq = info.seq + uint32(info.payLen)
+	s.psh = info.flags&0x08 != 0
+	s.payIovs = append(s.payIovs[:0], pkt[info.hdrLen:info.hdrLen+info.payLen])
+	c.slots = append(c.slots, s)
+	if !s.psh {
+		c.openSlots[info.fk] = s
+	}
+}
+
+// canAppend reports whether info's packet extends the slot's seed: same
+// header shape and stable contents, adjacent seq, not oversized, chain not
+// closed.
+func (c *tcpCoalescer) canAppend(s *coalesceSlot, pkt []byte, info parsedTCP) bool {
+	if s.psh {
 		return false
 	}
-	if info.hdrLen != c.hdrLen {
+	if info.hdrLen != s.hdrLen {
 		return false
 	}
-	if info.seq != c.nextSeq {
+	if info.seq != s.nextSeq {
 		return false
 	}
-	if c.numSeg >= tcpCoalesceMaxSegs {
+	if s.numSeg >= tcpCoalesceMaxSegs {
 		return false
 	}
-	if c.bufLen+info.payLen > len(c.buf) {
+	if info.payLen > s.gsoSize {
 		return false
 	}
-	// Every mid-chain segment must be exactly gsoSize. The final segment may
-	// be shorter, but once a short segment is appended we can't add another.
-	if info.payLen > c.gsoSize {
+	if s.hdrLen+s.totalPay+info.payLen > tcpCoalesceBufSize {
 		return false
 	}
-	if info.payLen < c.gsoSize {
-		// Will become the last segment — always OK to append, just no more.
-	}
-	// Compare the stable parts of the header.
-	if !headersMatch(c.buf[:c.hdrLen], pkt[:info.hdrLen], c.isV6, c.ipHdrLen) {
+	if !headersMatch(s.hdrBuf[:s.hdrLen], pkt[:info.hdrLen], s.isV6, s.ipHdrLen) {
 		return false
 	}
 	return true
 }

-func (c *tcpCoalescer) appendPayload(pkt []byte, info parsedTCP) {
-	copy(c.buf[c.bufLen:], pkt[info.hdrLen:info.hdrLen+info.payLen])
-	c.bufLen += info.payLen
-	c.numSeg++
-	c.nextSeq = info.seq + uint32(info.payLen)
-	// If this was a sub-gsoSize last segment, mark chain as closed.
-	if info.payLen < c.gsoSize {
-		c.psh = true
+func (c *tcpCoalescer) appendPayload(s *coalesceSlot, pkt []byte, info parsedTCP) {
+	s.payIovs = append(s.payIovs, pkt[info.hdrLen:info.hdrLen+info.payLen])
+	s.numSeg++
+	s.totalPay += info.payLen
+	s.nextSeq = info.seq + uint32(info.payLen)
+	if info.payLen < s.gsoSize || info.flags&0x08 != 0 {
+		s.psh = true
 	}
 }

+func (c *tcpCoalescer) take() *coalesceSlot {
+	if n := len(c.pool); n > 0 {
+		s := c.pool[n-1]
+		c.pool[n-1] = nil
+		c.pool = c.pool[:n-1]
+		return s
+	}
+	return &coalesceSlot{}
+}
+
+func (c *tcpCoalescer) release(s *coalesceSlot) {
+	s.passthrough = false
+	s.rawPkt = nil
+	for i := range s.payIovs {
+		s.payIovs[i] = nil
+	}
+	s.payIovs = s.payIovs[:0]
+	s.numSeg = 0
+	s.totalPay = 0
+	s.psh = false
+	c.pool = append(c.pool, s)
+}
+
+// flushSlot patches the header and calls WriteGSO. Does not remove the
+// slot from c.slots.
+func (c *tcpCoalescer) flushSlot(s *coalesceSlot) error {
+	total := s.hdrLen + s.totalPay
+	l4Len := total - s.ipHdrLen
+	hdr := s.hdrBuf[:s.hdrLen]
+
+	if s.isV6 {
+		binary.BigEndian.PutUint16(hdr[4:6], uint16(l4Len))
+	} else {
+		binary.BigEndian.PutUint16(hdr[2:4], uint16(total))
+		hdr[10] = 0
+		hdr[11] = 0
+		binary.BigEndian.PutUint16(hdr[10:12], ipv4HdrChecksum(hdr[:s.ipHdrLen]))
+	}
+
+	var psum uint32
+	if s.isV6 {
+		psum = pseudoSumIPv6(hdr[8:24], hdr[24:40], ipProtoTCP, l4Len)
+	} else {
+		psum = pseudoSumIPv4(hdr[12:16], hdr[16:20], ipProtoTCP, l4Len)
+	}
+	tcsum := s.ipHdrLen + 16
+	binary.BigEndian.PutUint16(hdr[tcsum:tcsum+2], foldOnceNoInvert(psum))
+
+	return c.gsoW.WriteGSO(hdr, s.payIovs, uint16(s.gsoSize), s.isV6, uint16(s.ipHdrLen))
+}
+
 // headersMatch compares two IP+TCP header prefixes for byte-for-byte
 // equality on every field that must be identical across coalesced
 // segments. Size/IPID/IPCsum/seq/flags/tcpCsum are masked out.
@@ -330,58 +441,6 @@ func bytesEq(a, b []byte) bool {
 	return true
 }

-func (c *tcpCoalescer) flushLocked() error {
-	// Guarantee the coalescer is empty on exit regardless of how we leave.
-	defer c.reset()
-
-	if c.numSeg <= 1 {
-		_, err := c.plainW.Write(c.buf[:c.bufLen])
-		return err
-	}
-
-	total := c.bufLen
-	l4Len := total - c.ipHdrLen
-
-	// Fix IP header length field.
-	if c.isV6 {
-		if l4Len > 0xffff {
-			// Shouldn't happen given buffer size, but guard against it.
-			return c.flushAsPerSegment()
-		}
-		binary.BigEndian.PutUint16(c.buf[4:6], uint16(l4Len))
-	} else {
-		if total > 0xffff {
-			return c.flushAsPerSegment()
-		}
-		binary.BigEndian.PutUint16(c.buf[2:4], uint16(total))
-		// Recompute IPv4 header checksum.
-		c.buf[10] = 0
-		c.buf[11] = 0
-		binary.BigEndian.PutUint16(c.buf[10:12], ipv4HdrChecksum(c.buf[:c.ipHdrLen]))
-	}
-
-	// Write the virtio NEEDS_CSUM pseudo-header partial into the TCP csum field.
-	var psum uint32
-	if c.isV6 {
-		psum = pseudoSumIPv6(c.buf[8:24], c.buf[24:40], ipProtoTCP, l4Len)
-	} else {
-		psum = pseudoSumIPv4(c.buf[12:16], c.buf[16:20], ipProtoTCP, l4Len)
-	}
-	tcsum := c.ipHdrLen + 16
-	binary.BigEndian.PutUint16(c.buf[tcsum:tcsum+2], foldOnceNoInvert(psum))
-
-	return c.gsoW.WriteGSO(c.buf[:total], uint16(c.gsoSize), c.isV6, uint16(c.hdrLen), uint16(c.ipHdrLen))
-}
-
-// flushAsPerSegment is a defensive fallback used if the coalesced superpacket
-// somehow exceeds 16-bit length fields. It writes the packet as-is through
-// the plain writer (the kernel will reject it, but that's a visible error
-// rather than silent corruption).
-func (c *tcpCoalescer) flushAsPerSegment() error {
-	_, err := c.plainW.Write(c.buf[:c.bufLen])
-	return err
-}
-
 // ipv4HdrChecksum computes the IPv4 header checksum over hdr (which must
 // already have its checksum field zeroed) and returns the folded/inverted
 // 16-bit value to store.