holy crap 2x

This commit is contained in:
JackDoan
2026-04-17 15:33:46 -05:00
parent 1fd24a19c7
commit 60e556866a
5 changed files with 643 additions and 307 deletions

View File

@@ -7,72 +7,114 @@ import (
"github.com/slackhq/nebula/overlay"
)
// IPPROTO_TCP is the IANA protocol number for TCP. Hardcoded instead of
// reaching for ipProtoTCP because golang.org/x/sys/unix doesn't
// define that constant on Windows, which would break cross-compiles even
// though this file runs unchanged on every platform.
// ipProtoTCP is the IANA protocol number for TCP. Hardcoded instead of
// reaching for golang.org/x/sys/unix — that package doesn't define the
// constant on Windows, which would break cross-compiles even though this
// file runs unchanged on every platform.
const ipProtoTCP = 6
// tcpCoalesceBufSize bounds the largest coalesced superpacket we will buffer.
// Linux caps sk_gso_max_size around 64KiB; 65535 bytes covers IP hdr + TCP
// hdr + up to ~65KB of payload, which is the most the kernel's TSO can
// segment in one shot.
// tcpCoalesceBufSize caps total bytes per superpacket. Mirrors the kernel's
// sk_gso_max_size of ~64KiB; anything beyond this would be rejected anyway.
const tcpCoalesceBufSize = 65535
// tcpCoalesceMaxSegs caps how many segments we are willing to coalesce into
// a single superpacket regardless of byte budget. Kernel allows up to 64
// for UDP GSO and 128 for many TSO engines; stop well before either limit
// to keep latency bounded.
// tcpCoalesceMaxSegs caps how many segments we'll coalesce into a single
// superpacket. Keeping this well below the kernel's TSO ceiling bounds
// latency.
const tcpCoalesceMaxSegs = 64
// tcpCoalescer accumulates adjacent in-flow TCP data segments into a single
// TSO superpacket and emits them via overlay.GSOWriter in one writev. When
// a packet fails admission or fails to extend the pending flow, the
// pending superpacket is flushed and the non-matching packet is written
// through as-is. Owns no locks — one coalescer per TUN write queue.
// tcpCoalesceHdrCap is the scratch space we copy a seed's IP+TCP header
// into. IPv6 (40) + TCP with full options (60) = 100 bytes.
const tcpCoalesceHdrCap = 100
// initialSlots is the starting capacity of the slot pool. One flow per
// packet is the worst case so this matches a typical UDP recvmmsg batch.
const initialSlots = 64
// flowKey identifies a TCP flow by {src, dst, sport, dport, family}.
// Comparable, so linear scans over the slot list stay tight.
type flowKey struct {
src, dst [16]byte
sport, dport uint16
isV6 bool
}
// coalesceSlot is one entry in the coalescer's ordered event queue. When
// passthrough is true the slot holds a single borrowed packet that must be
// emitted verbatim (non-TCP, non-admissible TCP, or oversize seed). When
// passthrough is false the slot is an in-progress coalesced superpacket:
// hdrBuf is a mutable copy of the seed's IP+TCP header (we patch total
// length and pseudo-header partial at flush), and payIovs are *borrowed*
// slices from the caller's plaintext buffers — no payload is ever copied.
// The caller (listenOut) must keep those buffers alive until Flush.
type coalesceSlot struct {
passthrough bool
rawPkt []byte // borrowed when passthrough
fk flowKey
hdrBuf [tcpCoalesceHdrCap]byte
hdrLen int
ipHdrLen int
isV6 bool
gsoSize int
numSeg int
totalPay int
nextSeq uint32
// psh closes the chain: set when the last-accepted segment had PSH or
// was sub-gsoSize. No further appends after that.
psh bool
payIovs [][]byte
}
// tcpCoalescer accumulates adjacent in-flow TCP data segments across
// multiple concurrent flows and emits each flow's run as a single TSO
// superpacket via overlay.GSOWriter. All output — coalesced or not — is
// deferred until Flush so arrival order is preserved on the wire. Owns
// no locks; one coalescer per TUN write queue.
type tcpCoalescer struct {
plainW io.Writer
gsoW overlay.GSOWriter // nil when the queue doesn't support TSO
buf []byte
bufLen int // valid bytes in buf — hdrLen plus accumulated payload
active bool // a seed packet is present
numSeg int
gsoSize int // payload length of each segment (= MSS of the seed)
isV6 bool
ipHdrLen int
hdrLen int // ipHdrLen + tcpHdrLen, the offset where payload starts
nextSeq uint32 // expected TCP seq of the next packet to coalesce
// psh indicates the last-accepted segment had PSH set. We accept a PSH
// packet as the final segment but reject any further Adds after that.
psh bool
// slots is the ordered event queue. Flush walks it once and emits each
// entry as either a WriteGSO (coalesced) or a plainW.Write (passthrough).
slots []*coalesceSlot
// openSlots maps a flow key to its most recent non-sealed slot, so new
// segments can extend an in-progress superpacket in O(1). Slots are
// removed from this map when they close (PSH or short-last-segment),
// when a non-admissible packet for that flow arrives, or in Flush.
openSlots map[flowKey]*coalesceSlot
pool []*coalesceSlot // free list for reuse
}
func newTCPCoalescer(w io.Writer) *tcpCoalescer {
c := &tcpCoalescer{plainW: w, buf: make([]byte, tcpCoalesceBufSize)}
c := &tcpCoalescer{
plainW: w,
slots: make([]*coalesceSlot, 0, initialSlots),
openSlots: make(map[flowKey]*coalesceSlot, initialSlots),
pool: make([]*coalesceSlot, 0, initialSlots),
}
if gw, ok := w.(overlay.GSOWriter); ok && gw.GSOSupported() {
c.gsoW = gw
}
return c
}
// parsedTCP holds the byte offsets / values we extract from one admission
// check so Add and canAppend don't re-parse the same header twice.
// parsedTCP holds the fields extracted from a single parse so later steps
// (admission, slot lookup, canAppend) don't re-walk the header.
type parsedTCP struct {
isV6 bool
fk flowKey
ipHdrLen int
tcpHdrLen int
hdrLen int // ipHdrLen + tcpHdrLen
hdrLen int
payLen int
seq uint32
flags byte
}
// parseCoalesceable decides whether pkt is eligible for TCP coalescing. It
// accepts IPv4 (no options, DF set, no fragmentation) and IPv6 (no
// extension headers) carrying a TCP segment with flags in {ACK, ACK|PSH}
// and a non-empty payload. On success it returns the parsed offsets.
func parseCoalesceable(pkt []byte) (parsedTCP, bool) {
// parseTCPBase extracts the flow key and IP/TCP offsets for any TCP packet,
// regardless of whether it's admissible for coalescing. Returns ok=false
// for non-TCP or malformed input. Accepts IPv4 (no options, no fragmentation)
// and IPv6 (no extension headers).
func parseTCPBase(pkt []byte) (parsedTCP, bool) {
var p parsedTCP
if len(pkt) < 20 {
return p, false
@@ -80,42 +122,41 @@ func parseCoalesceable(pkt []byte) (parsedTCP, bool) {
v := pkt[0] >> 4
switch v {
case 4:
if len(pkt) < 20 {
return p, false
}
ihl := int(pkt[0]&0x0f) * 4
if ihl != 20 {
return p, false // reject IP options
return p, false
}
if pkt[9] != ipProtoTCP {
return p, false
}
// Fragment check: MF=0 and frag offset=0. Accept DF=1 or DF=0 —
// just reject any actual fragmentation.
fragField := binary.BigEndian.Uint16(pkt[6:8])
if fragField&0x3fff != 0 {
// Reject actual fragmentation (MF or non-zero frag offset).
if binary.BigEndian.Uint16(pkt[6:8])&0x3fff != 0 {
return p, false
}
totalLen := int(binary.BigEndian.Uint16(pkt[2:4]))
if totalLen > len(pkt) || totalLen < ihl {
return p, false
}
p.isV6 = false
p.ipHdrLen = ihl
p.ipHdrLen = 20
p.fk.isV6 = false
copy(p.fk.src[:4], pkt[12:16])
copy(p.fk.dst[:4], pkt[16:20])
pkt = pkt[:totalLen]
case 6:
if len(pkt) < 40 {
return p, false
}
if pkt[6] != ipProtoTCP {
return p, false // reject ext headers
return p, false
}
payloadLen := int(binary.BigEndian.Uint16(pkt[4:6]))
if 40+payloadLen > len(pkt) {
return p, false
}
p.isV6 = true
p.ipHdrLen = 40
p.fk.isV6 = true
copy(p.fk.src[:], pkt[8:24])
copy(p.fk.dst[:], pkt[24:40])
pkt = pkt[:40+payloadLen]
default:
return p, false
@@ -131,146 +172,216 @@ func parseCoalesceable(pkt []byte) (parsedTCP, bool) {
if len(pkt) < p.ipHdrLen+tcpOff {
return p, false
}
flags := pkt[p.ipHdrLen+13]
// Allow only ACK and ACK|PSH. In particular: no SYN/FIN/RST/URG/CWR/ECE.
const ack = 0x10
const psh = 0x08
if flags&^(ack|psh) != 0 || flags&ack == 0 {
return p, false
}
p.tcpHdrLen = tcpOff
p.hdrLen = p.ipHdrLen + tcpOff
p.payLen = len(pkt) - p.hdrLen
if p.payLen <= 0 {
return p, false
}
p.seq = binary.BigEndian.Uint32(pkt[p.ipHdrLen+4 : p.ipHdrLen+8])
p.flags = flags
p.flags = pkt[p.ipHdrLen+13]
p.fk.sport = binary.BigEndian.Uint16(pkt[p.ipHdrLen : p.ipHdrLen+2])
p.fk.dport = binary.BigEndian.Uint16(pkt[p.ipHdrLen+2 : p.ipHdrLen+4])
return p, true
}
// Add takes a plaintext inbound packet destined for the tun. If GSO is
// unavailable or the packet isn't coalesceable, Add falls through to a
// plain Write on the underlying queue (flushing any pending superpacket
// first).
// coalesceable reports whether a parsed TCP segment is eligible for
// coalescing. Accepts only ACK or ACK|PSH with a non-empty payload.
func (p parsedTCP) coalesceable() bool {
const ack = 0x10
const psh = 0x08
if p.flags&^(ack|psh) != 0 || p.flags&ack == 0 {
return false
}
return p.payLen > 0
}
// Add borrows pkt. The caller must keep pkt valid until the next Flush,
// whether or not the packet was coalesced — passthrough (non-admissible)
// packets are queued and written at Flush time, not synchronously.
func (c *tcpCoalescer) Add(pkt []byte) error {
if c.gsoW == nil {
_, err := c.plainW.Write(pkt)
return err
c.addPassthrough(pkt)
return nil
}
info, ok := parseCoalesceable(pkt)
info, ok := parseTCPBase(pkt)
if !ok {
if c.active {
if err := c.flushLocked(); err != nil {
return err
}
}
_, err := c.plainW.Write(pkt)
return err
// Non-TCP or malformed — can't possibly collide with an open flow.
c.addPassthrough(pkt)
return nil
}
if !info.coalesceable() {
// TCP but not admissible (SYN/FIN/RST/URG/CWR/ECE or zero-payload).
// Seal this flow's open slot so later in-flow packets don't extend
// it and accidentally reorder past this passthrough.
delete(c.openSlots, info.fk)
c.addPassthrough(pkt)
return nil
}
if c.active {
if c.canAppend(pkt, info) {
c.appendPayload(pkt, info)
if info.flags&0x08 != 0 {
c.psh = true
if open := c.openSlots[info.fk]; open != nil {
if c.canAppend(open, pkt, info) {
c.appendPayload(open, pkt, info)
if open.psh {
delete(c.openSlots, info.fk)
}
return nil
}
if err := c.flushLocked(); err != nil {
return err
}
// Can't extend — seal it and fall through to seed a fresh slot.
delete(c.openSlots, info.fk)
}
return c.seed(pkt, info)
}
// Flush emits any pending superpacket. Called by the UDP read loop at
// recvmmsg batch boundaries — "no more packets coming right now".
func (c *tcpCoalescer) Flush() error {
if !c.active {
return nil
}
return c.flushLocked()
}
func (c *tcpCoalescer) reset() {
c.active = false
c.bufLen = 0
c.numSeg = 0
c.gsoSize = 0
c.hdrLen = 0
c.ipHdrLen = 0
c.nextSeq = 0
c.psh = false
}
func (c *tcpCoalescer) seed(pkt []byte, info parsedTCP) error {
if info.hdrLen+info.payLen > len(c.buf) {
// Oversize single packet — flush (already done above) and passthrough.
_, err := c.plainW.Write(pkt)
return err
}
copy(c.buf, pkt[:info.hdrLen+info.payLen])
c.active = true
c.bufLen = info.hdrLen + info.payLen
c.numSeg = 1
c.gsoSize = info.payLen
c.isV6 = info.isV6
c.ipHdrLen = info.ipHdrLen
c.hdrLen = info.hdrLen
c.nextSeq = info.seq + uint32(info.payLen)
c.psh = info.flags&0x08 != 0
c.seed(pkt, info)
return nil
}
// canAppend reports whether info's packet extends the current seed: same
// flow, adjacent seq, payload size rule, and no-PSH-mid-chain.
func (c *tcpCoalescer) canAppend(pkt []byte, info parsedTCP) bool {
if c.psh {
return false // we already accepted a PSH — chain is closed
// Flush emits every queued event in arrival order. Coalesced slots go out
// via WriteGSO; passthrough slots go out via plainW.Write. Returns the
// first error observed; keeps draining so one bad packet doesn't hold up
// the rest. After Flush returns, borrowed payload slices may be recycled.
func (c *tcpCoalescer) Flush() error {
var first error
for _, s := range c.slots {
var err error
if s.passthrough {
_, err = c.plainW.Write(s.rawPkt)
} else {
err = c.flushSlot(s)
}
if err != nil && first == nil {
first = err
}
c.release(s)
}
if info.isV6 != c.isV6 {
for i := range c.slots {
c.slots[i] = nil
}
c.slots = c.slots[:0]
for k := range c.openSlots {
delete(c.openSlots, k)
}
return first
}
func (c *tcpCoalescer) addPassthrough(pkt []byte) {
s := c.take()
s.passthrough = true
s.rawPkt = pkt
c.slots = append(c.slots, s)
}
func (c *tcpCoalescer) seed(pkt []byte, info parsedTCP) {
if info.hdrLen > tcpCoalesceHdrCap || info.hdrLen+info.payLen > tcpCoalesceBufSize {
// Pathological shape — can't fit our scratch, emit as-is.
c.addPassthrough(pkt)
return
}
s := c.take()
s.passthrough = false
s.rawPkt = nil
copy(s.hdrBuf[:], pkt[:info.hdrLen])
s.hdrLen = info.hdrLen
s.ipHdrLen = info.ipHdrLen
s.isV6 = info.fk.isV6
s.fk = info.fk
s.gsoSize = info.payLen
s.numSeg = 1
s.totalPay = info.payLen
s.nextSeq = info.seq + uint32(info.payLen)
s.psh = info.flags&0x08 != 0
s.payIovs = append(s.payIovs[:0], pkt[info.hdrLen:info.hdrLen+info.payLen])
c.slots = append(c.slots, s)
if !s.psh {
c.openSlots[info.fk] = s
}
}
// canAppend reports whether info's packet extends the slot's seed: same
// header shape and stable contents, adjacent seq, not oversized, chain not
// closed.
func (c *tcpCoalescer) canAppend(s *coalesceSlot, pkt []byte, info parsedTCP) bool {
if s.psh {
return false
}
if info.hdrLen != c.hdrLen {
if info.hdrLen != s.hdrLen {
return false
}
if info.seq != c.nextSeq {
if info.seq != s.nextSeq {
return false
}
if c.numSeg >= tcpCoalesceMaxSegs {
if s.numSeg >= tcpCoalesceMaxSegs {
return false
}
if c.bufLen+info.payLen > len(c.buf) {
if info.payLen > s.gsoSize {
return false
}
// Every mid-chain segment must be exactly gsoSize. The final segment may
// be shorter, but once a short segment is appended we can't add another.
if info.payLen > c.gsoSize {
if s.hdrLen+s.totalPay+info.payLen > tcpCoalesceBufSize {
return false
}
if info.payLen < c.gsoSize {
// Will become the last segment — always OK to append, just no more.
}
// Compare the stable parts of the header.
if !headersMatch(c.buf[:c.hdrLen], pkt[:info.hdrLen], c.isV6, c.ipHdrLen) {
if !headersMatch(s.hdrBuf[:s.hdrLen], pkt[:info.hdrLen], s.isV6, s.ipHdrLen) {
return false
}
return true
}
func (c *tcpCoalescer) appendPayload(pkt []byte, info parsedTCP) {
copy(c.buf[c.bufLen:], pkt[info.hdrLen:info.hdrLen+info.payLen])
c.bufLen += info.payLen
c.numSeg++
c.nextSeq = info.seq + uint32(info.payLen)
// If this was a sub-gsoSize last segment, mark chain as closed.
if info.payLen < c.gsoSize {
c.psh = true
func (c *tcpCoalescer) appendPayload(s *coalesceSlot, pkt []byte, info parsedTCP) {
s.payIovs = append(s.payIovs, pkt[info.hdrLen:info.hdrLen+info.payLen])
s.numSeg++
s.totalPay += info.payLen
s.nextSeq = info.seq + uint32(info.payLen)
if info.payLen < s.gsoSize || info.flags&0x08 != 0 {
s.psh = true
}
}
func (c *tcpCoalescer) take() *coalesceSlot {
if n := len(c.pool); n > 0 {
s := c.pool[n-1]
c.pool[n-1] = nil
c.pool = c.pool[:n-1]
return s
}
return &coalesceSlot{}
}
func (c *tcpCoalescer) release(s *coalesceSlot) {
s.passthrough = false
s.rawPkt = nil
for i := range s.payIovs {
s.payIovs[i] = nil
}
s.payIovs = s.payIovs[:0]
s.numSeg = 0
s.totalPay = 0
s.psh = false
c.pool = append(c.pool, s)
}
// flushSlot patches the header and calls WriteGSO. Does not remove the
// slot from c.slots.
func (c *tcpCoalescer) flushSlot(s *coalesceSlot) error {
total := s.hdrLen + s.totalPay
l4Len := total - s.ipHdrLen
hdr := s.hdrBuf[:s.hdrLen]
if s.isV6 {
binary.BigEndian.PutUint16(hdr[4:6], uint16(l4Len))
} else {
binary.BigEndian.PutUint16(hdr[2:4], uint16(total))
hdr[10] = 0
hdr[11] = 0
binary.BigEndian.PutUint16(hdr[10:12], ipv4HdrChecksum(hdr[:s.ipHdrLen]))
}
var psum uint32
if s.isV6 {
psum = pseudoSumIPv6(hdr[8:24], hdr[24:40], ipProtoTCP, l4Len)
} else {
psum = pseudoSumIPv4(hdr[12:16], hdr[16:20], ipProtoTCP, l4Len)
}
tcsum := s.ipHdrLen + 16
binary.BigEndian.PutUint16(hdr[tcsum:tcsum+2], foldOnceNoInvert(psum))
return c.gsoW.WriteGSO(hdr, s.payIovs, uint16(s.gsoSize), s.isV6, uint16(s.ipHdrLen))
}
// headersMatch compares two IP+TCP header prefixes for byte-for-byte
// equality on every field that must be identical across coalesced
// segments. Size/IPID/IPCsum/seq/flags/tcpCsum are masked out.
@@ -330,58 +441,6 @@ func bytesEq(a, b []byte) bool {
return true
}
func (c *tcpCoalescer) flushLocked() error {
// Guarantee the coalescer is empty on exit regardless of how we leave.
defer c.reset()
if c.numSeg <= 1 {
_, err := c.plainW.Write(c.buf[:c.bufLen])
return err
}
total := c.bufLen
l4Len := total - c.ipHdrLen
// Fix IP header length field.
if c.isV6 {
if l4Len > 0xffff {
// Shouldn't happen given buffer size, but guard against it.
return c.flushAsPerSegment()
}
binary.BigEndian.PutUint16(c.buf[4:6], uint16(l4Len))
} else {
if total > 0xffff {
return c.flushAsPerSegment()
}
binary.BigEndian.PutUint16(c.buf[2:4], uint16(total))
// Recompute IPv4 header checksum.
c.buf[10] = 0
c.buf[11] = 0
binary.BigEndian.PutUint16(c.buf[10:12], ipv4HdrChecksum(c.buf[:c.ipHdrLen]))
}
// Write the virtio NEEDS_CSUM pseudo-header partial into the TCP csum field.
var psum uint32
if c.isV6 {
psum = pseudoSumIPv6(c.buf[8:24], c.buf[24:40], ipProtoTCP, l4Len)
} else {
psum = pseudoSumIPv4(c.buf[12:16], c.buf[16:20], ipProtoTCP, l4Len)
}
tcsum := c.ipHdrLen + 16
binary.BigEndian.PutUint16(c.buf[tcsum:tcsum+2], foldOnceNoInvert(psum))
return c.gsoW.WriteGSO(c.buf[:total], uint16(c.gsoSize), c.isV6, uint16(c.hdrLen), uint16(c.ipHdrLen))
}
// flushAsPerSegment is a defensive fallback used if the coalesced superpacket
// somehow exceeds 16-bit length fields. It writes the packet as-is through
// the plain writer (the kernel will reject it, but that's a visible error
// rather than silent corruption).
func (c *tcpCoalescer) flushAsPerSegment() error {
_, err := c.plainW.Write(c.buf[:c.bufLen])
return err
}
// ipv4HdrChecksum computes the IPv4 header checksum over hdr (which must
// already have its checksum field zeroed) and returns the folded/inverted
// 16-bit value to store.