mirror of
https://github.com/slackhq/nebula.git
synced 2026-05-16 04:47:38 +02:00
holy crap 2x
This commit is contained in:
461
tcp_coalesce.go
461
tcp_coalesce.go
@@ -7,72 +7,114 @@ import (
|
||||
"github.com/slackhq/nebula/overlay"
|
||||
)
|
||||
|
||||
// IPPROTO_TCP is the IANA protocol number for TCP. Hardcoded instead of
|
||||
// reaching for ipProtoTCP because golang.org/x/sys/unix doesn't
|
||||
// define that constant on Windows, which would break cross-compiles even
|
||||
// though this file runs unchanged on every platform.
|
||||
// ipProtoTCP is the IANA protocol number for TCP. Hardcoded instead of
|
||||
// reaching for golang.org/x/sys/unix — that package doesn't define the
|
||||
// constant on Windows, which would break cross-compiles even though this
|
||||
// file runs unchanged on every platform.
|
||||
const ipProtoTCP = 6
|
||||
|
||||
// tcpCoalesceBufSize bounds the largest coalesced superpacket we will buffer.
|
||||
// Linux caps sk_gso_max_size around 64KiB; 65535 bytes covers IP hdr + TCP
|
||||
// hdr + up to ~65KB of payload, which is the most the kernel's TSO can
|
||||
// segment in one shot.
|
||||
// tcpCoalesceBufSize caps total bytes per superpacket. Mirrors the kernel's
|
||||
// sk_gso_max_size of ~64KiB; anything beyond this would be rejected anyway.
|
||||
const tcpCoalesceBufSize = 65535
|
||||
|
||||
// tcpCoalesceMaxSegs caps how many segments we are willing to coalesce into
|
||||
// a single superpacket regardless of byte budget. Kernel allows up to 64
|
||||
// for UDP GSO and 128 for many TSO engines; stop well before either limit
|
||||
// to keep latency bounded.
|
||||
// tcpCoalesceMaxSegs caps how many segments we'll coalesce into a single
|
||||
// superpacket. Keeping this well below the kernel's TSO ceiling bounds
|
||||
// latency.
|
||||
const tcpCoalesceMaxSegs = 64
|
||||
|
||||
// tcpCoalescer accumulates adjacent in-flow TCP data segments into a single
|
||||
// TSO superpacket and emits them via overlay.GSOWriter in one writev. When
|
||||
// a packet fails admission or fails to extend the pending flow, the
|
||||
// pending superpacket is flushed and the non-matching packet is written
|
||||
// through as-is. Owns no locks — one coalescer per TUN write queue.
|
||||
// tcpCoalesceHdrCap is the scratch space we copy a seed's IP+TCP header
|
||||
// into. IPv6 (40) + TCP with full options (60) = 100 bytes.
|
||||
const tcpCoalesceHdrCap = 100
|
||||
|
||||
// initialSlots is the starting capacity of the slot pool. One flow per
|
||||
// packet is the worst case so this matches a typical UDP recvmmsg batch.
|
||||
const initialSlots = 64
|
||||
|
||||
// flowKey identifies a TCP flow by {src, dst, sport, dport, family}.
|
||||
// Comparable, so linear scans over the slot list stay tight.
|
||||
type flowKey struct {
|
||||
src, dst [16]byte
|
||||
sport, dport uint16
|
||||
isV6 bool
|
||||
}
|
||||
|
||||
// coalesceSlot is one entry in the coalescer's ordered event queue. When
|
||||
// passthrough is true the slot holds a single borrowed packet that must be
|
||||
// emitted verbatim (non-TCP, non-admissible TCP, or oversize seed). When
|
||||
// passthrough is false the slot is an in-progress coalesced superpacket:
|
||||
// hdrBuf is a mutable copy of the seed's IP+TCP header (we patch total
|
||||
// length and pseudo-header partial at flush), and payIovs are *borrowed*
|
||||
// slices from the caller's plaintext buffers — no payload is ever copied.
|
||||
// The caller (listenOut) must keep those buffers alive until Flush.
|
||||
type coalesceSlot struct {
|
||||
passthrough bool
|
||||
rawPkt []byte // borrowed when passthrough
|
||||
|
||||
fk flowKey
|
||||
hdrBuf [tcpCoalesceHdrCap]byte
|
||||
hdrLen int
|
||||
ipHdrLen int
|
||||
isV6 bool
|
||||
gsoSize int
|
||||
numSeg int
|
||||
totalPay int
|
||||
nextSeq uint32
|
||||
// psh closes the chain: set when the last-accepted segment had PSH or
|
||||
// was sub-gsoSize. No further appends after that.
|
||||
psh bool
|
||||
payIovs [][]byte
|
||||
}
|
||||
|
||||
// tcpCoalescer accumulates adjacent in-flow TCP data segments across
|
||||
// multiple concurrent flows and emits each flow's run as a single TSO
|
||||
// superpacket via overlay.GSOWriter. All output — coalesced or not — is
|
||||
// deferred until Flush so arrival order is preserved on the wire. Owns
|
||||
// no locks; one coalescer per TUN write queue.
|
||||
type tcpCoalescer struct {
|
||||
plainW io.Writer
|
||||
gsoW overlay.GSOWriter // nil when the queue doesn't support TSO
|
||||
|
||||
buf []byte
|
||||
bufLen int // valid bytes in buf — hdrLen plus accumulated payload
|
||||
active bool // a seed packet is present
|
||||
numSeg int
|
||||
gsoSize int // payload length of each segment (= MSS of the seed)
|
||||
isV6 bool
|
||||
ipHdrLen int
|
||||
hdrLen int // ipHdrLen + tcpHdrLen, the offset where payload starts
|
||||
nextSeq uint32 // expected TCP seq of the next packet to coalesce
|
||||
// psh indicates the last-accepted segment had PSH set. We accept a PSH
|
||||
// packet as the final segment but reject any further Adds after that.
|
||||
psh bool
|
||||
// slots is the ordered event queue. Flush walks it once and emits each
|
||||
// entry as either a WriteGSO (coalesced) or a plainW.Write (passthrough).
|
||||
slots []*coalesceSlot
|
||||
// openSlots maps a flow key to its most recent non-sealed slot, so new
|
||||
// segments can extend an in-progress superpacket in O(1). Slots are
|
||||
// removed from this map when they close (PSH or short-last-segment),
|
||||
// when a non-admissible packet for that flow arrives, or in Flush.
|
||||
openSlots map[flowKey]*coalesceSlot
|
||||
pool []*coalesceSlot // free list for reuse
|
||||
}
|
||||
|
||||
func newTCPCoalescer(w io.Writer) *tcpCoalescer {
|
||||
c := &tcpCoalescer{plainW: w, buf: make([]byte, tcpCoalesceBufSize)}
|
||||
c := &tcpCoalescer{
|
||||
plainW: w,
|
||||
slots: make([]*coalesceSlot, 0, initialSlots),
|
||||
openSlots: make(map[flowKey]*coalesceSlot, initialSlots),
|
||||
pool: make([]*coalesceSlot, 0, initialSlots),
|
||||
}
|
||||
if gw, ok := w.(overlay.GSOWriter); ok && gw.GSOSupported() {
|
||||
c.gsoW = gw
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// parsedTCP holds the byte offsets / values we extract from one admission
|
||||
// check so Add and canAppend don't re-parse the same header twice.
|
||||
// parsedTCP holds the fields extracted from a single parse so later steps
|
||||
// (admission, slot lookup, canAppend) don't re-walk the header.
|
||||
type parsedTCP struct {
|
||||
isV6 bool
|
||||
fk flowKey
|
||||
ipHdrLen int
|
||||
tcpHdrLen int
|
||||
hdrLen int // ipHdrLen + tcpHdrLen
|
||||
hdrLen int
|
||||
payLen int
|
||||
seq uint32
|
||||
flags byte
|
||||
}
|
||||
|
||||
// parseCoalesceable decides whether pkt is eligible for TCP coalescing. It
|
||||
// accepts IPv4 (no options, DF set, no fragmentation) and IPv6 (no
|
||||
// extension headers) carrying a TCP segment with flags in {ACK, ACK|PSH}
|
||||
// and a non-empty payload. On success it returns the parsed offsets.
|
||||
func parseCoalesceable(pkt []byte) (parsedTCP, bool) {
|
||||
// parseTCPBase extracts the flow key and IP/TCP offsets for any TCP packet,
|
||||
// regardless of whether it's admissible for coalescing. Returns ok=false
|
||||
// for non-TCP or malformed input. Accepts IPv4 (no options, no fragmentation)
|
||||
// and IPv6 (no extension headers).
|
||||
func parseTCPBase(pkt []byte) (parsedTCP, bool) {
|
||||
var p parsedTCP
|
||||
if len(pkt) < 20 {
|
||||
return p, false
|
||||
@@ -80,42 +122,41 @@ func parseCoalesceable(pkt []byte) (parsedTCP, bool) {
|
||||
v := pkt[0] >> 4
|
||||
switch v {
|
||||
case 4:
|
||||
if len(pkt) < 20 {
|
||||
return p, false
|
||||
}
|
||||
ihl := int(pkt[0]&0x0f) * 4
|
||||
if ihl != 20 {
|
||||
return p, false // reject IP options
|
||||
return p, false
|
||||
}
|
||||
if pkt[9] != ipProtoTCP {
|
||||
return p, false
|
||||
}
|
||||
// Fragment check: MF=0 and frag offset=0. Accept DF=1 or DF=0 —
|
||||
// just reject any actual fragmentation.
|
||||
fragField := binary.BigEndian.Uint16(pkt[6:8])
|
||||
if fragField&0x3fff != 0 {
|
||||
// Reject actual fragmentation (MF or non-zero frag offset).
|
||||
if binary.BigEndian.Uint16(pkt[6:8])&0x3fff != 0 {
|
||||
return p, false
|
||||
}
|
||||
totalLen := int(binary.BigEndian.Uint16(pkt[2:4]))
|
||||
if totalLen > len(pkt) || totalLen < ihl {
|
||||
return p, false
|
||||
}
|
||||
p.isV6 = false
|
||||
p.ipHdrLen = ihl
|
||||
p.ipHdrLen = 20
|
||||
p.fk.isV6 = false
|
||||
copy(p.fk.src[:4], pkt[12:16])
|
||||
copy(p.fk.dst[:4], pkt[16:20])
|
||||
pkt = pkt[:totalLen]
|
||||
case 6:
|
||||
if len(pkt) < 40 {
|
||||
return p, false
|
||||
}
|
||||
if pkt[6] != ipProtoTCP {
|
||||
return p, false // reject ext headers
|
||||
return p, false
|
||||
}
|
||||
payloadLen := int(binary.BigEndian.Uint16(pkt[4:6]))
|
||||
if 40+payloadLen > len(pkt) {
|
||||
return p, false
|
||||
}
|
||||
p.isV6 = true
|
||||
p.ipHdrLen = 40
|
||||
p.fk.isV6 = true
|
||||
copy(p.fk.src[:], pkt[8:24])
|
||||
copy(p.fk.dst[:], pkt[24:40])
|
||||
pkt = pkt[:40+payloadLen]
|
||||
default:
|
||||
return p, false
|
||||
@@ -131,146 +172,216 @@ func parseCoalesceable(pkt []byte) (parsedTCP, bool) {
|
||||
if len(pkt) < p.ipHdrLen+tcpOff {
|
||||
return p, false
|
||||
}
|
||||
flags := pkt[p.ipHdrLen+13]
|
||||
// Allow only ACK and ACK|PSH. In particular: no SYN/FIN/RST/URG/CWR/ECE.
|
||||
const ack = 0x10
|
||||
const psh = 0x08
|
||||
if flags&^(ack|psh) != 0 || flags&ack == 0 {
|
||||
return p, false
|
||||
}
|
||||
p.tcpHdrLen = tcpOff
|
||||
p.hdrLen = p.ipHdrLen + tcpOff
|
||||
p.payLen = len(pkt) - p.hdrLen
|
||||
if p.payLen <= 0 {
|
||||
return p, false
|
||||
}
|
||||
p.seq = binary.BigEndian.Uint32(pkt[p.ipHdrLen+4 : p.ipHdrLen+8])
|
||||
p.flags = flags
|
||||
p.flags = pkt[p.ipHdrLen+13]
|
||||
p.fk.sport = binary.BigEndian.Uint16(pkt[p.ipHdrLen : p.ipHdrLen+2])
|
||||
p.fk.dport = binary.BigEndian.Uint16(pkt[p.ipHdrLen+2 : p.ipHdrLen+4])
|
||||
return p, true
|
||||
}
|
||||
|
||||
// Add takes a plaintext inbound packet destined for the tun. If GSO is
|
||||
// unavailable or the packet isn't coalesceable, Add falls through to a
|
||||
// plain Write on the underlying queue (flushing any pending superpacket
|
||||
// first).
|
||||
// coalesceable reports whether a parsed TCP segment is eligible for
|
||||
// coalescing. Accepts only ACK or ACK|PSH with a non-empty payload.
|
||||
func (p parsedTCP) coalesceable() bool {
|
||||
const ack = 0x10
|
||||
const psh = 0x08
|
||||
if p.flags&^(ack|psh) != 0 || p.flags&ack == 0 {
|
||||
return false
|
||||
}
|
||||
return p.payLen > 0
|
||||
}
|
||||
|
||||
// Add borrows pkt. The caller must keep pkt valid until the next Flush,
|
||||
// whether or not the packet was coalesced — passthrough (non-admissible)
|
||||
// packets are queued and written at Flush time, not synchronously.
|
||||
func (c *tcpCoalescer) Add(pkt []byte) error {
|
||||
if c.gsoW == nil {
|
||||
_, err := c.plainW.Write(pkt)
|
||||
return err
|
||||
c.addPassthrough(pkt)
|
||||
return nil
|
||||
}
|
||||
|
||||
info, ok := parseCoalesceable(pkt)
|
||||
info, ok := parseTCPBase(pkt)
|
||||
if !ok {
|
||||
if c.active {
|
||||
if err := c.flushLocked(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
_, err := c.plainW.Write(pkt)
|
||||
return err
|
||||
// Non-TCP or malformed — can't possibly collide with an open flow.
|
||||
c.addPassthrough(pkt)
|
||||
return nil
|
||||
}
|
||||
if !info.coalesceable() {
|
||||
// TCP but not admissible (SYN/FIN/RST/URG/CWR/ECE or zero-payload).
|
||||
// Seal this flow's open slot so later in-flow packets don't extend
|
||||
// it and accidentally reorder past this passthrough.
|
||||
delete(c.openSlots, info.fk)
|
||||
c.addPassthrough(pkt)
|
||||
return nil
|
||||
}
|
||||
|
||||
if c.active {
|
||||
if c.canAppend(pkt, info) {
|
||||
c.appendPayload(pkt, info)
|
||||
if info.flags&0x08 != 0 {
|
||||
c.psh = true
|
||||
if open := c.openSlots[info.fk]; open != nil {
|
||||
if c.canAppend(open, pkt, info) {
|
||||
c.appendPayload(open, pkt, info)
|
||||
if open.psh {
|
||||
delete(c.openSlots, info.fk)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if err := c.flushLocked(); err != nil {
|
||||
return err
|
||||
}
|
||||
// Can't extend — seal it and fall through to seed a fresh slot.
|
||||
delete(c.openSlots, info.fk)
|
||||
}
|
||||
return c.seed(pkt, info)
|
||||
}
|
||||
|
||||
// Flush emits any pending superpacket. Called by the UDP read loop at
|
||||
// recvmmsg batch boundaries — "no more packets coming right now".
|
||||
func (c *tcpCoalescer) Flush() error {
|
||||
if !c.active {
|
||||
return nil
|
||||
}
|
||||
return c.flushLocked()
|
||||
}
|
||||
|
||||
func (c *tcpCoalescer) reset() {
|
||||
c.active = false
|
||||
c.bufLen = 0
|
||||
c.numSeg = 0
|
||||
c.gsoSize = 0
|
||||
c.hdrLen = 0
|
||||
c.ipHdrLen = 0
|
||||
c.nextSeq = 0
|
||||
c.psh = false
|
||||
}
|
||||
|
||||
func (c *tcpCoalescer) seed(pkt []byte, info parsedTCP) error {
|
||||
if info.hdrLen+info.payLen > len(c.buf) {
|
||||
// Oversize single packet — flush (already done above) and passthrough.
|
||||
_, err := c.plainW.Write(pkt)
|
||||
return err
|
||||
}
|
||||
copy(c.buf, pkt[:info.hdrLen+info.payLen])
|
||||
c.active = true
|
||||
c.bufLen = info.hdrLen + info.payLen
|
||||
c.numSeg = 1
|
||||
c.gsoSize = info.payLen
|
||||
c.isV6 = info.isV6
|
||||
c.ipHdrLen = info.ipHdrLen
|
||||
c.hdrLen = info.hdrLen
|
||||
c.nextSeq = info.seq + uint32(info.payLen)
|
||||
c.psh = info.flags&0x08 != 0
|
||||
c.seed(pkt, info)
|
||||
return nil
|
||||
}
|
||||
|
||||
// canAppend reports whether info's packet extends the current seed: same
|
||||
// flow, adjacent seq, payload size rule, and no-PSH-mid-chain.
|
||||
func (c *tcpCoalescer) canAppend(pkt []byte, info parsedTCP) bool {
|
||||
if c.psh {
|
||||
return false // we already accepted a PSH — chain is closed
|
||||
// Flush emits every queued event in arrival order. Coalesced slots go out
|
||||
// via WriteGSO; passthrough slots go out via plainW.Write. Returns the
|
||||
// first error observed; keeps draining so one bad packet doesn't hold up
|
||||
// the rest. After Flush returns, borrowed payload slices may be recycled.
|
||||
func (c *tcpCoalescer) Flush() error {
|
||||
var first error
|
||||
for _, s := range c.slots {
|
||||
var err error
|
||||
if s.passthrough {
|
||||
_, err = c.plainW.Write(s.rawPkt)
|
||||
} else {
|
||||
err = c.flushSlot(s)
|
||||
}
|
||||
if err != nil && first == nil {
|
||||
first = err
|
||||
}
|
||||
c.release(s)
|
||||
}
|
||||
if info.isV6 != c.isV6 {
|
||||
for i := range c.slots {
|
||||
c.slots[i] = nil
|
||||
}
|
||||
c.slots = c.slots[:0]
|
||||
for k := range c.openSlots {
|
||||
delete(c.openSlots, k)
|
||||
}
|
||||
return first
|
||||
}
|
||||
|
||||
func (c *tcpCoalescer) addPassthrough(pkt []byte) {
|
||||
s := c.take()
|
||||
s.passthrough = true
|
||||
s.rawPkt = pkt
|
||||
c.slots = append(c.slots, s)
|
||||
}
|
||||
|
||||
func (c *tcpCoalescer) seed(pkt []byte, info parsedTCP) {
|
||||
if info.hdrLen > tcpCoalesceHdrCap || info.hdrLen+info.payLen > tcpCoalesceBufSize {
|
||||
// Pathological shape — can't fit our scratch, emit as-is.
|
||||
c.addPassthrough(pkt)
|
||||
return
|
||||
}
|
||||
s := c.take()
|
||||
s.passthrough = false
|
||||
s.rawPkt = nil
|
||||
copy(s.hdrBuf[:], pkt[:info.hdrLen])
|
||||
s.hdrLen = info.hdrLen
|
||||
s.ipHdrLen = info.ipHdrLen
|
||||
s.isV6 = info.fk.isV6
|
||||
s.fk = info.fk
|
||||
s.gsoSize = info.payLen
|
||||
s.numSeg = 1
|
||||
s.totalPay = info.payLen
|
||||
s.nextSeq = info.seq + uint32(info.payLen)
|
||||
s.psh = info.flags&0x08 != 0
|
||||
s.payIovs = append(s.payIovs[:0], pkt[info.hdrLen:info.hdrLen+info.payLen])
|
||||
c.slots = append(c.slots, s)
|
||||
if !s.psh {
|
||||
c.openSlots[info.fk] = s
|
||||
}
|
||||
}
|
||||
|
||||
// canAppend reports whether info's packet extends the slot's seed: same
|
||||
// header shape and stable contents, adjacent seq, not oversized, chain not
|
||||
// closed.
|
||||
func (c *tcpCoalescer) canAppend(s *coalesceSlot, pkt []byte, info parsedTCP) bool {
|
||||
if s.psh {
|
||||
return false
|
||||
}
|
||||
if info.hdrLen != c.hdrLen {
|
||||
if info.hdrLen != s.hdrLen {
|
||||
return false
|
||||
}
|
||||
if info.seq != c.nextSeq {
|
||||
if info.seq != s.nextSeq {
|
||||
return false
|
||||
}
|
||||
if c.numSeg >= tcpCoalesceMaxSegs {
|
||||
if s.numSeg >= tcpCoalesceMaxSegs {
|
||||
return false
|
||||
}
|
||||
if c.bufLen+info.payLen > len(c.buf) {
|
||||
if info.payLen > s.gsoSize {
|
||||
return false
|
||||
}
|
||||
// Every mid-chain segment must be exactly gsoSize. The final segment may
|
||||
// be shorter, but once a short segment is appended we can't add another.
|
||||
if info.payLen > c.gsoSize {
|
||||
if s.hdrLen+s.totalPay+info.payLen > tcpCoalesceBufSize {
|
||||
return false
|
||||
}
|
||||
if info.payLen < c.gsoSize {
|
||||
// Will become the last segment — always OK to append, just no more.
|
||||
}
|
||||
// Compare the stable parts of the header.
|
||||
if !headersMatch(c.buf[:c.hdrLen], pkt[:info.hdrLen], c.isV6, c.ipHdrLen) {
|
||||
if !headersMatch(s.hdrBuf[:s.hdrLen], pkt[:info.hdrLen], s.isV6, s.ipHdrLen) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (c *tcpCoalescer) appendPayload(pkt []byte, info parsedTCP) {
|
||||
copy(c.buf[c.bufLen:], pkt[info.hdrLen:info.hdrLen+info.payLen])
|
||||
c.bufLen += info.payLen
|
||||
c.numSeg++
|
||||
c.nextSeq = info.seq + uint32(info.payLen)
|
||||
// If this was a sub-gsoSize last segment, mark chain as closed.
|
||||
if info.payLen < c.gsoSize {
|
||||
c.psh = true
|
||||
func (c *tcpCoalescer) appendPayload(s *coalesceSlot, pkt []byte, info parsedTCP) {
|
||||
s.payIovs = append(s.payIovs, pkt[info.hdrLen:info.hdrLen+info.payLen])
|
||||
s.numSeg++
|
||||
s.totalPay += info.payLen
|
||||
s.nextSeq = info.seq + uint32(info.payLen)
|
||||
if info.payLen < s.gsoSize || info.flags&0x08 != 0 {
|
||||
s.psh = true
|
||||
}
|
||||
}
|
||||
|
||||
func (c *tcpCoalescer) take() *coalesceSlot {
|
||||
if n := len(c.pool); n > 0 {
|
||||
s := c.pool[n-1]
|
||||
c.pool[n-1] = nil
|
||||
c.pool = c.pool[:n-1]
|
||||
return s
|
||||
}
|
||||
return &coalesceSlot{}
|
||||
}
|
||||
|
||||
func (c *tcpCoalescer) release(s *coalesceSlot) {
|
||||
s.passthrough = false
|
||||
s.rawPkt = nil
|
||||
for i := range s.payIovs {
|
||||
s.payIovs[i] = nil
|
||||
}
|
||||
s.payIovs = s.payIovs[:0]
|
||||
s.numSeg = 0
|
||||
s.totalPay = 0
|
||||
s.psh = false
|
||||
c.pool = append(c.pool, s)
|
||||
}
|
||||
|
||||
// flushSlot patches the header and calls WriteGSO. Does not remove the
|
||||
// slot from c.slots.
|
||||
func (c *tcpCoalescer) flushSlot(s *coalesceSlot) error {
|
||||
total := s.hdrLen + s.totalPay
|
||||
l4Len := total - s.ipHdrLen
|
||||
hdr := s.hdrBuf[:s.hdrLen]
|
||||
|
||||
if s.isV6 {
|
||||
binary.BigEndian.PutUint16(hdr[4:6], uint16(l4Len))
|
||||
} else {
|
||||
binary.BigEndian.PutUint16(hdr[2:4], uint16(total))
|
||||
hdr[10] = 0
|
||||
hdr[11] = 0
|
||||
binary.BigEndian.PutUint16(hdr[10:12], ipv4HdrChecksum(hdr[:s.ipHdrLen]))
|
||||
}
|
||||
|
||||
var psum uint32
|
||||
if s.isV6 {
|
||||
psum = pseudoSumIPv6(hdr[8:24], hdr[24:40], ipProtoTCP, l4Len)
|
||||
} else {
|
||||
psum = pseudoSumIPv4(hdr[12:16], hdr[16:20], ipProtoTCP, l4Len)
|
||||
}
|
||||
tcsum := s.ipHdrLen + 16
|
||||
binary.BigEndian.PutUint16(hdr[tcsum:tcsum+2], foldOnceNoInvert(psum))
|
||||
|
||||
return c.gsoW.WriteGSO(hdr, s.payIovs, uint16(s.gsoSize), s.isV6, uint16(s.ipHdrLen))
|
||||
}
|
||||
|
||||
// headersMatch compares two IP+TCP header prefixes for byte-for-byte
|
||||
// equality on every field that must be identical across coalesced
|
||||
// segments. Size/IPID/IPCsum/seq/flags/tcpCsum are masked out.
|
||||
@@ -330,58 +441,6 @@ func bytesEq(a, b []byte) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (c *tcpCoalescer) flushLocked() error {
|
||||
// Guarantee the coalescer is empty on exit regardless of how we leave.
|
||||
defer c.reset()
|
||||
|
||||
if c.numSeg <= 1 {
|
||||
_, err := c.plainW.Write(c.buf[:c.bufLen])
|
||||
return err
|
||||
}
|
||||
|
||||
total := c.bufLen
|
||||
l4Len := total - c.ipHdrLen
|
||||
|
||||
// Fix IP header length field.
|
||||
if c.isV6 {
|
||||
if l4Len > 0xffff {
|
||||
// Shouldn't happen given buffer size, but guard against it.
|
||||
return c.flushAsPerSegment()
|
||||
}
|
||||
binary.BigEndian.PutUint16(c.buf[4:6], uint16(l4Len))
|
||||
} else {
|
||||
if total > 0xffff {
|
||||
return c.flushAsPerSegment()
|
||||
}
|
||||
binary.BigEndian.PutUint16(c.buf[2:4], uint16(total))
|
||||
// Recompute IPv4 header checksum.
|
||||
c.buf[10] = 0
|
||||
c.buf[11] = 0
|
||||
binary.BigEndian.PutUint16(c.buf[10:12], ipv4HdrChecksum(c.buf[:c.ipHdrLen]))
|
||||
}
|
||||
|
||||
// Write the virtio NEEDS_CSUM pseudo-header partial into the TCP csum field.
|
||||
var psum uint32
|
||||
if c.isV6 {
|
||||
psum = pseudoSumIPv6(c.buf[8:24], c.buf[24:40], ipProtoTCP, l4Len)
|
||||
} else {
|
||||
psum = pseudoSumIPv4(c.buf[12:16], c.buf[16:20], ipProtoTCP, l4Len)
|
||||
}
|
||||
tcsum := c.ipHdrLen + 16
|
||||
binary.BigEndian.PutUint16(c.buf[tcsum:tcsum+2], foldOnceNoInvert(psum))
|
||||
|
||||
return c.gsoW.WriteGSO(c.buf[:total], uint16(c.gsoSize), c.isV6, uint16(c.hdrLen), uint16(c.ipHdrLen))
|
||||
}
|
||||
|
||||
// flushAsPerSegment is a defensive fallback used if the coalesced superpacket
|
||||
// somehow exceeds 16-bit length fields. It writes the packet as-is through
|
||||
// the plain writer (the kernel will reject it, but that's a visible error
|
||||
// rather than silent corruption).
|
||||
func (c *tcpCoalescer) flushAsPerSegment() error {
|
||||
_, err := c.plainW.Write(c.buf[:c.bufLen])
|
||||
return err
|
||||
}
|
||||
|
||||
// ipv4HdrChecksum computes the IPv4 header checksum over hdr (which must
|
||||
// already have its checksum field zeroed) and returns the folded/inverted
|
||||
// 16-bit value to store.
|
||||
|
||||
Reference in New Issue
Block a user