mirror of
https://github.com/slackhq/nebula.git
synced 2026-05-16 04:47:38 +02:00
437 lines
12 KiB
Go
437 lines
12 KiB
Go
package nebula
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"io"
|
|
|
|
"github.com/slackhq/nebula/overlay"
|
|
)
|
|
|
|
// IPPROTO_TCP is the IANA protocol number for TCP. Hardcoded instead of
|
|
// reaching for ipProtoTCP because golang.org/x/sys/unix doesn't
|
|
// define that constant on Windows, which would break cross-compiles even
|
|
// though this file runs unchanged on every platform.
|
|
const ipProtoTCP = 6
|
|
|
|
// tcpCoalesceBufSize bounds the largest coalesced superpacket we will buffer.
|
|
// Linux caps sk_gso_max_size around 64KiB; 65535 bytes covers IP hdr + TCP
|
|
// hdr + up to ~65KB of payload, which is the most the kernel's TSO can
|
|
// segment in one shot.
|
|
const tcpCoalesceBufSize = 65535
|
|
|
|
// tcpCoalesceMaxSegs caps how many segments we are willing to coalesce into
|
|
// a single superpacket regardless of byte budget. Kernel allows up to 64
|
|
// for UDP GSO and 128 for many TSO engines; stop well before either limit
|
|
// to keep latency bounded.
|
|
const tcpCoalesceMaxSegs = 64
|
|
|
|
// tcpCoalescer accumulates adjacent in-flow TCP data segments into a single
|
|
// TSO superpacket and emits them via overlay.GSOWriter in one writev. When
|
|
// a packet fails admission or fails to extend the pending flow, the
|
|
// pending superpacket is flushed and the non-matching packet is written
|
|
// through as-is. Owns no locks — one coalescer per TUN write queue.
|
|
type tcpCoalescer struct {
|
|
plainW io.Writer
|
|
gsoW overlay.GSOWriter // nil when the queue doesn't support TSO
|
|
|
|
buf []byte
|
|
bufLen int // valid bytes in buf — hdrLen plus accumulated payload
|
|
active bool // a seed packet is present
|
|
numSeg int
|
|
gsoSize int // payload length of each segment (= MSS of the seed)
|
|
isV6 bool
|
|
ipHdrLen int
|
|
hdrLen int // ipHdrLen + tcpHdrLen, the offset where payload starts
|
|
nextSeq uint32 // expected TCP seq of the next packet to coalesce
|
|
// psh indicates the last-accepted segment had PSH set. We accept a PSH
|
|
// packet as the final segment but reject any further Adds after that.
|
|
psh bool
|
|
}
|
|
|
|
func newTCPCoalescer(w io.Writer) *tcpCoalescer {
|
|
c := &tcpCoalescer{plainW: w, buf: make([]byte, tcpCoalesceBufSize)}
|
|
if gw, ok := w.(overlay.GSOWriter); ok && gw.GSOSupported() {
|
|
c.gsoW = gw
|
|
}
|
|
return c
|
|
}
|
|
|
|
// parsedTCP holds the byte offsets / values we extract from one admission
|
|
// check so Add and canAppend don't re-parse the same header twice.
|
|
type parsedTCP struct {
|
|
isV6 bool
|
|
ipHdrLen int
|
|
tcpHdrLen int
|
|
hdrLen int // ipHdrLen + tcpHdrLen
|
|
payLen int
|
|
seq uint32
|
|
flags byte
|
|
}
|
|
|
|
// parseCoalesceable decides whether pkt is eligible for TCP coalescing. It
|
|
// accepts IPv4 (no options, DF set, no fragmentation) and IPv6 (no
|
|
// extension headers) carrying a TCP segment with flags in {ACK, ACK|PSH}
|
|
// and a non-empty payload. On success it returns the parsed offsets.
|
|
func parseCoalesceable(pkt []byte) (parsedTCP, bool) {
|
|
var p parsedTCP
|
|
if len(pkt) < 20 {
|
|
return p, false
|
|
}
|
|
v := pkt[0] >> 4
|
|
switch v {
|
|
case 4:
|
|
if len(pkt) < 20 {
|
|
return p, false
|
|
}
|
|
ihl := int(pkt[0]&0x0f) * 4
|
|
if ihl != 20 {
|
|
return p, false // reject IP options
|
|
}
|
|
if pkt[9] != ipProtoTCP {
|
|
return p, false
|
|
}
|
|
// Fragment check: MF=0 and frag offset=0. Accept DF=1 or DF=0 —
|
|
// just reject any actual fragmentation.
|
|
fragField := binary.BigEndian.Uint16(pkt[6:8])
|
|
if fragField&0x3fff != 0 {
|
|
return p, false
|
|
}
|
|
totalLen := int(binary.BigEndian.Uint16(pkt[2:4]))
|
|
if totalLen > len(pkt) || totalLen < ihl {
|
|
return p, false
|
|
}
|
|
p.isV6 = false
|
|
p.ipHdrLen = ihl
|
|
pkt = pkt[:totalLen]
|
|
case 6:
|
|
if len(pkt) < 40 {
|
|
return p, false
|
|
}
|
|
if pkt[6] != ipProtoTCP {
|
|
return p, false // reject ext headers
|
|
}
|
|
payloadLen := int(binary.BigEndian.Uint16(pkt[4:6]))
|
|
if 40+payloadLen > len(pkt) {
|
|
return p, false
|
|
}
|
|
p.isV6 = true
|
|
p.ipHdrLen = 40
|
|
pkt = pkt[:40+payloadLen]
|
|
default:
|
|
return p, false
|
|
}
|
|
|
|
if len(pkt) < p.ipHdrLen+20 {
|
|
return p, false
|
|
}
|
|
tcpOff := int(pkt[p.ipHdrLen+12]>>4) * 4
|
|
if tcpOff < 20 || tcpOff > 60 {
|
|
return p, false
|
|
}
|
|
if len(pkt) < p.ipHdrLen+tcpOff {
|
|
return p, false
|
|
}
|
|
flags := pkt[p.ipHdrLen+13]
|
|
// Allow only ACK and ACK|PSH. In particular: no SYN/FIN/RST/URG/CWR/ECE.
|
|
const ack = 0x10
|
|
const psh = 0x08
|
|
if flags&^(ack|psh) != 0 || flags&ack == 0 {
|
|
return p, false
|
|
}
|
|
p.tcpHdrLen = tcpOff
|
|
p.hdrLen = p.ipHdrLen + tcpOff
|
|
p.payLen = len(pkt) - p.hdrLen
|
|
if p.payLen <= 0 {
|
|
return p, false
|
|
}
|
|
p.seq = binary.BigEndian.Uint32(pkt[p.ipHdrLen+4 : p.ipHdrLen+8])
|
|
p.flags = flags
|
|
return p, true
|
|
}
|
|
|
|
// Add takes a plaintext inbound packet destined for the tun. If GSO is
|
|
// unavailable or the packet isn't coalesceable, Add falls through to a
|
|
// plain Write on the underlying queue (flushing any pending superpacket
|
|
// first).
|
|
func (c *tcpCoalescer) Add(pkt []byte) error {
|
|
if c.gsoW == nil {
|
|
_, err := c.plainW.Write(pkt)
|
|
return err
|
|
}
|
|
|
|
info, ok := parseCoalesceable(pkt)
|
|
if !ok {
|
|
if c.active {
|
|
if err := c.flushLocked(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
_, err := c.plainW.Write(pkt)
|
|
return err
|
|
}
|
|
|
|
if c.active {
|
|
if c.canAppend(pkt, info) {
|
|
c.appendPayload(pkt, info)
|
|
if info.flags&0x08 != 0 {
|
|
c.psh = true
|
|
}
|
|
return nil
|
|
}
|
|
if err := c.flushLocked(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return c.seed(pkt, info)
|
|
}
|
|
|
|
// Flush emits any pending superpacket. Called by the UDP read loop at
|
|
// recvmmsg batch boundaries — "no more packets coming right now".
|
|
func (c *tcpCoalescer) Flush() error {
|
|
if !c.active {
|
|
return nil
|
|
}
|
|
return c.flushLocked()
|
|
}
|
|
|
|
func (c *tcpCoalescer) reset() {
|
|
c.active = false
|
|
c.bufLen = 0
|
|
c.numSeg = 0
|
|
c.gsoSize = 0
|
|
c.hdrLen = 0
|
|
c.ipHdrLen = 0
|
|
c.nextSeq = 0
|
|
c.psh = false
|
|
}
|
|
|
|
func (c *tcpCoalescer) seed(pkt []byte, info parsedTCP) error {
|
|
if info.hdrLen+info.payLen > len(c.buf) {
|
|
// Oversize single packet — flush (already done above) and passthrough.
|
|
_, err := c.plainW.Write(pkt)
|
|
return err
|
|
}
|
|
copy(c.buf, pkt[:info.hdrLen+info.payLen])
|
|
c.active = true
|
|
c.bufLen = info.hdrLen + info.payLen
|
|
c.numSeg = 1
|
|
c.gsoSize = info.payLen
|
|
c.isV6 = info.isV6
|
|
c.ipHdrLen = info.ipHdrLen
|
|
c.hdrLen = info.hdrLen
|
|
c.nextSeq = info.seq + uint32(info.payLen)
|
|
c.psh = info.flags&0x08 != 0
|
|
return nil
|
|
}
|
|
|
|
// canAppend reports whether info's packet extends the current seed: same
|
|
// flow, adjacent seq, payload size rule, and no-PSH-mid-chain.
|
|
func (c *tcpCoalescer) canAppend(pkt []byte, info parsedTCP) bool {
|
|
if c.psh {
|
|
return false // we already accepted a PSH — chain is closed
|
|
}
|
|
if info.isV6 != c.isV6 {
|
|
return false
|
|
}
|
|
if info.hdrLen != c.hdrLen {
|
|
return false
|
|
}
|
|
if info.seq != c.nextSeq {
|
|
return false
|
|
}
|
|
if c.numSeg >= tcpCoalesceMaxSegs {
|
|
return false
|
|
}
|
|
if c.bufLen+info.payLen > len(c.buf) {
|
|
return false
|
|
}
|
|
// Every mid-chain segment must be exactly gsoSize. The final segment may
|
|
// be shorter, but once a short segment is appended we can't add another.
|
|
if info.payLen > c.gsoSize {
|
|
return false
|
|
}
|
|
if info.payLen < c.gsoSize {
|
|
// Will become the last segment — always OK to append, just no more.
|
|
}
|
|
// Compare the stable parts of the header.
|
|
if !headersMatch(c.buf[:c.hdrLen], pkt[:info.hdrLen], c.isV6, c.ipHdrLen) {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func (c *tcpCoalescer) appendPayload(pkt []byte, info parsedTCP) {
|
|
copy(c.buf[c.bufLen:], pkt[info.hdrLen:info.hdrLen+info.payLen])
|
|
c.bufLen += info.payLen
|
|
c.numSeg++
|
|
c.nextSeq = info.seq + uint32(info.payLen)
|
|
// If this was a sub-gsoSize last segment, mark chain as closed.
|
|
if info.payLen < c.gsoSize {
|
|
c.psh = true
|
|
}
|
|
}
|
|
|
|
// headersMatch compares two IP+TCP header prefixes for byte-for-byte
|
|
// equality on every field that must be identical across coalesced
|
|
// segments. Size/IPID/IPCsum/seq/flags/tcpCsum are masked out.
|
|
func headersMatch(a, b []byte, isV6 bool, ipHdrLen int) bool {
|
|
if len(a) != len(b) {
|
|
return false
|
|
}
|
|
if isV6 {
|
|
// IPv6: bytes [0:4] = version/TC/flow-label, [6:8] = next_hdr/hop,
|
|
// [8:40] = src+dst. Skip [4:6] payload length.
|
|
if !bytesEq(a[0:4], b[0:4]) {
|
|
return false
|
|
}
|
|
if !bytesEq(a[6:40], b[6:40]) {
|
|
return false
|
|
}
|
|
} else {
|
|
// IPv4: [0:2] version/IHL/TOS, [6:10] flags/fragoff/TTL/proto,
|
|
// [12:20] src+dst. Skip [2:4] total len, [4:6] id, [10:12] csum.
|
|
if !bytesEq(a[0:2], b[0:2]) {
|
|
return false
|
|
}
|
|
if !bytesEq(a[6:10], b[6:10]) {
|
|
return false
|
|
}
|
|
if !bytesEq(a[12:20], b[12:20]) {
|
|
return false
|
|
}
|
|
}
|
|
// TCP: compare [0:4] ports, [8:13] ack+dataoff, [14:16] window,
|
|
// [18:tcpHdrLen] options (incl. urgent).
|
|
tcp := ipHdrLen
|
|
if !bytesEq(a[tcp:tcp+4], b[tcp:tcp+4]) {
|
|
return false
|
|
}
|
|
if !bytesEq(a[tcp+8:tcp+13], b[tcp+8:tcp+13]) {
|
|
return false
|
|
}
|
|
if !bytesEq(a[tcp+14:tcp+16], b[tcp+14:tcp+16]) {
|
|
return false
|
|
}
|
|
if !bytesEq(a[tcp+18:], b[tcp+18:]) {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func bytesEq(a, b []byte) bool {
|
|
if len(a) != len(b) {
|
|
return false
|
|
}
|
|
for i := range a {
|
|
if a[i] != b[i] {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func (c *tcpCoalescer) flushLocked() error {
|
|
// Guarantee the coalescer is empty on exit regardless of how we leave.
|
|
defer c.reset()
|
|
|
|
if c.numSeg <= 1 {
|
|
_, err := c.plainW.Write(c.buf[:c.bufLen])
|
|
return err
|
|
}
|
|
|
|
total := c.bufLen
|
|
l4Len := total - c.ipHdrLen
|
|
|
|
// Fix IP header length field.
|
|
if c.isV6 {
|
|
if l4Len > 0xffff {
|
|
// Shouldn't happen given buffer size, but guard against it.
|
|
return c.flushAsPerSegment()
|
|
}
|
|
binary.BigEndian.PutUint16(c.buf[4:6], uint16(l4Len))
|
|
} else {
|
|
if total > 0xffff {
|
|
return c.flushAsPerSegment()
|
|
}
|
|
binary.BigEndian.PutUint16(c.buf[2:4], uint16(total))
|
|
// Recompute IPv4 header checksum.
|
|
c.buf[10] = 0
|
|
c.buf[11] = 0
|
|
binary.BigEndian.PutUint16(c.buf[10:12], ipv4HdrChecksum(c.buf[:c.ipHdrLen]))
|
|
}
|
|
|
|
// Write the virtio NEEDS_CSUM pseudo-header partial into the TCP csum field.
|
|
var psum uint32
|
|
if c.isV6 {
|
|
psum = pseudoSumIPv6(c.buf[8:24], c.buf[24:40], ipProtoTCP, l4Len)
|
|
} else {
|
|
psum = pseudoSumIPv4(c.buf[12:16], c.buf[16:20], ipProtoTCP, l4Len)
|
|
}
|
|
tcsum := c.ipHdrLen + 16
|
|
binary.BigEndian.PutUint16(c.buf[tcsum:tcsum+2], foldOnceNoInvert(psum))
|
|
|
|
return c.gsoW.WriteGSO(c.buf[:total], uint16(c.gsoSize), c.isV6, uint16(c.hdrLen), uint16(c.ipHdrLen))
|
|
}
|
|
|
|
// flushAsPerSegment is a defensive fallback used if the coalesced superpacket
|
|
// somehow exceeds 16-bit length fields. It writes the packet as-is through
|
|
// the plain writer (the kernel will reject it, but that's a visible error
|
|
// rather than silent corruption).
|
|
func (c *tcpCoalescer) flushAsPerSegment() error {
|
|
_, err := c.plainW.Write(c.buf[:c.bufLen])
|
|
return err
|
|
}
|
|
|
|
// ipv4HdrChecksum computes the IPv4 header checksum over hdr (which must
|
|
// already have its checksum field zeroed) and returns the folded/inverted
|
|
// 16-bit value to store.
|
|
func ipv4HdrChecksum(hdr []byte) uint16 {
|
|
var sum uint32
|
|
for i := 0; i+1 < len(hdr); i += 2 {
|
|
sum += uint32(binary.BigEndian.Uint16(hdr[i : i+2]))
|
|
}
|
|
if len(hdr)%2 == 1 {
|
|
sum += uint32(hdr[len(hdr)-1]) << 8
|
|
}
|
|
for sum>>16 != 0 {
|
|
sum = (sum & 0xffff) + (sum >> 16)
|
|
}
|
|
return ^uint16(sum)
|
|
}
|
|
|
|
// pseudoSumIPv4 / pseudoSumIPv6 build the TCP pseudo-header partial sum
|
|
// expected by the virtio NEEDS_CSUM kernel path: the 32-bit accumulator
|
|
// before folding.
|
|
func pseudoSumIPv4(src, dst []byte, proto byte, l4Len int) uint32 {
|
|
var sum uint32
|
|
sum += uint32(binary.BigEndian.Uint16(src[0:2]))
|
|
sum += uint32(binary.BigEndian.Uint16(src[2:4]))
|
|
sum += uint32(binary.BigEndian.Uint16(dst[0:2]))
|
|
sum += uint32(binary.BigEndian.Uint16(dst[2:4]))
|
|
sum += uint32(proto)
|
|
sum += uint32(l4Len)
|
|
return sum
|
|
}
|
|
|
|
func pseudoSumIPv6(src, dst []byte, proto byte, l4Len int) uint32 {
|
|
var sum uint32
|
|
for i := 0; i < 16; i += 2 {
|
|
sum += uint32(binary.BigEndian.Uint16(src[i : i+2]))
|
|
sum += uint32(binary.BigEndian.Uint16(dst[i : i+2]))
|
|
}
|
|
sum += uint32(l4Len >> 16)
|
|
sum += uint32(l4Len & 0xffff)
|
|
sum += uint32(proto)
|
|
return sum
|
|
}
|
|
|
|
// foldOnceNoInvert folds the 32-bit accumulator to 16 bits and returns it
|
|
// unchanged (no one's complement). This is what virtio NEEDS_CSUM wants in
|
|
// the L4 checksum field — the kernel will add the payload sum and invert.
|
|
func foldOnceNoInvert(sum uint32) uint16 {
|
|
for sum>>16 != 0 {
|
|
sum = (sum & 0xffff) + (sum >> 16)
|
|
}
|
|
return uint16(sum)
|
|
}
|