checkpt, try to parse packets only once pt2

2026-05-16 04:47:38 +02:00 · 2026-05-07 11:26:17 -05:00
parent 0375aff451
commit 5bdf645b0b
10 changed files with 1150 additions and 92 deletions
--- a/overlay/batch/inbound_bench_test.go
+++ b/overlay/batch/inbound_bench_test.go
@@ -0,0 +1,394 @@
+package batch
+
+import (
+	"encoding/binary"
+	"net/netip"
+	"testing"
+
+	"github.com/slackhq/nebula/firewall"
+)
+
+// parseV4InboundBaseline mirrors what outside.go's parseV4(incoming=true)
+// does, so the "split" bench measures the *current* state: firewall-side
+// parse, then m.Commit re-parses inside the coalescer. Two walks per
+// packet. Kept faithful in shape (one read per field, AddrFromSlice for
+// the addrs) so the CPU profile matches the production parseV4.
+func parseV4InboundBaseline(pkt []byte, fp *firewall.Packet) bool {
+	if len(pkt) < 20 {
+		return false
+	}
+	ihl := int(pkt[0]&0x0f) << 2
+	if ihl < 20 {
+		return false
+	}
+	flagsfrags := binary.BigEndian.Uint16(pkt[6:8])
+	fp.Fragment = (flagsfrags & 0x1FFF) != 0
+	fp.Protocol = pkt[9]
+	minLen := ihl
+	if !fp.Fragment {
+		if fp.Protocol == firewall.ProtoICMP {
+			minLen += 4 + 2
+		} else {
+			minLen += 4
+		}
+	}
+	if len(pkt) < minLen {
+		return false
+	}
+	fp.RemoteAddr, _ = netip.AddrFromSlice(pkt[12:16])
+	fp.LocalAddr, _ = netip.AddrFromSlice(pkt[16:20])
+	switch {
+	case fp.Fragment:
+		fp.RemotePort = 0
+		fp.LocalPort = 0
+	case fp.Protocol == firewall.ProtoICMP:
+		fp.RemotePort = binary.BigEndian.Uint16(pkt[ihl+4 : ihl+6])
+		fp.LocalPort = 0
+	default:
+		fp.RemotePort = binary.BigEndian.Uint16(pkt[ihl : ihl+2])
+		fp.LocalPort = binary.BigEndian.Uint16(pkt[ihl+2 : ihl+4])
+	}
+	return true
+}
+
+// parseV6InboundBaseline is the v6 analogue: replicates parseV6's
+// extension-header walk so the split bench captures its true cost.
+func parseV6InboundBaseline(pkt []byte, fp *firewall.Packet) bool {
+	dataLen := len(pkt)
+	if dataLen < 40 {
+		return false
+	}
+	fp.RemoteAddr, _ = netip.AddrFromSlice(pkt[8:24])
+	fp.LocalAddr, _ = netip.AddrFromSlice(pkt[24:40])
+
+	protoAt := 6
+	offset := 40
+	next := 0
+	for {
+		if protoAt >= dataLen {
+			return false
+		}
+		proto := pkt[protoAt]
+		switch proto {
+		case ipProtoESP, ipProtoNoNextHdr:
+			fp.Protocol = proto
+			fp.RemotePort = 0
+			fp.LocalPort = 0
+			fp.Fragment = false
+			return true
+		case ipProtoICMPv6:
+			if dataLen < offset+6 {
+				return false
+			}
+			fp.Protocol = proto
+			fp.LocalPort = 0
+			switch pkt[offset+1] {
+			case icmpv6TypeEchoRequest, icmpv6TypeEchoReply:
+				fp.RemotePort = binary.BigEndian.Uint16(pkt[offset+4 : offset+6])
+			default:
+				fp.RemotePort = 0
+			}
+			fp.Fragment = false
+			return true
+		case ipProtoTCP, ipProtoUDP:
+			if dataLen < offset+4 {
+				return false
+			}
+			fp.Protocol = proto
+			fp.RemotePort = binary.BigEndian.Uint16(pkt[offset : offset+2])
+			fp.LocalPort = binary.BigEndian.Uint16(pkt[offset+2 : offset+4])
+			fp.Fragment = false
+			return true
+		case ipProtoIPv6Fragment:
+			if dataLen < offset+8 {
+				return false
+			}
+			fragmentOffset := binary.BigEndian.Uint16(pkt[offset+2:offset+4]) &^ uint16(0x7)
+			if fragmentOffset != 0 {
+				fp.Protocol = pkt[offset]
+				fp.Fragment = true
+				fp.RemotePort = 0
+				fp.LocalPort = 0
+				return true
+			}
+			next = 8
+		case ipProtoAH:
+			if dataLen <= offset+1 {
+				return false
+			}
+			next = int(pkt[offset+1]+2) << 2
+		default:
+			if dataLen <= offset+1 {
+				return false
+			}
+			next = int(pkt[offset+1]+1) << 3
+		}
+		if next <= 0 {
+			next = 8
+		}
+		protoAt = offset
+		offset = offset + next
+	}
+}
+
+// runRxSplit drives the split path: faithful inbound parse for the firewall
+// side, then m.Commit re-parses to coalesce. v6 controls which baseline
+// parser we run.
+func runRxSplit(b *testing.B, pkts [][]byte, batchSize int, v6 bool) {
+	b.Helper()
+	m := NewMultiCoalescer(nopTunWriter{}, true, true)
+	var fp firewall.Packet
+	b.ReportAllocs()
+	b.SetBytes(int64(len(pkts[0])))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		pkt := pkts[i%len(pkts)]
+		var ok bool
+		if v6 {
+			ok = parseV6InboundBaseline(pkt, &fp)
+		} else {
+			ok = parseV4InboundBaseline(pkt, &fp)
+		}
+		if !ok {
+			b.Fatal("baseline parse failed")
+		}
+		if err := m.Commit(pkt); err != nil {
+			b.Fatal(err)
+		}
+		if (i+1)%batchSize == 0 {
+			if err := m.Flush(); err != nil {
+				b.Fatal(err)
+			}
+		}
+	}
+	_ = m.Flush()
+}
+
+// runRxUnified drives the unified path: ParseInbound walks once, filling
+// the conntrack key + coalescer hint in parsed; CommitInbound dispatches
+// without re-parsing.
+func runRxUnified(b *testing.B, pkts [][]byte, batchSize int) {
+	b.Helper()
+	m := NewMultiCoalescer(nopTunWriter{}, true, true)
+	var parsed RxParsed
+	b.ReportAllocs()
+	b.SetBytes(int64(len(pkts[0])))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		pkt := pkts[i%len(pkts)]
+		if err := ParseInbound(pkt, &parsed); err != nil {
+			b.Fatal(err)
+		}
+		if err := m.CommitInbound(pkt, &parsed); err != nil {
+			b.Fatal(err)
+		}
+		if (i+1)%batchSize == 0 {
+			if err := m.Flush(); err != nil {
+				b.Fatal(err)
+			}
+		}
+	}
+	_ = m.Flush()
+}
+
+// buildUDPv4Bulk returns N UDP packets on a single 5-tuple suitable for the
+// UDP coalescer's append path.
+func buildUDPv4Bulk(n, payloadLen int) [][]byte {
+	pkts := make([][]byte, n)
+	pay := make([]byte, payloadLen)
+	for i := range n {
+		pkts[i] = buildUDPv4(1000, 53, pay)
+	}
+	return pkts
+}
+
+func buildTCPv6Bulk(n, payloadLen int) [][]byte {
+	pkts := make([][]byte, n)
+	pay := make([]byte, payloadLen)
+	seq := uint32(1000)
+	for i := range n {
+		pkts[i] = buildTCPv6(0, seq, tcpAck, pay)
+		seq += uint32(payloadLen)
+	}
+	return pkts
+}
+
+func buildICMPv4Bulk(n int) [][]byte {
+	pkts := make([][]byte, n)
+	for i := range pkts {
+		pkts[i] = buildICMPv4()
+	}
+	return pkts
+}
+
+// === TCPv4 ===
+
+func BenchmarkRxSplitTCPv4(b *testing.B) {
+	pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
+	runRxSplit(b, pkts, tcpCoalesceMaxSegs, false)
+}
+
+func BenchmarkRxUnifiedTCPv4(b *testing.B) {
+	pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
+	runRxUnified(b, pkts, tcpCoalesceMaxSegs)
+}
+
+// === TCPv4 interleaved (4 flows) ===
+
+func BenchmarkRxSplitTCPv4Interleaved4(b *testing.B) {
+	pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
+	runRxSplit(b, pkts, len(pkts), false)
+}
+
+func BenchmarkRxUnifiedTCPv4Interleaved4(b *testing.B) {
+	pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
+	runRxUnified(b, pkts, len(pkts))
+}
+
+// === UDPv4 ===
+
+func BenchmarkRxSplitUDPv4(b *testing.B) {
+	pkts := buildUDPv4Bulk(udpCoalesceMaxSegs, 1200)
+	runRxSplit(b, pkts, udpCoalesceMaxSegs, false)
+}
+
+func BenchmarkRxUnifiedUDPv4(b *testing.B) {
+	pkts := buildUDPv4Bulk(udpCoalesceMaxSegs, 1200)
+	runRxUnified(b, pkts, udpCoalesceMaxSegs)
+}
+
+// === TCPv6 ===
+
+func BenchmarkRxSplitTCPv6(b *testing.B) {
+	pkts := buildTCPv6Bulk(tcpCoalesceMaxSegs, 1200)
+	runRxSplit(b, pkts, tcpCoalesceMaxSegs, true)
+}
+
+func BenchmarkRxUnifiedTCPv6(b *testing.B) {
+	pkts := buildTCPv6Bulk(tcpCoalesceMaxSegs, 1200)
+	runRxUnified(b, pkts, tcpCoalesceMaxSegs)
+}
+
+// === ICMPv4 (passthrough) — measures the unified parser on the coalescer-
+// rejected path, where both lenient and unified must still fill fp. ===
+
+func BenchmarkRxSplitICMPv4(b *testing.B) {
+	pkts := buildICMPv4Bulk(64)
+	runRxSplit(b, pkts, 64, false)
+}
+
+func BenchmarkRxUnifiedICMPv4(b *testing.B) {
+	pkts := buildICMPv4Bulk(64)
+	runRxUnified(b, pkts, 64)
+}
+
+// === Firewall fast-path (conntrack-hit) — exercises the savings from the
+// dense PacketKey: smaller hash key for the per-routine ConntrackCache,
+// and skipping the AddrFrom4 calls that the old path needed to fill the
+// netip.Addr-rich firewall.Packet up-front. ===
+//
+// The "split" baseline simulates the legacy path: parseV4InboundBaseline
+// fills a netip.Addr-rich Packet, then we probe a localCache keyed on
+// Packet. The "unified" path: ParseInbound fills only the dense PacketKey,
+// and we probe a localCache keyed on PacketKey. Both paths follow with
+// the coalescer Commit so the bench captures end-to-end RX-side cost.
+
+// runRxSplitWithCache mirrors runRxSplit but runs the legacy-style
+// firewall fast path (localCache keyed on firewall.Packet) on every
+// packet so we can compare against the unified path.
+func runRxSplitWithCache(b *testing.B, pkts [][]byte, batchSize int) {
+	b.Helper()
+	m := NewMultiCoalescer(nopTunWriter{}, true, true)
+	var fp firewall.Packet
+
+	// Pre-warm a per-packet cache keyed on the netip.Addr-rich Packet form.
+	cache := make(map[firewall.Packet]struct{}, len(pkts))
+	for _, pkt := range pkts {
+		var seedFp firewall.Packet
+		if !parseV4InboundBaseline(pkt, &seedFp) {
+			b.Fatal("seed parse failed")
+		}
+		cache[seedFp] = struct{}{}
+	}
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(pkts[0])))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		pkt := pkts[i%len(pkts)]
+		if !parseV4InboundBaseline(pkt, &fp) {
+			b.Fatal("baseline parse failed")
+		}
+		if _, ok := cache[fp]; !ok {
+			b.Fatal("cache miss")
+		}
+		if err := m.Commit(pkt); err != nil {
+			b.Fatal(err)
+		}
+		if (i+1)%batchSize == 0 {
+			if err := m.Flush(); err != nil {
+				b.Fatal(err)
+			}
+		}
+	}
+	_ = m.Flush()
+}
+
+// runRxUnifiedWithCache: unified path with a PacketKey-keyed localCache.
+// Each iteration: ParseInbound → conntrack-cache hit → CommitInbound.
+func runRxUnifiedWithCache(b *testing.B, pkts [][]byte, batchSize int) {
+	b.Helper()
+	m := NewMultiCoalescer(nopTunWriter{}, true, true)
+	var parsed RxParsed
+
+	cache := make(firewall.ConntrackCache, len(pkts))
+	for _, pkt := range pkts {
+		var seed RxParsed
+		if err := ParseInbound(pkt, &seed); err != nil {
+			b.Fatal(err)
+		}
+		cache[seed.Key] = struct{}{}
+	}
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(pkts[0])))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		pkt := pkts[i%len(pkts)]
+		if err := ParseInbound(pkt, &parsed); err != nil {
+			b.Fatal(err)
+		}
+		if _, ok := cache[parsed.Key]; !ok {
+			b.Fatal("cache miss")
+		}
+		if err := m.CommitInbound(pkt, &parsed); err != nil {
+			b.Fatal(err)
+		}
+		if (i+1)%batchSize == 0 {
+			if err := m.Flush(); err != nil {
+				b.Fatal(err)
+			}
+		}
+	}
+	_ = m.Flush()
+}
+
+func BenchmarkRxSplitTCPv4WithCache(b *testing.B) {
+	pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
+	runRxSplitWithCache(b, pkts, tcpCoalesceMaxSegs)
+}
+
+func BenchmarkRxUnifiedTCPv4WithCache(b *testing.B) {
+	pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
+	runRxUnifiedWithCache(b, pkts, tcpCoalesceMaxSegs)
+}
+
+func BenchmarkRxSplitInterleaved4WithCache(b *testing.B) {
+	pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
+	runRxSplitWithCache(b, pkts, len(pkts))
+}
+
+func BenchmarkRxUnifiedInterleaved4WithCache(b *testing.B) {
+	pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
+	runRxUnifiedWithCache(b, pkts, len(pkts))
+}