checkpt, try to parse packets only once pt2

This commit is contained in:
JackDoan
2026-05-07 11:26:17 -05:00
parent 0375aff451
commit 5bdf645b0b
10 changed files with 1150 additions and 92 deletions

View File

@@ -0,0 +1,394 @@
package batch
import (
"encoding/binary"
"net/netip"
"testing"
"github.com/slackhq/nebula/firewall"
)
// parseV4InboundBaseline mirrors what outside.go's parseV4(incoming=true)
// does, so the "split" bench measures the *current* state: firewall-side
// parse, then m.Commit re-parses inside the coalescer. Two walks per
// packet. Kept faithful in shape (one read per field, AddrFromSlice for
// the addrs) so the CPU profile matches the production parseV4.
func parseV4InboundBaseline(pkt []byte, fp *firewall.Packet) bool {
if len(pkt) < 20 {
return false
}
ihl := int(pkt[0]&0x0f) << 2
if ihl < 20 {
return false
}
flagsfrags := binary.BigEndian.Uint16(pkt[6:8])
fp.Fragment = (flagsfrags & 0x1FFF) != 0
fp.Protocol = pkt[9]
minLen := ihl
if !fp.Fragment {
if fp.Protocol == firewall.ProtoICMP {
minLen += 4 + 2
} else {
minLen += 4
}
}
if len(pkt) < minLen {
return false
}
fp.RemoteAddr, _ = netip.AddrFromSlice(pkt[12:16])
fp.LocalAddr, _ = netip.AddrFromSlice(pkt[16:20])
switch {
case fp.Fragment:
fp.RemotePort = 0
fp.LocalPort = 0
case fp.Protocol == firewall.ProtoICMP:
fp.RemotePort = binary.BigEndian.Uint16(pkt[ihl+4 : ihl+6])
fp.LocalPort = 0
default:
fp.RemotePort = binary.BigEndian.Uint16(pkt[ihl : ihl+2])
fp.LocalPort = binary.BigEndian.Uint16(pkt[ihl+2 : ihl+4])
}
return true
}
// parseV6InboundBaseline is the v6 analogue: replicates parseV6's
// extension-header walk so the split bench captures its true cost.
func parseV6InboundBaseline(pkt []byte, fp *firewall.Packet) bool {
dataLen := len(pkt)
if dataLen < 40 {
return false
}
fp.RemoteAddr, _ = netip.AddrFromSlice(pkt[8:24])
fp.LocalAddr, _ = netip.AddrFromSlice(pkt[24:40])
protoAt := 6
offset := 40
next := 0
for {
if protoAt >= dataLen {
return false
}
proto := pkt[protoAt]
switch proto {
case ipProtoESP, ipProtoNoNextHdr:
fp.Protocol = proto
fp.RemotePort = 0
fp.LocalPort = 0
fp.Fragment = false
return true
case ipProtoICMPv6:
if dataLen < offset+6 {
return false
}
fp.Protocol = proto
fp.LocalPort = 0
switch pkt[offset+1] {
case icmpv6TypeEchoRequest, icmpv6TypeEchoReply:
fp.RemotePort = binary.BigEndian.Uint16(pkt[offset+4 : offset+6])
default:
fp.RemotePort = 0
}
fp.Fragment = false
return true
case ipProtoTCP, ipProtoUDP:
if dataLen < offset+4 {
return false
}
fp.Protocol = proto
fp.RemotePort = binary.BigEndian.Uint16(pkt[offset : offset+2])
fp.LocalPort = binary.BigEndian.Uint16(pkt[offset+2 : offset+4])
fp.Fragment = false
return true
case ipProtoIPv6Fragment:
if dataLen < offset+8 {
return false
}
fragmentOffset := binary.BigEndian.Uint16(pkt[offset+2:offset+4]) &^ uint16(0x7)
if fragmentOffset != 0 {
fp.Protocol = pkt[offset]
fp.Fragment = true
fp.RemotePort = 0
fp.LocalPort = 0
return true
}
next = 8
case ipProtoAH:
if dataLen <= offset+1 {
return false
}
next = int(pkt[offset+1]+2) << 2
default:
if dataLen <= offset+1 {
return false
}
next = int(pkt[offset+1]+1) << 3
}
if next <= 0 {
next = 8
}
protoAt = offset
offset = offset + next
}
}
// runRxSplit drives the split path: faithful inbound parse for the firewall
// side, then m.Commit re-parses to coalesce. v6 controls which baseline
// parser we run.
func runRxSplit(b *testing.B, pkts [][]byte, batchSize int, v6 bool) {
b.Helper()
m := NewMultiCoalescer(nopTunWriter{}, true, true)
var fp firewall.Packet
b.ReportAllocs()
b.SetBytes(int64(len(pkts[0])))
b.ResetTimer()
for i := 0; i < b.N; i++ {
pkt := pkts[i%len(pkts)]
var ok bool
if v6 {
ok = parseV6InboundBaseline(pkt, &fp)
} else {
ok = parseV4InboundBaseline(pkt, &fp)
}
if !ok {
b.Fatal("baseline parse failed")
}
if err := m.Commit(pkt); err != nil {
b.Fatal(err)
}
if (i+1)%batchSize == 0 {
if err := m.Flush(); err != nil {
b.Fatal(err)
}
}
}
_ = m.Flush()
}
// runRxUnified drives the unified path: ParseInbound walks once, filling
// the conntrack key + coalescer hint in parsed; CommitInbound dispatches
// without re-parsing.
func runRxUnified(b *testing.B, pkts [][]byte, batchSize int) {
b.Helper()
m := NewMultiCoalescer(nopTunWriter{}, true, true)
var parsed RxParsed
b.ReportAllocs()
b.SetBytes(int64(len(pkts[0])))
b.ResetTimer()
for i := 0; i < b.N; i++ {
pkt := pkts[i%len(pkts)]
if err := ParseInbound(pkt, &parsed); err != nil {
b.Fatal(err)
}
if err := m.CommitInbound(pkt, &parsed); err != nil {
b.Fatal(err)
}
if (i+1)%batchSize == 0 {
if err := m.Flush(); err != nil {
b.Fatal(err)
}
}
}
_ = m.Flush()
}
// buildUDPv4Bulk returns N UDP packets on a single 5-tuple suitable for the
// UDP coalescer's append path.
func buildUDPv4Bulk(n, payloadLen int) [][]byte {
pkts := make([][]byte, n)
pay := make([]byte, payloadLen)
for i := range n {
pkts[i] = buildUDPv4(1000, 53, pay)
}
return pkts
}
func buildTCPv6Bulk(n, payloadLen int) [][]byte {
pkts := make([][]byte, n)
pay := make([]byte, payloadLen)
seq := uint32(1000)
for i := range n {
pkts[i] = buildTCPv6(0, seq, tcpAck, pay)
seq += uint32(payloadLen)
}
return pkts
}
func buildICMPv4Bulk(n int) [][]byte {
pkts := make([][]byte, n)
for i := range pkts {
pkts[i] = buildICMPv4()
}
return pkts
}
// === TCPv4 ===
func BenchmarkRxSplitTCPv4(b *testing.B) {
pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
runRxSplit(b, pkts, tcpCoalesceMaxSegs, false)
}
func BenchmarkRxUnifiedTCPv4(b *testing.B) {
pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
runRxUnified(b, pkts, tcpCoalesceMaxSegs)
}
// === TCPv4 interleaved (4 flows) ===
func BenchmarkRxSplitTCPv4Interleaved4(b *testing.B) {
pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
runRxSplit(b, pkts, len(pkts), false)
}
func BenchmarkRxUnifiedTCPv4Interleaved4(b *testing.B) {
pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
runRxUnified(b, pkts, len(pkts))
}
// === UDPv4 ===
func BenchmarkRxSplitUDPv4(b *testing.B) {
pkts := buildUDPv4Bulk(udpCoalesceMaxSegs, 1200)
runRxSplit(b, pkts, udpCoalesceMaxSegs, false)
}
func BenchmarkRxUnifiedUDPv4(b *testing.B) {
pkts := buildUDPv4Bulk(udpCoalesceMaxSegs, 1200)
runRxUnified(b, pkts, udpCoalesceMaxSegs)
}
// === TCPv6 ===
func BenchmarkRxSplitTCPv6(b *testing.B) {
pkts := buildTCPv6Bulk(tcpCoalesceMaxSegs, 1200)
runRxSplit(b, pkts, tcpCoalesceMaxSegs, true)
}
func BenchmarkRxUnifiedTCPv6(b *testing.B) {
pkts := buildTCPv6Bulk(tcpCoalesceMaxSegs, 1200)
runRxUnified(b, pkts, tcpCoalesceMaxSegs)
}
// === ICMPv4 (passthrough) — measures the unified parser on the coalescer-
// rejected path, where both lenient and unified must still fill fp. ===
func BenchmarkRxSplitICMPv4(b *testing.B) {
pkts := buildICMPv4Bulk(64)
runRxSplit(b, pkts, 64, false)
}
func BenchmarkRxUnifiedICMPv4(b *testing.B) {
pkts := buildICMPv4Bulk(64)
runRxUnified(b, pkts, 64)
}
// === Firewall fast-path (conntrack-hit) — exercises the savings from the
// dense PacketKey: smaller hash key for the per-routine ConntrackCache,
// and skipping the AddrFrom4 calls that the old path needed to fill the
// netip.Addr-rich firewall.Packet up-front. ===
//
// The "split" baseline simulates the legacy path: parseV4InboundBaseline
// fills a netip.Addr-rich Packet, then we probe a localCache keyed on
// Packet. The "unified" path: ParseInbound fills only the dense PacketKey,
// and we probe a localCache keyed on PacketKey. Both paths follow with
// the coalescer Commit so the bench captures end-to-end RX-side cost.
// runRxSplitWithCache mirrors runRxSplit but runs the legacy-style
// firewall fast path (localCache keyed on firewall.Packet) on every
// packet so we can compare against the unified path.
func runRxSplitWithCache(b *testing.B, pkts [][]byte, batchSize int) {
b.Helper()
m := NewMultiCoalescer(nopTunWriter{}, true, true)
var fp firewall.Packet
// Pre-warm a per-packet cache keyed on the netip.Addr-rich Packet form.
cache := make(map[firewall.Packet]struct{}, len(pkts))
for _, pkt := range pkts {
var seedFp firewall.Packet
if !parseV4InboundBaseline(pkt, &seedFp) {
b.Fatal("seed parse failed")
}
cache[seedFp] = struct{}{}
}
b.ReportAllocs()
b.SetBytes(int64(len(pkts[0])))
b.ResetTimer()
for i := 0; i < b.N; i++ {
pkt := pkts[i%len(pkts)]
if !parseV4InboundBaseline(pkt, &fp) {
b.Fatal("baseline parse failed")
}
if _, ok := cache[fp]; !ok {
b.Fatal("cache miss")
}
if err := m.Commit(pkt); err != nil {
b.Fatal(err)
}
if (i+1)%batchSize == 0 {
if err := m.Flush(); err != nil {
b.Fatal(err)
}
}
}
_ = m.Flush()
}
// runRxUnifiedWithCache: unified path with a PacketKey-keyed localCache.
// Each iteration: ParseInbound → conntrack-cache hit → CommitInbound.
func runRxUnifiedWithCache(b *testing.B, pkts [][]byte, batchSize int) {
b.Helper()
m := NewMultiCoalescer(nopTunWriter{}, true, true)
var parsed RxParsed
cache := make(firewall.ConntrackCache, len(pkts))
for _, pkt := range pkts {
var seed RxParsed
if err := ParseInbound(pkt, &seed); err != nil {
b.Fatal(err)
}
cache[seed.Key] = struct{}{}
}
b.ReportAllocs()
b.SetBytes(int64(len(pkts[0])))
b.ResetTimer()
for i := 0; i < b.N; i++ {
pkt := pkts[i%len(pkts)]
if err := ParseInbound(pkt, &parsed); err != nil {
b.Fatal(err)
}
if _, ok := cache[parsed.Key]; !ok {
b.Fatal("cache miss")
}
if err := m.CommitInbound(pkt, &parsed); err != nil {
b.Fatal(err)
}
if (i+1)%batchSize == 0 {
if err := m.Flush(); err != nil {
b.Fatal(err)
}
}
}
_ = m.Flush()
}
func BenchmarkRxSplitTCPv4WithCache(b *testing.B) {
pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
runRxSplitWithCache(b, pkts, tcpCoalesceMaxSegs)
}
func BenchmarkRxUnifiedTCPv4WithCache(b *testing.B) {
pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
runRxUnifiedWithCache(b, pkts, tcpCoalesceMaxSegs)
}
func BenchmarkRxSplitInterleaved4WithCache(b *testing.B) {
pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
runRxSplitWithCache(b, pkts, len(pkts))
}
func BenchmarkRxUnifiedInterleaved4WithCache(b *testing.B) {
pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
runRxUnifiedWithCache(b, pkts, len(pkts))
}