nebula/overlay/batch/inbound_bench_test.go

package batch

import (
	"encoding/binary"
	"net/netip"
	"testing"

	"github.com/slackhq/nebula/firewall"
)

// parseV4InboundBaseline mirrors what outside.go's parseV4(incoming=true)
// does, so the "split" bench measures the *current* state: firewall-side
// parse, then m.Commit re-parses inside the coalescer. Two walks per
// packet. Kept faithful in shape (one read per field, AddrFromSlice for
// the addrs) so the CPU profile matches the production parseV4.
func parseV4InboundBaseline(pkt []byte, fp *firewall.Packet) bool {
	if len(pkt) < 20 {
		return false
	}
	ihl := int(pkt[0]&0x0f) << 2
	if ihl < 20 {
		return false
	}
	flagsfrags := binary.BigEndian.Uint16(pkt[6:8])
	fp.Fragment = (flagsfrags & 0x1FFF) != 0
	fp.Protocol = pkt[9]
	minLen := ihl
	if !fp.Fragment {
		if fp.Protocol == firewall.ProtoICMP {
			minLen += 4 + 2
		} else {
			minLen += 4
		}
	}
	if len(pkt) < minLen {
		return false
	}
	fp.RemoteAddr, _ = netip.AddrFromSlice(pkt[12:16])
	fp.LocalAddr, _ = netip.AddrFromSlice(pkt[16:20])
	switch {
	case fp.Fragment:
		fp.RemotePort = 0
		fp.LocalPort = 0
	case fp.Protocol == firewall.ProtoICMP:
		fp.RemotePort = binary.BigEndian.Uint16(pkt[ihl+4 : ihl+6])
		fp.LocalPort = 0
	default:
		fp.RemotePort = binary.BigEndian.Uint16(pkt[ihl : ihl+2])
		fp.LocalPort = binary.BigEndian.Uint16(pkt[ihl+2 : ihl+4])
	}
	return true
}

// parseV6InboundBaseline is the v6 analogue: replicates parseV6's
// extension-header walk so the split bench captures its true cost.
func parseV6InboundBaseline(pkt []byte, fp *firewall.Packet) bool {
	dataLen := len(pkt)
	if dataLen < 40 {
		return false
	}
	fp.RemoteAddr, _ = netip.AddrFromSlice(pkt[8:24])
	fp.LocalAddr, _ = netip.AddrFromSlice(pkt[24:40])

	protoAt := 6
	offset := 40
	next := 0
	for {
		if protoAt >= dataLen {
			return false
		}
		proto := pkt[protoAt]
		switch proto {
		case ipProtoESP, ipProtoNoNextHdr:
			fp.Protocol = proto
			fp.RemotePort = 0
			fp.LocalPort = 0
			fp.Fragment = false
			return true
		case ipProtoICMPv6:
			if dataLen < offset+6 {
				return false
			}
			fp.Protocol = proto
			fp.LocalPort = 0
			switch pkt[offset+1] {
			case icmpv6TypeEchoRequest, icmpv6TypeEchoReply:
				fp.RemotePort = binary.BigEndian.Uint16(pkt[offset+4 : offset+6])
			default:
				fp.RemotePort = 0
			}
			fp.Fragment = false
			return true
		case ipProtoTCP, ipProtoUDP:
			if dataLen < offset+4 {
				return false
			}
			fp.Protocol = proto
			fp.RemotePort = binary.BigEndian.Uint16(pkt[offset : offset+2])
			fp.LocalPort = binary.BigEndian.Uint16(pkt[offset+2 : offset+4])
			fp.Fragment = false
			return true
		case ipProtoIPv6Fragment:
			if dataLen < offset+8 {
				return false
			}
			fragmentOffset := binary.BigEndian.Uint16(pkt[offset+2:offset+4]) &^ uint16(0x7)
			if fragmentOffset != 0 {
				fp.Protocol = pkt[offset]
				fp.Fragment = true
				fp.RemotePort = 0
				fp.LocalPort = 0
				return true
			}
			next = 8
		case ipProtoAH:
			if dataLen <= offset+1 {
				return false
			}
			next = int(pkt[offset+1]+2) << 2
		default:
			if dataLen <= offset+1 {
				return false
			}
			next = int(pkt[offset+1]+1) << 3
		}
		if next <= 0 {
			next = 8
		}
		protoAt = offset
		offset = offset + next
	}
}

// runRxSplit drives the split path: faithful inbound parse for the firewall
// side, then m.Commit re-parses to coalesce. v6 controls which baseline
// parser we run.
func runRxSplit(b *testing.B, pkts [][]byte, batchSize int, v6 bool) {
	b.Helper()
	m := NewMultiCoalescer(nopTunWriter{}, true, true)
	var fp firewall.Packet
	b.ReportAllocs()
	b.SetBytes(int64(len(pkts[0])))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		pkt := pkts[i%len(pkts)]
		var ok bool
		if v6 {
			ok = parseV6InboundBaseline(pkt, &fp)
		} else {
			ok = parseV4InboundBaseline(pkt, &fp)
		}
		if !ok {
			b.Fatal("baseline parse failed")
		}
		if err := m.Commit(pkt); err != nil {
			b.Fatal(err)
		}
		if (i+1)%batchSize == 0 {
			if err := m.Flush(); err != nil {
				b.Fatal(err)
			}
		}
	}
	_ = m.Flush()
}

// runRxUnified drives the unified path: ParseInbound walks once, filling
// the conntrack key + coalescer hint in parsed; CommitInbound dispatches
// without re-parsing.
func runRxUnified(b *testing.B, pkts [][]byte, batchSize int) {
	b.Helper()
	m := NewMultiCoalescer(nopTunWriter{}, true, true)
	var parsed RxParsed
	b.ReportAllocs()
	b.SetBytes(int64(len(pkts[0])))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		pkt := pkts[i%len(pkts)]
		if err := ParsePacket(pkt, true, &parsed); err != nil {
			b.Fatal(err)
		}
		if err := m.CommitInbound(pkt, &parsed); err != nil {
			b.Fatal(err)
		}
		if (i+1)%batchSize == 0 {
			if err := m.Flush(); err != nil {
				b.Fatal(err)
			}
		}
	}
	_ = m.Flush()
}

// buildUDPv4Bulk returns N UDP packets on a single 5-tuple suitable for the
// UDP coalescer's append path.
func buildUDPv4Bulk(n, payloadLen int) [][]byte {
	pkts := make([][]byte, n)
	pay := make([]byte, payloadLen)
	for i := range n {
		pkts[i] = buildUDPv4(1000, 53, pay)
	}
	return pkts
}

func buildTCPv6Bulk(n, payloadLen int) [][]byte {
	pkts := make([][]byte, n)
	pay := make([]byte, payloadLen)
	seq := uint32(1000)
	for i := range n {
		pkts[i] = buildTCPv6(0, seq, tcpAck, pay)
		seq += uint32(payloadLen)
	}
	return pkts
}

func buildICMPv4Bulk(n int) [][]byte {
	pkts := make([][]byte, n)
	for i := range pkts {
		pkts[i] = buildICMPv4()
	}
	return pkts
}

// === TCPv4 ===

func BenchmarkRxSplitTCPv4(b *testing.B) {
	pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
	runRxSplit(b, pkts, tcpCoalesceMaxSegs, false)
}

func BenchmarkRxUnifiedTCPv4(b *testing.B) {
	pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
	runRxUnified(b, pkts, tcpCoalesceMaxSegs)
}

// === TCPv4 interleaved (4 flows) ===

func BenchmarkRxSplitTCPv4Interleaved4(b *testing.B) {
	pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
	runRxSplit(b, pkts, len(pkts), false)
}

func BenchmarkRxUnifiedTCPv4Interleaved4(b *testing.B) {
	pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
	runRxUnified(b, pkts, len(pkts))
}

// === UDPv4 ===

func BenchmarkRxSplitUDPv4(b *testing.B) {
	pkts := buildUDPv4Bulk(udpCoalesceMaxSegs, 1200)
	runRxSplit(b, pkts, udpCoalesceMaxSegs, false)
}

func BenchmarkRxUnifiedUDPv4(b *testing.B) {
	pkts := buildUDPv4Bulk(udpCoalesceMaxSegs, 1200)
	runRxUnified(b, pkts, udpCoalesceMaxSegs)
}

// === TCPv6 ===

func BenchmarkRxSplitTCPv6(b *testing.B) {
	pkts := buildTCPv6Bulk(tcpCoalesceMaxSegs, 1200)
	runRxSplit(b, pkts, tcpCoalesceMaxSegs, true)
}

func BenchmarkRxUnifiedTCPv6(b *testing.B) {
	pkts := buildTCPv6Bulk(tcpCoalesceMaxSegs, 1200)
	runRxUnified(b, pkts, tcpCoalesceMaxSegs)
}

// === ICMPv4 (passthrough) — measures the unified parser on the coalescer-
// rejected path, where both lenient and unified must still fill fp. ===

func BenchmarkRxSplitICMPv4(b *testing.B) {
	pkts := buildICMPv4Bulk(64)
	runRxSplit(b, pkts, 64, false)
}

func BenchmarkRxUnifiedICMPv4(b *testing.B) {
	pkts := buildICMPv4Bulk(64)
	runRxUnified(b, pkts, 64)
}

// === Firewall fast-path (conntrack-hit) — exercises the savings from the
// dense PacketKey: smaller hash key for the per-routine ConntrackCache,
// and skipping the AddrFrom4 calls that the old path needed to fill the
// netip.Addr-rich firewall.Packet up-front. ===
//
// The "split" baseline simulates the legacy path: parseV4InboundBaseline
// fills a netip.Addr-rich Packet, then we probe a localCache keyed on
// Packet. The "unified" path: ParseInbound fills only the dense PacketKey,
// and we probe a localCache keyed on PacketKey. Both paths follow with
// the coalescer Commit so the bench captures end-to-end RX-side cost.

// runRxSplitWithCache mirrors runRxSplit but runs the legacy-style
// firewall fast path (localCache keyed on firewall.Packet) on every
// packet so we can compare against the unified path.
func runRxSplitWithCache(b *testing.B, pkts [][]byte, batchSize int) {
	b.Helper()
	m := NewMultiCoalescer(nopTunWriter{}, true, true)
	var fp firewall.Packet

	// Pre-warm a per-packet cache keyed on the netip.Addr-rich Packet form.
	cache := make(map[firewall.Packet]struct{}, len(pkts))
	for _, pkt := range pkts {
		var seedFp firewall.Packet
		if !parseV4InboundBaseline(pkt, &seedFp) {
			b.Fatal("seed parse failed")
		}
		cache[seedFp] = struct{}{}
	}

	b.ReportAllocs()
	b.SetBytes(int64(len(pkts[0])))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		pkt := pkts[i%len(pkts)]
		if !parseV4InboundBaseline(pkt, &fp) {
			b.Fatal("baseline parse failed")
		}
		if _, ok := cache[fp]; !ok {
			b.Fatal("cache miss")
		}
		if err := m.Commit(pkt); err != nil {
			b.Fatal(err)
		}
		if (i+1)%batchSize == 0 {
			if err := m.Flush(); err != nil {
				b.Fatal(err)
			}
		}
	}
	_ = m.Flush()
}

// runRxUnifiedWithCache: unified path with a PacketKey-keyed localCache.
// Each iteration: ParseInbound → conntrack-cache hit → CommitInbound.
func runRxUnifiedWithCache(b *testing.B, pkts [][]byte, batchSize int) {
	b.Helper()
	m := NewMultiCoalescer(nopTunWriter{}, true, true)
	var parsed RxParsed

	cache := make(firewall.ConntrackCache, len(pkts))
	for _, pkt := range pkts {
		var seed RxParsed
		if err := ParsePacket(pkt, true, &seed); err != nil {
			b.Fatal(err)
		}
		cache[seed.Key] = struct{}{}
	}

	b.ReportAllocs()
	b.SetBytes(int64(len(pkts[0])))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		pkt := pkts[i%len(pkts)]
		if err := ParsePacket(pkt, true, &parsed); err != nil {
			b.Fatal(err)
		}
		if _, ok := cache[parsed.Key]; !ok {
			b.Fatal("cache miss")
		}
		if err := m.CommitInbound(pkt, &parsed); err != nil {
			b.Fatal(err)
		}
		if (i+1)%batchSize == 0 {
			if err := m.Flush(); err != nil {
				b.Fatal(err)
			}
		}
	}
	_ = m.Flush()
}

func BenchmarkRxSplitTCPv4WithCache(b *testing.B) {
	pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
	runRxSplitWithCache(b, pkts, tcpCoalesceMaxSegs)
}

func BenchmarkRxUnifiedTCPv4WithCache(b *testing.B) {
	pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
	runRxUnifiedWithCache(b, pkts, tcpCoalesceMaxSegs)
}

func BenchmarkRxSplitInterleaved4WithCache(b *testing.B) {
	pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
	runRxSplitWithCache(b, pkts, len(pkts))
}

func BenchmarkRxUnifiedInterleaved4WithCache(b *testing.B) {
	pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
	runRxUnifiedWithCache(b, pkts, len(pkts))
}