mirror of
https://github.com/slackhq/nebula.git
synced 2026-05-16 12:57:38 +02:00
241 lines
7.7 KiB
Go
241 lines
7.7 KiB
Go
package batch
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"runtime"
|
|
"testing"
|
|
|
|
"github.com/slackhq/nebula/overlay/tio"
|
|
"github.com/slackhq/nebula/test"
|
|
)
|
|
|
|
// nopTunWriter is a zero-alloc tio.GSOWriter for benchmarks. Discards
|
|
// everything but satisfies the interface the coalescer detects.
|
|
type nopTunWriter struct{}
|
|
|
|
func (nopTunWriter) Write(p []byte) (int, error) { return len(p), nil }
|
|
func (nopTunWriter) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, _ tio.GSOProto) error {
|
|
return nil
|
|
}
|
|
func (nopTunWriter) Capabilities() tio.Capabilities {
|
|
return tio.Capabilities{TSO: true, USO: true}
|
|
}
|
|
|
|
// buildTCPv4BulkFlow returns a slice of N adjacent ACK-only TCP segments
|
|
// on a single 5-tuple, each carrying payloadLen bytes. Seq numbers are
|
|
// contiguous so every packet is coalesceable onto the previous one.
|
|
func buildTCPv4BulkFlow(n, payloadLen int) [][]byte {
|
|
pkts := make([][]byte, n)
|
|
pay := make([]byte, payloadLen)
|
|
seq := uint32(1000)
|
|
for i := range n {
|
|
pkts[i] = buildTCPv4(seq, tcpAck, pay)
|
|
seq += uint32(payloadLen)
|
|
}
|
|
return pkts
|
|
}
|
|
|
|
// buildTCPv4Interleaved returns nFlows * perFlow packets with per-flow
|
|
// seq continuity but round-robin across flows — worst case for any
|
|
// "last-slot" cache.
|
|
func buildTCPv4Interleaved(nFlows, perFlow, payloadLen int) [][]byte {
|
|
pay := make([]byte, payloadLen)
|
|
seqs := make([]uint32, nFlows)
|
|
for i := range seqs {
|
|
seqs[i] = uint32(1000 + i*1000000)
|
|
}
|
|
pkts := make([][]byte, 0, nFlows*perFlow)
|
|
for range perFlow {
|
|
for f := range nFlows {
|
|
sport := uint16(10000 + f)
|
|
pkts = append(pkts, buildTCPv4Ports(sport, 2000, seqs[f], tcpAck, pay))
|
|
seqs[f] += uint32(payloadLen)
|
|
}
|
|
}
|
|
return pkts
|
|
}
|
|
|
|
// buildICMPv4 returns a minimal non-TCP packet that takes the passthrough
|
|
// branch in Commit.
|
|
func buildICMPv4() []byte {
|
|
pkt := make([]byte, 28)
|
|
pkt[0] = 0x45
|
|
binary.BigEndian.PutUint16(pkt[2:4], 28)
|
|
pkt[9] = 1 // ICMP
|
|
copy(pkt[12:16], []byte{10, 0, 0, 1})
|
|
copy(pkt[16:20], []byte{10, 0, 0, 2})
|
|
return pkt
|
|
}
|
|
|
|
// runCommitBench drives Commit over pkts batchSize at a time, flushing
|
|
// between batches, and reports per-packet cost.
|
|
func runCommitBench(b *testing.B, pkts [][]byte, batchSize int) {
|
|
b.Helper()
|
|
c := NewTCPCoalescer(nopTunWriter{}, test.NewLogger())
|
|
b.ReportAllocs()
|
|
b.SetBytes(int64(len(pkts[0])))
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
pkt := pkts[i%len(pkts)]
|
|
if err := c.Commit(pkt); err != nil {
|
|
b.Fatal(err)
|
|
}
|
|
if (i+1)%batchSize == 0 {
|
|
if err := c.Flush(); err != nil {
|
|
b.Fatal(err)
|
|
}
|
|
}
|
|
}
|
|
// Drain any trailing partial batch so slot state doesn't leak across runs.
|
|
_ = c.Flush()
|
|
}
|
|
|
|
// BenchmarkCommitSingleFlow is the bulk-TCP steady state: one flow,
|
|
// contiguous seq, 1200-byte payloads. Every packet past the seed should
|
|
// append onto the open slot. This is the case we most care about.
|
|
func BenchmarkCommitSingleFlow(b *testing.B) {
|
|
pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
|
|
runCommitBench(b, pkts, tcpCoalesceMaxSegs)
|
|
}
|
|
|
|
// BenchmarkCommitInterleaved4 has 4 concurrent bulk flows round-robined.
|
|
// A single-entry fast-path cache will miss on every packet; an N-way
|
|
// cache or map lookup carries the weight.
|
|
func BenchmarkCommitInterleaved4(b *testing.B) {
|
|
pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
|
|
runCommitBench(b, pkts, len(pkts))
|
|
}
|
|
|
|
// BenchmarkCommitInterleaved16 stresses the map at higher flow counts.
|
|
func BenchmarkCommitInterleaved16(b *testing.B) {
|
|
pkts := buildTCPv4Interleaved(16, tcpCoalesceMaxSegs, 1200)
|
|
runCommitBench(b, pkts, len(pkts))
|
|
}
|
|
|
|
// BenchmarkCommitPassthrough exercises the non-TCP branch: parseTCPBase
|
|
// bails early and addPassthrough is the only work.
|
|
func BenchmarkCommitPassthrough(b *testing.B) {
|
|
pkt := buildICMPv4()
|
|
pkts := make([][]byte, 64)
|
|
for i := range pkts {
|
|
pkts[i] = pkt
|
|
}
|
|
runCommitBench(b, pkts, 64)
|
|
}
|
|
|
|
// BenchmarkCommitNonCoalesceableTCP sends SYN|ACK packets on one flow.
|
|
// Each packet takes the "TCP but not admissible" branch which does a
|
|
// map delete + passthrough. Measures the seal-without-slot cost.
|
|
func BenchmarkCommitNonCoalesceableTCP(b *testing.B) {
|
|
pay := make([]byte, 0)
|
|
pkts := make([][]byte, 64)
|
|
for i := range pkts {
|
|
pkts[i] = buildTCPv4(uint32(1000+i), tcpSyn|tcpAck, pay)
|
|
}
|
|
runCommitBench(b, pkts, 64)
|
|
}
|
|
|
|
// runMultiCommitBench drives MultiCoalescer.Commit. The dispatcher does
|
|
// the IP/L4 parse once and passes the parsed struct to the lane, so this
|
|
// is the bench that shows the savings of skipping the lane's re-parse.
|
|
func runMultiCommitBench(b *testing.B, pkts [][]byte, batchSize int) {
|
|
b.Helper()
|
|
m := NewMultiCoalescer(nopTunWriter{}, test.NewLogger(), true, true)
|
|
b.ReportAllocs()
|
|
b.SetBytes(int64(len(pkts[0])))
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
pkt := pkts[i%len(pkts)]
|
|
if err := m.Commit(pkt); err != nil {
|
|
b.Fatal(err)
|
|
}
|
|
if (i+1)%batchSize == 0 {
|
|
if err := m.Flush(); err != nil {
|
|
b.Fatal(err)
|
|
}
|
|
}
|
|
}
|
|
_ = m.Flush()
|
|
}
|
|
|
|
// BenchmarkMultiCommitSingleFlow is the multi-lane analogue of
|
|
// BenchmarkCommitSingleFlow — same workload but routed through the
|
|
// dispatcher. The delta vs the single-lane bench measures dispatcher
|
|
// overhead.
|
|
func BenchmarkMultiCommitSingleFlow(b *testing.B) {
|
|
pkts := buildTCPv4BulkFlow(tcpCoalesceMaxSegs, 1200)
|
|
runMultiCommitBench(b, pkts, tcpCoalesceMaxSegs)
|
|
}
|
|
|
|
// BenchmarkMultiCommitInterleaved4 mirrors BenchmarkCommitInterleaved4
|
|
// through the dispatcher.
|
|
func BenchmarkMultiCommitInterleaved4(b *testing.B) {
|
|
pkts := buildTCPv4Interleaved(4, tcpCoalesceMaxSegs, 1200)
|
|
runMultiCommitBench(b, pkts, len(pkts))
|
|
}
|
|
|
|
// flowKeyPair is one comparison input for the flowKeyCompare bench.
|
|
type flowKeyPair struct{ a, b flowKey }
|
|
|
|
// makeFlowKey builds an IPv4 flowKey from compact inputs.
|
|
func makeFlowKey(srcLow, dstLow uint32, sport, dport uint16) flowKey {
|
|
var fk flowKey
|
|
binary.BigEndian.PutUint32(fk.src[12:16], srcLow)
|
|
binary.BigEndian.PutUint32(fk.dst[12:16], dstLow)
|
|
fk.sport = sport
|
|
fk.dport = dport
|
|
return fk
|
|
}
|
|
|
|
// flowKeyCases are the workload mixes flowKeyCompare sees in practice.
|
|
// - sameFlow: equal keys; tests the equal-path cost (sort runs hit this
|
|
// repeatedly when many segments share a flow).
|
|
// - sportDiffers: same src/dst/dport, different sport — the typical
|
|
// "sibling flows from one host to one server" pattern.
|
|
// - dstDiffers: same src/sport/dport, different dst — outbound to many
|
|
// servers from a fixed local port.
|
|
// - allDiffer: every field differs; worst case for short-circuiting.
|
|
func flowKeyCases() map[string][]flowKeyPair {
|
|
const n = 64
|
|
cases := map[string][]flowKeyPair{
|
|
"sameFlow": make([]flowKeyPair, n),
|
|
"sportDiffers": make([]flowKeyPair, n),
|
|
"dstDiffers": make([]flowKeyPair, n),
|
|
"allDiffer": make([]flowKeyPair, n),
|
|
}
|
|
for i := range n {
|
|
base := makeFlowKey(0x0a000001, 0x0a000002, 40000, 443)
|
|
cases["sameFlow"][i] = flowKeyPair{a: base, b: base}
|
|
cases["sportDiffers"][i] = flowKeyPair{
|
|
a: base,
|
|
b: makeFlowKey(0x0a000001, 0x0a000002, uint16(40001+i), 443),
|
|
}
|
|
cases["dstDiffers"][i] = flowKeyPair{
|
|
a: base,
|
|
b: makeFlowKey(0x0a000001, uint32(0x0a000002+i+1), 40000, 443),
|
|
}
|
|
cases["allDiffer"][i] = flowKeyPair{
|
|
a: makeFlowKey(uint32(0x0a000001+i), uint32(0x0a000002+i), uint16(40000+i), uint16(80+i)),
|
|
b: makeFlowKey(uint32(0x0b000001+i), uint32(0x0b000002+i), uint16(50000+i), uint16(443+i)),
|
|
}
|
|
}
|
|
return cases
|
|
}
|
|
|
|
// BenchmarkFlowKeyCompare measures flowKeyCompare across the workloads
|
|
// the sort step actually sees. Use this to compare reorderings.
|
|
func BenchmarkFlowKeyCompare(b *testing.B) {
|
|
for name, pairs := range flowKeyCases() {
|
|
b.Run(name, func(b *testing.B) {
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
var sink int
|
|
for i := 0; i < b.N; i++ {
|
|
p := pairs[i&(len(pairs)-1)]
|
|
sink += flowKeyCompare(p.a, p.b)
|
|
}
|
|
runtime.KeepAlive(sink)
|
|
})
|
|
}
|
|
}
|