From 924268cc1fbe1210d343a3830da9a53b29c87a8d Mon Sep 17 00:00:00 2001
From: JackDoan <me@jackdoan.com>
Date: Mon, 4 May 2026 11:56:58 -0500
Subject: [PATCH] switch to ASM vector checksum

---
 overlay/batch/batch.go                |   2 +-
 overlay/batch/multi_coalesce.go       |  44 +++---
 overlay/checksum/checksum_amd64.go    |  23 ++++
 overlay/checksum/checksum_amd64.s     | 157 +++++++++++++++++++++
 overlay/checksum/checksum_arm64.go    |  12 ++
 overlay/checksum/checksum_arm64.s     | 143 +++++++++++++++++++
 overlay/checksum/checksum_fallback.go |  10 ++
 overlay/checksum/checksum_test.go     | 190 ++++++++++++++++++++++++++
 overlay/tio/virtio/segment_linux.go   |   3 +-
 9 files changed, 560 insertions(+), 24 deletions(-)
 create mode 100644 overlay/checksum/checksum_amd64.go
 create mode 100644 overlay/checksum/checksum_amd64.s
 create mode 100644 overlay/checksum/checksum_arm64.go
 create mode 100644 overlay/checksum/checksum_arm64.s
 create mode 100644 overlay/checksum/checksum_fallback.go
 create mode 100644 overlay/checksum/checksum_test.go

diff --git a/overlay/batch/batch.go b/overlay/batch/batch.go
index 338c6008..d171d136 100644
--- a/overlay/batch/batch.go
+++ b/overlay/batch/batch.go
@@ -22,7 +22,7 @@ type TxBatcher interface {
 	// to leave the outer ECN field unset.
 	Commit(pkt []byte, dst netip.AddrPort, outerECN byte)
 	// Flush emits every queued packet via the underlying batch writer in
-	// arrival order. Returns the first error observed. After Flush returns,
+	// arrival order. Returns an errors.Join of one or more errors. After Flush returns,
 	// borrowed payload slices may be recycled.
 	Flush() error
 }
diff --git a/overlay/batch/multi_coalesce.go b/overlay/batch/multi_coalesce.go
index fbe59ccc..f6a8316c 100644
--- a/overlay/batch/multi_coalesce.go
+++ b/overlay/batch/multi_coalesce.go
@@ -1,6 +1,7 @@
 package batch
 
 import (
+	"errors"
 	"io"
 )
 
@@ -60,16 +61,12 @@ func (m *MultiCoalescer) Reserve(sz int) []byte {
 }
 
 // Commit dispatches pkt to the appropriate lane based on IP version + L4
-// proto. Borrowed slice contract is identical to the single-lane batchers
-// — pkt must remain valid until the next Flush.
+// proto. Borrowed slice contract is identical to the single-lane batchers,
+// pkt must remain valid until the next Flush.
 //
 // On the success path the IP/TCP-or-UDP parse happens here once and the
 // parsed struct is handed to the lane via commitParsed so the lane doesn't
-// re-walk the header. On a parse failure we fall through to the lane's
-// public Commit, which re-runs the parse before passthrough — that path
-// only fires for malformed/unsupported packets so the duplicated parse is
-// not on the hot path. The lane's public Commit still works for direct
-// callers.
+// re-walk the header.
 func (m *MultiCoalescer) Commit(pkt []byte) error {
 	if len(pkt) < 20 {
 		return m.pt.Commit(pkt)
@@ -92,9 +89,10 @@ func (m *MultiCoalescer) Commit(pkt []byte) error {
 		if m.tcp != nil {
 			info, ok := parseTCPBase(pkt)
 			if !ok {
-				// Malformed/unsupported TCP shape (IP options, fragments, ...)
-				// — the TCP lane handles this as passthrough.
-				return m.tcp.Commit(pkt)
+				// Malformed/unsupported TCP shape (IP options, fragments, ...).
+				// Handle this via passthrough support in the TCP coalescer, to attempt to preserve flow order.
+				m.tcp.addPassthrough(pkt)
+				return nil
 			}
 			return m.tcp.commitParsed(pkt, info)
 		}
@@ -102,7 +100,8 @@ func (m *MultiCoalescer) Commit(pkt []byte) error {
 		if m.udp != nil {
 			info, ok := parseUDP(pkt)
 			if !ok {
-				return m.udp.Commit(pkt)
+				m.udp.addPassthrough(pkt) //we could also m.pt.Commit() here I guess?
+				return nil
 			}
 			return m.udp.commitParsed(pkt, info)
 		}
@@ -111,23 +110,24 @@ func (m *MultiCoalescer) Commit(pkt []byte) error {
 }
 
 // Flush drains every lane in a fixed order: TCP, UDP, passthrough. Errors
-// from a lane do not stop subsequent lanes from flushing — we keep
+// from a lane do not stop subsequent lanes from flushing, we keep
 // draining and return the first observed error so a single bad packet
 // doesn't strand the others.
 func (m *MultiCoalescer) Flush() error {
-	var first error
-	keep := func(err error) {
-		if err != nil && first == nil {
-			first = err
+	var errs []error
+	if m.tcp != nil {
+		if err := m.tcp.Flush(); err != nil {
+			errs = append(errs, err)
 		}
 	}
-	if m.tcp != nil {
-		keep(m.tcp.Flush())
-	}
 	if m.udp != nil {
-		keep(m.udp.Flush())
+		if err := m.udp.Flush(); err != nil {
+			errs = append(errs, err)
+		}
+	}
+	if err := m.pt.Flush(); err != nil {
+		errs = append(errs, err)
 	}
-	keep(m.pt.Flush())
 	m.backing = m.backing[:0]
-	return first
+	return errors.Join(errs...)
 }
diff --git a/overlay/checksum/checksum_amd64.go b/overlay/checksum/checksum_amd64.go
new file mode 100644
index 00000000..d504e73e
--- /dev/null
+++ b/overlay/checksum/checksum_amd64.go
@@ -0,0 +1,23 @@
+package checksum
+
+import (
+	"golang.org/x/sys/cpu"
+	gvisorchecksum "gvisor.dev/gvisor/pkg/tcpip/checksum"
+)
+
+//go:noescape
+func checksumAVX2(buf []byte, initial uint16) uint16
+
+var hasAVX2 = cpu.X86.HasAVX2
+
+// Checksum computes the RFC 1071 ones-complement sum of buf, seeded with
+// initial. It is a drop-in replacement for gvisor's checksum.Checksum that
+// dispatches to a hand-written AVX2 routine on amd64 CPUs that support it,
+// falling back to gvisor's pure-Go implementation otherwise. The result
+// matches gvisor's bit-for-bit for any buffer length and initial seed.
+func Checksum(buf []byte, initial uint16) uint16 {
+	if hasAVX2 {
+		return checksumAVX2(buf, initial)
+	}
+	return gvisorchecksum.Checksum(buf, initial)
+}
diff --git a/overlay/checksum/checksum_amd64.s b/overlay/checksum/checksum_amd64.s
new file mode 100644
index 00000000..5ee864d6
--- /dev/null
+++ b/overlay/checksum/checksum_amd64.s
@@ -0,0 +1,157 @@
+#include "textflag.h"
+
+// func checksumAVX2(buf []byte, initial uint16) uint16
+//
+// Computes the RFC 1071 ones-complement sum of buf, seeded with initial.
+//
+// Algorithm: sum the buffer treating it as a stream of uint32s in machine
+// (little-endian) byte order, accumulating into 64-bit lanes (top 32 bits
+// hold cross-add carries — at 1 byte / lane / iter we have 32 bits of
+// headroom which is far more than the 16 KB/64 KB max practical inputs).
+// At the end we fold to 16 bits and byte-swap once to recover the on-wire
+// (big-endian) result. RFC 1071 §1.2.B byte-order independence makes this
+// equivalent to summing as 16-bit big-endian words.
+//
+// The ymm accumulators (Y4..Y7) hold 4 uint64 lanes each = 16 parallel
+// partial sums. The main loop loads 64 bytes per iter as four 16-byte
+// chunks, zero-extending each chunk's four uint32s into a ymm via
+// VPMOVZXDQ-from-memory, then VPADDQ into a separate accumulator per
+// chunk to break the dep chain. After the vector loop the lane sums are
+// horizontally reduced and merged with a scalar accumulator that handles
+// the trailing 0..63 bytes plus the (byte-swapped) initial seed.
+TEXT ·checksumAVX2(SB), NOSPLIT, $0-34
+	MOVQ    buf_base+0(FP), SI
+	MOVQ    buf_len+8(FP), CX
+	MOVWQZX initial+24(FP), AX
+
+	// Pre-byteswap initial into the LE-summing space so it merges directly
+	// with the rest of the accumulator. The final fold's bswap16 will undo
+	// this and convert the whole result back to BE.
+	XCHGB AH, AL
+
+	CMPQ CX, $32
+	JLT  scalar_tail
+
+	VPXOR Y4, Y4, Y4
+	VPXOR Y5, Y5, Y5
+	VPXOR Y6, Y6, Y6
+	VPXOR Y7, Y7, Y7
+
+	CMPQ CX, $64
+	JLT  loop32
+
+loop64:
+	VPMOVZXDQ (SI), Y0
+	VPMOVZXDQ 16(SI), Y1
+	VPMOVZXDQ 32(SI), Y2
+	VPMOVZXDQ 48(SI), Y3
+	VPADDQ    Y0, Y4, Y4
+	VPADDQ    Y1, Y5, Y5
+	VPADDQ    Y2, Y6, Y6
+	VPADDQ    Y3, Y7, Y7
+	ADDQ      $64, SI
+	SUBQ      $64, CX
+	CMPQ      CX, $64
+	JGE       loop64
+
+loop32:
+	CMPQ      CX, $32
+	JLT       reduce_vec
+	VPMOVZXDQ (SI), Y0
+	VPMOVZXDQ 16(SI), Y1
+	VPADDQ    Y0, Y4, Y4
+	VPADDQ    Y1, Y5, Y5
+	ADDQ      $32, SI
+	SUBQ      $32, CX
+	JMP       loop32
+
+reduce_vec:
+	// Combine the four ymm accumulators into Y4.
+	VPADDQ Y5, Y4, Y4
+	VPADDQ Y7, Y6, Y6
+	VPADDQ Y6, Y4, Y4
+
+	// Horizontally reduce Y4's four uint64 lanes to a single scalar.
+	VEXTRACTI128 $1, Y4, X5
+	VPADDQ       X5, X4, X4
+	VPSHUFD      $0x4e, X4, X5
+	VPADDQ       X5, X4, X4
+	VMOVQ        X4, R8
+	VZEROUPPER
+
+	ADDQ R8, AX
+	ADCQ $0, AX
+
+scalar_tail:
+	// Handle remaining 0..63 bytes (or the entire buffer if it was < 32).
+	CMPQ CX, $8
+	JLT  tail4
+
+loop8:
+	ADDQ (SI), AX
+	ADCQ $0, AX
+	ADDQ $8, SI
+	SUBQ $8, CX
+	CMPQ CX, $8
+	JGE  loop8
+
+tail4:
+	CMPQ CX, $4
+	JLT  tail2
+	MOVL (SI), R8
+	ADDQ R8, AX
+	ADCQ $0, AX
+	ADDQ $4, SI
+	SUBQ $4, CX
+
+tail2:
+	CMPQ    CX, $2
+	JLT     tail1
+	MOVWQZX (SI), R8
+	ADDQ    R8, AX
+	ADCQ    $0, AX
+	ADDQ    $2, SI
+	SUBQ    $2, CX
+
+tail1:
+	TESTQ   CX, CX
+	JZ      fold
+	MOVBQZX (SI), R8
+	ADDQ    R8, AX
+	ADCQ    $0, AX
+
+fold:
+	// Fold the 64-bit accumulator to 16 bits via four rounds, mirroring
+	// gvisor's reduce(). Each pair (split, add) halves the live width;
+	// the truncation steps absorb the single bit that may be left over
+	// after each add so the next round's bound holds.
+
+	// 64 → 33 bits.
+	MOVQ AX, R8
+	SHRQ $32, R8
+	MOVL AX, AX
+	ADDQ R8, AX
+
+	// 33 → 32 bits. AX += (AX>>32); truncate to 32. AX is now ≤ 0xFFFF_FFFF.
+	MOVQ AX, R8
+	SHRQ $32, R8
+	ADDQ R8, AX
+	MOVL AX, AX
+
+	// 32 → 17 bits.
+	MOVQ    AX, R8
+	SHRQ    $16, R8
+	MOVWQZX AX, AX
+	ADDQ    R8, AX
+
+	// 17 → 16 bits. AX += (AX>>16); the trailing MOVW truncates bit 16.
+	MOVQ AX, R8
+	SHRQ $16, R8
+	ADDQ R8, AX
+
+	// AX low 16 bits hold the 16-bit sum in machine (LE) byte order; flip
+	// to big-endian to match the gvisor API contract.
+	XCHGB AH, AL
+
+	MOVW AX, ret+32(FP)
+	RET
diff --git a/overlay/checksum/checksum_arm64.go b/overlay/checksum/checksum_arm64.go
new file mode 100644
index 00000000..561ba712
--- /dev/null
+++ b/overlay/checksum/checksum_arm64.go
@@ -0,0 +1,12 @@
+package checksum
+
+//go:noescape
+func checksumNEON(buf []byte, initial uint16) uint16
+
+// Checksum computes the RFC 1071 ones-complement sum of buf, seeded with
+// initial. It is a drop-in replacement for gvisor's checksum.Checksum
+// that dispatches to a hand-written NEON routine. NEON is mandatory in
+// armv8 so no feature check is needed.
+func Checksum(buf []byte, initial uint16) uint16 {
+	return checksumNEON(buf, initial)
+}
diff --git a/overlay/checksum/checksum_arm64.s b/overlay/checksum/checksum_arm64.s
new file mode 100644
index 00000000..11499820
--- /dev/null
+++ b/overlay/checksum/checksum_arm64.s
@@ -0,0 +1,143 @@
+#include "textflag.h"
+
+// func checksumNEON(buf []byte, initial uint16) uint16
+//
+// Mirrors the algorithm in checksum_amd64.s: sum the buffer treating it as
+// a stream of uint32s in machine (little-endian) byte order, accumulating
+// into 64-bit lanes that have ample carry headroom; fold and byte-swap once
+// at the very end to recover the on-wire (big-endian) result.
+//
+// Each loop iteration loads 64 bytes via VLD1.P into V0..V3 (4 Q regs).
+// VUADDW takes the low two uint32 lanes of a Q reg, zero-extends them to
+// uint64, and adds them into a 2×uint64 accumulator; VUADDW2 does the same
+// for the high two lanes. Four ymm-equivalent accumulators (V8..V11) get
+// updated twice per iter to break the dep chain. Tail bytes go through a
+// scalar ADCS chain seeded with the byte-swapped initial.
+TEXT ·checksumNEON(SB), NOSPLIT, $0-34
+	MOVD  buf_base+0(FP), R0
+	MOVD  buf_len+8(FP), R1
+	MOVHU initial+24(FP), R2
+
+	// Pre-byteswap initial into the LE-summing space so it merges directly
+	// with the rest of the accumulator.
+	REV16W R2, R2
+
+	MOVD ZR, R3 // scalar accumulator
+
+	CMP $32, R1
+	BLT scalar_tail
+
+	VEOR V8.B16, V8.B16, V8.B16
+	VEOR V9.B16, V9.B16, V9.B16
+	VEOR V10.B16, V10.B16, V10.B16
+	VEOR V11.B16, V11.B16, V11.B16
+
+	CMP $64, R1
+	BLT loop16_init
+
+loop64:
+	VLD1.P  64(R0), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VUADDW  V0.S2, V8.D2, V8.D2
+	VUADDW2 V0.S4, V9.D2, V9.D2
+	VUADDW  V1.S2, V10.D2, V10.D2
+	VUADDW2 V1.S4, V11.D2, V11.D2
+	VUADDW  V2.S2, V8.D2, V8.D2
+	VUADDW2 V2.S4, V9.D2, V9.D2
+	VUADDW  V3.S2, V10.D2, V10.D2
+	VUADDW2 V3.S4, V11.D2, V11.D2
+	SUB     $64, R1, R1
+	CMP     $64, R1
+	BGE     loop64
+
+loop16_init:
+	CMP $16, R1
+	BLT reduce_vec
+
+loop16:
+	VLD1.P  16(R0), [V0.B16]
+	VUADDW  V0.S2, V8.D2, V8.D2
+	VUADDW2 V0.S4, V9.D2, V9.D2
+	SUB     $16, R1, R1
+	CMP     $16, R1
+	BGE     loop16
+
+reduce_vec:
+	// Combine the four accumulators into V8.
+	VADD V9.D2, V8.D2, V8.D2
+	VADD V11.D2, V10.D2, V10.D2
+	VADD V10.D2, V8.D2, V8.D2
+
+	// Horizontal-add the two lanes of V8.D2 into a single uint64.
+	VADDP V8.D2, V8.D2, V8.D2
+	VMOV  V8.D[0], R8
+
+	ADDS R8, R3, R3
+	ADC  ZR, R3, R3
+
+scalar_tail:
+	CMP $8, R1
+	BLT tail4
+
+loop8:
+	MOVD.P 8(R0), R8
+	ADDS   R8, R3, R3
+	ADC    ZR, R3, R3
+	SUB    $8, R1, R1
+	CMP    $8, R1
+	BGE    loop8
+
+tail4:
+	CMP    $4, R1
+	BLT    tail2
+	MOVWU.P 4(R0), R8
+	ADDS   R8, R3, R3
+	ADC    ZR, R3, R3
+	SUB    $4, R1, R1
+
+tail2:
+	CMP    $2, R1
+	BLT    tail1
+	MOVHU.P 2(R0), R8
+	ADDS   R8, R3, R3
+	ADC    ZR, R3, R3
+	SUB    $2, R1, R1
+
+tail1:
+	CBZ R1, fold
+	MOVBU (R0), R8
+	ADDS  R8, R3, R3
+	ADC   ZR, R3, R3
+
+fold:
+	// Merge the byte-swapped initial into our LE-form accumulator.
+	ADDS R2, R3, R3
+	ADC  ZR, R3, R3
+
+	// 64 → 33 bits.
+	LSR $32, R3, R8
+	AND $0xffffffff, R3, R3
+	ADD R8, R3, R3
+
+	// 33 → 32 (truncate after adding bit 32 back).
+	LSR $32, R3, R8
+	ADD R8, R3, R3
+	AND $0xffffffff, R3, R3
+
+	// 32 → 17.
+	LSR $16, R3, R8
+	AND $0xffff, R3, R3
+	ADD R8, R3, R3
+
+	// 17 → 16 (truncation absorbs bit 16 below).
+	LSR $16, R3, R8
+	ADD R8, R3, R3
+
+	// AX low 16 bits hold the 16-bit sum in machine (LE) byte order; flip
+	// to big-endian to match the gvisor API contract. REV16W swaps bytes
+	// within each 16-bit halfword of the low 32 bits, so it acts as a
+	// 16-bit byte-swap on the live low 16.
+	REV16W R3, R3
+	AND    $0xffff, R3, R3
+
+	MOVH R3, ret+32(FP)
+	RET
diff --git a/overlay/checksum/checksum_fallback.go b/overlay/checksum/checksum_fallback.go
new file mode 100644
index 00000000..89ac90a5
--- /dev/null
+++ b/overlay/checksum/checksum_fallback.go
@@ -0,0 +1,10 @@
+//go:build !amd64 && !arm64
+
+package checksum
+
+import gvisorchecksum "gvisor.dev/gvisor/pkg/tcpip/checksum"
+
+// Checksum delegates to gvisor on architectures without a hand-written body.
+func Checksum(buf []byte, initial uint16) uint16 {
+	return gvisorchecksum.Checksum(buf, initial)
+}
diff --git a/overlay/checksum/checksum_test.go b/overlay/checksum/checksum_test.go
new file mode 100644
index 00000000..08c8876b
--- /dev/null
+++ b/overlay/checksum/checksum_test.go
@@ -0,0 +1,190 @@
+package checksum
+
+import (
+	"fmt"
+	"math/rand/v2"
+	"testing"
+
+	gvisorchecksum "gvisor.dev/gvisor/pkg/tcpip/checksum"
+)
+
+// TestChecksumMatchesGvisor walks lengths from 0 to 4096, with several initial
+// seeds and a handful of starting alignments, asserting that our local
+// Checksum matches gvisor's reference bit-for-bit.
+func TestChecksumMatchesGvisor(t *testing.T) {
+	rng := rand.New(rand.NewPCG(1, 2))
+	const padFront = 16
+
+	// Random pool large enough for the longest case + alignment slop.
+	pool := make([]byte, 4096+padFront)
+	for i := range pool {
+		pool[i] = byte(rng.Uint32())
+	}
+
+	seeds := []uint16{0, 0x0001, 0xabcd, 0xffff, 0x1234, 0xfedc}
+	offsets := []int{0, 1, 2, 3, 4, 5, 7, 8, 15, 16}
+
+	for length := 0; length <= 4096; length++ {
+		for _, seed := range seeds {
+			for _, off := range offsets {
+				if off+length > len(pool) {
+					continue
+				}
+				buf := pool[off : off+length]
+				want := gvisorchecksum.Checksum(buf, seed)
+				got := Checksum(buf, seed)
+				if got != want {
+					t.Fatalf("len=%d off=%d seed=%#x: got %#04x want %#04x",
+						length, off, seed, got, want)
+				}
+			}
+		}
+	}
+}
+
+// TestChecksumPatternedBuffers exercises specific byte patterns that have
+// historically tripped up checksum implementations: all-zero, all-0xff,
+// alternating, and ascending sequences.
+func TestChecksumPatternedBuffers(t *testing.T) {
+	for length := 0; length <= 256; length++ {
+		patterns := map[string][]byte{
+			"zeros":       make([]byte, length),
+			"ones":        bytes(length, 0xff),
+			"alternating": pattern(length, []byte{0xa5, 0x5a}),
+			"ascending":   ascending(length),
+		}
+		for name, buf := range patterns {
+			for _, seed := range []uint16{0, 0xffff, 0x8000} {
+				want := gvisorchecksum.Checksum(buf, seed)
+				got := Checksum(buf, seed)
+				if got != want {
+					t.Fatalf("%s len=%d seed=%#x: got %#04x want %#04x",
+						name, length, seed, got, want)
+				}
+			}
+		}
+	}
+}
+
+func bytes(n int, v byte) []byte {
+	b := make([]byte, n)
+	for i := range b {
+		b[i] = v
+	}
+	return b
+}
+
+func pattern(n int, p []byte) []byte {
+	b := make([]byte, n)
+	for i := range b {
+		b[i] = p[i%len(p)]
+	}
+	return b
+}
+
+func ascending(n int) []byte {
+	b := make([]byte, n)
+	for i := range b {
+		b[i] = byte(i)
+	}
+	return b
+}
+
+// TestChecksumTailPaths targets every combination of (SIMD body iterations,
+// trailing tail bytes) the asm handlers walk through. The tail handlers
+// peel off 8 → 4 → 2 → 1 byte chunks in turn; this test exercises each by
+// constructing lengths of the form 64*k + tail for tail ∈ [0, 63] and a
+// representative spread of k values, including k=0 (no main loop, all tail)
+// and k=1 (one main loop iter, then tail). It's explicit coverage for
+// payload sizes that are odd, not divisible by 4, by 8, or by 32.
+func TestChecksumTailPaths(t *testing.T) {
+	rng := rand.New(rand.NewPCG(42, 17))
+	const padFront = 16
+	const maxK = 8
+
+	pool := make([]byte, 64*maxK+padFront+64)
+	for i := range pool {
+		pool[i] = byte(rng.Uint32())
+	}
+
+	seeds := []uint16{0, 0xffff, 0xabcd}
+	offsets := []int{0, 1, 3, 7, 15} // mix of aligned and odd starts
+
+	for k := 0; k <= maxK; k++ {
+		for tail := 0; tail < 64; tail++ {
+			length := 64*k + tail
+			for _, seed := range seeds {
+				for _, off := range offsets {
+					if off+length > len(pool) {
+						continue
+					}
+					buf := pool[off : off+length]
+					want := gvisorchecksum.Checksum(buf, seed)
+					got := Checksum(buf, seed)
+					if got != want {
+						t.Fatalf("k=%d tail=%d (len=%d) off=%d seed=%#x: got %#04x want %#04x",
+							k, tail, length, off, seed, got, want)
+					}
+				}
+			}
+		}
+	}
+}
+
+// BenchmarkChecksumTailSizes covers payload sizes that aren't clean multiples
+// of the SIMD body's 32-byte (amd64) or 16-byte (arm64) chunks, so the tail
+// handler is meaningfully on the hot path. Sizes are picked to either exercise
+// every tail branch (tiny lengths) or sit slightly off realistic packet
+// boundaries (e.g. 1499 = MTU − 1).
+func BenchmarkChecksumTailSizes(b *testing.B) {
+	sizes := []int{
+		1, 3, 7, 15, 31, // sub-SIMD; entire work is scalar tail
+		33, 35, 47, 63, // one loop32 + assorted tails
+		65, 95, 127, // one loop64 + assorted tails
+		1447, 1471, 1499, 1501, // around MTU
+		8191, 8193, // around USO
+		65531, 65533, // near the kernel max
+	}
+	for _, size := range sizes {
+		buf := make([]byte, size)
+		for i := range buf {
+			buf[i] = byte(i)
+		}
+		b.Run(fmt.Sprintf("size=%d/local", size), func(b *testing.B) {
+			b.SetBytes(int64(size))
+			for i := 0; i < b.N; i++ {
+				_ = Checksum(buf, 0)
+			}
+		})
+		b.Run(fmt.Sprintf("size=%d/gvisor", size), func(b *testing.B) {
+			b.SetBytes(int64(size))
+			for i := 0; i < b.N; i++ {
+				_ = gvisorchecksum.Checksum(buf, 0)
+			}
+		})
+	}
+}
+
+// BenchmarkChecksum compares the local Checksum to gvisor's at sizes that
+// match real traffic: a TCP/IP header (60), a typical MSS (1448), a typical
+// USO size (8192), and the kernel's max GSO superpacket (65535).
+func BenchmarkChecksum(b *testing.B) {
+	for _, size := range []int{60, 1448, 8192, 65535} {
+		buf := make([]byte, size)
+		for i := range buf {
+			buf[i] = byte(i)
+		}
+		b.Run(fmt.Sprintf("size=%d/local", size), func(b *testing.B) {
+			b.SetBytes(int64(size))
+			for i := 0; i < b.N; i++ {
+				_ = Checksum(buf, 0)
+			}
+		})
+		b.Run(fmt.Sprintf("size=%d/gvisor", size), func(b *testing.B) {
+			b.SetBytes(int64(size))
+			for i := 0; i < b.N; i++ {
+				_ = gvisorchecksum.Checksum(buf, 0)
+			}
+		})
+	}
+}
diff --git a/overlay/tio/virtio/segment_linux.go b/overlay/tio/virtio/segment_linux.go
index f0e90c0f..1244595f 100644
--- a/overlay/tio/virtio/segment_linux.go
+++ b/overlay/tio/virtio/segment_linux.go
@@ -14,7 +14,8 @@ import (
 	"fmt"
 
 	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/tcpip/checksum"
+
+	"github.com/slackhq/nebula/overlay/checksum"
 )
 
 // Protocol header size bounds used to validate / cap kernel-supplied offsets.