switch to ASM vector checksum

2026-05-16 04:47:38 +02:00 · 2026-05-04 11:56:58 -05:00
parent 5d35351437
commit 69863d6c81
9 changed files with 560 additions and 24 deletions
--- a/overlay/checksum/checksum_arm64.s
+++ b/overlay/checksum/checksum_arm64.s
@@ -0,0 +1,143 @@
+#include "textflag.h"
+
+// func checksumNEON(buf []byte, initial uint16) uint16
+//
+// Mirrors the algorithm in checksum_amd64.s: sum the buffer treating it as
+// a stream of uint32s in machine (little-endian) byte order, accumulating
+// into 64-bit lanes that have ample carry headroom; fold and byte-swap once
+// at the very end to recover the on-wire (big-endian) result.
+//
+// Each loop iteration loads 64 bytes via VLD1.P into V0..V3 (4 Q regs).
+// VUADDW takes the low two uint32 lanes of a Q reg, zero-extends them to
+// uint64, and adds them into a 2×uint64 accumulator; VUADDW2 does the same
+// for the high two lanes. Four ymm-equivalent accumulators (V8..V11) get
+// updated twice per iter to break the dep chain. Tail bytes go through a
+// scalar ADCS chain seeded with the byte-swapped initial.
+TEXT ·checksumNEON(SB), NOSPLIT, $0-34
+	MOVD  buf_base+0(FP), R0
+	MOVD  buf_len+8(FP), R1
+	MOVHU initial+24(FP), R2
+
+	// Pre-byteswap initial into the LE-summing space so it merges directly
+	// with the rest of the accumulator.
+	REV16W R2, R2
+
+	MOVD ZR, R3 // scalar accumulator
+
+	CMP $32, R1
+	BLT scalar_tail
+
+	VEOR V8.B16, V8.B16, V8.B16
+	VEOR V9.B16, V9.B16, V9.B16
+	VEOR V10.B16, V10.B16, V10.B16
+	VEOR V11.B16, V11.B16, V11.B16
+
+	CMP $64, R1
+	BLT loop16_init
+
+loop64:
+	VLD1.P  64(R0), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VUADDW  V0.S2, V8.D2, V8.D2
+	VUADDW2 V0.S4, V9.D2, V9.D2
+	VUADDW  V1.S2, V10.D2, V10.D2
+	VUADDW2 V1.S4, V11.D2, V11.D2
+	VUADDW  V2.S2, V8.D2, V8.D2
+	VUADDW2 V2.S4, V9.D2, V9.D2
+	VUADDW  V3.S2, V10.D2, V10.D2
+	VUADDW2 V3.S4, V11.D2, V11.D2
+	SUB     $64, R1, R1
+	CMP     $64, R1
+	BGE     loop64
+
+loop16_init:
+	CMP $16, R1
+	BLT reduce_vec
+
+loop16:
+	VLD1.P  16(R0), [V0.B16]
+	VUADDW  V0.S2, V8.D2, V8.D2
+	VUADDW2 V0.S4, V9.D2, V9.D2
+	SUB     $16, R1, R1
+	CMP     $16, R1
+	BGE     loop16
+
+reduce_vec:
+	// Combine the four accumulators into V8.
+	VADD V9.D2, V8.D2, V8.D2
+	VADD V11.D2, V10.D2, V10.D2
+	VADD V10.D2, V8.D2, V8.D2
+
+	// Horizontal-add the two lanes of V8.D2 into a single uint64.
+	VADDP V8.D2, V8.D2, V8.D2
+	VMOV  V8.D[0], R8
+
+	ADDS R8, R3, R3
+	ADC  ZR, R3, R3
+
+scalar_tail:
+	CMP $8, R1
+	BLT tail4
+
+loop8:
+	MOVD.P 8(R0), R8
+	ADDS   R8, R3, R3
+	ADC    ZR, R3, R3
+	SUB    $8, R1, R1
+	CMP    $8, R1
+	BGE    loop8
+
+tail4:
+	CMP    $4, R1
+	BLT    tail2
+	MOVWU.P 4(R0), R8
+	ADDS   R8, R3, R3
+	ADC    ZR, R3, R3
+	SUB    $4, R1, R1
+
+tail2:
+	CMP    $2, R1
+	BLT    tail1
+	MOVHU.P 2(R0), R8
+	ADDS   R8, R3, R3
+	ADC    ZR, R3, R3
+	SUB    $2, R1, R1
+
+tail1:
+	CBZ R1, fold
+	MOVBU (R0), R8
+	ADDS  R8, R3, R3
+	ADC   ZR, R3, R3
+
+fold:
+	// Merge the byte-swapped initial into our LE-form accumulator.
+	ADDS R2, R3, R3
+	ADC  ZR, R3, R3
+
+	// 64 → 33 bits.
+	LSR $32, R3, R8
+	AND $0xffffffff, R3, R3
+	ADD R8, R3, R3
+
+	// 33 → 32 (truncate after adding bit 32 back).
+	LSR $32, R3, R8
+	ADD R8, R3, R3
+	AND $0xffffffff, R3, R3
+
+	// 32 → 17.
+	LSR $16, R3, R8
+	AND $0xffff, R3, R3
+	ADD R8, R3, R3
+
+	// 17 → 16 (truncation absorbs bit 16 below).
+	LSR $16, R3, R8
+	ADD R8, R3, R3
+
+	// AX low 16 bits hold the 16-bit sum in machine (LE) byte order; flip
+	// to big-endian to match the gvisor API contract. REV16W swaps bytes
+	// within each 16-bit halfword of the low 32 bits, so it acts as a
+	// 16-bit byte-swap on the live low 16.
+	REV16W R3, R3
+	AND    $0xffff, R3, R3
+
+	MOVH R3, ret+32(FP)
+	RET