Files
nebula/overlay/checksum/checksum_arm64.s
2026-05-11 11:32:57 -05:00

144 lines
3.2 KiB
ArmAsm
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "textflag.h"
// func checksumNEON(buf []byte, initial uint16) uint16
//
// Mirrors the algorithm in checksum_amd64.s: sum the buffer treating it as
// a stream of uint32s in machine (little-endian) byte order, accumulating
// into 64-bit lanes that have ample carry headroom; fold and byte-swap once
// at the very end to recover the on-wire (big-endian) result.
//
// Each loop iteration loads 64 bytes via VLD1.P into V0..V3 (4 Q regs).
// VUADDW takes the low two uint32 lanes of a Q reg, zero-extends them to
// uint64, and adds them into a 2×uint64 accumulator; VUADDW2 does the same
// for the high two lanes. Four ymm-equivalent accumulators (V8..V11) get
// updated twice per iter to break the dep chain. Tail bytes go through a
// scalar ADCS chain seeded with the byte-swapped initial.
TEXT ·checksumNEON(SB), NOSPLIT, $0-34
MOVD buf_base+0(FP), R0
MOVD buf_len+8(FP), R1
MOVHU initial+24(FP), R2
// Pre-byteswap initial into the LE-summing space so it merges directly
// with the rest of the accumulator.
REV16W R2, R2
MOVD ZR, R3 // scalar accumulator
CMP $32, R1
BLT scalar_tail
VEOR V8.B16, V8.B16, V8.B16
VEOR V9.B16, V9.B16, V9.B16
VEOR V10.B16, V10.B16, V10.B16
VEOR V11.B16, V11.B16, V11.B16
CMP $64, R1
BLT loop16_init
loop64:
VLD1.P 64(R0), [V0.B16, V1.B16, V2.B16, V3.B16]
VUADDW V0.S2, V8.D2, V8.D2
VUADDW2 V0.S4, V9.D2, V9.D2
VUADDW V1.S2, V10.D2, V10.D2
VUADDW2 V1.S4, V11.D2, V11.D2
VUADDW V2.S2, V8.D2, V8.D2
VUADDW2 V2.S4, V9.D2, V9.D2
VUADDW V3.S2, V10.D2, V10.D2
VUADDW2 V3.S4, V11.D2, V11.D2
SUB $64, R1, R1
CMP $64, R1
BGE loop64
loop16_init:
CMP $16, R1
BLT reduce_vec
loop16:
VLD1.P 16(R0), [V0.B16]
VUADDW V0.S2, V8.D2, V8.D2
VUADDW2 V0.S4, V9.D2, V9.D2
SUB $16, R1, R1
CMP $16, R1
BGE loop16
reduce_vec:
// Combine the four accumulators into V8.
VADD V9.D2, V8.D2, V8.D2
VADD V11.D2, V10.D2, V10.D2
VADD V10.D2, V8.D2, V8.D2
// Horizontal-add the two lanes of V8.D2 into a single uint64.
VADDP V8.D2, V8.D2, V8.D2
VMOV V8.D[0], R8
ADDS R8, R3, R3
ADC ZR, R3, R3
scalar_tail:
CMP $8, R1
BLT tail4
loop8:
MOVD.P 8(R0), R8
ADDS R8, R3, R3
ADC ZR, R3, R3
SUB $8, R1, R1
CMP $8, R1
BGE loop8
tail4:
CMP $4, R1
BLT tail2
MOVWU.P 4(R0), R8
ADDS R8, R3, R3
ADC ZR, R3, R3
SUB $4, R1, R1
tail2:
CMP $2, R1
BLT tail1
MOVHU.P 2(R0), R8
ADDS R8, R3, R3
ADC ZR, R3, R3
SUB $2, R1, R1
tail1:
CBZ R1, fold
MOVBU (R0), R8
ADDS R8, R3, R3
ADC ZR, R3, R3
fold:
// Merge the byte-swapped initial into our LE-form accumulator.
ADDS R2, R3, R3
ADC ZR, R3, R3
// 64 33 bits.
LSR $32, R3, R8
AND $0xffffffff, R3, R3
ADD R8, R3, R3
// 33 32 (truncate after adding bit 32 back).
LSR $32, R3, R8
ADD R8, R3, R3
AND $0xffffffff, R3, R3
// 32 17.
LSR $16, R3, R8
AND $0xffff, R3, R3
ADD R8, R3, R3
// 17 16 (truncation absorbs bit 16 below).
LSR $16, R3, R8
ADD R8, R3, R3
// AX low 16 bits hold the 16-bit sum in machine (LE) byte order; flip
// to big-endian to match the gvisor API contract. REV16W swaps bytes
// within each 16-bit halfword of the low 32 bits, so it acts as a
// 16-bit byte-swap on the live low 16.
REV16W R3, R3
AND $0xffff, R3, R3
MOVH R3, ret+32(FP)
RET