From 7c38aa7e6bbde23878965b96b9b10edff22b5366 Mon Sep 17 00:00:00 2001 From: rawdigits Date: Fri, 24 Apr 2026 21:40:51 +0000 Subject: [PATCH] overlay/tio: KeepAlive writev iovec and payloads through the syscall rawWrite passed the iovec pointer to syscall.Syscall as a uintptr, so the Go compiler's escape analysis could not keep the underlying []unix.Iovec (or the payload slices its Base pointers reach) rooted across the syscall. Under heavy sustained write load, GC could collect or move these before tun_chr_write_iter finished reading them, at which point the kernel read freed memory. Observed on a UniFi UXG-Pro (Annapurna Labs Alpine V2, arm64, Linux 4.19.152) forwarding 1 Gbps iperf3 -R between LAN and a remote Nebula peer, as two paired kernel warnings in the same second: refcount_t: underflow; use-after-free sock_wfree -> skb_release_head_state -> kfree_skb -> skb_release_data -> __kfree_skb -> tcp_recvmsg ... refcount_t: addition on 0; use-after-free skb_set_owner_w -> sock_alloc_send_pskb -> tun_get_user -> tun_chr_write_iter -> do_iter_write -> vfs_writev -> do_writev -> __arm64_sys_writev The Annapurna watchdog then soft-rebooted the device. No crash or kernel WARN after patching; box ran sustained 1 Gbps iperf3 -R without issue. Fix: add a variadic `keepAlive ...interface{}` parameter to rawWrite, and call runtime.KeepAlive on the iovec plus every supplied root after the syscall returns. writeWithScratch now passes its buffer + iovec; WriteGSO passes the iovec array, the header buffer, and the payload fragment slice. runtime.KeepAlive is a compiler directive, not a runtime barrier, so the cost is effectively zero: it just forces the compiler's liveness analysis to treat the object as used at that point. --- overlay/tio/tio_gso_linux.go | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/overlay/tio/tio_gso_linux.go b/overlay/tio/tio_gso_linux.go index 70e8038b..27e54adb 100644 --- a/overlay/tio/tio_gso_linux.go +++ b/overlay/tio/tio_gso_linux.go @@ -4,6 +4,7 @@ import ( "fmt" "io" "os" + "runtime" "sync/atomic" "syscall" "unsafe" @@ -264,12 +265,23 @@ func (r *Offload) writeWithScratch(buf []byte, iovs *[2]unix.Iovec) (int, error) iovs[1].Base = &buf[0] iovs[1].SetLen(len(buf)) iovPtr := unsafe.Pointer(&iovs[0]) - return r.rawWrite(iovPtr, 2) + // Pin the caller's buffer AND the iovec array through the syscall. + return r.rawWrite(iovPtr, 2, buf, iovs) } -func (r *Offload) rawWrite(iovs unsafe.Pointer, iovcnt int) (int, error) { +func (r *Offload) rawWrite(iovs unsafe.Pointer, iovcnt int, keepAlive ...interface{}) (int, error) { for { n, _, errno := syscall.Syscall(unix.SYS_WRITEV, uintptr(r.fd), uintptr(iovs), uintptr(iovcnt)) + // Anchor the iovec array + every user-supplied payload slice + // through the syscall return. Without these, Go's GC may move or + // collect the underlying backing arrays while the kernel is still + // reading them via DMA (we pass the iovec as uintptr, so the + // compiler does not keep it live). Observed in practice as a + // kernel refcount underflow on tun_chr_write_iter / sock_wfree. + runtime.KeepAlive(iovs) + for _, ka := range keepAlive { + runtime.KeepAlive(ka) + } if errno == 0 { if int(n) < virtioNetHdrLen { return 0, io.ErrShortWrite @@ -354,7 +366,11 @@ func (r *Offload) WriteGSO(hdr []byte, pays [][]byte, gsoSize uint16, isV6 bool, iovPtr := unsafe.Pointer(&r.gsoIovs[0]) iovCnt := len(r.gsoIovs) - _, err := r.rawWrite(iovPtr, iovCnt) + // Pin EVERYTHING the kernel might still read via DMA: the backing iovec + // slice, the IP/TCP header buffer, and every individual payload + // fragment. Skipping any of these risks a use-after-free in + // tun_chr_write_iter if GC runs mid-syscall. + _, err := r.rawWrite(iovPtr, iovCnt, r.gsoIovs, hdr, pays) return err }