mirror of
https://github.com/slackhq/nebula.git
synced 2026-05-16 12:57:38 +02:00
overlay/tio: KeepAlive writev iovec and payloads through the syscall
rawWrite passed the iovec pointer to syscall.Syscall as a uintptr, so
the Go compiler's escape analysis could not keep the underlying
[]unix.Iovec (or the payload slices its Base pointers reach) rooted
across the syscall. Under heavy sustained write load, GC could
collect or move these before tun_chr_write_iter finished reading
them, at which point the kernel read freed memory.
Observed on a UniFi UXG-Pro (Annapurna Labs Alpine V2, arm64, Linux
4.19.152) forwarding 1 Gbps iperf3 -R between LAN and a remote
Nebula peer, as two paired kernel warnings in the same second:
refcount_t: underflow; use-after-free
sock_wfree -> skb_release_head_state -> kfree_skb
-> skb_release_data -> __kfree_skb -> tcp_recvmsg ...
refcount_t: addition on 0; use-after-free
skb_set_owner_w -> sock_alloc_send_pskb
-> tun_get_user -> tun_chr_write_iter -> do_iter_write
-> vfs_writev -> do_writev -> __arm64_sys_writev
The Annapurna watchdog then soft-rebooted the device. No crash or
kernel WARN after patching; box ran sustained 1 Gbps iperf3 -R
without issue.
Fix: add a variadic `keepAlive ...interface{}` parameter to
rawWrite, and call runtime.KeepAlive on the iovec plus every
supplied root after the syscall returns. writeWithScratch now
passes its buffer + iovec; WriteGSO passes the iovec array, the
header buffer, and the payload fragment slice.
runtime.KeepAlive is a compiler directive, not a runtime barrier,
so the cost is effectively zero: it just forces the compiler's
liveness analysis to treat the object as used at that point.
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
|
"runtime"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"syscall"
|
"syscall"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
@@ -264,12 +265,23 @@ func (r *Offload) writeWithScratch(buf []byte, iovs *[2]unix.Iovec) (int, error)
|
|||||||
iovs[1].Base = &buf[0]
|
iovs[1].Base = &buf[0]
|
||||||
iovs[1].SetLen(len(buf))
|
iovs[1].SetLen(len(buf))
|
||||||
iovPtr := unsafe.Pointer(&iovs[0])
|
iovPtr := unsafe.Pointer(&iovs[0])
|
||||||
return r.rawWrite(iovPtr, 2)
|
// Pin the caller's buffer AND the iovec array through the syscall.
|
||||||
|
return r.rawWrite(iovPtr, 2, buf, iovs)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *Offload) rawWrite(iovs unsafe.Pointer, iovcnt int) (int, error) {
|
func (r *Offload) rawWrite(iovs unsafe.Pointer, iovcnt int, keepAlive ...interface{}) (int, error) {
|
||||||
for {
|
for {
|
||||||
n, _, errno := syscall.Syscall(unix.SYS_WRITEV, uintptr(r.fd), uintptr(iovs), uintptr(iovcnt))
|
n, _, errno := syscall.Syscall(unix.SYS_WRITEV, uintptr(r.fd), uintptr(iovs), uintptr(iovcnt))
|
||||||
|
// Anchor the iovec array + every user-supplied payload slice
|
||||||
|
// through the syscall return. Without these, Go's GC may move or
|
||||||
|
// collect the underlying backing arrays while the kernel is still
|
||||||
|
// reading them via DMA (we pass the iovec as uintptr, so the
|
||||||
|
// compiler does not keep it live). Observed in practice as a
|
||||||
|
// kernel refcount underflow on tun_chr_write_iter / sock_wfree.
|
||||||
|
runtime.KeepAlive(iovs)
|
||||||
|
for _, ka := range keepAlive {
|
||||||
|
runtime.KeepAlive(ka)
|
||||||
|
}
|
||||||
if errno == 0 {
|
if errno == 0 {
|
||||||
if int(n) < virtioNetHdrLen {
|
if int(n) < virtioNetHdrLen {
|
||||||
return 0, io.ErrShortWrite
|
return 0, io.ErrShortWrite
|
||||||
@@ -354,7 +366,11 @@ func (r *Offload) WriteGSO(hdr []byte, pays [][]byte, gsoSize uint16, isV6 bool,
|
|||||||
|
|
||||||
iovPtr := unsafe.Pointer(&r.gsoIovs[0])
|
iovPtr := unsafe.Pointer(&r.gsoIovs[0])
|
||||||
iovCnt := len(r.gsoIovs)
|
iovCnt := len(r.gsoIovs)
|
||||||
_, err := r.rawWrite(iovPtr, iovCnt)
|
// Pin EVERYTHING the kernel might still read via DMA: the backing iovec
|
||||||
|
// slice, the IP/TCP header buffer, and every individual payload
|
||||||
|
// fragment. Skipping any of these risks a use-after-free in
|
||||||
|
// tun_chr_write_iter if GC runs mid-syscall.
|
||||||
|
_, err := r.rawWrite(iovPtr, iovCnt, r.gsoIovs, hdr, pays)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user