Files
nebula/overlay/tio/tio_poll_linux.go
rawdigits 3dea496c7f overlay/tio: KeepAlive poll-path readv/writev buffers too
The Poll fallback (used when IFF_VNET_HDR can't be enabled) has the
same unsafe-pointer-via-uintptr pattern as Offload.rawWrite:
readOne/writeOne build a [2]syscall.Iovec on the stack, pass it to
syscall.Syscall as uintptr, and the kernel then DMAs in/out of the
to/from slices whose Base pointers the iovec holds.

Escape analysis can't see that use, so under GC pressure the
backing memory could be collected or moved mid-syscall.

Add runtime.KeepAlive on the iovec and the user buffer around both
the SYS_READV and SYS_WRITEV syscalls. Same pattern and rationale
as the prior commit on the offload path.
2026-04-24 21:44:37 +00:00

218 lines
4.8 KiB
Go

package tio
import (
"fmt"
"os"
"runtime"
"sync/atomic"
"syscall"
"unsafe"
"golang.org/x/sys/unix"
)
// Maximum size we accept for a single read from a TUN with IFF_VNET_HDR. A
// TSO superpacket can be up to 64KiB of payload plus a single L2/L3/L4 header
// prefix plus the virtio header.
const tunReadBufSize = 65535
type Poll struct {
fd int
readPoll [2]unix.PollFd
writePoll [2]unix.PollFd
closed atomic.Bool
readBuf []byte
batchRet [1][]byte
}
func newPoll(fd int, shutdownFd int) (*Poll, error) {
if err := unix.SetNonblock(fd, true); err != nil {
_ = unix.Close(fd)
return nil, fmt.Errorf("failed to set Poll device as nonblocking: %w", err)
}
out := &Poll{
fd: fd,
readBuf: make([]byte, tunReadBufSize),
readPoll: [2]unix.PollFd{
{Fd: int32(fd), Events: unix.POLLIN},
{Fd: int32(shutdownFd), Events: unix.POLLIN},
},
writePoll: [2]unix.PollFd{
{Fd: int32(fd), Events: unix.POLLOUT},
{Fd: int32(shutdownFd), Events: unix.POLLIN},
},
}
return out, nil
}
// blockOnRead waits until the Poll fd is readable or shutdown has been signaled.
// Returns os.ErrClosed if Close was called.
func (t *Poll) blockOnRead() error {
const problemFlags = unix.POLLHUP | unix.POLLNVAL | unix.POLLERR
var err error
for {
_, err = unix.Poll(t.readPoll[:], -1)
if err != unix.EINTR {
break
}
}
tunEvents := t.readPoll[0].Revents
shutdownEvents := t.readPoll[1].Revents
t.readPoll[0].Revents = 0
t.readPoll[1].Revents = 0
if err != nil {
return err
}
if shutdownEvents&(unix.POLLIN|problemFlags) != 0 {
return os.ErrClosed
}
if tunEvents&problemFlags != 0 {
return os.ErrClosed
}
return nil
}
func (t *Poll) blockOnWrite() error {
const problemFlags = unix.POLLHUP | unix.POLLNVAL | unix.POLLERR
var err error
for {
_, err = unix.Poll(t.writePoll[:], -1)
if err != unix.EINTR {
break
}
}
tunEvents := t.writePoll[0].Revents
shutdownEvents := t.writePoll[1].Revents
t.writePoll[0].Revents = 0
t.writePoll[1].Revents = 0
if err != nil {
return err
}
if shutdownEvents&(unix.POLLIN|problemFlags) != 0 {
return os.ErrClosed
}
if tunEvents&problemFlags != 0 {
return os.ErrClosed
}
return nil
}
func (t *Poll) Read() ([][]byte, error) {
if t.readBuf == nil {
t.readBuf = make([]byte, defaultBatchBufSize)
}
n, err := t.readOne(t.readBuf)
if err != nil {
return nil, err
}
t.batchRet[0] = t.readBuf[:n]
return t.batchRet[:], nil
}
func (t *Poll) readOne(to []byte) (int, error) {
// first 4 bytes is protocol family, in network byte order
var head [4]byte
iovecs := [2]syscall.Iovec{ //todo plat-specific
{&head[0], 4},
{&to[0], uint64(len(to))},
}
for {
n, _, errno := syscall.Syscall(syscall.SYS_READV, uintptr(t.fd), uintptr(unsafe.Pointer(&iovecs[0])), 2)
// Pin the iovec + destination buffer backing array across the syscall.
// Without these the Go runtime may move/GC them while the kernel is
// still writing via DMA (we pass the iovec as uintptr, which hides it
// from escape analysis). Same class of bug as rawWrite in the Offload
// path.
runtime.KeepAlive(iovecs)
runtime.KeepAlive(to)
if errno == 0 {
bytesRead := int(n)
if bytesRead < 4 {
return 0, nil
}
return bytesRead - 4, nil
}
switch errno {
case unix.EAGAIN:
if err := t.blockOnRead(); err != nil {
return 0, err
}
case unix.EINTR:
// retry
case unix.EBADF:
return 0, os.ErrClosed
default:
return 0, errno
}
}
}
// Write is only valid for single threaded use
func (t *Poll) Write(from []byte) (int, error) {
if len(from) <= 1 {
return 0, syscall.EIO
}
ipVer := from[0] >> 4
var head [4]byte
// first 4 bytes is protocol family, in network byte order
switch ipVer {
case 4:
head[3] = syscall.AF_INET
case 6:
head[3] = syscall.AF_INET6
default:
return 0, fmt.Errorf("unable to determine IP version from packet")
}
iovecs := [2]syscall.Iovec{ //todo plat specific
{&head[0], 4},
{&from[0], uint64(len(from))},
}
for {
n, _, errno := syscall.Syscall(syscall.SYS_WRITEV, uintptr(t.fd), uintptr(unsafe.Pointer(&iovecs[0])), 2)
// Pin the iovec + source buffer backing array across the syscall.
// See readOne's KeepAlive comment for rationale.
runtime.KeepAlive(iovecs)
runtime.KeepAlive(from)
if errno == 0 {
return int(n) - 4, nil
}
switch errno {
case unix.EAGAIN:
if err := t.blockOnWrite(); err != nil {
return 0, err
}
case unix.EINTR:
// retry
case unix.EBADF:
return 0, os.ErrClosed
default:
return 0, errno
}
}
}
func (t *Poll) Close() error {
if t.closed.Swap(true) {
return nil
}
//shutdownFd is owned by the container, so we should not close it
var err error
if t.fd >= 0 {
err = unix.Close(t.fd)
t.fd = -1
}
return err
}
func (t *Poll) WriteReject(p []byte) (int, error) {
return t.Write(p)
}