broken checkpt

This commit is contained in:
JackDoan
2026-05-14 15:56:34 -05:00
parent d429dab5dd
commit d50c3028a2
17 changed files with 706 additions and 258 deletions

View File

@@ -46,42 +46,6 @@ type Queue interface {
Capabilities() Capabilities
}
// GSOInfo describes a kernel-supplied superpacket sitting in Packet.Bytes.
// The zero value means "not a superpacket" — Bytes is one regular IP
// datagram and no segmentation is required.
type GSOInfo struct {
// Size is the GSO segment size: max payload bytes per segment
// (== TCP MSS for TSO, == UDP payload chunk for USO). Zero means
// not a superpacket.
Size uint16
// HdrLen is the total L3+L4 header length within Bytes (already
// corrected via correctHdrLen, so safe to slice on).
HdrLen uint16
// CsumStart is the L4 header offset inside Bytes (== L3 header
// length).
CsumStart uint16
// Proto picks the L4 protocol (TCP or UDP) so the segmenter knows
// which checksum/header layout to apply.
Proto GSOProto
}
// IsSuperpacket reports whether g describes a multi-segment GSO/USO
// superpacket that needs segmentation before its bytes can be encrypted
// and sent on the wire.
func (g GSOInfo) IsSuperpacket() bool { return g.Size > 0 }
// GSOProto selects the L4 protocol for a GSO superpacket. Determines which
// VIRTIO_NET_HDR_GSO_* type the writer stamps and which checksum offset
// inside the transport header virtio NEEDS_CSUM expects.
type GSOProto uint8
const (
GSOProtoNone GSOProto = iota
GSOProtoTCP
GSOProtoUDP
)
// GSOWriter is implemented by Queues that can emit a TCP or UDP superpacket
// assembled from a header prefix plus one or more borrowed payload
// fragments, in a single vectored write (writev with a leading
@@ -104,24 +68,25 @@ const (
// implementation of GSOWriter is necessary but not sufficient since USO
// may not have been negotiated even when TSO was.
type GSOWriter interface {
WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto GSOProto) error
WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto wire.GSOProto) error
}
// SupportsGSO reports whether w implements GSOWriter and the underlying
// queue advertises the negotiated capability for `want`. A writer that
// implements GSOWriter but not CapsProvider is treated as permissive
// (used by tests and fakes that don't negotiate).
func SupportsGSO(w Queue, want GSOProto) (GSOWriter, bool) {
func SupportsGSO(w Queue, want wire.GSOProto) (GSOWriter, bool) {
gw, ok := w.(GSOWriter)
if !ok {
return nil, false
}
caps := w.Capabilities()
switch want {
case GSOProtoTCP:
case wire.GSOProtoTCP:
return gw, caps.TSO
case GSOProtoUDP:
case wire.GSOProtoUDP:
return gw, caps.USO
default:
return gw, false
}
return gw, false
}

View File

@@ -10,6 +10,7 @@ import (
"syscall"
"unsafe"
"github.com/slackhq/nebula/wire"
"golang.org/x/sys/unix"
"github.com/slackhq/nebula/overlay/tio/virtio"
@@ -67,9 +68,6 @@ type Offload struct {
// events.
writeLock sync.Mutex
closed atomic.Bool
rxBuf []byte // backing store for kernel-handed packets read this drain
rxOff int // cursor into rxBuf for the current Read drain
pending []Packet // packets returned from the most recent Read
// readVnetScratch holds the 10-byte virtio_net_hdr split off the front of
// every TUN read via readv(2). Decoupling the header from the packet body
@@ -115,9 +113,7 @@ func newOffload(fd int, shutdownFd int, usoEnabled bool) (*Offload, error) {
{Fd: int32(shutdownFd), Events: unix.POLLIN},
},
writeLock: sync.Mutex{},
rxBuf: make([]byte, tunRxBufCap),
gsoIovs: make([]unix.Iovec, 2, gsoMaxIovs),
gsoIovs: make([]unix.Iovec, 2, gsoMaxIovs),
}
out.gsoIovs[0].Base = &out.gsoHdrBuf[0]
@@ -197,9 +193,9 @@ func (r *Offload) blockOnWrite() error {
// hold one worst-case kernel-supplied packet body. Without that gate the
// body iovec could be smaller than the next inbound packet and the
// kernel would truncate.
func (r *Offload) readPacket(block bool) (int, error) {
func (r *Offload) readPacket(mem []byte, block bool) (int, error) {
for {
r.readIovs[1].Base = &r.rxBuf[r.rxOff]
r.readIovs[1].Base = &mem[0]
r.readIovs[1].SetLen(tunReadBufSize)
n, _, errno := syscall.Syscall(unix.SYS_READV, uintptr(r.fd), uintptr(unsafe.Pointer(&r.readIovs[0])), uintptr(len(r.readIovs)))
if errno == 0 {
@@ -237,29 +233,33 @@ func (r *Offload) readPacket(block bool) (int, error) {
// bursts of small packets (e.g. TCP ACKs). Packet.Bytes slices point
// into the Offload's internal buffer and are only valid until the next
// Read or Close on this Queue.
func (r *Offload) Read() ([]Packet, error) {
r.pending = r.pending[:0]
r.rxOff = 0
func (r *Offload) Read(p []wire.TunPacket, mem []byte) (int, error) {
maxP := len(p)
maxM := len(mem)
p = p[:0]
rxOff := 0
// Initial (blocking) read. Retry on decode errors so a single bad
// packet does not stall the reader.
for {
n, err := r.readPacket(true)
n, err := r.readPacket(mem, true)
if err != nil {
return nil, err
return 0, err
}
if err := r.decodeRead(n); err != nil {
if p, err = r.decodeRead(p, mem, n); err != nil {
// Drop and read again — a bad packet should not kill the reader.
continue
}
rxOff += n
break
}
// Drain: non-blocking reads until the kernel queue is empty, the drain
// cap is reached, or rxBuf no longer has room for another worst-case
// kernel-supplied packet (tunRxBufSize).
for len(r.pending) < tunDrainCap && tunRxBufCap-r.rxOff >= tunRxBufSize {
n, err := r.readPacket(false)
for len(p) < maxP && maxM-rxOff >= tunRxBufSize {
n, err := r.readPacket(mem[rxOff:], false)
if err != nil {
// EAGAIN / EINTR / anything else: stop draining. We already
// have a valid batch from the first read.
@@ -268,14 +268,15 @@ func (r *Offload) Read() ([]Packet, error) {
if n <= 0 {
break
}
if err := r.decodeRead(n); err != nil {
if p, err = r.decodeRead(p, mem, n); err != nil {
// Drop this packet and stop the drain; we'd rather hand off
// what we have than keep spinning here.
break
}
rxOff += n
}
return r.pending, nil
return len(p), nil
}
// decodeRead processes the packet sitting in rxBuf at rxOff (length
@@ -285,24 +286,23 @@ func (r *Offload) Read() ([]Packet, error) {
// caller can segment lazily at encrypt time. rxOff advances past the
// kernel-supplied body and nothing else, since segmentation no longer
// writes back into rxBuf.
func (r *Offload) decodeRead(pktLen int) error {
func (r *Offload) decodeRead(p []wire.TunPacket, mem []byte, pktLen int) ([]wire.TunPacket, error) {
if pktLen <= 0 {
return fmt.Errorf("short tun read: %d", pktLen)
return p, fmt.Errorf("short tun read: %d", pktLen)
}
var hdr virtio.Hdr
hdr.Decode(r.readVnetScratch[:])
body := r.rxBuf[r.rxOff : r.rxOff+pktLen]
body := mem[:pktLen]
if hdr.GSOType == unix.VIRTIO_NET_HDR_GSO_NONE {
if hdr.Flags&unix.VIRTIO_NET_HDR_F_NEEDS_CSUM != 0 {
if err := virtio.FinishChecksum(body, hdr); err != nil {
return err
return p, err
}
}
r.pending = append(r.pending, Packet{Bytes: body})
r.rxOff += pktLen
return nil
p = append(p, wire.TunPacket{Bytes: body})
return p, nil
}
// GSO superpacket: validate, fix the kernel-supplied HdrLen on the
@@ -310,26 +310,25 @@ func (r *Offload) decodeRead(pktLen int) error {
// the metadata. The bytes stay in rxBuf untouched, segmentation
// happens in SegmentSuperpacket at encrypt time.
if err := virtio.CheckValid(body, hdr); err != nil {
return err
return p, err
}
if err := virtio.CorrectHdrLen(body, &hdr); err != nil {
return err
return p, err
}
proto, err := protoFromGSOType(hdr.GSOType)
if err != nil {
return err
return p, err
}
r.pending = append(r.pending, Packet{
p = append(p, wire.TunPacket{
Bytes: body,
GSO: GSOInfo{
Meta: wire.GSOInfo{
Size: hdr.GSOSize,
HdrLen: hdr.HdrLen,
CsumStart: hdr.CsumStart,
Proto: proto,
},
})
r.rxOff += pktLen
return nil
return p, nil
}
func (r *Offload) Write(buf []byte) (int, error) {
@@ -384,7 +383,7 @@ func (r *Offload) Capabilities() Capabilities {
return Capabilities{TSO: true, USO: r.usoEnabled}
}
func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto GSOProto) error {
func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto wire.GSOProto) error {
if len(hdr) == 0 || len(pays) == 0 || len(transportHdr) == 0 {
return nil
}
@@ -392,7 +391,7 @@ func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto
// seq/ack/dataoff/flags/window), UDP=6 (after sport/dport/length).
var csumOff uint16
switch proto {
case GSOProtoUDP:
case wire.GSOProtoUDP:
csumOff = 6
default:
csumOff = 16
@@ -407,7 +406,7 @@ func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto
if len(pays) > 1 {
ipVer := hdr[0] >> 4
switch {
case proto == GSOProtoUDP && (ipVer == 4 || ipVer == 6):
case proto == wire.GSOProtoUDP && (ipVer == 4 || ipVer == 6):
vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_UDP_L4
case ipVer == 6:
vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV6

View File

@@ -5,6 +5,7 @@ import (
"os"
"sync/atomic"
"github.com/slackhq/nebula/wire"
"golang.org/x/sys/unix"
)
@@ -19,9 +20,6 @@ type Poll struct {
readPoll [2]unix.PollFd
writePoll [2]unix.PollFd
closed atomic.Bool
readBuf []byte
batchRet [1]Packet
}
func newPoll(fd int, shutdownFd int) (*Poll, error) {
@@ -31,8 +29,7 @@ func newPoll(fd int, shutdownFd int) (*Poll, error) {
}
out := &Poll{
fd: fd,
readBuf: make([]byte, tunReadBufSize),
fd: fd,
readPoll: [2]unix.PollFd{
{Fd: int32(fd), Events: unix.POLLIN},
{Fd: int32(shutdownFd), Events: unix.POLLIN},
@@ -97,13 +94,17 @@ func (t *Poll) blockOnWrite() error {
return nil
}
func (t *Poll) Read() ([]Packet, error) {
n, err := t.readOne(t.readBuf)
if err != nil {
return nil, err
func (t *Poll) Read(p []wire.TunPacket, mem []byte) (int, error) {
if len(p) == 0 || len(mem) == 0 {
return 0, nil //todo should this be an err?
}
t.batchRet[0] = Packet{Bytes: t.readBuf[:n]}
return t.batchRet[:], nil
p[0].Meta = wire.GSOInfo{}
n, err := t.readOne(mem)
if err != nil {
return 0, err
}
p[0].Bytes = mem[:n]
return 1, nil
}
func (t *Poll) readOne(to []byte) (int, error) {
@@ -162,3 +163,7 @@ func (t *Poll) Close() error {
return err
}
func (t *Poll) Capabilities() Capabilities {
return Capabilities{}
}

View File

@@ -6,46 +6,20 @@ package tio
import (
"fmt"
"github.com/slackhq/nebula/wire"
"golang.org/x/sys/unix"
"github.com/slackhq/nebula/overlay/tio/virtio"
)
// protoFromGSOType maps a virtio_net_hdr GSOType to the GSOProto value the
// segment-time helpers use. Returns an error for GSO_NONE or any unknown
// value — the caller should only invoke this on a confirmed superpacket.
func protoFromGSOType(t uint8) (GSOProto, error) {
func protoFromGSOType(t uint8) (wire.GSOProto, error) {
switch t {
case unix.VIRTIO_NET_HDR_GSO_TCPV4, unix.VIRTIO_NET_HDR_GSO_TCPV6:
return GSOProtoTCP, nil
return wire.GSOProtoTCP, nil
case unix.VIRTIO_NET_HDR_GSO_UDP_L4:
return GSOProtoUDP, nil
return wire.GSOProtoUDP, nil
default:
return 0, fmt.Errorf("unsupported virtio gso type: %d", t)
}
}
// SegmentSuperpacket invokes fn once per segment of pkt. For non-GSO pkts
// fn is called once with pkt.Bytes (no segmentation, no copy). For GSO/USO
// superpackets fn is called once per segment with a slice of pkt.Bytes
// holding that segment's plaintext (a freshly-patched L3+L4 header sliced
// in front of the original payload chunk). The slide is destructive: pkt is
// consumed by this call and its bytes are in an undefined state when
// SegmentSuperpacket returns. Callers must not retain pkt or any earlier
// seg slice past fn's return for that segment. The scratch parameter is
// unused on the destructive path and kept only for cross-platform
// signature compatibility. Aborts and returns the first error from fn or
// from per-segment construction.
func SegmentSuperpacket(pkt Packet, fn func(seg []byte) error) error {
if !pkt.GSO.IsSuperpacket() {
return fn(pkt.Bytes)
}
switch pkt.GSO.Proto {
case GSOProtoTCP:
return virtio.SegmentTCP(pkt.Bytes, pkt.GSO.HdrLen, pkt.GSO.CsumStart, pkt.GSO.Size, fn)
case GSOProtoUDP:
return virtio.SegmentUDP(pkt.Bytes, pkt.GSO.HdrLen, pkt.GSO.CsumStart, pkt.GSO.Size, fn)
default:
return fmt.Errorf("unsupported gso proto: %d", pkt.GSO.Proto)
}
}