broken checkpt

This commit is contained in:
JackDoan
2026-05-14 15:56:34 -05:00
parent d429dab5dd
commit d50c3028a2
17 changed files with 706 additions and 258 deletions

View File

@@ -283,7 +283,7 @@ func (f *Interface) activate() error {
} }
f.readers = f.inside.Readers() f.readers = f.inside.Readers()
for i := range f.readers { for i := range f.readers {
caps := tio.QueueCapabilities(f.readers[i]) caps := f.readers[i].Capabilities()
if caps.TSO || caps.USO { if caps.TSO || caps.USO {
// Multi-lane: TCP gets coalesced when TSO is on, UDP when USO // Multi-lane: TCP gets coalesced when TSO is on, UDP when USO
// is on, everything else (and either lane disabled) falls // is on, everything else (and either lane disabled) falls
@@ -387,7 +387,19 @@ func (f *Interface) listenIn(reader tio.Queue, q int) {
f.l.Warn("failed to pin tun reader to CPU", "queue", q, "cpu", cpu, "err", err) f.l.Warn("failed to pin tun reader to CPU", "queue", q, "cpu", cpu, "err", err)
} }
const bonusInfo = 16
bufferScale := udp.MTU + bonusInfo
numTunPackets := 1
caps := reader.Capabilities()
if caps.TSO || caps.USO {
bufferScale = 65535 + bonusInfo
numTunPackets = f.batchSize
}
rejectBuf := make([]byte, mtu) rejectBuf := make([]byte, mtu)
tunPackets := make([]wire.TunPacket, numTunPackets)
packetMem := make([]byte, bufferScale*numTunPackets)
arenaSize := batch.SendBatchCap * (udp.MTU + 32) arenaSize := batch.SendBatchCap * (udp.MTU + 32)
sb := batch.NewSendBatch(f.writers[q], batch.SendBatchCap, util.NewArena(arenaSize)) sb := batch.NewSendBatch(f.writers[q], batch.SendBatchCap, util.NewArena(arenaSize))
fwPacket := &firewall.Packet{} fwPacket := &firewall.Packet{}
@@ -396,7 +408,7 @@ func (f *Interface) listenIn(reader tio.Queue, q int) {
conntrackCache := firewall.NewConntrackCacheTicker(f.ctx, f.l, f.conntrackCacheTimeout) conntrackCache := firewall.NewConntrackCacheTicker(f.ctx, f.l, f.conntrackCacheTimeout)
for { for {
n, err := reader.Read(packets, packetMem) n, err := reader.Read(tunPackets, packetMem)
if err != nil { if err != nil {
if !f.closed.Load() { if !f.closed.Load() {
f.l.Error("Error while reading outbound packet, closing", "error", err, "reader", q) f.l.Error("Error while reading outbound packet, closing", "error", err, "reader", q)
@@ -407,7 +419,7 @@ func (f *Interface) listenIn(reader tio.Queue, q int) {
ctCache := conntrackCache.Get() ctCache := conntrackCache.Get()
for i := range n { for i := range n {
f.consumeInsidePacket(packets[i], fwPacket, nb, sb, rejectBuf, q, ctCache) f.consumeInsidePacket(tunPackets[i], fwPacket, nb, sb, rejectBuf, q, ctCache)
} }
if err := sb.Flush(); err != nil { if err := sb.Flush(); err != nil {
f.l.Error("Failed to write outgoing batch", "error", err, "writer", q) f.l.Error("Failed to write outgoing batch", "error", err, "writer", q)

View File

@@ -147,44 +147,3 @@ func mergeECNIntoSeed(seedHdr, pktHdr []byte, isV6 bool) {
seedHdr[1] |= pktHdr[1] & 0x03 seedHdr[1] |= pktHdr[1] & 0x03
} }
} }
// Arena is an injectable byte-slab that hands out non-overlapping borrowed
// slices via Reserve and releases them in bulk via Reset. Coalescers take
// an *Arena at construction so the caller controls the slab lifetime and
// can share one slab across multiple coalescers (MultiCoalescer hands the
// same *Arena to every lane so the lanes don't carry their own backings).
//
// Reserve borrows; the slice is valid until the next Reset. The slab grows
// (by allocating a fresh, larger backing array) if a Reserve doesn't fit;
// pre-size the arena via NewArena to avoid that path on the hot path.
type Arena struct {
buf []byte
}
// NewArena returns an Arena with a pre-allocated backing of the given
// capacity. Pass 0 if you don't intend to call Reserve (e.g. a test that
// only feeds the coalescer pre-made []byte packets via Commit).
func NewArena(capacity int) *Arena {
return &Arena{buf: make([]byte, 0, capacity)}
}
// Reserve hands out a non-overlapping sz-byte slice from the arena. If the
// request doesn't fit the current backing, a fresh, larger backing is
// allocated; already-borrowed slices reference the old backing and remain
// valid until Reset.
func (a *Arena) Reserve(sz int) []byte {
if len(a.buf)+sz > cap(a.buf) {
newCap := max(cap(a.buf)*2, sz)
a.buf = make([]byte, 0, newCap)
}
start := len(a.buf)
a.buf = a.buf[:start+sz]
return a.buf[start : start+sz : start+sz]
}
// Reset releases every slice handed out since the last Reset. Callers must
// not use any previously-borrowed slice after this returns. The underlying
// backing array is retained so subsequent Reserves don't re-allocate.
func (a *Arena) Reset() {
a.buf = a.buf[:0]
}

View File

@@ -2,8 +2,10 @@ package batch
import ( import (
"errors" "errors"
"io"
"log/slog" "log/slog"
"github.com/slackhq/nebula/overlay/tio"
"github.com/slackhq/nebula/util"
) )
// MultiCoalescer fans plaintext packets out to lane-specific batchers based // MultiCoalescer fans plaintext packets out to lane-specific batchers based
@@ -35,7 +37,7 @@ type MultiCoalescer struct {
// sequentially and never Reserves in between, so a later lane's // sequentially and never Reserves in between, so a later lane's
// slots stay readable across an earlier lane's Reset (the underlying // slots stay readable across an earlier lane's Reset (the underlying
// bytes are still alive — Reset only re-slices len to 0). // bytes are still alive — Reset only re-slices len to 0).
arena *Arena arena *util.Arena
} }
// DefaultMultiArenaCap is the recommended arena capacity for a Multi-lane // DefaultMultiArenaCap is the recommended arena capacity for a Multi-lane
@@ -49,9 +51,9 @@ const DefaultMultiArenaCap = initialSlots * 65535
// Either lane disabled redirects its traffic into the passthrough lane. // Either lane disabled redirects its traffic into the passthrough lane.
// arena is the single backing slab shared across every lane; the caller // arena is the single backing slab shared across every lane; the caller
// pre-sizes it via NewArena so the hot path never allocates. // pre-sizes it via NewArena so the hot path never allocates.
func NewMultiCoalescer(w io.Writer, l *slog.Logger, arena *Arena, tcpEnabled, udpEnabled bool) *MultiCoalescer { func NewMultiCoalescer(w tio.Queue, l *slog.Logger, arena *util.Arena, tcpEnabled, udpEnabled bool) *MultiCoalescer {
m := &MultiCoalescer{ m := &MultiCoalescer{
pt: NewPassthrough(w, arena), pt: NewPassthrough(w, initialSlots, arena),
arena: arena, arena: arena,
} }
if tcpEnabled { if tcpEnabled {

View File

@@ -4,6 +4,7 @@ import (
"io" "io"
"github.com/slackhq/nebula/udp" "github.com/slackhq/nebula/udp"
"github.com/slackhq/nebula/util"
) )
// Passthrough is a RxBatcher that doesn't batch anything, it just accumulates and then sends packets. // Passthrough is a RxBatcher that doesn't batch anything, it just accumulates and then sends packets.
@@ -11,7 +12,7 @@ type Passthrough struct {
out io.Writer out io.Writer
slots [][]byte slots [][]byte
// arena is injected; see TCPCoalescer.arena for the contract. // arena is injected; see TCPCoalescer.arena for the contract.
arena *Arena arena *util.Arena
cursor int cursor int
} }
@@ -21,7 +22,7 @@ const passthroughBaseNumSlots = 128
// standalone Passthrough batcher: 128 slots × udp.MTU ≈ 1.1 MiB. // standalone Passthrough batcher: 128 slots × udp.MTU ≈ 1.1 MiB.
const DefaultPassthroughArenaCap = passthroughBaseNumSlots * udp.MTU const DefaultPassthroughArenaCap = passthroughBaseNumSlots * udp.MTU
func NewPassthrough(w io.Writer, slots int, arena *Arena) *Passthrough { func NewPassthrough(w io.Writer, slots int, arena *util.Arena) *Passthrough {
return &Passthrough{ return &Passthrough{
out: w, out: w,
slots: make([][]byte, 0, slots), slots: make([][]byte, 0, slots),

View File

@@ -10,6 +10,8 @@ import (
"slices" "slices"
"github.com/slackhq/nebula/overlay/tio" "github.com/slackhq/nebula/overlay/tio"
"github.com/slackhq/nebula/util"
"github.com/slackhq/nebula/wire"
) )
// ipProtoTCP is the IANA protocol number for TCP. Hardcoded instead of // ipProtoTCP is the IANA protocol number for TCP. Hardcoded instead of
@@ -88,11 +90,11 @@ type TCPCoalescer struct {
// and tells it to release them via Reset on Flush. When wrapped in // and tells it to release them via Reset on Flush. When wrapped in
// MultiCoalescer the same *Arena is shared with the other lanes so // MultiCoalescer the same *Arena is shared with the other lanes so
// there's exactly one backing slab per Multi instance. // there's exactly one backing slab per Multi instance.
arena *Arena arena *util.Arena
l *slog.Logger l *slog.Logger
} }
func NewTCPCoalescer(w io.Writer, l *slog.Logger, arena *Arena) *TCPCoalescer { func NewTCPCoalescer(w tio.Queue, l *slog.Logger, arena *util.Arena) *TCPCoalescer {
c := &TCPCoalescer{ c := &TCPCoalescer{
plainW: w, plainW: w,
slots: make([]*coalesceSlot, 0, initialSlots), slots: make([]*coalesceSlot, 0, initialSlots),
@@ -101,7 +103,7 @@ func NewTCPCoalescer(w io.Writer, l *slog.Logger, arena *Arena) *TCPCoalescer {
arena: arena, arena: arena,
l: l, l: l,
} }
if gw, ok := tio.SupportsGSO(w, tio.GSOProtoTCP); ok { if gw, ok := tio.SupportsGSO(w, wire.GSOProtoTCP); ok {
c.gsoW = gw c.gsoW = gw
} }
return c return c
@@ -419,7 +421,7 @@ func (c *TCPCoalescer) flushSlot(s *coalesceSlot) error {
tcsum := s.ipHdrLen + 16 tcsum := s.ipHdrLen + 16
binary.BigEndian.PutUint16(hdr[tcsum:tcsum+2], foldOnceNoInvert(psum)) binary.BigEndian.PutUint16(hdr[tcsum:tcsum+2], foldOnceNoInvert(psum))
return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, tio.GSOProtoTCP) return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, wire.GSOProtoTCP)
} }
// headersMatch compares two IP+TCP header prefixes for byte-for-byte // headersMatch compares two IP+TCP header prefixes for byte-for-byte

View File

@@ -5,6 +5,8 @@ import (
"io" "io"
"github.com/slackhq/nebula/overlay/tio" "github.com/slackhq/nebula/overlay/tio"
"github.com/slackhq/nebula/util"
"github.com/slackhq/nebula/wire"
) )
// ipProtoUDP is the IANA protocol number for UDP. // ipProtoUDP is the IANA protocol number for UDP.
@@ -67,7 +69,7 @@ type UDPCoalescer struct {
pool []*udpSlot pool []*udpSlot
// arena is injected; see TCPCoalescer.arena for the contract. // arena is injected; see TCPCoalescer.arena for the contract.
arena *Arena arena *util.Arena
} }
// NewUDPCoalescer wraps w. The caller is responsible for only constructing // NewUDPCoalescer wraps w. The caller is responsible for only constructing
@@ -75,7 +77,7 @@ type UDPCoalescer struct {
// the kernel may reject GSO_UDP_L4 writes. If w does not implement // the kernel may reject GSO_UDP_L4 writes. If w does not implement
// tio.GSOWriter at all (single-packet Queue), the coalescer degrades to // tio.GSOWriter at all (single-packet Queue), the coalescer degrades to
// plain Writes — same defensive shape as the TCP coalescer. // plain Writes — same defensive shape as the TCP coalescer.
func NewUDPCoalescer(w io.Writer, arena *Arena) *UDPCoalescer { func NewUDPCoalescer(w tio.Queue, arena *util.Arena) *UDPCoalescer {
c := &UDPCoalescer{ c := &UDPCoalescer{
plainW: w, plainW: w,
slots: make([]*udpSlot, 0, initialSlots), slots: make([]*udpSlot, 0, initialSlots),
@@ -83,7 +85,7 @@ func NewUDPCoalescer(w io.Writer, arena *Arena) *UDPCoalescer {
pool: make([]*udpSlot, 0, initialSlots), pool: make([]*udpSlot, 0, initialSlots),
arena: arena, arena: arena,
} }
if gw, ok := tio.SupportsGSO(w, tio.GSOProtoUDP); ok { if gw, ok := tio.SupportsGSO(w, wire.GSOProtoUDP); ok {
c.gsoW = gw c.gsoW = gw
} }
return c return c
@@ -313,7 +315,7 @@ func (c *UDPCoalescer) flushSlot(s *udpSlot) error {
udpCsumOff := s.ipHdrLen + 6 udpCsumOff := s.ipHdrLen + 6
binary.BigEndian.PutUint16(hdr[udpCsumOff:udpCsumOff+2], foldOnceNoInvert(psum)) binary.BigEndian.PutUint16(hdr[udpCsumOff:udpCsumOff+2], foldOnceNoInvert(psum))
return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, tio.GSOProtoUDP) return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, wire.GSOProtoUDP)
} }
// udpHeadersMatch compares two IP+UDP header prefixes for byte-equality on // udpHeadersMatch compares two IP+UDP header prefixes for byte-equality on

View File

@@ -46,42 +46,6 @@ type Queue interface {
Capabilities() Capabilities Capabilities() Capabilities
} }
// GSOInfo describes a kernel-supplied superpacket sitting in Packet.Bytes.
// The zero value means "not a superpacket" — Bytes is one regular IP
// datagram and no segmentation is required.
type GSOInfo struct {
// Size is the GSO segment size: max payload bytes per segment
// (== TCP MSS for TSO, == UDP payload chunk for USO). Zero means
// not a superpacket.
Size uint16
// HdrLen is the total L3+L4 header length within Bytes (already
// corrected via correctHdrLen, so safe to slice on).
HdrLen uint16
// CsumStart is the L4 header offset inside Bytes (== L3 header
// length).
CsumStart uint16
// Proto picks the L4 protocol (TCP or UDP) so the segmenter knows
// which checksum/header layout to apply.
Proto GSOProto
}
// IsSuperpacket reports whether g describes a multi-segment GSO/USO
// superpacket that needs segmentation before its bytes can be encrypted
// and sent on the wire.
func (g GSOInfo) IsSuperpacket() bool { return g.Size > 0 }
// GSOProto selects the L4 protocol for a GSO superpacket. Determines which
// VIRTIO_NET_HDR_GSO_* type the writer stamps and which checksum offset
// inside the transport header virtio NEEDS_CSUM expects.
type GSOProto uint8
const (
GSOProtoNone GSOProto = iota
GSOProtoTCP
GSOProtoUDP
)
// GSOWriter is implemented by Queues that can emit a TCP or UDP superpacket // GSOWriter is implemented by Queues that can emit a TCP or UDP superpacket
// assembled from a header prefix plus one or more borrowed payload // assembled from a header prefix plus one or more borrowed payload
// fragments, in a single vectored write (writev with a leading // fragments, in a single vectored write (writev with a leading
@@ -104,24 +68,25 @@ const (
// implementation of GSOWriter is necessary but not sufficient since USO // implementation of GSOWriter is necessary but not sufficient since USO
// may not have been negotiated even when TSO was. // may not have been negotiated even when TSO was.
type GSOWriter interface { type GSOWriter interface {
WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto GSOProto) error WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto wire.GSOProto) error
} }
// SupportsGSO reports whether w implements GSOWriter and the underlying // SupportsGSO reports whether w implements GSOWriter and the underlying
// queue advertises the negotiated capability for `want`. A writer that // queue advertises the negotiated capability for `want`. A writer that
// implements GSOWriter but not CapsProvider is treated as permissive // implements GSOWriter but not CapsProvider is treated as permissive
// (used by tests and fakes that don't negotiate). // (used by tests and fakes that don't negotiate).
func SupportsGSO(w Queue, want GSOProto) (GSOWriter, bool) { func SupportsGSO(w Queue, want wire.GSOProto) (GSOWriter, bool) {
gw, ok := w.(GSOWriter) gw, ok := w.(GSOWriter)
if !ok { if !ok {
return nil, false return nil, false
} }
caps := w.Capabilities() caps := w.Capabilities()
switch want { switch want {
case GSOProtoTCP: case wire.GSOProtoTCP:
return gw, caps.TSO return gw, caps.TSO
case GSOProtoUDP: case wire.GSOProtoUDP:
return gw, caps.USO return gw, caps.USO
default:
return gw, false
} }
return gw, false
} }

View File

@@ -10,6 +10,7 @@ import (
"syscall" "syscall"
"unsafe" "unsafe"
"github.com/slackhq/nebula/wire"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
"github.com/slackhq/nebula/overlay/tio/virtio" "github.com/slackhq/nebula/overlay/tio/virtio"
@@ -67,9 +68,6 @@ type Offload struct {
// events. // events.
writeLock sync.Mutex writeLock sync.Mutex
closed atomic.Bool closed atomic.Bool
rxBuf []byte // backing store for kernel-handed packets read this drain
rxOff int // cursor into rxBuf for the current Read drain
pending []Packet // packets returned from the most recent Read
// readVnetScratch holds the 10-byte virtio_net_hdr split off the front of // readVnetScratch holds the 10-byte virtio_net_hdr split off the front of
// every TUN read via readv(2). Decoupling the header from the packet body // every TUN read via readv(2). Decoupling the header from the packet body
@@ -115,9 +113,7 @@ func newOffload(fd int, shutdownFd int, usoEnabled bool) (*Offload, error) {
{Fd: int32(shutdownFd), Events: unix.POLLIN}, {Fd: int32(shutdownFd), Events: unix.POLLIN},
}, },
writeLock: sync.Mutex{}, writeLock: sync.Mutex{},
gsoIovs: make([]unix.Iovec, 2, gsoMaxIovs),
rxBuf: make([]byte, tunRxBufCap),
gsoIovs: make([]unix.Iovec, 2, gsoMaxIovs),
} }
out.gsoIovs[0].Base = &out.gsoHdrBuf[0] out.gsoIovs[0].Base = &out.gsoHdrBuf[0]
@@ -197,9 +193,9 @@ func (r *Offload) blockOnWrite() error {
// hold one worst-case kernel-supplied packet body. Without that gate the // hold one worst-case kernel-supplied packet body. Without that gate the
// body iovec could be smaller than the next inbound packet and the // body iovec could be smaller than the next inbound packet and the
// kernel would truncate. // kernel would truncate.
func (r *Offload) readPacket(block bool) (int, error) { func (r *Offload) readPacket(mem []byte, block bool) (int, error) {
for { for {
r.readIovs[1].Base = &r.rxBuf[r.rxOff] r.readIovs[1].Base = &mem[0]
r.readIovs[1].SetLen(tunReadBufSize) r.readIovs[1].SetLen(tunReadBufSize)
n, _, errno := syscall.Syscall(unix.SYS_READV, uintptr(r.fd), uintptr(unsafe.Pointer(&r.readIovs[0])), uintptr(len(r.readIovs))) n, _, errno := syscall.Syscall(unix.SYS_READV, uintptr(r.fd), uintptr(unsafe.Pointer(&r.readIovs[0])), uintptr(len(r.readIovs)))
if errno == 0 { if errno == 0 {
@@ -237,29 +233,33 @@ func (r *Offload) readPacket(block bool) (int, error) {
// bursts of small packets (e.g. TCP ACKs). Packet.Bytes slices point // bursts of small packets (e.g. TCP ACKs). Packet.Bytes slices point
// into the Offload's internal buffer and are only valid until the next // into the Offload's internal buffer and are only valid until the next
// Read or Close on this Queue. // Read or Close on this Queue.
func (r *Offload) Read() ([]Packet, error) { func (r *Offload) Read(p []wire.TunPacket, mem []byte) (int, error) {
r.pending = r.pending[:0] maxP := len(p)
r.rxOff = 0 maxM := len(mem)
p = p[:0]
rxOff := 0
// Initial (blocking) read. Retry on decode errors so a single bad // Initial (blocking) read. Retry on decode errors so a single bad
// packet does not stall the reader. // packet does not stall the reader.
for { for {
n, err := r.readPacket(true) n, err := r.readPacket(mem, true)
if err != nil { if err != nil {
return nil, err return 0, err
} }
if err := r.decodeRead(n); err != nil { if p, err = r.decodeRead(p, mem, n); err != nil {
// Drop and read again — a bad packet should not kill the reader. // Drop and read again — a bad packet should not kill the reader.
continue continue
} }
rxOff += n
break break
} }
// Drain: non-blocking reads until the kernel queue is empty, the drain // Drain: non-blocking reads until the kernel queue is empty, the drain
// cap is reached, or rxBuf no longer has room for another worst-case // cap is reached, or rxBuf no longer has room for another worst-case
// kernel-supplied packet (tunRxBufSize). // kernel-supplied packet (tunRxBufSize).
for len(r.pending) < tunDrainCap && tunRxBufCap-r.rxOff >= tunRxBufSize { for len(p) < maxP && maxM-rxOff >= tunRxBufSize {
n, err := r.readPacket(false) n, err := r.readPacket(mem[rxOff:], false)
if err != nil { if err != nil {
// EAGAIN / EINTR / anything else: stop draining. We already // EAGAIN / EINTR / anything else: stop draining. We already
// have a valid batch from the first read. // have a valid batch from the first read.
@@ -268,14 +268,15 @@ func (r *Offload) Read() ([]Packet, error) {
if n <= 0 { if n <= 0 {
break break
} }
if err := r.decodeRead(n); err != nil { if p, err = r.decodeRead(p, mem, n); err != nil {
// Drop this packet and stop the drain; we'd rather hand off // Drop this packet and stop the drain; we'd rather hand off
// what we have than keep spinning here. // what we have than keep spinning here.
break break
} }
rxOff += n
} }
return r.pending, nil return len(p), nil
} }
// decodeRead processes the packet sitting in rxBuf at rxOff (length // decodeRead processes the packet sitting in rxBuf at rxOff (length
@@ -285,24 +286,23 @@ func (r *Offload) Read() ([]Packet, error) {
// caller can segment lazily at encrypt time. rxOff advances past the // caller can segment lazily at encrypt time. rxOff advances past the
// kernel-supplied body and nothing else, since segmentation no longer // kernel-supplied body and nothing else, since segmentation no longer
// writes back into rxBuf. // writes back into rxBuf.
func (r *Offload) decodeRead(pktLen int) error { func (r *Offload) decodeRead(p []wire.TunPacket, mem []byte, pktLen int) ([]wire.TunPacket, error) {
if pktLen <= 0 { if pktLen <= 0 {
return fmt.Errorf("short tun read: %d", pktLen) return p, fmt.Errorf("short tun read: %d", pktLen)
} }
var hdr virtio.Hdr var hdr virtio.Hdr
hdr.Decode(r.readVnetScratch[:]) hdr.Decode(r.readVnetScratch[:])
body := r.rxBuf[r.rxOff : r.rxOff+pktLen] body := mem[:pktLen]
if hdr.GSOType == unix.VIRTIO_NET_HDR_GSO_NONE { if hdr.GSOType == unix.VIRTIO_NET_HDR_GSO_NONE {
if hdr.Flags&unix.VIRTIO_NET_HDR_F_NEEDS_CSUM != 0 { if hdr.Flags&unix.VIRTIO_NET_HDR_F_NEEDS_CSUM != 0 {
if err := virtio.FinishChecksum(body, hdr); err != nil { if err := virtio.FinishChecksum(body, hdr); err != nil {
return err return p, err
} }
} }
r.pending = append(r.pending, Packet{Bytes: body}) p = append(p, wire.TunPacket{Bytes: body})
r.rxOff += pktLen return p, nil
return nil
} }
// GSO superpacket: validate, fix the kernel-supplied HdrLen on the // GSO superpacket: validate, fix the kernel-supplied HdrLen on the
@@ -310,26 +310,25 @@ func (r *Offload) decodeRead(pktLen int) error {
// the metadata. The bytes stay in rxBuf untouched, segmentation // the metadata. The bytes stay in rxBuf untouched, segmentation
// happens in SegmentSuperpacket at encrypt time. // happens in SegmentSuperpacket at encrypt time.
if err := virtio.CheckValid(body, hdr); err != nil { if err := virtio.CheckValid(body, hdr); err != nil {
return err return p, err
} }
if err := virtio.CorrectHdrLen(body, &hdr); err != nil { if err := virtio.CorrectHdrLen(body, &hdr); err != nil {
return err return p, err
} }
proto, err := protoFromGSOType(hdr.GSOType) proto, err := protoFromGSOType(hdr.GSOType)
if err != nil { if err != nil {
return err return p, err
} }
r.pending = append(r.pending, Packet{ p = append(p, wire.TunPacket{
Bytes: body, Bytes: body,
GSO: GSOInfo{ Meta: wire.GSOInfo{
Size: hdr.GSOSize, Size: hdr.GSOSize,
HdrLen: hdr.HdrLen, HdrLen: hdr.HdrLen,
CsumStart: hdr.CsumStart, CsumStart: hdr.CsumStart,
Proto: proto, Proto: proto,
}, },
}) })
r.rxOff += pktLen return p, nil
return nil
} }
func (r *Offload) Write(buf []byte) (int, error) { func (r *Offload) Write(buf []byte) (int, error) {
@@ -384,7 +383,7 @@ func (r *Offload) Capabilities() Capabilities {
return Capabilities{TSO: true, USO: r.usoEnabled} return Capabilities{TSO: true, USO: r.usoEnabled}
} }
func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto GSOProto) error { func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto wire.GSOProto) error {
if len(hdr) == 0 || len(pays) == 0 || len(transportHdr) == 0 { if len(hdr) == 0 || len(pays) == 0 || len(transportHdr) == 0 {
return nil return nil
} }
@@ -392,7 +391,7 @@ func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto
// seq/ack/dataoff/flags/window), UDP=6 (after sport/dport/length). // seq/ack/dataoff/flags/window), UDP=6 (after sport/dport/length).
var csumOff uint16 var csumOff uint16
switch proto { switch proto {
case GSOProtoUDP: case wire.GSOProtoUDP:
csumOff = 6 csumOff = 6
default: default:
csumOff = 16 csumOff = 16
@@ -407,7 +406,7 @@ func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto
if len(pays) > 1 { if len(pays) > 1 {
ipVer := hdr[0] >> 4 ipVer := hdr[0] >> 4
switch { switch {
case proto == GSOProtoUDP && (ipVer == 4 || ipVer == 6): case proto == wire.GSOProtoUDP && (ipVer == 4 || ipVer == 6):
vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_UDP_L4 vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_UDP_L4
case ipVer == 6: case ipVer == 6:
vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV6 vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV6

View File

@@ -5,6 +5,7 @@ import (
"os" "os"
"sync/atomic" "sync/atomic"
"github.com/slackhq/nebula/wire"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@@ -19,9 +20,6 @@ type Poll struct {
readPoll [2]unix.PollFd readPoll [2]unix.PollFd
writePoll [2]unix.PollFd writePoll [2]unix.PollFd
closed atomic.Bool closed atomic.Bool
readBuf []byte
batchRet [1]Packet
} }
func newPoll(fd int, shutdownFd int) (*Poll, error) { func newPoll(fd int, shutdownFd int) (*Poll, error) {
@@ -31,8 +29,7 @@ func newPoll(fd int, shutdownFd int) (*Poll, error) {
} }
out := &Poll{ out := &Poll{
fd: fd, fd: fd,
readBuf: make([]byte, tunReadBufSize),
readPoll: [2]unix.PollFd{ readPoll: [2]unix.PollFd{
{Fd: int32(fd), Events: unix.POLLIN}, {Fd: int32(fd), Events: unix.POLLIN},
{Fd: int32(shutdownFd), Events: unix.POLLIN}, {Fd: int32(shutdownFd), Events: unix.POLLIN},
@@ -97,13 +94,17 @@ func (t *Poll) blockOnWrite() error {
return nil return nil
} }
func (t *Poll) Read() ([]Packet, error) { func (t *Poll) Read(p []wire.TunPacket, mem []byte) (int, error) {
n, err := t.readOne(t.readBuf) if len(p) == 0 || len(mem) == 0 {
if err != nil { return 0, nil //todo should this be an err?
return nil, err
} }
t.batchRet[0] = Packet{Bytes: t.readBuf[:n]} p[0].Meta = wire.GSOInfo{}
return t.batchRet[:], nil n, err := t.readOne(mem)
if err != nil {
return 0, err
}
p[0].Bytes = mem[:n]
return 1, nil
} }
func (t *Poll) readOne(to []byte) (int, error) { func (t *Poll) readOne(to []byte) (int, error) {
@@ -162,3 +163,7 @@ func (t *Poll) Close() error {
return err return err
} }
func (t *Poll) Capabilities() Capabilities {
return Capabilities{}
}

View File

@@ -6,46 +6,20 @@ package tio
import ( import (
"fmt" "fmt"
"github.com/slackhq/nebula/wire"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
"github.com/slackhq/nebula/overlay/tio/virtio"
) )
// protoFromGSOType maps a virtio_net_hdr GSOType to the GSOProto value the // protoFromGSOType maps a virtio_net_hdr GSOType to the GSOProto value the
// segment-time helpers use. Returns an error for GSO_NONE or any unknown // segment-time helpers use. Returns an error for GSO_NONE or any unknown
// value — the caller should only invoke this on a confirmed superpacket. // value — the caller should only invoke this on a confirmed superpacket.
func protoFromGSOType(t uint8) (GSOProto, error) { func protoFromGSOType(t uint8) (wire.GSOProto, error) {
switch t { switch t {
case unix.VIRTIO_NET_HDR_GSO_TCPV4, unix.VIRTIO_NET_HDR_GSO_TCPV6: case unix.VIRTIO_NET_HDR_GSO_TCPV4, unix.VIRTIO_NET_HDR_GSO_TCPV6:
return GSOProtoTCP, nil return wire.GSOProtoTCP, nil
case unix.VIRTIO_NET_HDR_GSO_UDP_L4: case unix.VIRTIO_NET_HDR_GSO_UDP_L4:
return GSOProtoUDP, nil return wire.GSOProtoUDP, nil
default: default:
return 0, fmt.Errorf("unsupported virtio gso type: %d", t) return 0, fmt.Errorf("unsupported virtio gso type: %d", t)
} }
} }
// SegmentSuperpacket invokes fn once per segment of pkt. For non-GSO pkts
// fn is called once with pkt.Bytes (no segmentation, no copy). For GSO/USO
// superpackets fn is called once per segment with a slice of pkt.Bytes
// holding that segment's plaintext (a freshly-patched L3+L4 header sliced
// in front of the original payload chunk). The slide is destructive: pkt is
// consumed by this call and its bytes are in an undefined state when
// SegmentSuperpacket returns. Callers must not retain pkt or any earlier
// seg slice past fn's return for that segment. The scratch parameter is
// unused on the destructive path and kept only for cross-platform
// signature compatibility. Aborts and returns the first error from fn or
// from per-segment construction.
func SegmentSuperpacket(pkt Packet, fn func(seg []byte) error) error {
if !pkt.GSO.IsSuperpacket() {
return fn(pkt.Bytes)
}
switch pkt.GSO.Proto {
case GSOProtoTCP:
return virtio.SegmentTCP(pkt.Bytes, pkt.GSO.HdrLen, pkt.GSO.CsumStart, pkt.GSO.Size, fn)
case GSOProtoUDP:
return virtio.SegmentUDP(pkt.Bytes, pkt.GSO.HdrLen, pkt.GSO.CsumStart, pkt.GSO.Size, fn)
default:
return fmt.Errorf("unsupported gso proto: %d", pkt.GSO.Proto)
}
}

View File

@@ -83,7 +83,7 @@ func (t *disabledTun) Read(p []wire.TunPacket, mem []byte) (int, error) {
if len(p) == 0 || len(mem) == 0 { if len(p) == 0 || len(mem) == 0 {
return 0, nil //todo should this be an err? return 0, nil //todo should this be an err?
} }
p[0].Meta = struct{}{} p[0].Meta = wire.GSOInfo{}
n, err := t.readOne(mem) n, err := t.readOne(mem)
if err != nil { if err != nil {
return 0, err return 0, err

View File

@@ -146,6 +146,11 @@ func newTun(c *config.C, l *slog.Logger, vpnNetworks []netip.Prefix, multiqueue
} }
nameStr := c.GetString("tun.dev", "") nameStr := c.GetString("tun.dev", "")
// First try to enable IFF_VNET_HDR via TUNSETIFF and negotiate TUN_F_*
// offloads via TUNSETOFFLOAD so we can receive TSO/USO superpackets.
// We try TSO+USO first, fall back to TSO-only on kernels without USO
// (Linux < 6.2), and finally give up on virtio headers entirely and
// reopen as a plain TUN if neither offload mask is accepted.
fd, err := openTunDev() fd, err := openTunDev()
if err != nil { if err != nil {
return nil, err return nil, err

View File

@@ -48,7 +48,7 @@ func (d *UserDevice) Read(p []wire.TunPacket, mem []byte) (int, error) {
if len(p) == 0 || len(mem) == 0 { if len(p) == 0 || len(mem) == 0 {
return 0, nil //todo should this be an err? return 0, nil //todo should this be an err?
} }
p[0].Meta = struct{}{} p[0].Meta = wire.GSOInfo{}
n, err := d.outboundReader.Read(mem) n, err := d.outboundReader.Read(mem)
if err != nil { if err != nil {
return 0, err return 0, err

View File

@@ -6,10 +6,13 @@ package udp
import ( import (
"context" "context"
"encoding/binary" "encoding/binary"
"errors"
"fmt" "fmt"
"log/slog" "log/slog"
"net" "net"
"net/netip" "net/netip"
"strconv"
"strings"
"syscall" "syscall"
"unsafe" "unsafe"
@@ -32,14 +35,44 @@ type StdConn struct {
writeIovs []iovec writeIovs []iovec
writeNames [][]byte writeNames [][]byte
// sendmmsg(2) callback state. sendmmsgCB is bound once in NewListener // Per-entry cmsg scratch. writeCmsg is one contiguous slab of
// to the sendmmsgRun method value so passing it to rawConn.Write does // MaxWriteBatch * writeCmsgSpace bytes; each entry holds two cmsg
// not allocate a fresh closure per send; sendmmsgN/Sent/Errno carry // headers (UDP_SEGMENT then IP_TOS / IPV6_TCLASS) pre-filled once in
// the inputs and outputs across the call without escaping locals. // prepareWriteMessages. WriteBatch only rewrites the per-call data
sendmmsgCB func(fd uintptr) bool // payloads and toggles Hdr.Control / Hdr.Controllen to point at
sendmmsgN int // whichever subset of the two cmsgs applies.
sendmmsgSent int writeCmsg []byte
sendmmsgErrno syscall.Errno writeCmsgSpace int
writeCmsgSegSpace int
writeCmsgEcnSpace int
// writeEntryEnd[e] is the bufs index *after* the last packet packed
// into mmsghdr entry e. Used to rewind `i` on partial sendmmsg success.
writeEntryEnd []int
// rawSend wraps the sendmmsg(2) callback in a closure-free helper so
// the hot path doesn't heap-allocate a fresh closure per call.
rawSend rawSendmmsg
// UDP GSO (sendmsg with UDP_SEGMENT cmsg) support. gsoSupported is
// probed once at socket creation. When true, WriteBatch packs same-
// destination consecutive packets into a single sendmmsg entry with a
// UDP_SEGMENT cmsg; otherwise each packet is its own entry.
gsoSupported bool
maxGSOSegments int
// UDP GRO (recvmsg with UDP_GRO cmsg) support. groSupported is probed
// once at socket creation. When true, listenOutBatch allocates larger
// RX buffers and a per-entry cmsg slot so the kernel can coalesce
// consecutive same-flow datagrams into a single recvmmsg entry; the
// delivered cmsg carries the gso_size used to split them back apart.
groSupported bool
// ecnRecvSupported is true when IP_RECVTOS / IPV6_RECVTCLASS was
// successfully enabled — the kernel will deliver the outer IP-ECN of
// each arriving datagram as a per-slot cmsg, and listenOutBatch passes
// the parsed value to the EncReader callback for RFC 6040 combine.
ecnRecvSupported bool
} }
func setReusePort(network, address string, c syscall.RawConn) error { func setReusePort(network, address string, c syscall.RawConn) error {
@@ -73,10 +106,11 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int)
} }
//gotta find out if we got an AF_INET6 socket or not: //gotta find out if we got an AF_INET6 socket or not:
out := &StdConn{ out := &StdConn{
udpConn: udpConn, udpConn: udpConn,
rawConn: rawConn, rawConn: rawConn,
l: l, l: l,
batch: batch, batch: batch,
maxGSOSegments: 1,
} }
af, err := out.getSockOptInt(unix.SO_DOMAIN) af, err := out.getSockOptInt(unix.SO_DOMAIN)
@@ -87,15 +121,71 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int)
out.isV4 = af == unix.AF_INET out.isV4 = af == unix.AF_INET
out.prepareWriteMessages(MaxWriteBatch) out.prepareWriteMessages(MaxWriteBatch)
out.sendmmsgCB = out.sendmmsgRun out.rawSend.msgs = out.writeMsgs
out.rawSend.bind()
out.prepareGSO()
// GRO delivers coalesced superpackets that need a cmsg to split back
// into segments. The single-packet RX path uses ReadFromUDPAddrPort
// and cannot see that cmsg, so only enable GRO for the batch path.
if batch > 1 {
out.prepareGRO()
}
// Best-effort: ask the kernel to deliver outer IP-ECN as ancillary data
// on every recvmmsg slot so the decap side can apply RFC 6040 combine.
// On older kernels these may not exist; failing here just means we get
// 0 (Not-ECT) on every slot, which is the same as ecn_mode=disable.
out.prepareECNRecv()
return out, nil return out, nil
} }
// prepareWriteMessages allocates one mmsghdr/iovec/sockaddr/cmsg scratch
// slot per sendmmsg entry. The iovec slab is sized to n so all entries'
// iovecs share one allocation; per-entry fan-out is further capped at
// maxGSOSegments. Hdr.Iov / Hdr.Iovlen / Hdr.Control / Hdr.Controllen are
// wired per call since each entry can span a variable number of iovecs
// and may or may not carry a cmsg.
//
// Per-mmsghdr cmsg layout. Each entry's slot of length writeCmsgSpace holds
// up to two cmsg headers placed at fixed offsets:
//
// [0 .. writeCmsgSegSpace) UDP_SEGMENT (gso_size, uint16)
// [writeCmsgSegSpace .. writeCmsgSpace) IP_TOS or IPV6_TCLASS (int32)
//
// Both headers are pre-filled once here; per-call we only rewrite the data
// payload and toggle Hdr.Control / Hdr.Controllen to point at whichever
// subset applies (none / segment-only / ecn-only / both).
func (u *StdConn) prepareWriteMessages(n int) { func (u *StdConn) prepareWriteMessages(n int) {
u.writeMsgs = make([]rawMessage, n) u.writeMsgs = make([]rawMessage, n)
u.writeIovs = make([]iovec, n) u.writeIovs = make([]iovec, n)
u.writeNames = make([][]byte, n) u.writeNames = make([][]byte, n)
u.writeEntryEnd = make([]int, n)
u.writeCmsgSegSpace = unix.CmsgSpace(2)
u.writeCmsgEcnSpace = unix.CmsgSpace(4)
u.writeCmsgSpace = u.writeCmsgSegSpace + u.writeCmsgEcnSpace
u.writeCmsg = make([]byte, n*u.writeCmsgSpace)
ecnLevel := int32(unix.IPPROTO_IP)
ecnType := int32(unix.IP_TOS)
if !u.isV4 {
ecnLevel = unix.IPPROTO_IPV6
ecnType = unix.IPV6_TCLASS
}
for k := 0; k < n; k++ {
base := k * u.writeCmsgSpace
seg := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base]))
seg.Level = unix.SOL_UDP
seg.Type = unix.UDP_SEGMENT
setCmsgLen(seg, unix.CmsgLen(2))
ecn := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base+u.writeCmsgSegSpace]))
ecn.Level = ecnLevel
ecn.Type = ecnType
setCmsgLen(ecn, unix.CmsgLen(4))
}
for i := range u.writeMsgs { for i := range u.writeMsgs {
u.writeNames[i] = make([]byte, unix.SizeofSockaddrInet6) u.writeNames[i] = make([]byte, unix.SizeofSockaddrInet6)
@@ -103,6 +193,139 @@ func (u *StdConn) prepareWriteMessages(n int) {
} }
} }
// maxGSOBytes bounds the total payload per sendmsg() when UDP_SEGMENT is
// set. The kernel stitches all iovecs into a single skb whose length the
// UDP length field can represent, and also enforces sk_gso_max_size (which
// on most devices is 65536). We use 65000 to leave headroom under the
// 65535 UDP-length cap, avoiding EMSGSIZE on large TSO superpackets.
const maxGSOBytes = 65000
// prepareGSO probes UDP_SEGMENT support and sets u.gsoSupported on success.
// Best-effort; failure leaves it false.
func (u *StdConn) prepareGSO() {
u.maxGSOSegments = 63 //gotta be one less than the max so we can still attach a header
var probeErr error
if err := u.rawConn.Control(func(fd uintptr) {
probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_SEGMENT, 0)
}); err != nil {
u.l.Info("udp: GSO disabled", "reason", "rawconn control failed", "error", err)
recordCapability("udp.gso.enabled", false)
return
}
if probeErr != nil {
u.l.Info("udp: GSO disabled", "reason", "kernel rejected probe", "error", probeErr)
recordCapability("udp.gso.enabled", false)
return
}
var un unix.Utsname
if err := unix.Uname(&un); err != nil {
u.l.Info("udp: GSO disabled", "reason", "kernel uname probe failed", "error", err)
recordCapability("udp.gso.enabled", false)
return
}
major, minor := parseRelease(string(un.Release[:]))
if major > 5 || (major == 5 && minor >= 5) {
u.maxGSOSegments = 127
}
u.gsoSupported = true
u.l.Info("udp: GSO enabled", "maxGSOSegments", u.maxGSOSegments)
recordCapability("udp.gso.enabled", true)
}
// udpGROBufferSize sizes the per-entry recvmmsg buffer when UDP_GRO is on.
// The kernel stitches a run of same-flow datagrams into a single skb whose
// length is bounded by sk_gso_max_size (typically 65535); anything larger
// would be MSG_TRUNCed. We use the maximum representable UDP length so a
// full superpacket always lands intact.
const udpGROBufferSize = 65535
// udpGROCmsgPayload is the size of the UDP_GRO cmsg data delivered by the
// kernel: a single int (gso_size in bytes). See udp_cmsg_recv() in
// net/ipv4/udp.c.
const udpGROCmsgPayload = 4
// prepareGRO turns on UDP_GRO so the kernel coalesces consecutive same-flow
// datagrams into one recvmmsg entry, with a cmsg carrying the gso_size used
// to split them back apart on the application side.
func (u *StdConn) prepareGRO() {
var probeErr error
if err := u.rawConn.Control(func(fd uintptr) {
probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_GRO, 1)
}); err != nil {
u.l.Info("udp: GRO disabled", "reason", "rawconn control failed", "error", err)
recordCapability("udp.gro.enabled", false)
return
}
if probeErr != nil {
u.l.Info("udp: GRO disabled", "reason", "kernel rejected probe", "error", probeErr)
recordCapability("udp.gro.enabled", false)
return
}
u.groSupported = true
u.l.Info("udp: GRO enabled")
recordCapability("udp.gro.enabled", true)
}
// prepareECNRecv turns on IP_RECVTOS / IPV6_RECVTCLASS so the outer IP-ECN
// field of each arriving datagram is delivered as ancillary data alongside
// the payload. listenOutBatch reads it via parseRecvCmsg and passes the
// codepoint through the EncReader for RFC 6040 combine on the decap side.
// Best-effort: we keep going on failure.
func (u *StdConn) prepareECNRecv() {
var v4err, v6err error
if err := u.rawConn.Control(func(fd uintptr) {
v4err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_RECVTOS, 1)
if !u.isV4 {
v6err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_RECVTCLASS, 1)
}
}); err != nil {
u.l.Info("udp: outer-ECN RX disabled", "reason", "rawconn control failed", "error", err)
recordCapability("udp.ecn_rx.enabled", false)
return
}
if u.isV4 { //only check the V4 attempt
if v4err != nil {
u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", v4err)
recordCapability("udp.ecn_rx.enabled", false)
} else {
u.ecnRecvSupported = true
u.l.Info("udp: outer-ECN RX enabled")
recordCapability("udp.ecn_rx.enabled", true)
}
return
} else {
if v6err != nil { //no V6 ECN? disable it.
u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", errors.Join(v4err, v6err))
recordCapability("udp.ecn_rx.enabled", false)
return
} else if v4err != nil { //no V4, but yes V6? Low level warning. Could be a V6-specific bind.
u.l.Debug("udp: outer-ECN RX degraded", "reason", "kernel rejected probe on IPv4", "error", v4err)
}
// all good
u.ecnRecvSupported = true
u.l.Info("udp: outer-ECN RX enabled")
recordCapability("udp.ecn_rx.enabled", true)
return
}
}
// recordCapability registers (or updates) a boolean gauge for one of the
// kernel-feature probes. Gauges go to 1 when the feature is enabled, 0 when
// it is not — dashboards can show degraded state on partially-supported
// kernels at a glance. Calling repeatedly with the same name updates the
// existing gauge rather than registering a duplicate.
func recordCapability(name string, enabled bool) {
g := metrics.GetOrRegisterGauge(name, nil)
if enabled {
g.Update(1)
} else {
g.Update(0)
}
}
func (u *StdConn) SupportsMultipleReaders() bool { func (u *StdConn) SupportsMultipleReaders() bool {
return true return true
} }
@@ -221,16 +444,15 @@ func (u *StdConn) listenOutSingle(r EncReader, flush func()) error {
} }
} }
// readSockaddr decodes the source address out of a recvmmsg name buffer func getFrom(names [][]byte, i int, isV4 bool) netip.AddrPort {
func (u *StdConn) readSockaddr(name []byte) netip.AddrPort {
var ip netip.Addr var ip netip.Addr
// It's ok to skip the ok check here, the slicing is the only error that can occur and it will panic // Its ok to skip the ok check here, the slicing is the only error that can occur and it will panic
if u.isV4 { if isV4 {
ip, _ = netip.AddrFromSlice(name[4:8]) ip, _ = netip.AddrFromSlice(names[i][4:8])
} else { } else {
ip, _ = netip.AddrFromSlice(name[8:24]) ip, _ = netip.AddrFromSlice(names[i][8:24])
} }
return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(name[2:4])) return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(names[i][2:4]))
} }
func (u *StdConn) listenOutBatch(r EncReader, flush func()) error { func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
@@ -239,6 +461,16 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
bufSize := MTU bufSize := MTU
cmsgSpace := 0 cmsgSpace := 0
if u.groSupported {
bufSize = udpGROBufferSize
cmsgSpace = unix.CmsgSpace(udpGROCmsgPayload)
}
if u.ecnRecvSupported {
// IP_TOS arrives as 1 byte; IPV6_TCLASS arrives as a 4-byte int.
// Reserve enough for the wider of the two so the same buffer fits
// either family alongside any UDP_GRO cmsg.
cmsgSpace += unix.CmsgSpace(4)
}
msgs, buffers, names, _ := u.PrepareRawMessages(u.batch, bufSize, cmsgSpace) msgs, buffers, names, _ := u.PrepareRawMessages(u.batch, bufSize, cmsgSpace)
//reader needs to capture variables from this function, since it's used as a lambda with rawConn.Read //reader needs to capture variables from this function, since it's used as a lambda with rawConn.Read
@@ -249,6 +481,11 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
} }
for { for {
if cmsgSpace > 0 {
for i := range msgs {
setMsgControllen(&msgs[i].Hdr, cmsgSpace)
}
}
err := u.rawConn.Read(reader) err := u.rawConn.Read(reader)
if err != nil { if err != nil {
return err return err
@@ -258,13 +495,75 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
} }
for i := 0; i < n; i++ { for i := 0; i < n; i++ {
r(u.readSockaddr(names[i]), buffers[i][:msgs[i].Len], RxMeta{}) from := getFrom(names, i, u.isV4)
payload := buffers[i][:msgs[i].Len]
segSize := 0
outerECN := byte(0)
if cmsgSpace > 0 {
segSize, outerECN = parseRecvCmsg(&msgs[i].Hdr, u.groSupported, u.ecnRecvSupported, u.isV4)
}
if segSize <= 0 || segSize >= len(payload) {
r(from, payload, RxMeta{OuterECN: outerECN})
} else {
for off := 0; off < len(payload); off += segSize {
end := off + segSize
if end > len(payload) {
end = len(payload)
}
seg := payload[off:end]
r(from, seg, RxMeta{OuterECN: outerECN})
}
}
} }
flush() flush()
} }
} }
// parseRecvCmsg walks the per-slot ancillary buffer once and extracts up to
// two values of interest in a single pass: the UDP_GRO gso_size (when
// wantGRO is true) and the outer IP-level ECN codepoint stamped on the
// carrier (when wantECN is true). Returns zeros for whichever field is not
// requested or not present. isV4 selects between IP_TOS (1-byte) and
// IPV6_TCLASS (4-byte int) cmsg payloads.
func parseRecvCmsg(hdr *msghdr, wantGRO, wantECN bool, isV4 bool) (gso int, ecn byte) {
controllen := int(hdr.Controllen)
if controllen < unix.SizeofCmsghdr || hdr.Control == nil {
return 0, 0
}
ctrl := unsafe.Slice(hdr.Control, controllen)
off := 0
for off+unix.SizeofCmsghdr <= len(ctrl) {
ch := (*unix.Cmsghdr)(unsafe.Pointer(&ctrl[off]))
clen := int(ch.Len)
if clen < unix.SizeofCmsghdr || off+clen > len(ctrl) {
return gso, ecn
}
dataOff := off + unix.CmsgLen(0)
switch {
case wantGRO && ch.Level == unix.SOL_UDP && ch.Type == unix.UDP_GRO:
if dataOff+udpGROCmsgPayload <= len(ctrl) {
gso = int(int32(binary.NativeEndian.Uint32(ctrl[dataOff : dataOff+udpGROCmsgPayload])))
}
case wantECN && isV4 && ch.Level == unix.IPPROTO_IP && ch.Type == unix.IP_TOS:
// IP_TOS arrives as a single byte; only the low 2 bits are ECN.
if dataOff+1 <= len(ctrl) {
ecn = ctrl[dataOff] & 0x03
}
case wantECN && !isV4 && ch.Level == unix.IPPROTO_IPV6 && ch.Type == unix.IPV6_TCLASS:
// IPV6_TCLASS arrives as a 4-byte int; ECN is the low 2 bits.
if dataOff+4 <= len(ctrl) {
ecn = byte(binary.NativeEndian.Uint32(ctrl[dataOff:dataOff+4])) & 0x03
}
}
// Advance by the aligned cmsg space.
off += unix.CmsgSpace(clen - unix.CmsgLen(0))
}
return gso, ecn
}
func (u *StdConn) ListenOut(r EncReader, flush func()) error { func (u *StdConn) ListenOut(r EncReader, flush func()) error {
if u.batch == 1 { if u.batch == 1 {
return u.listenOutSingle(r, flush) return u.listenOutSingle(r, flush)
@@ -279,89 +578,222 @@ func (u *StdConn) WriteTo(b []byte, ip netip.AddrPort) error {
} }
// WriteBatch sends bufs via sendmmsg(2) using the preallocated scratch on // WriteBatch sends bufs via sendmmsg(2) using the preallocated scratch on
// StdConn. If supported, consecutive packets to the same destination with // StdConn. Consecutive packets to the same destination with matching segment
// matching segment sizes (all but possibly the last) are coalesced into a // sizes (all but possibly the last) are coalesced into a single mmsghdr entry
// single mmsghdr entry // carrying a UDP_SEGMENT cmsg, so one syscall can mix runs of GSO superpackets
// with plain one-off datagrams. Without GSO support every packet is its own
// entry, matching the prior behaviour.
// //
// If sendmmsg returns an error and zero entries went out, we fall back to // Chunks larger than the scratch are processed across multiple syscalls. If
// sendmmsg returns an error AND zero entries went out we fall back to
// per-packet WriteTo for that chunk so the caller still gets best-effort // per-packet WriteTo for that chunk so the caller still gets best-effort
// delivery. On a partial send we resume at the first un-acked entry on // delivery; on a partial-success error we just replay the remainder.
// the next iteration. func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, ecns []byte) error {
func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, _ []byte) error { if len(bufs) != len(addrs) {
for i := 0; i < len(bufs); { return fmt.Errorf("WriteBatch: len(bufs)=%d != len(addrs)=%d", len(bufs), len(addrs))
chunk := min(len(bufs)-i, len(u.writeMsgs)) }
if ecns != nil && len(ecns) != len(bufs) {
return fmt.Errorf("WriteBatch: len(ecns)=%d != len(bufs)=%d", len(ecns), len(bufs))
}
for k := 0; k < chunk; k++ { // Callers deliver same-destination packets contiguously and in counter
u.writeIovs[k].Base = &bufs[i+k][0] // order, so we run the GSO planner directly without a pre-sort. A
setIovLen(&u.writeIovs[k], len(bufs[i+k])) // sorting pass measurably hurt throughput in microbenchmarks while
// providing no observed reordering benefit.
nlen, err := writeSockaddr(u.writeNames[k], addrs[i+k], u.isV4) i := 0
for i < len(bufs) {
baseI := i
entry := 0
iovIdx := 0
for entry < len(u.writeMsgs) && i < len(bufs) {
iovBudget := len(u.writeIovs) - iovIdx
if iovBudget < 1 {
break
}
runLen, segSize := u.planRun(bufs, addrs, ecns, i, iovBudget)
if runLen == 0 {
break
}
for k := 0; k < runLen; k++ {
b := bufs[i+k]
if len(b) == 0 {
u.writeIovs[iovIdx+k].Base = nil
setIovLen(&u.writeIovs[iovIdx+k], 0)
} else {
u.writeIovs[iovIdx+k].Base = &b[0]
setIovLen(&u.writeIovs[iovIdx+k], len(b))
}
}
nlen, err := writeSockaddr(u.writeNames[entry], addrs[i], u.isV4)
if err != nil { if err != nil {
return err return err
} }
hdr := &u.writeMsgs[k].Hdr hdr := &u.writeMsgs[entry].Hdr
hdr.Iov = &u.writeIovs[k] hdr.Iov = &u.writeIovs[iovIdx]
setMsgIovlen(hdr, 1) setMsgIovlen(hdr, runLen)
hdr.Namelen = uint32(nlen) hdr.Namelen = uint32(nlen)
var ecn byte
if ecns != nil {
ecn = ecns[i]
}
u.writeEntryCmsg(entry, runLen, segSize, ecn)
i += runLen
iovIdx += runLen
u.writeEntryEnd[entry] = i
entry++
} }
sent, serr := u.sendmmsg(chunk) if entry == 0 {
return fmt.Errorf("sendmmsg: no progress")
}
sent, serr := u.sendmmsg(entry)
if serr != nil && sent <= 0 { if serr != nil && sent <= 0 {
// sendmmsg returns -1 / sent=0 when entry 0 itself failed; log // Nothing went out for this chunk; fall back to WriteTo for each
// that entry's destination and fall back to per-packet WriteTo // packet that was queued this iteration. We only enter this path
// for the whole chunk so the caller still gets best-effort // when sendmmsg returned an error AND zero entries succeeded —
// delivery without duplicating packets the kernel accepted. // otherwise the partial-success advance below replays only the
u.l.Warn("sendmmsg failed, falling back to per-packet WriteTo", // remainder, avoiding duplicates of already-sent packets.
"err", serr, //
"entries", chunk, // sent=-1 from sendmmsg means message 0 itself failed (partial
"entry0_dst", addrs[i], // success returns the count instead), so log entry 0's parameters
// — that's the entry the kernel rejected.
hdr0 := &u.writeMsgs[0].Hdr
runLen0 := u.writeEntryEnd[0] - baseI
seg0 := len(bufs[baseI])
ecn0 := byte(0)
if ecns != nil {
ecn0 = ecns[baseI]
}
u.l.Warn("sendmmsg had problem",
"sent", sent, "err", serr,
"entries", entry,
"entry0_runLen", runLen0,
"entry0_segSize", seg0,
"entry0_iovlen", hdr0.Iovlen,
"entry0_controllen", hdr0.Controllen,
"entry0_namelen", hdr0.Namelen,
"entry0_ecn", ecn0,
"entry0_dst", addrs[baseI],
"isV4", u.isV4, "isV4", u.isV4,
"gso", u.gsoSupported,
"gro", u.groSupported,
) )
for k := 0; k < chunk; k++ { for k := baseI; k < i; k++ {
if werr := u.WriteTo(bufs[i+k], addrs[i+k]); werr != nil { if werr := u.WriteTo(bufs[k], addrs[k]); werr != nil {
return werr return werr
} }
} }
i += chunk
continue continue
} }
i += sent if sent == 0 {
return fmt.Errorf("sendmmsg made no progress")
}
// Rewind i to the end of the last successfully sent entry. For a
// full-success send this leaves i unchanged; for a partial send it
// replays the remainder on the next outer-loop iteration.
i = u.writeEntryEnd[sent-1]
} }
return nil return nil
} }
// sendmmsg issues sendmmsg(2) against the first n entries of u.writeMsgs. // planRun groups consecutive packets starting at `start` that can be sent as
// The bound u.sendmmsgCB is passed to rawConn.Write so no closure is // a single UDP GSO superpacket (one sendmmsg entry with UDP_SEGMENT cmsg).
// allocated per call; inputs and outputs ride on the StdConn fields. // A run of length 1 means the entry carries no UDP_SEGMENT cmsg and the
func (u *StdConn) sendmmsg(n int) (int, error) { // kernel treats it as a plain datagram. Returns the run length and the
u.sendmmsgN = n // per-segment size (which equals len(bufs[start])). Without GSO support
u.sendmmsgSent = 0 // every call returns runLen=1. Outer ECN (when ecns != nil) is also a run
u.sendmmsgErrno = 0 // boundary — the kernel stamps one outer codepoint per sendmsg entry, so
if err := u.rawConn.Write(u.sendmmsgCB); err != nil { // mixing values inside a run would lose information.
return u.sendmmsgSent, err func (u *StdConn) planRun(bufs [][]byte, addrs []netip.AddrPort, ecns []byte, start, iovBudget int) (int, int) {
if start >= len(bufs) || iovBudget < 1 {
return 0, 0
} }
if u.sendmmsgErrno != 0 { segSize := len(bufs[start])
return u.sendmmsgSent, &net.OpError{Op: "sendmmsg", Err: u.sendmmsgErrno} if !u.gsoSupported || segSize == 0 || segSize > maxGSOBytes {
return 1, segSize
} }
return u.sendmmsgSent, nil dst := addrs[start]
var ecn byte
if ecns != nil {
ecn = ecns[start]
}
maxLen := u.maxGSOSegments
if iovBudget < maxLen {
maxLen = iovBudget
}
runLen := 1
total := segSize
for runLen < maxLen && start+runLen < len(bufs) {
nextLen := len(bufs[start+runLen])
if nextLen == 0 || nextLen > segSize {
break
}
if addrs[start+runLen] != dst {
break
}
if ecns != nil && ecns[start+runLen] != ecn {
break
}
if total+nextLen > maxGSOBytes {
break
}
total += nextLen
runLen++
if nextLen < segSize {
// A short packet must be the last in the run.
break
}
}
return runLen, segSize
} }
// sendmmsgRun is the rawConn.Write callback. It is bound once into // writeEntryCmsg sets up the per-mmsghdr Hdr.Control / Hdr.Controllen for one
// u.sendmmsgCB at construction so it stays alloc-free in the hot path; // entry. It writes the UDP_SEGMENT payload when runLen >= 2 and the
// inputs (sendmmsgN) and outputs (sendmmsgSent, sendmmsgErrno) ride on // IP_TOS/IPV6_TCLASS payload when ecn != 0, then points hdr.Control at the
// the receiver rather than escaping locals. // smallest contiguous span that covers whichever cmsg(s) actually apply.
func (u *StdConn) sendmmsgRun(fd uintptr) bool { func (u *StdConn) writeEntryCmsg(entry, runLen, segSize int, ecn byte) {
r1, _, errno := unix.Syscall6(unix.SYS_SENDMMSG, fd, hdr := &u.writeMsgs[entry].Hdr
uintptr(unsafe.Pointer(&u.writeMsgs[0])), uintptr(u.sendmmsgN), useSeg := runLen >= 2
0, 0, 0, useEcn := ecn != 0
) base := entry * u.writeCmsgSpace
if errno == syscall.EAGAIN || errno == syscall.EWOULDBLOCK {
return false if useSeg {
dataOff := base + unix.CmsgLen(0)
binary.NativeEndian.PutUint16(u.writeCmsg[dataOff:dataOff+2], uint16(segSize))
} }
u.sendmmsgSent = int(r1) if useEcn {
u.sendmmsgErrno = errno dataOff := base + u.writeCmsgSegSpace + unix.CmsgLen(0)
return true binary.NativeEndian.PutUint32(u.writeCmsg[dataOff:dataOff+4], uint32(ecn))
}
switch {
case useSeg && useEcn:
hdr.Control = &u.writeCmsg[base]
setMsgControllen(hdr, u.writeCmsgSpace)
case useSeg:
hdr.Control = &u.writeCmsg[base]
setMsgControllen(hdr, u.writeCmsgSegSpace)
case useEcn:
hdr.Control = &u.writeCmsg[base+u.writeCmsgSegSpace]
setMsgControllen(hdr, u.writeCmsgEcnSpace)
default:
hdr.Control = nil
setMsgControllen(hdr, 0)
}
}
// sendmmsg issues sendmmsg(2) over u.rawConn against the first n entries
// of u.writeMsgs. Routes through u.rawSend so the per-call kernel callback
// stays alloc-free.
func (u *StdConn) sendmmsg(n int) (int, error) {
return u.rawSend.send(u.rawConn, n)
} }
// writeSockaddr encodes addr into buf (which must be at least // writeSockaddr encodes addr into buf (which must be at least
@@ -497,3 +929,22 @@ func NewUDPStatsEmitter(udpConns []Conn) func() {
} }
} }
} }
func parseRelease(r string) (major, minor int) {
// strip anything after the second dot or any non-digit
parts := strings.SplitN(r, ".", 3)
if len(parts) < 2 {
return 0, 0
}
major, _ = strconv.Atoi(parts[0])
// minor may have trailing junk like "15-generic"
mp := parts[1]
for i, c := range mp {
if c < '0' || c > '9' {
mp = mp[:i]
break
}
}
minor, _ = strconv.Atoi(mp)
return
}

View File

@@ -7,11 +7,40 @@ type TunPacket struct {
Bytes []byte Bytes []byte
// Meta contains other information to help process the packet correctly, such as offsets for segmentation offloads // Meta contains other information to help process the packet correctly, such as offsets for segmentation offloads
// Fields in Meta should be as portable/platform-agnostic as possible. // Fields in Meta should be as portable/platform-agnostic as possible.
Meta struct{} Meta GSOInfo
} }
// PerSegment invokes fn once per segment of pkt. // GSOInfo describes a kernel-supplied superpacket sitting in Packet.Bytes.
// This is a stub implementation that does not actually support segmentation // The zero value means "not a superpacket" — Bytes is one regular IP
func (t *TunPacket) PerSegment(fn func(seg []byte) error) error { // datagram and no segmentation is required.
return fn(t.Bytes) type GSOInfo struct {
// Size is the GSO segment size: max payload bytes per segment
// (== TCP MSS for TSO, == UDP payload chunk for USO). Zero means
// not a superpacket.
Size uint16
// HdrLen is the total L3+L4 header length within Bytes (already
// corrected via correctHdrLen, so safe to slice on).
HdrLen uint16
// CsumStart is the L4 header offset inside Bytes (== L3 header
// length).
CsumStart uint16
// Proto picks the L4 protocol (TCP or UDP) so the segmenter knows
// which checksum/header layout to apply.
Proto GSOProto
} }
// GSOProto selects the L4 protocol for a GSO superpacket. Determines which
// VIRTIO_NET_HDR_GSO_* type the writer stamps and which checksum offset
// inside the transport header virtio NEEDS_CSUM expects.
type GSOProto uint8
const (
GSOProtoNone GSOProto = iota
GSOProtoTCP
GSOProtoUDP
)
// IsSuperpacket reports whether g describes a multi-segment GSO/USO
// superpacket that needs segmentation before its bytes can be encrypted
// and sent on the wire.
func (g GSOInfo) IsSuperpacket() bool { return g.Size > 0 }

10
wire/wire_generic.go Normal file
View File

@@ -0,0 +1,10 @@
//go:build !linux
// +build !linux
package wire
// PerSegment invokes fn once per segment of pkt.
// This is a stub implementation that does not actually support segmentation
func (t *TunPacket) PerSegment(fn func(seg []byte) error) error {
return fn(t.Bytes)
}

32
wire/wire_linux.go Normal file
View File

@@ -0,0 +1,32 @@
package wire
import (
"fmt"
"github.com/slackhq/nebula/overlay/tio/virtio"
)
// PerSegment invokes fn once per segment of pkt. For non-GSO pkts
// fn is called once with pkt.Bytes (no segmentation, no copy). For GSO/USO
// superpackets fn is called once per segment with a slice of pkt.Bytes
// holding that segment's plaintext (a freshly-patched L3+L4 header sliced
// in front of the original payload chunk). The slide is destructive: pkt is
// consumed by this call and its bytes are in an undefined state when
// PerSegment returns. Callers must not retain pkt or any earlier
// seg slice past fn's return for that segment. The scratch parameter is
// unused on the destructive path and kept only for cross-platform
// signature compatibility. Aborts and returns the first error from fn or
// from per-segment construction.
func (t *TunPacket) PerSegment(fn func(seg []byte) error) error {
if !t.Meta.IsSuperpacket() {
return fn(t.Bytes)
}
switch t.Meta.Proto {
case GSOProtoTCP:
return virtio.SegmentTCP(t.Bytes, t.Meta.HdrLen, t.Meta.CsumStart, t.Meta.Size, fn)
case GSOProtoUDP:
return virtio.SegmentUDP(t.Bytes, t.Meta.HdrLen, t.Meta.CsumStart, t.Meta.Size, fn)
default:
return fmt.Errorf("unsupported gso proto: %d", t.Meta.Proto)
}
}