mirror of
https://github.com/slackhq/nebula.git
synced 2026-05-16 04:47:38 +02:00
broken checkpt
This commit is contained in:
@@ -147,44 +147,3 @@ func mergeECNIntoSeed(seedHdr, pktHdr []byte, isV6 bool) {
|
||||
seedHdr[1] |= pktHdr[1] & 0x03
|
||||
}
|
||||
}
|
||||
|
||||
// Arena is an injectable byte-slab that hands out non-overlapping borrowed
|
||||
// slices via Reserve and releases them in bulk via Reset. Coalescers take
|
||||
// an *Arena at construction so the caller controls the slab lifetime and
|
||||
// can share one slab across multiple coalescers (MultiCoalescer hands the
|
||||
// same *Arena to every lane so the lanes don't carry their own backings).
|
||||
//
|
||||
// Reserve borrows; the slice is valid until the next Reset. The slab grows
|
||||
// (by allocating a fresh, larger backing array) if a Reserve doesn't fit;
|
||||
// pre-size the arena via NewArena to avoid that path on the hot path.
|
||||
type Arena struct {
|
||||
buf []byte
|
||||
}
|
||||
|
||||
// NewArena returns an Arena with a pre-allocated backing of the given
|
||||
// capacity. Pass 0 if you don't intend to call Reserve (e.g. a test that
|
||||
// only feeds the coalescer pre-made []byte packets via Commit).
|
||||
func NewArena(capacity int) *Arena {
|
||||
return &Arena{buf: make([]byte, 0, capacity)}
|
||||
}
|
||||
|
||||
// Reserve hands out a non-overlapping sz-byte slice from the arena. If the
|
||||
// request doesn't fit the current backing, a fresh, larger backing is
|
||||
// allocated; already-borrowed slices reference the old backing and remain
|
||||
// valid until Reset.
|
||||
func (a *Arena) Reserve(sz int) []byte {
|
||||
if len(a.buf)+sz > cap(a.buf) {
|
||||
newCap := max(cap(a.buf)*2, sz)
|
||||
a.buf = make([]byte, 0, newCap)
|
||||
}
|
||||
start := len(a.buf)
|
||||
a.buf = a.buf[:start+sz]
|
||||
return a.buf[start : start+sz : start+sz]
|
||||
}
|
||||
|
||||
// Reset releases every slice handed out since the last Reset. Callers must
|
||||
// not use any previously-borrowed slice after this returns. The underlying
|
||||
// backing array is retained so subsequent Reserves don't re-allocate.
|
||||
func (a *Arena) Reset() {
|
||||
a.buf = a.buf[:0]
|
||||
}
|
||||
|
||||
@@ -2,8 +2,10 @@ package batch
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
|
||||
"github.com/slackhq/nebula/overlay/tio"
|
||||
"github.com/slackhq/nebula/util"
|
||||
)
|
||||
|
||||
// MultiCoalescer fans plaintext packets out to lane-specific batchers based
|
||||
@@ -35,7 +37,7 @@ type MultiCoalescer struct {
|
||||
// sequentially and never Reserves in between, so a later lane's
|
||||
// slots stay readable across an earlier lane's Reset (the underlying
|
||||
// bytes are still alive — Reset only re-slices len to 0).
|
||||
arena *Arena
|
||||
arena *util.Arena
|
||||
}
|
||||
|
||||
// DefaultMultiArenaCap is the recommended arena capacity for a Multi-lane
|
||||
@@ -49,9 +51,9 @@ const DefaultMultiArenaCap = initialSlots * 65535
|
||||
// Either lane disabled redirects its traffic into the passthrough lane.
|
||||
// arena is the single backing slab shared across every lane; the caller
|
||||
// pre-sizes it via NewArena so the hot path never allocates.
|
||||
func NewMultiCoalescer(w io.Writer, l *slog.Logger, arena *Arena, tcpEnabled, udpEnabled bool) *MultiCoalescer {
|
||||
func NewMultiCoalescer(w tio.Queue, l *slog.Logger, arena *util.Arena, tcpEnabled, udpEnabled bool) *MultiCoalescer {
|
||||
m := &MultiCoalescer{
|
||||
pt: NewPassthrough(w, arena),
|
||||
pt: NewPassthrough(w, initialSlots, arena),
|
||||
arena: arena,
|
||||
}
|
||||
if tcpEnabled {
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"io"
|
||||
|
||||
"github.com/slackhq/nebula/udp"
|
||||
"github.com/slackhq/nebula/util"
|
||||
)
|
||||
|
||||
// Passthrough is a RxBatcher that doesn't batch anything, it just accumulates and then sends packets.
|
||||
@@ -11,7 +12,7 @@ type Passthrough struct {
|
||||
out io.Writer
|
||||
slots [][]byte
|
||||
// arena is injected; see TCPCoalescer.arena for the contract.
|
||||
arena *Arena
|
||||
arena *util.Arena
|
||||
cursor int
|
||||
}
|
||||
|
||||
@@ -21,7 +22,7 @@ const passthroughBaseNumSlots = 128
|
||||
// standalone Passthrough batcher: 128 slots × udp.MTU ≈ 1.1 MiB.
|
||||
const DefaultPassthroughArenaCap = passthroughBaseNumSlots * udp.MTU
|
||||
|
||||
func NewPassthrough(w io.Writer, slots int, arena *Arena) *Passthrough {
|
||||
func NewPassthrough(w io.Writer, slots int, arena *util.Arena) *Passthrough {
|
||||
return &Passthrough{
|
||||
out: w,
|
||||
slots: make([][]byte, 0, slots),
|
||||
|
||||
@@ -10,6 +10,8 @@ import (
|
||||
"slices"
|
||||
|
||||
"github.com/slackhq/nebula/overlay/tio"
|
||||
"github.com/slackhq/nebula/util"
|
||||
"github.com/slackhq/nebula/wire"
|
||||
)
|
||||
|
||||
// ipProtoTCP is the IANA protocol number for TCP. Hardcoded instead of
|
||||
@@ -88,11 +90,11 @@ type TCPCoalescer struct {
|
||||
// and tells it to release them via Reset on Flush. When wrapped in
|
||||
// MultiCoalescer the same *Arena is shared with the other lanes so
|
||||
// there's exactly one backing slab per Multi instance.
|
||||
arena *Arena
|
||||
arena *util.Arena
|
||||
l *slog.Logger
|
||||
}
|
||||
|
||||
func NewTCPCoalescer(w io.Writer, l *slog.Logger, arena *Arena) *TCPCoalescer {
|
||||
func NewTCPCoalescer(w tio.Queue, l *slog.Logger, arena *util.Arena) *TCPCoalescer {
|
||||
c := &TCPCoalescer{
|
||||
plainW: w,
|
||||
slots: make([]*coalesceSlot, 0, initialSlots),
|
||||
@@ -101,7 +103,7 @@ func NewTCPCoalescer(w io.Writer, l *slog.Logger, arena *Arena) *TCPCoalescer {
|
||||
arena: arena,
|
||||
l: l,
|
||||
}
|
||||
if gw, ok := tio.SupportsGSO(w, tio.GSOProtoTCP); ok {
|
||||
if gw, ok := tio.SupportsGSO(w, wire.GSOProtoTCP); ok {
|
||||
c.gsoW = gw
|
||||
}
|
||||
return c
|
||||
@@ -419,7 +421,7 @@ func (c *TCPCoalescer) flushSlot(s *coalesceSlot) error {
|
||||
tcsum := s.ipHdrLen + 16
|
||||
binary.BigEndian.PutUint16(hdr[tcsum:tcsum+2], foldOnceNoInvert(psum))
|
||||
|
||||
return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, tio.GSOProtoTCP)
|
||||
return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, wire.GSOProtoTCP)
|
||||
}
|
||||
|
||||
// headersMatch compares two IP+TCP header prefixes for byte-for-byte
|
||||
|
||||
@@ -5,6 +5,8 @@ import (
|
||||
"io"
|
||||
|
||||
"github.com/slackhq/nebula/overlay/tio"
|
||||
"github.com/slackhq/nebula/util"
|
||||
"github.com/slackhq/nebula/wire"
|
||||
)
|
||||
|
||||
// ipProtoUDP is the IANA protocol number for UDP.
|
||||
@@ -67,7 +69,7 @@ type UDPCoalescer struct {
|
||||
pool []*udpSlot
|
||||
|
||||
// arena is injected; see TCPCoalescer.arena for the contract.
|
||||
arena *Arena
|
||||
arena *util.Arena
|
||||
}
|
||||
|
||||
// NewUDPCoalescer wraps w. The caller is responsible for only constructing
|
||||
@@ -75,7 +77,7 @@ type UDPCoalescer struct {
|
||||
// the kernel may reject GSO_UDP_L4 writes. If w does not implement
|
||||
// tio.GSOWriter at all (single-packet Queue), the coalescer degrades to
|
||||
// plain Writes — same defensive shape as the TCP coalescer.
|
||||
func NewUDPCoalescer(w io.Writer, arena *Arena) *UDPCoalescer {
|
||||
func NewUDPCoalescer(w tio.Queue, arena *util.Arena) *UDPCoalescer {
|
||||
c := &UDPCoalescer{
|
||||
plainW: w,
|
||||
slots: make([]*udpSlot, 0, initialSlots),
|
||||
@@ -83,7 +85,7 @@ func NewUDPCoalescer(w io.Writer, arena *Arena) *UDPCoalescer {
|
||||
pool: make([]*udpSlot, 0, initialSlots),
|
||||
arena: arena,
|
||||
}
|
||||
if gw, ok := tio.SupportsGSO(w, tio.GSOProtoUDP); ok {
|
||||
if gw, ok := tio.SupportsGSO(w, wire.GSOProtoUDP); ok {
|
||||
c.gsoW = gw
|
||||
}
|
||||
return c
|
||||
@@ -313,7 +315,7 @@ func (c *UDPCoalescer) flushSlot(s *udpSlot) error {
|
||||
udpCsumOff := s.ipHdrLen + 6
|
||||
binary.BigEndian.PutUint16(hdr[udpCsumOff:udpCsumOff+2], foldOnceNoInvert(psum))
|
||||
|
||||
return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, tio.GSOProtoUDP)
|
||||
return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, wire.GSOProtoUDP)
|
||||
}
|
||||
|
||||
// udpHeadersMatch compares two IP+UDP header prefixes for byte-equality on
|
||||
|
||||
@@ -46,42 +46,6 @@ type Queue interface {
|
||||
Capabilities() Capabilities
|
||||
}
|
||||
|
||||
|
||||
// GSOInfo describes a kernel-supplied superpacket sitting in Packet.Bytes.
|
||||
// The zero value means "not a superpacket" — Bytes is one regular IP
|
||||
// datagram and no segmentation is required.
|
||||
type GSOInfo struct {
|
||||
// Size is the GSO segment size: max payload bytes per segment
|
||||
// (== TCP MSS for TSO, == UDP payload chunk for USO). Zero means
|
||||
// not a superpacket.
|
||||
Size uint16
|
||||
// HdrLen is the total L3+L4 header length within Bytes (already
|
||||
// corrected via correctHdrLen, so safe to slice on).
|
||||
HdrLen uint16
|
||||
// CsumStart is the L4 header offset inside Bytes (== L3 header
|
||||
// length).
|
||||
CsumStart uint16
|
||||
// Proto picks the L4 protocol (TCP or UDP) so the segmenter knows
|
||||
// which checksum/header layout to apply.
|
||||
Proto GSOProto
|
||||
}
|
||||
|
||||
// IsSuperpacket reports whether g describes a multi-segment GSO/USO
|
||||
// superpacket that needs segmentation before its bytes can be encrypted
|
||||
// and sent on the wire.
|
||||
func (g GSOInfo) IsSuperpacket() bool { return g.Size > 0 }
|
||||
|
||||
// GSOProto selects the L4 protocol for a GSO superpacket. Determines which
|
||||
// VIRTIO_NET_HDR_GSO_* type the writer stamps and which checksum offset
|
||||
// inside the transport header virtio NEEDS_CSUM expects.
|
||||
type GSOProto uint8
|
||||
|
||||
const (
|
||||
GSOProtoNone GSOProto = iota
|
||||
GSOProtoTCP
|
||||
GSOProtoUDP
|
||||
)
|
||||
|
||||
// GSOWriter is implemented by Queues that can emit a TCP or UDP superpacket
|
||||
// assembled from a header prefix plus one or more borrowed payload
|
||||
// fragments, in a single vectored write (writev with a leading
|
||||
@@ -104,24 +68,25 @@ const (
|
||||
// implementation of GSOWriter is necessary but not sufficient since USO
|
||||
// may not have been negotiated even when TSO was.
|
||||
type GSOWriter interface {
|
||||
WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto GSOProto) error
|
||||
WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto wire.GSOProto) error
|
||||
}
|
||||
|
||||
// SupportsGSO reports whether w implements GSOWriter and the underlying
|
||||
// queue advertises the negotiated capability for `want`. A writer that
|
||||
// implements GSOWriter but not CapsProvider is treated as permissive
|
||||
// (used by tests and fakes that don't negotiate).
|
||||
func SupportsGSO(w Queue, want GSOProto) (GSOWriter, bool) {
|
||||
func SupportsGSO(w Queue, want wire.GSOProto) (GSOWriter, bool) {
|
||||
gw, ok := w.(GSOWriter)
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
caps := w.Capabilities()
|
||||
switch want {
|
||||
case GSOProtoTCP:
|
||||
case wire.GSOProtoTCP:
|
||||
return gw, caps.TSO
|
||||
case GSOProtoUDP:
|
||||
case wire.GSOProtoUDP:
|
||||
return gw, caps.USO
|
||||
default:
|
||||
return gw, false
|
||||
}
|
||||
return gw, false
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
||||
"github.com/slackhq/nebula/wire"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/slackhq/nebula/overlay/tio/virtio"
|
||||
@@ -67,9 +68,6 @@ type Offload struct {
|
||||
// events.
|
||||
writeLock sync.Mutex
|
||||
closed atomic.Bool
|
||||
rxBuf []byte // backing store for kernel-handed packets read this drain
|
||||
rxOff int // cursor into rxBuf for the current Read drain
|
||||
pending []Packet // packets returned from the most recent Read
|
||||
|
||||
// readVnetScratch holds the 10-byte virtio_net_hdr split off the front of
|
||||
// every TUN read via readv(2). Decoupling the header from the packet body
|
||||
@@ -115,9 +113,7 @@ func newOffload(fd int, shutdownFd int, usoEnabled bool) (*Offload, error) {
|
||||
{Fd: int32(shutdownFd), Events: unix.POLLIN},
|
||||
},
|
||||
writeLock: sync.Mutex{},
|
||||
|
||||
rxBuf: make([]byte, tunRxBufCap),
|
||||
gsoIovs: make([]unix.Iovec, 2, gsoMaxIovs),
|
||||
gsoIovs: make([]unix.Iovec, 2, gsoMaxIovs),
|
||||
}
|
||||
|
||||
out.gsoIovs[0].Base = &out.gsoHdrBuf[0]
|
||||
@@ -197,9 +193,9 @@ func (r *Offload) blockOnWrite() error {
|
||||
// hold one worst-case kernel-supplied packet body. Without that gate the
|
||||
// body iovec could be smaller than the next inbound packet and the
|
||||
// kernel would truncate.
|
||||
func (r *Offload) readPacket(block bool) (int, error) {
|
||||
func (r *Offload) readPacket(mem []byte, block bool) (int, error) {
|
||||
for {
|
||||
r.readIovs[1].Base = &r.rxBuf[r.rxOff]
|
||||
r.readIovs[1].Base = &mem[0]
|
||||
r.readIovs[1].SetLen(tunReadBufSize)
|
||||
n, _, errno := syscall.Syscall(unix.SYS_READV, uintptr(r.fd), uintptr(unsafe.Pointer(&r.readIovs[0])), uintptr(len(r.readIovs)))
|
||||
if errno == 0 {
|
||||
@@ -237,29 +233,33 @@ func (r *Offload) readPacket(block bool) (int, error) {
|
||||
// bursts of small packets (e.g. TCP ACKs). Packet.Bytes slices point
|
||||
// into the Offload's internal buffer and are only valid until the next
|
||||
// Read or Close on this Queue.
|
||||
func (r *Offload) Read() ([]Packet, error) {
|
||||
r.pending = r.pending[:0]
|
||||
r.rxOff = 0
|
||||
func (r *Offload) Read(p []wire.TunPacket, mem []byte) (int, error) {
|
||||
maxP := len(p)
|
||||
maxM := len(mem)
|
||||
p = p[:0]
|
||||
rxOff := 0
|
||||
|
||||
// Initial (blocking) read. Retry on decode errors so a single bad
|
||||
// packet does not stall the reader.
|
||||
for {
|
||||
n, err := r.readPacket(true)
|
||||
n, err := r.readPacket(mem, true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return 0, err
|
||||
}
|
||||
if err := r.decodeRead(n); err != nil {
|
||||
if p, err = r.decodeRead(p, mem, n); err != nil {
|
||||
// Drop and read again — a bad packet should not kill the reader.
|
||||
continue
|
||||
}
|
||||
|
||||
rxOff += n
|
||||
break
|
||||
}
|
||||
|
||||
// Drain: non-blocking reads until the kernel queue is empty, the drain
|
||||
// cap is reached, or rxBuf no longer has room for another worst-case
|
||||
// kernel-supplied packet (tunRxBufSize).
|
||||
for len(r.pending) < tunDrainCap && tunRxBufCap-r.rxOff >= tunRxBufSize {
|
||||
n, err := r.readPacket(false)
|
||||
for len(p) < maxP && maxM-rxOff >= tunRxBufSize {
|
||||
n, err := r.readPacket(mem[rxOff:], false)
|
||||
if err != nil {
|
||||
// EAGAIN / EINTR / anything else: stop draining. We already
|
||||
// have a valid batch from the first read.
|
||||
@@ -268,14 +268,15 @@ func (r *Offload) Read() ([]Packet, error) {
|
||||
if n <= 0 {
|
||||
break
|
||||
}
|
||||
if err := r.decodeRead(n); err != nil {
|
||||
if p, err = r.decodeRead(p, mem, n); err != nil {
|
||||
// Drop this packet and stop the drain; we'd rather hand off
|
||||
// what we have than keep spinning here.
|
||||
break
|
||||
}
|
||||
rxOff += n
|
||||
}
|
||||
|
||||
return r.pending, nil
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
// decodeRead processes the packet sitting in rxBuf at rxOff (length
|
||||
@@ -285,24 +286,23 @@ func (r *Offload) Read() ([]Packet, error) {
|
||||
// caller can segment lazily at encrypt time. rxOff advances past the
|
||||
// kernel-supplied body and nothing else, since segmentation no longer
|
||||
// writes back into rxBuf.
|
||||
func (r *Offload) decodeRead(pktLen int) error {
|
||||
func (r *Offload) decodeRead(p []wire.TunPacket, mem []byte, pktLen int) ([]wire.TunPacket, error) {
|
||||
if pktLen <= 0 {
|
||||
return fmt.Errorf("short tun read: %d", pktLen)
|
||||
return p, fmt.Errorf("short tun read: %d", pktLen)
|
||||
}
|
||||
var hdr virtio.Hdr
|
||||
hdr.Decode(r.readVnetScratch[:])
|
||||
|
||||
body := r.rxBuf[r.rxOff : r.rxOff+pktLen]
|
||||
body := mem[:pktLen]
|
||||
|
||||
if hdr.GSOType == unix.VIRTIO_NET_HDR_GSO_NONE {
|
||||
if hdr.Flags&unix.VIRTIO_NET_HDR_F_NEEDS_CSUM != 0 {
|
||||
if err := virtio.FinishChecksum(body, hdr); err != nil {
|
||||
return err
|
||||
return p, err
|
||||
}
|
||||
}
|
||||
r.pending = append(r.pending, Packet{Bytes: body})
|
||||
r.rxOff += pktLen
|
||||
return nil
|
||||
p = append(p, wire.TunPacket{Bytes: body})
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// GSO superpacket: validate, fix the kernel-supplied HdrLen on the
|
||||
@@ -310,26 +310,25 @@ func (r *Offload) decodeRead(pktLen int) error {
|
||||
// the metadata. The bytes stay in rxBuf untouched, segmentation
|
||||
// happens in SegmentSuperpacket at encrypt time.
|
||||
if err := virtio.CheckValid(body, hdr); err != nil {
|
||||
return err
|
||||
return p, err
|
||||
}
|
||||
if err := virtio.CorrectHdrLen(body, &hdr); err != nil {
|
||||
return err
|
||||
return p, err
|
||||
}
|
||||
proto, err := protoFromGSOType(hdr.GSOType)
|
||||
if err != nil {
|
||||
return err
|
||||
return p, err
|
||||
}
|
||||
r.pending = append(r.pending, Packet{
|
||||
p = append(p, wire.TunPacket{
|
||||
Bytes: body,
|
||||
GSO: GSOInfo{
|
||||
Meta: wire.GSOInfo{
|
||||
Size: hdr.GSOSize,
|
||||
HdrLen: hdr.HdrLen,
|
||||
CsumStart: hdr.CsumStart,
|
||||
Proto: proto,
|
||||
},
|
||||
})
|
||||
r.rxOff += pktLen
|
||||
return nil
|
||||
return p, nil
|
||||
}
|
||||
|
||||
func (r *Offload) Write(buf []byte) (int, error) {
|
||||
@@ -384,7 +383,7 @@ func (r *Offload) Capabilities() Capabilities {
|
||||
return Capabilities{TSO: true, USO: r.usoEnabled}
|
||||
}
|
||||
|
||||
func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto GSOProto) error {
|
||||
func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto wire.GSOProto) error {
|
||||
if len(hdr) == 0 || len(pays) == 0 || len(transportHdr) == 0 {
|
||||
return nil
|
||||
}
|
||||
@@ -392,7 +391,7 @@ func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto
|
||||
// seq/ack/dataoff/flags/window), UDP=6 (after sport/dport/length).
|
||||
var csumOff uint16
|
||||
switch proto {
|
||||
case GSOProtoUDP:
|
||||
case wire.GSOProtoUDP:
|
||||
csumOff = 6
|
||||
default:
|
||||
csumOff = 16
|
||||
@@ -407,7 +406,7 @@ func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto
|
||||
if len(pays) > 1 {
|
||||
ipVer := hdr[0] >> 4
|
||||
switch {
|
||||
case proto == GSOProtoUDP && (ipVer == 4 || ipVer == 6):
|
||||
case proto == wire.GSOProtoUDP && (ipVer == 4 || ipVer == 6):
|
||||
vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_UDP_L4
|
||||
case ipVer == 6:
|
||||
vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV6
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"os"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/slackhq/nebula/wire"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
@@ -19,9 +20,6 @@ type Poll struct {
|
||||
readPoll [2]unix.PollFd
|
||||
writePoll [2]unix.PollFd
|
||||
closed atomic.Bool
|
||||
|
||||
readBuf []byte
|
||||
batchRet [1]Packet
|
||||
}
|
||||
|
||||
func newPoll(fd int, shutdownFd int) (*Poll, error) {
|
||||
@@ -31,8 +29,7 @@ func newPoll(fd int, shutdownFd int) (*Poll, error) {
|
||||
}
|
||||
|
||||
out := &Poll{
|
||||
fd: fd,
|
||||
readBuf: make([]byte, tunReadBufSize),
|
||||
fd: fd,
|
||||
readPoll: [2]unix.PollFd{
|
||||
{Fd: int32(fd), Events: unix.POLLIN},
|
||||
{Fd: int32(shutdownFd), Events: unix.POLLIN},
|
||||
@@ -97,13 +94,17 @@ func (t *Poll) blockOnWrite() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *Poll) Read() ([]Packet, error) {
|
||||
n, err := t.readOne(t.readBuf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
func (t *Poll) Read(p []wire.TunPacket, mem []byte) (int, error) {
|
||||
if len(p) == 0 || len(mem) == 0 {
|
||||
return 0, nil //todo should this be an err?
|
||||
}
|
||||
t.batchRet[0] = Packet{Bytes: t.readBuf[:n]}
|
||||
return t.batchRet[:], nil
|
||||
p[0].Meta = wire.GSOInfo{}
|
||||
n, err := t.readOne(mem)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
p[0].Bytes = mem[:n]
|
||||
return 1, nil
|
||||
}
|
||||
|
||||
func (t *Poll) readOne(to []byte) (int, error) {
|
||||
@@ -162,3 +163,7 @@ func (t *Poll) Close() error {
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (t *Poll) Capabilities() Capabilities {
|
||||
return Capabilities{}
|
||||
}
|
||||
|
||||
@@ -6,46 +6,20 @@ package tio
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/slackhq/nebula/wire"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/slackhq/nebula/overlay/tio/virtio"
|
||||
)
|
||||
|
||||
// protoFromGSOType maps a virtio_net_hdr GSOType to the GSOProto value the
|
||||
// segment-time helpers use. Returns an error for GSO_NONE or any unknown
|
||||
// value — the caller should only invoke this on a confirmed superpacket.
|
||||
func protoFromGSOType(t uint8) (GSOProto, error) {
|
||||
func protoFromGSOType(t uint8) (wire.GSOProto, error) {
|
||||
switch t {
|
||||
case unix.VIRTIO_NET_HDR_GSO_TCPV4, unix.VIRTIO_NET_HDR_GSO_TCPV6:
|
||||
return GSOProtoTCP, nil
|
||||
return wire.GSOProtoTCP, nil
|
||||
case unix.VIRTIO_NET_HDR_GSO_UDP_L4:
|
||||
return GSOProtoUDP, nil
|
||||
return wire.GSOProtoUDP, nil
|
||||
default:
|
||||
return 0, fmt.Errorf("unsupported virtio gso type: %d", t)
|
||||
}
|
||||
}
|
||||
|
||||
// SegmentSuperpacket invokes fn once per segment of pkt. For non-GSO pkts
|
||||
// fn is called once with pkt.Bytes (no segmentation, no copy). For GSO/USO
|
||||
// superpackets fn is called once per segment with a slice of pkt.Bytes
|
||||
// holding that segment's plaintext (a freshly-patched L3+L4 header sliced
|
||||
// in front of the original payload chunk). The slide is destructive: pkt is
|
||||
// consumed by this call and its bytes are in an undefined state when
|
||||
// SegmentSuperpacket returns. Callers must not retain pkt or any earlier
|
||||
// seg slice past fn's return for that segment. The scratch parameter is
|
||||
// unused on the destructive path and kept only for cross-platform
|
||||
// signature compatibility. Aborts and returns the first error from fn or
|
||||
// from per-segment construction.
|
||||
func SegmentSuperpacket(pkt Packet, fn func(seg []byte) error) error {
|
||||
if !pkt.GSO.IsSuperpacket() {
|
||||
return fn(pkt.Bytes)
|
||||
}
|
||||
switch pkt.GSO.Proto {
|
||||
case GSOProtoTCP:
|
||||
return virtio.SegmentTCP(pkt.Bytes, pkt.GSO.HdrLen, pkt.GSO.CsumStart, pkt.GSO.Size, fn)
|
||||
case GSOProtoUDP:
|
||||
return virtio.SegmentUDP(pkt.Bytes, pkt.GSO.HdrLen, pkt.GSO.CsumStart, pkt.GSO.Size, fn)
|
||||
default:
|
||||
return fmt.Errorf("unsupported gso proto: %d", pkt.GSO.Proto)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,7 +83,7 @@ func (t *disabledTun) Read(p []wire.TunPacket, mem []byte) (int, error) {
|
||||
if len(p) == 0 || len(mem) == 0 {
|
||||
return 0, nil //todo should this be an err?
|
||||
}
|
||||
p[0].Meta = struct{}{}
|
||||
p[0].Meta = wire.GSOInfo{}
|
||||
n, err := t.readOne(mem)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
|
||||
@@ -146,6 +146,11 @@ func newTun(c *config.C, l *slog.Logger, vpnNetworks []netip.Prefix, multiqueue
|
||||
}
|
||||
nameStr := c.GetString("tun.dev", "")
|
||||
|
||||
// First try to enable IFF_VNET_HDR via TUNSETIFF and negotiate TUN_F_*
|
||||
// offloads via TUNSETOFFLOAD so we can receive TSO/USO superpackets.
|
||||
// We try TSO+USO first, fall back to TSO-only on kernels without USO
|
||||
// (Linux < 6.2), and finally give up on virtio headers entirely and
|
||||
// reopen as a plain TUN if neither offload mask is accepted.
|
||||
fd, err := openTunDev()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
||||
@@ -48,7 +48,7 @@ func (d *UserDevice) Read(p []wire.TunPacket, mem []byte) (int, error) {
|
||||
if len(p) == 0 || len(mem) == 0 {
|
||||
return 0, nil //todo should this be an err?
|
||||
}
|
||||
p[0].Meta = struct{}{}
|
||||
p[0].Meta = wire.GSOInfo{}
|
||||
n, err := d.outboundReader.Read(mem)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
|
||||
Reference in New Issue
Block a user