From d50c3028a2c88ff2a58a79a006d867f334cb293c Mon Sep 17 00:00:00 2001 From: JackDoan Date: Thu, 14 May 2026 15:56:34 -0500 Subject: [PATCH] broken checkpt --- interface.go | 18 +- overlay/batch/coalesce_core.go | 41 --- overlay/batch/multi_coalesce.go | 10 +- overlay/batch/passthrough.go | 5 +- overlay/batch/tcp_coalesce.go | 10 +- overlay/batch/udp_coalesce.go | 10 +- overlay/tio/tio.go | 47 +-- overlay/tio/tio_gso_linux.go | 69 ++-- overlay/tio/tio_poll_linux.go | 27 +- overlay/tio/tun_linux_offload.go | 34 +- overlay/tun_disabled.go | 2 +- overlay/tun_linux.go | 5 + overlay/user.go | 2 +- udp/udp_linux.go | 603 +++++++++++++++++++++++++++---- wire/wire.go | 39 +- wire/wire_generic.go | 10 + wire/wire_linux.go | 32 ++ 17 files changed, 706 insertions(+), 258 deletions(-) create mode 100644 wire/wire_generic.go create mode 100644 wire/wire_linux.go diff --git a/interface.go b/interface.go index 100500d5..9dcbf346 100644 --- a/interface.go +++ b/interface.go @@ -283,7 +283,7 @@ func (f *Interface) activate() error { } f.readers = f.inside.Readers() for i := range f.readers { - caps := tio.QueueCapabilities(f.readers[i]) + caps := f.readers[i].Capabilities() if caps.TSO || caps.USO { // Multi-lane: TCP gets coalesced when TSO is on, UDP when USO // is on, everything else (and either lane disabled) falls @@ -387,7 +387,19 @@ func (f *Interface) listenIn(reader tio.Queue, q int) { f.l.Warn("failed to pin tun reader to CPU", "queue", q, "cpu", cpu, "err", err) } + const bonusInfo = 16 + bufferScale := udp.MTU + bonusInfo + numTunPackets := 1 + caps := reader.Capabilities() + if caps.TSO || caps.USO { + bufferScale = 65535 + bonusInfo + numTunPackets = f.batchSize + } + rejectBuf := make([]byte, mtu) + tunPackets := make([]wire.TunPacket, numTunPackets) + packetMem := make([]byte, bufferScale*numTunPackets) + arenaSize := batch.SendBatchCap * (udp.MTU + 32) sb := batch.NewSendBatch(f.writers[q], batch.SendBatchCap, util.NewArena(arenaSize)) fwPacket := &firewall.Packet{} @@ -396,7 +408,7 @@ func (f *Interface) listenIn(reader tio.Queue, q int) { conntrackCache := firewall.NewConntrackCacheTicker(f.ctx, f.l, f.conntrackCacheTimeout) for { - n, err := reader.Read(packets, packetMem) + n, err := reader.Read(tunPackets, packetMem) if err != nil { if !f.closed.Load() { f.l.Error("Error while reading outbound packet, closing", "error", err, "reader", q) @@ -407,7 +419,7 @@ func (f *Interface) listenIn(reader tio.Queue, q int) { ctCache := conntrackCache.Get() for i := range n { - f.consumeInsidePacket(packets[i], fwPacket, nb, sb, rejectBuf, q, ctCache) + f.consumeInsidePacket(tunPackets[i], fwPacket, nb, sb, rejectBuf, q, ctCache) } if err := sb.Flush(); err != nil { f.l.Error("Failed to write outgoing batch", "error", err, "writer", q) diff --git a/overlay/batch/coalesce_core.go b/overlay/batch/coalesce_core.go index d46addc3..f0399989 100644 --- a/overlay/batch/coalesce_core.go +++ b/overlay/batch/coalesce_core.go @@ -147,44 +147,3 @@ func mergeECNIntoSeed(seedHdr, pktHdr []byte, isV6 bool) { seedHdr[1] |= pktHdr[1] & 0x03 } } - -// Arena is an injectable byte-slab that hands out non-overlapping borrowed -// slices via Reserve and releases them in bulk via Reset. Coalescers take -// an *Arena at construction so the caller controls the slab lifetime and -// can share one slab across multiple coalescers (MultiCoalescer hands the -// same *Arena to every lane so the lanes don't carry their own backings). -// -// Reserve borrows; the slice is valid until the next Reset. The slab grows -// (by allocating a fresh, larger backing array) if a Reserve doesn't fit; -// pre-size the arena via NewArena to avoid that path on the hot path. -type Arena struct { - buf []byte -} - -// NewArena returns an Arena with a pre-allocated backing of the given -// capacity. Pass 0 if you don't intend to call Reserve (e.g. a test that -// only feeds the coalescer pre-made []byte packets via Commit). -func NewArena(capacity int) *Arena { - return &Arena{buf: make([]byte, 0, capacity)} -} - -// Reserve hands out a non-overlapping sz-byte slice from the arena. If the -// request doesn't fit the current backing, a fresh, larger backing is -// allocated; already-borrowed slices reference the old backing and remain -// valid until Reset. -func (a *Arena) Reserve(sz int) []byte { - if len(a.buf)+sz > cap(a.buf) { - newCap := max(cap(a.buf)*2, sz) - a.buf = make([]byte, 0, newCap) - } - start := len(a.buf) - a.buf = a.buf[:start+sz] - return a.buf[start : start+sz : start+sz] -} - -// Reset releases every slice handed out since the last Reset. Callers must -// not use any previously-borrowed slice after this returns. The underlying -// backing array is retained so subsequent Reserves don't re-allocate. -func (a *Arena) Reset() { - a.buf = a.buf[:0] -} diff --git a/overlay/batch/multi_coalesce.go b/overlay/batch/multi_coalesce.go index b6f76fe6..5eef43f2 100644 --- a/overlay/batch/multi_coalesce.go +++ b/overlay/batch/multi_coalesce.go @@ -2,8 +2,10 @@ package batch import ( "errors" - "io" "log/slog" + + "github.com/slackhq/nebula/overlay/tio" + "github.com/slackhq/nebula/util" ) // MultiCoalescer fans plaintext packets out to lane-specific batchers based @@ -35,7 +37,7 @@ type MultiCoalescer struct { // sequentially and never Reserves in between, so a later lane's // slots stay readable across an earlier lane's Reset (the underlying // bytes are still alive — Reset only re-slices len to 0). - arena *Arena + arena *util.Arena } // DefaultMultiArenaCap is the recommended arena capacity for a Multi-lane @@ -49,9 +51,9 @@ const DefaultMultiArenaCap = initialSlots * 65535 // Either lane disabled redirects its traffic into the passthrough lane. // arena is the single backing slab shared across every lane; the caller // pre-sizes it via NewArena so the hot path never allocates. -func NewMultiCoalescer(w io.Writer, l *slog.Logger, arena *Arena, tcpEnabled, udpEnabled bool) *MultiCoalescer { +func NewMultiCoalescer(w tio.Queue, l *slog.Logger, arena *util.Arena, tcpEnabled, udpEnabled bool) *MultiCoalescer { m := &MultiCoalescer{ - pt: NewPassthrough(w, arena), + pt: NewPassthrough(w, initialSlots, arena), arena: arena, } if tcpEnabled { diff --git a/overlay/batch/passthrough.go b/overlay/batch/passthrough.go index db62ab85..39aaca2e 100644 --- a/overlay/batch/passthrough.go +++ b/overlay/batch/passthrough.go @@ -4,6 +4,7 @@ import ( "io" "github.com/slackhq/nebula/udp" + "github.com/slackhq/nebula/util" ) // Passthrough is a RxBatcher that doesn't batch anything, it just accumulates and then sends packets. @@ -11,7 +12,7 @@ type Passthrough struct { out io.Writer slots [][]byte // arena is injected; see TCPCoalescer.arena for the contract. - arena *Arena + arena *util.Arena cursor int } @@ -21,7 +22,7 @@ const passthroughBaseNumSlots = 128 // standalone Passthrough batcher: 128 slots × udp.MTU ≈ 1.1 MiB. const DefaultPassthroughArenaCap = passthroughBaseNumSlots * udp.MTU -func NewPassthrough(w io.Writer, slots int, arena *Arena) *Passthrough { +func NewPassthrough(w io.Writer, slots int, arena *util.Arena) *Passthrough { return &Passthrough{ out: w, slots: make([][]byte, 0, slots), diff --git a/overlay/batch/tcp_coalesce.go b/overlay/batch/tcp_coalesce.go index c50667e0..2f811454 100644 --- a/overlay/batch/tcp_coalesce.go +++ b/overlay/batch/tcp_coalesce.go @@ -10,6 +10,8 @@ import ( "slices" "github.com/slackhq/nebula/overlay/tio" + "github.com/slackhq/nebula/util" + "github.com/slackhq/nebula/wire" ) // ipProtoTCP is the IANA protocol number for TCP. Hardcoded instead of @@ -88,11 +90,11 @@ type TCPCoalescer struct { // and tells it to release them via Reset on Flush. When wrapped in // MultiCoalescer the same *Arena is shared with the other lanes so // there's exactly one backing slab per Multi instance. - arena *Arena + arena *util.Arena l *slog.Logger } -func NewTCPCoalescer(w io.Writer, l *slog.Logger, arena *Arena) *TCPCoalescer { +func NewTCPCoalescer(w tio.Queue, l *slog.Logger, arena *util.Arena) *TCPCoalescer { c := &TCPCoalescer{ plainW: w, slots: make([]*coalesceSlot, 0, initialSlots), @@ -101,7 +103,7 @@ func NewTCPCoalescer(w io.Writer, l *slog.Logger, arena *Arena) *TCPCoalescer { arena: arena, l: l, } - if gw, ok := tio.SupportsGSO(w, tio.GSOProtoTCP); ok { + if gw, ok := tio.SupportsGSO(w, wire.GSOProtoTCP); ok { c.gsoW = gw } return c @@ -419,7 +421,7 @@ func (c *TCPCoalescer) flushSlot(s *coalesceSlot) error { tcsum := s.ipHdrLen + 16 binary.BigEndian.PutUint16(hdr[tcsum:tcsum+2], foldOnceNoInvert(psum)) - return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, tio.GSOProtoTCP) + return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, wire.GSOProtoTCP) } // headersMatch compares two IP+TCP header prefixes for byte-for-byte diff --git a/overlay/batch/udp_coalesce.go b/overlay/batch/udp_coalesce.go index 410d2982..6cdf1ff4 100644 --- a/overlay/batch/udp_coalesce.go +++ b/overlay/batch/udp_coalesce.go @@ -5,6 +5,8 @@ import ( "io" "github.com/slackhq/nebula/overlay/tio" + "github.com/slackhq/nebula/util" + "github.com/slackhq/nebula/wire" ) // ipProtoUDP is the IANA protocol number for UDP. @@ -67,7 +69,7 @@ type UDPCoalescer struct { pool []*udpSlot // arena is injected; see TCPCoalescer.arena for the contract. - arena *Arena + arena *util.Arena } // NewUDPCoalescer wraps w. The caller is responsible for only constructing @@ -75,7 +77,7 @@ type UDPCoalescer struct { // the kernel may reject GSO_UDP_L4 writes. If w does not implement // tio.GSOWriter at all (single-packet Queue), the coalescer degrades to // plain Writes — same defensive shape as the TCP coalescer. -func NewUDPCoalescer(w io.Writer, arena *Arena) *UDPCoalescer { +func NewUDPCoalescer(w tio.Queue, arena *util.Arena) *UDPCoalescer { c := &UDPCoalescer{ plainW: w, slots: make([]*udpSlot, 0, initialSlots), @@ -83,7 +85,7 @@ func NewUDPCoalescer(w io.Writer, arena *Arena) *UDPCoalescer { pool: make([]*udpSlot, 0, initialSlots), arena: arena, } - if gw, ok := tio.SupportsGSO(w, tio.GSOProtoUDP); ok { + if gw, ok := tio.SupportsGSO(w, wire.GSOProtoUDP); ok { c.gsoW = gw } return c @@ -313,7 +315,7 @@ func (c *UDPCoalescer) flushSlot(s *udpSlot) error { udpCsumOff := s.ipHdrLen + 6 binary.BigEndian.PutUint16(hdr[udpCsumOff:udpCsumOff+2], foldOnceNoInvert(psum)) - return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, tio.GSOProtoUDP) + return c.gsoW.WriteGSO(hdr[:s.ipHdrLen], hdr[s.ipHdrLen:], s.payIovs, wire.GSOProtoUDP) } // udpHeadersMatch compares two IP+UDP header prefixes for byte-equality on diff --git a/overlay/tio/tio.go b/overlay/tio/tio.go index 6611ffa5..08c5060c 100644 --- a/overlay/tio/tio.go +++ b/overlay/tio/tio.go @@ -46,42 +46,6 @@ type Queue interface { Capabilities() Capabilities } - -// GSOInfo describes a kernel-supplied superpacket sitting in Packet.Bytes. -// The zero value means "not a superpacket" — Bytes is one regular IP -// datagram and no segmentation is required. -type GSOInfo struct { - // Size is the GSO segment size: max payload bytes per segment - // (== TCP MSS for TSO, == UDP payload chunk for USO). Zero means - // not a superpacket. - Size uint16 - // HdrLen is the total L3+L4 header length within Bytes (already - // corrected via correctHdrLen, so safe to slice on). - HdrLen uint16 - // CsumStart is the L4 header offset inside Bytes (== L3 header - // length). - CsumStart uint16 - // Proto picks the L4 protocol (TCP or UDP) so the segmenter knows - // which checksum/header layout to apply. - Proto GSOProto -} - -// IsSuperpacket reports whether g describes a multi-segment GSO/USO -// superpacket that needs segmentation before its bytes can be encrypted -// and sent on the wire. -func (g GSOInfo) IsSuperpacket() bool { return g.Size > 0 } - -// GSOProto selects the L4 protocol for a GSO superpacket. Determines which -// VIRTIO_NET_HDR_GSO_* type the writer stamps and which checksum offset -// inside the transport header virtio NEEDS_CSUM expects. -type GSOProto uint8 - -const ( - GSOProtoNone GSOProto = iota - GSOProtoTCP - GSOProtoUDP -) - // GSOWriter is implemented by Queues that can emit a TCP or UDP superpacket // assembled from a header prefix plus one or more borrowed payload // fragments, in a single vectored write (writev with a leading @@ -104,24 +68,25 @@ const ( // implementation of GSOWriter is necessary but not sufficient since USO // may not have been negotiated even when TSO was. type GSOWriter interface { - WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto GSOProto) error + WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto wire.GSOProto) error } // SupportsGSO reports whether w implements GSOWriter and the underlying // queue advertises the negotiated capability for `want`. A writer that // implements GSOWriter but not CapsProvider is treated as permissive // (used by tests and fakes that don't negotiate). -func SupportsGSO(w Queue, want GSOProto) (GSOWriter, bool) { +func SupportsGSO(w Queue, want wire.GSOProto) (GSOWriter, bool) { gw, ok := w.(GSOWriter) if !ok { return nil, false } caps := w.Capabilities() switch want { - case GSOProtoTCP: + case wire.GSOProtoTCP: return gw, caps.TSO - case GSOProtoUDP: + case wire.GSOProtoUDP: return gw, caps.USO + default: + return gw, false } - return gw, false } diff --git a/overlay/tio/tio_gso_linux.go b/overlay/tio/tio_gso_linux.go index 583bad35..fe154283 100644 --- a/overlay/tio/tio_gso_linux.go +++ b/overlay/tio/tio_gso_linux.go @@ -10,6 +10,7 @@ import ( "syscall" "unsafe" + "github.com/slackhq/nebula/wire" "golang.org/x/sys/unix" "github.com/slackhq/nebula/overlay/tio/virtio" @@ -67,9 +68,6 @@ type Offload struct { // events. writeLock sync.Mutex closed atomic.Bool - rxBuf []byte // backing store for kernel-handed packets read this drain - rxOff int // cursor into rxBuf for the current Read drain - pending []Packet // packets returned from the most recent Read // readVnetScratch holds the 10-byte virtio_net_hdr split off the front of // every TUN read via readv(2). Decoupling the header from the packet body @@ -115,9 +113,7 @@ func newOffload(fd int, shutdownFd int, usoEnabled bool) (*Offload, error) { {Fd: int32(shutdownFd), Events: unix.POLLIN}, }, writeLock: sync.Mutex{}, - - rxBuf: make([]byte, tunRxBufCap), - gsoIovs: make([]unix.Iovec, 2, gsoMaxIovs), + gsoIovs: make([]unix.Iovec, 2, gsoMaxIovs), } out.gsoIovs[0].Base = &out.gsoHdrBuf[0] @@ -197,9 +193,9 @@ func (r *Offload) blockOnWrite() error { // hold one worst-case kernel-supplied packet body. Without that gate the // body iovec could be smaller than the next inbound packet and the // kernel would truncate. -func (r *Offload) readPacket(block bool) (int, error) { +func (r *Offload) readPacket(mem []byte, block bool) (int, error) { for { - r.readIovs[1].Base = &r.rxBuf[r.rxOff] + r.readIovs[1].Base = &mem[0] r.readIovs[1].SetLen(tunReadBufSize) n, _, errno := syscall.Syscall(unix.SYS_READV, uintptr(r.fd), uintptr(unsafe.Pointer(&r.readIovs[0])), uintptr(len(r.readIovs))) if errno == 0 { @@ -237,29 +233,33 @@ func (r *Offload) readPacket(block bool) (int, error) { // bursts of small packets (e.g. TCP ACKs). Packet.Bytes slices point // into the Offload's internal buffer and are only valid until the next // Read or Close on this Queue. -func (r *Offload) Read() ([]Packet, error) { - r.pending = r.pending[:0] - r.rxOff = 0 +func (r *Offload) Read(p []wire.TunPacket, mem []byte) (int, error) { + maxP := len(p) + maxM := len(mem) + p = p[:0] + rxOff := 0 // Initial (blocking) read. Retry on decode errors so a single bad // packet does not stall the reader. for { - n, err := r.readPacket(true) + n, err := r.readPacket(mem, true) if err != nil { - return nil, err + return 0, err } - if err := r.decodeRead(n); err != nil { + if p, err = r.decodeRead(p, mem, n); err != nil { // Drop and read again — a bad packet should not kill the reader. continue } + + rxOff += n break } // Drain: non-blocking reads until the kernel queue is empty, the drain // cap is reached, or rxBuf no longer has room for another worst-case // kernel-supplied packet (tunRxBufSize). - for len(r.pending) < tunDrainCap && tunRxBufCap-r.rxOff >= tunRxBufSize { - n, err := r.readPacket(false) + for len(p) < maxP && maxM-rxOff >= tunRxBufSize { + n, err := r.readPacket(mem[rxOff:], false) if err != nil { // EAGAIN / EINTR / anything else: stop draining. We already // have a valid batch from the first read. @@ -268,14 +268,15 @@ func (r *Offload) Read() ([]Packet, error) { if n <= 0 { break } - if err := r.decodeRead(n); err != nil { + if p, err = r.decodeRead(p, mem, n); err != nil { // Drop this packet and stop the drain; we'd rather hand off // what we have than keep spinning here. break } + rxOff += n } - return r.pending, nil + return len(p), nil } // decodeRead processes the packet sitting in rxBuf at rxOff (length @@ -285,24 +286,23 @@ func (r *Offload) Read() ([]Packet, error) { // caller can segment lazily at encrypt time. rxOff advances past the // kernel-supplied body and nothing else, since segmentation no longer // writes back into rxBuf. -func (r *Offload) decodeRead(pktLen int) error { +func (r *Offload) decodeRead(p []wire.TunPacket, mem []byte, pktLen int) ([]wire.TunPacket, error) { if pktLen <= 0 { - return fmt.Errorf("short tun read: %d", pktLen) + return p, fmt.Errorf("short tun read: %d", pktLen) } var hdr virtio.Hdr hdr.Decode(r.readVnetScratch[:]) - body := r.rxBuf[r.rxOff : r.rxOff+pktLen] + body := mem[:pktLen] if hdr.GSOType == unix.VIRTIO_NET_HDR_GSO_NONE { if hdr.Flags&unix.VIRTIO_NET_HDR_F_NEEDS_CSUM != 0 { if err := virtio.FinishChecksum(body, hdr); err != nil { - return err + return p, err } } - r.pending = append(r.pending, Packet{Bytes: body}) - r.rxOff += pktLen - return nil + p = append(p, wire.TunPacket{Bytes: body}) + return p, nil } // GSO superpacket: validate, fix the kernel-supplied HdrLen on the @@ -310,26 +310,25 @@ func (r *Offload) decodeRead(pktLen int) error { // the metadata. The bytes stay in rxBuf untouched, segmentation // happens in SegmentSuperpacket at encrypt time. if err := virtio.CheckValid(body, hdr); err != nil { - return err + return p, err } if err := virtio.CorrectHdrLen(body, &hdr); err != nil { - return err + return p, err } proto, err := protoFromGSOType(hdr.GSOType) if err != nil { - return err + return p, err } - r.pending = append(r.pending, Packet{ + p = append(p, wire.TunPacket{ Bytes: body, - GSO: GSOInfo{ + Meta: wire.GSOInfo{ Size: hdr.GSOSize, HdrLen: hdr.HdrLen, CsumStart: hdr.CsumStart, Proto: proto, }, }) - r.rxOff += pktLen - return nil + return p, nil } func (r *Offload) Write(buf []byte) (int, error) { @@ -384,7 +383,7 @@ func (r *Offload) Capabilities() Capabilities { return Capabilities{TSO: true, USO: r.usoEnabled} } -func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto GSOProto) error { +func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto wire.GSOProto) error { if len(hdr) == 0 || len(pays) == 0 || len(transportHdr) == 0 { return nil } @@ -392,7 +391,7 @@ func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto // seq/ack/dataoff/flags/window), UDP=6 (after sport/dport/length). var csumOff uint16 switch proto { - case GSOProtoUDP: + case wire.GSOProtoUDP: csumOff = 6 default: csumOff = 16 @@ -407,7 +406,7 @@ func (r *Offload) WriteGSO(hdr []byte, transportHdr []byte, pays [][]byte, proto if len(pays) > 1 { ipVer := hdr[0] >> 4 switch { - case proto == GSOProtoUDP && (ipVer == 4 || ipVer == 6): + case proto == wire.GSOProtoUDP && (ipVer == 4 || ipVer == 6): vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_UDP_L4 case ipVer == 6: vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV6 diff --git a/overlay/tio/tio_poll_linux.go b/overlay/tio/tio_poll_linux.go index 1b8c1f21..b81247d2 100644 --- a/overlay/tio/tio_poll_linux.go +++ b/overlay/tio/tio_poll_linux.go @@ -5,6 +5,7 @@ import ( "os" "sync/atomic" + "github.com/slackhq/nebula/wire" "golang.org/x/sys/unix" ) @@ -19,9 +20,6 @@ type Poll struct { readPoll [2]unix.PollFd writePoll [2]unix.PollFd closed atomic.Bool - - readBuf []byte - batchRet [1]Packet } func newPoll(fd int, shutdownFd int) (*Poll, error) { @@ -31,8 +29,7 @@ func newPoll(fd int, shutdownFd int) (*Poll, error) { } out := &Poll{ - fd: fd, - readBuf: make([]byte, tunReadBufSize), + fd: fd, readPoll: [2]unix.PollFd{ {Fd: int32(fd), Events: unix.POLLIN}, {Fd: int32(shutdownFd), Events: unix.POLLIN}, @@ -97,13 +94,17 @@ func (t *Poll) blockOnWrite() error { return nil } -func (t *Poll) Read() ([]Packet, error) { - n, err := t.readOne(t.readBuf) - if err != nil { - return nil, err +func (t *Poll) Read(p []wire.TunPacket, mem []byte) (int, error) { + if len(p) == 0 || len(mem) == 0 { + return 0, nil //todo should this be an err? } - t.batchRet[0] = Packet{Bytes: t.readBuf[:n]} - return t.batchRet[:], nil + p[0].Meta = wire.GSOInfo{} + n, err := t.readOne(mem) + if err != nil { + return 0, err + } + p[0].Bytes = mem[:n] + return 1, nil } func (t *Poll) readOne(to []byte) (int, error) { @@ -162,3 +163,7 @@ func (t *Poll) Close() error { return err } + +func (t *Poll) Capabilities() Capabilities { + return Capabilities{} +} diff --git a/overlay/tio/tun_linux_offload.go b/overlay/tio/tun_linux_offload.go index 9eb46729..bdf76683 100644 --- a/overlay/tio/tun_linux_offload.go +++ b/overlay/tio/tun_linux_offload.go @@ -6,46 +6,20 @@ package tio import ( "fmt" + "github.com/slackhq/nebula/wire" "golang.org/x/sys/unix" - - "github.com/slackhq/nebula/overlay/tio/virtio" ) // protoFromGSOType maps a virtio_net_hdr GSOType to the GSOProto value the // segment-time helpers use. Returns an error for GSO_NONE or any unknown // value — the caller should only invoke this on a confirmed superpacket. -func protoFromGSOType(t uint8) (GSOProto, error) { +func protoFromGSOType(t uint8) (wire.GSOProto, error) { switch t { case unix.VIRTIO_NET_HDR_GSO_TCPV4, unix.VIRTIO_NET_HDR_GSO_TCPV6: - return GSOProtoTCP, nil + return wire.GSOProtoTCP, nil case unix.VIRTIO_NET_HDR_GSO_UDP_L4: - return GSOProtoUDP, nil + return wire.GSOProtoUDP, nil default: return 0, fmt.Errorf("unsupported virtio gso type: %d", t) } } - -// SegmentSuperpacket invokes fn once per segment of pkt. For non-GSO pkts -// fn is called once with pkt.Bytes (no segmentation, no copy). For GSO/USO -// superpackets fn is called once per segment with a slice of pkt.Bytes -// holding that segment's plaintext (a freshly-patched L3+L4 header sliced -// in front of the original payload chunk). The slide is destructive: pkt is -// consumed by this call and its bytes are in an undefined state when -// SegmentSuperpacket returns. Callers must not retain pkt or any earlier -// seg slice past fn's return for that segment. The scratch parameter is -// unused on the destructive path and kept only for cross-platform -// signature compatibility. Aborts and returns the first error from fn or -// from per-segment construction. -func SegmentSuperpacket(pkt Packet, fn func(seg []byte) error) error { - if !pkt.GSO.IsSuperpacket() { - return fn(pkt.Bytes) - } - switch pkt.GSO.Proto { - case GSOProtoTCP: - return virtio.SegmentTCP(pkt.Bytes, pkt.GSO.HdrLen, pkt.GSO.CsumStart, pkt.GSO.Size, fn) - case GSOProtoUDP: - return virtio.SegmentUDP(pkt.Bytes, pkt.GSO.HdrLen, pkt.GSO.CsumStart, pkt.GSO.Size, fn) - default: - return fmt.Errorf("unsupported gso proto: %d", pkt.GSO.Proto) - } -} diff --git a/overlay/tun_disabled.go b/overlay/tun_disabled.go index d96f912b..e1d1600b 100644 --- a/overlay/tun_disabled.go +++ b/overlay/tun_disabled.go @@ -83,7 +83,7 @@ func (t *disabledTun) Read(p []wire.TunPacket, mem []byte) (int, error) { if len(p) == 0 || len(mem) == 0 { return 0, nil //todo should this be an err? } - p[0].Meta = struct{}{} + p[0].Meta = wire.GSOInfo{} n, err := t.readOne(mem) if err != nil { return 0, err diff --git a/overlay/tun_linux.go b/overlay/tun_linux.go index 1245027f..c18fc38e 100644 --- a/overlay/tun_linux.go +++ b/overlay/tun_linux.go @@ -146,6 +146,11 @@ func newTun(c *config.C, l *slog.Logger, vpnNetworks []netip.Prefix, multiqueue } nameStr := c.GetString("tun.dev", "") + // First try to enable IFF_VNET_HDR via TUNSETIFF and negotiate TUN_F_* + // offloads via TUNSETOFFLOAD so we can receive TSO/USO superpackets. + // We try TSO+USO first, fall back to TSO-only on kernels without USO + // (Linux < 6.2), and finally give up on virtio headers entirely and + // reopen as a plain TUN if neither offload mask is accepted. fd, err := openTunDev() if err != nil { return nil, err diff --git a/overlay/user.go b/overlay/user.go index 64d772e5..5345119d 100644 --- a/overlay/user.go +++ b/overlay/user.go @@ -48,7 +48,7 @@ func (d *UserDevice) Read(p []wire.TunPacket, mem []byte) (int, error) { if len(p) == 0 || len(mem) == 0 { return 0, nil //todo should this be an err? } - p[0].Meta = struct{}{} + p[0].Meta = wire.GSOInfo{} n, err := d.outboundReader.Read(mem) if err != nil { return 0, err diff --git a/udp/udp_linux.go b/udp/udp_linux.go index 6465be32..53564e1a 100644 --- a/udp/udp_linux.go +++ b/udp/udp_linux.go @@ -6,10 +6,13 @@ package udp import ( "context" "encoding/binary" + "errors" "fmt" "log/slog" "net" "net/netip" + "strconv" + "strings" "syscall" "unsafe" @@ -32,14 +35,44 @@ type StdConn struct { writeIovs []iovec writeNames [][]byte - // sendmmsg(2) callback state. sendmmsgCB is bound once in NewListener - // to the sendmmsgRun method value so passing it to rawConn.Write does - // not allocate a fresh closure per send; sendmmsgN/Sent/Errno carry - // the inputs and outputs across the call without escaping locals. - sendmmsgCB func(fd uintptr) bool - sendmmsgN int - sendmmsgSent int - sendmmsgErrno syscall.Errno + // Per-entry cmsg scratch. writeCmsg is one contiguous slab of + // MaxWriteBatch * writeCmsgSpace bytes; each entry holds two cmsg + // headers (UDP_SEGMENT then IP_TOS / IPV6_TCLASS) pre-filled once in + // prepareWriteMessages. WriteBatch only rewrites the per-call data + // payloads and toggles Hdr.Control / Hdr.Controllen to point at + // whichever subset of the two cmsgs applies. + writeCmsg []byte + writeCmsgSpace int + writeCmsgSegSpace int + writeCmsgEcnSpace int + + // writeEntryEnd[e] is the bufs index *after* the last packet packed + // into mmsghdr entry e. Used to rewind `i` on partial sendmmsg success. + writeEntryEnd []int + + // rawSend wraps the sendmmsg(2) callback in a closure-free helper so + // the hot path doesn't heap-allocate a fresh closure per call. + rawSend rawSendmmsg + + // UDP GSO (sendmsg with UDP_SEGMENT cmsg) support. gsoSupported is + // probed once at socket creation. When true, WriteBatch packs same- + // destination consecutive packets into a single sendmmsg entry with a + // UDP_SEGMENT cmsg; otherwise each packet is its own entry. + gsoSupported bool + maxGSOSegments int + + // UDP GRO (recvmsg with UDP_GRO cmsg) support. groSupported is probed + // once at socket creation. When true, listenOutBatch allocates larger + // RX buffers and a per-entry cmsg slot so the kernel can coalesce + // consecutive same-flow datagrams into a single recvmmsg entry; the + // delivered cmsg carries the gso_size used to split them back apart. + groSupported bool + + // ecnRecvSupported is true when IP_RECVTOS / IPV6_RECVTCLASS was + // successfully enabled — the kernel will deliver the outer IP-ECN of + // each arriving datagram as a per-slot cmsg, and listenOutBatch passes + // the parsed value to the EncReader callback for RFC 6040 combine. + ecnRecvSupported bool } func setReusePort(network, address string, c syscall.RawConn) error { @@ -73,10 +106,11 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int) } //gotta find out if we got an AF_INET6 socket or not: out := &StdConn{ - udpConn: udpConn, - rawConn: rawConn, - l: l, - batch: batch, + udpConn: udpConn, + rawConn: rawConn, + l: l, + batch: batch, + maxGSOSegments: 1, } af, err := out.getSockOptInt(unix.SO_DOMAIN) @@ -87,15 +121,71 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int) out.isV4 = af == unix.AF_INET out.prepareWriteMessages(MaxWriteBatch) - out.sendmmsgCB = out.sendmmsgRun + out.rawSend.msgs = out.writeMsgs + out.rawSend.bind() + + out.prepareGSO() + // GRO delivers coalesced superpackets that need a cmsg to split back + // into segments. The single-packet RX path uses ReadFromUDPAddrPort + // and cannot see that cmsg, so only enable GRO for the batch path. + if batch > 1 { + out.prepareGRO() + } + // Best-effort: ask the kernel to deliver outer IP-ECN as ancillary data + // on every recvmmsg slot so the decap side can apply RFC 6040 combine. + // On older kernels these may not exist; failing here just means we get + // 0 (Not-ECT) on every slot, which is the same as ecn_mode=disable. + out.prepareECNRecv() return out, nil } +// prepareWriteMessages allocates one mmsghdr/iovec/sockaddr/cmsg scratch +// slot per sendmmsg entry. The iovec slab is sized to n so all entries' +// iovecs share one allocation; per-entry fan-out is further capped at +// maxGSOSegments. Hdr.Iov / Hdr.Iovlen / Hdr.Control / Hdr.Controllen are +// wired per call since each entry can span a variable number of iovecs +// and may or may not carry a cmsg. +// +// Per-mmsghdr cmsg layout. Each entry's slot of length writeCmsgSpace holds +// up to two cmsg headers placed at fixed offsets: +// +// [0 .. writeCmsgSegSpace) UDP_SEGMENT (gso_size, uint16) +// [writeCmsgSegSpace .. writeCmsgSpace) IP_TOS or IPV6_TCLASS (int32) +// +// Both headers are pre-filled once here; per-call we only rewrite the data +// payload and toggle Hdr.Control / Hdr.Controllen to point at whichever +// subset applies (none / segment-only / ecn-only / both). func (u *StdConn) prepareWriteMessages(n int) { u.writeMsgs = make([]rawMessage, n) u.writeIovs = make([]iovec, n) u.writeNames = make([][]byte, n) + u.writeEntryEnd = make([]int, n) + + u.writeCmsgSegSpace = unix.CmsgSpace(2) + u.writeCmsgEcnSpace = unix.CmsgSpace(4) + u.writeCmsgSpace = u.writeCmsgSegSpace + u.writeCmsgEcnSpace + u.writeCmsg = make([]byte, n*u.writeCmsgSpace) + + ecnLevel := int32(unix.IPPROTO_IP) + ecnType := int32(unix.IP_TOS) + if !u.isV4 { + ecnLevel = unix.IPPROTO_IPV6 + ecnType = unix.IPV6_TCLASS + } + + for k := 0; k < n; k++ { + base := k * u.writeCmsgSpace + seg := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base])) + seg.Level = unix.SOL_UDP + seg.Type = unix.UDP_SEGMENT + setCmsgLen(seg, unix.CmsgLen(2)) + + ecn := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base+u.writeCmsgSegSpace])) + ecn.Level = ecnLevel + ecn.Type = ecnType + setCmsgLen(ecn, unix.CmsgLen(4)) + } for i := range u.writeMsgs { u.writeNames[i] = make([]byte, unix.SizeofSockaddrInet6) @@ -103,6 +193,139 @@ func (u *StdConn) prepareWriteMessages(n int) { } } +// maxGSOBytes bounds the total payload per sendmsg() when UDP_SEGMENT is +// set. The kernel stitches all iovecs into a single skb whose length the +// UDP length field can represent, and also enforces sk_gso_max_size (which +// on most devices is 65536). We use 65000 to leave headroom under the +// 65535 UDP-length cap, avoiding EMSGSIZE on large TSO superpackets. +const maxGSOBytes = 65000 + +// prepareGSO probes UDP_SEGMENT support and sets u.gsoSupported on success. +// Best-effort; failure leaves it false. +func (u *StdConn) prepareGSO() { + u.maxGSOSegments = 63 //gotta be one less than the max so we can still attach a header + + var probeErr error + if err := u.rawConn.Control(func(fd uintptr) { + probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_SEGMENT, 0) + }); err != nil { + u.l.Info("udp: GSO disabled", "reason", "rawconn control failed", "error", err) + recordCapability("udp.gso.enabled", false) + return + } + if probeErr != nil { + u.l.Info("udp: GSO disabled", "reason", "kernel rejected probe", "error", probeErr) + recordCapability("udp.gso.enabled", false) + return + } + + var un unix.Utsname + if err := unix.Uname(&un); err != nil { + u.l.Info("udp: GSO disabled", "reason", "kernel uname probe failed", "error", err) + recordCapability("udp.gso.enabled", false) + return + } + major, minor := parseRelease(string(un.Release[:])) + if major > 5 || (major == 5 && minor >= 5) { + u.maxGSOSegments = 127 + } + + u.gsoSupported = true + u.l.Info("udp: GSO enabled", "maxGSOSegments", u.maxGSOSegments) + recordCapability("udp.gso.enabled", true) +} + +// udpGROBufferSize sizes the per-entry recvmmsg buffer when UDP_GRO is on. +// The kernel stitches a run of same-flow datagrams into a single skb whose +// length is bounded by sk_gso_max_size (typically 65535); anything larger +// would be MSG_TRUNCed. We use the maximum representable UDP length so a +// full superpacket always lands intact. +const udpGROBufferSize = 65535 + +// udpGROCmsgPayload is the size of the UDP_GRO cmsg data delivered by the +// kernel: a single int (gso_size in bytes). See udp_cmsg_recv() in +// net/ipv4/udp.c. +const udpGROCmsgPayload = 4 + +// prepareGRO turns on UDP_GRO so the kernel coalesces consecutive same-flow +// datagrams into one recvmmsg entry, with a cmsg carrying the gso_size used +// to split them back apart on the application side. +func (u *StdConn) prepareGRO() { + var probeErr error + if err := u.rawConn.Control(func(fd uintptr) { + probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_GRO, 1) + }); err != nil { + u.l.Info("udp: GRO disabled", "reason", "rawconn control failed", "error", err) + recordCapability("udp.gro.enabled", false) + return + } + if probeErr != nil { + u.l.Info("udp: GRO disabled", "reason", "kernel rejected probe", "error", probeErr) + recordCapability("udp.gro.enabled", false) + return + } + u.groSupported = true + u.l.Info("udp: GRO enabled") + recordCapability("udp.gro.enabled", true) +} + +// prepareECNRecv turns on IP_RECVTOS / IPV6_RECVTCLASS so the outer IP-ECN +// field of each arriving datagram is delivered as ancillary data alongside +// the payload. listenOutBatch reads it via parseRecvCmsg and passes the +// codepoint through the EncReader for RFC 6040 combine on the decap side. +// Best-effort: we keep going on failure. +func (u *StdConn) prepareECNRecv() { + var v4err, v6err error + if err := u.rawConn.Control(func(fd uintptr) { + v4err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_RECVTOS, 1) + if !u.isV4 { + v6err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_RECVTCLASS, 1) + } + }); err != nil { + u.l.Info("udp: outer-ECN RX disabled", "reason", "rawconn control failed", "error", err) + recordCapability("udp.ecn_rx.enabled", false) + return + } + if u.isV4 { //only check the V4 attempt + if v4err != nil { + u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", v4err) + recordCapability("udp.ecn_rx.enabled", false) + } else { + u.ecnRecvSupported = true + u.l.Info("udp: outer-ECN RX enabled") + recordCapability("udp.ecn_rx.enabled", true) + } + return + } else { + if v6err != nil { //no V6 ECN? disable it. + u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", errors.Join(v4err, v6err)) + recordCapability("udp.ecn_rx.enabled", false) + return + } else if v4err != nil { //no V4, but yes V6? Low level warning. Could be a V6-specific bind. + u.l.Debug("udp: outer-ECN RX degraded", "reason", "kernel rejected probe on IPv4", "error", v4err) + } + // all good + u.ecnRecvSupported = true + u.l.Info("udp: outer-ECN RX enabled") + recordCapability("udp.ecn_rx.enabled", true) + return + } +} + +// recordCapability registers (or updates) a boolean gauge for one of the +// kernel-feature probes. Gauges go to 1 when the feature is enabled, 0 when +// it is not — dashboards can show degraded state on partially-supported +// kernels at a glance. Calling repeatedly with the same name updates the +// existing gauge rather than registering a duplicate. +func recordCapability(name string, enabled bool) { + g := metrics.GetOrRegisterGauge(name, nil) + if enabled { + g.Update(1) + } else { + g.Update(0) + } +} + func (u *StdConn) SupportsMultipleReaders() bool { return true } @@ -221,16 +444,15 @@ func (u *StdConn) listenOutSingle(r EncReader, flush func()) error { } } -// readSockaddr decodes the source address out of a recvmmsg name buffer -func (u *StdConn) readSockaddr(name []byte) netip.AddrPort { +func getFrom(names [][]byte, i int, isV4 bool) netip.AddrPort { var ip netip.Addr - // It's ok to skip the ok check here, the slicing is the only error that can occur and it will panic - if u.isV4 { - ip, _ = netip.AddrFromSlice(name[4:8]) + // Its ok to skip the ok check here, the slicing is the only error that can occur and it will panic + if isV4 { + ip, _ = netip.AddrFromSlice(names[i][4:8]) } else { - ip, _ = netip.AddrFromSlice(name[8:24]) + ip, _ = netip.AddrFromSlice(names[i][8:24]) } - return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(name[2:4])) + return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(names[i][2:4])) } func (u *StdConn) listenOutBatch(r EncReader, flush func()) error { @@ -239,6 +461,16 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error { bufSize := MTU cmsgSpace := 0 + if u.groSupported { + bufSize = udpGROBufferSize + cmsgSpace = unix.CmsgSpace(udpGROCmsgPayload) + } + if u.ecnRecvSupported { + // IP_TOS arrives as 1 byte; IPV6_TCLASS arrives as a 4-byte int. + // Reserve enough for the wider of the two so the same buffer fits + // either family alongside any UDP_GRO cmsg. + cmsgSpace += unix.CmsgSpace(4) + } msgs, buffers, names, _ := u.PrepareRawMessages(u.batch, bufSize, cmsgSpace) //reader needs to capture variables from this function, since it's used as a lambda with rawConn.Read @@ -249,6 +481,11 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error { } for { + if cmsgSpace > 0 { + for i := range msgs { + setMsgControllen(&msgs[i].Hdr, cmsgSpace) + } + } err := u.rawConn.Read(reader) if err != nil { return err @@ -258,13 +495,75 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error { } for i := 0; i < n; i++ { - r(u.readSockaddr(names[i]), buffers[i][:msgs[i].Len], RxMeta{}) + from := getFrom(names, i, u.isV4) + payload := buffers[i][:msgs[i].Len] + + segSize := 0 + outerECN := byte(0) + if cmsgSpace > 0 { + segSize, outerECN = parseRecvCmsg(&msgs[i].Hdr, u.groSupported, u.ecnRecvSupported, u.isV4) + } + + if segSize <= 0 || segSize >= len(payload) { + r(from, payload, RxMeta{OuterECN: outerECN}) + } else { + for off := 0; off < len(payload); off += segSize { + end := off + segSize + if end > len(payload) { + end = len(payload) + } + seg := payload[off:end] + r(from, seg, RxMeta{OuterECN: outerECN}) + } + } } flush() } } +// parseRecvCmsg walks the per-slot ancillary buffer once and extracts up to +// two values of interest in a single pass: the UDP_GRO gso_size (when +// wantGRO is true) and the outer IP-level ECN codepoint stamped on the +// carrier (when wantECN is true). Returns zeros for whichever field is not +// requested or not present. isV4 selects between IP_TOS (1-byte) and +// IPV6_TCLASS (4-byte int) cmsg payloads. +func parseRecvCmsg(hdr *msghdr, wantGRO, wantECN bool, isV4 bool) (gso int, ecn byte) { + controllen := int(hdr.Controllen) + if controllen < unix.SizeofCmsghdr || hdr.Control == nil { + return 0, 0 + } + ctrl := unsafe.Slice(hdr.Control, controllen) + off := 0 + for off+unix.SizeofCmsghdr <= len(ctrl) { + ch := (*unix.Cmsghdr)(unsafe.Pointer(&ctrl[off])) + clen := int(ch.Len) + if clen < unix.SizeofCmsghdr || off+clen > len(ctrl) { + return gso, ecn + } + dataOff := off + unix.CmsgLen(0) + switch { + case wantGRO && ch.Level == unix.SOL_UDP && ch.Type == unix.UDP_GRO: + if dataOff+udpGROCmsgPayload <= len(ctrl) { + gso = int(int32(binary.NativeEndian.Uint32(ctrl[dataOff : dataOff+udpGROCmsgPayload]))) + } + case wantECN && isV4 && ch.Level == unix.IPPROTO_IP && ch.Type == unix.IP_TOS: + // IP_TOS arrives as a single byte; only the low 2 bits are ECN. + if dataOff+1 <= len(ctrl) { + ecn = ctrl[dataOff] & 0x03 + } + case wantECN && !isV4 && ch.Level == unix.IPPROTO_IPV6 && ch.Type == unix.IPV6_TCLASS: + // IPV6_TCLASS arrives as a 4-byte int; ECN is the low 2 bits. + if dataOff+4 <= len(ctrl) { + ecn = byte(binary.NativeEndian.Uint32(ctrl[dataOff:dataOff+4])) & 0x03 + } + } + // Advance by the aligned cmsg space. + off += unix.CmsgSpace(clen - unix.CmsgLen(0)) + } + return gso, ecn +} + func (u *StdConn) ListenOut(r EncReader, flush func()) error { if u.batch == 1 { return u.listenOutSingle(r, flush) @@ -279,89 +578,222 @@ func (u *StdConn) WriteTo(b []byte, ip netip.AddrPort) error { } // WriteBatch sends bufs via sendmmsg(2) using the preallocated scratch on -// StdConn. If supported, consecutive packets to the same destination with -// matching segment sizes (all but possibly the last) are coalesced into a -// single mmsghdr entry +// StdConn. Consecutive packets to the same destination with matching segment +// sizes (all but possibly the last) are coalesced into a single mmsghdr entry +// carrying a UDP_SEGMENT cmsg, so one syscall can mix runs of GSO superpackets +// with plain one-off datagrams. Without GSO support every packet is its own +// entry, matching the prior behaviour. // -// If sendmmsg returns an error and zero entries went out, we fall back to +// Chunks larger than the scratch are processed across multiple syscalls. If +// sendmmsg returns an error AND zero entries went out we fall back to // per-packet WriteTo for that chunk so the caller still gets best-effort -// delivery. On a partial send we resume at the first un-acked entry on -// the next iteration. -func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, _ []byte) error { - for i := 0; i < len(bufs); { - chunk := min(len(bufs)-i, len(u.writeMsgs)) +// delivery; on a partial-success error we just replay the remainder. +func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, ecns []byte) error { + if len(bufs) != len(addrs) { + return fmt.Errorf("WriteBatch: len(bufs)=%d != len(addrs)=%d", len(bufs), len(addrs)) + } + if ecns != nil && len(ecns) != len(bufs) { + return fmt.Errorf("WriteBatch: len(ecns)=%d != len(bufs)=%d", len(ecns), len(bufs)) + } - for k := 0; k < chunk; k++ { - u.writeIovs[k].Base = &bufs[i+k][0] - setIovLen(&u.writeIovs[k], len(bufs[i+k])) + // Callers deliver same-destination packets contiguously and in counter + // order, so we run the GSO planner directly without a pre-sort. A + // sorting pass measurably hurt throughput in microbenchmarks while + // providing no observed reordering benefit. - nlen, err := writeSockaddr(u.writeNames[k], addrs[i+k], u.isV4) + i := 0 + for i < len(bufs) { + baseI := i + entry := 0 + iovIdx := 0 + for entry < len(u.writeMsgs) && i < len(bufs) { + iovBudget := len(u.writeIovs) - iovIdx + if iovBudget < 1 { + break + } + runLen, segSize := u.planRun(bufs, addrs, ecns, i, iovBudget) + if runLen == 0 { + break + } + + for k := 0; k < runLen; k++ { + b := bufs[i+k] + if len(b) == 0 { + u.writeIovs[iovIdx+k].Base = nil + setIovLen(&u.writeIovs[iovIdx+k], 0) + } else { + u.writeIovs[iovIdx+k].Base = &b[0] + setIovLen(&u.writeIovs[iovIdx+k], len(b)) + } + } + + nlen, err := writeSockaddr(u.writeNames[entry], addrs[i], u.isV4) if err != nil { return err } - hdr := &u.writeMsgs[k].Hdr - hdr.Iov = &u.writeIovs[k] - setMsgIovlen(hdr, 1) + hdr := &u.writeMsgs[entry].Hdr + hdr.Iov = &u.writeIovs[iovIdx] + setMsgIovlen(hdr, runLen) hdr.Namelen = uint32(nlen) + + var ecn byte + if ecns != nil { + ecn = ecns[i] + } + u.writeEntryCmsg(entry, runLen, segSize, ecn) + + i += runLen + iovIdx += runLen + u.writeEntryEnd[entry] = i + entry++ } - sent, serr := u.sendmmsg(chunk) + if entry == 0 { + return fmt.Errorf("sendmmsg: no progress") + } + + sent, serr := u.sendmmsg(entry) if serr != nil && sent <= 0 { - // sendmmsg returns -1 / sent=0 when entry 0 itself failed; log - // that entry's destination and fall back to per-packet WriteTo - // for the whole chunk so the caller still gets best-effort - // delivery without duplicating packets the kernel accepted. - u.l.Warn("sendmmsg failed, falling back to per-packet WriteTo", - "err", serr, - "entries", chunk, - "entry0_dst", addrs[i], + // Nothing went out for this chunk; fall back to WriteTo for each + // packet that was queued this iteration. We only enter this path + // when sendmmsg returned an error AND zero entries succeeded — + // otherwise the partial-success advance below replays only the + // remainder, avoiding duplicates of already-sent packets. + // + // sent=-1 from sendmmsg means message 0 itself failed (partial + // success returns the count instead), so log entry 0's parameters + // — that's the entry the kernel rejected. + hdr0 := &u.writeMsgs[0].Hdr + runLen0 := u.writeEntryEnd[0] - baseI + seg0 := len(bufs[baseI]) + ecn0 := byte(0) + if ecns != nil { + ecn0 = ecns[baseI] + } + u.l.Warn("sendmmsg had problem", + "sent", sent, "err", serr, + "entries", entry, + "entry0_runLen", runLen0, + "entry0_segSize", seg0, + "entry0_iovlen", hdr0.Iovlen, + "entry0_controllen", hdr0.Controllen, + "entry0_namelen", hdr0.Namelen, + "entry0_ecn", ecn0, + "entry0_dst", addrs[baseI], "isV4", u.isV4, + "gso", u.gsoSupported, + "gro", u.groSupported, ) - for k := 0; k < chunk; k++ { - if werr := u.WriteTo(bufs[i+k], addrs[i+k]); werr != nil { + for k := baseI; k < i; k++ { + if werr := u.WriteTo(bufs[k], addrs[k]); werr != nil { return werr } } - i += chunk continue } - i += sent + if sent == 0 { + return fmt.Errorf("sendmmsg made no progress") + } + // Rewind i to the end of the last successfully sent entry. For a + // full-success send this leaves i unchanged; for a partial send it + // replays the remainder on the next outer-loop iteration. + i = u.writeEntryEnd[sent-1] } return nil } -// sendmmsg issues sendmmsg(2) against the first n entries of u.writeMsgs. -// The bound u.sendmmsgCB is passed to rawConn.Write so no closure is -// allocated per call; inputs and outputs ride on the StdConn fields. -func (u *StdConn) sendmmsg(n int) (int, error) { - u.sendmmsgN = n - u.sendmmsgSent = 0 - u.sendmmsgErrno = 0 - if err := u.rawConn.Write(u.sendmmsgCB); err != nil { - return u.sendmmsgSent, err +// planRun groups consecutive packets starting at `start` that can be sent as +// a single UDP GSO superpacket (one sendmmsg entry with UDP_SEGMENT cmsg). +// A run of length 1 means the entry carries no UDP_SEGMENT cmsg and the +// kernel treats it as a plain datagram. Returns the run length and the +// per-segment size (which equals len(bufs[start])). Without GSO support +// every call returns runLen=1. Outer ECN (when ecns != nil) is also a run +// boundary — the kernel stamps one outer codepoint per sendmsg entry, so +// mixing values inside a run would lose information. +func (u *StdConn) planRun(bufs [][]byte, addrs []netip.AddrPort, ecns []byte, start, iovBudget int) (int, int) { + if start >= len(bufs) || iovBudget < 1 { + return 0, 0 } - if u.sendmmsgErrno != 0 { - return u.sendmmsgSent, &net.OpError{Op: "sendmmsg", Err: u.sendmmsgErrno} + segSize := len(bufs[start]) + if !u.gsoSupported || segSize == 0 || segSize > maxGSOBytes { + return 1, segSize } - return u.sendmmsgSent, nil + dst := addrs[start] + var ecn byte + if ecns != nil { + ecn = ecns[start] + } + maxLen := u.maxGSOSegments + if iovBudget < maxLen { + maxLen = iovBudget + } + runLen := 1 + total := segSize + for runLen < maxLen && start+runLen < len(bufs) { + nextLen := len(bufs[start+runLen]) + if nextLen == 0 || nextLen > segSize { + break + } + if addrs[start+runLen] != dst { + break + } + if ecns != nil && ecns[start+runLen] != ecn { + break + } + if total+nextLen > maxGSOBytes { + break + } + total += nextLen + runLen++ + if nextLen < segSize { + // A short packet must be the last in the run. + break + } + } + return runLen, segSize } -// sendmmsgRun is the rawConn.Write callback. It is bound once into -// u.sendmmsgCB at construction so it stays alloc-free in the hot path; -// inputs (sendmmsgN) and outputs (sendmmsgSent, sendmmsgErrno) ride on -// the receiver rather than escaping locals. -func (u *StdConn) sendmmsgRun(fd uintptr) bool { - r1, _, errno := unix.Syscall6(unix.SYS_SENDMMSG, fd, - uintptr(unsafe.Pointer(&u.writeMsgs[0])), uintptr(u.sendmmsgN), - 0, 0, 0, - ) - if errno == syscall.EAGAIN || errno == syscall.EWOULDBLOCK { - return false +// writeEntryCmsg sets up the per-mmsghdr Hdr.Control / Hdr.Controllen for one +// entry. It writes the UDP_SEGMENT payload when runLen >= 2 and the +// IP_TOS/IPV6_TCLASS payload when ecn != 0, then points hdr.Control at the +// smallest contiguous span that covers whichever cmsg(s) actually apply. +func (u *StdConn) writeEntryCmsg(entry, runLen, segSize int, ecn byte) { + hdr := &u.writeMsgs[entry].Hdr + useSeg := runLen >= 2 + useEcn := ecn != 0 + base := entry * u.writeCmsgSpace + + if useSeg { + dataOff := base + unix.CmsgLen(0) + binary.NativeEndian.PutUint16(u.writeCmsg[dataOff:dataOff+2], uint16(segSize)) } - u.sendmmsgSent = int(r1) - u.sendmmsgErrno = errno - return true + if useEcn { + dataOff := base + u.writeCmsgSegSpace + unix.CmsgLen(0) + binary.NativeEndian.PutUint32(u.writeCmsg[dataOff:dataOff+4], uint32(ecn)) + } + + switch { + case useSeg && useEcn: + hdr.Control = &u.writeCmsg[base] + setMsgControllen(hdr, u.writeCmsgSpace) + case useSeg: + hdr.Control = &u.writeCmsg[base] + setMsgControllen(hdr, u.writeCmsgSegSpace) + case useEcn: + hdr.Control = &u.writeCmsg[base+u.writeCmsgSegSpace] + setMsgControllen(hdr, u.writeCmsgEcnSpace) + default: + hdr.Control = nil + setMsgControllen(hdr, 0) + } +} + +// sendmmsg issues sendmmsg(2) over u.rawConn against the first n entries +// of u.writeMsgs. Routes through u.rawSend so the per-call kernel callback +// stays alloc-free. +func (u *StdConn) sendmmsg(n int) (int, error) { + return u.rawSend.send(u.rawConn, n) } // writeSockaddr encodes addr into buf (which must be at least @@ -497,3 +929,22 @@ func NewUDPStatsEmitter(udpConns []Conn) func() { } } } + +func parseRelease(r string) (major, minor int) { + // strip anything after the second dot or any non-digit + parts := strings.SplitN(r, ".", 3) + if len(parts) < 2 { + return 0, 0 + } + major, _ = strconv.Atoi(parts[0]) + // minor may have trailing junk like "15-generic" + mp := parts[1] + for i, c := range mp { + if c < '0' || c > '9' { + mp = mp[:i] + break + } + } + minor, _ = strconv.Atoi(mp) + return +} diff --git a/wire/wire.go b/wire/wire.go index 9f580fd8..ac6c4814 100644 --- a/wire/wire.go +++ b/wire/wire.go @@ -7,11 +7,40 @@ type TunPacket struct { Bytes []byte // Meta contains other information to help process the packet correctly, such as offsets for segmentation offloads // Fields in Meta should be as portable/platform-agnostic as possible. - Meta struct{} + Meta GSOInfo } -// PerSegment invokes fn once per segment of pkt. -// This is a stub implementation that does not actually support segmentation -func (t *TunPacket) PerSegment(fn func(seg []byte) error) error { - return fn(t.Bytes) +// GSOInfo describes a kernel-supplied superpacket sitting in Packet.Bytes. +// The zero value means "not a superpacket" — Bytes is one regular IP +// datagram and no segmentation is required. +type GSOInfo struct { + // Size is the GSO segment size: max payload bytes per segment + // (== TCP MSS for TSO, == UDP payload chunk for USO). Zero means + // not a superpacket. + Size uint16 + // HdrLen is the total L3+L4 header length within Bytes (already + // corrected via correctHdrLen, so safe to slice on). + HdrLen uint16 + // CsumStart is the L4 header offset inside Bytes (== L3 header + // length). + CsumStart uint16 + // Proto picks the L4 protocol (TCP or UDP) so the segmenter knows + // which checksum/header layout to apply. + Proto GSOProto } + +// GSOProto selects the L4 protocol for a GSO superpacket. Determines which +// VIRTIO_NET_HDR_GSO_* type the writer stamps and which checksum offset +// inside the transport header virtio NEEDS_CSUM expects. +type GSOProto uint8 + +const ( + GSOProtoNone GSOProto = iota + GSOProtoTCP + GSOProtoUDP +) + +// IsSuperpacket reports whether g describes a multi-segment GSO/USO +// superpacket that needs segmentation before its bytes can be encrypted +// and sent on the wire. +func (g GSOInfo) IsSuperpacket() bool { return g.Size > 0 } diff --git a/wire/wire_generic.go b/wire/wire_generic.go new file mode 100644 index 00000000..0499ad8a --- /dev/null +++ b/wire/wire_generic.go @@ -0,0 +1,10 @@ +//go:build !linux +// +build !linux + +package wire + +// PerSegment invokes fn once per segment of pkt. +// This is a stub implementation that does not actually support segmentation +func (t *TunPacket) PerSegment(fn func(seg []byte) error) error { + return fn(t.Bytes) +} diff --git a/wire/wire_linux.go b/wire/wire_linux.go new file mode 100644 index 00000000..eab006df --- /dev/null +++ b/wire/wire_linux.go @@ -0,0 +1,32 @@ +package wire + +import ( + "fmt" + + "github.com/slackhq/nebula/overlay/tio/virtio" +) + +// PerSegment invokes fn once per segment of pkt. For non-GSO pkts +// fn is called once with pkt.Bytes (no segmentation, no copy). For GSO/USO +// superpackets fn is called once per segment with a slice of pkt.Bytes +// holding that segment's plaintext (a freshly-patched L3+L4 header sliced +// in front of the original payload chunk). The slide is destructive: pkt is +// consumed by this call and its bytes are in an undefined state when +// PerSegment returns. Callers must not retain pkt or any earlier +// seg slice past fn's return for that segment. The scratch parameter is +// unused on the destructive path and kept only for cross-platform +// signature compatibility. Aborts and returns the first error from fn or +// from per-segment construction. +func (t *TunPacket) PerSegment(fn func(seg []byte) error) error { + if !t.Meta.IsSuperpacket() { + return fn(t.Bytes) + } + switch t.Meta.Proto { + case GSOProtoTCP: + return virtio.SegmentTCP(t.Bytes, t.Meta.HdrLen, t.Meta.CsumStart, t.Meta.Size, fn) + case GSOProtoUDP: + return virtio.SegmentUDP(t.Bytes, t.Meta.HdrLen, t.Meta.CsumStart, t.Meta.Size, fn) + default: + return fmt.Errorf("unsupported gso proto: %d", t.Meta.Proto) + } +}