diff --git a/inside.go b/inside.go index 0fa841da..223a1c02 100644 --- a/inside.go +++ b/inside.go @@ -15,14 +15,7 @@ import ( "github.com/slackhq/nebula/routing" ) -func (f *Interface) consumeInsidePacket(pkt tio.Packet, fwPacket *firewall.Packet, nb []byte, sendBatch batch.TxBatcher, rejectBuf []byte, q int, localCache firewall.ConntrackCache) { - // borrowed: pkt.Bytes is owned by the originating tio.Queue and is - // only valid until the next Read on that queue. Every consumer below - // (parse, self-forward, handshake cache, sendInsideMessage) reads it - // synchronously; do not retain pkt outside this call. If a future - // caller needs to keep the packet, use pkt.Clone() to detach it from - // the borrow. - // +func (f *Interface) consumeInsidePacket(pkt wire.Packet, fwPacket *firewall.Packet, nb []byte, sendBatch batch.TxBatcher, rejectBuf []byte, q int, localCache firewall.ConntrackCache) { // pkt.Bytes is either one IP datagram (GSO zero) or a TSO/USO // superpacket. In both cases the L3+L4 headers at the start describe // the same 5-tuple every segment will share, so a single newPacket / @@ -52,10 +45,6 @@ func (f *Interface) consumeInsidePacket(pkt tio.Packet, fwPacket *firewall.Packe // routes packets from the Nebula addr to the Nebula addr through the Nebula // TUN device. if immediatelyForwardToSelf { - // Write copies into the kernel queue synchronously, so seg's lifetime ends at return. - // A self-forwarded superpacket would be re-handed to the - // kernel as one giant blob; segment first so the loopback - // path sees one IP datagram per Write. err := tio.SegmentSuperpacket(pkt, func(seg []byte) error { _, werr := f.readers[q].Write(seg) return werr @@ -107,7 +96,7 @@ func (f *Interface) consumeInsidePacket(pkt tio.Packet, fwPacket *firewall.Packe dropReason := f.firewall.Drop(*fwPacket, false, hostinfo, f.pki.GetCAPool(), localCache) if dropReason == nil { - f.sendInsideMessage(hostinfo, pkt, nb, sendBatch, rejectBuf, q) + f.sendInsideMessage(hostinfo, pkt, nb, sendBatch) } else { f.rejectInside(packet, rejectBuf, q) if f.l.Enabled(context.Background(), slog.LevelDebug) { @@ -521,6 +510,10 @@ func (f *Interface) SendVia(via *HostInfo, nocopy bool, ) { toSend, err := f.prepareSendVia(via, relay, ad, nb, out, nocopy) + if err != nil { + via.logger(f.l).Info("Failed to prepareSendVia", "error", err) + return + } err = f.writers[0].WriteTo(toSend, via.remote) if err != nil { via.logger(f.l).Info("Failed to WriteTo in sendVia", "error", err) diff --git a/interface.go b/interface.go index 303771d6..0402e68b 100644 --- a/interface.go +++ b/interface.go @@ -407,7 +407,7 @@ func (f *Interface) listenIn(reader tio.Queue, q int) { f.consumeInsidePacket(packets[i], fwPacket, nb, sb, rejectBuf, q, ctCache) } if err := sb.Flush(); err != nil { - f.l.Error("Failed to write outgoing batch", "error", err, "writer", i) + f.l.Error("Failed to write outgoing batch", "error", err, "writer", q) } } diff --git a/overlay/batch/tx_batch.go b/overlay/batch/tx_batch.go index 38f86b25..599bc306 100644 --- a/overlay/batch/tx_batch.go +++ b/overlay/batch/tx_batch.go @@ -1,9 +1,18 @@ package batch -import "net/netip" +import ( + "net/netip" + + "github.com/slackhq/nebula/udp" +) const SendBatchCap = 128 +// DefaultSendBatchArenaCap is the recommended arena capacity for a +// standalone SendBatch: 128 slots × (udp.MTU + 32) ≈ 1.1 MiB. The +32 covers +// the nebula header + AEAD tag tacked onto each plaintext segment. +const DefaultSendBatchArenaCap = SendBatchCap * (udp.MTU + 32) + // batchWriter is the minimal subset of udp.Conn needed by SendBatch to flush. type batchWriter interface { WriteBatch(bufs [][]byte, addrs []netip.AddrPort, outerECNs []byte) error @@ -11,38 +20,29 @@ type batchWriter interface { // SendBatch accumulates encrypted UDP packets and flushes them via WriteBatch. // One SendBatch is owned by each listenIn goroutine; no locking is needed. -// The backing arena grows on demand: when there isn't room for the next slot -// we allocate a fresh backing array. Already-committed slices keep referencing -// the old array and remain valid until Flush drops them. +// Slot bytes are borrowed from the injected Arena and remain valid until +// Flush, which Resets the arena. type SendBatch struct { - out batchWriter - bufs [][]byte - dsts []netip.AddrPort - ecns []byte - backing []byte + out batchWriter + bufs [][]byte + dsts []netip.AddrPort + ecns []byte + arena *Arena } -// NewSendBatch makes a SendBatch with batchCap slots and an arenaSize byte buffer for slices to back those slots -func NewSendBatch(out batchWriter, batchCap, arenaSize int) *SendBatch { +// NewSendBatch makes a SendBatch with batchCap slots backed by arena. +func NewSendBatch(out batchWriter, batchCap int, arena *Arena) *SendBatch { return &SendBatch{ - out: out, - bufs: make([][]byte, 0, batchCap), - dsts: make([]netip.AddrPort, 0, batchCap), - ecns: make([]byte, 0, batchCap), - backing: make([]byte, 0, arenaSize), + out: out, + bufs: make([][]byte, 0, batchCap), + dsts: make([]netip.AddrPort, 0, batchCap), + ecns: make([]byte, 0, batchCap), + arena: arena, } } func (b *SendBatch) Reserve(sz int) []byte { - if len(b.backing)+sz > cap(b.backing) { - // Grow: allocate a fresh backing. Already-committed slices still - // reference the old array and remain valid until Flush drops them. - newCap := max(cap(b.backing)*2, sz) - b.backing = make([]byte, 0, newCap) - } - start := len(b.backing) - b.backing = b.backing[:start+sz] - return b.backing[start : start+sz : start+sz] + return b.arena.Reserve(sz) } func (b *SendBatch) Commit(pkt []byte, dst netip.AddrPort, outerECN byte) { @@ -60,6 +60,6 @@ func (b *SendBatch) Flush() error { b.bufs = b.bufs[:0] b.dsts = b.dsts[:0] b.ecns = b.ecns[:0] - b.backing = b.backing[:0] + b.arena.Reset() return err } diff --git a/overlay/batch/tx_batch_test.go b/overlay/batch/tx_batch_test.go index 454011dc..59b06e58 100644 --- a/overlay/batch/tx_batch_test.go +++ b/overlay/batch/tx_batch_test.go @@ -27,7 +27,7 @@ func (w *fakeBatchWriter) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, ecns func TestSendBatchReserveCommitFlush(t *testing.T) { fw := &fakeBatchWriter{} - b := NewSendBatch(fw, 4, 32) + b := NewSendBatch(fw, 4, NewArena(32)) ap := netip.MustParseAddrPort("10.0.0.1:4242") for i := 0; i < 4; i++ { @@ -71,7 +71,7 @@ func TestSendBatchReserveCommitFlush(t *testing.T) { func TestSendBatchSlotsDoNotOverlap(t *testing.T) { fw := &fakeBatchWriter{} - b := NewSendBatch(fw, 3, 8) + b := NewSendBatch(fw, 3, NewArena(8)) ap := netip.MustParseAddrPort("10.0.0.1:80") for i := 0; i < 3; i++ { @@ -93,7 +93,7 @@ func TestSendBatchSlotsDoNotOverlap(t *testing.T) { func TestSendBatchGrowPreservesCommitted(t *testing.T) { fw := &fakeBatchWriter{} // Tiny initial backing forces a grow on the second Reserve. - b := NewSendBatch(fw, 1, 4) + b := NewSendBatch(fw, 1, NewArena(4)) ap := netip.MustParseAddrPort("10.0.0.1:80") s1 := b.Reserve(4) diff --git a/overlay/device.go b/overlay/device.go index cff3ac7d..8044ee75 100644 --- a/overlay/device.go +++ b/overlay/device.go @@ -8,6 +8,10 @@ import ( "github.com/slackhq/nebula/routing" ) +// defaultBatchBufSize is the per-Queue scratch size for Read on backends +// that don't do TSO segmentation. 65535 covers any single IP packet. +const defaultBatchBufSize = 65535 + type Device interface { io.Closer Activate() error diff --git a/overlay/tio/segment.go b/overlay/tio/segment.go new file mode 100644 index 00000000..67648ad2 --- /dev/null +++ b/overlay/tio/segment.go @@ -0,0 +1,12 @@ +package tio + +import "fmt" + +// SegmentSuperpacket invokes fn once per segment of pkt. +// This is a stub implementation that does not actually support segmentation +func SegmentSuperpacket(pkt Packet, fn func(seg []byte) error) error { + if pkt.GSO.IsSuperpacket() { + return fmt.Errorf("tio: GSO superpacket on platform without segmentation support") + } + return fn(pkt.Bytes) +} diff --git a/udp/udp_linux.go b/udp/udp_linux.go index a1b43c5b..6465be32 100644 --- a/udp/udp_linux.go +++ b/udp/udp_linux.go @@ -6,13 +6,10 @@ package udp import ( "context" "encoding/binary" - "errors" "fmt" "log/slog" "net" "net/netip" - "strconv" - "strings" "syscall" "unsafe" @@ -35,44 +32,14 @@ type StdConn struct { writeIovs []iovec writeNames [][]byte - // Per-entry cmsg scratch. writeCmsg is one contiguous slab of - // MaxWriteBatch * writeCmsgSpace bytes; each entry holds two cmsg - // headers (UDP_SEGMENT then IP_TOS / IPV6_TCLASS) pre-filled once in - // prepareWriteMessages. WriteBatch only rewrites the per-call data - // payloads and toggles Hdr.Control / Hdr.Controllen to point at - // whichever subset of the two cmsgs applies. - writeCmsg []byte - writeCmsgSpace int - writeCmsgSegSpace int - writeCmsgEcnSpace int - - // writeEntryEnd[e] is the bufs index *after* the last packet packed - // into mmsghdr entry e. Used to rewind `i` on partial sendmmsg success. - writeEntryEnd []int - - // rawSend wraps the sendmmsg(2) callback in a closure-free helper so - // the hot path doesn't heap-allocate a fresh closure per call. - rawSend rawSendmmsg - - // UDP GSO (sendmsg with UDP_SEGMENT cmsg) support. gsoSupported is - // probed once at socket creation. When true, WriteBatch packs same- - // destination consecutive packets into a single sendmmsg entry with a - // UDP_SEGMENT cmsg; otherwise each packet is its own entry. - gsoSupported bool - maxGSOSegments int - - // UDP GRO (recvmsg with UDP_GRO cmsg) support. groSupported is probed - // once at socket creation. When true, listenOutBatch allocates larger - // RX buffers and a per-entry cmsg slot so the kernel can coalesce - // consecutive same-flow datagrams into a single recvmmsg entry; the - // delivered cmsg carries the gso_size used to split them back apart. - groSupported bool - - // ecnRecvSupported is true when IP_RECVTOS / IPV6_RECVTCLASS was - // successfully enabled — the kernel will deliver the outer IP-ECN of - // each arriving datagram as a per-slot cmsg, and listenOutBatch passes - // the parsed value to the EncReader callback for RFC 6040 combine. - ecnRecvSupported bool + // sendmmsg(2) callback state. sendmmsgCB is bound once in NewListener + // to the sendmmsgRun method value so passing it to rawConn.Write does + // not allocate a fresh closure per send; sendmmsgN/Sent/Errno carry + // the inputs and outputs across the call without escaping locals. + sendmmsgCB func(fd uintptr) bool + sendmmsgN int + sendmmsgSent int + sendmmsgErrno syscall.Errno } func setReusePort(network, address string, c syscall.RawConn) error { @@ -106,11 +73,10 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int) } //gotta find out if we got an AF_INET6 socket or not: out := &StdConn{ - udpConn: udpConn, - rawConn: rawConn, - l: l, - batch: batch, - maxGSOSegments: 1, + udpConn: udpConn, + rawConn: rawConn, + l: l, + batch: batch, } af, err := out.getSockOptInt(unix.SO_DOMAIN) @@ -121,71 +87,15 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int) out.isV4 = af == unix.AF_INET out.prepareWriteMessages(MaxWriteBatch) - out.rawSend.msgs = out.writeMsgs - out.rawSend.bind() - - out.prepareGSO() - // GRO delivers coalesced superpackets that need a cmsg to split back - // into segments. The single-packet RX path uses ReadFromUDPAddrPort - // and cannot see that cmsg, so only enable GRO for the batch path. - if batch > 1 { - out.prepareGRO() - } - // Best-effort: ask the kernel to deliver outer IP-ECN as ancillary data - // on every recvmmsg slot so the decap side can apply RFC 6040 combine. - // On older kernels these may not exist; failing here just means we get - // 0 (Not-ECT) on every slot, which is the same as ecn_mode=disable. - out.prepareECNRecv() + out.sendmmsgCB = out.sendmmsgRun return out, nil } -// prepareWriteMessages allocates one mmsghdr/iovec/sockaddr/cmsg scratch -// slot per sendmmsg entry. The iovec slab is sized to n so all entries' -// iovecs share one allocation; per-entry fan-out is further capped at -// maxGSOSegments. Hdr.Iov / Hdr.Iovlen / Hdr.Control / Hdr.Controllen are -// wired per call since each entry can span a variable number of iovecs -// and may or may not carry a cmsg. -// -// Per-mmsghdr cmsg layout. Each entry's slot of length writeCmsgSpace holds -// up to two cmsg headers placed at fixed offsets: -// -// [0 .. writeCmsgSegSpace) UDP_SEGMENT (gso_size, uint16) -// [writeCmsgSegSpace .. writeCmsgSpace) IP_TOS or IPV6_TCLASS (int32) -// -// Both headers are pre-filled once here; per-call we only rewrite the data -// payload and toggle Hdr.Control / Hdr.Controllen to point at whichever -// subset applies (none / segment-only / ecn-only / both). func (u *StdConn) prepareWriteMessages(n int) { u.writeMsgs = make([]rawMessage, n) u.writeIovs = make([]iovec, n) u.writeNames = make([][]byte, n) - u.writeEntryEnd = make([]int, n) - - u.writeCmsgSegSpace = unix.CmsgSpace(2) - u.writeCmsgEcnSpace = unix.CmsgSpace(4) - u.writeCmsgSpace = u.writeCmsgSegSpace + u.writeCmsgEcnSpace - u.writeCmsg = make([]byte, n*u.writeCmsgSpace) - - ecnLevel := int32(unix.IPPROTO_IP) - ecnType := int32(unix.IP_TOS) - if !u.isV4 { - ecnLevel = unix.IPPROTO_IPV6 - ecnType = unix.IPV6_TCLASS - } - - for k := 0; k < n; k++ { - base := k * u.writeCmsgSpace - seg := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base])) - seg.Level = unix.SOL_UDP - seg.Type = unix.UDP_SEGMENT - setCmsgLen(seg, unix.CmsgLen(2)) - - ecn := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base+u.writeCmsgSegSpace])) - ecn.Level = ecnLevel - ecn.Type = ecnType - setCmsgLen(ecn, unix.CmsgLen(4)) - } for i := range u.writeMsgs { u.writeNames[i] = make([]byte, unix.SizeofSockaddrInet6) @@ -193,139 +103,6 @@ func (u *StdConn) prepareWriteMessages(n int) { } } -// maxGSOBytes bounds the total payload per sendmsg() when UDP_SEGMENT is -// set. The kernel stitches all iovecs into a single skb whose length the -// UDP length field can represent, and also enforces sk_gso_max_size (which -// on most devices is 65536). We use 65000 to leave headroom under the -// 65535 UDP-length cap, avoiding EMSGSIZE on large TSO superpackets. -const maxGSOBytes = 65000 - -// prepareGSO probes UDP_SEGMENT support and sets u.gsoSupported on success. -// Best-effort; failure leaves it false. -func (u *StdConn) prepareGSO() { - u.maxGSOSegments = 63 //gotta be one less than the max so we can still attach a header - - var probeErr error - if err := u.rawConn.Control(func(fd uintptr) { - probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_SEGMENT, 0) - }); err != nil { - u.l.Info("udp: GSO disabled", "reason", "rawconn control failed", "error", err) - recordCapability("udp.gso.enabled", false) - return - } - if probeErr != nil { - u.l.Info("udp: GSO disabled", "reason", "kernel rejected probe", "error", probeErr) - recordCapability("udp.gso.enabled", false) - return - } - - var un unix.Utsname - if err := unix.Uname(&un); err != nil { - u.l.Info("udp: GSO disabled", "reason", "kernel uname probe failed", "error", err) - recordCapability("udp.gso.enabled", false) - return - } - major, minor := parseRelease(string(un.Release[:])) - if major > 5 || (major == 5 && minor >= 5) { - u.maxGSOSegments = 127 - } - - u.gsoSupported = true - u.l.Info("udp: GSO enabled", "maxGSOSegments", u.maxGSOSegments) - recordCapability("udp.gso.enabled", true) -} - -// udpGROBufferSize sizes the per-entry recvmmsg buffer when UDP_GRO is on. -// The kernel stitches a run of same-flow datagrams into a single skb whose -// length is bounded by sk_gso_max_size (typically 65535); anything larger -// would be MSG_TRUNCed. We use the maximum representable UDP length so a -// full superpacket always lands intact. -const udpGROBufferSize = 65535 - -// udpGROCmsgPayload is the size of the UDP_GRO cmsg data delivered by the -// kernel: a single int (gso_size in bytes). See udp_cmsg_recv() in -// net/ipv4/udp.c. -const udpGROCmsgPayload = 4 - -// prepareGRO turns on UDP_GRO so the kernel coalesces consecutive same-flow -// datagrams into one recvmmsg entry, with a cmsg carrying the gso_size used -// to split them back apart on the application side. -func (u *StdConn) prepareGRO() { - var probeErr error - if err := u.rawConn.Control(func(fd uintptr) { - probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_GRO, 1) - }); err != nil { - u.l.Info("udp: GRO disabled", "reason", "rawconn control failed", "error", err) - recordCapability("udp.gro.enabled", false) - return - } - if probeErr != nil { - u.l.Info("udp: GRO disabled", "reason", "kernel rejected probe", "error", probeErr) - recordCapability("udp.gro.enabled", false) - return - } - u.groSupported = true - u.l.Info("udp: GRO enabled") - recordCapability("udp.gro.enabled", true) -} - -// prepareECNRecv turns on IP_RECVTOS / IPV6_RECVTCLASS so the outer IP-ECN -// field of each arriving datagram is delivered as ancillary data alongside -// the payload. listenOutBatch reads it via parseRecvCmsg and passes the -// codepoint through the EncReader for RFC 6040 combine on the decap side. -// Best-effort: we keep going on failure. -func (u *StdConn) prepareECNRecv() { - var v4err, v6err error - if err := u.rawConn.Control(func(fd uintptr) { - v4err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_RECVTOS, 1) - if !u.isV4 { - v6err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_RECVTCLASS, 1) - } - }); err != nil { - u.l.Info("udp: outer-ECN RX disabled", "reason", "rawconn control failed", "error", err) - recordCapability("udp.ecn_rx.enabled", false) - return - } - if u.isV4 { //only check the V4 attempt - if v4err != nil { - u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", v4err) - recordCapability("udp.ecn_rx.enabled", false) - } else { - u.ecnRecvSupported = true - u.l.Info("udp: outer-ECN RX enabled") - recordCapability("udp.ecn_rx.enabled", true) - } - return - } else { - if v6err != nil { //no V6 ECN? disable it. - u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", errors.Join(v4err, v6err)) - recordCapability("udp.ecn_rx.enabled", false) - return - } else if v4err != nil { //no V4, but yes V6? Low level warning. Could be a V6-specific bind. - u.l.Debug("udp: outer-ECN RX degraded", "reason", "kernel rejected probe on IPv4", "error", v4err) - } - // all good - u.ecnRecvSupported = true - u.l.Info("udp: outer-ECN RX enabled") - recordCapability("udp.ecn_rx.enabled", true) - return - } -} - -// recordCapability registers (or updates) a boolean gauge for one of the -// kernel-feature probes. Gauges go to 1 when the feature is enabled, 0 when -// it is not — dashboards can show degraded state on partially-supported -// kernels at a glance. Calling repeatedly with the same name updates the -// existing gauge rather than registering a duplicate. -func recordCapability(name string, enabled bool) { - g := metrics.GetOrRegisterGauge(name, nil) - if enabled { - g.Update(1) - } else { - g.Update(0) - } -} - func (u *StdConn) SupportsMultipleReaders() bool { return true } @@ -444,15 +221,16 @@ func (u *StdConn) listenOutSingle(r EncReader, flush func()) error { } } -func getFrom(names [][]byte, i int, isV4 bool) netip.AddrPort { +// readSockaddr decodes the source address out of a recvmmsg name buffer +func (u *StdConn) readSockaddr(name []byte) netip.AddrPort { var ip netip.Addr - // Its ok to skip the ok check here, the slicing is the only error that can occur and it will panic - if isV4 { - ip, _ = netip.AddrFromSlice(names[i][4:8]) + // It's ok to skip the ok check here, the slicing is the only error that can occur and it will panic + if u.isV4 { + ip, _ = netip.AddrFromSlice(name[4:8]) } else { - ip, _ = netip.AddrFromSlice(names[i][8:24]) + ip, _ = netip.AddrFromSlice(name[8:24]) } - return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(names[i][2:4])) + return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(name[2:4])) } func (u *StdConn) listenOutBatch(r EncReader, flush func()) error { @@ -461,16 +239,6 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error { bufSize := MTU cmsgSpace := 0 - if u.groSupported { - bufSize = udpGROBufferSize - cmsgSpace = unix.CmsgSpace(udpGROCmsgPayload) - } - if u.ecnRecvSupported { - // IP_TOS arrives as 1 byte; IPV6_TCLASS arrives as a 4-byte int. - // Reserve enough for the wider of the two so the same buffer fits - // either family alongside any UDP_GRO cmsg. - cmsgSpace += unix.CmsgSpace(4) - } msgs, buffers, names, _ := u.PrepareRawMessages(u.batch, bufSize, cmsgSpace) //reader needs to capture variables from this function, since it's used as a lambda with rawConn.Read @@ -481,11 +249,6 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error { } for { - if cmsgSpace > 0 { - for i := range msgs { - setMsgControllen(&msgs[i].Hdr, cmsgSpace) - } - } err := u.rawConn.Read(reader) if err != nil { return err @@ -495,84 +258,13 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error { } for i := 0; i < n; i++ { - from := getFrom(names, i, u.isV4) - payload := buffers[i][:msgs[i].Len] - - segSize := 0 - outerECN := byte(0) - if cmsgSpace > 0 { - segSize, outerECN = parseRecvCmsg(&msgs[i].Hdr, u.groSupported, u.ecnRecvSupported, u.isV4) - } - - if segSize <= 0 || segSize >= len(payload) { - r(from, payload, RxMeta{OuterECN: outerECN}) - } else { - for off := 0; off < len(payload); off += segSize { - end := off + segSize - if end > len(payload) { - end = len(payload) - } - seg := payload[off:end] - r(from, seg, RxMeta{OuterECN: outerECN}) - } - } + r(u.readSockaddr(names[i]), buffers[i][:msgs[i].Len], RxMeta{}) } flush() } } -// headerCounter returns the big-endian uint64 message counter at bytes -// [8:16] of a nebula packet, or 0 if the buffer is too short. -func headerCounter(buf []byte) uint64 { - if len(buf) < 16 { - return 0 - } - return binary.BigEndian.Uint64(buf[8:16]) -} - -// parseRecvCmsg walks the per-slot ancillary buffer once and extracts up to -// two values of interest in a single pass: the UDP_GRO gso_size (when -// wantGRO is true) and the outer IP-level ECN codepoint stamped on the -// carrier (when wantECN is true). Returns zeros for whichever field is not -// requested or not present. isV4 selects between IP_TOS (1-byte) and -// IPV6_TCLASS (4-byte int) cmsg payloads. -func parseRecvCmsg(hdr *msghdr, wantGRO, wantECN bool, isV4 bool) (gso int, ecn byte) { - controllen := int(hdr.Controllen) - if controllen < unix.SizeofCmsghdr || hdr.Control == nil { - return 0, 0 - } - ctrl := unsafe.Slice(hdr.Control, controllen) - off := 0 - for off+unix.SizeofCmsghdr <= len(ctrl) { - ch := (*unix.Cmsghdr)(unsafe.Pointer(&ctrl[off])) - clen := int(ch.Len) - if clen < unix.SizeofCmsghdr || off+clen > len(ctrl) { - return gso, ecn - } - dataOff := off + unix.CmsgLen(0) - switch { - case wantGRO && ch.Level == unix.SOL_UDP && ch.Type == unix.UDP_GRO: - if dataOff+udpGROCmsgPayload <= len(ctrl) { - gso = int(int32(binary.NativeEndian.Uint32(ctrl[dataOff : dataOff+udpGROCmsgPayload]))) - } - case wantECN && isV4 && ch.Level == unix.IPPROTO_IP && ch.Type == unix.IP_TOS: - // IP_TOS arrives as a single byte; only the low 2 bits are ECN. - if dataOff+1 <= len(ctrl) { - ecn = ctrl[dataOff] & 0x03 - } - case wantECN && !isV4 && ch.Level == unix.IPPROTO_IPV6 && ch.Type == unix.IPV6_TCLASS: - // IPV6_TCLASS arrives as a 4-byte int; ECN is the low 2 bits. - if dataOff+4 <= len(ctrl) { - ecn = byte(binary.NativeEndian.Uint32(ctrl[dataOff:dataOff+4])) & 0x03 - } - } - // Advance by the aligned cmsg space. - off += unix.CmsgSpace(clen - unix.CmsgLen(0)) - } - return gso, ecn -} - func (u *StdConn) ListenOut(r EncReader, flush func()) error { if u.batch == 1 { return u.listenOutSingle(r, flush) @@ -587,222 +279,89 @@ func (u *StdConn) WriteTo(b []byte, ip netip.AddrPort) error { } // WriteBatch sends bufs via sendmmsg(2) using the preallocated scratch on -// StdConn. Consecutive packets to the same destination with matching segment -// sizes (all but possibly the last) are coalesced into a single mmsghdr entry -// carrying a UDP_SEGMENT cmsg, so one syscall can mix runs of GSO superpackets -// with plain one-off datagrams. Without GSO support every packet is its own -// entry, matching the prior behaviour. +// StdConn. If supported, consecutive packets to the same destination with +// matching segment sizes (all but possibly the last) are coalesced into a +// single mmsghdr entry // -// Chunks larger than the scratch are processed across multiple syscalls. If -// sendmmsg returns an error AND zero entries went out we fall back to +// If sendmmsg returns an error and zero entries went out, we fall back to // per-packet WriteTo for that chunk so the caller still gets best-effort -// delivery; on a partial-success error we just replay the remainder. -func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, ecns []byte) error { - if len(bufs) != len(addrs) { - return fmt.Errorf("WriteBatch: len(bufs)=%d != len(addrs)=%d", len(bufs), len(addrs)) - } - if ecns != nil && len(ecns) != len(bufs) { - return fmt.Errorf("WriteBatch: len(ecns)=%d != len(bufs)=%d", len(ecns), len(bufs)) - } +// delivery. On a partial send we resume at the first un-acked entry on +// the next iteration. +func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, _ []byte) error { + for i := 0; i < len(bufs); { + chunk := min(len(bufs)-i, len(u.writeMsgs)) - // Callers deliver same-destination packets contiguously and in counter - // order, so we run the GSO planner directly without a pre-sort. A - // sorting pass measurably hurt throughput in microbenchmarks while - // providing no observed reordering benefit. + for k := 0; k < chunk; k++ { + u.writeIovs[k].Base = &bufs[i+k][0] + setIovLen(&u.writeIovs[k], len(bufs[i+k])) - i := 0 - for i < len(bufs) { - baseI := i - entry := 0 - iovIdx := 0 - for entry < len(u.writeMsgs) && i < len(bufs) { - iovBudget := len(u.writeIovs) - iovIdx - if iovBudget < 1 { - break - } - runLen, segSize := u.planRun(bufs, addrs, ecns, i, iovBudget) - if runLen == 0 { - break - } - - for k := 0; k < runLen; k++ { - b := bufs[i+k] - if len(b) == 0 { - u.writeIovs[iovIdx+k].Base = nil - setIovLen(&u.writeIovs[iovIdx+k], 0) - } else { - u.writeIovs[iovIdx+k].Base = &b[0] - setIovLen(&u.writeIovs[iovIdx+k], len(b)) - } - } - - nlen, err := writeSockaddr(u.writeNames[entry], addrs[i], u.isV4) + nlen, err := writeSockaddr(u.writeNames[k], addrs[i+k], u.isV4) if err != nil { return err } - hdr := &u.writeMsgs[entry].Hdr - hdr.Iov = &u.writeIovs[iovIdx] - setMsgIovlen(hdr, runLen) + hdr := &u.writeMsgs[k].Hdr + hdr.Iov = &u.writeIovs[k] + setMsgIovlen(hdr, 1) hdr.Namelen = uint32(nlen) - - var ecn byte - if ecns != nil { - ecn = ecns[i] - } - u.writeEntryCmsg(entry, runLen, segSize, ecn) - - i += runLen - iovIdx += runLen - u.writeEntryEnd[entry] = i - entry++ } - if entry == 0 { - return fmt.Errorf("sendmmsg: no progress") - } - - sent, serr := u.sendmmsg(entry) + sent, serr := u.sendmmsg(chunk) if serr != nil && sent <= 0 { - // Nothing went out for this chunk; fall back to WriteTo for each - // packet that was queued this iteration. We only enter this path - // when sendmmsg returned an error AND zero entries succeeded — - // otherwise the partial-success advance below replays only the - // remainder, avoiding duplicates of already-sent packets. - // - // sent=-1 from sendmmsg means message 0 itself failed (partial - // success returns the count instead), so log entry 0's parameters - // — that's the entry the kernel rejected. - hdr0 := &u.writeMsgs[0].Hdr - runLen0 := u.writeEntryEnd[0] - baseI - seg0 := len(bufs[baseI]) - ecn0 := byte(0) - if ecns != nil { - ecn0 = ecns[baseI] - } - u.l.Warn("sendmmsg had problem", - "sent", sent, "err", serr, - "entries", entry, - "entry0_runLen", runLen0, - "entry0_segSize", seg0, - "entry0_iovlen", hdr0.Iovlen, - "entry0_controllen", hdr0.Controllen, - "entry0_namelen", hdr0.Namelen, - "entry0_ecn", ecn0, - "entry0_dst", addrs[baseI], + // sendmmsg returns -1 / sent=0 when entry 0 itself failed; log + // that entry's destination and fall back to per-packet WriteTo + // for the whole chunk so the caller still gets best-effort + // delivery without duplicating packets the kernel accepted. + u.l.Warn("sendmmsg failed, falling back to per-packet WriteTo", + "err", serr, + "entries", chunk, + "entry0_dst", addrs[i], "isV4", u.isV4, - "gso", u.gsoSupported, - "gro", u.groSupported, ) - for k := baseI; k < i; k++ { - if werr := u.WriteTo(bufs[k], addrs[k]); werr != nil { + for k := 0; k < chunk; k++ { + if werr := u.WriteTo(bufs[i+k], addrs[i+k]); werr != nil { return werr } } + i += chunk continue } - if sent == 0 { - return fmt.Errorf("sendmmsg made no progress") - } - // Rewind i to the end of the last successfully sent entry. For a - // full-success send this leaves i unchanged; for a partial send it - // replays the remainder on the next outer-loop iteration. - i = u.writeEntryEnd[sent-1] + i += sent } return nil } -// planRun groups consecutive packets starting at `start` that can be sent as -// a single UDP GSO superpacket (one sendmmsg entry with UDP_SEGMENT cmsg). -// A run of length 1 means the entry carries no UDP_SEGMENT cmsg and the -// kernel treats it as a plain datagram. Returns the run length and the -// per-segment size (which equals len(bufs[start])). Without GSO support -// every call returns runLen=1. Outer ECN (when ecns != nil) is also a run -// boundary — the kernel stamps one outer codepoint per sendmsg entry, so -// mixing values inside a run would lose information. -func (u *StdConn) planRun(bufs [][]byte, addrs []netip.AddrPort, ecns []byte, start, iovBudget int) (int, int) { - if start >= len(bufs) || iovBudget < 1 { - return 0, 0 - } - segSize := len(bufs[start]) - if !u.gsoSupported || segSize == 0 || segSize > maxGSOBytes { - return 1, segSize - } - dst := addrs[start] - var ecn byte - if ecns != nil { - ecn = ecns[start] - } - maxLen := u.maxGSOSegments - if iovBudget < maxLen { - maxLen = iovBudget - } - runLen := 1 - total := segSize - for runLen < maxLen && start+runLen < len(bufs) { - nextLen := len(bufs[start+runLen]) - if nextLen == 0 || nextLen > segSize { - break - } - if addrs[start+runLen] != dst { - break - } - if ecns != nil && ecns[start+runLen] != ecn { - break - } - if total+nextLen > maxGSOBytes { - break - } - total += nextLen - runLen++ - if nextLen < segSize { - // A short packet must be the last in the run. - break - } - } - return runLen, segSize -} - -// writeEntryCmsg sets up the per-mmsghdr Hdr.Control / Hdr.Controllen for one -// entry. It writes the UDP_SEGMENT payload when runLen >= 2 and the -// IP_TOS/IPV6_TCLASS payload when ecn != 0, then points hdr.Control at the -// smallest contiguous span that covers whichever cmsg(s) actually apply. -func (u *StdConn) writeEntryCmsg(entry, runLen, segSize int, ecn byte) { - hdr := &u.writeMsgs[entry].Hdr - useSeg := runLen >= 2 - useEcn := ecn != 0 - base := entry * u.writeCmsgSpace - - if useSeg { - dataOff := base + unix.CmsgLen(0) - binary.NativeEndian.PutUint16(u.writeCmsg[dataOff:dataOff+2], uint16(segSize)) - } - if useEcn { - dataOff := base + u.writeCmsgSegSpace + unix.CmsgLen(0) - binary.NativeEndian.PutUint32(u.writeCmsg[dataOff:dataOff+4], uint32(ecn)) - } - - switch { - case useSeg && useEcn: - hdr.Control = &u.writeCmsg[base] - setMsgControllen(hdr, u.writeCmsgSpace) - case useSeg: - hdr.Control = &u.writeCmsg[base] - setMsgControllen(hdr, u.writeCmsgSegSpace) - case useEcn: - hdr.Control = &u.writeCmsg[base+u.writeCmsgSegSpace] - setMsgControllen(hdr, u.writeCmsgEcnSpace) - default: - hdr.Control = nil - setMsgControllen(hdr, 0) - } -} - -// sendmmsg issues sendmmsg(2) over u.rawConn against the first n entries -// of u.writeMsgs. Routes through u.rawSend so the per-call kernel callback -// stays alloc-free. +// sendmmsg issues sendmmsg(2) against the first n entries of u.writeMsgs. +// The bound u.sendmmsgCB is passed to rawConn.Write so no closure is +// allocated per call; inputs and outputs ride on the StdConn fields. func (u *StdConn) sendmmsg(n int) (int, error) { - return u.rawSend.send(u.rawConn, n) + u.sendmmsgN = n + u.sendmmsgSent = 0 + u.sendmmsgErrno = 0 + if err := u.rawConn.Write(u.sendmmsgCB); err != nil { + return u.sendmmsgSent, err + } + if u.sendmmsgErrno != 0 { + return u.sendmmsgSent, &net.OpError{Op: "sendmmsg", Err: u.sendmmsgErrno} + } + return u.sendmmsgSent, nil +} + +// sendmmsgRun is the rawConn.Write callback. It is bound once into +// u.sendmmsgCB at construction so it stays alloc-free in the hot path; +// inputs (sendmmsgN) and outputs (sendmmsgSent, sendmmsgErrno) ride on +// the receiver rather than escaping locals. +func (u *StdConn) sendmmsgRun(fd uintptr) bool { + r1, _, errno := unix.Syscall6(unix.SYS_SENDMMSG, fd, + uintptr(unsafe.Pointer(&u.writeMsgs[0])), uintptr(u.sendmmsgN), + 0, 0, 0, + ) + if errno == syscall.EAGAIN || errno == syscall.EWOULDBLOCK { + return false + } + u.sendmmsgSent = int(r1) + u.sendmmsgErrno = errno + return true } // writeSockaddr encodes addr into buf (which must be at least @@ -820,9 +379,7 @@ func writeSockaddr(buf []byte, addr netip.AddrPort, isV4 bool) (int, error) { binary.BigEndian.PutUint16(buf[2:4], addr.Port()) ip4 := ap.As4() copy(buf[4:8], ip4[:]) - for j := 8; j < 16; j++ { - buf[j] = 0 - } + clear(buf[8:16]) return unix.SizeofSockaddrInet4, nil } // struct sockaddr_in6: { sa_family_t(2), in_port_t(2, BE), flowinfo(4), in6_addr(16), scope_id(4) } @@ -940,22 +497,3 @@ func NewUDPStatsEmitter(udpConns []Conn) func() { } } } - -func parseRelease(r string) (major, minor int) { - // strip anything after the second dot or any non-digit - parts := strings.SplitN(r, ".", 3) - if len(parts) < 2 { - return 0, 0 - } - major, _ = strconv.Atoi(parts[0]) - // minor may have trailing junk like "15-generic" - mp := parts[1] - for i, c := range mp { - if c < '0' || c > '9' { - mp = mp[:i] - break - } - } - minor, _ = strconv.Atoi(mp) - return -}