mirror of
https://github.com/slackhq/nebula.git
synced 2026-05-16 04:47:38 +02:00
broken checkpt
This commit is contained in:
603
udp/udp_linux.go
603
udp/udp_linux.go
@@ -6,10 +6,13 @@ package udp
|
||||
import (
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net"
|
||||
"net/netip"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
||||
@@ -32,14 +35,44 @@ type StdConn struct {
|
||||
writeIovs []iovec
|
||||
writeNames [][]byte
|
||||
|
||||
// sendmmsg(2) callback state. sendmmsgCB is bound once in NewListener
|
||||
// to the sendmmsgRun method value so passing it to rawConn.Write does
|
||||
// not allocate a fresh closure per send; sendmmsgN/Sent/Errno carry
|
||||
// the inputs and outputs across the call without escaping locals.
|
||||
sendmmsgCB func(fd uintptr) bool
|
||||
sendmmsgN int
|
||||
sendmmsgSent int
|
||||
sendmmsgErrno syscall.Errno
|
||||
// Per-entry cmsg scratch. writeCmsg is one contiguous slab of
|
||||
// MaxWriteBatch * writeCmsgSpace bytes; each entry holds two cmsg
|
||||
// headers (UDP_SEGMENT then IP_TOS / IPV6_TCLASS) pre-filled once in
|
||||
// prepareWriteMessages. WriteBatch only rewrites the per-call data
|
||||
// payloads and toggles Hdr.Control / Hdr.Controllen to point at
|
||||
// whichever subset of the two cmsgs applies.
|
||||
writeCmsg []byte
|
||||
writeCmsgSpace int
|
||||
writeCmsgSegSpace int
|
||||
writeCmsgEcnSpace int
|
||||
|
||||
// writeEntryEnd[e] is the bufs index *after* the last packet packed
|
||||
// into mmsghdr entry e. Used to rewind `i` on partial sendmmsg success.
|
||||
writeEntryEnd []int
|
||||
|
||||
// rawSend wraps the sendmmsg(2) callback in a closure-free helper so
|
||||
// the hot path doesn't heap-allocate a fresh closure per call.
|
||||
rawSend rawSendmmsg
|
||||
|
||||
// UDP GSO (sendmsg with UDP_SEGMENT cmsg) support. gsoSupported is
|
||||
// probed once at socket creation. When true, WriteBatch packs same-
|
||||
// destination consecutive packets into a single sendmmsg entry with a
|
||||
// UDP_SEGMENT cmsg; otherwise each packet is its own entry.
|
||||
gsoSupported bool
|
||||
maxGSOSegments int
|
||||
|
||||
// UDP GRO (recvmsg with UDP_GRO cmsg) support. groSupported is probed
|
||||
// once at socket creation. When true, listenOutBatch allocates larger
|
||||
// RX buffers and a per-entry cmsg slot so the kernel can coalesce
|
||||
// consecutive same-flow datagrams into a single recvmmsg entry; the
|
||||
// delivered cmsg carries the gso_size used to split them back apart.
|
||||
groSupported bool
|
||||
|
||||
// ecnRecvSupported is true when IP_RECVTOS / IPV6_RECVTCLASS was
|
||||
// successfully enabled — the kernel will deliver the outer IP-ECN of
|
||||
// each arriving datagram as a per-slot cmsg, and listenOutBatch passes
|
||||
// the parsed value to the EncReader callback for RFC 6040 combine.
|
||||
ecnRecvSupported bool
|
||||
}
|
||||
|
||||
func setReusePort(network, address string, c syscall.RawConn) error {
|
||||
@@ -73,10 +106,11 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int)
|
||||
}
|
||||
//gotta find out if we got an AF_INET6 socket or not:
|
||||
out := &StdConn{
|
||||
udpConn: udpConn,
|
||||
rawConn: rawConn,
|
||||
l: l,
|
||||
batch: batch,
|
||||
udpConn: udpConn,
|
||||
rawConn: rawConn,
|
||||
l: l,
|
||||
batch: batch,
|
||||
maxGSOSegments: 1,
|
||||
}
|
||||
|
||||
af, err := out.getSockOptInt(unix.SO_DOMAIN)
|
||||
@@ -87,15 +121,71 @@ func NewListener(l *slog.Logger, ip netip.Addr, port int, multi bool, batch int)
|
||||
out.isV4 = af == unix.AF_INET
|
||||
|
||||
out.prepareWriteMessages(MaxWriteBatch)
|
||||
out.sendmmsgCB = out.sendmmsgRun
|
||||
out.rawSend.msgs = out.writeMsgs
|
||||
out.rawSend.bind()
|
||||
|
||||
out.prepareGSO()
|
||||
// GRO delivers coalesced superpackets that need a cmsg to split back
|
||||
// into segments. The single-packet RX path uses ReadFromUDPAddrPort
|
||||
// and cannot see that cmsg, so only enable GRO for the batch path.
|
||||
if batch > 1 {
|
||||
out.prepareGRO()
|
||||
}
|
||||
// Best-effort: ask the kernel to deliver outer IP-ECN as ancillary data
|
||||
// on every recvmmsg slot so the decap side can apply RFC 6040 combine.
|
||||
// On older kernels these may not exist; failing here just means we get
|
||||
// 0 (Not-ECT) on every slot, which is the same as ecn_mode=disable.
|
||||
out.prepareECNRecv()
|
||||
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// prepareWriteMessages allocates one mmsghdr/iovec/sockaddr/cmsg scratch
|
||||
// slot per sendmmsg entry. The iovec slab is sized to n so all entries'
|
||||
// iovecs share one allocation; per-entry fan-out is further capped at
|
||||
// maxGSOSegments. Hdr.Iov / Hdr.Iovlen / Hdr.Control / Hdr.Controllen are
|
||||
// wired per call since each entry can span a variable number of iovecs
|
||||
// and may or may not carry a cmsg.
|
||||
//
|
||||
// Per-mmsghdr cmsg layout. Each entry's slot of length writeCmsgSpace holds
|
||||
// up to two cmsg headers placed at fixed offsets:
|
||||
//
|
||||
// [0 .. writeCmsgSegSpace) UDP_SEGMENT (gso_size, uint16)
|
||||
// [writeCmsgSegSpace .. writeCmsgSpace) IP_TOS or IPV6_TCLASS (int32)
|
||||
//
|
||||
// Both headers are pre-filled once here; per-call we only rewrite the data
|
||||
// payload and toggle Hdr.Control / Hdr.Controllen to point at whichever
|
||||
// subset applies (none / segment-only / ecn-only / both).
|
||||
func (u *StdConn) prepareWriteMessages(n int) {
|
||||
u.writeMsgs = make([]rawMessage, n)
|
||||
u.writeIovs = make([]iovec, n)
|
||||
u.writeNames = make([][]byte, n)
|
||||
u.writeEntryEnd = make([]int, n)
|
||||
|
||||
u.writeCmsgSegSpace = unix.CmsgSpace(2)
|
||||
u.writeCmsgEcnSpace = unix.CmsgSpace(4)
|
||||
u.writeCmsgSpace = u.writeCmsgSegSpace + u.writeCmsgEcnSpace
|
||||
u.writeCmsg = make([]byte, n*u.writeCmsgSpace)
|
||||
|
||||
ecnLevel := int32(unix.IPPROTO_IP)
|
||||
ecnType := int32(unix.IP_TOS)
|
||||
if !u.isV4 {
|
||||
ecnLevel = unix.IPPROTO_IPV6
|
||||
ecnType = unix.IPV6_TCLASS
|
||||
}
|
||||
|
||||
for k := 0; k < n; k++ {
|
||||
base := k * u.writeCmsgSpace
|
||||
seg := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base]))
|
||||
seg.Level = unix.SOL_UDP
|
||||
seg.Type = unix.UDP_SEGMENT
|
||||
setCmsgLen(seg, unix.CmsgLen(2))
|
||||
|
||||
ecn := (*unix.Cmsghdr)(unsafe.Pointer(&u.writeCmsg[base+u.writeCmsgSegSpace]))
|
||||
ecn.Level = ecnLevel
|
||||
ecn.Type = ecnType
|
||||
setCmsgLen(ecn, unix.CmsgLen(4))
|
||||
}
|
||||
|
||||
for i := range u.writeMsgs {
|
||||
u.writeNames[i] = make([]byte, unix.SizeofSockaddrInet6)
|
||||
@@ -103,6 +193,139 @@ func (u *StdConn) prepareWriteMessages(n int) {
|
||||
}
|
||||
}
|
||||
|
||||
// maxGSOBytes bounds the total payload per sendmsg() when UDP_SEGMENT is
|
||||
// set. The kernel stitches all iovecs into a single skb whose length the
|
||||
// UDP length field can represent, and also enforces sk_gso_max_size (which
|
||||
// on most devices is 65536). We use 65000 to leave headroom under the
|
||||
// 65535 UDP-length cap, avoiding EMSGSIZE on large TSO superpackets.
|
||||
const maxGSOBytes = 65000
|
||||
|
||||
// prepareGSO probes UDP_SEGMENT support and sets u.gsoSupported on success.
|
||||
// Best-effort; failure leaves it false.
|
||||
func (u *StdConn) prepareGSO() {
|
||||
u.maxGSOSegments = 63 //gotta be one less than the max so we can still attach a header
|
||||
|
||||
var probeErr error
|
||||
if err := u.rawConn.Control(func(fd uintptr) {
|
||||
probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_SEGMENT, 0)
|
||||
}); err != nil {
|
||||
u.l.Info("udp: GSO disabled", "reason", "rawconn control failed", "error", err)
|
||||
recordCapability("udp.gso.enabled", false)
|
||||
return
|
||||
}
|
||||
if probeErr != nil {
|
||||
u.l.Info("udp: GSO disabled", "reason", "kernel rejected probe", "error", probeErr)
|
||||
recordCapability("udp.gso.enabled", false)
|
||||
return
|
||||
}
|
||||
|
||||
var un unix.Utsname
|
||||
if err := unix.Uname(&un); err != nil {
|
||||
u.l.Info("udp: GSO disabled", "reason", "kernel uname probe failed", "error", err)
|
||||
recordCapability("udp.gso.enabled", false)
|
||||
return
|
||||
}
|
||||
major, minor := parseRelease(string(un.Release[:]))
|
||||
if major > 5 || (major == 5 && minor >= 5) {
|
||||
u.maxGSOSegments = 127
|
||||
}
|
||||
|
||||
u.gsoSupported = true
|
||||
u.l.Info("udp: GSO enabled", "maxGSOSegments", u.maxGSOSegments)
|
||||
recordCapability("udp.gso.enabled", true)
|
||||
}
|
||||
|
||||
// udpGROBufferSize sizes the per-entry recvmmsg buffer when UDP_GRO is on.
|
||||
// The kernel stitches a run of same-flow datagrams into a single skb whose
|
||||
// length is bounded by sk_gso_max_size (typically 65535); anything larger
|
||||
// would be MSG_TRUNCed. We use the maximum representable UDP length so a
|
||||
// full superpacket always lands intact.
|
||||
const udpGROBufferSize = 65535
|
||||
|
||||
// udpGROCmsgPayload is the size of the UDP_GRO cmsg data delivered by the
|
||||
// kernel: a single int (gso_size in bytes). See udp_cmsg_recv() in
|
||||
// net/ipv4/udp.c.
|
||||
const udpGROCmsgPayload = 4
|
||||
|
||||
// prepareGRO turns on UDP_GRO so the kernel coalesces consecutive same-flow
|
||||
// datagrams into one recvmmsg entry, with a cmsg carrying the gso_size used
|
||||
// to split them back apart on the application side.
|
||||
func (u *StdConn) prepareGRO() {
|
||||
var probeErr error
|
||||
if err := u.rawConn.Control(func(fd uintptr) {
|
||||
probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_GRO, 1)
|
||||
}); err != nil {
|
||||
u.l.Info("udp: GRO disabled", "reason", "rawconn control failed", "error", err)
|
||||
recordCapability("udp.gro.enabled", false)
|
||||
return
|
||||
}
|
||||
if probeErr != nil {
|
||||
u.l.Info("udp: GRO disabled", "reason", "kernel rejected probe", "error", probeErr)
|
||||
recordCapability("udp.gro.enabled", false)
|
||||
return
|
||||
}
|
||||
u.groSupported = true
|
||||
u.l.Info("udp: GRO enabled")
|
||||
recordCapability("udp.gro.enabled", true)
|
||||
}
|
||||
|
||||
// prepareECNRecv turns on IP_RECVTOS / IPV6_RECVTCLASS so the outer IP-ECN
|
||||
// field of each arriving datagram is delivered as ancillary data alongside
|
||||
// the payload. listenOutBatch reads it via parseRecvCmsg and passes the
|
||||
// codepoint through the EncReader for RFC 6040 combine on the decap side.
|
||||
// Best-effort: we keep going on failure.
|
||||
func (u *StdConn) prepareECNRecv() {
|
||||
var v4err, v6err error
|
||||
if err := u.rawConn.Control(func(fd uintptr) {
|
||||
v4err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_RECVTOS, 1)
|
||||
if !u.isV4 {
|
||||
v6err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_RECVTCLASS, 1)
|
||||
}
|
||||
}); err != nil {
|
||||
u.l.Info("udp: outer-ECN RX disabled", "reason", "rawconn control failed", "error", err)
|
||||
recordCapability("udp.ecn_rx.enabled", false)
|
||||
return
|
||||
}
|
||||
if u.isV4 { //only check the V4 attempt
|
||||
if v4err != nil {
|
||||
u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", v4err)
|
||||
recordCapability("udp.ecn_rx.enabled", false)
|
||||
} else {
|
||||
u.ecnRecvSupported = true
|
||||
u.l.Info("udp: outer-ECN RX enabled")
|
||||
recordCapability("udp.ecn_rx.enabled", true)
|
||||
}
|
||||
return
|
||||
} else {
|
||||
if v6err != nil { //no V6 ECN? disable it.
|
||||
u.l.Info("udp: outer-ECN RX disabled", "reason", "kernel rejected probe", "error", errors.Join(v4err, v6err))
|
||||
recordCapability("udp.ecn_rx.enabled", false)
|
||||
return
|
||||
} else if v4err != nil { //no V4, but yes V6? Low level warning. Could be a V6-specific bind.
|
||||
u.l.Debug("udp: outer-ECN RX degraded", "reason", "kernel rejected probe on IPv4", "error", v4err)
|
||||
}
|
||||
// all good
|
||||
u.ecnRecvSupported = true
|
||||
u.l.Info("udp: outer-ECN RX enabled")
|
||||
recordCapability("udp.ecn_rx.enabled", true)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// recordCapability registers (or updates) a boolean gauge for one of the
|
||||
// kernel-feature probes. Gauges go to 1 when the feature is enabled, 0 when
|
||||
// it is not — dashboards can show degraded state on partially-supported
|
||||
// kernels at a glance. Calling repeatedly with the same name updates the
|
||||
// existing gauge rather than registering a duplicate.
|
||||
func recordCapability(name string, enabled bool) {
|
||||
g := metrics.GetOrRegisterGauge(name, nil)
|
||||
if enabled {
|
||||
g.Update(1)
|
||||
} else {
|
||||
g.Update(0)
|
||||
}
|
||||
}
|
||||
|
||||
func (u *StdConn) SupportsMultipleReaders() bool {
|
||||
return true
|
||||
}
|
||||
@@ -221,16 +444,15 @@ func (u *StdConn) listenOutSingle(r EncReader, flush func()) error {
|
||||
}
|
||||
}
|
||||
|
||||
// readSockaddr decodes the source address out of a recvmmsg name buffer
|
||||
func (u *StdConn) readSockaddr(name []byte) netip.AddrPort {
|
||||
func getFrom(names [][]byte, i int, isV4 bool) netip.AddrPort {
|
||||
var ip netip.Addr
|
||||
// It's ok to skip the ok check here, the slicing is the only error that can occur and it will panic
|
||||
if u.isV4 {
|
||||
ip, _ = netip.AddrFromSlice(name[4:8])
|
||||
// Its ok to skip the ok check here, the slicing is the only error that can occur and it will panic
|
||||
if isV4 {
|
||||
ip, _ = netip.AddrFromSlice(names[i][4:8])
|
||||
} else {
|
||||
ip, _ = netip.AddrFromSlice(name[8:24])
|
||||
ip, _ = netip.AddrFromSlice(names[i][8:24])
|
||||
}
|
||||
return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(name[2:4]))
|
||||
return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(names[i][2:4]))
|
||||
}
|
||||
|
||||
func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
|
||||
@@ -239,6 +461,16 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
|
||||
|
||||
bufSize := MTU
|
||||
cmsgSpace := 0
|
||||
if u.groSupported {
|
||||
bufSize = udpGROBufferSize
|
||||
cmsgSpace = unix.CmsgSpace(udpGROCmsgPayload)
|
||||
}
|
||||
if u.ecnRecvSupported {
|
||||
// IP_TOS arrives as 1 byte; IPV6_TCLASS arrives as a 4-byte int.
|
||||
// Reserve enough for the wider of the two so the same buffer fits
|
||||
// either family alongside any UDP_GRO cmsg.
|
||||
cmsgSpace += unix.CmsgSpace(4)
|
||||
}
|
||||
msgs, buffers, names, _ := u.PrepareRawMessages(u.batch, bufSize, cmsgSpace)
|
||||
|
||||
//reader needs to capture variables from this function, since it's used as a lambda with rawConn.Read
|
||||
@@ -249,6 +481,11 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
|
||||
}
|
||||
|
||||
for {
|
||||
if cmsgSpace > 0 {
|
||||
for i := range msgs {
|
||||
setMsgControllen(&msgs[i].Hdr, cmsgSpace)
|
||||
}
|
||||
}
|
||||
err := u.rawConn.Read(reader)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -258,13 +495,75 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
|
||||
}
|
||||
|
||||
for i := 0; i < n; i++ {
|
||||
r(u.readSockaddr(names[i]), buffers[i][:msgs[i].Len], RxMeta{})
|
||||
from := getFrom(names, i, u.isV4)
|
||||
payload := buffers[i][:msgs[i].Len]
|
||||
|
||||
segSize := 0
|
||||
outerECN := byte(0)
|
||||
if cmsgSpace > 0 {
|
||||
segSize, outerECN = parseRecvCmsg(&msgs[i].Hdr, u.groSupported, u.ecnRecvSupported, u.isV4)
|
||||
}
|
||||
|
||||
if segSize <= 0 || segSize >= len(payload) {
|
||||
r(from, payload, RxMeta{OuterECN: outerECN})
|
||||
} else {
|
||||
for off := 0; off < len(payload); off += segSize {
|
||||
end := off + segSize
|
||||
if end > len(payload) {
|
||||
end = len(payload)
|
||||
}
|
||||
seg := payload[off:end]
|
||||
r(from, seg, RxMeta{OuterECN: outerECN})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
flush()
|
||||
}
|
||||
}
|
||||
|
||||
// parseRecvCmsg walks the per-slot ancillary buffer once and extracts up to
|
||||
// two values of interest in a single pass: the UDP_GRO gso_size (when
|
||||
// wantGRO is true) and the outer IP-level ECN codepoint stamped on the
|
||||
// carrier (when wantECN is true). Returns zeros for whichever field is not
|
||||
// requested or not present. isV4 selects between IP_TOS (1-byte) and
|
||||
// IPV6_TCLASS (4-byte int) cmsg payloads.
|
||||
func parseRecvCmsg(hdr *msghdr, wantGRO, wantECN bool, isV4 bool) (gso int, ecn byte) {
|
||||
controllen := int(hdr.Controllen)
|
||||
if controllen < unix.SizeofCmsghdr || hdr.Control == nil {
|
||||
return 0, 0
|
||||
}
|
||||
ctrl := unsafe.Slice(hdr.Control, controllen)
|
||||
off := 0
|
||||
for off+unix.SizeofCmsghdr <= len(ctrl) {
|
||||
ch := (*unix.Cmsghdr)(unsafe.Pointer(&ctrl[off]))
|
||||
clen := int(ch.Len)
|
||||
if clen < unix.SizeofCmsghdr || off+clen > len(ctrl) {
|
||||
return gso, ecn
|
||||
}
|
||||
dataOff := off + unix.CmsgLen(0)
|
||||
switch {
|
||||
case wantGRO && ch.Level == unix.SOL_UDP && ch.Type == unix.UDP_GRO:
|
||||
if dataOff+udpGROCmsgPayload <= len(ctrl) {
|
||||
gso = int(int32(binary.NativeEndian.Uint32(ctrl[dataOff : dataOff+udpGROCmsgPayload])))
|
||||
}
|
||||
case wantECN && isV4 && ch.Level == unix.IPPROTO_IP && ch.Type == unix.IP_TOS:
|
||||
// IP_TOS arrives as a single byte; only the low 2 bits are ECN.
|
||||
if dataOff+1 <= len(ctrl) {
|
||||
ecn = ctrl[dataOff] & 0x03
|
||||
}
|
||||
case wantECN && !isV4 && ch.Level == unix.IPPROTO_IPV6 && ch.Type == unix.IPV6_TCLASS:
|
||||
// IPV6_TCLASS arrives as a 4-byte int; ECN is the low 2 bits.
|
||||
if dataOff+4 <= len(ctrl) {
|
||||
ecn = byte(binary.NativeEndian.Uint32(ctrl[dataOff:dataOff+4])) & 0x03
|
||||
}
|
||||
}
|
||||
// Advance by the aligned cmsg space.
|
||||
off += unix.CmsgSpace(clen - unix.CmsgLen(0))
|
||||
}
|
||||
return gso, ecn
|
||||
}
|
||||
|
||||
func (u *StdConn) ListenOut(r EncReader, flush func()) error {
|
||||
if u.batch == 1 {
|
||||
return u.listenOutSingle(r, flush)
|
||||
@@ -279,89 +578,222 @@ func (u *StdConn) WriteTo(b []byte, ip netip.AddrPort) error {
|
||||
}
|
||||
|
||||
// WriteBatch sends bufs via sendmmsg(2) using the preallocated scratch on
|
||||
// StdConn. If supported, consecutive packets to the same destination with
|
||||
// matching segment sizes (all but possibly the last) are coalesced into a
|
||||
// single mmsghdr entry
|
||||
// StdConn. Consecutive packets to the same destination with matching segment
|
||||
// sizes (all but possibly the last) are coalesced into a single mmsghdr entry
|
||||
// carrying a UDP_SEGMENT cmsg, so one syscall can mix runs of GSO superpackets
|
||||
// with plain one-off datagrams. Without GSO support every packet is its own
|
||||
// entry, matching the prior behaviour.
|
||||
//
|
||||
// If sendmmsg returns an error and zero entries went out, we fall back to
|
||||
// Chunks larger than the scratch are processed across multiple syscalls. If
|
||||
// sendmmsg returns an error AND zero entries went out we fall back to
|
||||
// per-packet WriteTo for that chunk so the caller still gets best-effort
|
||||
// delivery. On a partial send we resume at the first un-acked entry on
|
||||
// the next iteration.
|
||||
func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, _ []byte) error {
|
||||
for i := 0; i < len(bufs); {
|
||||
chunk := min(len(bufs)-i, len(u.writeMsgs))
|
||||
// delivery; on a partial-success error we just replay the remainder.
|
||||
func (u *StdConn) WriteBatch(bufs [][]byte, addrs []netip.AddrPort, ecns []byte) error {
|
||||
if len(bufs) != len(addrs) {
|
||||
return fmt.Errorf("WriteBatch: len(bufs)=%d != len(addrs)=%d", len(bufs), len(addrs))
|
||||
}
|
||||
if ecns != nil && len(ecns) != len(bufs) {
|
||||
return fmt.Errorf("WriteBatch: len(ecns)=%d != len(bufs)=%d", len(ecns), len(bufs))
|
||||
}
|
||||
|
||||
for k := 0; k < chunk; k++ {
|
||||
u.writeIovs[k].Base = &bufs[i+k][0]
|
||||
setIovLen(&u.writeIovs[k], len(bufs[i+k]))
|
||||
// Callers deliver same-destination packets contiguously and in counter
|
||||
// order, so we run the GSO planner directly without a pre-sort. A
|
||||
// sorting pass measurably hurt throughput in microbenchmarks while
|
||||
// providing no observed reordering benefit.
|
||||
|
||||
nlen, err := writeSockaddr(u.writeNames[k], addrs[i+k], u.isV4)
|
||||
i := 0
|
||||
for i < len(bufs) {
|
||||
baseI := i
|
||||
entry := 0
|
||||
iovIdx := 0
|
||||
for entry < len(u.writeMsgs) && i < len(bufs) {
|
||||
iovBudget := len(u.writeIovs) - iovIdx
|
||||
if iovBudget < 1 {
|
||||
break
|
||||
}
|
||||
runLen, segSize := u.planRun(bufs, addrs, ecns, i, iovBudget)
|
||||
if runLen == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
for k := 0; k < runLen; k++ {
|
||||
b := bufs[i+k]
|
||||
if len(b) == 0 {
|
||||
u.writeIovs[iovIdx+k].Base = nil
|
||||
setIovLen(&u.writeIovs[iovIdx+k], 0)
|
||||
} else {
|
||||
u.writeIovs[iovIdx+k].Base = &b[0]
|
||||
setIovLen(&u.writeIovs[iovIdx+k], len(b))
|
||||
}
|
||||
}
|
||||
|
||||
nlen, err := writeSockaddr(u.writeNames[entry], addrs[i], u.isV4)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
hdr := &u.writeMsgs[k].Hdr
|
||||
hdr.Iov = &u.writeIovs[k]
|
||||
setMsgIovlen(hdr, 1)
|
||||
hdr := &u.writeMsgs[entry].Hdr
|
||||
hdr.Iov = &u.writeIovs[iovIdx]
|
||||
setMsgIovlen(hdr, runLen)
|
||||
hdr.Namelen = uint32(nlen)
|
||||
|
||||
var ecn byte
|
||||
if ecns != nil {
|
||||
ecn = ecns[i]
|
||||
}
|
||||
u.writeEntryCmsg(entry, runLen, segSize, ecn)
|
||||
|
||||
i += runLen
|
||||
iovIdx += runLen
|
||||
u.writeEntryEnd[entry] = i
|
||||
entry++
|
||||
}
|
||||
|
||||
sent, serr := u.sendmmsg(chunk)
|
||||
if entry == 0 {
|
||||
return fmt.Errorf("sendmmsg: no progress")
|
||||
}
|
||||
|
||||
sent, serr := u.sendmmsg(entry)
|
||||
if serr != nil && sent <= 0 {
|
||||
// sendmmsg returns -1 / sent=0 when entry 0 itself failed; log
|
||||
// that entry's destination and fall back to per-packet WriteTo
|
||||
// for the whole chunk so the caller still gets best-effort
|
||||
// delivery without duplicating packets the kernel accepted.
|
||||
u.l.Warn("sendmmsg failed, falling back to per-packet WriteTo",
|
||||
"err", serr,
|
||||
"entries", chunk,
|
||||
"entry0_dst", addrs[i],
|
||||
// Nothing went out for this chunk; fall back to WriteTo for each
|
||||
// packet that was queued this iteration. We only enter this path
|
||||
// when sendmmsg returned an error AND zero entries succeeded —
|
||||
// otherwise the partial-success advance below replays only the
|
||||
// remainder, avoiding duplicates of already-sent packets.
|
||||
//
|
||||
// sent=-1 from sendmmsg means message 0 itself failed (partial
|
||||
// success returns the count instead), so log entry 0's parameters
|
||||
// — that's the entry the kernel rejected.
|
||||
hdr0 := &u.writeMsgs[0].Hdr
|
||||
runLen0 := u.writeEntryEnd[0] - baseI
|
||||
seg0 := len(bufs[baseI])
|
||||
ecn0 := byte(0)
|
||||
if ecns != nil {
|
||||
ecn0 = ecns[baseI]
|
||||
}
|
||||
u.l.Warn("sendmmsg had problem",
|
||||
"sent", sent, "err", serr,
|
||||
"entries", entry,
|
||||
"entry0_runLen", runLen0,
|
||||
"entry0_segSize", seg0,
|
||||
"entry0_iovlen", hdr0.Iovlen,
|
||||
"entry0_controllen", hdr0.Controllen,
|
||||
"entry0_namelen", hdr0.Namelen,
|
||||
"entry0_ecn", ecn0,
|
||||
"entry0_dst", addrs[baseI],
|
||||
"isV4", u.isV4,
|
||||
"gso", u.gsoSupported,
|
||||
"gro", u.groSupported,
|
||||
)
|
||||
for k := 0; k < chunk; k++ {
|
||||
if werr := u.WriteTo(bufs[i+k], addrs[i+k]); werr != nil {
|
||||
for k := baseI; k < i; k++ {
|
||||
if werr := u.WriteTo(bufs[k], addrs[k]); werr != nil {
|
||||
return werr
|
||||
}
|
||||
}
|
||||
i += chunk
|
||||
continue
|
||||
}
|
||||
i += sent
|
||||
if sent == 0 {
|
||||
return fmt.Errorf("sendmmsg made no progress")
|
||||
}
|
||||
// Rewind i to the end of the last successfully sent entry. For a
|
||||
// full-success send this leaves i unchanged; for a partial send it
|
||||
// replays the remainder on the next outer-loop iteration.
|
||||
i = u.writeEntryEnd[sent-1]
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// sendmmsg issues sendmmsg(2) against the first n entries of u.writeMsgs.
|
||||
// The bound u.sendmmsgCB is passed to rawConn.Write so no closure is
|
||||
// allocated per call; inputs and outputs ride on the StdConn fields.
|
||||
func (u *StdConn) sendmmsg(n int) (int, error) {
|
||||
u.sendmmsgN = n
|
||||
u.sendmmsgSent = 0
|
||||
u.sendmmsgErrno = 0
|
||||
if err := u.rawConn.Write(u.sendmmsgCB); err != nil {
|
||||
return u.sendmmsgSent, err
|
||||
// planRun groups consecutive packets starting at `start` that can be sent as
|
||||
// a single UDP GSO superpacket (one sendmmsg entry with UDP_SEGMENT cmsg).
|
||||
// A run of length 1 means the entry carries no UDP_SEGMENT cmsg and the
|
||||
// kernel treats it as a plain datagram. Returns the run length and the
|
||||
// per-segment size (which equals len(bufs[start])). Without GSO support
|
||||
// every call returns runLen=1. Outer ECN (when ecns != nil) is also a run
|
||||
// boundary — the kernel stamps one outer codepoint per sendmsg entry, so
|
||||
// mixing values inside a run would lose information.
|
||||
func (u *StdConn) planRun(bufs [][]byte, addrs []netip.AddrPort, ecns []byte, start, iovBudget int) (int, int) {
|
||||
if start >= len(bufs) || iovBudget < 1 {
|
||||
return 0, 0
|
||||
}
|
||||
if u.sendmmsgErrno != 0 {
|
||||
return u.sendmmsgSent, &net.OpError{Op: "sendmmsg", Err: u.sendmmsgErrno}
|
||||
segSize := len(bufs[start])
|
||||
if !u.gsoSupported || segSize == 0 || segSize > maxGSOBytes {
|
||||
return 1, segSize
|
||||
}
|
||||
return u.sendmmsgSent, nil
|
||||
dst := addrs[start]
|
||||
var ecn byte
|
||||
if ecns != nil {
|
||||
ecn = ecns[start]
|
||||
}
|
||||
maxLen := u.maxGSOSegments
|
||||
if iovBudget < maxLen {
|
||||
maxLen = iovBudget
|
||||
}
|
||||
runLen := 1
|
||||
total := segSize
|
||||
for runLen < maxLen && start+runLen < len(bufs) {
|
||||
nextLen := len(bufs[start+runLen])
|
||||
if nextLen == 0 || nextLen > segSize {
|
||||
break
|
||||
}
|
||||
if addrs[start+runLen] != dst {
|
||||
break
|
||||
}
|
||||
if ecns != nil && ecns[start+runLen] != ecn {
|
||||
break
|
||||
}
|
||||
if total+nextLen > maxGSOBytes {
|
||||
break
|
||||
}
|
||||
total += nextLen
|
||||
runLen++
|
||||
if nextLen < segSize {
|
||||
// A short packet must be the last in the run.
|
||||
break
|
||||
}
|
||||
}
|
||||
return runLen, segSize
|
||||
}
|
||||
|
||||
// sendmmsgRun is the rawConn.Write callback. It is bound once into
|
||||
// u.sendmmsgCB at construction so it stays alloc-free in the hot path;
|
||||
// inputs (sendmmsgN) and outputs (sendmmsgSent, sendmmsgErrno) ride on
|
||||
// the receiver rather than escaping locals.
|
||||
func (u *StdConn) sendmmsgRun(fd uintptr) bool {
|
||||
r1, _, errno := unix.Syscall6(unix.SYS_SENDMMSG, fd,
|
||||
uintptr(unsafe.Pointer(&u.writeMsgs[0])), uintptr(u.sendmmsgN),
|
||||
0, 0, 0,
|
||||
)
|
||||
if errno == syscall.EAGAIN || errno == syscall.EWOULDBLOCK {
|
||||
return false
|
||||
// writeEntryCmsg sets up the per-mmsghdr Hdr.Control / Hdr.Controllen for one
|
||||
// entry. It writes the UDP_SEGMENT payload when runLen >= 2 and the
|
||||
// IP_TOS/IPV6_TCLASS payload when ecn != 0, then points hdr.Control at the
|
||||
// smallest contiguous span that covers whichever cmsg(s) actually apply.
|
||||
func (u *StdConn) writeEntryCmsg(entry, runLen, segSize int, ecn byte) {
|
||||
hdr := &u.writeMsgs[entry].Hdr
|
||||
useSeg := runLen >= 2
|
||||
useEcn := ecn != 0
|
||||
base := entry * u.writeCmsgSpace
|
||||
|
||||
if useSeg {
|
||||
dataOff := base + unix.CmsgLen(0)
|
||||
binary.NativeEndian.PutUint16(u.writeCmsg[dataOff:dataOff+2], uint16(segSize))
|
||||
}
|
||||
u.sendmmsgSent = int(r1)
|
||||
u.sendmmsgErrno = errno
|
||||
return true
|
||||
if useEcn {
|
||||
dataOff := base + u.writeCmsgSegSpace + unix.CmsgLen(0)
|
||||
binary.NativeEndian.PutUint32(u.writeCmsg[dataOff:dataOff+4], uint32(ecn))
|
||||
}
|
||||
|
||||
switch {
|
||||
case useSeg && useEcn:
|
||||
hdr.Control = &u.writeCmsg[base]
|
||||
setMsgControllen(hdr, u.writeCmsgSpace)
|
||||
case useSeg:
|
||||
hdr.Control = &u.writeCmsg[base]
|
||||
setMsgControllen(hdr, u.writeCmsgSegSpace)
|
||||
case useEcn:
|
||||
hdr.Control = &u.writeCmsg[base+u.writeCmsgSegSpace]
|
||||
setMsgControllen(hdr, u.writeCmsgEcnSpace)
|
||||
default:
|
||||
hdr.Control = nil
|
||||
setMsgControllen(hdr, 0)
|
||||
}
|
||||
}
|
||||
|
||||
// sendmmsg issues sendmmsg(2) over u.rawConn against the first n entries
|
||||
// of u.writeMsgs. Routes through u.rawSend so the per-call kernel callback
|
||||
// stays alloc-free.
|
||||
func (u *StdConn) sendmmsg(n int) (int, error) {
|
||||
return u.rawSend.send(u.rawConn, n)
|
||||
}
|
||||
|
||||
// writeSockaddr encodes addr into buf (which must be at least
|
||||
@@ -497,3 +929,22 @@ func NewUDPStatsEmitter(udpConns []Conn) func() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func parseRelease(r string) (major, minor int) {
|
||||
// strip anything after the second dot or any non-digit
|
||||
parts := strings.SplitN(r, ".", 3)
|
||||
if len(parts) < 2 {
|
||||
return 0, 0
|
||||
}
|
||||
major, _ = strconv.Atoi(parts[0])
|
||||
// minor may have trailing junk like "15-generic"
|
||||
mp := parts[1]
|
||||
for i, c := range mp {
|
||||
if c < '0' || c > '9' {
|
||||
mp = mp[:i]
|
||||
break
|
||||
}
|
||||
}
|
||||
minor, _ = strconv.Atoi(mp)
|
||||
return
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user