mirror of
https://github.com/slackhq/nebula.git
synced 2026-05-16 04:47:38 +02:00
GRO
This commit is contained in:
109
udp/udp_linux.go
109
udp/udp_linux.go
@@ -54,6 +54,13 @@ type StdConn struct {
|
|||||||
// probed once at socket creation. When true, WriteSegmented takes a
|
// probed once at socket creation. When true, WriteSegmented takes a
|
||||||
// single-syscall GSO path; otherwise it falls back to a WriteTo loop.
|
// single-syscall GSO path; otherwise it falls back to a WriteTo loop.
|
||||||
gsoSupported bool
|
gsoSupported bool
|
||||||
|
|
||||||
|
// UDP GRO (recvmsg with UDP_GRO cmsg) support. groSupported is probed
|
||||||
|
// once at socket creation. When true, listenOutBatch allocates larger
|
||||||
|
// RX buffers and a per-entry cmsg slot so the kernel can coalesce
|
||||||
|
// consecutive same-flow datagrams into a single recvmmsg entry; the
|
||||||
|
// delivered cmsg carries the gso_size used to split them back apart.
|
||||||
|
groSupported bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func setReusePort(network, address string, c syscall.RawConn) error {
|
func setReusePort(network, address string, c syscall.RawConn) error {
|
||||||
@@ -104,6 +111,12 @@ func NewListener(l *logrus.Logger, ip netip.Addr, port int, multi bool, batch in
|
|||||||
out.writeFunc = out.sendmmsgRawWrite
|
out.writeFunc = out.sendmmsgRawWrite
|
||||||
|
|
||||||
out.prepareGSO()
|
out.prepareGSO()
|
||||||
|
// GRO delivers coalesced superpackets that need a cmsg to split back
|
||||||
|
// into segments. The single-packet RX path uses ReadFromUDPAddrPort
|
||||||
|
// and cannot see that cmsg, so only enable GRO for the batch path.
|
||||||
|
if batch > 1 {
|
||||||
|
out.prepareGRO()
|
||||||
|
}
|
||||||
|
|
||||||
return out, nil
|
return out, nil
|
||||||
}
|
}
|
||||||
@@ -162,6 +175,34 @@ func (u *StdConn) prepareGSO() {
|
|||||||
u.gsoSupported = true
|
u.gsoSupported = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// udpGROBufferSize sizes the per-entry recvmmsg buffer when UDP_GRO is on.
|
||||||
|
// The kernel stitches a run of same-flow datagrams into a single skb whose
|
||||||
|
// length is bounded by sk_gso_max_size (typically 65535); anything larger
|
||||||
|
// would be MSG_TRUNCed. We use the maximum representable UDP length so a
|
||||||
|
// full superpacket always lands intact.
|
||||||
|
const udpGROBufferSize = 65535
|
||||||
|
|
||||||
|
// udpGROCmsgPayload is the size of the UDP_GRO cmsg data delivered by the
|
||||||
|
// kernel: a single int (gso_size in bytes). See udp_cmsg_recv() in
|
||||||
|
// net/ipv4/udp.c.
|
||||||
|
const udpGROCmsgPayload = 4
|
||||||
|
|
||||||
|
// prepareGRO turns on UDP_GRO so the kernel coalesces consecutive same-flow
|
||||||
|
// datagrams into one recvmmsg entry, with a cmsg carrying the gso_size used
|
||||||
|
// to split them back apart on the application side.
|
||||||
|
func (u *StdConn) prepareGRO() {
|
||||||
|
var probeErr error
|
||||||
|
if err := u.rawConn.Control(func(fd uintptr) {
|
||||||
|
probeErr = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_GRO, 1)
|
||||||
|
}); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if probeErr != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
u.groSupported = true
|
||||||
|
}
|
||||||
|
|
||||||
func (u *StdConn) SupportsMultipleReaders() bool {
|
func (u *StdConn) SupportsMultipleReaders() bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
@@ -282,7 +323,13 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
|
|||||||
var n int
|
var n int
|
||||||
var operr error
|
var operr error
|
||||||
|
|
||||||
msgs, buffers, names := u.PrepareRawMessages(u.batch)
|
bufSize := MTU
|
||||||
|
cmsgSpace := 0
|
||||||
|
if u.groSupported {
|
||||||
|
bufSize = udpGROBufferSize
|
||||||
|
cmsgSpace = unix.CmsgSpace(udpGROCmsgPayload)
|
||||||
|
}
|
||||||
|
msgs, buffers, names, _ := u.PrepareRawMessages(u.batch, bufSize, cmsgSpace)
|
||||||
|
|
||||||
//reader needs to capture variables from this function, since it's used as a lambda with rawConn.Read
|
//reader needs to capture variables from this function, since it's used as a lambda with rawConn.Read
|
||||||
//defining it outside the loop so it gets re-used
|
//defining it outside the loop so it gets re-used
|
||||||
@@ -292,6 +339,11 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
|
if cmsgSpace > 0 {
|
||||||
|
for i := range msgs {
|
||||||
|
setMsgControllen(&msgs[i].Hdr, cmsgSpace)
|
||||||
|
}
|
||||||
|
}
|
||||||
err := u.rawConn.Read(reader)
|
err := u.rawConn.Read(reader)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -307,7 +359,28 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
|
|||||||
} else {
|
} else {
|
||||||
ip, _ = netip.AddrFromSlice(names[i][8:24])
|
ip, _ = netip.AddrFromSlice(names[i][8:24])
|
||||||
}
|
}
|
||||||
r(netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(names[i][2:4])), buffers[i][:msgs[i].Len])
|
from := netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(names[i][2:4]))
|
||||||
|
payload := buffers[i][:msgs[i].Len]
|
||||||
|
|
||||||
|
segSize := 0
|
||||||
|
if u.groSupported {
|
||||||
|
segSize = parseUDPGRO(&msgs[i].Hdr)
|
||||||
|
}
|
||||||
|
if segSize <= 0 || segSize >= len(payload) {
|
||||||
|
// No coalescing happened (or a lone datagram).
|
||||||
|
r(from, payload)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// GRO superpacket: the kernel guarantees every segment is
|
||||||
|
// exactly segSize bytes except for the final one, which may be
|
||||||
|
// short.
|
||||||
|
for off := 0; off < len(payload); off += segSize {
|
||||||
|
end := off + segSize
|
||||||
|
if end > len(payload) {
|
||||||
|
end = len(payload)
|
||||||
|
}
|
||||||
|
r(from, payload[off:end])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// End-of-batch: let callers (e.g. TUN write coalescer) flush any
|
// End-of-batch: let callers (e.g. TUN write coalescer) flush any
|
||||||
// state they accumulated across this batch.
|
// state they accumulated across this batch.
|
||||||
@@ -315,6 +388,38 @@ func (u *StdConn) listenOutBatch(r EncReader, flush func()) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parseUDPGRO walks the control buffer on hdr looking for a SOL_UDP/UDP_GRO
|
||||||
|
// cmsg and returns the gso_size (bytes per coalesced segment) it carries.
|
||||||
|
// Returns 0 when no UDP_GRO cmsg is present, which is the normal case for
|
||||||
|
// lone datagrams that the kernel did not coalesce.
|
||||||
|
func parseUDPGRO(hdr *msghdr) int {
|
||||||
|
controllen := int(hdr.Controllen)
|
||||||
|
if controllen < unix.SizeofCmsghdr || hdr.Control == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
ctrl := unsafe.Slice(hdr.Control, controllen)
|
||||||
|
off := 0
|
||||||
|
for off+unix.SizeofCmsghdr <= len(ctrl) {
|
||||||
|
ch := (*unix.Cmsghdr)(unsafe.Pointer(&ctrl[off]))
|
||||||
|
clen := int(ch.Len)
|
||||||
|
if clen < unix.SizeofCmsghdr || off+clen > len(ctrl) {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
if ch.Level == unix.SOL_UDP && ch.Type == unix.UDP_GRO {
|
||||||
|
dataOff := off + unix.CmsgLen(0)
|
||||||
|
if dataOff+udpGROCmsgPayload <= len(ctrl) {
|
||||||
|
return int(int32(binary.NativeEndian.Uint32(ctrl[dataOff : dataOff+udpGROCmsgPayload])))
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
// Advance by the aligned cmsg space. CmsgSpace(n) is the stride
|
||||||
|
// from one header to the next (len aligned up to the platform's
|
||||||
|
// cmsg alignment).
|
||||||
|
off += unix.CmsgSpace(clen - unix.CmsgLen(0))
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func (u *StdConn) ListenOut(r EncReader, flush func()) error {
|
func (u *StdConn) ListenOut(r EncReader, flush func()) error {
|
||||||
if u.batch == 1 {
|
if u.batch == 1 {
|
||||||
return u.listenOutSingle(r, flush)
|
return u.listenOutSingle(r, flush)
|
||||||
|
|||||||
@@ -30,13 +30,18 @@ type rawMessage struct {
|
|||||||
Len uint32
|
Len uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
func (u *StdConn) PrepareRawMessages(n int) ([]rawMessage, [][]byte, [][]byte) {
|
func (u *StdConn) PrepareRawMessages(n, bufSize, cmsgSpace int) ([]rawMessage, [][]byte, [][]byte, []byte) {
|
||||||
msgs := make([]rawMessage, n)
|
msgs := make([]rawMessage, n)
|
||||||
buffers := make([][]byte, n)
|
buffers := make([][]byte, n)
|
||||||
names := make([][]byte, n)
|
names := make([][]byte, n)
|
||||||
|
|
||||||
|
var cmsgs []byte
|
||||||
|
if cmsgSpace > 0 {
|
||||||
|
cmsgs = make([]byte, n*cmsgSpace)
|
||||||
|
}
|
||||||
|
|
||||||
for i := range msgs {
|
for i := range msgs {
|
||||||
buffers[i] = make([]byte, MTU)
|
buffers[i] = make([]byte, bufSize)
|
||||||
names[i] = make([]byte, unix.SizeofSockaddrInet6)
|
names[i] = make([]byte, unix.SizeofSockaddrInet6)
|
||||||
|
|
||||||
vs := []iovec{
|
vs := []iovec{
|
||||||
@@ -48,9 +53,14 @@ func (u *StdConn) PrepareRawMessages(n int) ([]rawMessage, [][]byte, [][]byte) {
|
|||||||
|
|
||||||
msgs[i].Hdr.Name = &names[i][0]
|
msgs[i].Hdr.Name = &names[i][0]
|
||||||
msgs[i].Hdr.Namelen = uint32(len(names[i]))
|
msgs[i].Hdr.Namelen = uint32(len(names[i]))
|
||||||
|
|
||||||
|
if cmsgSpace > 0 {
|
||||||
|
msgs[i].Hdr.Control = &cmsgs[i*cmsgSpace]
|
||||||
|
msgs[i].Hdr.Controllen = uint32(cmsgSpace)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return msgs, buffers, names
|
return msgs, buffers, names, cmsgs
|
||||||
}
|
}
|
||||||
|
|
||||||
func setIovLen(v *iovec, n int) {
|
func setIovLen(v *iovec, n int) {
|
||||||
|
|||||||
@@ -33,13 +33,18 @@ type rawMessage struct {
|
|||||||
Pad0 [4]byte
|
Pad0 [4]byte
|
||||||
}
|
}
|
||||||
|
|
||||||
func (u *StdConn) PrepareRawMessages(n int) ([]rawMessage, [][]byte, [][]byte) {
|
func (u *StdConn) PrepareRawMessages(n, bufSize, cmsgSpace int) ([]rawMessage, [][]byte, [][]byte, []byte) {
|
||||||
msgs := make([]rawMessage, n)
|
msgs := make([]rawMessage, n)
|
||||||
buffers := make([][]byte, n)
|
buffers := make([][]byte, n)
|
||||||
names := make([][]byte, n)
|
names := make([][]byte, n)
|
||||||
|
|
||||||
|
var cmsgs []byte
|
||||||
|
if cmsgSpace > 0 {
|
||||||
|
cmsgs = make([]byte, n*cmsgSpace)
|
||||||
|
}
|
||||||
|
|
||||||
for i := range msgs {
|
for i := range msgs {
|
||||||
buffers[i] = make([]byte, MTU)
|
buffers[i] = make([]byte, bufSize)
|
||||||
names[i] = make([]byte, unix.SizeofSockaddrInet6)
|
names[i] = make([]byte, unix.SizeofSockaddrInet6)
|
||||||
|
|
||||||
vs := []iovec{
|
vs := []iovec{
|
||||||
@@ -51,9 +56,14 @@ func (u *StdConn) PrepareRawMessages(n int) ([]rawMessage, [][]byte, [][]byte) {
|
|||||||
|
|
||||||
msgs[i].Hdr.Name = &names[i][0]
|
msgs[i].Hdr.Name = &names[i][0]
|
||||||
msgs[i].Hdr.Namelen = uint32(len(names[i]))
|
msgs[i].Hdr.Namelen = uint32(len(names[i]))
|
||||||
|
|
||||||
|
if cmsgSpace > 0 {
|
||||||
|
msgs[i].Hdr.Control = &cmsgs[i*cmsgSpace]
|
||||||
|
msgs[i].Hdr.Controllen = uint64(cmsgSpace)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return msgs, buffers, names
|
return msgs, buffers, names, cmsgs
|
||||||
}
|
}
|
||||||
|
|
||||||
func setIovLen(v *iovec, n int) {
|
func setIovLen(v *iovec, n int) {
|
||||||
|
|||||||
Reference in New Issue
Block a user