holy crap 2x

This commit is contained in:
JackDoan
2026-04-17 15:33:46 -05:00
parent 1fd24a19c7
commit 60e556866a
5 changed files with 643 additions and 307 deletions

View File

@@ -31,21 +31,24 @@ type Device interface {
NewMultiQueueReader() (Queue, error)
}
// GSOWriter is implemented by Queues that can write a TCP TSO superpacket as
// a single virtio_net_hdr + payload writev, letting the kernel segment on
// egress. Callers type-assert on it; backends that don't support GSO return
// false from Supported and all coalescing logic is skipped.
// GSOWriter is implemented by Queues that can emit a TCP TSO superpacket
// assembled from a header prefix plus one or more borrowed payload
// fragments, in a single vectored write (writev with a leading
// virtio_net_hdr). This lets the coalescer avoid copying payload bytes
// between the caller's decrypt buffer and the TUN. Backends without GSO
// support return false from GSOSupported and coalescing is skipped.
//
// pkt must contain the IPv4/IPv6 + TCP header plus the concatenated
// coalesced payload. hdrLen is the total L3+L4 header length (where the
// payload starts). csumStart is the byte offset where the TCP header
// begins (= IP header length). gsoSize is the MSS — every segment except
// possibly the last must be exactly this many payload bytes. isV6 selects
// GSO_TCPV4 vs GSO_TCPV6.
// hdr contains the IPv4/IPv6 + TCP header prefix (mutable — callers will
// have filled in total length and pseudo-header partial). pays are
// non-overlapping payload fragments whose concatenation is the full
// superpacket payload; they are read-only from the writer's perspective
// and must remain valid until the call returns. gsoSize is the MSS:
// every segment except possibly the last is exactly that many bytes.
// csumStart is the byte offset where the TCP header begins within hdr.
//
// pkt's TCP checksum field must already hold the pseudo-header partial
// hdr's TCP checksum field must already hold the pseudo-header partial
// sum (single-fold, not inverted), per virtio NEEDS_CSUM semantics.
type GSOWriter interface {
WriteGSO(pkt []byte, gsoSize uint16, isV6 bool, hdrLen, csumStart uint16) error
WriteGSO(hdr []byte, pays [][]byte, gsoSize uint16, isV6 bool, csumStart uint16) error
GSOSupported() bool
}

View File

@@ -53,9 +53,17 @@ type tunFile struct {
// by WriteGSO. Separate from zeroVnetHdr so a concurrent non-GSO Write on
// another queue never observes a half-written header.
gsoHdrBuf [virtioNetHdrLen]byte
gsoIovs [2]unix.Iovec
// gsoIovs is the writev iovec scratch for WriteGSO. Sized to hold the
// virtio header + IP/TCP header + up to gsoInitialPayIovs payload
// fragments; grown on demand if a coalescer pushes more.
gsoIovs []unix.Iovec
}
// gsoInitialPayIovs is the starting capacity (in payload fragments) of
// tunFile.gsoIovs. Sized to cover the default coalesce segment cap without
// any reallocations.
const gsoInitialPayIovs = 66
// zeroVnetHdr is the 10-byte virtio_net_hdr we prepend to every TUN write when
// IFF_VNET_HDR is active. All-zero signals "no GSO, no checksum offload"; the
// kernel accepts the packet as-is.
@@ -84,6 +92,7 @@ func (r *tunFile) newFriend(fd int) (*tunFile, error) {
out.segBuf = make([]byte, tunSegBufCap)
out.writeIovs[0].Base = &zeroVnetHdr[0]
out.writeIovs[0].SetLen(virtioNetHdrLen)
out.gsoIovs = make([]unix.Iovec, 2, 2+gsoInitialPayIovs)
out.gsoIovs[0].Base = &out.gsoHdrBuf[0]
out.gsoIovs[0].SetLen(virtioNetHdrLen)
}
@@ -119,6 +128,7 @@ func newTunFd(fd int, vnetHdr bool) (*tunFile, error) {
out.segBuf = make([]byte, tunSegBufCap)
out.writeIovs[0].Base = &zeroVnetHdr[0]
out.writeIovs[0].SetLen(virtioNetHdrLen)
out.gsoIovs = make([]unix.Iovec, 2, 2+gsoInitialPayIovs)
out.gsoIovs[0].Base = &out.gsoHdrBuf[0]
out.gsoIovs[0].SetLen(virtioNetHdrLen)
}
@@ -346,46 +356,79 @@ func (r *tunFile) Write(buf []byte) (int, error) {
// Write calls.
func (r *tunFile) GSOSupported() bool { return r.vnetHdr }
// WriteGSO emits pkt as a single TCP TSO superpacket via writev. pkt must
// contain a full IPv4/IPv6 + TCP header prefix followed by the concatenated
// coalesced payload. The TCP checksum field must already hold the
// pseudo-header partial (NEEDS_CSUM semantics). gsoSize is the MSS; every
// segment except the last must be exactly that many payload bytes.
func (r *tunFile) WriteGSO(pkt []byte, gsoSize uint16, isV6 bool, hdrLen, csumStart uint16) error {
// WriteGSO emits a TCP TSO superpacket in a single writev. hdr is the
// IPv4/IPv6 + TCP header prefix (already finalized — total length, IP csum,
// and TCP pseudo-header partial set by the caller). pays are payload
// fragments whose concatenation forms the full coalesced payload; each
// slice is read-only and must stay valid until return. gsoSize is the MSS;
// every segment except possibly the last is exactly gsoSize bytes.
// csumStart is the byte offset where the TCP header begins within hdr.
func (r *tunFile) WriteGSO(hdr []byte, pays [][]byte, gsoSize uint16, isV6 bool, csumStart uint16) error {
if !r.vnetHdr {
return fmt.Errorf("WriteGSO called on tun without IFF_VNET_HDR")
}
if len(pkt) == 0 {
if len(hdr) == 0 || len(pays) == 0 {
return nil
}
hdr := virtioNetHdr{
// Build the virtio_net_hdr. When pays total to <= gsoSize the kernel
// would produce a single segment; keep NEEDS_CSUM semantics but skip
// the GSO type so the kernel doesn't spuriously mark this as TSO.
vhdr := virtioNetHdr{
Flags: unix.VIRTIO_NET_HDR_F_NEEDS_CSUM,
HdrLen: hdrLen,
HdrLen: uint16(len(hdr)),
GSOSize: gsoSize,
CsumStart: csumStart,
CsumOffset: 16, // TCP checksum field lives 16 bytes into the TCP header
}
if isV6 {
hdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV6
} else {
hdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV4
var totalPay int
for _, p := range pays {
totalPay += len(p)
}
if totalPay > int(gsoSize) {
if isV6 {
vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV6
} else {
vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV4
}
} else {
vhdr.GSOType = unix.VIRTIO_NET_HDR_GSO_NONE
vhdr.GSOSize = 0
}
vhdr.encode(r.gsoHdrBuf[:])
// Build the iovec array: [virtio_hdr, hdr, pays...]. r.gsoIovs[0] is
// wired to gsoHdrBuf at construction and never changes.
need := 2 + len(pays)
if cap(r.gsoIovs) < need {
grown := make([]unix.Iovec, need)
grown[0] = r.gsoIovs[0]
r.gsoIovs = grown
} else {
r.gsoIovs = r.gsoIovs[:need]
}
r.gsoIovs[1].Base = &hdr[0]
r.gsoIovs[1].SetLen(len(hdr))
for i, p := range pays {
r.gsoIovs[2+i].Base = &p[0]
r.gsoIovs[2+i].SetLen(len(p))
}
hdr.encode(r.gsoHdrBuf[:])
r.gsoIovs[1].Base = &pkt[0]
r.gsoIovs[1].SetLen(len(pkt))
iovPtr := uintptr(unsafe.Pointer(&r.gsoIovs[0]))
iovCnt := uintptr(len(r.gsoIovs))
for {
n, _, errno := syscall.RawSyscall(unix.SYS_WRITEV, uintptr(r.fd), iovPtr, 2)
n, _, errno := syscall.RawSyscall(unix.SYS_WRITEV, uintptr(r.fd), iovPtr, iovCnt)
if errno == 0 {
runtime.KeepAlive(pkt)
runtime.KeepAlive(hdr)
runtime.KeepAlive(pays)
if int(n) < virtioNetHdrLen {
return io.ErrShortWrite
}
return nil
}
if errno == unix.EAGAIN {
runtime.KeepAlive(pkt)
runtime.KeepAlive(hdr)
runtime.KeepAlive(pays)
if err := r.blockOnWrite(); err != nil {
return err
}
@@ -394,7 +437,8 @@ func (r *tunFile) WriteGSO(pkt []byte, gsoSize uint16, isV6 bool, hdrLen, csumSt
if errno == unix.EINTR {
continue
}
runtime.KeepAlive(pkt)
runtime.KeepAlive(hdr)
runtime.KeepAlive(pays)
return errno
}
}