holy crap 2x

This commit is contained in:
JackDoan
2026-04-17 14:56:18 -05:00
parent f60cbfdc71
commit 1fd24a19c7
13 changed files with 928 additions and 13 deletions

View File

@@ -30,3 +30,22 @@ type Device interface {
SupportsMultiqueue() bool
NewMultiQueueReader() (Queue, error)
}
// GSOWriter is implemented by Queues that can write a TCP TSO superpacket as
// a single virtio_net_hdr + payload writev, letting the kernel segment on
// egress. Callers type-assert on it; backends that don't support GSO return
// false from Supported and all coalescing logic is skipped.
//
// pkt must contain the IPv4/IPv6 + TCP header plus the concatenated
// coalesced payload. hdrLen is the total L3+L4 header length (where the
// payload starts). csumStart is the byte offset where the TCP header
// begins (= IP header length). gsoSize is the MSS — every segment except
// possibly the last must be exactly this many payload bytes. isV6 selects
// GSO_TCPV4 vs GSO_TCPV6.
//
// pkt's TCP checksum field must already hold the pseudo-header partial
// sum (single-fold, not inverted), per virtio NEEDS_CSUM semantics.
type GSOWriter interface {
WriteGSO(pkt []byte, gsoSize uint16, isV6 bool, hdrLen, csumStart uint16) error
GSOSupported() bool
}

View File

@@ -48,6 +48,12 @@ type tunFile struct {
pending [][]byte // segments waiting to be drained by Read
pendingIdx int
writeIovs [2]unix.Iovec // preallocated iovecs for vnetHdr writes; iovs[0] is fixed to zeroVnetHdr
// gsoHdrBuf is a per-queue 10-byte scratch for the virtio_net_hdr emitted
// by WriteGSO. Separate from zeroVnetHdr so a concurrent non-GSO Write on
// another queue never observes a half-written header.
gsoHdrBuf [virtioNetHdrLen]byte
gsoIovs [2]unix.Iovec
}
// zeroVnetHdr is the 10-byte virtio_net_hdr we prepend to every TUN write when
@@ -78,6 +84,8 @@ func (r *tunFile) newFriend(fd int) (*tunFile, error) {
out.segBuf = make([]byte, tunSegBufCap)
out.writeIovs[0].Base = &zeroVnetHdr[0]
out.writeIovs[0].SetLen(virtioNetHdrLen)
out.gsoIovs[0].Base = &out.gsoHdrBuf[0]
out.gsoIovs[0].SetLen(virtioNetHdrLen)
}
return out, nil
}
@@ -111,6 +119,8 @@ func newTunFd(fd int, vnetHdr bool) (*tunFile, error) {
out.segBuf = make([]byte, tunSegBufCap)
out.writeIovs[0].Base = &zeroVnetHdr[0]
out.writeIovs[0].SetLen(virtioNetHdrLen)
out.gsoIovs[0].Base = &out.gsoHdrBuf[0]
out.gsoIovs[0].SetLen(virtioNetHdrLen)
}
return out, nil
@@ -331,6 +341,64 @@ func (r *tunFile) Write(buf []byte) (int, error) {
}
}
// GSOSupported reports whether this queue was opened with IFF_VNET_HDR and
// can accept WriteGSO. When false, callers should fall back to per-segment
// Write calls.
func (r *tunFile) GSOSupported() bool { return r.vnetHdr }
// WriteGSO emits pkt as a single TCP TSO superpacket via writev. pkt must
// contain a full IPv4/IPv6 + TCP header prefix followed by the concatenated
// coalesced payload. The TCP checksum field must already hold the
// pseudo-header partial (NEEDS_CSUM semantics). gsoSize is the MSS; every
// segment except the last must be exactly that many payload bytes.
func (r *tunFile) WriteGSO(pkt []byte, gsoSize uint16, isV6 bool, hdrLen, csumStart uint16) error {
if !r.vnetHdr {
return fmt.Errorf("WriteGSO called on tun without IFF_VNET_HDR")
}
if len(pkt) == 0 {
return nil
}
hdr := virtioNetHdr{
Flags: unix.VIRTIO_NET_HDR_F_NEEDS_CSUM,
HdrLen: hdrLen,
GSOSize: gsoSize,
CsumStart: csumStart,
CsumOffset: 16, // TCP checksum field lives 16 bytes into the TCP header
}
if isV6 {
hdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV6
} else {
hdr.GSOType = unix.VIRTIO_NET_HDR_GSO_TCPV4
}
hdr.encode(r.gsoHdrBuf[:])
r.gsoIovs[1].Base = &pkt[0]
r.gsoIovs[1].SetLen(len(pkt))
iovPtr := uintptr(unsafe.Pointer(&r.gsoIovs[0]))
for {
n, _, errno := syscall.RawSyscall(unix.SYS_WRITEV, uintptr(r.fd), iovPtr, 2)
if errno == 0 {
runtime.KeepAlive(pkt)
if int(n) < virtioNetHdrLen {
return io.ErrShortWrite
}
return nil
}
if errno == unix.EAGAIN {
runtime.KeepAlive(pkt)
if err := r.blockOnWrite(); err != nil {
return err
}
continue
}
if errno == unix.EINTR {
continue
}
runtime.KeepAlive(pkt)
return errno
}
}
func (r *tunFile) wakeForShutdown() error {
var buf [8]byte
binary.NativeEndian.PutUint64(buf[:], 1)

View File

@@ -54,6 +54,18 @@ func (h *virtioNetHdr) decode(b []byte) {
h.CsumOffset = binary.NativeEndian.Uint16(b[8:10])
}
// encode is the inverse of decode: writes the virtio_net_hdr fields into b
// (must be at least virtioNetHdrLen bytes). Used to emit a TSO superpacket
// on egress.
func (h *virtioNetHdr) encode(b []byte) {
b[0] = h.Flags
b[1] = h.GSOType
binary.NativeEndian.PutUint16(b[2:4], h.HdrLen)
binary.NativeEndian.PutUint16(b[4:6], h.GSOSize)
binary.NativeEndian.PutUint16(b[6:8], h.CsumStart)
binary.NativeEndian.PutUint16(b[8:10], h.CsumOffset)
}
// segmentInto splits a TUN-side packet described by hdr into one or more
// IP packets, each appended to *out as a slice of scratch. scratch must be
// sized to hold every segment (including replicated headers).