mirror of
https://github.com/slackhq/nebula.git
synced 2026-05-16 04:47:38 +02:00
new tun interface
This commit is contained in:
80
overlay/tio/queueset_poll_linux.go
Normal file
80
overlay/tio/queueset_poll_linux.go
Normal file
@@ -0,0 +1,80 @@
|
||||
package tio
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
type pollQueueSet struct {
|
||||
pq []*Poll
|
||||
// pqi is exactly the same as pq, but stored as the interface type
|
||||
pqi []Queue
|
||||
shutdownFd int
|
||||
}
|
||||
|
||||
func NewPollQueueSet() (QueueSet, error) {
|
||||
shutdownFd, err := unix.Eventfd(0, unix.EFD_NONBLOCK|unix.EFD_CLOEXEC)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create eventfd: %w", err)
|
||||
}
|
||||
|
||||
out := &pollQueueSet{
|
||||
pq: []*Poll{},
|
||||
pqi: []Queue{},
|
||||
shutdownFd: shutdownFd,
|
||||
}
|
||||
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (c *pollQueueSet) Queues() []Queue {
|
||||
return c.pqi
|
||||
}
|
||||
|
||||
func (c *pollQueueSet) Add(fd int) error {
|
||||
x, err := newPoll(fd, c.shutdownFd)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
c.pq = append(c.pq, x)
|
||||
c.pqi = append(c.pqi, x)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *pollQueueSet) wakeForShutdown() error {
|
||||
var buf [8]byte
|
||||
binary.NativeEndian.PutUint64(buf[:], 1)
|
||||
_, err := unix.Write(c.shutdownFd, buf[:])
|
||||
return err
|
||||
}
|
||||
|
||||
func (c *pollQueueSet) Close() error {
|
||||
if c.shutdownFd < 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
errs := []error{}
|
||||
|
||||
if err := c.wakeForShutdown(); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
|
||||
for _, x := range c.pq {
|
||||
if err := x.Close(); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
}
|
||||
|
||||
// All Polls reference shutdownFd in their pollfd arrays, so close it
|
||||
// only after every Poll.Close has returned.
|
||||
if err := unix.Close(c.shutdownFd); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
c.shutdownFd = -1
|
||||
|
||||
return errors.Join(errs...)
|
||||
}
|
||||
123
overlay/tio/tio.go
Normal file
123
overlay/tio/tio.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package tio
|
||||
|
||||
import (
|
||||
"io"
|
||||
)
|
||||
|
||||
// QueueSet holds one or many Queue objects and helps close them in an orderly way.
|
||||
type QueueSet interface {
|
||||
io.Closer
|
||||
Queues() []Queue
|
||||
|
||||
// Add takes a tun fd, adds it to the set, and prepares it for use as a Queue.
|
||||
Add(fd int) error
|
||||
}
|
||||
|
||||
// Capabilities advertises which kernel offload features a Queue
|
||||
// successfully negotiated. Callers consult this to decide which coalescers
|
||||
// to wire onto the write path — a Queue without TSO can't usefully accept a
|
||||
// TCPCoalescer, and a Queue without USO can't accept a UDPCoalescer.
|
||||
type Capabilities struct {
|
||||
// TSO means the FD was opened with IFF_VNET_HDR and the kernel agreed
|
||||
// to TUN_F_TSO4|TSO6 — i.e. WriteGSO with GSOProtoTCP is safe.
|
||||
TSO bool
|
||||
// USO means the kernel additionally agreed to TUN_F_USO4|USO6, so
|
||||
// WriteGSO with GSOProtoUDP is safe. Linux ≥ 6.2.
|
||||
USO bool
|
||||
}
|
||||
|
||||
// Queue is a readable/writable Poll queue. One Queue is driven by a single
|
||||
// read goroutine plus a single writer (see Write below).
|
||||
type Queue interface {
|
||||
io.Closer
|
||||
|
||||
// Read returns one or more packets. The returned Packet.Bytes slices
|
||||
// are borrowed from the Queue's internal buffer and are only valid
|
||||
// until the next Read or Close on this Queue - callers must encrypt
|
||||
// or copy each slice before the next call. A Packet may carry a
|
||||
// GSO/USO superpacket (see GSOInfo); when GSO.IsSuperpacket() is
|
||||
// true the caller must segment Bytes before treating it as a single
|
||||
// IP datagram. Not safe for concurrent Reads.
|
||||
Read() ([]Packet, error)
|
||||
|
||||
// Write emits a single packet on the plaintext (outside→inside)
|
||||
// delivery path. Not safe for concurrent Writes.
|
||||
Write(p []byte) (int, error)
|
||||
}
|
||||
|
||||
// Packet is the unit Queue.Read returns. Bytes points into the queue's
|
||||
// internal buffer and is only valid until the next Read or Close on the
|
||||
// queue that produced it. GSO is the zero value for an already-segmented
|
||||
// IP datagram; when non-zero it describes a kernel-supplied TSO/USO
|
||||
// superpacket the caller must segment before consuming.
|
||||
type Packet struct {
|
||||
Bytes []byte
|
||||
GSO GSOInfo
|
||||
}
|
||||
|
||||
// GSOInfo describes a kernel-supplied superpacket sitting in Packet.Bytes.
|
||||
// The zero value means "not a superpacket" — Bytes is one regular IP
|
||||
// datagram and no segmentation is required.
|
||||
type GSOInfo struct {
|
||||
// Size is the GSO segment size: max payload bytes per segment
|
||||
// (== TCP MSS for TSO, == UDP payload chunk for USO). Zero means
|
||||
// not a superpacket.
|
||||
Size uint16
|
||||
// HdrLen is the total L3+L4 header length within Bytes (already
|
||||
// corrected via correctHdrLen, so safe to slice on).
|
||||
HdrLen uint16
|
||||
// CsumStart is the L4 header offset inside Bytes (== L3 header
|
||||
// length).
|
||||
CsumStart uint16
|
||||
// Proto picks the L4 protocol (TCP or UDP) so the segmenter knows
|
||||
// which checksum/header layout to apply.
|
||||
Proto GSOProto
|
||||
}
|
||||
|
||||
// IsSuperpacket reports whether g describes a multi-segment GSO/USO
|
||||
// superpacket that needs segmentation before its bytes can be encrypted
|
||||
// and sent on the wire.
|
||||
func (g GSOInfo) IsSuperpacket() bool { return g.Size > 0 }
|
||||
|
||||
// Clone returns a Packet whose Bytes is a freshly allocated copy of p.Bytes,
|
||||
// safe to retain past the next Read or Close on the originating Queue.
|
||||
// GSO metadata is copied verbatim. Use this only when a caller genuinely
|
||||
// needs to outlive the borrowed-slice contract — the hot path reads should
|
||||
// continue to consume the borrow synchronously to avoid the allocation.
|
||||
func (p Packet) Clone() Packet {
|
||||
if p.Bytes == nil {
|
||||
return p
|
||||
}
|
||||
cp := make([]byte, len(p.Bytes))
|
||||
copy(cp, p.Bytes)
|
||||
return Packet{Bytes: cp, GSO: p.GSO}
|
||||
}
|
||||
|
||||
// CapsProvider is an optional interface implemented by Queues that
|
||||
// successfully negotiated kernel offload features at open time. Callers
|
||||
// pick a write-path coalescer based on the result. Queues that don't
|
||||
// implement it are treated as having no offload capability — callers must
|
||||
// fall back to plain per-packet writes.
|
||||
type CapsProvider interface {
|
||||
Capabilities() Capabilities
|
||||
}
|
||||
|
||||
// QueueCapabilities returns q's negotiated offload capabilities, or the
|
||||
// zero value when q does not advertise any.
|
||||
func QueueCapabilities(q Queue) Capabilities {
|
||||
if cp, ok := q.(CapsProvider); ok {
|
||||
return cp.Capabilities()
|
||||
}
|
||||
return Capabilities{}
|
||||
}
|
||||
|
||||
// GSOProto selects the L4 protocol for a GSO superpacket. Determines which
|
||||
// VIRTIO_NET_HDR_GSO_* type the writer stamps and which checksum offset
|
||||
// inside the transport header virtio NEEDS_CSUM expects.
|
||||
type GSOProto uint8
|
||||
|
||||
const (
|
||||
GSOProtoNone GSOProto = iota
|
||||
GSOProtoTCP
|
||||
GSOProtoUDP
|
||||
)
|
||||
167
overlay/tio/tio_poll_linux.go
Normal file
167
overlay/tio/tio_poll_linux.go
Normal file
@@ -0,0 +1,167 @@
|
||||
package tio
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
type Poll struct {
|
||||
fd int
|
||||
|
||||
readPoll [2]unix.PollFd
|
||||
writePoll [2]unix.PollFd
|
||||
writeLock sync.Mutex
|
||||
closed atomic.Bool
|
||||
|
||||
readBuf []byte
|
||||
batchRet [1]Packet
|
||||
}
|
||||
|
||||
func newPoll(fd int, shutdownFd int) (*Poll, error) {
|
||||
if err := unix.SetNonblock(fd, true); err != nil {
|
||||
_ = unix.Close(fd)
|
||||
return nil, fmt.Errorf("failed to set Poll device as nonblocking: %w", err)
|
||||
}
|
||||
|
||||
out := &Poll{
|
||||
fd: fd,
|
||||
readBuf: make([]byte, 65535),
|
||||
readPoll: [2]unix.PollFd{
|
||||
{Fd: int32(fd), Events: unix.POLLIN},
|
||||
{Fd: int32(shutdownFd), Events: unix.POLLIN},
|
||||
},
|
||||
writePoll: [2]unix.PollFd{
|
||||
{Fd: int32(fd), Events: unix.POLLOUT},
|
||||
{Fd: int32(shutdownFd), Events: unix.POLLIN},
|
||||
},
|
||||
writeLock: sync.Mutex{},
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// blockOnRead waits until the Poll fd is readable or shutdown has been signaled.
|
||||
// Returns os.ErrClosed if Close was called.
|
||||
func (t *Poll) blockOnRead() error {
|
||||
const problemFlags = unix.POLLHUP | unix.POLLNVAL | unix.POLLERR
|
||||
var err error
|
||||
for {
|
||||
_, err = unix.Poll(t.readPoll[:], -1)
|
||||
if err != unix.EINTR {
|
||||
break
|
||||
}
|
||||
}
|
||||
tunEvents := t.readPoll[0].Revents
|
||||
shutdownEvents := t.readPoll[1].Revents
|
||||
t.readPoll[0].Revents = 0
|
||||
t.readPoll[1].Revents = 0
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if shutdownEvents&(unix.POLLIN|problemFlags) != 0 {
|
||||
return os.ErrClosed
|
||||
}
|
||||
if tunEvents&problemFlags != 0 {
|
||||
return os.ErrClosed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *Poll) blockOnWrite() error {
|
||||
const problemFlags = unix.POLLHUP | unix.POLLNVAL | unix.POLLERR
|
||||
var err error
|
||||
for {
|
||||
_, err = unix.Poll(t.writePoll[:], -1)
|
||||
if err != unix.EINTR {
|
||||
break
|
||||
}
|
||||
}
|
||||
t.writeLock.Lock()
|
||||
tunEvents := t.writePoll[0].Revents
|
||||
shutdownEvents := t.writePoll[1].Revents
|
||||
t.writePoll[0].Revents = 0
|
||||
t.writePoll[1].Revents = 0
|
||||
t.writeLock.Unlock()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if shutdownEvents&(unix.POLLIN|problemFlags) != 0 {
|
||||
return os.ErrClosed
|
||||
}
|
||||
if tunEvents&problemFlags != 0 {
|
||||
return os.ErrClosed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *Poll) Read() ([]Packet, error) {
|
||||
n, err := t.readOne(t.readBuf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
t.batchRet[0] = Packet{Bytes: t.readBuf[:n]}
|
||||
return t.batchRet[:], nil
|
||||
}
|
||||
|
||||
func (t *Poll) readOne(to []byte) (int, error) {
|
||||
for {
|
||||
n, errno := unix.Read(t.fd, to)
|
||||
if errno == nil {
|
||||
return n, nil
|
||||
}
|
||||
switch errno {
|
||||
case unix.EAGAIN:
|
||||
if err := t.blockOnRead(); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
case unix.EINTR:
|
||||
// retry
|
||||
case unix.EBADF:
|
||||
return 0, os.ErrClosed
|
||||
default:
|
||||
return 0, errno
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Poll) Write(from []byte) (int, error) {
|
||||
for {
|
||||
n, errno := unix.Write(t.fd, from)
|
||||
if errno == nil {
|
||||
return n, nil
|
||||
}
|
||||
switch errno {
|
||||
case unix.EAGAIN:
|
||||
if err := t.blockOnWrite(); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
case unix.EINTR:
|
||||
// retry
|
||||
case unix.EBADF:
|
||||
return 0, os.ErrClosed
|
||||
default:
|
||||
return 0, errno
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Poll) Close() error {
|
||||
if t.closed.Swap(true) {
|
||||
return nil
|
||||
}
|
||||
//shutdownFd is owned by the container, so we should not close it
|
||||
var err error
|
||||
if t.fd >= 0 {
|
||||
err = unix.Close(t.fd)
|
||||
t.fd = -1
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (t *Poll) Capabilities() Capabilities {
|
||||
return Capabilities{TSO: false, USO: false}
|
||||
}
|
||||
103
overlay/tio/tun_file_linux_test.go
Normal file
103
overlay/tio/tun_file_linux_test.go
Normal file
@@ -0,0 +1,103 @@
|
||||
//go:build linux && !android && !e2e_testing
|
||||
// +build linux,!android,!e2e_testing
|
||||
|
||||
package tio
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// newReadPipe returns a read fd. The matching write fd is registered for cleanup.
|
||||
// The caller takes ownership of the read fd (pass it into a QueueSet).
|
||||
func newReadPipe(t *testing.T) int {
|
||||
t.Helper()
|
||||
var fds [2]int
|
||||
if err := unix.Pipe2(fds[:], unix.O_CLOEXEC); err != nil {
|
||||
t.Fatalf("pipe2: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { _ = unix.Close(fds[1]) })
|
||||
return fds[0]
|
||||
}
|
||||
|
||||
func TestPoll_WakeForShutdown_WakesFriends(t *testing.T) {
|
||||
pipe1 := newReadPipe(t)
|
||||
pipe2 := newReadPipe(t)
|
||||
parent, err := NewPollQueueSet()
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, parent.Add(pipe1))
|
||||
require.NoError(t, parent.Add(pipe2))
|
||||
t.Cleanup(func() {
|
||||
_ = unix.Close(pipe1)
|
||||
_ = unix.Close(pipe2)
|
||||
})
|
||||
|
||||
readers := parent.Queues()
|
||||
errs := make([]error, len(readers))
|
||||
var wg sync.WaitGroup
|
||||
for i, r := range readers {
|
||||
wg.Add(1)
|
||||
go func(i int, r Queue) {
|
||||
defer wg.Done()
|
||||
_, errs[i] = r.Read()
|
||||
}(i, r)
|
||||
}
|
||||
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
if err := parent.Close(); err != nil {
|
||||
t.Fatalf("Close: %v", err)
|
||||
}
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() { wg.Wait(); close(done) }()
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("readers did not wake")
|
||||
}
|
||||
|
||||
for i, err := range errs {
|
||||
if !errors.Is(err, os.ErrClosed) {
|
||||
t.Errorf("reader %d: expected os.ErrClosed, got %v", i, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPoll_Close_Idempotent(t *testing.T) {
|
||||
tf, err := newPoll(newReadPipe(t), 1)
|
||||
require.NoError(t, err)
|
||||
if err := tf.Close(); err != nil {
|
||||
t.Fatalf("first Close: %v", err)
|
||||
}
|
||||
if err := tf.Close(); err != nil {
|
||||
t.Fatalf("second Close should be a no-op, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPollQueueSet_Close_ClosesEventfd(t *testing.T) {
|
||||
qs, err := NewPollQueueSet()
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, qs.Add(newReadPipe(t)))
|
||||
|
||||
fd := qs.(*pollQueueSet).shutdownFd
|
||||
require.NoError(t, qs.Close())
|
||||
|
||||
// Closing the eventfd again should fail with EBADF, proving Close
|
||||
// actually released it.
|
||||
if err := unix.Close(fd); err == nil {
|
||||
t.Fatalf("eventfd %d still open after QueueSet.Close", fd)
|
||||
}
|
||||
|
||||
// Second Close must be a no-op (and must not double-close the eventfd
|
||||
// in case the kernel handed it out to another caller in the meantime).
|
||||
if err := qs.Close(); err != nil {
|
||||
t.Fatalf("second Close: %v", err)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user