Compare commits

..

13 Commits

Author SHA1 Message Date
JackDoan
e7423d39f9 cursed 2025-11-06 09:18:33 -06:00
JackDoan
befba57366 hmmm 2025-11-05 15:38:47 -06:00
Ryan
2d128a3254 add locking for stop crash 2025-11-05 11:58:25 -05:00
Ryan
c8980d34cf fixes 2025-11-05 10:54:08 -05:00
Ryan
98f264cf14 works well 2025-11-04 19:33:52 -05:00
Ryan
aa44f4c7c9 hmmmmmm it works i guess maybe 2025-11-04 16:08:31 -05:00
Ryan Huber
419157c407 passes traffic 2025-11-04 04:50:35 +00:00
Ryan Huber
0864852d33 updated bind 2025-11-04 04:39:07 +00:00
Ryan Huber
2b5aec9a18 updated udp 2025-11-04 04:34:59 +00:00
Ryan Huber
f0665bee20 pem.go restored 2025-11-04 04:32:22 +00:00
Ryan Huber
11da0baab1 quick fix 2025-11-04 04:21:27 +00:00
Ryan Huber
608904b9dd add new files for compat layer 2025-11-04 04:10:51 +00:00
Ryan Huber
fd1c52127f first try 2025-11-04 04:00:29 +00:00
99 changed files with 4938 additions and 5564 deletions

View File

@@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: actions/setup-go@v6
with:

View File

@@ -10,7 +10,7 @@ jobs:
name: Build Linux/BSD All
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: actions/setup-go@v6
with:
@@ -24,7 +24,7 @@ jobs:
mv build/*.tar.gz release
- name: Upload artifacts
uses: actions/upload-artifact@v5
uses: actions/upload-artifact@v4
with:
name: linux-latest
path: release
@@ -33,7 +33,7 @@ jobs:
name: Build Windows
runs-on: windows-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: actions/setup-go@v6
with:
@@ -55,7 +55,7 @@ jobs:
mv dist\windows\wintun build\dist\windows\
- name: Upload artifacts
uses: actions/upload-artifact@v5
uses: actions/upload-artifact@v4
with:
name: windows-latest
path: build
@@ -66,7 +66,7 @@ jobs:
HAS_SIGNING_CREDS: ${{ secrets.AC_USERNAME != '' }}
runs-on: macos-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: actions/setup-go@v6
with:
@@ -104,7 +104,7 @@ jobs:
fi
- name: Upload artifacts
uses: actions/upload-artifact@v5
uses: actions/upload-artifact@v4
with:
name: darwin-latest
path: ./release/*
@@ -124,11 +124,11 @@ jobs:
# be overwritten
- name: Checkout code
if: ${{ env.HAS_DOCKER_CREDS == 'true' }}
uses: actions/checkout@v5
uses: actions/checkout@v4
- name: Download artifacts
if: ${{ env.HAS_DOCKER_CREDS == 'true' }}
uses: actions/download-artifact@v6
uses: actions/download-artifact@v4
with:
name: linux-latest
path: artifacts
@@ -160,10 +160,10 @@ jobs:
needs: [build-linux, build-darwin, build-windows]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- name: Download artifacts
uses: actions/download-artifact@v6
uses: actions/download-artifact@v4
with:
path: artifacts

View File

@@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: actions/setup-go@v6
with:

View File

@@ -18,7 +18,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: actions/setup-go@v6
with:

View File

@@ -18,7 +18,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: actions/setup-go@v6
with:
@@ -32,7 +32,7 @@ jobs:
run: make vet
- name: golangci-lint
uses: golangci/golangci-lint-action@v9
uses: golangci/golangci-lint-action@v8
with:
version: v2.5
@@ -45,7 +45,7 @@ jobs:
- name: Build test mobile
run: make build-test-mobile
- uses: actions/upload-artifact@v5
- uses: actions/upload-artifact@v4
with:
name: e2e packet flow linux-latest
path: e2e/mermaid/linux-latest
@@ -56,7 +56,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: actions/setup-go@v6
with:
@@ -77,7 +77,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: actions/setup-go@v6
with:
@@ -98,7 +98,7 @@ jobs:
os: [windows-latest, macos-latest]
steps:
- uses: actions/checkout@v5
- uses: actions/checkout@v4
- uses: actions/setup-go@v6
with:
@@ -115,7 +115,7 @@ jobs:
run: make vet
- name: golangci-lint
uses: golangci/golangci-lint-action@v9
uses: golangci/golangci-lint-action@v8
with:
version: v2.5
@@ -125,7 +125,7 @@ jobs:
- name: End 2 end
run: make e2evv
- uses: actions/upload-artifact@v5
- uses: actions/upload-artifact@v4
with:
name: e2e packet flow ${{ matrix.os }}
path: e2e/mermaid/${{ matrix.os }}

164
batch_pipeline.go Normal file
View File

@@ -0,0 +1,164 @@
package nebula
import (
"net/netip"
"github.com/slackhq/nebula/overlay"
"github.com/slackhq/nebula/udp"
)
// batchPipelines tracks whether the inside device can operate on packet batches
// and, if so, holds the shared packet pool sized for the virtio headroom and
// payload limits advertised by the device. It also owns the fan-in/fan-out
// queues between the TUN readers, encrypt/decrypt workers, and the UDP writers.
type batchPipelines struct {
enabled bool
inside overlay.BatchCapableDevice
headroom int
payloadCap int
pool *overlay.PacketPool
batchSize int
routines int
rxQueues []chan *overlay.Packet
txQueues []chan queuedDatagram
tunQueues []chan *overlay.Packet
}
type queuedDatagram struct {
packet *overlay.Packet
addr netip.AddrPort
}
func (bp *batchPipelines) init(device overlay.Device, routines int, queueDepth int, maxSegments int) {
if device == nil || routines <= 0 {
return
}
bcap, ok := device.(overlay.BatchCapableDevice)
if !ok {
return
}
headroom := bcap.BatchHeadroom()
payload := bcap.BatchPayloadCap()
if maxSegments < 1 {
maxSegments = 1
}
requiredPayload := udp.MTU * maxSegments
if payload < requiredPayload {
payload = requiredPayload
}
batchSize := bcap.BatchSize()
if headroom <= 0 || payload <= 0 || batchSize <= 0 {
return
}
bp.enabled = true
bp.inside = bcap
bp.headroom = headroom
bp.payloadCap = payload
bp.batchSize = batchSize
bp.routines = routines
bp.pool = overlay.NewPacketPool(headroom, payload)
queueCap := batchSize * defaultBatchQueueDepthFactor
if queueDepth > 0 {
queueCap = queueDepth
}
if queueCap < batchSize {
queueCap = batchSize
}
bp.rxQueues = make([]chan *overlay.Packet, routines)
bp.txQueues = make([]chan queuedDatagram, routines)
bp.tunQueues = make([]chan *overlay.Packet, routines)
for i := 0; i < routines; i++ {
bp.rxQueues[i] = make(chan *overlay.Packet, queueCap)
bp.txQueues[i] = make(chan queuedDatagram, queueCap)
bp.tunQueues[i] = make(chan *overlay.Packet, queueCap)
}
}
func (bp *batchPipelines) Pool() *overlay.PacketPool {
if bp == nil || !bp.enabled {
return nil
}
return bp.pool
}
func (bp *batchPipelines) Enabled() bool {
return bp != nil && bp.enabled
}
func (bp *batchPipelines) batchSizeHint() int {
if bp == nil || bp.batchSize <= 0 {
return 1
}
return bp.batchSize
}
func (bp *batchPipelines) rxQueue(i int) chan *overlay.Packet {
if bp == nil || !bp.enabled || i < 0 || i >= len(bp.rxQueues) {
return nil
}
return bp.rxQueues[i]
}
func (bp *batchPipelines) txQueue(i int) chan queuedDatagram {
if bp == nil || !bp.enabled || i < 0 || i >= len(bp.txQueues) {
return nil
}
return bp.txQueues[i]
}
func (bp *batchPipelines) tunQueue(i int) chan *overlay.Packet {
if bp == nil || !bp.enabled || i < 0 || i >= len(bp.tunQueues) {
return nil
}
return bp.tunQueues[i]
}
func (bp *batchPipelines) txQueueLen(i int) int {
q := bp.txQueue(i)
if q == nil {
return 0
}
return len(q)
}
func (bp *batchPipelines) tunQueueLen(i int) int {
q := bp.tunQueue(i)
if q == nil {
return 0
}
return len(q)
}
func (bp *batchPipelines) enqueueRx(i int, pkt *overlay.Packet) bool {
q := bp.rxQueue(i)
if q == nil {
return false
}
q <- pkt
return true
}
func (bp *batchPipelines) enqueueTx(i int, pkt *overlay.Packet, addr netip.AddrPort) bool {
q := bp.txQueue(i)
if q == nil {
return false
}
q <- queuedDatagram{packet: pkt, addr: addr}
return true
}
func (bp *batchPipelines) enqueueTun(i int, pkt *overlay.Packet) bool {
q := bp.tunQueue(i)
if q == nil {
return false
}
q <- pkt
return true
}
func (bp *batchPipelines) newPacket() *overlay.Packet {
if bp == nil || !bp.enabled || bp.pool == nil {
return nil
}
return bp.pool.Get()
}

View File

@@ -43,7 +43,7 @@ func (b *Bits) Check(l logrus.FieldLogger, i uint64) bool {
}
// Not within the window
l.Debugf("rejected a packet (top) %d %d delta %d\n", b.current, i, b.current-i)
l.Debugf("rejected a packet (top) %d %d\n", b.current, i)
return false
}

View File

@@ -1,8 +1,10 @@
package cert
import (
"encoding/hex"
"encoding/pem"
"fmt"
"time"
"golang.org/x/crypto/ed25519"
)
@@ -138,6 +140,101 @@ func MarshalSigningPrivateKeyToPEM(curve Curve, b []byte) []byte {
}
}
// Backward compatibility functions for older API
func MarshalX25519PublicKey(b []byte) []byte {
return MarshalPublicKeyToPEM(Curve_CURVE25519, b)
}
func MarshalX25519PrivateKey(b []byte) []byte {
return MarshalPrivateKeyToPEM(Curve_CURVE25519, b)
}
func MarshalPublicKey(curve Curve, b []byte) []byte {
return MarshalPublicKeyToPEM(curve, b)
}
func MarshalPrivateKey(curve Curve, b []byte) []byte {
return MarshalPrivateKeyToPEM(curve, b)
}
// NebulaCertificate is a compatibility wrapper for the old API
type NebulaCertificate struct {
Details NebulaCertificateDetails
Signature []byte
cert Certificate
}
// NebulaCertificateDetails is a compatibility wrapper for certificate details
type NebulaCertificateDetails struct {
Name string
NotBefore time.Time
NotAfter time.Time
PublicKey []byte
IsCA bool
Issuer []byte
Curve Curve
}
// UnmarshalNebulaCertificateFromPEM provides backward compatibility with the old API
func UnmarshalNebulaCertificateFromPEM(b []byte) (*NebulaCertificate, []byte, error) {
c, rest, err := UnmarshalCertificateFromPEM(b)
if err != nil {
return nil, rest, err
}
issuerBytes, err := func() ([]byte, error) {
issuer := c.Issuer()
if issuer == "" {
return nil, nil
}
decoded, err := hex.DecodeString(issuer)
if err != nil {
return nil, fmt.Errorf("failed to decode issuer fingerprint: %w", err)
}
return decoded, nil
}()
if err != nil {
return nil, rest, err
}
pubKey := c.PublicKey()
if pubKey != nil {
pubKey = append([]byte(nil), pubKey...)
}
sig := c.Signature()
if sig != nil {
sig = append([]byte(nil), sig...)
}
return &NebulaCertificate{
Details: NebulaCertificateDetails{
Name: c.Name(),
NotBefore: c.NotBefore(),
NotAfter: c.NotAfter(),
PublicKey: pubKey,
IsCA: c.IsCA(),
Issuer: issuerBytes,
Curve: c.Curve(),
},
Signature: sig,
cert: c,
}, rest, nil
}
// IssuerString returns the issuer in hex format for compatibility
func (n *NebulaCertificate) IssuerString() string {
if n.Details.Issuer == nil {
return ""
}
return hex.EncodeToString(n.Details.Issuer)
}
// Certificate returns the underlying certificate (read-only)
func (n *NebulaCertificate) Certificate() Certificate {
return n.cert
}
// UnmarshalPrivateKeyFromPEM will try to unmarshal the first pem block in a byte array, returning any non
// consumed data or an error on failure
func UnmarshalPrivateKeyFromPEM(b []byte) ([]byte, []byte, Curve, error) {

View File

@@ -3,9 +3,6 @@ package main
import (
"flag"
"fmt"
"log"
"net/http"
_ "net/http/pprof"
"os"
"github.com/sirupsen/logrus"
@@ -61,10 +58,6 @@ func main() {
os.Exit(1)
}
go func() {
log.Println(http.ListenAndServe("0.0.0.0:6060", nil))
}()
if !*configTest {
ctrl.Start()
notifyReady(l)

View File

@@ -17,7 +17,7 @@ import (
"dario.cat/mergo"
"github.com/sirupsen/logrus"
"go.yaml.in/yaml/v3"
"gopkg.in/yaml.v3"
)
type C struct {

View File

@@ -10,7 +10,7 @@ import (
"github.com/slackhq/nebula/test"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.yaml.in/yaml/v3"
"gopkg.in/yaml.v3"
)
func TestConfig_Load(t *testing.T) {

View File

@@ -13,7 +13,7 @@ import (
"github.com/slackhq/nebula/noiseutil"
)
const ReplayWindow = 4096
const ReplayWindow = 1024
type ConnectionState struct {
eKey *NebulaCipherState

View File

@@ -174,10 +174,6 @@ func (c *Control) GetHostmap() *HostMap {
return c.f.hostMap
}
func (c *Control) GetF() *Interface {
return c.f
}
func (c *Control) GetCertState() *CertState {
return c.f.pki.getCertState()
}

View File

@@ -20,7 +20,7 @@ import (
"github.com/slackhq/nebula/udp"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.yaml.in/yaml/v3"
"gopkg.in/yaml.v3"
)
func BenchmarkHotPath(b *testing.B) {
@@ -97,41 +97,6 @@ func TestGoodHandshake(t *testing.T) {
theirControl.Stop()
}
func TestGoodHandshakeNoOverlap(t *testing.T) {
ca, _, caKey, _ := cert_test.NewTestCaCert(cert.Version2, cert.Curve_CURVE25519, time.Now(), time.Now().Add(10*time.Minute), nil, nil, []string{})
myControl, myVpnIpNet, myUdpAddr, _ := newSimpleServer(cert.Version2, ca, caKey, "me", "10.128.0.1/24", nil)
theirControl, theirVpnIpNet, theirUdpAddr, _ := newSimpleServer(cert.Version2, ca, caKey, "them", "2001::69/24", nil) //look ma, cross-stack!
// Put their info in our lighthouse
myControl.InjectLightHouseAddr(theirVpnIpNet[0].Addr(), theirUdpAddr)
// Start the servers
myControl.Start()
theirControl.Start()
empty := []byte{}
t.Log("do something to cause a handshake")
myControl.GetF().SendMessageToVpnAddr(header.Test, header.MessageNone, theirVpnIpNet[0].Addr(), empty, empty, empty)
t.Log("Have them consume my stage 0 packet. They have a tunnel now")
theirControl.InjectUDPPacket(myControl.GetFromUDP(true))
t.Log("Get their stage 1 packet")
stage1Packet := theirControl.GetFromUDP(true)
t.Log("Have me consume their stage 1 packet. I have a tunnel now")
myControl.InjectUDPPacket(stage1Packet)
t.Log("Wait until we see a test packet come through to make sure we give the tunnel time to complete")
myControl.WaitForType(header.Test, 0, theirControl)
t.Log("Make sure our host infos are correct")
assertHostInfoPair(t, myUdpAddr, theirUdpAddr, myVpnIpNet, theirVpnIpNet, myControl, theirControl)
myControl.Stop()
theirControl.Stop()
}
func TestWrongResponderHandshake(t *testing.T) {
ca, _, caKey, _ := cert_test.NewTestCaCert(cert.Version1, cert.Curve_CURVE25519, time.Now(), time.Now().Add(10*time.Minute), nil, nil, []string{})
@@ -499,35 +464,6 @@ func TestRelays(t *testing.T) {
r.RenderHostmaps("Final hostmaps", myControl, relayControl, theirControl)
}
func TestRelaysDontCareAboutIps(t *testing.T) {
ca, _, caKey, _ := cert_test.NewTestCaCert(cert.Version2, cert.Curve_CURVE25519, time.Now(), time.Now().Add(10*time.Minute), nil, nil, []string{})
myControl, myVpnIpNet, _, _ := newSimpleServer(cert.Version2, ca, caKey, "me ", "10.128.0.1/24", m{"relay": m{"use_relays": true}})
relayControl, relayVpnIpNet, relayUdpAddr, _ := newSimpleServer(cert.Version2, ca, caKey, "relay ", "2001::9999/24", m{"relay": m{"am_relay": true}})
theirControl, theirVpnIpNet, theirUdpAddr, _ := newSimpleServer(cert.Version2, ca, caKey, "them ", "10.128.0.2/24", m{"relay": m{"use_relays": true}})
// Teach my how to get to the relay and that their can be reached via the relay
myControl.InjectLightHouseAddr(relayVpnIpNet[0].Addr(), relayUdpAddr)
myControl.InjectRelays(theirVpnIpNet[0].Addr(), []netip.Addr{relayVpnIpNet[0].Addr()})
relayControl.InjectLightHouseAddr(theirVpnIpNet[0].Addr(), theirUdpAddr)
// Build a router so we don't have to reason who gets which packet
r := router.NewR(t, myControl, relayControl, theirControl)
defer r.RenderFlow()
// Start the servers
myControl.Start()
relayControl.Start()
theirControl.Start()
t.Log("Trigger a handshake from me to them via the relay")
myControl.InjectTunUDPPacket(theirVpnIpNet[0].Addr(), 80, myVpnIpNet[0].Addr(), 80, []byte("Hi from me"))
p := r.RouteForAllUntilTxTun(theirControl)
r.Log("Assert the tunnel works")
assertUdpPacket(t, []byte("Hi from me"), p, myVpnIpNet[0].Addr(), theirVpnIpNet[0].Addr(), 80, 80)
r.RenderHostmaps("Final hostmaps", myControl, relayControl, theirControl)
}
func TestReestablishRelays(t *testing.T) {
ca, _, caKey, _ := cert_test.NewTestCaCert(cert.Version1, cert.Curve_CURVE25519, time.Now(), time.Now().Add(10*time.Minute), nil, nil, []string{})
myControl, myVpnIpNet, _, _ := newSimpleServer(cert.Version1, ca, caKey, "me ", "10.128.0.1/24", m{"relay": m{"use_relays": true}})
@@ -1291,109 +1227,3 @@ func TestV2NonPrimaryWithLighthouse(t *testing.T) {
myControl.Stop()
theirControl.Stop()
}
func TestV2NonPrimaryWithOffNetLighthouse(t *testing.T) {
ca, _, caKey, _ := cert_test.NewTestCaCert(cert.Version2, cert.Curve_CURVE25519, time.Now(), time.Now().Add(10*time.Minute), nil, nil, []string{})
lhControl, lhVpnIpNet, lhUdpAddr, _ := newSimpleServer(cert.Version2, ca, caKey, "lh ", "2001::1/64", m{"lighthouse": m{"am_lighthouse": true}})
o := m{
"static_host_map": m{
lhVpnIpNet[0].Addr().String(): []string{lhUdpAddr.String()},
},
"lighthouse": m{
"hosts": []string{lhVpnIpNet[0].Addr().String()},
"local_allow_list": m{
// Try and block our lighthouse updates from using the actual addresses assigned to this computer
// If we start discovering addresses the test router doesn't know about then test traffic cant flow
"10.0.0.0/24": true,
"::/0": false,
},
},
}
myControl, myVpnIpNet, _, _ := newSimpleServer(cert.Version2, ca, caKey, "me ", "10.128.0.2/24, ff::2/64", o)
theirControl, theirVpnIpNet, _, _ := newSimpleServer(cert.Version2, ca, caKey, "them", "10.128.0.3/24, ff::3/64", o)
// Build a router so we don't have to reason who gets which packet
r := router.NewR(t, lhControl, myControl, theirControl)
defer r.RenderFlow()
// Start the servers
lhControl.Start()
myControl.Start()
theirControl.Start()
t.Log("Stand up an ipv6 tunnel between me and them")
assert.True(t, myVpnIpNet[1].Addr().Is6())
assert.True(t, theirVpnIpNet[1].Addr().Is6())
assertTunnel(t, myVpnIpNet[1].Addr(), theirVpnIpNet[1].Addr(), myControl, theirControl, r)
lhControl.Stop()
myControl.Stop()
theirControl.Stop()
}
func TestGoodHandshakeUnsafeDest(t *testing.T) {
unsafePrefix := "192.168.6.0/24"
ca, _, caKey, _ := cert_test.NewTestCaCert(cert.Version2, cert.Curve_CURVE25519, time.Now(), time.Now().Add(10*time.Minute), nil, nil, []string{})
theirControl, theirVpnIpNet, theirUdpAddr, _ := newSimpleServerWithUdpAndUnsafeNetworks(cert.Version2, ca, caKey, "spooky", "10.128.0.2/24", netip.MustParseAddrPort("10.64.0.2:4242"), unsafePrefix, nil)
route := m{"route": unsafePrefix, "via": theirVpnIpNet[0].Addr().String()}
myCfg := m{
"tun": m{
"unsafe_routes": []m{route},
},
}
myControl, myVpnIpNet, myUdpAddr, myConfig := newSimpleServer(cert.Version2, ca, caKey, "me", "10.128.0.1/24", myCfg)
t.Logf("my config %v", myConfig)
// Put their info in our lighthouse
myControl.InjectLightHouseAddr(theirVpnIpNet[0].Addr(), theirUdpAddr)
spookyDest := netip.MustParseAddr("192.168.6.4")
// Start the servers
myControl.Start()
theirControl.Start()
t.Log("Send a udp packet through to begin standing up the tunnel, this should come out the other side")
myControl.InjectTunUDPPacket(spookyDest, 80, myVpnIpNet[0].Addr(), 80, []byte("Hi from me"))
t.Log("Have them consume my stage 0 packet. They have a tunnel now")
theirControl.InjectUDPPacket(myControl.GetFromUDP(true))
t.Log("Get their stage 1 packet so that we can play with it")
stage1Packet := theirControl.GetFromUDP(true)
t.Log("I consume a garbage packet with a proper nebula header for our tunnel")
// this should log a statement and get ignored, allowing the real handshake packet to complete the tunnel
badPacket := stage1Packet.Copy()
badPacket.Data = badPacket.Data[:len(badPacket.Data)-header.Len]
myControl.InjectUDPPacket(badPacket)
t.Log("Have me consume their real stage 1 packet. I have a tunnel now")
myControl.InjectUDPPacket(stage1Packet)
t.Log("Wait until we see my cached packet come through")
myControl.WaitForType(1, 0, theirControl)
t.Log("Make sure our host infos are correct")
assertHostInfoPair(t, myUdpAddr, theirUdpAddr, myVpnIpNet, theirVpnIpNet, myControl, theirControl)
t.Log("Get that cached packet and make sure it looks right")
myCachedPacket := theirControl.GetFromTun(true)
assertUdpPacket(t, []byte("Hi from me"), myCachedPacket, myVpnIpNet[0].Addr(), spookyDest, 80, 80)
//reply
theirControl.InjectTunUDPPacket(myVpnIpNet[0].Addr(), 80, spookyDest, 80, []byte("Hi from the spookyman"))
//wait for reply
theirControl.WaitForType(1, 0, myControl)
theirCachedPacket := myControl.GetFromTun(true)
assertUdpPacket(t, []byte("Hi from the spookyman"), theirCachedPacket, spookyDest, myVpnIpNet[0].Addr(), 80, 80)
t.Log("Do a bidirectional tunnel test")
r := router.NewR(t, myControl, theirControl)
defer r.RenderFlow()
assertTunnel(t, myVpnIpNet[0].Addr(), theirVpnIpNet[0].Addr(), myControl, theirControl, r)
r.RenderHostmaps("Final hostmaps", myControl, theirControl)
myControl.Stop()
theirControl.Stop()
}

View File

@@ -22,14 +22,15 @@ import (
"github.com/slackhq/nebula/config"
"github.com/slackhq/nebula/e2e/router"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.yaml.in/yaml/v3"
"gopkg.in/yaml.v3"
)
type m = map[string]any
// newSimpleServer creates a nebula instance with many assumptions
func newSimpleServer(v cert.Version, caCrt cert.Certificate, caKey []byte, name string, sVpnNetworks string, overrides m) (*nebula.Control, []netip.Prefix, netip.AddrPort, *config.C) {
l := NewTestLogger()
var vpnNetworks []netip.Prefix
for _, sn := range strings.Split(sVpnNetworks, ",") {
vpnIpNet, err := netip.ParsePrefix(strings.TrimSpace(sn))
@@ -55,54 +56,7 @@ func newSimpleServer(v cert.Version, caCrt cert.Certificate, caKey []byte, name
budpIp[3] = 239
udpAddr = netip.AddrPortFrom(netip.AddrFrom16(budpIp), 4242)
}
return newSimpleServerWithUdp(v, caCrt, caKey, name, sVpnNetworks, udpAddr, overrides)
}
func newSimpleServerWithUdp(v cert.Version, caCrt cert.Certificate, caKey []byte, name string, sVpnNetworks string, udpAddr netip.AddrPort, overrides m) (*nebula.Control, []netip.Prefix, netip.AddrPort, *config.C) {
return newSimpleServerWithUdpAndUnsafeNetworks(v, caCrt, caKey, name, sVpnNetworks, udpAddr, "", overrides)
}
func newSimpleServerWithUdpAndUnsafeNetworks(v cert.Version, caCrt cert.Certificate, caKey []byte, name string, sVpnNetworks string, udpAddr netip.AddrPort, sUnsafeNetworks string, overrides m) (*nebula.Control, []netip.Prefix, netip.AddrPort, *config.C) {
l := NewTestLogger()
var vpnNetworks []netip.Prefix
for _, sn := range strings.Split(sVpnNetworks, ",") {
vpnIpNet, err := netip.ParsePrefix(strings.TrimSpace(sn))
if err != nil {
panic(err)
}
vpnNetworks = append(vpnNetworks, vpnIpNet)
}
if len(vpnNetworks) == 0 {
panic("no vpn networks")
}
firewallInbound := []m{{
"proto": "any",
"port": "any",
"host": "any",
}}
var unsafeNetworks []netip.Prefix
if sUnsafeNetworks != "" {
firewallInbound = []m{{
"proto": "any",
"port": "any",
"host": "any",
"local_cidr": "0.0.0.0/0",
}}
for _, sn := range strings.Split(sUnsafeNetworks, ",") {
x, err := netip.ParsePrefix(strings.TrimSpace(sn))
if err != nil {
panic(err)
}
unsafeNetworks = append(unsafeNetworks, x)
}
}
_, _, myPrivKey, myPEM := cert_test.NewTestCert(v, cert.Curve_CURVE25519, caCrt, caKey, name, time.Now(), time.Now().Add(5*time.Minute), vpnNetworks, unsafeNetworks, []string{})
_, _, myPrivKey, myPEM := cert_test.NewTestCert(v, cert.Curve_CURVE25519, caCrt, caKey, name, time.Now(), time.Now().Add(5*time.Minute), vpnNetworks, nil, []string{})
caB, err := caCrt.MarshalPEM()
if err != nil {
@@ -122,7 +76,11 @@ func newSimpleServerWithUdpAndUnsafeNetworks(v cert.Version, caCrt cert.Certific
"port": "any",
"host": "any",
}},
"inbound": firewallInbound,
"inbound": []m{{
"proto": "any",
"port": "any",
"host": "any",
}},
},
//"handshakes": m{
// "try_interval": "1s",
@@ -308,10 +266,10 @@ func assertHostInfoPair(t *testing.T, addrA, addrB netip.AddrPort, vpnNetsA, vpn
// Get both host infos
//TODO: CERT-V2 we may want to loop over each vpnAddr and assert all the things
hBinA := controlA.GetHostInfoByVpnAddr(vpnNetsB[0].Addr(), false)
require.NotNil(t, hBinA, "Host B was not found by vpnAddr in controlA")
assert.NotNil(t, hBinA, "Host B was not found by vpnAddr in controlA")
hAinB := controlB.GetHostInfoByVpnAddr(vpnNetsA[0].Addr(), false)
require.NotNil(t, hAinB, "Host A was not found by vpnAddr in controlB")
assert.NotNil(t, hAinB, "Host A was not found by vpnAddr in controlB")
// Check that both vpn and real addr are correct
assert.EqualValues(t, getAddrs(vpnNetsB), hBinA.VpnAddrs, "Host B VpnIp is wrong in control A")

View File

@@ -318,50 +318,3 @@ func TestCertMismatchCorrection(t *testing.T) {
myControl.Stop()
theirControl.Stop()
}
func TestCrossStackRelaysWork(t *testing.T) {
ca, _, caKey, _ := cert_test.NewTestCaCert(cert.Version2, cert.Curve_CURVE25519, time.Now(), time.Now().Add(10*time.Minute), nil, nil, []string{})
myControl, myVpnIpNet, _, _ := newSimpleServer(cert.Version2, ca, caKey, "me ", "10.128.0.1/24,fc00::1/64", m{"relay": m{"use_relays": true}})
relayControl, relayVpnIpNet, relayUdpAddr, _ := newSimpleServer(cert.Version2, ca, caKey, "relay ", "10.128.0.128/24,fc00::128/64", m{"relay": m{"am_relay": true}})
theirUdp := netip.MustParseAddrPort("10.0.0.2:4242")
theirControl, theirVpnIpNet, theirUdpAddr, _ := newSimpleServerWithUdp(cert.Version2, ca, caKey, "them ", "fc00::2/64", theirUdp, m{"relay": m{"use_relays": true}})
//myVpnV4 := myVpnIpNet[0]
myVpnV6 := myVpnIpNet[1]
relayVpnV4 := relayVpnIpNet[0]
relayVpnV6 := relayVpnIpNet[1]
theirVpnV6 := theirVpnIpNet[0]
// Teach my how to get to the relay and that their can be reached via the relay
myControl.InjectLightHouseAddr(relayVpnV4.Addr(), relayUdpAddr)
myControl.InjectLightHouseAddr(relayVpnV6.Addr(), relayUdpAddr)
myControl.InjectRelays(theirVpnV6.Addr(), []netip.Addr{relayVpnV6.Addr()})
relayControl.InjectLightHouseAddr(theirVpnV6.Addr(), theirUdpAddr)
// Build a router so we don't have to reason who gets which packet
r := router.NewR(t, myControl, relayControl, theirControl)
defer r.RenderFlow()
// Start the servers
myControl.Start()
relayControl.Start()
theirControl.Start()
t.Log("Trigger a handshake from me to them via the relay")
myControl.InjectTunUDPPacket(theirVpnV6.Addr(), 80, myVpnV6.Addr(), 80, []byte("Hi from me"))
p := r.RouteForAllUntilTxTun(theirControl)
r.Log("Assert the tunnel works")
assertUdpPacket(t, []byte("Hi from me"), p, myVpnV6.Addr(), theirVpnV6.Addr(), 80, 80)
t.Log("reply?")
theirControl.InjectTunUDPPacket(myVpnV6.Addr(), 80, theirVpnV6.Addr(), 80, []byte("Hi from them"))
p = r.RouteForAllUntilTxTun(myControl)
assertUdpPacket(t, []byte("Hi from them"), p, theirVpnV6.Addr(), myVpnV6.Addr(), 80, 80)
r.RenderHostmaps("Final hostmaps", myControl, relayControl, theirControl)
//t.Log("finish up")
//myControl.Stop()
//theirControl.Stop()
//relayControl.Stop()
}

View File

@@ -417,45 +417,30 @@ func AddFirewallRulesFromConfig(l *logrus.Logger, inbound bool, c *config.C, fw
return nil
}
var ErrUnknownNetworkType = errors.New("unknown network type")
var ErrPeerRejected = errors.New("remote address is not within a network that we handle")
var ErrInvalidRemoteIP = errors.New("remote address is not in remote certificate networks")
var ErrInvalidLocalIP = errors.New("local address is not in list of handled local addresses")
var ErrInvalidRemoteIP = errors.New("remote IP is not in remote certificate subnets")
var ErrInvalidLocalIP = errors.New("local IP is not in list of handled local IPs")
var ErrNoMatchingRule = errors.New("no matching rule in firewall table")
// Drop returns an error if the packet should be dropped, explaining why. It
// returns nil if the packet should not be dropped.
func (f *Firewall) Drop(fp firewall.Packet, incoming bool, h *HostInfo, caPool *cert.CAPool, localCache firewall.ConntrackCache, now time.Time) error {
func (f *Firewall) Drop(fp firewall.Packet, incoming bool, h *HostInfo, caPool *cert.CAPool, localCache *firewall.ConntrackCache) error {
// Check if we spoke to this tuple, if we did then allow this packet
if f.inConns(fp, h, caPool, localCache, now) {
if f.inConns(fp, h, caPool, localCache) {
return nil
}
// Make sure remote address matches nebula certificate, and determine how to treat it
if h.networks == nil {
// Simple case: Certificate has one address and no unsafe networks
if h.vpnAddrs[0] != fp.RemoteAddr {
// Make sure remote address matches nebula certificate
if h.networks != nil {
if !h.networks.Contains(fp.RemoteAddr) {
f.metrics(incoming).droppedRemoteAddr.Inc(1)
return ErrInvalidRemoteIP
}
} else {
nwType, ok := h.networks.Lookup(fp.RemoteAddr)
if !ok {
// Simple case: Certificate has one address and no unsafe networks
if h.vpnAddrs[0] != fp.RemoteAddr {
f.metrics(incoming).droppedRemoteAddr.Inc(1)
return ErrInvalidRemoteIP
}
switch nwType {
case NetworkTypeVPN:
break // nothing special
case NetworkTypeVPNPeer:
f.metrics(incoming).droppedRemoteAddr.Inc(1)
return ErrPeerRejected // reject for now, one day this may have different FW rules
case NetworkTypeUnsafe:
break // nothing special, one day this may have different FW rules
default:
f.metrics(incoming).droppedRemoteAddr.Inc(1)
return ErrUnknownNetworkType //should never happen
}
}
// Make sure we are supposed to be handling this local ip address
@@ -476,7 +461,7 @@ func (f *Firewall) Drop(fp firewall.Packet, incoming bool, h *HostInfo, caPool *
}
// We always want to conntrack since it is a faster operation
f.addConn(fp, incoming, now)
f.addConn(fp, incoming)
return nil
}
@@ -505,11 +490,9 @@ func (f *Firewall) EmitStats() {
metrics.GetOrRegisterGauge("firewall.rules.hash", nil).Update(int64(f.GetRuleHashFNV()))
}
func (f *Firewall) inConns(fp firewall.Packet, h *HostInfo, caPool *cert.CAPool, localCache firewall.ConntrackCache, now time.Time) bool {
if localCache != nil {
if _, ok := localCache[fp]; ok {
return true
}
func (f *Firewall) inConns(fp firewall.Packet, h *HostInfo, caPool *cert.CAPool, localCache *firewall.ConntrackCache) bool {
if localCache != nil && localCache.Has(fp) {
return true
}
conntrack := f.Conntrack
conntrack.Lock()
@@ -517,7 +500,7 @@ func (f *Firewall) inConns(fp firewall.Packet, h *HostInfo, caPool *cert.CAPool,
// Purge every time we test
ep, has := conntrack.TimerWheel.Purge()
if has {
f.evict(ep, now)
f.evict(ep)
}
c, ok := conntrack.Conns[fp]
@@ -564,23 +547,23 @@ func (f *Firewall) inConns(fp firewall.Packet, h *HostInfo, caPool *cert.CAPool,
switch fp.Protocol {
case firewall.ProtoTCP:
c.Expires = now.Add(f.TCPTimeout)
c.Expires = time.Now().Add(f.TCPTimeout)
case firewall.ProtoUDP:
c.Expires = now.Add(f.UDPTimeout)
c.Expires = time.Now().Add(f.UDPTimeout)
default:
c.Expires = now.Add(f.DefaultTimeout)
c.Expires = time.Now().Add(f.DefaultTimeout)
}
conntrack.Unlock()
if localCache != nil {
localCache[fp] = struct{}{}
localCache.Add(fp)
}
return true
}
func (f *Firewall) addConn(fp firewall.Packet, incoming bool, now time.Time) {
func (f *Firewall) addConn(fp firewall.Packet, incoming bool) {
var timeout time.Duration
c := &conn{}
@@ -596,7 +579,7 @@ func (f *Firewall) addConn(fp firewall.Packet, incoming bool, now time.Time) {
conntrack := f.Conntrack
conntrack.Lock()
if _, ok := conntrack.Conns[fp]; !ok {
conntrack.TimerWheel.Advance(now)
conntrack.TimerWheel.Advance(time.Now())
conntrack.TimerWheel.Add(fp, timeout)
}
@@ -604,14 +587,14 @@ func (f *Firewall) addConn(fp firewall.Packet, incoming bool, now time.Time) {
// firewall reload
c.incoming = incoming
c.rulesVersion = f.rulesVersion
c.Expires = now.Add(timeout)
c.Expires = time.Now().Add(timeout)
conntrack.Conns[fp] = c
conntrack.Unlock()
}
// Evict checks if a conntrack entry has expired, if so it is removed, if not it is re-added to the wheel
// Caller must own the connMutex lock!
func (f *Firewall) evict(p firewall.Packet, now time.Time) {
func (f *Firewall) evict(p firewall.Packet) {
// Are we still tracking this conn?
conntrack := f.Conntrack
t, ok := conntrack.Conns[p]
@@ -619,11 +602,11 @@ func (f *Firewall) evict(p firewall.Packet, now time.Time) {
return
}
newT := t.Expires.Sub(now)
newT := t.Expires.Sub(time.Now())
// Timeout is in the future, re-add the timer
if newT > 0 {
conntrack.TimerWheel.Advance(now)
conntrack.TimerWheel.Advance(time.Now())
conntrack.TimerWheel.Add(p, newT)
return
}

View File

@@ -1,6 +1,7 @@
package firewall
import (
"sync"
"sync/atomic"
"time"
@@ -9,13 +10,58 @@ import (
// ConntrackCache is used as a local routine cache to know if a given flow
// has been seen in the conntrack table.
type ConntrackCache map[Packet]struct{}
type ConntrackCache struct {
mu sync.Mutex
entries map[Packet]struct{}
}
func newConntrackCache() *ConntrackCache {
return &ConntrackCache{entries: make(map[Packet]struct{})}
}
func (c *ConntrackCache) Has(p Packet) bool {
if c == nil {
return false
}
c.mu.Lock()
_, ok := c.entries[p]
c.mu.Unlock()
return ok
}
func (c *ConntrackCache) Add(p Packet) {
if c == nil {
return
}
c.mu.Lock()
c.entries[p] = struct{}{}
c.mu.Unlock()
}
func (c *ConntrackCache) Len() int {
if c == nil {
return 0
}
c.mu.Lock()
l := len(c.entries)
c.mu.Unlock()
return l
}
func (c *ConntrackCache) Reset(capHint int) {
if c == nil {
return
}
c.mu.Lock()
c.entries = make(map[Packet]struct{}, capHint)
c.mu.Unlock()
}
type ConntrackCacheTicker struct {
cacheV uint64
cacheTick atomic.Uint64
cache ConntrackCache
cache *ConntrackCache
}
func NewConntrackCacheTicker(d time.Duration) *ConntrackCacheTicker {
@@ -23,9 +69,7 @@ func NewConntrackCacheTicker(d time.Duration) *ConntrackCacheTicker {
return nil
}
c := &ConntrackCacheTicker{
cache: ConntrackCache{},
}
c := &ConntrackCacheTicker{cache: newConntrackCache()}
go c.tick(d)
@@ -41,17 +85,17 @@ func (c *ConntrackCacheTicker) tick(d time.Duration) {
// Get checks if the cache ticker has moved to the next version before returning
// the map. If it has moved, we reset the map.
func (c *ConntrackCacheTicker) Get(l *logrus.Logger) ConntrackCache {
func (c *ConntrackCacheTicker) Get(l *logrus.Logger) *ConntrackCache {
if c == nil {
return nil
}
if tick := c.cacheTick.Load(); tick != c.cacheV {
c.cacheV = tick
if ll := len(c.cache); ll > 0 {
if ll := c.cache.Len(); ll > 0 {
if l.Level == logrus.DebugLevel {
l.WithField("len", ll).Debug("resetting conntrack cache")
}
c.cache = make(ConntrackCache, ll)
c.cache.Reset(ll)
}
}

View File

@@ -8,8 +8,6 @@ import (
"testing"
"time"
"github.com/gaissmai/bart"
"github.com/sirupsen/logrus"
"github.com/slackhq/nebula/cert"
"github.com/slackhq/nebula/config"
"github.com/slackhq/nebula/firewall"
@@ -151,8 +149,7 @@ func TestFirewall_Drop(t *testing.T) {
l := test.NewLogger()
ob := &bytes.Buffer{}
l.SetOutput(ob)
myVpnNetworksTable := new(bart.Lite)
myVpnNetworksTable.Insert(netip.MustParsePrefix("1.1.1.1/8"))
p := firewall.Packet{
LocalAddr: netip.MustParseAddr("1.2.3.4"),
RemoteAddr: netip.MustParseAddr("1.2.3.4"),
@@ -177,7 +174,7 @@ func TestFirewall_Drop(t *testing.T) {
},
vpnAddrs: []netip.Addr{netip.MustParseAddr("1.2.3.4")},
}
h.buildNetworks(myVpnNetworksTable, &c)
h.buildNetworks(c.networks, c.unsafeNetworks)
fw := NewFirewall(l, time.Second, time.Minute, time.Hour, &c)
require.NoError(t, fw.AddRule(true, firewall.ProtoAny, 0, 0, []string{"any"}, "", netip.Prefix{}, netip.Prefix{}, "", ""))
@@ -229,9 +226,6 @@ func TestFirewall_DropV6(t *testing.T) {
ob := &bytes.Buffer{}
l.SetOutput(ob)
myVpnNetworksTable := new(bart.Lite)
myVpnNetworksTable.Insert(netip.MustParsePrefix("fd00::/7"))
p := firewall.Packet{
LocalAddr: netip.MustParseAddr("fd12::34"),
RemoteAddr: netip.MustParseAddr("fd12::34"),
@@ -256,7 +250,7 @@ func TestFirewall_DropV6(t *testing.T) {
},
vpnAddrs: []netip.Addr{netip.MustParseAddr("fd12::34")},
}
h.buildNetworks(myVpnNetworksTable, &c)
h.buildNetworks(c.networks, c.unsafeNetworks)
fw := NewFirewall(l, time.Second, time.Minute, time.Hour, &c)
require.NoError(t, fw.AddRule(true, firewall.ProtoAny, 0, 0, []string{"any"}, "", netip.Prefix{}, netip.Prefix{}, "", ""))
@@ -459,8 +453,6 @@ func TestFirewall_Drop2(t *testing.T) {
l := test.NewLogger()
ob := &bytes.Buffer{}
l.SetOutput(ob)
myVpnNetworksTable := new(bart.Lite)
myVpnNetworksTable.Insert(netip.MustParsePrefix("1.1.1.1/8"))
p := firewall.Packet{
LocalAddr: netip.MustParseAddr("1.2.3.4"),
@@ -486,7 +478,7 @@ func TestFirewall_Drop2(t *testing.T) {
},
vpnAddrs: []netip.Addr{network.Addr()},
}
h.buildNetworks(myVpnNetworksTable, c.Certificate)
h.buildNetworks(c.Certificate.Networks(), c.Certificate.UnsafeNetworks())
c1 := cert.CachedCertificate{
Certificate: &dummyCert{
@@ -501,7 +493,7 @@ func TestFirewall_Drop2(t *testing.T) {
peerCert: &c1,
},
}
h1.buildNetworks(myVpnNetworksTable, c1.Certificate)
h1.buildNetworks(c1.Certificate.Networks(), c1.Certificate.UnsafeNetworks())
fw := NewFirewall(l, time.Second, time.Minute, time.Hour, c.Certificate)
require.NoError(t, fw.AddRule(true, firewall.ProtoAny, 0, 0, []string{"default-group", "test-group"}, "", netip.Prefix{}, netip.Prefix{}, "", ""))
@@ -518,8 +510,6 @@ func TestFirewall_Drop3(t *testing.T) {
l := test.NewLogger()
ob := &bytes.Buffer{}
l.SetOutput(ob)
myVpnNetworksTable := new(bart.Lite)
myVpnNetworksTable.Insert(netip.MustParsePrefix("1.1.1.1/8"))
p := firewall.Packet{
LocalAddr: netip.MustParseAddr("1.2.3.4"),
@@ -551,7 +541,7 @@ func TestFirewall_Drop3(t *testing.T) {
},
vpnAddrs: []netip.Addr{network.Addr()},
}
h1.buildNetworks(myVpnNetworksTable, c1.Certificate)
h1.buildNetworks(c1.Certificate.Networks(), c1.Certificate.UnsafeNetworks())
c2 := cert.CachedCertificate{
Certificate: &dummyCert{
@@ -566,7 +556,7 @@ func TestFirewall_Drop3(t *testing.T) {
},
vpnAddrs: []netip.Addr{network.Addr()},
}
h2.buildNetworks(myVpnNetworksTable, c2.Certificate)
h2.buildNetworks(c2.Certificate.Networks(), c2.Certificate.UnsafeNetworks())
c3 := cert.CachedCertificate{
Certificate: &dummyCert{
@@ -581,7 +571,7 @@ func TestFirewall_Drop3(t *testing.T) {
},
vpnAddrs: []netip.Addr{network.Addr()},
}
h3.buildNetworks(myVpnNetworksTable, c3.Certificate)
h3.buildNetworks(c3.Certificate.Networks(), c3.Certificate.UnsafeNetworks())
fw := NewFirewall(l, time.Second, time.Minute, time.Hour, c.Certificate)
require.NoError(t, fw.AddRule(true, firewall.ProtoAny, 1, 1, []string{}, "host1", netip.Prefix{}, netip.Prefix{}, "", ""))
@@ -607,8 +597,6 @@ func TestFirewall_Drop3V6(t *testing.T) {
l := test.NewLogger()
ob := &bytes.Buffer{}
l.SetOutput(ob)
myVpnNetworksTable := new(bart.Lite)
myVpnNetworksTable.Insert(netip.MustParsePrefix("fd00::/7"))
p := firewall.Packet{
LocalAddr: netip.MustParseAddr("fd12::34"),
@@ -632,7 +620,7 @@ func TestFirewall_Drop3V6(t *testing.T) {
},
vpnAddrs: []netip.Addr{network.Addr()},
}
h.buildNetworks(myVpnNetworksTable, c.Certificate)
h.buildNetworks(c.Certificate.Networks(), c.Certificate.UnsafeNetworks())
// Test a remote address match
fw := NewFirewall(l, time.Second, time.Minute, time.Hour, c.Certificate)
@@ -645,8 +633,6 @@ func TestFirewall_DropConntrackReload(t *testing.T) {
l := test.NewLogger()
ob := &bytes.Buffer{}
l.SetOutput(ob)
myVpnNetworksTable := new(bart.Lite)
myVpnNetworksTable.Insert(netip.MustParsePrefix("1.1.1.1/8"))
p := firewall.Packet{
LocalAddr: netip.MustParseAddr("1.2.3.4"),
@@ -673,7 +659,7 @@ func TestFirewall_DropConntrackReload(t *testing.T) {
},
vpnAddrs: []netip.Addr{network.Addr()},
}
h.buildNetworks(myVpnNetworksTable, c.Certificate)
h.buildNetworks(c.Certificate.Networks(), c.Certificate.UnsafeNetworks())
fw := NewFirewall(l, time.Second, time.Minute, time.Hour, c.Certificate)
require.NoError(t, fw.AddRule(true, firewall.ProtoAny, 0, 0, []string{"any"}, "", netip.Prefix{}, netip.Prefix{}, "", ""))
@@ -710,8 +696,6 @@ func TestFirewall_DropIPSpoofing(t *testing.T) {
l := test.NewLogger()
ob := &bytes.Buffer{}
l.SetOutput(ob)
myVpnNetworksTable := new(bart.Lite)
myVpnNetworksTable.Insert(netip.MustParsePrefix("192.0.2.1/24"))
c := cert.CachedCertificate{
Certificate: &dummyCert{
@@ -733,7 +717,7 @@ func TestFirewall_DropIPSpoofing(t *testing.T) {
},
vpnAddrs: []netip.Addr{c1.Certificate.Networks()[0].Addr()},
}
h1.buildNetworks(myVpnNetworksTable, c1.Certificate)
h1.buildNetworks(c1.Certificate.Networks(), c1.Certificate.UnsafeNetworks())
fw := NewFirewall(l, time.Second, time.Minute, time.Hour, c.Certificate)
@@ -1063,171 +1047,6 @@ func TestFirewall_convertRule(t *testing.T) {
assert.Equal(t, "group1", r.Group)
}
type testcase struct {
h *HostInfo
p firewall.Packet
c cert.Certificate
err error
}
func (c *testcase) Test(t *testing.T, fw *Firewall) {
t.Helper()
cp := cert.NewCAPool()
resetConntrack(fw)
err := fw.Drop(c.p, true, c.h, cp, nil)
if c.err == nil {
require.NoError(t, err, "failed to not drop remote address %s", c.p.RemoteAddr)
} else {
require.ErrorIs(t, c.err, err, "failed to drop remote address %s", c.p.RemoteAddr)
}
}
func buildTestCase(setup testsetup, err error, theirPrefixes ...netip.Prefix) testcase {
c1 := dummyCert{
name: "host1",
networks: theirPrefixes,
groups: []string{"default-group"},
issuer: "signer-shasum",
}
h := HostInfo{
ConnectionState: &ConnectionState{
peerCert: &cert.CachedCertificate{
Certificate: &c1,
InvertedGroups: map[string]struct{}{"default-group": {}},
},
},
vpnAddrs: make([]netip.Addr, len(theirPrefixes)),
}
for i := range theirPrefixes {
h.vpnAddrs[i] = theirPrefixes[i].Addr()
}
h.buildNetworks(setup.myVpnNetworksTable, &c1)
p := firewall.Packet{
LocalAddr: setup.c.Networks()[0].Addr(), //todo?
RemoteAddr: theirPrefixes[0].Addr(),
LocalPort: 10,
RemotePort: 90,
Protocol: firewall.ProtoUDP,
Fragment: false,
}
return testcase{
h: &h,
p: p,
c: &c1,
err: err,
}
}
type testsetup struct {
c dummyCert
myVpnNetworksTable *bart.Lite
fw *Firewall
}
func newSetup(t *testing.T, l *logrus.Logger, myPrefixes ...netip.Prefix) testsetup {
c := dummyCert{
name: "me",
networks: myPrefixes,
groups: []string{"default-group"},
issuer: "signer-shasum",
}
return newSetupFromCert(t, l, c)
}
func newSetupFromCert(t *testing.T, l *logrus.Logger, c dummyCert) testsetup {
myVpnNetworksTable := new(bart.Lite)
for _, prefix := range c.Networks() {
myVpnNetworksTable.Insert(prefix)
}
fw := NewFirewall(l, time.Second, time.Minute, time.Hour, &c)
require.NoError(t, fw.AddRule(true, firewall.ProtoAny, 0, 0, []string{"any"}, "", netip.Prefix{}, netip.Prefix{}, "", ""))
return testsetup{
c: c,
fw: fw,
myVpnNetworksTable: myVpnNetworksTable,
}
}
func TestFirewall_Drop_EnforceIPMatch(t *testing.T) {
t.Parallel()
l := test.NewLogger()
ob := &bytes.Buffer{}
l.SetOutput(ob)
myPrefix := netip.MustParsePrefix("1.1.1.1/8")
// for now, it's okay that these are all "incoming", the logic this test tries to check doesn't care about in/out
t.Run("allow inbound all matching", func(t *testing.T) {
t.Parallel()
setup := newSetup(t, l, myPrefix)
tc := buildTestCase(setup, nil, netip.MustParsePrefix("1.2.3.4/24"))
tc.Test(t, setup.fw)
})
t.Run("allow inbound local matching", func(t *testing.T) {
t.Parallel()
setup := newSetup(t, l, myPrefix)
tc := buildTestCase(setup, ErrInvalidLocalIP, netip.MustParsePrefix("1.2.3.4/24"))
tc.p.LocalAddr = netip.MustParseAddr("1.2.3.8")
tc.Test(t, setup.fw)
})
t.Run("block inbound remote mismatched", func(t *testing.T) {
t.Parallel()
setup := newSetup(t, l, myPrefix)
tc := buildTestCase(setup, ErrInvalidRemoteIP, netip.MustParsePrefix("1.2.3.4/24"))
tc.p.RemoteAddr = netip.MustParseAddr("9.9.9.9")
tc.Test(t, setup.fw)
})
t.Run("Block a vpn peer packet", func(t *testing.T) {
t.Parallel()
setup := newSetup(t, l, myPrefix)
tc := buildTestCase(setup, ErrPeerRejected, netip.MustParsePrefix("2.2.2.2/24"))
tc.Test(t, setup.fw)
})
twoPrefixes := []netip.Prefix{
netip.MustParsePrefix("1.2.3.4/24"), netip.MustParsePrefix("2.2.2.2/24"),
}
t.Run("allow inbound one matching", func(t *testing.T) {
t.Parallel()
setup := newSetup(t, l, myPrefix)
tc := buildTestCase(setup, nil, twoPrefixes...)
tc.Test(t, setup.fw)
})
t.Run("block inbound multimismatch", func(t *testing.T) {
t.Parallel()
setup := newSetup(t, l, myPrefix)
tc := buildTestCase(setup, ErrInvalidRemoteIP, twoPrefixes...)
tc.p.RemoteAddr = netip.MustParseAddr("9.9.9.9")
tc.Test(t, setup.fw)
})
t.Run("allow inbound 2nd one matching", func(t *testing.T) {
t.Parallel()
setup2 := newSetup(t, l, netip.MustParsePrefix("2.2.2.1/24"))
tc := buildTestCase(setup2, nil, twoPrefixes...)
tc.p.RemoteAddr = twoPrefixes[1].Addr()
tc.Test(t, setup2.fw)
})
t.Run("allow inbound unsafe route", func(t *testing.T) {
t.Parallel()
unsafePrefix := netip.MustParsePrefix("192.168.0.0/24")
c := dummyCert{
name: "me",
networks: []netip.Prefix{myPrefix},
unsafeNetworks: []netip.Prefix{unsafePrefix},
groups: []string{"default-group"},
issuer: "signer-shasum",
}
unsafeSetup := newSetupFromCert(t, l, c)
tc := buildTestCase(unsafeSetup, nil, twoPrefixes...)
tc.p.LocalAddr = netip.MustParseAddr("192.168.0.3")
tc.err = ErrNoMatchingRule
tc.Test(t, unsafeSetup.fw) //should hit firewall and bounce off
require.NoError(t, unsafeSetup.fw.AddRule(true, firewall.ProtoAny, 0, 0, []string{"any"}, "", netip.Prefix{}, unsafePrefix, "", ""))
tc.err = nil
tc.Test(t, unsafeSetup.fw) //should pass
})
}
type addRuleCall struct {
incoming bool
proto uint8

16
go.mod
View File

@@ -6,6 +6,7 @@ require (
dario.cat/mergo v1.0.2
github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be
github.com/armon/go-radix v1.0.0
github.com/cilium/ebpf v0.12.3
github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432
github.com/flynn/noise v1.1.0
github.com/gaissmai/bart v0.25.0
@@ -22,17 +23,16 @@ require (
github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6
github.com/stretchr/testify v1.11.1
github.com/vishvananda/netlink v1.3.1
go.yaml.in/yaml/v3 v3.0.4
golang.org/x/crypto v0.44.0
golang.org/x/crypto v0.43.0
golang.org/x/exp v0.0.0-20230725093048-515e97ebf090
golang.org/x/net v0.46.0
golang.org/x/sync v0.18.0
golang.org/x/sys v0.38.0
golang.org/x/term v0.37.0
golang.org/x/net v0.45.0
golang.org/x/sync v0.17.0
golang.org/x/sys v0.37.0
golang.org/x/term v0.36.0
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2
golang.zx2c4.com/wireguard v0.0.0-20230325221338-052af4a8072b
golang.zx2c4.com/wireguard/windows v0.5.3
google.golang.org/protobuf v1.36.10
google.golang.org/protobuf v1.36.8
gopkg.in/yaml.v3 v3.0.1
gvisor.dev/gvisor v0.0.0-20240423190808-9d7a357edefe
)
@@ -50,6 +50,6 @@ require (
github.com/vishvananda/netns v0.0.5 // indirect
go.yaml.in/yaml/v2 v2.4.2 // indirect
golang.org/x/mod v0.24.0 // indirect
golang.org/x/time v0.7.0 // indirect
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.33.0 // indirect
)

37
go.sum
View File

@@ -17,6 +17,8 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cilium/ebpf v0.12.3 h1:8ht6F9MquybnY97at+VDZb3eQQr8ev79RueWeVaEcG4=
github.com/cilium/ebpf v0.12.3/go.mod h1:TctK1ivibvI3znr66ljgi4hqOT8EYQjz1KWBfb1UVgM=
github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432 h1:M5QgkYacWj0Xs8MhpIK/5uwU02icXpEoSo9sM2aRCps=
github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432/go.mod h1:xwIwAxMvYnVrGJPe2FKx5prTrnAjGOD8zvDOnxnrrkM=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -24,6 +26,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/flynn/noise v1.1.0 h1:KjPQoQCEFdZDiP03phOvGi11+SVVhBG2wOWAorLsstg=
github.com/flynn/noise v1.1.0/go.mod h1:xbMo+0i6+IGbYdJhF31t2eR1BIU0CYc12+BNAKwUTag=
github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA=
github.com/frankban/quicktest v1.14.5/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
github.com/gaissmai/bart v0.25.0 h1:eqiokVPqM3F94vJ0bTHXHtH91S8zkKL+bKh+BsGOsJM=
github.com/gaissmai/bart v0.25.0/go.mod h1:GREWQfTLRWz/c5FTOsIw+KkscuFkIV5t8Rp7Nd1Td5c=
github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
@@ -78,8 +82,9 @@ github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfn
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
@@ -155,15 +160,13 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4=
golang.org/x/crypto v0.44.0 h1:A97SsFvM3AIwEEmTBiaxPPTYpDC47w720rdiiUvgoAU=
golang.org/x/crypto v0.44.0/go.mod h1:013i+Nw79BMiQiMsOPcVCB5ZIJbYkerPrGnOa00tvmc=
golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0=
golang.org/x/exp v0.0.0-20230725093048-515e97ebf090 h1:Di6/M8l0O2lCLc6VVRWhgCiApHV8MnQurBnFSHsQtNY=
golang.org/x/exp v0.0.0-20230725093048-515e97ebf090/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc=
golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
@@ -182,8 +185,8 @@ golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
golang.org/x/net v0.45.0 h1:RLBg5JKixCy82FtLJpeNlVM0nrSqpCRYzVU1n8kj0tM=
golang.org/x/net v0.45.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -191,8 +194,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I=
golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -209,16 +212,16 @@ golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU=
golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254=
golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q=
golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ=
golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=
golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
@@ -244,8 +247,8 @@ google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miE
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=

View File

@@ -2,6 +2,7 @@ package nebula
import (
"net/netip"
"slices"
"time"
"github.com/flynn/noise"
@@ -191,17 +192,17 @@ func ixHandshakeStage1(f *Interface, addr netip.AddrPort, via *ViaSender, packet
return
}
var vpnAddrs []netip.Addr
var filteredNetworks []netip.Prefix
certName := remoteCert.Certificate.Name()
certVersion := remoteCert.Certificate.Version()
fingerprint := remoteCert.Fingerprint
issuer := remoteCert.Certificate.Issuer()
vpnNetworks := remoteCert.Certificate.Networks()
anyVpnAddrsInCommon := false
vpnAddrs := make([]netip.Addr, len(vpnNetworks))
for i, network := range vpnNetworks {
if f.myVpnAddrsTable.Contains(network.Addr()) {
f.l.WithField("vpnNetworks", vpnNetworks).WithField("udpAddr", addr).
for _, network := range remoteCert.Certificate.Networks() {
vpnAddr := network.Addr()
if f.myVpnAddrsTable.Contains(vpnAddr) {
f.l.WithField("vpnAddr", vpnAddr).WithField("udpAddr", addr).
WithField("certName", certName).
WithField("certVersion", certVersion).
WithField("fingerprint", fingerprint).
@@ -209,10 +210,24 @@ func ixHandshakeStage1(f *Interface, addr netip.AddrPort, via *ViaSender, packet
WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).Error("Refusing to handshake with myself")
return
}
vpnAddrs[i] = network.Addr()
if f.myVpnNetworksTable.Contains(network.Addr()) {
anyVpnAddrsInCommon = true
// vpnAddrs outside our vpn networks are of no use to us, filter them out
if !f.myVpnNetworksTable.Contains(vpnAddr) {
continue
}
filteredNetworks = append(filteredNetworks, network)
vpnAddrs = append(vpnAddrs, vpnAddr)
}
if len(vpnAddrs) == 0 {
f.l.WithError(err).WithField("udpAddr", addr).
WithField("certName", certName).
WithField("certVersion", certVersion).
WithField("fingerprint", fingerprint).
WithField("issuer", issuer).
WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).Error("No usable vpn addresses from host, refusing handshake")
return
}
if addr.IsValid() {
@@ -249,30 +264,26 @@ func ixHandshakeStage1(f *Interface, addr netip.AddrPort, via *ViaSender, packet
},
}
msgRxL := f.l.WithFields(m{
"vpnAddrs": vpnAddrs,
"udpAddr": addr,
"certName": certName,
"certVersion": certVersion,
"fingerprint": fingerprint,
"issuer": issuer,
"initiatorIndex": hs.Details.InitiatorIndex,
"responderIndex": hs.Details.ResponderIndex,
"remoteIndex": h.RemoteIndex,
"handshake": m{"stage": 1, "style": "ix_psk0"},
})
if anyVpnAddrsInCommon {
msgRxL.Info("Handshake message received")
} else {
//todo warn if not lighthouse or relay?
msgRxL.Info("Handshake message received, but no vpnNetworks in common.")
}
f.l.WithField("vpnAddrs", vpnAddrs).WithField("udpAddr", addr).
WithField("certName", certName).
WithField("certVersion", certVersion).
WithField("fingerprint", fingerprint).
WithField("issuer", issuer).
WithField("initiatorIndex", hs.Details.InitiatorIndex).WithField("responderIndex", hs.Details.ResponderIndex).
WithField("remoteIndex", h.RemoteIndex).WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
Info("Handshake message received")
hs.Details.ResponderIndex = myIndex
hs.Details.Cert = cs.getHandshakeBytes(ci.myCert.Version())
if hs.Details.Cert == nil {
msgRxL.WithField("myCertVersion", ci.myCert.Version()).
f.l.WithField("vpnAddrs", vpnAddrs).WithField("udpAddr", addr).
WithField("certName", certName).
WithField("certVersion", certVersion).
WithField("fingerprint", fingerprint).
WithField("issuer", issuer).
WithField("initiatorIndex", hs.Details.InitiatorIndex).WithField("responderIndex", hs.Details.ResponderIndex).
WithField("remoteIndex", h.RemoteIndex).WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
WithField("certVersion", ci.myCert.Version()).
Error("Unable to handshake with host because no certificate handshake bytes is available")
return
}
@@ -330,7 +341,7 @@ func ixHandshakeStage1(f *Interface, addr netip.AddrPort, via *ViaSender, packet
hostinfo.remotes = f.lightHouse.QueryCache(vpnAddrs)
hostinfo.SetRemote(addr)
hostinfo.buildNetworks(f.myVpnNetworksTable, remoteCert.Certificate)
hostinfo.buildNetworks(filteredNetworks, remoteCert.Certificate.UnsafeNetworks())
existing, err := f.handshakeManager.CheckAndComplete(hostinfo, 0, f)
if err != nil {
@@ -571,22 +582,31 @@ func ixHandshakeStage2(f *Interface, addr netip.AddrPort, via *ViaSender, hh *Ha
hostinfo.relayState.InsertRelayTo(via.relayHI.vpnAddrs[0])
}
correctHostResponded := false
anyVpnAddrsInCommon := false
vpnAddrs := make([]netip.Addr, len(vpnNetworks))
for i, network := range vpnNetworks {
vpnAddrs[i] = network.Addr()
if f.myVpnNetworksTable.Contains(network.Addr()) {
anyVpnAddrsInCommon = true
}
if hostinfo.vpnAddrs[0] == network.Addr() {
// todo is it more correct to see if any of hostinfo.vpnAddrs are in the cert? it should have len==1, but one day it might not?
correctHostResponded = true
var vpnAddrs []netip.Addr
var filteredNetworks []netip.Prefix
for _, network := range vpnNetworks {
// vpnAddrs outside our vpn networks are of no use to us, filter them out
vpnAddr := network.Addr()
if !f.myVpnNetworksTable.Contains(vpnAddr) {
continue
}
filteredNetworks = append(filteredNetworks, network)
vpnAddrs = append(vpnAddrs, vpnAddr)
}
if len(vpnAddrs) == 0 {
f.l.WithError(err).WithField("udpAddr", addr).
WithField("certName", certName).
WithField("certVersion", certVersion).
WithField("fingerprint", fingerprint).
WithField("issuer", issuer).
WithField("handshake", m{"stage": 2, "style": "ix_psk0"}).Error("No usable vpn addresses from host, refusing handshake")
return true
}
// Ensure the right host responded
if !correctHostResponded {
if !slices.Contains(vpnAddrs, hostinfo.vpnAddrs[0]) {
f.l.WithField("intendedVpnAddrs", hostinfo.vpnAddrs).WithField("haveVpnNetworks", vpnNetworks).
WithField("udpAddr", addr).
WithField("certName", certName).
@@ -598,7 +618,6 @@ func ixHandshakeStage2(f *Interface, addr netip.AddrPort, via *ViaSender, hh *Ha
f.handshakeManager.DeleteHostInfo(hostinfo)
// Create a new hostinfo/handshake for the intended vpn ip
//TODO is hostinfo.vpnAddrs[0] always the address to use?
f.handshakeManager.StartHandshake(hostinfo.vpnAddrs[0], func(newHH *HandshakeHostInfo) {
// Block the current used address
newHH.hostinfo.remotes = hostinfo.remotes
@@ -625,7 +644,7 @@ func ixHandshakeStage2(f *Interface, addr netip.AddrPort, via *ViaSender, hh *Ha
ci.window.Update(f.l, 2)
duration := time.Since(hh.startTime).Nanoseconds()
msgRxL := f.l.WithField("vpnAddrs", vpnAddrs).WithField("udpAddr", addr).
f.l.WithField("vpnAddrs", vpnAddrs).WithField("udpAddr", addr).
WithField("certName", certName).
WithField("certVersion", certVersion).
WithField("fingerprint", fingerprint).
@@ -633,17 +652,12 @@ func ixHandshakeStage2(f *Interface, addr netip.AddrPort, via *ViaSender, hh *Ha
WithField("initiatorIndex", hs.Details.InitiatorIndex).WithField("responderIndex", hs.Details.ResponderIndex).
WithField("remoteIndex", h.RemoteIndex).WithField("handshake", m{"stage": 2, "style": "ix_psk0"}).
WithField("durationNs", duration).
WithField("sentCachedPackets", len(hh.packetStore))
if anyVpnAddrsInCommon {
msgRxL.Info("Handshake message received")
} else {
//todo warn if not lighthouse or relay?
msgRxL.Info("Handshake message received, but no vpnNetworks in common.")
}
WithField("sentCachedPackets", len(hh.packetStore)).
Info("Handshake message received")
// Build up the radix for the firewall if we have subnets in the cert
hostinfo.vpnAddrs = vpnAddrs
hostinfo.buildNetworks(f.myVpnNetworksTable, remoteCert.Certificate)
hostinfo.buildNetworks(filteredNetworks, remoteCert.Certificate.UnsafeNetworks())
// Complete our handshake and update metrics, this will replace any existing tunnels for the vpnAddrs here
f.handshakeManager.Complete(hostinfo, f)

View File

@@ -269,12 +269,12 @@ func (hm *HandshakeManager) handleOutbound(vpnIp netip.Addr, lighthouseTriggered
hostinfo.logger(hm.l).WithField("relays", hostinfo.remotes.relays).Info("Attempt to relay through hosts")
// Send a RelayRequest to all known Relay IP's
for _, relay := range hostinfo.remotes.relays {
// Don't relay through the host I'm trying to connect to
// Don't relay to myself
if relay == vpnIp {
continue
}
// Don't relay to myself
// Don't relay through the host I'm trying to connect to
if hm.f.myVpnAddrsTable.Contains(relay) {
continue
}

View File

@@ -212,18 +212,6 @@ func (rs *RelayState) InsertRelay(ip netip.Addr, idx uint32, r *Relay) {
rs.relayForByIdx[idx] = r
}
type NetworkType uint8
const (
NetworkTypeUnknown NetworkType = iota
// NetworkTypeVPN is a network that overlaps one or more of the vpnNetworks in our certificate
NetworkTypeVPN
// NetworkTypeVPNPeer is a network that does not overlap one of our networks
NetworkTypeVPNPeer
// NetworkTypeUnsafe is a network from Certificate.UnsafeNetworks()
NetworkTypeUnsafe
)
type HostInfo struct {
remote netip.AddrPort
remotes *RemoteList
@@ -237,8 +225,8 @@ type HostInfo struct {
// vpn networks but were removed because they are not usable
vpnAddrs []netip.Addr
// networks is a combination of specific vpn addresses (not prefixes!) and full unsafe networks assigned to this host.
networks *bart.Table[NetworkType]
// networks are both all vpn and unsafe networks assigned to this host
networks *bart.Lite
relayState RelayState
// HandshakePacket records the packets used to create this hostinfo
@@ -742,26 +730,20 @@ func (i *HostInfo) SetRemoteIfPreferred(hm *HostMap, newRemote netip.AddrPort) b
return false
}
// buildNetworks fills in the networks field of HostInfo. It accepts a cert.Certificate so you never ever mix the network types up.
func (i *HostInfo) buildNetworks(myVpnNetworksTable *bart.Lite, c cert.Certificate) {
if len(c.Networks()) == 1 && len(c.UnsafeNetworks()) == 0 {
if myVpnNetworksTable.Contains(c.Networks()[0].Addr()) {
return // Simple case, no BART needed
}
func (i *HostInfo) buildNetworks(networks, unsafeNetworks []netip.Prefix) {
if len(networks) == 1 && len(unsafeNetworks) == 0 {
// Simple case, no CIDRTree needed
return
}
i.networks = new(bart.Table[NetworkType])
for _, network := range c.Networks() {
i.networks = new(bart.Lite)
for _, network := range networks {
nprefix := netip.PrefixFrom(network.Addr(), network.Addr().BitLen())
if myVpnNetworksTable.Contains(network.Addr()) {
i.networks.Insert(nprefix, NetworkTypeVPN)
} else {
i.networks.Insert(nprefix, NetworkTypeVPNPeer)
}
i.networks.Insert(nprefix)
}
for _, network := range c.UnsafeNetworks() {
i.networks.Insert(network, NetworkTypeUnsafe)
for _, network := range unsafeNetworks {
i.networks.Insert(network)
}
}

198
inside.go
View File

@@ -2,18 +2,18 @@ package nebula
import (
"net/netip"
"time"
"unsafe"
"github.com/sirupsen/logrus"
"github.com/slackhq/nebula/firewall"
"github.com/slackhq/nebula/header"
"github.com/slackhq/nebula/iputil"
"github.com/slackhq/nebula/noiseutil"
"github.com/slackhq/nebula/packet"
"github.com/slackhq/nebula/overlay"
"github.com/slackhq/nebula/routing"
)
func (f *Interface) consumeInsidePacket(packet []byte, fwPacket *firewall.Packet, nb []byte, out *packet.Packet, q int, localCache firewall.ConntrackCache, now time.Time) {
func (f *Interface) consumeInsidePacket(packet []byte, fwPacket *firewall.Packet, nb, out []byte, q int, localCache *firewall.ConntrackCache) {
err := newPacket(packet, false, fwPacket)
if err != nil {
if f.l.Level >= logrus.DebugLevel {
@@ -55,7 +55,7 @@ func (f *Interface) consumeInsidePacket(packet []byte, fwPacket *firewall.Packet
})
if hostinfo == nil {
f.rejectInside(packet, out.Payload, q) //todo vector?
f.rejectInside(packet, out, q)
if f.l.Level >= logrus.DebugLevel {
f.l.WithField("vpnAddr", fwPacket.RemoteAddr).
WithField("fwPacket", fwPacket).
@@ -68,11 +68,12 @@ func (f *Interface) consumeInsidePacket(packet []byte, fwPacket *firewall.Packet
return
}
dropReason := f.firewall.Drop(*fwPacket, false, hostinfo, f.pki.GetCAPool(), localCache, now)
dropReason := f.firewall.Drop(*fwPacket, false, hostinfo, f.pki.GetCAPool(), localCache)
if dropReason == nil {
f.sendNoMetricsDelayed(header.Message, 0, hostinfo.ConnectionState, hostinfo, netip.AddrPort{}, packet, nb, out, q)
f.sendNoMetrics(header.Message, 0, hostinfo.ConnectionState, hostinfo, netip.AddrPort{}, packet, nb, out, q)
} else {
f.rejectInside(packet, out.Payload, q) //todo vector?
f.rejectInside(packet, out, q)
if f.l.Level >= logrus.DebugLevel {
hostinfo.logger(f.l).
WithField("fwPacket", fwPacket).
@@ -121,10 +122,9 @@ func (f *Interface) rejectOutside(packet []byte, ci *ConnectionState, hostinfo *
f.sendNoMetrics(header.Message, 0, ci, hostinfo, netip.AddrPort{}, out, nb, packet, q)
}
// Handshake will attempt to initiate a tunnel with the provided vpn address. This is a no-op if the tunnel is already established or being established
// it does not check if it is within our vpn networks!
// Handshake will attempt to initiate a tunnel with the provided vpn address if it is within our vpn networks. This is a no-op if the tunnel is already established or being established
func (f *Interface) Handshake(vpnAddr netip.Addr) {
f.handshakeManager.GetOrHandshake(vpnAddr, nil)
f.getOrHandshakeNoRouting(vpnAddr, nil)
}
// getOrHandshakeNoRouting returns nil if the vpnAddr is not routable.
@@ -140,6 +140,7 @@ func (f *Interface) getOrHandshakeNoRouting(vpnAddr netip.Addr, cacheCallback fu
// getOrHandshakeConsiderRouting will try to find the HostInfo to handle this packet, starting a handshake if necessary.
// If the 2nd return var is false then the hostinfo is not ready to be used in a tunnel.
func (f *Interface) getOrHandshakeConsiderRouting(fwPacket *firewall.Packet, cacheCallback func(*HandshakeHostInfo)) (*HostInfo, bool) {
destinationAddr := fwPacket.RemoteAddr
hostinfo, ready := f.getOrHandshakeNoRouting(destinationAddr, cacheCallback)
@@ -219,7 +220,7 @@ func (f *Interface) sendMessageNow(t header.MessageType, st header.MessageSubTyp
}
// check if packet is in outbound fw rules
dropReason := f.firewall.Drop(*fp, false, hostinfo, f.pki.GetCAPool(), nil, time.Now())
dropReason := f.firewall.Drop(*fp, false, hostinfo, f.pki.GetCAPool(), nil)
if dropReason != nil {
if f.l.Level >= logrus.DebugLevel {
f.l.WithField("fwPacket", fp).
@@ -232,10 +233,9 @@ func (f *Interface) sendMessageNow(t header.MessageType, st header.MessageSubTyp
f.sendNoMetrics(header.Message, st, hostinfo.ConnectionState, hostinfo, netip.AddrPort{}, p, nb, out, 0)
}
// SendMessageToVpnAddr handles real addr:port lookup and sends to the current best known address for vpnAddr.
// This function ignores myVpnNetworksTable, and will always attempt to treat the address as a vpnAddr
// SendMessageToVpnAddr handles real addr:port lookup and sends to the current best known address for vpnAddr
func (f *Interface) SendMessageToVpnAddr(t header.MessageType, st header.MessageSubType, vpnAddr netip.Addr, p, nb, out []byte) {
hostInfo, ready := f.handshakeManager.GetOrHandshake(vpnAddr, func(hh *HandshakeHostInfo) {
hostInfo, ready := f.getOrHandshakeNoRouting(vpnAddr, func(hh *HandshakeHostInfo) {
hh.cachePacket(f.l, t, st, p, f.SendMessageToHostInfo, f.cachedPacketMetrics)
})
@@ -337,9 +337,21 @@ func (f *Interface) sendNoMetrics(t header.MessageType, st header.MessageSubType
if ci.eKey == nil {
return
}
useRelay := !remote.IsValid() && !hostinfo.remote.IsValid()
target := remote
if !target.IsValid() {
target = hostinfo.remote
}
useRelay := !target.IsValid()
fullOut := out
var pkt *overlay.Packet
if !useRelay && f.batches.Enabled() {
pkt = f.batches.newPacket()
if pkt != nil {
out = pkt.Payload()[:0]
}
}
if useRelay {
if len(out) < header.Len {
// out always has a capacity of mtu, but not always a length greater than the header.Len.
@@ -373,119 +385,85 @@ func (f *Interface) sendNoMetrics(t header.MessageType, st header.MessageSubType
}
var err error
if len(p) > 0 && slicesOverlap(out, p) {
tmp := make([]byte, len(p))
copy(tmp, p)
p = tmp
}
out, err = ci.eKey.EncryptDanger(out, out, p, c, nb)
if noiseutil.EncryptLockNeeded {
ci.writeLock.Unlock()
}
if err != nil {
if pkt != nil {
pkt.Release()
}
hostinfo.logger(f.l).WithError(err).
WithField("udpAddr", remote).WithField("counter", c).
WithField("udpAddr", target).WithField("counter", c).
WithField("attemptedCounter", c).
Error("Failed to encrypt outgoing packet")
return
}
if remote.IsValid() {
err = f.writers[q].WriteTo(out, remote)
if err != nil {
hostinfo.logger(f.l).WithError(err).
WithField("udpAddr", remote).Error("Failed to write outgoing packet")
}
} else if hostinfo.remote.IsValid() {
err = f.writers[q].WriteTo(out, hostinfo.remote)
if err != nil {
hostinfo.logger(f.l).WithError(err).
WithField("udpAddr", remote).Error("Failed to write outgoing packet")
}
} else {
// Try to send via a relay
for _, relayIP := range hostinfo.relayState.CopyRelayIps() {
relayHostInfo, relay, err := f.hostMap.QueryVpnAddrsRelayFor(hostinfo.vpnAddrs, relayIP)
if err != nil {
hostinfo.relayState.DeleteRelay(relayIP)
hostinfo.logger(f.l).WithField("relay", relayIP).WithError(err).Info("sendNoMetrics failed to find HostInfo")
continue
if target.IsValid() {
if pkt != nil {
pkt.Len = len(out)
if f.l.Level >= logrus.DebugLevel {
f.l.WithFields(logrus.Fields{
"queue": q,
"dest": target,
"payload_len": pkt.Len,
"use_batches": true,
"remote_index": hostinfo.remoteIndexId,
}).Debug("enqueueing packet to UDP batch queue")
}
f.SendVia(relayHostInfo, relay, out, nb, fullOut[:header.Len+len(out)], true)
break
if f.tryQueuePacket(q, pkt, target) {
return
}
if f.l.Level >= logrus.DebugLevel {
f.l.WithFields(logrus.Fields{
"queue": q,
"dest": target,
}).Debug("failed to enqueue packet; falling back to immediate send")
}
f.writeImmediatePacket(q, pkt, target, hostinfo)
return
}
}
}
func (f *Interface) sendNoMetricsDelayed(t header.MessageType, st header.MessageSubType, ci *ConnectionState, hostinfo *HostInfo, remote netip.AddrPort, p, nb []byte, out *packet.Packet, q int) {
if ci.eKey == nil {
return
}
useRelay := !remote.IsValid() && !hostinfo.remote.IsValid()
fullOut := out.Payload
if useRelay {
if len(out.Payload) < header.Len {
// out always has a capacity of mtu, but not always a length greater than the header.Len.
// Grow it to make sure the next operation works.
out.Payload = out.Payload[:header.Len]
if f.tryQueueDatagram(q, out, target) {
return
}
// Save a header's worth of data at the front of the 'out' buffer.
out.Payload = out.Payload[header.Len:]
}
if noiseutil.EncryptLockNeeded {
// NOTE: for goboring AESGCMTLS we need to lock because of the nonce check
ci.writeLock.Lock()
}
c := ci.messageCounter.Add(1)
//l.WithField("trace", string(debug.Stack())).Error("out Header ", &Header{Version, t, st, 0, hostinfo.remoteIndexId, c}, p)
out.Payload = header.Encode(out.Payload, header.Version, t, st, hostinfo.remoteIndexId, c)
f.connectionManager.Out(hostinfo)
// Query our LH if we haven't since the last time we've been rebound, this will cause the remote to punch against
// all our addrs and enable a faster roaming.
if t != header.CloseTunnel && hostinfo.lastRebindCount != f.rebindCount {
//NOTE: there is an update hole if a tunnel isn't used and exactly 256 rebinds occur before the tunnel is
// finally used again. This tunnel would eventually be torn down and recreated if this action didn't help.
f.lightHouse.QueryServer(hostinfo.vpnAddrs[0])
hostinfo.lastRebindCount = f.rebindCount
if f.l.Level >= logrus.DebugLevel {
f.l.WithField("vpnAddrs", hostinfo.vpnAddrs).Debug("Lighthouse update triggered for punch due to rebind counter")
}
}
var err error
out.Payload, err = ci.eKey.EncryptDanger(out.Payload, out.Payload, p, c, nb)
if noiseutil.EncryptLockNeeded {
ci.writeLock.Unlock()
}
if err != nil {
hostinfo.logger(f.l).WithError(err).
WithField("udpAddr", remote).WithField("counter", c).
WithField("attemptedCounter", c).
Error("Failed to encrypt outgoing packet")
f.writeImmediate(q, out, target, hostinfo)
return
}
if remote.IsValid() {
err = f.writers[q].Prep(out, remote)
// fall back to relay path
if pkt != nil {
pkt.Release()
}
// Try to send via a relay
for _, relayIP := range hostinfo.relayState.CopyRelayIps() {
relayHostInfo, relay, err := f.hostMap.QueryVpnAddrsRelayFor(hostinfo.vpnAddrs, relayIP)
if err != nil {
hostinfo.logger(f.l).WithError(err).WithField("udpAddr", remote).Error("Failed to write outgoing packet")
}
} else if hostinfo.remote.IsValid() {
err = f.writers[q].Prep(out, hostinfo.remote)
if err != nil {
hostinfo.logger(f.l).WithError(err).WithField("udpAddr", remote).Error("Failed to write outgoing packet")
}
} else {
// Try to send via a relay
for _, relayIP := range hostinfo.relayState.CopyRelayIps() {
relayHostInfo, relay, err := f.hostMap.QueryVpnAddrsRelayFor(hostinfo.vpnAddrs, relayIP)
if err != nil {
hostinfo.relayState.DeleteRelay(relayIP)
hostinfo.logger(f.l).WithField("relay", relayIP).WithError(err).Info("sendNoMetrics failed to find HostInfo")
continue
}
//todo vector!!
f.SendVia(relayHostInfo, relay, out.Payload, nb, fullOut[:header.Len+len(out.Payload)], true)
break
hostinfo.relayState.DeleteRelay(relayIP)
hostinfo.logger(f.l).WithField("relay", relayIP).WithError(err).Info("sendNoMetrics failed to find HostInfo")
continue
}
f.SendVia(relayHostInfo, relay, out, nb, fullOut[:header.Len+len(out)], true)
break
}
}
// slicesOverlap reports whether the two byte slices share any portion of memory.
// cipher.AEAD.Seal requires plaintext and dst to live in disjoint regions.
func slicesOverlap(a, b []byte) bool {
if len(a) == 0 || len(b) == 0 {
return false
}
aStart := uintptr(unsafe.Pointer(&a[0]))
aEnd := aStart + uintptr(len(a))
bStart := uintptr(unsafe.Pointer(&b[0]))
bEnd := bStart + uintptr(len(b))
return aStart < bEnd && bStart < aEnd
}

View File

@@ -4,9 +4,11 @@ import (
"context"
"errors"
"fmt"
"io"
"net/netip"
"os"
"runtime"
"strings"
"sync/atomic"
"time"
@@ -17,12 +19,16 @@ import (
"github.com/slackhq/nebula/firewall"
"github.com/slackhq/nebula/header"
"github.com/slackhq/nebula/overlay"
"github.com/slackhq/nebula/packet"
"github.com/slackhq/nebula/udp"
)
const mtu = 9001
const batch = 1024 //todo config!
const (
mtu = 9001
defaultGSOFlushInterval = 150 * time.Microsecond
defaultBatchQueueDepthFactor = 4
defaultGSOMaxSegments = 8
maxKernelGSOSegments = 64
)
type InterfaceConfig struct {
HostMap *HostMap
@@ -37,6 +43,9 @@ type InterfaceConfig struct {
connectionManager *connectionManager
DropLocalBroadcast bool
DropMulticast bool
EnableGSO bool
EnableGRO bool
GSOMaxSegments int
routines int
MessageMetrics *MessageMetrics
version string
@@ -48,6 +57,8 @@ type InterfaceConfig struct {
reQueryWait time.Duration
ConntrackCacheTimeout time.Duration
BatchFlushInterval time.Duration
BatchQueueDepth int
l *logrus.Logger
}
@@ -85,20 +96,25 @@ type Interface struct {
version string
conntrackCacheTimeout time.Duration
batchQueueDepth int
enableGSO bool
enableGRO bool
gsoMaxSegments int
batchUDPQueueGauge metrics.Gauge
batchUDPFlushCounter metrics.Counter
batchTunQueueGauge metrics.Gauge
batchTunFlushCounter metrics.Counter
batchFlushInterval atomic.Int64
sendSem chan struct{}
writers []udp.Conn
readers []overlay.TunDev
readers []io.ReadWriteCloser
batches batchPipelines
metricHandshakes metrics.Histogram
messageMetrics *MessageMetrics
cachedPacketMetrics *cachedPacketMetrics
listenInN int
listenOutN int
listenInMetric metrics.Histogram
listenOutMetric metrics.Histogram
l *logrus.Logger
}
@@ -168,6 +184,22 @@ func NewInterface(ctx context.Context, c *InterfaceConfig) (*Interface, error) {
return nil, errors.New("no connection manager")
}
if c.GSOMaxSegments <= 0 {
c.GSOMaxSegments = defaultGSOMaxSegments
}
if c.GSOMaxSegments > maxKernelGSOSegments {
c.GSOMaxSegments = maxKernelGSOSegments
}
if c.BatchQueueDepth <= 0 {
c.BatchQueueDepth = c.routines * defaultBatchQueueDepthFactor
}
if c.BatchFlushInterval < 0 {
c.BatchFlushInterval = 0
}
if c.BatchFlushInterval == 0 && c.EnableGSO {
c.BatchFlushInterval = defaultGSOFlushInterval
}
cs := c.pki.getCertState()
ifce := &Interface{
pki: c.pki,
@@ -184,7 +216,7 @@ func NewInterface(ctx context.Context, c *InterfaceConfig) (*Interface, error) {
routines: c.routines,
version: c.version,
writers: make([]udp.Conn, c.routines),
readers: make([]overlay.TunDev, c.routines),
readers: make([]io.ReadWriteCloser, c.routines),
myVpnNetworks: cs.myVpnNetworks,
myVpnNetworksTable: cs.myVpnNetworksTable,
myVpnAddrs: cs.myVpnAddrs,
@@ -193,6 +225,10 @@ func NewInterface(ctx context.Context, c *InterfaceConfig) (*Interface, error) {
relayManager: c.relayManager,
connectionManager: c.connectionManager,
conntrackCacheTimeout: c.ConntrackCacheTimeout,
batchQueueDepth: c.BatchQueueDepth,
enableGSO: c.EnableGSO,
enableGRO: c.EnableGRO,
gsoMaxSegments: c.GSOMaxSegments,
metricHandshakes: metrics.GetOrRegisterHistogram("handshakes", nil, metrics.NewExpDecaySample(1028, 0.015)),
messageMetrics: c.MessageMetrics,
@@ -203,12 +239,27 @@ func NewInterface(ctx context.Context, c *InterfaceConfig) (*Interface, error) {
l: c.l,
}
ifce.listenInMetric = metrics.GetOrRegisterHistogram("vhost.listenIn.n", nil, metrics.NewExpDecaySample(1028, 0.015))
ifce.listenOutMetric = metrics.GetOrRegisterHistogram("vhost.listenOut.n", nil, metrics.NewExpDecaySample(1028, 0.015))
ifce.tryPromoteEvery.Store(c.tryPromoteEvery)
ifce.batchUDPQueueGauge = metrics.GetOrRegisterGauge("batch.udp.queue_depth", nil)
ifce.batchUDPFlushCounter = metrics.GetOrRegisterCounter("batch.udp.flushes", nil)
ifce.batchTunQueueGauge = metrics.GetOrRegisterGauge("batch.tun.queue_depth", nil)
ifce.batchTunFlushCounter = metrics.GetOrRegisterCounter("batch.tun.flushes", nil)
ifce.batchFlushInterval.Store(int64(c.BatchFlushInterval))
ifce.sendSem = make(chan struct{}, c.routines)
ifce.batches.init(c.Inside, c.routines, c.BatchQueueDepth, c.GSOMaxSegments)
ifce.reQueryEvery.Store(c.reQueryEvery)
ifce.reQueryWait.Store(int64(c.reQueryWait))
if c.l.Level >= logrus.DebugLevel {
c.l.WithFields(logrus.Fields{
"enableGSO": c.EnableGSO,
"enableGRO": c.EnableGRO,
"gsoMaxSegments": c.GSOMaxSegments,
"batchQueueDepth": c.BatchQueueDepth,
"batchFlush": c.BatchFlushInterval,
"batching": ifce.batches.Enabled(),
}).Debug("initialized batch pipelines")
}
ifce.connectionManager.intf = ifce
@@ -234,7 +285,7 @@ func (f *Interface) activate() {
metrics.GetOrRegisterGauge("routines", nil).Update(int64(f.routines))
// Prepare n tun queues
var reader overlay.TunDev = f.inside
var reader io.ReadWriteCloser = f.inside
for i := 0; i < f.routines; i++ {
if i > 0 {
reader, err = f.inside.NewMultiQueueReader()
@@ -257,77 +308,69 @@ func (f *Interface) run() {
go f.listenOut(i)
}
if f.l.Level >= logrus.DebugLevel {
f.l.WithField("batching", f.batches.Enabled()).Debug("starting interface run loops")
}
if f.batches.Enabled() {
for i := 0; i < f.routines; i++ {
go f.runInsideBatchWorker(i)
go f.runTunWriteQueue(i)
go f.runSendQueue(i)
}
}
// Launch n queues to read packets from tun dev
for i := 0; i < f.routines; i++ {
go f.listenIn(f.readers[i], i)
}
}
func (f *Interface) listenOut(q int) {
func (f *Interface) listenOut(i int) {
runtime.LockOSThread()
var li udp.Conn
if q > 0 {
li = f.writers[q]
if i > 0 {
li = f.writers[i]
} else {
li = f.outside
}
ctCache := firewall.NewConntrackCacheTicker(f.conntrackCacheTimeout)
lhh := f.lightHouse.NewRequestHandler()
outPackets := make([]*packet.OutPacket, batch)
for i := 0; i < batch; i++ {
outPackets[i] = packet.NewOut()
}
plaintext := make([]byte, udp.MTU)
h := &header.H{}
fwPacket := &firewall.Packet{}
nb := make([]byte, 12, 12)
toSend := make([][]byte, batch)
li.ListenOut(func(pkts []*packet.Packet) {
toSend = toSend[:0]
for i := range outPackets {
outPackets[i].Valid = false
outPackets[i].SegCounter = 0
}
f.readOutsidePacketsMany(pkts, outPackets, h, fwPacket, lhh, nb, q, ctCache.Get(f.l), time.Now())
//we opportunistically tx, but try to also send stragglers
if _, err := f.readers[q].WriteMany(outPackets, q); err != nil {
f.l.WithError(err).Error("Failed to send packets")
}
//todo I broke this
//n := len(toSend)
//if f.l.Level == logrus.DebugLevel {
// f.listenOutMetric.Update(int64(n))
//}
//f.listenOutN = n
li.ListenOut(func(fromUdpAddr netip.AddrPort, payload []byte) {
f.readOutsidePackets(fromUdpAddr, nil, plaintext[:0], payload, h, fwPacket, lhh, nb, i, ctCache.Get(f.l))
})
}
func (f *Interface) listenIn(reader overlay.TunDev, queueNum int) {
func (f *Interface) listenIn(reader io.ReadWriteCloser, i int) {
runtime.LockOSThread()
if f.batches.Enabled() {
if br, ok := reader.(overlay.BatchReader); ok {
f.listenInBatchLocked(reader, br, i)
return
}
}
f.listenInLegacyLocked(reader, i)
}
func (f *Interface) listenInLegacyLocked(reader io.ReadWriteCloser, i int) {
packet := make([]byte, mtu)
out := make([]byte, mtu)
fwPacket := &firewall.Packet{}
nb := make([]byte, 12, 12)
conntrackCache := firewall.NewConntrackCacheTicker(f.conntrackCacheTimeout)
packets := make([]*packet.VirtIOPacket, batch)
outPackets := make([]*packet.Packet, batch)
for i := 0; i < batch; i++ {
packets[i] = packet.NewVIO()
outPackets[i] = packet.New(false) //todo?
}
for {
n, err := reader.ReadMany(packets, queueNum)
//todo!!
n, err := reader.Read(packet)
if err != nil {
if errors.Is(err, os.ErrClosed) && f.closed.Load() {
return
@@ -338,21 +381,581 @@ func (f *Interface) listenIn(reader overlay.TunDev, queueNum int) {
os.Exit(2)
}
if f.l.Level == logrus.DebugLevel {
f.listenInMetric.Update(int64(n))
}
f.listenInN = n
f.consumeInsidePacket(packet[:n], fwPacket, nb, out, i, conntrackCache.Get(f.l))
}
}
now := time.Now()
for i, pkt := range packets[:n] {
outPackets[i].OutLen = -1
f.consumeInsidePacket(pkt.Payload, fwPacket, nb, outPackets[i], queueNum, conntrackCache.Get(f.l), now)
reader.RecycleRxSeg(pkt, i == (n-1), queueNum) //todo handle err?
pkt.Reset()
}
_, err = f.writers[queueNum].WriteBatch(outPackets[:n])
func (f *Interface) listenInBatchLocked(raw io.ReadWriteCloser, reader overlay.BatchReader, i int) {
pool := f.batches.Pool()
if pool == nil {
f.l.Warn("batch pipeline enabled without an allocated pool; falling back to single-packet reads")
f.listenInLegacyLocked(raw, i)
return
}
for {
packets, err := reader.ReadIntoBatch(pool)
if err != nil {
f.l.WithError(err).Error("Error while writing outbound packets")
if errors.Is(err, os.ErrClosed) && f.closed.Load() {
return
}
if isVirtioHeadroomError(err) {
f.l.WithError(err).Warn("Batch reader fell back due to tun headroom issue")
f.listenInLegacyLocked(raw, i)
return
}
f.l.WithError(err).Error("Error while reading outbound packet batch")
os.Exit(2)
}
if len(packets) == 0 {
continue
}
for _, pkt := range packets {
if pkt == nil {
continue
}
if !f.batches.enqueueRx(i, pkt) {
pkt.Release()
}
}
}
}
func (f *Interface) runInsideBatchWorker(i int) {
queue := f.batches.rxQueue(i)
if queue == nil {
return
}
out := make([]byte, mtu)
fwPacket := &firewall.Packet{}
nb := make([]byte, 12, 12)
conntrackCache := firewall.NewConntrackCacheTicker(f.conntrackCacheTimeout)
for pkt := range queue {
if pkt == nil {
continue
}
f.consumeInsidePacket(pkt.Payload(), fwPacket, nb, out, i, conntrackCache.Get(f.l))
pkt.Release()
}
}
func (f *Interface) runSendQueue(i int) {
queue := f.batches.txQueue(i)
if queue == nil {
if f.l.Level >= logrus.DebugLevel {
f.l.WithField("queue", i).Debug("tx queue not initialized; batching disabled for writer")
}
return
}
writer := f.writerForIndex(i)
if writer == nil {
if f.l.Level >= logrus.DebugLevel {
f.l.WithField("queue", i).Debug("no UDP writer for batch queue")
}
return
}
if f.l.Level >= logrus.DebugLevel {
f.l.WithField("queue", i).Debug("send queue worker started")
}
defer func() {
if f.l.Level >= logrus.WarnLevel {
f.l.WithField("queue", i).Warn("send queue worker exited")
}
}()
batchCap := f.batches.batchSizeHint()
if batchCap <= 0 {
batchCap = 1
}
gsoLimit := f.effectiveGSOMaxSegments()
if gsoLimit > batchCap {
batchCap = gsoLimit
}
pending := make([]queuedDatagram, 0, batchCap)
var (
flushTimer *time.Timer
flushC <-chan time.Time
)
dispatch := func(reason string, timerFired bool) {
if len(pending) == 0 {
return
}
batch := pending
f.flushAndReleaseBatch(i, writer, batch, reason)
for idx := range batch {
batch[idx] = queuedDatagram{}
}
pending = pending[:0]
if flushTimer != nil {
if !timerFired {
if !flushTimer.Stop() {
select {
case <-flushTimer.C:
default:
}
}
}
flushTimer = nil
flushC = nil
}
}
armTimer := func() {
delay := f.currentBatchFlushInterval()
if delay <= 0 {
dispatch("nogso", false)
return
}
if flushTimer == nil {
flushTimer = time.NewTimer(delay)
flushC = flushTimer.C
}
}
for {
select {
case d := <-queue:
if d.packet == nil {
continue
}
if f.l.Level >= logrus.DebugLevel {
f.l.WithFields(logrus.Fields{
"queue": i,
"payload_len": d.packet.Len,
"dest": d.addr,
}).Debug("send queue received packet")
}
pending = append(pending, d)
if gsoLimit > 0 && len(pending) >= gsoLimit {
dispatch("gso", false)
continue
}
if len(pending) >= cap(pending) {
dispatch("cap", false)
continue
}
armTimer()
f.observeUDPQueueLen(i)
case <-flushC:
dispatch("timer", true)
}
}
}
func (f *Interface) runTunWriteQueue(i int) {
queue := f.batches.tunQueue(i)
if queue == nil {
return
}
writer := f.batches.inside
if writer == nil {
return
}
requiredHeadroom := writer.BatchHeadroom()
batchCap := f.batches.batchSizeHint()
if batchCap <= 0 {
batchCap = 1
}
pending := make([]*overlay.Packet, 0, batchCap)
var (
flushTimer *time.Timer
flushC <-chan time.Time
)
flush := func(reason string, timerFired bool) {
if len(pending) == 0 {
return
}
valid := pending[:0]
for idx := range pending {
if !f.ensurePacketHeadroom(&pending[idx], requiredHeadroom, i, reason) {
pending[idx] = nil
continue
}
if pending[idx] != nil {
valid = append(valid, pending[idx])
}
}
if len(valid) > 0 {
if _, err := writer.WriteBatch(valid); err != nil {
f.l.WithError(err).
WithField("queue", i).
WithField("reason", reason).
Warn("Failed to write tun batch")
for _, pkt := range valid {
if pkt != nil {
f.writePacketToTun(i, pkt)
}
}
}
}
pending = pending[:0]
if flushTimer != nil {
if !timerFired {
if !flushTimer.Stop() {
select {
case <-flushTimer.C:
default:
}
}
}
flushTimer = nil
flushC = nil
}
}
armTimer := func() {
delay := f.currentBatchFlushInterval()
if delay <= 0 {
return
}
if flushTimer == nil {
flushTimer = time.NewTimer(delay)
flushC = flushTimer.C
}
}
for {
select {
case pkt := <-queue:
if pkt == nil {
continue
}
if f.ensurePacketHeadroom(&pkt, requiredHeadroom, i, "queue") {
pending = append(pending, pkt)
}
if len(pending) >= cap(pending) {
flush("cap", false)
continue
}
armTimer()
f.observeTunQueueLen(i)
case <-flushC:
flush("timer", true)
}
}
}
func (f *Interface) flushAndReleaseBatch(index int, writer udp.Conn, batch []queuedDatagram, reason string) {
if len(batch) == 0 {
return
}
f.flushDatagrams(index, writer, batch, reason)
for idx := range batch {
if batch[idx].packet != nil {
batch[idx].packet.Release()
batch[idx].packet = nil
}
}
if f.batchUDPFlushCounter != nil {
f.batchUDPFlushCounter.Inc(int64(len(batch)))
}
}
func (f *Interface) flushDatagrams(index int, writer udp.Conn, batch []queuedDatagram, reason string) {
if len(batch) == 0 {
return
}
if f.l.Level >= logrus.DebugLevel {
f.l.WithFields(logrus.Fields{
"writer": index,
"reason": reason,
"pending": len(batch),
}).Debug("udp batch flush summary")
}
maxSeg := f.effectiveGSOMaxSegments()
if bw, ok := writer.(udp.BatchConn); ok {
chunkCap := maxSeg
if chunkCap <= 0 {
chunkCap = len(batch)
}
chunk := make([]udp.Datagram, 0, chunkCap)
var (
currentAddr netip.AddrPort
segments int
)
flushChunk := func() {
if len(chunk) == 0 {
return
}
if f.l.Level >= logrus.DebugLevel {
f.l.WithFields(logrus.Fields{
"writer": index,
"segments": len(chunk),
"dest": chunk[0].Addr,
"reason": reason,
"pending_total": len(batch),
}).Debug("flushing UDP batch")
}
if err := bw.WriteBatch(chunk); err != nil {
f.l.WithError(err).
WithField("writer", index).
WithField("reason", reason).
Warn("Failed to write UDP batch")
}
chunk = chunk[:0]
segments = 0
}
for _, item := range batch {
if item.packet == nil || !item.addr.IsValid() {
continue
}
payload := item.packet.Payload()[:item.packet.Len]
if segments == 0 {
currentAddr = item.addr
}
if item.addr != currentAddr || (maxSeg > 0 && segments >= maxSeg) {
flushChunk()
currentAddr = item.addr
}
chunk = append(chunk, udp.Datagram{Payload: payload, Addr: item.addr})
segments++
}
flushChunk()
return
}
for _, item := range batch {
if item.packet == nil || !item.addr.IsValid() {
continue
}
if f.l.Level >= logrus.DebugLevel {
f.l.WithFields(logrus.Fields{
"writer": index,
"reason": reason,
"dest": item.addr,
"segments": 1,
}).Debug("flushing UDP batch")
}
if err := writer.WriteTo(item.packet.Payload()[:item.packet.Len], item.addr); err != nil {
f.l.WithError(err).
WithField("writer", index).
WithField("udpAddr", item.addr).
WithField("reason", reason).
Warn("Failed to write UDP packet")
}
}
}
func (f *Interface) tryQueueDatagram(q int, buf []byte, addr netip.AddrPort) bool {
if !addr.IsValid() || !f.batches.Enabled() {
return false
}
pkt := f.batches.newPacket()
if pkt == nil {
return false
}
payload := pkt.Payload()
if len(payload) < len(buf) {
pkt.Release()
return false
}
copy(payload, buf)
pkt.Len = len(buf)
if f.batches.enqueueTx(q, pkt, addr) {
f.observeUDPQueueLen(q)
return true
}
pkt.Release()
return false
}
func (f *Interface) writerForIndex(i int) udp.Conn {
if i < 0 || i >= len(f.writers) {
return nil
}
return f.writers[i]
}
func (f *Interface) writeImmediate(q int, buf []byte, addr netip.AddrPort, hostinfo *HostInfo) {
writer := f.writerForIndex(q)
if writer == nil {
f.l.WithField("udpAddr", addr).
WithField("writer", q).
Error("Failed to write outgoing packet: no writer available")
return
}
if err := writer.WriteTo(buf, addr); err != nil {
hostinfo.logger(f.l).
WithError(err).
WithField("udpAddr", addr).
Error("Failed to write outgoing packet")
}
}
func (f *Interface) tryQueuePacket(q int, pkt *overlay.Packet, addr netip.AddrPort) bool {
if pkt == nil || !addr.IsValid() || !f.batches.Enabled() {
return false
}
if f.batches.enqueueTx(q, pkt, addr) {
f.observeUDPQueueLen(q)
return true
}
return false
}
func (f *Interface) writeImmediatePacket(q int, pkt *overlay.Packet, addr netip.AddrPort, hostinfo *HostInfo) {
if pkt == nil {
return
}
writer := f.writerForIndex(q)
if writer == nil {
f.l.WithField("udpAddr", addr).
WithField("writer", q).
Error("Failed to write outgoing packet: no writer available")
pkt.Release()
return
}
if err := writer.WriteTo(pkt.Payload()[:pkt.Len], addr); err != nil {
hostinfo.logger(f.l).
WithError(err).
WithField("udpAddr", addr).
Error("Failed to write outgoing packet")
}
pkt.Release()
}
func (f *Interface) writePacketToTun(q int, pkt *overlay.Packet) {
if pkt == nil {
return
}
writer := f.readers[q]
if writer == nil {
pkt.Release()
return
}
if bw, ok := writer.(interface {
WriteBatch([]*overlay.Packet) (int, error)
}); ok {
if _, err := bw.WriteBatch([]*overlay.Packet{pkt}); err != nil {
f.l.WithError(err).WithField("queue", q).Warn("Failed to write tun packet via batch writer")
pkt.Release()
}
return
}
if _, err := writer.Write(pkt.Payload()[:pkt.Len]); err != nil {
f.l.WithError(err).Error("Failed to write to tun")
}
pkt.Release()
}
func (f *Interface) clonePacketWithHeadroom(pkt *overlay.Packet, required int) *overlay.Packet {
if pkt == nil {
return nil
}
payload := pkt.Payload()[:pkt.Len]
if len(payload) == 0 && required <= 0 {
return pkt
}
pool := f.batches.Pool()
if pool != nil {
if clone := pool.Get(); clone != nil {
if len(clone.Payload()) >= len(payload) {
clone.Len = copy(clone.Payload(), payload)
pkt.Release()
return clone
}
clone.Release()
}
}
if required < 0 {
required = 0
}
buf := make([]byte, required+len(payload))
n := copy(buf[required:], payload)
pkt.Release()
return &overlay.Packet{
Buf: buf,
Offset: required,
Len: n,
}
}
func (f *Interface) observeUDPQueueLen(i int) {
if f.batchUDPQueueGauge == nil {
return
}
f.batchUDPQueueGauge.Update(int64(f.batches.txQueueLen(i)))
}
func (f *Interface) observeTunQueueLen(i int) {
if f.batchTunQueueGauge == nil {
return
}
f.batchTunQueueGauge.Update(int64(f.batches.tunQueueLen(i)))
}
func (f *Interface) currentBatchFlushInterval() time.Duration {
if v := f.batchFlushInterval.Load(); v > 0 {
return time.Duration(v)
}
return 0
}
func (f *Interface) ensurePacketHeadroom(pkt **overlay.Packet, required int, queue int, reason string) bool {
p := *pkt
if p == nil {
return false
}
if required <= 0 || p.Offset >= required {
return true
}
clone := f.clonePacketWithHeadroom(p, required)
if clone == nil {
f.l.WithFields(logrus.Fields{
"queue": queue,
"reason": reason,
}).Warn("dropping packet lacking tun headroom")
return false
}
*pkt = clone
return true
}
func isVirtioHeadroomError(err error) bool {
if err == nil {
return false
}
msg := err.Error()
return strings.Contains(msg, "headroom") || strings.Contains(msg, "virtio")
}
func (f *Interface) effectiveGSOMaxSegments() int {
max := f.gsoMaxSegments
if max <= 0 {
max = defaultGSOMaxSegments
}
if max > maxKernelGSOSegments {
max = maxKernelGSOSegments
}
if !f.enableGSO {
return 1
}
return max
}
type udpOffloadConfigurator interface {
ConfigureOffload(enableGSO, enableGRO bool, maxSegments int)
}
func (f *Interface) applyOffloadConfig(enableGSO, enableGRO bool, maxSegments int) {
if maxSegments <= 0 {
maxSegments = defaultGSOMaxSegments
}
if maxSegments > maxKernelGSOSegments {
maxSegments = maxKernelGSOSegments
}
f.enableGSO = enableGSO
f.enableGRO = enableGRO
f.gsoMaxSegments = maxSegments
for _, writer := range f.writers {
if cfg, ok := writer.(udpOffloadConfigurator); ok {
cfg.ConfigureOffload(enableGSO, enableGRO, maxSegments)
}
}
}
@@ -459,6 +1062,42 @@ func (f *Interface) reloadMisc(c *config.C) {
f.reQueryWait.Store(int64(n))
f.l.Info("timers.requery_wait_duration has changed")
}
if c.HasChanged("listen.gso_flush_timeout") {
d := c.GetDuration("listen.gso_flush_timeout", defaultGSOFlushInterval)
if d < 0 {
d = 0
}
f.batchFlushInterval.Store(int64(d))
f.l.WithField("duration", d).Info("listen.gso_flush_timeout has changed")
} else if c.HasChanged("batch.flush_interval") {
d := c.GetDuration("batch.flush_interval", defaultGSOFlushInterval)
if d < 0 {
d = 0
}
f.batchFlushInterval.Store(int64(d))
f.l.WithField("duration", d).Warn("batch.flush_interval is deprecated; use listen.gso_flush_timeout")
}
if c.HasChanged("batch.queue_depth") {
n := c.GetInt("batch.queue_depth", f.batchQueueDepth)
if n != f.batchQueueDepth {
f.batchQueueDepth = n
f.l.Warn("batch.queue_depth changes require a restart to take effect")
}
}
if c.HasChanged("listen.enable_gso") || c.HasChanged("listen.enable_gro") || c.HasChanged("listen.gso_max_segments") {
enableGSO := c.GetBool("listen.enable_gso", f.enableGSO)
enableGRO := c.GetBool("listen.enable_gro", f.enableGRO)
maxSeg := c.GetInt("listen.gso_max_segments", f.gsoMaxSegments)
f.applyOffloadConfig(enableGSO, enableGRO, maxSeg)
f.l.WithFields(logrus.Fields{
"enableGSO": enableGSO,
"enableGRO": enableGRO,
"gsoMaxSegments": maxSeg,
}).Info("listen GSO/GRO configuration updated")
}
}
func (f *Interface) emitStats(ctx context.Context, i time.Duration) {
@@ -491,11 +1130,6 @@ func (f *Interface) emitStats(ctx context.Context, i time.Duration) {
} else {
certMaxVersion.Update(int64(certState.v1Cert.Version()))
}
if f.l.Level != logrus.DebugLevel {
f.listenInMetric.Update(int64(f.listenInN))
f.listenOutMetric.Update(int64(f.listenOutN))
}
}
}
}

View File

@@ -360,8 +360,7 @@ func (lh *LightHouse) parseLighthouses(c *config.C) ([]netip.Addr, error) {
}
if !lh.myVpnNetworksTable.Contains(addr) {
lh.l.WithFields(m{"vpnAddr": addr, "networks": lh.myVpnNetworks}).
Warn("lighthouse host is not within our networks, lighthouse functionality will work but layer 3 network traffic to the lighthouse will not")
return nil, util.NewContextualError("lighthouse host is not in our networks, invalid", m{"vpnAddr": addr, "networks": lh.myVpnNetworks}, nil)
}
out[i] = addr
}
@@ -432,8 +431,7 @@ func (lh *LightHouse) loadStaticMap(c *config.C, staticList map[netip.Addr]struc
}
if !lh.myVpnNetworksTable.Contains(vpnAddr) {
lh.l.WithFields(m{"vpnAddr": vpnAddr, "networks": lh.myVpnNetworks, "entry": i + 1}).
Warn("static_host_map key is not within our networks, layer 3 network traffic to this host will not work")
return util.NewContextualError("static_host_map key is not in our network, invalid", m{"vpnAddr": vpnAddr, "networks": lh.myVpnNetworks, "entry": i + 1}, nil)
}
vals, ok := v.([]any)
@@ -1339,19 +1337,12 @@ func (lhh *LightHouseHandler) handleHostPunchNotification(n *NebulaMeta, fromVpn
}
}
remoteAllowList := lhh.lh.GetRemoteAllowList()
for _, a := range n.Details.V4AddrPorts {
b := protoV4AddrPortToNetAddrPort(a)
if remoteAllowList.Allow(detailsVpnAddr, b.Addr()) {
punch(b, detailsVpnAddr)
}
punch(protoV4AddrPortToNetAddrPort(a), detailsVpnAddr)
}
for _, a := range n.Details.V6AddrPorts {
b := protoV6AddrPortToNetAddrPort(a)
if remoteAllowList.Allow(detailsVpnAddr, b.Addr()) {
punch(b, detailsVpnAddr)
}
punch(protoV6AddrPortToNetAddrPort(a), detailsVpnAddr)
}
// This sends a nebula test packet to the host trying to contact us. In the case

View File

@@ -14,7 +14,7 @@ import (
"github.com/slackhq/nebula/test"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.yaml.in/yaml/v3"
"gopkg.in/yaml.v3"
)
func TestOldIPv4Only(t *testing.T) {

43
main.go
View File

@@ -5,6 +5,7 @@ import (
"fmt"
"net"
"net/netip"
"runtime"
"time"
"github.com/sirupsen/logrus"
@@ -13,7 +14,7 @@ import (
"github.com/slackhq/nebula/sshd"
"github.com/slackhq/nebula/udp"
"github.com/slackhq/nebula/util"
"go.yaml.in/yaml/v3"
"gopkg.in/yaml.v3"
)
type m = map[string]any
@@ -75,8 +76,7 @@ func Main(c *config.C, configTest bool, buildVersion string, logger *logrus.Logg
if c.GetBool("sshd.enabled", false) {
sshStart, err = configSSH(l, ssh, c)
if err != nil {
l.WithError(err).Warn("Failed to configure sshd, ssh debugging will not be available")
sshStart = nil
return nil, util.ContextualizeIfNeeded("Error while configuring the sshd", err)
}
}
@@ -144,6 +144,20 @@ func Main(c *config.C, configTest bool, buildVersion string, logger *logrus.Logg
// set up our UDP listener
udpConns := make([]udp.Conn, routines)
port := c.GetInt("listen.port", 0)
enableGSO := c.GetBool("listen.enable_gso", true)
enableGRO := c.GetBool("listen.enable_gro", true)
gsoMaxSegments := c.GetInt("listen.gso_max_segments", defaultGSOMaxSegments)
if gsoMaxSegments <= 0 {
gsoMaxSegments = defaultGSOMaxSegments
}
if gsoMaxSegments > maxKernelGSOSegments {
gsoMaxSegments = maxKernelGSOSegments
}
gsoFlushTimeout := c.GetDuration("listen.gso_flush_timeout", defaultGSOFlushInterval)
if gsoFlushTimeout < 0 {
gsoFlushTimeout = 0
}
batchQueueDepth := c.GetInt("batch.queue_depth", 0)
if !configTest {
rawListenHost := c.GetString("listen.host", "0.0.0.0")
@@ -163,13 +177,28 @@ func Main(c *config.C, configTest bool, buildVersion string, logger *logrus.Logg
listenHost = ips[0].Unmap()
}
useWGDefault := runtime.GOOS == "linux"
useWG := c.GetBool("listen.use_wireguard_stack", useWGDefault)
var mkListener func(*logrus.Logger, netip.Addr, int, bool, int, int) (udp.Conn, error)
if useWG {
mkListener = udp.NewWireguardListener
} else {
mkListener = udp.NewListener
}
for i := 0; i < routines; i++ {
l.Infof("listening on %v", netip.AddrPortFrom(listenHost, uint16(port)))
udpServer, err := udp.NewListener(l, listenHost, port, routines > 1, c.GetInt("listen.batch", 64))
udpServer, err := mkListener(l, listenHost, port, routines > 1, c.GetInt("listen.batch", 64), i)
if err != nil {
return nil, util.NewContextualError("Failed to open udp listener", m{"queue": i}, err)
}
//todo set bpf on zeroth socket
udpServer.ReloadConfig(c)
if cfg, ok := udpServer.(interface {
ConfigureOffload(bool, bool, int)
}); ok {
cfg.ConfigureOffload(enableGSO, enableGRO, gsoMaxSegments)
}
udpConns[i] = udpServer
// If port is dynamic, discover it before the next pass through the for loop
@@ -237,12 +266,17 @@ func Main(c *config.C, configTest bool, buildVersion string, logger *logrus.Logg
reQueryWait: c.GetDuration("timers.requery_wait_duration", defaultReQueryWait),
DropLocalBroadcast: c.GetBool("tun.drop_local_broadcast", false),
DropMulticast: c.GetBool("tun.drop_multicast", false),
EnableGSO: enableGSO,
EnableGRO: enableGRO,
GSOMaxSegments: gsoMaxSegments,
routines: routines,
MessageMetrics: messageMetrics,
version: buildVersion,
relayManager: NewRelayManager(ctx, l, hostMap, c),
punchy: punchy,
ConntrackCacheTimeout: conntrackCacheTimeout,
BatchFlushInterval: gsoFlushTimeout,
BatchQueueDepth: batchQueueDepth,
l: l,
}
@@ -254,6 +288,7 @@ func Main(c *config.C, configTest bool, buildVersion string, logger *logrus.Logg
}
ifce.writers = udpConns
ifce.applyOffloadConfig(enableGSO, enableGRO, gsoMaxSegments)
lightHouse.ifce = ifce
ifce.RegisterConfigChangeCallbacks(c)

View File

@@ -7,12 +7,12 @@ import (
"time"
"github.com/google/gopacket/layers"
"github.com/slackhq/nebula/packet"
"golang.org/x/net/ipv6"
"github.com/sirupsen/logrus"
"github.com/slackhq/nebula/firewall"
"github.com/slackhq/nebula/header"
"github.com/slackhq/nebula/overlay"
"golang.org/x/net/ipv4"
)
@@ -20,7 +20,7 @@ const (
minFwPacketLen = 4
)
func (f *Interface) readOutsidePackets(ip netip.AddrPort, via *ViaSender, out []byte, packet []byte, h *header.H, fwPacket *firewall.Packet, lhf *LightHouseHandler, nb []byte, q int, localCache firewall.ConntrackCache, now time.Time) {
func (f *Interface) readOutsidePackets(ip netip.AddrPort, via *ViaSender, out []byte, packet []byte, h *header.H, fwPacket *firewall.Packet, lhf *LightHouseHandler, nb []byte, q int, localCache *firewall.ConntrackCache) {
err := h.Parse(packet)
if err != nil {
// Hole punch packets are 0 or 1 byte big, so lets ignore printing those errors
@@ -62,7 +62,7 @@ func (f *Interface) readOutsidePackets(ip netip.AddrPort, via *ViaSender, out []
switch h.Subtype {
case header.MessageNone:
if !f.decryptToTun(hostinfo, h.MessageCounter, out, packet, fwPacket, nb, q, localCache, now) {
if !f.decryptToTun(hostinfo, h.MessageCounter, out, packet, fwPacket, nb, q, localCache, ip, h.RemoteIndex) {
return
}
case header.MessageRelay:
@@ -97,7 +97,7 @@ func (f *Interface) readOutsidePackets(ip netip.AddrPort, via *ViaSender, out []
case TerminalType:
// If I am the target of this relay, process the unwrapped packet
// From this recursive point, all these variables are 'burned'. We shouldn't rely on them again.
f.readOutsidePackets(netip.AddrPort{}, &ViaSender{relayHI: hostinfo, remoteIdx: relay.RemoteIndex, relay: relay}, out[:0], signedPayload, h, fwPacket, lhf, nb, q, localCache, now)
f.readOutsidePackets(netip.AddrPort{}, &ViaSender{relayHI: hostinfo, remoteIdx: relay.RemoteIndex, relay: relay}, out[:0], signedPayload, h, fwPacket, lhf, nb, q, localCache)
return
case ForwardingType:
// Find the target HostInfo relay object
@@ -217,217 +217,6 @@ func (f *Interface) readOutsidePackets(ip netip.AddrPort, via *ViaSender, out []
f.connectionManager.In(hostinfo)
}
func (f *Interface) readOutsidePacketsMany(packets []*packet.Packet, out []*packet.OutPacket, h *header.H, fwPacket *firewall.Packet, lhf *LightHouseHandler, nb []byte, q int, localCache firewall.ConntrackCache, now time.Time) {
for i, pkt := range packets {
out[i].Scratch = out[i].Scratch[:0]
ip := pkt.AddrPort()
//l.Error("in packet ", header, packet[HeaderLen:])
if ip.IsValid() {
if f.myVpnNetworksTable.Contains(ip.Addr()) {
if f.l.Level >= logrus.DebugLevel {
f.l.WithField("udpAddr", ip).Debug("Refusing to process double encrypted packet")
}
return
}
}
//todo per-segment!
for segment := range pkt.Segments() {
err := h.Parse(segment)
if err != nil {
// Hole punch packets are 0 or 1 byte big, so lets ignore printing those errors
if len(segment) > 1 {
f.l.WithField("packet", pkt).Infof("Error while parsing inbound packet from %s: %s", ip, err)
}
return
}
var hostinfo *HostInfo
// verify if we've seen this index before, otherwise respond to the handshake initiation
if h.Type == header.Message && h.Subtype == header.MessageRelay {
hostinfo = f.hostMap.QueryRelayIndex(h.RemoteIndex)
} else {
hostinfo = f.hostMap.QueryIndex(h.RemoteIndex)
}
var ci *ConnectionState
if hostinfo != nil {
ci = hostinfo.ConnectionState
}
switch h.Type {
case header.Message:
// TODO handleEncrypted sends directly to addr on error. Handle this in the tunneling case.
if !f.handleEncrypted(ci, ip, h) {
return
}
switch h.Subtype {
case header.MessageNone:
if !f.decryptToTunDelayWrite(hostinfo, h.MessageCounter, out[i], pkt, segment, fwPacket, nb, q, localCache, now) {
return
}
case header.MessageRelay:
// The entire body is sent as AD, not encrypted.
// The packet consists of a 16-byte parsed Nebula header, Associated Data-protected payload, and a trailing 16-byte AEAD signature value.
// The packet is guaranteed to be at least 16 bytes at this point, b/c it got past the h.Parse() call above. If it's
// otherwise malformed (meaning, there is no trailing 16 byte AEAD value), then this will result in at worst a 0-length slice
// which will gracefully fail in the DecryptDanger call.
signedPayload := segment[:len(segment)-hostinfo.ConnectionState.dKey.Overhead()]
signatureValue := segment[len(segment)-hostinfo.ConnectionState.dKey.Overhead():]
out[i].Scratch, err = hostinfo.ConnectionState.dKey.DecryptDanger(out[i].Scratch, signedPayload, signatureValue, h.MessageCounter, nb)
if err != nil {
return
}
// Successfully validated the thing. Get rid of the Relay header.
signedPayload = signedPayload[header.Len:]
// Pull the Roaming parts up here, and return in all call paths.
f.handleHostRoaming(hostinfo, ip)
// Track usage of both the HostInfo and the Relay for the received & authenticated packet
f.connectionManager.In(hostinfo)
f.connectionManager.RelayUsed(h.RemoteIndex)
relay, ok := hostinfo.relayState.QueryRelayForByIdx(h.RemoteIndex)
if !ok {
// The only way this happens is if hostmap has an index to the correct HostInfo, but the HostInfo is missing
// its internal mapping. This should never happen.
hostinfo.logger(f.l).WithFields(logrus.Fields{"vpnAddrs": hostinfo.vpnAddrs, "remoteIndex": h.RemoteIndex}).Error("HostInfo missing remote relay index")
return
}
switch relay.Type {
case TerminalType:
// If I am the target of this relay, process the unwrapped packet
// From this recursive point, all these variables are 'burned'. We shouldn't rely on them again.
f.readOutsidePackets(netip.AddrPort{}, &ViaSender{relayHI: hostinfo, remoteIdx: relay.RemoteIndex, relay: relay}, out[i].Scratch[:0], signedPayload, h, fwPacket, lhf, nb, q, localCache, now)
return
case ForwardingType:
// Find the target HostInfo relay object
targetHI, targetRelay, err := f.hostMap.QueryVpnAddrsRelayFor(hostinfo.vpnAddrs, relay.PeerAddr)
if err != nil {
hostinfo.logger(f.l).WithField("relayTo", relay.PeerAddr).WithError(err).WithField("hostinfo.vpnAddrs", hostinfo.vpnAddrs).Info("Failed to find target host info by ip")
return
}
// If that relay is Established, forward the payload through it
if targetRelay.State == Established {
switch targetRelay.Type {
case ForwardingType:
// Forward this packet through the relay tunnel
// Find the target HostInfo
f.SendVia(targetHI, targetRelay, signedPayload, nb, out[i].Scratch, false)
return
case TerminalType:
hostinfo.logger(f.l).Error("Unexpected Relay Type of Terminal")
}
} else {
hostinfo.logger(f.l).WithFields(logrus.Fields{"relayTo": relay.PeerAddr, "relayFrom": hostinfo.vpnAddrs[0], "targetRelayState": targetRelay.State}).Info("Unexpected target relay state")
return
}
}
}
case header.LightHouse:
f.messageMetrics.Rx(h.Type, h.Subtype, 1)
if !f.handleEncrypted(ci, ip, h) {
return
}
d, err := f.decrypt(hostinfo, h.MessageCounter, out[i].Scratch, segment, h, nb)
if err != nil {
hostinfo.logger(f.l).WithError(err).WithField("udpAddr", ip).
WithField("packet", segment).
Error("Failed to decrypt lighthouse packet")
return
}
lhf.HandleRequest(ip, hostinfo.vpnAddrs, d, f)
// Fallthrough to the bottom to record incoming traffic
case header.Test:
f.messageMetrics.Rx(h.Type, h.Subtype, 1)
if !f.handleEncrypted(ci, ip, h) {
return
}
d, err := f.decrypt(hostinfo, h.MessageCounter, out[i].Scratch, segment, h, nb)
if err != nil {
hostinfo.logger(f.l).WithError(err).WithField("udpAddr", ip).
WithField("packet", segment).
Error("Failed to decrypt test packet")
return
}
if h.Subtype == header.TestRequest {
// This testRequest might be from TryPromoteBest, so we should roam
// to the new IP address before responding
f.handleHostRoaming(hostinfo, ip)
f.send(header.Test, header.TestReply, ci, hostinfo, d, nb, out[i].Scratch)
}
// Fallthrough to the bottom to record incoming traffic
// Non encrypted messages below here, they should not fall through to avoid tracking incoming traffic since they
// are unauthenticated
case header.Handshake:
f.messageMetrics.Rx(h.Type, h.Subtype, 1)
f.handshakeManager.HandleIncoming(ip, nil, segment, h)
return
case header.RecvError:
f.messageMetrics.Rx(h.Type, h.Subtype, 1)
f.handleRecvError(ip, h)
return
case header.CloseTunnel:
f.messageMetrics.Rx(h.Type, h.Subtype, 1)
if !f.handleEncrypted(ci, ip, h) {
return
}
hostinfo.logger(f.l).WithField("udpAddr", ip).
Info("Close tunnel received, tearing down.")
f.closeTunnel(hostinfo)
return
case header.Control:
if !f.handleEncrypted(ci, ip, h) {
return
}
d, err := f.decrypt(hostinfo, h.MessageCounter, out[i].Scratch, segment, h, nb)
if err != nil {
hostinfo.logger(f.l).WithError(err).WithField("udpAddr", ip).
WithField("packet", segment).
Error("Failed to decrypt Control packet")
return
}
f.relayManager.HandleControlMsg(hostinfo, d, f)
default:
f.messageMetrics.Rx(h.Type, h.Subtype, 1)
hostinfo.logger(f.l).Debugf("Unexpected packet received from %s", ip)
return
}
f.handleHostRoaming(hostinfo, ip)
f.connectionManager.In(hostinfo)
}
_, err := f.readers[q].WriteOne(out[i], false, q)
if err != nil {
f.l.WithError(err).Error("Failed to write packet")
}
}
}
// closeTunnel closes a tunnel locally, it does not send a closeTunnel packet to the remote
func (f *Interface) closeTunnel(hostInfo *HostInfo) {
final := f.hostMap.DeleteHostInfo(hostInfo)
@@ -677,78 +466,55 @@ func (f *Interface) decrypt(hostinfo *HostInfo, mc uint64, out []byte, packet []
return out, nil
}
func (f *Interface) decryptToTunDelayWrite(hostinfo *HostInfo, messageCounter uint64, out *packet.OutPacket, pkt *packet.Packet, inSegment []byte, fwPacket *firewall.Packet, nb []byte, q int, localCache firewall.ConntrackCache, now time.Time) bool {
var err error
func (f *Interface) decryptToTun(hostinfo *HostInfo, messageCounter uint64, out []byte, packet []byte, fwPacket *firewall.Packet, nb []byte, q int, localCache *firewall.ConntrackCache, addr netip.AddrPort, recvIndex uint32) bool {
var (
err error
pkt *overlay.Packet
)
seg, err := f.readers[q].AllocSeg(out, q)
if err != nil {
f.l.WithError(err).Errorln("decryptToTunDelayWrite: failed to allocate segment")
return false
}
out.SegmentPayloads[seg] = out.SegmentPayloads[seg][:0]
out.SegmentPayloads[seg], err = hostinfo.ConnectionState.dKey.DecryptDanger(out.SegmentPayloads[seg], inSegment[:header.Len], inSegment[header.Len:], messageCounter, nb)
if err != nil {
hostinfo.logger(f.l).WithError(err).Error("Failed to decrypt packet")
return false
}
err = newPacket(out.SegmentPayloads[seg], true, fwPacket)
if err != nil {
hostinfo.logger(f.l).WithError(err).WithField("packet", out).
Warnf("Error while validating inbound packet")
return false
}
if !hostinfo.ConnectionState.window.Update(f.l, messageCounter) {
hostinfo.logger(f.l).WithField("fwPacket", fwPacket).
Debugln("dropping out of window packet")
return false
}
dropReason := f.firewall.Drop(*fwPacket, true, hostinfo, f.pki.GetCAPool(), localCache, now)
if dropReason != nil {
// NOTE: We give `packet` as the `out` here since we already decrypted from it and we don't need it anymore
// This gives us a buffer to build the reject packet in
f.rejectOutside(out.SegmentPayloads[seg], hostinfo.ConnectionState, hostinfo, nb, inSegment, q)
if f.l.Level >= logrus.DebugLevel {
hostinfo.logger(f.l).WithField("fwPacket", fwPacket).
WithField("reason", dropReason).
Debugln("dropping inbound packet")
if f.batches.tunQueue(q) != nil {
pkt = f.batches.newPacket()
if pkt != nil {
out = pkt.Payload()[:0]
}
return false
}
f.connectionManager.In(hostinfo)
pkt.OutLen += len(inSegment)
out.Segments[seg] = out.Segments[seg][:len(out.SegmentHeaders[seg])+len(out.SegmentPayloads[seg])]
return true
}
func (f *Interface) decryptToTun(hostinfo *HostInfo, messageCounter uint64, out []byte, packet []byte, fwPacket *firewall.Packet, nb []byte, q int, localCache firewall.ConntrackCache, now time.Time) bool {
var err error
out, err = hostinfo.ConnectionState.dKey.DecryptDanger(out, packet[:header.Len], packet[header.Len:], messageCounter, nb)
if err != nil {
if pkt != nil {
pkt.Release()
}
hostinfo.logger(f.l).WithError(err).Error("Failed to decrypt packet")
if addr.IsValid() {
f.maybeSendRecvError(addr, recvIndex)
}
return false
}
err = newPacket(out, true, fwPacket)
if err != nil {
if pkt != nil {
pkt.Release()
}
hostinfo.logger(f.l).WithError(err).WithField("packet", out).
Warnf("Error while validating inbound packet")
return false
}
if !hostinfo.ConnectionState.window.Update(f.l, messageCounter) {
if pkt != nil {
pkt.Release()
}
hostinfo.logger(f.l).WithField("fwPacket", fwPacket).
Debugln("dropping out of window packet")
return false
}
dropReason := f.firewall.Drop(*fwPacket, true, hostinfo, f.pki.GetCAPool(), localCache, now)
dropReason := f.firewall.Drop(*fwPacket, true, hostinfo, f.pki.GetCAPool(), localCache)
if dropReason != nil {
if pkt != nil {
pkt.Release()
}
// NOTE: We give `packet` as the `out` here since we already decrypted from it and we don't need it anymore
// This gives us a buffer to build the reject packet in
f.rejectOutside(out, hostinfo.ConnectionState, hostinfo, nb, packet, q)
@@ -761,8 +527,17 @@ func (f *Interface) decryptToTun(hostinfo *HostInfo, messageCounter uint64, out
}
f.connectionManager.In(hostinfo)
_, err = f.readers[q].Write(out)
if err != nil {
if pkt != nil {
pkt.Len = len(out)
if f.batches.enqueueTun(q, pkt) {
f.observeTunQueueLen(q)
return true
}
f.writePacketToTun(q, pkt)
return true
}
if _, err = f.readers[q].Write(out); err != nil {
f.l.WithError(err).Error("Failed to write to tun")
}
return true

View File

@@ -1,16 +1,99 @@
package overlay
import (
"io"
"net/netip"
"sync"
"github.com/slackhq/nebula/routing"
)
type Device interface {
TunDev
io.ReadWriteCloser
Activate() error
Networks() []netip.Prefix
Name() string
RoutesFor(netip.Addr) routing.Gateways
NewMultiQueueReader() (TunDev, error)
NewMultiQueueReader() (io.ReadWriteCloser, error)
}
// Packet represents a single packet buffer with optional headroom to carry
// metadata (for example virtio-net headers).
type Packet struct {
Buf []byte
Offset int
Len int
release func()
}
func (p *Packet) Payload() []byte {
return p.Buf[p.Offset : p.Offset+p.Len]
}
func (p *Packet) Reset() {
p.Len = 0
p.Offset = 0
p.release = nil
}
func (p *Packet) Release() {
if p.release != nil {
p.release()
p.release = nil
}
}
func (p *Packet) Capacity() int {
return len(p.Buf) - p.Offset
}
// PacketPool manages reusable buffers with headroom.
type PacketPool struct {
headroom int
blksz int
pool sync.Pool
}
func NewPacketPool(headroom, payload int) *PacketPool {
p := &PacketPool{headroom: headroom, blksz: headroom + payload}
p.pool.New = func() any {
buf := make([]byte, p.blksz)
return &Packet{Buf: buf, Offset: headroom}
}
return p
}
func (p *PacketPool) Get() *Packet {
pkt := p.pool.Get().(*Packet)
pkt.Offset = p.headroom
pkt.Len = 0
pkt.release = func() { p.put(pkt) }
return pkt
}
func (p *PacketPool) put(pkt *Packet) {
pkt.Reset()
p.pool.Put(pkt)
}
// BatchReader allows reading multiple packets into a shared pool with
// preallocated headroom (e.g. virtio-net headers).
type BatchReader interface {
ReadIntoBatch(pool *PacketPool) ([]*Packet, error)
}
// BatchWriter writes a slice of packets that carry their own metadata.
type BatchWriter interface {
WriteBatch(packets []*Packet) (int, error)
}
// BatchCapableDevice describes a device that can efficiently read and write
// batches of packets with virtio headroom.
type BatchCapableDevice interface {
Device
BatchReader
BatchWriter
BatchHeadroom() int
BatchPayloadCap() int
BatchSize() int
}

View File

@@ -1,91 +0,0 @@
package eventfd
import (
"encoding/binary"
"syscall"
"golang.org/x/sys/unix"
)
type EventFD struct {
fd int
buf [8]byte
}
func New() (EventFD, error) {
fd, err := unix.Eventfd(0, unix.EFD_NONBLOCK)
if err != nil {
return EventFD{}, err
}
return EventFD{
fd: fd,
buf: [8]byte{},
}, nil
}
func (e *EventFD) Kick() error {
binary.LittleEndian.PutUint64(e.buf[:], 1) //is this right???
_, err := syscall.Write(int(e.fd), e.buf[:])
return err
}
func (e *EventFD) Close() error {
if e.fd != 0 {
return unix.Close(e.fd)
}
return nil
}
func (e *EventFD) FD() int {
return e.fd
}
type Epoll struct {
fd int
buf [8]byte
events []syscall.EpollEvent
}
func NewEpoll() (Epoll, error) {
fd, err := unix.EpollCreate1(0)
if err != nil {
return Epoll{}, err
}
return Epoll{
fd: fd,
buf: [8]byte{},
events: make([]syscall.EpollEvent, 1),
}, nil
}
func (ep *Epoll) AddEvent(fdToAdd int) error {
event := syscall.EpollEvent{
Events: syscall.EPOLLIN,
Fd: int32(fdToAdd),
}
return syscall.EpollCtl(ep.fd, syscall.EPOLL_CTL_ADD, fdToAdd, &event)
}
func (ep *Epoll) Block() (int, error) {
n, err := syscall.EpollWait(ep.fd, ep.events, -1)
if err != nil {
//goland:noinspection GoDirectComparisonOfErrors
if err == syscall.EINTR {
return 0, nil //??
}
return -1, err
}
return n, nil
}
func (ep *Epoll) Clear() error {
_, err := syscall.Read(int(ep.events[0].Fd), ep.buf[:])
return err
}
func (ep *Epoll) Close() error {
if ep.fd != 0 {
return unix.Close(ep.fd)
}
return nil
}

View File

@@ -2,29 +2,16 @@ package overlay
import (
"fmt"
"io"
"net"
"net/netip"
"github.com/sirupsen/logrus"
"github.com/slackhq/nebula/config"
"github.com/slackhq/nebula/packet"
"github.com/slackhq/nebula/util"
)
const DefaultMTU = 1300
type TunDev interface {
io.WriteCloser
ReadMany(x []*packet.VirtIOPacket, q int) (int, error)
//todo this interface sux
AllocSeg(pkt *packet.OutPacket, q int) (int, error)
WriteOne(x *packet.OutPacket, kick bool, q int) (int, error)
WriteMany(x []*packet.OutPacket, q int) (int, error)
RecycleRxSeg(pkt *packet.VirtIOPacket, kick bool, q int) error
}
// TODO: We may be able to remove routines
type DeviceFactory func(c *config.C, l *logrus.Logger, vpnNetworks []netip.Prefix, routines int) (Device, error)
@@ -39,11 +26,11 @@ func NewDeviceFromConfig(c *config.C, l *logrus.Logger, vpnNetworks []netip.Pref
}
}
//func NewFdDeviceFromConfig(fd *int) DeviceFactory {
// return func(c *config.C, l *logrus.Logger, vpnNetworks []netip.Prefix, routines int) (Device, error) {
// return newTunFromFd(c, l, *fd, vpnNetworks)
// }
//}
func NewFdDeviceFromConfig(fd *int) DeviceFactory {
return func(c *config.C, l *logrus.Logger, vpnNetworks []netip.Prefix, routines int) (Device, error) {
return newTunFromFd(c, l, *fd, vpnNetworks)
}
}
func getAllRoutesFromConfig(c *config.C, vpnNetworks []netip.Prefix, initial bool) (bool, []Route, error) {
if !initial && !c.HasChanged("tun.routes") && !c.HasChanged("tun.unsafe_routes") {

View File

@@ -9,8 +9,6 @@ import (
"github.com/rcrowley/go-metrics"
"github.com/sirupsen/logrus"
"github.com/slackhq/nebula/iputil"
"github.com/slackhq/nebula/overlay/virtqueue"
"github.com/slackhq/nebula/packet"
"github.com/slackhq/nebula/routing"
)
@@ -24,10 +22,6 @@ type disabledTun struct {
l *logrus.Logger
}
func (*disabledTun) RecycleRxSeg(pkt *packet.VirtIOPacket, kick bool, q int) error {
return nil
}
func newDisabledTun(vpnNetworks []netip.Prefix, queueLen int, metricsEnabled bool, l *logrus.Logger) *disabledTun {
tun := &disabledTun{
vpnNetworks: vpnNetworks,
@@ -46,10 +40,6 @@ func newDisabledTun(vpnNetworks []netip.Prefix, queueLen int, metricsEnabled boo
return tun
}
func (*disabledTun) GetQueues() []*virtqueue.SplitQueue {
return nil
}
func (*disabledTun) Activate() error {
return nil
}
@@ -115,23 +105,7 @@ func (t *disabledTun) Write(b []byte) (int, error) {
return len(b), nil
}
func (t *disabledTun) AllocSeg(pkt *packet.OutPacket, q int) (int, error) {
return 0, fmt.Errorf("tun_disabled: AllocSeg not implemented")
}
func (t *disabledTun) WriteOne(x *packet.OutPacket, kick bool, q int) (int, error) {
return 0, fmt.Errorf("tun_disabled: WriteOne not implemented")
}
func (t *disabledTun) WriteMany(x []*packet.OutPacket, q int) (int, error) {
return 0, fmt.Errorf("tun_disabled: WriteMany not implemented")
}
func (t *disabledTun) ReadMany(b []*packet.VirtIOPacket, _ int) (int, error) {
return t.Read(b[0].Payload)
}
func (t *disabledTun) NewMultiQueueReader() (TunDev, error) {
func (t *disabledTun) NewMultiQueueReader() (io.ReadWriteCloser, error) {
return t, nil
}

View File

@@ -5,9 +5,11 @@ package overlay
import (
"fmt"
"io"
"net"
"net/netip"
"os"
"runtime"
"strings"
"sync/atomic"
"time"
@@ -16,19 +18,16 @@ import (
"github.com/gaissmai/bart"
"github.com/sirupsen/logrus"
"github.com/slackhq/nebula/config"
"github.com/slackhq/nebula/overlay/vhostnet"
"github.com/slackhq/nebula/packet"
"github.com/slackhq/nebula/routing"
"github.com/slackhq/nebula/util"
"github.com/slackhq/nebula/util/virtio"
wgtun "github.com/slackhq/nebula/wgstack/tun"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
)
type tun struct {
file *os.File
io.ReadWriteCloser
fd int
vdev []*vhostnet.Device
Device string
vpnNetworks []netip.Prefix
MaxMTU int
@@ -36,6 +35,7 @@ type tun struct {
TXQueueLen int
deviceIndex int
ioctlFd uintptr
wgDevice wgtun.Device
Routes atomic.Pointer[[]Route]
routeTree atomic.Pointer[bart.Table[routing.Gateways]]
@@ -43,8 +43,7 @@ type tun struct {
useSystemRoutes bool
useSystemRoutesBufferSize int
isV6 bool
l *logrus.Logger
l *logrus.Logger
}
func (t *tun) Networks() []netip.Prefix {
@@ -72,7 +71,9 @@ type ifreqQLEN struct {
func newTunFromFd(c *config.C, l *logrus.Logger, deviceFd int, vpnNetworks []netip.Prefix) (*tun, error) {
file := os.NewFile(uintptr(deviceFd), "/dev/net/tun")
t, err := newTunGeneric(c, l, file, vpnNetworks)
useWGDefault := runtime.GOOS == "linux"
useWG := c.GetBool("tun.use_wireguard_stack", c.GetBool("listen.use_wireguard_stack", useWGDefault))
t, err := newTunGeneric(c, l, file, vpnNetworks, useWG)
if err != nil {
return nil, err
}
@@ -106,7 +107,7 @@ func newTun(c *config.C, l *logrus.Logger, vpnNetworks []netip.Prefix, multiqueu
}
var req ifReq
req.Flags = uint16(unix.IFF_TUN | unix.IFF_NO_PI | unix.IFF_TUN_EXCL | unix.IFF_VNET_HDR | unix.IFF_NAPI)
req.Flags = uint16(unix.IFF_TUN | unix.IFF_NO_PI)
if multiqueue {
req.Flags |= unix.IFF_MULTI_QUEUE
}
@@ -116,56 +117,57 @@ func newTun(c *config.C, l *logrus.Logger, vpnNetworks []netip.Prefix, multiqueu
}
name := strings.Trim(string(req.Name[:]), "\x00")
if err = unix.SetNonblock(fd, true); err != nil {
_ = unix.Close(fd)
return nil, fmt.Errorf("make file descriptor non-blocking: %w", err)
}
file := os.NewFile(uintptr(fd), "/dev/net/tun")
err = unix.IoctlSetPointerInt(fd, unix.TUNSETVNETHDRSZ, virtio.NetHdrSize)
if err != nil {
return nil, fmt.Errorf("set vnethdr size: %w", err)
}
flags := 0
//flags = //unix.TUN_F_CSUM //| unix.TUN_F_TSO4 | unix.TUN_F_USO4 | unix.TUN_F_TSO6 | unix.TUN_F_USO6
err = unix.IoctlSetInt(fd, unix.TUNSETOFFLOAD, flags)
if err != nil {
return nil, fmt.Errorf("set offloads: %w", err)
}
t, err := newTunGeneric(c, l, file, vpnNetworks)
useWGDefault := runtime.GOOS == "linux"
useWG := c.GetBool("tun.use_wireguard_stack", c.GetBool("listen.use_wireguard_stack", useWGDefault))
t, err := newTunGeneric(c, l, file, vpnNetworks, useWG)
if err != nil {
return nil, err
}
t.fd = fd
t.Device = name
vdev, err := vhostnet.NewDevice(
vhostnet.WithBackendFD(fd),
vhostnet.WithQueueSize(8192), //todo config
)
if err != nil {
return nil, err
}
t.vdev = []*vhostnet.Device{vdev}
return t, nil
}
func newTunGeneric(c *config.C, l *logrus.Logger, file *os.File, vpnNetworks []netip.Prefix) (*tun, error) {
func newTunGeneric(c *config.C, l *logrus.Logger, file *os.File, vpnNetworks []netip.Prefix, useWireguard bool) (*tun, error) {
var (
rw io.ReadWriteCloser = file
fd = int(file.Fd())
wgDev wgtun.Device
)
if useWireguard {
dev, err := wgtun.CreateTUNFromFile(file, c.GetInt("tun.mtu", DefaultMTU))
if err != nil {
return nil, fmt.Errorf("failed to initialize wireguard tun device: %w", err)
}
wgDev = dev
rw = newWireguardTunIO(dev, c.GetInt("tun.mtu", DefaultMTU))
fd = int(dev.File().Fd())
}
t := &tun{
file: file,
fd: int(file.Fd()),
ReadWriteCloser: rw,
fd: fd,
vpnNetworks: vpnNetworks,
TXQueueLen: c.GetInt("tun.tx_queue", 500),
useSystemRoutes: c.GetBool("tun.use_system_route_table", false),
useSystemRoutesBufferSize: c.GetInt("tun.use_system_route_table_buffer_size", 0),
l: l,
}
if len(vpnNetworks) != 0 {
t.isV6 = vpnNetworks[0].Addr().Is6() //todo what about multi-IP?
if wgDev != nil {
t.wgDevice = wgDev
}
if wgDev != nil {
// replace ioctl fd with device file descriptor to keep route management working
file = wgDev.File()
t.fd = int(file.Fd())
t.ioctlFd = file.Fd()
}
if t.ioctlFd == 0 {
t.ioctlFd = file.Fd()
}
err := t.reload(c, true)
@@ -250,7 +252,7 @@ func (t *tun) reload(c *config.C, initial bool) error {
return nil
}
func (t *tun) NewMultiQueueReader() (TunDev, error) {
func (t *tun) NewMultiQueueReader() (io.ReadWriteCloser, error) {
fd, err := unix.Open("/dev/net/tun", os.O_RDWR, 0)
if err != nil {
return nil, err
@@ -263,17 +265,9 @@ func (t *tun) NewMultiQueueReader() (TunDev, error) {
return nil, err
}
vdev, err := vhostnet.NewDevice(
vhostnet.WithBackendFD(fd),
vhostnet.WithQueueSize(8192), //todo config
)
if err != nil {
return nil, err
}
file := os.NewFile(uintptr(fd), "/dev/net/tun")
t.vdev = append(t.vdev, vdev)
return t, nil
return file, nil
}
func (t *tun) RoutesFor(ip netip.Addr) routing.Gateways {
@@ -281,6 +275,29 @@ func (t *tun) RoutesFor(ip netip.Addr) routing.Gateways {
return r
}
func (t *tun) Write(b []byte) (int, error) {
var nn int
maximum := len(b)
for {
n, err := unix.Write(t.fd, b[nn:maximum])
if n > 0 {
nn += n
}
if nn == len(b) {
return nn, err
}
if err != nil {
return nn, err
}
if n == 0 {
return nn, io.ErrUnexpectedEOF
}
}
}
func (t *tun) deviceBytes() (o [16]byte) {
for i, c := range t.Device {
o[i] = byte(c)
@@ -693,14 +710,16 @@ func (t *tun) Close() error {
close(t.routeChan)
}
for _, v := range t.vdev {
if v != nil {
_ = v.Close()
}
if t.ReadWriteCloser != nil {
_ = t.ReadWriteCloser.Close()
}
if t.file != nil {
_ = t.file.Close()
if t.wgDevice != nil {
_ = t.wgDevice.Close()
if t.ioctlFd > 0 {
// underlying fd already closed by the device
t.ioctlFd = 0
}
}
if t.ioctlFd > 0 {
@@ -709,65 +728,3 @@ func (t *tun) Close() error {
return nil
}
func (t *tun) ReadMany(p []*packet.VirtIOPacket, q int) (int, error) {
n, err := t.vdev[q].ReceivePackets(p) //we are TXing
if err != nil {
return 0, err
}
return n, nil
}
func (t *tun) Write(b []byte) (int, error) {
maximum := len(b) //we are RXing
//todo garbagey
out := packet.NewOut()
x, err := t.AllocSeg(out, 0)
if err != nil {
return 0, err
}
copy(out.SegmentPayloads[x], b)
err = t.vdev[0].TransmitPacket(out, true)
if err != nil {
t.l.WithError(err).Error("Transmitting packet")
return 0, err
}
return maximum, nil
}
func (t *tun) AllocSeg(pkt *packet.OutPacket, q int) (int, error) {
idx, buf, err := t.vdev[q].GetPacketForTx()
if err != nil {
return 0, err
}
x := pkt.UseSegment(idx, buf, t.isV6)
return x, nil
}
func (t *tun) WriteOne(x *packet.OutPacket, kick bool, q int) (int, error) {
if err := t.vdev[q].TransmitPacket(x, kick); err != nil {
t.l.WithError(err).Error("Transmitting packet")
return 0, err
}
return 1, nil
}
func (t *tun) WriteMany(x []*packet.OutPacket, q int) (int, error) {
maximum := len(x) //we are RXing
if maximum == 0 {
return 0, nil
}
err := t.vdev[q].TransmitPackets(x)
if err != nil {
t.l.WithError(err).Error("Transmitting packet")
return 0, err
}
return maximum, nil
}
func (t *tun) RecycleRxSeg(pkt *packet.VirtIOPacket, kick bool, q int) error {
return t.vdev[q].ReceiveQueue.OfferDescriptorChains(pkt.Chains, kick)
}

View File

@@ -0,0 +1,56 @@
//go:build linux && !android && !e2e_testing
package overlay
import "fmt"
func (t *tun) batchIO() (*wireguardTunIO, bool) {
io, ok := t.ReadWriteCloser.(*wireguardTunIO)
return io, ok
}
func (t *tun) ReadIntoBatch(pool *PacketPool) ([]*Packet, error) {
io, ok := t.batchIO()
if !ok {
return nil, fmt.Errorf("wireguard batch I/O not enabled")
}
return io.ReadIntoBatch(pool)
}
func (t *tun) WriteBatch(packets []*Packet) (int, error) {
io, ok := t.batchIO()
if ok {
return io.WriteBatch(packets)
}
for _, pkt := range packets {
if pkt == nil {
continue
}
if _, err := t.Write(pkt.Payload()[:pkt.Len]); err != nil {
return 0, err
}
pkt.Release()
}
return len(packets), nil
}
func (t *tun) BatchHeadroom() int {
if io, ok := t.batchIO(); ok {
return io.BatchHeadroom()
}
return 0
}
func (t *tun) BatchPayloadCap() int {
if io, ok := t.batchIO(); ok {
return io.BatchPayloadCap()
}
return 0
}
func (t *tun) BatchSize() int {
if io, ok := t.batchIO(); ok {
return io.BatchSize()
}
return 1
}

View File

@@ -1,13 +1,11 @@
package overlay
import (
"fmt"
"io"
"net/netip"
"github.com/sirupsen/logrus"
"github.com/slackhq/nebula/config"
"github.com/slackhq/nebula/packet"
"github.com/slackhq/nebula/routing"
)
@@ -38,10 +36,6 @@ type UserDevice struct {
inboundWriter *io.PipeWriter
}
func (d *UserDevice) RecycleRxSeg(pkt *packet.VirtIOPacket, kick bool, q int) error {
return nil
}
func (d *UserDevice) Activate() error {
return nil
}
@@ -52,7 +46,7 @@ func (d *UserDevice) RoutesFor(ip netip.Addr) routing.Gateways {
return routing.Gateways{routing.NewGateway(ip, 1)}
}
func (d *UserDevice) NewMultiQueueReader() (TunDev, error) {
func (d *UserDevice) NewMultiQueueReader() (io.ReadWriteCloser, error) {
return d, nil
}
@@ -71,19 +65,3 @@ func (d *UserDevice) Close() error {
d.outboundWriter.Close()
return nil
}
func (d *UserDevice) ReadMany(b []*packet.VirtIOPacket, _ int) (int, error) {
return d.Read(b[0].Payload)
}
func (d *UserDevice) AllocSeg(pkt *packet.OutPacket, q int) (int, error) {
return 0, fmt.Errorf("user: AllocSeg not implemented")
}
func (d *UserDevice) WriteOne(x *packet.OutPacket, kick bool, q int) (int, error) {
return 0, fmt.Errorf("user: WriteOne not implemented")
}
func (d *UserDevice) WriteMany(x []*packet.OutPacket, q int) (int, error) {
return 0, fmt.Errorf("user: WriteMany not implemented")
}

View File

@@ -1,23 +0,0 @@
Significant portions of this code are derived from https://pkg.go.dev/github.com/hetznercloud/virtio-go
MIT License
Copyright (c) 2025 Hetzner Cloud GmbH
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,4 +0,0 @@
// Package vhost implements the basic ioctl requests needed to interact with the
// kernel-level virtio server that provides accelerated virtio devices for
// networking and more.
package vhost

View File

@@ -1,218 +0,0 @@
package vhost
import (
"fmt"
"unsafe"
"github.com/slackhq/nebula/overlay/virtqueue"
"github.com/slackhq/nebula/util/virtio"
"golang.org/x/sys/unix"
)
const (
// vhostIoctlGetFeatures can be used to retrieve the features supported by
// the vhost implementation in the kernel.
//
// Response payload: [virtio.Feature]
// Kernel name: VHOST_GET_FEATURES
vhostIoctlGetFeatures = 0x8008af00
// vhostIoctlSetFeatures can be used to communicate the features supported
// by this virtio implementation to the kernel.
//
// Request payload: [virtio.Feature]
// Kernel name: VHOST_SET_FEATURES
vhostIoctlSetFeatures = 0x4008af00
// vhostIoctlSetOwner can be used to set the current process as the
// exclusive owner of a control file descriptor.
//
// Request payload: none
// Kernel name: VHOST_SET_OWNER
vhostIoctlSetOwner = 0x0000af01
// vhostIoctlSetMemoryLayout can be used to set up or modify the memory
// layout which describes the IOTLB mappings in the kernel.
//
// Request payload: [MemoryLayout] with custom serialization
// Kernel name: VHOST_SET_MEM_TABLE
vhostIoctlSetMemoryLayout = 0x4008af03
// vhostIoctlSetQueueSize can be used to set the size of the virtqueue.
//
// Request payload: [QueueState]
// Kernel name: VHOST_SET_VRING_NUM
vhostIoctlSetQueueSize = 0x4008af10
// vhostIoctlSetQueueAddress can be used to set the addresses of the
// different parts of the virtqueue.
//
// Request payload: [QueueAddresses]
// Kernel name: VHOST_SET_VRING_ADDR
vhostIoctlSetQueueAddress = 0x4028af11
// vhostIoctlSetAvailableRingBase can be used to set the index of the next
// available ring entry the device will process.
//
// Request payload: [QueueState]
// Kernel name: VHOST_SET_VRING_BASE
vhostIoctlSetAvailableRingBase = 0x4008af12
// vhostIoctlSetQueueKickEventFD can be used to set the event file
// descriptor to signal the device when descriptor chains were added to the
// available ring.
//
// Request payload: [QueueFile]
// Kernel name: VHOST_SET_VRING_KICK
vhostIoctlSetQueueKickEventFD = 0x4008af20
// vhostIoctlSetQueueCallEventFD can be used to set the event file
// descriptor that gets signaled by the device when descriptor chains have
// been used by it.
//
// Request payload: [QueueFile]
// Kernel name: VHOST_SET_VRING_CALL
vhostIoctlSetQueueCallEventFD = 0x4008af21
)
// QueueState is an ioctl request payload that can hold a queue index and any
// 32-bit number.
//
// Kernel name: vhost_vring_state
type QueueState struct {
// QueueIndex is the index of the virtqueue.
QueueIndex uint32
// Num is any 32-bit number, depending on the request.
Num uint32
}
// QueueAddresses is an ioctl request payload that can hold the addresses of the
// different parts of a virtqueue.
//
// Kernel name: vhost_vring_addr
type QueueAddresses struct {
// QueueIndex is the index of the virtqueue.
QueueIndex uint32
// Flags that are not used in this implementation.
Flags uint32
// DescriptorTableAddress is the address of the descriptor table in user
// space memory. It must be 16-byte aligned.
DescriptorTableAddress uintptr
// UsedRingAddress is the address of the used ring in user space memory. It
// must be 4-byte aligned.
UsedRingAddress uintptr
// AvailableRingAddress is the address of the available ring in user space
// memory. It must be 2-byte aligned.
AvailableRingAddress uintptr
// LogAddress is used for an optional logging support, not supported by this
// implementation.
LogAddress uintptr
}
// QueueFile is an ioctl request payload that can hold a queue index and a file
// descriptor.
//
// Kernel name: vhost_vring_file
type QueueFile struct {
// QueueIndex is the index of the virtqueue.
QueueIndex uint32
// FD is the file descriptor of the file. Pass -1 to unbind from a file.
FD int32
}
// IoctlPtr is a copy of the similarly named unexported function from the Go
// unix package. This is needed to do custom ioctl requests not supported by the
// standard library.
func IoctlPtr(fd int, req uint, arg unsafe.Pointer) error {
_, _, err := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(arg))
if err != 0 {
return fmt.Errorf("ioctl request %d: %w", req, err)
}
return nil
}
// GetFeatures requests the supported feature bits from the virtio device
// associated with the given control file descriptor.
func GetFeatures(controlFD int) (virtio.Feature, error) {
var features virtio.Feature
if err := IoctlPtr(controlFD, vhostIoctlGetFeatures, unsafe.Pointer(&features)); err != nil {
return 0, fmt.Errorf("get features: %w", err)
}
return features, nil
}
// SetFeatures communicates the feature bits supported by this implementation
// to the virtio device associated with the given control file descriptor.
func SetFeatures(controlFD int, features virtio.Feature) error {
if err := IoctlPtr(controlFD, vhostIoctlSetFeatures, unsafe.Pointer(&features)); err != nil {
return fmt.Errorf("set features: %w", err)
}
return nil
}
// OwnControlFD sets the current process as the exclusive owner for the
// given control file descriptor. This must be called before interacting with
// the control file descriptor in any other way.
func OwnControlFD(controlFD int) error {
if err := IoctlPtr(controlFD, vhostIoctlSetOwner, unsafe.Pointer(nil)); err != nil {
return fmt.Errorf("set control file descriptor owner: %w", err)
}
return nil
}
// SetMemoryLayout sets up or modifies the memory layout for the kernel-level
// virtio device associated with the given control file descriptor.
func SetMemoryLayout(controlFD int, layout MemoryLayout) error {
payload := layout.serializePayload()
if err := IoctlPtr(controlFD, vhostIoctlSetMemoryLayout, unsafe.Pointer(&payload[0])); err != nil {
return fmt.Errorf("set memory layout: %w", err)
}
return nil
}
// RegisterQueue registers a virtio queue with the kernel-level virtio server.
// The virtqueue will be linked to the given control file descriptor and will
// have the given index. The kernel will use this queue until the control file
// descriptor is closed.
func RegisterQueue(controlFD int, queueIndex uint32, queue *virtqueue.SplitQueue) error {
if err := IoctlPtr(controlFD, vhostIoctlSetQueueSize, unsafe.Pointer(&QueueState{
QueueIndex: queueIndex,
Num: uint32(queue.Size()),
})); err != nil {
return fmt.Errorf("set queue size: %w", err)
}
if err := IoctlPtr(controlFD, vhostIoctlSetQueueAddress, unsafe.Pointer(&QueueAddresses{
QueueIndex: queueIndex,
Flags: 0,
DescriptorTableAddress: queue.DescriptorTable().Address(),
UsedRingAddress: queue.UsedRing().Address(),
AvailableRingAddress: queue.AvailableRing().Address(),
LogAddress: 0,
})); err != nil {
return fmt.Errorf("set queue addresses: %w", err)
}
if err := IoctlPtr(controlFD, vhostIoctlSetAvailableRingBase, unsafe.Pointer(&QueueState{
QueueIndex: queueIndex,
Num: 0,
})); err != nil {
return fmt.Errorf("set available ring base: %w", err)
}
if err := IoctlPtr(controlFD, vhostIoctlSetQueueKickEventFD, unsafe.Pointer(&QueueFile{
QueueIndex: queueIndex,
FD: int32(queue.KickEventFD()),
})); err != nil {
return fmt.Errorf("set kick event file descriptor: %w", err)
}
if err := IoctlPtr(controlFD, vhostIoctlSetQueueCallEventFD, unsafe.Pointer(&QueueFile{
QueueIndex: queueIndex,
FD: int32(queue.CallEventFD()),
})); err != nil {
return fmt.Errorf("set call event file descriptor: %w", err)
}
return nil
}

View File

@@ -1,21 +0,0 @@
package vhost_test
import (
"testing"
"unsafe"
"github.com/slackhq/nebula/overlay/vhost"
"github.com/stretchr/testify/assert"
)
func TestQueueState_Size(t *testing.T) {
assert.EqualValues(t, 8, unsafe.Sizeof(vhost.QueueState{}))
}
func TestQueueAddresses_Size(t *testing.T) {
assert.EqualValues(t, 40, unsafe.Sizeof(vhost.QueueAddresses{}))
}
func TestQueueFile_Size(t *testing.T) {
assert.EqualValues(t, 8, unsafe.Sizeof(vhost.QueueFile{}))
}

View File

@@ -1,73 +0,0 @@
package vhost
import (
"encoding/binary"
"fmt"
"unsafe"
"github.com/slackhq/nebula/overlay/virtqueue"
)
// MemoryRegion describes a region of userspace memory which is being made
// accessible to a vhost device.
//
// Kernel name: vhost_memory_region
type MemoryRegion struct {
// GuestPhysicalAddress is the physical address of the memory region within
// the guest, when virtualization is used. When no virtualization is used,
// this should be the same as UserspaceAddress.
GuestPhysicalAddress uintptr
// Size is the size of the memory region.
Size uint64
// UserspaceAddress is the virtual address in the userspace of the host
// where the memory region can be found.
UserspaceAddress uintptr
// Padding and room for flags. Currently unused.
_ uint64
}
// MemoryLayout is a list of [MemoryRegion]s.
type MemoryLayout []MemoryRegion
// NewMemoryLayoutForQueues returns a new [MemoryLayout] that describes the
// memory pages used by the descriptor tables of the given queues.
func NewMemoryLayoutForQueues(queues []*virtqueue.SplitQueue) MemoryLayout {
regions := make([]MemoryRegion, 0)
for _, queue := range queues {
for address, size := range queue.DescriptorTable().BufferAddresses() {
regions = append(regions, MemoryRegion{
// There is no virtualization in play here, so the guest address
// is the same as in the host's userspace.
GuestPhysicalAddress: address,
Size: uint64(size),
UserspaceAddress: address,
})
}
}
return regions
}
// serializePayload serializes the list of memory regions into a format that is
// compatible to the vhost_memory kernel struct. The returned byte slice can be
// used as a payload for the vhostIoctlSetMemoryLayout ioctl.
func (regions MemoryLayout) serializePayload() []byte {
regionCount := len(regions)
regionSize := int(unsafe.Sizeof(MemoryRegion{}))
payload := make([]byte, 8+regionCount*regionSize)
// The first 32 bits contain the number of memory regions. The following 32
// bits are padding.
binary.LittleEndian.PutUint32(payload[0:4], uint32(regionCount))
if regionCount > 0 {
// The underlying byte array of the slice should already have the correct
// format, so just copy that.
copied := copy(payload[8:], unsafe.Slice((*byte)(unsafe.Pointer(&regions[0])), regionCount*regionSize))
if copied != regionCount*regionSize {
panic(fmt.Sprintf("copied only %d bytes of the memory regions, but expected %d",
copied, regionCount*regionSize))
}
}
return payload
}

View File

@@ -1,42 +0,0 @@
package vhost
import (
"testing"
"unsafe"
"github.com/stretchr/testify/assert"
)
func TestMemoryRegion_Size(t *testing.T) {
assert.EqualValues(t, 32, unsafe.Sizeof(MemoryRegion{}))
}
func TestMemoryLayout_SerializePayload(t *testing.T) {
layout := MemoryLayout([]MemoryRegion{
{
GuestPhysicalAddress: 42,
Size: 100,
UserspaceAddress: 142,
}, {
GuestPhysicalAddress: 99,
Size: 100,
UserspaceAddress: 99,
},
})
payload := layout.serializePayload()
assert.Equal(t, []byte{
0x02, 0x00, 0x00, 0x00, // nregions
0x00, 0x00, 0x00, 0x00, // padding
// region 0
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // guest_phys_addr
0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // memory_size
0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // userspace_addr
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // flags_padding
// region 1
0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // guest_phys_addr
0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // memory_size
0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // userspace_addr
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // flags_padding
}, payload)
}

View File

@@ -1,23 +0,0 @@
Significant portions of this code are derived from https://pkg.go.dev/github.com/hetznercloud/virtio-go
MIT License
Copyright (c) 2025 Hetzner Cloud GmbH
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,427 +0,0 @@
package vhostnet
import (
"context"
"errors"
"fmt"
"os"
"runtime"
"github.com/slackhq/nebula/overlay/vhost"
"github.com/slackhq/nebula/overlay/virtqueue"
"github.com/slackhq/nebula/packet"
"github.com/slackhq/nebula/util/virtio"
"golang.org/x/sys/unix"
)
// ErrDeviceClosed is returned when the [Device] is closed while operations are
// still running.
var ErrDeviceClosed = errors.New("device was closed")
// The indexes for the receive and transmit queues.
const (
receiveQueueIndex = 0
transmitQueueIndex = 1
)
// Device represents a vhost networking device within the kernel-level virtio
// implementation and provides methods to interact with it.
type Device struct {
initialized bool
controlFD int
fullTable bool
ReceiveQueue *virtqueue.SplitQueue
TransmitQueue *virtqueue.SplitQueue
}
// NewDevice initializes a new vhost networking device within the
// kernel-level virtio implementation, sets up the virtqueues and returns a
// [Device] instance that can be used to communicate with that vhost device.
//
// There are multiple options that can be passed to this constructor to
// influence device creation:
// - [WithQueueSize]
// - [WithBackendFD]
// - [WithBackendDevice]
//
// Remember to call [Device.Close] after use to free up resources.
func NewDevice(options ...Option) (*Device, error) {
var err error
opts := optionDefaults
opts.apply(options)
if err = opts.validate(); err != nil {
return nil, fmt.Errorf("invalid options: %w", err)
}
dev := Device{
controlFD: -1,
}
// Clean up a partially initialized device when something fails.
defer func() {
if err != nil {
_ = dev.Close()
}
}()
// Retrieve a new control file descriptor. This will be used to configure
// the vhost networking device in the kernel.
dev.controlFD, err = unix.Open("/dev/vhost-net", os.O_RDWR, 0666)
if err != nil {
return nil, fmt.Errorf("get control file descriptor: %w", err)
}
if err = vhost.OwnControlFD(dev.controlFD); err != nil {
return nil, fmt.Errorf("own control file descriptor: %w", err)
}
// Advertise the supported features. This isn't much for now.
// TODO: Add feature options and implement proper feature negotiation.
getFeatures, err := vhost.GetFeatures(dev.controlFD) //0x1033D008000 but why
if err != nil {
return nil, fmt.Errorf("get features: %w", err)
}
if getFeatures == 0 {
}
//const funky = virtio.Feature(1 << 27)
//features := virtio.FeatureVersion1 | funky // | todo virtio.FeatureNetMergeRXBuffers
features := virtio.FeatureVersion1 | virtio.FeatureNetMergeRXBuffers
if err = vhost.SetFeatures(dev.controlFD, features); err != nil {
return nil, fmt.Errorf("set features: %w", err)
}
itemSize := os.Getpagesize() * 4 //todo config
// Initialize and register the queues needed for the networking device.
if dev.ReceiveQueue, err = createQueue(dev.controlFD, receiveQueueIndex, opts.queueSize, itemSize); err != nil {
return nil, fmt.Errorf("create receive queue: %w", err)
}
if dev.TransmitQueue, err = createQueue(dev.controlFD, transmitQueueIndex, opts.queueSize, itemSize); err != nil {
return nil, fmt.Errorf("create transmit queue: %w", err)
}
// Set up memory mappings for all buffers used by the queues. This has to
// happen before a backend for the queues can be registered.
memoryLayout := vhost.NewMemoryLayoutForQueues(
[]*virtqueue.SplitQueue{dev.ReceiveQueue, dev.TransmitQueue},
)
if err = vhost.SetMemoryLayout(dev.controlFD, memoryLayout); err != nil {
return nil, fmt.Errorf("setup memory layout: %w", err)
}
// Set the queue backends. This activates the queues within the kernel.
if err = SetQueueBackend(dev.controlFD, receiveQueueIndex, opts.backendFD); err != nil {
return nil, fmt.Errorf("set receive queue backend: %w", err)
}
if err = SetQueueBackend(dev.controlFD, transmitQueueIndex, opts.backendFD); err != nil {
return nil, fmt.Errorf("set transmit queue backend: %w", err)
}
// Fully populate the receive queue with available buffers which the device
// can write new packets into.
if err = dev.refillReceiveQueue(); err != nil {
return nil, fmt.Errorf("refill receive queue: %w", err)
}
if err = dev.refillTransmitQueue(); err != nil {
return nil, fmt.Errorf("refill receive queue: %w", err)
}
dev.initialized = true
// Make sure to clean up even when the device gets garbage collected without
// Close being called first.
devPtr := &dev
runtime.SetFinalizer(devPtr, (*Device).Close)
return devPtr, nil
}
// refillReceiveQueue offers as many new device-writable buffers to the device
// as the queue can fit. The device will then use these to write received
// packets.
func (dev *Device) refillReceiveQueue() error {
for {
_, err := dev.ReceiveQueue.OfferInDescriptorChains()
if err != nil {
if errors.Is(err, virtqueue.ErrNotEnoughFreeDescriptors) {
// Queue is full, job is done.
return nil
}
return fmt.Errorf("offer descriptor chain: %w", err)
}
}
}
func (dev *Device) refillTransmitQueue() error {
//for {
// desc, err := dev.TransmitQueue.DescriptorTable().CreateDescriptorForOutputs()
// if err != nil {
// if errors.Is(err, virtqueue.ErrNotEnoughFreeDescriptors) {
// // Queue is full, job is done.
// return nil
// }
// return fmt.Errorf("offer descriptor chain: %w", err)
// } else {
// dev.TransmitQueue.UsedRing().InitOfferSingle(desc, 0)
// }
//}
return nil
}
// Close cleans up the vhost networking device within the kernel and releases
// all resources used for it.
// The implementation will try to release as many resources as possible and
// collect potential errors before returning them.
func (dev *Device) Close() error {
dev.initialized = false
// Closing the control file descriptor will unregister all queues from the
// kernel.
if dev.controlFD >= 0 {
if err := unix.Close(dev.controlFD); err != nil {
// Return an error and do not continue, because the memory used for
// the queues should not be released before they were unregistered
// from the kernel.
return fmt.Errorf("close control file descriptor: %w", err)
}
dev.controlFD = -1
}
var errs []error
if dev.ReceiveQueue != nil {
if err := dev.ReceiveQueue.Close(); err == nil {
dev.ReceiveQueue = nil
} else {
errs = append(errs, fmt.Errorf("close receive queue: %w", err))
}
}
if dev.TransmitQueue != nil {
if err := dev.TransmitQueue.Close(); err == nil {
dev.TransmitQueue = nil
} else {
errs = append(errs, fmt.Errorf("close transmit queue: %w", err))
}
}
if len(errs) == 0 {
// Everything was cleaned up. No need to run the finalizer anymore.
runtime.SetFinalizer(dev, nil)
}
return errors.Join(errs...)
}
// ensureInitialized is used as a guard to prevent methods to be called on an
// uninitialized instance.
func (dev *Device) ensureInitialized() {
if !dev.initialized {
panic("device is not initialized")
}
}
// createQueue creates a new virtqueue and registers it with the vhost device
// using the given index.
func createQueue(controlFD int, queueIndex int, queueSize int, itemSize int) (*virtqueue.SplitQueue, error) {
var (
queue *virtqueue.SplitQueue
err error
)
if queue, err = virtqueue.NewSplitQueue(queueSize, itemSize); err != nil {
return nil, fmt.Errorf("create virtqueue: %w", err)
}
if err = vhost.RegisterQueue(controlFD, uint32(queueIndex), queue); err != nil {
return nil, fmt.Errorf("register virtqueue with index %d: %w", queueIndex, err)
}
return queue, nil
}
// truncateBuffers returns a new list of buffers whose combined length matches
// exactly the specified length. When the specified length exceeds the length of
// the buffers, this is an error. When it is smaller, the buffer list will be
// truncated accordingly.
func truncateBuffers(buffers [][]byte, length int) (out [][]byte) {
for _, buffer := range buffers {
if length < len(buffer) {
out = append(out, buffer[:length])
return
}
out = append(out, buffer)
length -= len(buffer)
}
if length > 0 {
panic("length exceeds the combined length of all buffers")
}
return
}
func (dev *Device) GetPacketForTx() (uint16, []byte, error) {
var err error
var idx uint16
if !dev.fullTable {
idx, err = dev.TransmitQueue.DescriptorTable().CreateDescriptorForOutputs()
if err == virtqueue.ErrNotEnoughFreeDescriptors {
dev.fullTable = true
idx, err = dev.TransmitQueue.TakeSingle(context.TODO())
}
} else {
idx, err = dev.TransmitQueue.TakeSingle(context.TODO())
}
if err != nil {
return 0, nil, fmt.Errorf("transmit queue: %w", err)
}
buf, err := dev.TransmitQueue.GetDescriptorItem(idx)
if err != nil {
return 0, nil, fmt.Errorf("get descriptor chain: %w", err)
}
return idx, buf, nil
}
func (dev *Device) TransmitPacket(pkt *packet.OutPacket, kick bool) error {
if len(pkt.SegmentIDs) == 0 {
return nil
}
for idx := range pkt.SegmentIDs {
segmentID := pkt.SegmentIDs[idx]
dev.TransmitQueue.SetDescSize(segmentID, len(pkt.Segments[idx]))
}
err := dev.TransmitQueue.OfferDescriptorChains(pkt.SegmentIDs, false)
if err != nil {
return fmt.Errorf("offer descriptor chains: %w", err)
}
pkt.Reset()
if kick {
if err := dev.TransmitQueue.Kick(); err != nil {
return err
}
}
return nil
}
func (dev *Device) TransmitPackets(pkts []*packet.OutPacket) error {
if len(pkts) == 0 {
return nil
}
for i := range pkts {
if err := dev.TransmitPacket(pkts[i], false); err != nil {
return err
}
}
if err := dev.TransmitQueue.Kick(); err != nil {
return err
}
return nil
}
// TODO: Make above methods cancelable by taking a context.Context argument?
// TODO: Implement zero-copy variants to transmit and receive packets?
// processChains processes as many chains as needed to create one packet. The number of processed chains is returned.
func (dev *Device) processChains(pkt *packet.VirtIOPacket, chains []virtqueue.UsedElement) (int, error) {
//read first element to see how many descriptors we need:
pkt.Reset()
err := dev.ReceiveQueue.GetDescriptorInbuffers(uint16(chains[0].DescriptorIndex), &pkt.ChainRefs)
if err != nil {
return 0, fmt.Errorf("get descriptor chain: %w", err)
}
if len(pkt.ChainRefs) == 0 {
return 1, nil
}
// The specification requires that the first descriptor chain starts
// with a virtio-net header. It is not clear, whether it is also
// required to be fully contained in the first buffer of that
// descriptor chain, but it is reasonable to assume that this is
// always the case.
// The decode method already does the buffer length check.
if err = pkt.Header.Decode(pkt.ChainRefs[0][0:]); err != nil {
// The device misbehaved. There is no way we can gracefully
// recover from this, because we don't know how many of the
// following descriptor chains belong to this packet.
return 0, fmt.Errorf("decode vnethdr: %w", err)
}
//we have the header now: what do we need to do?
if int(pkt.Header.NumBuffers) > len(chains) {
return 0, fmt.Errorf("number of buffers is greater than number of chains %d", len(chains))
}
if int(pkt.Header.NumBuffers) != 1 {
return 0, fmt.Errorf("too smol-brain to handle more than one chain right now: %d chains", len(chains))
}
if chains[0].Length > 16000 {
//todo!
return 1, fmt.Errorf("too big packet length: %d", chains[0].Length)
}
//shift the buffer out of out:
pkt.Payload = pkt.ChainRefs[0][virtio.NetHdrSize:chains[0].Length]
pkt.Chains = append(pkt.Chains, uint16(chains[0].DescriptorIndex))
return 1, nil
//cursor := n - virtio.NetHdrSize
//
//if uint32(n) >= chains[0].Length && pkt.Header.NumBuffers == 1 {
// pkt.Payload = pkt.Payload[:chains[0].Length-virtio.NetHdrSize]
// return 1, nil
//}
//
//i := 1
//// we used chain 0 already
//for i = 1; i < len(chains); i++ {
// n, err = dev.ReceiveQueue.GetDescriptorChainContents(uint16(chains[i].DescriptorIndex), pkt.Payload[cursor:], int(chains[i].Length))
// if err != nil {
// // When this fails we may miss to free some descriptor chains. We
// // could try to mitigate this by deferring the freeing somehow, but
// // it's not worth the hassle. When this method fails, the queue will
// // be in a broken state anyway.
// return i, fmt.Errorf("get descriptor chain: %w", err)
// }
// cursor += n
//}
////todo this has to be wrong
//pkt.Payload = pkt.Payload[:cursor]
//return i, nil
}
func (dev *Device) ReceivePackets(out []*packet.VirtIOPacket) (int, error) {
//todo optimize?
var chains []virtqueue.UsedElement
var err error
//if len(dev.extraRx) == 0 {
chains, err = dev.ReceiveQueue.BlockAndGetHeadsCapped(context.TODO(), len(out))
if err != nil {
return 0, err
}
if len(chains) == 0 {
return 0, nil
}
//} else {
// chains = dev.extraRx
//}
numPackets := 0
chainsIdx := 0
for numPackets = 0; chainsIdx < len(chains); numPackets++ {
if numPackets >= len(out) {
return numPackets, fmt.Errorf("dropping %d packets, no room", len(chains)-numPackets)
}
numChains, err := dev.processChains(out[numPackets], chains[chainsIdx:])
if err != nil {
return 0, err
}
chainsIdx += numChains
}
// Now that we have copied all buffers, we can recycle the used descriptor chains
//if err = dev.ReceiveQueue.OfferDescriptorChains(chains); err != nil {
// return 0, err
//}
return numPackets, nil
}

View File

@@ -1,3 +0,0 @@
// Package vhostnet implements methods to initialize vhost networking devices
// within the kernel-level virtio implementation and communicate with them.
package vhostnet

View File

@@ -1,31 +0,0 @@
package vhostnet
import (
"fmt"
"unsafe"
"github.com/slackhq/nebula/overlay/vhost"
)
const (
// vhostNetIoctlSetBackend can be used to attach a virtqueue to a RAW socket
// or TAP device.
//
// Request payload: [vhost.QueueFile]
// Kernel name: VHOST_NET_SET_BACKEND
vhostNetIoctlSetBackend = 0x4008af30
)
// SetQueueBackend attaches a virtqueue of the vhost networking device
// described by controlFD to the given backend file descriptor.
// The backend file descriptor can either be a RAW socket or a TAP device. When
// it is -1, the queue will be detached.
func SetQueueBackend(controlFD int, queueIndex uint32, backendFD int) error {
if err := vhost.IoctlPtr(controlFD, vhostNetIoctlSetBackend, unsafe.Pointer(&vhost.QueueFile{
QueueIndex: queueIndex,
FD: int32(backendFD),
})); err != nil {
return fmt.Errorf("set queue backend file descriptor: %w", err)
}
return nil
}

View File

@@ -1,69 +0,0 @@
package vhostnet
import (
"errors"
"github.com/slackhq/nebula/overlay/virtqueue"
)
type optionValues struct {
queueSize int
backendFD int
}
func (o *optionValues) apply(options []Option) {
for _, option := range options {
option(o)
}
}
func (o *optionValues) validate() error {
if o.queueSize == -1 {
return errors.New("queue size is required")
}
if err := virtqueue.CheckQueueSize(o.queueSize); err != nil {
return err
}
if o.backendFD == -1 {
return errors.New("backend file descriptor is required")
}
return nil
}
var optionDefaults = optionValues{
// Required.
queueSize: -1,
// Required.
backendFD: -1,
}
// Option can be passed to [NewDevice] to influence device creation.
type Option func(*optionValues)
// WithQueueSize returns an [Option] that sets the size of the TX and RX queues
// that are to be created for the device. It specifies the number of
// entries/buffers each queue can hold. This also affects the memory
// consumption.
// This is required and must be an integer from 1 to 32768 that is also a power
// of 2.
func WithQueueSize(queueSize int) Option {
return func(o *optionValues) { o.queueSize = queueSize }
}
// WithBackendFD returns an [Option] that sets the file descriptor of the
// backend that will be used for the queues of the device. The device will write
// and read packets to/from that backend. The file descriptor can either be of a
// RAW socket or TUN/TAP device.
// Either this or [WithBackendDevice] is required.
func WithBackendFD(backendFD int) Option {
return func(o *optionValues) { o.backendFD = backendFD }
}
//// WithBackendDevice returns an [Option] that sets the given TAP device as the
//// backend that will be used for the queues of the device. The device will
//// write and read packets to/from that backend. The TAP device should have been
//// created with the [tuntap.WithVirtioNetHdr] option enabled.
//// Either this or [WithBackendFD] is required.
//func WithBackendDevice(dev *tuntap.Device) Option {
// return func(o *optionValues) { o.backendFD = int(dev.File().Fd()) }
//}

View File

@@ -1,23 +0,0 @@
Significant portions of this code are derived from https://pkg.go.dev/github.com/hetznercloud/virtio-go
MIT License
Copyright (c) 2025 Hetzner Cloud GmbH
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,140 +0,0 @@
package virtqueue
import (
"fmt"
"unsafe"
)
// availableRingFlag is a flag that describes an [AvailableRing].
type availableRingFlag uint16
const (
// availableRingFlagNoInterrupt is used by the guest to advise the host to
// not interrupt it when consuming a buffer. It's unreliable, so it's simply
// an optimization.
availableRingFlagNoInterrupt availableRingFlag = 1 << iota
)
// availableRingSize is the number of bytes needed to store an [AvailableRing]
// with the given queue size in memory.
func availableRingSize(queueSize int) int {
return 6 + 2*queueSize
}
// availableRingAlignment is the minimum alignment of an [AvailableRing]
// in memory, as required by the virtio spec.
const availableRingAlignment = 2
// AvailableRing is used by the driver to offer descriptor chains to the device.
// Each ring entry refers to the head of a descriptor chain. It is only written
// to by the driver and read by the device.
//
// Because the size of the ring depends on the queue size, we cannot define a
// Go struct with a static size that maps to the memory of the ring. Instead,
// this struct only contains pointers to the corresponding memory areas.
type AvailableRing struct {
initialized bool
// flags that describe this ring.
flags *availableRingFlag
// ringIndex indicates where the driver would put the next entry into the
// ring (modulo the queue size).
ringIndex *uint16
// ring references buffers using the index of the head of the descriptor
// chain in the [DescriptorTable]. It wraps around at queue size.
ring []uint16
// usedEvent is not used by this implementation, but we reserve it anyway to
// avoid issues in case a device may try to access it, contrary to the
// virtio specification.
usedEvent *uint16
}
// newAvailableRing creates an available ring that uses the given underlying
// memory. The length of the memory slice must match the size needed for the
// ring (see [availableRingSize]) for the given queue size.
func newAvailableRing(queueSize int, mem []byte) *AvailableRing {
ringSize := availableRingSize(queueSize)
if len(mem) != ringSize {
panic(fmt.Sprintf("memory size (%v) does not match required size "+
"for available ring: %v", len(mem), ringSize))
}
return &AvailableRing{
initialized: true,
flags: (*availableRingFlag)(unsafe.Pointer(&mem[0])),
ringIndex: (*uint16)(unsafe.Pointer(&mem[2])),
ring: unsafe.Slice((*uint16)(unsafe.Pointer(&mem[4])), queueSize),
usedEvent: (*uint16)(unsafe.Pointer(&mem[ringSize-2])),
}
}
// Address returns the pointer to the beginning of the ring in memory.
// Do not modify the memory directly to not interfere with this implementation.
func (r *AvailableRing) Address() uintptr {
if !r.initialized {
panic("available ring is not initialized")
}
return uintptr(unsafe.Pointer(r.flags))
}
// offer adds the given descriptor chain heads to the available ring and
// advances the ring index accordingly to make the device process the new
// descriptor chains.
func (r *AvailableRing) offerElements(chains []UsedElement) {
//always called under lock
//r.mu.Lock()
//defer r.mu.Unlock()
// Add descriptor chain heads to the ring.
for offset, x := range chains {
// The 16-bit ring index may overflow. This is expected and is not an
// issue because the size of the ring array (which equals the queue
// size) is always a power of 2 and smaller than the highest possible
// 16-bit value.
insertIndex := int(*r.ringIndex+uint16(offset)) % len(r.ring)
r.ring[insertIndex] = x.GetHead()
}
// Increase the ring index by the number of descriptor chains added to the
// ring.
*r.ringIndex += uint16(len(chains))
}
func (r *AvailableRing) offer(chains []uint16) {
//always called under lock
//r.mu.Lock()
//defer r.mu.Unlock()
// Add descriptor chain heads to the ring.
for offset, x := range chains {
// The 16-bit ring index may overflow. This is expected and is not an
// issue because the size of the ring array (which equals the queue
// size) is always a power of 2 and smaller than the highest possible
// 16-bit value.
insertIndex := int(*r.ringIndex+uint16(offset)) % len(r.ring)
r.ring[insertIndex] = x
}
// Increase the ring index by the number of descriptor chains added to the
// ring.
*r.ringIndex += uint16(len(chains))
}
func (r *AvailableRing) offerSingle(x uint16) {
//always called under lock
//r.mu.Lock()
//defer r.mu.Unlock()
offset := 0
// Add descriptor chain heads to the ring.
// The 16-bit ring index may overflow. This is expected and is not an
// issue because the size of the ring array (which equals the queue
// size) is always a power of 2 and smaller than the highest possible
// 16-bit value.
insertIndex := int(*r.ringIndex+uint16(offset)) % len(r.ring)
r.ring[insertIndex] = x
// Increase the ring index by the number of descriptor chains added to the ring.
*r.ringIndex += 1
}

View File

@@ -1,71 +0,0 @@
package virtqueue
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestAvailableRing_MemoryLayout(t *testing.T) {
const queueSize = 2
memory := make([]byte, availableRingSize(queueSize))
r := newAvailableRing(queueSize, memory)
*r.flags = 0x01ff
*r.ringIndex = 1
r.ring[0] = 0x1234
r.ring[1] = 0x5678
assert.Equal(t, []byte{
0xff, 0x01,
0x01, 0x00,
0x34, 0x12,
0x78, 0x56,
0x00, 0x00,
}, memory)
}
func TestAvailableRing_Offer(t *testing.T) {
const queueSize = 8
chainHeads := []uint16{42, 33, 69}
tests := []struct {
name string
startRingIndex uint16
expectedRingIndex uint16
expectedRing []uint16
}{
{
name: "no overflow",
startRingIndex: 0,
expectedRingIndex: 3,
expectedRing: []uint16{42, 33, 69, 0, 0, 0, 0, 0},
},
{
name: "ring overflow",
startRingIndex: 6,
expectedRingIndex: 9,
expectedRing: []uint16{69, 0, 0, 0, 0, 0, 42, 33},
},
{
name: "index overflow",
startRingIndex: 65535,
expectedRingIndex: 2,
expectedRing: []uint16{33, 69, 0, 0, 0, 0, 0, 42},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
memory := make([]byte, availableRingSize(queueSize))
r := newAvailableRing(queueSize, memory)
*r.ringIndex = tt.startRingIndex
r.offer(chainHeads)
assert.Equal(t, tt.expectedRingIndex, *r.ringIndex)
assert.Equal(t, tt.expectedRing, r.ring)
})
}
}

View File

@@ -1,43 +0,0 @@
package virtqueue
// descriptorFlag is a flag that describes a [Descriptor].
type descriptorFlag uint16
const (
// descriptorFlagHasNext marks a descriptor chain as continuing via the next
// field.
descriptorFlagHasNext descriptorFlag = 1 << iota
// descriptorFlagWritable marks a buffer as device write-only (otherwise
// device read-only).
descriptorFlagWritable
// descriptorFlagIndirect means the buffer contains a list of buffer
// descriptors to provide an additional layer of indirection.
// Only allowed when the [virtio.FeatureIndirectDescriptors] feature was
// negotiated.
descriptorFlagIndirect
)
// descriptorSize is the number of bytes needed to store a [Descriptor] in
// memory.
const descriptorSize = 16
// Descriptor describes (a part of) a buffer which is either read-only for the
// device or write-only for the device (depending on [descriptorFlagWritable]).
// Multiple descriptors can be chained to produce a "descriptor chain" that can
// contain both device-readable and device-writable buffers. Device-readable
// descriptors always come first in a chain. A single, large buffer may be
// split up by chaining multiple similar descriptors that reference different
// memory pages. This is required, because buffers may exceed a single page size
// and the memory accessed by the device is expected to be continuous.
type Descriptor struct {
// address is the address to the continuous memory holding the data for this
// descriptor.
address uintptr
// length is the amount of bytes stored at address.
length uint32
// flags that describe this descriptor.
flags descriptorFlag
// next contains the index of the next descriptor continuing this descriptor
// chain when the [descriptorFlagHasNext] flag is set.
next uint16
}

View File

@@ -1,12 +0,0 @@
package virtqueue
import (
"testing"
"unsafe"
"github.com/stretchr/testify/assert"
)
func TestDescriptor_Size(t *testing.T) {
assert.EqualValues(t, descriptorSize, unsafe.Sizeof(Descriptor{}))
}

View File

@@ -1,641 +0,0 @@
package virtqueue
import (
"errors"
"fmt"
"math"
"unsafe"
"golang.org/x/sys/unix"
)
var (
// ErrDescriptorChainEmpty is returned when a descriptor chain would contain
// no buffers, which is not allowed.
ErrDescriptorChainEmpty = errors.New("empty descriptor chains are not allowed")
// ErrNotEnoughFreeDescriptors is returned when the free descriptors are
// exhausted, meaning that the queue is full.
ErrNotEnoughFreeDescriptors = errors.New("not enough free descriptors, queue is full")
// ErrInvalidDescriptorChain is returned when a descriptor chain is not
// valid for a given operation.
ErrInvalidDescriptorChain = errors.New("invalid descriptor chain")
)
// noFreeHead is used to mark when all descriptors are in use and we have no
// free chain. This value is impossible to occur as an index naturally, because
// it exceeds the maximum queue size.
const noFreeHead = uint16(math.MaxUint16)
// descriptorTableSize is the number of bytes needed to store a
// [DescriptorTable] with the given queue size in memory.
func descriptorTableSize(queueSize int) int {
return descriptorSize * queueSize
}
// descriptorTableAlignment is the minimum alignment of a [DescriptorTable]
// in memory, as required by the virtio spec.
const descriptorTableAlignment = 16
// DescriptorTable is a table that holds [Descriptor]s, addressed via their
// index in the slice.
type DescriptorTable struct {
descriptors []Descriptor
// freeHeadIndex is the index of the head of the descriptor chain which
// contains all currently unused descriptors. When all descriptors are in
// use, this has the special value of noFreeHead.
freeHeadIndex uint16
// freeNum tracks the number of descriptors which are currently not in use.
freeNum uint16
bufferBase uintptr
bufferSize int
itemSize int
}
// newDescriptorTable creates a descriptor table that uses the given underlying
// memory. The Length of the memory slice must match the size needed for the
// descriptor table (see [descriptorTableSize]) for the given queue size.
//
// Before this descriptor table can be used, [initialize] must be called.
func newDescriptorTable(queueSize int, mem []byte, itemSize int) *DescriptorTable {
dtSize := descriptorTableSize(queueSize)
if len(mem) != dtSize {
panic(fmt.Sprintf("memory size (%v) does not match required size "+
"for descriptor table: %v", len(mem), dtSize))
}
return &DescriptorTable{
descriptors: unsafe.Slice((*Descriptor)(unsafe.Pointer(&mem[0])), queueSize),
// We have no free descriptors until they were initialized.
freeHeadIndex: noFreeHead,
freeNum: 0,
itemSize: itemSize, //todo configurable? needs to be page-aligned
}
}
// Address returns the pointer to the beginning of the descriptor table in
// memory. Do not modify the memory directly to not interfere with this
// implementation.
func (dt *DescriptorTable) Address() uintptr {
if dt.descriptors == nil {
panic("descriptor table is not initialized")
}
//should be same as dt.bufferBase
return uintptr(unsafe.Pointer(&dt.descriptors[0]))
}
func (dt *DescriptorTable) Size() uintptr {
if dt.descriptors == nil {
panic("descriptor table is not initialized")
}
return uintptr(dt.bufferSize)
}
// BufferAddresses returns a map of pointer->size for all allocations used by the table
func (dt *DescriptorTable) BufferAddresses() map[uintptr]int {
if dt.descriptors == nil {
panic("descriptor table is not initialized")
}
return map[uintptr]int{dt.bufferBase: dt.bufferSize}
}
// initializeDescriptors allocates buffers with the size of a full memory page
// for each descriptor in the table. While this may be a bit wasteful, it makes
// dealing with descriptors way easier. Without this preallocation, we would
// have to allocate and free memory on demand, increasing complexity.
//
// All descriptors will be marked as free and will form a free chain. The
// addresses of all descriptors will be populated while their length remains
// zero.
func (dt *DescriptorTable) initializeDescriptors() error {
numDescriptors := len(dt.descriptors)
// Allocate ONE large region for all buffers
totalSize := dt.itemSize * numDescriptors
basePtr, err := unix.MmapPtr(-1, 0, nil, uintptr(totalSize),
unix.PROT_READ|unix.PROT_WRITE,
unix.MAP_PRIVATE|unix.MAP_ANONYMOUS)
if err != nil {
return fmt.Errorf("allocate buffer memory for descriptors: %w", err)
}
// Store the base for cleanup later
dt.bufferBase = uintptr(basePtr)
dt.bufferSize = totalSize
for i := range dt.descriptors {
dt.descriptors[i] = Descriptor{
address: dt.bufferBase + uintptr(i*dt.itemSize),
length: 0,
// All descriptors should form a free chain that loops around.
flags: descriptorFlagHasNext,
next: uint16((i + 1) % len(dt.descriptors)),
}
}
// All descriptors are free to use now.
dt.freeHeadIndex = 0
dt.freeNum = uint16(len(dt.descriptors))
return nil
}
// releaseBuffers releases all allocated buffers for this descriptor table.
// The implementation will try to release as many buffers as possible and
// collect potential errors before returning them.
// The descriptor table should no longer be used after calling this.
func (dt *DescriptorTable) releaseBuffers() error {
for i := range dt.descriptors {
descriptor := &dt.descriptors[i]
descriptor.address = 0
}
// As a safety measure, make sure no descriptors can be used anymore.
dt.freeHeadIndex = noFreeHead
dt.freeNum = 0
if dt.bufferBase != 0 {
// The pointer points to memory not managed by Go, so this conversion
// is safe. See https://github.com/golang/go/issues/58625
dt.bufferBase = 0
//goland:noinspection GoVetUnsafePointer
err := unix.MunmapPtr(unsafe.Pointer(dt.bufferBase), uintptr(dt.bufferSize))
if err != nil {
return fmt.Errorf("release buffer memory: %w", err)
}
}
return nil
}
// createDescriptorChain creates a new descriptor chain within the descriptor
// table which contains a number of device-readable buffers (out buffers) and
// device-writable buffers (in buffers).
//
// All buffers in the outBuffers slice will be concatenated by chaining
// descriptors, one for each buffer in the slice. The size of the single buffers
// must not exceed the size of a memory page (see [os.Getpagesize]).
// When numInBuffers is greater than zero, the given number of device-writable
// descriptors will be appended to the end of the chain, each referencing a
// whole memory page.
//
// The index of the head of the new descriptor chain will be returned. Callers
// should make sure to free the descriptor chain using [freeDescriptorChain]
// after it was used by the device.
//
// When there are not enough free descriptors to hold the given number of
// buffers, an [ErrNotEnoughFreeDescriptors] will be returned. In this case, the
// caller should try again after some descriptor chains were used by the device
// and returned back into the free chain.
func (dt *DescriptorTable) createDescriptorChain(outBuffers [][]byte, numInBuffers int) (uint16, error) {
// Calculate the number of descriptors needed to build the chain.
numDesc := uint16(len(outBuffers) + numInBuffers)
// Descriptor chains must always contain at least one descriptor.
if numDesc < 1 {
return 0, ErrDescriptorChainEmpty
}
// Do we still have enough free descriptors?
if numDesc > dt.freeNum {
return 0, ErrNotEnoughFreeDescriptors
}
// Above validation ensured that there is at least one free descriptor, so
// the free descriptor chain head should be valid.
if dt.freeHeadIndex == noFreeHead {
panic("free descriptor chain head is unset but there should be free descriptors")
}
// To avoid having to iterate over the whole table to find the descriptor
// pointing to the head just to replace the free head, we instead always
// create descriptor chains from the descriptors coming after the head.
// This way we only have to touch the head as a last resort, when all other
// descriptors are already used.
head := dt.descriptors[dt.freeHeadIndex].next
next := head
tail := head
for i, buffer := range outBuffers {
desc := &dt.descriptors[next]
checkUnusedDescriptorLength(next, desc)
if len(buffer) > dt.itemSize {
// The caller should already prevent that from happening.
panic(fmt.Sprintf("out buffer %d has size %d which exceeds desc length %d", i, len(buffer), dt.itemSize))
}
// Copy the buffer to the memory referenced by the descriptor.
// The descriptor address points to memory not managed by Go, so this
// conversion is safe. See https://github.com/golang/go/issues/58625
//goland:noinspection GoVetUnsafePointer
copy(unsafe.Slice((*byte)(unsafe.Pointer(desc.address)), dt.itemSize), buffer)
desc.length = uint32(len(buffer))
// Clear the flags in case there were any others set.
desc.flags = descriptorFlagHasNext
tail = next
next = desc.next
}
for range numInBuffers {
desc := &dt.descriptors[next]
checkUnusedDescriptorLength(next, desc)
// Give the device the maximum available number of bytes to write into.
desc.length = uint32(dt.itemSize)
// Mark the descriptor as device-writable.
desc.flags = descriptorFlagHasNext | descriptorFlagWritable
tail = next
next = desc.next
}
// The last descriptor should end the chain.
tailDesc := &dt.descriptors[tail]
tailDesc.flags &= ^descriptorFlagHasNext
tailDesc.next = 0 // Not necessary to clear this, it's just for looks.
dt.freeNum -= numDesc
if dt.freeNum == 0 {
// The last descriptor in the chain should be the free chain head
// itself.
if tail != dt.freeHeadIndex {
panic("descriptor chain takes up all free descriptors but does not end with the free chain head")
}
// When this new chain takes up all remaining descriptors, we no longer
// have a free chain.
dt.freeHeadIndex = noFreeHead
} else {
// We took some descriptors out of the free chain, so make sure to close
// the circle again.
dt.descriptors[dt.freeHeadIndex].next = next
}
return head, nil
}
func (dt *DescriptorTable) CreateDescriptorForOutputs() (uint16, error) {
//todo just fill the damn table
// Do we still have enough free descriptors?
if 1 > dt.freeNum {
return 0, ErrNotEnoughFreeDescriptors
}
// Above validation ensured that there is at least one free descriptor, so
// the free descriptor chain head should be valid.
if dt.freeHeadIndex == noFreeHead {
panic("free descriptor chain head is unset but there should be free descriptors")
}
// To avoid having to iterate over the whole table to find the descriptor
// pointing to the head just to replace the free head, we instead always
// create descriptor chains from the descriptors coming after the head.
// This way we only have to touch the head as a last resort, when all other
// descriptors are already used.
head := dt.descriptors[dt.freeHeadIndex].next
desc := &dt.descriptors[head]
next := desc.next
checkUnusedDescriptorLength(head, desc)
// Give the device the maximum available number of bytes to write into.
desc.length = uint32(dt.itemSize)
desc.flags = 0 // descriptorFlagWritable
desc.next = 0 // Not necessary to clear this, it's just for looks.
dt.freeNum -= 1
if dt.freeNum == 0 {
// The last descriptor in the chain should be the free chain head
// itself.
if next != dt.freeHeadIndex {
panic("descriptor chain takes up all free descriptors but does not end with the free chain head")
}
// When this new chain takes up all remaining descriptors, we no longer
// have a free chain.
dt.freeHeadIndex = noFreeHead
} else {
// We took some descriptors out of the free chain, so make sure to close
// the circle again.
dt.descriptors[dt.freeHeadIndex].next = next
}
return head, nil
}
func (dt *DescriptorTable) createDescriptorForInputs() (uint16, error) {
// Do we still have enough free descriptors?
if 1 > dt.freeNum {
return 0, ErrNotEnoughFreeDescriptors
}
// Above validation ensured that there is at least one free descriptor, so
// the free descriptor chain head should be valid.
if dt.freeHeadIndex == noFreeHead {
panic("free descriptor chain head is unset but there should be free descriptors")
}
// To avoid having to iterate over the whole table to find the descriptor
// pointing to the head just to replace the free head, we instead always
// create descriptor chains from the descriptors coming after the head.
// This way we only have to touch the head as a last resort, when all other
// descriptors are already used.
head := dt.descriptors[dt.freeHeadIndex].next
desc := &dt.descriptors[head]
next := desc.next
checkUnusedDescriptorLength(head, desc)
// Give the device the maximum available number of bytes to write into.
desc.length = uint32(dt.itemSize)
desc.flags = descriptorFlagWritable
desc.next = 0 // Not necessary to clear this, it's just for looks.
dt.freeNum -= 1
if dt.freeNum == 0 {
// The last descriptor in the chain should be the free chain head
// itself.
if next != dt.freeHeadIndex {
panic("descriptor chain takes up all free descriptors but does not end with the free chain head")
}
// When this new chain takes up all remaining descriptors, we no longer
// have a free chain.
dt.freeHeadIndex = noFreeHead
} else {
// We took some descriptors out of the free chain, so make sure to close
// the circle again.
dt.descriptors[dt.freeHeadIndex].next = next
}
return head, nil
}
// TODO: Implement a zero-copy variant of createDescriptorChain?
// getDescriptorChain returns the device-readable buffers (out buffers) and
// device-writable buffers (in buffers) of the descriptor chain that starts with
// the given head index. The descriptor chain must have been created using
// [createDescriptorChain] and must not have been freed yet (meaning that the
// head index must not be contained in the free chain).
//
// Be careful to only access the returned buffer slices when the device has not
// yet or is no longer using them. They must not be accessed after
// [freeDescriptorChain] has been called.
func (dt *DescriptorTable) getDescriptorChain(head uint16) (outBuffers, inBuffers [][]byte, err error) {
if int(head) > len(dt.descriptors) {
return nil, nil, fmt.Errorf("%w: index out of range", ErrInvalidDescriptorChain)
}
// Iterate over the chain. The iteration is limited to the queue size to
// avoid ending up in an endless loop when things go very wrong.
next := head
for range len(dt.descriptors) {
if next == dt.freeHeadIndex {
return nil, nil, fmt.Errorf("%w: must not be part of the free chain", ErrInvalidDescriptorChain)
}
desc := &dt.descriptors[next]
// The descriptor address points to memory not managed by Go, so this
// conversion is safe. See https://github.com/golang/go/issues/58625
//goland:noinspection GoVetUnsafePointer
bs := unsafe.Slice((*byte)(unsafe.Pointer(desc.address)), desc.length)
if desc.flags&descriptorFlagWritable == 0 {
outBuffers = append(outBuffers, bs)
} else {
inBuffers = append(inBuffers, bs)
}
// Is this the tail of the chain?
if desc.flags&descriptorFlagHasNext == 0 {
break
}
// Detect loops.
if desc.next == head {
return nil, nil, fmt.Errorf("%w: contains a loop", ErrInvalidDescriptorChain)
}
next = desc.next
}
return
}
func (dt *DescriptorTable) getDescriptorItem(head uint16) ([]byte, error) {
if int(head) > len(dt.descriptors) {
return nil, fmt.Errorf("%w: index out of range", ErrInvalidDescriptorChain)
}
desc := &dt.descriptors[head] //todo this is a pretty nasty hack with no checks
// The descriptor address points to memory not managed by Go, so this
// conversion is safe. See https://github.com/golang/go/issues/58625
//goland:noinspection GoVetUnsafePointer
bs := unsafe.Slice((*byte)(unsafe.Pointer(desc.address)), desc.length)
return bs, nil
}
func (dt *DescriptorTable) getDescriptorInbuffers(head uint16, inBuffers *[][]byte) error {
if int(head) > len(dt.descriptors) {
return fmt.Errorf("%w: index out of range", ErrInvalidDescriptorChain)
}
// Iterate over the chain. The iteration is limited to the queue size to
// avoid ending up in an endless loop when things go very wrong.
next := head
for range len(dt.descriptors) {
if next == dt.freeHeadIndex {
return fmt.Errorf("%w: must not be part of the free chain", ErrInvalidDescriptorChain)
}
desc := &dt.descriptors[next]
// The descriptor address points to memory not managed by Go, so this
// conversion is safe. See https://github.com/golang/go/issues/58625
//goland:noinspection GoVetUnsafePointer
bs := unsafe.Slice((*byte)(unsafe.Pointer(desc.address)), desc.length)
if desc.flags&descriptorFlagWritable == 0 {
return fmt.Errorf("there should not be an outbuffer in %d", head)
} else {
*inBuffers = append(*inBuffers, bs)
}
// Is this the tail of the chain?
if desc.flags&descriptorFlagHasNext == 0 {
break
}
// Detect loops.
if desc.next == head {
return fmt.Errorf("%w: contains a loop", ErrInvalidDescriptorChain)
}
next = desc.next
}
return nil
}
func (dt *DescriptorTable) getDescriptorChainContents(head uint16, out []byte, maxLen int) (int, error) {
if int(head) > len(dt.descriptors) {
return 0, fmt.Errorf("%w: index out of range", ErrInvalidDescriptorChain)
}
// Iterate over the chain. The iteration is limited to the queue size to
// avoid ending up in an endless loop when things go very wrong.
length := 0
//find length
next := head
for range len(dt.descriptors) {
if next == dt.freeHeadIndex {
return 0, fmt.Errorf("%w: must not be part of the free chain", ErrInvalidDescriptorChain)
}
desc := &dt.descriptors[next]
if desc.flags&descriptorFlagWritable == 0 {
return 0, fmt.Errorf("receive queue contains device-readable buffer")
}
length += int(desc.length)
// Is this the tail of the chain?
if desc.flags&descriptorFlagHasNext == 0 {
break
}
// Detect loops.
if desc.next == head {
return 0, fmt.Errorf("%w: contains a loop", ErrInvalidDescriptorChain)
}
next = desc.next
}
if maxLen > 0 {
//todo length = min(maxLen, length)
}
//set out to length:
out = out[:length]
//now do the copying
copied := 0
for range len(dt.descriptors) {
desc := &dt.descriptors[next]
// The descriptor address points to memory not managed by Go, so this
// conversion is safe. See https://github.com/golang/go/issues/58625
//goland:noinspection GoVetUnsafePointer
bs := unsafe.Slice((*byte)(unsafe.Pointer(desc.address)), min(uint32(length-copied), desc.length))
copied += copy(out[copied:], bs)
// Is this the tail of the chain?
if desc.flags&descriptorFlagHasNext == 0 {
break
}
// we did this already, no need to detect loops.
next = desc.next
}
if copied != length {
panic(fmt.Sprintf("expected to copy %d bytes but only copied %d bytes", length, copied))
}
return length, nil
}
// freeDescriptorChain can be used to free a descriptor chain when it is no
// longer in use. The descriptor chain that starts with the given index will be
// put back into the free chain, so the descriptors can be used for later calls
// of [createDescriptorChain].
// The descriptor chain must have been created using [createDescriptorChain] and
// must not have been freed yet (meaning that the head index must not be
// contained in the free chain).
func (dt *DescriptorTable) freeDescriptorChain(head uint16) error {
if int(head) > len(dt.descriptors) {
return fmt.Errorf("%w: index out of range", ErrInvalidDescriptorChain)
}
// Iterate over the chain. The iteration is limited to the queue size to
// avoid ending up in an endless loop when things go very wrong.
next := head
var tailDesc *Descriptor
var chainLen uint16
for range len(dt.descriptors) {
if next == dt.freeHeadIndex {
return fmt.Errorf("%w: must not be part of the free chain", ErrInvalidDescriptorChain)
}
desc := &dt.descriptors[next]
chainLen++
// Set the length of all unused descriptors back to zero.
desc.length = 0
// Unset all flags except the next flag.
desc.flags &= descriptorFlagHasNext
// Is this the tail of the chain?
if desc.flags&descriptorFlagHasNext == 0 {
tailDesc = desc
break
}
// Detect loops.
if desc.next == head {
return fmt.Errorf("%w: contains a loop", ErrInvalidDescriptorChain)
}
next = desc.next
}
if tailDesc == nil {
// A descriptor chain longer than the queue size but without loops
// should be impossible.
panic(fmt.Sprintf("could not find a tail for descriptor chain starting at %d", head))
}
// The tail descriptor does not have the next flag set, but when it comes
// back into the free chain, it should have.
tailDesc.flags = descriptorFlagHasNext
if dt.freeHeadIndex == noFreeHead {
// The whole free chain was used up, so we turn this returned descriptor
// chain into the new free chain by completing the circle and using its
// head.
tailDesc.next = head
dt.freeHeadIndex = head
} else {
// Attach the returned chain at the beginning of the free chain but
// right after the free chain head.
freeHeadDesc := &dt.descriptors[dt.freeHeadIndex]
tailDesc.next = freeHeadDesc.next
freeHeadDesc.next = head
}
dt.freeNum += chainLen
return nil
}
// checkUnusedDescriptorLength asserts that the length of an unused descriptor
// is zero, as it should be.
// This is not a requirement by the virtio spec but rather a thing we do to
// notice when our algorithm goes sideways.
func checkUnusedDescriptorLength(index uint16, desc *Descriptor) {
if desc.length != 0 {
panic(fmt.Sprintf("descriptor %d should be unused but has a non-zero length", index))
}
}

View File

@@ -1,407 +0,0 @@
package virtqueue
import (
"os"
"testing"
"unsafe"
"github.com/stretchr/testify/assert"
)
func TestDescriptorTable_InitializeDescriptors(t *testing.T) {
const queueSize = 32
dt := DescriptorTable{
descriptors: make([]Descriptor, queueSize),
}
assert.NoError(t, dt.initializeDescriptors())
t.Cleanup(func() {
assert.NoError(t, dt.releaseBuffers())
})
for i, descriptor := range dt.descriptors {
assert.NotZero(t, descriptor.address)
assert.Zero(t, descriptor.length)
assert.EqualValues(t, descriptorFlagHasNext, descriptor.flags)
assert.EqualValues(t, (i+1)%queueSize, descriptor.next)
}
}
func TestDescriptorTable_DescriptorChains(t *testing.T) {
// Use a very short queue size to not make this test overly verbose.
const queueSize = 8
pageSize := os.Getpagesize() * 2
// Initialize descriptor table.
dt := DescriptorTable{
descriptors: make([]Descriptor, queueSize),
}
assert.NoError(t, dt.initializeDescriptors())
t.Cleanup(func() {
assert.NoError(t, dt.releaseBuffers())
})
// Some utilities for easier checking if the descriptor table looks as
// expected.
type desc struct {
buffer []byte
flags descriptorFlag
next uint16
}
assertDescriptorTable := func(expected [queueSize]desc) {
for i := 0; i < queueSize; i++ {
actualDesc := &dt.descriptors[i]
expectedDesc := &expected[i]
assert.Equal(t, uint32(len(expectedDesc.buffer)), actualDesc.length)
if len(expectedDesc.buffer) > 0 {
//goland:noinspection GoVetUnsafePointer
assert.EqualValues(t,
unsafe.Slice((*byte)(unsafe.Pointer(actualDesc.address)), actualDesc.length),
expectedDesc.buffer)
}
assert.Equal(t, expectedDesc.flags, actualDesc.flags)
if expectedDesc.flags&descriptorFlagHasNext != 0 {
assert.Equal(t, expectedDesc.next, actualDesc.next)
}
}
}
// Initial state: All descriptors are in the free chain.
assert.Equal(t, uint16(0), dt.freeHeadIndex)
assert.Equal(t, uint16(8), dt.freeNum)
assertDescriptorTable([queueSize]desc{
{
// Free head.
flags: descriptorFlagHasNext,
next: 1,
},
{
flags: descriptorFlagHasNext,
next: 2,
},
{
flags: descriptorFlagHasNext,
next: 3,
},
{
flags: descriptorFlagHasNext,
next: 4,
},
{
flags: descriptorFlagHasNext,
next: 5,
},
{
flags: descriptorFlagHasNext,
next: 6,
},
{
flags: descriptorFlagHasNext,
next: 7,
},
{
flags: descriptorFlagHasNext,
next: 0,
},
})
// Create the first chain.
firstChain, err := dt.createDescriptorChain([][]byte{
makeTestBuffer(t, 26),
makeTestBuffer(t, 256),
}, 1)
assert.NoError(t, err)
assert.Equal(t, uint16(1), firstChain)
// Now there should be a new chain next to the free chain.
assert.Equal(t, uint16(0), dt.freeHeadIndex)
assert.Equal(t, uint16(5), dt.freeNum)
assertDescriptorTable([queueSize]desc{
{
// Free head.
flags: descriptorFlagHasNext,
next: 4,
},
{
// Head of first chain.
buffer: makeTestBuffer(t, 26),
flags: descriptorFlagHasNext,
next: 2,
},
{
buffer: makeTestBuffer(t, 256),
flags: descriptorFlagHasNext,
next: 3,
},
{
// Tail of first chain.
buffer: make([]byte, pageSize),
flags: descriptorFlagWritable,
},
{
flags: descriptorFlagHasNext,
next: 5,
},
{
flags: descriptorFlagHasNext,
next: 6,
},
{
flags: descriptorFlagHasNext,
next: 7,
},
{
flags: descriptorFlagHasNext,
next: 0,
},
})
// Create a second chain with only a single in buffer.
secondChain, err := dt.createDescriptorChain(nil, 1)
assert.NoError(t, err)
assert.Equal(t, uint16(4), secondChain)
// Now there should be two chains next to the free chain.
assert.Equal(t, uint16(0), dt.freeHeadIndex)
assert.Equal(t, uint16(4), dt.freeNum)
assertDescriptorTable([queueSize]desc{
{
// Free head.
flags: descriptorFlagHasNext,
next: 5,
},
{
// Head of the first chain.
buffer: makeTestBuffer(t, 26),
flags: descriptorFlagHasNext,
next: 2,
},
{
buffer: makeTestBuffer(t, 256),
flags: descriptorFlagHasNext,
next: 3,
},
{
// Tail of the first chain.
buffer: make([]byte, pageSize),
flags: descriptorFlagWritable,
},
{
// Head and tail of the second chain.
buffer: make([]byte, pageSize),
flags: descriptorFlagWritable,
},
{
flags: descriptorFlagHasNext,
next: 6,
},
{
flags: descriptorFlagHasNext,
next: 7,
},
{
flags: descriptorFlagHasNext,
next: 0,
},
})
// Create a third chain taking up all remaining descriptors.
thirdChain, err := dt.createDescriptorChain([][]byte{
makeTestBuffer(t, 42),
makeTestBuffer(t, 96),
makeTestBuffer(t, 33),
makeTestBuffer(t, 222),
}, 0)
assert.NoError(t, err)
assert.Equal(t, uint16(5), thirdChain)
// Now there should be three chains and no free chain.
assert.Equal(t, noFreeHead, dt.freeHeadIndex)
assert.Equal(t, uint16(0), dt.freeNum)
assertDescriptorTable([queueSize]desc{
{
// Tail of the third chain.
buffer: makeTestBuffer(t, 222),
},
{
// Head of the first chain.
buffer: makeTestBuffer(t, 26),
flags: descriptorFlagHasNext,
next: 2,
},
{
buffer: makeTestBuffer(t, 256),
flags: descriptorFlagHasNext,
next: 3,
},
{
// Tail of the first chain.
buffer: make([]byte, pageSize),
flags: descriptorFlagWritable,
},
{
// Head and tail of the second chain.
buffer: make([]byte, pageSize),
flags: descriptorFlagWritable,
},
{
// Head of the third chain.
buffer: makeTestBuffer(t, 42),
flags: descriptorFlagHasNext,
next: 6,
},
{
buffer: makeTestBuffer(t, 96),
flags: descriptorFlagHasNext,
next: 7,
},
{
buffer: makeTestBuffer(t, 33),
flags: descriptorFlagHasNext,
next: 0,
},
})
// Free the third chain.
assert.NoError(t, dt.freeDescriptorChain(thirdChain))
// Now there should be two chains and a free chain again.
assert.Equal(t, uint16(5), dt.freeHeadIndex)
assert.Equal(t, uint16(4), dt.freeNum)
assertDescriptorTable([queueSize]desc{
{
flags: descriptorFlagHasNext,
next: 5,
},
{
// Head of the first chain.
buffer: makeTestBuffer(t, 26),
flags: descriptorFlagHasNext,
next: 2,
},
{
buffer: makeTestBuffer(t, 256),
flags: descriptorFlagHasNext,
next: 3,
},
{
// Tail of the first chain.
buffer: make([]byte, pageSize),
flags: descriptorFlagWritable,
},
{
// Head and tail of the second chain.
buffer: make([]byte, pageSize),
flags: descriptorFlagWritable,
},
{
// Free head.
flags: descriptorFlagHasNext,
next: 6,
},
{
flags: descriptorFlagHasNext,
next: 7,
},
{
flags: descriptorFlagHasNext,
next: 0,
},
})
// Free the first chain.
assert.NoError(t, dt.freeDescriptorChain(firstChain))
// Now there should be only a single chain next to the free chain.
assert.Equal(t, uint16(5), dt.freeHeadIndex)
assert.Equal(t, uint16(7), dt.freeNum)
assertDescriptorTable([queueSize]desc{
{
flags: descriptorFlagHasNext,
next: 5,
},
{
flags: descriptorFlagHasNext,
next: 2,
},
{
flags: descriptorFlagHasNext,
next: 3,
},
{
flags: descriptorFlagHasNext,
next: 6,
},
{
// Head and tail of the second chain.
buffer: make([]byte, pageSize),
flags: descriptorFlagWritable,
},
{
// Free head.
flags: descriptorFlagHasNext,
next: 1,
},
{
flags: descriptorFlagHasNext,
next: 7,
},
{
flags: descriptorFlagHasNext,
next: 0,
},
})
// Free the second chain.
assert.NoError(t, dt.freeDescriptorChain(secondChain))
// Now all descriptors should be in the free chain again.
assert.Equal(t, uint16(5), dt.freeHeadIndex)
assert.Equal(t, uint16(8), dt.freeNum)
assertDescriptorTable([queueSize]desc{
{
flags: descriptorFlagHasNext,
next: 5,
},
{
flags: descriptorFlagHasNext,
next: 2,
},
{
flags: descriptorFlagHasNext,
next: 3,
},
{
flags: descriptorFlagHasNext,
next: 6,
},
{
flags: descriptorFlagHasNext,
next: 1,
},
{
// Free head.
flags: descriptorFlagHasNext,
next: 4,
},
{
flags: descriptorFlagHasNext,
next: 7,
},
{
flags: descriptorFlagHasNext,
next: 0,
},
})
}
func makeTestBuffer(t *testing.T, length int) []byte {
t.Helper()
buf := make([]byte, length)
for i := 0; i < length; i++ {
buf[i] = byte(length - i)
}
return buf
}

View File

@@ -1,7 +0,0 @@
// Package virtqueue implements the driver-side for a virtio queue as described
// in the specification:
// https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-270006
// This package does not make assumptions about the device that consumes the
// queue. It rather just allocates the queue structures in memory and provides
// methods to interact with it.
package virtqueue

View File

@@ -1,45 +0,0 @@
package virtqueue
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"gvisor.dev/gvisor/pkg/eventfd"
)
// Tests how an eventfd and a waiting goroutine can be gracefully closed.
// Extends the eventfd test suite:
// https://github.com/google/gvisor/blob/0799336d64be65eb97d330606c30162dc3440cab/pkg/eventfd/eventfd_test.go
func TestEventFD_CancelWait(t *testing.T) {
efd, err := eventfd.Create()
require.NoError(t, err)
t.Cleanup(func() {
assert.NoError(t, efd.Close())
})
var stop bool
done := make(chan struct{})
go func() {
for !stop {
_ = efd.Wait()
}
close(done)
}()
select {
case <-done:
t.Fatalf("goroutine ended early")
case <-time.After(500 * time.Millisecond):
}
stop = true
assert.NoError(t, efd.Notify())
select {
case <-done:
break
case <-time.After(5 * time.Second):
t.Error("goroutine did not end")
}
}

View File

@@ -1,33 +0,0 @@
package virtqueue
import (
"errors"
"fmt"
)
// ErrQueueSizeInvalid is returned when a queue size is invalid.
var ErrQueueSizeInvalid = errors.New("queue size is invalid")
// CheckQueueSize checks if the given value would be a valid size for a
// virtqueue and returns an [ErrQueueSizeInvalid], if not.
func CheckQueueSize(queueSize int) error {
if queueSize <= 0 {
return fmt.Errorf("%w: %d is too small", ErrQueueSizeInvalid, queueSize)
}
// The queue size must always be a power of 2.
// This ensures that ring indexes wrap correctly when the 16-bit integers
// overflow.
if queueSize&(queueSize-1) != 0 {
return fmt.Errorf("%w: %d is not a power of 2", ErrQueueSizeInvalid, queueSize)
}
// The largest power of 2 that fits into a 16-bit integer is 32768.
// 2 * 32768 would be 65536 which no longer fits.
if queueSize > 32768 {
return fmt.Errorf("%w: %d is larger than the maximum possible queue size 32768",
ErrQueueSizeInvalid, queueSize)
}
return nil
}

View File

@@ -1,59 +0,0 @@
package virtqueue
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestCheckQueueSize(t *testing.T) {
tests := []struct {
name string
queueSize int
containsErr string
}{
{
name: "negative",
queueSize: -1,
containsErr: "too small",
},
{
name: "zero",
queueSize: 0,
containsErr: "too small",
},
{
name: "not a power of 2",
queueSize: 24,
containsErr: "not a power of 2",
},
{
name: "too large",
queueSize: 65536,
containsErr: "larger than the maximum",
},
{
name: "valid 1",
queueSize: 1,
},
{
name: "valid 256",
queueSize: 256,
},
{
name: "valid 32768",
queueSize: 32768,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := CheckQueueSize(tt.queueSize)
if tt.containsErr != "" {
assert.ErrorContains(t, err, tt.containsErr)
} else {
assert.NoError(t, err)
}
})
}
}

View File

@@ -1,530 +0,0 @@
package virtqueue
import (
"context"
"errors"
"fmt"
"os"
"syscall"
"github.com/slackhq/nebula/overlay/eventfd"
"golang.org/x/sys/unix"
)
// SplitQueue is a virtqueue that consists of several parts, where each part is
// writeable by either the driver or the device, but not both.
type SplitQueue struct {
// size is the size of the queue.
size int
// buf is the underlying memory used for the queue.
buf []byte
descriptorTable *DescriptorTable
availableRing *AvailableRing
usedRing *UsedRing
// kickEventFD is used to signal the device when descriptor chains were
// added to the available ring.
kickEventFD eventfd.EventFD
// callEventFD is used by the device to signal when it has used descriptor
// chains and put them in the used ring.
callEventFD eventfd.EventFD
// stop is used by [SplitQueue.Close] to cancel the goroutine that handles
// used buffer notifications. It blocks until the goroutine ended.
stop func() error
itemSize int
epoll eventfd.Epoll
more int
}
// NewSplitQueue allocates a new [SplitQueue] in memory. The given queue size
// specifies the number of entries/buffers the queue can hold. This also affects
// the memory consumption.
func NewSplitQueue(queueSize int, itemSize int) (_ *SplitQueue, err error) {
if err = CheckQueueSize(queueSize); err != nil {
return nil, err
}
if itemSize%os.Getpagesize() != 0 {
return nil, errors.New("split queue size must be multiple of os.Getpagesize()")
}
sq := SplitQueue{
size: queueSize,
itemSize: itemSize,
}
// Clean up a partially initialized queue when something fails.
defer func() {
if err != nil {
_ = sq.Close()
}
}()
// There are multiple ways for how the memory for the virtqueue could be
// allocated. We could use Go native structs with arrays inside them, but
// this wouldn't allow us to make the queue size configurable. And including
// a slice in the Go structs wouldn't work, because this would just put the
// Go slice descriptor into the memory region which the virtio device will
// not understand.
// Additionally, Go does not allow us to ensure a correct alignment of the
// parts of the virtqueue, as it is required by the virtio specification.
//
// To resolve this, let's just allocate the memory manually by allocating
// one or more memory pages, depending on the queue size. Making the
// virtqueue start at the beginning of a page is not strictly necessary, as
// the virtio specification does not require it to be continuous in the
// physical memory of the host (e.g. the vhost implementation in the kernel
// always uses copy_from_user to access it), but this makes it very easy to
// guarantee the alignment. Also, it is not required for the virtqueue parts
// to be in the same memory region, as we pass separate pointers to them to
// the device, but this design just makes things easier to implement.
//
// One added benefit of allocating the memory manually is, that we have full
// control over its lifetime and don't risk the garbage collector to collect
// our valuable structures while the device still works with them.
// The descriptor table is at the start of the page, so alignment is not an
// issue here.
descriptorTableStart := 0
descriptorTableEnd := descriptorTableStart + descriptorTableSize(queueSize)
availableRingStart := align(descriptorTableEnd, availableRingAlignment)
availableRingEnd := availableRingStart + availableRingSize(queueSize)
usedRingStart := align(availableRingEnd, usedRingAlignment)
usedRingEnd := usedRingStart + usedRingSize(queueSize)
sq.buf, err = unix.Mmap(-1, 0, usedRingEnd,
unix.PROT_READ|unix.PROT_WRITE,
unix.MAP_PRIVATE|unix.MAP_ANONYMOUS)
if err != nil {
return nil, fmt.Errorf("allocate virtqueue buffer: %w", err)
}
sq.descriptorTable = newDescriptorTable(queueSize, sq.buf[descriptorTableStart:descriptorTableEnd], sq.itemSize)
sq.availableRing = newAvailableRing(queueSize, sq.buf[availableRingStart:availableRingEnd])
sq.usedRing = newUsedRing(queueSize, sq.buf[usedRingStart:usedRingEnd])
sq.kickEventFD, err = eventfd.New()
if err != nil {
return nil, fmt.Errorf("create kick event file descriptor: %w", err)
}
sq.callEventFD, err = eventfd.New()
if err != nil {
return nil, fmt.Errorf("create call event file descriptor: %w", err)
}
if err = sq.descriptorTable.initializeDescriptors(); err != nil {
return nil, fmt.Errorf("initialize descriptors: %w", err)
}
sq.epoll, err = eventfd.NewEpoll()
if err != nil {
return nil, err
}
err = sq.epoll.AddEvent(sq.callEventFD.FD())
if err != nil {
return nil, err
}
// Consume used buffer notifications in the background.
sq.stop = sq.startConsumeUsedRing()
return &sq, nil
}
// Size returns the size of this queue, which is the number of entries/buffers
// this queue can hold.
func (sq *SplitQueue) Size() int {
return sq.size
}
// DescriptorTable returns the [DescriptorTable] behind this queue.
func (sq *SplitQueue) DescriptorTable() *DescriptorTable {
return sq.descriptorTable
}
// AvailableRing returns the [AvailableRing] behind this queue.
func (sq *SplitQueue) AvailableRing() *AvailableRing {
return sq.availableRing
}
// UsedRing returns the [UsedRing] behind this queue.
func (sq *SplitQueue) UsedRing() *UsedRing {
return sq.usedRing
}
// KickEventFD returns the kick event file descriptor behind this queue.
// The returned file descriptor should be used with great care to not interfere
// with this implementation.
func (sq *SplitQueue) KickEventFD() int {
return sq.kickEventFD.FD()
}
// CallEventFD returns the call event file descriptor behind this queue.
// The returned file descriptor should be used with great care to not interfere
// with this implementation.
func (sq *SplitQueue) CallEventFD() int {
return sq.callEventFD.FD()
}
// startConsumeUsedRing starts a goroutine that runs [consumeUsedRing].
// A function is returned that can be used to gracefully cancel it. todo rename
func (sq *SplitQueue) startConsumeUsedRing() func() error {
return func() error {
// The goroutine blocks until it receives a signal on the event file
// descriptor, so it will never notice the context being canceled.
// To resolve this, we can just produce a fake-signal ourselves to wake
// it up.
if err := sq.callEventFD.Kick(); err != nil {
return fmt.Errorf("wake up goroutine: %w", err)
}
return nil
}
}
// BlockAndGetHeads waits for the device to signal that it has used descriptor chains and returns all [UsedElement]s
func (sq *SplitQueue) BlockAndGetHeads(ctx context.Context) ([]UsedElement, error) {
var n int
var err error
for ctx.Err() == nil {
// Wait for a signal from the device.
if n, err = sq.epoll.Block(); err != nil {
return nil, fmt.Errorf("wait: %w", err)
}
if n > 0 {
stillNeedToTake, out := sq.usedRing.take(-1)
sq.more = stillNeedToTake
if stillNeedToTake == 0 {
_ = sq.epoll.Clear() //???
}
return out, nil
}
}
return nil, ctx.Err()
}
func (sq *SplitQueue) TakeSingle(ctx context.Context) (uint16, error) {
var n int
var err error
for ctx.Err() == nil {
out, ok := sq.usedRing.takeOne()
if ok {
return out, nil
}
// Wait for a signal from the device.
if n, err = sq.epoll.Block(); err != nil {
return 0, fmt.Errorf("wait: %w", err)
}
if n > 0 {
out, ok = sq.usedRing.takeOne()
if ok {
_ = sq.epoll.Clear() //???
return out, nil
} else {
continue //???
}
}
}
return 0, ctx.Err()
}
func (sq *SplitQueue) BlockAndGetHeadsCapped(ctx context.Context, maxToTake int) ([]UsedElement, error) {
var n int
var err error
for ctx.Err() == nil {
//we have leftovers in the fridge
if sq.more > 0 {
stillNeedToTake, out := sq.usedRing.take(maxToTake)
sq.more = stillNeedToTake
return out, nil
}
//look inside the fridge
stillNeedToTake, out := sq.usedRing.take(maxToTake)
if len(out) > 0 {
sq.more = stillNeedToTake
return out, nil
}
//fridge is empty I guess
// Wait for a signal from the device.
if n, err = sq.epoll.Block(); err != nil {
return nil, fmt.Errorf("wait: %w", err)
}
if n > 0 {
_ = sq.epoll.Clear() //???
stillNeedToTake, out = sq.usedRing.take(maxToTake)
sq.more = stillNeedToTake
return out, nil
}
}
return nil, ctx.Err()
}
// OfferDescriptorChain offers a descriptor chain to the device which contains a
// number of device-readable buffers (out buffers) and device-writable buffers
// (in buffers).
//
// All buffers in the outBuffers slice will be concatenated by chaining
// descriptors, one for each buffer in the slice. When a buffer is too large to
// fit into a single descriptor (limited by the system's page size), it will be
// split up into multiple descriptors within the chain.
// When numInBuffers is greater than zero, the given number of device-writable
// descriptors will be appended to the end of the chain, each referencing a
// whole memory page (see [os.Getpagesize]).
//
// When the queue is full and no more descriptor chains can be added, a wrapped
// [ErrNotEnoughFreeDescriptors] will be returned. If you set waitFree to true,
// this method will handle this error and will block instead until there are
// enough free descriptors again.
//
// After defining the descriptor chain in the [DescriptorTable], the index of
// the head of the chain will be made available to the device using the
// [AvailableRing] and will be returned by this method.
// Callers should read from the [SplitQueue.UsedDescriptorChains] channel to be
// notified when the descriptor chain was used by the device and should free the
// used descriptor chains again using [SplitQueue.FreeDescriptorChain] when
// they're done with them. When this does not happen, the queue will run full
// and any further calls to [SplitQueue.OfferDescriptorChain] will stall.
func (sq *SplitQueue) OfferInDescriptorChains() (uint16, error) {
// Create a descriptor chain for the given buffers.
var (
head uint16
err error
)
for {
head, err = sq.descriptorTable.createDescriptorForInputs()
if err == nil {
break
}
// I don't wanna use errors.Is, it's slow
//goland:noinspection GoDirectComparisonOfErrors
if err == ErrNotEnoughFreeDescriptors {
return 0, err
} else {
return 0, fmt.Errorf("create descriptor chain: %w", err)
}
}
// Make the descriptor chain available to the device.
sq.availableRing.offerSingle(head)
// Notify the device to make it process the updated available ring.
if err := sq.kickEventFD.Kick(); err != nil {
return head, fmt.Errorf("notify device: %w", err)
}
return head, nil
}
func (sq *SplitQueue) OfferOutDescriptorChains(prepend []byte, outBuffers [][]byte) ([]uint16, error) {
// TODO change this
// Each descriptor can only hold a whole memory page, so split large out
// buffers into multiple smaller ones.
outBuffers = splitBuffers(outBuffers, sq.itemSize)
chains := make([]uint16, len(outBuffers))
// Create a descriptor chain for the given buffers.
var (
head uint16
err error
)
for i := range outBuffers {
for {
bufs := [][]byte{prepend, outBuffers[i]}
head, err = sq.descriptorTable.createDescriptorChain(bufs, 0)
if err == nil {
break
}
// I don't wanna use errors.Is, it's slow
//goland:noinspection GoDirectComparisonOfErrors
if err == ErrNotEnoughFreeDescriptors {
// Wait for more free descriptors to be put back into the queue.
// If the number of free descriptors is still not sufficient, we'll
// land here again.
//todo should never happen
syscall.Syscall(syscall.SYS_SCHED_YIELD, 0, 0, 0) // Cheap barrier
continue
}
return nil, fmt.Errorf("create descriptor chain: %w", err)
}
chains[i] = head
}
// Make the descriptor chain available to the device.
sq.availableRing.offer(chains)
// Notify the device to make it process the updated available ring.
if err := sq.kickEventFD.Kick(); err != nil {
return chains, fmt.Errorf("notify device: %w", err)
}
return chains, nil
}
// GetDescriptorChain returns the device-readable buffers (out buffers) and
// device-writable buffers (in buffers) of the descriptor chain with the given
// head index.
// The head index must be one that was returned by a previous call to
// [SplitQueue.OfferDescriptorChain] and the descriptor chain must not have been
// freed yet.
//
// Be careful to only access the returned buffer slices when the device is no
// longer using them. They must not be accessed after
// [SplitQueue.FreeDescriptorChain] has been called.
func (sq *SplitQueue) GetDescriptorChain(head uint16) (outBuffers, inBuffers [][]byte, err error) {
return sq.descriptorTable.getDescriptorChain(head)
}
func (sq *SplitQueue) GetDescriptorItem(head uint16) ([]byte, error) {
sq.descriptorTable.descriptors[head].length = uint32(sq.descriptorTable.itemSize)
return sq.descriptorTable.getDescriptorItem(head)
}
func (sq *SplitQueue) GetDescriptorChainContents(head uint16, out []byte, maxLen int) (int, error) {
return sq.descriptorTable.getDescriptorChainContents(head, out, maxLen)
}
func (sq *SplitQueue) GetDescriptorInbuffers(head uint16, inBuffers *[][]byte) error {
return sq.descriptorTable.getDescriptorInbuffers(head, inBuffers)
}
// FreeDescriptorChain frees the descriptor chain with the given head index.
// The head index must be one that was returned by a previous call to
// [SplitQueue.OfferDescriptorChain] and the descriptor chain must not have been
// freed yet.
//
// This creates new room in the queue which can be used by following
// [SplitQueue.OfferDescriptorChain] calls.
// When there are outstanding calls for [SplitQueue.OfferDescriptorChain] that
// are waiting for free room in the queue, they may become unblocked by this.
func (sq *SplitQueue) FreeDescriptorChain(head uint16) error {
//not called under lock
if err := sq.descriptorTable.freeDescriptorChain(head); err != nil {
return fmt.Errorf("free: %w", err)
}
return nil
}
func (sq *SplitQueue) SetDescSize(head uint16, sz int) {
//not called under lock
sq.descriptorTable.descriptors[int(head)].length = uint32(sz)
}
func (sq *SplitQueue) OfferDescriptorChains(chains []uint16, kick bool) error {
//todo not doing this may break eventually?
//not called under lock
//if err := sq.descriptorTable.freeDescriptorChain(head); err != nil {
// return fmt.Errorf("free: %w", err)
//}
// Make the descriptor chain available to the device.
sq.availableRing.offer(chains)
// Notify the device to make it process the updated available ring.
if kick {
return sq.Kick()
}
return nil
}
func (sq *SplitQueue) Kick() error {
if err := sq.kickEventFD.Kick(); err != nil {
return fmt.Errorf("notify device: %w", err)
}
return nil
}
// Close releases all resources used for this queue.
// The implementation will try to release as many resources as possible and
// collect potential errors before returning them.
func (sq *SplitQueue) Close() error {
var errs []error
if sq.stop != nil {
// This has to happen before the event file descriptors may be closed.
if err := sq.stop(); err != nil {
errs = append(errs, fmt.Errorf("stop consume used ring: %w", err))
}
// Make sure that this code block is executed only once.
sq.stop = nil
}
if err := sq.kickEventFD.Close(); err != nil {
errs = append(errs, fmt.Errorf("close kick event file descriptor: %w", err))
}
if err := sq.callEventFD.Close(); err != nil {
errs = append(errs, fmt.Errorf("close call event file descriptor: %w", err))
}
if err := sq.descriptorTable.releaseBuffers(); err != nil {
errs = append(errs, fmt.Errorf("release descriptor buffers: %w", err))
}
if sq.buf != nil {
if err := unix.Munmap(sq.buf); err == nil {
sq.buf = nil
} else {
errs = append(errs, fmt.Errorf("unmap virtqueue buffer: %w", err))
}
}
return errors.Join(errs...)
}
// ensureInitialized is used as a guard to prevent methods to be called on an
// uninitialized instance.
func (sq *SplitQueue) ensureInitialized() {
if sq.buf == nil {
panic("used ring is not initialized")
}
}
func align(index, alignment int) int {
remainder := index % alignment
if remainder == 0 {
return index
}
return index + alignment - remainder
}
// splitBuffers processes a list of buffers and splits each buffer that is
// larger than the size limit into multiple smaller buffers.
// If none of the buffers are too big though, do nothing, to avoid allocation for now
func splitBuffers(buffers [][]byte, sizeLimit int) [][]byte {
for i := range buffers {
if len(buffers[i]) > sizeLimit {
return reallySplitBuffers(buffers, sizeLimit)
}
}
return buffers
}
func reallySplitBuffers(buffers [][]byte, sizeLimit int) [][]byte {
result := make([][]byte, 0, len(buffers))
for _, buffer := range buffers {
for added := 0; added < len(buffer); added += sizeLimit {
if len(buffer)-added <= sizeLimit {
result = append(result, buffer[added:])
break
}
result = append(result, buffer[added:added+sizeLimit])
}
}
return result
}

View File

@@ -1,105 +0,0 @@
package virtqueue
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestSplitQueue_MemoryAlignment(t *testing.T) {
tests := []struct {
name string
queueSize int
}{
{
name: "minimal queue size",
queueSize: 1,
},
{
name: "small queue size",
queueSize: 8,
},
{
name: "large queue size",
queueSize: 256,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
sq, err := NewSplitQueue(tt.queueSize)
require.NoError(t, err)
assert.Zero(t, sq.descriptorTable.Address()%descriptorTableAlignment)
assert.Zero(t, sq.availableRing.Address()%availableRingAlignment)
assert.Zero(t, sq.usedRing.Address()%usedRingAlignment)
})
}
}
func TestSplitBuffers(t *testing.T) {
const sizeLimit = 16
tests := []struct {
name string
buffers [][]byte
expected [][]byte
}{
{
name: "no buffers",
buffers: make([][]byte, 0),
expected: make([][]byte, 0),
},
{
name: "small",
buffers: [][]byte{
make([]byte, 11),
},
expected: [][]byte{
make([]byte, 11),
},
},
{
name: "exact size",
buffers: [][]byte{
make([]byte, sizeLimit),
},
expected: [][]byte{
make([]byte, sizeLimit),
},
},
{
name: "large",
buffers: [][]byte{
make([]byte, 42),
},
expected: [][]byte{
make([]byte, 16),
make([]byte, 16),
make([]byte, 10),
},
},
{
name: "mixed",
buffers: [][]byte{
make([]byte, 7),
make([]byte, 30),
make([]byte, 15),
make([]byte, 32),
},
expected: [][]byte{
make([]byte, 7),
make([]byte, 16),
make([]byte, 14),
make([]byte, 15),
make([]byte, 16),
make([]byte, 16),
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
actual := splitBuffers(tt.buffers, sizeLimit)
assert.Equal(t, tt.expected, actual)
})
}
}

View File

@@ -1,21 +0,0 @@
package virtqueue
// usedElementSize is the number of bytes needed to store a [UsedElement] in
// memory.
const usedElementSize = 8
// UsedElement is an element of the [UsedRing] and describes a descriptor chain
// that was used by the device.
type UsedElement struct {
// DescriptorIndex is the index of the head of the used descriptor chain in
// the [DescriptorTable].
// The index is 32-bit here for padding reasons.
DescriptorIndex uint32
// Length is the number of bytes written into the device writable portion of
// the buffer described by the descriptor chain.
Length uint32
}
func (u *UsedElement) GetHead() uint16 {
return uint16(u.DescriptorIndex)
}

View File

@@ -1,12 +0,0 @@
package virtqueue
import (
"testing"
"unsafe"
"github.com/stretchr/testify/assert"
)
func TestUsedElement_Size(t *testing.T) {
assert.EqualValues(t, usedElementSize, unsafe.Sizeof(UsedElement{}))
}

View File

@@ -1,184 +0,0 @@
package virtqueue
import (
"fmt"
"unsafe"
)
// usedRingFlag is a flag that describes a [UsedRing].
type usedRingFlag uint16
const (
// usedRingFlagNoNotify is used by the host to advise the guest to not
// kick it when adding a buffer. It's unreliable, so it's simply an
// optimization. Guest will still kick when it's out of buffers.
usedRingFlagNoNotify usedRingFlag = 1 << iota
)
// usedRingSize is the number of bytes needed to store a [UsedRing] with the
// given queue size in memory.
func usedRingSize(queueSize int) int {
return 6 + usedElementSize*queueSize
}
// usedRingAlignment is the minimum alignment of a [UsedRing] in memory, as
// required by the virtio spec.
const usedRingAlignment = 4
// UsedRing is where the device returns descriptor chains once it is done with
// them. Each ring entry is a [UsedElement]. It is only written to by the device
// and read by the driver.
//
// Because the size of the ring depends on the queue size, we cannot define a
// Go struct with a static size that maps to the memory of the ring. Instead,
// this struct only contains pointers to the corresponding memory areas.
type UsedRing struct {
initialized bool
// flags that describe this ring.
flags *usedRingFlag
// ringIndex indicates where the device would put the next entry into the
// ring (modulo the queue size).
ringIndex *uint16
// ring contains the [UsedElement]s. It wraps around at queue size.
ring []UsedElement
// availableEvent is not used by this implementation, but we reserve it
// anyway to avoid issues in case a device may try to write to it, contrary
// to the virtio specification.
availableEvent *uint16
// lastIndex is the internal ringIndex up to which all [UsedElement]s were
// processed.
lastIndex uint16
//mu sync.Mutex
}
// newUsedRing creates a used ring that uses the given underlying memory. The
// length of the memory slice must match the size needed for the ring (see
// [usedRingSize]) for the given queue size.
func newUsedRing(queueSize int, mem []byte) *UsedRing {
ringSize := usedRingSize(queueSize)
if len(mem) != ringSize {
panic(fmt.Sprintf("memory size (%v) does not match required size "+
"for used ring: %v", len(mem), ringSize))
}
r := UsedRing{
initialized: true,
flags: (*usedRingFlag)(unsafe.Pointer(&mem[0])),
ringIndex: (*uint16)(unsafe.Pointer(&mem[2])),
ring: unsafe.Slice((*UsedElement)(unsafe.Pointer(&mem[4])), queueSize),
availableEvent: (*uint16)(unsafe.Pointer(&mem[ringSize-2])),
}
r.lastIndex = *r.ringIndex
return &r
}
// Address returns the pointer to the beginning of the ring in memory.
// Do not modify the memory directly to not interfere with this implementation.
func (r *UsedRing) Address() uintptr {
if !r.initialized {
panic("used ring is not initialized")
}
return uintptr(unsafe.Pointer(r.flags))
}
// take returns all new [UsedElement]s that the device put into the ring and
// that weren't already returned by a previous call to this method.
// had a lock, I removed it
func (r *UsedRing) take(maxToTake int) (int, []UsedElement) {
//r.mu.Lock()
//defer r.mu.Unlock()
ringIndex := *r.ringIndex
if ringIndex == r.lastIndex {
// Nothing new.
return 0, nil
}
// Calculate the number new used elements that we can read from the ring.
// The ring index may wrap, so special handling for that case is needed.
count := int(ringIndex - r.lastIndex)
if count < 0 {
count += 0xffff
}
stillNeedToTake := 0
if maxToTake > 0 {
stillNeedToTake = count - maxToTake
if stillNeedToTake < 0 {
stillNeedToTake = 0
}
count = min(count, maxToTake)
}
// The number of new elements can never exceed the queue size.
if count > len(r.ring) {
panic("used ring contains more new elements than the ring is long")
}
elems := make([]UsedElement, count)
for i := range count {
elems[i] = r.ring[r.lastIndex%uint16(len(r.ring))]
r.lastIndex++
}
return stillNeedToTake, elems
}
func (r *UsedRing) takeOne() (uint16, bool) {
//r.mu.Lock()
//defer r.mu.Unlock()
ringIndex := *r.ringIndex
if ringIndex == r.lastIndex {
// Nothing new.
return 0xffff, false
}
// Calculate the number new used elements that we can read from the ring.
// The ring index may wrap, so special handling for that case is needed.
count := int(ringIndex - r.lastIndex)
if count < 0 {
count += 0xffff
}
// The number of new elements can never exceed the queue size.
if count > len(r.ring) {
panic("used ring contains more new elements than the ring is long")
}
if count == 0 {
return 0xffff, false
}
out := r.ring[r.lastIndex%uint16(len(r.ring))].GetHead()
r.lastIndex++
return out, true
}
// InitOfferSingle is only used to pre-fill the used queue at startup, and should not be used if the device is running!
func (r *UsedRing) InitOfferSingle(x uint16, size int) {
//always called under lock
//r.mu.Lock()
//defer r.mu.Unlock()
offset := 0
// Add descriptor chain heads to the ring.
// The 16-bit ring index may overflow. This is expected and is not an
// issue because the size of the ring array (which equals the queue
// size) is always a power of 2 and smaller than the highest possible
// 16-bit value.
insertIndex := int(*r.ringIndex+uint16(offset)) % len(r.ring)
r.ring[insertIndex] = UsedElement{
DescriptorIndex: uint32(x),
Length: uint32(size),
}
// Increase the ring index by the number of descriptor chains added to the ring.
*r.ringIndex += 1
}

View File

@@ -1,136 +0,0 @@
package virtqueue
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestUsedRing_MemoryLayout(t *testing.T) {
const queueSize = 2
memory := make([]byte, usedRingSize(queueSize))
r := newUsedRing(queueSize, memory)
*r.flags = 0x01ff
*r.ringIndex = 1
r.ring[0] = UsedElement{
DescriptorIndex: 0x0123,
Length: 0x4567,
}
r.ring[1] = UsedElement{
DescriptorIndex: 0x89ab,
Length: 0xcdef,
}
assert.Equal(t, []byte{
0xff, 0x01,
0x01, 0x00,
0x23, 0x01, 0x00, 0x00,
0x67, 0x45, 0x00, 0x00,
0xab, 0x89, 0x00, 0x00,
0xef, 0xcd, 0x00, 0x00,
0x00, 0x00,
}, memory)
}
//func TestUsedRing_Take(t *testing.T) {
// const queueSize = 8
//
// tests := []struct {
// name string
// ring []UsedElement
// ringIndex uint16
// lastIndex uint16
// expected []UsedElement
// }{
// {
// name: "nothing new",
// ring: []UsedElement{
// {DescriptorIndex: 1},
// {DescriptorIndex: 2},
// {DescriptorIndex: 3},
// {DescriptorIndex: 4},
// {},
// {},
// {},
// {},
// },
// ringIndex: 4,
// lastIndex: 4,
// expected: nil,
// },
// {
// name: "no overflow",
// ring: []UsedElement{
// {DescriptorIndex: 1},
// {DescriptorIndex: 2},
// {DescriptorIndex: 3},
// {DescriptorIndex: 4},
// {},
// {},
// {},
// {},
// },
// ringIndex: 4,
// lastIndex: 1,
// expected: []UsedElement{
// {DescriptorIndex: 2},
// {DescriptorIndex: 3},
// {DescriptorIndex: 4},
// },
// },
// {
// name: "ring overflow",
// ring: []UsedElement{
// {DescriptorIndex: 9},
// {DescriptorIndex: 10},
// {DescriptorIndex: 3},
// {DescriptorIndex: 4},
// {DescriptorIndex: 5},
// {DescriptorIndex: 6},
// {DescriptorIndex: 7},
// {DescriptorIndex: 8},
// },
// ringIndex: 10,
// lastIndex: 7,
// expected: []UsedElement{
// {DescriptorIndex: 8},
// {DescriptorIndex: 9},
// {DescriptorIndex: 10},
// },
// },
// {
// name: "index overflow",
// ring: []UsedElement{
// {DescriptorIndex: 9},
// {DescriptorIndex: 10},
// {DescriptorIndex: 3},
// {DescriptorIndex: 4},
// {DescriptorIndex: 5},
// {DescriptorIndex: 6},
// {DescriptorIndex: 7},
// {DescriptorIndex: 8},
// },
// ringIndex: 2,
// lastIndex: 65535,
// expected: []UsedElement{
// {DescriptorIndex: 8},
// {DescriptorIndex: 9},
// {DescriptorIndex: 10},
// },
// },
// }
// for _, tt := range tests {
// t.Run(tt.name, func(t *testing.T) {
// memory := make([]byte, usedRingSize(queueSize))
// r := newUsedRing(queueSize, memory)
//
// copy(r.ring, tt.ring)
// *r.ringIndex = tt.ringIndex
// r.lastIndex = tt.lastIndex
//
// assert.Equal(t, tt.expected, r.take())
// })
// }
//}

View File

@@ -0,0 +1,220 @@
//go:build linux && !android && !e2e_testing
package overlay
import (
"fmt"
"sync"
wgtun "github.com/slackhq/nebula/wgstack/tun"
)
type wireguardTunIO struct {
dev wgtun.Device
mtu int
batchSize int
readMu sync.Mutex
readBuffers [][]byte
readLens []int
legacyBuf []byte
writeMu sync.Mutex
writeBuf []byte
writeWrap [][]byte
writeBuffers [][]byte
}
func newWireguardTunIO(dev wgtun.Device, mtu int) *wireguardTunIO {
batch := dev.BatchSize()
if batch <= 0 {
batch = 1
}
if mtu <= 0 {
mtu = DefaultMTU
}
return &wireguardTunIO{
dev: dev,
mtu: mtu,
batchSize: batch,
readLens: make([]int, batch),
legacyBuf: make([]byte, wgtun.VirtioNetHdrLen+mtu),
writeBuf: make([]byte, wgtun.VirtioNetHdrLen+mtu),
writeWrap: make([][]byte, 1),
}
}
func (w *wireguardTunIO) Read(p []byte) (int, error) {
w.readMu.Lock()
defer w.readMu.Unlock()
bufs := w.readBuffers
if len(bufs) == 0 {
bufs = [][]byte{w.legacyBuf}
w.readBuffers = bufs
}
n, err := w.dev.Read(bufs[:1], w.readLens[:1], wgtun.VirtioNetHdrLen)
if err != nil {
return 0, err
}
if n == 0 {
return 0, nil
}
length := w.readLens[0]
copy(p, w.legacyBuf[wgtun.VirtioNetHdrLen:wgtun.VirtioNetHdrLen+length])
return length, nil
}
func (w *wireguardTunIO) Write(p []byte) (int, error) {
if len(p) > w.mtu {
return 0, fmt.Errorf("wireguard tun: payload exceeds MTU (%d > %d)", len(p), w.mtu)
}
w.writeMu.Lock()
defer w.writeMu.Unlock()
buf := w.writeBuf[:wgtun.VirtioNetHdrLen+len(p)]
for i := 0; i < wgtun.VirtioNetHdrLen; i++ {
buf[i] = 0
}
copy(buf[wgtun.VirtioNetHdrLen:], p)
w.writeWrap[0] = buf
n, err := w.dev.Write(w.writeWrap, wgtun.VirtioNetHdrLen)
if err != nil {
return n, err
}
return len(p), nil
}
func (w *wireguardTunIO) ReadIntoBatch(pool *PacketPool) ([]*Packet, error) {
if pool == nil {
return nil, fmt.Errorf("wireguard tun: packet pool is nil")
}
w.readMu.Lock()
defer w.readMu.Unlock()
if len(w.readBuffers) < w.batchSize {
w.readBuffers = make([][]byte, w.batchSize)
}
if len(w.readLens) < w.batchSize {
w.readLens = make([]int, w.batchSize)
}
packets := make([]*Packet, w.batchSize)
requiredHeadroom := w.BatchHeadroom()
requiredPayload := w.BatchPayloadCap()
headroom := 0
for i := 0; i < w.batchSize; i++ {
pkt := pool.Get()
if pkt == nil {
releasePackets(packets[:i])
return nil, fmt.Errorf("wireguard tun: packet pool returned nil packet")
}
if pkt.Capacity() < requiredPayload {
pkt.Release()
releasePackets(packets[:i])
return nil, fmt.Errorf("wireguard tun: packet capacity %d below required %d", pkt.Capacity(), requiredPayload)
}
if i == 0 {
headroom = pkt.Offset
if headroom < requiredHeadroom {
pkt.Release()
releasePackets(packets[:i])
return nil, fmt.Errorf("wireguard tun: packet headroom %d below virtio requirement %d", headroom, requiredHeadroom)
}
} else if pkt.Offset != headroom {
pkt.Release()
releasePackets(packets[:i])
return nil, fmt.Errorf("wireguard tun: inconsistent packet headroom (%d != %d)", pkt.Offset, headroom)
}
packets[i] = pkt
w.readBuffers[i] = pkt.Buf
}
n, err := w.dev.Read(w.readBuffers[:w.batchSize], w.readLens[:w.batchSize], headroom)
if err != nil {
releasePackets(packets)
return nil, err
}
if n == 0 {
releasePackets(packets)
return nil, nil
}
for i := 0; i < n; i++ {
packets[i].Len = w.readLens[i]
}
for i := n; i < w.batchSize; i++ {
packets[i].Release()
packets[i] = nil
}
return packets[:n], nil
}
func (w *wireguardTunIO) WriteBatch(packets []*Packet) (int, error) {
if len(packets) == 0 {
return 0, nil
}
requiredHeadroom := w.BatchHeadroom()
offset := packets[0].Offset
if offset < requiredHeadroom {
releasePackets(packets)
return 0, fmt.Errorf("wireguard tun: packet offset %d smaller than required headroom %d", offset, requiredHeadroom)
}
for _, pkt := range packets {
if pkt == nil {
continue
}
if pkt.Offset != offset {
releasePackets(packets)
return 0, fmt.Errorf("wireguard tun: mixed packet offsets not supported")
}
limit := pkt.Offset + pkt.Len
if limit > len(pkt.Buf) {
releasePackets(packets)
return 0, fmt.Errorf("wireguard tun: packet length %d exceeds buffer capacity %d", pkt.Len, len(pkt.Buf)-pkt.Offset)
}
}
w.writeMu.Lock()
defer w.writeMu.Unlock()
if len(w.writeBuffers) < len(packets) {
w.writeBuffers = make([][]byte, len(packets))
}
for i, pkt := range packets {
if pkt == nil {
w.writeBuffers[i] = nil
continue
}
limit := pkt.Offset + pkt.Len
w.writeBuffers[i] = pkt.Buf[:limit]
}
n, err := w.dev.Write(w.writeBuffers[:len(packets)], offset)
if err != nil {
return n, err
}
releasePackets(packets)
return n, nil
}
func (w *wireguardTunIO) BatchHeadroom() int {
return wgtun.VirtioNetHdrLen
}
func (w *wireguardTunIO) BatchPayloadCap() int {
return w.mtu
}
func (w *wireguardTunIO) BatchSize() int {
return w.batchSize
}
func (w *wireguardTunIO) Close() error {
return nil
}
func releasePackets(pkts []*Packet) {
for _, pkt := range pkts {
if pkt != nil {
pkt.Release()
}
}
}

View File

@@ -1,70 +0,0 @@
package packet
import (
"github.com/slackhq/nebula/util/virtio"
"golang.org/x/sys/unix"
)
type OutPacket struct {
Segments [][]byte
SegmentPayloads [][]byte
SegmentHeaders [][]byte
SegmentIDs []uint16
//todo virtio header?
SegSize int
SegCounter int
Valid bool
wasSegmented bool
Scratch []byte
}
func NewOut() *OutPacket {
out := new(OutPacket)
out.Segments = make([][]byte, 0, 64)
out.SegmentHeaders = make([][]byte, 0, 64)
out.SegmentPayloads = make([][]byte, 0, 64)
out.SegmentIDs = make([]uint16, 0, 64)
out.Scratch = make([]byte, Size)
return out
}
func (pkt *OutPacket) Reset() {
pkt.Segments = pkt.Segments[:0]
pkt.SegmentPayloads = pkt.SegmentPayloads[:0]
pkt.SegmentHeaders = pkt.SegmentHeaders[:0]
pkt.SegmentIDs = pkt.SegmentIDs[:0]
pkt.SegSize = 0
pkt.Valid = false
pkt.wasSegmented = false
}
func (pkt *OutPacket) UseSegment(segID uint16, seg []byte, isV6 bool) int {
pkt.Valid = true
pkt.SegmentIDs = append(pkt.SegmentIDs, segID)
pkt.Segments = append(pkt.Segments, seg) //todo do we need this?
vhdr := virtio.NetHdr{ //todo
Flags: unix.VIRTIO_NET_HDR_F_DATA_VALID,
GSOType: unix.VIRTIO_NET_HDR_GSO_NONE,
HdrLen: 0,
GSOSize: 0,
CsumStart: 0,
CsumOffset: 0,
NumBuffers: 0,
}
hdr := seg[0 : virtio.NetHdrSize+14]
_ = vhdr.Encode(hdr)
if isV6 {
hdr[virtio.NetHdrSize+14-2] = 0x86
hdr[virtio.NetHdrSize+14-1] = 0xdd
} else {
hdr[virtio.NetHdrSize+14-2] = 0x08
hdr[virtio.NetHdrSize+14-1] = 0x00
}
pkt.SegmentHeaders = append(pkt.SegmentHeaders, hdr)
pkt.SegmentPayloads = append(pkt.SegmentPayloads, seg[virtio.NetHdrSize+14:])
return len(pkt.SegmentIDs) - 1
}

View File

@@ -1,119 +0,0 @@
package packet
import (
"encoding/binary"
"iter"
"net/netip"
"slices"
"syscall"
"unsafe"
"golang.org/x/sys/unix"
)
const Size = 0xffff
type Packet struct {
Payload []byte
Control []byte
Name []byte
SegSize int
//todo should this hold out as well?
OutLen int
wasSegmented bool
isV4 bool
}
func New(isV4 bool) *Packet {
return &Packet{
Payload: make([]byte, Size),
Control: make([]byte, unix.CmsgSpace(2)),
Name: make([]byte, unix.SizeofSockaddrInet6),
isV4: isV4,
}
}
func (p *Packet) AddrPort() netip.AddrPort {
var ip netip.Addr
// Its ok to skip the ok check here, the slicing is the only error that can occur and it will panic
if p.isV4 {
ip, _ = netip.AddrFromSlice(p.Name[4:8])
} else {
ip, _ = netip.AddrFromSlice(p.Name[8:24])
}
return netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(p.Name[2:4]))
}
func (p *Packet) updateCtrl(ctrlLen int) {
p.SegSize = len(p.Payload)
p.wasSegmented = false
if ctrlLen == 0 {
return
}
if len(p.Control) == 0 {
return
}
cmsgs, err := unix.ParseSocketControlMessage(p.Control)
if err != nil {
return // oh well
}
for _, c := range cmsgs {
if c.Header.Level == unix.SOL_UDP && c.Header.Type == unix.UDP_GRO && len(c.Data) >= 2 {
p.wasSegmented = true
p.SegSize = int(binary.LittleEndian.Uint16(c.Data[:2]))
return
}
}
}
// Update sets a Packet into "just received, not processed" state
func (p *Packet) Update(ctrlLen int) {
p.OutLen = -1
p.updateCtrl(ctrlLen)
}
func (p *Packet) SetSegSizeForTX() {
p.SegSize = len(p.Payload)
hdr := (*unix.Cmsghdr)(unsafe.Pointer(&p.Control[0]))
hdr.Level = unix.SOL_UDP
hdr.Type = unix.UDP_SEGMENT
hdr.SetLen(syscall.CmsgLen(2))
binary.NativeEndian.PutUint16(p.Control[unix.CmsgLen(0):unix.CmsgLen(0)+2], uint16(p.SegSize))
}
func (p *Packet) CompatibleForSegmentationWith(otherP *Packet, currentTotalSize int) bool {
//same dest
if !slices.Equal(p.Name, otherP.Name) {
return false
}
//don't get too big
if len(p.Payload)+currentTotalSize >= 0xffff {
return false
}
//same body len
//todo allow single different size at end
if len(p.Payload) != len(otherP.Payload) {
return false //todo technically you can cram one extra in
}
return true
}
func (p *Packet) Segments() iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
//cursor := 0
for offset := 0; offset < len(p.Payload); offset += p.SegSize {
end := offset + p.SegSize
if end > len(p.Payload) {
end = len(p.Payload)
}
if !yield(p.Payload[offset:end]) {
return
}
}
}
}

View File

@@ -1,37 +0,0 @@
package packet
import (
"github.com/slackhq/nebula/util/virtio"
)
type VirtIOPacket struct {
Payload []byte
Header virtio.NetHdr
Chains []uint16
ChainRefs [][]byte
// OfferDescriptorChains(chains []uint16, kick bool) error
}
func NewVIO() *VirtIOPacket {
out := new(VirtIOPacket)
out.Payload = nil
out.ChainRefs = make([][]byte, 0, 4)
out.Chains = make([]uint16, 0, 8)
return out
}
func (v *VirtIOPacket) Reset() {
v.Payload = nil
v.ChainRefs = v.ChainRefs[:0]
v.Chains = v.Chains[:0]
}
type VirtIOTXPacket struct {
VirtIOPacket
}
func NewVIOTX(isV4 bool) *VirtIOTXPacket {
out := new(VirtIOTXPacket)
out.VirtIOPacket = *NewVIO()
return out
}

10
pki.go
View File

@@ -523,13 +523,9 @@ func loadCAPoolFromConfig(l *logrus.Logger, c *config.C) (*cert.CAPool, error) {
return nil, fmt.Errorf("error while adding CA certificate to CA trust store: %s", err)
}
bl := c.GetStringSlice("pki.blocklist", []string{})
if len(bl) > 0 {
for _, fp := range bl {
caPool.BlocklistFingerprint(fp)
}
l.WithField("fingerprintCount", len(bl)).Info("Blocklisted certificates")
for _, fp := range c.GetStringSlice("pki.blocklist", []string{}) {
l.WithField("fingerprint", fp).Info("Blocklisting cert")
caPool.BlocklistFingerprint(fp)
}
return caPool, nil

View File

@@ -16,8 +16,8 @@ import (
"github.com/slackhq/nebula/cert_test"
"github.com/slackhq/nebula/config"
"github.com/slackhq/nebula/overlay"
"go.yaml.in/yaml/v3"
"golang.org/x/sync/errgroup"
"gopkg.in/yaml.v3"
)
type m = map[string]any

View File

@@ -4,13 +4,13 @@ import (
"net/netip"
"github.com/slackhq/nebula/config"
"github.com/slackhq/nebula/packet"
)
const MTU = 9001
type EncReader func(
[]*packet.Packet,
addr netip.AddrPort,
payload []byte,
)
type Conn interface {
@@ -19,11 +19,21 @@ type Conn interface {
ListenOut(r EncReader)
WriteTo(b []byte, addr netip.AddrPort) error
ReloadConfig(c *config.C)
Prep(pkt *packet.Packet, addr netip.AddrPort) error
WriteBatch(pkt []*packet.Packet) (int, error)
Close() error
}
// Datagram represents a UDP payload destined to a specific address.
type Datagram struct {
Payload []byte
Addr netip.AddrPort
}
// BatchConn can send multiple datagrams in one syscall.
type BatchConn interface {
Conn
WriteBatch(pkts []Datagram) error
}
type NoopConn struct{}
func (NoopConn) Rebind() error {

View File

@@ -14,25 +14,25 @@ import (
"github.com/rcrowley/go-metrics"
"github.com/sirupsen/logrus"
"github.com/slackhq/nebula/config"
"github.com/slackhq/nebula/packet"
"golang.org/x/sys/unix"
)
const iovMax = 128 //1024 //no unix constant for this? from limits.h
//todo I'd like this to be 1024 but we seem to hit errors around ~130?
type StdConn struct {
sysFd int
isV4 bool
l *logrus.Logger
batch int
enableGRO bool
msgs []rawMessage
iovs [][]iovec
sysFd int
isV4 bool
l *logrus.Logger
batch int
}
func NewListener(l *logrus.Logger, ip netip.Addr, port int, multi bool, batch int) (Conn, error) {
func maybeIPV4(ip net.IP) (net.IP, bool) {
ip4 := ip.To4()
if ip4 != nil {
return ip4, true
}
return ip, false
}
func NewListener(l *logrus.Logger, ip netip.Addr, port int, multi bool, batch int, q int) (Conn, error) {
af := unix.AF_INET6
if ip.Is4() {
af = unix.AF_INET
@@ -69,20 +69,7 @@ func NewListener(l *logrus.Logger, ip netip.Addr, port int, multi bool, batch in
return nil, fmt.Errorf("unable to bind to socket: %s", err)
}
const batchSize = 8192
msgs := make([]rawMessage, 0, batchSize) //todo configure
iovs := make([][]iovec, batchSize)
for i := range iovs {
iovs[i] = make([]iovec, iovMax)
}
return &StdConn{
sysFd: fd,
isV4: ip.Is4(),
l: l,
batch: batch,
msgs: msgs,
iovs: iovs,
}, err
return &StdConn{sysFd: fd, isV4: ip.Is4(), l: l, batch: batch}, err
}
func (u *StdConn) Rebind() error {
@@ -132,7 +119,9 @@ func (u *StdConn) LocalAddr() (netip.AddrPort, error) {
}
func (u *StdConn) ListenOut(r EncReader) {
msgs, packets := u.PrepareRawMessages(u.batch, u.isV4)
var ip netip.Addr
msgs, buffers, names := u.PrepareRawMessages(u.batch)
read := u.ReadMulti
if u.batch == 1 {
read = u.ReadSingle
@@ -146,12 +135,13 @@ func (u *StdConn) ListenOut(r EncReader) {
}
for i := 0; i < n; i++ {
packets[i].Payload = packets[i].Payload[:msgs[i].Len]
packets[i].Update(getRawMessageControlLen(&msgs[i]))
}
r(packets[:n])
for i := 0; i < n; i++ { //todo reset this in prev loop, but this makes debug ez
msgs[i].Hdr.Controllen = uint64(unix.CmsgSpace(2))
// Its ok to skip the ok check here, the slicing is the only error that can occur and it will panic
if u.isV4 {
ip, _ = netip.AddrFromSlice(names[i][4:8])
} else {
ip, _ = netip.AddrFromSlice(names[i][8:24])
}
r(netip.AddrPortFrom(ip.Unmap(), binary.BigEndian.Uint16(names[i][2:4])), buffers[i][:msgs[i].Len])
}
}
}
@@ -204,147 +194,6 @@ func (u *StdConn) WriteTo(b []byte, ip netip.AddrPort) error {
return u.writeTo6(b, ip)
}
func (u *StdConn) WriteToBatch(b []byte, ip netip.AddrPort) error {
if u.isV4 {
return u.writeTo4(b, ip)
}
return u.writeTo6(b, ip)
}
func (u *StdConn) Prep(pkt *packet.Packet, addr netip.AddrPort) error {
nl, err := u.encodeSockaddr(pkt.Name, addr)
if err != nil {
return err
}
pkt.Name = pkt.Name[:nl]
pkt.OutLen = len(pkt.Payload)
return nil
}
func (u *StdConn) WriteBatch(pkts []*packet.Packet) (int, error) {
if len(pkts) == 0 {
return 0, nil
}
u.msgs = u.msgs[:0]
//u.iovs = u.iovs[:0]
sent := 0
var mostRecentPkt *packet.Packet
mostRecentPktSize := 0
//segmenting := false
idx := 0
for _, pkt := range pkts {
if len(pkt.Payload) == 0 || pkt.OutLen == -1 {
sent++
continue
}
lastIdx := idx - 1
if mostRecentPkt != nil && pkt.CompatibleForSegmentationWith(mostRecentPkt, mostRecentPktSize) && u.msgs[lastIdx].Hdr.Iovlen < iovMax {
u.msgs[lastIdx].Hdr.Controllen = uint64(len(mostRecentPkt.Control))
u.msgs[lastIdx].Hdr.Control = &mostRecentPkt.Control[0]
u.iovs[lastIdx][u.msgs[lastIdx].Hdr.Iovlen].Base = &pkt.Payload[0]
u.iovs[lastIdx][u.msgs[lastIdx].Hdr.Iovlen].Len = uint64(len(pkt.Payload))
u.msgs[lastIdx].Hdr.Iovlen++
mostRecentPktSize += len(pkt.Payload)
mostRecentPkt.SetSegSizeForTX()
} else {
u.msgs = append(u.msgs, rawMessage{})
u.iovs[idx][0] = iovec{
Base: &pkt.Payload[0],
Len: uint64(len(pkt.Payload)),
}
msg := &u.msgs[idx]
iov := &u.iovs[idx][0]
idx++
msg.Hdr.Iov = iov
msg.Hdr.Iovlen = 1
setRawMessageControl(msg, nil)
msg.Hdr.Flags = 0
msg.Hdr.Name = &pkt.Name[0]
msg.Hdr.Namelen = uint32(len(pkt.Name))
mostRecentPkt = pkt
mostRecentPktSize = len(pkt.Payload)
}
}
if len(u.msgs) == 0 {
return sent, nil
}
offset := 0
for offset < len(u.msgs) {
n, _, errno := unix.Syscall6(
unix.SYS_SENDMMSG,
uintptr(u.sysFd),
uintptr(unsafe.Pointer(&u.msgs[offset])),
uintptr(len(u.msgs)-offset),
0,
0,
0,
)
if errno != 0 {
if errno == unix.EINTR {
continue
}
//for i := 0; i < len(u.msgs); i++ {
// for j := 0; j < int(u.msgs[i].Hdr.Iovlen); j++ {
// u.l.WithFields(logrus.Fields{
// "msg_index": i,
// "iov idx": j,
// "iov": fmt.Sprintf("%+v", u.iovs[i][j]),
// }).Warn("failed to send message")
// }
//
//}
u.l.WithFields(logrus.Fields{
"errno": errno,
"idx": idx,
"len": len(u.msgs),
"deets": fmt.Sprintf("%+v", u.msgs),
"lastIOV": fmt.Sprintf("%+v", u.iovs[len(u.msgs)-1][u.msgs[len(u.msgs)-1].Hdr.Iovlen-1]),
}).Error("failed to send message")
return sent + offset, &net.OpError{Op: "sendmmsg", Err: errno}
}
if n == 0 {
break
}
offset += int(n)
}
return sent + len(u.msgs), nil
}
func (u *StdConn) encodeSockaddr(dst []byte, addr netip.AddrPort) (uint32, error) {
if u.isV4 {
if !addr.Addr().Is4() {
return 0, fmt.Errorf("Listener is IPv4, but writing to IPv6 remote")
}
var sa unix.RawSockaddrInet4
sa.Family = unix.AF_INET
sa.Addr = addr.Addr().As4()
binary.BigEndian.PutUint16((*[2]byte)(unsafe.Pointer(&sa.Port))[:], addr.Port())
size := unix.SizeofSockaddrInet4
copy(dst[:size], (*(*[unix.SizeofSockaddrInet4]byte)(unsafe.Pointer(&sa)))[:])
return uint32(size), nil
}
var sa unix.RawSockaddrInet6
sa.Family = unix.AF_INET6
sa.Addr = addr.Addr().As16()
binary.BigEndian.PutUint16((*[2]byte)(unsafe.Pointer(&sa.Port))[:], addr.Port())
size := unix.SizeofSockaddrInet6
copy(dst[:size], (*(*[unix.SizeofSockaddrInet6]byte)(unsafe.Pointer(&sa)))[:])
return uint32(size), nil
}
func (u *StdConn) writeTo6(b []byte, ip netip.AddrPort) error {
var rsa unix.RawSockaddrInet6
rsa.Family = unix.AF_INET6
@@ -445,27 +294,6 @@ func (u *StdConn) ReloadConfig(c *config.C) {
u.l.WithError(err).Error("Failed to set listen.so_mark")
}
}
u.configureGRO(true)
}
func (u *StdConn) configureGRO(enable bool) {
if enable == u.enableGRO {
return
}
if enable {
if err := unix.SetsockoptInt(u.sysFd, unix.SOL_UDP, unix.UDP_GRO, 1); err != nil {
u.l.WithError(err).Warn("Failed to enable UDP GRO")
return
}
u.enableGRO = true
u.l.Info("UDP GRO enabled")
} else {
if err := unix.SetsockoptInt(u.sysFd, unix.SOL_UDP, unix.UDP_GRO, 0); err != nil && err != unix.ENOPROTOOPT {
u.l.WithError(err).Warn("Failed to disable UDP GRO")
}
u.enableGRO = false
}
}
func (u *StdConn) getMemInfo(meminfo *[unix.SK_MEMINFO_VARS]uint32) error {
@@ -482,31 +310,51 @@ func (u *StdConn) Close() error {
}
func NewUDPStatsEmitter(udpConns []Conn) func() {
// Check if our kernel supports SO_MEMINFO before registering the gauges
var udpGauges [][unix.SK_MEMINFO_VARS]metrics.Gauge
if len(udpConns) == 0 {
return func() {}
}
type statsProvider struct {
index int
conn *StdConn
}
providers := make([]statsProvider, 0, len(udpConns))
for i, c := range udpConns {
if sc, ok := c.(*StdConn); ok {
providers = append(providers, statsProvider{index: i, conn: sc})
}
}
if len(providers) == 0 {
return func() {}
}
var meminfo [unix.SK_MEMINFO_VARS]uint32
if err := udpConns[0].(*StdConn).getMemInfo(&meminfo); err == nil {
udpGauges = make([][unix.SK_MEMINFO_VARS]metrics.Gauge, len(udpConns))
for i := range udpConns {
udpGauges[i] = [unix.SK_MEMINFO_VARS]metrics.Gauge{
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.rmem_alloc", i), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.rcvbuf", i), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.wmem_alloc", i), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.sndbuf", i), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.fwd_alloc", i), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.wmem_queued", i), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.optmem", i), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.backlog", i), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.drops", i), nil),
}
if err := providers[0].conn.getMemInfo(&meminfo); err != nil {
return func() {}
}
udpGauges := make([][unix.SK_MEMINFO_VARS]metrics.Gauge, len(providers))
for i, provider := range providers {
udpGauges[i] = [unix.SK_MEMINFO_VARS]metrics.Gauge{
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.rmem_alloc", provider.index), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.rcvbuf", provider.index), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.wmem_alloc", provider.index), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.sndbuf", provider.index), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.fwd_alloc", provider.index), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.wmem_queued", provider.index), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.optmem", provider.index), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.backlog", provider.index), nil),
metrics.GetOrRegisterGauge(fmt.Sprintf("udp.%d.drops", provider.index), nil),
}
}
return func() {
for i, gauges := range udpGauges {
if err := udpConns[i].(*StdConn).getMemInfo(&meminfo); err == nil {
for i, provider := range providers {
if err := provider.conn.getMemInfo(&meminfo); err == nil {
for j := 0; j < unix.SK_MEMINFO_VARS; j++ {
gauges[j].Update(int64(meminfo[j]))
udpGauges[i][j].Update(int64(meminfo[j]))
}
}
}

View File

@@ -7,7 +7,6 @@
package udp
import (
"github.com/slackhq/nebula/packet"
"golang.org/x/sys/unix"
)
@@ -34,59 +33,25 @@ type rawMessage struct {
Pad0 [4]byte
}
func setRawMessageControl(msg *rawMessage, buf []byte) {
if len(buf) == 0 {
msg.Hdr.Control = nil
msg.Hdr.Controllen = 0
return
}
msg.Hdr.Control = &buf[0]
msg.Hdr.Controllen = uint64(len(buf))
}
func getRawMessageControlLen(msg *rawMessage) int {
return int(msg.Hdr.Controllen)
}
func setCmsgLen(h *unix.Cmsghdr, l int) {
h.Len = uint64(l)
}
func (u *StdConn) PrepareRawMessages(n int, isV4 bool) ([]rawMessage, []*packet.Packet) {
func (u *StdConn) PrepareRawMessages(n int) ([]rawMessage, [][]byte, [][]byte) {
msgs := make([]rawMessage, n)
packets := make([]*packet.Packet, n)
buffers := make([][]byte, n)
names := make([][]byte, n)
for i := range msgs {
packets[i] = packet.New(isV4)
buffers[i] = make([]byte, MTU)
names[i] = make([]byte, unix.SizeofSockaddrInet6)
vs := []iovec{
{Base: &packets[i].Payload[0], Len: uint64(packet.Size)},
{Base: &buffers[i][0], Len: uint64(len(buffers[i]))},
}
msgs[i].Hdr.Iov = &vs[0]
msgs[i].Hdr.Iovlen = uint64(len(vs))
msgs[i].Hdr.Name = &packets[i].Name[0]
msgs[i].Hdr.Namelen = uint32(len(packets[i].Name))
if u.enableGRO {
msgs[i].Hdr.Control = &packets[i].Control[0]
msgs[i].Hdr.Controllen = uint64(len(packets[i].Control))
} else {
msgs[i].Hdr.Control = nil
msgs[i].Hdr.Controllen = 0
}
msgs[i].Hdr.Name = &names[i][0]
msgs[i].Hdr.Namelen = uint32(len(names[i]))
}
return msgs, packets
}
func setIovecSlice(iov *iovec, b []byte) {
if len(b) == 0 {
iov.Base = nil
iov.Len = 0
return
}
iov.Base = &b[0]
iov.Len = uint64(len(b))
return msgs, buffers, names
}

226
udp/wireguard_conn_linux.go Normal file
View File

@@ -0,0 +1,226 @@
//go:build linux && !android && !e2e_testing
package udp
import (
"errors"
"net"
"net/netip"
"sync"
"sync/atomic"
"github.com/sirupsen/logrus"
"github.com/slackhq/nebula/config"
wgconn "github.com/slackhq/nebula/wgstack/conn"
)
// WGConn adapts WireGuard's batched UDP bind implementation to Nebula's udp.Conn interface.
type WGConn struct {
l *logrus.Logger
bind *wgconn.StdNetBind
recvers []wgconn.ReceiveFunc
batch int
reqBatch int
localIP netip.Addr
localPort uint16
enableGSO bool
enableGRO bool
gsoMaxSeg int
closed atomic.Bool
q int
closeOnce sync.Once
}
// NewWireguardListener creates a UDP listener backed by WireGuard's StdNetBind.
func NewWireguardListener(l *logrus.Logger, ip netip.Addr, port int, multi bool, batch int, q int) (Conn, error) {
bind := wgconn.NewStdNetBindForAddr(ip, multi, q)
recvers, actualPort, err := bind.Open(uint16(port))
if err != nil {
return nil, err
}
if batch <= 0 {
batch = bind.BatchSize()
} else if batch > bind.BatchSize() {
batch = bind.BatchSize()
}
return &WGConn{
l: l,
bind: bind,
recvers: recvers,
batch: batch,
reqBatch: batch,
localIP: ip,
localPort: actualPort,
q: q,
}, nil
}
func (c *WGConn) Rebind() error {
// WireGuard's bind does not support rebinding in place.
return nil
}
func (c *WGConn) LocalAddr() (netip.AddrPort, error) {
if !c.localIP.IsValid() || c.localIP.IsUnspecified() {
// Fallback to wildcard IPv4 for display purposes.
return netip.AddrPortFrom(netip.IPv4Unspecified(), c.localPort), nil
}
return netip.AddrPortFrom(c.localIP, c.localPort), nil
}
func (c *WGConn) listen(fn wgconn.ReceiveFunc, r EncReader) {
batchSize := c.batch
packets := make([][]byte, batchSize)
for i := range packets {
packets[i] = make([]byte, 0xffff)
}
sizes := make([]int, batchSize)
endpoints := make([]wgconn.Endpoint, batchSize)
for {
if c.closed.Load() {
return
}
n, err := fn(packets, sizes, endpoints)
if err != nil {
if errors.Is(err, net.ErrClosed) {
return
}
if c.l != nil {
c.l.WithError(err).Debug("wireguard UDP listener receive error")
}
continue
}
for i := 0; i < n; i++ {
if sizes[i] == 0 {
continue
}
stdEp, ok := endpoints[i].(*wgconn.StdNetEndpoint)
if !ok {
if c.l != nil {
c.l.Warn("wireguard UDP listener received unexpected endpoint type")
}
continue
}
addr := stdEp.AddrPort
r(addr, packets[i][:sizes[i]])
endpoints[i] = nil
}
}
}
func (c *WGConn) ListenOut(r EncReader) {
for _, fn := range c.recvers {
go c.listen(fn, r)
}
}
func (c *WGConn) WriteTo(b []byte, addr netip.AddrPort) error {
if len(b) == 0 {
return nil
}
if c.closed.Load() {
return net.ErrClosed
}
ep := &wgconn.StdNetEndpoint{AddrPort: addr}
return c.bind.Send([][]byte{b}, ep)
}
func (c *WGConn) WriteBatch(datagrams []Datagram) error {
if len(datagrams) == 0 {
return nil
}
if c.closed.Load() {
return net.ErrClosed
}
max := c.batch
if max <= 0 {
max = len(datagrams)
if max == 0 {
max = 1
}
}
bufs := make([][]byte, 0, max)
var (
current netip.AddrPort
endpoint *wgconn.StdNetEndpoint
haveAddr bool
)
flush := func() error {
if len(bufs) == 0 || endpoint == nil {
bufs = bufs[:0]
return nil
}
err := c.bind.Send(bufs, endpoint)
bufs = bufs[:0]
return err
}
for _, d := range datagrams {
if len(d.Payload) == 0 || !d.Addr.IsValid() {
continue
}
if !haveAddr || d.Addr != current {
if err := flush(); err != nil {
return err
}
current = d.Addr
endpoint = &wgconn.StdNetEndpoint{AddrPort: current}
haveAddr = true
}
bufs = append(bufs, d.Payload)
if len(bufs) >= max {
if err := flush(); err != nil {
return err
}
}
}
return flush()
}
func (c *WGConn) ConfigureOffload(enableGSO, enableGRO bool, maxSegments int) {
c.enableGSO = enableGSO
c.enableGRO = enableGRO
if maxSegments <= 0 {
maxSegments = 1
} else if maxSegments > wgconn.IdealBatchSize {
maxSegments = wgconn.IdealBatchSize
}
c.gsoMaxSeg = maxSegments
effectiveBatch := c.reqBatch
if enableGSO && c.bind != nil {
bindBatch := c.bind.BatchSize()
if effectiveBatch < bindBatch {
if c.l != nil {
c.l.WithFields(logrus.Fields{
"requested": c.reqBatch,
"effective": bindBatch,
}).Warn("listen.batch below wireguard minimum; using bind batch size for UDP GSO support")
}
effectiveBatch = bindBatch
}
}
c.batch = effectiveBatch
if c.l != nil {
c.l.WithFields(logrus.Fields{
"enableGSO": enableGSO,
"enableGRO": enableGRO,
"gsoMaxSegments": maxSegments,
}).Debug("configured wireguard UDP offload")
}
}
func (c *WGConn) ReloadConfig(*config.C) {
// WireGuard bind currently does not expose runtime configuration knobs.
}
func (c *WGConn) Close() error {
var err error
c.closeOnce.Do(func() {
c.closed.Store(true)
err = c.bind.Close()
})
return err
}

View File

@@ -0,0 +1,15 @@
//go:build !linux || android || e2e_testing
package udp
import (
"fmt"
"net/netip"
"github.com/sirupsen/logrus"
)
// NewWireguardListener is only available on Linux builds.
func NewWireguardListener(*logrus.Logger, netip.Addr, int, bool, int) (Conn, error) {
return nil, fmt.Errorf("wireguard experimental UDP listener is only supported on Linux")
}

View File

@@ -1,3 +0,0 @@
// Package virtio contains some generic types and concepts related to the virtio
// protocol.
package virtio

View File

@@ -1,136 +0,0 @@
package virtio
// Feature contains feature bits that describe a virtio device or driver.
type Feature uint64
// Device-independent feature bits.
//
// Source: https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-6600006
const (
// FeatureIndirectDescriptors indicates that the driver can use descriptors
// with an additional layer of indirection.
FeatureIndirectDescriptors Feature = 1 << 28
// FeatureVersion1 indicates compliance with version 1.0 of the virtio
// specification.
FeatureVersion1 Feature = 1 << 32
)
// Feature bits for networking devices.
//
// Source: https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-2200003
const (
// FeatureNetDeviceCsum indicates that the device can handle packets with
// partial checksum (checksum offload).
FeatureNetDeviceCsum Feature = 1 << 0
// FeatureNetDriverCsum indicates that the driver can handle packets with
// partial checksum.
FeatureNetDriverCsum Feature = 1 << 1
// FeatureNetCtrlDriverOffloads indicates support for dynamic offload state
// reconfiguration.
FeatureNetCtrlDriverOffloads Feature = 1 << 2
// FeatureNetMTU indicates that the device reports a maximum MTU value.
FeatureNetMTU Feature = 1 << 3
// FeatureNetMAC indicates that the device provides a MAC address.
FeatureNetMAC Feature = 1 << 5
// FeatureNetDriverTSO4 indicates that the driver supports the TCP
// segmentation offload for received IPv4 packets.
FeatureNetDriverTSO4 Feature = 1 << 7
// FeatureNetDriverTSO6 indicates that the driver supports the TCP
// segmentation offload for received IPv6 packets.
FeatureNetDriverTSO6 Feature = 1 << 8
// FeatureNetDriverECN indicates that the driver supports the TCP
// segmentation offload with ECN for received packets.
FeatureNetDriverECN Feature = 1 << 9
// FeatureNetDriverUFO indicates that the driver supports the UDP
// fragmentation offload for received packets.
FeatureNetDriverUFO Feature = 1 << 10
// FeatureNetDeviceTSO4 indicates that the device supports the TCP
// segmentation offload for received IPv4 packets.
FeatureNetDeviceTSO4 Feature = 1 << 11
// FeatureNetDeviceTSO6 indicates that the device supports the TCP
// segmentation offload for received IPv6 packets.
FeatureNetDeviceTSO6 Feature = 1 << 12
// FeatureNetDeviceECN indicates that the device supports the TCP
// segmentation offload with ECN for received packets.
FeatureNetDeviceECN Feature = 1 << 13
// FeatureNetDeviceUFO indicates that the device supports the UDP
// fragmentation offload for received packets.
FeatureNetDeviceUFO Feature = 1 << 14
// FeatureNetMergeRXBuffers indicates that the driver can handle merged
// receive buffers.
// When this feature is negotiated, devices may merge multiple descriptor
// chains together to transport large received packets. [NetHdr.NumBuffers]
// will then contain the number of merged descriptor chains.
FeatureNetMergeRXBuffers Feature = 1 << 15
// FeatureNetStatus indicates that the device configuration status field is
// available.
FeatureNetStatus Feature = 1 << 16
// FeatureNetCtrlVQ indicates that a control channel virtqueue is
// available.
FeatureNetCtrlVQ Feature = 1 << 17
// FeatureNetCtrlRX indicates support for RX mode control (e.g. promiscuous
// or all-multicast) for packet receive filtering.
FeatureNetCtrlRX Feature = 1 << 18
// FeatureNetCtrlVLAN indicates support for VLAN filtering through the
// control channel.
FeatureNetCtrlVLAN Feature = 1 << 19
// FeatureNetDriverAnnounce indicates that the driver can send gratuitous
// packets.
FeatureNetDriverAnnounce Feature = 1 << 21
// FeatureNetMQ indicates that the device supports multiqueue with automatic
// receive steering.
FeatureNetMQ Feature = 1 << 22
// FeatureNetCtrlMACAddr indicates that the MAC address can be set through
// the control channel.
FeatureNetCtrlMACAddr Feature = 1 << 23
// FeatureNetDeviceUSO indicates that the device supports the UDP
// segmentation offload for received packets.
FeatureNetDeviceUSO Feature = 1 << 56
// FeatureNetHashReport indicates that the device can report a per-packet
// hash value and type.
FeatureNetHashReport Feature = 1 << 57
// FeatureNetDriverHdrLen indicates that the driver can provide the exact
// header length value (see [NetHdr.HdrLen]).
// Devices may benefit from knowing the exact header length.
FeatureNetDriverHdrLen Feature = 1 << 59
// FeatureNetRSS indicates that the device supports RSS (receive-side
// scaling) with configurable hash parameters.
FeatureNetRSS Feature = 1 << 60
// FeatureNetRSCExt indicates that the device can process duplicated ACKs
// and report the number of coalesced segments and duplicated ACKs.
FeatureNetRSCExt Feature = 1 << 61
// FeatureNetStandby indicates that the device may act as a standby for a
// primary device with the same MAC address.
FeatureNetStandby Feature = 1 << 62
// FeatureNetSpeedDuplex indicates that the device can report link speed and
// duplex mode.
FeatureNetSpeedDuplex Feature = 1 << 63
)

View File

@@ -1,77 +0,0 @@
package virtio
import (
"errors"
"unsafe"
"golang.org/x/sys/unix"
)
// Workaround to make Go doc links work.
var _ unix.Errno
// NetHdrSize is the number of bytes needed to store a [NetHdr] in memory.
const NetHdrSize = 12
// ErrNetHdrBufferTooSmall is returned when a buffer is too small to fit a
// virtio_net_hdr.
var ErrNetHdrBufferTooSmall = errors.New("the buffer is too small to fit a virtio_net_hdr")
// NetHdr defines the virtio_net_hdr as described by the virtio specification.
type NetHdr struct {
// Flags that describe the packet.
// Possible values are:
// - [unix.VIRTIO_NET_HDR_F_NEEDS_CSUM]
// - [unix.VIRTIO_NET_HDR_F_DATA_VALID]
// - [unix.VIRTIO_NET_HDR_F_RSC_INFO]
Flags uint8
// GSOType contains the type of segmentation offload that should be used for
// the packet.
// Possible values are:
// - [unix.VIRTIO_NET_HDR_GSO_NONE]
// - [unix.VIRTIO_NET_HDR_GSO_TCPV4]
// - [unix.VIRTIO_NET_HDR_GSO_UDP]
// - [unix.VIRTIO_NET_HDR_GSO_TCPV6]
// - [unix.VIRTIO_NET_HDR_GSO_UDP_L4]
// - [unix.VIRTIO_NET_HDR_GSO_ECN]
GSOType uint8
// HdrLen contains the length of the headers that need to be replicated by
// segmentation offloads. It's the number of bytes from the beginning of the
// packet to the beginning of the transport payload.
// Only used when [FeatureNetDriverHdrLen] is negotiated.
HdrLen uint16
// GSOSize contains the maximum size of each segmented packet beyond the
// header (payload size). In case of TCP, this is the MSS.
GSOSize uint16
// CsumStart contains the offset within the packet from which on the
// checksum should be computed.
CsumStart uint16
// CsumOffset specifies how many bytes after [NetHdr.CsumStart] the computed
// 16-bit checksum should be inserted.
CsumOffset uint16
// NumBuffers contains the number of merged descriptor chains when
// [FeatureNetMergeRXBuffers] is negotiated.
// This field is only used for packets received by the driver and should be
// zero for transmitted packets.
NumBuffers uint16
}
// Decode decodes the [NetHdr] from the given byte slice. The slice must contain
// at least [NetHdrSize] bytes.
func (v *NetHdr) Decode(data []byte) error {
if len(data) < NetHdrSize {
return ErrNetHdrBufferTooSmall
}
copy(unsafe.Slice((*byte)(unsafe.Pointer(v)), NetHdrSize), data[:NetHdrSize])
return nil
}
// Encode encodes the [NetHdr] into the given byte slice. The slice must have
// room for at least [NetHdrSize] bytes.
func (v *NetHdr) Encode(data []byte) error {
if len(data) < NetHdrSize {
return ErrNetHdrBufferTooSmall
}
copy(data[:NetHdrSize], unsafe.Slice((*byte)(unsafe.Pointer(v)), NetHdrSize))
return nil
}

View File

@@ -1,43 +0,0 @@
package virtio
import (
"testing"
"unsafe"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/sys/unix"
)
func TestNetHdr_Size(t *testing.T) {
assert.EqualValues(t, NetHdrSize, unsafe.Sizeof(NetHdr{}))
}
func TestNetHdr_Encoding(t *testing.T) {
vnethdr := NetHdr{
Flags: unix.VIRTIO_NET_HDR_F_NEEDS_CSUM,
GSOType: unix.VIRTIO_NET_HDR_GSO_UDP_L4,
HdrLen: 42,
GSOSize: 1472,
CsumStart: 34,
CsumOffset: 6,
NumBuffers: 16,
}
buf := make([]byte, NetHdrSize)
require.NoError(t, vnethdr.Encode(buf))
assert.Equal(t, []byte{
0x01, 0x05,
0x2a, 0x00,
0xc0, 0x05,
0x22, 0x00,
0x06, 0x00,
0x10, 0x00,
}, buf)
var decoded NetHdr
require.NoError(t, decoded.Decode(buf))
assert.Equal(t, vnethdr, decoded)
}

587
wgstack/conn/bind_std.go Normal file
View File

@@ -0,0 +1,587 @@
/* SPDX-License-Identifier: MIT
*
* Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
*/
package conn
import (
"context"
"errors"
"fmt"
"net"
"net/netip"
"runtime"
"strconv"
"sync"
"syscall"
"golang.org/x/net/ipv4"
"golang.org/x/net/ipv6"
)
var (
_ Bind = (*StdNetBind)(nil)
)
// StdNetBind implements Bind for all platforms. While Windows has its own Bind
// (see bind_windows.go), it may fall back to StdNetBind.
// TODO: Remove usage of ipv{4,6}.PacketConn when net.UDPConn has comparable
// methods for sending and receiving multiple datagrams per-syscall. See the
// proposal in https://github.com/golang/go/issues/45886#issuecomment-1218301564.
type StdNetBind struct {
mu sync.Mutex // protects all fields except as specified
ipv4 *net.UDPConn
ipv6 *net.UDPConn
ipv4PC *ipv4.PacketConn // will be nil on non-Linux
ipv6PC *ipv6.PacketConn // will be nil on non-Linux
ipv4TxOffload bool
ipv4RxOffload bool
ipv6TxOffload bool
ipv6RxOffload bool
// these two fields are not guarded by mu
udpAddrPool sync.Pool
msgsPool sync.Pool
blackhole4 bool
blackhole6 bool
q int
}
// NewStdNetBind creates a bind that listens on all interfaces.
func NewStdNetBind() *StdNetBind {
return newStdNetBind().(*StdNetBind)
}
// NewStdNetBindForAddr creates a bind that listens on a specific address.
// If addr is IPv4, only the IPv4 socket will be created. For IPv6, only the
// IPv6 socket will be created.
func NewStdNetBindForAddr(addr netip.Addr, reusePort bool, q int) *StdNetBind {
b := NewStdNetBind()
b.q = q
//if addr.IsValid() {
// if addr.IsUnspecified() {
// // keep dual-stack defaults with empty listen addresses
// } else if addr.Is4() {
// b.listenAddr4 = addr.Unmap().String()
// b.bindV4 = true
// b.bindV6 = false
// } else {
// b.listenAddr6 = addr.Unmap().String()
// b.bindV6 = true
// b.bindV4 = false
// }
//}
//b.reusePort = reusePort
return b
}
func newStdNetBind() Bind {
return &StdNetBind{
udpAddrPool: sync.Pool{
New: func() any {
return &net.UDPAddr{
IP: make([]byte, 16),
}
},
},
msgsPool: sync.Pool{
New: func() any {
// ipv6.Message and ipv4.Message are interchangeable as they are
// both aliases for x/net/internal/socket.Message.
msgs := make([]ipv6.Message, IdealBatchSize)
for i := range msgs {
msgs[i].Buffers = make(net.Buffers, 1)
msgs[i].OOB = make([]byte, 0, stickyControlSize+gsoControlSize)
}
return &msgs
},
},
}
}
type StdNetEndpoint struct {
// AddrPort is the endpoint destination.
netip.AddrPort
// src is the current sticky source address and interface index, if
// supported. Typically this is a PKTINFO structure from/for control
// messages, see unix.PKTINFO for an example.
src []byte
}
var (
_ Bind = (*StdNetBind)(nil)
_ Endpoint = &StdNetEndpoint{}
)
func (*StdNetBind) ParseEndpoint(s string) (Endpoint, error) {
e, err := netip.ParseAddrPort(s)
if err != nil {
return nil, err
}
return &StdNetEndpoint{
AddrPort: e,
}, nil
}
func (e *StdNetEndpoint) ClearSrc() {
if e.src != nil {
// Truncate src, no need to reallocate.
e.src = e.src[:0]
}
}
func (e *StdNetEndpoint) DstIP() netip.Addr {
return e.AddrPort.Addr()
}
// See control_default,linux, etc for implementations of SrcIP and SrcIfidx.
func (e *StdNetEndpoint) DstToBytes() []byte {
b, _ := e.AddrPort.MarshalBinary()
return b
}
func (e *StdNetEndpoint) DstToString() string {
return e.AddrPort.String()
}
func listenNet(network string, port int, q int) (*net.UDPConn, int, error) {
lc := listenConfig(q)
conn, err := lc.ListenPacket(context.Background(), network, ":"+strconv.Itoa(port))
if err != nil {
return nil, 0, err
}
if q == 0 {
if EvilFdZero == 0 {
panic("fuck")
}
err = reusePortHax(EvilFdZero)
if err != nil {
return nil, 0, fmt.Errorf("reuse port hax: %v", err)
}
}
// Retrieve port.
laddr := conn.LocalAddr()
uaddr, err := net.ResolveUDPAddr(
laddr.Network(),
laddr.String(),
)
if err != nil {
return nil, 0, err
}
return conn.(*net.UDPConn), uaddr.Port, nil
}
func (s *StdNetBind) Open(uport uint16) ([]ReceiveFunc, uint16, error) {
s.mu.Lock()
defer s.mu.Unlock()
var err error
var tries int
if s.ipv4 != nil || s.ipv6 != nil {
return nil, 0, ErrBindAlreadyOpen
}
// Attempt to open ipv4 and ipv6 listeners on the same port.
// If uport is 0, we can retry on failure.
again:
port := int(uport)
var v4conn, v6conn *net.UDPConn
var v4pc *ipv4.PacketConn
var v6pc *ipv6.PacketConn
v4conn, port, err = listenNet("udp4", port, s.q)
if err != nil && !errors.Is(err, syscall.EAFNOSUPPORT) {
return nil, 0, err
}
// Listen on the same port as we're using for ipv4.
v6conn, port, err = listenNet("udp6", port, s.q)
if uport == 0 && errors.Is(err, syscall.EADDRINUSE) && tries < 100 {
v4conn.Close()
tries++
goto again
}
if err != nil && !errors.Is(err, syscall.EAFNOSUPPORT) {
v4conn.Close()
return nil, 0, err
}
var fns []ReceiveFunc
if v4conn != nil {
s.ipv4TxOffload, s.ipv4RxOffload = supportsUDPOffload(v4conn)
if runtime.GOOS == "linux" || runtime.GOOS == "android" {
v4pc = ipv4.NewPacketConn(v4conn)
s.ipv4PC = v4pc
}
fns = append(fns, s.makeReceiveIPv4(v4pc, v4conn, s.ipv4RxOffload))
s.ipv4 = v4conn
}
if v6conn != nil {
s.ipv6TxOffload, s.ipv6RxOffload = supportsUDPOffload(v6conn)
if runtime.GOOS == "linux" || runtime.GOOS == "android" {
v6pc = ipv6.NewPacketConn(v6conn)
s.ipv6PC = v6pc
}
fns = append(fns, s.makeReceiveIPv6(v6pc, v6conn, s.ipv6RxOffload))
s.ipv6 = v6conn
}
if len(fns) == 0 {
return nil, 0, syscall.EAFNOSUPPORT
}
return fns, uint16(port), nil
}
func (s *StdNetBind) putMessages(msgs *[]ipv6.Message) {
for i := range *msgs {
(*msgs)[i].OOB = (*msgs)[i].OOB[:0]
(*msgs)[i] = ipv6.Message{Buffers: (*msgs)[i].Buffers, OOB: (*msgs)[i].OOB}
}
s.msgsPool.Put(msgs)
}
func (s *StdNetBind) getMessages() *[]ipv6.Message {
return s.msgsPool.Get().(*[]ipv6.Message)
}
var (
// If compilation fails here these are no longer the same underlying type.
_ ipv6.Message = ipv4.Message{}
)
type batchReader interface {
ReadBatch([]ipv6.Message, int) (int, error)
}
type batchWriter interface {
WriteBatch([]ipv6.Message, int) (int, error)
}
func (s *StdNetBind) receiveIP(
br batchReader,
conn *net.UDPConn,
rxOffload bool,
bufs [][]byte,
sizes []int,
eps []Endpoint,
) (n int, err error) {
msgs := s.getMessages()
for i := range bufs {
(*msgs)[i].Buffers[0] = bufs[i]
(*msgs)[i].OOB = (*msgs)[i].OOB[:cap((*msgs)[i].OOB)]
}
defer s.putMessages(msgs)
var numMsgs int
if runtime.GOOS == "linux" || runtime.GOOS == "android" {
if rxOffload {
readAt := len(*msgs) - (IdealBatchSize / udpSegmentMaxDatagrams)
numMsgs, err = br.ReadBatch((*msgs)[readAt:], 0)
if err != nil {
return 0, err
}
numMsgs, err = splitCoalescedMessages(*msgs, readAt, getGSOSize)
if err != nil {
return 0, err
}
} else {
numMsgs, err = br.ReadBatch(*msgs, 0)
if err != nil {
return 0, err
}
}
} else {
msg := &(*msgs)[0]
msg.N, msg.NN, _, msg.Addr, err = conn.ReadMsgUDP(msg.Buffers[0], msg.OOB)
if err != nil {
return 0, err
}
numMsgs = 1
}
for i := 0; i < numMsgs; i++ {
msg := &(*msgs)[i]
sizes[i] = msg.N
if sizes[i] == 0 {
continue
}
addrPort := msg.Addr.(*net.UDPAddr).AddrPort()
ep := &StdNetEndpoint{AddrPort: addrPort} // TODO: remove allocation
getSrcFromControl(msg.OOB[:msg.NN], ep)
eps[i] = ep
}
return numMsgs, nil
}
func (s *StdNetBind) makeReceiveIPv4(pc *ipv4.PacketConn, conn *net.UDPConn, rxOffload bool) ReceiveFunc {
return func(bufs [][]byte, sizes []int, eps []Endpoint) (n int, err error) {
return s.receiveIP(pc, conn, rxOffload, bufs, sizes, eps)
}
}
func (s *StdNetBind) makeReceiveIPv6(pc *ipv6.PacketConn, conn *net.UDPConn, rxOffload bool) ReceiveFunc {
return func(bufs [][]byte, sizes []int, eps []Endpoint) (n int, err error) {
return s.receiveIP(pc, conn, rxOffload, bufs, sizes, eps)
}
}
// TODO: When all Binds handle IdealBatchSize, remove this dynamic function and
// rename the IdealBatchSize constant to BatchSize.
func (s *StdNetBind) BatchSize() int {
if runtime.GOOS == "linux" || runtime.GOOS == "android" {
return IdealBatchSize
}
return 1
}
func (s *StdNetBind) Close() error {
s.mu.Lock()
defer s.mu.Unlock()
var err1, err2 error
if s.ipv4 != nil {
err1 = s.ipv4.Close()
s.ipv4 = nil
s.ipv4PC = nil
}
if s.ipv6 != nil {
err2 = s.ipv6.Close()
s.ipv6 = nil
s.ipv6PC = nil
}
s.blackhole4 = false
s.blackhole6 = false
s.ipv4TxOffload = false
s.ipv4RxOffload = false
s.ipv6TxOffload = false
s.ipv6RxOffload = false
if err1 != nil {
return err1
}
return err2
}
type ErrUDPGSODisabled struct {
onLaddr string
RetryErr error
}
func (e ErrUDPGSODisabled) Error() string {
return fmt.Sprintf("disabled UDP GSO on %s, NIC(s) may not support checksum offload", e.onLaddr)
}
func (e ErrUDPGSODisabled) Unwrap() error {
return e.RetryErr
}
func (s *StdNetBind) Send(bufs [][]byte, endpoint Endpoint) error {
s.mu.Lock()
blackhole := s.blackhole4
conn := s.ipv4
offload := s.ipv4TxOffload
br := batchWriter(s.ipv4PC)
is6 := false
if endpoint.DstIP().Is6() {
blackhole = s.blackhole6
conn = s.ipv6
br = s.ipv6PC
is6 = true
offload = s.ipv6TxOffload
}
s.mu.Unlock()
if blackhole {
return nil
}
if conn == nil {
return syscall.EAFNOSUPPORT
}
msgs := s.getMessages()
defer s.putMessages(msgs)
ua := s.udpAddrPool.Get().(*net.UDPAddr)
defer s.udpAddrPool.Put(ua)
if is6 {
as16 := endpoint.DstIP().As16()
copy(ua.IP, as16[:])
ua.IP = ua.IP[:16]
} else {
as4 := endpoint.DstIP().As4()
copy(ua.IP, as4[:])
ua.IP = ua.IP[:4]
}
ua.Port = int(endpoint.(*StdNetEndpoint).Port())
var (
retried bool
err error
)
retry:
if offload {
n := coalesceMessages(ua, endpoint.(*StdNetEndpoint), bufs, *msgs, setGSOSize)
err = s.send(conn, br, (*msgs)[:n])
if err != nil && offload && errShouldDisableUDPGSO(err) {
offload = false
s.mu.Lock()
if is6 {
s.ipv6TxOffload = false
} else {
s.ipv4TxOffload = false
}
s.mu.Unlock()
retried = true
goto retry
}
} else {
for i := range bufs {
(*msgs)[i].Addr = ua
(*msgs)[i].Buffers[0] = bufs[i]
setSrcControl(&(*msgs)[i].OOB, endpoint.(*StdNetEndpoint))
}
err = s.send(conn, br, (*msgs)[:len(bufs)])
}
if retried {
return ErrUDPGSODisabled{onLaddr: conn.LocalAddr().String(), RetryErr: err}
}
return err
}
func (s *StdNetBind) send(conn *net.UDPConn, pc batchWriter, msgs []ipv6.Message) error {
var (
n int
err error
start int
)
if runtime.GOOS == "linux" || runtime.GOOS == "android" {
for {
n, err = pc.WriteBatch(msgs[start:], 0)
if err != nil || n == len(msgs[start:]) {
break
}
start += n
}
} else {
for _, msg := range msgs {
_, _, err = conn.WriteMsgUDP(msg.Buffers[0], msg.OOB, msg.Addr.(*net.UDPAddr))
if err != nil {
break
}
}
}
return err
}
const (
// Exceeding these values results in EMSGSIZE. They account for layer3 and
// layer4 headers. IPv6 does not need to account for itself as the payload
// length field is self excluding.
maxIPv4PayloadLen = 1<<16 - 1 - 20 - 8
maxIPv6PayloadLen = 1<<16 - 1 - 8
// This is a hard limit imposed by the kernel.
udpSegmentMaxDatagrams = 64
)
type setGSOFunc func(control *[]byte, gsoSize uint16)
func coalesceMessages(addr *net.UDPAddr, ep *StdNetEndpoint, bufs [][]byte, msgs []ipv6.Message, setGSO setGSOFunc) int {
var (
base = -1 // index of msg we are currently coalescing into
gsoSize int // segmentation size of msgs[base]
dgramCnt int // number of dgrams coalesced into msgs[base]
endBatch bool // tracking flag to start a new batch on next iteration of bufs
)
maxPayloadLen := maxIPv4PayloadLen
if ep.DstIP().Is6() {
maxPayloadLen = maxIPv6PayloadLen
}
for i, buf := range bufs {
if i > 0 {
msgLen := len(buf)
baseLenBefore := len(msgs[base].Buffers[0])
freeBaseCap := cap(msgs[base].Buffers[0]) - baseLenBefore
if msgLen+baseLenBefore <= maxPayloadLen &&
msgLen <= gsoSize &&
msgLen <= freeBaseCap &&
dgramCnt < udpSegmentMaxDatagrams &&
!endBatch {
msgs[base].Buffers[0] = append(msgs[base].Buffers[0], buf...)
if i == len(bufs)-1 {
setGSO(&msgs[base].OOB, uint16(gsoSize))
}
dgramCnt++
if msgLen < gsoSize {
// A smaller than gsoSize packet on the tail is legal, but
// it must end the batch.
endBatch = true
}
continue
}
}
if dgramCnt > 1 {
setGSO(&msgs[base].OOB, uint16(gsoSize))
}
// Reset prior to incrementing base since we are preparing to start a
// new potential batch.
endBatch = false
base++
gsoSize = len(buf)
setSrcControl(&msgs[base].OOB, ep)
msgs[base].Buffers[0] = buf
msgs[base].Addr = addr
dgramCnt = 1
}
return base + 1
}
type getGSOFunc func(control []byte) (int, error)
func splitCoalescedMessages(msgs []ipv6.Message, firstMsgAt int, getGSO getGSOFunc) (n int, err error) {
for i := firstMsgAt; i < len(msgs); i++ {
msg := &msgs[i]
if msg.N == 0 {
return n, err
}
var (
gsoSize int
start int
end = msg.N
numToSplit = 1
)
gsoSize, err = getGSO(msg.OOB[:msg.NN])
if err != nil {
return n, err
}
if gsoSize > 0 {
numToSplit = (msg.N + gsoSize - 1) / gsoSize
end = gsoSize
}
for j := 0; j < numToSplit; j++ {
if n > i {
return n, errors.New("splitting coalesced packet resulted in overflow")
}
copied := copy(msgs[n].Buffers[0], msg.Buffers[0][start:end])
msgs[n].N = copied
msgs[n].Addr = msg.Addr
start = end
end += gsoSize
if end > msg.N {
end = msg.N
}
n++
}
if i != n-1 {
// It is legal for bytes to move within msg.Buffers[0] as a result
// of splitting, so we only zero the source msg len when it is not
// the destination of the last split operation above.
msg.N = 0
}
}
return n, nil
}

131
wgstack/conn/conn.go Normal file
View File

@@ -0,0 +1,131 @@
// SPDX-License-Identifier: MIT
//
// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
package conn
import (
"errors"
"fmt"
"net/netip"
"reflect"
"runtime"
"strings"
)
const (
IdealBatchSize = 128 // maximum number of packets handled per read and write
)
// A ReceiveFunc receives at least one packet from the network and writes them
// into packets. On a successful read it returns the number of elements of
// sizes, packets, and endpoints that should be evaluated. Some elements of
// sizes may be zero, and callers should ignore them. Callers must pass a sizes
// and eps slice with a length greater than or equal to the length of packets.
// These lengths must not exceed the length of the associated Bind.BatchSize().
type ReceiveFunc func(packets [][]byte, sizes []int, eps []Endpoint) (n int, err error)
// A Bind listens on a port for both IPv6 and IPv4 UDP traffic.
//
// A Bind interface may also be a PeekLookAtSocketFd or BindSocketToInterface,
// depending on the platform-specific implementation.
type Bind interface {
// Open puts the Bind into a listening state on a given port and reports the actual
// port that it bound to. Passing zero results in a random selection.
// fns is the set of functions that will be called to receive packets.
Open(port uint16) (fns []ReceiveFunc, actualPort uint16, err error)
// Close closes the Bind listener.
// All fns returned by Open must return net.ErrClosed after a call to Close.
Close() error
// SetMark sets the mark for each packet sent through this Bind.
// This mark is passed to the kernel as the socket option SO_MARK.
SetMark(mark uint32) error
// Send writes one or more packets in bufs to address ep. The length of
// bufs must not exceed BatchSize().
Send(bufs [][]byte, ep Endpoint) error
// ParseEndpoint creates a new endpoint from a string.
ParseEndpoint(s string) (Endpoint, error)
// BatchSize is the number of buffers expected to be passed to
// the ReceiveFuncs, and the maximum expected to be passed to SendBatch.
BatchSize() int
}
// BindSocketToInterface is implemented by Bind objects that support being
// tied to a single network interface. Used by wireguard-windows.
type BindSocketToInterface interface {
BindSocketToInterface4(interfaceIndex uint32, blackhole bool) error
BindSocketToInterface6(interfaceIndex uint32, blackhole bool) error
}
// PeekLookAtSocketFd is implemented by Bind objects that support having their
// file descriptor peeked at. Used by wireguard-android.
type PeekLookAtSocketFd interface {
PeekLookAtSocketFd4() (fd int, err error)
PeekLookAtSocketFd6() (fd int, err error)
}
// An Endpoint maintains the source/destination caching for a peer.
//
// dst: the remote address of a peer ("endpoint" in uapi terminology)
// src: the local address from which datagrams originate going to the peer
type Endpoint interface {
ClearSrc() // clears the source address
SrcToString() string // returns the local source address (ip:port)
DstToString() string // returns the destination address (ip:port)
DstToBytes() []byte // used for mac2 cookie calculations
DstIP() netip.Addr
SrcIP() netip.Addr
}
var (
ErrBindAlreadyOpen = errors.New("bind is already open")
ErrWrongEndpointType = errors.New("endpoint type does not correspond with bind type")
)
func (fn ReceiveFunc) PrettyName() string {
name := runtime.FuncForPC(reflect.ValueOf(fn).Pointer()).Name()
// 0. cheese/taco.beansIPv6.func12.func21218-fm
name = strings.TrimSuffix(name, "-fm")
// 1. cheese/taco.beansIPv6.func12.func21218
if idx := strings.LastIndexByte(name, '/'); idx != -1 {
name = name[idx+1:]
// 2. taco.beansIPv6.func12.func21218
}
for {
var idx int
for idx = len(name) - 1; idx >= 0; idx-- {
if name[idx] < '0' || name[idx] > '9' {
break
}
}
if idx == len(name)-1 {
break
}
const dotFunc = ".func"
if !strings.HasSuffix(name[:idx+1], dotFunc) {
break
}
name = name[:idx+1-len(dotFunc)]
// 3. taco.beansIPv6.func12
// 4. taco.beansIPv6
}
if idx := strings.LastIndexByte(name, '.'); idx != -1 {
name = name[idx+1:]
// 5. beansIPv6
}
if name == "" {
return fmt.Sprintf("%p", fn)
}
if strings.HasSuffix(name, "IPv4") {
return "v4"
}
if strings.HasSuffix(name, "IPv6") {
return "v6"
}
return name
}

222
wgstack/conn/controlfns.go Normal file
View File

@@ -0,0 +1,222 @@
// SPDX-License-Identifier: MIT
//
// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
package conn
import (
"fmt"
"net"
"syscall"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
)
// UDP socket read/write buffer size (7MB). The value of 7MB is chosen as it is
// the max supported by a default configuration of macOS. Some platforms will
// silently clamp the value to other maximums, such as linux clamping to
// net.core.{r,w}mem_max (see _linux.go for additional implementation that works
// around this limitation)
const socketBufferSize = 7 << 20
// controlFn is the callback function signature from net.ListenConfig.Control.
// It is used to apply platform specific configuration to the socket prior to
// bind.
type controlFn func(network, address string, c syscall.RawConn) error
// controlFns is a list of functions that are called from the listen config
// that can apply socket options.
var controlFns = []controlFn{}
const SO_ATTACH_REUSEPORT_EBPF = 52
//Create eBPF program that returns a hash to distribute packets
func createReuseportProgram() (*ebpf.Program, error) {
// This program uses the packet's hash and returns it modulo number of sockets
// Simple version: just return a counter-based distribution
//instructions := asm.Instructions{
// // Load the skb->hash value (already computed by kernel)
// asm.LoadMem(asm.R0, asm.R1, int16(unsafe.Offsetof(unix.XDPMd{}.RxQueueIndex)), asm.Word),
// asm.Return(),
//}
//
//// Alternative: simpler round-robin approach
//// This returns the CPU number, effectively round-robin
//instructions := asm.Instructions{
// asm.Mov.Reg(asm.R0, asm.R1), // Move ctx to R0
// asm.LoadMem(asm.R0, asm.R1, 0, asm.Word), // Load some field
// asm.Return(),
//}
// Better: Use BPF helper to get random/hash value
//instructions := asm.Instructions{
// // Call get_prandom_u32() to get random value for distribution
// asm.Mov.Imm(asm.R0, 0),
// asm.Call.Label("get_prandom_u32"),
// asm.Return(),
//}
//
//prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
// Type: ebpf.SocketFilter,
// Instructions: instructions,
// License: "GPL",
//})
//instructions := asm.Instructions{
// // R1 contains pointer to skb
// // Load skb->hash at offset 0x20 (may vary by kernel, but 0x20 is common)
// asm.LoadMem(asm.R0, asm.R1, 0x20, asm.Word),
//
// // If hash is 0, use rxhash instead (fallback)
// asm.JEq.Imm(asm.R0, 0, "use_rxhash"),
// asm.Return().Sym("return"),
//
// // Fallback: load rxhash
// asm.LoadMem(asm.R0, asm.R1, 0x24, asm.Word).Sym("use_rxhash"),
// asm.Return(),
//}
//
//prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
// Type: ebpf.SkReuseport,
// Instructions: instructions,
// License: "GPL",
//})
//instructions := asm.Instructions{
// // R1 = ctx (sk_reuseport_md)
// // R2 = sk_reuseport map (we'll use NULL/0 for default behavior)
// // R3 = key (select socket index)
// // R4 = flags
//
// // Simple approach: use the hash field from sk_reuseport_md
// // struct sk_reuseport_md { ... __u32 hash; ... } at offset 24
// asm.Mov.Reg(asm.R6, asm.R1), // Save ctx
//
// // Load the hash value at offset 24
// asm.LoadMem(asm.R2, asm.R6, 24, asm.Word),
//
// // Call bpf_sk_select_reuseport(ctx, map, key, flags)
// asm.Mov.Reg(asm.R1, asm.R6), // ctx
// asm.Mov.Imm(asm.R2, 0), // map (NULL = use default)
// asm.Mov.Reg(asm.R3, asm.R2), // key = hash we loaded (in R2)
// asm.Mov.Imm(asm.R4, 0), // flags
// asm.Call.Label("sk_select_reuseport"),
//
// // Return 0
// asm.Mov.Imm(asm.R0, 0),
// asm.Return(),
//}
//
//prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
// Type: ebpf.SkReuseport,
// Instructions: instructions,
// License: "GPL",
//})
instructions := asm.Instructions{
// R1 = ctx (sk_reuseport_md pointer)
// Load hash from sk_reuseport_md at offset 24
//asm.LoadMem(asm.R0, asm.R1, 20, asm.Word),
// R1 = ctx (save it)
asm.Mov.Reg(asm.R6, asm.R1),
// Prepare string on stack: "BPF called!\n"
// We need to build the format string on the stack
asm.Mov.Reg(asm.R1, asm.R10), // R1 = frame pointer
asm.Add.Imm(asm.R1, -16), // R1 = stack location for string
// Write "BPF called!\n" to stack (we'll use a simpler version)
// Store immediate 64-bit values
asm.StoreImm(asm.R1, 0, 0x2066706220, asm.DWord), // "bpf "
asm.StoreImm(asm.R1, 8, 0x0a21, asm.DWord), // "!\n"
// Call bpf_trace_printk(fmt, fmt_size)
// R1 already points to format string
asm.Mov.Imm(asm.R2, 16), // R2 = format size
asm.Call.Label("bpf_printk"),
// Return 0 (send to socket 0 for testing)
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
//asm.Mov.Imm(asm.R0, 0),
//// Just return the hash directly
//// The kernel will automatically modulo by number of sockets
//asm.Return(),
}
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.SkReuseport,
Instructions: instructions,
License: "GPL",
})
return prog, err
}
//func createReuseportProgram() (*ebpf.Program, error) {
// // Try offset 20 (common in newer kernels)
// instructions := asm.Instructions{
// asm.LoadMem(asm.R0, asm.R1, 20, asm.Word),
// asm.Return(),
// }
//
// prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
// Type: ebpf.SkReuseport,
// Instructions: instructions,
// License: "GPL",
// })
//
// return prog, err
//}
func reusePortHax(fd uintptr) error {
prog, err := createReuseportProgram()
if err != nil {
return fmt.Errorf("failed to create eBPF program: %w", err)
}
//defer prog.Close()
sockErr := syscall.SetsockoptInt(int(fd), syscall.SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, prog.FD())
if sockErr != nil {
return sockErr
}
return nil
}
var EvilFdZero uintptr
// listenConfig returns a net.ListenConfig that applies the controlFns to the
// socket prior to bind. This is used to apply socket buffer sizing and packet
// information OOB configuration for sticky sockets.
func listenConfig(q int) *net.ListenConfig {
return &net.ListenConfig{
Control: func(network, address string, c syscall.RawConn) error {
for _, fn := range controlFns {
if err := fn(network, address, c); err != nil {
return err
}
}
if q == 0 {
c.Control(func(fd uintptr) {
EvilFdZero = fd
})
// var e error
// err := c.Control(func(fd uintptr) {
// e = reusePortHax(fd)
// })
// if err != nil {
// return err
// }
// if e != nil {
// return e
// }
}
return nil
},
}
}

View File

@@ -0,0 +1,66 @@
//go:build linux
// SPDX-License-Identifier: MIT
//
// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
package conn
import (
"fmt"
"runtime"
"syscall"
"golang.org/x/sys/unix"
)
func init() {
controlFns = append(controlFns,
// Attempt to set the socket buffer size beyond net.core.{r,w}mem_max by
// using SO_*BUFFORCE. This requires CAP_NET_ADMIN, and is allowed here to
// fail silently - the result of failure is lower performance on very fast
// links or high latency links.
func(network, address string, c syscall.RawConn) error {
return c.Control(func(fd uintptr) {
// Set up to *mem_max
_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_RCVBUF, socketBufferSize)
_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_SNDBUF, socketBufferSize)
// Set beyond *mem_max if CAP_NET_ADMIN
_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, socketBufferSize)
_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_SNDBUFFORCE, socketBufferSize)
_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_REUSEPORT, 1) //todo!!!
_ = unix.SetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_GRO, 1) //todo!!!
_ = unix.SetsockoptInt(int(fd), unix.SOL_UDP, unix.UDP_SEGMENT, 0xffff) //todo!!!
//print(err.Error())
})
},
// Enable receiving of the packet information (IP_PKTINFO for IPv4,
// IPV6_PKTINFO for IPv6) that is used to implement sticky socket support.
func(network, address string, c syscall.RawConn) error {
var err error
switch network {
case "udp4":
if runtime.GOOS != "android" {
c.Control(func(fd uintptr) {
err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_PKTINFO, 1)
})
}
case "udp6":
c.Control(func(fd uintptr) {
if runtime.GOOS != "android" {
err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_RECVPKTINFO, 1)
if err != nil {
return
}
}
err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_V6ONLY, 1)
})
default:
err = fmt.Errorf("unhandled network: %s: %w", network, unix.EINVAL)
}
return err
},
)
}

9
wgstack/conn/default.go Normal file
View File

@@ -0,0 +1,9 @@
//go:build !windows
// SPDX-License-Identifier: MIT
//
// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
package conn
func NewDefaultBind() Bind { return NewStdNetBind() }

View File

@@ -0,0 +1,12 @@
//go:build !linux
/* SPDX-License-Identifier: MIT
*
* Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
*/
package conn
func errShouldDisableUDPGSO(err error) bool {
return false
}

View File

@@ -0,0 +1,26 @@
/* SPDX-License-Identifier: MIT
*
* Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
*/
package conn
import (
"errors"
"os"
"golang.org/x/sys/unix"
)
func errShouldDisableUDPGSO(err error) bool {
var serr *os.SyscallError
if errors.As(err, &serr) {
// EIO is returned by udp_send_skb() if the device driver does not have
// tx checksumming enabled, which is a hard requirement of UDP_SEGMENT.
// See:
// https://git.kernel.org/pub/scm/docs/man-pages/man-pages.git/tree/man7/udp.7?id=806eabd74910447f21005160e90957bde4db0183#n228
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/net/ipv4/udp.c?h=v6.2&id=c9c3395d5e3dcc6daee66c6908354d47bf98cb0c#n942
return serr.Err == unix.EIO
}
return false
}

View File

@@ -0,0 +1,15 @@
//go:build !linux
// +build !linux
/* SPDX-License-Identifier: MIT
*
* Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
*/
package conn
import "net"
func supportsUDPOffload(conn *net.UDPConn) (txOffload, rxOffload bool) {
return
}

View File

@@ -0,0 +1,33 @@
/* SPDX-License-Identifier: MIT
*
* Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
*/
package conn
import (
"fmt"
"net"
"golang.org/x/sys/unix"
)
func supportsUDPOffload(conn *net.UDPConn) (txOffload, rxOffload bool) {
rc, err := conn.SyscallConn()
if err != nil {
return
}
a := 0
err = rc.Control(func(fd uintptr) {
a, err = unix.GetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_SEGMENT)
txOffload = err == nil
opt, errSyscall := unix.GetsockoptInt(int(fd), unix.IPPROTO_UDP, unix.UDP_GRO)
rxOffload = errSyscall == nil && opt == 1
})
fmt.Printf("%d", a)
if err != nil {
return false, false
}
return txOffload, rxOffload
}

View File

@@ -0,0 +1,21 @@
//go:build !linux
/* SPDX-License-Identifier: MIT
*
* Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
*/
package conn
// getGSOSize parses control for UDP_GRO and if found returns its GSO size data.
func getGSOSize(control []byte) (int, error) {
return 0, nil
}
// setGSOSize sets a UDP_SEGMENT in control based on gsoSize.
func setGSOSize(control *[]byte, gsoSize uint16) {
}
// gsoControlSize returns the recommended buffer size for pooling sticky and UDP
// offloading control data.
const gsoControlSize = 0

65
wgstack/conn/gso_linux.go Normal file
View File

@@ -0,0 +1,65 @@
//go:build linux
/* SPDX-License-Identifier: MIT
*
* Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
*/
package conn
import (
"fmt"
"unsafe"
"golang.org/x/sys/unix"
)
const (
sizeOfGSOData = 2
)
// getGSOSize parses control for UDP_GRO and if found returns its GSO size data.
func getGSOSize(control []byte) (int, error) {
var (
hdr unix.Cmsghdr
data []byte
rem = control
err error
)
for len(rem) > unix.SizeofCmsghdr {
hdr, data, rem, err = unix.ParseOneSocketControlMessage(rem)
if err != nil {
return 0, fmt.Errorf("error parsing socket control message: %w", err)
}
if hdr.Level == unix.SOL_UDP && hdr.Type == unix.UDP_GRO && len(data) >= sizeOfGSOData {
var gso uint16
copy(unsafe.Slice((*byte)(unsafe.Pointer(&gso)), sizeOfGSOData), data[:sizeOfGSOData])
return int(gso), nil
}
}
return 0, nil
}
// setGSOSize sets a UDP_SEGMENT in control based on gsoSize. It leaves existing
// data in control untouched.
func setGSOSize(control *[]byte, gsoSize uint16) {
existingLen := len(*control)
avail := cap(*control) - existingLen
space := unix.CmsgSpace(sizeOfGSOData)
if avail < space {
return
}
*control = (*control)[:cap(*control)]
gsoControl := (*control)[existingLen:]
hdr := (*unix.Cmsghdr)(unsafe.Pointer(&(gsoControl)[0]))
hdr.Level = unix.SOL_UDP
hdr.Type = unix.UDP_SEGMENT
hdr.SetLen(unix.CmsgLen(sizeOfGSOData))
copy((gsoControl)[unix.CmsgLen(0):], unsafe.Slice((*byte)(unsafe.Pointer(&gsoSize)), sizeOfGSOData))
*control = (*control)[:existingLen+space]
}
// gsoControlSize returns the recommended buffer size for pooling UDP
// offloading control data.
var gsoControlSize = unix.CmsgSpace(sizeOfGSOData)

64
wgstack/conn/mark_unix.go Normal file
View File

@@ -0,0 +1,64 @@
//go:build linux || openbsd || freebsd
// SPDX-License-Identifier: MIT
//
// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
package conn
import (
"runtime"
"golang.org/x/sys/unix"
)
var fwmarkIoctl int
func init() {
switch runtime.GOOS {
case "linux", "android":
fwmarkIoctl = 36 /* unix.SO_MARK */
case "freebsd":
fwmarkIoctl = 0x1015 /* unix.SO_USER_COOKIE */
case "openbsd":
fwmarkIoctl = 0x1021 /* unix.SO_RTABLE */
}
}
func (s *StdNetBind) SetMark(mark uint32) error {
var operr error
if fwmarkIoctl == 0 {
return nil
}
if s.ipv4 != nil {
fd, err := s.ipv4.SyscallConn()
if err != nil {
return err
}
err = fd.Control(func(fd uintptr) {
operr = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, fwmarkIoctl, int(mark))
})
if err == nil {
err = operr
}
if err != nil {
return err
}
}
if s.ipv6 != nil {
fd, err := s.ipv6.SyscallConn()
if err != nil {
return err
}
err = fd.Control(func(fd uintptr) {
operr = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, fwmarkIoctl, int(mark))
})
if err == nil {
err = operr
}
if err != nil {
return err
}
}
return nil
}

View File

@@ -0,0 +1,42 @@
//go:build !linux || android
/* SPDX-License-Identifier: MIT
*
* Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
*/
package conn
import "net/netip"
func (e *StdNetEndpoint) SrcIP() netip.Addr {
return netip.Addr{}
}
func (e *StdNetEndpoint) SrcIfidx() int32 {
return 0
}
func (e *StdNetEndpoint) SrcToString() string {
return ""
}
// TODO: macOS, FreeBSD and other BSDs likely do support the sticky sockets
// {get,set}srcControl feature set, but use alternatively named flags and need
// ports and require testing.
// getSrcFromControl parses the control for PKTINFO and if found updates ep with
// the source information found.
func getSrcFromControl(control []byte, ep *StdNetEndpoint) {
}
// setSrcControl parses the control for PKTINFO and if found updates ep with
// the source information found.
func setSrcControl(control *[]byte, ep *StdNetEndpoint) {
}
// stickyControlSize returns the recommended buffer size for pooling sticky
// offloading control data.
const stickyControlSize = 0
const StdNetSupportsStickySockets = false

View File

@@ -0,0 +1,105 @@
package conn
import (
"net/netip"
"unsafe"
"golang.org/x/sys/unix"
)
func (e *StdNetEndpoint) SrcIP() netip.Addr {
switch len(e.src) {
case unix.CmsgSpace(unix.SizeofInet4Pktinfo):
info := (*unix.Inet4Pktinfo)(unsafe.Pointer(&e.src[unix.CmsgLen(0)]))
return netip.AddrFrom4(info.Spec_dst)
case unix.CmsgSpace(unix.SizeofInet6Pktinfo):
info := (*unix.Inet6Pktinfo)(unsafe.Pointer(&e.src[unix.CmsgLen(0)]))
// TODO: set zone. in order to do so we need to check if the address is
// link local, and if it is perform a syscall to turn the ifindex into a
// zone string because netip uses string zones.
return netip.AddrFrom16(info.Addr)
}
return netip.Addr{}
}
func (e *StdNetEndpoint) SrcIfidx() int32 {
switch len(e.src) {
case unix.CmsgSpace(unix.SizeofInet4Pktinfo):
info := (*unix.Inet4Pktinfo)(unsafe.Pointer(&e.src[unix.CmsgLen(0)]))
return info.Ifindex
case unix.CmsgSpace(unix.SizeofInet6Pktinfo):
info := (*unix.Inet6Pktinfo)(unsafe.Pointer(&e.src[unix.CmsgLen(0)]))
return int32(info.Ifindex)
}
return 0
}
func (e *StdNetEndpoint) SrcToString() string {
return e.SrcIP().String()
}
// getSrcFromControl parses the control for PKTINFO and if found updates ep with
// the source information found.
func getSrcFromControl(control []byte, ep *StdNetEndpoint) {
ep.ClearSrc()
var (
hdr unix.Cmsghdr
data []byte
rem []byte = control
err error
)
for len(rem) > unix.SizeofCmsghdr {
hdr, data, rem, err = unix.ParseOneSocketControlMessage(rem)
if err != nil {
return
}
if hdr.Level == unix.IPPROTO_IP &&
hdr.Type == unix.IP_PKTINFO {
if ep.src == nil || cap(ep.src) < unix.CmsgSpace(unix.SizeofInet4Pktinfo) {
ep.src = make([]byte, 0, unix.CmsgSpace(unix.SizeofInet4Pktinfo))
}
ep.src = ep.src[:unix.CmsgSpace(unix.SizeofInet4Pktinfo)]
hdrBuf := unsafe.Slice((*byte)(unsafe.Pointer(&hdr)), unix.SizeofCmsghdr)
copy(ep.src, hdrBuf)
copy(ep.src[unix.CmsgLen(0):], data)
return
}
if hdr.Level == unix.IPPROTO_IPV6 &&
hdr.Type == unix.IPV6_PKTINFO {
if ep.src == nil || cap(ep.src) < unix.CmsgSpace(unix.SizeofInet6Pktinfo) {
ep.src = make([]byte, 0, unix.CmsgSpace(unix.SizeofInet6Pktinfo))
}
ep.src = ep.src[:unix.CmsgSpace(unix.SizeofInet6Pktinfo)]
hdrBuf := unsafe.Slice((*byte)(unsafe.Pointer(&hdr)), unix.SizeofCmsghdr)
copy(ep.src, hdrBuf)
copy(ep.src[unix.CmsgLen(0):], data)
return
}
}
}
// setSrcControl sets an IP{V6}_PKTINFO in control based on the source address
// and source ifindex found in ep. control's len will be set to 0 in the event
// that ep is a default value.
func setSrcControl(control *[]byte, ep *StdNetEndpoint) {
if cap(*control) < len(ep.src) {
return
}
*control = (*control)[:0]
*control = append(*control, ep.src...)
}
// stickyControlSize returns the recommended buffer size for pooling sticky
// offloading control data.
var stickyControlSize = unix.CmsgSpace(unix.SizeofInet6Pktinfo)
const StdNetSupportsStickySockets = true

42
wgstack/tun/checksum.go Normal file
View File

@@ -0,0 +1,42 @@
package tun
import "encoding/binary"
// TODO: Explore SIMD and/or other assembly optimizations.
func checksumNoFold(b []byte, initial uint64) uint64 {
ac := initial
i := 0
n := len(b)
for n >= 4 {
ac += uint64(binary.BigEndian.Uint32(b[i : i+4]))
n -= 4
i += 4
}
for n >= 2 {
ac += uint64(binary.BigEndian.Uint16(b[i : i+2]))
n -= 2
i += 2
}
if n == 1 {
ac += uint64(b[i]) << 8
}
return ac
}
func checksum(b []byte, initial uint64) uint16 {
ac := checksumNoFold(b, initial)
ac = (ac >> 16) + (ac & 0xffff)
ac = (ac >> 16) + (ac & 0xffff)
ac = (ac >> 16) + (ac & 0xffff)
ac = (ac >> 16) + (ac & 0xffff)
return uint16(ac)
}
func pseudoHeaderChecksumNoFold(protocol uint8, srcAddr, dstAddr []byte, totalLen uint16) uint64 {
sum := checksumNoFold(srcAddr, 0)
sum = checksumNoFold(dstAddr, sum)
sum = checksumNoFold([]byte{0, protocol}, sum)
tmp := make([]byte, 2)
binary.BigEndian.PutUint16(tmp, totalLen)
return checksumNoFold(tmp, sum)
}

3
wgstack/tun/export.go Normal file
View File

@@ -0,0 +1,3 @@
package tun
const VirtioNetHdrLen = virtioNetHdrLen

View File

@@ -0,0 +1,630 @@
//go:build linux
// SPDX-License-Identifier: MIT
//
// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
package tun
import (
"bytes"
"encoding/binary"
"errors"
"io"
"unsafe"
wgconn "github.com/slackhq/nebula/wgstack/conn"
"golang.org/x/sys/unix"
)
var ErrTooManySegments = errors.New("tun: too many segments for TSO")
const tcpFlagsOffset = 13
const (
tcpFlagFIN uint8 = 0x01
tcpFlagPSH uint8 = 0x08
tcpFlagACK uint8 = 0x10
)
// virtioNetHdr is defined in the kernel in include/uapi/linux/virtio_net.h. The
// kernel symbol is virtio_net_hdr.
type virtioNetHdr struct {
flags uint8
gsoType uint8
hdrLen uint16
gsoSize uint16
csumStart uint16
csumOffset uint16
}
func (v *virtioNetHdr) decode(b []byte) error {
if len(b) < virtioNetHdrLen {
return io.ErrShortBuffer
}
copy(unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen), b[:virtioNetHdrLen])
return nil
}
func (v *virtioNetHdr) encode(b []byte) error {
if len(b) < virtioNetHdrLen {
return io.ErrShortBuffer
}
copy(b[:virtioNetHdrLen], unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen))
return nil
}
const (
// virtioNetHdrLen is the length in bytes of virtioNetHdr. This matches the
// shape of the C ABI for its kernel counterpart -- sizeof(virtio_net_hdr).
virtioNetHdrLen = int(unsafe.Sizeof(virtioNetHdr{}))
)
// flowKey represents the key for a flow.
type flowKey struct {
srcAddr, dstAddr [16]byte
srcPort, dstPort uint16
rxAck uint32 // varying ack values should not be coalesced. Treat them as separate flows.
}
// tcpGROTable holds flow and coalescing information for the purposes of GRO.
type tcpGROTable struct {
itemsByFlow map[flowKey][]tcpGROItem
itemsPool [][]tcpGROItem
}
func newTCPGROTable() *tcpGROTable {
t := &tcpGROTable{
itemsByFlow: make(map[flowKey][]tcpGROItem, wgconn.IdealBatchSize),
itemsPool: make([][]tcpGROItem, wgconn.IdealBatchSize),
}
for i := range t.itemsPool {
t.itemsPool[i] = make([]tcpGROItem, 0, wgconn.IdealBatchSize)
}
return t
}
func newFlowKey(pkt []byte, srcAddr, dstAddr, tcphOffset int) flowKey {
key := flowKey{}
addrSize := dstAddr - srcAddr
copy(key.srcAddr[:], pkt[srcAddr:dstAddr])
copy(key.dstAddr[:], pkt[dstAddr:dstAddr+addrSize])
key.srcPort = binary.BigEndian.Uint16(pkt[tcphOffset:])
key.dstPort = binary.BigEndian.Uint16(pkt[tcphOffset+2:])
key.rxAck = binary.BigEndian.Uint32(pkt[tcphOffset+8:])
return key
}
// lookupOrInsert looks up a flow for the provided packet and metadata,
// returning the packets found for the flow, or inserting a new one if none
// is found.
func (t *tcpGROTable) lookupOrInsert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) ([]tcpGROItem, bool) {
key := newFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
items, ok := t.itemsByFlow[key]
if ok {
return items, ok
}
// TODO: insert() performs another map lookup. This could be rearranged to avoid.
t.insert(pkt, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex)
return nil, false
}
// insert an item in the table for the provided packet and packet metadata.
func (t *tcpGROTable) insert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) {
key := newFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
item := tcpGROItem{
key: key,
bufsIndex: uint16(bufsIndex),
gsoSize: uint16(len(pkt[tcphOffset+tcphLen:])),
iphLen: uint8(tcphOffset),
tcphLen: uint8(tcphLen),
sentSeq: binary.BigEndian.Uint32(pkt[tcphOffset+4:]),
pshSet: pkt[tcphOffset+tcpFlagsOffset]&tcpFlagPSH != 0,
}
items, ok := t.itemsByFlow[key]
if !ok {
items = t.newItems()
}
items = append(items, item)
t.itemsByFlow[key] = items
}
func (t *tcpGROTable) updateAt(item tcpGROItem, i int) {
items, _ := t.itemsByFlow[item.key]
items[i] = item
}
func (t *tcpGROTable) deleteAt(key flowKey, i int) {
items, _ := t.itemsByFlow[key]
items = append(items[:i], items[i+1:]...)
t.itemsByFlow[key] = items
}
// tcpGROItem represents bookkeeping data for a TCP packet during the lifetime
// of a GRO evaluation across a vector of packets.
type tcpGROItem struct {
key flowKey
sentSeq uint32 // the sequence number
bufsIndex uint16 // the index into the original bufs slice
numMerged uint16 // the number of packets merged into this item
gsoSize uint16 // payload size
iphLen uint8 // ip header len
tcphLen uint8 // tcp header len
pshSet bool // psh flag is set
}
func (t *tcpGROTable) newItems() []tcpGROItem {
var items []tcpGROItem
items, t.itemsPool = t.itemsPool[len(t.itemsPool)-1], t.itemsPool[:len(t.itemsPool)-1]
return items
}
func (t *tcpGROTable) reset() {
for k, items := range t.itemsByFlow {
items = items[:0]
t.itemsPool = append(t.itemsPool, items)
delete(t.itemsByFlow, k)
}
}
// canCoalesce represents the outcome of checking if two TCP packets are
// candidates for coalescing.
type canCoalesce int
const (
coalescePrepend canCoalesce = -1
coalesceUnavailable canCoalesce = 0
coalesceAppend canCoalesce = 1
)
// tcpPacketsCanCoalesce evaluates if pkt can be coalesced with the packet
// described by item. This function makes considerations that match the kernel's
// GRO self tests, which can be found in tools/testing/selftests/net/gro.c.
func tcpPacketsCanCoalesce(pkt []byte, iphLen, tcphLen uint8, seq uint32, pshSet bool, gsoSize uint16, item tcpGROItem, bufs [][]byte, bufsOffset int) canCoalesce {
pktTarget := bufs[item.bufsIndex][bufsOffset:]
if tcphLen != item.tcphLen {
// cannot coalesce with unequal tcp options len
return coalesceUnavailable
}
if tcphLen > 20 {
if !bytes.Equal(pkt[iphLen+20:iphLen+tcphLen], pktTarget[item.iphLen+20:iphLen+tcphLen]) {
// cannot coalesce with unequal tcp options
return coalesceUnavailable
}
}
if pkt[0]>>4 == 6 {
if pkt[0] != pktTarget[0] || pkt[1]>>4 != pktTarget[1]>>4 {
// cannot coalesce with unequal Traffic class values
return coalesceUnavailable
}
if pkt[7] != pktTarget[7] {
// cannot coalesce with unequal Hop limit values
return coalesceUnavailable
}
} else {
if pkt[1] != pktTarget[1] {
// cannot coalesce with unequal ToS values
return coalesceUnavailable
}
if pkt[6]>>5 != pktTarget[6]>>5 {
// cannot coalesce with unequal DF or reserved bits. MF is checked
// further up the stack.
return coalesceUnavailable
}
if pkt[8] != pktTarget[8] {
// cannot coalesce with unequal TTL values
return coalesceUnavailable
}
}
// seq adjacency
lhsLen := item.gsoSize
lhsLen += item.numMerged * item.gsoSize
if seq == item.sentSeq+uint32(lhsLen) { // pkt aligns following item from a seq num perspective
if item.pshSet {
// We cannot append to a segment that has the PSH flag set, PSH
// can only be set on the final segment in a reassembled group.
return coalesceUnavailable
}
if len(pktTarget[iphLen+tcphLen:])%int(item.gsoSize) != 0 {
// A smaller than gsoSize packet has been appended previously.
// Nothing can come after a smaller packet on the end.
return coalesceUnavailable
}
if gsoSize > item.gsoSize {
// We cannot have a larger packet following a smaller one.
return coalesceUnavailable
}
return coalesceAppend
} else if seq+uint32(gsoSize) == item.sentSeq { // pkt aligns in front of item from a seq num perspective
if pshSet {
// We cannot prepend with a segment that has the PSH flag set, PSH
// can only be set on the final segment in a reassembled group.
return coalesceUnavailable
}
if gsoSize < item.gsoSize {
// We cannot have a larger packet following a smaller one.
return coalesceUnavailable
}
if gsoSize > item.gsoSize && item.numMerged > 0 {
// There's at least one previous merge, and we're larger than all
// previous. This would put multiple smaller packets on the end.
return coalesceUnavailable
}
return coalescePrepend
}
return coalesceUnavailable
}
func tcpChecksumValid(pkt []byte, iphLen uint8, isV6 bool) bool {
srcAddrAt := ipv4SrcAddrOffset
addrSize := 4
if isV6 {
srcAddrAt = ipv6SrcAddrOffset
addrSize = 16
}
tcpTotalLen := uint16(len(pkt) - int(iphLen))
tcpCSumNoFold := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, pkt[srcAddrAt:srcAddrAt+addrSize], pkt[srcAddrAt+addrSize:srcAddrAt+addrSize*2], tcpTotalLen)
return ^checksum(pkt[iphLen:], tcpCSumNoFold) == 0
}
// coalesceResult represents the result of attempting to coalesce two TCP
// packets.
type coalesceResult int
const (
coalesceInsufficientCap coalesceResult = 0
coalescePSHEnding coalesceResult = 1
coalesceItemInvalidCSum coalesceResult = 2
coalescePktInvalidCSum coalesceResult = 3
coalesceSuccess coalesceResult = 4
)
// coalesceTCPPackets attempts to coalesce pkt with the packet described by
// item, returning the outcome. This function may swap bufs elements in the
// event of a prepend as item's bufs index is already being tracked for writing
// to a Device.
func coalesceTCPPackets(mode canCoalesce, pkt []byte, pktBuffsIndex int, gsoSize uint16, seq uint32, pshSet bool, item *tcpGROItem, bufs [][]byte, bufsOffset int, isV6 bool) coalesceResult {
var pktHead []byte // the packet that will end up at the front
headersLen := item.iphLen + item.tcphLen
coalescedLen := len(bufs[item.bufsIndex][bufsOffset:]) + len(pkt) - int(headersLen)
// Copy data
if mode == coalescePrepend {
pktHead = pkt
if cap(pkt)-bufsOffset < coalescedLen {
// We don't want to allocate a new underlying array if capacity is
// too small.
return coalesceInsufficientCap
}
if pshSet {
return coalescePSHEnding
}
if item.numMerged == 0 {
if !tcpChecksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, isV6) {
return coalesceItemInvalidCSum
}
}
if !tcpChecksumValid(pkt, item.iphLen, isV6) {
return coalescePktInvalidCSum
}
item.sentSeq = seq
extendBy := coalescedLen - len(pktHead)
bufs[pktBuffsIndex] = append(bufs[pktBuffsIndex], make([]byte, extendBy)...)
copy(bufs[pktBuffsIndex][bufsOffset+len(pkt):], bufs[item.bufsIndex][bufsOffset+int(headersLen):])
// Flip the slice headers in bufs as part of prepend. The index of item
// is already being tracked for writing.
bufs[item.bufsIndex], bufs[pktBuffsIndex] = bufs[pktBuffsIndex], bufs[item.bufsIndex]
} else {
pktHead = bufs[item.bufsIndex][bufsOffset:]
if cap(pktHead)-bufsOffset < coalescedLen {
// We don't want to allocate a new underlying array if capacity is
// too small.
return coalesceInsufficientCap
}
if item.numMerged == 0 {
if !tcpChecksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, isV6) {
return coalesceItemInvalidCSum
}
}
if !tcpChecksumValid(pkt, item.iphLen, isV6) {
return coalescePktInvalidCSum
}
if pshSet {
// We are appending a segment with PSH set.
item.pshSet = pshSet
pktHead[item.iphLen+tcpFlagsOffset] |= tcpFlagPSH
}
extendBy := len(pkt) - int(headersLen)
bufs[item.bufsIndex] = append(bufs[item.bufsIndex], make([]byte, extendBy)...)
copy(bufs[item.bufsIndex][bufsOffset+len(pktHead):], pkt[headersLen:])
}
if gsoSize > item.gsoSize {
item.gsoSize = gsoSize
}
hdr := virtioNetHdr{
flags: unix.VIRTIO_NET_HDR_F_NEEDS_CSUM, // this turns into CHECKSUM_PARTIAL in the skb
hdrLen: uint16(headersLen),
gsoSize: uint16(item.gsoSize),
csumStart: uint16(item.iphLen),
csumOffset: 16,
}
// Recalculate the total len (IPv4) or payload len (IPv6). Recalculate the
// (IPv4) header checksum.
if isV6 {
hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV6
binary.BigEndian.PutUint16(pktHead[4:], uint16(coalescedLen)-uint16(item.iphLen)) // set new payload len
} else {
hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV4
pktHead[10], pktHead[11] = 0, 0 // clear checksum field
binary.BigEndian.PutUint16(pktHead[2:], uint16(coalescedLen)) // set new total length
iphCSum := ^checksum(pktHead[:item.iphLen], 0) // compute checksum
binary.BigEndian.PutUint16(pktHead[10:], iphCSum) // set checksum field
}
hdr.encode(bufs[item.bufsIndex][bufsOffset-virtioNetHdrLen:])
// Calculate the pseudo header checksum and place it at the TCP checksum
// offset. Downstream checksum offloading will combine this with computation
// of the tcp header and payload checksum.
addrLen := 4
addrOffset := ipv4SrcAddrOffset
if isV6 {
addrLen = 16
addrOffset = ipv6SrcAddrOffset
}
srcAddrAt := bufsOffset + addrOffset
srcAddr := bufs[item.bufsIndex][srcAddrAt : srcAddrAt+addrLen]
dstAddr := bufs[item.bufsIndex][srcAddrAt+addrLen : srcAddrAt+addrLen*2]
psum := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, srcAddr, dstAddr, uint16(coalescedLen-int(item.iphLen)))
binary.BigEndian.PutUint16(pktHead[hdr.csumStart+hdr.csumOffset:], checksum([]byte{}, psum))
item.numMerged++
return coalesceSuccess
}
const (
ipv4FlagMoreFragments uint8 = 0x20
)
const (
ipv4SrcAddrOffset = 12
ipv6SrcAddrOffset = 8
maxUint16 = 1<<16 - 1
)
// tcpGRO evaluates the TCP packet at pktI in bufs for coalescing with
// existing packets tracked in table. It will return false when pktI is not
// coalesced, otherwise true. This indicates to the caller if bufs[pktI]
// should be written to the Device.
func tcpGRO(bufs [][]byte, offset int, pktI int, table *tcpGROTable, isV6 bool) (pktCoalesced bool) {
pkt := bufs[pktI][offset:]
if len(pkt) > maxUint16 {
// A valid IPv4 or IPv6 packet will never exceed this.
return false
}
iphLen := int((pkt[0] & 0x0F) * 4)
if isV6 {
iphLen = 40
ipv6HPayloadLen := int(binary.BigEndian.Uint16(pkt[4:]))
if ipv6HPayloadLen != len(pkt)-iphLen {
return false
}
} else {
totalLen := int(binary.BigEndian.Uint16(pkt[2:]))
if totalLen != len(pkt) {
return false
}
}
if len(pkt) < iphLen {
return false
}
tcphLen := int((pkt[iphLen+12] >> 4) * 4)
if tcphLen < 20 || tcphLen > 60 {
return false
}
if len(pkt) < iphLen+tcphLen {
return false
}
if !isV6 {
if pkt[6]&ipv4FlagMoreFragments != 0 || pkt[6]<<3 != 0 || pkt[7] != 0 {
// no GRO support for fragmented segments for now
return false
}
}
tcpFlags := pkt[iphLen+tcpFlagsOffset]
var pshSet bool
// not a candidate if any non-ACK flags (except PSH+ACK) are set
if tcpFlags != tcpFlagACK {
if pkt[iphLen+tcpFlagsOffset] != tcpFlagACK|tcpFlagPSH {
return false
}
pshSet = true
}
gsoSize := uint16(len(pkt) - tcphLen - iphLen)
// not a candidate if payload len is 0
if gsoSize < 1 {
return false
}
seq := binary.BigEndian.Uint32(pkt[iphLen+4:])
srcAddrOffset := ipv4SrcAddrOffset
addrLen := 4
if isV6 {
srcAddrOffset = ipv6SrcAddrOffset
addrLen = 16
}
items, existing := table.lookupOrInsert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
if !existing {
return false
}
for i := len(items) - 1; i >= 0; i-- {
// In the best case of packets arriving in order iterating in reverse is
// more efficient if there are multiple items for a given flow. This
// also enables a natural table.deleteAt() in the
// coalesceItemInvalidCSum case without the need for index tracking.
// This algorithm makes a best effort to coalesce in the event of
// unordered packets, where pkt may land anywhere in items from a
// sequence number perspective, however once an item is inserted into
// the table it is never compared across other items later.
item := items[i]
can := tcpPacketsCanCoalesce(pkt, uint8(iphLen), uint8(tcphLen), seq, pshSet, gsoSize, item, bufs, offset)
if can != coalesceUnavailable {
result := coalesceTCPPackets(can, pkt, pktI, gsoSize, seq, pshSet, &item, bufs, offset, isV6)
switch result {
case coalesceSuccess:
table.updateAt(item, i)
return true
case coalesceItemInvalidCSum:
// delete the item with an invalid csum
table.deleteAt(item.key, i)
case coalescePktInvalidCSum:
// no point in inserting an item that we can't coalesce
return false
default:
}
}
}
// failed to coalesce with any other packets; store the item in the flow
table.insert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
return false
}
func isTCP4NoIPOptions(b []byte) bool {
if len(b) < 40 {
return false
}
if b[0]>>4 != 4 {
return false
}
if b[0]&0x0F != 5 {
return false
}
if b[9] != unix.IPPROTO_TCP {
return false
}
return true
}
func isTCP6NoEH(b []byte) bool {
if len(b) < 60 {
return false
}
if b[0]>>4 != 6 {
return false
}
if b[6] != unix.IPPROTO_TCP {
return false
}
return true
}
// handleGRO evaluates bufs for GRO, and writes the indices of the resulting
// packets into toWrite. toWrite, tcp4Table, and tcp6Table should initially be
// empty (but non-nil), and are passed in to save allocs as the caller may reset
// and recycle them across vectors of packets.
func handleGRO(bufs [][]byte, offset int, tcp4Table, tcp6Table *tcpGROTable, toWrite *[]int) error {
for i := range bufs {
if offset < virtioNetHdrLen || offset > len(bufs[i])-1 {
return errors.New("invalid offset")
}
var coalesced bool
switch {
case isTCP4NoIPOptions(bufs[i][offset:]): // ipv4 packets w/IP options do not coalesce
coalesced = tcpGRO(bufs, offset, i, tcp4Table, false)
case isTCP6NoEH(bufs[i][offset:]): // ipv6 packets w/extension headers do not coalesce
coalesced = tcpGRO(bufs, offset, i, tcp6Table, true)
}
if !coalesced {
hdr := virtioNetHdr{}
err := hdr.encode(bufs[i][offset-virtioNetHdrLen:])
if err != nil {
return err
}
*toWrite = append(*toWrite, i)
}
}
return nil
}
// tcpTSO splits packets from in into outBuffs, writing the size of each
// element into sizes. It returns the number of buffers populated, and/or an
// error.
func tcpTSO(in []byte, hdr virtioNetHdr, outBuffs [][]byte, sizes []int, outOffset int) (int, error) {
iphLen := int(hdr.csumStart)
srcAddrOffset := ipv6SrcAddrOffset
addrLen := 16
if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV4 {
in[10], in[11] = 0, 0 // clear ipv4 header checksum
srcAddrOffset = ipv4SrcAddrOffset
addrLen = 4
}
tcpCSumAt := int(hdr.csumStart + hdr.csumOffset)
in[tcpCSumAt], in[tcpCSumAt+1] = 0, 0 // clear tcp checksum
firstTCPSeqNum := binary.BigEndian.Uint32(in[hdr.csumStart+4:])
nextSegmentDataAt := int(hdr.hdrLen)
i := 0
for ; nextSegmentDataAt < len(in); i++ {
if i == len(outBuffs) {
return i - 1, ErrTooManySegments
}
nextSegmentEnd := nextSegmentDataAt + int(hdr.gsoSize)
if nextSegmentEnd > len(in) {
nextSegmentEnd = len(in)
}
segmentDataLen := nextSegmentEnd - nextSegmentDataAt
totalLen := int(hdr.hdrLen) + segmentDataLen
sizes[i] = totalLen
out := outBuffs[i][outOffset:]
copy(out, in[:iphLen])
if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV4 {
// For IPv4 we are responsible for incrementing the ID field,
// updating the total len field, and recalculating the header
// checksum.
if i > 0 {
id := binary.BigEndian.Uint16(out[4:])
id += uint16(i)
binary.BigEndian.PutUint16(out[4:], id)
}
binary.BigEndian.PutUint16(out[2:], uint16(totalLen))
ipv4CSum := ^checksum(out[:iphLen], 0)
binary.BigEndian.PutUint16(out[10:], ipv4CSum)
} else {
// For IPv6 we are responsible for updating the payload length field.
binary.BigEndian.PutUint16(out[4:], uint16(totalLen-iphLen))
}
// TCP header
copy(out[hdr.csumStart:hdr.hdrLen], in[hdr.csumStart:hdr.hdrLen])
tcpSeq := firstTCPSeqNum + uint32(hdr.gsoSize*uint16(i))
binary.BigEndian.PutUint32(out[hdr.csumStart+4:], tcpSeq)
if nextSegmentEnd != len(in) {
// FIN and PSH should only be set on last segment
clearFlags := tcpFlagFIN | tcpFlagPSH
out[hdr.csumStart+tcpFlagsOffset] &^= clearFlags
}
// payload
copy(out[hdr.hdrLen:], in[nextSegmentDataAt:nextSegmentEnd])
// TCP checksum
tcpHLen := int(hdr.hdrLen - hdr.csumStart)
tcpLenForPseudo := uint16(tcpHLen + segmentDataLen)
tcpCSumNoFold := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, in[srcAddrOffset:srcAddrOffset+addrLen], in[srcAddrOffset+addrLen:srcAddrOffset+addrLen*2], tcpLenForPseudo)
tcpCSum := ^checksum(out[hdr.csumStart:totalLen], tcpCSumNoFold)
binary.BigEndian.PutUint16(out[hdr.csumStart+hdr.csumOffset:], tcpCSum)
nextSegmentDataAt += int(hdr.gsoSize)
}
return i, nil
}
func gsoNoneChecksum(in []byte, cSumStart, cSumOffset uint16) error {
cSumAt := cSumStart + cSumOffset
// The initial value at the checksum offset should be summed with the
// checksum we compute. This is typically the pseudo-header checksum.
initial := binary.BigEndian.Uint16(in[cSumAt:])
in[cSumAt], in[cSumAt+1] = 0, 0
binary.BigEndian.PutUint16(in[cSumAt:], ^checksum(in[cSumStart:], uint64(initial)))
return nil
}

52
wgstack/tun/tun.go Normal file
View File

@@ -0,0 +1,52 @@
// SPDX-License-Identifier: MIT
//
// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
package tun
import (
"os"
)
type Event int
const (
EventUp = 1 << iota
EventDown
EventMTUUpdate
)
type Device interface {
// File returns the file descriptor of the device.
File() *os.File
// Read one or more packets from the Device (without any additional headers).
// On a successful read it returns the number of packets read, and sets
// packet lengths within the sizes slice. len(sizes) must be >= len(bufs).
// A nonzero offset can be used to instruct the Device on where to begin
// reading into each element of the bufs slice.
Read(bufs [][]byte, sizes []int, offset int) (n int, err error)
// Write one or more packets to the device (without any additional headers).
// On a successful write it returns the number of packets written. A nonzero
// offset can be used to instruct the Device on where to begin writing from
// each packet contained within the bufs slice.
Write(bufs [][]byte, offset int) (int, error)
// MTU returns the MTU of the Device.
MTU() (int, error)
// Name returns the current name of the Device.
Name() (string, error)
// Events returns a channel of type Event, which is fed Device events.
Events() <-chan Event
// Close stops the Device and closes the Event channel.
Close() error
// BatchSize returns the preferred/max number of packets that can be read or
// written in a single read/write call. BatchSize must not change over the
// lifetime of a Device.
BatchSize() int
}

664
wgstack/tun/tun_linux.go Normal file
View File

@@ -0,0 +1,664 @@
//go:build linux
// SPDX-License-Identifier: MIT
//
// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
package tun
/* Implementation of the TUN device interface for linux
*/
import (
"errors"
"fmt"
"os"
"sync"
"syscall"
"time"
"unsafe"
wgconn "github.com/slackhq/nebula/wgstack/conn"
"golang.org/x/sys/unix"
"golang.zx2c4.com/wireguard/rwcancel"
)
const (
cloneDevicePath = "/dev/net/tun"
ifReqSize = unix.IFNAMSIZ + 64
)
type NativeTun struct {
tunFile *os.File
index int32 // if index
errors chan error // async error handling
events chan Event // device related events
netlinkSock int
netlinkCancel *rwcancel.RWCancel
hackListenerClosed sync.Mutex
statusListenersShutdown chan struct{}
batchSize int
vnetHdr bool
closeOnce sync.Once
nameOnce sync.Once // guards calling initNameCache, which sets following fields
nameCache string // name of interface
nameErr error
readOpMu sync.Mutex // readOpMu guards readBuff
readBuff [virtioNetHdrLen + 65535]byte // if vnetHdr every read() is prefixed by virtioNetHdr
writeOpMu sync.Mutex // writeOpMu guards toWrite, tcp4GROTable, tcp6GROTable
toWrite []int
tcp4GROTable, tcp6GROTable *tcpGROTable
}
func (tun *NativeTun) File() *os.File {
return tun.tunFile
}
func (tun *NativeTun) routineHackListener() {
defer tun.hackListenerClosed.Unlock()
/* This is needed for the detection to work across network namespaces
* If you are reading this and know a better method, please get in touch.
*/
last := 0
const (
up = 1
down = 2
)
for {
sysconn, err := tun.tunFile.SyscallConn()
if err != nil {
return
}
err2 := sysconn.Control(func(fd uintptr) {
_, err = unix.Write(int(fd), nil)
})
if err2 != nil {
return
}
switch err {
case unix.EINVAL:
if last != up {
// If the tunnel is up, it reports that write() is
// allowed but we provided invalid data.
tun.events <- EventUp
last = up
}
case unix.EIO:
if last != down {
// If the tunnel is down, it reports that no I/O
// is possible, without checking our provided data.
tun.events <- EventDown
last = down
}
default:
return
}
select {
case <-time.After(time.Second):
// nothing
case <-tun.statusListenersShutdown:
return
}
}
}
func createNetlinkSocket() (int, error) {
sock, err := unix.Socket(unix.AF_NETLINK, unix.SOCK_RAW|unix.SOCK_CLOEXEC, unix.NETLINK_ROUTE)
if err != nil {
return -1, err
}
saddr := &unix.SockaddrNetlink{
Family: unix.AF_NETLINK,
Groups: unix.RTMGRP_LINK | unix.RTMGRP_IPV4_IFADDR | unix.RTMGRP_IPV6_IFADDR,
}
err = unix.Bind(sock, saddr)
if err != nil {
return -1, err
}
return sock, nil
}
func (tun *NativeTun) routineNetlinkListener() {
defer func() {
unix.Close(tun.netlinkSock)
tun.hackListenerClosed.Lock()
close(tun.events)
tun.netlinkCancel.Close()
}()
for msg := make([]byte, 1<<16); ; {
var err error
var msgn int
for {
msgn, _, _, _, err = unix.Recvmsg(tun.netlinkSock, msg[:], nil, 0)
if err == nil || !rwcancel.RetryAfterError(err) {
break
}
if !tun.netlinkCancel.ReadyRead() {
tun.errors <- fmt.Errorf("netlink socket closed: %w", err)
return
}
}
if err != nil {
tun.errors <- fmt.Errorf("failed to receive netlink message: %w", err)
return
}
select {
case <-tun.statusListenersShutdown:
return
default:
}
wasEverUp := false
for remain := msg[:msgn]; len(remain) >= unix.SizeofNlMsghdr; {
hdr := *(*unix.NlMsghdr)(unsafe.Pointer(&remain[0]))
if int(hdr.Len) > len(remain) {
break
}
switch hdr.Type {
case unix.NLMSG_DONE:
remain = []byte{}
case unix.RTM_NEWLINK:
info := *(*unix.IfInfomsg)(unsafe.Pointer(&remain[unix.SizeofNlMsghdr]))
remain = remain[hdr.Len:]
if info.Index != tun.index {
// not our interface
continue
}
if info.Flags&unix.IFF_RUNNING != 0 {
tun.events <- EventUp
wasEverUp = true
}
if info.Flags&unix.IFF_RUNNING == 0 {
// Don't emit EventDown before we've ever emitted EventUp.
// This avoids a startup race with HackListener, which
// might detect Up before we have finished reporting Down.
if wasEverUp {
tun.events <- EventDown
}
}
tun.events <- EventMTUUpdate
default:
remain = remain[hdr.Len:]
}
}
}
}
func getIFIndex(name string) (int32, error) {
fd, err := unix.Socket(
unix.AF_INET,
unix.SOCK_DGRAM|unix.SOCK_CLOEXEC,
0,
)
if err != nil {
return 0, err
}
defer unix.Close(fd)
var ifr [ifReqSize]byte
copy(ifr[:], name)
_, _, errno := unix.Syscall(
unix.SYS_IOCTL,
uintptr(fd),
uintptr(unix.SIOCGIFINDEX),
uintptr(unsafe.Pointer(&ifr[0])),
)
if errno != 0 {
return 0, errno
}
return *(*int32)(unsafe.Pointer(&ifr[unix.IFNAMSIZ])), nil
}
func (tun *NativeTun) setMTU(n int) error {
name, err := tun.Name()
if err != nil {
return err
}
// open datagram socket
fd, err := unix.Socket(
unix.AF_INET,
unix.SOCK_DGRAM|unix.SOCK_CLOEXEC,
0,
)
if err != nil {
return err
}
defer unix.Close(fd)
var ifr [ifReqSize]byte
copy(ifr[:], name)
*(*uint32)(unsafe.Pointer(&ifr[unix.IFNAMSIZ])) = uint32(n)
_, _, errno := unix.Syscall(
unix.SYS_IOCTL,
uintptr(fd),
uintptr(unix.SIOCSIFMTU),
uintptr(unsafe.Pointer(&ifr[0])),
)
if errno != 0 {
return errno
}
return nil
}
func (tun *NativeTun) routineNetlinkRead() {
defer func() {
unix.Close(tun.netlinkSock)
tun.hackListenerClosed.Lock()
close(tun.events)
tun.netlinkCancel.Close()
}()
for msg := make([]byte, 1<<16); ; {
var err error
var msgn int
for {
msgn, _, _, _, err = unix.Recvmsg(tun.netlinkSock, msg[:], nil, 0)
if err == nil || !rwcancel.RetryAfterError(err) {
break
}
if !tun.netlinkCancel.ReadyRead() {
tun.errors <- fmt.Errorf("netlink socket closed: %w", err)
return
}
}
if err != nil {
tun.errors <- fmt.Errorf("failed to receive netlink message: %w", err)
return
}
wasEverUp := false
for remain := msg[:msgn]; len(remain) >= unix.SizeofNlMsghdr; {
hdr := *(*unix.NlMsghdr)(unsafe.Pointer(&remain[0]))
if int(hdr.Len) > len(remain) {
break
}
switch hdr.Type {
case unix.NLMSG_DONE:
remain = []byte{}
case unix.RTM_NEWLINK:
info := *(*unix.IfInfomsg)(unsafe.Pointer(&remain[unix.SizeofNlMsghdr]))
remain = remain[hdr.Len:]
if info.Index != tun.index {
continue
}
if info.Flags&unix.IFF_RUNNING != 0 {
tun.events <- EventUp
wasEverUp = true
}
if info.Flags&unix.IFF_RUNNING == 0 {
if wasEverUp {
tun.events <- EventDown
}
}
tun.events <- EventMTUUpdate
default:
remain = remain[hdr.Len:]
}
}
}
}
func (tun *NativeTun) routineNetlink() {
var err error
tun.netlinkSock, err = createNetlinkSocket()
if err != nil {
tun.errors <- fmt.Errorf("failed to create netlink socket: %w", err)
return
}
tun.netlinkCancel, err = rwcancel.NewRWCancel(tun.netlinkSock)
if err != nil {
tun.errors <- fmt.Errorf("failed to create netlink cancel: %w", err)
return
}
go tun.routineNetlinkListener()
}
func (tun *NativeTun) Close() error {
var err1, err2 error
tun.closeOnce.Do(func() {
if tun.statusListenersShutdown != nil {
close(tun.statusListenersShutdown)
if tun.netlinkCancel != nil {
err1 = tun.netlinkCancel.Cancel()
}
} else if tun.events != nil {
close(tun.events)
}
err2 = tun.tunFile.Close()
})
if err1 != nil {
return err1
}
return err2
}
func (tun *NativeTun) BatchSize() int {
return tun.batchSize
}
const (
// TODO: support TSO with ECN bits
tunOffloads = unix.TUN_F_CSUM | unix.TUN_F_TSO4 | unix.TUN_F_TSO6
)
func (tun *NativeTun) initFromFlags(name string) error {
sc, err := tun.tunFile.SyscallConn()
if err != nil {
return err
}
if e := sc.Control(func(fd uintptr) {
var (
ifr *unix.Ifreq
)
ifr, err = unix.NewIfreq(name)
if err != nil {
return
}
err = unix.IoctlIfreq(int(fd), unix.TUNGETIFF, ifr)
if err != nil {
return
}
got := ifr.Uint16()
if got&unix.IFF_VNET_HDR != 0 {
err = unix.IoctlSetInt(int(fd), unix.TUNSETOFFLOAD, tunOffloads)
if err != nil {
return
}
tun.vnetHdr = true
tun.batchSize = wgconn.IdealBatchSize
} else {
tun.batchSize = 1
}
}); e != nil {
return e
}
return err
}
// CreateTUN creates a Device with the provided name and MTU.
func CreateTUN(name string, mtu int) (Device, error) {
nfd, err := unix.Open(cloneDevicePath, unix.O_RDWR|unix.O_CLOEXEC, 0)
if err != nil {
return nil, fmt.Errorf("CreateTUN(%q) failed; %s does not exist", name, cloneDevicePath)
}
fd := os.NewFile(uintptr(nfd), cloneDevicePath)
tun, err := CreateTUNFromFile(fd, mtu)
if err != nil {
return nil, err
}
if name != "tun" {
if err := tun.(*NativeTun).initFromFlags(name); err != nil {
tun.Close()
return nil, fmt.Errorf("CreateTUN(%q) failed to set flags: %w", name, err)
}
}
return tun, nil
}
// CreateTUNFromFile creates a Device from an os.File with the provided MTU.
func CreateTUNFromFile(file *os.File, mtu int) (Device, error) {
tun := &NativeTun{
tunFile: file,
errors: make(chan error, 5),
events: make(chan Event, 5),
}
name, err := tun.Name()
if err != nil {
return nil, fmt.Errorf("failed to determine TUN name: %w", err)
}
if err := tun.initFromFlags(name); err != nil {
return nil, fmt.Errorf("failed to query TUN flags: %w", err)
}
if tun.batchSize == 0 {
tun.batchSize = 1
}
tun.index, err = getIFIndex(name)
if err != nil {
return nil, fmt.Errorf("failed to get TUN index: %w", err)
}
if err = tun.setMTU(mtu); err != nil {
return nil, fmt.Errorf("failed to set MTU: %w", err)
}
tun.statusListenersShutdown = make(chan struct{})
go tun.routineNetlink()
if tun.batchSize == 0 {
tun.batchSize = 1
}
tun.tcp4GROTable = newTCPGROTable()
tun.tcp6GROTable = newTCPGROTable()
return tun, nil
}
func (tun *NativeTun) Name() (string, error) {
tun.nameOnce.Do(tun.initNameCache)
return tun.nameCache, tun.nameErr
}
func (tun *NativeTun) initNameCache() {
sysconn, err := tun.tunFile.SyscallConn()
if err != nil {
tun.nameErr = err
return
}
err = sysconn.Control(func(fd uintptr) {
var ifr [ifReqSize]byte
_, _, errno := unix.Syscall(
unix.SYS_IOCTL,
fd,
uintptr(unix.TUNGETIFF),
uintptr(unsafe.Pointer(&ifr[0])),
)
if errno != 0 {
tun.nameErr = errno
return
}
tun.nameCache = unix.ByteSliceToString(ifr[:])
})
if err != nil && tun.nameErr == nil {
tun.nameErr = err
}
}
func (tun *NativeTun) MTU() (int, error) {
name, err := tun.Name()
if err != nil {
return 0, err
}
// open datagram socket
fd, err := unix.Socket(
unix.AF_INET,
unix.SOCK_DGRAM|unix.SOCK_CLOEXEC,
0,
)
if err != nil {
return 0, err
}
defer unix.Close(fd)
var ifr [ifReqSize]byte
copy(ifr[:], name)
_, _, errno := unix.Syscall(
unix.SYS_IOCTL,
uintptr(fd),
uintptr(unix.SIOCGIFMTU),
uintptr(unsafe.Pointer(&ifr[0])),
)
if errno != 0 {
return 0, errno
}
return int(*(*uint32)(unsafe.Pointer(&ifr[unix.IFNAMSIZ]))), nil
}
func (tun *NativeTun) Events() <-chan Event {
return tun.events
}
func (tun *NativeTun) Write(bufs [][]byte, offset int) (int, error) {
tun.writeOpMu.Lock()
defer func() {
tun.tcp4GROTable.reset()
tun.tcp6GROTable.reset()
tun.writeOpMu.Unlock()
}()
var (
errs error
total int
)
tun.toWrite = tun.toWrite[:0]
if tun.vnetHdr {
err := handleGRO(bufs, offset, tun.tcp4GROTable, tun.tcp6GROTable, &tun.toWrite)
if err != nil {
return 0, err
}
offset -= virtioNetHdrLen
} else {
for i := range bufs {
tun.toWrite = append(tun.toWrite, i)
}
}
for _, bufsI := range tun.toWrite {
n, err := tun.tunFile.Write(bufs[bufsI][offset:])
if errors.Is(err, syscall.EBADFD) {
return total, os.ErrClosed
}
if err != nil {
errs = errors.Join(errs, err)
} else {
total += n
}
}
return total, errs
}
// handleVirtioRead splits in into bufs, leaving offset bytes at the front of
// each buffer. It mutates sizes to reflect the size of each element of bufs,
// and returns the number of packets read.
func handleVirtioRead(in []byte, bufs [][]byte, sizes []int, offset int) (int, error) {
var hdr virtioNetHdr
if err := hdr.decode(in); err != nil {
return 0, err
}
in = in[virtioNetHdrLen:]
if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_NONE {
if hdr.flags&unix.VIRTIO_NET_HDR_F_NEEDS_CSUM != 0 {
if err := gsoNoneChecksum(in, hdr.csumStart, hdr.csumOffset); err != nil {
return 0, err
}
}
if len(in) > len(bufs[0][offset:]) {
return 0, fmt.Errorf("read len %d overflows bufs element len %d", len(in), len(bufs[0][offset:]))
}
n := copy(bufs[0][offset:], in)
sizes[0] = n
return 1, nil
}
if hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV4 && hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV6 {
return 0, fmt.Errorf("unsupported virtio GSO type: %d", hdr.gsoType)
}
ipVersion := in[0] >> 4
switch ipVersion {
case 4:
if hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV4 {
return 0, fmt.Errorf("ip header version: %d, GSO type: %d", ipVersion, hdr.gsoType)
}
case 6:
if hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV6 {
return 0, fmt.Errorf("ip header version: %d, GSO type: %d", ipVersion, hdr.gsoType)
}
default:
return 0, fmt.Errorf("invalid ip header version: %d", ipVersion)
}
if len(in) <= int(hdr.csumStart+12) {
return 0, errors.New("packet is too short")
}
tcpHLen := uint16(in[hdr.csumStart+12] >> 4 * 4)
if tcpHLen < 20 || tcpHLen > 60 {
return 0, fmt.Errorf("tcp header len is invalid: %d", tcpHLen)
}
hdr.hdrLen = hdr.csumStart + tcpHLen
if len(in) < int(hdr.hdrLen) {
return 0, fmt.Errorf("length of packet (%d) < virtioNetHdr.hdrLen (%d)", len(in), hdr.hdrLen)
}
if hdr.hdrLen < hdr.csumStart {
return 0, fmt.Errorf("virtioNetHdr.hdrLen (%d) < virtioNetHdr.csumStart (%d)", hdr.hdrLen, hdr.csumStart)
}
cSumAt := int(hdr.csumStart + hdr.csumOffset)
if cSumAt+1 >= len(in) {
return 0, fmt.Errorf("end of checksum offset (%d) exceeds packet length (%d)", cSumAt+1, len(in))
}
return tcpTSO(in, hdr, bufs, sizes, offset)
}
func (tun *NativeTun) Read(bufs [][]byte, sizes []int, offset int) (int, error) {
tun.readOpMu.Lock()
defer tun.readOpMu.Unlock()
select {
case err := <-tun.errors:
return 0, err
default:
readInto := bufs[0][offset:]
if tun.vnetHdr {
readInto = tun.readBuff[:]
}
n, err := tun.tunFile.Read(readInto)
if errors.Is(err, syscall.EBADFD) {
err = os.ErrClosed
}
if err != nil {
return 0, err
}
if tun.vnetHdr {
return handleVirtioRead(readInto[:n], bufs, sizes, offset)
}
sizes[0] = n
return 1, nil
}
}