diff --git a/cmd/nebula-service/main.go b/cmd/nebula-service/main.go index efbcc8b8..992f9c72 100644 --- a/cmd/nebula-service/main.go +++ b/cmd/nebula-service/main.go @@ -85,7 +85,12 @@ func main() { } go ctrl.ShutdownBlock() - wait() + + if err := wait(); err != nil { + l.WithError(err).Error("Nebula stopped due to fatal error") + l.Info("Goodbye") + os.Exit(2) + } l.Info("Goodbye") } diff --git a/cmd/nebula/main.go b/cmd/nebula/main.go index fa2c5e7f..079ffbe7 100644 --- a/cmd/nebula/main.go +++ b/cmd/nebula/main.go @@ -80,7 +80,12 @@ func main() { go ctrl.ShutdownBlock() notifyReady(l) - wait() + + if err := wait(); err != nil { + l.WithError(err).Error("Nebula stopped due to fatal error") + l.Info("Goodbye") + os.Exit(2) + } l.Info("Goodbye") } diff --git a/control.go b/control.go index bb071933..6c0d6a34 100644 --- a/control.go +++ b/control.go @@ -65,8 +65,11 @@ type ControlHostInfo struct { } // Start actually runs nebula, this is a nonblocking call. -// The returned function can be used to wait for nebula to fully stop. -func (c *Control) Start() (func(), error) { +// The returned function blocks until nebula has fully stopped and returns the +// first fatal reader error (if any). A nil error means nebula shut down +// gracefully; a non-nil error means a reader hit an unexpected failure that +// triggered the shutdown. +func (c *Control) Start() (func() error, error) { c.stateLock.Lock() if c.state != Stopped { c.stateLock.Unlock() @@ -97,6 +100,8 @@ func (c *Control) Start() (func(), error) { c.lighthouseStart() } + c.f.triggerShutdown = c.Stop + // Start reading packets. c.state = Started c.stateLock.Unlock() diff --git a/interface.go b/interface.go index c38130c5..c94043d7 100644 --- a/interface.go +++ b/interface.go @@ -89,6 +89,12 @@ type Interface struct { readers []io.ReadWriteCloser wg sync.WaitGroup + // fatalErr holds the first unexpected reader error that caused shutdown. + // nil means "no fatal error" (yet) + fatalErr atomic.Pointer[error] + // triggerShutdown is a function that will be run exactly once, when onFatal swaps something non-nil into fatalErr + triggerShutdown func() + metricHandshakes metrics.Histogram messageMetrics *MessageMetrics cachedPacketMetrics *cachedPacketMetrics @@ -244,6 +250,7 @@ func (f *Interface) activate() error { f.readers[i] = reader } + f.wg.Add(1) // for us to wait on Close() to return if err = f.inside.Activate(); err != nil { f.inside.Close() return err @@ -252,7 +259,7 @@ func (f *Interface) activate() error { return nil } -func (f *Interface) run() (func(), error) { +func (f *Interface) run() (func() error, error) { // Launch n queues to read packets from udp for i := 0; i < f.routines; i++ { f.wg.Go(func() { @@ -267,7 +274,24 @@ func (f *Interface) run() (func(), error) { }) } - return f.wg.Wait, nil + return func() error { + f.wg.Wait() + if e := f.fatalErr.Load(); e != nil { + return *e + } + return nil + }, nil +} + +// onFatal stores the first fatal reader error, and calls triggerShutdown if it was the first one +func (f *Interface) onFatal(err error) { + swapped := f.fatalErr.CompareAndSwap(nil, &err) + if !swapped { + return + } + if f.triggerShutdown != nil { + f.triggerShutdown() + } } func (f *Interface) listenOut(i int) { @@ -291,7 +315,7 @@ func (f *Interface) listenOut(i int) { if err != nil && !f.closed.Load() { f.l.WithError(err).Error("Error while reading inbound packet, closing") - //TODO: Trigger Control to close + f.onFatal(err) } f.l.Infof("underlay reader %v is done", i) @@ -310,7 +334,7 @@ func (f *Interface) listenIn(reader io.ReadWriteCloser, i int) { if err != nil { if !f.closed.Load() { f.l.WithError(err).WithField("reader", i).Error("Error while reading outbound packet, closing") - //TODO: Trigger Control to close + f.onFatal(err) } break } @@ -505,5 +529,7 @@ func (f *Interface) Close() error { } // Release the tun device (closing the tun also closes all readers) - return f.inside.Close() + err = f.inside.Close() + f.wg.Done() + return err } diff --git a/overlay/tun_linux.go b/overlay/tun_linux.go index 8bd1609b..d5c6c745 100644 --- a/overlay/tun_linux.go +++ b/overlay/tun_linux.go @@ -831,18 +831,18 @@ func (t *tun) Close() error { } err := t.readers[i].Close() if err != nil { - t.l.WithField("reader", i).WithError(err).Error("Error closing tun reader") + t.l.WithField("reader", i).WithError(err).Error("error closing tun reader") } else { - t.l.WithField("reader", i).Info("Closed tun reader") + t.l.WithField("reader", i).Info("closed tun reader") } } //this is t.readers[0] too err := t.tunFile.Close() if err != nil { - t.l.WithField("reader", 0).WithError(err).Error("Error closing tun reader") + t.l.WithField("reader", 0).WithError(err).Error("error closing tun reader") } else { - t.l.WithField("reader", 0).Info("Closed tun reader") + t.l.WithField("reader", 0).Info("closed tun reader") } return err } diff --git a/service/service.go b/service/service.go index c86d08c3..899e851d 100644 --- a/service/service.go +++ b/service/service.go @@ -144,10 +144,10 @@ func New(control *nebula.Control) (*Service, error) { } }) - // Add the nebula wait function to the group + // Add the nebula wait function to the group so a fatal reader error + // propagates out through errgroup.Wait(). eg.Go(func() error { - wait() - return nil + return wait() }) return &s, nil