Add experimental host_query API for local identity lookups

Programs running alongside nebula have no simple way to ask "who is this
vpn address?" when making authorization decisions, e.g. a nebula-aware
webapp that wants to identify an inbound connection by its source
address instead of presenting a login form. The existing surfaces are
the sshd admin interface (not scriptable from app code) and the
lighthouse-only DNS TXT lookup, which returns raw cert JSON over an
awkward transport.

This adds an opt-in `host_query` config section that serves a small
HTTP+JSON API on a unix socket or tcp address, requiring no client
library to consume:

  GET /v1/host?addr=<vpn addr>  identity of the host owning the address
                                (an established peer, or this node).
                                addr may include a port so a server can
                                pass a connection's RemoteAddr through
                                unparsed.
  GET /v1/self                  this node's own identity.

Responses carry the certificate-derived identity only: name, vpn
addresses, networks, unsafe networks, groups, fingerprint, issuer,
validity window, and cert version.

The self-vs-peer lookup logic is shared with the DNS TXT handler via a
new findCertificateForVpnAddr helper, which also swaps the panicking
GetDefaultCertificate call for the nil-returning accessor so a missing
certificate yields an empty answer instead of a crash.

The listener follows the statsServer lifecycle: the whole section is
reloadable via SIGHUP, including moving between socket paths and tcp
addresses. Unix sockets default to mode 0600, stale sockets left by an
unclean exit are removed at bind time, and a non-socket file at the
configured path is never replaced.

https://claude.ai/code/session_01Nibp24Pgk2JMue8VyWHq7o
This commit is contained in:
Claude
2026-06-13 01:46:02 +00:00
committed by JackDoan
parent 7d3166a19d
commit e87ccdc9ea
7 changed files with 962 additions and 22 deletions
+459
View File
@@ -0,0 +1,459 @@
package nebula
import (
"context"
"encoding/json"
"errors"
"fmt"
"io/fs"
"log/slog"
"net"
"net/http"
"net/netip"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/slackhq/nebula/cert"
"github.com/slackhq/nebula/config"
)
// hostQueryServer owns the local host query API: a small HTTP+JSON listener
// on a unix socket or tcp address that lets other programs on this machine
// resolve a vpn address to its certificate identity (name, groups, networks)
// for making authorization decisions. It mirrors the lifecycle shape of
// statsServer: constructor wires the reload callback, reload records config,
// Start builds and runs the runtime, Stop tears it down.
type hostQueryServer struct {
l *slog.Logger
ctx context.Context
hostMap *HostMap
pki *PKI
// enabled mirrors `host_query.enabled`. Start consults it so callers
// don't need to know the gating rules.
enabled atomic.Bool
runMu sync.Mutex
runCfg *hostQueryConfig
run *hostQueryRuntime // non-nil while a runtime is live
}
// hostQueryRuntime is the live state owned by a single Start invocation.
// Start stashes a pointer under runMu; Stop and Start's own exit path use
// pointer equality to tell "my runtime" apart from one that replaced it
// after a reload.
type hostQueryRuntime struct {
server *http.Server
listener net.Listener
}
// hostQueryConfig is the snapshot of host_query config that drives the
// runtime. It is comparable with == so reload can detect "no change" cheaply.
type hostQueryConfig struct {
enabled bool
listen string // raw config value, for error messages
network string // "unix" or "tcp"
addr string // socket path or host:port
// socketMode is the file mode applied to the unix socket after bind.
socketMode fs.FileMode
}
// newHostQueryServerFromConfig builds a hostQueryServer, applies the initial
// config, and registers a reload callback. The reload callback is registered
// before the initial config is applied so a SIGHUP can later enable, fix, or
// disable the listener even if the initial application failed.
//
// Construction never binds the listener; that happens in Start, so config
// tests are side effect free. Start is safe to call unconditionally: it
// no-ops when the host query API is disabled. The returned pointer is always
// non-nil, even on error.
func newHostQueryServerFromConfig(ctx context.Context, l *slog.Logger, pki *PKI, hostMap *HostMap, c *config.C) (*hostQueryServer, error) {
h := &hostQueryServer{
l: l,
ctx: ctx,
hostMap: hostMap,
pki: pki,
}
c.RegisterReloadCallback(func(c *config.C) {
if err := h.reload(c, false); err != nil {
h.l.Error("Failed to reload host query API from config", "error", err)
}
})
if err := h.reload(c, true); err != nil {
return h, err
}
return h, nil
}
// reload records the latest config. On the initial call it only records it;
// Control.Start is what launches the first runtime via hostQueryStart. On
// later calls it reconciles the running runtime with the new config:
//
// - newly enabled -> spawn Start
// - newly disabled -> Stop the runtime
// - config changed (still enabled) -> Stop the old, Start the new
// - no change -> no-op
func (h *hostQueryServer) reload(c *config.C, initial bool) error {
newCfg, err := loadHostQueryConfig(c)
if err != nil {
return err
}
h.runMu.Lock()
sameCfg := h.runCfg != nil && *h.runCfg == newCfg
h.runCfg = &newCfg
running := h.run != nil
h.runMu.Unlock()
h.enabled.Store(newCfg.enabled)
if initial || sameCfg {
return nil
}
if running {
h.Stop()
}
if newCfg.enabled {
go h.Start()
}
return nil
}
// Start binds the listener from the latest config and serves until Stop is
// called or ctx fires. Safe to call when the host query API is disabled or
// already running (both no-op).
func (h *hostQueryServer) Start() {
if !h.enabled.Load() {
return
}
h.runMu.Lock()
if h.ctx.Err() != nil || h.run != nil || h.runCfg == nil {
h.runMu.Unlock()
return
}
cfg := *h.runCfg
ln, err := h.listen(cfg)
if err != nil {
// Drop the cached config so a SIGHUP with the same config re-triggers
// Start once the user fixes the underlying problem.
h.runCfg = nil
h.runMu.Unlock()
h.l.Error("Failed to start host query listener", "listen", cfg.listen, "error", err)
return
}
mux := http.NewServeMux()
mux.HandleFunc("GET /v1/host", h.handleHost)
mux.HandleFunc("GET /v1/self", h.handleSelf)
srv := &http.Server{Handler: mux, ReadHeaderTimeout: 5 * time.Second}
rt := &hostQueryRuntime{server: srv, listener: ln}
h.run = rt
h.runMu.Unlock()
h.l.Info("Starting host query listener", "network", cfg.network, "addr", ln.Addr())
cleanExit := h.serve(srv, ln)
// A Stop that raced our bind shut the server down before Serve could
// adopt the listener; closing it again is harmless and guarantees a unix
// socket file gets unlinked.
_ = ln.Close()
// Clear our runtime only if nothing has replaced it. Stop races through
// here too but leaves h.run == nil, so the pointer check skips.
h.runMu.Lock()
if h.run == rt {
h.run = nil
// A listener that exited with an error leaves runCfg cached as if it
// were applied. Drop it so a SIGHUP with the same config re-triggers
// Start once the user fixes the underlying problem.
if !cleanExit {
h.runCfg = nil
}
}
h.runMu.Unlock()
}
// serve runs srv.Serve and ensures ctx cancellation unblocks it. Returns true
// if the listener exited cleanly (Stop, ctx cancellation, or any other
// http.ErrServerClosed path), false on an unexpected error.
func (h *hostQueryServer) serve(srv *http.Server, ln net.Listener) bool {
// Per-invocation watcher: ctx cancellation triggers a server shutdown
// which in turn unblocks Serve. Closing `done` on exit keeps the watcher
// from outliving this call.
done := make(chan struct{})
go func() {
select {
case <-h.ctx.Done():
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := srv.Shutdown(shutdownCtx); err != nil {
h.l.Warn("Failed to shut down host query listener", "error", err)
}
case <-done:
}
}()
defer close(done)
err := srv.Serve(ln)
if err == nil || errors.Is(err, http.ErrServerClosed) {
return true
}
h.l.Error("Host query listener exited", "error", err)
return false
}
// Stop tears down the active runtime, if any. Idempotent.
func (h *hostQueryServer) Stop() {
h.runMu.Lock()
rt := h.run
h.run = nil
h.runMu.Unlock()
if rt == nil {
return
}
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := rt.server.Shutdown(shutdownCtx); err != nil {
h.l.Warn("Failed to shut down host query listener", "error", err)
}
}
// listen binds the configured address. For unix sockets it also clears a
// stale socket file left by an unclean exit and applies the configured file
// mode.
func (h *hostQueryServer) listen(cfg hostQueryConfig) (net.Listener, error) {
if cfg.network == "unix" {
return h.listenUnix(cfg)
}
if host, _, err := net.SplitHostPort(cfg.addr); err == nil {
ip, ipErr := netip.ParseAddr(host)
if host == "" || (ipErr == nil && !ip.IsLoopback()) {
h.l.Warn("host_query is listening on a non-loopback tcp address; anything that can reach it can query host identities", "addr", cfg.addr)
}
}
return net.Listen("tcp", cfg.addr)
}
func (h *hostQueryServer) listenUnix(cfg hostQueryConfig) (net.Listener, error) {
if fi, err := os.Stat(cfg.addr); err == nil {
if fi.Mode()&os.ModeSocket == 0 {
return nil, fmt.Errorf("host_query.listen path %s exists and is not a socket, refusing to replace it", cfg.addr)
}
// A normal shutdown unlinks the socket (unlink-on-close), so a file
// here means a previous process exited uncleanly. Remove it so the
// bind below can succeed.
if err = os.Remove(cfg.addr); err != nil {
return nil, fmt.Errorf("failed to remove stale socket %s: %w", cfg.addr, err)
}
}
ln, err := net.Listen("unix", cfg.addr)
if err != nil {
return nil, err
}
// The socket is briefly live with umask-derived permissions before this
// chmod lands; tolerated because connections accepted in that window
// still only reach this read-only API.
if err = os.Chmod(cfg.addr, cfg.socketMode); err != nil {
_ = ln.Close()
return nil, fmt.Errorf("failed to set mode on socket %s: %w", cfg.addr, err)
}
return ln, nil
}
func (h *hostQueryServer) certState() *CertState {
if h.pki == nil {
return nil
}
return h.pki.getCertState()
}
// handleHost serves GET /v1/host?addr=<vpn addr>, answering with the identity
// of the host that owns the address: a peer with an active tunnel, or this
// node itself. addr may include a port, which is ignored, so clients can pass
// a connection's remote address through without parsing it.
func (h *hostQueryServer) handleHost(w http.ResponseWriter, r *http.Request) {
q := r.URL.Query().Get("addr")
if q == "" {
writeJSONError(w, http.StatusBadRequest, "missing addr parameter")
return
}
ip, err := parseQueryAddrParam(q)
if err != nil {
writeJSONError(w, http.StatusBadRequest, "invalid address")
return
}
crt := findCertificateForVpnAddr(h.certState(), h.hostMap, ip)
if crt == nil {
writeJSONError(w, http.StatusNotFound, "no active tunnel for address")
return
}
h.writeHostIdentity(w, crt)
}
// handleSelf serves GET /v1/self, answering with this node's own identity.
func (h *hostQueryServer) handleSelf(w http.ResponseWriter, r *http.Request) {
var crt cert.Certificate
if cs := h.certState(); cs != nil {
crt = cs.getCertificate(cs.initiatingVersion)
}
if crt == nil {
writeJSONError(w, http.StatusInternalServerError, "no certificate available")
return
}
h.writeHostIdentity(w, crt)
}
func (h *hostQueryServer) writeHostIdentity(w http.ResponseWriter, crt cert.Certificate) {
id, err := newHostIdentity(crt)
if err != nil {
writeJSONError(w, http.StatusInternalServerError, "failed to fingerprint certificate")
return
}
w.Header().Set("Content-Type", "application/json")
if err = json.NewEncoder(w).Encode(id); err != nil {
h.l.Debug("Failed to write host query response", "error", err)
}
}
// findCertificateForVpnAddr answers "who owns this vpn address": ourselves
// (from local cert state, since the hostmap never carries an entry for this
// node) or a peer with an active tunnel. Returns nil when the address is
// unknown or the tunnel is mid-teardown.
func findCertificateForVpnAddr(cs *CertState, hostMap *HostMap, ip netip.Addr) cert.Certificate {
if cs != nil && cs.myVpnAddrsTable != nil && cs.myVpnAddrsTable.Contains(ip) {
return cs.getCertificate(cs.initiatingVersion)
}
hostinfo := hostMap.QueryVpnAddr(ip)
if hostinfo == nil {
return nil
}
cc := hostinfo.GetCert()
if cc == nil {
return nil
}
return cc.Certificate
}
// hostIdentity is the JSON document served for both /v1/host and /v1/self.
// Every field is derived from the authenticated certificate alone.
type hostIdentity struct {
Name string `json:"name"`
VpnAddrs []netip.Addr `json:"vpnAddrs"`
Networks []netip.Prefix `json:"networks"`
UnsafeNetworks []netip.Prefix `json:"unsafeNetworks"`
Groups []string `json:"groups"`
Fingerprint string `json:"fingerprint"`
Issuer string `json:"issuer"`
NotBefore time.Time `json:"notBefore"`
NotAfter time.Time `json:"notAfter"`
CertVersion int `json:"certVersion"`
}
func newHostIdentity(crt cert.Certificate) (hostIdentity, error) {
fp, err := crt.Fingerprint()
if err != nil {
return hostIdentity{}, err
}
// Slices are always allocated so they marshal as [] rather than null;
// consumers iterate groups without a presence check.
networks := crt.Networks()
id := hostIdentity{
Name: crt.Name(),
VpnAddrs: make([]netip.Addr, 0, len(networks)),
Networks: append(make([]netip.Prefix, 0, len(networks)), networks...),
UnsafeNetworks: append(make([]netip.Prefix, 0, len(crt.UnsafeNetworks())), crt.UnsafeNetworks()...),
Groups: append(make([]string, 0, len(crt.Groups())), crt.Groups()...),
Fingerprint: fp,
Issuer: crt.Issuer(),
NotBefore: crt.NotBefore(),
NotAfter: crt.NotAfter(),
CertVersion: int(crt.Version()),
}
for _, n := range networks {
id.VpnAddrs = append(id.VpnAddrs, n.Addr())
}
return id, nil
}
func writeJSONError(w http.ResponseWriter, status int, msg string) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
_ = json.NewEncoder(w).Encode(map[string]string{"error": msg})
}
// parseQueryAddrParam parses the addr query parameter, accepting a bare
// address or an address with a port (`192.168.100.7:54321`, `[fd00::1]:443`)
// so callers can pass a connection's RemoteAddr straight through. The result
// is unmapped: 4in6 addresses (::ffff:a.b.c.d) are normalized to ipv4.
func parseQueryAddrParam(s string) (netip.Addr, error) {
if ip, err := netip.ParseAddr(s); err == nil {
return ip.Unmap(), nil
}
ap, err := netip.ParseAddrPort(s)
if err != nil {
return netip.Addr{}, err
}
return ap.Addr().Unmap(), nil
}
func loadHostQueryConfig(c *config.C) (hostQueryConfig, error) {
cfg := hostQueryConfig{
enabled: c.GetBool("host_query.enabled", false),
listen: c.GetString("host_query.listen", ""),
}
if !cfg.enabled {
return cfg, nil
}
if cfg.listen == "" {
return cfg, errors.New("host_query.listen can not be empty when host_query is enabled")
}
network, addr, err := parseHostQueryListen(cfg.listen)
if err != nil {
return cfg, err
}
cfg.network = network
cfg.addr = addr
if network == "unix" {
// Read as a string so YAML can't reinterpret the octal literal.
modeStr := c.GetString("host_query.socket_mode", "0600")
mode, err := strconv.ParseUint(modeStr, 8, 32)
if err != nil || fs.FileMode(mode)&^fs.ModePerm != 0 {
return cfg, fmt.Errorf("host_query.socket_mode was not a valid octal file mode: %s", modeStr)
}
cfg.socketMode = fs.FileMode(mode)
}
return cfg, nil
}
// parseHostQueryListen splits the host_query.listen config value into a
// network and address for net.Listen: `unix:///abs/path.sock` selects a unix
// socket, anything else must be a tcp host:port.
func parseHostQueryListen(listen string) (network string, addr string, err error) {
if path, ok := strings.CutPrefix(listen, "unix://"); ok {
if !filepath.IsAbs(path) {
return "", "", fmt.Errorf("host_query.listen unix socket path must be absolute: %s", listen)
}
return "unix", path, nil
}
if _, _, err = net.SplitHostPort(listen); err != nil {
return "", "", fmt.Errorf("host_query.listen must be a unix:// socket path or a host:port address: %s", listen)
}
return "tcp", listen, nil
}