Ver Fonte

cmd/{k8s-proxy,containerboot,k8s-operator},kube: add health check and metrics endpoints for k8s-proxy (#16540)

* Modifies the k8s-proxy to expose health check and metrics
endpoints on the Pod's IP.

* Moves cmd/containerboot/healthz.go and cmd/containerboot/metrics.go to
  /kube to be shared with /k8s-proxy.

Updates #13358

Signed-off-by: David Bond <[email protected]>
David Bond há 7 meses atrás
pai
commit
4494705496

+ 0 - 57
cmd/containerboot/healthz.go

@@ -1,57 +0,0 @@
-// Copyright (c) Tailscale Inc & AUTHORS
-// SPDX-License-Identifier: BSD-3-Clause
-
-//go:build linux
-
-package main
-
-import (
-	"fmt"
-	"log"
-	"net/http"
-	"sync"
-
-	"tailscale.com/kube/kubetypes"
-)
-
-// healthz is a simple health check server, if enabled it returns 200 OK if
-// this tailscale node currently has at least one tailnet IP address else
-// returns 503.
-type healthz struct {
-	sync.Mutex
-	hasAddrs bool
-	podIPv4  string
-}
-
-func (h *healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) {
-	h.Lock()
-	defer h.Unlock()
-
-	if h.hasAddrs {
-		w.Header().Add(kubetypes.PodIPv4Header, h.podIPv4)
-		if _, err := w.Write([]byte("ok")); err != nil {
-			http.Error(w, fmt.Sprintf("error writing status: %v", err), http.StatusInternalServerError)
-		}
-	} else {
-		http.Error(w, "node currently has no tailscale IPs", http.StatusServiceUnavailable)
-	}
-}
-
-func (h *healthz) update(healthy bool) {
-	h.Lock()
-	defer h.Unlock()
-
-	if h.hasAddrs != healthy {
-		log.Println("Setting healthy", healthy)
-	}
-	h.hasAddrs = healthy
-}
-
-// registerHealthHandlers registers a simple health handler at /healthz.
-// A containerized tailscale instance is considered healthy if
-// it has at least one tailnet IP address.
-func registerHealthHandlers(mux *http.ServeMux, podIPv4 string) *healthz {
-	h := &healthz{podIPv4: podIPv4}
-	mux.Handle("GET /healthz", h)
-	return h
-}

+ 9 - 7
cmd/containerboot/main.go

@@ -121,7 +121,9 @@ import (
 	"tailscale.com/client/tailscale"
 	"tailscale.com/ipn"
 	kubeutils "tailscale.com/k8s-operator"
+	healthz "tailscale.com/kube/health"
 	"tailscale.com/kube/kubetypes"
+	"tailscale.com/kube/metrics"
 	"tailscale.com/kube/services"
 	"tailscale.com/tailcfg"
 	"tailscale.com/types/logger"
@@ -232,13 +234,13 @@ func run() error {
 	}
 	defer killTailscaled()
 
-	var healthCheck *healthz
+	var healthCheck *healthz.Healthz
 	ep := &egressProxy{}
 	if cfg.HealthCheckAddrPort != "" {
 		mux := http.NewServeMux()
 
 		log.Printf("Running healthcheck endpoint at %s/healthz", cfg.HealthCheckAddrPort)
-		healthCheck = registerHealthHandlers(mux, cfg.PodIPv4)
+		healthCheck = healthz.RegisterHealthHandlers(mux, cfg.PodIPv4, log.Printf)
 
 		close := runHTTPServer(mux, cfg.HealthCheckAddrPort)
 		defer close()
@@ -249,12 +251,12 @@ func run() error {
 
 		if cfg.localMetricsEnabled() {
 			log.Printf("Running metrics endpoint at %s/metrics", cfg.LocalAddrPort)
-			registerMetricsHandlers(mux, client, cfg.DebugAddrPort)
+			metrics.RegisterMetricsHandlers(mux, client, cfg.DebugAddrPort)
 		}
 
 		if cfg.localHealthEnabled() {
 			log.Printf("Running healthcheck endpoint at %s/healthz", cfg.LocalAddrPort)
-			healthCheck = registerHealthHandlers(mux, cfg.PodIPv4)
+			healthCheck = healthz.RegisterHealthHandlers(mux, cfg.PodIPv4, log.Printf)
 		}
 
 		if cfg.egressSvcsTerminateEPEnabled() {
@@ -438,8 +440,8 @@ authLoop:
 	)
 	// egressSvcsErrorChan will get an error sent to it if this containerboot instance is configured to expose 1+
 	// egress services in HA mode and errored.
-	var egressSvcsErrorChan = make(chan error)
-	var ingressSvcsErrorChan = make(chan error)
+	egressSvcsErrorChan := make(chan error)
+	ingressSvcsErrorChan := make(chan error)
 	defer t.Stop()
 	// resetTimer resets timer for when to next attempt to resolve the DNS
 	// name for the proxy configured with TS_EXPERIMENTAL_DEST_DNS_NAME. The
@@ -644,7 +646,7 @@ runLoop:
 				}
 
 				if healthCheck != nil {
-					healthCheck.update(len(addrs) != 0)
+					healthCheck.Update(len(addrs) != 0)
 				}
 
 				if cfg.ServeConfigPath != "" {

+ 7 - 1
cmd/k8s-operator/proxygroup.go

@@ -826,6 +826,8 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
 						// as containerboot does for ingress-pg-reconciler.
 						IssueCerts: opt.NewBool(i == 0),
 					},
+					LocalPort:          ptr.To(uint16(9002)),
+					HealthCheckEnabled: opt.NewBool(true),
 				},
 			}
 
@@ -849,7 +851,11 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
 			}
 
 			if proxyClass != nil && proxyClass.Spec.TailscaleConfig != nil {
-				cfg.AcceptRoutes = &proxyClass.Spec.TailscaleConfig.AcceptRoutes
+				cfg.AcceptRoutes = opt.NewBool(proxyClass.Spec.TailscaleConfig.AcceptRoutes)
+			}
+
+			if proxyClass != nil && proxyClass.Spec.Metrics != nil {
+				cfg.MetricsEnabled = opt.NewBool(proxyClass.Spec.Metrics.Enable)
 			}
 
 			if len(endpoints[nodePortSvcName]) > 0 {

+ 2 - 0
cmd/k8s-operator/proxygroup_test.go

@@ -1379,6 +1379,8 @@ func TestKubeAPIServerType_DoesNotOverwriteServicesConfig(t *testing.T) {
 				Mode:       ptr.To(kubetypes.APIServerProxyModeNoAuth),
 				IssueCerts: opt.NewBool(true),
 			},
+			LocalPort:          ptr.To(uint16(9002)),
+			HealthCheckEnabled: opt.NewBool(true),
 		},
 	}
 	cfgB, err := json.Marshal(cfg)

+ 62 - 5
cmd/k8s-proxy/k8s-proxy.go

@@ -12,9 +12,12 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"net"
+	"net/http"
 	"os"
 	"os/signal"
 	"reflect"
+	"strconv"
 	"strings"
 	"syscall"
 	"time"
@@ -33,9 +36,11 @@ import (
 	"tailscale.com/ipn/store"
 	apiproxy "tailscale.com/k8s-operator/api-proxy"
 	"tailscale.com/kube/certs"
+	healthz "tailscale.com/kube/health"
 	"tailscale.com/kube/k8s-proxy/conf"
 	"tailscale.com/kube/kubetypes"
 	klc "tailscale.com/kube/localclient"
+	"tailscale.com/kube/metrics"
 	"tailscale.com/kube/services"
 	"tailscale.com/kube/state"
 	"tailscale.com/tailcfg"
@@ -63,6 +68,7 @@ func run(logger *zap.SugaredLogger) error {
 	var (
 		configPath = os.Getenv("TS_K8S_PROXY_CONFIG")
 		podUID     = os.Getenv("POD_UID")
+		podIP      = os.Getenv("POD_IP")
 	)
 	if configPath == "" {
 		return errors.New("TS_K8S_PROXY_CONFIG unset")
@@ -201,10 +207,57 @@ func run(logger *zap.SugaredLogger) error {
 		})
 	}
 
-	if cfg.Parsed.AcceptRoutes != nil {
+	if cfg.Parsed.HealthCheckEnabled.EqualBool(true) || cfg.Parsed.MetricsEnabled.EqualBool(true) {
+		addr := podIP
+		if addr == "" {
+			addr = cfg.GetLocalAddr()
+		}
+
+		addrPort := getLocalAddrPort(addr, cfg.GetLocalPort())
+		mux := http.NewServeMux()
+		localSrv := &http.Server{Addr: addrPort, Handler: mux}
+
+		if cfg.Parsed.MetricsEnabled.EqualBool(true) {
+			logger.Infof("Running metrics endpoint at %s/metrics", addrPort)
+			metrics.RegisterMetricsHandlers(mux, lc, "")
+		}
+
+		if cfg.Parsed.HealthCheckEnabled.EqualBool(true) {
+			ipV4, _ := ts.TailscaleIPs()
+			hz := healthz.RegisterHealthHandlers(mux, ipV4.String(), logger.Infof)
+			group.Go(func() error {
+				err := hz.MonitorHealth(ctx, lc)
+				if err == nil || errors.Is(err, context.Canceled) {
+					return nil
+				}
+				return err
+			})
+		}
+
+		group.Go(func() error {
+			errChan := make(chan error)
+			go func() {
+				if err := localSrv.ListenAndServe(); err != nil {
+					errChan <- err
+				}
+				close(errChan)
+			}()
+
+			select {
+			case <-ctx.Done():
+				sCtx, scancel := context.WithTimeout(serveCtx, 10*time.Second)
+				defer scancel()
+				return localSrv.Shutdown(sCtx)
+			case err := <-errChan:
+				return err
+			}
+		})
+	}
+
+	if v, ok := cfg.Parsed.AcceptRoutes.Get(); ok {
 		_, err = lc.EditPrefs(ctx, &ipn.MaskedPrefs{
 			RouteAllSet: true,
-			Prefs:       ipn.Prefs{RouteAll: *cfg.Parsed.AcceptRoutes},
+			Prefs:       ipn.Prefs{RouteAll: v},
 		})
 		if err != nil {
 			return fmt.Errorf("error editing prefs: %w", err)
@@ -285,10 +338,10 @@ func run(logger *zap.SugaredLogger) error {
 				prefs.HostnameSet = true
 				prefs.Hostname = *cfg.Parsed.Hostname
 			}
-			if cfg.Parsed.AcceptRoutes != nil && *cfg.Parsed.AcceptRoutes != currentPrefs.RouteAll {
-				cfgLogger = cfgLogger.With("AcceptRoutes", fmt.Sprintf("%v -> %v", currentPrefs.RouteAll, *cfg.Parsed.AcceptRoutes))
+			if v, ok := cfg.Parsed.AcceptRoutes.Get(); ok && v != currentPrefs.RouteAll {
+				cfgLogger = cfgLogger.With("AcceptRoutes", fmt.Sprintf("%v -> %v", currentPrefs.RouteAll, v))
 				prefs.RouteAllSet = true
-				prefs.Prefs.RouteAll = *cfg.Parsed.AcceptRoutes
+				prefs.Prefs.RouteAll = v
 			}
 			if !prefs.IsEmpty() {
 				if _, err := lc.EditPrefs(ctx, &prefs); err != nil {
@@ -304,6 +357,10 @@ func run(logger *zap.SugaredLogger) error {
 	}
 }
 
+func getLocalAddrPort(addr string, port uint16) string {
+	return net.JoinHostPort(addr, strconv.FormatUint(uint64(port), 10))
+}
+
 func getStateStore(path *string, logger *zap.SugaredLogger) (ipn.StateStore, error) {
 	p := "mem:"
 	if path != nil {

+ 84 - 0
kube/health/healthz.go

@@ -0,0 +1,84 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+//go:build !plan9
+
+// Package health contains shared types and underlying methods for serving
+// a `/healthz` endpoint for containerboot and k8s-proxy.
+package health
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"sync"
+
+	"tailscale.com/client/local"
+	"tailscale.com/ipn"
+	"tailscale.com/kube/kubetypes"
+	"tailscale.com/types/logger"
+)
+
+// Healthz is a simple health check server, if enabled it returns 200 OK if
+// this tailscale node currently has at least one tailnet IP address else
+// returns 503.
+type Healthz struct {
+	sync.Mutex
+	hasAddrs bool
+	podIPv4  string
+	logger   logger.Logf
+}
+
+func (h *Healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	h.Lock()
+	defer h.Unlock()
+
+	if h.hasAddrs {
+		w.Header().Add(kubetypes.PodIPv4Header, h.podIPv4)
+		if _, err := w.Write([]byte("ok")); err != nil {
+			http.Error(w, fmt.Sprintf("error writing status: %v", err), http.StatusInternalServerError)
+		}
+	} else {
+		http.Error(w, "node currently has no tailscale IPs", http.StatusServiceUnavailable)
+	}
+}
+
+func (h *Healthz) Update(healthy bool) {
+	h.Lock()
+	defer h.Unlock()
+
+	if h.hasAddrs != healthy {
+		h.logger("Setting healthy %v", healthy)
+	}
+	h.hasAddrs = healthy
+}
+
+func (h *Healthz) MonitorHealth(ctx context.Context, lc *local.Client) error {
+	w, err := lc.WatchIPNBus(ctx, ipn.NotifyInitialNetMap)
+	if err != nil {
+		return fmt.Errorf("failed to watch IPN bus: %w", err)
+	}
+
+	for {
+		n, err := w.Next()
+		if err != nil {
+			return err
+		}
+
+		if n.NetMap != nil {
+			h.Update(n.NetMap.SelfNode.Addresses().Len() != 0)
+		}
+	}
+}
+
+// RegisterHealthHandlers registers a simple health handler at /healthz.
+// A containerized tailscale instance is considered healthy if
+// it has at least one tailnet IP address.
+func RegisterHealthHandlers(mux *http.ServeMux, podIPv4 string, logger logger.Logf) *Healthz {
+	h := &Healthz{
+		podIPv4: podIPv4,
+		logger:  logger,
+	}
+	mux.Handle("GET /healthz", h)
+	return h
+}

+ 27 - 9
kube/k8s-proxy/conf/conf.go

@@ -49,21 +49,23 @@ type VersionedConfig struct {
 }
 
 type ConfigV1Alpha1 struct {
-	AuthKey   *string `json:",omitempty"` // Tailscale auth key to use.
-	State     *string `json:",omitempty"` // Path to the Tailscale state.
-	LogLevel  *string `json:",omitempty"` // "debug", "info". Defaults to "info".
-	App       *string `json:",omitempty"` // e.g. kubetypes.AppProxyGroupKubeAPIServer
-	ServerURL *string `json:",omitempty"` // URL of the Tailscale coordination server.
-	// StaticEndpoints are additional, user-defined endpoints that this node
-	// should advertise amongst its wireguard endpoints.
-	StaticEndpoints []netip.AddrPort `json:",omitempty"`
+	AuthKey            *string  `json:",omitempty"` // Tailscale auth key to use.
+	State              *string  `json:",omitempty"` // Path to the Tailscale state.
+	LogLevel           *string  `json:",omitempty"` // "debug", "info". Defaults to "info".
+	App                *string  `json:",omitempty"` // e.g. kubetypes.AppProxyGroupKubeAPIServer
+	ServerURL          *string  `json:",omitempty"` // URL of the Tailscale coordination server.
+	LocalAddr          *string  `json:",omitempty"` // The address to use for serving HTTP health checks and metrics (defaults to all interfaces).
+	LocalPort          *uint16  `json:",omitempty"` // The port to use for serving HTTP health checks and metrics (defaults to 9002).
+	MetricsEnabled     opt.Bool `json:",omitempty"` // Serve metrics on <LocalAddr>:<LocalPort>/metrics.
+	HealthCheckEnabled opt.Bool `json:",omitempty"` // Serve health check on <LocalAddr>:<LocalPort>/metrics.
 
 	// TODO(tomhjp): The remaining fields should all be reloadable during
 	// runtime, but currently missing most of the APIServerProxy fields.
 	Hostname          *string               `json:",omitempty"` // Tailscale device hostname.
-	AcceptRoutes      *bool                 `json:",omitempty"` // Accepts routes advertised by other Tailscale nodes.
+	AcceptRoutes      opt.Bool              `json:",omitempty"` // Accepts routes advertised by other Tailscale nodes.
 	AdvertiseServices []string              `json:",omitempty"` // Tailscale Services to advertise.
 	APIServerProxy    *APIServerProxyConfig `json:",omitempty"` // Config specific to the API Server proxy.
+	StaticEndpoints   []netip.AddrPort      `json:",omitempty"` // StaticEndpoints are additional, user-defined endpoints that this node should advertise amongst its wireguard endpoints.
 }
 
 type APIServerProxyConfig struct {
@@ -108,3 +110,19 @@ func Load(raw []byte) (c Config, err error) {
 
 	return c, nil
 }
+
+func (c *Config) GetLocalAddr() string {
+	if c.Parsed.LocalAddr == nil {
+		return "[::]"
+	}
+
+	return *c.Parsed.LocalAddr
+}
+
+func (c *Config) GetLocalPort() uint16 {
+	if c.Parsed.LocalPort == nil {
+		return uint16(9002)
+	}
+
+	return *c.Parsed.LocalPort
+}

+ 5 - 3
cmd/containerboot/metrics.go → kube/metrics/metrics.go

@@ -1,9 +1,11 @@
 // Copyright (c) Tailscale Inc & AUTHORS
 // SPDX-License-Identifier: BSD-3-Clause
 
-//go:build linux
+//go:build !plan9
 
-package main
+// Package metrics contains shared types and underlying methods for serving
+// localapi metrics. This is primarily consumed by containerboot and k8s-proxy.
+package metrics
 
 import (
 	"fmt"
@@ -68,7 +70,7 @@ func (m *metrics) handleDebug(w http.ResponseWriter, r *http.Request) {
 // In 1.78.x and 1.80.x, it also proxies debug paths to tailscaled's debug
 // endpoint if configured to ease migration for a breaking change serving user
 // metrics instead of debug metrics on the "metrics" port.
-func registerMetricsHandlers(mux *http.ServeMux, lc *local.Client, debugAddrPort string) {
+func RegisterMetricsHandlers(mux *http.ServeMux, lc *local.Client, debugAddrPort string) {
 	m := &metrics{
 		lc:            lc,
 		debugEndpoint: debugAddrPort,