Browse Source

tailcfg, health: add way for control plane to add problems to health check

So if the control plane knows that something's broken about the node, it can
include problem(s) in MapResponse and "tailscale status" will show it.
(and GUIs in the future, as it's in ipnstate.Status/JSON)

This also bumps the MapRequest.Version, though it's not strictly
required. Doesn't hurt.

Signed-off-by: Brad Fitzpatrick <[email protected]>
Brad Fitzpatrick 4 years ago
parent
commit
aae622314e
5 changed files with 39 additions and 1 deletions
  1. 5 0
      control/controlclient/map.go
  2. 11 0
      health/health.go
  3. 6 0
      ipn/ipnlocal/local.go
  4. 10 1
      tailcfg/tailcfg.go
  5. 7 0
      types/netmap/netmap.go

+ 5 - 0
control/controlclient/map.go

@@ -44,6 +44,7 @@ type mapSession struct {
 	collectServices        bool
 	previousPeers          []*tailcfg.Node // for delta-purposes
 	lastDomain             string
+	lastHealth             []string
 
 	// netMapBuilding is non-nil during a netmapForResponse call,
 	// containing the value to be returned, once fully populated.
@@ -105,6 +106,9 @@ func (ms *mapSession) netmapForResponse(resp *tailcfg.MapResponse) *netmap.Netwo
 	if resp.Domain != "" {
 		ms.lastDomain = resp.Domain
 	}
+	if resp.Health != nil {
+		ms.lastHealth = resp.Health
+	}
 
 	nm := &netmap.NetworkMap{
 		NodeKey:         tailcfg.NodeKey(ms.privateNodeKey.Public()),
@@ -118,6 +122,7 @@ func (ms *mapSession) netmapForResponse(resp *tailcfg.MapResponse) *netmap.Netwo
 		CollectServices: ms.collectServices,
 		DERPMap:         ms.lastDERPMap,
 		Debug:           resp.Debug,
+		ControlHealth:   ms.lastHealth,
 	}
 	ms.netMapBuilding = nm
 

+ 11 - 0
health/health.go

@@ -40,6 +40,7 @@ var (
 	ipnWantRunning          bool
 	anyInterfaceUp          = true // until told otherwise
 	udp4Unbound             bool
+	controlHealth           []string
 )
 
 // Subsystem is the name of a subsystem whose health can be monitored.
@@ -141,6 +142,13 @@ func setLocked(key Subsystem, err error) {
 	}
 }
 
+func SetControlHealth(problems []string) {
+	mu.Lock()
+	defer mu.Unlock()
+	controlHealth = problems
+	selfCheckLocked()
+}
+
 // GotStreamedMapResponse notes that we got a tailcfg.MapResponse
 // message in streaming mode, even if it's just a keep-alive message.
 func GotStreamedMapResponse() {
@@ -318,6 +326,9 @@ func overallErrorLocked() error {
 	for regionID, problem := range derpRegionHealthProblem {
 		errs = append(errs, fmt.Errorf("derp%d: %v", regionID, problem))
 	}
+	for _, s := range controlHealth {
+		errs = append(errs, errors.New(s))
+	}
 	if e := fakeErrForTesting; len(errs) == 0 && e != "" {
 		return errors.New(e)
 	}

+ 6 - 0
ipn/ipnlocal/local.go

@@ -2548,6 +2548,12 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
 	}
 	b.maybePauseControlClientLocked()
 
+	if nm != nil {
+		health.SetControlHealth(nm.ControlHealth)
+	} else {
+		health.SetControlHealth(nil)
+	}
+
 	// Determine if file sharing is enabled
 	fs := hasCapability(nm, tailcfg.CapabilityFileSharing)
 	if fs != b.capFileSharing {

+ 10 - 1
tailcfg/tailcfg.go

@@ -47,7 +47,8 @@ import (
 //    21: 2021-06-15: added MapResponse.DNSConfig.CertDomains
 //    22: 2021-06-16: added MapResponse.DNSConfig.ExtraRecords
 //    23: 2021-08-25: DNSConfig.Routes values may be empty (for ExtraRecords support in 1.14.1+)
-const CurrentMapRequestVersion = 23
+//    24: 2021-09-18: MapResponse.Health from control to node; node shows in "tailscale status"
+const CurrentMapRequestVersion = 24
 
 type StableID string
 
@@ -1028,6 +1029,14 @@ type MapResponse struct {
 	// user profiles only.
 	UserProfiles []UserProfile `json:",omitempty"`
 
+	// Health, if non-nil, sets the health state
+	// of the node from the control plane's perspective.
+	// A nil value means no change from the previous MapResponse.
+	// A non-nil 0-length slice restores the health to good (no known problems).
+	// A non-zero length slice are the list of problems that the control place
+	// sees.
+	Health []string `json:",omitempty"`
+
 	// Debug is normally nil, except for when the control server
 	// is setting debug settings on a node.
 	Debug *Debug `json:",omitempty"`

+ 7 - 0
types/netmap/netmap.go

@@ -54,6 +54,13 @@ type NetworkMap struct {
 	// Debug knobs from control server for debug or feature gating.
 	Debug *tailcfg.Debug
 
+	// ControlHealth are the list of health check problems for this
+	// node from the perspective of the control plane.
+	// If empty, there are no known problems from the control plane's
+	// point of view, but the node might know about its own health
+	// check problems.
+	ControlHealth []string
+
 	// ACLs
 
 	User   tailcfg.UserID