Browse Source

health: support delayed Warnable visibility (#12783)

Updates tailscale/tailscale#4136

To reduce the likelihood of presenting spurious warnings, add the ability to delay the visibility of certain Warnables, based on a TimeToVisible time.Duration field on each Warnable. The default is zero, meaning that a Warnable is immediately visible to the user when it enters an unhealthy state.

Signed-off-by: Andrea Gottardo <[email protected]>
Andrea Gottardo 1 year ago
parent
commit
b7c3cfe049
4 changed files with 107 additions and 2 deletions
  1. 50 1
      health/health.go
  2. 47 0
      health/health_test.go
  3. 5 1
      health/state.go
  4. 5 0
      health/warnings.go

+ 50 - 1
health/health.go

@@ -69,6 +69,9 @@ type Tracker struct {
 
 	warnables   []*Warnable // keys ever set
 	warnableVal map[*Warnable]*warningState
+	// pendingVisibleTimers contains timers for Warnables that are unhealthy, but are
+	// not visible to the user yet, because they haven't been unhealthy for TimeToVisible
+	pendingVisibleTimers map[*Warnable]*time.Timer
 
 	// sysErr maps subsystems to their current error (or nil if the subsystem is healthy)
 	// Deprecated: using Warnables should be preferred
@@ -162,6 +165,7 @@ func Register(w *Warnable) *Warnable {
 	if registeredWarnables[w.Code] != nil {
 		panic(fmt.Sprintf("health: a Warnable with code %q was already registered", w.Code))
 	}
+
 	mak.Set(&registeredWarnables, w.Code, w)
 	return w
 }
@@ -218,6 +222,11 @@ type Warnable struct {
 	// the client GUI supports a tray icon, the client will display an exclamation mark
 	// on the tray icon when ImpactsConnectivity is set to true and the Warnable is unhealthy.
 	ImpactsConnectivity bool
+
+	// TimeToVisible is the Duration that the Warnable has to be in an unhealthy state before it
+	// should be surfaced as unhealthy to the user. This is used to prevent transient errors from being
+	// displayed to the user.
+	TimeToVisible time.Duration
 }
 
 // StaticMessage returns a function that always returns the input string, to be used in
@@ -291,6 +300,15 @@ func (ws *warningState) Equal(other *warningState) bool {
 	return ws.BrokenSince.Equal(other.BrokenSince) && maps.Equal(ws.Args, other.Args)
 }
 
+// IsVisible returns whether the Warnable should be visible to the user, based on the TimeToVisible
+// field of the Warnable and the BrokenSince time when the Warnable became unhealthy.
+func (w *Warnable) IsVisible(ws *warningState) bool {
+	if ws == nil || w.TimeToVisible == 0 {
+		return true
+	}
+	return time.Since(ws.BrokenSince) >= w.TimeToVisible
+}
+
 // SetUnhealthy sets a warningState for the given Warnable with the provided Args, and should be
 // called when a Warnable becomes unhealthy, or its unhealthy status needs to be updated.
 // SetUnhealthy takes ownership of args. The args can be nil if no additional information is
@@ -327,7 +345,27 @@ func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
 	mak.Set(&t.warnableVal, w, ws)
 	if !ws.Equal(prevWs) {
 		for _, cb := range t.watchers {
-			go cb(w, w.unhealthyState(ws))
+			// If the Warnable has been unhealthy for more than its TimeToVisible, the callback should be
+			// executed immediately. Otherwise, the callback should be enqueued to run once the Warnable
+			// becomes visible.
+			if w.IsVisible(ws) {
+				go cb(w, w.unhealthyState(ws))
+				continue
+			}
+
+			// The time remaining until the Warnable will be visible to the user is the TimeToVisible
+			// minus the time that has already passed since the Warnable became unhealthy.
+			visibleIn := w.TimeToVisible - time.Since(brokenSince)
+			mak.Set(&t.pendingVisibleTimers, w, time.AfterFunc(visibleIn, func() {
+				t.mu.Lock()
+				defer t.mu.Unlock()
+				// Check if the Warnable is still unhealthy, as it could have become healthy between the time
+				// the timer was set for and the time it was executed.
+				if t.warnableVal[w] != nil {
+					go cb(w, w.unhealthyState(ws))
+					delete(t.pendingVisibleTimers, w)
+				}
+			}))
 		}
 	}
 }
@@ -349,6 +387,13 @@ func (t *Tracker) setHealthyLocked(w *Warnable) {
 	}
 
 	delete(t.warnableVal, w)
+
+	// Stop any pending visiblity timers for this Warnable
+	if canc, ok := t.pendingVisibleTimers[w]; ok {
+		canc.Stop()
+		delete(t.pendingVisibleTimers, w)
+	}
+
 	for _, cb := range t.watchers {
 		go cb(w, nil)
 	}
@@ -861,6 +906,10 @@ func (t *Tracker) Strings() []string {
 func (t *Tracker) stringsLocked() []string {
 	result := []string{}
 	for w, ws := range t.warnableVal {
+		if !w.IsVisible(ws) {
+			// Do not append invisible warnings.
+			continue
+		}
 		if ws.Args == nil {
 			result = append(result, w.Text(Args{}))
 		} else {

+ 47 - 0
health/health_test.go

@@ -162,6 +162,53 @@ func TestWatcher(t *testing.T) {
 	}
 }
 
+// TestWatcherWithTimeToVisible tests that a registered watcher function gets called with the correct
+// Warnable and non-nil/nil UnhealthyState upon setting a Warnable to unhealthy/healthy, but the Warnable
+// has a TimeToVisible set, which means that a watcher should only be notified of an unhealthy state after
+// the TimeToVisible duration has passed.
+func TestSetUnhealthyWithTimeToVisible(t *testing.T) {
+	ht := Tracker{}
+	mw := Register(&Warnable{
+		Code:                "test-warnable-3-secs-to-visible",
+		Title:               "Test Warnable with 3 seconds to visible",
+		Text:                StaticMessage("Hello world"),
+		TimeToVisible:       2 * time.Second,
+		ImpactsConnectivity: true,
+	})
+	defer unregister(mw)
+
+	becameUnhealthy := make(chan struct{})
+	becameHealthy := make(chan struct{})
+
+	watchFunc := func(w *Warnable, us *UnhealthyState) {
+		if w != mw {
+			t.Fatalf("watcherFunc was called, but with an unexpected Warnable: %v, want: %v", w, w)
+		}
+
+		if us != nil {
+			t.Logf("watcherFunc was called with an UnhealthyState: %v", us)
+			becameUnhealthy <- struct{}{}
+		} else {
+			t.Logf("watcherFunc was called with an healthy state: %v", us)
+			becameHealthy <- struct{}{}
+		}
+	}
+
+	ht.RegisterWatcher(watchFunc)
+	ht.SetUnhealthy(mw, Args{ArgError: "Hello world"})
+
+	select {
+	case <-becameUnhealthy:
+		// Test failed because the watcher got notified of an unhealthy state
+		t.Fatalf("watcherFunc was called with an unhealthy state")
+	case <-becameHealthy:
+		// Test failed because the watcher got of a healthy state
+		t.Fatalf("watcherFunc was called with a healthy state")
+	case <-time.After(1 * time.Second):
+		// As expected, watcherFunc still had not been called after 1 second
+	}
+}
+
 func TestRegisterWarnablePanicsWithDuplicate(t *testing.T) {
 	w := &Warnable{
 		Code: "test-warnable-1",

+ 5 - 1
health/state.go

@@ -20,7 +20,7 @@ type State struct {
 	Warnings map[WarnableCode]UnhealthyState
 }
 
-// Representation contains information to be shown to the user to inform them
+// UnhealthyState contains information to be shown to the user to inform them
 // that a Warnable is currently unhealthy.
 type UnhealthyState struct {
 	WarnableCode        WarnableCode
@@ -86,6 +86,10 @@ func (t *Tracker) CurrentState() *State {
 	wm := map[WarnableCode]UnhealthyState{}
 
 	for w, ws := range t.warnableVal {
+		if !w.IsVisible(ws) {
+			// Skip invisible Warnables.
+			continue
+		}
 		wm[w.Code] = *w.unhealthyState(ws)
 	}
 

+ 5 - 0
health/warnings.go

@@ -59,6 +59,7 @@ var NetworkStatusWarnable = Register(&Warnable{
 	Severity:            SeverityMedium,
 	Text:                StaticMessage("Tailscale cannot connect because the network is down. Check your Internet connection."),
 	ImpactsConnectivity: true,
+	TimeToVisible:       5 * time.Second,
 })
 
 // IPNStateWarnable is a Warnable that warns the user that Tailscale is stopped.
@@ -101,6 +102,8 @@ var notInMapPollWarnable = Register(&Warnable{
 	Severity:  SeverityMedium,
 	DependsOn: []*Warnable{NetworkStatusWarnable},
 	Text:      StaticMessage("Unable to connect to the Tailscale coordination server to synchronize the state of your tailnet. Peer reachability might degrade over time."),
+	// 8 minutes reflects a maximum maintenance window for the coordination server.
+	TimeToVisible: 8 * time.Minute,
 })
 
 // noDERPHomeWarnable is a Warnable that warns the user that Tailscale doesn't have a home DERP.
@@ -111,6 +114,7 @@ var noDERPHomeWarnable = Register(&Warnable{
 	DependsOn:           []*Warnable{NetworkStatusWarnable},
 	Text:                StaticMessage("Tailscale could not connect to any relay server. Check your Internet connection."),
 	ImpactsConnectivity: true,
+	TimeToVisible:       10 * time.Second,
 })
 
 // noDERPConnectionWarnable is a Warnable that warns the user that Tailscale couldn't connect to a specific DERP server.
@@ -127,6 +131,7 @@ var noDERPConnectionWarnable = Register(&Warnable{
 		}
 	},
 	ImpactsConnectivity: true,
+	TimeToVisible:       10 * time.Second,
 })
 
 // derpTimeoutWarnable is a Warnable that warns the user that Tailscale hasn't heard from the home DERP region for a while.